Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_bar.py |
— | — | @@ -1,42 +1,42 @@ |
2 | | -#!/usr/bin/python
|
3 | | -# -*- coding: utf-8 -*-
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http://www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__email__ = 'dvanliere at gmail dot com'
|
18 | | -__date__ = '2011-01-25'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -
|
22 | | -def cohort_dataset_forward_bar(var, editor, **kwargs):
|
23 | | - '''
|
24 | | - The forward looking bar charts looks for every month that an editor
|
25 | | - was part of the Wikimedia community whether this person made at least cutoff
|
26 | | - value edits. If yes, then include this person in the analysis, else skip the
|
27 | | - person.
|
28 | | - '''
|
29 | | - new_wikipedian = editor['new_wikipedian']
|
30 | | - last_edit = editor['final_edit']
|
31 | | - monthly_edits = editor['monthly_edits']
|
32 | | - yearly_edits = editor['edits_by_year']
|
33 | | - n = editor['edit_count']
|
34 | | -
|
35 | | - if n >= var.cum_cutoff:
|
36 | | - for year in xrange(new_wikipedian.year, var.max_year):
|
37 | | - max_edits = max(monthly_edits.get(str(year), {0:0}).values())
|
38 | | - if yearly_edits.get(str(year), 0) == 0 or max_edits < var.cutoff:
|
39 | | - continue
|
40 | | - else:
|
41 | | - experience = (year - new_wikipedian.year) + 1
|
42 | | - var.add(new_wikipedian, 1, {'experience':experience})
|
43 | | - return var
|
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-25' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | + |
| 22 | +def cohort_dataset_forward_bar(var, editor, **kwargs): |
| 23 | + ''' |
| 24 | + The forward looking bar charts looks for every month that an editor |
| 25 | + was part of the Wikimedia community whether this person made at least cutoff |
| 26 | + value edits. If yes, then include this person in the analysis, else skip the |
| 27 | + person. |
| 28 | + ''' |
| 29 | + new_wikipedian = editor['new_wikipedian'] |
| 30 | + last_edit = editor['final_edit'] |
| 31 | + monthly_edits = editor['monthly_edits'] |
| 32 | + yearly_edits = editor['edits_by_year'] |
| 33 | + n = editor['edit_count'] |
| 34 | + |
| 35 | + if n >= var.cum_cutoff: |
| 36 | + for year in xrange(new_wikipedian.year, var.max_year): |
| 37 | + max_edits = max(monthly_edits.get(str(year), {0:0}).values()) |
| 38 | + if yearly_edits.get(str(year), 0) == 0 or max_edits < var.cutoff: |
| 39 | + continue |
| 40 | + else: |
| 41 | + experience = (year - new_wikipedian.year) + 1 |
| 42 | + var.add(new_wikipedian, 1, {'experience':experience}) |
| 43 | + return var |
Property changes on: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_bar.py |
___________________________________________________________________ |
Added: svn:eol-style |
44 | 44 | + native |
Index: trunk/tools/editor_trends/analyses/plugins/time_to_new_wikipedian.py |
— | — | @@ -1,26 +1,26 @@ |
2 | | -#!/usr/bin/python
|
3 | | -# -*- coding: utf-8 -*-
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http://www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__email__ = 'dvanliere at gmail dot com'
|
18 | | -__date__ = '2011-01-25'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -def time_to_new_wikipedian(var, editor, **kwargs):
|
22 | | -# headers = ['year', 'time_to_new_wikipedian']
|
23 | | - new_wikipedian = editor['new_wikipedian']
|
24 | | - first_edit = editor['first_edit']
|
25 | | - dt = new_wikipedian - first_edit
|
26 | | - var.add(new_wikipedian, dt.days)
|
27 | | - return var
|
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-25' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +def time_to_new_wikipedian(var, editor, **kwargs): |
| 22 | +# headers = ['year', 'time_to_new_wikipedian'] |
| 23 | + new_wikipedian = editor['new_wikipedian'] |
| 24 | + first_edit = editor['first_edit'] |
| 25 | + dt = new_wikipedian - first_edit |
| 26 | + var.add(new_wikipedian, dt.days) |
| 27 | + return var |
Property changes on: trunk/tools/editor_trends/analyses/plugins/time_to_new_wikipedian.py |
___________________________________________________________________ |
Added: svn:eol-style |
28 | 28 | + native |
Property changes on: trunk/tools/editor_trends/analyses/plugins/__init__.py |
___________________________________________________________________ |
Added: svn:eol-style |
29 | 29 | + native |
Index: trunk/tools/editor_trends/analyses/plugins/total_number_of_new_wikipedians.py |
— | — | @@ -1,25 +1,25 @@ |
2 | | -#!/usr/bin/python
|
3 | | -# -*- coding: utf-8 -*-
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http://www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__email__ = 'dvanliere at gmail dot com'
|
18 | | -__date__ = '2011-01-25'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -
|
22 | | -
|
23 | | -def total_number_of_new_wikipedians(var, editor, **kwargs):
|
24 | | - new_wikipedian = editor['new_wikipedian']
|
25 | | - var.add(new_wikipedian, 1, {'year':new_wikipedian.year})
|
26 | | - return var
|
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-25' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | + |
| 22 | + |
| 23 | +def total_number_of_new_wikipedians(var, editor, **kwargs): |
| 24 | + new_wikipedian = editor['new_wikipedian'] |
| 25 | + var.add(new_wikipedian, 1, {'year':new_wikipedian.year}) |
| 26 | + return var |
Property changes on: trunk/tools/editor_trends/analyses/plugins/total_number_of_new_wikipedians.py |
___________________________________________________________________ |
Added: svn:eol-style |
27 | 27 | + native |
Property changes on: trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py |
___________________________________________________________________ |
Added: svn:eol-style |
28 | 28 | + native |
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_histogram.py |
— | — | @@ -1,51 +1,51 @@ |
2 | | -#!/usr/bin/python
|
3 | | -# -*- coding: utf-8 -*-
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http://www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__email__ = 'dvanliere at gmail dot com'
|
18 | | -__date__ = '2011-01-25'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -import datetime
|
22 | | -from dateutil.relativedelta import *
|
23 | | -
|
24 | | -def cohort_dataset_forward_histogram(var, editor, **kwargs):
|
25 | | -# headers = ['year', 'month', 'edits']
|
26 | | - '''
|
27 | | - The forward looking histogram looks for every month that an editor
|
28 | | - was part of the Wikimedia community whether this person made at least cutoff
|
29 | | - value edits. If yes, then include this person in the analysis, else skip the
|
30 | | - person.
|
31 | | - '''
|
32 | | -
|
33 | | - new_wikipedian = editor['new_wikipedian']
|
34 | | - final_edit = editor['final_edit'].year + 1
|
35 | | - yearly_edits = editor['edits_by_year']
|
36 | | - n = editor['edit_count']
|
37 | | -
|
38 | | - if n >= var.cum_cutoff:
|
39 | | - for i, year in enumerate(xrange(new_wikipedian.year, final_edit)):
|
40 | | - edits = editor['monthly_edits'].get(str(year), {0:0})
|
41 | | - if year == new_wikipedian.year:
|
42 | | - start = new_wikipedian.month
|
43 | | - else:
|
44 | | - start = 1
|
45 | | -
|
46 | | - for month in xrange(start, 13):
|
47 | | - if edits.get(str(month), 0) >= var.cutoff:
|
48 | | - dt = datetime.datetime(year, month, 1)
|
49 | | - experience = relativedelta(dt - new_wikipedian)
|
50 | | - experience = experience.years * 12 + experience.months
|
51 | | - var.add(new_wikipedian, 1, {'experience': experience})
|
52 | | - return var
|
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-25' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import datetime |
| 22 | +from dateutil.relativedelta import * |
| 23 | + |
| 24 | +def cohort_dataset_forward_histogram(var, editor, **kwargs): |
| 25 | +# headers = ['year', 'month', 'edits'] |
| 26 | + ''' |
| 27 | + The forward looking histogram looks for every month that an editor |
| 28 | + was part of the Wikimedia community whether this person made at least cutoff |
| 29 | + value edits. If yes, then include this person in the analysis, else skip the |
| 30 | + person. |
| 31 | + ''' |
| 32 | + |
| 33 | + new_wikipedian = editor['new_wikipedian'] |
| 34 | + final_edit = editor['final_edit'].year + 1 |
| 35 | + yearly_edits = editor['edits_by_year'] |
| 36 | + n = editor['edit_count'] |
| 37 | + |
| 38 | + if n >= var.cum_cutoff: |
| 39 | + for i, year in enumerate(xrange(new_wikipedian.year, final_edit)): |
| 40 | + edits = editor['monthly_edits'].get(str(year), {0:0}) |
| 41 | + if year == new_wikipedian.year: |
| 42 | + start = new_wikipedian.month |
| 43 | + else: |
| 44 | + start = 1 |
| 45 | + |
| 46 | + for month in xrange(start, 13): |
| 47 | + if edits.get(str(month), 0) >= var.cutoff: |
| 48 | + dt = datetime.datetime(year, month, 1) |
| 49 | + experience = relativedelta(dt - new_wikipedian) |
| 50 | + experience = experience.years * 12 + experience.months |
| 51 | + var.add(new_wikipedian, 1, {'experience': experience}) |
| 52 | + return var |
Property changes on: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_histogram.py |
___________________________________________________________________ |
Added: svn:eol-style |
53 | 53 | + native |
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py |
— | — | @@ -1,50 +1,50 @@ |
2 | | -#!/usr/bin/python
|
3 | | -# -*- coding: utf-8 -*-
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http://www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__email__ = 'dvanliere at gmail dot com'
|
18 | | -__date__ = '2011-01-25'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -import datetime
|
22 | | -from dateutil.relativedelta import relativedelta
|
23 | | -from utils import data_converter
|
24 | | -
|
25 | | -
|
26 | | -def cohort_dataset_backward_bar(var, editor, **kwargs):
|
27 | | - '''
|
28 | | - The backward looking bar chart looks for every year that an editor
|
29 | | - was part of the Wikimedia community whether this person made at least cutoff
|
30 | | - value edits. If yes, then include this person in the analysis, else skip the
|
31 | | - person.
|
32 | | - '''
|
33 | | - break_down = kwargs.pop('break_down', False)
|
34 | | - new_wikipedian = editor['new_wikipedian']
|
35 | | - n = editor['edit_count']
|
36 | | -
|
37 | | - if n >= var.cum_cutoff:
|
38 | | - windows = data_converter.create_windows(var, break_down_first_year=break_down)
|
39 | | - for year in xrange(new_wikipedian.year, var.max_year):
|
40 | | - year = str(year)
|
41 | | - if editor['edits_by_year'][year] >= var.cutoff:
|
42 | | - last_edit = editor['last_edit_by_year'][year]
|
43 | | - if last_edit != 0.0:
|
44 | | - editor_dt = relativedelta(last_edit, new_wikipedian)
|
45 | | - editor_dt = (editor_dt.years * 12) + editor_dt.months
|
46 | | - for w in windows:
|
47 | | - if w >= editor_dt:
|
48 | | - datum = datetime.datetime(int(year), 12, 31)
|
49 | | - var.add(datum, 1, {'window':w})
|
50 | | - break
|
51 | | - return var
|
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-25' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import datetime |
| 22 | +from dateutil.relativedelta import relativedelta |
| 23 | +from utils import data_converter |
| 24 | + |
| 25 | + |
| 26 | +def cohort_dataset_backward_bar(var, editor, **kwargs): |
| 27 | + ''' |
| 28 | + The backward looking bar chart looks for every year that an editor |
| 29 | + was part of the Wikimedia community whether this person made at least cutoff |
| 30 | + value edits. If yes, then include this person in the analysis, else skip the |
| 31 | + person. |
| 32 | + ''' |
| 33 | + break_down = kwargs.pop('break_down', False) |
| 34 | + new_wikipedian = editor['new_wikipedian'] |
| 35 | + n = editor['edit_count'] |
| 36 | + |
| 37 | + if n >= var.cum_cutoff: |
| 38 | + windows = data_converter.create_windows(var, break_down_first_year=break_down) |
| 39 | + for year in xrange(new_wikipedian.year, var.max_year): |
| 40 | + year = str(year) |
| 41 | + if editor['edits_by_year'][year] >= var.cutoff: |
| 42 | + last_edit = editor['last_edit_by_year'][year] |
| 43 | + if last_edit != 0.0: |
| 44 | + editor_dt = relativedelta(last_edit, new_wikipedian) |
| 45 | + editor_dt = (editor_dt.years * 12) + editor_dt.months |
| 46 | + for w in windows: |
| 47 | + if w >= editor_dt: |
| 48 | + datum = datetime.datetime(int(year), 12, 31) |
| 49 | + var.add(datum, 1, {'window':w}) |
| 50 | + break |
| 51 | + return var |
Property changes on: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py |
___________________________________________________________________ |
Added: svn:eol-style |
52 | 52 | + native |
Index: trunk/tools/editor_trends/analyses/plugins/histogram_edits.py |
— | — | @@ -1,26 +1,26 @@ |
2 | | -#!/usr/bin/python
|
3 | | -# -*- coding: utf-8 -*-
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http://www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__email__ = 'dvanliere at gmail dot com'
|
18 | | -__date__ = '2011-01-25'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -
|
22 | | -def histogram_edits(var, editor, **kwargs):
|
23 | | -# headers = ['year', 'num_edits', 'frequency']
|
24 | | - cnt = editor['edit_count']
|
25 | | - new_wikipedian = editor['new_wikipedian']
|
26 | | - var.add(new_wikipedian, cnt)
|
27 | | - return var
|
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-25' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | + |
| 22 | +def histogram_edits(var, editor, **kwargs): |
| 23 | +# headers = ['year', 'num_edits', 'frequency'] |
| 24 | + cnt = editor['edit_count'] |
| 25 | + new_wikipedian = editor['new_wikipedian'] |
| 26 | + var.add(new_wikipedian, cnt) |
| 27 | + return var |
Property changes on: trunk/tools/editor_trends/analyses/plugins/histogram_edits.py |
___________________________________________________________________ |
Added: svn:eol-style |
28 | 28 | + native |
Index: trunk/tools/editor_trends/analyses/plugins/total_number_of_articles.py |
— | — | @@ -1,28 +1,28 @@ |
2 | | -#!/usr/bin/python
|
3 | | -# -*- coding: utf-8 -*-
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http://www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__email__ = 'dvanliere at gmail dot com'
|
18 | | -__date__ = '2011-01-25'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -
|
22 | | -def total_number_of_articles(var, editor, **kwargs):
|
23 | | - for year in editor['edits']:
|
24 | | - edits = editor['edits'][year]
|
25 | | - for edit in edits:
|
26 | | - article = edit['article']
|
27 | | - date = edit['date']
|
28 | | - var.add(date, 1, {'article':article})
|
29 | | - return var
|
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-25' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | + |
| 22 | +def total_number_of_articles(var, editor, **kwargs): |
| 23 | + for year in editor['edits']: |
| 24 | + edits = editor['edits'][year] |
| 25 | + for edit in edits: |
| 26 | + article = edit['article'] |
| 27 | + date = edit['date'] |
| 28 | + var.add(date, 1, {'article':article}) |
| 29 | + return var |
Property changes on: trunk/tools/editor_trends/analyses/plugins/total_number_of_articles.py |
___________________________________________________________________ |
Added: svn:eol-style |
30 | 30 | + native |
Index: trunk/tools/editor_trends/analyses/plugins/new_editor_count.py |
— | — | @@ -1,31 +1,31 @@ |
2 | | -#!/usr/bin/python
|
3 | | -# -*- coding: utf-8 -*-
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http://www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__email__ = 'dvanliere at gmail dot com'
|
18 | | -__date__ = '2011-01-25'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -
|
22 | | -def new_editor_count(var, editor, **kwargs):
|
23 | | - '''
|
24 | | - Summary: This function generates an overview of the number of
|
25 | | - new_wikipedians for a given year / month combination.
|
26 | | - Purpose: This data can be used to compare with Erik Zachte's
|
27 | | - stats.download.org to make sure that we are using the same numbers.
|
28 | | - '''
|
29 | | -# headers = ['year', 'month', 'count']
|
30 | | - new_wikipedian = editor['new_wikipedian']
|
31 | | - var.add(new_wikipedian, 1)
|
32 | | - return var
|
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-25' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | + |
| 22 | +def new_editor_count(var, editor, **kwargs): |
| 23 | + ''' |
| 24 | + Summary: This function generates an overview of the number of |
| 25 | + new_wikipedians for a given year / month combination. |
| 26 | + Purpose: This data can be used to compare with Erik Zachte's |
| 27 | + stats.download.org to make sure that we are using the same numbers. |
| 28 | + ''' |
| 29 | +# headers = ['year', 'month', 'count'] |
| 30 | + new_wikipedian = editor['new_wikipedian'] |
| 31 | + var.add(new_wikipedian, 1) |
| 32 | + return var |
Property changes on: trunk/tools/editor_trends/analyses/plugins/new_editor_count.py |
___________________________________________________________________ |
Added: svn:eol-style |
33 | 33 | + native |
Index: trunk/tools/editor_trends/analyses/inventory.py |
— | — | @@ -1,70 +1,70 @@ |
2 | | -#!/usr/bin/python
|
3 | | -# coding=utf-8
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http,//www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__author__email = 'dvanliere at gmail dot com'
|
18 | | -__date__ = '2011-02-11'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -
|
22 | | -import os
|
23 | | -import sys
|
24 | | -import types
|
25 | | -
|
26 | | -def available_analyses(caller='manage'):
|
27 | | - '''
|
28 | | - Generates a dictionary:
|
29 | | - key: name of analysis
|
30 | | - value: function that generates the dataset
|
31 | | - ignore: a list of functions that should never be called from manage.py,
|
32 | | - they are not valid entry points.
|
33 | | - '''
|
34 | | - assert caller == 'django' or caller == 'manage'
|
35 | | - ignore = ['__init__']
|
36 | | - functions = {}
|
37 | | -
|
38 | | - fn = os.path.realpath(__file__)
|
39 | | - pos = fn.rfind(os.sep)
|
40 | | - loc = fn[:pos]
|
41 | | - path = os.path.join(loc , 'plugins')
|
42 | | - plugins = import_libs(path)
|
43 | | -
|
44 | | - for plugin in plugins:
|
45 | | - if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore:
|
46 | | - functions[plugin.func_name] = plugin
|
47 | | - if caller == 'manage':
|
48 | | - return functions
|
49 | | - elif caller == 'django':
|
50 | | - django_functions = []
|
51 | | - for function in functions:
|
52 | | - fancy_name = function.replace('_', ' ').title()
|
53 | | - django_functions.append((function, fancy_name))
|
54 | | -
|
55 | | - return django_functions
|
56 | | -
|
57 | | -
|
58 | | -def import_libs(path):
|
59 | | - '''
|
60 | | - Dynamically importing functions from the plugins directory.
|
61 | | - '''
|
62 | | - library_list = []
|
63 | | - sys.path.append(path)
|
64 | | - for f in os.listdir(os.path.abspath(path)):
|
65 | | - module_name, ext = os.path.splitext(f)
|
66 | | - if ext == '.py':
|
67 | | - module = __import__(module_name)
|
68 | | - func = getattr(module, module_name)
|
69 | | - library_list.append(func)
|
70 | | -
|
71 | | - return library_list
|
| 2 | +#!/usr/bin/python |
| 3 | +# coding=utf-8 |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http,//www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-02-11' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | + |
| 22 | +import os |
| 23 | +import sys |
| 24 | +import types |
| 25 | + |
| 26 | +def available_analyses(caller='manage'): |
| 27 | + ''' |
| 28 | + Generates a dictionary: |
| 29 | + key: name of analysis |
| 30 | + value: function that generates the dataset |
| 31 | + ignore: a list of functions that should never be called from manage.py, |
| 32 | + they are not valid entry points. |
| 33 | + ''' |
| 34 | + assert caller == 'django' or caller == 'manage' |
| 35 | + ignore = ['__init__'] |
| 36 | + functions = {} |
| 37 | + |
| 38 | + fn = os.path.realpath(__file__) |
| 39 | + pos = fn.rfind(os.sep) |
| 40 | + loc = fn[:pos] |
| 41 | + path = os.path.join(loc , 'plugins') |
| 42 | + plugins = import_libs(path) |
| 43 | + |
| 44 | + for plugin in plugins: |
| 45 | + if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore: |
| 46 | + functions[plugin.func_name] = plugin |
| 47 | + if caller == 'manage': |
| 48 | + return functions |
| 49 | + elif caller == 'django': |
| 50 | + django_functions = [] |
| 51 | + for function in functions: |
| 52 | + fancy_name = function.replace('_', ' ').title() |
| 53 | + django_functions.append((function, fancy_name)) |
| 54 | + |
| 55 | + return django_functions |
| 56 | + |
| 57 | + |
| 58 | +def import_libs(path): |
| 59 | + ''' |
| 60 | + Dynamically importing functions from the plugins directory. |
| 61 | + ''' |
| 62 | + library_list = [] |
| 63 | + sys.path.append(path) |
| 64 | + for f in os.listdir(os.path.abspath(path)): |
| 65 | + module_name, ext = os.path.splitext(f) |
| 66 | + if ext == '.py': |
| 67 | + module = __import__(module_name) |
| 68 | + func = getattr(module, module_name) |
| 69 | + library_list.append(func) |
| 70 | + |
| 71 | + return library_list |
Property changes on: trunk/tools/editor_trends/analyses/inventory.py |
___________________________________________________________________ |
Added: svn:eol-style |
72 | 72 | + native |
Property changes on: trunk/tools/editor_trends/manage.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
73 | 73 | - text/plain |
Property changes on: trunk/tools/editor_trends/wikitree/parser.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
74 | 74 | - text/plain |
Property changes on: trunk/tools/editor_trends/wikitree/__init__.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
75 | 75 | - text/plain |
Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -1,462 +1,462 @@ |
2 | | -#!/usr/bin/python
|
3 | | -# -*- coding: utf-8 -*-
|
4 | | -'''
|
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
6 | | -This program is free software; you can redistribute it and/or
|
7 | | -modify it under the terms of the GNU General Public License version 2
|
8 | | -as published by the Free Software Foundation.
|
9 | | -This program is distributed in the hope that it will be useful,
|
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
12 | | -See the GNU General Public License for more details, at
|
13 | | -http://www.fsf.org/licenses/gpl.html
|
14 | | -'''
|
15 | | -
|
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
17 | | -__email__ = 'dvanliere at gmail dot com'
|
18 | | -__date__ = '2011-02-06'
|
19 | | -__version__ = '0.1'
|
20 | | -
|
21 | | -
|
22 | | -import bz2
|
23 | | -import cStringIO
|
24 | | -import hashlib
|
25 | | -import codecs
|
26 | | -import re
|
27 | | -import sys
|
28 | | -import progressbar
|
29 | | -from multiprocessing import JoinableQueue, Process, cpu_count, current_process
|
30 | | -from xml.etree.cElementTree import fromstring, iterparse
|
31 | | -from collections import deque
|
32 | | -
|
33 | | -if '..' not in sys.path:
|
34 | | - sys.path.append('..')
|
35 | | -
|
36 | | -try:
|
37 | | - from database import cassandra
|
38 | | - import pycassa
|
39 | | -
|
40 | | -except ImportError:
|
41 | | - print 'I am not going to use Cassandra today, it\'s my off day.'
|
42 | | -
|
43 | | -
|
44 | | -
|
45 | | -from database import db
|
46 | | -from bots import detector
|
47 | | -from utils import file_utils
|
48 | | -import extracter
|
49 | | -
|
50 | | -RE_CATEGORY = re.compile('\(.*\`\,\.\-\:\'\)')
|
51 | | -
|
52 | | -NAMESPACE = {
|
53 | | - #0:'Main',
|
54 | | - #1:'Talk',
|
55 | | - #2:'User',
|
56 | | - #3:'User talk',
|
57 | | - 4:'Wikipedia',
|
58 | | - #5:'Wikipedia talk',
|
59 | | - 6:'File',
|
60 | | - #7:'File talk',
|
61 | | - 8:'MediaWiki',
|
62 | | - #9:'MediaWiki talk',
|
63 | | - 10:'Template',
|
64 | | - #11:'Template talk',
|
65 | | - 12:'Help',
|
66 | | - #13:'Help talk',
|
67 | | - 14:'Category',
|
68 | | - #15:'Category talk',
|
69 | | - 90:'Thread',
|
70 | | - #91:'Thread talk',
|
71 | | - 92:'Summary',
|
72 | | - #93:'Summary talk',
|
73 | | - 100:'Portal',
|
74 | | - #101:'Portal talk',
|
75 | | - 108:'Book',
|
76 | | - #109:'Book talk'
|
77 | | -}
|
78 | | -
|
79 | | -
|
80 | | -class Buffer:
|
81 | | - def __init__(self, storage, id):
|
82 | | - assert storage == 'cassandra' or storage == 'mongo' or storage == 'csv', \
|
83 | | - 'Valid storage options are cassandra and mongo.'
|
84 | | - self.storage = storage
|
85 | | - self.revisions = {}
|
86 | | - self.comments = {}
|
87 | | - self.id = id
|
88 | | - self.keyspace_name = 'enwiki'
|
89 | | - self.keys = ['revision_id', 'article_id', 'id', 'namespace',
|
90 | | - 'title', 'timestamp', 'hash', 'revert', 'bot', 'prev_size',
|
91 | | - 'cur_size', 'delta']
|
92 | | - self.setup_storage()
|
93 | | -
|
94 | | - def setup_storage(self):
|
95 | | - if self.storage == 'cassandra':
|
96 | | - self.db = pycassa.connect(self.keyspace_name)
|
97 | | - self.collection = pycassa.ColumnFamily(self.db, 'revisions')
|
98 | | -
|
99 | | - elif self.storage == 'mongo':
|
100 | | - self.db = db.init_mongo_db(self.keyspace_name)
|
101 | | - self.collection = self.db['kaggle']
|
102 | | -
|
103 | | - else:
|
104 | | - kaggle_file = 'kaggle_%s.csv' % self.id
|
105 | | - comment_file = 'kaggle_comments_%s.csv' % self.id
|
106 | | - file_utils.delete_file('', kaggle_file, directory=False)
|
107 | | - file_utils.delete_file('', comment_file, directory=False)
|
108 | | - self.fh_main = codecs.open(kaggle_file, 'a', 'utf-8')
|
109 | | - self.fh_extra = codecs.open(comment_file, 'a', 'utf-8')
|
110 | | -
|
111 | | - def add(self, revision):
|
112 | | - self.stringify(revision)
|
113 | | - id = revision['revision_id']
|
114 | | - self.revisions[id] = revision
|
115 | | - if len(self.revisions) == 1000:
|
116 | | - self.store()
|
117 | | - self.clear()
|
118 | | -
|
119 | | - def stringify(self, revision):
|
120 | | - for key, value in revision.iteritems():
|
121 | | - try:
|
122 | | - value = str(value)
|
123 | | - except UnicodeEncodeError:
|
124 | | - value = value.encode('utf-8')
|
125 | | - revision[key] = value
|
126 | | -
|
127 | | - def empty(self):
|
128 | | - self.store()
|
129 | | - self.clear()
|
130 | | - if self.storage == 'csv':
|
131 | | - self.fh_main.close()
|
132 | | - self.fh_extra.close()
|
133 | | -
|
134 | | - def clear(self):
|
135 | | - self.revisions = {}
|
136 | | - self.comments = {}
|
137 | | -
|
138 | | - def store(self):
|
139 | | - if self.storage == 'cassandra':
|
140 | | - self.collection.batch_insert(self.revisions)
|
141 | | - elif self.storage == 'mongo':
|
142 | | - print 'insert into mongo'
|
143 | | - else:
|
144 | | - for revision in self.revisions.itervalues():
|
145 | | - values = []
|
146 | | - for key in self.keys:
|
147 | | - values.append(revision[key].decode('utf-8'))
|
148 | | -
|
149 | | - value = '\t'.join(values) + '\n'
|
150 | | - row = '\t'.join([key, value])
|
151 | | - self.fh_main.write(row)
|
152 | | -
|
153 | | - for revision_id, comment in self.comments.iteritems():
|
154 | | - comment = comment.decode('utf-8')
|
155 | | - row = '\t'.join([revision_id, comment]) + '\n'
|
156 | | - self.fh_extra.write(row)
|
157 | | -
|
158 | | -
|
159 | | -def extract_categories():
|
160 | | - '''
|
161 | | - Field 1: page id
|
162 | | - Field 2: name category
|
163 | | - Field 3: sort key
|
164 | | - Field 4: timestamp last change
|
165 | | - '''
|
166 | | - filename = 'C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-categorylinks.sql'
|
167 | | - output = codecs.open('categories.csv', 'w', encoding='utf-8')
|
168 | | - fh = codecs.open(filename, 'r', encoding='utf-8')
|
169 | | -
|
170 | | - try:
|
171 | | - for line in fh:
|
172 | | - if line.startswith('INSERT INTO `categorylinks` VALUES ('):
|
173 | | - line = line.replace('INSERT INTO `categorylinks` VALUES (', '')
|
174 | | - line = line.replace("'", '')
|
175 | | - categories = line.split('),(')
|
176 | | - for category in categories:
|
177 | | - category = category.split(',')
|
178 | | - if len(category) == 4:
|
179 | | - output.write('%s\t%s\n' % (category[0], category[1]))
|
180 | | - except UnicodeDecodeError, e:
|
181 | | - print e
|
182 | | -
|
183 | | - output.close()
|
184 | | - fh.close()
|
185 | | -
|
186 | | -
|
187 | | -def extract_revision_text(revision):
|
188 | | - rev = revision.find('text')
|
189 | | - if rev != None:
|
190 | | - if rev.text == None:
|
191 | | - rev = fix_revision_text(revision)
|
192 | | - return rev.text.encode('utf-8')
|
193 | | - else:
|
194 | | - return ''
|
195 | | -
|
196 | | -
|
197 | | -def fix_revision_text(revision):
|
198 | | - if revision.text == None:
|
199 | | - revision.text = ''
|
200 | | - return revision
|
201 | | -
|
202 | | -
|
203 | | -def create_md5hash(text):
|
204 | | - hash = {}
|
205 | | - if text != None:
|
206 | | - m = hashlib.md5()
|
207 | | - m.update(text)
|
208 | | - #echo m.digest()
|
209 | | - hash['hash'] = m.hexdigest()
|
210 | | - else:
|
211 | | - hash['hash'] = -1
|
212 | | - return hash
|
213 | | -
|
214 | | -
|
215 | | -def calculate_delta_article_size(size, text):
|
216 | | - if 'prev_size' not in size:
|
217 | | - size['prev_size'] = 0
|
218 | | - size['cur_size'] = len(text)
|
219 | | - size['delta'] = len(text)
|
220 | | - else:
|
221 | | - size['prev_size'] = size['cur_size']
|
222 | | - delta = len(text) - size['prev_size']
|
223 | | - size['cur_size'] = len(text)
|
224 | | - size['delta'] = delta
|
225 | | - return size
|
226 | | -
|
227 | | -
|
228 | | -def parse_contributor(contributor, bots):
|
229 | | - username = extracter.extract_username(contributor)
|
230 | | - user_id = extracter.extract_contributor_id(contributor)
|
231 | | - bot = extracter.determine_username_is_bot(contributor, bots=bots)
|
232 | | - contributor = {}
|
233 | | - contributor['username'] = username
|
234 | | - contributor['bot'] = bot
|
235 | | - if user_id != None:
|
236 | | - contributor.update(user_id)
|
237 | | - else:
|
238 | | - contributor = False
|
239 | | - return contributor
|
240 | | -
|
241 | | -
|
242 | | -def determine_namespace(title):
|
243 | | - namespaces = {'User': 2,
|
244 | | - 'Talk': 1,
|
245 | | - 'User Talk': 3,
|
246 | | - }
|
247 | | - ns = {}
|
248 | | - if title.text != None:
|
249 | | - title = title.text
|
250 | | - for namespace in namespaces:
|
251 | | - if title.startswith(namespace):
|
252 | | - ns['namespace'] = namespaces[namespace]
|
253 | | - if ns == {}:
|
254 | | - for namespace in NAMESPACE:
|
255 | | - if title.startswith(namespace):
|
256 | | - ns = False #article does not belong to either the main namespace, user, talk or user talk namespace.
|
257 | | - break
|
258 | | - ns['namespace'] = 0
|
259 | | - else:
|
260 | | - ns = False
|
261 | | - return ns
|
262 | | -
|
263 | | -
|
264 | | -def prefill_row(title, article_id, namespace):
|
265 | | - row = {}
|
266 | | - row['title'] = title.text
|
267 | | - row['article_id'] = article_id
|
268 | | - row.update(namespace)
|
269 | | - return row
|
270 | | -
|
271 | | -
|
272 | | -def is_revision_reverted(hash_cur, hashes):
|
273 | | - revert = {}
|
274 | | - if hash_cur in hashes:
|
275 | | - revert['revert'] = 1
|
276 | | - else:
|
277 | | - revert['revert'] = 0
|
278 | | - return revert
|
279 | | -
|
280 | | -
|
281 | | -def create_variables(result_queue, storage, id):
|
282 | | - bots = detector.retrieve_bots('en')
|
283 | | - buffer = Buffer(storage, id)
|
284 | | - i = 0
|
285 | | - while True:
|
286 | | - article = result_queue.get(block=True)
|
287 | | - result_queue.task_done()
|
288 | | - if article == None:
|
289 | | - break
|
290 | | - i += 1
|
291 | | - #article = fromstring(article)
|
292 | | - title = article['title'].text
|
293 | | - namespace = determine_namespace(title)
|
294 | | - if namespace != False:
|
295 | | - #revisions = article.findall('revision')
|
296 | | - article_id = article['id'].text
|
297 | | - hashes = deque(maxlen=1000)
|
298 | | - size = {}
|
299 | | - for revision in article['revision']:
|
300 | | - if revision == None:
|
301 | | - #the entire revision is empty, weird.
|
302 | | - continue
|
303 | | -
|
304 | | - contributor = revision.find('contributor')
|
305 | | - contributor = parse_contributor(contributor, bots)
|
306 | | - if not contributor:
|
307 | | - #editor is anonymous, ignore
|
308 | | - continue
|
309 | | -
|
310 | | - revision_id = revision.find('id')
|
311 | | - revision_id = extracter.extract_revision_id(revision_id)
|
312 | | - if revision_id == None:
|
313 | | - #revision_id is missing, which is weird
|
314 | | - continue
|
315 | | -
|
316 | | - row = prefill_row(title, article_id, namespace)
|
317 | | - row['revision_id'] = revision_id
|
318 | | - text = extract_revision_text(revision)
|
319 | | - row.update(contributor)
|
320 | | -
|
321 | | -
|
322 | | - timestamp = revision.find('timestamp').text
|
323 | | - row['timestamp'] = timestamp
|
324 | | -
|
325 | | - hash = create_md5hash(text)
|
326 | | - revert = is_revision_reverted(hash['hash'], hashes)
|
327 | | - hashes.append(hash['hash'])
|
328 | | - size = calculate_delta_article_size(size, text)
|
329 | | -
|
330 | | - row.update(hash)
|
331 | | - row.update(size)
|
332 | | - row.update(revert)
|
333 | | - # print row
|
334 | | - # if row['username'] == None:
|
335 | | - # contributor = revision.find('contributor')
|
336 | | - # attrs = contributor.getchildren()
|
337 | | - # for attr in attrs:
|
338 | | - # print attr.text
|
339 | | - #print revision_id, hash, delta, prev_size\
|
340 | | -
|
341 | | - buffer.add(row)
|
342 | | - if i % 10000 == 0:
|
343 | | - print 'Parsed %s articles' % i
|
344 | | -# except ValueError, e:
|
345 | | -# print e
|
346 | | -# except UnicodeDecodeError, e:
|
347 | | -# print e
|
348 | | - buffer.empty()
|
349 | | - print 'Buffer is empty'
|
350 | | -
|
351 | | -
|
352 | | -def parse_xml(source, result_queue):
|
353 | | - context = iterparse(source, events=('end',))
|
354 | | - context = iter(context)
|
355 | | - event, root = context.next()
|
356 | | -
|
357 | | - article = {}
|
358 | | - id = False
|
359 | | - for event, elem in context:
|
360 | | - if event == 'end' and elem.tag == 'revision':
|
361 | | - article[elem.tag] = elem
|
362 | | - elif event == 'end' and elem.tag == 'id' and id == False:
|
363 | | - article[elem.tag] = elem
|
364 | | - id = True
|
365 | | - article[root.tag] = root
|
366 | | - result_queue.put(article)
|
367 | | - root.clear()
|
368 | | -
|
369 | | -
|
370 | | -def stream_raw_xml(input_queue, result_queue):
|
371 | | - buffer = cStringIO.StringIO()
|
372 | | - parsing = False
|
373 | | -
|
374 | | - while True:
|
375 | | - filename = input_queue.get()
|
376 | | - input_queue.task_done()
|
377 | | - if filename == None:
|
378 | | - break
|
379 | | -
|
380 | | - #filesize = file_utils.determine_filesize('', filename)
|
381 | | - #pbar = progressbar.ProgressBar().start()
|
382 | | -
|
383 | | - for data in unzip(filename):
|
384 | | - if data.startswith('<page>'):
|
385 | | - parsing = True
|
386 | | - if parsing:
|
387 | | - buffer.write(data)
|
388 | | - buffer.write('\n')
|
389 | | - if data == '</page>':
|
390 | | - buffer.seek(0)
|
391 | | - parse_xml(buffer, result_queue)
|
392 | | - buffer = cStringIO.StringIO()
|
393 | | - #pbar.update(pbar.currval + len(data)) #is inaccurate!!!
|
394 | | -
|
395 | | -
|
396 | | - for x in xrange(cpu_count()):
|
397 | | - result_queue.put(None)
|
398 | | - print 'Finished parsing bz2 archives'
|
399 | | -
|
400 | | -
|
401 | | -def debug():
|
402 | | - input_queue = JoinableQueue()
|
403 | | - result_queue = JoinableQueue()
|
404 | | - files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']
|
405 | | -
|
406 | | - for file in files:
|
407 | | - input_queue.put(file)
|
408 | | -
|
409 | | - stream_raw_xml(input_queue, result_queue)
|
410 | | -
|
411 | | -
|
412 | | -def unzip(filename):
|
413 | | - '''
|
414 | | - Filename should be a fully qualified path to the bz2 file that will be
|
415 | | - decompressed. It will iterate line by line and yield this back to
|
416 | | - create_article
|
417 | | - '''
|
418 | | - fh = bz2.BZ2File(filename, 'r')
|
419 | | - for line in fh:
|
420 | | - line = line.strip()
|
421 | | - yield line
|
422 | | - fh.close()
|
423 | | -
|
424 | | -
|
425 | | -def setup(storage):
|
426 | | - keyspace_name = 'enwiki'
|
427 | | - if storage == 'cassandra':
|
428 | | - cassandra.install_schema(keyspace_name, drop_first=True)
|
429 | | -
|
430 | | -
|
431 | | -def launcher():
|
432 | | -
|
433 | | - storage = 'csv'
|
434 | | - setup(storage)
|
435 | | - input_queue = JoinableQueue()
|
436 | | - result_queue = JoinableQueue()
|
437 | | - #files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']
|
438 | | - files = ['/home/diederik/kaggle/enwiki-20100904-pages-meta-history2.xml.bz2']
|
439 | | -
|
440 | | - for file in files:
|
441 | | - input_queue.put(file)
|
442 | | -
|
443 | | - for x in xrange(cpu_count()):
|
444 | | - input_queue.put(None)
|
445 | | -
|
446 | | - extracters = [Process(target=stream_raw_xml, args=[input_queue, result_queue])
|
447 | | - for x in xrange(cpu_count())]
|
448 | | - for extracter in extracters:
|
449 | | - extracter.start()
|
450 | | -
|
451 | | - creators = [Process(target=create_variables, args=[result_queue, storage, x])
|
452 | | - for x in xrange(cpu_count())]
|
453 | | - for creator in creators:
|
454 | | - creator.start()
|
455 | | -
|
456 | | -
|
457 | | - input_queue.join()
|
458 | | - result_queue.join()
|
459 | | -
|
460 | | -
|
461 | | -if __name__ == '__main__':
|
462 | | - #debug()
|
463 | | - launcher()
|
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-02-06' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | + |
| 22 | +import bz2 |
| 23 | +import cStringIO |
| 24 | +import hashlib |
| 25 | +import codecs |
| 26 | +import re |
| 27 | +import sys |
| 28 | +import progressbar |
| 29 | +from multiprocessing import JoinableQueue, Process, cpu_count, current_process |
| 30 | +from xml.etree.cElementTree import fromstring, iterparse |
| 31 | +from collections import deque |
| 32 | + |
| 33 | +if '..' not in sys.path: |
| 34 | + sys.path.append('..') |
| 35 | + |
| 36 | +try: |
| 37 | + from database import cassandra |
| 38 | + import pycassa |
| 39 | + |
| 40 | +except ImportError: |
| 41 | + print 'I am not going to use Cassandra today, it\'s my off day.' |
| 42 | + |
| 43 | + |
| 44 | + |
| 45 | +from database import db |
| 46 | +from bots import detector |
| 47 | +from utils import file_utils |
| 48 | +import extracter |
| 49 | + |
| 50 | +RE_CATEGORY = re.compile('\(.*\`\,\.\-\:\'\)') |
| 51 | + |
| 52 | +NAMESPACE = { |
| 53 | + #0:'Main', |
| 54 | + #1:'Talk', |
| 55 | + #2:'User', |
| 56 | + #3:'User talk', |
| 57 | + 4:'Wikipedia', |
| 58 | + #5:'Wikipedia talk', |
| 59 | + 6:'File', |
| 60 | + #7:'File talk', |
| 61 | + 8:'MediaWiki', |
| 62 | + #9:'MediaWiki talk', |
| 63 | + 10:'Template', |
| 64 | + #11:'Template talk', |
| 65 | + 12:'Help', |
| 66 | + #13:'Help talk', |
| 67 | + 14:'Category', |
| 68 | + #15:'Category talk', |
| 69 | + 90:'Thread', |
| 70 | + #91:'Thread talk', |
| 71 | + 92:'Summary', |
| 72 | + #93:'Summary talk', |
| 73 | + 100:'Portal', |
| 74 | + #101:'Portal talk', |
| 75 | + 108:'Book', |
| 76 | + #109:'Book talk' |
| 77 | +} |
| 78 | + |
| 79 | + |
| 80 | +class Buffer: |
| 81 | + def __init__(self, storage, id): |
| 82 | + assert storage == 'cassandra' or storage == 'mongo' or storage == 'csv', \ |
| 83 | + 'Valid storage options are cassandra and mongo.' |
| 84 | + self.storage = storage |
| 85 | + self.revisions = {} |
| 86 | + self.comments = {} |
| 87 | + self.id = id |
| 88 | + self.keyspace_name = 'enwiki' |
| 89 | + self.keys = ['revision_id', 'article_id', 'id', 'namespace', |
| 90 | + 'title', 'timestamp', 'hash', 'revert', 'bot', 'prev_size', |
| 91 | + 'cur_size', 'delta'] |
| 92 | + self.setup_storage() |
| 93 | + |
| 94 | + def setup_storage(self): |
| 95 | + if self.storage == 'cassandra': |
| 96 | + self.db = pycassa.connect(self.keyspace_name) |
| 97 | + self.collection = pycassa.ColumnFamily(self.db, 'revisions') |
| 98 | + |
| 99 | + elif self.storage == 'mongo': |
| 100 | + self.db = db.init_mongo_db(self.keyspace_name) |
| 101 | + self.collection = self.db['kaggle'] |
| 102 | + |
| 103 | + else: |
| 104 | + kaggle_file = 'kaggle_%s.csv' % self.id |
| 105 | + comment_file = 'kaggle_comments_%s.csv' % self.id |
| 106 | + file_utils.delete_file('', kaggle_file, directory=False) |
| 107 | + file_utils.delete_file('', comment_file, directory=False) |
| 108 | + self.fh_main = codecs.open(kaggle_file, 'a', 'utf-8') |
| 109 | + self.fh_extra = codecs.open(comment_file, 'a', 'utf-8') |
| 110 | + |
| 111 | + def add(self, revision): |
| 112 | + self.stringify(revision) |
| 113 | + id = revision['revision_id'] |
| 114 | + self.revisions[id] = revision |
| 115 | + if len(self.revisions) == 1000: |
| 116 | + self.store() |
| 117 | + self.clear() |
| 118 | + |
| 119 | + def stringify(self, revision): |
| 120 | + for key, value in revision.iteritems(): |
| 121 | + try: |
| 122 | + value = str(value) |
| 123 | + except UnicodeEncodeError: |
| 124 | + value = value.encode('utf-8') |
| 125 | + revision[key] = value |
| 126 | + |
| 127 | + def empty(self): |
| 128 | + self.store() |
| 129 | + self.clear() |
| 130 | + if self.storage == 'csv': |
| 131 | + self.fh_main.close() |
| 132 | + self.fh_extra.close() |
| 133 | + |
| 134 | + def clear(self): |
| 135 | + self.revisions = {} |
| 136 | + self.comments = {} |
| 137 | + |
| 138 | + def store(self): |
| 139 | + if self.storage == 'cassandra': |
| 140 | + self.collection.batch_insert(self.revisions) |
| 141 | + elif self.storage == 'mongo': |
| 142 | + print 'insert into mongo' |
| 143 | + else: |
| 144 | + for revision in self.revisions.itervalues(): |
| 145 | + values = [] |
| 146 | + for key in self.keys: |
| 147 | + values.append(revision[key].decode('utf-8')) |
| 148 | + |
| 149 | + value = '\t'.join(values) + '\n' |
| 150 | + row = '\t'.join([key, value]) |
| 151 | + self.fh_main.write(row) |
| 152 | + |
| 153 | + for revision_id, comment in self.comments.iteritems(): |
| 154 | + comment = comment.decode('utf-8') |
| 155 | + row = '\t'.join([revision_id, comment]) + '\n' |
| 156 | + self.fh_extra.write(row) |
| 157 | + |
| 158 | + |
| 159 | +def extract_categories(): |
| 160 | + ''' |
| 161 | + Field 1: page id |
| 162 | + Field 2: name category |
| 163 | + Field 3: sort key |
| 164 | + Field 4: timestamp last change |
| 165 | + ''' |
| 166 | + filename = 'C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-categorylinks.sql' |
| 167 | + output = codecs.open('categories.csv', 'w', encoding='utf-8') |
| 168 | + fh = codecs.open(filename, 'r', encoding='utf-8') |
| 169 | + |
| 170 | + try: |
| 171 | + for line in fh: |
| 172 | + if line.startswith('INSERT INTO `categorylinks` VALUES ('): |
| 173 | + line = line.replace('INSERT INTO `categorylinks` VALUES (', '') |
| 174 | + line = line.replace("'", '') |
| 175 | + categories = line.split('),(') |
| 176 | + for category in categories: |
| 177 | + category = category.split(',') |
| 178 | + if len(category) == 4: |
| 179 | + output.write('%s\t%s\n' % (category[0], category[1])) |
| 180 | + except UnicodeDecodeError, e: |
| 181 | + print e |
| 182 | + |
| 183 | + output.close() |
| 184 | + fh.close() |
| 185 | + |
| 186 | + |
| 187 | +def extract_revision_text(revision): |
| 188 | + rev = revision.find('text') |
| 189 | + if rev != None: |
| 190 | + if rev.text == None: |
| 191 | + rev = fix_revision_text(revision) |
| 192 | + return rev.text.encode('utf-8') |
| 193 | + else: |
| 194 | + return '' |
| 195 | + |
| 196 | + |
| 197 | +def fix_revision_text(revision): |
| 198 | + if revision.text == None: |
| 199 | + revision.text = '' |
| 200 | + return revision |
| 201 | + |
| 202 | + |
| 203 | +def create_md5hash(text): |
| 204 | + hash = {} |
| 205 | + if text != None: |
| 206 | + m = hashlib.md5() |
| 207 | + m.update(text) |
| 208 | + #echo m.digest() |
| 209 | + hash['hash'] = m.hexdigest() |
| 210 | + else: |
| 211 | + hash['hash'] = -1 |
| 212 | + return hash |
| 213 | + |
| 214 | + |
| 215 | +def calculate_delta_article_size(size, text): |
| 216 | + if 'prev_size' not in size: |
| 217 | + size['prev_size'] = 0 |
| 218 | + size['cur_size'] = len(text) |
| 219 | + size['delta'] = len(text) |
| 220 | + else: |
| 221 | + size['prev_size'] = size['cur_size'] |
| 222 | + delta = len(text) - size['prev_size'] |
| 223 | + size['cur_size'] = len(text) |
| 224 | + size['delta'] = delta |
| 225 | + return size |
| 226 | + |
| 227 | + |
| 228 | +def parse_contributor(contributor, bots): |
| 229 | + username = extracter.extract_username(contributor) |
| 230 | + user_id = extracter.extract_contributor_id(contributor) |
| 231 | + bot = extracter.determine_username_is_bot(contributor, bots=bots) |
| 232 | + contributor = {} |
| 233 | + contributor['username'] = username |
| 234 | + contributor['bot'] = bot |
| 235 | + if user_id != None: |
| 236 | + contributor.update(user_id) |
| 237 | + else: |
| 238 | + contributor = False |
| 239 | + return contributor |
| 240 | + |
| 241 | + |
| 242 | +def determine_namespace(title): |
| 243 | + namespaces = {'User': 2, |
| 244 | + 'Talk': 1, |
| 245 | + 'User Talk': 3, |
| 246 | + } |
| 247 | + ns = {} |
| 248 | + if title.text != None: |
| 249 | + title = title.text |
| 250 | + for namespace in namespaces: |
| 251 | + if title.startswith(namespace): |
| 252 | + ns['namespace'] = namespaces[namespace] |
| 253 | + if ns == {}: |
| 254 | + for namespace in NAMESPACE: |
| 255 | + if title.startswith(namespace): |
| 256 | + ns = False #article does not belong to either the main namespace, user, talk or user talk namespace. |
| 257 | + break |
| 258 | + ns['namespace'] = 0 |
| 259 | + else: |
| 260 | + ns = False |
| 261 | + return ns |
| 262 | + |
| 263 | + |
| 264 | +def prefill_row(title, article_id, namespace): |
| 265 | + row = {} |
| 266 | + row['title'] = title.text |
| 267 | + row['article_id'] = article_id |
| 268 | + row.update(namespace) |
| 269 | + return row |
| 270 | + |
| 271 | + |
| 272 | +def is_revision_reverted(hash_cur, hashes): |
| 273 | + revert = {} |
| 274 | + if hash_cur in hashes: |
| 275 | + revert['revert'] = 1 |
| 276 | + else: |
| 277 | + revert['revert'] = 0 |
| 278 | + return revert |
| 279 | + |
| 280 | + |
| 281 | +def create_variables(result_queue, storage, id): |
| 282 | + bots = detector.retrieve_bots('en') |
| 283 | + buffer = Buffer(storage, id) |
| 284 | + i = 0 |
| 285 | + while True: |
| 286 | + article = result_queue.get(block=True) |
| 287 | + result_queue.task_done() |
| 288 | + if article == None: |
| 289 | + break |
| 290 | + i += 1 |
| 291 | + #article = fromstring(article) |
| 292 | + title = article['title'].text |
| 293 | + namespace = determine_namespace(title) |
| 294 | + if namespace != False: |
| 295 | + #revisions = article.findall('revision') |
| 296 | + article_id = article['id'].text |
| 297 | + hashes = deque(maxlen=1000) |
| 298 | + size = {} |
| 299 | + for revision in article['revision']: |
| 300 | + if revision == None: |
| 301 | + #the entire revision is empty, weird. |
| 302 | + continue |
| 303 | + |
| 304 | + contributor = revision.find('contributor') |
| 305 | + contributor = parse_contributor(contributor, bots) |
| 306 | + if not contributor: |
| 307 | + #editor is anonymous, ignore |
| 308 | + continue |
| 309 | + |
| 310 | + revision_id = revision.find('id') |
| 311 | + revision_id = extracter.extract_revision_id(revision_id) |
| 312 | + if revision_id == None: |
| 313 | + #revision_id is missing, which is weird |
| 314 | + continue |
| 315 | + |
| 316 | + row = prefill_row(title, article_id, namespace) |
| 317 | + row['revision_id'] = revision_id |
| 318 | + text = extract_revision_text(revision) |
| 319 | + row.update(contributor) |
| 320 | + |
| 321 | + |
| 322 | + timestamp = revision.find('timestamp').text |
| 323 | + row['timestamp'] = timestamp |
| 324 | + |
| 325 | + hash = create_md5hash(text) |
| 326 | + revert = is_revision_reverted(hash['hash'], hashes) |
| 327 | + hashes.append(hash['hash']) |
| 328 | + size = calculate_delta_article_size(size, text) |
| 329 | + |
| 330 | + row.update(hash) |
| 331 | + row.update(size) |
| 332 | + row.update(revert) |
| 333 | + # print row |
| 334 | + # if row['username'] == None: |
| 335 | + # contributor = revision.find('contributor') |
| 336 | + # attrs = contributor.getchildren() |
| 337 | + # for attr in attrs: |
| 338 | + # print attr.text |
| 339 | + #print revision_id, hash, delta, prev_size\ |
| 340 | + |
| 341 | + buffer.add(row) |
| 342 | + if i % 10000 == 0: |
| 343 | + print 'Parsed %s articles' % i |
| 344 | +# except ValueError, e: |
| 345 | +# print e |
| 346 | +# except UnicodeDecodeError, e: |
| 347 | +# print e |
| 348 | + buffer.empty() |
| 349 | + print 'Buffer is empty' |
| 350 | + |
| 351 | + |
| 352 | +def parse_xml(source, result_queue): |
| 353 | + context = iterparse(source, events=('end',)) |
| 354 | + context = iter(context) |
| 355 | + event, root = context.next() |
| 356 | + |
| 357 | + article = {} |
| 358 | + id = False |
| 359 | + for event, elem in context: |
| 360 | + if event == 'end' and elem.tag == 'revision': |
| 361 | + article[elem.tag] = elem |
| 362 | + elif event == 'end' and elem.tag == 'id' and id == False: |
| 363 | + article[elem.tag] = elem |
| 364 | + id = True |
| 365 | + article[root.tag] = root |
| 366 | + result_queue.put(article) |
| 367 | + root.clear() |
| 368 | + |
| 369 | + |
| 370 | +def stream_raw_xml(input_queue, result_queue): |
| 371 | + buffer = cStringIO.StringIO() |
| 372 | + parsing = False |
| 373 | + |
| 374 | + while True: |
| 375 | + filename = input_queue.get() |
| 376 | + input_queue.task_done() |
| 377 | + if filename == None: |
| 378 | + break |
| 379 | + |
| 380 | + #filesize = file_utils.determine_filesize('', filename) |
| 381 | + #pbar = progressbar.ProgressBar().start() |
| 382 | + |
| 383 | + for data in unzip(filename): |
| 384 | + if data.startswith('<page>'): |
| 385 | + parsing = True |
| 386 | + if parsing: |
| 387 | + buffer.write(data) |
| 388 | + buffer.write('\n') |
| 389 | + if data == '</page>': |
| 390 | + buffer.seek(0) |
| 391 | + parse_xml(buffer, result_queue) |
| 392 | + buffer = cStringIO.StringIO() |
| 393 | + #pbar.update(pbar.currval + len(data)) #is inaccurate!!! |
| 394 | + |
| 395 | + |
| 396 | + for x in xrange(cpu_count()): |
| 397 | + result_queue.put(None) |
| 398 | + print 'Finished parsing bz2 archives' |
| 399 | + |
| 400 | + |
| 401 | +def debug(): |
| 402 | + input_queue = JoinableQueue() |
| 403 | + result_queue = JoinableQueue() |
| 404 | + files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2'] |
| 405 | + |
| 406 | + for file in files: |
| 407 | + input_queue.put(file) |
| 408 | + |
| 409 | + stream_raw_xml(input_queue, result_queue) |
| 410 | + |
| 411 | + |
| 412 | +def unzip(filename): |
| 413 | + ''' |
| 414 | + Filename should be a fully qualified path to the bz2 file that will be |
| 415 | + decompressed. It will iterate line by line and yield this back to |
| 416 | + create_article |
| 417 | + ''' |
| 418 | + fh = bz2.BZ2File(filename, 'r') |
| 419 | + for line in fh: |
| 420 | + line = line.strip() |
| 421 | + yield line |
| 422 | + fh.close() |
| 423 | + |
| 424 | + |
| 425 | +def setup(storage): |
| 426 | + keyspace_name = 'enwiki' |
| 427 | + if storage == 'cassandra': |
| 428 | + cassandra.install_schema(keyspace_name, drop_first=True) |
| 429 | + |
| 430 | + |
| 431 | +def launcher(): |
| 432 | + |
| 433 | + storage = 'csv' |
| 434 | + setup(storage) |
| 435 | + input_queue = JoinableQueue() |
| 436 | + result_queue = JoinableQueue() |
| 437 | + #files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2'] |
| 438 | + files = ['/home/diederik/kaggle/enwiki-20100904-pages-meta-history2.xml.bz2'] |
| 439 | + |
| 440 | + for file in files: |
| 441 | + input_queue.put(file) |
| 442 | + |
| 443 | + for x in xrange(cpu_count()): |
| 444 | + input_queue.put(None) |
| 445 | + |
| 446 | + extracters = [Process(target=stream_raw_xml, args=[input_queue, result_queue]) |
| 447 | + for x in xrange(cpu_count())] |
| 448 | + for extracter in extracters: |
| 449 | + extracter.start() |
| 450 | + |
| 451 | + creators = [Process(target=create_variables, args=[result_queue, storage, x]) |
| 452 | + for x in xrange(cpu_count())] |
| 453 | + for creator in creators: |
| 454 | + creator.start() |
| 455 | + |
| 456 | + |
| 457 | + input_queue.join() |
| 458 | + result_queue.join() |
| 459 | + |
| 460 | + |
| 461 | +if __name__ == '__main__': |
| 462 | + #debug() |
| 463 | + launcher() |
Property changes on: trunk/tools/editor_trends/etl/enricher.py |
___________________________________________________________________ |
Added: svn:eol-style |
464 | 464 | + native |
Property changes on: trunk/tools/editor_trends/__init__.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
465 | 465 | - text/plain |
Property changes on: trunk/tools/editor_trends/classes/consumers.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
466 | 466 | - text/plain |
Property changes on: trunk/tools/editor_trends/classes/runtime_settings.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
467 | 467 | - text/plain |
Property changes on: trunk/tools/editor_trends/utils/__init__.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
468 | 468 | - text/plain |
Property changes on: trunk/tools/editor_trends/utils/file_utils.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
469 | 469 | - text/plain |
Property changes on: trunk/tools/editor_trends/utils/http_utils.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
470 | 470 | - text/plain |
Property changes on: trunk/tools/editor_trends/utils/ordered_dict.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
471 | 471 | - text/plain |
Property changes on: trunk/tools/editor_trends/database/__init__.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
472 | 472 | - text/plain |
Property changes on: trunk/tools/editor_trends/database/cache.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
473 | 473 | - text/plain |
Property changes on: trunk/tools/editor_trends/database/db.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
474 | 474 | - text/plain |
Index: trunk/tools/editor_trends/mapreduce/__init__.py |
— | — | @@ -1 +1 @@ |
2 | | -
|
| 2 | + |
Property changes on: trunk/tools/editor_trends/mapreduce/__init__.py |
___________________________________________________________________ |
Added: svn:eol-style |
3 | 3 | + native |
Property changes on: trunk/tools/editor_trends/bots/detector.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
4 | 4 | - text/plain |
Property changes on: trunk/tools/editor_trends/bots/__init__.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
5 | 5 | - text/plain |
Property changes on: trunk/tools/editor_trends/code-snippets/chunker.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
6 | 6 | - text/plain |
Property changes on: trunk/tools/editor_trends/code-snippets/exporter.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
7 | 7 | - text/plain |
Property changes on: trunk/tools/editor_trends/code-snippets/process_constructor.py |
___________________________________________________________________ |
Deleted: svn:mime-type |
8 | 8 | - text/plain |