r84505 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r84504‎ | r84505 | r84506 >
Date:00:15, 22 March 2011
Author:reedy
Status:deferred
Tags:
Comment:
Fix svn:eol-style native, remove svn:mime-type text/plain
Modified paths:
  • /trunk/tools/editor_trends/__init__.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/inventory.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/__init__.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_bar.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_histogram.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/histogram_edits.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/new_editor_count.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/time_to_new_wikipedian.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/total_number_of_articles.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/total_number_of_new_wikipedians.py (modified) (history)
  • /trunk/tools/editor_trends/bots/__init__.py (modified) (history)
  • /trunk/tools/editor_trends/bots/detector.py (modified) (history)
  • /trunk/tools/editor_trends/classes/consumers.py (modified) (history)
  • /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history)
  • /trunk/tools/editor_trends/code-snippets/chunker.py (modified) (history)
  • /trunk/tools/editor_trends/code-snippets/exporter.py (modified) (history)
  • /trunk/tools/editor_trends/code-snippets/process_constructor.py (modified) (history)
  • /trunk/tools/editor_trends/database/__init__.py (modified) (history)
  • /trunk/tools/editor_trends/database/cache.py (modified) (history)
  • /trunk/tools/editor_trends/database/db.py (modified) (history)
  • /trunk/tools/editor_trends/etl/enricher.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/mapreduce/__init__.py (modified) (history)
  • /trunk/tools/editor_trends/utils/__init__.py (modified) (history)
  • /trunk/tools/editor_trends/utils/file_utils.py (modified) (history)
  • /trunk/tools/editor_trends/utils/http_utils.py (modified) (history)
  • /trunk/tools/editor_trends/utils/ordered_dict.py (modified) (history)
  • /trunk/tools/editor_trends/wikitree/__init__.py (modified) (history)
  • /trunk/tools/editor_trends/wikitree/parser.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_bar.py
@@ -1,42 +1,42 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-01-25'
19 -__version__ = '0.1'
20 -
21 -
22 -def cohort_dataset_forward_bar(var, editor, **kwargs):
23 - '''
24 - The forward looking bar charts looks for every month that an editor
25 - was part of the Wikimedia community whether this person made at least cutoff
26 - value edits. If yes, then include this person in the analysis, else skip the
27 - person.
28 - '''
29 - new_wikipedian = editor['new_wikipedian']
30 - last_edit = editor['final_edit']
31 - monthly_edits = editor['monthly_edits']
32 - yearly_edits = editor['edits_by_year']
33 - n = editor['edit_count']
34 -
35 - if n >= var.cum_cutoff:
36 - for year in xrange(new_wikipedian.year, var.max_year):
37 - max_edits = max(monthly_edits.get(str(year), {0:0}).values())
38 - if yearly_edits.get(str(year), 0) == 0 or max_edits < var.cutoff:
39 - continue
40 - else:
41 - experience = (year - new_wikipedian.year) + 1
42 - var.add(new_wikipedian, 1, {'experience':experience})
43 - return var
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+
 22+def cohort_dataset_forward_bar(var, editor, **kwargs):
 23+ '''
 24+ The forward looking bar charts looks for every month that an editor
 25+ was part of the Wikimedia community whether this person made at least cutoff
 26+ value edits. If yes, then include this person in the analysis, else skip the
 27+ person.
 28+ '''
 29+ new_wikipedian = editor['new_wikipedian']
 30+ last_edit = editor['final_edit']
 31+ monthly_edits = editor['monthly_edits']
 32+ yearly_edits = editor['edits_by_year']
 33+ n = editor['edit_count']
 34+
 35+ if n >= var.cum_cutoff:
 36+ for year in xrange(new_wikipedian.year, var.max_year):
 37+ max_edits = max(monthly_edits.get(str(year), {0:0}).values())
 38+ if yearly_edits.get(str(year), 0) == 0 or max_edits < var.cutoff:
 39+ continue
 40+ else:
 41+ experience = (year - new_wikipedian.year) + 1
 42+ var.add(new_wikipedian, 1, {'experience':experience})
 43+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_bar.py
___________________________________________________________________
Added: svn:eol-style
4444 + native
Index: trunk/tools/editor_trends/analyses/plugins/time_to_new_wikipedian.py
@@ -1,26 +1,26 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-01-25'
19 -__version__ = '0.1'
20 -
21 -def time_to_new_wikipedian(var, editor, **kwargs):
22 -# headers = ['year', 'time_to_new_wikipedian']
23 - new_wikipedian = editor['new_wikipedian']
24 - first_edit = editor['first_edit']
25 - dt = new_wikipedian - first_edit
26 - var.add(new_wikipedian, dt.days)
27 - return var
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+def time_to_new_wikipedian(var, editor, **kwargs):
 22+# headers = ['year', 'time_to_new_wikipedian']
 23+ new_wikipedian = editor['new_wikipedian']
 24+ first_edit = editor['first_edit']
 25+ dt = new_wikipedian - first_edit
 26+ var.add(new_wikipedian, dt.days)
 27+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/time_to_new_wikipedian.py
___________________________________________________________________
Added: svn:eol-style
2828 + native
Property changes on: trunk/tools/editor_trends/analyses/plugins/__init__.py
___________________________________________________________________
Added: svn:eol-style
2929 + native
Index: trunk/tools/editor_trends/analyses/plugins/total_number_of_new_wikipedians.py
@@ -1,25 +1,25 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-01-25'
19 -__version__ = '0.1'
20 -
21 -
22 -
23 -def total_number_of_new_wikipedians(var, editor, **kwargs):
24 - new_wikipedian = editor['new_wikipedian']
25 - var.add(new_wikipedian, 1, {'year':new_wikipedian.year})
26 - return var
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+
 22+
 23+def total_number_of_new_wikipedians(var, editor, **kwargs):
 24+ new_wikipedian = editor['new_wikipedian']
 25+ var.add(new_wikipedian, 1, {'year':new_wikipedian.year})
 26+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/total_number_of_new_wikipedians.py
___________________________________________________________________
Added: svn:eol-style
2727 + native
Property changes on: trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py
___________________________________________________________________
Added: svn:eol-style
2828 + native
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_histogram.py
@@ -1,51 +1,51 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-01-25'
19 -__version__ = '0.1'
20 -
21 -import datetime
22 -from dateutil.relativedelta import *
23 -
24 -def cohort_dataset_forward_histogram(var, editor, **kwargs):
25 -# headers = ['year', 'month', 'edits']
26 - '''
27 - The forward looking histogram looks for every month that an editor
28 - was part of the Wikimedia community whether this person made at least cutoff
29 - value edits. If yes, then include this person in the analysis, else skip the
30 - person.
31 - '''
32 -
33 - new_wikipedian = editor['new_wikipedian']
34 - final_edit = editor['final_edit'].year + 1
35 - yearly_edits = editor['edits_by_year']
36 - n = editor['edit_count']
37 -
38 - if n >= var.cum_cutoff:
39 - for i, year in enumerate(xrange(new_wikipedian.year, final_edit)):
40 - edits = editor['monthly_edits'].get(str(year), {0:0})
41 - if year == new_wikipedian.year:
42 - start = new_wikipedian.month
43 - else:
44 - start = 1
45 -
46 - for month in xrange(start, 13):
47 - if edits.get(str(month), 0) >= var.cutoff:
48 - dt = datetime.datetime(year, month, 1)
49 - experience = relativedelta(dt - new_wikipedian)
50 - experience = experience.years * 12 + experience.months
51 - var.add(new_wikipedian, 1, {'experience': experience})
52 - return var
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+import datetime
 22+from dateutil.relativedelta import *
 23+
 24+def cohort_dataset_forward_histogram(var, editor, **kwargs):
 25+# headers = ['year', 'month', 'edits']
 26+ '''
 27+ The forward looking histogram looks for every month that an editor
 28+ was part of the Wikimedia community whether this person made at least cutoff
 29+ value edits. If yes, then include this person in the analysis, else skip the
 30+ person.
 31+ '''
 32+
 33+ new_wikipedian = editor['new_wikipedian']
 34+ final_edit = editor['final_edit'].year + 1
 35+ yearly_edits = editor['edits_by_year']
 36+ n = editor['edit_count']
 37+
 38+ if n >= var.cum_cutoff:
 39+ for i, year in enumerate(xrange(new_wikipedian.year, final_edit)):
 40+ edits = editor['monthly_edits'].get(str(year), {0:0})
 41+ if year == new_wikipedian.year:
 42+ start = new_wikipedian.month
 43+ else:
 44+ start = 1
 45+
 46+ for month in xrange(start, 13):
 47+ if edits.get(str(month), 0) >= var.cutoff:
 48+ dt = datetime.datetime(year, month, 1)
 49+ experience = relativedelta(dt - new_wikipedian)
 50+ experience = experience.years * 12 + experience.months
 51+ var.add(new_wikipedian, 1, {'experience': experience})
 52+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_histogram.py
___________________________________________________________________
Added: svn:eol-style
5353 + native
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py
@@ -1,50 +1,50 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-01-25'
19 -__version__ = '0.1'
20 -
21 -import datetime
22 -from dateutil.relativedelta import relativedelta
23 -from utils import data_converter
24 -
25 -
26 -def cohort_dataset_backward_bar(var, editor, **kwargs):
27 - '''
28 - The backward looking bar chart looks for every year that an editor
29 - was part of the Wikimedia community whether this person made at least cutoff
30 - value edits. If yes, then include this person in the analysis, else skip the
31 - person.
32 - '''
33 - break_down = kwargs.pop('break_down', False)
34 - new_wikipedian = editor['new_wikipedian']
35 - n = editor['edit_count']
36 -
37 - if n >= var.cum_cutoff:
38 - windows = data_converter.create_windows(var, break_down_first_year=break_down)
39 - for year in xrange(new_wikipedian.year, var.max_year):
40 - year = str(year)
41 - if editor['edits_by_year'][year] >= var.cutoff:
42 - last_edit = editor['last_edit_by_year'][year]
43 - if last_edit != 0.0:
44 - editor_dt = relativedelta(last_edit, new_wikipedian)
45 - editor_dt = (editor_dt.years * 12) + editor_dt.months
46 - for w in windows:
47 - if w >= editor_dt:
48 - datum = datetime.datetime(int(year), 12, 31)
49 - var.add(datum, 1, {'window':w})
50 - break
51 - return var
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+import datetime
 22+from dateutil.relativedelta import relativedelta
 23+from utils import data_converter
 24+
 25+
 26+def cohort_dataset_backward_bar(var, editor, **kwargs):
 27+ '''
 28+ The backward looking bar chart looks for every year that an editor
 29+ was part of the Wikimedia community whether this person made at least cutoff
 30+ value edits. If yes, then include this person in the analysis, else skip the
 31+ person.
 32+ '''
 33+ break_down = kwargs.pop('break_down', False)
 34+ new_wikipedian = editor['new_wikipedian']
 35+ n = editor['edit_count']
 36+
 37+ if n >= var.cum_cutoff:
 38+ windows = data_converter.create_windows(var, break_down_first_year=break_down)
 39+ for year in xrange(new_wikipedian.year, var.max_year):
 40+ year = str(year)
 41+ if editor['edits_by_year'][year] >= var.cutoff:
 42+ last_edit = editor['last_edit_by_year'][year]
 43+ if last_edit != 0.0:
 44+ editor_dt = relativedelta(last_edit, new_wikipedian)
 45+ editor_dt = (editor_dt.years * 12) + editor_dt.months
 46+ for w in windows:
 47+ if w >= editor_dt:
 48+ datum = datetime.datetime(int(year), 12, 31)
 49+ var.add(datum, 1, {'window':w})
 50+ break
 51+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py
___________________________________________________________________
Added: svn:eol-style
5252 + native
Index: trunk/tools/editor_trends/analyses/plugins/histogram_edits.py
@@ -1,26 +1,26 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-01-25'
19 -__version__ = '0.1'
20 -
21 -
22 -def histogram_edits(var, editor, **kwargs):
23 -# headers = ['year', 'num_edits', 'frequency']
24 - cnt = editor['edit_count']
25 - new_wikipedian = editor['new_wikipedian']
26 - var.add(new_wikipedian, cnt)
27 - return var
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+
 22+def histogram_edits(var, editor, **kwargs):
 23+# headers = ['year', 'num_edits', 'frequency']
 24+ cnt = editor['edit_count']
 25+ new_wikipedian = editor['new_wikipedian']
 26+ var.add(new_wikipedian, cnt)
 27+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/histogram_edits.py
___________________________________________________________________
Added: svn:eol-style
2828 + native
Index: trunk/tools/editor_trends/analyses/plugins/total_number_of_articles.py
@@ -1,28 +1,28 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-01-25'
19 -__version__ = '0.1'
20 -
21 -
22 -def total_number_of_articles(var, editor, **kwargs):
23 - for year in editor['edits']:
24 - edits = editor['edits'][year]
25 - for edit in edits:
26 - article = edit['article']
27 - date = edit['date']
28 - var.add(date, 1, {'article':article})
29 - return var
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+
 22+def total_number_of_articles(var, editor, **kwargs):
 23+ for year in editor['edits']:
 24+ edits = editor['edits'][year]
 25+ for edit in edits:
 26+ article = edit['article']
 27+ date = edit['date']
 28+ var.add(date, 1, {'article':article})
 29+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/total_number_of_articles.py
___________________________________________________________________
Added: svn:eol-style
3030 + native
Index: trunk/tools/editor_trends/analyses/plugins/new_editor_count.py
@@ -1,31 +1,31 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-01-25'
19 -__version__ = '0.1'
20 -
21 -
22 -def new_editor_count(var, editor, **kwargs):
23 - '''
24 - Summary: This function generates an overview of the number of
25 - new_wikipedians for a given year / month combination.
26 - Purpose: This data can be used to compare with Erik Zachte's
27 - stats.download.org to make sure that we are using the same numbers.
28 - '''
29 -# headers = ['year', 'month', 'count']
30 - new_wikipedian = editor['new_wikipedian']
31 - var.add(new_wikipedian, 1)
32 - return var
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-25'
 19+__version__ = '0.1'
 20+
 21+
 22+def new_editor_count(var, editor, **kwargs):
 23+ '''
 24+ Summary: This function generates an overview of the number of
 25+ new_wikipedians for a given year / month combination.
 26+ Purpose: This data can be used to compare with Erik Zachte's
 27+ stats.download.org to make sure that we are using the same numbers.
 28+ '''
 29+# headers = ['year', 'month', 'count']
 30+ new_wikipedian = editor['new_wikipedian']
 31+ var.add(new_wikipedian, 1)
 32+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/new_editor_count.py
___________________________________________________________________
Added: svn:eol-style
3333 + native
Index: trunk/tools/editor_trends/analyses/inventory.py
@@ -1,70 +1,70 @@
2 -#!/usr/bin/python
3 -# coding=utf-8
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http,//www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__author__email = 'dvanliere at gmail dot com'
18 -__date__ = '2011-02-11'
19 -__version__ = '0.1'
20 -
21 -
22 -import os
23 -import sys
24 -import types
25 -
26 -def available_analyses(caller='manage'):
27 - '''
28 - Generates a dictionary:
29 - key: name of analysis
30 - value: function that generates the dataset
31 - ignore: a list of functions that should never be called from manage.py,
32 - they are not valid entry points.
33 - '''
34 - assert caller == 'django' or caller == 'manage'
35 - ignore = ['__init__']
36 - functions = {}
37 -
38 - fn = os.path.realpath(__file__)
39 - pos = fn.rfind(os.sep)
40 - loc = fn[:pos]
41 - path = os.path.join(loc , 'plugins')
42 - plugins = import_libs(path)
43 -
44 - for plugin in plugins:
45 - if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore:
46 - functions[plugin.func_name] = plugin
47 - if caller == 'manage':
48 - return functions
49 - elif caller == 'django':
50 - django_functions = []
51 - for function in functions:
52 - fancy_name = function.replace('_', ' ').title()
53 - django_functions.append((function, fancy_name))
54 -
55 - return django_functions
56 -
57 -
58 -def import_libs(path):
59 - '''
60 - Dynamically importing functions from the plugins directory.
61 - '''
62 - library_list = []
63 - sys.path.append(path)
64 - for f in os.listdir(os.path.abspath(path)):
65 - module_name, ext = os.path.splitext(f)
66 - if ext == '.py':
67 - module = __import__(module_name)
68 - func = getattr(module, module_name)
69 - library_list.append(func)
70 -
71 - return library_list
 2+#!/usr/bin/python
 3+# coding=utf-8
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http,//www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2011-02-11'
 19+__version__ = '0.1'
 20+
 21+
 22+import os
 23+import sys
 24+import types
 25+
 26+def available_analyses(caller='manage'):
 27+ '''
 28+ Generates a dictionary:
 29+ key: name of analysis
 30+ value: function that generates the dataset
 31+ ignore: a list of functions that should never be called from manage.py,
 32+ they are not valid entry points.
 33+ '''
 34+ assert caller == 'django' or caller == 'manage'
 35+ ignore = ['__init__']
 36+ functions = {}
 37+
 38+ fn = os.path.realpath(__file__)
 39+ pos = fn.rfind(os.sep)
 40+ loc = fn[:pos]
 41+ path = os.path.join(loc , 'plugins')
 42+ plugins = import_libs(path)
 43+
 44+ for plugin in plugins:
 45+ if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore:
 46+ functions[plugin.func_name] = plugin
 47+ if caller == 'manage':
 48+ return functions
 49+ elif caller == 'django':
 50+ django_functions = []
 51+ for function in functions:
 52+ fancy_name = function.replace('_', ' ').title()
 53+ django_functions.append((function, fancy_name))
 54+
 55+ return django_functions
 56+
 57+
 58+def import_libs(path):
 59+ '''
 60+ Dynamically importing functions from the plugins directory.
 61+ '''
 62+ library_list = []
 63+ sys.path.append(path)
 64+ for f in os.listdir(os.path.abspath(path)):
 65+ module_name, ext = os.path.splitext(f)
 66+ if ext == '.py':
 67+ module = __import__(module_name)
 68+ func = getattr(module, module_name)
 69+ library_list.append(func)
 70+
 71+ return library_list
Property changes on: trunk/tools/editor_trends/analyses/inventory.py
___________________________________________________________________
Added: svn:eol-style
7272 + native
Property changes on: trunk/tools/editor_trends/manage.py
___________________________________________________________________
Deleted: svn:mime-type
7373 - text/plain
Property changes on: trunk/tools/editor_trends/wikitree/parser.py
___________________________________________________________________
Deleted: svn:mime-type
7474 - text/plain
Property changes on: trunk/tools/editor_trends/wikitree/__init__.py
___________________________________________________________________
Deleted: svn:mime-type
7575 - text/plain
Index: trunk/tools/editor_trends/etl/enricher.py
@@ -1,462 +1,462 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2011-02-06'
19 -__version__ = '0.1'
20 -
21 -
22 -import bz2
23 -import cStringIO
24 -import hashlib
25 -import codecs
26 -import re
27 -import sys
28 -import progressbar
29 -from multiprocessing import JoinableQueue, Process, cpu_count, current_process
30 -from xml.etree.cElementTree import fromstring, iterparse
31 -from collections import deque
32 -
33 -if '..' not in sys.path:
34 - sys.path.append('..')
35 -
36 -try:
37 - from database import cassandra
38 - import pycassa
39 -
40 -except ImportError:
41 - print 'I am not going to use Cassandra today, it\'s my off day.'
42 -
43 -
44 -
45 -from database import db
46 -from bots import detector
47 -from utils import file_utils
48 -import extracter
49 -
50 -RE_CATEGORY = re.compile('\(.*\`\,\.\-\:\'\)')
51 -
52 -NAMESPACE = {
53 - #0:'Main',
54 - #1:'Talk',
55 - #2:'User',
56 - #3:'User talk',
57 - 4:'Wikipedia',
58 - #5:'Wikipedia talk',
59 - 6:'File',
60 - #7:'File talk',
61 - 8:'MediaWiki',
62 - #9:'MediaWiki talk',
63 - 10:'Template',
64 - #11:'Template talk',
65 - 12:'Help',
66 - #13:'Help talk',
67 - 14:'Category',
68 - #15:'Category talk',
69 - 90:'Thread',
70 - #91:'Thread talk',
71 - 92:'Summary',
72 - #93:'Summary talk',
73 - 100:'Portal',
74 - #101:'Portal talk',
75 - 108:'Book',
76 - #109:'Book talk'
77 -}
78 -
79 -
80 -class Buffer:
81 - def __init__(self, storage, id):
82 - assert storage == 'cassandra' or storage == 'mongo' or storage == 'csv', \
83 - 'Valid storage options are cassandra and mongo.'
84 - self.storage = storage
85 - self.revisions = {}
86 - self.comments = {}
87 - self.id = id
88 - self.keyspace_name = 'enwiki'
89 - self.keys = ['revision_id', 'article_id', 'id', 'namespace',
90 - 'title', 'timestamp', 'hash', 'revert', 'bot', 'prev_size',
91 - 'cur_size', 'delta']
92 - self.setup_storage()
93 -
94 - def setup_storage(self):
95 - if self.storage == 'cassandra':
96 - self.db = pycassa.connect(self.keyspace_name)
97 - self.collection = pycassa.ColumnFamily(self.db, 'revisions')
98 -
99 - elif self.storage == 'mongo':
100 - self.db = db.init_mongo_db(self.keyspace_name)
101 - self.collection = self.db['kaggle']
102 -
103 - else:
104 - kaggle_file = 'kaggle_%s.csv' % self.id
105 - comment_file = 'kaggle_comments_%s.csv' % self.id
106 - file_utils.delete_file('', kaggle_file, directory=False)
107 - file_utils.delete_file('', comment_file, directory=False)
108 - self.fh_main = codecs.open(kaggle_file, 'a', 'utf-8')
109 - self.fh_extra = codecs.open(comment_file, 'a', 'utf-8')
110 -
111 - def add(self, revision):
112 - self.stringify(revision)
113 - id = revision['revision_id']
114 - self.revisions[id] = revision
115 - if len(self.revisions) == 1000:
116 - self.store()
117 - self.clear()
118 -
119 - def stringify(self, revision):
120 - for key, value in revision.iteritems():
121 - try:
122 - value = str(value)
123 - except UnicodeEncodeError:
124 - value = value.encode('utf-8')
125 - revision[key] = value
126 -
127 - def empty(self):
128 - self.store()
129 - self.clear()
130 - if self.storage == 'csv':
131 - self.fh_main.close()
132 - self.fh_extra.close()
133 -
134 - def clear(self):
135 - self.revisions = {}
136 - self.comments = {}
137 -
138 - def store(self):
139 - if self.storage == 'cassandra':
140 - self.collection.batch_insert(self.revisions)
141 - elif self.storage == 'mongo':
142 - print 'insert into mongo'
143 - else:
144 - for revision in self.revisions.itervalues():
145 - values = []
146 - for key in self.keys:
147 - values.append(revision[key].decode('utf-8'))
148 -
149 - value = '\t'.join(values) + '\n'
150 - row = '\t'.join([key, value])
151 - self.fh_main.write(row)
152 -
153 - for revision_id, comment in self.comments.iteritems():
154 - comment = comment.decode('utf-8')
155 - row = '\t'.join([revision_id, comment]) + '\n'
156 - self.fh_extra.write(row)
157 -
158 -
159 -def extract_categories():
160 - '''
161 - Field 1: page id
162 - Field 2: name category
163 - Field 3: sort key
164 - Field 4: timestamp last change
165 - '''
166 - filename = 'C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-categorylinks.sql'
167 - output = codecs.open('categories.csv', 'w', encoding='utf-8')
168 - fh = codecs.open(filename, 'r', encoding='utf-8')
169 -
170 - try:
171 - for line in fh:
172 - if line.startswith('INSERT INTO `categorylinks` VALUES ('):
173 - line = line.replace('INSERT INTO `categorylinks` VALUES (', '')
174 - line = line.replace("'", '')
175 - categories = line.split('),(')
176 - for category in categories:
177 - category = category.split(',')
178 - if len(category) == 4:
179 - output.write('%s\t%s\n' % (category[0], category[1]))
180 - except UnicodeDecodeError, e:
181 - print e
182 -
183 - output.close()
184 - fh.close()
185 -
186 -
187 -def extract_revision_text(revision):
188 - rev = revision.find('text')
189 - if rev != None:
190 - if rev.text == None:
191 - rev = fix_revision_text(revision)
192 - return rev.text.encode('utf-8')
193 - else:
194 - return ''
195 -
196 -
197 -def fix_revision_text(revision):
198 - if revision.text == None:
199 - revision.text = ''
200 - return revision
201 -
202 -
203 -def create_md5hash(text):
204 - hash = {}
205 - if text != None:
206 - m = hashlib.md5()
207 - m.update(text)
208 - #echo m.digest()
209 - hash['hash'] = m.hexdigest()
210 - else:
211 - hash['hash'] = -1
212 - return hash
213 -
214 -
215 -def calculate_delta_article_size(size, text):
216 - if 'prev_size' not in size:
217 - size['prev_size'] = 0
218 - size['cur_size'] = len(text)
219 - size['delta'] = len(text)
220 - else:
221 - size['prev_size'] = size['cur_size']
222 - delta = len(text) - size['prev_size']
223 - size['cur_size'] = len(text)
224 - size['delta'] = delta
225 - return size
226 -
227 -
228 -def parse_contributor(contributor, bots):
229 - username = extracter.extract_username(contributor)
230 - user_id = extracter.extract_contributor_id(contributor)
231 - bot = extracter.determine_username_is_bot(contributor, bots=bots)
232 - contributor = {}
233 - contributor['username'] = username
234 - contributor['bot'] = bot
235 - if user_id != None:
236 - contributor.update(user_id)
237 - else:
238 - contributor = False
239 - return contributor
240 -
241 -
242 -def determine_namespace(title):
243 - namespaces = {'User': 2,
244 - 'Talk': 1,
245 - 'User Talk': 3,
246 - }
247 - ns = {}
248 - if title.text != None:
249 - title = title.text
250 - for namespace in namespaces:
251 - if title.startswith(namespace):
252 - ns['namespace'] = namespaces[namespace]
253 - if ns == {}:
254 - for namespace in NAMESPACE:
255 - if title.startswith(namespace):
256 - ns = False #article does not belong to either the main namespace, user, talk or user talk namespace.
257 - break
258 - ns['namespace'] = 0
259 - else:
260 - ns = False
261 - return ns
262 -
263 -
264 -def prefill_row(title, article_id, namespace):
265 - row = {}
266 - row['title'] = title.text
267 - row['article_id'] = article_id
268 - row.update(namespace)
269 - return row
270 -
271 -
272 -def is_revision_reverted(hash_cur, hashes):
273 - revert = {}
274 - if hash_cur in hashes:
275 - revert['revert'] = 1
276 - else:
277 - revert['revert'] = 0
278 - return revert
279 -
280 -
281 -def create_variables(result_queue, storage, id):
282 - bots = detector.retrieve_bots('en')
283 - buffer = Buffer(storage, id)
284 - i = 0
285 - while True:
286 - article = result_queue.get(block=True)
287 - result_queue.task_done()
288 - if article == None:
289 - break
290 - i += 1
291 - #article = fromstring(article)
292 - title = article['title'].text
293 - namespace = determine_namespace(title)
294 - if namespace != False:
295 - #revisions = article.findall('revision')
296 - article_id = article['id'].text
297 - hashes = deque(maxlen=1000)
298 - size = {}
299 - for revision in article['revision']:
300 - if revision == None:
301 - #the entire revision is empty, weird.
302 - continue
303 -
304 - contributor = revision.find('contributor')
305 - contributor = parse_contributor(contributor, bots)
306 - if not contributor:
307 - #editor is anonymous, ignore
308 - continue
309 -
310 - revision_id = revision.find('id')
311 - revision_id = extracter.extract_revision_id(revision_id)
312 - if revision_id == None:
313 - #revision_id is missing, which is weird
314 - continue
315 -
316 - row = prefill_row(title, article_id, namespace)
317 - row['revision_id'] = revision_id
318 - text = extract_revision_text(revision)
319 - row.update(contributor)
320 -
321 -
322 - timestamp = revision.find('timestamp').text
323 - row['timestamp'] = timestamp
324 -
325 - hash = create_md5hash(text)
326 - revert = is_revision_reverted(hash['hash'], hashes)
327 - hashes.append(hash['hash'])
328 - size = calculate_delta_article_size(size, text)
329 -
330 - row.update(hash)
331 - row.update(size)
332 - row.update(revert)
333 - # print row
334 - # if row['username'] == None:
335 - # contributor = revision.find('contributor')
336 - # attrs = contributor.getchildren()
337 - # for attr in attrs:
338 - # print attr.text
339 - #print revision_id, hash, delta, prev_size\
340 -
341 - buffer.add(row)
342 - if i % 10000 == 0:
343 - print 'Parsed %s articles' % i
344 -# except ValueError, e:
345 -# print e
346 -# except UnicodeDecodeError, e:
347 -# print e
348 - buffer.empty()
349 - print 'Buffer is empty'
350 -
351 -
352 -def parse_xml(source, result_queue):
353 - context = iterparse(source, events=('end',))
354 - context = iter(context)
355 - event, root = context.next()
356 -
357 - article = {}
358 - id = False
359 - for event, elem in context:
360 - if event == 'end' and elem.tag == 'revision':
361 - article[elem.tag] = elem
362 - elif event == 'end' and elem.tag == 'id' and id == False:
363 - article[elem.tag] = elem
364 - id = True
365 - article[root.tag] = root
366 - result_queue.put(article)
367 - root.clear()
368 -
369 -
370 -def stream_raw_xml(input_queue, result_queue):
371 - buffer = cStringIO.StringIO()
372 - parsing = False
373 -
374 - while True:
375 - filename = input_queue.get()
376 - input_queue.task_done()
377 - if filename == None:
378 - break
379 -
380 - #filesize = file_utils.determine_filesize('', filename)
381 - #pbar = progressbar.ProgressBar().start()
382 -
383 - for data in unzip(filename):
384 - if data.startswith('<page>'):
385 - parsing = True
386 - if parsing:
387 - buffer.write(data)
388 - buffer.write('\n')
389 - if data == '</page>':
390 - buffer.seek(0)
391 - parse_xml(buffer, result_queue)
392 - buffer = cStringIO.StringIO()
393 - #pbar.update(pbar.currval + len(data)) #is inaccurate!!!
394 -
395 -
396 - for x in xrange(cpu_count()):
397 - result_queue.put(None)
398 - print 'Finished parsing bz2 archives'
399 -
400 -
401 -def debug():
402 - input_queue = JoinableQueue()
403 - result_queue = JoinableQueue()
404 - files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']
405 -
406 - for file in files:
407 - input_queue.put(file)
408 -
409 - stream_raw_xml(input_queue, result_queue)
410 -
411 -
412 -def unzip(filename):
413 - '''
414 - Filename should be a fully qualified path to the bz2 file that will be
415 - decompressed. It will iterate line by line and yield this back to
416 - create_article
417 - '''
418 - fh = bz2.BZ2File(filename, 'r')
419 - for line in fh:
420 - line = line.strip()
421 - yield line
422 - fh.close()
423 -
424 -
425 -def setup(storage):
426 - keyspace_name = 'enwiki'
427 - if storage == 'cassandra':
428 - cassandra.install_schema(keyspace_name, drop_first=True)
429 -
430 -
431 -def launcher():
432 -
433 - storage = 'csv'
434 - setup(storage)
435 - input_queue = JoinableQueue()
436 - result_queue = JoinableQueue()
437 - #files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']
438 - files = ['/home/diederik/kaggle/enwiki-20100904-pages-meta-history2.xml.bz2']
439 -
440 - for file in files:
441 - input_queue.put(file)
442 -
443 - for x in xrange(cpu_count()):
444 - input_queue.put(None)
445 -
446 - extracters = [Process(target=stream_raw_xml, args=[input_queue, result_queue])
447 - for x in xrange(cpu_count())]
448 - for extracter in extracters:
449 - extracter.start()
450 -
451 - creators = [Process(target=create_variables, args=[result_queue, storage, x])
452 - for x in xrange(cpu_count())]
453 - for creator in creators:
454 - creator.start()
455 -
456 -
457 - input_queue.join()
458 - result_queue.join()
459 -
460 -
461 -if __name__ == '__main__':
462 - #debug()
463 - launcher()
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-02-06'
 19+__version__ = '0.1'
 20+
 21+
 22+import bz2
 23+import cStringIO
 24+import hashlib
 25+import codecs
 26+import re
 27+import sys
 28+import progressbar
 29+from multiprocessing import JoinableQueue, Process, cpu_count, current_process
 30+from xml.etree.cElementTree import fromstring, iterparse
 31+from collections import deque
 32+
 33+if '..' not in sys.path:
 34+ sys.path.append('..')
 35+
 36+try:
 37+ from database import cassandra
 38+ import pycassa
 39+
 40+except ImportError:
 41+ print 'I am not going to use Cassandra today, it\'s my off day.'
 42+
 43+
 44+
 45+from database import db
 46+from bots import detector
 47+from utils import file_utils
 48+import extracter
 49+
 50+RE_CATEGORY = re.compile('\(.*\`\,\.\-\:\'\)')
 51+
 52+NAMESPACE = {
 53+ #0:'Main',
 54+ #1:'Talk',
 55+ #2:'User',
 56+ #3:'User talk',
 57+ 4:'Wikipedia',
 58+ #5:'Wikipedia talk',
 59+ 6:'File',
 60+ #7:'File talk',
 61+ 8:'MediaWiki',
 62+ #9:'MediaWiki talk',
 63+ 10:'Template',
 64+ #11:'Template talk',
 65+ 12:'Help',
 66+ #13:'Help talk',
 67+ 14:'Category',
 68+ #15:'Category talk',
 69+ 90:'Thread',
 70+ #91:'Thread talk',
 71+ 92:'Summary',
 72+ #93:'Summary talk',
 73+ 100:'Portal',
 74+ #101:'Portal talk',
 75+ 108:'Book',
 76+ #109:'Book talk'
 77+}
 78+
 79+
 80+class Buffer:
 81+ def __init__(self, storage, id):
 82+ assert storage == 'cassandra' or storage == 'mongo' or storage == 'csv', \
 83+ 'Valid storage options are cassandra and mongo.'
 84+ self.storage = storage
 85+ self.revisions = {}
 86+ self.comments = {}
 87+ self.id = id
 88+ self.keyspace_name = 'enwiki'
 89+ self.keys = ['revision_id', 'article_id', 'id', 'namespace',
 90+ 'title', 'timestamp', 'hash', 'revert', 'bot', 'prev_size',
 91+ 'cur_size', 'delta']
 92+ self.setup_storage()
 93+
 94+ def setup_storage(self):
 95+ if self.storage == 'cassandra':
 96+ self.db = pycassa.connect(self.keyspace_name)
 97+ self.collection = pycassa.ColumnFamily(self.db, 'revisions')
 98+
 99+ elif self.storage == 'mongo':
 100+ self.db = db.init_mongo_db(self.keyspace_name)
 101+ self.collection = self.db['kaggle']
 102+
 103+ else:
 104+ kaggle_file = 'kaggle_%s.csv' % self.id
 105+ comment_file = 'kaggle_comments_%s.csv' % self.id
 106+ file_utils.delete_file('', kaggle_file, directory=False)
 107+ file_utils.delete_file('', comment_file, directory=False)
 108+ self.fh_main = codecs.open(kaggle_file, 'a', 'utf-8')
 109+ self.fh_extra = codecs.open(comment_file, 'a', 'utf-8')
 110+
 111+ def add(self, revision):
 112+ self.stringify(revision)
 113+ id = revision['revision_id']
 114+ self.revisions[id] = revision
 115+ if len(self.revisions) == 1000:
 116+ self.store()
 117+ self.clear()
 118+
 119+ def stringify(self, revision):
 120+ for key, value in revision.iteritems():
 121+ try:
 122+ value = str(value)
 123+ except UnicodeEncodeError:
 124+ value = value.encode('utf-8')
 125+ revision[key] = value
 126+
 127+ def empty(self):
 128+ self.store()
 129+ self.clear()
 130+ if self.storage == 'csv':
 131+ self.fh_main.close()
 132+ self.fh_extra.close()
 133+
 134+ def clear(self):
 135+ self.revisions = {}
 136+ self.comments = {}
 137+
 138+ def store(self):
 139+ if self.storage == 'cassandra':
 140+ self.collection.batch_insert(self.revisions)
 141+ elif self.storage == 'mongo':
 142+ print 'insert into mongo'
 143+ else:
 144+ for revision in self.revisions.itervalues():
 145+ values = []
 146+ for key in self.keys:
 147+ values.append(revision[key].decode('utf-8'))
 148+
 149+ value = '\t'.join(values) + '\n'
 150+ row = '\t'.join([key, value])
 151+ self.fh_main.write(row)
 152+
 153+ for revision_id, comment in self.comments.iteritems():
 154+ comment = comment.decode('utf-8')
 155+ row = '\t'.join([revision_id, comment]) + '\n'
 156+ self.fh_extra.write(row)
 157+
 158+
 159+def extract_categories():
 160+ '''
 161+ Field 1: page id
 162+ Field 2: name category
 163+ Field 3: sort key
 164+ Field 4: timestamp last change
 165+ '''
 166+ filename = 'C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-categorylinks.sql'
 167+ output = codecs.open('categories.csv', 'w', encoding='utf-8')
 168+ fh = codecs.open(filename, 'r', encoding='utf-8')
 169+
 170+ try:
 171+ for line in fh:
 172+ if line.startswith('INSERT INTO `categorylinks` VALUES ('):
 173+ line = line.replace('INSERT INTO `categorylinks` VALUES (', '')
 174+ line = line.replace("'", '')
 175+ categories = line.split('),(')
 176+ for category in categories:
 177+ category = category.split(',')
 178+ if len(category) == 4:
 179+ output.write('%s\t%s\n' % (category[0], category[1]))
 180+ except UnicodeDecodeError, e:
 181+ print e
 182+
 183+ output.close()
 184+ fh.close()
 185+
 186+
 187+def extract_revision_text(revision):
 188+ rev = revision.find('text')
 189+ if rev != None:
 190+ if rev.text == None:
 191+ rev = fix_revision_text(revision)
 192+ return rev.text.encode('utf-8')
 193+ else:
 194+ return ''
 195+
 196+
 197+def fix_revision_text(revision):
 198+ if revision.text == None:
 199+ revision.text = ''
 200+ return revision
 201+
 202+
 203+def create_md5hash(text):
 204+ hash = {}
 205+ if text != None:
 206+ m = hashlib.md5()
 207+ m.update(text)
 208+ #echo m.digest()
 209+ hash['hash'] = m.hexdigest()
 210+ else:
 211+ hash['hash'] = -1
 212+ return hash
 213+
 214+
 215+def calculate_delta_article_size(size, text):
 216+ if 'prev_size' not in size:
 217+ size['prev_size'] = 0
 218+ size['cur_size'] = len(text)
 219+ size['delta'] = len(text)
 220+ else:
 221+ size['prev_size'] = size['cur_size']
 222+ delta = len(text) - size['prev_size']
 223+ size['cur_size'] = len(text)
 224+ size['delta'] = delta
 225+ return size
 226+
 227+
 228+def parse_contributor(contributor, bots):
 229+ username = extracter.extract_username(contributor)
 230+ user_id = extracter.extract_contributor_id(contributor)
 231+ bot = extracter.determine_username_is_bot(contributor, bots=bots)
 232+ contributor = {}
 233+ contributor['username'] = username
 234+ contributor['bot'] = bot
 235+ if user_id != None:
 236+ contributor.update(user_id)
 237+ else:
 238+ contributor = False
 239+ return contributor
 240+
 241+
 242+def determine_namespace(title):
 243+ namespaces = {'User': 2,
 244+ 'Talk': 1,
 245+ 'User Talk': 3,
 246+ }
 247+ ns = {}
 248+ if title.text != None:
 249+ title = title.text
 250+ for namespace in namespaces:
 251+ if title.startswith(namespace):
 252+ ns['namespace'] = namespaces[namespace]
 253+ if ns == {}:
 254+ for namespace in NAMESPACE:
 255+ if title.startswith(namespace):
 256+ ns = False #article does not belong to either the main namespace, user, talk or user talk namespace.
 257+ break
 258+ ns['namespace'] = 0
 259+ else:
 260+ ns = False
 261+ return ns
 262+
 263+
 264+def prefill_row(title, article_id, namespace):
 265+ row = {}
 266+ row['title'] = title.text
 267+ row['article_id'] = article_id
 268+ row.update(namespace)
 269+ return row
 270+
 271+
 272+def is_revision_reverted(hash_cur, hashes):
 273+ revert = {}
 274+ if hash_cur in hashes:
 275+ revert['revert'] = 1
 276+ else:
 277+ revert['revert'] = 0
 278+ return revert
 279+
 280+
 281+def create_variables(result_queue, storage, id):
 282+ bots = detector.retrieve_bots('en')
 283+ buffer = Buffer(storage, id)
 284+ i = 0
 285+ while True:
 286+ article = result_queue.get(block=True)
 287+ result_queue.task_done()
 288+ if article == None:
 289+ break
 290+ i += 1
 291+ #article = fromstring(article)
 292+ title = article['title'].text
 293+ namespace = determine_namespace(title)
 294+ if namespace != False:
 295+ #revisions = article.findall('revision')
 296+ article_id = article['id'].text
 297+ hashes = deque(maxlen=1000)
 298+ size = {}
 299+ for revision in article['revision']:
 300+ if revision == None:
 301+ #the entire revision is empty, weird.
 302+ continue
 303+
 304+ contributor = revision.find('contributor')
 305+ contributor = parse_contributor(contributor, bots)
 306+ if not contributor:
 307+ #editor is anonymous, ignore
 308+ continue
 309+
 310+ revision_id = revision.find('id')
 311+ revision_id = extracter.extract_revision_id(revision_id)
 312+ if revision_id == None:
 313+ #revision_id is missing, which is weird
 314+ continue
 315+
 316+ row = prefill_row(title, article_id, namespace)
 317+ row['revision_id'] = revision_id
 318+ text = extract_revision_text(revision)
 319+ row.update(contributor)
 320+
 321+
 322+ timestamp = revision.find('timestamp').text
 323+ row['timestamp'] = timestamp
 324+
 325+ hash = create_md5hash(text)
 326+ revert = is_revision_reverted(hash['hash'], hashes)
 327+ hashes.append(hash['hash'])
 328+ size = calculate_delta_article_size(size, text)
 329+
 330+ row.update(hash)
 331+ row.update(size)
 332+ row.update(revert)
 333+ # print row
 334+ # if row['username'] == None:
 335+ # contributor = revision.find('contributor')
 336+ # attrs = contributor.getchildren()
 337+ # for attr in attrs:
 338+ # print attr.text
 339+ #print revision_id, hash, delta, prev_size\
 340+
 341+ buffer.add(row)
 342+ if i % 10000 == 0:
 343+ print 'Parsed %s articles' % i
 344+# except ValueError, e:
 345+# print e
 346+# except UnicodeDecodeError, e:
 347+# print e
 348+ buffer.empty()
 349+ print 'Buffer is empty'
 350+
 351+
 352+def parse_xml(source, result_queue):
 353+ context = iterparse(source, events=('end',))
 354+ context = iter(context)
 355+ event, root = context.next()
 356+
 357+ article = {}
 358+ id = False
 359+ for event, elem in context:
 360+ if event == 'end' and elem.tag == 'revision':
 361+ article[elem.tag] = elem
 362+ elif event == 'end' and elem.tag == 'id' and id == False:
 363+ article[elem.tag] = elem
 364+ id = True
 365+ article[root.tag] = root
 366+ result_queue.put(article)
 367+ root.clear()
 368+
 369+
 370+def stream_raw_xml(input_queue, result_queue):
 371+ buffer = cStringIO.StringIO()
 372+ parsing = False
 373+
 374+ while True:
 375+ filename = input_queue.get()
 376+ input_queue.task_done()
 377+ if filename == None:
 378+ break
 379+
 380+ #filesize = file_utils.determine_filesize('', filename)
 381+ #pbar = progressbar.ProgressBar().start()
 382+
 383+ for data in unzip(filename):
 384+ if data.startswith('<page>'):
 385+ parsing = True
 386+ if parsing:
 387+ buffer.write(data)
 388+ buffer.write('\n')
 389+ if data == '</page>':
 390+ buffer.seek(0)
 391+ parse_xml(buffer, result_queue)
 392+ buffer = cStringIO.StringIO()
 393+ #pbar.update(pbar.currval + len(data)) #is inaccurate!!!
 394+
 395+
 396+ for x in xrange(cpu_count()):
 397+ result_queue.put(None)
 398+ print 'Finished parsing bz2 archives'
 399+
 400+
 401+def debug():
 402+ input_queue = JoinableQueue()
 403+ result_queue = JoinableQueue()
 404+ files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']
 405+
 406+ for file in files:
 407+ input_queue.put(file)
 408+
 409+ stream_raw_xml(input_queue, result_queue)
 410+
 411+
 412+def unzip(filename):
 413+ '''
 414+ Filename should be a fully qualified path to the bz2 file that will be
 415+ decompressed. It will iterate line by line and yield this back to
 416+ create_article
 417+ '''
 418+ fh = bz2.BZ2File(filename, 'r')
 419+ for line in fh:
 420+ line = line.strip()
 421+ yield line
 422+ fh.close()
 423+
 424+
 425+def setup(storage):
 426+ keyspace_name = 'enwiki'
 427+ if storage == 'cassandra':
 428+ cassandra.install_schema(keyspace_name, drop_first=True)
 429+
 430+
 431+def launcher():
 432+
 433+ storage = 'csv'
 434+ setup(storage)
 435+ input_queue = JoinableQueue()
 436+ result_queue = JoinableQueue()
 437+ #files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']
 438+ files = ['/home/diederik/kaggle/enwiki-20100904-pages-meta-history2.xml.bz2']
 439+
 440+ for file in files:
 441+ input_queue.put(file)
 442+
 443+ for x in xrange(cpu_count()):
 444+ input_queue.put(None)
 445+
 446+ extracters = [Process(target=stream_raw_xml, args=[input_queue, result_queue])
 447+ for x in xrange(cpu_count())]
 448+ for extracter in extracters:
 449+ extracter.start()
 450+
 451+ creators = [Process(target=create_variables, args=[result_queue, storage, x])
 452+ for x in xrange(cpu_count())]
 453+ for creator in creators:
 454+ creator.start()
 455+
 456+
 457+ input_queue.join()
 458+ result_queue.join()
 459+
 460+
 461+if __name__ == '__main__':
 462+ #debug()
 463+ launcher()
Property changes on: trunk/tools/editor_trends/etl/enricher.py
___________________________________________________________________
Added: svn:eol-style
464464 + native
Property changes on: trunk/tools/editor_trends/__init__.py
___________________________________________________________________
Deleted: svn:mime-type
465465 - text/plain
Property changes on: trunk/tools/editor_trends/classes/consumers.py
___________________________________________________________________
Deleted: svn:mime-type
466466 - text/plain
Property changes on: trunk/tools/editor_trends/classes/runtime_settings.py
___________________________________________________________________
Deleted: svn:mime-type
467467 - text/plain
Property changes on: trunk/tools/editor_trends/utils/__init__.py
___________________________________________________________________
Deleted: svn:mime-type
468468 - text/plain
Property changes on: trunk/tools/editor_trends/utils/file_utils.py
___________________________________________________________________
Deleted: svn:mime-type
469469 - text/plain
Property changes on: trunk/tools/editor_trends/utils/http_utils.py
___________________________________________________________________
Deleted: svn:mime-type
470470 - text/plain
Property changes on: trunk/tools/editor_trends/utils/ordered_dict.py
___________________________________________________________________
Deleted: svn:mime-type
471471 - text/plain
Property changes on: trunk/tools/editor_trends/database/__init__.py
___________________________________________________________________
Deleted: svn:mime-type
472472 - text/plain
Property changes on: trunk/tools/editor_trends/database/cache.py
___________________________________________________________________
Deleted: svn:mime-type
473473 - text/plain
Property changes on: trunk/tools/editor_trends/database/db.py
___________________________________________________________________
Deleted: svn:mime-type
474474 - text/plain
Index: trunk/tools/editor_trends/mapreduce/__init__.py
@@ -1 +1 @@
2 -
 2+
Property changes on: trunk/tools/editor_trends/mapreduce/__init__.py
___________________________________________________________________
Added: svn:eol-style
33 + native
Property changes on: trunk/tools/editor_trends/bots/detector.py
___________________________________________________________________
Deleted: svn:mime-type
44 - text/plain
Property changes on: trunk/tools/editor_trends/bots/__init__.py
___________________________________________________________________
Deleted: svn:mime-type
55 - text/plain
Property changes on: trunk/tools/editor_trends/code-snippets/chunker.py
___________________________________________________________________
Deleted: svn:mime-type
66 - text/plain
Property changes on: trunk/tools/editor_trends/code-snippets/exporter.py
___________________________________________________________________
Deleted: svn:mime-type
77 - text/plain
Property changes on: trunk/tools/editor_trends/code-snippets/process_constructor.py
___________________________________________________________________
Deleted: svn:mime-type
88 - text/plain

Status & tagging log