r84505 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r84504‎ \| r84505 \| r84506 >
Date:	00:15, 22 March 2011
Author:	reedy
Status:	deferred
Tags:
Comment:	Fix svn:eol-style native, remove svn:mime-type text/plain
Modified paths:	/trunk/tools/editor_trends/__init__.py (modified) (history) /trunk/tools/editor_trends/analyses/inventory.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/__init__.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_bar.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_histogram.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/histogram_edits.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/new_editor_count.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/time_to_new_wikipedian.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/total_number_of_articles.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/total_number_of_new_wikipedians.py (modified) (history) /trunk/tools/editor_trends/bots/__init__.py (modified) (history) /trunk/tools/editor_trends/bots/detector.py (modified) (history) /trunk/tools/editor_trends/classes/consumers.py (modified) (history) /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history) /trunk/tools/editor_trends/code-snippets/chunker.py (modified) (history) /trunk/tools/editor_trends/code-snippets/exporter.py (modified) (history) /trunk/tools/editor_trends/code-snippets/process_constructor.py (modified) (history) /trunk/tools/editor_trends/database/__init__.py (modified) (history) /trunk/tools/editor_trends/database/cache.py (modified) (history) /trunk/tools/editor_trends/database/db.py (modified) (history) /trunk/tools/editor_trends/etl/enricher.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/mapreduce/__init__.py (modified) (history) /trunk/tools/editor_trends/utils/__init__.py (modified) (history) /trunk/tools/editor_trends/utils/file_utils.py (modified) (history) /trunk/tools/editor_trends/utils/http_utils.py (modified) (history) /trunk/tools/editor_trends/utils/ordered_dict.py (modified) (history) /trunk/tools/editor_trends/wikitree/__init__.py (modified) (history) /trunk/tools/editor_trends/wikitree/parser.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_bar.py
—	—	@@ -1,42 +1,42 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-01-25'~~
19		~~-__version__ = '0.1'~~
20		-
21		-
22		~~-def cohort_dataset_forward_bar(var, editor, **kwargs):~~
23		~~- '''~~
24		~~- The forward looking bar charts looks for every month that an editor~~
25		~~- was part of the Wikimedia community whether this person made at least cutoff~~
26		~~- value edits. If yes, then include this person in the analysis, else skip the~~
27		~~- person.~~
28		~~- '''~~
29		~~- new_wikipedian = editor['new_wikipedian']~~
30		~~- last_edit = editor['final_edit']~~
31		~~- monthly_edits = editor['monthly_edits']~~
32		~~- yearly_edits = editor['edits_by_year']~~
33		~~- n = editor['edit_count']~~
34		-
35		~~- if n >= var.cum_cutoff:~~
36		~~- for year in xrange(new_wikipedian.year, var.max_year):~~
37		~~- max_edits = max(monthly_edits.get(str(year), {0:0}).values())~~
38		~~- if yearly_edits.get(str(year), 0) == 0 or max_edits < var.cutoff:~~
39		~~- continue~~
40		~~- else:~~
41		~~- experience = (year - new_wikipedian.year) + 1~~
42		~~- var.add(new_wikipedian, 1, {'experience':experience})~~
43		~~- return var~~
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-25'
	19	+__version__ = '0.1'
	20	+
	21	+
	22	+def cohort_dataset_forward_bar(var, editor, **kwargs):
	23	+ '''
	24	+ The forward looking bar charts looks for every month that an editor
	25	+ was part of the Wikimedia community whether this person made at least cutoff
	26	+ value edits. If yes, then include this person in the analysis, else skip the
	27	+ person.
	28	+ '''
	29	+ new_wikipedian = editor['new_wikipedian']
	30	+ last_edit = editor['final_edit']
	31	+ monthly_edits = editor['monthly_edits']
	32	+ yearly_edits = editor['edits_by_year']
	33	+ n = editor['edit_count']
	34	+
	35	+ if n >= var.cum_cutoff:
	36	+ for year in xrange(new_wikipedian.year, var.max_year):
	37	+ max_edits = max(monthly_edits.get(str(year), {0:0}).values())
	38	+ if yearly_edits.get(str(year), 0) == 0 or max_edits < var.cutoff:
	39	+ continue
	40	+ else:
	41	+ experience = (year - new_wikipedian.year) + 1
	42	+ var.add(new_wikipedian, 1, {'experience':experience})
	43	+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_bar.py
___________________________________________________________________
Added: svn:eol-style
44	44	+ native
Index: trunk/tools/editor_trends/analyses/plugins/time_to_new_wikipedian.py
—	—	@@ -1,26 +1,26 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-01-25'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-def time_to_new_wikipedian(var, editor, **kwargs):~~
22		~~-# headers = ['year', 'time_to_new_wikipedian']~~
23		~~- new_wikipedian = editor['new_wikipedian']~~
24		~~- first_edit = editor['first_edit']~~
25		~~- dt = new_wikipedian - first_edit~~
26		~~- var.add(new_wikipedian, dt.days)~~
27		~~- return var~~
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-25'
	19	+__version__ = '0.1'
	20	+
	21	+def time_to_new_wikipedian(var, editor, **kwargs):
	22	+# headers = ['year', 'time_to_new_wikipedian']
	23	+ new_wikipedian = editor['new_wikipedian']
	24	+ first_edit = editor['first_edit']
	25	+ dt = new_wikipedian - first_edit
	26	+ var.add(new_wikipedian, dt.days)
	27	+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/time_to_new_wikipedian.py
___________________________________________________________________
Added: svn:eol-style
28	28	+ native
Property changes on: trunk/tools/editor_trends/analyses/plugins/__init__.py
___________________________________________________________________
Added: svn:eol-style
29	29	+ native
Index: trunk/tools/editor_trends/analyses/plugins/total_number_of_new_wikipedians.py
—	—	@@ -1,25 +1,25 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-01-25'~~
19		~~-__version__ = '0.1'~~
20		-
21		-
22		-
23		~~-def total_number_of_new_wikipedians(var, editor, **kwargs):~~
24		~~- new_wikipedian = editor['new_wikipedian']~~
25		~~- var.add(new_wikipedian, 1, {'year':new_wikipedian.year})~~
26		~~- return var~~
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-25'
	19	+__version__ = '0.1'
	20	+
	21	+
	22	+
	23	+def total_number_of_new_wikipedians(var, editor, **kwargs):
	24	+ new_wikipedian = editor['new_wikipedian']
	25	+ var.add(new_wikipedian, 1, {'year':new_wikipedian.year})
	26	+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/total_number_of_new_wikipedians.py
___________________________________________________________________
Added: svn:eol-style
27	27	+ native
Property changes on: trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py
___________________________________________________________________
Added: svn:eol-style
28	28	+ native
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_histogram.py
—	—	@@ -1,51 +1,51 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-01-25'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-import datetime~~
22		-from dateutil.relativedelta import *
23		-
24		~~-def cohort_dataset_forward_histogram(var, editor, **kwargs):~~
25		~~-# headers = ['year', 'month', 'edits']~~
26		~~- '''~~
27		~~- The forward looking histogram looks for every month that an editor~~
28		~~- was part of the Wikimedia community whether this person made at least cutoff~~
29		~~- value edits. If yes, then include this person in the analysis, else skip the~~
30		~~- person.~~
31		~~- '''~~
32		-
33		~~- new_wikipedian = editor['new_wikipedian']~~
34		~~- final_edit = editor['final_edit'].year + 1~~
35		~~- yearly_edits = editor['edits_by_year']~~
36		~~- n = editor['edit_count']~~
37		-
38		~~- if n >= var.cum_cutoff:~~
39		~~- for i, year in enumerate(xrange(new_wikipedian.year, final_edit)):~~
40		~~- edits = editor['monthly_edits'].get(str(year), {0:0})~~
41		~~- if year == new_wikipedian.year:~~
42		~~- start = new_wikipedian.month~~
43		~~- else:~~
44		~~- start = 1~~
45		-
46		~~- for month in xrange(start, 13):~~
47		~~- if edits.get(str(month), 0) >= var.cutoff:~~
48		~~- dt = datetime.datetime(year, month, 1)~~
49		~~- experience = relativedelta(dt - new_wikipedian)~~
50		~~- experience = experience.years * 12 + experience.months~~
51		~~- var.add(new_wikipedian, 1, {'experience': experience})~~
52		~~- return var~~
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-25'
	19	+__version__ = '0.1'
	20	+
	21	+import datetime
	22	+from dateutil.relativedelta import *
	23	+
	24	+def cohort_dataset_forward_histogram(var, editor, **kwargs):
	25	+# headers = ['year', 'month', 'edits']
	26	+ '''
	27	+ The forward looking histogram looks for every month that an editor
	28	+ was part of the Wikimedia community whether this person made at least cutoff
	29	+ value edits. If yes, then include this person in the analysis, else skip the
	30	+ person.
	31	+ '''
	32	+
	33	+ new_wikipedian = editor['new_wikipedian']
	34	+ final_edit = editor['final_edit'].year + 1
	35	+ yearly_edits = editor['edits_by_year']
	36	+ n = editor['edit_count']
	37	+
	38	+ if n >= var.cum_cutoff:
	39	+ for i, year in enumerate(xrange(new_wikipedian.year, final_edit)):
	40	+ edits = editor['monthly_edits'].get(str(year), {0:0})
	41	+ if year == new_wikipedian.year:
	42	+ start = new_wikipedian.month
	43	+ else:
	44	+ start = 1
	45	+
	46	+ for month in xrange(start, 13):
	47	+ if edits.get(str(month), 0) >= var.cutoff:
	48	+ dt = datetime.datetime(year, month, 1)
	49	+ experience = relativedelta(dt - new_wikipedian)
	50	+ experience = experience.years * 12 + experience.months
	51	+ var.add(new_wikipedian, 1, {'experience': experience})
	52	+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_forward_histogram.py
___________________________________________________________________
Added: svn:eol-style
53	53	+ native
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py
—	—	@@ -1,50 +1,50 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-01-25'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-import datetime~~
22		~~-from dateutil.relativedelta import relativedelta~~
23		~~-from utils import data_converter~~
24		-
25		-
26		~~-def cohort_dataset_backward_bar(var, editor, **kwargs):~~
27		~~- '''~~
28		~~- The backward looking bar chart looks for every year that an editor~~
29		~~- was part of the Wikimedia community whether this person made at least cutoff~~
30		~~- value edits. If yes, then include this person in the analysis, else skip the~~
31		~~- person.~~
32		~~- '''~~
33		~~- break_down = kwargs.pop('break_down', False)~~
34		~~- new_wikipedian = editor['new_wikipedian']~~
35		~~- n = editor['edit_count']~~
36		-
37		~~- if n >= var.cum_cutoff:~~
38		~~- windows = data_converter.create_windows(var, break_down_first_year=break_down)~~
39		~~- for year in xrange(new_wikipedian.year, var.max_year):~~
40		~~- year = str(year)~~
41		~~- if editor['edits_by_year'][year] >= var.cutoff:~~
42		~~- last_edit = editor['last_edit_by_year'][year]~~
43		~~- if last_edit != 0.0:~~
44		~~- editor_dt = relativedelta(last_edit, new_wikipedian)~~
45		~~- editor_dt = (editor_dt.years * 12) + editor_dt.months~~
46		~~- for w in windows:~~
47		~~- if w >= editor_dt:~~
48		~~- datum = datetime.datetime(int(year), 12, 31)~~
49		~~- var.add(datum, 1, {'window':w})~~
50		~~- break~~
51		~~- return var~~
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-25'
	19	+__version__ = '0.1'
	20	+
	21	+import datetime
	22	+from dateutil.relativedelta import relativedelta
	23	+from utils import data_converter
	24	+
	25	+
	26	+def cohort_dataset_backward_bar(var, editor, **kwargs):
	27	+ '''
	28	+ The backward looking bar chart looks for every year that an editor
	29	+ was part of the Wikimedia community whether this person made at least cutoff
	30	+ value edits. If yes, then include this person in the analysis, else skip the
	31	+ person.
	32	+ '''
	33	+ break_down = kwargs.pop('break_down', False)
	34	+ new_wikipedian = editor['new_wikipedian']
	35	+ n = editor['edit_count']
	36	+
	37	+ if n >= var.cum_cutoff:
	38	+ windows = data_converter.create_windows(var, break_down_first_year=break_down)
	39	+ for year in xrange(new_wikipedian.year, var.max_year):
	40	+ year = str(year)
	41	+ if editor['edits_by_year'][year] >= var.cutoff:
	42	+ last_edit = editor['last_edit_by_year'][year]
	43	+ if last_edit != 0.0:
	44	+ editor_dt = relativedelta(last_edit, new_wikipedian)
	45	+ editor_dt = (editor_dt.years * 12) + editor_dt.months
	46	+ for w in windows:
	47	+ if w >= editor_dt:
	48	+ datum = datetime.datetime(int(year), 12, 31)
	49	+ var.add(datum, 1, {'window':w})
	50	+ break
	51	+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py
___________________________________________________________________
Added: svn:eol-style
52	52	+ native
Index: trunk/tools/editor_trends/analyses/plugins/histogram_edits.py
—	—	@@ -1,26 +1,26 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-01-25'~~
19		~~-__version__ = '0.1'~~
20		-
21		-
22		~~-def histogram_edits(var, editor, **kwargs):~~
23		~~-# headers = ['year', 'num_edits', 'frequency']~~
24		~~- cnt = editor['edit_count']~~
25		~~- new_wikipedian = editor['new_wikipedian']~~
26		~~- var.add(new_wikipedian, cnt)~~
27		~~- return var~~
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-25'
	19	+__version__ = '0.1'
	20	+
	21	+
	22	+def histogram_edits(var, editor, **kwargs):
	23	+# headers = ['year', 'num_edits', 'frequency']
	24	+ cnt = editor['edit_count']
	25	+ new_wikipedian = editor['new_wikipedian']
	26	+ var.add(new_wikipedian, cnt)
	27	+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/histogram_edits.py
___________________________________________________________________
Added: svn:eol-style
28	28	+ native
Index: trunk/tools/editor_trends/analyses/plugins/total_number_of_articles.py
—	—	@@ -1,28 +1,28 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-01-25'~~
19		~~-__version__ = '0.1'~~
20		-
21		-
22		~~-def total_number_of_articles(var, editor, **kwargs):~~
23		~~- for year in editor['edits']:~~
24		~~- edits = editor['edits'][year]~~
25		~~- for edit in edits:~~
26		~~- article = edit['article']~~
27		~~- date = edit['date']~~
28		~~- var.add(date, 1, {'article':article})~~
29		~~- return var~~
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-25'
	19	+__version__ = '0.1'
	20	+
	21	+
	22	+def total_number_of_articles(var, editor, **kwargs):
	23	+ for year in editor['edits']:
	24	+ edits = editor['edits'][year]
	25	+ for edit in edits:
	26	+ article = edit['article']
	27	+ date = edit['date']
	28	+ var.add(date, 1, {'article':article})
	29	+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/total_number_of_articles.py
___________________________________________________________________
Added: svn:eol-style
30	30	+ native
Index: trunk/tools/editor_trends/analyses/plugins/new_editor_count.py
—	—	@@ -1,31 +1,31 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-01-25'~~
19		~~-__version__ = '0.1'~~
20		-
21		-
22		~~-def new_editor_count(var, editor, **kwargs):~~
23		~~- '''~~
24		~~- Summary: This function generates an overview of the number of~~
25		~~- new_wikipedians for a given year / month combination.~~
26		~~- Purpose: This data can be used to compare with Erik Zachte's~~
27		~~- stats.download.org to make sure that we are using the same numbers.~~
28		~~- '''~~
29		~~-# headers = ['year', 'month', 'count']~~
30		~~- new_wikipedian = editor['new_wikipedian']~~
31		~~- var.add(new_wikipedian, 1)~~
32		~~- return var~~
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-25'
	19	+__version__ = '0.1'
	20	+
	21	+
	22	+def new_editor_count(var, editor, **kwargs):
	23	+ '''
	24	+ Summary: This function generates an overview of the number of
	25	+ new_wikipedians for a given year / month combination.
	26	+ Purpose: This data can be used to compare with Erik Zachte's
	27	+ stats.download.org to make sure that we are using the same numbers.
	28	+ '''
	29	+# headers = ['year', 'month', 'count']
	30	+ new_wikipedian = editor['new_wikipedian']
	31	+ var.add(new_wikipedian, 1)
	32	+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/new_editor_count.py
___________________________________________________________________
Added: svn:eol-style
33	33	+ native
Index: trunk/tools/editor_trends/analyses/inventory.py
—	—	@@ -1,70 +1,70 @@
2		~~-#!/usr/bin/python~~
3		~~-# coding=utf-8~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http,//www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__author__email = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-02-11'~~
19		~~-__version__ = '0.1'~~
20		-
21		-
22		~~-import os~~
23		~~-import sys~~
24		~~-import types~~
25		-
26		~~-def available_analyses(caller='manage'):~~
27		~~- '''~~
28		~~- Generates a dictionary:~~
29		~~- key: name of analysis~~
30		~~- value: function that generates the dataset~~
31		~~- ignore: a list of functions that should never be called from manage.py,~~
32		~~- they are not valid entry points.~~
33		~~- '''~~
34		~~- assert caller == 'django' or caller == 'manage'~~
35		~~- ignore = ['__init__']~~
36		~~- functions = {}~~
37		-
38		~~- fn = os.path.realpath(__file__)~~
39		~~- pos = fn.rfind(os.sep)~~
40		~~- loc = fn[:pos]~~
41		~~- path = os.path.join(loc , 'plugins')~~
42		~~- plugins = import_libs(path)~~
43		-
44		~~- for plugin in plugins:~~
45		~~- if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore:~~
46		~~- functions[plugin.func_name] = plugin~~
47		~~- if caller == 'manage':~~
48		~~- return functions~~
49		~~- elif caller == 'django':~~
50		~~- django_functions = []~~
51		~~- for function in functions:~~
52		~~- fancy_name = function.replace('_', ' ').title()~~
53		~~- django_functions.append((function, fancy_name))~~
54		-
55		~~- return django_functions~~
56		-
57		-
58		~~-def import_libs(path):~~
59		~~- '''~~
60		~~- Dynamically importing functions from the plugins directory.~~
61		~~- '''~~
62		~~- library_list = []~~
63		~~- sys.path.append(path)~~
64		~~- for f in os.listdir(os.path.abspath(path)):~~
65		~~- module_name, ext = os.path.splitext(f)~~
66		~~- if ext == '.py':~~
67		~~- module = __import__(module_name)~~
68		~~- func = getattr(module, module_name)~~
69		~~- library_list.append(func)~~
70		-
71		~~- return library_list~~
	2	+#!/usr/bin/python
	3	+# coding=utf-8
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http,//www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-02-11'
	19	+__version__ = '0.1'
	20	+
	21	+
	22	+import os
	23	+import sys
	24	+import types
	25	+
	26	+def available_analyses(caller='manage'):
	27	+ '''
	28	+ Generates a dictionary:
	29	+ key: name of analysis
	30	+ value: function that generates the dataset
	31	+ ignore: a list of functions that should never be called from manage.py,
	32	+ they are not valid entry points.
	33	+ '''
	34	+ assert caller == 'django' or caller == 'manage'
	35	+ ignore = ['__init__']
	36	+ functions = {}
	37	+
	38	+ fn = os.path.realpath(__file__)
	39	+ pos = fn.rfind(os.sep)
	40	+ loc = fn[:pos]
	41	+ path = os.path.join(loc , 'plugins')
	42	+ plugins = import_libs(path)
	43	+
	44	+ for plugin in plugins:
	45	+ if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore:
	46	+ functions[plugin.func_name] = plugin
	47	+ if caller == 'manage':
	48	+ return functions
	49	+ elif caller == 'django':
	50	+ django_functions = []
	51	+ for function in functions:
	52	+ fancy_name = function.replace('_', ' ').title()
	53	+ django_functions.append((function, fancy_name))
	54	+
	55	+ return django_functions
	56	+
	57	+
	58	+def import_libs(path):
	59	+ '''
	60	+ Dynamically importing functions from the plugins directory.
	61	+ '''
	62	+ library_list = []
	63	+ sys.path.append(path)
	64	+ for f in os.listdir(os.path.abspath(path)):
	65	+ module_name, ext = os.path.splitext(f)
	66	+ if ext == '.py':
	67	+ module = __import__(module_name)
	68	+ func = getattr(module, module_name)
	69	+ library_list.append(func)
	70	+
	71	+ return library_list
Property changes on: trunk/tools/editor_trends/analyses/inventory.py
___________________________________________________________________
Added: svn:eol-style
72	72	+ native
Property changes on: trunk/tools/editor_trends/manage.py
___________________________________________________________________
Deleted: svn:mime-type
73	73	- text/plain
Property changes on: trunk/tools/editor_trends/wikitree/parser.py
___________________________________________________________________
Deleted: svn:mime-type
74	74	- text/plain
Property changes on: trunk/tools/editor_trends/wikitree/__init__.py
___________________________________________________________________
Deleted: svn:mime-type
75	75	- text/plain
Index: trunk/tools/editor_trends/etl/enricher.py
—	—	@@ -1,462 +1,462 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-02-06'~~
19		~~-__version__ = '0.1'~~
20		-
21		-
22		~~-import bz2~~
23		~~-import cStringIO~~
24		~~-import hashlib~~
25		~~-import codecs~~
26		~~-import re~~
27		~~-import sys~~
28		~~-import progressbar~~
29		~~-from multiprocessing import JoinableQueue, Process, cpu_count, current_process~~
30		~~-from xml.etree.cElementTree import fromstring, iterparse~~
31		~~-from collections import deque~~
32		-
33		~~-if '..' not in sys.path:~~
34		~~- sys.path.append('..')~~
35		-
36		~~-try:~~
37		~~- from database import cassandra~~
38		~~- import pycassa~~
39		-
40		~~-except ImportError:~~
41		~~- print 'I am not going to use Cassandra today, it\'s my off day.'~~
42		-
43		-
44		-
45		~~-from database import db~~
46		~~-from bots import detector~~
47		~~-from utils import file_utils~~
48		~~-import extracter~~
49		-
50		~~-RE_CATEGORY = re.compile('\(.*\`\,\.\-\:\'\)')~~
51		-
52		~~-NAMESPACE = {~~
53		~~- #0:'Main',~~
54		~~- #1:'Talk',~~
55		~~- #2:'User',~~
56		~~- #3:'User talk',~~
57		~~- 4:'Wikipedia',~~
58		~~- #5:'Wikipedia talk',~~
59		~~- 6:'File',~~
60		~~- #7:'File talk',~~
61		~~- 8:'MediaWiki',~~
62		~~- #9:'MediaWiki talk',~~
63		~~- 10:'Template',~~
64		~~- #11:'Template talk',~~
65		~~- 12:'Help',~~
66		~~- #13:'Help talk',~~
67		~~- 14:'Category',~~
68		~~- #15:'Category talk',~~
69		~~- 90:'Thread',~~
70		~~- #91:'Thread talk',~~
71		~~- 92:'Summary',~~
72		~~- #93:'Summary talk',~~
73		~~- 100:'Portal',~~
74		~~- #101:'Portal talk',~~
75		~~- 108:'Book',~~
76		~~- #109:'Book talk'~~
77		-}
78		-
79		-
80		~~-class Buffer:~~
81		~~- def __init__(self, storage, id):~~
82		~~- assert storage == 'cassandra' or storage == 'mongo' or storage == 'csv', \~~
83		~~- 'Valid storage options are cassandra and mongo.'~~
84		~~- self.storage = storage~~
85		~~- self.revisions = {}~~
86		~~- self.comments = {}~~
87		~~- self.id = id~~
88		~~- self.keyspace_name = 'enwiki'~~
89		~~- self.keys = ['revision_id', 'article_id', 'id', 'namespace',~~
90		~~- 'title', 'timestamp', 'hash', 'revert', 'bot', 'prev_size',~~
91		~~- 'cur_size', 'delta']~~
92		~~- self.setup_storage()~~
93		-
94		~~- def setup_storage(self):~~
95		~~- if self.storage == 'cassandra':~~
96		~~- self.db = pycassa.connect(self.keyspace_name)~~
97		~~- self.collection = pycassa.ColumnFamily(self.db, 'revisions')~~
98		-
99		~~- elif self.storage == 'mongo':~~
100		~~- self.db = db.init_mongo_db(self.keyspace_name)~~
101		~~- self.collection = self.db['kaggle']~~
102		-
103		~~- else:~~
104		~~- kaggle_file = 'kaggle_%s.csv' % self.id~~
105		~~- comment_file = 'kaggle_comments_%s.csv' % self.id~~
106		~~- file_utils.delete_file('', kaggle_file, directory=False)~~
107		~~- file_utils.delete_file('', comment_file, directory=False)~~
108		~~- self.fh_main = codecs.open(kaggle_file, 'a', 'utf-8')~~
109		~~- self.fh_extra = codecs.open(comment_file, 'a', 'utf-8')~~
110		-
111		~~- def add(self, revision):~~
112		~~- self.stringify(revision)~~
113		~~- id = revision['revision_id']~~
114		~~- self.revisions[id] = revision~~
115		~~- if len(self.revisions) == 1000:~~
116		~~- self.store()~~
117		~~- self.clear()~~
118		-
119		~~- def stringify(self, revision):~~
120		~~- for key, value in revision.iteritems():~~
121		~~- try:~~
122		~~- value = str(value)~~
123		~~- except UnicodeEncodeError:~~
124		~~- value = value.encode('utf-8')~~
125		~~- revision[key] = value~~
126		-
127		~~- def empty(self):~~
128		~~- self.store()~~
129		~~- self.clear()~~
130		~~- if self.storage == 'csv':~~
131		~~- self.fh_main.close()~~
132		~~- self.fh_extra.close()~~
133		-
134		~~- def clear(self):~~
135		~~- self.revisions = {}~~
136		~~- self.comments = {}~~
137		-
138		~~- def store(self):~~
139		~~- if self.storage == 'cassandra':~~
140		~~- self.collection.batch_insert(self.revisions)~~
141		~~- elif self.storage == 'mongo':~~
142		~~- print 'insert into mongo'~~
143		~~- else:~~
144		~~- for revision in self.revisions.itervalues():~~
145		~~- values = []~~
146		~~- for key in self.keys:~~
147		~~- values.append(revision[key].decode('utf-8'))~~
148		-
149		~~- value = '\t'.join(values) + '\n'~~
150		~~- row = '\t'.join([key, value])~~
151		~~- self.fh_main.write(row)~~
152		-
153		~~- for revision_id, comment in self.comments.iteritems():~~
154		~~- comment = comment.decode('utf-8')~~
155		~~- row = '\t'.join([revision_id, comment]) + '\n'~~
156		~~- self.fh_extra.write(row)~~
157		-
158		-
159		~~-def extract_categories():~~
160		~~- '''~~
161		~~- Field 1: page id~~
162		~~- Field 2: name category~~
163		~~- Field 3: sort key~~
164		~~- Field 4: timestamp last change~~
165		~~- '''~~
166		~~- filename = 'C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-categorylinks.sql'~~
167		~~- output = codecs.open('categories.csv', 'w', encoding='utf-8')~~
168		~~- fh = codecs.open(filename, 'r', encoding='utf-8')~~
169		-
170		~~- try:~~
171		~~- for line in fh:~~
172		~~- if line.startswith('INSERT INTO `categorylinks` VALUES ('):~~
173		~~- line = line.replace('INSERT INTO `categorylinks` VALUES (', '')~~
174		~~- line = line.replace("'", '')~~
175		~~- categories = line.split('),(')~~
176		~~- for category in categories:~~
177		~~- category = category.split(',')~~
178		~~- if len(category) == 4:~~
179		~~- output.write('%s\t%s\n' % (category[0], category[1]))~~
180		~~- except UnicodeDecodeError, e:~~
181		~~- print e~~
182		-
183		~~- output.close()~~
184		~~- fh.close()~~
185		-
186		-
187		~~-def extract_revision_text(revision):~~
188		~~- rev = revision.find('text')~~
189		~~- if rev != None:~~
190		~~- if rev.text == None:~~
191		~~- rev = fix_revision_text(revision)~~
192		~~- return rev.text.encode('utf-8')~~
193		~~- else:~~
194		~~- return ''~~
195		-
196		-
197		~~-def fix_revision_text(revision):~~
198		~~- if revision.text == None:~~
199		~~- revision.text = ''~~
200		~~- return revision~~
201		-
202		-
203		~~-def create_md5hash(text):~~
204		~~- hash = {}~~
205		~~- if text != None:~~
206		~~- m = hashlib.md5()~~
207		~~- m.update(text)~~
208		~~- #echo m.digest()~~
209		~~- hash['hash'] = m.hexdigest()~~
210		~~- else:~~
211		~~- hash['hash'] = -1~~
212		~~- return hash~~
213		-
214		-
215		~~-def calculate_delta_article_size(size, text):~~
216		~~- if 'prev_size' not in size:~~
217		~~- size['prev_size'] = 0~~
218		~~- size['cur_size'] = len(text)~~
219		~~- size['delta'] = len(text)~~
220		~~- else:~~
221		~~- size['prev_size'] = size['cur_size']~~
222		~~- delta = len(text) - size['prev_size']~~
223		~~- size['cur_size'] = len(text)~~
224		~~- size['delta'] = delta~~
225		~~- return size~~
226		-
227		-
228		~~-def parse_contributor(contributor, bots):~~
229		~~- username = extracter.extract_username(contributor)~~
230		~~- user_id = extracter.extract_contributor_id(contributor)~~
231		~~- bot = extracter.determine_username_is_bot(contributor, bots=bots)~~
232		~~- contributor = {}~~
233		~~- contributor['username'] = username~~
234		~~- contributor['bot'] = bot~~
235		~~- if user_id != None:~~
236		~~- contributor.update(user_id)~~
237		~~- else:~~
238		~~- contributor = False~~
239		~~- return contributor~~
240		-
241		-
242		~~-def determine_namespace(title):~~
243		~~- namespaces = {'User': 2,~~
244		~~- 'Talk': 1,~~
245		~~- 'User Talk': 3,~~
246		~~- }~~
247		~~- ns = {}~~
248		~~- if title.text != None:~~
249		~~- title = title.text~~
250		~~- for namespace in namespaces:~~
251		~~- if title.startswith(namespace):~~
252		~~- ns['namespace'] = namespaces[namespace]~~
253		~~- if ns == {}:~~
254		~~- for namespace in NAMESPACE:~~
255		~~- if title.startswith(namespace):~~
256		~~- ns = False #article does not belong to either the main namespace, user, talk or user talk namespace.~~
257		~~- break~~
258		~~- ns['namespace'] = 0~~
259		~~- else:~~
260		~~- ns = False~~
261		~~- return ns~~
262		-
263		-
264		~~-def prefill_row(title, article_id, namespace):~~
265		~~- row = {}~~
266		~~- row['title'] = title.text~~
267		~~- row['article_id'] = article_id~~
268		~~- row.update(namespace)~~
269		~~- return row~~
270		-
271		-
272		~~-def is_revision_reverted(hash_cur, hashes):~~
273		~~- revert = {}~~
274		~~- if hash_cur in hashes:~~
275		~~- revert['revert'] = 1~~
276		~~- else:~~
277		~~- revert['revert'] = 0~~
278		~~- return revert~~
279		-
280		-
281		~~-def create_variables(result_queue, storage, id):~~
282		~~- bots = detector.retrieve_bots('en')~~
283		~~- buffer = Buffer(storage, id)~~
284		~~- i = 0~~
285		~~- while True:~~
286		~~- article = result_queue.get(block=True)~~
287		~~- result_queue.task_done()~~
288		~~- if article == None:~~
289		~~- break~~
290		~~- i += 1~~
291		~~- #article = fromstring(article)~~
292		~~- title = article['title'].text~~
293		~~- namespace = determine_namespace(title)~~
294		~~- if namespace != False:~~
295		~~- #revisions = article.findall('revision')~~
296		~~- article_id = article['id'].text~~
297		~~- hashes = deque(maxlen=1000)~~
298		~~- size = {}~~
299		~~- for revision in article['revision']:~~
300		~~- if revision == None:~~
301		~~- #the entire revision is empty, weird.~~
302		~~- continue~~
303		-
304		~~- contributor = revision.find('contributor')~~
305		~~- contributor = parse_contributor(contributor, bots)~~
306		~~- if not contributor:~~
307		~~- #editor is anonymous, ignore~~
308		~~- continue~~
309		-
310		~~- revision_id = revision.find('id')~~
311		~~- revision_id = extracter.extract_revision_id(revision_id)~~
312		~~- if revision_id == None:~~
313		~~- #revision_id is missing, which is weird~~
314		~~- continue~~
315		-
316		~~- row = prefill_row(title, article_id, namespace)~~
317		~~- row['revision_id'] = revision_id~~
318		~~- text = extract_revision_text(revision)~~
319		~~- row.update(contributor)~~
320		-
321		-
322		~~- timestamp = revision.find('timestamp').text~~
323		~~- row['timestamp'] = timestamp~~
324		-
325		~~- hash = create_md5hash(text)~~
326		~~- revert = is_revision_reverted(hash['hash'], hashes)~~
327		~~- hashes.append(hash['hash'])~~
328		~~- size = calculate_delta_article_size(size, text)~~
329		-
330		~~- row.update(hash)~~
331		~~- row.update(size)~~
332		~~- row.update(revert)~~
333		~~- # print row~~
334		~~- # if row['username'] == None:~~
335		~~- # contributor = revision.find('contributor')~~
336		~~- # attrs = contributor.getchildren()~~
337		~~- # for attr in attrs:~~
338		~~- # print attr.text~~
339		~~- #print revision_id, hash, delta, prev_size\~~
340		-
341		~~- buffer.add(row)~~
342		~~- if i % 10000 == 0:~~
343		~~- print 'Parsed %s articles' % i~~
344		~~-# except ValueError, e:~~
345		~~-# print e~~
346		~~-# except UnicodeDecodeError, e:~~
347		~~-# print e~~
348		~~- buffer.empty()~~
349		~~- print 'Buffer is empty'~~
350		-
351		-
352		~~-def parse_xml(source, result_queue):~~
353		~~- context = iterparse(source, events=('end',))~~
354		~~- context = iter(context)~~
355		~~- event, root = context.next()~~
356		-
357		~~- article = {}~~
358		~~- id = False~~
359		~~- for event, elem in context:~~
360		~~- if event == 'end' and elem.tag == 'revision':~~
361		~~- article[elem.tag] = elem~~
362		~~- elif event == 'end' and elem.tag == 'id' and id == False:~~
363		~~- article[elem.tag] = elem~~
364		~~- id = True~~
365		~~- article[root.tag] = root~~
366		~~- result_queue.put(article)~~
367		~~- root.clear()~~
368		-
369		-
370		~~-def stream_raw_xml(input_queue, result_queue):~~
371		~~- buffer = cStringIO.StringIO()~~
372		~~- parsing = False~~
373		-
374		~~- while True:~~
375		~~- filename = input_queue.get()~~
376		~~- input_queue.task_done()~~
377		~~- if filename == None:~~
378		~~- break~~
379		-
380		~~- #filesize = file_utils.determine_filesize('', filename)~~
381		~~- #pbar = progressbar.ProgressBar().start()~~
382		-
383		~~- for data in unzip(filename):~~
384		~~- if data.startswith('<page>'):~~
385		~~- parsing = True~~
386		~~- if parsing:~~
387		~~- buffer.write(data)~~
388		~~- buffer.write('\n')~~
389		~~- if data == '</page>':~~
390		~~- buffer.seek(0)~~
391		~~- parse_xml(buffer, result_queue)~~
392		~~- buffer = cStringIO.StringIO()~~
393		~~- #pbar.update(pbar.currval + len(data)) #is inaccurate!!!~~
394		-
395		-
396		~~- for x in xrange(cpu_count()):~~
397		~~- result_queue.put(None)~~
398		~~- print 'Finished parsing bz2 archives'~~
399		-
400		-
401		~~-def debug():~~
402		~~- input_queue = JoinableQueue()~~
403		~~- result_queue = JoinableQueue()~~
404		~~- files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']~~
405		-
406		~~- for file in files:~~
407		~~- input_queue.put(file)~~
408		-
409		~~- stream_raw_xml(input_queue, result_queue)~~
410		-
411		-
412		~~-def unzip(filename):~~
413		~~- '''~~
414		~~- Filename should be a fully qualified path to the bz2 file that will be~~
415		~~- decompressed. It will iterate line by line and yield this back to~~
416		~~- create_article~~
417		~~- '''~~
418		~~- fh = bz2.BZ2File(filename, 'r')~~
419		~~- for line in fh:~~
420		~~- line = line.strip()~~
421		~~- yield line~~
422		~~- fh.close()~~
423		-
424		-
425		~~-def setup(storage):~~
426		~~- keyspace_name = 'enwiki'~~
427		~~- if storage == 'cassandra':~~
428		~~- cassandra.install_schema(keyspace_name, drop_first=True)~~
429		-
430		-
431		~~-def launcher():~~
432		-
433		~~- storage = 'csv'~~
434		~~- setup(storage)~~
435		~~- input_queue = JoinableQueue()~~
436		~~- result_queue = JoinableQueue()~~
437		~~- #files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']~~
438		~~- files = ['/home/diederik/kaggle/enwiki-20100904-pages-meta-history2.xml.bz2']~~
439		-
440		~~- for file in files:~~
441		~~- input_queue.put(file)~~
442		-
443		~~- for x in xrange(cpu_count()):~~
444		~~- input_queue.put(None)~~
445		-
446		~~- extracters = [Process(target=stream_raw_xml, args=[input_queue, result_queue])~~
447		~~- for x in xrange(cpu_count())]~~
448		~~- for extracter in extracters:~~
449		~~- extracter.start()~~
450		-
451		~~- creators = [Process(target=create_variables, args=[result_queue, storage, x])~~
452		~~- for x in xrange(cpu_count())]~~
453		~~- for creator in creators:~~
454		~~- creator.start()~~
455		-
456		-
457		~~- input_queue.join()~~
458		~~- result_queue.join()~~
459		-
460		-
461		~~-if __name__ == '__main__':~~
462		~~- #debug()~~
463		~~- launcher()~~
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-02-06'
	19	+__version__ = '0.1'
	20	+
	21	+
	22	+import bz2
	23	+import cStringIO
	24	+import hashlib
	25	+import codecs
	26	+import re
	27	+import sys
	28	+import progressbar
	29	+from multiprocessing import JoinableQueue, Process, cpu_count, current_process
	30	+from xml.etree.cElementTree import fromstring, iterparse
	31	+from collections import deque
	32	+
	33	+if '..' not in sys.path:
	34	+ sys.path.append('..')
	35	+
	36	+try:
	37	+ from database import cassandra
	38	+ import pycassa
	39	+
	40	+except ImportError:
	41	+ print 'I am not going to use Cassandra today, it\'s my off day.'
	42	+
	43	+
	44	+
	45	+from database import db
	46	+from bots import detector
	47	+from utils import file_utils
	48	+import extracter
	49	+
	50	+RE_CATEGORY = re.compile('\(.*\`\,\.\-\:\'\)')
	51	+
	52	+NAMESPACE = {
	53	+ #0:'Main',
	54	+ #1:'Talk',
	55	+ #2:'User',
	56	+ #3:'User talk',
	57	+ 4:'Wikipedia',
	58	+ #5:'Wikipedia talk',
	59	+ 6:'File',
	60	+ #7:'File talk',
	61	+ 8:'MediaWiki',
	62	+ #9:'MediaWiki talk',
	63	+ 10:'Template',
	64	+ #11:'Template talk',
	65	+ 12:'Help',
	66	+ #13:'Help talk',
	67	+ 14:'Category',
	68	+ #15:'Category talk',
	69	+ 90:'Thread',
	70	+ #91:'Thread talk',
	71	+ 92:'Summary',
	72	+ #93:'Summary talk',
	73	+ 100:'Portal',
	74	+ #101:'Portal talk',
	75	+ 108:'Book',
	76	+ #109:'Book talk'
	77	+}
	78	+
	79	+
	80	+class Buffer:
	81	+ def __init__(self, storage, id):
	82	+ assert storage == 'cassandra' or storage == 'mongo' or storage == 'csv', \
	83	+ 'Valid storage options are cassandra and mongo.'
	84	+ self.storage = storage
	85	+ self.revisions = {}
	86	+ self.comments = {}
	87	+ self.id = id
	88	+ self.keyspace_name = 'enwiki'
	89	+ self.keys = ['revision_id', 'article_id', 'id', 'namespace',
	90	+ 'title', 'timestamp', 'hash', 'revert', 'bot', 'prev_size',
	91	+ 'cur_size', 'delta']
	92	+ self.setup_storage()
	93	+
	94	+ def setup_storage(self):
	95	+ if self.storage == 'cassandra':
	96	+ self.db = pycassa.connect(self.keyspace_name)
	97	+ self.collection = pycassa.ColumnFamily(self.db, 'revisions')
	98	+
	99	+ elif self.storage == 'mongo':
	100	+ self.db = db.init_mongo_db(self.keyspace_name)
	101	+ self.collection = self.db['kaggle']
	102	+
	103	+ else:
	104	+ kaggle_file = 'kaggle_%s.csv' % self.id
	105	+ comment_file = 'kaggle_comments_%s.csv' % self.id
	106	+ file_utils.delete_file('', kaggle_file, directory=False)
	107	+ file_utils.delete_file('', comment_file, directory=False)
	108	+ self.fh_main = codecs.open(kaggle_file, 'a', 'utf-8')
	109	+ self.fh_extra = codecs.open(comment_file, 'a', 'utf-8')
	110	+
	111	+ def add(self, revision):
	112	+ self.stringify(revision)
	113	+ id = revision['revision_id']
	114	+ self.revisions[id] = revision
	115	+ if len(self.revisions) == 1000:
	116	+ self.store()
	117	+ self.clear()
	118	+
	119	+ def stringify(self, revision):
	120	+ for key, value in revision.iteritems():
	121	+ try:
	122	+ value = str(value)
	123	+ except UnicodeEncodeError:
	124	+ value = value.encode('utf-8')
	125	+ revision[key] = value
	126	+
	127	+ def empty(self):
	128	+ self.store()
	129	+ self.clear()
	130	+ if self.storage == 'csv':
	131	+ self.fh_main.close()
	132	+ self.fh_extra.close()
	133	+
	134	+ def clear(self):
	135	+ self.revisions = {}
	136	+ self.comments = {}
	137	+
	138	+ def store(self):
	139	+ if self.storage == 'cassandra':
	140	+ self.collection.batch_insert(self.revisions)
	141	+ elif self.storage == 'mongo':
	142	+ print 'insert into mongo'
	143	+ else:
	144	+ for revision in self.revisions.itervalues():
	145	+ values = []
	146	+ for key in self.keys:
	147	+ values.append(revision[key].decode('utf-8'))
	148	+
	149	+ value = '\t'.join(values) + '\n'
	150	+ row = '\t'.join([key, value])
	151	+ self.fh_main.write(row)
	152	+
	153	+ for revision_id, comment in self.comments.iteritems():
	154	+ comment = comment.decode('utf-8')
	155	+ row = '\t'.join([revision_id, comment]) + '\n'
	156	+ self.fh_extra.write(row)
	157	+
	158	+
	159	+def extract_categories():
	160	+ '''
	161	+ Field 1: page id
	162	+ Field 2: name category
	163	+ Field 3: sort key
	164	+ Field 4: timestamp last change
	165	+ '''
	166	+ filename = 'C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-categorylinks.sql'
	167	+ output = codecs.open('categories.csv', 'w', encoding='utf-8')
	168	+ fh = codecs.open(filename, 'r', encoding='utf-8')
	169	+
	170	+ try:
	171	+ for line in fh:
	172	+ if line.startswith('INSERT INTO `categorylinks` VALUES ('):
	173	+ line = line.replace('INSERT INTO `categorylinks` VALUES (', '')
	174	+ line = line.replace("'", '')
	175	+ categories = line.split('),(')
	176	+ for category in categories:
	177	+ category = category.split(',')
	178	+ if len(category) == 4:
	179	+ output.write('%s\t%s\n' % (category[0], category[1]))
	180	+ except UnicodeDecodeError, e:
	181	+ print e
	182	+
	183	+ output.close()
	184	+ fh.close()
	185	+
	186	+
	187	+def extract_revision_text(revision):
	188	+ rev = revision.find('text')
	189	+ if rev != None:
	190	+ if rev.text == None:
	191	+ rev = fix_revision_text(revision)
	192	+ return rev.text.encode('utf-8')
	193	+ else:
	194	+ return ''
	195	+
	196	+
	197	+def fix_revision_text(revision):
	198	+ if revision.text == None:
	199	+ revision.text = ''
	200	+ return revision
	201	+
	202	+
	203	+def create_md5hash(text):
	204	+ hash = {}
	205	+ if text != None:
	206	+ m = hashlib.md5()
	207	+ m.update(text)
	208	+ #echo m.digest()
	209	+ hash['hash'] = m.hexdigest()
	210	+ else:
	211	+ hash['hash'] = -1
	212	+ return hash
	213	+
	214	+
	215	+def calculate_delta_article_size(size, text):
	216	+ if 'prev_size' not in size:
	217	+ size['prev_size'] = 0
	218	+ size['cur_size'] = len(text)
	219	+ size['delta'] = len(text)
	220	+ else:
	221	+ size['prev_size'] = size['cur_size']
	222	+ delta = len(text) - size['prev_size']
	223	+ size['cur_size'] = len(text)
	224	+ size['delta'] = delta
	225	+ return size
	226	+
	227	+
	228	+def parse_contributor(contributor, bots):
	229	+ username = extracter.extract_username(contributor)
	230	+ user_id = extracter.extract_contributor_id(contributor)
	231	+ bot = extracter.determine_username_is_bot(contributor, bots=bots)
	232	+ contributor = {}
	233	+ contributor['username'] = username
	234	+ contributor['bot'] = bot
	235	+ if user_id != None:
	236	+ contributor.update(user_id)
	237	+ else:
	238	+ contributor = False
	239	+ return contributor
	240	+
	241	+
	242	+def determine_namespace(title):
	243	+ namespaces = {'User': 2,
	244	+ 'Talk': 1,
	245	+ 'User Talk': 3,
	246	+ }
	247	+ ns = {}
	248	+ if title.text != None:
	249	+ title = title.text
	250	+ for namespace in namespaces:
	251	+ if title.startswith(namespace):
	252	+ ns['namespace'] = namespaces[namespace]
	253	+ if ns == {}:
	254	+ for namespace in NAMESPACE:
	255	+ if title.startswith(namespace):
	256	+ ns = False #article does not belong to either the main namespace, user, talk or user talk namespace.
	257	+ break
	258	+ ns['namespace'] = 0
	259	+ else:
	260	+ ns = False
	261	+ return ns
	262	+
	263	+
	264	+def prefill_row(title, article_id, namespace):
	265	+ row = {}
	266	+ row['title'] = title.text
	267	+ row['article_id'] = article_id
	268	+ row.update(namespace)
	269	+ return row
	270	+
	271	+
	272	+def is_revision_reverted(hash_cur, hashes):
	273	+ revert = {}
	274	+ if hash_cur in hashes:
	275	+ revert['revert'] = 1
	276	+ else:
	277	+ revert['revert'] = 0
	278	+ return revert
	279	+
	280	+
	281	+def create_variables(result_queue, storage, id):
	282	+ bots = detector.retrieve_bots('en')
	283	+ buffer = Buffer(storage, id)
	284	+ i = 0
	285	+ while True:
	286	+ article = result_queue.get(block=True)
	287	+ result_queue.task_done()
	288	+ if article == None:
	289	+ break
	290	+ i += 1
	291	+ #article = fromstring(article)
	292	+ title = article['title'].text
	293	+ namespace = determine_namespace(title)
	294	+ if namespace != False:
	295	+ #revisions = article.findall('revision')
	296	+ article_id = article['id'].text
	297	+ hashes = deque(maxlen=1000)
	298	+ size = {}
	299	+ for revision in article['revision']:
	300	+ if revision == None:
	301	+ #the entire revision is empty, weird.
	302	+ continue
	303	+
	304	+ contributor = revision.find('contributor')
	305	+ contributor = parse_contributor(contributor, bots)
	306	+ if not contributor:
	307	+ #editor is anonymous, ignore
	308	+ continue
	309	+
	310	+ revision_id = revision.find('id')
	311	+ revision_id = extracter.extract_revision_id(revision_id)
	312	+ if revision_id == None:
	313	+ #revision_id is missing, which is weird
	314	+ continue
	315	+
	316	+ row = prefill_row(title, article_id, namespace)
	317	+ row['revision_id'] = revision_id
	318	+ text = extract_revision_text(revision)
	319	+ row.update(contributor)
	320	+
	321	+
	322	+ timestamp = revision.find('timestamp').text
	323	+ row['timestamp'] = timestamp
	324	+
	325	+ hash = create_md5hash(text)
	326	+ revert = is_revision_reverted(hash['hash'], hashes)
	327	+ hashes.append(hash['hash'])
	328	+ size = calculate_delta_article_size(size, text)
	329	+
	330	+ row.update(hash)
	331	+ row.update(size)
	332	+ row.update(revert)
	333	+ # print row
	334	+ # if row['username'] == None:
	335	+ # contributor = revision.find('contributor')
	336	+ # attrs = contributor.getchildren()
	337	+ # for attr in attrs:
	338	+ # print attr.text
	339	+ #print revision_id, hash, delta, prev_size\
	340	+
	341	+ buffer.add(row)
	342	+ if i % 10000 == 0:
	343	+ print 'Parsed %s articles' % i
	344	+# except ValueError, e:
	345	+# print e
	346	+# except UnicodeDecodeError, e:
	347	+# print e
	348	+ buffer.empty()
	349	+ print 'Buffer is empty'
	350	+
	351	+
	352	+def parse_xml(source, result_queue):
	353	+ context = iterparse(source, events=('end',))
	354	+ context = iter(context)
	355	+ event, root = context.next()
	356	+
	357	+ article = {}
	358	+ id = False
	359	+ for event, elem in context:
	360	+ if event == 'end' and elem.tag == 'revision':
	361	+ article[elem.tag] = elem
	362	+ elif event == 'end' and elem.tag == 'id' and id == False:
	363	+ article[elem.tag] = elem
	364	+ id = True
	365	+ article[root.tag] = root
	366	+ result_queue.put(article)
	367	+ root.clear()
	368	+
	369	+
	370	+def stream_raw_xml(input_queue, result_queue):
	371	+ buffer = cStringIO.StringIO()
	372	+ parsing = False
	373	+
	374	+ while True:
	375	+ filename = input_queue.get()
	376	+ input_queue.task_done()
	377	+ if filename == None:
	378	+ break
	379	+
	380	+ #filesize = file_utils.determine_filesize('', filename)
	381	+ #pbar = progressbar.ProgressBar().start()
	382	+
	383	+ for data in unzip(filename):
	384	+ if data.startswith('<page>'):
	385	+ parsing = True
	386	+ if parsing:
	387	+ buffer.write(data)
	388	+ buffer.write('\n')
	389	+ if data == '</page>':
	390	+ buffer.seek(0)
	391	+ parse_xml(buffer, result_queue)
	392	+ buffer = cStringIO.StringIO()
	393	+ #pbar.update(pbar.currval + len(data)) #is inaccurate!!!
	394	+
	395	+
	396	+ for x in xrange(cpu_count()):
	397	+ result_queue.put(None)
	398	+ print 'Finished parsing bz2 archives'
	399	+
	400	+
	401	+def debug():
	402	+ input_queue = JoinableQueue()
	403	+ result_queue = JoinableQueue()
	404	+ files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']
	405	+
	406	+ for file in files:
	407	+ input_queue.put(file)
	408	+
	409	+ stream_raw_xml(input_queue, result_queue)
	410	+
	411	+
	412	+def unzip(filename):
	413	+ '''
	414	+ Filename should be a fully qualified path to the bz2 file that will be
	415	+ decompressed. It will iterate line by line and yield this back to
	416	+ create_article
	417	+ '''
	418	+ fh = bz2.BZ2File(filename, 'r')
	419	+ for line in fh:
	420	+ line = line.strip()
	421	+ yield line
	422	+ fh.close()
	423	+
	424	+
	425	+def setup(storage):
	426	+ keyspace_name = 'enwiki'
	427	+ if storage == 'cassandra':
	428	+ cassandra.install_schema(keyspace_name, drop_first=True)
	429	+
	430	+
	431	+def launcher():
	432	+
	433	+ storage = 'csv'
	434	+ setup(storage)
	435	+ input_queue = JoinableQueue()
	436	+ result_queue = JoinableQueue()
	437	+ #files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']
	438	+ files = ['/home/diederik/kaggle/enwiki-20100904-pages-meta-history2.xml.bz2']
	439	+
	440	+ for file in files:
	441	+ input_queue.put(file)
	442	+
	443	+ for x in xrange(cpu_count()):
	444	+ input_queue.put(None)
	445	+
	446	+ extracters = [Process(target=stream_raw_xml, args=[input_queue, result_queue])
	447	+ for x in xrange(cpu_count())]
	448	+ for extracter in extracters:
	449	+ extracter.start()
	450	+
	451	+ creators = [Process(target=create_variables, args=[result_queue, storage, x])
	452	+ for x in xrange(cpu_count())]
	453	+ for creator in creators:
	454	+ creator.start()
	455	+
	456	+
	457	+ input_queue.join()
	458	+ result_queue.join()
	459	+
	460	+
	461	+if __name__ == '__main__':
	462	+ #debug()
	463	+ launcher()
Property changes on: trunk/tools/editor_trends/etl/enricher.py
___________________________________________________________________
Added: svn:eol-style
464	464	+ native
Property changes on: trunk/tools/editor_trends/__init__.py
___________________________________________________________________
Deleted: svn:mime-type
465	465	- text/plain
Property changes on: trunk/tools/editor_trends/classes/consumers.py
___________________________________________________________________
Deleted: svn:mime-type
466	466	- text/plain
Property changes on: trunk/tools/editor_trends/classes/runtime_settings.py
___________________________________________________________________
Deleted: svn:mime-type
467	467	- text/plain
Property changes on: trunk/tools/editor_trends/utils/__init__.py
___________________________________________________________________
Deleted: svn:mime-type
468	468	- text/plain
Property changes on: trunk/tools/editor_trends/utils/file_utils.py
___________________________________________________________________
Deleted: svn:mime-type
469	469	- text/plain
Property changes on: trunk/tools/editor_trends/utils/http_utils.py
___________________________________________________________________
Deleted: svn:mime-type
470	470	- text/plain
Property changes on: trunk/tools/editor_trends/utils/ordered_dict.py
___________________________________________________________________
Deleted: svn:mime-type
471	471	- text/plain
Property changes on: trunk/tools/editor_trends/database/__init__.py
___________________________________________________________________
Deleted: svn:mime-type
472	472	- text/plain
Property changes on: trunk/tools/editor_trends/database/cache.py
___________________________________________________________________
Deleted: svn:mime-type
473	473	- text/plain
Property changes on: trunk/tools/editor_trends/database/db.py
___________________________________________________________________
Deleted: svn:mime-type
474	474	- text/plain
Index: trunk/tools/editor_trends/mapreduce/__init__.py
—	—	@@ -1 +1 @@
2		-
	2	+
Property changes on: trunk/tools/editor_trends/mapreduce/__init__.py
___________________________________________________________________
Added: svn:eol-style
3	3	+ native
Property changes on: trunk/tools/editor_trends/bots/detector.py
___________________________________________________________________
Deleted: svn:mime-type
4	4	- text/plain
Property changes on: trunk/tools/editor_trends/bots/__init__.py
___________________________________________________________________
Deleted: svn:mime-type
5	5	- text/plain
Property changes on: trunk/tools/editor_trends/code-snippets/chunker.py
___________________________________________________________________
Deleted: svn:mime-type
6	6	- text/plain
Property changes on: trunk/tools/editor_trends/code-snippets/exporter.py
___________________________________________________________________
Deleted: svn:mime-type
7	7	- text/plain
Property changes on: trunk/tools/editor_trends/code-snippets/process_constructor.py
___________________________________________________________________
Deleted: svn:mime-type
8	8	- text/plain

Status & tagging log

00:15, 22 March 2011 Reedy (talk | contribs) changed the status of r84505 [removed: new added: deferred]