r81236 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r81235‎ \| r81236 \| r81237 >
Date:	06:45, 31 January 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Sync
Modified paths:	/trunk/tools/editor_trends/analyses/analyzer.py (modified) (history) /trunk/tools/editor_trends/analyses/count_editors.py (deleted) (history) /trunk/tools/editor_trends/analyses/plugins/edit_patterns.py (added) (history) /trunk/tools/editor_trends/database/cache.py (modified) (history) /trunk/tools/editor_trends/run.py (deleted) (history)

Diff [purge]

Index: trunk/tools/editor_trends/run.py
—	—	@@ -1,82 +0,0 @@
2		~~-import os~~
3		~~-import configuration~~
4		~~-settings = configuration.Settings()~~
5		~~-#from utils import namespace_downloader as nd~~
6		~~-#nd.launch_downloader()~~
7		-
8		-
9		~~-#def which(program):~~
10		~~-# import os~~
11		~~-# def is_exe(fpath):~~
12		~~-# return os.path.exists(fpath) and os.access(fpath, os.X_OK)~~
13		-#
14		~~-# fpath, fname = os.path.split(program)~~
15		~~-# if fpath:~~
16		~~-# if is_exe(program):~~
17		~~-# return program~~
18		~~-# else:~~
19		~~-# for path in os.environ["PATH"].split(os.pathsep):~~
20		~~-# exe_file = os.path.join(path, program)~~
21		~~-# if is_exe(exe_file):~~
22		~~-# return exe_file~~
23		-#
24		~~-# return None~~
25		-#
26		-#
27		~~-#result = which('7z.exe')~~
28		~~-#print result~~
29		-
30		~~-#from database import launcher~~
31		~~-#launcher.launcher()~~
32		~~-from utils import sort~~
33		~~-input = os.path.join(settings.input_location, 'en', 'wiki', 'txt')~~
34		~~-output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')~~
35		~~-dbname = 'enwiki'~~
36		~~-#sort.debug_mergesort_feeder(input, output)~~
37		~~-#sort.mergesort_launcher(input, output)~~
38		~~-#sort.mergesort_external_launcher(dbname, output, output)~~
39		-
40		-
41		-
42		-
43		-
44		~~-from analyses import cohort_charts~~
45		~~-cohort_charts.prepare_cohort_dataset()~~
46		~~-import os~~
47		-
48		~~-import configuration~~
49		~~-settings = configuration.Settings()~~
50		~~-#from utils import namespace_downloader as nd~~
51		~~-#nd.launch_downloader()~~
52		-
53		-
54		~~-#def which(program):~~
55		~~-# import os~~
56		~~-# def is_exe(fpath):~~
57		~~-# return os.path.exists(fpath) and os.access(fpath, os.X_OK)~~
58		-#
59		~~-# fpath, fname = os.path.split(program)~~
60		~~-# if fpath:~~
61		~~-# if is_exe(program):~~
62		~~-# return program~~
63		~~-# else:~~
64		~~-# for path in os.environ["PATH"].split(os.pathsep):~~
65		~~-# exe_file = os.path.join(path, program)~~
66		~~-# if is_exe(exe_file):~~
67		~~-# return exe_file~~
68		-#
69		~~-# return None~~
70		-#
71		-#
72		~~-#result = which('7z.exe')~~
73		~~-#print result~~
74		-
75		~~-#from database import launcher~~
76		~~-#launcher.launcher()~~
77		~~-from etl import loader~~
78		~~-input = os.path.join(settings.input_location, 'en', 'wiki', 'txt')~~
79		~~-output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')~~
80		~~-dbname = 'enwiki'~~
81		~~-#sort.debug_mergesort_feeder(input, output)~~
82		~~-#sort.mergesort_launcher(input, output)~~
83		~~-loader.mergesort_external_launcher(dbname, output, output)~~
\ No newline at end of file
Index: trunk/tools/editor_trends/analyses/count_editors.py
—	—	@@ -1,251 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2010-12-10'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-import datetime~~
22		~~-import multiprocessing~~
23		~~-import calendar~~
24		~~-import sys~~
25		~~-import os~~
26		~~-import progressbar~~
27		~~-import types~~
28		~~-from dateutil.relativedelta import relativedelta~~
29		-
30		~~-sys.path.append('..')~~
31		-
32		~~-import configuration~~
33		~~-settings = configuration.Settings()~~
34		~~-from database import db~~
35		~~-from etl import shaper~~
36		~~-from utils import file_utils~~
37		~~-from utils import timer~~
38		~~-from utils import messages~~
39		~~-from utils import log~~
40		~~-import dataset~~
41		-
42		~~-def available_analyses(caller='manage'):~~
43		~~- '''~~
44		~~- Generates a dictionary:~~
45		~~- key: name of analysis~~
46		~~- value: function that generates the dataset~~
47		~~- ignore: a list of functions that should never be called from manage.py,~~
48		~~- they are not valid entry points.~~
49		~~- '''~~
50		~~- assert caller == 'django' or caller == 'manage'~~
51		~~- ignore = ['analyses', 'determine_project_year_range', 'create_windows',~~
52		~~- 'generate_chart_data', 'loop_editors']~~
53		~~- functions = {}~~
54		~~- for func in globals():~~
55		~~- func = globals()[func]~~
56		~~- if isinstance(func, types.FunctionType) and func.func_name not in ignore:~~
57		~~- functions[func.func_name] = func~~
58		~~- if caller == 'manage':~~
59		~~- return functions~~
60		~~- elif caller == 'django':~~
61		~~- django_functions = []~~
62		~~- for function in functions:~~
63		~~- fancy_name = function.replace('_', ' ').title()~~
64		~~- django_functions.append((function, fancy_name))~~
65		-
66		~~- return django_functions~~
67		-
68		-
69		~~-def determine_project_year_range(dbname, collection, var):~~
70		~~- '''~~
71		~~- Determine the first and final year for the observed data~~
72		~~- '''~~
73		~~- max_year = db.run_query(dbname, collection, var, 'max')~~
74		~~- max_year = max_year[var].year + 1~~
75		~~- min_year = db.run_query(dbname, collection, var, 'min')~~
76		~~- min_year = min_year[var].year~~
77		~~- return min_year, max_year~~
78		-
79		-
80		~~-def create_windows(var, break_down_first_year=True):~~
81		~~- '''~~
82		~~- This function creates a list of months. If break_down_first_year = True then~~
83		~~- the first year will be split in 3, 6, 9 months as well.~~
84		~~- '''~~
85		~~- years = var.max_year - var.min_year~~
86		~~- windows = [y * 12 for y in xrange(1, years)]~~
87		~~- if break_down_first_year:~~
88		~~- windows = [3, 6, 9] + windows~~
89		~~- return windows~~
90		-
91		-
92		~~-def generate_chart_data(project, collection, language_code, func, **kwargs):~~
93		~~- '''~~
94		~~- This is the entry function to be called to generate data for creating charts.~~
95		~~- '''~~
96		~~- stopwatch = timer.Timer()~~
97		~~- dbname = '%s%s' % (language_code, project)~~
98		~~- print 'Exporting data for chart: %s' % func.func_name~~
99		~~- print 'Project: %s' % dbname~~
100		~~- print 'Dataset: %s' % collection~~
101		~~- ds = loop_editors(dbname, project, collection, language_code, func, **kwargs)~~
102		~~- file = '%s_%s.csv' % (dbname, func.func_name)~~
103		~~- print 'Storing dataset: %s' % os.path.join(settings.dataset_location, file)~~
104		~~- ds.write(format='csv')~~
105		~~- print 'Serializing dataset to %s_%s' % (dbname, 'charts')~~
106		~~- log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start')~~
107		~~- ds.write(format='mongo')~~
108		~~- stopwatch.elapsed()~~
109		~~- log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish')~~
110		-
111		-
112		~~-def loop_editors(dbname, project, collection, language_code, func, **kwargs):~~
113		~~- '''~~
114		~~- Generic loop function that loops over all the editors of a Wikipedia project~~
115		~~- and then calls the function that does the actual aggregation.~~
116		~~- '''~~
117		-
118		~~- editors = db.retrieve_distinct_keys(dbname, collection, 'editor')~~
119		~~- pbar = progressbar.ProgressBar(maxval=len(editors)).start()~~
120		~~- min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian')~~
121		~~- print 'Number of editors: %s' % len(editors)~~
122		~~- mongo = db.init_mongo_db(dbname)~~
123		~~- coll = mongo[collection]~~
124		~~- format = kwargs.pop('format', 'long')~~
125		~~- kwargs['min_year'] = min_year~~
126		~~- kwargs['max_year'] = max_year~~
127		~~- vars = []~~
128		~~- ds = dataset.Dataset(func.func_name, project, coll.name, language_code, vars, format=format)~~
129		~~- var = dataset.Variable('count', **kwargs)~~
130		~~-# cutoff=cutoff,~~
131		~~-# cum_cutoff=cum_cutoff,~~
132		~~-# min_year=min_year,~~
133		~~-# max_year=max_year)~~
134		~~- for editor in editors:~~
135		~~- editor = coll.find_one({'editor': editor})~~
136		~~- data = func(var, editor, dbname=dbname)~~
137		~~- pbar.update(pbar.currval + 1)~~
138		-
139		~~- ds.add_variable(var)~~
140		~~- return ds~~
141		-
142		-
143		~~-def cohort_dataset_forward_histogram(var, editor, **kwargs):~~
144		~~-# headers = ['year', 'month', 'edits']~~
145		~~- new_wikipedian = editor['new_wikipedian']~~
146		~~- final_edit = editor['final_edit']~~
147		~~- yearly_edits = editor['edits_by_year']~~
148		~~- n = editor['edit_count']~~
149		-
150		~~- if n >= var.cum_cutoff:~~
151		~~- for i, year in enumerate(xrange(new_wikipedian.year, final_edit.year)):~~
152		~~- edits = editor['monthly_edits'].get(str(year), {0:0})~~
153		~~- if year == new_wikipedian.year:~~
154		~~- start = new_wikipedian.month~~
155		~~- else:~~
156		~~- start = 1~~
157		~~- for month in xrange(start, 13):~~
158		~~- if edits.get(str(month), 0) >= var.cutoff:~~
159		~~- experience = i * 12 + (month - new_wikipedian.month)~~
160		~~- var.add(new_wikipedian, {experience: 1})~~
161		~~- return var~~
162		-
163		-
164		~~-def cohort_dataset_backward_bar(var, editor, **kwargs):~~
165		~~- #first_edit = editor['first_edit']~~
166		~~- new_wikipedian = editor['new_wikipedian']~~
167		~~- n = editor['edit_count']~~
168		-
169		~~- if n >= var.cum_cutoff:~~
170		~~- windows = create_windows(var, break_down_first_year=False)~~
171		~~- for year in xrange(new_wikipedian.year, var.max_year):~~
172		~~- year = str(year)~~
173		~~- if editor['edits_by_year'][year] >= var.cutoff:~~
174		~~- last_edit = editor['last_edit_by_year'][year]~~
175		~~- if last_edit != 0.0:~~
176		~~- editor_dt = relativedelta(last_edit, new_wikipedian)~~
177		~~- editor_dt = (editor_dt.years * 12) + editor_dt.months~~
178		~~- for w in windows:~~
179		~~- if w >= editor_dt:~~
180		~~- datum = datetime.datetime(int(year), 12, 31)~~
181		~~- var.add(datum, {w:1})~~
182		~~- break~~
183		~~- return var~~
184		-
185		-
186		~~-def cohort_dataset_forward_bar(var, editor, **kwargs):~~
187		~~- new_wikipedian = editor['new_wikipedian']~~
188		~~- last_edit = editor['final_edit']~~
189		~~- monthly_edits = editor['monthly_edits']~~
190		~~- yearly_edits = editor['edits_by_year']~~
191		~~- n = editor['edit_count']~~
192		-
193		~~- if n >= var.cum_cutoff:~~
194		~~- for year in xrange(new_wikipedian.year, var.max_year):~~
195		~~- max_edits = max(monthly_edits.get(str(year), {0:0}).values())~~
196		~~- if yearly_edits.get(str(year), 0) == 0 or max_edits < var.cutoff:~~
197		~~- continue~~
198		~~- else:~~
199		~~- experience = (year - new_wikipedian.year) + 1~~
200		~~- var.add(new_wikipedian, {experience: 1 })~~
201		~~- return var~~
202		-
203		-
204		~~-def new_editor_count(var, editor, **kwargs):~~
205		~~- '''~~
206		~~- Summary: This function generates an overview of the number of~~
207		~~- new_wikipedians for a given year / month combination.~~
208		~~- Purpose: This data can be used to compare with Erik Zachte's~~
209		~~- stats.download.org to make sure that we are using the same numbers.~~
210		~~- '''~~
211		~~-# headers = ['year', 'month', 'count']~~
212		~~- new_wikipedian = editor['new_wikipedian']~~
213		~~- var.add(new_wikipedian, {0:1})~~
214		~~- return var~~
215		-
216		-
217		~~-def active_editor_count(var, editor, **kwargs):~~
218		~~- monthly_edits = editor['monthly_edits']~~
219		~~- for year in xrange(ds.count.min_year, var.max_year):~~
220		~~- for month in xrange(1, 13):~~
221		~~- if monthly_edits[str(year)][str(month)] >= var.cutoff:~~
222		~~- datum = datetime.date(year, month, 1)~~
223		~~- var.add(datum, {0:1})~~
224		~~- return var~~
225		-
226		-
227		~~-def histogram_edits(var, editor, **kwargs):~~
228		~~-# headers = ['year', 'num_edits', 'frequency']~~
229		~~- cnt = editor['edit_count']~~
230		~~- new_wikipedian = editor['new_wikipedian']~~
231		~~- var.add(new_wikipedian, {0: cnt})~~
232		~~- return var~~
233		-
234		-
235		~~-def time_to_new_wikipedian(var, editor, **kwargs):~~
236		~~-# headers = ['year', 'time_to_new_wikipedian']~~
237		~~- new_wikipedian = editor['new_wikipedian']~~
238		~~- first_edit = editor['first_edit']~~
239		~~- dt = new_wikipedian - first_edit~~
240		~~- var.add(new_wikipedian, {0:dt.days}, update=False)~~
241		~~- return var~~
242		-
243		-
244		~~-if __name__ == '__main__':~~
245		~~- generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_forward_histogram, time_unit='month', cutoff=1, cum_cutoff=50)~~
246		~~- generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_backward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide')~~
247		~~- generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_forward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide')~~
248		~~- #generate_chart_data('wiki', 'editors_dataset','en', histogram_edits, time_unit='year', cutoff=0)~~
249		~~- #generate_chart_data('wiki', 'editors_dataset','en', time_to_new_wikipedian, time_unit='year', cutoff=0)~~
250		~~- #generate_chart_data('wiki', 'editors_dataset','en', new_editor_count, time_unit='month', cutoff=0)~~
251		-
252		~~- #analyses()~~
Index: trunk/tools/editor_trends/analyses/plugins/edit_patterns.py
—	—	@@ -0,0 +1,47 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-28'
	19	+__version__ = '0.1'
	20	+
	21	+import datetime
	22	+
	23	+def edit_patterns(var, editor, **kwargs):
	24	+ monthly = editor['monthly_edits']
	25	+ new_wikipedian = editor['new_wikipedian']
	26	+ final_edit = editor['final_edit']
	27	+ dt = final_edit - new_wikipedian
	28	+ if dt.days < 366:
	29	+ return var
	30	+
	31	+ m = 0
	32	+ obs = {}
	33	+ for year in xrange(new_wikipedian.year, new_wikipedian.year + 2):
	34	+ if m == 12:
	35	+ break
	36	+ for month in xrange(new_wikipedian.month, 13):
	37	+ n = monthly[str(year)][str(month)]
	38	+ date = datetime.datetime(year, month, 1)
	39	+ if n >= var.cutoff:
	40	+ obs[m] = True
	41	+ else:
	42	+ obs[m] = False
	43	+ m += 1
	44	+ if m == 12:
	45	+ break
	46	+ if m == 12:
	47	+ var.add(date, obs, update=False)
	48	+ return var
Property changes on: trunk/tools/editor_trends/analyses/plugins/edit_patterns.py
___________________________________________________________________
Added: svn:eol-style
1	49	+ native
Index: trunk/tools/editor_trends/analyses/analyzer.py
—	—	@@ -55,7 +55,7 @@
56	56	fn = '%s_%s.csv' % (dbname, func.func_name)
57	57
58	58	print 'Storing dataset: %s' % os.path.join(settings.dataset_location, fn)
59		~~- ds.write(format='csv')~~
	59	+ #ds.write(format='csv')
60	60
61	61	print 'Serializing dataset to %s_%s' % (dbname, 'charts')
62	62	log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start')
—	—	@@ -168,14 +168,14 @@
169	169
170	170	if __name__ == '__main__':
171	171	generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)
172		~~- #generate_chart_data('wiki', 'editors_dataset','en', 'total_number_of_new_wikipedians', time_unit='year')~~
173		~~- #generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year')~~
174		~~- #generate_chart_data('wiki', 'editors_dataset','en', 'total_cumulative_edits', time_unit='year')~~
175		~~- #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0)~~
176		~~- #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide')~~
177		~~- #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')~~
178		~~- #generate_chart_data('wiki', 'editors_dataset','en', 'histogram_edits', time_unit='year', cutoff=0)~~
179		~~- #generate_chart_data('wiki', 'editors_dataset','en', 'time_to_new_wikipedian', time_unit='year', cutoff=0)~~
180		~~- #generate_chart_data('wiki', 'editors_dataset','en', 'new_editor_count', time_unit='month', cutoff=0)~~
	172	+ generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', time_unit='year')
	173	+ generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year')
	174	+ generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', time_unit='year')
	175	+ generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0)
	176	+ generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide')
	177	+ generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')
	178	+ generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_edits', time_unit='year', cutoff=0)
	179	+ generate_chart_data('wiki', 'editors_dataset', 'en', 'time_to_new_wikipedian', time_unit='year', cutoff=0)
	180	+ generate_chart_data('wiki', 'editors_dataset', 'en', 'new_editor_count', time_unit='month', cutoff=0)
181	181
182	182	#available_analyses()
Index: trunk/tools/editor_trends/database/cache.py
—	—	@@ -76,8 +76,8 @@
77	77
78	78	def insert(self, editor, values, username):
79	79	'''
80		~~- Adding the safe=True statement slows down the insert process but this assures that all data~~
81		~~- will be written.~~
	80	+ Adding the safe=True statement slows down the insert process but this
	81	+ assures that all data will be written.
82	82	'''
83	83	try:
84	84	self.collection.insert({'editor': editor, 'edits': values, 'username': username}, safe=True)

Status & tagging log

07:24, 31 January 2011 Reedy (talk | contribs) changed the status of r81236 [removed: new added: deferred]