r81005 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r81004‎ \| r81005 \| r81006 >
Date:	23:09, 25 January 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Added some initial documentation.
Modified paths:	/trunk/tools/editor_trends/analyses/analyzer.py (modified) (history) /trunk/tools/editor_trends/analyses/dataset.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/analyzer.py
—	—	@@ -22,7 +22,6 @@
23	23	import calendar
24	24	import sys
25	25	import os
26		~~-import inspect~~
27	26	import progressbar
28	27	import types
29	28	from dateutil.relativedelta import relativedelta
—	—	@@ -41,6 +40,66 @@
42	41	import dataset
43	42
44	43
	44	+def generate_chart_data(project, collection, language_code, func, **kwargs):
	45	+ '''
	46	+ This is the entry function to be called to generate data for creating charts.
	47	+ '''
	48	+ stopwatch = timer.Timer()
	49	+ res = True
	50	+ dbname = '%s%s' % (language_code, project)
	51	+ functions = available_analyses()
	52	+ try:
	53	+ func = functions[func]
	54	+ except KeyError:
	55	+ return False
	56	+
	57	+ print 'Exporting data for chart: %s' % func.func_name
	58	+ print 'Project: %s' % dbname
	59	+ print 'Dataset: %s' % collection
	60	+ ds = loop_editors(dbname, project, collection, language_code, func, **kwargs)
	61	+ file = '%s_%s.csv' % (dbname, func.func_name)
	62	+ print 'Storing dataset: %s' % os.path.join(settings.dataset_location, file)
	63	+ ds.write(format='csv')
	64	+ print 'Serializing dataset to %s_%s' % (dbname, 'charts')
	65	+ log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start')
	66	+ ds.write(format='mongo')
	67	+ stopwatch.elapsed()
	68	+ log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish')
	69	+
	70	+ return res
	71	+
	72	+
	73	+def loop_editors(dbname, project, collection, language_code, func, **kwargs):
	74	+ '''
	75	+ Generic loop function that loops over all the editors of a Wikipedia project
	76	+ and then calls the function that does the actual aggregation.
	77	+ '''
	78	+
	79	+ editors = db.retrieve_distinct_keys(dbname, collection, 'editor')
	80	+
	81	+ pbar = progressbar.ProgressBar(maxval=len(editors)).start()
	82	+ min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian')
	83	+ print 'Number of editors: %s' % len(editors)
	84	+ mongo = db.init_mongo_db(dbname)
	85	+ coll = mongo[collection]
	86	+ format = kwargs.pop('format', 'long')
	87	+ kwargs['min_year'] = min_year
	88	+ kwargs['max_year'] = max_year
	89	+ vars = []
	90	+ ds = dataset.Dataset(func.func_name, project, coll.name, language_code, vars, format=format)
	91	+ var = dataset.Variable('count', **kwargs)
	92	+
	93	+
	94	+
	95	+ for editor in editors:
	96	+ editor = coll.find_one({'editor': editor})
	97	+ data = func(var, editor, dbname=dbname)
	98	+ pbar.update(pbar.currval + 1)
	99	+
	100	+ ds.add_variable(var)
	101	+ return ds
	102	+
	103	+
45	104	def available_analyses(caller='manage'):
46	105	'''
47	106	Generates a dictionary:
—	—	@@ -53,8 +112,8 @@
54	113	ignore = ['__init__']
55	114	functions = {}
56	115
57		~~- fn = '%s.py' % inspect.getmodulename(__file__)~~
58		~~- loc = __file__.replace(fn, '')~~
	116	+ pos = __file__.rfind(os.sep)
	117	+ loc = __file__[:pos]
59	118	path = os.path.join(loc , 'plugins')
60	119	plugins = import_libs(path)
61	120
—	—	@@ -73,9 +132,9 @@
74	133
75	134
76	135	def import_libs(path):
77		~~- """~~
	136	+ '''
78	137	Dynamically importing functions from the plugins directory.
79		~~- """~~
	138	+ '''
80	139
81	140	library_list = []
82	141	sys.path.append(path)
—	—	@@ -112,58 +171,6 @@
113	172	return windows
114	173
115	174
116		~~-def generate_chart_data(project, collection, language_code, func, **kwargs):~~
117		~~- '''~~
118		~~- This is the entry function to be called to generate data for creating charts.~~
119		~~- '''~~
120		~~- stopwatch = timer.Timer()~~
121		~~- dbname = '%s%s' % (language_code, project)~~
122		~~- print 'Exporting data for chart: %s' % func~~
123		~~- print 'Project: %s' % dbname~~
124		~~- print 'Dataset: %s' % collection~~
125		~~- ds = loop_editors(dbname, project, collection, language_code, func, **kwargs)~~
126		~~- file = '%s_%s.csv' % (dbname, func.func_name)~~
127		~~- print 'Storing dataset: %s' % os.path.join(settings.dataset_location, file)~~
128		~~- ds.write(format='csv')~~
129		~~- print 'Serializing dataset to %s_%s' % (dbname, 'charts')~~
130		~~- log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start')~~
131		~~- ds.write(format='mongo')~~
132		~~- stopwatch.elapsed()~~
133		~~- log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish')~~
134		-
135		-
136		~~-def loop_editors(dbname, project, collection, language_code, func, **kwargs):~~
137		~~- '''~~
138		~~- Generic loop function that loops over all the editors of a Wikipedia project~~
139		~~- and then calls the function that does the actual aggregation.~~
140		~~- '''~~
141		-
142		~~- editors = db.retrieve_distinct_keys(dbname, collection, 'editor')~~
143		-
144		~~- pbar = progressbar.ProgressBar(maxval=len(editors)).start()~~
145		~~- min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian')~~
146		~~- print 'Number of editors: %s' % len(editors)~~
147		~~- mongo = db.init_mongo_db(dbname)~~
148		~~- coll = mongo[collection]~~
149		~~- format = kwargs.pop('format', 'long')~~
150		~~- kwargs['min_year'] = min_year~~
151		~~- kwargs['max_year'] = max_year~~
152		~~- vars = []~~
153		~~- ds = dataset.Dataset(func, project, coll.name, language_code, vars, format=format)~~
154		~~- var = dataset.Variable('count', **kwargs)~~
155		-
156		~~- functions = available_analyses()~~
157		~~- func = functions[func]~~
158		-
159		~~- for editor in editors:~~
160		~~- editor = coll.find_one({'editor': editor})~~
161		~~- data = func(var, editor, dbname=dbname)~~
162		~~- pbar.update(pbar.currval + 1)~~
163		-
164		~~- ds.add_variable(var)~~
165		~~- return ds~~
166		-
167		-
168	175	if __name__ == '__main__':
169	176
170	177	generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=50)
Index: trunk/tools/editor_trends/analyses/dataset.py
—	—	@@ -34,6 +34,11 @@
35	35	from database import db
36	36
37	37	class Transform(SONManipulator):
	38	+ '''
	39	+ This encoder transforms a Dataset to a MongoDB bson document.
	40	+ To use this encoder initalize a mongo database instance and then add:
	41	+ mongo.add_son_manipulator(Transform())
	42	+ '''
38	43	def transform_incoming(self, son, collection):
39	44	for (key, ds) in son.items():
40	45	son[key] = {}
—	—	@@ -67,6 +72,10 @@
68	73
69	74
70	75	class Data:
	76	+ '''
	77	+ Some generic functions that are required by the Observation, Variable, and
	78	+ Dataset classes.
	79	+ '''
71	80	def __hash__(self, date):
72	81	#return hash(self.convert_date_to_epoch(date))
73	82	return int(self.convert_date_to_epoch(date))
—	—	@@ -79,8 +88,6 @@
80	89	kwargs[key] = d
81	90	return kwargs
82	91
83		-
84		-
85	92	def convert_date_to_epoch(self, date):
86	93	assert self.time_unit == 'year' or self.time_unit == 'month' \
87	94	or self.time_unit == 'day'
—	—	@@ -96,8 +103,13 @@
97	104
98	105
99	106	class Observation(Data):
	107	+ '''
	108	+ The smallest unit, here the actual data is being stored.
	109	+ Time_unit should either be 'year', 'month' or 'day'.
	110	+ '''
100	111	def __init__(self, date, time_unit):
101	112	assert isinstance(date, datetime.datetime)
	113	+ assert time_unit == 'year' or time_unit == 'month' or time_unit == 'day'
102	114	self.time_unit = time_unit
103	115	self.t0 = self.set_start_date(date)
104	116	self.t1 = self.set_end_date(date)
—	—	@@ -142,6 +154,11 @@
143	155	return datetime.datetime(date.year, date.month, date.day)
144	156
145	157	def add(self, value, update):
	158	+ '''
	159	+ If update == True then data[i] will be incremented else data[i] will be
	160	+ created, in that case make sure that i is unique. Update is useful for
	161	+ tallying a variable.
	162	+ '''
146	163	if hasattr(value, '__iter__') == False:
147	164	d = {}
148	165	d[0] = value
—	—	@@ -209,57 +226,11 @@
210	227	key = self.__hash__(date)
211	228	return self.obs.get(key, Observation(date, self.time_unit))
212	229
213		~~- def min(self):~~
214		~~- return min([obs for obs in self])~~
215		~~- #return min([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()])~~
216		-
217		~~- def max(self):~~
218		~~- return max([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()])~~
219		-
220		~~- def get_standard_deviation(self, number_list):~~
221		~~- mean = get_mean(number_list)~~
222		~~- std = 0~~
223		~~- n = len(number_list)~~
224		~~- for i in number_list:~~
225		~~- std = std + (i - mean) ** 2~~
226		~~- return math.sqrt(std / float(n - 1))~~
227		-
228		-
229		~~- def get_median(self, number_list):~~
230		~~- #print number_list~~
231		~~- if number_list == []: return '.'~~
232		~~- data = sorted(number_list)~~
233		~~- data = [float(x) for x in data]~~
234		~~- if len(data) % 2 == 1:~~
235		~~- return data[(len(data) + 1) / 2 - 1]~~
236		~~- else:~~
237		~~- lower = data[len(data) / 2 - 1]~~
238		~~- upper = data[len(data) / 2]~~
239		~~- #print upper, lower~~
240		~~- return (lower + upper) / 2~~
241		-
242		-
243		~~- def get_mean(self, number_list):~~
244		~~- #print number_list~~
245		~~- if number_list == []: return '.'~~
246		~~- float_nums = [float(x) for x in number_list]~~
247		~~- return sum(float_nums) / len(number_list)~~
248		-
249		~~- def summary(self):~~
250		~~- print 'Variable: %s' % self.name~~
251		~~- print 'Mean: %s' % self.get_mean(self)~~
252		~~- print 'Median: %s' % self.get_median(self)~~
253		~~- print 'Standard Deviation: %s' % self.get_standard_deviation(self)~~
254		~~- print 'Minimum: %s' % self.min()~~
255		~~- print 'Maximum: %s' % self.max()~~
256		-
257		-
258	230	def add(self, date, value, update=True):
259	231	data = self.get_observation(date)
260	232	data.add(value, update)
261	233	self.obs[data.hash] = data
262	234
263		-
264	235	def encode(self):
265	236	bson = {}
266	237	for prop in self.props:
—	—	@@ -327,7 +298,6 @@
328	299	else:
329	300	raise TypeError('You can only instance of Variable to a dataset.')
330	301
331		-
332	302	def write(self, format='csv'):
333	303	if format == 'csv':
334	304	self.to_csv()
—	—	@@ -357,56 +327,77 @@
358	328	props[prop] = getattr(self, prop)
359	329	return props
360	330
	331	+ def min(self):
	332	+ return min([obs for obs in self])
	333	+ #return min([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()])
361	334
	335	+ def max(self):
	336	+ return max([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()])
362	337
363		~~-# def transform_to_stacked_bar_json(self):~~
364		~~-# '''~~
365		~~-# This function outputs data in a format that is understood by jquery~~
366		~~-# flot plugin.~~
367		~~-# '''~~
368		~~-# options = {}~~
369		~~-# options['xaxis'] = {}~~
370		~~-# options['xaxis']['ticks'] = []~~
371		~~-# data = []~~
372		~~-# obs, all_keys = ds.convert_dataset_to_lists()~~
373		-#
374		~~-# for ob in obs:~~
375		~~-# d = {}~~
376		~~-# d['label'] = ob[0].year~~
377		~~-# d['data'] = []~~
378		~~-# ob = ob[1:]~~
379		~~-# for x, o in enumerate(ob):~~
380		~~-# d['data'].append([x, o])~~
381		~~-# data.append(d)~~
382		~~-# for x, date in enumerate(obs[0]):~~
383		~~-# options['xaxis']['ticks'].append([x, date.year])~~
384		-#
385		~~-# return data, options~~
	338	+ def get_standard_deviation(self, number_list):
	339	+ mean = get_mean(number_list)
	340	+ std = 0
	341	+ n = len(number_list)
	342	+ for i in number_list:
	343	+ std = std + (i - mean) ** 2
	344	+ return math.sqrt(std / float(n - 1))
386	345
387	346
	347	+ def get_median(self, number_list):
	348	+ #print number_list
	349	+ if number_list == []: return '.'
	350	+ data = sorted(number_list)
	351	+ data = [float(x) for x in data]
	352	+ if len(data) % 2 == 1:
	353	+ return data[(len(data) + 1) / 2 - 1]
	354	+ else:
	355	+ lower = data[len(data) / 2 - 1]
	356	+ upper = data[len(data) / 2]
	357	+ #print upper, lower
	358	+ return (lower + upper) / 2
	359	+
	360	+
	361	+ def get_mean(self, number_list):
	362	+ #print number_list
	363	+ if number_list == []: return '.'
	364	+ float_nums = [float(x) for x in number_list]
	365	+ return sum(float_nums) / len(number_list)
	366	+
	367	+ def summary(self):
	368	+ print 'Variable: %s' % self.name
	369	+ print 'Mean: %s' % self.get_mean(self)
	370	+ print 'Median: %s' % self.get_median(self)
	371	+ print 'Standard Deviation: %s' % self.get_standard_deviation(self)
	372	+ print 'Minimum: %s' % self.min()
	373	+ print 'Maximum: %s' % self.max()
	374	+
	375	+
388	376	def debug():
389	377	mongo = db.init_mongo_db('enwiki')
390	378	rawdata = mongo['enwiki_charts']
391	379	mongo.add_son_manipulator(Transform())
392		~~-# d1 = datetime.datetime.today()~~
393		~~-# d2 = datetime.datetime(2007, 6, 7)~~
394		~~-# ds = Dataset('test', 'enwiki', 'editors_dataset', [{'name': 'count', 'time_unit': 'year'},~~
395		~~-# {'name': 'testest', 'time_unit': 'year'}])~~
396		~~-# ds.count.add(d1, 5)~~
397		~~-# ds.count.add(d2, 514)~~
398		~~-# ds.testest.add(d1, 135)~~
399		~~-# ds.testest.add(d2, 535)~~
400		~~-# #ds.summary()~~
401		~~-# #ds.write_to_csv()~~
402		~~-# v = Variable('test', 'year')~~
403		~~-# ds.encode()~~
404		~~-# mongo.test.insert({'variables': ds})~~
405	380
406		~~- #v.add(date , 5)~~
407		~~- #o = v.get_observation(date)~~
408		~~- ds = rawdata.find_one({'project': 'wiki', 'language_code': 'en', 'hash': 'cohort_dataset_backward_bar'})~~
409		~~- transform_to_stacked_bar_json(ds)~~
410		~~- #v.summary()~~
	381	+ d1 = datetime.datetime.today()
	382	+ d2 = datetime.datetime(2007, 6, 7)
	383	+ ds = Dataset('test', 'enwiki', 'editors_dataset', [
	384	+ {'name': 'count', 'time_unit': 'year'},
	385	+ {'name': 'testest', 'time_unit': 'year'}
	386	+ ])
	387	+ ds.count.add(d1, 5)
	388	+ ds.count.add(d2, 514)
	389	+ ds.testest.add(d1, 135)
	390	+ ds.testest.add(d2, 535)
	391	+ #ds.summary()
	392	+ ds.write_to_csv()
	393	+ v = Variable('test', 'year')
	394	+ ds.encode()
	395	+ mongo.test.insert({'variables': ds})
	396	+
	397	+ v.add(date , 5)
	398	+ o = v.get_observation(date)
	399	+ ds = rawdata.find_one({'project': 'wiki',
	400	+ 'language_code': 'en',
	401	+ 'hash': 'cohort_dataset_backward_bar'})
411	402	print ds
412	403
413	404

Status & tagging log

05:05, 26 January 2011 Reedy (talk | contribs) changed the status of r81005 [removed: new added: deferred]