r81005 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r81004‎ | r81005 | r81006 >
Date:23:09, 25 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
Added some initial documentation.
Modified paths:
  • /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/dataset.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/analyzer.py
@@ -22,7 +22,6 @@
2323 import calendar
2424 import sys
2525 import os
26 -import inspect
2726 import progressbar
2827 import types
2928 from dateutil.relativedelta import relativedelta
@@ -41,6 +40,66 @@
4241 import dataset
4342
4443
 44+def generate_chart_data(project, collection, language_code, func, **kwargs):
 45+ '''
 46+ This is the entry function to be called to generate data for creating charts.
 47+ '''
 48+ stopwatch = timer.Timer()
 49+ res = True
 50+ dbname = '%s%s' % (language_code, project)
 51+ functions = available_analyses()
 52+ try:
 53+ func = functions[func]
 54+ except KeyError:
 55+ return False
 56+
 57+ print 'Exporting data for chart: %s' % func.func_name
 58+ print 'Project: %s' % dbname
 59+ print 'Dataset: %s' % collection
 60+ ds = loop_editors(dbname, project, collection, language_code, func, **kwargs)
 61+ file = '%s_%s.csv' % (dbname, func.func_name)
 62+ print 'Storing dataset: %s' % os.path.join(settings.dataset_location, file)
 63+ ds.write(format='csv')
 64+ print 'Serializing dataset to %s_%s' % (dbname, 'charts')
 65+ log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start')
 66+ ds.write(format='mongo')
 67+ stopwatch.elapsed()
 68+ log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish')
 69+
 70+ return res
 71+
 72+
 73+def loop_editors(dbname, project, collection, language_code, func, **kwargs):
 74+ '''
 75+ Generic loop function that loops over all the editors of a Wikipedia project
 76+ and then calls the function that does the actual aggregation.
 77+ '''
 78+
 79+ editors = db.retrieve_distinct_keys(dbname, collection, 'editor')
 80+
 81+ pbar = progressbar.ProgressBar(maxval=len(editors)).start()
 82+ min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian')
 83+ print 'Number of editors: %s' % len(editors)
 84+ mongo = db.init_mongo_db(dbname)
 85+ coll = mongo[collection]
 86+ format = kwargs.pop('format', 'long')
 87+ kwargs['min_year'] = min_year
 88+ kwargs['max_year'] = max_year
 89+ vars = []
 90+ ds = dataset.Dataset(func.func_name, project, coll.name, language_code, vars, format=format)
 91+ var = dataset.Variable('count', **kwargs)
 92+
 93+
 94+
 95+ for editor in editors:
 96+ editor = coll.find_one({'editor': editor})
 97+ data = func(var, editor, dbname=dbname)
 98+ pbar.update(pbar.currval + 1)
 99+
 100+ ds.add_variable(var)
 101+ return ds
 102+
 103+
45104 def available_analyses(caller='manage'):
46105 '''
47106 Generates a dictionary:
@@ -53,8 +112,8 @@
54113 ignore = ['__init__']
55114 functions = {}
56115
57 - fn = '%s.py' % inspect.getmodulename(__file__)
58 - loc = __file__.replace(fn, '')
 116+ pos = __file__.rfind(os.sep)
 117+ loc = __file__[:pos]
59118 path = os.path.join(loc , 'plugins')
60119 plugins = import_libs(path)
61120
@@ -73,9 +132,9 @@
74133
75134
76135 def import_libs(path):
77 - """
 136+ '''
78137 Dynamically importing functions from the plugins directory.
79 - """
 138+ '''
80139
81140 library_list = []
82141 sys.path.append(path)
@@ -112,58 +171,6 @@
113172 return windows
114173
115174
116 -def generate_chart_data(project, collection, language_code, func, **kwargs):
117 - '''
118 - This is the entry function to be called to generate data for creating charts.
119 - '''
120 - stopwatch = timer.Timer()
121 - dbname = '%s%s' % (language_code, project)
122 - print 'Exporting data for chart: %s' % func
123 - print 'Project: %s' % dbname
124 - print 'Dataset: %s' % collection
125 - ds = loop_editors(dbname, project, collection, language_code, func, **kwargs)
126 - file = '%s_%s.csv' % (dbname, func.func_name)
127 - print 'Storing dataset: %s' % os.path.join(settings.dataset_location, file)
128 - ds.write(format='csv')
129 - print 'Serializing dataset to %s_%s' % (dbname, 'charts')
130 - log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start')
131 - ds.write(format='mongo')
132 - stopwatch.elapsed()
133 - log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish')
134 -
135 -
136 -def loop_editors(dbname, project, collection, language_code, func, **kwargs):
137 - '''
138 - Generic loop function that loops over all the editors of a Wikipedia project
139 - and then calls the function that does the actual aggregation.
140 - '''
141 -
142 - editors = db.retrieve_distinct_keys(dbname, collection, 'editor')
143 -
144 - pbar = progressbar.ProgressBar(maxval=len(editors)).start()
145 - min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian')
146 - print 'Number of editors: %s' % len(editors)
147 - mongo = db.init_mongo_db(dbname)
148 - coll = mongo[collection]
149 - format = kwargs.pop('format', 'long')
150 - kwargs['min_year'] = min_year
151 - kwargs['max_year'] = max_year
152 - vars = []
153 - ds = dataset.Dataset(func, project, coll.name, language_code, vars, format=format)
154 - var = dataset.Variable('count', **kwargs)
155 -
156 - functions = available_analyses()
157 - func = functions[func]
158 -
159 - for editor in editors:
160 - editor = coll.find_one({'editor': editor})
161 - data = func(var, editor, dbname=dbname)
162 - pbar.update(pbar.currval + 1)
163 -
164 - ds.add_variable(var)
165 - return ds
166 -
167 -
168175 if __name__ == '__main__':
169176
170177 generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=50)
Index: trunk/tools/editor_trends/analyses/dataset.py
@@ -34,6 +34,11 @@
3535 from database import db
3636
3737 class Transform(SONManipulator):
 38+ '''
 39+ This encoder transforms a Dataset to a MongoDB bson document.
 40+ To use this encoder initalize a mongo database instance and then add:
 41+ mongo.add_son_manipulator(Transform())
 42+ '''
3843 def transform_incoming(self, son, collection):
3944 for (key, ds) in son.items():
4045 son[key] = {}
@@ -67,6 +72,10 @@
6873
6974
7075 class Data:
 76+ '''
 77+ Some generic functions that are required by the Observation, Variable, and
 78+ Dataset classes.
 79+ '''
7180 def __hash__(self, date):
7281 #return hash(self.convert_date_to_epoch(date))
7382 return int(self.convert_date_to_epoch(date))
@@ -79,8 +88,6 @@
8089 kwargs[key] = d
8190 return kwargs
8291
83 -
84 -
8592 def convert_date_to_epoch(self, date):
8693 assert self.time_unit == 'year' or self.time_unit == 'month' \
8794 or self.time_unit == 'day'
@@ -96,8 +103,13 @@
97104
98105
99106 class Observation(Data):
 107+ '''
 108+ The smallest unit, here the actual data is being stored.
 109+ Time_unit should either be 'year', 'month' or 'day'.
 110+ '''
100111 def __init__(self, date, time_unit):
101112 assert isinstance(date, datetime.datetime)
 113+ assert time_unit == 'year' or time_unit == 'month' or time_unit == 'day'
102114 self.time_unit = time_unit
103115 self.t0 = self.set_start_date(date)
104116 self.t1 = self.set_end_date(date)
@@ -142,6 +154,11 @@
143155 return datetime.datetime(date.year, date.month, date.day)
144156
145157 def add(self, value, update):
 158+ '''
 159+ If update == True then data[i] will be incremented else data[i] will be
 160+ created, in that case make sure that i is unique. Update is useful for
 161+ tallying a variable.
 162+ '''
146163 if hasattr(value, '__iter__') == False:
147164 d = {}
148165 d[0] = value
@@ -209,57 +226,11 @@
210227 key = self.__hash__(date)
211228 return self.obs.get(key, Observation(date, self.time_unit))
212229
213 - def min(self):
214 - return min([obs for obs in self])
215 - #return min([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()])
216 -
217 - def max(self):
218 - return max([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()])
219 -
220 - def get_standard_deviation(self, number_list):
221 - mean = get_mean(number_list)
222 - std = 0
223 - n = len(number_list)
224 - for i in number_list:
225 - std = std + (i - mean) ** 2
226 - return math.sqrt(std / float(n - 1))
227 -
228 -
229 - def get_median(self, number_list):
230 - #print number_list
231 - if number_list == []: return '.'
232 - data = sorted(number_list)
233 - data = [float(x) for x in data]
234 - if len(data) % 2 == 1:
235 - return data[(len(data) + 1) / 2 - 1]
236 - else:
237 - lower = data[len(data) / 2 - 1]
238 - upper = data[len(data) / 2]
239 - #print upper, lower
240 - return (lower + upper) / 2
241 -
242 -
243 - def get_mean(self, number_list):
244 - #print number_list
245 - if number_list == []: return '.'
246 - float_nums = [float(x) for x in number_list]
247 - return sum(float_nums) / len(number_list)
248 -
249 - def summary(self):
250 - print 'Variable: %s' % self.name
251 - print 'Mean: %s' % self.get_mean(self)
252 - print 'Median: %s' % self.get_median(self)
253 - print 'Standard Deviation: %s' % self.get_standard_deviation(self)
254 - print 'Minimum: %s' % self.min()
255 - print 'Maximum: %s' % self.max()
256 -
257 -
258230 def add(self, date, value, update=True):
259231 data = self.get_observation(date)
260232 data.add(value, update)
261233 self.obs[data.hash] = data
262234
263 -
264235 def encode(self):
265236 bson = {}
266237 for prop in self.props:
@@ -327,7 +298,6 @@
328299 else:
329300 raise TypeError('You can only instance of Variable to a dataset.')
330301
331 -
332302 def write(self, format='csv'):
333303 if format == 'csv':
334304 self.to_csv()
@@ -357,56 +327,77 @@
358328 props[prop] = getattr(self, prop)
359329 return props
360330
 331+ def min(self):
 332+ return min([obs for obs in self])
 333+ #return min([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()])
361334
 335+ def max(self):
 336+ return max([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()])
362337
363 -# def transform_to_stacked_bar_json(self):
364 -# '''
365 -# This function outputs data in a format that is understood by jquery
366 -# flot plugin.
367 -# '''
368 -# options = {}
369 -# options['xaxis'] = {}
370 -# options['xaxis']['ticks'] = []
371 -# data = []
372 -# obs, all_keys = ds.convert_dataset_to_lists()
373 -#
374 -# for ob in obs:
375 -# d = {}
376 -# d['label'] = ob[0].year
377 -# d['data'] = []
378 -# ob = ob[1:]
379 -# for x, o in enumerate(ob):
380 -# d['data'].append([x, o])
381 -# data.append(d)
382 -# for x, date in enumerate(obs[0]):
383 -# options['xaxis']['ticks'].append([x, date.year])
384 -#
385 -# return data, options
 338+ def get_standard_deviation(self, number_list):
 339+ mean = get_mean(number_list)
 340+ std = 0
 341+ n = len(number_list)
 342+ for i in number_list:
 343+ std = std + (i - mean) ** 2
 344+ return math.sqrt(std / float(n - 1))
386345
387346
 347+ def get_median(self, number_list):
 348+ #print number_list
 349+ if number_list == []: return '.'
 350+ data = sorted(number_list)
 351+ data = [float(x) for x in data]
 352+ if len(data) % 2 == 1:
 353+ return data[(len(data) + 1) / 2 - 1]
 354+ else:
 355+ lower = data[len(data) / 2 - 1]
 356+ upper = data[len(data) / 2]
 357+ #print upper, lower
 358+ return (lower + upper) / 2
 359+
 360+
 361+ def get_mean(self, number_list):
 362+ #print number_list
 363+ if number_list == []: return '.'
 364+ float_nums = [float(x) for x in number_list]
 365+ return sum(float_nums) / len(number_list)
 366+
 367+ def summary(self):
 368+ print 'Variable: %s' % self.name
 369+ print 'Mean: %s' % self.get_mean(self)
 370+ print 'Median: %s' % self.get_median(self)
 371+ print 'Standard Deviation: %s' % self.get_standard_deviation(self)
 372+ print 'Minimum: %s' % self.min()
 373+ print 'Maximum: %s' % self.max()
 374+
 375+
388376 def debug():
389377 mongo = db.init_mongo_db('enwiki')
390378 rawdata = mongo['enwiki_charts']
391379 mongo.add_son_manipulator(Transform())
392 -# d1 = datetime.datetime.today()
393 -# d2 = datetime.datetime(2007, 6, 7)
394 -# ds = Dataset('test', 'enwiki', 'editors_dataset', [{'name': 'count', 'time_unit': 'year'},
395 -# {'name': 'testest', 'time_unit': 'year'}])
396 -# ds.count.add(d1, 5)
397 -# ds.count.add(d2, 514)
398 -# ds.testest.add(d1, 135)
399 -# ds.testest.add(d2, 535)
400 -# #ds.summary()
401 -# #ds.write_to_csv()
402 -# v = Variable('test', 'year')
403 -# ds.encode()
404 -# mongo.test.insert({'variables': ds})
405380
406 - #v.add(date , 5)
407 - #o = v.get_observation(date)
408 - ds = rawdata.find_one({'project': 'wiki', 'language_code': 'en', 'hash': 'cohort_dataset_backward_bar'})
409 - transform_to_stacked_bar_json(ds)
410 - #v.summary()
 381+ d1 = datetime.datetime.today()
 382+ d2 = datetime.datetime(2007, 6, 7)
 383+ ds = Dataset('test', 'enwiki', 'editors_dataset', [
 384+ {'name': 'count', 'time_unit': 'year'},
 385+ {'name': 'testest', 'time_unit': 'year'}
 386+ ])
 387+ ds.count.add(d1, 5)
 388+ ds.count.add(d2, 514)
 389+ ds.testest.add(d1, 135)
 390+ ds.testest.add(d2, 535)
 391+ #ds.summary()
 392+ ds.write_to_csv()
 393+ v = Variable('test', 'year')
 394+ ds.encode()
 395+ mongo.test.insert({'variables': ds})
 396+
 397+ v.add(date , 5)
 398+ o = v.get_observation(date)
 399+ ds = rawdata.find_one({'project': 'wiki',
 400+ 'language_code': 'en',
 401+ 'hash': 'cohort_dataset_backward_bar'})
411402 print ds
412403
413404

Status & tagging log