Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -22,7 +22,6 @@ |
23 | 23 | import calendar |
24 | 24 | import sys |
25 | 25 | import os |
26 | | -import inspect |
27 | 26 | import progressbar |
28 | 27 | import types |
29 | 28 | from dateutil.relativedelta import relativedelta |
— | — | @@ -41,6 +40,66 @@ |
42 | 41 | import dataset |
43 | 42 | |
44 | 43 | |
| 44 | +def generate_chart_data(project, collection, language_code, func, **kwargs): |
| 45 | + ''' |
| 46 | + This is the entry function to be called to generate data for creating charts. |
| 47 | + ''' |
| 48 | + stopwatch = timer.Timer() |
| 49 | + res = True |
| 50 | + dbname = '%s%s' % (language_code, project) |
| 51 | + functions = available_analyses() |
| 52 | + try: |
| 53 | + func = functions[func] |
| 54 | + except KeyError: |
| 55 | + return False |
| 56 | + |
| 57 | + print 'Exporting data for chart: %s' % func.func_name |
| 58 | + print 'Project: %s' % dbname |
| 59 | + print 'Dataset: %s' % collection |
| 60 | + ds = loop_editors(dbname, project, collection, language_code, func, **kwargs) |
| 61 | + file = '%s_%s.csv' % (dbname, func.func_name) |
| 62 | + print 'Storing dataset: %s' % os.path.join(settings.dataset_location, file) |
| 63 | + ds.write(format='csv') |
| 64 | + print 'Serializing dataset to %s_%s' % (dbname, 'charts') |
| 65 | + log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start') |
| 66 | + ds.write(format='mongo') |
| 67 | + stopwatch.elapsed() |
| 68 | + log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish') |
| 69 | + |
| 70 | + return res |
| 71 | + |
| 72 | + |
| 73 | +def loop_editors(dbname, project, collection, language_code, func, **kwargs): |
| 74 | + ''' |
| 75 | + Generic loop function that loops over all the editors of a Wikipedia project |
| 76 | + and then calls the function that does the actual aggregation. |
| 77 | + ''' |
| 78 | + |
| 79 | + editors = db.retrieve_distinct_keys(dbname, collection, 'editor') |
| 80 | + |
| 81 | + pbar = progressbar.ProgressBar(maxval=len(editors)).start() |
| 82 | + min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian') |
| 83 | + print 'Number of editors: %s' % len(editors) |
| 84 | + mongo = db.init_mongo_db(dbname) |
| 85 | + coll = mongo[collection] |
| 86 | + format = kwargs.pop('format', 'long') |
| 87 | + kwargs['min_year'] = min_year |
| 88 | + kwargs['max_year'] = max_year |
| 89 | + vars = [] |
| 90 | + ds = dataset.Dataset(func.func_name, project, coll.name, language_code, vars, format=format) |
| 91 | + var = dataset.Variable('count', **kwargs) |
| 92 | + |
| 93 | + |
| 94 | + |
| 95 | + for editor in editors: |
| 96 | + editor = coll.find_one({'editor': editor}) |
| 97 | + data = func(var, editor, dbname=dbname) |
| 98 | + pbar.update(pbar.currval + 1) |
| 99 | + |
| 100 | + ds.add_variable(var) |
| 101 | + return ds |
| 102 | + |
| 103 | + |
45 | 104 | def available_analyses(caller='manage'): |
46 | 105 | ''' |
47 | 106 | Generates a dictionary: |
— | — | @@ -53,8 +112,8 @@ |
54 | 113 | ignore = ['__init__'] |
55 | 114 | functions = {} |
56 | 115 | |
57 | | - fn = '%s.py' % inspect.getmodulename(__file__) |
58 | | - loc = __file__.replace(fn, '') |
| 116 | + pos = __file__.rfind(os.sep) |
| 117 | + loc = __file__[:pos] |
59 | 118 | path = os.path.join(loc , 'plugins') |
60 | 119 | plugins = import_libs(path) |
61 | 120 | |
— | — | @@ -73,9 +132,9 @@ |
74 | 133 | |
75 | 134 | |
76 | 135 | def import_libs(path): |
77 | | - """ |
| 136 | + ''' |
78 | 137 | Dynamically importing functions from the plugins directory. |
79 | | - """ |
| 138 | + ''' |
80 | 139 | |
81 | 140 | library_list = [] |
82 | 141 | sys.path.append(path) |
— | — | @@ -112,58 +171,6 @@ |
113 | 172 | return windows |
114 | 173 | |
115 | 174 | |
116 | | -def generate_chart_data(project, collection, language_code, func, **kwargs): |
117 | | - ''' |
118 | | - This is the entry function to be called to generate data for creating charts. |
119 | | - ''' |
120 | | - stopwatch = timer.Timer() |
121 | | - dbname = '%s%s' % (language_code, project) |
122 | | - print 'Exporting data for chart: %s' % func |
123 | | - print 'Project: %s' % dbname |
124 | | - print 'Dataset: %s' % collection |
125 | | - ds = loop_editors(dbname, project, collection, language_code, func, **kwargs) |
126 | | - file = '%s_%s.csv' % (dbname, func.func_name) |
127 | | - print 'Storing dataset: %s' % os.path.join(settings.dataset_location, file) |
128 | | - ds.write(format='csv') |
129 | | - print 'Serializing dataset to %s_%s' % (dbname, 'charts') |
130 | | - log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start') |
131 | | - ds.write(format='mongo') |
132 | | - stopwatch.elapsed() |
133 | | - log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish') |
134 | | - |
135 | | - |
136 | | -def loop_editors(dbname, project, collection, language_code, func, **kwargs): |
137 | | - ''' |
138 | | - Generic loop function that loops over all the editors of a Wikipedia project |
139 | | - and then calls the function that does the actual aggregation. |
140 | | - ''' |
141 | | - |
142 | | - editors = db.retrieve_distinct_keys(dbname, collection, 'editor') |
143 | | - |
144 | | - pbar = progressbar.ProgressBar(maxval=len(editors)).start() |
145 | | - min_year, max_year = determine_project_year_range(dbname, collection, 'new_wikipedian') |
146 | | - print 'Number of editors: %s' % len(editors) |
147 | | - mongo = db.init_mongo_db(dbname) |
148 | | - coll = mongo[collection] |
149 | | - format = kwargs.pop('format', 'long') |
150 | | - kwargs['min_year'] = min_year |
151 | | - kwargs['max_year'] = max_year |
152 | | - vars = [] |
153 | | - ds = dataset.Dataset(func, project, coll.name, language_code, vars, format=format) |
154 | | - var = dataset.Variable('count', **kwargs) |
155 | | - |
156 | | - functions = available_analyses() |
157 | | - func = functions[func] |
158 | | - |
159 | | - for editor in editors: |
160 | | - editor = coll.find_one({'editor': editor}) |
161 | | - data = func(var, editor, dbname=dbname) |
162 | | - pbar.update(pbar.currval + 1) |
163 | | - |
164 | | - ds.add_variable(var) |
165 | | - return ds |
166 | | - |
167 | | - |
168 | 175 | if __name__ == '__main__': |
169 | 176 | |
170 | 177 | generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=50) |
Index: trunk/tools/editor_trends/analyses/dataset.py |
— | — | @@ -34,6 +34,11 @@ |
35 | 35 | from database import db |
36 | 36 | |
37 | 37 | class Transform(SONManipulator): |
| 38 | + ''' |
| 39 | + This encoder transforms a Dataset to a MongoDB bson document. |
| 40 | + To use this encoder initalize a mongo database instance and then add: |
| 41 | + mongo.add_son_manipulator(Transform()) |
| 42 | + ''' |
38 | 43 | def transform_incoming(self, son, collection): |
39 | 44 | for (key, ds) in son.items(): |
40 | 45 | son[key] = {} |
— | — | @@ -67,6 +72,10 @@ |
68 | 73 | |
69 | 74 | |
70 | 75 | class Data: |
| 76 | + ''' |
| 77 | + Some generic functions that are required by the Observation, Variable, and |
| 78 | + Dataset classes. |
| 79 | + ''' |
71 | 80 | def __hash__(self, date): |
72 | 81 | #return hash(self.convert_date_to_epoch(date)) |
73 | 82 | return int(self.convert_date_to_epoch(date)) |
— | — | @@ -79,8 +88,6 @@ |
80 | 89 | kwargs[key] = d |
81 | 90 | return kwargs |
82 | 91 | |
83 | | - |
84 | | - |
85 | 92 | def convert_date_to_epoch(self, date): |
86 | 93 | assert self.time_unit == 'year' or self.time_unit == 'month' \ |
87 | 94 | or self.time_unit == 'day' |
— | — | @@ -96,8 +103,13 @@ |
97 | 104 | |
98 | 105 | |
99 | 106 | class Observation(Data): |
| 107 | + ''' |
| 108 | + The smallest unit, here the actual data is being stored. |
| 109 | + Time_unit should either be 'year', 'month' or 'day'. |
| 110 | + ''' |
100 | 111 | def __init__(self, date, time_unit): |
101 | 112 | assert isinstance(date, datetime.datetime) |
| 113 | + assert time_unit == 'year' or time_unit == 'month' or time_unit == 'day' |
102 | 114 | self.time_unit = time_unit |
103 | 115 | self.t0 = self.set_start_date(date) |
104 | 116 | self.t1 = self.set_end_date(date) |
— | — | @@ -142,6 +154,11 @@ |
143 | 155 | return datetime.datetime(date.year, date.month, date.day) |
144 | 156 | |
145 | 157 | def add(self, value, update): |
| 158 | + ''' |
| 159 | + If update == True then data[i] will be incremented else data[i] will be |
| 160 | + created, in that case make sure that i is unique. Update is useful for |
| 161 | + tallying a variable. |
| 162 | + ''' |
146 | 163 | if hasattr(value, '__iter__') == False: |
147 | 164 | d = {} |
148 | 165 | d[0] = value |
— | — | @@ -209,57 +226,11 @@ |
210 | 227 | key = self.__hash__(date) |
211 | 228 | return self.obs.get(key, Observation(date, self.time_unit)) |
212 | 229 | |
213 | | - def min(self): |
214 | | - return min([obs for obs in self]) |
215 | | - #return min([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()]) |
216 | | - |
217 | | - def max(self): |
218 | | - return max([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()]) |
219 | | - |
220 | | - def get_standard_deviation(self, number_list): |
221 | | - mean = get_mean(number_list) |
222 | | - std = 0 |
223 | | - n = len(number_list) |
224 | | - for i in number_list: |
225 | | - std = std + (i - mean) ** 2 |
226 | | - return math.sqrt(std / float(n - 1)) |
227 | | - |
228 | | - |
229 | | - def get_median(self, number_list): |
230 | | - #print number_list |
231 | | - if number_list == []: return '.' |
232 | | - data = sorted(number_list) |
233 | | - data = [float(x) for x in data] |
234 | | - if len(data) % 2 == 1: |
235 | | - return data[(len(data) + 1) / 2 - 1] |
236 | | - else: |
237 | | - lower = data[len(data) / 2 - 1] |
238 | | - upper = data[len(data) / 2] |
239 | | - #print upper, lower |
240 | | - return (lower + upper) / 2 |
241 | | - |
242 | | - |
243 | | - def get_mean(self, number_list): |
244 | | - #print number_list |
245 | | - if number_list == []: return '.' |
246 | | - float_nums = [float(x) for x in number_list] |
247 | | - return sum(float_nums) / len(number_list) |
248 | | - |
249 | | - def summary(self): |
250 | | - print 'Variable: %s' % self.name |
251 | | - print 'Mean: %s' % self.get_mean(self) |
252 | | - print 'Median: %s' % self.get_median(self) |
253 | | - print 'Standard Deviation: %s' % self.get_standard_deviation(self) |
254 | | - print 'Minimum: %s' % self.min() |
255 | | - print 'Maximum: %s' % self.max() |
256 | | - |
257 | | - |
258 | 230 | def add(self, date, value, update=True): |
259 | 231 | data = self.get_observation(date) |
260 | 232 | data.add(value, update) |
261 | 233 | self.obs[data.hash] = data |
262 | 234 | |
263 | | - |
264 | 235 | def encode(self): |
265 | 236 | bson = {} |
266 | 237 | for prop in self.props: |
— | — | @@ -327,7 +298,6 @@ |
328 | 299 | else: |
329 | 300 | raise TypeError('You can only instance of Variable to a dataset.') |
330 | 301 | |
331 | | - |
332 | 302 | def write(self, format='csv'): |
333 | 303 | if format == 'csv': |
334 | 304 | self.to_csv() |
— | — | @@ -357,56 +327,77 @@ |
358 | 328 | props[prop] = getattr(self, prop) |
359 | 329 | return props |
360 | 330 | |
| 331 | + def min(self): |
| 332 | + return min([obs for obs in self]) |
| 333 | + #return min([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()]) |
361 | 334 | |
| 335 | + def max(self): |
| 336 | + return max([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()]) |
362 | 337 | |
363 | | -# def transform_to_stacked_bar_json(self): |
364 | | -# ''' |
365 | | -# This function outputs data in a format that is understood by jquery |
366 | | -# flot plugin. |
367 | | -# ''' |
368 | | -# options = {} |
369 | | -# options['xaxis'] = {} |
370 | | -# options['xaxis']['ticks'] = [] |
371 | | -# data = [] |
372 | | -# obs, all_keys = ds.convert_dataset_to_lists() |
373 | | -# |
374 | | -# for ob in obs: |
375 | | -# d = {} |
376 | | -# d['label'] = ob[0].year |
377 | | -# d['data'] = [] |
378 | | -# ob = ob[1:] |
379 | | -# for x, o in enumerate(ob): |
380 | | -# d['data'].append([x, o]) |
381 | | -# data.append(d) |
382 | | -# for x, date in enumerate(obs[0]): |
383 | | -# options['xaxis']['ticks'].append([x, date.year]) |
384 | | -# |
385 | | -# return data, options |
| 338 | + def get_standard_deviation(self, number_list): |
| 339 | + mean = get_mean(number_list) |
| 340 | + std = 0 |
| 341 | + n = len(number_list) |
| 342 | + for i in number_list: |
| 343 | + std = std + (i - mean) ** 2 |
| 344 | + return math.sqrt(std / float(n - 1)) |
386 | 345 | |
387 | 346 | |
| 347 | + def get_median(self, number_list): |
| 348 | + #print number_list |
| 349 | + if number_list == []: return '.' |
| 350 | + data = sorted(number_list) |
| 351 | + data = [float(x) for x in data] |
| 352 | + if len(data) % 2 == 1: |
| 353 | + return data[(len(data) + 1) / 2 - 1] |
| 354 | + else: |
| 355 | + lower = data[len(data) / 2 - 1] |
| 356 | + upper = data[len(data) / 2] |
| 357 | + #print upper, lower |
| 358 | + return (lower + upper) / 2 |
| 359 | + |
| 360 | + |
| 361 | + def get_mean(self, number_list): |
| 362 | + #print number_list |
| 363 | + if number_list == []: return '.' |
| 364 | + float_nums = [float(x) for x in number_list] |
| 365 | + return sum(float_nums) / len(number_list) |
| 366 | + |
| 367 | + def summary(self): |
| 368 | + print 'Variable: %s' % self.name |
| 369 | + print 'Mean: %s' % self.get_mean(self) |
| 370 | + print 'Median: %s' % self.get_median(self) |
| 371 | + print 'Standard Deviation: %s' % self.get_standard_deviation(self) |
| 372 | + print 'Minimum: %s' % self.min() |
| 373 | + print 'Maximum: %s' % self.max() |
| 374 | + |
| 375 | + |
388 | 376 | def debug(): |
389 | 377 | mongo = db.init_mongo_db('enwiki') |
390 | 378 | rawdata = mongo['enwiki_charts'] |
391 | 379 | mongo.add_son_manipulator(Transform()) |
392 | | -# d1 = datetime.datetime.today() |
393 | | -# d2 = datetime.datetime(2007, 6, 7) |
394 | | -# ds = Dataset('test', 'enwiki', 'editors_dataset', [{'name': 'count', 'time_unit': 'year'}, |
395 | | -# {'name': 'testest', 'time_unit': 'year'}]) |
396 | | -# ds.count.add(d1, 5) |
397 | | -# ds.count.add(d2, 514) |
398 | | -# ds.testest.add(d1, 135) |
399 | | -# ds.testest.add(d2, 535) |
400 | | -# #ds.summary() |
401 | | -# #ds.write_to_csv() |
402 | | -# v = Variable('test', 'year') |
403 | | -# ds.encode() |
404 | | -# mongo.test.insert({'variables': ds}) |
405 | 380 | |
406 | | - #v.add(date , 5) |
407 | | - #o = v.get_observation(date) |
408 | | - ds = rawdata.find_one({'project': 'wiki', 'language_code': 'en', 'hash': 'cohort_dataset_backward_bar'}) |
409 | | - transform_to_stacked_bar_json(ds) |
410 | | - #v.summary() |
| 381 | + d1 = datetime.datetime.today() |
| 382 | + d2 = datetime.datetime(2007, 6, 7) |
| 383 | + ds = Dataset('test', 'enwiki', 'editors_dataset', [ |
| 384 | + {'name': 'count', 'time_unit': 'year'}, |
| 385 | + {'name': 'testest', 'time_unit': 'year'} |
| 386 | + ]) |
| 387 | + ds.count.add(d1, 5) |
| 388 | + ds.count.add(d2, 514) |
| 389 | + ds.testest.add(d1, 135) |
| 390 | + ds.testest.add(d2, 535) |
| 391 | + #ds.summary() |
| 392 | + ds.write_to_csv() |
| 393 | + v = Variable('test', 'year') |
| 394 | + ds.encode() |
| 395 | + mongo.test.insert({'variables': ds}) |
| 396 | + |
| 397 | + v.add(date , 5) |
| 398 | + o = v.get_observation(date) |
| 399 | + ds = rawdata.find_one({'project': 'wiki', |
| 400 | + 'language_code': 'en', |
| 401 | + 'hash': 'cohort_dataset_backward_bar'}) |
411 | 402 | print ds |
412 | 403 | |
413 | 404 | |