Index: trunk/tools/editor_trends/analyses/count_editors.py |
— | — | @@ -107,6 +107,7 @@ |
108 | 108 | stopwatch.elapsed() |
109 | 109 | log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish') |
110 | 110 | |
| 111 | + |
111 | 112 | def loop_editors(dbname, project, collection, language_code, func, **kwargs): |
112 | 113 | ''' |
113 | 114 | Generic loop function that loops over all the editors of a Wikipedia project |
— | — | @@ -119,7 +120,7 @@ |
120 | 121 | print 'Number of editors: %s' % len(editors) |
121 | 122 | mongo = db.init_mongo_db(dbname) |
122 | 123 | coll = mongo[collection] |
123 | | - format = kwargs.pop('format') |
| 124 | + format = kwargs.pop('format', 'long') |
124 | 125 | kwargs['min_year'] = min_year |
125 | 126 | kwargs['max_year'] = max_year |
126 | 127 | vars = [] |
— | — | @@ -145,7 +146,7 @@ |
146 | 147 | yearly_edits = editor['edits_by_year'] |
147 | 148 | n = editor['edit_count'] |
148 | 149 | |
149 | | - if n >= ds.count.cum_cutoff: |
| 150 | + if n >= var.cum_cutoff: |
150 | 151 | for i, year in enumerate(xrange(new_wikipedian.year, final_edit.year)): |
151 | 152 | edits = editor['monthly_edits'].get(str(year), {0:0}) |
152 | 153 | if year == new_wikipedian.year: |
— | — | @@ -153,7 +154,7 @@ |
154 | 155 | else: |
155 | 156 | start = 1 |
156 | 157 | for month in xrange(start, 13): |
157 | | - if edits.get(str(month), 0) >= ds.count.cutoff: |
| 158 | + if edits.get(str(month), 0) >= var.cutoff: |
158 | 159 | experience = i * 12 + (month - new_wikipedian.month) |
159 | 160 | var.add(new_wikipedian, {experience: 1}) |
160 | 161 | return var |
— | — | @@ -240,7 +241,7 @@ |
241 | 242 | |
242 | 243 | |
243 | 244 | if __name__ == '__main__': |
244 | | - #generate_chart_data('wiki', 'editors_dataset', 'en',cohort_dataset_forward_histogram, time_unit='month', cutoff=1, cum_cutoff=50) |
| 245 | + generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_forward_histogram, time_unit='month', cutoff=1, cum_cutoff=50) |
245 | 246 | generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_backward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide') |
246 | 247 | generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_forward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide') |
247 | 248 | #generate_chart_data('wiki', 'editors_dataset','en', histogram_edits, time_unit='year', cutoff=0) |
Index: trunk/tools/editor_trends/analyses/dataset.py |
— | — | @@ -30,6 +30,7 @@ |
31 | 31 | settings = configuration.Settings() |
32 | 32 | |
33 | 33 | from utils import file_utils |
| 34 | +from utils import data_converter |
34 | 35 | from database import db |
35 | 36 | |
36 | 37 | class Transform(SONManipulator): |
— | — | @@ -46,11 +47,23 @@ |
47 | 48 | def transform_outgoing(self, son, collection): |
48 | 49 | for (key, value) in son.items(): |
49 | 50 | if isinstance(value, dict): |
50 | | - if "_type" in value and value["_type"] == "custom": |
51 | | - son[key] = decode_custom(value) |
| 51 | + names = value.keys() |
| 52 | + for name in names: |
| 53 | + var = Variable(name, None) |
| 54 | + var.decode(value) |
| 55 | + son['variables'][name] = var |
52 | 56 | else: # Again, make sure to recurse into sub-docs |
53 | | - son[key] = self.transform_outgoing(value, collection) |
54 | | - return son |
| 57 | + son[key] = value |
| 58 | + name = son.pop('name', None) |
| 59 | + project = son.pop('project', None) |
| 60 | + collection = son.pop('collection', None) |
| 61 | + language_code = son.pop('language_code', None) |
| 62 | + variables = son.pop('variables', []) |
| 63 | + ds = Dataset(name, project, collection, language_code, **son) |
| 64 | + for var in variables: |
| 65 | + var = variables[var] |
| 66 | + ds.add_variable(var) |
| 67 | + return ds |
55 | 68 | |
56 | 69 | |
57 | 70 | class Data: |
— | — | @@ -66,10 +79,8 @@ |
67 | 80 | kwargs[key] = d |
68 | 81 | return kwargs |
69 | 82 | |
70 | | - def convert_seconds_to_date(self, secs): |
71 | | - #return time.gmtime(secs) |
72 | | - return datetime.datetime.fromtimestamp(secs) |
73 | 83 | |
| 84 | + |
74 | 85 | def convert_date_to_epoch(self, date): |
75 | 86 | assert self.time_unit == 'year' or self.time_unit == 'month' \ |
76 | 87 | or self.time_unit == 'day' |
— | — | @@ -91,6 +102,7 @@ |
92 | 103 | self.t0 = self.set_start_date(date) |
93 | 104 | self.t1 = self.set_end_date(date) |
94 | 105 | self.hash = self.__hash__(date) |
| 106 | + self.date = date |
95 | 107 | self.data = {} |
96 | 108 | self._type = 'observation' |
97 | 109 | |
— | — | @@ -104,6 +116,9 @@ |
105 | 117 | for obs in self.obs: |
106 | 118 | yield self.obs[obs] |
107 | 119 | |
| 120 | + def __getitem__(self, key): |
| 121 | + return getattr(self, key, []) |
| 122 | + |
108 | 123 | def next(self): |
109 | 124 | try: |
110 | 125 | return len(self.data.keys()) + 1 |
— | — | @@ -153,9 +168,10 @@ |
154 | 169 | self.obs = {} |
155 | 170 | self.time_unit = time_unit |
156 | 171 | self._type = 'variable' |
157 | | - #self.stats = stats |
| 172 | + self.props = ['name', 'time_unit', '_type'] |
158 | 173 | for kw in kwargs: |
159 | 174 | setattr(self, kw, kwargs[kw]) |
| 175 | + self.props.append(kw) |
160 | 176 | |
161 | 177 | def __str__(self): |
162 | 178 | return self.name |
— | — | @@ -164,7 +180,7 @@ |
165 | 181 | return self.name |
166 | 182 | |
167 | 183 | def __getitem__(self, key): |
168 | | - return self.obs[key] |
| 184 | + return getattr(self, key, []) |
169 | 185 | |
170 | 186 | def __iter__(self): |
171 | 187 | dates = self.obs.keys() |
— | — | @@ -246,13 +262,30 @@ |
247 | 263 | |
248 | 264 | def encode(self): |
249 | 265 | bson = {} |
250 | | - for x, obs in enumerate(self): |
251 | | - obs = self[obs] |
252 | | - x = str(x) |
253 | | - bson[x] = obs.encode_to_bson() |
254 | | - #print bson |
| 266 | + for prop in self.props: |
| 267 | + bson[prop] = getattr(self, prop) |
| 268 | + |
| 269 | + bson['obs'] = {} |
| 270 | + for obs in self: |
| 271 | + data = self.obs[obs] |
| 272 | + obs = str(obs) |
| 273 | + bson['obs'][obs] = data.encode_to_bson() |
255 | 274 | return bson |
256 | 275 | |
| 276 | + def decode(self, values): |
| 277 | + for varname in values: |
| 278 | + for prop in values[varname]: |
| 279 | + if isinstance(values[varname][prop], dict): |
| 280 | + data = values[varname][prop] |
| 281 | + for d in data: |
| 282 | + date = data[d]['date'] |
| 283 | + obs = data[d]['data'] |
| 284 | + self.add(date, obs) |
| 285 | + else: |
| 286 | + setattr(self, prop, values[varname][prop]) |
| 287 | + self.props.append(prop) |
| 288 | + |
| 289 | + |
257 | 290 | class Dataset: |
258 | 291 | ''' |
259 | 292 | This class acts as a container for the Variable class and has some methods |
— | — | @@ -268,98 +301,33 @@ |
269 | 302 | self._type = 'dataset' |
270 | 303 | self.filename = '%s_%s.csv' % (self.project, self.name) |
271 | 304 | self.created = datetime.datetime.now() |
272 | | - self.props = self.__dict__.keys() |
273 | | - self.vars = [] |
274 | 305 | for kw in kwargs: |
275 | | - if kw == 'format': |
276 | | - setattr(self, kw, kwargs.get(kw, 'long')) |
277 | | - else: |
278 | | - setattr(self, kw, kwargs[kw]) |
| 306 | + setattr(self, kw, kwargs[kw]) |
| 307 | + self.props = self.__dict__.keys() |
| 308 | + |
| 309 | + self.variables = [] |
279 | 310 | if vars != None: |
280 | 311 | for kwargs in vars: |
281 | 312 | name = kwargs.pop('name') |
282 | 313 | setattr(self, name, Variable(name, **kwargs)) |
283 | | - self.vars.append(name) |
| 314 | + self.variables.append(name) |
284 | 315 | |
285 | 316 | def __repr__(self): |
286 | 317 | return 'Dataset contains %s variables' % (len(self.vars)) |
287 | 318 | |
288 | 319 | def __iter__(self): |
289 | | - for var in self.vars: |
| 320 | + for var in self.variables: |
290 | 321 | yield getattr(self, var) |
291 | 322 | |
292 | 323 | def add_variable(self, var): |
293 | 324 | if isinstance(var, Variable): |
294 | | - self.vars.append(var.name) |
| 325 | + self.variables.append(var.name) |
295 | 326 | setattr(self, var.name, var) |
296 | 327 | |
297 | 328 | else: |
298 | 329 | raise TypeError('You can only instance of Variable to a dataset.') |
299 | 330 | |
300 | | - def get_all_keys(self, data): |
301 | | - all_keys = [] |
302 | | - for d in data: |
303 | | - for key in d: |
304 | | - if key not in all_keys: |
305 | | - all_keys.append(key) |
306 | | - all_keys.sort() |
307 | | - all_keys.insert(0, all_keys[-1]) |
308 | | - del all_keys[-1] |
309 | | - return all_keys |
310 | 331 | |
311 | | - def make_data_rectangular(self, data, all_keys): |
312 | | - for i, d in enumerate(data): |
313 | | - for key in all_keys: |
314 | | - if key not in d: |
315 | | - d[key] = 0 |
316 | | - data[i] = d |
317 | | - return data |
318 | | - |
319 | | - def sort(self, data, all_keys): |
320 | | - dates = [date['date'] for date in data] |
321 | | - dates.sort() |
322 | | - cube = [] |
323 | | - for date in dates: |
324 | | - for i, d in enumerate(data): |
325 | | - if d['date'] == date: |
326 | | - raw_data = d |
327 | | - del data[i] |
328 | | - break |
329 | | - obs = [] |
330 | | - for key in all_keys: |
331 | | - obs.append(raw_data[key]) |
332 | | - cube.append(obs) |
333 | | - return cube |
334 | | - |
335 | | - def convert_dataset_to_lists(self): |
336 | | - assert self.format == 'long' or self.format == 'wide' |
337 | | - data, all_keys = [], [] |
338 | | - for var in self: |
339 | | - for date in var.obs.keys(): |
340 | | - datum = var.convert_seconds_to_date(date) |
341 | | - if self.format == 'long': |
342 | | - o = [] |
343 | | - else: |
344 | | - o = {} |
345 | | - o['date'] = datum |
346 | | - |
347 | | - for obs in var[date].data: |
348 | | - if self.format == 'long': |
349 | | - o.append([datum, obs, var.obs[date].data[obs]]) |
350 | | - data.extend(o) |
351 | | - o = [] |
352 | | - else: |
353 | | - o[obs] = var.obs[date].data[obs] |
354 | | - #o.append({obs:var.obs[date].data[obs]}) |
355 | | - if self.format == 'wide': |
356 | | - data.append(o) |
357 | | - if self.format == 'wide': |
358 | | - #Make sure that each variable / observation combination exists. |
359 | | - all_keys = self.get_all_keys(data) |
360 | | - data = self.make_data_rectangular(data, all_keys) |
361 | | - data = self.sort(data, all_keys) |
362 | | - return data, all_keys |
363 | | - |
364 | 332 | def write(self, format='csv'): |
365 | 333 | if format == 'csv': |
366 | 334 | self.to_csv() |
— | — | @@ -371,74 +339,79 @@ |
372 | 340 | mongo = db.init_mongo_db(dbname) |
373 | 341 | coll = mongo['%s_%s' % (dbname, 'charts')] |
374 | 342 | mongo.add_son_manipulator(Transform()) |
375 | | - #transform = Transform() |
376 | | - #bson = transform.transform_incoming(self, coll) |
377 | | -# coll.update( |
378 | | -# {'$set': {'variables': self}}) |
379 | 343 | coll.remove({'hash':self.hash, 'project':self.project, |
380 | 344 | 'language_code':self.language_code}) |
381 | 345 | coll.insert({'variables': self}) |
382 | 346 | |
383 | 347 | def to_csv(self): |
384 | | - |
385 | | - data, all_keys = self.convert_dataset_to_lists() |
386 | | - headers = self.add_headers(all_keys) |
387 | | - fh = file_utils.create_txt_filehandle(settings.dataset_location, self.name, 'w', settings.encoding) |
| 348 | + data, all_keys = data_converter.convert_dataset_to_lists(self, 'manage') |
| 349 | + headers = data_converter.add_headers(self, all_keys) |
| 350 | + fh = file_utils.create_txt_filehandle(settings.dataset_location, self.filename, 'w', settings.encoding) |
388 | 351 | file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True, format=self.format) |
389 | 352 | file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format) |
390 | 353 | fh.close() |
391 | 354 | |
392 | | - def add_headers(self, all_keys): |
393 | | - assert self.format == 'long' or self.format == 'wide' |
394 | | - headers = [] |
395 | | - if self.format == 'long': |
396 | | - headers.append('date') |
397 | | - for var in self: |
398 | | - if self.format == 'long': |
399 | | - headers.extend([var.time_unit, var.name]) |
400 | | - else: |
401 | | - for key in all_keys: |
402 | | - header = '%s_%s' % (key, var.name) |
403 | | - headers.append(header) |
404 | | - return headers |
405 | | - |
406 | 355 | def encode(self): |
407 | 356 | props = {} |
408 | 357 | for prop in self.props: |
409 | 358 | props[prop] = getattr(self, prop) |
410 | 359 | return props |
411 | 360 | |
412 | | - def encode_to_bson(self, var): |
413 | | - return {'_type': 'dataset', 'x': var.x()} |
414 | 361 | |
415 | | - def decode_from_bson(self, document): |
416 | | - assert document["_type"] == "custom" |
417 | | - return self(document["x"]) |
418 | 362 | |
| 363 | +# def transform_to_stacked_bar_json(self): |
| 364 | +# ''' |
| 365 | +# This function outputs data in a format that is understood by jquery |
| 366 | +# flot plugin. |
| 367 | +# ''' |
| 368 | +# options = {} |
| 369 | +# options['xaxis'] = {} |
| 370 | +# options['xaxis']['ticks'] = [] |
| 371 | +# data = [] |
| 372 | +# obs, all_keys = ds.convert_dataset_to_lists() |
| 373 | +# |
| 374 | +# for ob in obs: |
| 375 | +# d = {} |
| 376 | +# d['label'] = ob[0].year |
| 377 | +# d['data'] = [] |
| 378 | +# ob = ob[1:] |
| 379 | +# for x, o in enumerate(ob): |
| 380 | +# d['data'].append([x, o]) |
| 381 | +# data.append(d) |
| 382 | +# for x, date in enumerate(obs[0]): |
| 383 | +# options['xaxis']['ticks'].append([x, date.year]) |
| 384 | +# |
| 385 | +# return data, options |
| 386 | + |
| 387 | + |
419 | 388 | def debug(): |
420 | 389 | mongo = db.init_mongo_db('enwiki') |
421 | | - rawdata = mongo['test'] |
| 390 | + rawdata = mongo['enwiki_charts'] |
422 | 391 | mongo.add_son_manipulator(Transform()) |
423 | | - d1 = datetime.datetime.today() |
424 | | - d2 = datetime.datetime(2007, 6, 7) |
425 | | - ds = Dataset('test', 'enwiki', 'editors_dataset', [{'name': 'count', 'time_unit': 'year'}, |
426 | | - {'name': 'testest', 'time_unit': 'year'}]) |
427 | | - ds.count.add(d1, 5) |
428 | | - ds.count.add(d2, 514) |
429 | | - ds.testest.add(d1, 135) |
430 | | - ds.testest.add(d2, 535) |
431 | | - #ds.summary() |
432 | | - #ds.write_to_csv() |
433 | | - v = Variable('test', 'year') |
434 | | - ds.encode() |
435 | | - mongo.test.insert({'variables': ds}) |
| 392 | +# d1 = datetime.datetime.today() |
| 393 | +# d2 = datetime.datetime(2007, 6, 7) |
| 394 | +# ds = Dataset('test', 'enwiki', 'editors_dataset', [{'name': 'count', 'time_unit': 'year'}, |
| 395 | +# {'name': 'testest', 'time_unit': 'year'}]) |
| 396 | +# ds.count.add(d1, 5) |
| 397 | +# ds.count.add(d2, 514) |
| 398 | +# ds.testest.add(d1, 135) |
| 399 | +# ds.testest.add(d2, 535) |
| 400 | +# #ds.summary() |
| 401 | +# #ds.write_to_csv() |
| 402 | +# v = Variable('test', 'year') |
| 403 | +# ds.encode() |
| 404 | +# mongo.test.insert({'variables': ds}) |
436 | 405 | |
437 | 406 | #v.add(date , 5) |
438 | 407 | #o = v.get_observation(date) |
439 | | - |
| 408 | + ds = rawdata.find_one({'project': 'wiki', 'language_code': 'en', 'hash': 'cohort_dataset_backward_bar'}) |
| 409 | + transform_to_stacked_bar_json(ds) |
440 | 410 | #v.summary() |
441 | 411 | print ds |
442 | 412 | |
443 | 413 | |
| 414 | + |
| 415 | + |
| 416 | + |
444 | 417 | if __name__ == '__main__': |
445 | 418 | debug() |
Index: trunk/tools/editor_trends/classes/wikiprojects.py |
— | — | @@ -29,11 +29,11 @@ |
30 | 30 | import datetime |
31 | 31 | import time |
32 | 32 | import re |
33 | | -import ordered_dict as odict |
| 33 | +sys.path.append('..') |
34 | 34 | |
35 | | -import text_utils |
| 35 | +from utils import text_utils |
| 36 | +from utils import ordered_dict as odict |
36 | 37 | |
37 | | - |
38 | 38 | class Wiki: |
39 | 39 | ''' |
40 | 40 | This class keeps track of the commands issued by the user and is used to |
Index: trunk/tools/editor_trends/bots/detector.py |
— | — | @@ -33,7 +33,7 @@ |
34 | 34 | from utils import file_utils |
35 | 35 | from utils import messages |
36 | 36 | |
37 | | -from etl import models |
| 37 | +from classes import consumers |
38 | 38 | from classes import bots |
39 | 39 | |
40 | 40 | import cProfile |
— | — | @@ -61,7 +61,7 @@ |
62 | 62 | bots = bots.split('|') |
63 | 63 | for bot in bots: |
64 | 64 | if bot not in bot_dict: |
65 | | - b = botmodels.Bot(bot) |
| 65 | + b = botconsumers.Bot(bot) |
66 | 66 | b.id = None |
67 | 67 | else: |
68 | 68 | b = bot_dict[bot] |
— | — | @@ -156,7 +156,7 @@ |
157 | 157 | |
158 | 158 | #print username.encode('utf-8') |
159 | 159 | if username.find('bot') > -1 or username.find('script') > -1: |
160 | | - bot = bots.get(username, botmodels.Bot(username, verified=False)) |
| 160 | + bot = bots.get(username, botconsumers.Bot(username, verified=False)) |
161 | 161 | bot.id = contributor.find('id').text |
162 | 162 | timestamp = revision.find('timestamp').text |
163 | 163 | if timestamp != None: |
— | — | @@ -192,15 +192,14 @@ |
193 | 193 | input_queue = pc.load_queue(files, poison_pill=True) |
194 | 194 | bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding, manager=manager) |
195 | 195 | for file in files: |
196 | | - tasks.put(models.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys)) |
| 196 | + tasks.put(consumers.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys)) |
197 | 197 | |
198 | 198 | else: |
199 | 199 | output_file = 'bots_predictionset.csv' |
200 | 200 | files = file_utils.retrieve_file_list(input_xml, 'xml', mask=None) |
201 | | - input_queue = pc.load_queue(files, poison_pill=True) |
202 | 201 | bots = {} |
203 | 202 | for file in files: |
204 | | - tasks.put(models.XMLFile(file, input_xml, settings.csv_location, output_file, target, bots=bots, keys=keys)) |
| 203 | + tasks.put(consumers.XMLFile(file, input_xml, settings.csv_location, output_file, target, bots=bots, keys=keys)) |
205 | 204 | |
206 | 205 | #lock = mgr.Lock() |
207 | 206 | if manager: |
— | — | @@ -253,7 +252,7 @@ |
254 | 253 | ''' |
255 | 254 | This is the launcher that uses multiprocesses. |
256 | 255 | ''' |
257 | | - consumers = [models.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)] |
| 256 | + consumers = [consumers.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)] |
258 | 257 | for x in xrange(settings.number_of_processes): |
259 | 258 | tasks.put(None) |
260 | 259 | |