r80941 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r80940‎ | r80941 | r80942 >
Date:23:43, 24 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
Synchronize repo with local machine.
Modified paths:
  • /trunk/tools/editor_trends/analyses/count_editors.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/dataset.py (modified) (history)
  • /trunk/tools/editor_trends/bots/detector.py (modified) (history)
  • /trunk/tools/editor_trends/classes/wikiprojects.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/count_editors.py
@@ -107,6 +107,7 @@
108108 stopwatch.elapsed()
109109 log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish')
110110
 111+
111112 def loop_editors(dbname, project, collection, language_code, func, **kwargs):
112113 '''
113114 Generic loop function that loops over all the editors of a Wikipedia project
@@ -119,7 +120,7 @@
120121 print 'Number of editors: %s' % len(editors)
121122 mongo = db.init_mongo_db(dbname)
122123 coll = mongo[collection]
123 - format = kwargs.pop('format')
 124+ format = kwargs.pop('format', 'long')
124125 kwargs['min_year'] = min_year
125126 kwargs['max_year'] = max_year
126127 vars = []
@@ -145,7 +146,7 @@
146147 yearly_edits = editor['edits_by_year']
147148 n = editor['edit_count']
148149
149 - if n >= ds.count.cum_cutoff:
 150+ if n >= var.cum_cutoff:
150151 for i, year in enumerate(xrange(new_wikipedian.year, final_edit.year)):
151152 edits = editor['monthly_edits'].get(str(year), {0:0})
152153 if year == new_wikipedian.year:
@@ -153,7 +154,7 @@
154155 else:
155156 start = 1
156157 for month in xrange(start, 13):
157 - if edits.get(str(month), 0) >= ds.count.cutoff:
 158+ if edits.get(str(month), 0) >= var.cutoff:
158159 experience = i * 12 + (month - new_wikipedian.month)
159160 var.add(new_wikipedian, {experience: 1})
160161 return var
@@ -240,7 +241,7 @@
241242
242243
243244 if __name__ == '__main__':
244 - #generate_chart_data('wiki', 'editors_dataset', 'en',cohort_dataset_forward_histogram, time_unit='month', cutoff=1, cum_cutoff=50)
 245+ generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_forward_histogram, time_unit='month', cutoff=1, cum_cutoff=50)
245246 generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_backward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide')
246247 generate_chart_data('wiki', 'editors_dataset', 'en', cohort_dataset_forward_bar, time_unit='year', cutoff=0, cum_cutoff=50, format='wide')
247248 #generate_chart_data('wiki', 'editors_dataset','en', histogram_edits, time_unit='year', cutoff=0)
Index: trunk/tools/editor_trends/analyses/dataset.py
@@ -30,6 +30,7 @@
3131 settings = configuration.Settings()
3232
3333 from utils import file_utils
 34+from utils import data_converter
3435 from database import db
3536
3637 class Transform(SONManipulator):
@@ -46,11 +47,23 @@
4748 def transform_outgoing(self, son, collection):
4849 for (key, value) in son.items():
4950 if isinstance(value, dict):
50 - if "_type" in value and value["_type"] == "custom":
51 - son[key] = decode_custom(value)
 51+ names = value.keys()
 52+ for name in names:
 53+ var = Variable(name, None)
 54+ var.decode(value)
 55+ son['variables'][name] = var
5256 else: # Again, make sure to recurse into sub-docs
53 - son[key] = self.transform_outgoing(value, collection)
54 - return son
 57+ son[key] = value
 58+ name = son.pop('name', None)
 59+ project = son.pop('project', None)
 60+ collection = son.pop('collection', None)
 61+ language_code = son.pop('language_code', None)
 62+ variables = son.pop('variables', [])
 63+ ds = Dataset(name, project, collection, language_code, **son)
 64+ for var in variables:
 65+ var = variables[var]
 66+ ds.add_variable(var)
 67+ return ds
5568
5669
5770 class Data:
@@ -66,10 +79,8 @@
6780 kwargs[key] = d
6881 return kwargs
6982
70 - def convert_seconds_to_date(self, secs):
71 - #return time.gmtime(secs)
72 - return datetime.datetime.fromtimestamp(secs)
7383
 84+
7485 def convert_date_to_epoch(self, date):
7586 assert self.time_unit == 'year' or self.time_unit == 'month' \
7687 or self.time_unit == 'day'
@@ -91,6 +102,7 @@
92103 self.t0 = self.set_start_date(date)
93104 self.t1 = self.set_end_date(date)
94105 self.hash = self.__hash__(date)
 106+ self.date = date
95107 self.data = {}
96108 self._type = 'observation'
97109
@@ -104,6 +116,9 @@
105117 for obs in self.obs:
106118 yield self.obs[obs]
107119
 120+ def __getitem__(self, key):
 121+ return getattr(self, key, [])
 122+
108123 def next(self):
109124 try:
110125 return len(self.data.keys()) + 1
@@ -153,9 +168,10 @@
154169 self.obs = {}
155170 self.time_unit = time_unit
156171 self._type = 'variable'
157 - #self.stats = stats
 172+ self.props = ['name', 'time_unit', '_type']
158173 for kw in kwargs:
159174 setattr(self, kw, kwargs[kw])
 175+ self.props.append(kw)
160176
161177 def __str__(self):
162178 return self.name
@@ -164,7 +180,7 @@
165181 return self.name
166182
167183 def __getitem__(self, key):
168 - return self.obs[key]
 184+ return getattr(self, key, [])
169185
170186 def __iter__(self):
171187 dates = self.obs.keys()
@@ -246,13 +262,30 @@
247263
248264 def encode(self):
249265 bson = {}
250 - for x, obs in enumerate(self):
251 - obs = self[obs]
252 - x = str(x)
253 - bson[x] = obs.encode_to_bson()
254 - #print bson
 266+ for prop in self.props:
 267+ bson[prop] = getattr(self, prop)
 268+
 269+ bson['obs'] = {}
 270+ for obs in self:
 271+ data = self.obs[obs]
 272+ obs = str(obs)
 273+ bson['obs'][obs] = data.encode_to_bson()
255274 return bson
256275
 276+ def decode(self, values):
 277+ for varname in values:
 278+ for prop in values[varname]:
 279+ if isinstance(values[varname][prop], dict):
 280+ data = values[varname][prop]
 281+ for d in data:
 282+ date = data[d]['date']
 283+ obs = data[d]['data']
 284+ self.add(date, obs)
 285+ else:
 286+ setattr(self, prop, values[varname][prop])
 287+ self.props.append(prop)
 288+
 289+
257290 class Dataset:
258291 '''
259292 This class acts as a container for the Variable class and has some methods
@@ -268,98 +301,33 @@
269302 self._type = 'dataset'
270303 self.filename = '%s_%s.csv' % (self.project, self.name)
271304 self.created = datetime.datetime.now()
272 - self.props = self.__dict__.keys()
273 - self.vars = []
274305 for kw in kwargs:
275 - if kw == 'format':
276 - setattr(self, kw, kwargs.get(kw, 'long'))
277 - else:
278 - setattr(self, kw, kwargs[kw])
 306+ setattr(self, kw, kwargs[kw])
 307+ self.props = self.__dict__.keys()
 308+
 309+ self.variables = []
279310 if vars != None:
280311 for kwargs in vars:
281312 name = kwargs.pop('name')
282313 setattr(self, name, Variable(name, **kwargs))
283 - self.vars.append(name)
 314+ self.variables.append(name)
284315
285316 def __repr__(self):
286317 return 'Dataset contains %s variables' % (len(self.vars))
287318
288319 def __iter__(self):
289 - for var in self.vars:
 320+ for var in self.variables:
290321 yield getattr(self, var)
291322
292323 def add_variable(self, var):
293324 if isinstance(var, Variable):
294 - self.vars.append(var.name)
 325+ self.variables.append(var.name)
295326 setattr(self, var.name, var)
296327
297328 else:
298329 raise TypeError('You can only instance of Variable to a dataset.')
299330
300 - def get_all_keys(self, data):
301 - all_keys = []
302 - for d in data:
303 - for key in d:
304 - if key not in all_keys:
305 - all_keys.append(key)
306 - all_keys.sort()
307 - all_keys.insert(0, all_keys[-1])
308 - del all_keys[-1]
309 - return all_keys
310331
311 - def make_data_rectangular(self, data, all_keys):
312 - for i, d in enumerate(data):
313 - for key in all_keys:
314 - if key not in d:
315 - d[key] = 0
316 - data[i] = d
317 - return data
318 -
319 - def sort(self, data, all_keys):
320 - dates = [date['date'] for date in data]
321 - dates.sort()
322 - cube = []
323 - for date in dates:
324 - for i, d in enumerate(data):
325 - if d['date'] == date:
326 - raw_data = d
327 - del data[i]
328 - break
329 - obs = []
330 - for key in all_keys:
331 - obs.append(raw_data[key])
332 - cube.append(obs)
333 - return cube
334 -
335 - def convert_dataset_to_lists(self):
336 - assert self.format == 'long' or self.format == 'wide'
337 - data, all_keys = [], []
338 - for var in self:
339 - for date in var.obs.keys():
340 - datum = var.convert_seconds_to_date(date)
341 - if self.format == 'long':
342 - o = []
343 - else:
344 - o = {}
345 - o['date'] = datum
346 -
347 - for obs in var[date].data:
348 - if self.format == 'long':
349 - o.append([datum, obs, var.obs[date].data[obs]])
350 - data.extend(o)
351 - o = []
352 - else:
353 - o[obs] = var.obs[date].data[obs]
354 - #o.append({obs:var.obs[date].data[obs]})
355 - if self.format == 'wide':
356 - data.append(o)
357 - if self.format == 'wide':
358 - #Make sure that each variable / observation combination exists.
359 - all_keys = self.get_all_keys(data)
360 - data = self.make_data_rectangular(data, all_keys)
361 - data = self.sort(data, all_keys)
362 - return data, all_keys
363 -
364332 def write(self, format='csv'):
365333 if format == 'csv':
366334 self.to_csv()
@@ -371,74 +339,79 @@
372340 mongo = db.init_mongo_db(dbname)
373341 coll = mongo['%s_%s' % (dbname, 'charts')]
374342 mongo.add_son_manipulator(Transform())
375 - #transform = Transform()
376 - #bson = transform.transform_incoming(self, coll)
377 -# coll.update(
378 -# {'$set': {'variables': self}})
379343 coll.remove({'hash':self.hash, 'project':self.project,
380344 'language_code':self.language_code})
381345 coll.insert({'variables': self})
382346
383347 def to_csv(self):
384 -
385 - data, all_keys = self.convert_dataset_to_lists()
386 - headers = self.add_headers(all_keys)
387 - fh = file_utils.create_txt_filehandle(settings.dataset_location, self.name, 'w', settings.encoding)
 348+ data, all_keys = data_converter.convert_dataset_to_lists(self, 'manage')
 349+ headers = data_converter.add_headers(self, all_keys)
 350+ fh = file_utils.create_txt_filehandle(settings.dataset_location, self.filename, 'w', settings.encoding)
388351 file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True, format=self.format)
389352 file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format)
390353 fh.close()
391354
392 - def add_headers(self, all_keys):
393 - assert self.format == 'long' or self.format == 'wide'
394 - headers = []
395 - if self.format == 'long':
396 - headers.append('date')
397 - for var in self:
398 - if self.format == 'long':
399 - headers.extend([var.time_unit, var.name])
400 - else:
401 - for key in all_keys:
402 - header = '%s_%s' % (key, var.name)
403 - headers.append(header)
404 - return headers
405 -
406355 def encode(self):
407356 props = {}
408357 for prop in self.props:
409358 props[prop] = getattr(self, prop)
410359 return props
411360
412 - def encode_to_bson(self, var):
413 - return {'_type': 'dataset', 'x': var.x()}
414361
415 - def decode_from_bson(self, document):
416 - assert document["_type"] == "custom"
417 - return self(document["x"])
418362
 363+# def transform_to_stacked_bar_json(self):
 364+# '''
 365+# This function outputs data in a format that is understood by jquery
 366+# flot plugin.
 367+# '''
 368+# options = {}
 369+# options['xaxis'] = {}
 370+# options['xaxis']['ticks'] = []
 371+# data = []
 372+# obs, all_keys = ds.convert_dataset_to_lists()
 373+#
 374+# for ob in obs:
 375+# d = {}
 376+# d['label'] = ob[0].year
 377+# d['data'] = []
 378+# ob = ob[1:]
 379+# for x, o in enumerate(ob):
 380+# d['data'].append([x, o])
 381+# data.append(d)
 382+# for x, date in enumerate(obs[0]):
 383+# options['xaxis']['ticks'].append([x, date.year])
 384+#
 385+# return data, options
 386+
 387+
419388 def debug():
420389 mongo = db.init_mongo_db('enwiki')
421 - rawdata = mongo['test']
 390+ rawdata = mongo['enwiki_charts']
422391 mongo.add_son_manipulator(Transform())
423 - d1 = datetime.datetime.today()
424 - d2 = datetime.datetime(2007, 6, 7)
425 - ds = Dataset('test', 'enwiki', 'editors_dataset', [{'name': 'count', 'time_unit': 'year'},
426 - {'name': 'testest', 'time_unit': 'year'}])
427 - ds.count.add(d1, 5)
428 - ds.count.add(d2, 514)
429 - ds.testest.add(d1, 135)
430 - ds.testest.add(d2, 535)
431 - #ds.summary()
432 - #ds.write_to_csv()
433 - v = Variable('test', 'year')
434 - ds.encode()
435 - mongo.test.insert({'variables': ds})
 392+# d1 = datetime.datetime.today()
 393+# d2 = datetime.datetime(2007, 6, 7)
 394+# ds = Dataset('test', 'enwiki', 'editors_dataset', [{'name': 'count', 'time_unit': 'year'},
 395+# {'name': 'testest', 'time_unit': 'year'}])
 396+# ds.count.add(d1, 5)
 397+# ds.count.add(d2, 514)
 398+# ds.testest.add(d1, 135)
 399+# ds.testest.add(d2, 535)
 400+# #ds.summary()
 401+# #ds.write_to_csv()
 402+# v = Variable('test', 'year')
 403+# ds.encode()
 404+# mongo.test.insert({'variables': ds})
436405
437406 #v.add(date , 5)
438407 #o = v.get_observation(date)
439 -
 408+ ds = rawdata.find_one({'project': 'wiki', 'language_code': 'en', 'hash': 'cohort_dataset_backward_bar'})
 409+ transform_to_stacked_bar_json(ds)
440410 #v.summary()
441411 print ds
442412
443413
 414+
 415+
 416+
444417 if __name__ == '__main__':
445418 debug()
Index: trunk/tools/editor_trends/classes/wikiprojects.py
@@ -29,11 +29,11 @@
3030 import datetime
3131 import time
3232 import re
33 -import ordered_dict as odict
 33+sys.path.append('..')
3434
35 -import text_utils
 35+from utils import text_utils
 36+from utils import ordered_dict as odict
3637
37 -
3838 class Wiki:
3939 '''
4040 This class keeps track of the commands issued by the user and is used to
Index: trunk/tools/editor_trends/bots/detector.py
@@ -33,7 +33,7 @@
3434 from utils import file_utils
3535 from utils import messages
3636
37 -from etl import models
 37+from classes import consumers
3838 from classes import bots
3939
4040 import cProfile
@@ -61,7 +61,7 @@
6262 bots = bots.split('|')
6363 for bot in bots:
6464 if bot not in bot_dict:
65 - b = botmodels.Bot(bot)
 65+ b = botconsumers.Bot(bot)
6666 b.id = None
6767 else:
6868 b = bot_dict[bot]
@@ -156,7 +156,7 @@
157157
158158 #print username.encode('utf-8')
159159 if username.find('bot') > -1 or username.find('script') > -1:
160 - bot = bots.get(username, botmodels.Bot(username, verified=False))
 160+ bot = bots.get(username, botconsumers.Bot(username, verified=False))
161161 bot.id = contributor.find('id').text
162162 timestamp = revision.find('timestamp').text
163163 if timestamp != None:
@@ -192,15 +192,14 @@
193193 input_queue = pc.load_queue(files, poison_pill=True)
194194 bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding, manager=manager)
195195 for file in files:
196 - tasks.put(models.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys))
 196+ tasks.put(consumers.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys))
197197
198198 else:
199199 output_file = 'bots_predictionset.csv'
200200 files = file_utils.retrieve_file_list(input_xml, 'xml', mask=None)
201 - input_queue = pc.load_queue(files, poison_pill=True)
202201 bots = {}
203202 for file in files:
204 - tasks.put(models.XMLFile(file, input_xml, settings.csv_location, output_file, target, bots=bots, keys=keys))
 203+ tasks.put(consumers.XMLFile(file, input_xml, settings.csv_location, output_file, target, bots=bots, keys=keys))
205204
206205 #lock = mgr.Lock()
207206 if manager:
@@ -253,7 +252,7 @@
254253 '''
255254 This is the launcher that uses multiprocesses.
256255 '''
257 - consumers = [models.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)]
 256+ consumers = [consumers.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)]
258257 for x in xrange(settings.number_of_processes):
259258 tasks.put(None)
260259

Status & tagging log