r87049 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r87048‎ | r87049 | r87050 >
Date:21:54, 27 April 2011
Author:diederik
Status:deferred
Tags:
Comment:
Updated files to start ml hunt.
Modified paths:
  • /trunk/tools/editor_trends (modified) (history)
  • /trunk/tools/editor_trends/analyses/adhoc/bot_detector.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/adhoc/community_graph.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/ppi_editor_productivity.py (modified) (history)
  • /trunk/tools/editor_trends/classes/dataset.py (modified) (history)
  • /trunk/tools/editor_trends/classes/queue.py (modified) (history)
  • /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history)
  • /trunk/tools/editor_trends/classes/storage.py (modified) (history)
  • /trunk/tools/editor_trends/cronjobs.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)
  • /trunk/tools/editor_trends/etl/transformer.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/plugins/ppi_editor_productivity.py
@@ -20,6 +20,9 @@
2121 from datetime import datetime
2222
2323 def ppi_editor_productivity(var, editor, **kwargs):
 24+ #print editor
 25+ if editor == None:
 26+ return var
2427 new_wikipedian = editor['new_wikipedian']
2528 if not new_wikipedian:
2629 return var
@@ -27,43 +30,42 @@
2831 edits = editor['character_count']
2932 username = editor['username']
3033 x = 0
31 -
3234 try:
3335 added = edits['2010']['11']['0']['added']
3436 x += 1
3537 except KeyError:
36 - added = 0
37 - try:
38 - removed = edits['2010']['11']['0']['removed']
39 - x += 1
40 - except KeyError:
41 - removed = 0
 38+ added = 2
 39+# try:
 40+# removed = edits['2010']['11']['0']['removed']
 41+# x += 1
 42+# except KeyError:
 43+# removed = 0
4244
4345
4446 key = datetime(2010, 11, 30)
4547 if added > 0:
4648 var.add(key, added, {'username': username, 'added': 'added'})
47 - if removed > 0:
48 - var.add(key, removed, {'username': username, 'removed': 'removed'})
49 - var.add(key, x, {'username': username, 'total': 'total'})
 49+# if removed > 0:
 50+# var.add(key, removed, {'username': username, 'removed': 'removed'})
 51+# var.add(key, x, {'username': username, 'total': 'total'})
5052
5153 y = 0
5254 try:
5355 added = edits['2010']['12']['0']['added']
5456 y += 1
5557 except KeyError:
56 - added = 0
57 - try:
58 - removed = edits['2010']['12']['0']['removed']
59 - y += 1
60 - except KeyError:
61 - removed = 0
 58+ added = 4
 59+# try:
 60+# removed = edits['2010']['12']['0']['removed']
 61+# y += 1
 62+# except KeyError:
 63+# removed = 0
6264
6365 key = datetime(2010, 12, 31)
6466 if added > 0:
6567 var.add(key, added, {'username': username, 'added': 'added'})
66 - if removed > 0:
67 - var.add(key, removed, {'username': username, 'removed': 'removed'})
68 - var.add(key, y, {'username': username, 'total': 'total'})
 68+# if removed > 0:
 69+# var.add(key, removed, {'username': username, 'removed': 'removed'})
 70+# var.add(key, y, {'username': username, 'total': 'total'})
6971
7072 return var
Index: trunk/tools/editor_trends/analyses/adhoc/bot_detector.py
@@ -70,7 +70,7 @@
7171 Loader function to retrieve list of id's of known Wikipedia bots.
7272 '''
7373 bots = []
74 - db = storage.Database(db_type, 'bots', 'ids')
 74+ db = storage.init_database(db_type, 'bots', 'ids')
7575 cursor = db.find()
7676 for bot in cursor:
7777 if bot['verified'] == 'True' and language_code in bot['projects']:
@@ -88,7 +88,7 @@
8989 'bots_ids.csv',
9090 'utf-8',
9191 keys)
92 - db = storage.Database(rts.storage, 'wikilytics', 'bots')
 92+ db = storage.init_database(rts.storage, 'wikilytics', 'bots')
9393 db.drop_collection()
9494 for id in bots:
9595 bot = bots[id]
Index: trunk/tools/editor_trends/analyses/adhoc/community_graph.py
@@ -42,7 +42,7 @@
4343
4444
4545 def create_edgelist(project, collection):
46 - db = storage.Database(rts.storage, project, collection)
 46+ db = storage.init_database(rts.storage, project, collection)
4747 ids = db.retrieve_distinct_keys('editor')
4848 ids.sort()
4949 fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', 'utf-8')
Index: trunk/tools/editor_trends/manage.py
@@ -17,7 +17,7 @@
1818 __date__ = '2010-10-21'
1919 __version__ = '0.1'
2020
21 -
 21+import cProfile
2222 import os
2323 import logging
2424 import logging.handlers
@@ -337,7 +337,7 @@
338338 stopwatch = timer.Timer()
339339 log.to_db(rts, 'dataset', 'transform', stopwatch, event='start')
340340 log.to_csv(logger, rts, 'Start', 'Transform', transformer_launcher)
341 - transformer.transform_editors_single_launcher(rts)
 341+ transformer.transform_editors_multi_launcher(rts)
342342 stopwatch.elapsed()
343343 log.to_db(rts, 'dataset', 'transform', stopwatch, event='finish')
344344 log.to_csv(logger, rts, 'Finish', 'Transform', transformer_launcher)
@@ -353,6 +353,7 @@
354354 log.to_db(rts, 'dataset', 'export', stopwatch, event='start')
355355
356356 for plugin in rts.plugins:
 357+ #cProfile.runctx('analyzer.generate_chart_data(rts, plugin, **rts.keywords)', globals(), locals(), filename="analyzer.cprof")
357358 analyzer.generate_chart_data(rts, plugin, **rts.keywords)
358359 log.to_csv(logger, rts, 'Start', 'Dataset', dataset_launcher,
359360 plugin=plugin,
Index: trunk/tools/editor_trends/etl/store.py
@@ -75,30 +75,26 @@
7676 Prepare a single line to store in the database, this entails converting
7777 to proper variable and taking care of the encoding.
7878 '''
79 - try:
80 - article_id = int(line[1])
81 - username = line[3].encode('utf-8')
82 - ns = int(line[4])
83 - date = text_utils.convert_timestamp_to_datetime_utc(line[6])
84 - md5 = line[7]
85 - revert = int(line[8])
86 - bot = int(line[9])
87 - cur_size = int(line[10])
88 - delta = int(line[11])
 79+ article_id = int(line[1])
 80+ username = line[3].encode('utf-8')
 81+ ns = int(line[4])
 82+ date = text_utils.convert_timestamp_to_datetime_utc(line[6])
 83+ md5 = line[7]
 84+ revert = int(line[8])
 85+ bot = int(line[9])
 86+ cur_size = int(line[10])
 87+ delta = int(line[11])
8988
90 - data = {'date': date,
91 - 'article': article_id,
92 - 'username': username,
93 - 'ns': ns,
94 - 'hash': md5,
95 - 'revert':revert,
96 - 'cur_size':cur_size,
97 - 'delta':delta,
98 - 'bot':bot
99 - }
100 - except:
101 - print line, len(line)
102 - return {}
 89+ data = {'date': date,
 90+ 'article': article_id,
 91+ 'username': username,
 92+ 'ns': ns,
 93+ 'hash': md5,
 94+ 'revert':revert,
 95+ 'cur_size':cur_size,
 96+ 'delta':delta,
 97+ 'bot':bot
 98+ }
10399 return data
104100
105101
@@ -200,17 +196,15 @@
201197 storer.start()
202198
203199 ppills = rts.number_of_processes
204 - while True:
205 - while ppills > 0:
206 - try:
207 - res = result.get(block=False)
208 - if res == True:
209 - pbar.update(pbar.currval + 1)
210 - else:
211 - ppills -= 1
212 - except Empty:
213 - pass
214 - break
 200+ while ppills > 0:
 201+ try:
 202+ res = result.get(block=False)
 203+ if res == True:
 204+ pbar.update(pbar.currval + 1)
 205+ else:
 206+ ppills -= 1
 207+ except Empty:
 208+ pass
215209
216210 tasks.join()
217211 print '\nCreating indexes...'
Index: trunk/tools/editor_trends/etl/transformer.py
@@ -20,41 +20,49 @@
2121 import multiprocessing
2222 from operator import itemgetter
2323 from copy import deepcopy
 24+from Queue import Empty
2425
2526 import progressbar
2627 from classes import storage
2728 from utils import file_utils
28 -from utils import messages
2929 from utils import data_converter
3030 from classes import consumers
 31+from classes import queue
3132
3233
 34+class EditorDatabase(object):
 35+ def __init__(self, rts, tasks, result):
 36+ self.db_raw = storage.init_database(rts.storage, rts.dbname, rts.editors_raw)
 37+ self.db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset)
 38+
3339 class EditorConsumer(consumers.BaseConsumer):
3440 '''
3541 A simple class takes care of fetching an editor from the queue and start
3642 processing its edits.
3743 '''
38 - def __init__(self, rts, tasks, db_raw, db_dataset):
39 - super(EditorConsumer, self).__init__(rts, tasks)
40 - self.db_raw = db_raw
41 - self.db_dataset = db_dataset
 44+ def __init__(self, rts, tasks, result):
 45+ super(EditorConsumer, self).__init__(rts, tasks, result)
 46+ self.db_raw = storage.init_database(rts.storage, rts.dbname, rts.editors_raw)
 47+ self.db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset)
 48+ self.rts = rts
4249
4350 def run(self):
4451 while True:
45 - editor = self.tasks.get()
 52+ editor_id = self.tasks.get()
4653 self.tasks.task_done()
47 - print '%s editors to go...' % messages.show(self.tasks.qsize)
48 - if editor == None:
 54+ if editor_id == None:
4955 break
50 - editor = Editor(self.db_raw, self.db_dataset, editor)
 56+ editor = Editor(self.rts, editor_id, self.db_raw, self.db_dataset)
5157 editor()
 58+ self.result.put(True)
5259
5360
5461 class Editor:
55 - def __init__(self, db_raw, db_dataset, editor_id):
 62+ def __init__(self, rts, editor_id, db_raw, db_dataset):
5663 self.editor_id = editor_id
57 - self.db_raw = db_raw #storage.init_database(self.rts.storage, self.rts.dbname, self.rts.editors_raw)
58 - self.db_dataset = db_dataset #storage.init_database(self.rts.storage, self.rts.dbname, self.rts.editors_dataset)
 64+ self.db_raw = db_raw
 65+ self.db_dataset = db_dataset
 66+ self.rts = rts
5967 self.cutoff = 9
6068
6169 def __str__(self):
@@ -87,13 +95,14 @@
8896 totals = calculate_totals(totals, counts, article_count, 'article_count')
8997 totals = calculate_totals(totals, counts, edit_count, 'edit_count')
9098
 99+ cum_edit_count_main_ns, cum_edit_count_other_ns = calculate_cum_edits(edits)
 100+
 101+ edits = sort_edits(edits)
91102 if len(edits) > self.cutoff:
92103 new_wikipedian = edits[self.cutoff]['date']
93104 else:
94105 new_wikipedian = False
95 - cum_edit_count_main_ns, cum_edit_count_other_ns = calculate_cum_edits(edits)
96106
97 - edits = sort_edits(edits)
98107 first_edit = edits[0]['date']
99108 final_edit = edits[-1]['date']
100109
@@ -309,6 +318,14 @@
310319 return sorted(edits, key=itemgetter('date'))
311320
312321
 322+def add_indexes(rts):
 323+ db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset)
 324+ print '\nCreating indexes...'
 325+ db_dataset.add_index('editor')
 326+ db_dataset.add_index('new_wikipedian')
 327+ print 'Finished creating indexes...'
 328+
 329+
313330 def setup_database(rts):
314331 '''
315332 Initialize the database, including setting indexes and dropping the older
@@ -318,12 +335,16 @@
319336 db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset)
320337 db_dataset.drop_collection()
321338 editors = db_raw.retrieve_editors()
322 - return db_raw, db_dataset, editors
 339+ return editors
323340
324341
325342 def transform_editors_multi_launcher(rts):
326 - db_raw, db_dataset, editors = setup_database(rts)
327 - transformers = [EditorConsumer(rts, editors, db_raw, db_dataset) for i in xrange(rts.number_of_processes)]
 343+ editors = setup_database(rts)
 344+ n = editors.size()
 345+ result = queue.JoinableRetryQueue()
 346+ pbar = progressbar.ProgressBar(maxval=n).start()
 347+ transformers = [EditorConsumer(rts, editors, result) \
 348+ for i in xrange(rts.number_of_processes)]
328349
329350
330351 for x in xrange(rts.number_of_processes):
@@ -332,16 +353,24 @@
333354 for transformer in transformers:
334355 transformer.start()
335356
 357+ while n > 0:
 358+ try:
 359+ res = result.get(block=False)
 360+ if res == True:
 361+ pbar.update(pbar.currval + 1)
 362+ n -= 1
 363+ except Empty:
 364+ pass
 365+
336366 editors.join()
 367+ add_indexes(rts)
337368
338 - db_dataset.add_index('editor')
339 - db_dataset.add_index('new_wikipedian')
340369
341370
342371 def transform_editors_single_launcher(rts):
343372 print rts.dbname, rts.editors_raw
344 - db_raw, db_dataset, editors = setup_database(rts)
345 - n = db_raw.count()
 373+ editors = setup_database(rts)
 374+ n = editors.size()
346375 pbar = progressbar.ProgressBar(maxval=n).start()
347376
348377 for x in xrange(rts.number_of_processes):
@@ -352,13 +381,12 @@
353382 editors.task_done()
354383 if editor == None:
355384 break
356 - editor = Editor(db_raw, db_dataset, editor)
 385+ editor = Editor(rts, editor)
357386 editor()
358387
359388 pbar.update(pbar.currval + 1)
360389
361 - db_dataset.add_index('editor')
362 - db_dataset.add_index('new_wikipedian')
 390+ add_indexes(rts)
363391
364392
365393 if __name__ == '__main__':
Index: trunk/tools/editor_trends/classes/runtime_settings.py
@@ -135,7 +135,7 @@
136136 '''
137137 plugin = self.get_value('charts')
138138 requested_plugins = []
139 - if plugin != None and getattr(plugin, 'func_name', None) == None:
 139+ if plugin != None and isinstance(plugin, type('module')) == False:
140140 plugins = plugin.split(',')
141141 available_plugins = inventory.available_analyses()
142142 for plugin in plugins:
@@ -143,8 +143,8 @@
144144 raise exceptions.UnknownPluginError(plugin, available_plugins)
145145 else:
146146 requested_plugins.append(plugin)
147 - elif getattr(plugin, 'func_name', None) != None:
148 - requested_plugins.append(plugin.func_name)
 147+ elif isinstance(plugin, type('module')) != None:
 148+ requested_plugins.append(plugin)
149149 return requested_plugins
150150
151151 def set_input_location(self):
Index: trunk/tools/editor_trends/classes/queue.py
@@ -37,7 +37,7 @@
3838 def get(self, block=True, timeout=None):
3939 return retry_on_eintr(Queue.get, self, block, timeout)
4040
41 - def qsize(self):
 41+ def size(self):
4242 try:
4343 return self.qsize()
4444 except:
@@ -50,9 +50,10 @@
5151 def get(self, block=True, timeout=None):
5252 return retry_on_eintr(Queue.get, self, block, timeout)
5353
54 - def qsize(self):
 54+ def size(self):
5555 try:
5656 return self.qsize()
57 - except:
 57+ except Exception, error:
 58+ print error
5859 #OSX does not support the qsize function so we return unknown
5960 return 'unknown'
Index: trunk/tools/editor_trends/classes/storage.py
@@ -212,7 +212,8 @@
213213 reducer = Code("function()")
214214
215215 ids = []
216 - cursor = self.db[self.collection].map_reduce(mapper, reducer)
 216+ collection = '%s_%s' % (self.dbname, 'mapreduce_editors')
 217+ cursor = self.db[self.collection].map_reduce(mapper, reducer, collection)
217218 for c in cursor.find():
218219 ids.append(c['_id'])
219220 return ids
Index: trunk/tools/editor_trends/classes/dataset.py
@@ -445,7 +445,6 @@
446446 attrs)
447447 return filename
448448
449 -
450449 def add_variable(self, variables):
451450 '''
452451 Call this function to add a Variable to a dataset.
@@ -535,7 +534,7 @@
536535 print self.details()
537536
538537 def get_standard_deviation(number_list):
539 - '''Given a list of numbers, calculate the standard devition of the list'''
 538+ '''Given a list of numbers, calculate the standard deviation of the list'''
540539 mean = get_mean(number_list)
541540 std = 0
542541 n = len(number_list)
Index: trunk/tools/editor_trends/cronjobs.py
@@ -86,7 +86,7 @@
8787 This is the main entry point, it creates a queue with jobs and determines
8888 the type of job and fires it off
8989 '''
90 - db = storage.Database(rts.storage, 'wikilytics', 'jobs')
 90+ db = storage.init_database(rts.storage, 'wikilytics', 'jobs')
9191 tasks = []
9292 project, language, parser = manager.init_args_parser()
9393 args = parser.parse_args(['django'])
Property changes on: trunk/tools/editor_trends
___________________________________________________________________
Modified: svn:ignore
9494 - wikistats
zips
notes.txt
*.pyc
*.xml
*.db
*.bin
*.zip
*.csv
datasets
errors
.settings
.project
.pydevproject
wiki.cfg
fabric.py
fabfile.py
deployment
data
libs
9595 + wikistats
zips
notes.txt
*.pyc
*.xml
*.db
*.bin
*.zip
*.csv
datasets
errors
.settings
.project
.pydevproject
wiki.cfg
fabric.py
fabfile.py
deployment
data
libs
code-snippets