Index: trunk/tools/editor_trends/analyses/plugins/ppi_editor_productivity.py |
— | — | @@ -20,6 +20,9 @@ |
21 | 21 | from datetime import datetime |
22 | 22 | |
23 | 23 | def ppi_editor_productivity(var, editor, **kwargs): |
| 24 | + #print editor |
| 25 | + if editor == None: |
| 26 | + return var |
24 | 27 | new_wikipedian = editor['new_wikipedian'] |
25 | 28 | if not new_wikipedian: |
26 | 29 | return var |
— | — | @@ -27,43 +30,42 @@ |
28 | 31 | edits = editor['character_count'] |
29 | 32 | username = editor['username'] |
30 | 33 | x = 0 |
31 | | - |
32 | 34 | try: |
33 | 35 | added = edits['2010']['11']['0']['added'] |
34 | 36 | x += 1 |
35 | 37 | except KeyError: |
36 | | - added = 0 |
37 | | - try: |
38 | | - removed = edits['2010']['11']['0']['removed'] |
39 | | - x += 1 |
40 | | - except KeyError: |
41 | | - removed = 0 |
| 38 | + added = 2 |
| 39 | +# try: |
| 40 | +# removed = edits['2010']['11']['0']['removed'] |
| 41 | +# x += 1 |
| 42 | +# except KeyError: |
| 43 | +# removed = 0 |
42 | 44 | |
43 | 45 | |
44 | 46 | key = datetime(2010, 11, 30) |
45 | 47 | if added > 0: |
46 | 48 | var.add(key, added, {'username': username, 'added': 'added'}) |
47 | | - if removed > 0: |
48 | | - var.add(key, removed, {'username': username, 'removed': 'removed'}) |
49 | | - var.add(key, x, {'username': username, 'total': 'total'}) |
| 49 | +# if removed > 0: |
| 50 | +# var.add(key, removed, {'username': username, 'removed': 'removed'}) |
| 51 | +# var.add(key, x, {'username': username, 'total': 'total'}) |
50 | 52 | |
51 | 53 | y = 0 |
52 | 54 | try: |
53 | 55 | added = edits['2010']['12']['0']['added'] |
54 | 56 | y += 1 |
55 | 57 | except KeyError: |
56 | | - added = 0 |
57 | | - try: |
58 | | - removed = edits['2010']['12']['0']['removed'] |
59 | | - y += 1 |
60 | | - except KeyError: |
61 | | - removed = 0 |
| 58 | + added = 4 |
| 59 | +# try: |
| 60 | +# removed = edits['2010']['12']['0']['removed'] |
| 61 | +# y += 1 |
| 62 | +# except KeyError: |
| 63 | +# removed = 0 |
62 | 64 | |
63 | 65 | key = datetime(2010, 12, 31) |
64 | 66 | if added > 0: |
65 | 67 | var.add(key, added, {'username': username, 'added': 'added'}) |
66 | | - if removed > 0: |
67 | | - var.add(key, removed, {'username': username, 'removed': 'removed'}) |
68 | | - var.add(key, y, {'username': username, 'total': 'total'}) |
| 68 | +# if removed > 0: |
| 69 | +# var.add(key, removed, {'username': username, 'removed': 'removed'}) |
| 70 | +# var.add(key, y, {'username': username, 'total': 'total'}) |
69 | 71 | |
70 | 72 | return var |
Index: trunk/tools/editor_trends/analyses/adhoc/bot_detector.py |
— | — | @@ -70,7 +70,7 @@ |
71 | 71 | Loader function to retrieve list of id's of known Wikipedia bots. |
72 | 72 | ''' |
73 | 73 | bots = [] |
74 | | - db = storage.Database(db_type, 'bots', 'ids') |
| 74 | + db = storage.init_database(db_type, 'bots', 'ids') |
75 | 75 | cursor = db.find() |
76 | 76 | for bot in cursor: |
77 | 77 | if bot['verified'] == 'True' and language_code in bot['projects']: |
— | — | @@ -88,7 +88,7 @@ |
89 | 89 | 'bots_ids.csv', |
90 | 90 | 'utf-8', |
91 | 91 | keys) |
92 | | - db = storage.Database(rts.storage, 'wikilytics', 'bots') |
| 92 | + db = storage.init_database(rts.storage, 'wikilytics', 'bots') |
93 | 93 | db.drop_collection() |
94 | 94 | for id in bots: |
95 | 95 | bot = bots[id] |
Index: trunk/tools/editor_trends/analyses/adhoc/community_graph.py |
— | — | @@ -42,7 +42,7 @@ |
43 | 43 | |
44 | 44 | |
45 | 45 | def create_edgelist(project, collection): |
46 | | - db = storage.Database(rts.storage, project, collection) |
| 46 | + db = storage.init_database(rts.storage, project, collection) |
47 | 47 | ids = db.retrieve_distinct_keys('editor') |
48 | 48 | ids.sort() |
49 | 49 | fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', 'utf-8') |
Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -17,7 +17,7 @@ |
18 | 18 | __date__ = '2010-10-21' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
21 | | - |
| 21 | +import cProfile |
22 | 22 | import os |
23 | 23 | import logging |
24 | 24 | import logging.handlers |
— | — | @@ -337,7 +337,7 @@ |
338 | 338 | stopwatch = timer.Timer() |
339 | 339 | log.to_db(rts, 'dataset', 'transform', stopwatch, event='start') |
340 | 340 | log.to_csv(logger, rts, 'Start', 'Transform', transformer_launcher) |
341 | | - transformer.transform_editors_single_launcher(rts) |
| 341 | + transformer.transform_editors_multi_launcher(rts) |
342 | 342 | stopwatch.elapsed() |
343 | 343 | log.to_db(rts, 'dataset', 'transform', stopwatch, event='finish') |
344 | 344 | log.to_csv(logger, rts, 'Finish', 'Transform', transformer_launcher) |
— | — | @@ -353,6 +353,7 @@ |
354 | 354 | log.to_db(rts, 'dataset', 'export', stopwatch, event='start') |
355 | 355 | |
356 | 356 | for plugin in rts.plugins: |
| 357 | + #cProfile.runctx('analyzer.generate_chart_data(rts, plugin, **rts.keywords)', globals(), locals(), filename="analyzer.cprof") |
357 | 358 | analyzer.generate_chart_data(rts, plugin, **rts.keywords) |
358 | 359 | log.to_csv(logger, rts, 'Start', 'Dataset', dataset_launcher, |
359 | 360 | plugin=plugin, |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -75,30 +75,26 @@ |
76 | 76 | Prepare a single line to store in the database, this entails converting |
77 | 77 | to proper variable and taking care of the encoding. |
78 | 78 | ''' |
79 | | - try: |
80 | | - article_id = int(line[1]) |
81 | | - username = line[3].encode('utf-8') |
82 | | - ns = int(line[4]) |
83 | | - date = text_utils.convert_timestamp_to_datetime_utc(line[6]) |
84 | | - md5 = line[7] |
85 | | - revert = int(line[8]) |
86 | | - bot = int(line[9]) |
87 | | - cur_size = int(line[10]) |
88 | | - delta = int(line[11]) |
| 79 | + article_id = int(line[1]) |
| 80 | + username = line[3].encode('utf-8') |
| 81 | + ns = int(line[4]) |
| 82 | + date = text_utils.convert_timestamp_to_datetime_utc(line[6]) |
| 83 | + md5 = line[7] |
| 84 | + revert = int(line[8]) |
| 85 | + bot = int(line[9]) |
| 86 | + cur_size = int(line[10]) |
| 87 | + delta = int(line[11]) |
89 | 88 | |
90 | | - data = {'date': date, |
91 | | - 'article': article_id, |
92 | | - 'username': username, |
93 | | - 'ns': ns, |
94 | | - 'hash': md5, |
95 | | - 'revert':revert, |
96 | | - 'cur_size':cur_size, |
97 | | - 'delta':delta, |
98 | | - 'bot':bot |
99 | | - } |
100 | | - except: |
101 | | - print line, len(line) |
102 | | - return {} |
| 89 | + data = {'date': date, |
| 90 | + 'article': article_id, |
| 91 | + 'username': username, |
| 92 | + 'ns': ns, |
| 93 | + 'hash': md5, |
| 94 | + 'revert':revert, |
| 95 | + 'cur_size':cur_size, |
| 96 | + 'delta':delta, |
| 97 | + 'bot':bot |
| 98 | + } |
103 | 99 | return data |
104 | 100 | |
105 | 101 | |
— | — | @@ -200,17 +196,15 @@ |
201 | 197 | storer.start() |
202 | 198 | |
203 | 199 | ppills = rts.number_of_processes |
204 | | - while True: |
205 | | - while ppills > 0: |
206 | | - try: |
207 | | - res = result.get(block=False) |
208 | | - if res == True: |
209 | | - pbar.update(pbar.currval + 1) |
210 | | - else: |
211 | | - ppills -= 1 |
212 | | - except Empty: |
213 | | - pass |
214 | | - break |
| 200 | + while ppills > 0: |
| 201 | + try: |
| 202 | + res = result.get(block=False) |
| 203 | + if res == True: |
| 204 | + pbar.update(pbar.currval + 1) |
| 205 | + else: |
| 206 | + ppills -= 1 |
| 207 | + except Empty: |
| 208 | + pass |
215 | 209 | |
216 | 210 | tasks.join() |
217 | 211 | print '\nCreating indexes...' |
Index: trunk/tools/editor_trends/etl/transformer.py |
— | — | @@ -20,41 +20,49 @@ |
21 | 21 | import multiprocessing |
22 | 22 | from operator import itemgetter |
23 | 23 | from copy import deepcopy |
| 24 | +from Queue import Empty |
24 | 25 | |
25 | 26 | import progressbar |
26 | 27 | from classes import storage |
27 | 28 | from utils import file_utils |
28 | | -from utils import messages |
29 | 29 | from utils import data_converter |
30 | 30 | from classes import consumers |
| 31 | +from classes import queue |
31 | 32 | |
32 | 33 | |
| 34 | +class EditorDatabase(object): |
| 35 | + def __init__(self, rts, tasks, result): |
| 36 | + self.db_raw = storage.init_database(rts.storage, rts.dbname, rts.editors_raw) |
| 37 | + self.db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset) |
| 38 | + |
33 | 39 | class EditorConsumer(consumers.BaseConsumer): |
34 | 40 | ''' |
35 | 41 | A simple class takes care of fetching an editor from the queue and start |
36 | 42 | processing its edits. |
37 | 43 | ''' |
38 | | - def __init__(self, rts, tasks, db_raw, db_dataset): |
39 | | - super(EditorConsumer, self).__init__(rts, tasks) |
40 | | - self.db_raw = db_raw |
41 | | - self.db_dataset = db_dataset |
| 44 | + def __init__(self, rts, tasks, result): |
| 45 | + super(EditorConsumer, self).__init__(rts, tasks, result) |
| 46 | + self.db_raw = storage.init_database(rts.storage, rts.dbname, rts.editors_raw) |
| 47 | + self.db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset) |
| 48 | + self.rts = rts |
42 | 49 | |
43 | 50 | def run(self): |
44 | 51 | while True: |
45 | | - editor = self.tasks.get() |
| 52 | + editor_id = self.tasks.get() |
46 | 53 | self.tasks.task_done() |
47 | | - print '%s editors to go...' % messages.show(self.tasks.qsize) |
48 | | - if editor == None: |
| 54 | + if editor_id == None: |
49 | 55 | break |
50 | | - editor = Editor(self.db_raw, self.db_dataset, editor) |
| 56 | + editor = Editor(self.rts, editor_id, self.db_raw, self.db_dataset) |
51 | 57 | editor() |
| 58 | + self.result.put(True) |
52 | 59 | |
53 | 60 | |
54 | 61 | class Editor: |
55 | | - def __init__(self, db_raw, db_dataset, editor_id): |
| 62 | + def __init__(self, rts, editor_id, db_raw, db_dataset): |
56 | 63 | self.editor_id = editor_id |
57 | | - self.db_raw = db_raw #storage.init_database(self.rts.storage, self.rts.dbname, self.rts.editors_raw) |
58 | | - self.db_dataset = db_dataset #storage.init_database(self.rts.storage, self.rts.dbname, self.rts.editors_dataset) |
| 64 | + self.db_raw = db_raw |
| 65 | + self.db_dataset = db_dataset |
| 66 | + self.rts = rts |
59 | 67 | self.cutoff = 9 |
60 | 68 | |
61 | 69 | def __str__(self): |
— | — | @@ -87,13 +95,14 @@ |
88 | 96 | totals = calculate_totals(totals, counts, article_count, 'article_count') |
89 | 97 | totals = calculate_totals(totals, counts, edit_count, 'edit_count') |
90 | 98 | |
| 99 | + cum_edit_count_main_ns, cum_edit_count_other_ns = calculate_cum_edits(edits) |
| 100 | + |
| 101 | + edits = sort_edits(edits) |
91 | 102 | if len(edits) > self.cutoff: |
92 | 103 | new_wikipedian = edits[self.cutoff]['date'] |
93 | 104 | else: |
94 | 105 | new_wikipedian = False |
95 | | - cum_edit_count_main_ns, cum_edit_count_other_ns = calculate_cum_edits(edits) |
96 | 106 | |
97 | | - edits = sort_edits(edits) |
98 | 107 | first_edit = edits[0]['date'] |
99 | 108 | final_edit = edits[-1]['date'] |
100 | 109 | |
— | — | @@ -309,6 +318,14 @@ |
310 | 319 | return sorted(edits, key=itemgetter('date')) |
311 | 320 | |
312 | 321 | |
| 322 | +def add_indexes(rts): |
| 323 | + db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset) |
| 324 | + print '\nCreating indexes...' |
| 325 | + db_dataset.add_index('editor') |
| 326 | + db_dataset.add_index('new_wikipedian') |
| 327 | + print 'Finished creating indexes...' |
| 328 | + |
| 329 | + |
313 | 330 | def setup_database(rts): |
314 | 331 | ''' |
315 | 332 | Initialize the database, including setting indexes and dropping the older |
— | — | @@ -318,12 +335,16 @@ |
319 | 336 | db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset) |
320 | 337 | db_dataset.drop_collection() |
321 | 338 | editors = db_raw.retrieve_editors() |
322 | | - return db_raw, db_dataset, editors |
| 339 | + return editors |
323 | 340 | |
324 | 341 | |
325 | 342 | def transform_editors_multi_launcher(rts): |
326 | | - db_raw, db_dataset, editors = setup_database(rts) |
327 | | - transformers = [EditorConsumer(rts, editors, db_raw, db_dataset) for i in xrange(rts.number_of_processes)] |
| 343 | + editors = setup_database(rts) |
| 344 | + n = editors.size() |
| 345 | + result = queue.JoinableRetryQueue() |
| 346 | + pbar = progressbar.ProgressBar(maxval=n).start() |
| 347 | + transformers = [EditorConsumer(rts, editors, result) \ |
| 348 | + for i in xrange(rts.number_of_processes)] |
328 | 349 | |
329 | 350 | |
330 | 351 | for x in xrange(rts.number_of_processes): |
— | — | @@ -332,16 +353,24 @@ |
333 | 354 | for transformer in transformers: |
334 | 355 | transformer.start() |
335 | 356 | |
| 357 | + while n > 0: |
| 358 | + try: |
| 359 | + res = result.get(block=False) |
| 360 | + if res == True: |
| 361 | + pbar.update(pbar.currval + 1) |
| 362 | + n -= 1 |
| 363 | + except Empty: |
| 364 | + pass |
| 365 | + |
336 | 366 | editors.join() |
| 367 | + add_indexes(rts) |
337 | 368 | |
338 | | - db_dataset.add_index('editor') |
339 | | - db_dataset.add_index('new_wikipedian') |
340 | 369 | |
341 | 370 | |
342 | 371 | def transform_editors_single_launcher(rts): |
343 | 372 | print rts.dbname, rts.editors_raw |
344 | | - db_raw, db_dataset, editors = setup_database(rts) |
345 | | - n = db_raw.count() |
| 373 | + editors = setup_database(rts) |
| 374 | + n = editors.size() |
346 | 375 | pbar = progressbar.ProgressBar(maxval=n).start() |
347 | 376 | |
348 | 377 | for x in xrange(rts.number_of_processes): |
— | — | @@ -352,13 +381,12 @@ |
353 | 382 | editors.task_done() |
354 | 383 | if editor == None: |
355 | 384 | break |
356 | | - editor = Editor(db_raw, db_dataset, editor) |
| 385 | + editor = Editor(rts, editor) |
357 | 386 | editor() |
358 | 387 | |
359 | 388 | pbar.update(pbar.currval + 1) |
360 | 389 | |
361 | | - db_dataset.add_index('editor') |
362 | | - db_dataset.add_index('new_wikipedian') |
| 390 | + add_indexes(rts) |
363 | 391 | |
364 | 392 | |
365 | 393 | if __name__ == '__main__': |
Index: trunk/tools/editor_trends/classes/runtime_settings.py |
— | — | @@ -135,7 +135,7 @@ |
136 | 136 | ''' |
137 | 137 | plugin = self.get_value('charts') |
138 | 138 | requested_plugins = [] |
139 | | - if plugin != None and getattr(plugin, 'func_name', None) == None: |
| 139 | + if plugin != None and isinstance(plugin, type('module')) == False: |
140 | 140 | plugins = plugin.split(',') |
141 | 141 | available_plugins = inventory.available_analyses() |
142 | 142 | for plugin in plugins: |
— | — | @@ -143,8 +143,8 @@ |
144 | 144 | raise exceptions.UnknownPluginError(plugin, available_plugins) |
145 | 145 | else: |
146 | 146 | requested_plugins.append(plugin) |
147 | | - elif getattr(plugin, 'func_name', None) != None: |
148 | | - requested_plugins.append(plugin.func_name) |
| 147 | + elif isinstance(plugin, type('module')) != None: |
| 148 | + requested_plugins.append(plugin) |
149 | 149 | return requested_plugins |
150 | 150 | |
151 | 151 | def set_input_location(self): |
Index: trunk/tools/editor_trends/classes/queue.py |
— | — | @@ -37,7 +37,7 @@ |
38 | 38 | def get(self, block=True, timeout=None): |
39 | 39 | return retry_on_eintr(Queue.get, self, block, timeout) |
40 | 40 | |
41 | | - def qsize(self): |
| 41 | + def size(self): |
42 | 42 | try: |
43 | 43 | return self.qsize() |
44 | 44 | except: |
— | — | @@ -50,9 +50,10 @@ |
51 | 51 | def get(self, block=True, timeout=None): |
52 | 52 | return retry_on_eintr(Queue.get, self, block, timeout) |
53 | 53 | |
54 | | - def qsize(self): |
| 54 | + def size(self): |
55 | 55 | try: |
56 | 56 | return self.qsize() |
57 | | - except: |
| 57 | + except Exception, error: |
| 58 | + print error |
58 | 59 | #OSX does not support the qsize function so we return unknown |
59 | 60 | return 'unknown' |
Index: trunk/tools/editor_trends/classes/storage.py |
— | — | @@ -212,7 +212,8 @@ |
213 | 213 | reducer = Code("function()") |
214 | 214 | |
215 | 215 | ids = [] |
216 | | - cursor = self.db[self.collection].map_reduce(mapper, reducer) |
| 216 | + collection = '%s_%s' % (self.dbname, 'mapreduce_editors') |
| 217 | + cursor = self.db[self.collection].map_reduce(mapper, reducer, collection) |
217 | 218 | for c in cursor.find(): |
218 | 219 | ids.append(c['_id']) |
219 | 220 | return ids |
Index: trunk/tools/editor_trends/classes/dataset.py |
— | — | @@ -445,7 +445,6 @@ |
446 | 446 | attrs) |
447 | 447 | return filename |
448 | 448 | |
449 | | - |
450 | 449 | def add_variable(self, variables): |
451 | 450 | ''' |
452 | 451 | Call this function to add a Variable to a dataset. |
— | — | @@ -535,7 +534,7 @@ |
536 | 535 | print self.details() |
537 | 536 | |
538 | 537 | def get_standard_deviation(number_list): |
539 | | - '''Given a list of numbers, calculate the standard devition of the list''' |
| 538 | + '''Given a list of numbers, calculate the standard deviation of the list''' |
540 | 539 | mean = get_mean(number_list) |
541 | 540 | std = 0 |
542 | 541 | n = len(number_list) |
Index: trunk/tools/editor_trends/cronjobs.py |
— | — | @@ -86,7 +86,7 @@ |
87 | 87 | This is the main entry point, it creates a queue with jobs and determines |
88 | 88 | the type of job and fires it off |
89 | 89 | ''' |
90 | | - db = storage.Database(rts.storage, 'wikilytics', 'jobs') |
| 90 | + db = storage.init_database(rts.storage, 'wikilytics', 'jobs') |
91 | 91 | tasks = [] |
92 | 92 | project, language, parser = manager.init_args_parser() |
93 | 93 | args = parser.parse_args(['django']) |
Property changes on: trunk/tools/editor_trends |
___________________________________________________________________ |
Modified: svn:ignore |
94 | 94 | - wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
libs |
95 | 95 | + wikistats |
zips |
notes.txt |
*.pyc |
*.xml |
*.db |
*.bin |
*.zip |
*.csv |
datasets |
errors |
.settings |
.project |
.pydevproject |
wiki.cfg |
fabric.py |
fabfile.py |
deployment |
data |
libs |
code-snippets |