r86522 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r86521‎ | r86522 | r86523 >
Date:18:40, 20 April 2011
Author:diederik
Status:deferred
Tags:
Comment:
Implemented pylint suggestions
Modified paths:
  • /trunk/tools/editor_trends/etl/transformer.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/transformer.py
@@ -17,10 +17,7 @@
1818 __date__ = '2010-11-02'
1919 __version__ = '0.1'
2020
21 -import sys
22 -import datetime
2321 import multiprocessing
24 -from Queue import Empty
2522 from operator import itemgetter
2623 from copy import deepcopy
2724
@@ -31,7 +28,14 @@
3229 from utils import data_converter
3330 from classes import consumers
3431
 32+
3533 class EditorConsumer(consumers.BaseConsumer):
 34+ '''
 35+ A simple class takes care of fetching an editor from the queue and start
 36+ processing its edits.
 37+ '''
 38+ def __init__(self):
 39+ super(EditorConsumer, self).__init__()
3640
3741 def run(self):
3842 while True:
@@ -43,9 +47,9 @@
4448 new_editor()
4549
4650
47 -class Editor(object):
48 - def __init__(self, id, db_raw, db_dataset, **kwargs):
49 - self.id = id
 51+class Editor:
 52+ def __init__(self, editor_id, db_raw, db_dataset, **kwargs):
 53+ self.id = editor_id
5054 self.db_raw = db_raw
5155 self.db_dataset = db_dataset
5256 for kw in kwargs:
@@ -121,6 +125,13 @@
122126
123127
124128 def calculate_totals(totals, counts, dc, var):
 129+ '''
 130+ So far, counting a variable for an editor happens per month per year but
 131+ this makes it cumbersome to determine how many edits an editor has made in
 132+ a single year (you need to iterate over all the months and that can become
 133+ quite expensive when you have 10000s of editors. Hence, this little helper
 134+ function counts the total number of actions on a yearly basis.
 135+ '''
125136 cnts = deepcopy(counts)
126137 totals.setdefault(var, {})
127138 for year in dc:
@@ -140,6 +151,9 @@
141152
142153
143154 def determine_number_edits(edits, first_year, final_year):
 155+ '''
 156+ This function counts the number of edits per namespace per month per year.
 157+ '''
144158 dc = data_converter.create_datacontainer(first_year, final_year)
145159 dc = data_converter.add_months_to_datacontainer(dc, 'dict')
146160 for edit in edits:
@@ -152,6 +166,10 @@
153167
154168
155169 def determine_articles_workedon(edits, first_year, final_year):
 170+ '''
 171+ This function creates a list of article_ids that an editor has worked on in
 172+ a given month/year.
 173+ '''
156174 dc = data_converter.create_datacontainer(first_year, final_year)
157175 dc = data_converter.add_months_to_datacontainer(dc, 'dict')
158176 for year in edits:
@@ -161,6 +179,7 @@
162180 dc[year][month].setdefault(ns, set())
163181 dc[year][month][ns].add(edit['article'])
164182
 183+ #convert the set to a list as mongo cannot store sets.
165184 for year in dc:
166185 for month in dc[year]:
167186 for ns in dc[year][month]:
@@ -170,6 +189,10 @@
171190
172191
173192 def determine_namespaces_workedon(edits, first_year, final_year):
 193+ '''
 194+ This function creates a list of namespaces that an editor has worked on in
 195+ a given month/year.
 196+ '''
174197 dc = data_converter.create_datacontainer(first_year, final_year)
175198 dc = data_converter.add_months_to_datacontainer(dc, 'set')
176199 for year in edits:
@@ -184,6 +207,10 @@
185208
186209
187210 def determine_number_reverts(edits, first_year, final_year):
 211+ '''
 212+ This function counts the number of times an edit was reverted in a given
 213+ month/year.
 214+ '''
188215 dc = data_converter.create_datacontainer(first_year, final_year)
189216 dc = data_converter.add_months_to_datacontainer(dc, 'dict')
190217 for year in edits:
@@ -220,6 +247,9 @@
221248
222249
223250 def determine_year_range(edits):
 251+ '''
 252+ This function determines the first and final year that an editor was active.
 253+ '''
224254 years = [year for year in edits if edits[year] != []]
225255 first_year = int(min(years))
226256 final_year = int(max(years)) + 1
@@ -227,6 +257,10 @@
228258
229259
230260 def determine_last_edit_by_year(edits, first_year, final_year):
 261+ '''
 262+ This function determines the date of the last edit in a given year for a
 263+ given editor.
 264+ '''
231265 dc = data_converter.create_datacontainer(first_year, final_year, 0)
232266 for year in edits:
233267 for edit in edits[year]:
@@ -259,23 +293,29 @@
260294
261295 def transform_editors_multi_launcher(rts):
262296 tasks = multiprocessing.JoinableQueue()
263 - consumers = [EditorConsumer(tasks, None) for i in xrange(rts.number_of_processes)]
 297+ input_db, output_db, editors = setup_database(rts)
 298+ transformers = [EditorConsumer(tasks, None) for i in xrange(rts.number_of_processes)]
264299
265 - for id in ids:
266 - tasks.put(Editor(rts.dbname, rts.editors_raw, id))
 300+ for editor in editors:
 301+ tasks.put(Editor(rts.dbname, rts.editors_raw, editor))
 302+
267303 for x in xrange(rts.number_of_processes):
268304 tasks.put(None)
269305
270306 print messages.show(tasks.qsize)
271 - for w in consumers:
272 - w.start()
 307+ for transformer in transformers:
 308+ transformer.start()
273309
274310 tasks.join()
275311
276312
277313 def setup_database(rts):
278 - db_raw = storage.Database(rts.storage, rts.dbname, rts.editors_raw)
279 - db_dataset = storage.Database(rts.storage, rts.dbname, rts.editors_dataset)
 314+ '''
 315+ Initialize the database, including setting indexes and dropping the older
 316+ version of the collection.
 317+ '''
 318+ db_raw = storage.init_database(rts.storage, rts.dbname, rts.editors_raw)
 319+ db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset)
280320 db_dataset.drop_collection()
281321 ids = db_dataset.retrieve_distinct_keys('editor')
282322 db_dataset.add_index('editor')
@@ -286,14 +326,15 @@
287327
288328 def transform_editors_single_launcher(rts):
289329 print rts.dbname, rts.editors_raw
290 - input_db, output_db, ids = setup_database(rts)
291 - pbar = progressbar.ProgressBar(maxval=len(ids)).start()
292 - for x, id in enumerate(ids):
 330+ input_db, output_db, editors = setup_database(rts)
 331+ pbar = progressbar.ProgressBar(maxval=len(editors)).start()
 332+ for x, editor in enumerate(editors):
293333 editor = Editor(id, input_db, output_db)
294334 editor()
295335 pbar.update(pbar.currval + 1)
296336
297337
298338 if __name__ == '__main__':
299 - transform_editors_single_launcher('enwiki', 'editors')
300 - #transform_editors_multi_launcher('enwiki', 'editors')
 339+ rts = None
 340+ transform_editors_single_launcher(rts)
 341+ #transform_editors_multi_launcher(rts)