r84956 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r84955‎ | r84956 | r84957 >
Date:14:17, 29 March 2011
Author:diederik
Status:deferred
Tags:
Comment:
Fixed some bugs for the data competition.
Modified paths:
  • /trunk/tools/editor_trends/etl/enricher.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/sort.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/enricher.py
@@ -295,11 +295,7 @@
296296 return editor
297297
298298
299 -def determine_namespace(title):
300 - namespaces = {'User': 2,
301 - 'Talk': 1,
302 - 'User Talk': 3,
303 - }
 299+def determine_namespace(title, namespaces):
304300 ns = {}
305301 if title != None:
306302 for namespace in namespaces:
@@ -346,8 +342,9 @@
347343
348344
349345 def count_edits(article, counts, bots):
 346+ namespaces = {}
350347 title = article['title'].text
351 - namespace = determine_namespace(title)
 348+ namespace = determine_namespace(title, namespaces)
352349 xml_namespace = '{http://www.mediawiki.org/xml/export-0.4/}'
353350 if namespace != False:
354351 article_id = article['id'].text
@@ -370,8 +367,12 @@
371368
372369
373370 def create_variables(article, cache, bots):
 371+ namespaces = {'User': 2,
 372+ 'Talk': 1,
 373+ 'User Talk': 3,
 374+ }
374375 title = article['title']
375 - namespace = determine_namespace(title)
 376+ namespace = determine_namespace(title, namespaces)
376377
377378 if namespace != False:
378379 cache.stats.count_articles += 1
@@ -544,7 +545,7 @@
545546 function = count_edits
546547 storage = 'csv'
547548 dataset = 'prediction'
548 - processors = 1
 549+ processors = 7
549550 launcher(function, path, dataset, storage, processors)
550551
551552
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -150,7 +150,7 @@
151151
152152 def extract_username(contributor, **kwargs):
153153 contributor = contributor.find('username')
154 - if contributor != None:
 154+ if contributor != None and contributor.text != None:
155155 contributor = contributor.text.encode('utf-8')
156156 return contributor.decode('utf-8')
157157 else:
@@ -187,11 +187,16 @@
188188
189189
190190 def parse_title(title):
 191+ title_data = {}
191192 if type(title.text) == type('str'):
192 - title = title.text.decode('utf-8')
 193+ title_data['title'] = title.text.decode('utf-8')
193194 else:
194 - title = title.text
195 - return title
 195+ title_data['title'] = title.text
 196+ if title_data['title'].startswith('List of'):
 197+ title_data['list'] = True
 198+ else:
 199+ title_data['list'] = False
 200+ return title_data
196201
197202
198203 def output_editor_information(revisions, page, bots, rts):
@@ -300,7 +305,7 @@
301306 output = output_editor_information(revisions, article_id, bot_ids, rts)
302307 output = add_namespace_to_output(output, namespace)
303308 write_output(output, filehandles, lock, rts)
304 - file_utils.write_list_to_csv([article_id, title], fh2)
 309+ file_utils.write_list_to_csv([article_id, title.values()], fh2)
305310 processed += 1
306311 page.clear()
307312 pbar.update(pbar.currval + article_size)
Index: trunk/tools/editor_trends/etl/sort.py
@@ -56,7 +56,7 @@
5757 write_sorted_file(sorted_data, filename, self.rts)
5858 self.result.put(True)
5959 except UnicodeDecodeError, e:
60 - print e
 60+ print 'Error: %s, (%s)' % (e, filename)
6161 except Empty:
6262 pass
6363
@@ -142,7 +142,6 @@
143143 rts is an instance of RunTimeSettings
144144 '''
145145 files = file_utils.retrieve_file_list(rts.txt, 'csv')
146 - #files = files[0:6]
147146
148147 pbar = progressbar.ProgressBar(maxval=len(files)).start()
149148 tasks = multiprocessing.JoinableQueue()