r84851 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r84850‎ | r84851 | r84852 >
Date:15:52, 27 March 2011
Author:diederik
Status:deferred
Tags:
Comment:
Cleaned up old code.
Modified paths:
  • /trunk/tools/editor_trends/etl/enricher.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/enricher.py
@@ -304,7 +304,7 @@
305305
306306
307307 def count_edits(article, counts, bots):
308 - title = article['title']
 308+ title = article['title'].text
309309 namespace = determine_namespace(title)
310310
311311 if namespace != False:
@@ -380,30 +380,27 @@
381381
382382
383383 def parse_xml(fh):
384 - context = iterparse(fh, events=('end',))
 384+ context = iterparse(fh, events=('start', 'end'))
385385 context = iter(context)
386 - event, root = context.next()
387 -# try:
388 -#
389 -# except SyntaxError, e:
390 -# print e
391 -# print buffer.getvalue()
 386+ x = 0
392387
393388 article = {}
394389 article['revisions'] = []
395390 id = False
396 -# article[root.tag] = root.text
397 -# root.clear()
 391+ namespace = '{http://www.mediawiki.org/xml/export-0.4/}'
398392
399393 for event, elem in context:
400 - if event == 'end' and elem.tag == 'revision':
 394+ if event == 'end' and elem.tag == '%s%s' % (namespace, 'title'):
 395+ article['title'] = elem
 396+ x += 1
 397+ if x == 100:
 398+ break
 399+ elif event == 'end' and elem.tag == '%s%s' % (namespace, 'revision'):
401400 article['revisions'].append(elem)
402 - elif event == 'end' and elem.tag == 'id' and id == False:
403 - article[elem.tag] = elem
 401+ elif event == 'end' and elem.tag == '%s%s' % (namespace, 'id') and id == False:
 402+ article['id'] = elem
404403 id = True
405 - elif event == 'end' and elem.tag == 'title':
406 - article[elem.tag] = elem
407 - elif event == 'end' and elem.tag == 'page':
 404+ elif event == 'end' and elem.tag == '%s%s' % (namespace, 'page'):
408405 print article
409406 yield article
410407 article = {}
@@ -411,13 +408,13 @@
412409 id = False
413410 else:
414411 elem.clear()
415 - #return article
416412
417413
 414+
418415 def stream_raw_xml(input_queue, storage, id, function, dataset):
419416 bots = detector.retrieve_bots('en')
420417 t0 = datetime.datetime.now()
421 -
 418+ i = 0
422419 if dataset == 'training':
423420 cache = Buffer(storage, id)
424421 else: