r84850 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r84849‎ | r84850 | r84851 >
Date:15:06, 27 March 2011
Author:diederik
Status:deferred
Tags:
Comment:
Replaced cStringIO buffer with BZ2 file like object (part 3)
Modified paths:
  • /trunk/tools/editor_trends/etl/enricher.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/enricher.py
@@ -382,34 +382,41 @@
383383 def parse_xml(fh):
384384 context = iterparse(fh, events=('end',))
385385 context = iter(context)
386 - try:
387 - event, root = context.next()
388 - except SyntaxError, e:
389 - print e
390 - print buffer.getvalue()
 386+ event, root = context.next()
 387+# try:
 388+#
 389+# except SyntaxError, e:
 390+# print e
 391+# print buffer.getvalue()
391392
392393 article = {}
 394+ article['revisions'] = []
393395 id = False
394 - article[root.tag] = root.text
395 - root.clear()
396 - article['revisions'] = []
 396+# article[root.tag] = root.text
 397+# root.clear()
 398+
397399 for event, elem in context:
398400 if event == 'end' and elem.tag == 'revision':
399401 article['revisions'].append(elem)
400402 elif event == 'end' and elem.tag == 'id' and id == False:
401403 article[elem.tag] = elem
402404 id = True
 405+ elif event == 'end' and elem.tag == 'title':
 406+ article[elem.tag] = elem
 407+ elif event == 'end' and elem.tag == 'page':
 408+ print article
 409+ yield article
 410+ article = {}
 411+ article['revisions'] = []
 412+ id = False
403413 else:
404414 elem.clear()
405 - return article
 415+ #return article
406416
407417
408418 def stream_raw_xml(input_queue, storage, id, function, dataset):
409419 bots = detector.retrieve_bots('en')
410 - buffer = cStringIO.StringIO()
411 - parsing = False
412420 t0 = datetime.datetime.now()
413 - i = 0
414421
415422 if dataset == 'training':
416423 cache = Buffer(storage, id)
@@ -422,16 +429,20 @@
423430 if filename == None:
424431 break
425432
 433+ fh = bz2.BZ2File(filename, 'rb')
 434+ for article in parse_xml(fh):
 435+ if dataset == 'training':
 436+ function(article, cache, bots)
 437+ else:
 438+ counts = function(article, counts, bots)
 439+ i += 1
 440+ if i % 10000 == 0:
 441+ print 'Worker %s parsed %s articles' % (id, i)
 442+ fh.close()
 443+
426444 t1 = datetime.datetime.now()
427445 print 'Processing took %s' % (t1 - t0)
428446 t0 = t1
429 - fh = bz2.BZ2File(filename, 'rb')
430 - article = parse_xml(fh)
431 - if dataset == 'training':
432 - function(article, cache, bots)
433 - else:
434 - counts = function(article, counts, bots)
435 - fh.close()
436447 # for data in unzip(filename):
437448 # if data.find('<page>') > -1:
438449 # parsing = True
@@ -454,10 +465,9 @@
455466 # counts = function(article, counts, bots)
456467 # buffer = cStringIO.StringIO()
457468 # parsing = False
458 -# if i % 10000 == 0:
459 -# print 'Worker %s parsed %s articles' % (id, i)
460469
461470
 471+
462472 if dataset == 'training':
463473 cache.empty()
464474 cache.stats.summary()