r84847 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r84846‎ | r84847 | r84848 >
Date:14:47, 27 March 2011
Author:diederik
Status:deferred
Tags:
Comment:
Replaced cStringIO buffer with BZ2 file like object.
Modified paths:
  • /trunk/tools/editor_trends/etl/enricher.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/enricher.py
@@ -379,8 +379,8 @@
380380
381381
382382
383 -def parse_xml(buffer):
384 - context = iterparse(buffer, events=('end',))
 383+def parse_xml(fh):
 384+ context = iterparse(fh, events=('end',))
385385 context = iter(context)
386386 try:
387387 event, root = context.next()
@@ -425,33 +425,38 @@
426426 t1 = datetime.datetime.now()
427427 print 'Processing took %s' % (t1 - t0)
428428 t0 = t1
429 - for data in unzip(filename):
430 - if data.find('<page>') > -1:
431 - parsing = True
 429+ fh = bz2.BZ2File(filename, 'rb')
 430+ if dataset == 'training':
 431+ function(fh, cache, bots)
 432+ else:
 433+ counts = function(fh, counts, bots)
 434+ fh.close()
 435+# for data in unzip(filename):
 436+# if data.find('<page>') > -1:
 437+# parsing = True
 438+#
 439+# if parsing:
 440+# try:
 441+# buffer.write(data)
 442+# except MemoryError, e:
 443+# print e
 444+# parsing = False
 445+# buffer = cStringIO.StringIO()
 446+#
 447+# if data.find('</page>') > -1:
 448+# i += 1
 449+# buffer.seek(0)
 450+# article = parse_xml(buffer)
 451+# if dataset == 'training':
 452+# function(article, cache, bots)
 453+# else:
 454+# counts = function(article, counts, bots)
 455+# buffer = cStringIO.StringIO()
 456+# parsing = False
 457+# if i % 10000 == 0:
 458+# print 'Worker %s parsed %s articles' % (id, i)
432459
433 - if parsing:
434 - try:
435 - buffer.write(data)
436 - except MemoryError, e:
437 - print e
438 - parsing = False
439 - buffer = cStringIO.StringIO()
440460
441 - if data.find('</page>') > -1:
442 - i += 1
443 - buffer.seek(0)
444 - article = parse_xml(buffer)
445 - if dataset == 'training':
446 - function(article, cache, bots)
447 - else:
448 - #counts = function(article, counts, bots)
449 - pass
450 - buffer = cStringIO.StringIO()
451 - parsing = False
452 - if i % 10000 == 0:
453 - print 'Worker %s parsed %s articles' % (id, i)
454 -
455 -
456461 if dataset == 'training':
457462 cache.empty()
458463 cache.stats.summary()