Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -379,8 +379,8 @@ |
380 | 380 | |
381 | 381 | |
382 | 382 | |
383 | | -def parse_xml(buffer): |
384 | | - context = iterparse(buffer, events=('end',)) |
| 383 | +def parse_xml(fh): |
| 384 | + context = iterparse(fh, events=('end',)) |
385 | 385 | context = iter(context) |
386 | 386 | try: |
387 | 387 | event, root = context.next() |
— | — | @@ -425,33 +425,38 @@ |
426 | 426 | t1 = datetime.datetime.now() |
427 | 427 | print 'Processing took %s' % (t1 - t0) |
428 | 428 | t0 = t1 |
429 | | - for data in unzip(filename): |
430 | | - if data.find('<page>') > -1: |
431 | | - parsing = True |
| 429 | + fh = bz2.BZ2File(filename, 'rb') |
| 430 | + if dataset == 'training': |
| 431 | + function(fh, cache, bots) |
| 432 | + else: |
| 433 | + counts = function(fh, counts, bots) |
| 434 | + fh.close() |
| 435 | +# for data in unzip(filename): |
| 436 | +# if data.find('<page>') > -1: |
| 437 | +# parsing = True |
| 438 | +# |
| 439 | +# if parsing: |
| 440 | +# try: |
| 441 | +# buffer.write(data) |
| 442 | +# except MemoryError, e: |
| 443 | +# print e |
| 444 | +# parsing = False |
| 445 | +# buffer = cStringIO.StringIO() |
| 446 | +# |
| 447 | +# if data.find('</page>') > -1: |
| 448 | +# i += 1 |
| 449 | +# buffer.seek(0) |
| 450 | +# article = parse_xml(buffer) |
| 451 | +# if dataset == 'training': |
| 452 | +# function(article, cache, bots) |
| 453 | +# else: |
| 454 | +# counts = function(article, counts, bots) |
| 455 | +# buffer = cStringIO.StringIO() |
| 456 | +# parsing = False |
| 457 | +# if i % 10000 == 0: |
| 458 | +# print 'Worker %s parsed %s articles' % (id, i) |
432 | 459 | |
433 | | - if parsing: |
434 | | - try: |
435 | | - buffer.write(data) |
436 | | - except MemoryError, e: |
437 | | - print e |
438 | | - parsing = False |
439 | | - buffer = cStringIO.StringIO() |
440 | 460 | |
441 | | - if data.find('</page>') > -1: |
442 | | - i += 1 |
443 | | - buffer.seek(0) |
444 | | - article = parse_xml(buffer) |
445 | | - if dataset == 'training': |
446 | | - function(article, cache, bots) |
447 | | - else: |
448 | | - #counts = function(article, counts, bots) |
449 | | - pass |
450 | | - buffer = cStringIO.StringIO() |
451 | | - parsing = False |
452 | | - if i % 10000 == 0: |
453 | | - print 'Worker %s parsed %s articles' % (id, i) |
454 | | - |
455 | | - |
456 | 461 | if dataset == 'training': |
457 | 462 | cache.empty() |
458 | 463 | cache.stats.summary() |