Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -382,34 +382,41 @@ |
383 | 383 | def parse_xml(fh): |
384 | 384 | context = iterparse(fh, events=('end',)) |
385 | 385 | context = iter(context) |
386 | | - try: |
387 | | - event, root = context.next() |
388 | | - except SyntaxError, e: |
389 | | - print e |
390 | | - print buffer.getvalue() |
| 386 | + event, root = context.next() |
| 387 | +# try: |
| 388 | +# |
| 389 | +# except SyntaxError, e: |
| 390 | +# print e |
| 391 | +# print buffer.getvalue() |
391 | 392 | |
392 | 393 | article = {} |
| 394 | + article['revisions'] = [] |
393 | 395 | id = False |
394 | | - article[root.tag] = root.text |
395 | | - root.clear() |
396 | | - article['revisions'] = [] |
| 396 | +# article[root.tag] = root.text |
| 397 | +# root.clear() |
| 398 | + |
397 | 399 | for event, elem in context: |
398 | 400 | if event == 'end' and elem.tag == 'revision': |
399 | 401 | article['revisions'].append(elem) |
400 | 402 | elif event == 'end' and elem.tag == 'id' and id == False: |
401 | 403 | article[elem.tag] = elem |
402 | 404 | id = True |
| 405 | + elif event == 'end' and elem.tag == 'title': |
| 406 | + article[elem.tag] = elem |
| 407 | + elif event == 'end' and elem.tag == 'page': |
| 408 | + print article |
| 409 | + yield article |
| 410 | + article = {} |
| 411 | + article['revisions'] = [] |
| 412 | + id = False |
403 | 413 | else: |
404 | 414 | elem.clear() |
405 | | - return article |
| 415 | + #return article |
406 | 416 | |
407 | 417 | |
408 | 418 | def stream_raw_xml(input_queue, storage, id, function, dataset): |
409 | 419 | bots = detector.retrieve_bots('en') |
410 | | - buffer = cStringIO.StringIO() |
411 | | - parsing = False |
412 | 420 | t0 = datetime.datetime.now() |
413 | | - i = 0 |
414 | 421 | |
415 | 422 | if dataset == 'training': |
416 | 423 | cache = Buffer(storage, id) |
— | — | @@ -422,16 +429,20 @@ |
423 | 430 | if filename == None: |
424 | 431 | break |
425 | 432 | |
| 433 | + fh = bz2.BZ2File(filename, 'rb') |
| 434 | + for article in parse_xml(fh): |
| 435 | + if dataset == 'training': |
| 436 | + function(article, cache, bots) |
| 437 | + else: |
| 438 | + counts = function(article, counts, bots) |
| 439 | + i += 1 |
| 440 | + if i % 10000 == 0: |
| 441 | + print 'Worker %s parsed %s articles' % (id, i) |
| 442 | + fh.close() |
| 443 | + |
426 | 444 | t1 = datetime.datetime.now() |
427 | 445 | print 'Processing took %s' % (t1 - t0) |
428 | 446 | t0 = t1 |
429 | | - fh = bz2.BZ2File(filename, 'rb') |
430 | | - article = parse_xml(fh) |
431 | | - if dataset == 'training': |
432 | | - function(article, cache, bots) |
433 | | - else: |
434 | | - counts = function(article, counts, bots) |
435 | | - fh.close() |
436 | 447 | # for data in unzip(filename): |
437 | 448 | # if data.find('<page>') > -1: |
438 | 449 | # parsing = True |
— | — | @@ -454,10 +465,9 @@ |
455 | 466 | # counts = function(article, counts, bots) |
456 | 467 | # buffer = cStringIO.StringIO() |
457 | 468 | # parsing = False |
458 | | -# if i % 10000 == 0: |
459 | | -# print 'Worker %s parsed %s articles' % (id, i) |
460 | 469 | |
461 | 470 | |
| 471 | + |
462 | 472 | if dataset == 'training': |
463 | 473 | cache.empty() |
464 | 474 | cache.stats.summary() |