Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -20,16 +20,13 @@ |
21 | 21 | |
22 | 22 | import bz2 |
23 | 23 | import os |
24 | | -import cStringIO |
25 | 24 | import hashlib |
26 | 25 | import codecs |
27 | | -import re |
28 | 26 | import sys |
29 | 27 | import datetime |
30 | | -import gc |
31 | 28 | import progressbar |
32 | 29 | from multiprocessing import JoinableQueue, Process, cpu_count, current_process |
33 | | -from xml.etree.cElementTree import fromstring, iterparse |
| 30 | +from xml.etree.cElementTree import iterparse, dump |
34 | 31 | from collections import deque |
35 | 32 | |
36 | 33 | if '..' not in sys.path: |
— | — | @@ -49,8 +46,6 @@ |
50 | 47 | from utils import file_utils |
51 | 48 | import extracter |
52 | 49 | |
53 | | -#RE_CATEGORY = re.compile('\(.*\`\,\.\-\:\'\)') |
54 | | - |
55 | 50 | NAMESPACE = { |
56 | 51 | #0:'Main', |
57 | 52 | #1:'Talk', |
— | — | @@ -314,6 +309,7 @@ |
315 | 310 | if revision == None: |
316 | 311 | #the entire revision is empty, weird. |
317 | 312 | continue |
| 313 | + dump(revision) |
318 | 314 | contributor = revision.find('contributor') |
319 | 315 | contributor = parse_contributor(contributor, bots) |
320 | 316 | if not contributor: |
— | — | @@ -382,7 +378,6 @@ |
383 | 379 | def parse_xml(fh): |
384 | 380 | context = iterparse(fh, events=('start', 'end')) |
385 | 381 | context = iter(context) |
386 | | - x = 0 |
387 | 382 | |
388 | 383 | article = {} |
389 | 384 | article['revisions'] = [] |
— | — | @@ -392,25 +387,20 @@ |
393 | 388 | for event, elem in context: |
394 | 389 | if event == 'end' and elem.tag == '%s%s' % (namespace, 'title'): |
395 | 390 | article['title'] = elem |
396 | | - x += 1 |
397 | | - if x == 100: |
398 | | - break |
399 | 391 | elif event == 'end' and elem.tag == '%s%s' % (namespace, 'revision'): |
400 | 392 | article['revisions'].append(elem) |
401 | 393 | elif event == 'end' and elem.tag == '%s%s' % (namespace, 'id') and id == False: |
402 | 394 | article['id'] = elem |
403 | 395 | id = True |
404 | 396 | elif event == 'end' and elem.tag == '%s%s' % (namespace, 'page'): |
405 | | - print article |
406 | 397 | yield article |
407 | 398 | article = {} |
408 | 399 | article['revisions'] = [] |
409 | 400 | id = False |
410 | | - else: |
411 | | - elem.clear() |
412 | 401 | |
413 | 402 | |
414 | 403 | |
| 404 | + |
415 | 405 | def stream_raw_xml(input_queue, storage, id, function, dataset): |
416 | 406 | bots = detector.retrieve_bots('en') |
417 | 407 | t0 = datetime.datetime.now() |
— | — | @@ -440,31 +430,7 @@ |
441 | 431 | t1 = datetime.datetime.now() |
442 | 432 | print 'Processing took %s' % (t1 - t0) |
443 | 433 | t0 = t1 |
444 | | -# for data in unzip(filename): |
445 | | -# if data.find('<page>') > -1: |
446 | | -# parsing = True |
447 | | -# |
448 | | -# if parsing: |
449 | | -# try: |
450 | | -# buffer.write(data) |
451 | | -# except MemoryError, e: |
452 | | -# print e |
453 | | -# parsing = False |
454 | | -# buffer = cStringIO.StringIO() |
455 | | -# |
456 | | -# if data.find('</page>') > -1: |
457 | | -# i += 1 |
458 | | -# buffer.seek(0) |
459 | | -# article = parse_xml(buffer) |
460 | | -# if dataset == 'training': |
461 | | -# function(article, cache, bots) |
462 | | -# else: |
463 | | -# counts = function(article, counts, bots) |
464 | | -# buffer = cStringIO.StringIO() |
465 | | -# parsing = False |
466 | 434 | |
467 | | - |
468 | | - |
469 | 435 | if dataset == 'training': |
470 | 436 | cache.empty() |
471 | 437 | cache.stats.summary() |
— | — | @@ -475,19 +441,6 @@ |
476 | 442 | file_utils.store_object(counts, location, filename) |
477 | 443 | |
478 | 444 | |
479 | | -def unzip(filename): |
480 | | - ''' |
481 | | - Filename should be a fully qualified path to the bz2 file that will be |
482 | | - decompressed. It will iterate line by line and yield this back to |
483 | | - create_article |
484 | | - ''' |
485 | | - fh = bz2.BZ2File(filename, 'r') |
486 | | - for line in fh: |
487 | | - yield line |
488 | | - fh.close() |
489 | | - print 'Reached end of BZ2 file.' |
490 | | - |
491 | | - |
492 | 445 | def setup(storage): |
493 | 446 | keyspace_name = 'enwiki' |
494 | 447 | if storage == 'cassandra': |
— | — | @@ -549,6 +502,5 @@ |
550 | 503 | |
551 | 504 | if __name__ == '__main__': |
552 | 505 | #launcher_training() |
553 | | - gc.enable() |
554 | 506 | debug() |
555 | 507 | launcher_prediction() |