r84853 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r84852‎ | r84853 | r84854 >
Date:16:22, 27 March 2011
Author:diederik
Status:deferred
Tags:
Comment:
Removed some unused imports.
Modified paths:
  • /trunk/tools/editor_trends/etl/enricher.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/enricher.py
@@ -20,16 +20,13 @@
2121
2222 import bz2
2323 import os
24 -import cStringIO
2524 import hashlib
2625 import codecs
27 -import re
2826 import sys
2927 import datetime
30 -import gc
3128 import progressbar
3229 from multiprocessing import JoinableQueue, Process, cpu_count, current_process
33 -from xml.etree.cElementTree import fromstring, iterparse
 30+from xml.etree.cElementTree import iterparse, dump
3431 from collections import deque
3532
3633 if '..' not in sys.path:
@@ -49,8 +46,6 @@
5047 from utils import file_utils
5148 import extracter
5249
53 -#RE_CATEGORY = re.compile('\(.*\`\,\.\-\:\'\)')
54 -
5550 NAMESPACE = {
5651 #0:'Main',
5752 #1:'Talk',
@@ -314,6 +309,7 @@
315310 if revision == None:
316311 #the entire revision is empty, weird.
317312 continue
 313+ dump(revision)
318314 contributor = revision.find('contributor')
319315 contributor = parse_contributor(contributor, bots)
320316 if not contributor:
@@ -382,7 +378,6 @@
383379 def parse_xml(fh):
384380 context = iterparse(fh, events=('start', 'end'))
385381 context = iter(context)
386 - x = 0
387382
388383 article = {}
389384 article['revisions'] = []
@@ -392,25 +387,20 @@
393388 for event, elem in context:
394389 if event == 'end' and elem.tag == '%s%s' % (namespace, 'title'):
395390 article['title'] = elem
396 - x += 1
397 - if x == 100:
398 - break
399391 elif event == 'end' and elem.tag == '%s%s' % (namespace, 'revision'):
400392 article['revisions'].append(elem)
401393 elif event == 'end' and elem.tag == '%s%s' % (namespace, 'id') and id == False:
402394 article['id'] = elem
403395 id = True
404396 elif event == 'end' and elem.tag == '%s%s' % (namespace, 'page'):
405 - print article
406397 yield article
407398 article = {}
408399 article['revisions'] = []
409400 id = False
410 - else:
411 - elem.clear()
412401
413402
414403
 404+
415405 def stream_raw_xml(input_queue, storage, id, function, dataset):
416406 bots = detector.retrieve_bots('en')
417407 t0 = datetime.datetime.now()
@@ -440,31 +430,7 @@
441431 t1 = datetime.datetime.now()
442432 print 'Processing took %s' % (t1 - t0)
443433 t0 = t1
444 -# for data in unzip(filename):
445 -# if data.find('<page>') > -1:
446 -# parsing = True
447 -#
448 -# if parsing:
449 -# try:
450 -# buffer.write(data)
451 -# except MemoryError, e:
452 -# print e
453 -# parsing = False
454 -# buffer = cStringIO.StringIO()
455 -#
456 -# if data.find('</page>') > -1:
457 -# i += 1
458 -# buffer.seek(0)
459 -# article = parse_xml(buffer)
460 -# if dataset == 'training':
461 -# function(article, cache, bots)
462 -# else:
463 -# counts = function(article, counts, bots)
464 -# buffer = cStringIO.StringIO()
465 -# parsing = False
466434
467 -
468 -
469435 if dataset == 'training':
470436 cache.empty()
471437 cache.stats.summary()
@@ -475,19 +441,6 @@
476442 file_utils.store_object(counts, location, filename)
477443
478444
479 -def unzip(filename):
480 - '''
481 - Filename should be a fully qualified path to the bz2 file that will be
482 - decompressed. It will iterate line by line and yield this back to
483 - create_article
484 - '''
485 - fh = bz2.BZ2File(filename, 'r')
486 - for line in fh:
487 - yield line
488 - fh.close()
489 - print 'Reached end of BZ2 file.'
490 -
491 -
492445 def setup(storage):
493446 keyspace_name = 'enwiki'
494447 if storage == 'cassandra':
@@ -549,6 +502,5 @@
550503
551504 if __name__ == '__main__':
552505 #launcher_training()
553 - gc.enable()
554506 debug()
555507 launcher_prediction()