r84853 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r84852‎ \| r84853 \| r84854 >
Date:	16:22, 27 March 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Removed some unused imports.
Modified paths:	/trunk/tools/editor_trends/etl/enricher.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/enricher.py
—	—	@@ -20,16 +20,13 @@
21	21
22	22	import bz2
23	23	import os
24		~~-import cStringIO~~
25	24	import hashlib
26	25	import codecs
27		~~-import re~~
28	26	import sys
29	27	import datetime
30		~~-import gc~~
31	28	import progressbar
32	29	from multiprocessing import JoinableQueue, Process, cpu_count, current_process
33		~~-from xml.etree.cElementTree import fromstring, iterparse~~
	30	+from xml.etree.cElementTree import iterparse, dump
34	31	from collections import deque
35	32
36	33	if '..' not in sys.path:
—	—	@@ -49,8 +46,6 @@
50	47	from utils import file_utils
51	48	import extracter
52	49
53		~~-#RE_CATEGORY = re.compile('\(.*\`\,\.\-\:\'\)')~~
54		-
55	50	NAMESPACE = {
56	51	#0:'Main',
57	52	#1:'Talk',
—	—	@@ -314,6 +309,7 @@
315	310	if revision == None:
316	311	#the entire revision is empty, weird.
317	312	continue
	313	+ dump(revision)
318	314	contributor = revision.find('contributor')
319	315	contributor = parse_contributor(contributor, bots)
320	316	if not contributor:
—	—	@@ -382,7 +378,6 @@
383	379	def parse_xml(fh):
384	380	context = iterparse(fh, events=('start', 'end'))
385	381	context = iter(context)
386		~~- x = 0~~
387	382
388	383	article = {}
389	384	article['revisions'] = []
—	—	@@ -392,25 +387,20 @@
393	388	for event, elem in context:
394	389	if event == 'end' and elem.tag == '%s%s' % (namespace, 'title'):
395	390	article['title'] = elem
396		~~- x += 1~~
397		~~- if x == 100:~~
398		~~- break~~
399	391	elif event == 'end' and elem.tag == '%s%s' % (namespace, 'revision'):
400	392	article['revisions'].append(elem)
401	393	elif event == 'end' and elem.tag == '%s%s' % (namespace, 'id') and id == False:
402	394	article['id'] = elem
403	395	id = True
404	396	elif event == 'end' and elem.tag == '%s%s' % (namespace, 'page'):
405		~~- print article~~
406	397	yield article
407	398	article = {}
408	399	article['revisions'] = []
409	400	id = False
410		~~- else:~~
411		~~- elem.clear()~~
412	401
413	402
414	403
	404	+
415	405	def stream_raw_xml(input_queue, storage, id, function, dataset):
416	406	bots = detector.retrieve_bots('en')
417	407	t0 = datetime.datetime.now()
—	—	@@ -440,31 +430,7 @@
441	431	t1 = datetime.datetime.now()
442	432	print 'Processing took %s' % (t1 - t0)
443	433	t0 = t1
444		~~-# for data in unzip(filename):~~
445		~~-# if data.find('<page>') > -1:~~
446		~~-# parsing = True~~
447		-#
448		~~-# if parsing:~~
449		~~-# try:~~
450		~~-# buffer.write(data)~~
451		~~-# except MemoryError, e:~~
452		~~-# print e~~
453		~~-# parsing = False~~
454		~~-# buffer = cStringIO.StringIO()~~
455		-#
456		~~-# if data.find('</page>') > -1:~~
457		~~-# i += 1~~
458		~~-# buffer.seek(0)~~
459		~~-# article = parse_xml(buffer)~~
460		~~-# if dataset == 'training':~~
461		~~-# function(article, cache, bots)~~
462		~~-# else:~~
463		~~-# counts = function(article, counts, bots)~~
464		~~-# buffer = cStringIO.StringIO()~~
465		~~-# parsing = False~~
466	434
467		-
468		-
469	435	if dataset == 'training':
470	436	cache.empty()
471	437	cache.stats.summary()
—	—	@@ -475,19 +441,6 @@
476	442	file_utils.store_object(counts, location, filename)
477	443
478	444
479		~~-def unzip(filename):~~
480		~~- '''~~
481		~~- Filename should be a fully qualified path to the bz2 file that will be~~
482		~~- decompressed. It will iterate line by line and yield this back to~~
483		~~- create_article~~
484		~~- '''~~
485		~~- fh = bz2.BZ2File(filename, 'r')~~
486		~~- for line in fh:~~
487		~~- yield line~~
488		~~- fh.close()~~
489		~~- print 'Reached end of BZ2 file.'~~
490		-
491		-
492	445	def setup(storage):
493	446	keyspace_name = 'enwiki'
494	447	if storage == 'cassandra':
—	—	@@ -549,6 +502,5 @@
550	503
551	504	if __name__ == '__main__':
552	505	#launcher_training()
553		~~- gc.enable()~~
554	506	debug()
555	507	launcher_prediction()