r84850 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r84849‎ \| r84850 \| r84851 >
Date:	15:06, 27 March 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Replaced cStringIO buffer with BZ2 file like object (part 3)
Modified paths:	/trunk/tools/editor_trends/etl/enricher.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/enricher.py
—	—	@@ -382,34 +382,41 @@
383	383	def parse_xml(fh):
384	384	context = iterparse(fh, events=('end',))
385	385	context = iter(context)
386		~~- try:~~
387		~~- event, root = context.next()~~
388		~~- except SyntaxError, e:~~
389		~~- print e~~
390		~~- print buffer.getvalue()~~
	386	+ event, root = context.next()
	387	+# try:
	388	+#
	389	+# except SyntaxError, e:
	390	+# print e
	391	+# print buffer.getvalue()
391	392
392	393	article = {}
	394	+ article['revisions'] = []
393	395	id = False
394		~~- article[root.tag] = root.text~~
395		~~- root.clear()~~
396		~~- article['revisions'] = []~~
	396	+# article[root.tag] = root.text
	397	+# root.clear()
	398	+
397	399	for event, elem in context:
398	400	if event == 'end' and elem.tag == 'revision':
399	401	article['revisions'].append(elem)
400	402	elif event == 'end' and elem.tag == 'id' and id == False:
401	403	article[elem.tag] = elem
402	404	id = True
	405	+ elif event == 'end' and elem.tag == 'title':
	406	+ article[elem.tag] = elem
	407	+ elif event == 'end' and elem.tag == 'page':
	408	+ print article
	409	+ yield article
	410	+ article = {}
	411	+ article['revisions'] = []
	412	+ id = False
403	413	else:
404	414	elem.clear()
405		~~- return article~~
	415	+ #return article
406	416
407	417
408	418	def stream_raw_xml(input_queue, storage, id, function, dataset):
409	419	bots = detector.retrieve_bots('en')
410		~~- buffer = cStringIO.StringIO()~~
411		~~- parsing = False~~
412	420	t0 = datetime.datetime.now()
413		~~- i = 0~~
414	421
415	422	if dataset == 'training':
416	423	cache = Buffer(storage, id)
—	—	@@ -422,16 +429,20 @@
423	430	if filename == None:
424	431	break
425	432
	433	+ fh = bz2.BZ2File(filename, 'rb')
	434	+ for article in parse_xml(fh):
	435	+ if dataset == 'training':
	436	+ function(article, cache, bots)
	437	+ else:
	438	+ counts = function(article, counts, bots)
	439	+ i += 1
	440	+ if i % 10000 == 0:
	441	+ print 'Worker %s parsed %s articles' % (id, i)
	442	+ fh.close()
	443	+
426	444	t1 = datetime.datetime.now()
427	445	print 'Processing took %s' % (t1 - t0)
428	446	t0 = t1
429		~~- fh = bz2.BZ2File(filename, 'rb')~~
430		~~- article = parse_xml(fh)~~
431		~~- if dataset == 'training':~~
432		~~- function(article, cache, bots)~~
433		~~- else:~~
434		~~- counts = function(article, counts, bots)~~
435		~~- fh.close()~~
436	447	# for data in unzip(filename):
437	448	# if data.find('<page>') > -1:
438	449	# parsing = True
—	—	@@ -454,10 +465,9 @@
455	466	# counts = function(article, counts, bots)
456	467	# buffer = cStringIO.StringIO()
457	468	# parsing = False
458		~~-# if i % 10000 == 0:~~
459		~~-# print 'Worker %s parsed %s articles' % (id, i)~~
460	469
461	470
	471	+
462	472	if dataset == 'training':
463	473	cache.empty()
464	474	cache.stats.summary()