r84851 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r84850‎ \| r84851 \| r84852 >
Date:	15:52, 27 March 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Cleaned up old code.
Modified paths:	/trunk/tools/editor_trends/etl/enricher.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/enricher.py
—	—	@@ -304,7 +304,7 @@
305	305
306	306
307	307	def count_edits(article, counts, bots):
308		~~- title = article['title']~~
	308	+ title = article['title'].text
309	309	namespace = determine_namespace(title)
310	310
311	311	if namespace != False:
—	—	@@ -380,30 +380,27 @@
381	381
382	382
383	383	def parse_xml(fh):
384		~~- context = iterparse(fh, events=('end',))~~
	384	+ context = iterparse(fh, events=('start', 'end'))
385	385	context = iter(context)
386		~~- event, root = context.next()~~
387		~~-# try:~~
388		-#
389		~~-# except SyntaxError, e:~~
390		~~-# print e~~
391		~~-# print buffer.getvalue()~~
	386	+ x = 0
392	387
393	388	article = {}
394	389	article['revisions'] = []
395	390	id = False
396		~~-# article[root.tag] = root.text~~
397		~~-# root.clear()~~
	391	+ namespace = '{http://www.mediawiki.org/xml/export-0.4/}'
398	392
399	393	for event, elem in context:
400		~~- if event == 'end' and elem.tag == 'revision':~~
	394	+ if event == 'end' and elem.tag == '%s%s' % (namespace, 'title'):
	395	+ article['title'] = elem
	396	+ x += 1
	397	+ if x == 100:
	398	+ break
	399	+ elif event == 'end' and elem.tag == '%s%s' % (namespace, 'revision'):
401	400	article['revisions'].append(elem)
402		~~- elif event == 'end' and elem.tag == 'id' and id == False:~~
403		~~- article[elem.tag] = elem~~
	401	+ elif event == 'end' and elem.tag == '%s%s' % (namespace, 'id') and id == False:
	402	+ article['id'] = elem
404	403	id = True
405		~~- elif event == 'end' and elem.tag == 'title':~~
406		~~- article[elem.tag] = elem~~
407		~~- elif event == 'end' and elem.tag == 'page':~~
	404	+ elif event == 'end' and elem.tag == '%s%s' % (namespace, 'page'):
408	405	print article
409	406	yield article
410	407	article = {}
—	—	@@ -411,13 +408,13 @@
412	409	id = False
413	410	else:
414	411	elem.clear()
415		~~- #return article~~
416	412
417	413
	414	+
418	415	def stream_raw_xml(input_queue, storage, id, function, dataset):
419	416	bots = detector.retrieve_bots('en')
420	417	t0 = datetime.datetime.now()
421		-
	418	+ i = 0
422	419	if dataset == 'training':
423	420	cache = Buffer(storage, id)
424	421	else: