r85971 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r85970‎ \| r85971 \| r85972 >
Date:	18:32, 13 April 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Create prediction dataset for datacompetition
Modified paths:	/trunk/tools/editor_trends/etl/extracter.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/extracter.py
—	—	@@ -80,17 +80,31 @@
81	81	cache.add(article)
82	82	return md5hashes, size
83	83
84		~~-def setup_parser(rts):~~
85		~~- bots = bot_detector.retrieve_bots(rts.language.code)~~
86		~~- include_ns = {3: 'User Talk',~~
87		~~- 5: 'Wikipedia Talk',~~
88		~~- 1: 'Talk',~~
89		~~- 2: 'User',~~
90		~~- 4: 'Wikipedia'}~~
91	84
92		~~- return bots, include_ns~~
	85	+def datacompetition_parse_revision(revision, xml_namespace, bots, counts):
	86	+ '''
	87	+ This function has as input a single revision from a Wikipedia dump file,
	88	+ article information it belongs to, the xml_namespace of the Wikipedia dump
	89	+ file, the cache object that collects parsed revisions, a list of md5hashes
	90	+ to determine whether an edit was reverted and a size dictionary to determine
	91	+ how many characters were added and removed compared to the previous revision.
	92	+ '''
	93	+ if revision == None:
	94	+ #the entire revision is empty, weird.
	95	+ #dump(revision)
	96	+ return counts
93	97
	98	+ contributor = revision.find('%s%s' % (xml_namespace, 'contributor'))
	99	+ contributor = variables.parse_contributor(contributor, bots, xml_namespace)
	100	+ if not contributor:
	101	+ #editor is anonymous, ignore
	102	+ return counts
	103	+ else:
	104	+ counts.setdefault(contributor['id'], 0)
	105	+ counts[contributor['id']] += 1
	106	+ return counts
94	107
	108	+
95	109	def datacompetition_count_edits(fh, rts, file_id):
96	110	'''
97	111	This function counts for every editor the total number of edits that person
—	—	@@ -99,7 +113,8 @@
100	114	edits. This function is only to be used to create the prediction dataset
101	115	for the datacompetition.
102	116	'''
103		~~- bots, include_ns = setup_parser(rts)~~
	117	+ bots = bot_detector.retrieve_bots(rts.language.code)
	118	+ include_ns = {}
104	119
105	120	start = 'start'; end = 'end'
106	121	context = iterparse(fh, events=(start, end))
—	—	@@ -132,8 +147,7 @@
133	148	if event is start:
134	149	clear = False
135	150	else:
136		~~- print 'IMPLEMENT'~~
137		~~- #md5hashes, size = parse_revision(elem, article, xml_namespace, cache, bots, md5hashes, size)~~
	151	+ counts = datacompetition_parse_revision(revision, xml_namespace, bots, counts)
138	152	cache.count_revisions += 1
139	153	clear = True
140	154	if clear:
—	—	@@ -159,12 +173,17 @@
160	174	file_utils.write_dict_to_csv(counts, fh, keys)
161	175	fh.close()
162	176
163		~~- filename = 'counts_kaggle_%s.bin' % file_id~~
164		~~- file_utils.store_object(counts, location, filename)~~
	177	+ #filename = 'counts_kaggle_%s.bin' % file_id
	178	+ #file_utils.store_object(counts, location, filename)
165	179
166	180
167	181	def parse_xml(fh, rts, cache, process_id, file_id):
168		~~- bots, include_ns = setup_parser(rts)~~
	182	+ bots = bot_detector.retrieve_bots(rts.language.code)
	183	+ include_ns = {3: 'User Talk',
	184	+ 5: 'Wikipedia Talk',
	185	+ 1: 'Talk',
	186	+ 2: 'User',
	187	+ 4: 'Wikipedia'}
169	188
170	189	start = 'start'; end = 'end'
171	190	context = iterparse(fh, events=(start, end))
—	—	@@ -239,7 +258,8 @@
240	259	def stream_raw_xml(input_queue, process_id, lock, rts):
241	260	t0 = datetime.now()
242	261	file_id = 0
243		~~- cache = buffer.CSVBuffer(process_id, rts, lock)~~
	262	+ if rts.kaggle:
	263	+ cache = buffer.CSVBuffer(process_id, rts, lock)
244	264
245	265	while True:
246	266	filename = input_queue.get()