r85971 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r85970‎ | r85971 | r85972 >
Date:18:32, 13 April 2011
Author:diederik
Status:deferred
Tags:
Comment:
Create prediction dataset for datacompetition
Modified paths:
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/extracter.py
@@ -80,17 +80,31 @@
8181 cache.add(article)
8282 return md5hashes, size
8383
84 -def setup_parser(rts):
85 - bots = bot_detector.retrieve_bots(rts.language.code)
86 - include_ns = {3: 'User Talk',
87 - 5: 'Wikipedia Talk',
88 - 1: 'Talk',
89 - 2: 'User',
90 - 4: 'Wikipedia'}
9184
92 - return bots, include_ns
 85+def datacompetition_parse_revision(revision, xml_namespace, bots, counts):
 86+ '''
 87+ This function has as input a single revision from a Wikipedia dump file,
 88+ article information it belongs to, the xml_namespace of the Wikipedia dump
 89+ file, the cache object that collects parsed revisions, a list of md5hashes
 90+ to determine whether an edit was reverted and a size dictionary to determine
 91+ how many characters were added and removed compared to the previous revision.
 92+ '''
 93+ if revision == None:
 94+ #the entire revision is empty, weird.
 95+ #dump(revision)
 96+ return counts
9397
 98+ contributor = revision.find('%s%s' % (xml_namespace, 'contributor'))
 99+ contributor = variables.parse_contributor(contributor, bots, xml_namespace)
 100+ if not contributor:
 101+ #editor is anonymous, ignore
 102+ return counts
 103+ else:
 104+ counts.setdefault(contributor['id'], 0)
 105+ counts[contributor['id']] += 1
 106+ return counts
94107
 108+
95109 def datacompetition_count_edits(fh, rts, file_id):
96110 '''
97111 This function counts for every editor the total number of edits that person
@@ -99,7 +113,8 @@
100114 edits. This function is only to be used to create the prediction dataset
101115 for the datacompetition.
102116 '''
103 - bots, include_ns = setup_parser(rts)
 117+ bots = bot_detector.retrieve_bots(rts.language.code)
 118+ include_ns = {}
104119
105120 start = 'start'; end = 'end'
106121 context = iterparse(fh, events=(start, end))
@@ -132,8 +147,7 @@
133148 if event is start:
134149 clear = False
135150 else:
136 - print 'IMPLEMENT'
137 - #md5hashes, size = parse_revision(elem, article, xml_namespace, cache, bots, md5hashes, size)
 151+ counts = datacompetition_parse_revision(revision, xml_namespace, bots, counts)
138152 cache.count_revisions += 1
139153 clear = True
140154 if clear:
@@ -159,12 +173,17 @@
160174 file_utils.write_dict_to_csv(counts, fh, keys)
161175 fh.close()
162176
163 - filename = 'counts_kaggle_%s.bin' % file_id
164 - file_utils.store_object(counts, location, filename)
 177+ #filename = 'counts_kaggle_%s.bin' % file_id
 178+ #file_utils.store_object(counts, location, filename)
165179
166180
167181 def parse_xml(fh, rts, cache, process_id, file_id):
168 - bots, include_ns = setup_parser(rts)
 182+ bots = bot_detector.retrieve_bots(rts.language.code)
 183+ include_ns = {3: 'User Talk',
 184+ 5: 'Wikipedia Talk',
 185+ 1: 'Talk',
 186+ 2: 'User',
 187+ 4: 'Wikipedia'}
169188
170189 start = 'start'; end = 'end'
171190 context = iterparse(fh, events=(start, end))
@@ -239,7 +258,8 @@
240259 def stream_raw_xml(input_queue, process_id, lock, rts):
241260 t0 = datetime.now()
242261 file_id = 0
243 - cache = buffer.CSVBuffer(process_id, rts, lock)
 262+ if rts.kaggle:
 263+ cache = buffer.CSVBuffer(process_id, rts, lock)
244264
245265 while True:
246266 filename = input_queue.get()