r85921 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r85920‎ | r85921 | r85922 >
Date:21:22, 12 April 2011
Author:diederik
Status:deferred
Tags:
Comment:
Some minor issues.
Modified paths:
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/store.py
@@ -38,7 +38,7 @@
3939 The treshold is currently more than 9 edits and is not yet configurable.
4040 '''
4141 def run(self):
42 - db = storage.Database(rts.storage, self.rts.dbname, self.rts.editors_raw)
 42+ db = storage.Database(self.rts.storage, self.rts.dbname, self.rts.editors_raw)
4343 editor_cache = cache.EditorCache(db)
4444 prev_editor = -1
4545 while True:
@@ -121,14 +121,12 @@
122122 x, y = 0, 1
123123 while y < len(line):
124124 key, value = line[x], line[y]
125 - data[key] = value
 125+ if key == 'ns' or key == 'id':
 126+ data[key] = int(value)
 127+ else:
 128+ data[key] = value
126129 x += 2
127130 y += 2
128 - for key, value in data.iteritems():
129 - try:
130 - data[key] = int(value)
131 - except ValueError:
132 - pass
133131 db.insert(data)
134132 fh.close()
135133 print 'Done storing articles...'
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -35,6 +35,13 @@
3636 from analyses.adhoc import bot_detector
3737
3838 def parse_revision(revision, article, xml_namespace, cache, bots, md5hashes, size):
 39+ '''
 40+ This function has as input a single revision from a Wikipedia dump file,
 41+ article information it belongs to, the xml_namespace of the Wikipedia dump
 42+ file, the cache object that collects parsed revisions, a list of md5hashes
 43+ to determine whether an edit was reverted and a size dictionary to determine
 44+ how many characters were added and removed compared to the previous revision.
 45+ '''
3946 if revision == None:
4047 #the entire revision is empty, weird.
4148 #dump(revision)
@@ -85,6 +92,13 @@
8693
8794
8895 def datacompetition_count_edits(fh, rts, file_id):
 96+ '''
 97+ This function counts for every editor the total number of edits that person
 98+ made. It follows the same logic as the parse_xml function although it
 99+ skips a bunch of extraction phases that are not relevant for counting
 100+ edits. This function is only to be used to create the prediction dataset
 101+ for the datacompetition.
 102+ '''
89103 bots, include_ns = setup_parser(rts)
90104
91105 start = 'start'; end = 'end'
@@ -196,7 +210,7 @@
197211
198212 elif event is end and elem.tag.endswith('id') and id == False:
199213 article['article_id'] = elem.text
200 - if current_namespace:
 214+ if isinstance(current_namespace, int):
201215 cache.articles[article['article_id']] = title_meta
202216 id = True
203217 elem.clear()

Follow-up revisions

RevisionCommit summaryAuthorDate
r86002Follow up r85991. For some reason I committed the test for r85921 parser, not...platonides23:12, 13 April 2011