Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -38,7 +38,7 @@ |
39 | 39 | The treshold is currently more than 9 edits and is not yet configurable. |
40 | 40 | ''' |
41 | 41 | def run(self): |
42 | | - db = storage.Database(rts.storage, self.rts.dbname, self.rts.editors_raw) |
| 42 | + db = storage.Database(self.rts.storage, self.rts.dbname, self.rts.editors_raw) |
43 | 43 | editor_cache = cache.EditorCache(db) |
44 | 44 | prev_editor = -1 |
45 | 45 | while True: |
— | — | @@ -121,14 +121,12 @@ |
122 | 122 | x, y = 0, 1 |
123 | 123 | while y < len(line): |
124 | 124 | key, value = line[x], line[y] |
125 | | - data[key] = value |
| 125 | + if key == 'ns' or key == 'id': |
| 126 | + data[key] = int(value) |
| 127 | + else: |
| 128 | + data[key] = value |
126 | 129 | x += 2 |
127 | 130 | y += 2 |
128 | | - for key, value in data.iteritems(): |
129 | | - try: |
130 | | - data[key] = int(value) |
131 | | - except ValueError: |
132 | | - pass |
133 | 131 | db.insert(data) |
134 | 132 | fh.close() |
135 | 133 | print 'Done storing articles...' |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -35,6 +35,13 @@ |
36 | 36 | from analyses.adhoc import bot_detector
|
37 | 37 |
|
38 | 38 | def parse_revision(revision, article, xml_namespace, cache, bots, md5hashes, size):
|
| 39 | + '''
|
| 40 | + This function has as input a single revision from a Wikipedia dump file,
|
| 41 | + article information it belongs to, the xml_namespace of the Wikipedia dump
|
| 42 | + file, the cache object that collects parsed revisions, a list of md5hashes
|
| 43 | + to determine whether an edit was reverted and a size dictionary to determine
|
| 44 | + how many characters were added and removed compared to the previous revision.
|
| 45 | + '''
|
39 | 46 | if revision == None:
|
40 | 47 | #the entire revision is empty, weird.
|
41 | 48 | #dump(revision)
|
— | — | @@ -85,6 +92,13 @@ |
86 | 93 |
|
87 | 94 |
|
88 | 95 | def datacompetition_count_edits(fh, rts, file_id):
|
| 96 | + '''
|
| 97 | + This function counts for every editor the total number of edits that person
|
| 98 | + made. It follows the same logic as the parse_xml function although it
|
| 99 | + skips a bunch of extraction phases that are not relevant for counting
|
| 100 | + edits. This function is only to be used to create the prediction dataset
|
| 101 | + for the datacompetition.
|
| 102 | + '''
|
89 | 103 | bots, include_ns = setup_parser(rts)
|
90 | 104 |
|
91 | 105 | start = 'start'; end = 'end'
|
— | — | @@ -196,7 +210,7 @@ |
197 | 211 |
|
198 | 212 | elif event is end and elem.tag.endswith('id') and id == False:
|
199 | 213 | article['article_id'] = elem.text
|
200 | | - if current_namespace:
|
| 214 | + if isinstance(current_namespace, int):
|
201 | 215 | cache.articles[article['article_id']] = title_meta
|
202 | 216 | id = True
|
203 | 217 | elem.clear()
|