Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -295,11 +295,7 @@ |
296 | 296 | return editor |
297 | 297 | |
298 | 298 | |
299 | | -def determine_namespace(title): |
300 | | - namespaces = {'User': 2, |
301 | | - 'Talk': 1, |
302 | | - 'User Talk': 3, |
303 | | - } |
| 299 | +def determine_namespace(title, namespaces): |
304 | 300 | ns = {} |
305 | 301 | if title != None: |
306 | 302 | for namespace in namespaces: |
— | — | @@ -346,8 +342,9 @@ |
347 | 343 | |
348 | 344 | |
349 | 345 | def count_edits(article, counts, bots): |
| 346 | + namespaces = {} |
350 | 347 | title = article['title'].text |
351 | | - namespace = determine_namespace(title) |
| 348 | + namespace = determine_namespace(title, namespaces) |
352 | 349 | xml_namespace = '{http://www.mediawiki.org/xml/export-0.4/}' |
353 | 350 | if namespace != False: |
354 | 351 | article_id = article['id'].text |
— | — | @@ -370,8 +367,12 @@ |
371 | 368 | |
372 | 369 | |
373 | 370 | def create_variables(article, cache, bots): |
| 371 | + namespaces = {'User': 2, |
| 372 | + 'Talk': 1, |
| 373 | + 'User Talk': 3, |
| 374 | + } |
374 | 375 | title = article['title'] |
375 | | - namespace = determine_namespace(title) |
| 376 | + namespace = determine_namespace(title, namespaces) |
376 | 377 | |
377 | 378 | if namespace != False: |
378 | 379 | cache.stats.count_articles += 1 |
— | — | @@ -544,7 +545,7 @@ |
545 | 546 | function = count_edits |
546 | 547 | storage = 'csv' |
547 | 548 | dataset = 'prediction' |
548 | | - processors = 1 |
| 549 | + processors = 7 |
549 | 550 | launcher(function, path, dataset, storage, processors) |
550 | 551 | |
551 | 552 | |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -150,7 +150,7 @@ |
151 | 151 | |
152 | 152 | def extract_username(contributor, **kwargs): |
153 | 153 | contributor = contributor.find('username') |
154 | | - if contributor != None: |
| 154 | + if contributor != None and contributor.text != None: |
155 | 155 | contributor = contributor.text.encode('utf-8') |
156 | 156 | return contributor.decode('utf-8') |
157 | 157 | else: |
— | — | @@ -187,11 +187,16 @@ |
188 | 188 | |
189 | 189 | |
190 | 190 | def parse_title(title): |
| 191 | + title_data = {} |
191 | 192 | if type(title.text) == type('str'): |
192 | | - title = title.text.decode('utf-8') |
| 193 | + title_data['title'] = title.text.decode('utf-8') |
193 | 194 | else: |
194 | | - title = title.text |
195 | | - return title |
| 195 | + title_data['title'] = title.text |
| 196 | + if title_data['title'].startswith('List of'): |
| 197 | + title_data['list'] = True |
| 198 | + else: |
| 199 | + title_data['list'] = False |
| 200 | + return title_data |
196 | 201 | |
197 | 202 | |
198 | 203 | def output_editor_information(revisions, page, bots, rts): |
— | — | @@ -300,7 +305,7 @@ |
301 | 306 | output = output_editor_information(revisions, article_id, bot_ids, rts) |
302 | 307 | output = add_namespace_to_output(output, namespace) |
303 | 308 | write_output(output, filehandles, lock, rts) |
304 | | - file_utils.write_list_to_csv([article_id, title], fh2) |
| 309 | + file_utils.write_list_to_csv([article_id, title.values()], fh2) |
305 | 310 | processed += 1 |
306 | 311 | page.clear() |
307 | 312 | pbar.update(pbar.currval + article_size) |
Index: trunk/tools/editor_trends/etl/sort.py |
— | — | @@ -56,7 +56,7 @@ |
57 | 57 | write_sorted_file(sorted_data, filename, self.rts) |
58 | 58 | self.result.put(True) |
59 | 59 | except UnicodeDecodeError, e: |
60 | | - print e |
| 60 | + print 'Error: %s, (%s)' % (e, filename) |
61 | 61 | except Empty: |
62 | 62 | pass |
63 | 63 | |
— | — | @@ -142,7 +142,6 @@ |
143 | 143 | rts is an instance of RunTimeSettings |
144 | 144 | ''' |
145 | 145 | files = file_utils.retrieve_file_list(rts.txt, 'csv') |
146 | | - #files = files[0:6] |
147 | 146 | |
148 | 147 | pbar = progressbar.ProgressBar(maxval=len(files)).start() |
149 | 148 | tasks = multiprocessing.JoinableQueue() |