Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -304,7 +304,7 @@ |
305 | 305 | |
306 | 306 | |
307 | 307 | def count_edits(article, counts, bots): |
308 | | - title = article['title'] |
| 308 | + title = article['title'].text |
309 | 309 | namespace = determine_namespace(title) |
310 | 310 | |
311 | 311 | if namespace != False: |
— | — | @@ -380,30 +380,27 @@ |
381 | 381 | |
382 | 382 | |
383 | 383 | def parse_xml(fh): |
384 | | - context = iterparse(fh, events=('end',)) |
| 384 | + context = iterparse(fh, events=('start', 'end')) |
385 | 385 | context = iter(context) |
386 | | - event, root = context.next() |
387 | | -# try: |
388 | | -# |
389 | | -# except SyntaxError, e: |
390 | | -# print e |
391 | | -# print buffer.getvalue() |
| 386 | + x = 0 |
392 | 387 | |
393 | 388 | article = {} |
394 | 389 | article['revisions'] = [] |
395 | 390 | id = False |
396 | | -# article[root.tag] = root.text |
397 | | -# root.clear() |
| 391 | + namespace = '{http://www.mediawiki.org/xml/export-0.4/}' |
398 | 392 | |
399 | 393 | for event, elem in context: |
400 | | - if event == 'end' and elem.tag == 'revision': |
| 394 | + if event == 'end' and elem.tag == '%s%s' % (namespace, 'title'): |
| 395 | + article['title'] = elem |
| 396 | + x += 1 |
| 397 | + if x == 100: |
| 398 | + break |
| 399 | + elif event == 'end' and elem.tag == '%s%s' % (namespace, 'revision'): |
401 | 400 | article['revisions'].append(elem) |
402 | | - elif event == 'end' and elem.tag == 'id' and id == False: |
403 | | - article[elem.tag] = elem |
| 401 | + elif event == 'end' and elem.tag == '%s%s' % (namespace, 'id') and id == False: |
| 402 | + article['id'] = elem |
404 | 403 | id = True |
405 | | - elif event == 'end' and elem.tag == 'title': |
406 | | - article[elem.tag] = elem |
407 | | - elif event == 'end' and elem.tag == 'page': |
| 404 | + elif event == 'end' and elem.tag == '%s%s' % (namespace, 'page'): |
408 | 405 | print article |
409 | 406 | yield article |
410 | 407 | article = {} |
— | — | @@ -411,13 +408,13 @@ |
412 | 409 | id = False |
413 | 410 | else: |
414 | 411 | elem.clear() |
415 | | - #return article |
416 | 412 | |
417 | 413 | |
| 414 | + |
418 | 415 | def stream_raw_xml(input_queue, storage, id, function, dataset): |
419 | 416 | bots = detector.retrieve_bots('en') |
420 | 417 | t0 = datetime.datetime.now() |
421 | | - |
| 418 | + i = 0 |
422 | 419 | if dataset == 'training': |
423 | 420 | cache = Buffer(storage, id) |
424 | 421 | else: |