Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -394,7 +394,7 @@ |
395 | 395 | return article |
396 | 396 | |
397 | 397 | |
398 | | -def stream_raw_xml(input_queue, storage, id, dataset='training'): |
| 398 | +def stream_raw_xml(input_queue, storage, id, function, dataset='training'): |
399 | 399 | buffer = cStringIO.StringIO() |
400 | 400 | parsing = False |
401 | 401 | i = 0 |
— | — | @@ -457,7 +457,7 @@ |
458 | 458 | cassandra.install_schema(keyspace_name, drop_first=True) |
459 | 459 | |
460 | 460 | |
461 | | -def launcher(function, path): |
| 461 | +def launcher(function, path, dataset): |
462 | 462 | storage = 'csv' |
463 | 463 | setup(storage) |
464 | 464 | input_queue = JoinableQueue() |
— | — | @@ -474,7 +474,7 @@ |
475 | 475 | for x in xrange(cpu_count()): |
476 | 476 | input_queue.put(None) |
477 | 477 | |
478 | | - extracters = [Process(target=stream_raw_xml, args=[input_queue, function, storage, x]) |
| 478 | + extracters = [Process(target=stream_raw_xml, args=[input_queue, function, storage, x, dataset]) |
479 | 479 | for x in xrange(cpu_count())] |
480 | 480 | for extracter in extracters: |
481 | 481 | extracter.start() |
— | — | @@ -483,10 +483,12 @@ |
484 | 484 | |
485 | 485 | |
486 | 486 | if __name__ == '__main__': |
487 | | - path1 = '/media/wikipedia_dumps/batch1/' |
488 | | - path2 = '/media/wikipedia_dumps/batch2/' |
| 487 | + path1 = '/media/wikipedia_dumps/batch2/' |
| 488 | + path2 = '/media/wikipedia_dumps/batch1/' |
489 | 489 | function1 = create_variables |
490 | 490 | function2 = count_edits |
491 | 491 | |
492 | | - launcher(function1, path1) # launcher for creating training data |
493 | | - launcher(function2, path2) # launcher for creating test data |
| 492 | + dataset1 = 'training' |
| 493 | + dataset2 = 'prediction' |
| 494 | + #launcher(function1, path1, dataset1) # launcher for creating training data |
| 495 | + launcher(function2, path2, dataset2) # launcher for creating test data |