r84696 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r84695‎ | r84696 | r84697 >
Date:20:00, 24 March 2011
Author:diederik
Status:deferred
Tags:
Comment:
Added counting number of edits for prediction dataset.
Modified paths:
  • /trunk/tools/editor_trends/etl/enricher.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/enricher.py
@@ -394,7 +394,7 @@
395395 return article
396396
397397
398 -def stream_raw_xml(input_queue, storage, id, dataset='training'):
 398+def stream_raw_xml(input_queue, storage, id, function, dataset='training'):
399399 buffer = cStringIO.StringIO()
400400 parsing = False
401401 i = 0
@@ -457,7 +457,7 @@
458458 cassandra.install_schema(keyspace_name, drop_first=True)
459459
460460
461 -def launcher(function, path):
 461+def launcher(function, path, dataset):
462462 storage = 'csv'
463463 setup(storage)
464464 input_queue = JoinableQueue()
@@ -474,7 +474,7 @@
475475 for x in xrange(cpu_count()):
476476 input_queue.put(None)
477477
478 - extracters = [Process(target=stream_raw_xml, args=[input_queue, function, storage, x])
 478+ extracters = [Process(target=stream_raw_xml, args=[input_queue, function, storage, x, dataset])
479479 for x in xrange(cpu_count())]
480480 for extracter in extracters:
481481 extracter.start()
@@ -483,10 +483,12 @@
484484
485485
486486 if __name__ == '__main__':
487 - path1 = '/media/wikipedia_dumps/batch1/'
488 - path2 = '/media/wikipedia_dumps/batch2/'
 487+ path1 = '/media/wikipedia_dumps/batch2/'
 488+ path2 = '/media/wikipedia_dumps/batch1/'
489489 function1 = create_variables
490490 function2 = count_edits
491491
492 - launcher(function1, path1) # launcher for creating training data
493 - launcher(function2, path2) # launcher for creating test data
 492+ dataset1 = 'training'
 493+ dataset2 = 'prediction'
 494+ #launcher(function1, path1, dataset1) # launcher for creating training data
 495+ launcher(function2, path2, dataset2) # launcher for creating test data

Status & tagging log