Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -433,13 +433,13 @@ |
434 | 434 | |
435 | 435 | if i % 1000 == 0: |
436 | 436 | print 'Worker %s parsed %s articles' % (id, i) |
437 | | - print gc.get_count() |
438 | | - gc.collect() |
439 | 437 | print '************************' |
440 | 438 | gc.DEBUG_COLLECTABLE |
441 | 439 | gc.DEBUG_UNCOLLECTABLE |
442 | 440 | gc.DEBUG_STATS |
443 | 441 | print '************************' |
| 442 | + gc.collect() |
| 443 | + print gc.get_count() |
444 | 444 | |
445 | 445 | if dataset == 'training': |
446 | 446 | cache.empty() |
— | — | @@ -494,6 +494,15 @@ |
495 | 495 | input_queue.join() |
496 | 496 | |
497 | 497 | |
| 498 | + |
| 499 | +def debug(): |
| 500 | + path = '/media/wikipedia_dumps/batch2/' |
| 501 | + files = file_utils.retrieve_file_list(path, 'bz2') |
| 502 | + for file in files: |
| 503 | + filename = os.path.join(path, file) |
| 504 | + unzip(filename) |
| 505 | + |
| 506 | + |
498 | 507 | def launcher_training(): |
499 | 508 | # launcher for creating training data |
500 | 509 | path = '/media/wikipedia_dumps/batch2/' |
— | — | @@ -517,4 +526,5 @@ |
518 | 527 | if __name__ == '__main__': |
519 | 528 | #launcher_training() |
520 | 529 | gc.enable() |
| 530 | + debug() |
521 | 531 | launcher_prediction() |