Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -33,9 +33,9 @@ |
34 | 34 | from utils import utils |
35 | 35 | from utils import dump_downloader |
36 | 36 | from etl import chunker |
37 | | -import map_wiki_editors |
38 | | -import optimize_editors |
39 | | -import construct_datasets |
| 37 | +from etl import extract |
| 38 | +from etl import optimize_editors |
| 39 | +from etl import construct_datasets |
40 | 40 | import config |
41 | 41 | |
42 | 42 | |
— | — | @@ -155,7 +155,7 @@ |
156 | 156 | |
157 | 157 | def mongodb_script_launcher(args, location, filename, project, full_project, language_code, language): |
158 | 158 | print 'mongodb_script_launcher' |
159 | | - map_wiki_editors.run_parse_editors(project, language_code, location) |
| 159 | + extract.run_parse_editors(project, language_code, location) |
160 | 160 | |
161 | 161 | |
162 | 162 | def sort_launcher(args, location, filename, project, full_project, language_code): |
Index: trunk/tools/editor_trends/etl/optimize_editors.py |
— | — | @@ -21,7 +21,9 @@ |
22 | 22 | from Queue import Empty |
23 | 23 | from operator import itemgetter |
24 | 24 | import datetime |
| 25 | +import sys |
25 | 26 | |
| 27 | +sys.path.append('..') |
26 | 28 | import configuration |
27 | 29 | settings = configuration.Settings() |
28 | 30 | from database import db |
— | — | @@ -154,7 +156,7 @@ |
155 | 157 | } |
156 | 158 | print len(ids) |
157 | 159 | ids = list(ids) |
158 | | - chunks = dict(0, ids) |
| 160 | + chunks = {0: ids} |
159 | 161 | pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs) |
160 | 162 | |
161 | 163 | |
Index: trunk/tools/editor_trends/etl/construct_datasets.py |
— | — | @@ -21,9 +21,10 @@ |
22 | 22 | from Queue import Empty |
23 | 23 | import datetime |
24 | 24 | from dateutil.relativedelta import * |
25 | | - |
| 25 | +import sys |
26 | 26 | import progressbar |
27 | 27 | |
| 28 | +sys.path.append('..') |
28 | 29 | import configuration |
29 | 30 | settings = configuration.Settings() |
30 | 31 | from utils import models, utils |
Index: trunk/tools/editor_trends/etl/loader.py |
— | — | @@ -17,8 +17,9 @@ |
18 | 18 | __date__ = '2010-11-16' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
21 | | - |
| 21 | +import os |
22 | 22 | import sys |
| 23 | +from Queue import Empty |
23 | 24 | |
24 | 25 | sys.path.append('..') |
25 | 26 | import configuration |
— | — | @@ -26,9 +27,11 @@ |
27 | 28 | from database import db |
28 | 29 | from database import cache |
29 | 30 | from utils import utils |
| 31 | +from utils import sort |
30 | 32 | import process_constructor as pc |
31 | 33 | |
32 | 34 | |
| 35 | + |
33 | 36 | def store_editors(input, filename, dbname): |
34 | 37 | fh = utils.create_txt_filehandle(input, filename, 'r', settings.encoding) |
35 | 38 | mongo = db.init_mongo_db(dbname) |
— | — | @@ -77,17 +80,17 @@ |
78 | 81 | chunks = utils.split_list(files, int(x)) |
79 | 82 | '''1st iteration external mergesort''' |
80 | 83 | for chunk in chunks: |
81 | | - filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]] |
82 | | - filename = merge_sorted_files(output, filehandles, chunk) |
83 | | - filehandles = [fh.close() for fh in filehandles] |
| 84 | + #filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]] |
| 85 | + #filename = sort.merge_sorted_files(output, filehandles, chunk) |
| 86 | + #filehandles = [fh.close() for fh in filehandles] |
84 | 87 | pass |
85 | 88 | '''2nd iteration external mergesort, if necessary''' |
86 | 89 | if len(chunks) > 1: |
87 | 90 | files = utils.retrieve_file_list(output, 'txt', mask='[merged]') |
88 | 91 | filehandles = [utils.create_txt_filehandle(output, file, 'r', settings.encoding) for file in files] |
89 | | - filename = merge_sorted_files(output, filehandles, 'final') |
| 92 | + filename = sort.merge_sorted_files(output, filehandles, 'final') |
90 | 93 | filehandles = [fh.close() for fh in filehandles] |
91 | | - filename = 'merged_final.txt' |
| 94 | + filename = 'merged_final.txt' |
92 | 95 | store_editors(output, filename, dbname) |
93 | 96 | |
94 | 97 | |
— | — | @@ -102,8 +105,8 @@ |
103 | 106 | fh.close() |
104 | 107 | data = [d.replace('\n', '') for d in data] |
105 | 108 | data = [d.split('\t') for d in data] |
106 | | - sorted_data = mergesort(data) |
107 | | - write_sorted_file(sorted_data, file, output) |
| 109 | + sorted_data = sort.mergesort(data) |
| 110 | + sort.write_sorted_file(sorted_data, file, output) |
108 | 111 | except Empty: |
109 | 112 | break |
110 | 113 | |
— | — | @@ -136,5 +139,5 @@ |
137 | 140 | input = os.path.join(settings.input_location, 'en', 'wiki', 'txt') |
138 | 141 | output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted') |
139 | 142 | dbname = 'enwiki' |
140 | | - mergesort_launcher(input, output) |
| 143 | + #mergesort_launcher(input, output) |
141 | 144 | mergesort_external_launcher(dbname, output, output) |
\ No newline at end of file |
Index: trunk/tools/editor_trends/utils/utils.py |
— | — | @@ -331,6 +331,8 @@ |
332 | 332 | all_files = os.listdir(location) |
333 | 333 | files = [] |
334 | 334 | for file in all_files: |
| 335 | + if file == 'merged_1.txt': |
| 336 | + print 'debug' |
335 | 337 | file = file.split('.') |
336 | 338 | if len(file) == 1: |
337 | 339 | continue |
Index: trunk/tools/editor_trends/utils/process_constructor.py |
— | — | @@ -74,7 +74,7 @@ |
75 | 75 | else: |
76 | 76 | result_queues[i] = False |
77 | 77 | |
78 | | - if settings.progress_bar: |
| 78 | + if settings.progressbar: |
79 | 79 | size = sum([input_queues[q].qsize() for q in input_queues]) |
80 | 80 | pbar = progressbar.ProgressBar(maxval=size).start() |
81 | 81 | kwargs['pbar'] = pbar |
Index: trunk/tools/editor_trends/utils/sort.py |
— | — | @@ -28,9 +28,17 @@ |
29 | 29 | from multiprocessing import Queue |
30 | 30 | from Queue import Empty |
31 | 31 | import datetime |
| 32 | +import sys |
32 | 33 | |
| 34 | +sys.path.append('..') |
| 35 | +import configuration |
| 36 | +settings = configuration.Settings() |
33 | 37 | |
34 | 38 | |
| 39 | +import utils |
| 40 | + |
| 41 | + |
| 42 | + |
35 | 43 | def quick_sort(obs): |
36 | 44 | if obs == []: |
37 | 45 | return [] |
— | — | @@ -68,6 +76,7 @@ |
69 | 77 | |
70 | 78 | def readline(file): |
71 | 79 | for line in file: |
| 80 | + print file.stream.name |
72 | 81 | line = line.replace('\n', '') |
73 | 82 | if line == '': |
74 | 83 | continue |