Index: trunk/tools/editor_trends/map_wiki_editors.py |
— | — | @@ -138,9 +138,8 @@ |
139 | 139 | |
140 | 140 | Output is the data_queue that will be used by store_editors() |
141 | 141 | ''' |
142 | | - input = os.path.join(settings.XML_FILE_LOCATION, kwargs.get('language', 'en'), kwargs.get('project', 'wiki')) |
143 | | - output = os.path.join(input, 'txt') |
144 | | - utils.create_directory(output) |
| 142 | + input = kwargs.get('input', None) |
| 143 | + output = kwargs.get('output', None) |
145 | 144 | debug = kwargs.get('debug', False) |
146 | 145 | destination = kwargs.get('destination', 'file') |
147 | 146 | |
— | — | @@ -301,6 +300,9 @@ |
302 | 301 | |
303 | 302 | def run_parse_editors(location, language, project): |
304 | 303 | ids = load_bot_ids() |
| 304 | + input = os.path.join(location, language, project) |
| 305 | + output = os.path.join(input, 'txt') |
| 306 | + |
305 | 307 | kwargs = {'bots': ids, |
306 | 308 | 'dbname': language + project, |
307 | 309 | 'language': language, |
— | — | @@ -309,26 +311,32 @@ |
310 | 312 | 'destination': 'file', |
311 | 313 | 'nr_input_processors': settings.NUMBER_OF_PROCESSES, |
312 | 314 | 'nr_output_processors': settings.NUMBER_OF_PROCESSES, |
| 315 | + 'input': input, |
| 316 | + 'output': output, |
313 | 317 | } |
314 | 318 | chunks = {} |
315 | 319 | source = os.path.join(location, language, project) |
316 | 320 | files = utils.retrieve_file_list(source, 'xml') |
317 | 321 | parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0)) |
318 | 322 | a = 0 |
| 323 | + |
| 324 | + if not os.path.exists(input): |
| 325 | + utils.create_directory(input) |
| 326 | + if not os.path.exists(output): |
| 327 | + utils.create_directory(output) |
| 328 | + |
319 | 329 | for x in xrange(settings.NUMBER_OF_PROCESSES): |
320 | 330 | b = a + parts |
321 | 331 | chunks[x] = files[a:b] |
322 | 332 | a = (x + 1) * parts |
323 | 333 | |
324 | 334 | pc.build_scaffolding(pc.load_queue, parse_editors, chunks, False, False, **kwargs) |
325 | | - #search_cache_for_missed_editors(dbname) |
326 | 335 | |
327 | 336 | |
328 | 337 | def debug_parse_editors(dbname): |
329 | 338 | q = JoinableQueue() |
330 | 339 | parse_editors('522.xml', q, None, None, debug=True, destination='file') |
331 | 340 | store_editors(q, [], dbname) |
332 | | - #search_cache_for_missed_editors(dbname) |
333 | 341 | |
334 | 342 | |
335 | 343 | if __name__ == "__main__": |
Index: trunk/tools/editor_trends/utils/sort.py |
— | — | @@ -28,6 +28,7 @@ |
29 | 29 | |
30 | 30 | import settings |
31 | 31 | import utils |
| 32 | +from database import cache |
32 | 33 | |
33 | 34 | def quick_sort(obs): |
34 | 35 | if obs == []: |
— | — | @@ -92,6 +93,23 @@ |
93 | 94 | fh.close() |
94 | 95 | |
95 | 96 | |
| 97 | +def store_editors(input, dbname): |
| 98 | + fh = utils.create_txt_filehandle(input, 'merged.txt', 'r', settings.ENCODING) |
| 99 | + mongo = db.init_mongo_db(dbname) |
| 100 | + collection = mongo['editors'] |
| 101 | + mongo.collection.ensure_index('editor') |
| 102 | + editor_cache = cache.EditorCache(collection) |
| 103 | + prev_contributor = '' |
| 104 | + for line in readline(file): |
| 105 | + contributor = line[0] |
| 106 | + if prev_contributor != contributor: |
| 107 | + editor_cache.add('NEXT', '') |
| 108 | + value = {'date': line[1], 'article': line[2]} |
| 109 | + editor_cache.add(contributor, value) |
| 110 | + prev_contributor = contributor |
| 111 | + fh.close() |
| 112 | + |
| 113 | + |
96 | 114 | def debug_merge_sorted_files(input, output): |
97 | 115 | files = utils.retrieve_file_list(input, 'txt', mask='') |
98 | 116 | filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.ENCODING) for file in files] |