r76348 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r76347‎ | r76348 | r76349 >
Date:23:09, 8 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
Added store in MongoDB function.
Modified paths:
  • /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history)
  • /trunk/tools/editor_trends/utils/sort.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/map_wiki_editors.py
@@ -138,9 +138,8 @@
139139
140140 Output is the data_queue that will be used by store_editors()
141141 '''
142 - input = os.path.join(settings.XML_FILE_LOCATION, kwargs.get('language', 'en'), kwargs.get('project', 'wiki'))
143 - output = os.path.join(input, 'txt')
144 - utils.create_directory(output)
 142+ input = kwargs.get('input', None)
 143+ output = kwargs.get('output', None)
145144 debug = kwargs.get('debug', False)
146145 destination = kwargs.get('destination', 'file')
147146
@@ -301,6 +300,9 @@
302301
303302 def run_parse_editors(location, language, project):
304303 ids = load_bot_ids()
 304+ input = os.path.join(location, language, project)
 305+ output = os.path.join(input, 'txt')
 306+
305307 kwargs = {'bots': ids,
306308 'dbname': language + project,
307309 'language': language,
@@ -309,26 +311,32 @@
310312 'destination': 'file',
311313 'nr_input_processors': settings.NUMBER_OF_PROCESSES,
312314 'nr_output_processors': settings.NUMBER_OF_PROCESSES,
 315+ 'input': input,
 316+ 'output': output,
313317 }
314318 chunks = {}
315319 source = os.path.join(location, language, project)
316320 files = utils.retrieve_file_list(source, 'xml')
317321 parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0))
318322 a = 0
 323+
 324+ if not os.path.exists(input):
 325+ utils.create_directory(input)
 326+ if not os.path.exists(output):
 327+ utils.create_directory(output)
 328+
319329 for x in xrange(settings.NUMBER_OF_PROCESSES):
320330 b = a + parts
321331 chunks[x] = files[a:b]
322332 a = (x + 1) * parts
323333
324334 pc.build_scaffolding(pc.load_queue, parse_editors, chunks, False, False, **kwargs)
325 - #search_cache_for_missed_editors(dbname)
326335
327336
328337 def debug_parse_editors(dbname):
329338 q = JoinableQueue()
330339 parse_editors('522.xml', q, None, None, debug=True, destination='file')
331340 store_editors(q, [], dbname)
332 - #search_cache_for_missed_editors(dbname)
333341
334342
335343 if __name__ == "__main__":
Index: trunk/tools/editor_trends/utils/sort.py
@@ -28,6 +28,7 @@
2929
3030 import settings
3131 import utils
 32+from database import cache
3233
3334 def quick_sort(obs):
3435 if obs == []:
@@ -92,6 +93,23 @@
9394 fh.close()
9495
9596
 97+def store_editors(input, dbname):
 98+ fh = utils.create_txt_filehandle(input, 'merged.txt', 'r', settings.ENCODING)
 99+ mongo = db.init_mongo_db(dbname)
 100+ collection = mongo['editors']
 101+ mongo.collection.ensure_index('editor')
 102+ editor_cache = cache.EditorCache(collection)
 103+ prev_contributor = ''
 104+ for line in readline(file):
 105+ contributor = line[0]
 106+ if prev_contributor != contributor:
 107+ editor_cache.add('NEXT', '')
 108+ value = {'date': line[1], 'article': line[2]}
 109+ editor_cache.add(contributor, value)
 110+ prev_contributor = contributor
 111+ fh.close()
 112+
 113+
96114 def debug_merge_sorted_files(input, output):
97115 files = utils.retrieve_file_list(input, 'txt', mask='')
98116 filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.ENCODING) for file in files]

Status & tagging log