r76455 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r76454‎ | r76455 | r76456 >
Date:16:39, 10 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
Small bugfixes for mergesort functionality.
Modified paths:
  • /trunk/tools/editor_trends/construct_datasets.py (modified) (history)
  • /trunk/tools/editor_trends/optimize_editors.py (modified) (history)
  • /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history)
  • /trunk/tools/editor_trends/utils/sort.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/optimize_editors.py
@@ -115,6 +115,8 @@
116116 'nr_input_processors': 1,
117117 'nr_output_processors': 0,
118118 }
 119+ print len(ids)
 120+ ids = list(ids)
119121 chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)
120122 # chunks = {}
121123 # parts = int(round(float(len(ids)) / 1, 0))
Index: trunk/tools/editor_trends/utils/process_constructor.py
@@ -55,6 +55,7 @@
5656
5757 nr_input_processors = kwargs.pop('nr_input_processors')
5858 nr_output_processors = kwargs.pop('nr_output_processors')
 59+ poison_pill = kwargs.get('poison_pill', True)
5960 input_queues = {}
6061 result_queues = {}
6162
@@ -63,7 +64,7 @@
6465 # assert len(obj)== nr_output_processors
6566
6667 for i, o in enumerate(obj):
67 - input_queues[i] = load_input_queue(obj[o], poison_pill=True)
 68+ input_queues[i] = load_input_queue(obj[o], poison_pill=poison_pill)
6869 if result_queue:
6970 result_queues[i] = JoinableQueue()
7071 else:
Index: trunk/tools/editor_trends/utils/sort.py
@@ -27,6 +27,7 @@
2828 import heapq
2929 from multiprocessing import Queue
3030 from Queue import Empty
 31+import datetime
3132
3233 import settings
3334 import utils
@@ -71,10 +72,10 @@
7273
7374 def readline(file):
7475 for line in file:
 76+ line = line.replace('\n', '')
7577 if line == '':
7678 continue
7779 else:
78 - line = line.replace('\n', '')
7980 line = line.split('\t')
8081 yield line
8182
@@ -102,26 +103,29 @@
103104 def store_editors(input, filename, dbname):
104105 fh = utils.create_txt_filehandle(input, filename, 'r', settings.ENCODING)
105106 mongo = db.init_mongo_db(dbname)
106 - collection = mongo['editors']
 107+ collection = mongo['test']
107108 mongo.collection.ensure_index('editor')
108109 editor_cache = cache.EditorCache(collection)
109 - prev_contributor = ''
 110+ prev_contributor = -1
110111 x = 0
111112 edits = 0
112113 editors = set()
113114 for line in readline(fh):
114 - contributor = line[0]
115 -
 115+ if len(line) == 0:
 116+ continue
 117+ contributor = int(line[0])
116118 if prev_contributor != contributor:
117 - result = editor_cache.add(prev_contributor, 'NEXT')
118 - print 'Stored %s editors' % x
 119+ if edits >= 10:
 120+ result = editor_cache.add(prev_contributor, 'NEXT')
 121+ if result:
 122+ editors.add(contributor)
 123+ x += 1
 124+ print 'Stored %s editors' % x
 125+ else:
 126+ editor_cache.clear(prev_contributor)
119127 edits = 0
120 - x += 1
121 - else:
122 - edits += 1
123 - if edits == 10:
124 - editors.add(contributor)
125 - date = utils.convert_timestamp_to_date(line[1])
 128+ edits += 1
 129+ date = utils.convert_timestamp_to_date(line[1]) #+ datetime.timedelta(days=1)
126130 article_id = int(line[2])
127131 value = {'date': date, 'article': article_id}
128132 editor_cache.add(contributor, value)
@@ -177,11 +181,13 @@
178182 'nr_output_processors': settings.NUMBER_OF_PROCESSES,
179183 'input': input,
180184 'output': output,
 185+ 'poison_pill': False
181186 }
182187 files = utils.retrieve_file_list(input, 'txt')
183188 chunks = utils.split_list(files, settings.NUMBER_OF_PROCESSES)
184189 pc.build_scaffolding(pc.load_queue, mergesort_feeder, chunks, False, False, **kwargs)
185190
 191+
186192 def debug_mergesort_feeder(input, output):
187193 kwargs = {
188194 'input': input,
@@ -192,6 +198,7 @@
193199 q = pc.load_queue(chunks[0])
194200 mergesort_feeder(q, False, **kwargs)
195201
 202+
196203 if __name__ == '__main__':
197204 input = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'txt')
198205 output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted')
Index: trunk/tools/editor_trends/construct_datasets.py
@@ -36,9 +36,9 @@
3737
3838 def retrieve_editor_ids_mongo(dbname, collection):
3939 if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,
40 - retrieve_editor_ids_mongo):
 40+ 'editors.bin'):
4141 ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
42 - retrieve_editor_ids_mongo)
 42+ 'editors.bin')
4343 else:
4444 mongo = db.init_mongo_db(dbname)
4545 editors = mongo[collection]

Follow-up revisions

RevisionCommit summaryAuthorDate
r91115MFT r76455, r80805 and r83564-91107awjrichards21:53, 29 June 2011

Status & tagging log