r76455 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r76454‎ \| r76455 \| r76456 >
Date:	16:39, 10 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Small bugfixes for mergesort functionality.
Modified paths:	/trunk/tools/editor_trends/construct_datasets.py (modified) (history) /trunk/tools/editor_trends/optimize_editors.py (modified) (history) /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history) /trunk/tools/editor_trends/utils/sort.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/optimize_editors.py
—	—	@@ -115,6 +115,8 @@
116	116	'nr_input_processors': 1,
117	117	'nr_output_processors': 0,
118	118	}
	119	+ print len(ids)
	120	+ ids = list(ids)
119	121	chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)
120	122	# chunks = {}
121	123	# parts = int(round(float(len(ids)) / 1, 0))
Index: trunk/tools/editor_trends/utils/process_constructor.py
—	—	@@ -55,6 +55,7 @@
56	56
57	57	nr_input_processors = kwargs.pop('nr_input_processors')
58	58	nr_output_processors = kwargs.pop('nr_output_processors')
	59	+ poison_pill = kwargs.get('poison_pill', True)
59	60	input_queues = {}
60	61	result_queues = {}
61	62
—	—	@@ -63,7 +64,7 @@
64	65	# assert len(obj)== nr_output_processors
65	66
66	67	for i, o in enumerate(obj):
67		~~- input_queues[i] = load_input_queue(obj[o], poison_pill=True)~~
	68	+ input_queues[i] = load_input_queue(obj[o], poison_pill=poison_pill)
68	69	if result_queue:
69	70	result_queues[i] = JoinableQueue()
70	71	else:
Index: trunk/tools/editor_trends/utils/sort.py
—	—	@@ -27,6 +27,7 @@
28	28	import heapq
29	29	from multiprocessing import Queue
30	30	from Queue import Empty
	31	+import datetime
31	32
32	33	import settings
33	34	import utils
—	—	@@ -71,10 +72,10 @@
72	73
73	74	def readline(file):
74	75	for line in file:
	76	+ line = line.replace('\n', '')
75	77	if line == '':
76	78	continue
77	79	else:
78		~~- line = line.replace('\n', '')~~
79	80	line = line.split('\t')
80	81	yield line
81	82
—	—	@@ -102,26 +103,29 @@
103	104	def store_editors(input, filename, dbname):
104	105	fh = utils.create_txt_filehandle(input, filename, 'r', settings.ENCODING)
105	106	mongo = db.init_mongo_db(dbname)
106		~~- collection = mongo['editors']~~
	107	+ collection = mongo['test']
107	108	mongo.collection.ensure_index('editor')
108	109	editor_cache = cache.EditorCache(collection)
109		~~- prev_contributor = ''~~
	110	+ prev_contributor = -1
110	111	x = 0
111	112	edits = 0
112	113	editors = set()
113	114	for line in readline(fh):
114		~~- contributor = line[0]~~
115		-
	115	+ if len(line) == 0:
	116	+ continue
	117	+ contributor = int(line[0])
116	118	if prev_contributor != contributor:
117		~~- result = editor_cache.add(prev_contributor, 'NEXT')~~
118		~~- print 'Stored %s editors' % x~~
	119	+ if edits >= 10:
	120	+ result = editor_cache.add(prev_contributor, 'NEXT')
	121	+ if result:
	122	+ editors.add(contributor)
	123	+ x += 1
	124	+ print 'Stored %s editors' % x
	125	+ else:
	126	+ editor_cache.clear(prev_contributor)
119	127	edits = 0
120		~~- x += 1~~
121		~~- else:~~
122		~~- edits += 1~~
123		~~- if edits == 10:~~
124		~~- editors.add(contributor)~~
125		~~- date = utils.convert_timestamp_to_date(line[1])~~
	128	+ edits += 1
	129	+ date = utils.convert_timestamp_to_date(line[1]) #+ datetime.timedelta(days=1)
126	130	article_id = int(line[2])
127	131	value = {'date': date, 'article': article_id}
128	132	editor_cache.add(contributor, value)
—	—	@@ -177,11 +181,13 @@
178	182	'nr_output_processors': settings.NUMBER_OF_PROCESSES,
179	183	'input': input,
180	184	'output': output,
	185	+ 'poison_pill': False
181	186	}
182	187	files = utils.retrieve_file_list(input, 'txt')
183	188	chunks = utils.split_list(files, settings.NUMBER_OF_PROCESSES)
184	189	pc.build_scaffolding(pc.load_queue, mergesort_feeder, chunks, False, False, **kwargs)
185	190
	191	+
186	192	def debug_mergesort_feeder(input, output):
187	193	kwargs = {
188	194	'input': input,
—	—	@@ -192,6 +198,7 @@
193	199	q = pc.load_queue(chunks[0])
194	200	mergesort_feeder(q, False, **kwargs)
195	201
	202	+
196	203	if __name__ == '__main__':
197	204	input = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'txt')
198	205	output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted')
Index: trunk/tools/editor_trends/construct_datasets.py
—	—	@@ -36,9 +36,9 @@
37	37
38	38	def retrieve_editor_ids_mongo(dbname, collection):
39	39	if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,
40		~~- retrieve_editor_ids_mongo):~~
	40	+ 'editors.bin'):
41	41	ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
42		~~- retrieve_editor_ids_mongo)~~
	42	+ 'editors.bin')
43	43	else:
44	44	mongo = db.init_mongo_db(dbname)
45	45	editors = mongo[collection]

Follow-up revisions

Revision	Commit summary	Author	Date
r91115	MFT r76455, r80805 and r83564-91107	awjrichards	21:53, 29 June 2011

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r76455 [removed: new added: deferred]