r76417 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r76416‎ \| r76417 \| r76418 >
Date:	22:43, 9 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Changed MongoDB schema to handle cases where editors have more than 4Mb of edit observations.
Modified paths:	/trunk/tools/editor_trends/construct_datasets.py (modified) (history) /trunk/tools/editor_trends/database/cache.py (modified) (history) /trunk/tools/editor_trends/optimize_editors.py (modified) (history) /trunk/tools/editor_trends/run.py (modified) (history) /trunk/tools/editor_trends/settings.py (modified) (history) /trunk/tools/editor_trends/utils/sort.py (modified) (history) /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/optimize_editors.py
—	—	@@ -25,6 +25,7 @@
26	26	import settings
27	27	from database import db
28	28	from utils import process_constructor as pc
	29	+from utils import utils
29	30	import construct_datasets
30	31
31	32
—	—	@@ -67,20 +68,25 @@
68	69	return articles
69	70
70	71
71		~~-def optimize_editors(input_queue, result_queue, pbar, kwargs):~~
	72	+def sort_edits(edits):
	73	+ edits = utils.merge_list(edits)
	74	+ return sorted(edits, key=itemgetter('date'))
	75	+
	76	+
	77	+def optimize_editors(input_queue, result_queue, pbar, **kwargs):
72	78	dbname = kwargs.pop('dbname')
73	79	mongo = db.init_mongo_db(dbname)
74	80	input = mongo['editors']
75	81	output = mongo['dataset']
76		~~- mongo.output.ensure_index('editor')~~
77		~~- mongo.output.ensure_index('year_joined')~~
	82	+ output.ensure_index('editor')
	83	+ output.ensure_index('year_joined')
78	84	definition = kwargs.pop('definition')
79	85	while True:
80	86	try:
81	87	id = input_queue.get(block=False)
82	88	editor = input.find_one({'editor': id})
83	89	edits = editor['edits']
84		~~- edits = sorted(edits, key=itemgetter('date'))~~
	90	+ edits = sort_edits(edits)
85	91	edit_count = len(edits)
86	92	new_wikipedian = edits[9]['date']
87	93	first_edit = edits[0]['date']
—	—	@@ -100,6 +106,7 @@
101	107	except Empty:
102	108	break
103	109
	110	+
104	111	def run_optimize_editors(dbname):
105	112	ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
106	113	kwargs = {'definition': 'traditional',
—	—	@@ -108,15 +115,16 @@
109	116	'nr_input_processors': 1,
110	117	'nr_output_processors': 0,
111	118	}
112		~~- chunks = {}~~
113		~~- parts = int(round(float(len(ids)) / 1, 0))~~
114		~~- a = 0~~
115		~~- for x in xrange(settings.NUMBER_OF_PROCESSES):~~
116		~~- b = a + parts~~
117		~~- chunks[x] = ids[a:b]~~
118		~~- a = (x + 1) * parts~~
119		~~- if a >= len(ids):~~
120		~~- break~~
	119	+ chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)
	120	+# chunks = {}
	121	+# parts = int(round(float(len(ids)) / 1, 0))
	122	+# a = 0
	123	+# for x in xrange(settings.NUMBER_OF_PROCESSES):
	124	+# b = a + parts
	125	+# chunks[x] = ids[a:b]
	126	+# a = (x + 1) * parts
	127	+# if a >= len(ids):
	128	+# break
121	129
122	130	pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
123	131
—	—	@@ -131,5 +139,5 @@
132	140
133	141
134	142	if __name__ == '__main__':
135		~~- debug_optimize_editors('test')~~
136		~~- #run_optimize_editors('test')~~
	143	+ #debug_optimize_editors('test')
	144	+ run_optimize_editors('enwiki')
Index: trunk/tools/editor_trends/settings.py
—	—	@@ -22,19 +22,19 @@
23	23	the datasets as part of the Editor Dynamics and Anti-Vandalism projects.
24	24	'''
25	25
26		-
27	26	from multiprocessing import cpu_count
28	27	import os
29	28	import sys
30	29	import platform
31		~~-#try:~~
32		~~-# from pywin import win32file~~
33		~~-# '''increase the maximum number of open files on Windows to 1024'''~~
34		~~-# win32file._setmaxstdio(1024)~~
35		~~-#except ImportError:~~
36		~~-# pass~~
37	30
38	31	try:
	32	+ from pywin import win32file
	33	+ '''increase the maximum number of open files on Windows to 1024'''
	34	+ win32file._setmaxstdio(1024)
	35	+except ImportError:
	36	+ pass
	37	+
	38	+try:
39	39	import resource
40	40	except ImportError:
41	41	pass
—	—	@@ -151,7 +151,7 @@
152	152	NUMBER_OF_PROCESSES = cpu_count() * 1
153	153
154	154	#Extensions of ascii files, this is used to determine the filemode to use
155		~~-ASCII = ['txt', 'csv', 'xml', 'sql']~~
	155	+ASCII = ['txt', 'csv', 'xml', 'sql', 'json']
156	156
157	157	WP_DUMP_LOCATION = 'http://download.wikimedia.org'
158	158
Index: trunk/tools/editor_trends/run.py
—	—	@@ -33,5 +33,5 @@
34	34	output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted')
35	35	dbname = 'enwiki'
36	36	#sort.debug_mergesort_feeder(input, output)
37		~~-sort.mergesort_launcher(input, output)~~
38		~~-#sort.mergesort_external_launcher(dbname, output, output)~~
\ No newline at end of file
	37	+#sort.mergesort_launcher(input, output)
	38	+sort.mergesort_external_launcher(dbname, output, output)
\ No newline at end of file
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -326,6 +326,13 @@
327	327	files.append('.'.join(file))
328	328	return files
329	329
	330	+def merge_list(datalist):
	331	+ merged = []
	332	+ for d in datalist:
	333	+ for x in datalist[d]:
	334	+ merged.append(x)
	335	+ return merged
	336	+
330	337	def split_list(datalist, maxval):
331	338	chunks = {}
332	339	a = 0
Index: trunk/tools/editor_trends/utils/sort.py
—	—	@@ -32,8 +32,8 @@
33	33	import utils
34	34	import process_constructor as pc
35	35	from database import cache
	36	+from database import db
36	37
37		-
38	38	def quick_sort(obs):
39	39	if obs == []:
40	40	return []
—	—	@@ -106,14 +106,28 @@
107	107	mongo.collection.ensure_index('editor')
108	108	editor_cache = cache.EditorCache(collection)
109	109	prev_contributor = ''
110		~~- for line in readline(file):~~
	110	+ x = 0
	111	+ edits = 0
	112	+ editors = set()
	113	+ for line in readline(fh):
111	114	contributor = line[0]
	115	+
112	116	if prev_contributor != contributor:
113		~~- editor_cache.add('NEXT', '')~~
114		~~- value = {'date': line[1], 'article': line[2]}~~
	117	+ result = editor_cache.add(prev_contributor, 'NEXT')
	118	+ print 'Stored %s editors' % x
	119	+ edits = 0
	120	+ x += 1
	121	+ else:
	122	+ edits += 1
	123	+ if edits == 10:
	124	+ editors.add(contributor)
	125	+ date = utils.convert_timestamp_to_date(line[1])
	126	+ article_id = int(line[2])
	127	+ value = {'date': date, 'article': article_id}
115	128	editor_cache.add(contributor, value)
116	129	prev_contributor = contributor
117	130	fh.close()
	131	+ utils.store_object(editors, settings.BINARY_OBJECT_FILE_LOCATION, 'editors')
118	132
119	133
120	134	def mergesort_external_launcher(dbname, input, output):
—	—	@@ -126,15 +140,17 @@
127	141	chunks = utils.split_list(files, int(x))
128	142	'''1st iteration external mergesort'''
129	143	for chunk in chunks:
130		~~- filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.ENCODING) for file in chunks[chunk]]~~
131		~~- filename = merge_sorted_files(output, filehandles, chunk)~~
132		~~- filehandles = [fh.close() for fh in filehandles]~~
	144	+# filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.ENCODING) for file in chunks[chunk]]
	145	+# filename = merge_sorted_files(output, filehandles, chunk)
	146	+# filehandles = [fh.close() for fh in filehandles]
	147	+ pass
133	148	'''2nd iteration external mergesort, if necessary'''
134	149	if len(chunks) > 1:
135		~~- files = utils.retrieve_file_list(output, 'txt', mask='[merged]')~~
136		~~- filehandles = [utils.create_txt_filehandle(output, file, 'r', settings.ENCODING) for file in files]~~
137		~~- filename = merge_sorted_files(output, filehandles, 'final')~~
138		~~- filehandles = [fh.close() for fh in filehandles]~~
	150	+# files = utils.retrieve_file_list(output, 'txt', mask='[merged]')
	151	+# filehandles = [utils.create_txt_filehandle(output, file, 'r', settings.ENCODING) for file in files]
	152	+# filename = merge_sorted_files(output, filehandles, 'final')
	153	+# filehandles = [fh.close() for fh in filehandles]
	154	+ filename = 'merged_final.txt'
139	155	store_editors(output, filename, dbname)
140	156
141	157
Index: trunk/tools/editor_trends/construct_datasets.py
—	—	@@ -82,7 +82,7 @@
83	83	return headers
84	84
85	85
86		~~-def generate_editor_dataset(input_queue, data_queue, pbar, kwargs):~~
	86	+def generate_editor_dataset(input_queue, data_queue, pbar, **kwargs):
87	87	debug = kwargs.pop('debug')
88	88	dbname = kwargs.pop('dbname')
89	89	mongo = db.init_mongo_db(dbname)
—	—	@@ -143,16 +143,17 @@
144	144	'dbname': dbname,
145	145	}
146	146	ids = retrieve_editor_ids_mongo(dbname, 'editors')
147		~~- chunks = {}~~
148		~~- parts = int(round(float(len(ids)) / 1, 0))~~
149		~~- a = 0~~
150		~~- for x in xrange(settings.NUMBER_OF_PROCESSES):~~
151		~~- b = a + parts~~
152		~~- chunks[x] = ids[a:b]~~
153		~~- a = (x + 1) * parts~~
154		~~- if a >= len(ids):~~
155		~~- break~~
156		-
	147	+ chunks = utils.split_list(ids, settings.NUMBER_OF_PROCESSES)
	148	+# chunks = {}
	149	+# parts = int(round(float(len(ids)) / 1, 0))
	150	+# a = 0
	151	+# for x in xrange(settings.NUMBER_OF_PROCESSES):
	152	+# b = a + parts
	153	+# chunks[x] = ids[a:b]
	154	+# a = (x + 1) * parts
	155	+# if a >= len(ids):
	156	+# break
	157	+#
157	158	pc.build_scaffolding(pc.load_queue, generate_editor_dataset, chunks, False, False, **kwargs)
158	159
159	160
—	—	@@ -169,5 +170,5 @@
170	171
171	172	if __name__ == '__main__':
172	173	#generate_editor_dataset_debug('test')
173		~~- generate_editor_dataset_launcher('test')~~
	174	+ generate_editor_dataset_launcher('enwiki')
174	175	#debug_retrieve_edits_by_contributor_launcher()
Index: trunk/tools/editor_trends/database/cache.py
—	—	@@ -64,12 +64,14 @@
65	65	return sum([self.editors[k].get('obs', 0) for k in self.editors])
66	66
67	67	def add(self, key, value):
68		~~- if key == 'NEXT':~~
	68	+ if value == 'NEXT':
69	69	for editor in self.treshold_editors:
70		~~- self.update(editor, self.editors[editor]['edits'])~~
	70	+ self.insert (editor, self.editors[editor]['edits'])
71	71	self.n -= self.editors[editor]['obs']
72	72	self.number_editors -= 1
73	73	del self.editors[editor]
	74	+ if key in self.editors:
	75	+ del self.editors[key]
74	76	self.treshold_editors = set()
75	77	else:
76	78	self.cumulative_n += 1
—	—	@@ -77,19 +79,33 @@
78	80	if key not in self.editors:
79	81	self.editors[key] = {}
80	82	self.editors[key]['obs'] = 0
81		~~- self.editors[key]['edits'] = []~~
	83	+ self.editors[key]['edits'] = {}
	84	+ self.add_years(key)
82	85	self.number_editors += 1
83		-
	86	+
84	87	id = str(self.editors[key]['obs'])
85		~~- self.editors[key]['edits'].append(value)~~
	88	+ year = str(value['date'].year)
	89	+ self.editors[key]['edits'][year].append(value)
86	90	self.editors[key]['obs'] += 1
87	91
88	92	if self.editors[key]['obs'] == self.treshold:
89	93	self.treshold_editors.add(key)
90	94
	95	+ def add_years(self, key):
	96	+ now = datetime.datetime.now().year + 1
	97	+ for year in xrange(2001, now):
	98	+ self.editors[key]['edits'][str(year)] = []
	99	+
	100	+
91	101	def update(self, editor, values):
92	102	self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True)
93	103
	104	+ def insert(self, editor, values):
	105	+ try:
	106	+ self.collection.insert({'editor': editor, 'edits': values})
	107	+ except:
	108	+ pass
	109	+
94	110	def store(self):
95	111	utils.store_object(self, settings.BINARY_OBJECT_FILE_LOCATION, self.__repr__())
96	112

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r76417 [removed: new added: deferred]