r76345 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r76344‎ \| r76345 \| r76346 >
Date:	22:12, 8 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Addded mergesort module. By presorting data, significant reductions in processing time are achieved.
Modified paths:	/trunk/tools/editor_trends/construct_datasets.py (modified) (history) /trunk/tools/editor_trends/database/cache.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history) /trunk/tools/editor_trends/settings.py (modified) (history) /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history) /trunk/tools/editor_trends/utils/sort.py (added) (history) /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -79,17 +79,19 @@
80	80	return project
81	81
82	82
83		~~-def generate_wikidump_filename(args):~~
84		~~- return '%s-%s-%s' % (retrieve_projectname(args), 'latest', get_value(args, 'file'))~~
	83	+def generate_wikidump_filename(project, args):
	84	+ return '%s-%s-%s' % (project, 'latest', get_value(args, 'file'))
85	85
86	86
87	87	def determine_file_locations(args):
88	88	locations = {}
89	89	location = get_value(args, 'location') if get_value(args, 'location') != None else settings.XML_FILE_LOCATION
90		~~- locations['language_code'] = retrieve_language(args)~~
91		~~- locations['location'] = os.path.join(location, retrieve_language(args))~~
	90	+ project = retrieve_project(args)
	91	+ language_code = retrieve_language(args)
	92	+ locations['language_code'] = language_code
	93	+ locations['location'] = os.path.join(location, language_code, project)
92	94	locations['project'] = retrieve_projectname(args)
93		~~- locations['filename'] = generate_wikidump_filename(args)~~
	95	+ locations['filename'] = generate_wikidump_filename(project, args)
94	96	return locations
95	97
96	98
—	—	@@ -189,6 +191,12 @@
190	192	except UnicodeEncodeError:
191	193	print '%s' % language
192	194
	195	+
	196	+def detect_python_version():
	197	+ version = ''.join(sys.version_info[0:2])
	198	+ if version < settings.MINIMUM_PYTHON_VERSION:
	199	+ raise 'Please upgrade to Python 2.6 or higher (but not Python 3.x).'
	200	+
193	201	def about():
194	202	print 'Editor Trends Software is (c) 2010 by the Wikimedia Foundation.'
195	203	print 'Written by Diederik van Liere (dvanliere@gmail.com).'
—	—	@@ -253,6 +261,7 @@
254	262	parser.add_argument('-prog', '--progress', action='store_true', default=True,
255	263	help='Indicate whether you want to have a progressbar.')
256	264
	265	+ detect_python_version()
257	266	args = parser.parse_args()
258	267	config.load_configuration(args)
259	268	locations = determine_file_locations(args)
Index: trunk/tools/editor_trends/map_wiki_editors.py
—	—	@@ -88,20 +88,22 @@
89	89	return - 1
90	90
91	91
92		~~-def output_editor_information(elem, data_queue, **kwargs):~~
	92	+def output_editor_information(elem, output, **kwargs):
93	93	'''
94	94	@elem is an XML element containing 1 revision from a page
95		~~- @data_queue is where to store the data~~
	95	+ @output is where to store the data, either a queue or a filehandle
96	96	@**kwargs contains extra information
97	97
98	98	the variable tags determines which attributes are being parsed, the values in
99	99	this dictionary are the functions used to extract the data.
100	100	'''
101		~~- tags = {'contributor': {'editor': extract_contributor_id, 'bot': determine_username_is_bot},~~
	101	+ tags = {'contributor': {'editor': extract_contributor_id,
	102	+ 'bot': determine_username_is_bot},
102	103	'timestamp': {'date': xml.extract_text},
103	104	}
104	105	vars = {}
105		-
	106	+ headers = ['editor', 'date', 'article']
	107	+ destination = kwargs.pop('destination')
106	108	revisions = elem.findall('revision')
107	109	for revision in revisions:
108	110	vars['article'] = elem.find('id').text.decode(settings.ENCODING)
—	—	@@ -114,12 +116,19 @@
115	117	#print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot'])
116	118	if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None:
117	119	vars.pop('bot')
118		~~- vars['date'] = utils.convert_timestamp_to_date(vars['date'])~~
119		~~- data_queue.put(vars)~~
	120	+ if destination == 'queue':
	121	+ output.put(vars)
	122	+ vars['date'] = utils.convert_timestamp_to_date(vars['date'])
	123	+ elif destination == 'file':
	124	+ data =[]
	125	+ for head in headers:
	126	+ data.append(vars[head])
	127	+ utils.write_list_to_csv(data, output)
	128	+ output.write('\n')
120	129	vars = {}
121	130
122	131
123		~~-def parse_editors(xml_queue, data_queue, pbar, bots, **kwargs):~~
	132	+def parse_editors(xml_queue, output, pbar, bots, **kwargs):
124	133	'''
125	134	@xml_queue contains the filenames of the files to be parsed
126	135	@data_queue is an instance of Queue where the extracted data is stored for
—	—	@@ -130,8 +139,10 @@
131	140
132	141	Output is the data_queue that will be used by store_editors()
133	142	'''
134		~~- file_location = os.path.join(settings.XML_FILE_LOCATION, kwargs.get('language', 'en'))~~
135		~~- debug = kwargs.get('debug', None)~~
	143	+ file_location = os.path.join(settings.XML_FILE_LOCATION, kwargs.get('language', 'en'), kwargs.get('project', 'wiki'))
	144	+ debug = kwargs.get('debug', False)
	145	+ destination = kwargs.get('destination', 'file')
	146	+
136	147	if settings.DEBUG:
137	148	messages = {}
138	149	vars = {}
—	—	@@ -145,9 +156,13 @@
146	157	if file == None:
147	158	print 'Swallowed a poison pill'
148	159	break
	160	+
149	161	data = xml.read_input(utils.create_txt_filehandle(file_location,
150	162	file, 'r',
151	163	encoding=settings.ENCODING))
	164	+ if destination == 'file':
	165	+ name = file[:-4] + '.txt'
	166	+ output = utils.create_txt_filehandle(file_location, name, 'w', settings.ENCODING)
152	167	for raw_data in data:
153	168	xml_buffer = cStringIO.StringIO()
154	169	raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
—	—	@@ -156,7 +171,7 @@
157	172	raw_data = ''.join(raw_data)
158	173	xml_buffer.write(raw_data)
159	174	elem = cElementTree.XML(xml_buffer.getvalue())
160		~~- output_editor_information(elem, data_queue, bots=bots)~~
	175	+ output_editor_information(elem, output, bots=bots, destination=destination)
161	176	except SyntaxError, error:
162	177	print error
163	178	'''
—	—	@@ -176,26 +191,30 @@
177	192	print file, error
178	193	print raw_data[:12]
179	194	print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])
	195	+ if destination == 'queue':
	196	+ output.put('NEXT')
	197	+ while True:
	198	+ if output.qsize() < 100000:
	199	+ break
	200	+ else:
	201	+ time.sleep(10)
	202	+ print 'Still sleeping, queue is %s items long' % output.qsize()
180	203
181		~~- data_queue.put('NEXT')~~
	204	+ else:
	205	+ output.close()
	206	+
182	207	if pbar:
183		~~- print file, xml_queue.qsize(), data_queue.qsize()~~
	208	+ print file, xml_queue.qsize()
184	209	#utils.update_progressbar(pbar, xml_queue)
	210	+
185	211	if debug:
186	212	break
187		-
188		~~- while True:~~
189		~~- if data_queue.qsize() < 100000:~~
190		~~- break~~
191		~~- else:~~
192		~~- time.sleep(10)~~
193		~~- print 'Still sleeping, queue is %s items long' % data_queue.qsize()~~
194		-
	213	+
195	214	except Empty:
196	215	break
197	216
198		~~- #for x in xrange(4):~~
199		~~- data_queue.put(None)~~
	217	+ if destination == 'queue':
	218	+ data_queue.put(None)
200	219
201	220	if settings.DEBUG:
202	221	utils.report_error_messages(messages, parse_editors)
—	—	@@ -263,9 +282,9 @@
264	283	cache[c] = {}
265	284	editor_cache.add('NEXT', '')
266	285	cache = {}
267		-
268	286
269	287
	288	+
270	289	def load_bot_ids():
271	290	'''
272	291	Loader function to retrieve list of id's of known Wikipedia bots.
—	—	@@ -279,17 +298,20 @@
280	299	return ids
281	300
282	301
283		~~-def run_parse_editors(dbname, language, location):~~
	302	+def run_parse_editors(location, language, project):
284	303	ids = load_bot_ids()
285	304	kwargs = {'bots': ids,
286		~~- 'dbname': dbname,~~
	305	+ 'dbname': language + project,
	306	+ 'language': language,
	307	+ 'project': project,
287	308	'pbar': True,
288		~~- 'nr_input_processors': 2,~~
289		~~- 'nr_output_processors': 2,~~
290		~~- 'language': language,~~
	309	+ 'destination': 'file',
	310	+ 'nr_input_processors': settings.NUMBER_OF_PROCESSES,
	311	+ 'nr_output_processors': settings.NUMBER_OF_PROCESSES,
291	312	}
292	313	chunks = {}
293		~~- files = utils.retrieve_file_list(location, 'xml')~~
	314	+ source = os.path.join(location, language, project)
	315	+ files = utils.retrieve_file_list(source, 'xml')
294	316	parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0))
295	317	a = 0
296	318	for x in xrange(settings.NUMBER_OF_PROCESSES):
—	—	@@ -297,18 +319,18 @@
298	320	chunks[x] = files[a:b]
299	321	a = (x + 1) * parts
300	322
301		~~- pc.build_scaffolding(pc.load_queue, parse_editors, chunks, store_editors, True, **kwargs)~~
302		~~- search_cache_for_missed_editors(dbname)~~
	323	+ pc.build_scaffolding(pc.load_queue, parse_editors, chunks, False, False, **kwargs)
	324	+ #search_cache_for_missed_editors(dbname)
303	325
304	326
305	327	def debug_parse_editors(dbname):
306	328	q = JoinableQueue()
307		~~- parse_editors('en\\522.xml', q, None, None, True)~~
	329	+ parse_editors('522.xml', q, None, None, debug=True, destination='file')
308	330	store_editors(q, [], dbname)
309		~~- search_cache_for_missed_editors(dbname)~~
	331	+ #search_cache_for_missed_editors(dbname)
310	332
311	333
312	334	if __name__ == "__main__":
313		~~- #debug_parse_editors('test')~~
314		~~- run_parse_editors('test', 'en')~~
	335	+ #debug_parse_editors('test2')
	336	+ run_parse_editors(settings.XML_FILE_LOCATION, 'en', 'wiki')
315	337	pass
Index: trunk/tools/editor_trends/settings.py
—	—	@@ -41,6 +41,7 @@
42	42	IGNORE_DIRS = ['wikistats', 'zips']
43	43	ROOT = '/' if OS != 'Windows' else 'c:\\'
44	44
	45	+MINIMUM_PYTHON_VERSION = 2.6
45	46
46	47	dirs = [name for name in os.listdir(WORKING_DIRECTORY) if
47	48	os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -132,6 +132,11 @@
133	133
134	134	# read / write data related functions
135	135	def read_data_from_csv(filename, encoding):
	136	+ '''
	137	+ @filename is the path (either absolute or relative) including the name of
	138	+ of the file
	139	+ @encoding is usually utf-8
	140	+ '''
136	141	if hasattr(filename, '__call__'):
137	142	filename = construct_filename(filename)
138	143
—	—	@@ -156,6 +161,10 @@
157	162
158	163
159	164	def determine_file_mode(extension):
	165	+ '''
	166	+ Checks if a given extension is an ASCII extension or not. The settings file
	167	+ provides known ASCII extensions.
	168	+ '''
160	169	if extension in settings.ASCII:
161	170	return 'w'
162	171	else:
—	—	@@ -163,15 +172,30 @@
164	173
165	174
166	175	def write_list_to_csv(data, fh, recursive=False):
	176	+ '''
	177	+ @data is a list which can contain other lists that will be written as a
	178	+ single line to a textfile
	179	+ @fh is a handle to an open text
	180	+
	181	+ The calling function is responsible for:
	182	+ 1) writing a newline
	183	+ 2) closing the filehandle
	184	+ '''
	185	+ tab = False
167	186	if recursive:
168	187	recursive = False
169		~~- for d in data:~~
	188	+ for x, d in enumerate(data):
	189	+ if tab:
	190	+ fh.write('\t')
170	191	if type(d) == type([]):
171	192	recursive = write_list_to_csv(d, fh, True)
172	193	else:
173		~~- fh.write('%s\t' % d)~~
	194	+ fh.write('%s' % d)
	195	+ tab = True
174	196	if recursive:
	197	+ tab = False
175	198	return True
	199	+ fh.write('\n')
176	200
177	201
178	202	def write_dict_to_csv(data, fh):
—	—	@@ -267,31 +291,37 @@
268	292
269	293
270	294	def create_dict_from_csv_file(filename, encoding):
	295	+ '''
	296	+ Constructs a dictionary from a txtfile
	297	+ '''
271	298	d = {}
272	299	for line in read_data_from_csv(filename, encoding):
273	300	line = clean_string(line)
274	301	value, key = line.split('\t')
275	302	d[key] = value
276		-
277	303	return d
278	304
279	305
280		~~-def retrieve_file_list(location, extension, mask=''):~~
	306	+def retrieve_file_list(location, extension, mask=None):
281	307	'''
282	308	Retrieve a list of files from a specified location.
283	309	@location: either an absolute or relative path
284	310	@extension: only include files with extension (optional)
285		~~- @mask: only include files that start with mask (optional)~~
	311	+ @mask: only include files that start with mask (optional), this is
	312	+ interpreted as a regular expression.
286	313
287	314	@return: a list of files matching the criteria
288	315	'''
	316	+ if mask:
	317	+ mask = re.compile(mask)
	318	+ else:
	319	+ mask = re.compile('[\w\d*]')
289	320	all_files = os.listdir(location)
290		~~- if not extension.startswith('.'):~~
291		~~- extension = '.' + extension~~
292	321	files = []
293	322	for file in all_files:
294		~~- if file.startswith(mask) and file.endswith(extension):~~
295		~~- files.append(file)~~
	323	+ file = file.split('.')
	324	+ if re.match(mask, file[0]) and file[1].endswith(extension):
	325	+ files.append('.'.join(file))
296	326	return files
297	327
298	328
Index: trunk/tools/editor_trends/utils/process_constructor.py
—	—	@@ -57,6 +57,7 @@
58	58	nr_output_processors = kwargs.pop('nr_output_processors')
59	59	input_queues = {}
60	60	result_queues = {}
	61	+
61	62	#assert len(obj) == nr_input_processors
62	63	#if result_queue:
63	64	# assert len(obj)== nr_output_processors
Index: trunk/tools/editor_trends/utils/sort.py
—	—	@@ -0,0 +1,119 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+
	5	+'''
	6	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	7	+This program is free software; you can redistribute it and/or
	8	+modify it under the terms of the GNU General Public License version 2
	9	+as published by the Free Software Foundation.
	10	+This program is distributed in the hope that it will be useful,
	11	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	13	+See the GNU General Public License for more details, at
	14	+http://www.fsf.org/licenses/gpl.html
	15	+'''
	16	+
	17	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	18	+__author__email = 'dvanliere at gmail dot com'
	19	+__date__ = '2010-11-07'
	20	+__version__ = '0.1'
	21	+
	22	+'''
	23	+This module provides a small number of sorting algorithms including mergesort,
	24	+external mergesort and quicksort. By presorting the data, considerably
	25	+efficiency gains can be realized when inserting the data in MongoDB.
	26	+'''
	27	+
	28	+import heapq
	29	+
	30	+import settings
	31	+import utils
	32	+
	33	+def quick_sort(obs):
	34	+ if obs == []:
	35	+ return []
	36	+ else:
	37	+ pivot = obs[0]
	38	+ lesser = quick_sort([x for x in obs[1:] if x < pivot])
	39	+ greater = quick_sort([x for x in obs[1:] if x >= pivot])
	40	+ return lesser + [pivot] + greater
	41	+
	42	+def mergesort(n):
	43	+ """Recursively merge sort a list. Returns the sorted list."""
	44	+ front = n[:len(n) / 2]
	45	+ back = n[len(n) / 2:]
	46	+
	47	+ if len(front) > 1:
	48	+ front = mergesort(front)
	49	+ if len(back) > 1:
	50	+ back = mergesort(back)
	51	+
	52	+ return merge(front, back)
	53	+
	54	+
	55	+def merge(front, back):
	56	+ """Merge two sorted lists together. Returns the merged list."""
	57	+ result = []
	58	+ while front and back:
	59	+ # pick the smaller one from the front and stick it on
	60	+ # note that list.pop(0) is a linear operation, so this gives quadratic running time...
	61	+ result.append(front.pop(0) if front[0] <= back[0] else back.pop(0))
	62	+ # add the remaining end
	63	+ result.extend(front or back)
	64	+ return result
	65	+
	66	+
	67	+def readline(file):
	68	+ for line in file:
	69	+ if line == '':
	70	+ continue
	71	+ else:
	72	+ line = line.replace('\n', '')
	73	+ line = line.split('\t')
	74	+ yield line
	75	+
	76	+
	77	+def merge_sorted_files(output, files):
	78	+ output = utils.create_txt_filehandle(output, 'merged.txt', 'w', settings.ENCODING)
	79	+ lines = 0
	80	+ for line in heapq.merge(*[readline(file) for file in files]):
	81	+ output.write(line)
	82	+ lines += 1
	83	+ output.close()
	84	+ return lines
	85	+
	86	+
	87	+def write_sorted_file(sorted_data, file, output):
	88	+ file = file.split('.')
	89	+ file[0] = file[0] + '_sorted'
	90	+ file = '.'.join(file)
	91	+ fh = utils.create_txt_filehandle(output, file, 'w', settings.ENCODING)
	92	+ utils.write_list_to_csv(sorted_data, fh)
	93	+ fh.close()
	94	+
	95	+
	96	+def debug_merge_sorted_files(input, output):
	97	+ files = utils.retrieve_file_list(input, 'txt', mask='')
	98	+ filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.ENCODING) for file in files]
	99	+ lines = merge_sorted_files(output, filehandles)
	100	+ filehandles = [fh.close() for fh in filehandles]
	101	+ print lines
	102	+
	103	+
	104	+def debug_mergesort(input, output):
	105	+ files = utils.retrieve_file_list(input, 'txt', mask='((?!_sorted)\d)')
	106	+ for file in files:
	107	+ fh = utils.create_txt_filehandle(input, file, 'r', settings.ENCODING)
	108	+ data = fh.readlines()
	109	+ fh.close()
	110	+ data = [d.replace('\n', '') for d in data]
	111	+ data = [d.split('\t') for d in data]
	112	+ sorted_data = mergesort(data)
	113	+ write_sorted_file(sorted_data, file, output)
	114	+
	115	+
	116	+if __name__ == '__main__':
	117	+ input = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki')
	118	+ output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted')
	119	+ debug_mergesort(input, output)
	120	+ #debug_merge_sorted_files(input, output)
Property changes on: trunk/tools/editor_trends/utils/sort.py
___________________________________________________________________
Added: svn:eol-style
1	121	+ native
Index: trunk/tools/editor_trends/construct_datasets.py
—	—	@@ -126,11 +126,11 @@
127	127	pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors')
128	128
129	129
130		~~-def debug_retrieve_edits_by_contributor_launcher():~~
	130	+def debug_retrieve_edits_by_contributor_launcher(dbname):
131	131	kwargs = {'debug': False,
132		~~- 'dbname': 'enwiki',~~
	132	+ 'dbname': dbname,
133	133	}
134		~~- ids = retrieve_editor_ids_mongo('enwiki', 'editors')~~
	134	+ ids = retrieve_editor_ids_mongo(dbname, 'editors')
135	135	input_queue = pc.load_queue(ids)
136	136	q = Queue()
137	137	generate_editor_dataset(input_queue, q, False, kwargs)
—	—	@@ -159,7 +159,6 @@
160	160	def generate_editor_dataset_debug(dbname):
161	161	ids = retrieve_editor_ids_mongo(dbname, 'editors')
162	162	input_queue = pc.load_queue(ids)
163		~~- #write_dataset(input_queue, [], 'enwiki')~~
164	163	kwargs = {'nr_input_processors': 1,
165	164	'nr_output_processors': 1,
166	165	'debug': True,
Index: trunk/tools/editor_trends/database/cache.py
—	—	@@ -86,25 +86,10 @@
87	87
88	88	if self.editors[key]['obs'] == self.treshold:
89	89	self.treshold_editors.add(key)
90		~~-# self.update(key, self.editors[key]['edits'])~~
91		~~-# del self.editors[key]~~
92		~~-# self.n -= 10~~
93		~~-# self.number_editors -= 1~~
94	90
95	91	def update(self, editor, values):
96		~~- #t = datetime.datetime.now()~~
97	92	self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True)
98		~~- #print 'It took %s to store editor %s;and the cache contains %s editors and %s items' % (datetime.datetime.now() - t, editor, self.number_editors, self.n)~~
99	93
100		~~- def quick_sort(self, obs):~~
101		~~- if obs == []:~~
102		~~- return []~~
103		~~- else:~~
104		~~- pivot = obs[0]~~
105		~~- lesser = self.quick_sort([x for x in obs[1:] if x < pivot])~~
106		~~- greater = self.quick_sort([x for x in obs[1:] if x >= pivot])~~
107		~~- return lesser + [pivot] + greater~~
108		-
109	94	def store(self):
110	95	utils.store_object(self, settings.BINARY_OBJECT_FILE_LOCATION, self.__repr__())
111	96

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r76345 [removed: new added: deferred]