r76201 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r76200‎ \| r76201 \| r76202 >
Date:	17:42, 6 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Various bugfixes
Modified paths:	/trunk/tools/editor_trends/config.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history) /trunk/tools/editor_trends/optimize_editors.py (modified) (history) /trunk/tools/editor_trends/split_xml_file.py (modified) (history) /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history) /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -22,8 +22,8 @@
23	23	import subprocess
24	24	from argparse import ArgumentParser
25	25	from argparse import RawTextHelpFormatter
	26	+import locale
26	27
27		-
28	28	import progressbar
29	29
30	30	import settings
—	—	@@ -43,6 +43,11 @@
44	44	config.load_configuration(args)
45	45
46	46
	47	+def determine_default_language():
	48	+ language_code = locale.getdefaultlocale()[0]
	49	+ return language_code.split('_')[0]
	50	+
	51	+
47	52	def retrieve_projectname(args):
48	53	language_code = retrieve_language(args)
49	54	if language_code == None:
—	—	@@ -53,13 +58,16 @@
54	59	if project == None:
55	60	print 'Entered project: %s is not valid Wikipedia project.' % get_value(args, 'project')
56	61	sys.exit(-1)
	62	+ if project == 'commonswiki':
	63	+ return project
	64	+ else:
	65	+ return '%s%s' % (language_code, project)
57	66
58		~~- return '%s%s' % (language_code, project)~~
59	67
60	68	def retrieve_language(args):
61	69	language = get_value(args, 'language')
62	70	language = language.title()
63		~~- return languages.MAPPING.get(language, None)~~
	71	+ return languages.MAPPING.get(language, 'en')
64	72
65	73
66	74	def retrieve_project(args):
—	—	@@ -75,13 +83,24 @@
76	84
77	85	def determine_file_locations(args):
78	86	locations = {}
	87	+ location = get_value(args, 'location') if get_value(args, 'location') != None else settings.XML_FILE_LOCATION
79	88	locations['language_code'] = retrieve_language(args)
80		~~- locations['location'] = os.path.join(get_value(args, 'location'), retrieve_language(args))~~
	89	+ locations['location'] = os.path.join(location, retrieve_language(args))
81	90	locations['project'] = retrieve_projectname(args)
82	91	locations['filename'] = generate_wikidump_filename(args)
83	92	return locations
84	93
85	94
	95	+def show_settings(args, location, filename, project, language_code):
	96	+ project = settings.WIKIMEDIA_PROJECTS.get(project, 'wiki')
	97	+ project = project.title()
	98	+ language_map = utils.invert_dict(languages.MAPPING)
	99	+ print 'Project: %s' % (project)
	100	+ print 'Language: %s' % language_map[language_code]
	101	+ print 'Input directory: %s' % location
	102	+ print 'Output directory: TODO'
	103	+
	104	+
86	105	def dump_downloader_launcher(args, location, filename, project, language_code):
87	106	print 'dump downloader'
88	107	pbar = get_value(args, 'progress')
—	—	@@ -113,8 +132,8 @@
114	133	path = config.detect_installed_program('7zip')
115	134
116	135	source = os.path.join(location, file)
117		~~- retcode = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)])~~
118		~~- return retcode~~
	136	+ p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)])
	137	+ return p
119	138
120	139
121	140	def mongodb_script_launcher(args, location, filename, project, language_code):
—	—	@@ -153,6 +172,7 @@
154	173
155	174
156	175	def main():
	176	+ default_language = determine_default_language()
157	177	file_choices = ('stub-meta-history.xml.gz',
158	178	'stub-meta-current.xml.gz',
159	179	'pages-meta-history.xml.7z',
—	—	@@ -188,7 +208,7 @@
189	209	parser.add_argument('-l', '--language', action='store',
190	210	help='Example of valid languages.',
191	211	choices=supported_languages(),
192		~~- default='Russian')~~
	212	+ default=default_language)
193	213
194	214	parser.add_argument('-p', '--project', action='store',
195	215	help='Specify the Wikimedia project that you would like to download',
—	—	@@ -210,6 +230,7 @@
211	231	args = parser.parse_args()
212	232	config.load_configuration(args)
213	233	locations = determine_file_locations(args)
	234	+ show_settings(args, **locations)
214	235	args.func(args, **locations)
215	236
216	237
Index: trunk/tools/editor_trends/optimize_editors.py
—	—	@@ -17,11 +17,15 @@
18	18	__date__ = '2010-11-02'
19	19	__version__ = '0.1'
20	20
	21	+from multiprocessing import Queue
	22	+from Queue import Empty
	23	+from operator import itemgetter
	24	+import datetime
21	25
22		-
23	26	import settings
24	27	from database import db
25	28	from utils import process_constructor as pc
	29	+import construct_datasets
26	30
27	31
28	32	def create_datacontainer(init_value=0):
—	—	@@ -37,7 +41,7 @@
38	42	data[str(x)] = init_value
39	43	return data
40	44
41		-
	45	+
42	46	def determine_edits_by_year(dates):
43	47	'''
44	48	This function counts the number of edits by year made by a particular editor.
—	—	@@ -87,7 +91,7 @@
88	92
89	93	output.insert({'editor': id, 'edits': edits,
90	94	'edits_by_year': edits_by_year,
91		~~- 'year_joined': year,~~
	95	+ 'year_joined': new_wikipedian,
92	96	'edit_count': edit_count,
93	97	'final_edit': final_edit,
94	98	'first_edit': first_edit,
—	—	@@ -101,20 +105,31 @@
102	106	kwargs = {'definition': 'traditional',
103	107	'pbar': True,
104	108	'dbname': 'enwiki',
105		~~- 'nr_input_processors': 2,~~
	109	+ 'nr_input_processors': 1,
106	110	'nr_output_processors': 0,
107	111	}
108		~~- pc.build_scaffolding(pc.load_queue, optimize_editors, ids, False, False, **kwargs)~~
	112	+ chunks = {}
	113	+ parts = int(round(float(len(ids)) / 1, 0))
	114	+ a = 0
	115	+ for x in xrange(settings.NUMBER_OF_PROCESSES):
	116	+ b = a + parts
	117	+ chunks[x] = ids[a:b]
	118	+ a = (x + 1) * parts
	119	+ if a >= len(ids):
	120	+ break
109	121
	122	+ pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
110	123
	124	+
111	125	def debug_optimize_editors(dbname):
112	126	ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
113	127	q = pc.load_queue(ids)
114	128	kwargs = {'definition': 'traditional',
115		~~- 'dbname': 'enwiki'~~
	129	+ 'dbname': dbname
116	130	}
117	131	optimize_editors(q, False, True, kwargs)
118	132
119	133
120	134	if __name__ == '__main__':
121		~~- run_optimize_editors('enwiki')~~
\ No newline at end of file
	135	+ debug_optimize_editors('test')
	136	+ #run_optimize_editors('test')
Index: trunk/tools/editor_trends/map_wiki_editors.py
—	—	@@ -244,6 +244,26 @@
245	245	print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n)
246	246
247	247
	248	+def load_cache_objects():
	249	+ cache = {}
	250	+ files = utils.retrieve_file_list(settings.BINARY_OBJECT_FILE_LOCATION, '.bin')
	251	+ for x, file in enumerate(files):
	252	+ cache[x] = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, file)
	253	+ return cache
	254	+
	255	+
	256	+def search_cache_for_missed_editors(dbname):
	257	+ mongo = db.init_mongo_db(dbname)
	258	+ collection = mongo['editors']
	259	+ editor_cache = cache.EditorCache(collection)
	260	+ cache = load_cache_objects()
	261	+ for c in cache:
	262	+ for editor in cache[c]:
	263	+ editor_cache.add(editor, cache[c][editor])
	264	+ cache[c] = {}
	265	+ editor_cache.add('NEXT', '')
	266	+
	267	+
248	268	def load_bot_ids():
249	269	'''
250	270	Loader function to retrieve list of id's of known Wikipedia bots.
—	—	@@ -267,7 +287,6 @@
268	288	'language': language,
269	289	}
270	290	chunks = {}
271		~~- #file_location = os.path.join(settings.XML_FILE_LOCATION, language)~~
272	291	files = utils.retrieve_file_list(location, 'xml')
273	292	parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0))
274	293	a = 0
—	—	@@ -277,12 +296,14 @@
278	297	a = (x + 1) * parts
279	298
280	299	pc.build_scaffolding(pc.load_queue, parse_editors, chunks, store_editors, True, **kwargs)
	300	+ search_cache_for_missed_editors(dbname)
281	301
282	302
283	303	def debug_parse_editors(dbname):
284	304	q = JoinableQueue()
285	305	parse_editors('en\\522.xml', q, None, None, True)
286	306	store_editors(q, [], dbname)
	307	+ search_cache_for_missed_editors(dbname)
287	308
288	309
289	310	if __name__ == "__main__":
Index: trunk/tools/editor_trends/config.py
—	—	@@ -20,13 +20,14 @@
21	21
22	22	import os
23	23	import ConfigParser
24		-from _winreg import *
25	24
	25	+
26	26	import settings
27	27	from utils import utils
28	28
29	29
30	30	def detect_windows_program(program):
	31	+ from _winreg import *
31	32	entry = settings.WINDOWS_REGISTER[program]
32	33	try:
33	34	key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ)
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -32,6 +32,7 @@
33	33	import ctypes
34	34
35	35	import settings
	36	+import exceptions
36	37
37	38
38	39	try:
—	—	@@ -160,6 +161,7 @@
161	162	else:
162	163	return 'wb'
163	164
	165	+
164	166	def write_list_to_csv(data, fh, recursive=False):
165	167	if recursive:
166	168	recursive = False
—	—	@@ -170,6 +172,7 @@
171	173	fh.write('%s\t' % d)
172	174	if recursive:
173	175	return True
	176	+
174	177
175	178	def write_dict_to_csv(data, fh):
176	179	keys = data.keys()
—	—	@@ -225,7 +228,7 @@
226	229	if is_exe(exe_file):
227	230	return exe_file
228	231
229		~~- return None~~
	232	+ raise exceptions.FileNotFoundException(program)
230	233
231	234
232	235	def store_object(object, location, filename):
—	—	@@ -254,6 +257,15 @@
255	258	return string
256	259
257	260
	261	+def invert_dict(dictionary):
	262	+ '''
	263	+ @dictionary is a simple dictionary containing simple values, ie. no lists,
	264	+ or other dictionaries
	265	+ output: dictionary where key and value are swapped.
	266	+ '''
	267	+ return dict([[v,k] for k,v in dictionary.items()])
	268	+
	269	+
258	270	def create_dict_from_csv_file(filename, encoding):
259	271	d = {}
260	272	for line in read_data_from_csv(filename, encoding):
Index: trunk/tools/editor_trends/utils/process_constructor.py
—	—	@@ -57,14 +57,16 @@
58	58	nr_output_processors = kwargs.pop('nr_output_processors')
59	59	input_queues = {}
60	60	result_queues = {}
61		~~- assert len(obj) == nr_input_processors~~
62		~~- if result_queue:~~
63		~~- assert len(obj)== nr_output_processors~~
	61	+ #assert len(obj) == nr_input_processors
	62	+ #if result_queue:
	63	+ # assert len(obj)== nr_output_processors
64	64
65	65	for i, o in enumerate(obj):
66	66	input_queues[i] = load_input_queue(obj[o], poison_pill=True)
67	67	if result_queue:
68	68	result_queues[i] = JoinableQueue()
	69	+ else:
	70	+ result_queues[i] = False
69	71
70	72	if settings.PROGRESS_BAR:
71	73	size = sum([input_queues[q].qsize() for q in input_queues])
Index: trunk/tools/editor_trends/split_xml_file.py
—	—	@@ -172,7 +172,7 @@
173	173	#elem = parse_comments(elem, remove_ascii_control_characters)
174	174	#print cElementTree.tostring(elem)
175	175	except SyntaxError:
176		~~- fh = utils.create_txt_filehandle(ERROR_MESSAGE_FILE_LOCATION, 'split_xml', 'w', settings.ENCODING)~~
	176	+ fh = utils.create_txt_filehandle(settings.ERROR_MESSAGE_FILE_LOCATION, 'split_xml', 'w', settings.ENCODING)
177	177	fh.write(cElementTree.tostring(elem))
178	178	fh.close()
179	179

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r76201 [removed: new added: deferred]