r77589 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r77588‎ \| r77589 \| r77590 >
Date:	17:12, 2 December 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	* Handle more edge cases * Pipe line works
Modified paths:	/trunk/tools/editor_trends/config.py (modified) (history) /trunk/tools/editor_trends/configuration.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -31,17 +31,20 @@
32	32	settings = configuration.Settings()
33	33
34	34	import languages
	35	+import config
35	36	from utils import utils
36	37	from utils import dump_downloader
37	38	from utils import compression
	39	+from utils import ordered_dict
	40	+from database import db
38	41	from etl import chunker
39	42	from etl import extract
40	43	from etl import loader
41	44	from etl import transformer
42	45	from etl import exporter
43		~~-import config~~
44	46
45	47
	48	+
46	49	class Timer(object):
47	50	def __init__(self):
48	51	self.t0 = datetime.datetime.now()
—	—	@@ -58,7 +61,7 @@
59	62	return getattr(args, key, None)
60	63
61	64
62		~~-def config_launcher(args, **kwargs):~~
	65	+def config_launcher(args, logger, **kwargs):
63	66	settings.load_configuration()
64	67
65	68
—	—	@@ -151,13 +154,15 @@
152	155	language = kwargs.pop('language')
153	156	language_code = kwargs.pop('language_code')
154	157	config = {}
155		~~- config['Project'] = settings.projects.get(kwargs.pop('project'), 'wiki').title()~~
156		~~- config['Language'] = '%s / %s' % (language_map[language_code].decode(settings.encoding), language.decode(settings.encoding))~~
157		~~- config['Input directory'] = kwargs.get('location')~~
	158	+ config['Project'] = '\t\t%s' % settings.projects.get(kwargs.pop('project'), 'wiki').title()
	159	+ config['Language'] = '\t%s / %s' % (language_map[language_code], language) #.decode(settings.encoding)
	160	+ config['Input directory'] = '\t%s' % kwargs.get('location')
158	161	config['Output directory'] = '%s and subdirectories' % kwargs.get('location')
159	162
160	163	message = 'Final settings after parsing command line arguments:'
161	164	write_message_to_log(logger, args, message, None, **config)
	165	+ for c in config:
	166	+ print '%s\t%s' % (c, config[c])
162	167
163	168
164	169	def dump_downloader_launcher(args, logger, **kwargs):
—	—	@@ -177,7 +182,7 @@
178	183
179	184
180	185	def chunker_launcher(args, logger, **kwargs):
181		~~- print 'split_settings.input_filename_launcher'~~
	186	+ print 'chunker_launcher'
182	187	timer = Timer()
183	188	write_message_to_log(logger, args, **kwargs)
184	189	filename = kwargs.pop('filename')
—	—	@@ -191,7 +196,7 @@
192	197	file = filename.replace('.' + ext, '')
193	198	result = utils.check_file_exists(location, file)
194	199	if not result:
195		~~- retcode = launch_zip_extractor(args, location, filename)~~
	200	+ retcode = launch_zip_extractor(args, logger, location, filename)
196	201	else:
197	202	retcode = 0
198	203	if retcode != 0:
—	—	@@ -201,7 +206,7 @@
202	207	timer.elapsed()
203	208
204	209
205		~~-def launch_zip_extractor(args, location, file):~~
	210	+def launch_zip_extractor(args, logger, location, file):
206	211	timer = Timer()
207	212	write_message_to_log(logger, args, location=location, file=file)
208	213	compressor = compression.Compressor(location, file)
—	—	@@ -226,9 +231,8 @@
227	232	input = os.path.join(location, 'txt')
228	233	output = os.path.join(location, 'sorted')
229	234	final_output = os.path.join(location, 'dbready')
230		~~- dbname = kwargs.pop('full_project')~~
231	235	loader.mergesort_launcher(input, output)
232		~~- loader.mergesort_external_launcher(dbname, output, final_output)~~
	236	+ loader.mergesort_external_launcher(output, final_output)
233	237	timer.elapsed()
234	238
235	239
—	—	@@ -249,40 +253,48 @@
250	254	write_message_to_log(logger, args, **kwargs)
251	255	project = kwargs.pop('full_project')
252	256	collection = kwargs.pop('collection')
253		~~- transformer.run_optimize_editors(project, collection)~~
	257	+ transformer.transform_editors_single_launcher(project, collection)
254	258	timer.elapsed()
255	259
256	260
257	261	def exporter_launcher(args, logger, **kwargs):
258	262	timer = Timer()
259	263	write_message_to_log(logger, args, **kwargs)
260		~~- project = kwargs.pop('full_project')~~
261		~~- exporter.generate_editor_dataset_launcher(project)~~
	264	+ collection = get_value(args, 'collection')
	265	+ dbname = kwargs.pop('full_project')
	266	+ targets = get_value(args, 'datasets')
	267	+ targets = targets.split(',')
	268	+ for target in targets:
	269	+ exporter.dataset_launcher(dbname, collection, target)
262	270	timer.elapsed()
263	271
264	272
265	273	def all_launcher(args, logger, **kwargs):
266	274	print 'all_launcher'
267	275	timer = Timer()
268		~~- message = 'Starting '~~
	276	+ full_project = kwargs.get('full_project', None)
	277	+ message = 'Start of building %s dataset.' % full_project
	278	+ db.cleanup_database(full_project)
269	279	write_message_to_log(logger, args, message, **kwargs)
270	280	ignore = get_value(args, 'except')
271		~~- clean = get_value(args, 'clean')~~
	281	+ clean = get_value(args, 'new')
272	282	if clean:
273	283	dirs = kwargs.get('directories')[1:]
274	284	for dir in dirs:
275	285	write_message_to_log(logger, args, verb='Deleting', **kwargs)
276	286	utils.delete_file(dir, '')
277		~~- functions = {dump_downloader_launcher: 'download',~~
278		~~- chunker_launcher: 'split',~~
279		~~- extract_launcher: 'extract',~~
280		~~- sort_launcher: 'sort',~~
281		~~- transformer_launcher: 'transform',~~
282		~~- exporter_launcher: 'export'~~
283		~~- }~~
	287	+
	288	+ functions = ordered_dict.OrderedDict(((dump_downloader_launcher, 'download'),
	289	+ (chunker_launcher, 'split'),
	290	+ (extract_launcher, 'extract'),
	291	+ (sort_launcher, 'sort'),
	292	+ (store_launcher, 'store'),
	293	+ (transformer_launcher, 'transform'),
	294	+ (exporter_launcher, 'export')))
	295	+
284	296	for function, callname in functions.iteritems():
285	297	if callname not in ignore:
286		~~- function(args, **kwargs)~~
	298	+ function(args, logger, **kwargs)
287	299
288	300	timer.elapsed()
289	301
—	—	@@ -293,7 +305,7 @@
294	306	return tuple(choices)
295	307
296	308
297		~~-def show_languages(args, location, filename, project, full_project, language_code, language):~~
	309	+def show_languages(args, logger, **kwargs):
298	310	first = get_value(args, 'startswith')
299	311	if first != None:
300	312	first = first.title()
—	—	@@ -331,13 +343,20 @@
332	344	logger.setLevel(logging.DEBUG)
333	345
334	346	default_language = determine_default_language()
	347	+
	348	+ datasets = {'cohort': 'generate_cohort_dataset',
	349	+ 'long': 'generate_long_editor_dataset',
	350	+ 'wide': 'generate_wide_editor_dataset',
	351	+ }
	352	+
335	353	file_choices = ('stub-meta-history.xml.gz',
336	354	'stub-meta-current.xml.gz',
337	355	'pages-meta-history.xml.7z',
338	356	'pages-meta-current.xml.bz2')
339	357
	358	+
340	359	parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter)
341		~~- subparsers = parser.add_subparsers(help='sub-command help')~~
	360	+ subparsers = parser.add_subparsers(help='sub - command help')
342	361
343	362	parser_languages = subparsers.add_parser('show_languages', help='Overview of all valid languages.')
344	363	parser_languages.add_argument('-s', '--startswith',
—	—	@@ -365,15 +384,9 @@
366	385
367	386	parser_store = subparsers.add_parser('store', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.')
368	387	parser_store.set_defaults(func=store_launcher)
369		~~- parser_store.add_argument('-c', '--collection', action='store',~~
370		~~- help='Name of MongoDB collection',~~
371		~~- default='editors')~~
372	388
373	389	parser_transform = subparsers.add_parser('transform', help='Transform the raw datatable to an enriched dataset that can be exported.')
374	390	parser_transform.set_defaults(func=transformer_launcher)
375		~~- parser_transform.add_argument('-c', '--collection', action='store',~~
376		~~- help='Name of MongoDB collection',~~
377		~~- default='editors')~~
378	391
379	392	parser_dataset = subparsers.add_parser('export', help='Create a dataset from the MongoDB and write it to a csv file.')
380	393	parser_dataset.set_defaults(func=exporter_launcher)
—	—	@@ -383,11 +396,11 @@
384	397	parser_all.add_argument('-e', '--except', action='store',
385	398	help='Should be a list of functions that are to be ignored when executing \'all\'.',
386	399	default=[])
	400	+
387	401	parser_all.add_argument('-n', '--new', action='store_false',
388	402	help='This will delete all previous output and starts from scratch. Mostly useful for debugging purposes.',
389	403	default=False)
390	404
391		-
392	405	parser.add_argument('-l', '--language', action='store',
393	406	help='Example of valid languages.',
394	407	choices=supported_languages(),
—	—	@@ -398,8 +411,14 @@
399	412	choices=settings.projects.keys(),
400	413	default='wiki')
401	414
	415	+ parser.add_argument('-c', '--collection', action='store',
	416	+ help='Name of MongoDB collection',
	417	+ default='editors')
	418	+
	419	+
402	420	parser.add_argument('-o', '--location', action='store',
403	421	help='Indicate where you want to store the downloaded file.',
	422	+ default=settings.input_location
404	423	)
405	424
406	425	parser.add_argument('-n', '--namespace', action='store',
—	—	@@ -412,6 +431,16 @@
413	432	help='Indicate which dump you want to download. Valid choices are:\n %s' % ''.join([f + ',\n' for f in file_choices]),
414	433	default='stub-meta-history.xml.gz')
415	434
	435	+ parser.add_argument('-dv', '--dumpversion', action='store',
	436	+ choices=settings.dumpversions.keys(),
	437	+ help='Indicate the Wikidump version that you are parsing.',
	438	+ default=settings.dumpversions['0'])
	439	+
	440	+ parser.add_argument('-d', '--datasets', action='store',
	441	+ choices=datasets.keys(),
	442	+ help='Indicate what type of data should be exported.',
	443	+ default=datasets['cohort'])
	444	+
416	445	parser.add_argument('-prog', '--progress', action='store_true', default=True,
417	446	help='Indicate whether you want to have a progressbar.')
418	447
—	—	@@ -422,9 +451,7 @@
423	452	locations = determine_file_locations(args, logger)
424	453	settings.verify_environment(locations['directories'])
425	454	show_settings(args, logger, **locations)
426		~~- #locations['settings'] = settings~~
427	455	args.func(args, logger, **locations)
428		~~- t1 = datetime.datetime.now()~~
429	456
430	457
431	458	if __name__ == '__main__':
Index: trunk/tools/editor_trends/config.py
—	—	@@ -24,12 +24,22 @@
25	25	from utils import utils
26	26	import languages
27	27
	28	+def show_choices(settings, attr):
	29	+ choices = getattr(settings, attr).items()
	30	+ choices.sort()
	31	+ choices = ['%s\t%s' % (choice[0], choice[1]) for choice in choices]
	32	+ #print '\n'.join(choices)
	33	+ return choices
	34	+ #for choice in choices:
	35	+ # print '%s\t%s' % (choice[0], choice[1])
	36	+
28	37	def create_configuration(settings, args):
29	38	force = getattr(args, 'force', False)
30	39	if not os.path.exists('wiki.cfg') or force:
31	40	config = ConfigParser.RawConfigParser()
32	41	project = None
33	42	language = None
	43	+ dumpversion = None
34	44	language_map = languages.language_map()
35	45	working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.\n' % os.getcwd())
36	46	input_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.\n' % settings.input_location)
—	—	@@ -46,6 +56,16 @@
47	57	language = language_map[args.language]
48	58	language = language if language in languages.MAPPING else args.language
49	59
	60	+ while dumpversion not in settings.dumpversions.keys():
	61	+ choices = '\n'.join(show_choices(settings, 'dumpversions'))
	62	+ dumpversion = raw_input('Please indicate the version of the Wikipedia project you are analyzing.\nValid choices are:\n%s\nDefault is: 0 (%s)\nPress Enter to accept default.\n' % (choices, settings.dumpversions['0']))
	63	+ if len(dumpversion) == 0:
	64	+ dumpversion = settings.dumpversions['0']
	65	+
	66	+
	67	+ #dumpversion = dumpversion if dumpversion in settings.dumpversions.keys() else args.dumpversion
	68	+
	69	+ dumpversion = settings.dumpversions[dumpversion]
50	70	input_location = input_location if len(input_location) > 0 else settings.input_location
51	71	working_directory = working_directory if len(working_directory) > 0 else os.getcwd()
52	72
—	—	@@ -56,6 +76,7 @@
57	77	config.add_section('wiki')
58	78	config.set('wiki', 'project', project)
59	79	config.set('wiki', 'language', language)
	80	+ config.set('wiki', 'dumpversion', dumpversion)
60	81
61	82	fh = utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb')
62	83	config.write(fh)
—	—	@@ -63,6 +84,7 @@
64	85
65	86	settings.working_directory = config.get('file_locations', 'working_directory')
66	87	settings.input_location = config.get('file_locations', 'input_location')
	88	+ settings.xml_namespace = config.get('wiki', 'dumpversion')
67	89	return settings
68	90
69	91
Index: trunk/tools/editor_trends/configuration.py
—	—	@@ -57,12 +57,11 @@
58	58	#Change this to match your computers configuration (RAM / CPU)
59	59	self.minimum_python_version = (2, 6)
60	60	self.wp_dump_location = 'http://download.wikimedia.org'
61		~~- self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'~~
	61	+ self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.3/'
62	62	self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json']
63	63	self.windows_register = {'7z.exe': 'Software\\7-Zip', }
64	64	#Extensions of ascii files, this is used to determine the filemode to use
65	65	self.platform = self.determine_platform()
66		~~- #self.compression = self.init_compression_tools()~~
67	66
68	67	self.architecture = platform.machine()
69	68	self.working_directory = self.determine_working_directory()
—	—	@@ -75,6 +74,9 @@
76	75
77	76	self.load_configuration()
78	77	self.set_custom_settings(**kwargs)
	78	+ self.dumpversions = {'0': 'http://www.mediawiki.org/xml/export-0.4/',
	79	+ '1': 'http://www.mediawiki.org/xml/export-0.3/',
	80	+ }
79	81	self.projects = {'wiki': 'wikipedia',
80	82	'commons': 'commonswiki',
81	83	'books': 'wikibooks',

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r77589 [removed: new added: deferred]