r77195 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r77194‎ \| r77195 \| r77196 >
Date:	22:23, 23 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	* Various performance improvements
Modified paths:	/trunk/tools/editor_trends/config.py (modified) (history) /trunk/tools/editor_trends/configuration.py (modified) (history) /trunk/tools/editor_trends/etl/chunker.py (modified) (history) /trunk/tools/editor_trends/etl/extract.py (modified) (history) /trunk/tools/editor_trends/etl/loader.py (modified) (history) /trunk/tools/editor_trends/etl/store.py (modified) (history) /trunk/tools/editor_trends/languages.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/utils/utils.py (modified) (history) /trunk/tools/editor_trends/wikitree/xml.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -19,7 +19,6 @@
20	20
21	21	import os
22	22	import sys
23		~~-import subprocess~~
24	23	import datetime
25	24	from argparse import ArgumentParser
26	25	from argparse import RawTextHelpFormatter
—	—	@@ -44,15 +43,15 @@
45	44	class Timer(object):
46	45	def __init__(self):
47	46	self.t0 = datetime.datetime.now()
48		-
	47	+
49	48	def stop(self):
50	49	self.t1 = datetime.datetime.now()
51		-
	50	+
52	51	def elapsed(self):
53	52	self.stop()
54	53	print 'Processing time: %s' % (self.t1 - self.t0)
55		-
56	54
	55	+
57	56	def get_value(args, key):
58	57	return getattr(args, key, None)
59	58
—	—	@@ -69,12 +68,12 @@
70	69	def retrieve_projectname(args):
71	70	language_code = retrieve_language(args)
72	71	if language_code == None:
73		~~- print 'Entered language: %s is not a valid Wikipedia language' % get_value(args, 'language')~~
	72	+ print 'Entered language: %s is not a valid Wikimedia language' % get_value(args, 'language')
74	73	sys.exit(-1)
75	74	project = retrieve_project(args)
76	75
77	76	if project == None:
78		~~- print 'Entered project: %s is not valid Wikipedia project.' % get_value(args, 'project')~~
	77	+ print 'Entered project: %s is not valid Wikimedia Foundation project.' % get_value(args, 'project')
79	78	sys.exit(-1)
80	79	if project == 'commonswiki':
81	80	return project
—	—	@@ -91,7 +90,7 @@
92	91	def retrieve_project(args):
93	92	project = get_value(args, 'project')
94	93	if project != 'wiki':
95		~~- project = settings.WIKIMEDIA_PROJECTS.get(project, None)~~
	94	+ project = settings.projects.get(project, None)
96	95	return project
97	96
98	97
—	—	@@ -107,9 +106,15 @@
108	107	locations['language_code'] = language_code
109	108	locations['language'] = get_value(args, 'language')
110	109	locations['location'] = os.path.join(location, language_code, project)
	110	+ locations['chunks'] = os.path.join(locations['location'], 'chunks')
	111	+ locations['txt'] = os.path.join(locations['location'], 'txt')
	112	+ locations['sorted'] = os.path.join(locations['location'], 'sorted')
	113	+ locations['dbready'] = os.path.join(locations['location'], 'dbready')
111	114	locations['project'] = project
112	115	locations['full_project'] = retrieve_projectname(args)
113	116	locations['filename'] = generate_wikidump_filename(project, args)
	117	+ locations['collection'] = get_value(args, 'collection')
	118	+ locations['directories'] = [locations['chunks'], locations['location'], locations['txt'], locations['sorted'], locations['dbready']]
114	119	return locations
115	120
116	121
—	—	@@ -119,7 +124,7 @@
120	125	language = kwargs.pop('language')
121	126	location = kwargs.pop('location')
122	127	project = project.title()
123		~~- language_map = utils.invert_dict(languages.MAPPING)~~
	128	+ language_map = languages.language_map()
124	129	print 'Project: %s' % (project)
125	130	print 'Language: %s / %s' % (language_map[language_code].decode(settings.encoding), language.decode(settings.encoding))
126	131	print 'Input directory: %s' % location
—	—	@@ -163,29 +168,15 @@
164	169	sys.exit(retcode)
165	170	chunker.split_file(location, file, project, language_code, language)
166	171	timer.elapsed()
167		~~- #settings.set_custom_settings(xml_namespace='http://www.mediawiki.org/xml/export-0.3/')~~
168	172
169	173
170	174	def launch_zip_extractor(args, location, file):
171	175	timer = Timer()
172		~~- path = settings.detect_installed_program('7zip')~~
173		~~- source = os.path.join(location, file)~~
174		~~- p = None~~
175		-
176		~~- if settings.platform == 'Windows':~~
177		~~- p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)], shell=True).wait()~~
178		~~- elif settings.platform == 'Linux':~~
179		~~- raise NotImplementedError~~
180		~~- elif settings.platform == 'OSX':~~
181		~~- raise NotImplementedError~~
182		~~- else:~~
183		~~- raise exceptions.PlatformNotSupportedError~~
	176	+ utils.zip_extract(location, file, compression='7z')
184	177	timer.elapsed()
185		~~- return p~~
186	178
187	179
188	180	def extract_launcher(args, **kwargs):
189		~~- print 'mongodb_script_launcher'~~
190	181	timer = Timer()
191	182	location = kwargs.pop('location')
192	183	language_code = kwargs.pop('language_code')
—	—	@@ -199,13 +190,23 @@
200	191	location = kwargs.pop('location')
201	192	input = os.path.join(location, 'txt')
202	193	output = os.path.join(location, 'sorted')
	194	+ final_output = os.path.join(location, 'dbready')
203	195	dbname = kwargs.pop('full_project')
204	196	loader.mergesort_launcher(input, output)
205		~~- filename = loader.mergesort_external_launcher(dbname, output, output)~~
206		~~- loader.store_editors(output, filename, dbname, 'editors')~~
	197	+ loader.mergesort_external_launcher(dbname, output, final_output)
207	198	timer.elapsed()
208	199
209	200
	201	+def store_launcher(args, **kwargs):
	202	+ timer = Timer()
	203	+ location = kwargs.pop('location')
	204	+ input = os.path.join(location, 'dbready')
	205	+ dbname = kwargs.pop('full_project')
	206	+ collection = kwargs.pop('collection')
	207	+ loader.store_editors(input, dbname, collection)
	208	+ timer.elapsed()
	209	+
	210	+
210	211	def transformer_launcher(args, **kwargs):
211	212	print 'dataset launcher'
212	213	timer = Timer()
—	—	@@ -289,6 +290,8 @@
290	291
291	292	parser_config = subparsers.add_parser('config', help='The config sub command allows you set the data location of where to store files.')
292	293	parser_config.set_defaults(func=config_launcher)
	294	+ parser_config.add_argument('-f', '--force', action='store_true',
	295	+ help='Reconfigure Editor Toolkit (this will replace wiki.cfg')
293	296
294	297	parser_download = subparsers.add_parser('download', help='The download sub command allows you to download a Wikipedia dump file.')
295	298	parser_download.set_defaults(func=dump_downloader_launcher)
—	—	@@ -296,11 +299,17 @@
297	300	parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.')
298	301	parser_split.set_defaults(func=chunker_launcher)
299	302
	303	+ parser_create = subparsers.add_parser('extract', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.')
	304	+ parser_create.set_defaults(func=extract_launcher)
	305	+
300	306	parser_sort = subparsers.add_parser('sort', help='By presorting the data, significant processing time reducations are achieved.')
301	307	parser_sort.set_defaults(func=sort_launcher)
302	308
303		~~- parser_create = subparsers.add_parser('extract', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.')~~
304		~~- parser_create.set_defaults(func=extract_launcher)~~
	309	+ parser_store = subparsers.add_parser('store', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.')
	310	+ parser_store.set_defaults(func=store_launcher)
	311	+ parser_store.add_argument('-c', '--collection', action='store',
	312	+ help='Name of MongoDB collection',
	313	+ default='editors')
305	314
306	315	parser_transform = subparsers.add_parser('transform', help='Transform the raw datatabe to an enriched dataset that can be exported.')
307	316	parser_transform.set_defaults(func=transformer_launcher)
—	—	@@ -337,10 +346,9 @@
338	347	detect_python_version()
339	348	about()
340	349	args = parser.parse_args()
341		~~- if not os.path.exists('wiki.cfg'):~~
342		~~- config.create_configuration(settings, args)~~
	350	+ config.create_configuration(settings, args)
343	351	locations = determine_file_locations(args)
344		~~- settings.verify_environment([locations['location']])~~
	352	+ settings.verify_environment(locations['directories'])
345	353	show_settings(args, **locations)
346	354	#locations['settings'] = settings
347	355	args.func(args, **locations)
—	—	@@ -348,5 +356,4 @@
349	357
350	358
351	359	if __name__ == '__main__':
352		~~- #args = ['download', '-l', 'Russian']~~
353	360	main()
Index: trunk/tools/editor_trends/wikitree/xml.py
—	—	@@ -28,8 +28,10 @@
29	29
30	30	def extract_text(elem, kwargs):
31	31	if elem != None and elem.text != None:
32		~~- return elem.text.decode(settings.encoding)~~
33		~~- return None~~
	32	+ #try:
	33	+ return elem.text #.decode(settings.encoding)
	34	+ #except UnicodeDecodeError:
	35	+ #return None
34	36
35	37
36	38	def retrieve_xml_node(xml_nodes, name):
Index: trunk/tools/editor_trends/etl/extract.py
—	—	@@ -63,11 +63,12 @@
64	64	new_xmlfile()
65	65
66	66	class XMLFile(object):
67		~~- def __init__(self, input, output, file, bots, **kwargs):~~
	67	+ def __init__(self, input, output, file, bots, target, **kwargs):
68	68	self.file = file
69	69	self.input = input
70	70	self.output = output
71	71	self.bots = bots
	72	+ self.target = target
72	73	for kw in kwargs:
73	74	setattr(self, kw, kwargs[kw])
74	75
—	—	@@ -96,7 +97,7 @@
97	98	raw_data = ''.join(raw_data)
98	99	xml_buffer.write(raw_data)
99	100	elem = cElementTree.XML(xml_buffer.getvalue())
100		~~- output_editor_information(elem, self.fh, bots=self.bots, destination=self.destination)~~
	101	+ self.target(elem, self.fh, bots=self.bots, destination=self.destination)
101	102	except SyntaxError, error:
102	103	print error
103	104	'''
—	—	@@ -160,6 +161,7 @@
161	162	else:
162	163	return None
163	164
	165	+
164	166	def extract_contributor_id(contributor, kwargs):
165	167	'''
166	168	@contributor is the xml contributor node containing a number of attributes
—	—	@@ -339,20 +341,17 @@
340	342	tasks = multiprocessing.JoinableQueue()
341	343	consumers = [XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)]
342	344	for file in files:
343		~~- tasks.put(XMLFile(input, output, file, bots, **kwargs))~~
	345	+ tasks.put(XMLFile(input, output, file, bots, output_editor_information, **kwargs))
	346	+ print 'The queue contains %s files.' % tasks.qsize()
344	347	for x in xrange(settings.number_of_processes):
345	348	tasks.put(None)
346	349
347		~~- print tasks.qsize()~~
348	350	for w in consumers:
349	351	w.start()
350	352
351	353	tasks.join()
352	354
353		~~- #chunks = utils.split_list(files, settings.number_of_processes)~~
354		~~- #pc.build_scaffolding(pc.load_queue, parse_editors, chunks, False, False, **kwargs)~~
355	355
356		-
357	356	def debug_parse_editors(dbname):
358	357	q = JoinableQueue()
359	358	parse_editors('522.xml', q, None, None, debug=True, destination='file')
Index: trunk/tools/editor_trends/etl/store.py
—	—	@@ -38,8 +38,9 @@
39	39	@dbname is the name of the MongoDB collection where to store the information.
40	40	'''
41	41	dbname = kwargs.get('dbname', None)
	42	+ colleciton = kwargs.pop('collection')
42	43	mongo = db.init_mongo_db(dbname)
43		~~- collection = mongo['editors']~~
	44	+ collection = mongo[collection]
44	45	mongo[collection].ensure_index('editor')
45	46	editor_cache = cache.EditorCache(collection)
46	47
—	—	@@ -82,9 +83,9 @@
83	84	return cache
84	85
85	86
86		~~-def search_cache_for_missed_editors(dbname):~~
	87	+def search_cache_for_missed_editors(dbname, collection):
87	88	mongo = db.init_mongo_db(dbname)
88		~~- collection = mongo['editors']~~
	89	+ collection = mongo[collection]
89	90	editor_cache = cache.EditorCache(collection)
90	91	cache = load_cache_objects()
91	92	for c in cache:
Index: trunk/tools/editor_trends/etl/chunker.py
—	—	@@ -24,6 +24,7 @@
25	25	import json
26	26	import os
27	27
	28	+
28	29	import progressbar
29	30
30	31
—	—	@@ -79,14 +80,15 @@
80	81	return ns
81	82
82	83
83		~~-def build_namespaces_locale(namespaces):~~
	84	+def build_namespaces_locale(namespaces, include=[0]):
84	85	'''
85		~~- Construct a list of all the non-main namespaces~~
	86	+ @include is a list of namespace keys that should not be ignored, the default
	87	+ setting is to ignore all namespaces except the main namespace.
86	88	'''
87	89	ns = []
88	90	for namespace in namespaces:
89		~~- value = namespaces[namespace].get(u'*', None)~~
90		~~- if value != None and value != '':~~
	91	+ if int(namespace) not in include:
	92	+ value = namespaces[namespace].get(u'*', None)
91	93	ns.append(value)
92	94	return ns
93	95
—	—	@@ -114,32 +116,39 @@
115	117
116	118	def write_xml_file(element, fh, output, counter):
117	119	'''Get file handle and write xml element to file'''
118		~~- size = len(cElementTree.tostring(element))~~
119		~~- fh, counter = create_file_handle(fh, output, counter, size)~~
	120	+ xml_string = cElementTree.tostring(element)
	121	+ size = len(xml_string)
	122	+ fh, counter, new_file = create_file_handle(fh, output, counter, size)
120	123	try:
121		~~- fh.write(cElementTree.tostring(element))~~
	124	+ fh.write(xml_string)
122	125	except MemoryError:
123	126	print 'Add error capturing logic'
124	127	fh.write('\n')
125		~~- return fh, counter~~
	128	+ return fh, counter, new_file
126	129
127	130
128	131	def create_file_handle(fh, output, counter, size):
129		~~- '''Create file handle if none is supplied or if file size > max file size.'''~~
130		~~- if not counter:~~
	132	+ '''
	133	+ @fh is file handle, if none is supplied or if file size > max file size then
	134	+ create a new file handle
	135	+ @output is the location where to store the files
	136	+ @counter indicates which chunk it is
	137	+ @size is the length of the xml element about to be written to file.
	138	+ '''
	139	+ if not fh:
131	140	counter = 0
132		~~- path = os.path.join(output, '%s.xml' % counter)~~
133		~~- if not fh:~~
	141	+ path = os.path.join(output, '%s.xml' % counter)
134	142	fh = codecs.open(path, 'w', encoding=settings.encoding)
135		~~- return fh, counter~~
136		~~- elif (fh.tell() + size) > settings.max_settings_xmlfile_size:~~
137		~~- print 'Created chunk %s' % counter~~
	143	+ return fh, counter, False
	144	+ elif (fh.tell() + size) > settings.max_xmlfile_size:
	145	+ print 'Created chunk %s' % (counter + 1)
138	146	fh.close
139	147	counter += 1
	148	+ path = os.path.join(output, '%s.xml' % counter)
140	149	fh = codecs.open(path, 'w', encoding=settings.encoding)
141		~~- return fh, counter~~
	150	+ return fh, counter, True
142	151	else:
143		~~- return fh, counter~~
	152	+ return fh, counter, False
144	153
145	154
146	155	def flatten_xml_elements(data, page):
—	—	@@ -154,9 +163,9 @@
155	164	else:
156	165	flat[x].append(xml.extract_text(elem, None))
157	166	return flat
158		-
159	167
160		~~-def split_file(location, file, project, language_code, language, format='xml'):~~
	168	+
	169	+def split_file(location, file, project, language_code, include, format='xml', zip=False):
161	170	'''Reads xml file and splits it in N chunks'''
162	171	#location = os.path.join(settings.input_location, language)
163	172	input = os.path.join(location, file)
—	—	@@ -167,12 +176,11 @@
168	177	else:
169	178	f = input.replace('.xml', '')
170	179	fh = utils.create_txt_filehandle(output, '%s.tsv' % f, 'w', settings.encoding)
171		-
	180	+
172	181	ns = load_namespace(language_code)
173		~~- ns = build_namespaces_locale(ns)~~
	182	+ ns = build_namespaces_locale(ns, include)
174	183
175		~~- settings.xml_namespace = 'http://www.mediawiki.org/xml/export-0.3/'~~
176		~~- counter = None~~
	184	+ counter = 0
177	185	tag = '{%s}page' % settings.xml_namespace
178	186	context = cElementTree.iterparse(input, events=('start', 'end'))
179	187	context = iter(context)
—	—	@@ -186,7 +194,11 @@
187	195	page = elem.find('id').text
188	196	elem = parse_comments(elem, remove_numeric_character_references)
189	197	if format == 'xml':
190		~~- fh, counter = write_xml_file(elem, fh, output, counter)~~
	198	+ fh, counter, new_file = write_xml_file(elem, fh, output, counter)
	199	+ if zip and new_file:
	200	+ file = str(counter - 1) + '.xml'
	201	+ utils.zip_archive(settings.path_ziptool, output, file)
	202	+ utils.delete_file(output, file)
191	203	else:
192	204	data = [el.getchildren() for el in elem if el.tag == 'revision']
193	205	data = flatten_xml_elements(data, page)
—	—	@@ -196,9 +208,9 @@
197	209	f = utils.create_txt_filehandle(settings.log_location, 'split_xml', 'w', settings.encoding)
198	210	f.write(cElementTree.tostring(elem))
199	211	f.close()
200		~~- finally:~~
201		~~- fh.close()~~
202	212
	213	+ fh.close()
	214	+
203	215	if __name__ == "__main__":
204	216	kwargs = {'output': settings.input_location,
205	217	'input': settings.input_filename,
Index: trunk/tools/editor_trends/etl/loader.py
—	—	@@ -32,7 +32,8 @@
33	33
34	34
35	35
36		~~-def store_editors(input, filename, dbname, collection):~~
	36	+def store_editors(input, dbname, collection):
	37	+ filename = utils.retrieve_file_list(input, 'txt', mask=None)[0]
37	38	fh = utils.create_txt_filehandle(input, filename, 'r', settings.encoding)
38	39	mongo = db.init_mongo_db(dbname)
39	40	collection = mongo[collection]
—	—	@@ -70,7 +71,7 @@
71	72	utils.store_object(editors, settings.binary_location, 'editors')
72	73
73	74
74		~~-def mergesort_external_launcher(dbname, input, output):~~
	75	+def mergesort_external_launcher(dbname, input, intermediate_output, output):
75	76	files = utils.retrieve_file_list(input, 'txt', mask='')
76	77	x = 0
77	78	maxval = 99999
—	—	@@ -79,11 +80,12 @@
80	81	maxval = round(len(files) / x)
81	82	chunks = utils.split_list(files, int(x))
82	83	'''1st iteration external mergesort'''
	84	+ if len(chunks) < 2:
	85	+ intermediate_output = output
83	86	for chunk in chunks:
84	87	filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]]
85		~~- filename = sort.merge_sorted_files(output, filehandles, chunk)~~
	88	+ filename = sort.merge_sorted_files(intermediate_output, filehandles, chunk)
86	89	filehandles = [fh.close() for fh in filehandles]
87		~~-# pass~~
88	90	'''2nd iteration external mergesort, if necessary'''
89	91	if len(chunks) > 1:
90	92	files = utils.retrieve_file_list(output, 'txt', mask='[merged]')
—	—	@@ -91,7 +93,7 @@
92	94	filename = sort.merge_sorted_files(output, filehandles, 'final')
93	95	filehandles = [fh.close() for fh in filehandles]
94	96	filename = 'merged_final.txt'
95		~~- return filename~~
	97	+
96	98
97	99
98	100	def mergesort_feeder(task_queue, **kwargs):
—	—	@@ -134,4 +136,6 @@
135	137	output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')
136	138	dbname = 'enwiki'
137	139	#mergesort_launcher(input, output)
138		~~- mergesort_external_launcher(dbname, output, output)~~
\ No newline at end of file
	140	+ final_output = os.path.join(settings.input_location, 'en', 'wiki', 'dbready')
	141	+ mergesort_external_launcher(dbname, output, final_output)
	142	+ store_editors(input, dbname, collection)
\ No newline at end of file
Index: trunk/tools/editor_trends/config.py
—	—	@@ -22,29 +22,49 @@
23	23	import ConfigParser
24	24
25	25	from utils import utils
	26	+import languages
26	27
27		-
28	28	def create_configuration(settings, args):
29		~~- config = ConfigParser.RawConfigParser()~~
	29	+ force = getattr(args, 'force', False)
	30	+ if not os.path.exists('wiki.cfg') or force:
	31	+ config = ConfigParser.RawConfigParser()
	32	+ project = None
	33	+ language = None
	34	+ language_map = languages.language_map()
	35	+ working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.\n' % os.getcwd())
	36	+ input_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.\n' % settings.input_location)
30	37
31		~~- working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.' % os.getcwd())~~
32		~~- input_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.' % settings.input_location)~~
33		~~- input_location = input_location if len(input_location) > 0 else settings.input_location~~
34		~~- working_directory = working_directory if len(working_directory) > 0 else os.getcwd()~~
35		-
36		~~- config = ConfigParser.RawConfigParser()~~
37		~~- config.add_section('file_locations')~~
38		~~- config.set('file_locations', 'working_directory', working_directory)~~
39		~~- config.set('file_locations', 'input_location', input_location)~~
	38	+ while project not in settings.projects.keys():
	39	+ project = raw_input('Please indicate which project you would like to analyze.\nDefault is: %s\nPress Enter to accept default.\n' % settings.projects[args.project].capitalize())
	40	+ project = project if len(project) > 0 else args.project
	41	+ if project not in settings.projects.keys():
	42	+ print 'Valid choices for a project are: %s' % ','.join(settings.projects.keys())
40	43
41		~~- fh = utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb')~~
42		~~- config.write(fh)~~
43		~~- fh.close()~~
44		-
45		~~- settings.working_directory = config.get('file_locations', 'working_directory')~~
46		~~- settings.input_location = config.get('file_locations', 'input_location')~~
47		~~- return settings~~
	44	+ while language not in languages.MAPPING:
	45	+ language = raw_input('Please indicate which language of project %s you would like to analyze.\nDefault is: %s\nPress Enter to accept default.\n' % (settings.projects[project].capitalize(), language_map[args.language]))
	46	+ if len(language) == 0:
	47	+ language = language_map[args.language]
	48	+ language = language if language in languages.MAPPING else args.language
48	49
	50	+ input_location = input_location if len(input_location) > 0 else settings.input_location
	51	+ working_directory = working_directory if len(working_directory) > 0 else os.getcwd()
49	52
	53	+ config = ConfigParser.RawConfigParser()
	54	+ config.add_section('file_locations')
	55	+ config.set('file_locations', 'working_directory', working_directory)
	56	+ config.set('file_locations', 'input_location', input_location)
	57	+ config.add_section('wiki')
	58	+ config.set('wiki', 'project', project)
	59	+ config.set('wiki', 'language', language)
	60	+
	61	+ fh = utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb')
	62	+ config.write(fh)
	63	+ fh.close()
	64	+
	65	+ settings.working_directory = config.get('file_locations', 'working_directory')
	66	+ settings.input_location = config.get('file_locations', 'input_location')
	67	+ return settings
	68	+
	69	+
50	70	if __name__ == '__main__':
51		~~- pass~~
\ No newline at end of file
	71	+ pass
Index: trunk/tools/editor_trends/languages.py
—	—	@@ -24,6 +24,8 @@
25	25	'''
26	26
27	27	from utils import ordered_dict as odict
	28	+from utils import utils
	29	+
28	30	MAPPING = odict.OrderedDict([
29	31	(u'English','en'),
30	32	(u'German','de'),
—	—	@@ -604,4 +606,7 @@
605	607	(u'Muskogee','mus'),
606	608	(u'Kanuri','kr'),
607	609	(u'Otsiherero','hz'),
608		~~-])~~
\ No newline at end of file
	610	+])
	611	+
	612	+def language_map():
	613	+ return utils.invert_dict(MAPPING)
\ No newline at end of file
Index: trunk/tools/editor_trends/configuration.py
—	—	@@ -50,7 +50,7 @@
51	51	self.date_format = '%Y-%m-%d' #Date format as used by Erik Zachte
52	52	self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' # Timestampformat as generated by the MediaWiki dumps
53	53
54		~~- self.max_settings_xmlfile_size = 67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason~~
	54	+ self.max_xmlfile_size = 67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason
55	55	self.number_of_processes = cpu_count() * process_multiplier
56	56	#Change this to match your computers configuration (RAM / CPU)
57	57	self.minimum_python_version = (2, 6)
—	—	@@ -69,25 +69,27 @@
70	70	self.file_locations = self.set_file_locations()
71	71	self.max_filehandles = self.determine_max_filehandles_open()
72	72
73		~~- self.windows_register = {'7zip': 'Software\\7-Zip', }~~
	73	+ self.windows_register = {'7z.exe': 'Software\\7-Zip', }
74	74	self.load_configuration()
75	75	self.set_custom_settings(**kwargs)
76		~~- self.projects = {'commons': 'commonswiki',~~
77		~~- 'wikibooks': 'wikibooks',~~
78		~~- 'wikinews': 'wikinews',~~
79		~~- 'wikiquote': 'wikiquote',~~
80		~~- 'wikisource': 'wikisource',~~
81		~~- 'wikiversity': 'wikiversity',~~
82		~~- 'wiktionary': 'wiktionary',~~
83		~~- 'metawiki': 'metawiki',~~
84		~~- 'wikispecies': 'specieswiki',~~
85		~~- 'incubator': 'incubatorwiki',~~
86		~~- 'foundation': 'foundationwiki',~~
87		~~- 'mediawiki': 'mediawikiwiki',~~
88		~~- 'outreach': 'outreachwiki',~~
89		~~- 'strategic planning': 'strategywiki',~~
90		~~- 'usability initiative': 'usabilitywiki',~~
91		~~- 'multilingual wikisource': None~~
	76	+ self.path_ziptool = self.determine_path_ziptool()
	77	+ self.projects = {'wiki': 'wikipedia',
	78	+ 'commons': 'commonswiki',
	79	+ 'books': 'wikibooks',
	80	+ 'news': 'wikinews',
	81	+ 'quote': 'wikiquote',
	82	+ 'source': 'wikisource',
	83	+ 'versity': 'wikiversity',
	84	+ 'tionary': 'wiktionary',
	85	+ 'meta': 'metawiki',
	86	+ 'species': 'specieswiki',
	87	+ 'incubator': 'incubatorwiki',
	88	+ 'foundation': 'foundationwiki',
	89	+ 'mediawiki': 'mediawikiwiki',
	90	+ 'outreach': 'outreachwiki',
	91	+ 'strategic_planning': 'strategywiki',
	92	+ 'usability_initiative': 'usabilitywiki',
	93	+ 'multilingual_wikisource': None
92	94	}
93	95
94	96	def set_custom_settings(self, **kwargs):
—	—	@@ -100,6 +102,8 @@
101	103	config.read(os.path.join(self.working_directory, 'wiki.cfg'))
102	104	self.working_directory = config.get('file_locations', 'working_directory')
103	105	self.input_location = config.get('file_locations', 'input_location')
	106	+ self.default_project = config.get('wiki', 'project')
	107	+ self.default_language = config.get('wiki', 'language')
104	108
105	109	def determine_working_directory(self):
106	110	cwd = os.getcwd()
—	—	@@ -115,6 +119,10 @@
116	120	else:
117	121	return os
118	122
	123	+ def determine_path_ziptool(self):
	124	+ return self.detect_installed_program(self.determine_ziptool())
	125	+
	126	+
119	127	def verify_environment(self, directories):
120	128	for dir in directories:
121	129	if not os.path.exists(dir):
—	—	@@ -146,6 +154,7 @@
147	155	return resource.getrlimit(resource.RLIMIT_NOFILE)[0]
148	156	else:
149	157	return 500
	158	+
150	159	def update_python_path(self):
151	160	IGNORE_DIRS = ['wikistats', 'zips']
152	161	dirs = [name for name in os.listdir(self.working_directory) if
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -31,13 +31,14 @@
32	32	import os
33	33	import ctypes
34	34	import time
	35	+import subprocess
	36	+import sys
	37	+sys.path.append('..')
35	38
36	39	import configuration
37	40	settings = configuration.Settings()
38	41	import exceptions
39	42
40		~~-settings = configuration.Settings()~~
41		-
42	43	try:
43	44	import psyco
44	45	psyco.full()
—	—	@@ -250,6 +251,11 @@
251	252	return name
252	253
253	254
	255	+def delete_file(location, filename):
	256	+ if check_file_exists(location, filename):
	257	+ os.remove(os.path.join(location, filename))
	258	+
	259	+
254	260	def check_file_exists(location, filename):
255	261	if hasattr(filename, '__call__'):
256	262	filename = construct_filename(filename, '.bin')
—	—	@@ -350,6 +356,41 @@
351	357	return files
352	358
353	359
	360	+def zip_archive(location, source, compression='7z'):
	361	+ '''
	362	+ @path is the absolute path to the zip program
	363	+ @location is the directory where to store the compressed file
	364	+ @source is the name of the zipfile
	365	+ '''
	366	+ output, ext = source.split('.')
	367	+ output = output + '.7z'
	368	+ path = settings.path_ziptool
	369	+ if settings.platform == 'Windows':
	370	+ p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'a', '-scsUTF-8', '-t%s' % compression, '%s\\%s' % (location,output), '%s\\%s' % (location,source)], shell=True).wait()
	371	+ elif settings.platform == 'Linux':
	372	+ raise NotImplementedError
	373	+ elif settings.platform == 'OSX':
	374	+ raise NotImplementedError
	375	+ else:
	376	+ raise exceptions.PlatformNotSupportedError
	377	+
	378	+
	379	+def zip_extract(path, location, source):
	380	+ '''
	381	+ @path is the absolute path to the zip program
	382	+ @location is the directory where to store the compressed file
	383	+ @source is the name of the zipfile
	384	+ '''
	385	+ if settings.platform == 'Windows':
	386	+ p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)], shell=True).wait()
	387	+ elif settings.platform == 'Linux':
	388	+ raise NotImplementedError
	389	+ elif settings.platform == 'OSX':
	390	+ raise NotImplementedError
	391	+ else:
	392	+ raise exceptions.PlatformNotSupportedError
	393	+
	394	+
354	395	def merge_list(datalist):
355	396	merged = []
356	397	for d in datalist:
—	—	@@ -421,4 +462,8 @@
422	463
423	464
424	465	if __name__ == '__main__':
425		~~- debug()~~
	466	+ tool = settings.determine_ziptool()
	467	+ path = settings.detect_installed_program(tool)
	468	+ location = os.path.join(settings.input_location, 'en', 'wiki')
	469	+ source = 'enwiki-20100916-stub-meta-history.xml'
	470	+ zip_archive(path, location, source)

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r77195 [removed: new added: deferred]