r81374 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r81373‎ \| r81374 \| r81375 >
Date:	03:30, 2 February 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	A bunch of small fixes, mostly corner cases on OSX, printing of Unicode strings and datetime conversions.
Modified paths:	/trunk/tools/editor_trends/analyses/analyzer.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py (modified) (history) /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history) /trunk/tools/editor_trends/cronjobs.py (modified) (history) /trunk/tools/editor_trends/etl/downloader.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/utils/data_converter.py (modified) (history) /trunk/tools/editor_trends/utils/file_utils.py (modified) (history) /trunk/tools/editor_trends/utils/http_utils.py (modified) (history) /trunk/tools/editor_trends/wikilytics/api/models.py (modified) (history) /trunk/tools/editor_trends/wikilytics/templates/datasets.html (modified) (history) /trunk/tools/editor_trends/wikitree/parser.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -104,7 +104,6 @@
105	105	'''
106	106	print 'Start downloading'
107	107	stopwatch = timer.Timer()
108		~~- #project, language, jobtype, task, timer, event = 'start'~~
109	108	log.log_to_mongo(properties, 'dataset', 'download', stopwatch, event='start')
110	109	res = downloader.launcher(properties, settings, logger)
111	110	stopwatch.elapsed()
—	—	@@ -153,7 +152,7 @@
154	153	print 'Start storing data in MongoDB'
155	154	stopwatch = timer.Timer()
156	155	log.log_to_mongo(properties, 'dataset', 'store', stopwatch, event='start')
157		~~- db.cleanup_database(properties.project.name, logger)~~
	156	+ db.cleanup_database(properties.dbname, logger)
158	157	# write_message_to_log(logger, settings,
159	158	# message=None,
160	159	# verb='Storing',
—	—	@@ -164,7 +163,8 @@
165	164	# collection=properties.collection)
166	165	# for key in properties:
167	166	# print key, getattr(properties, key)
168		~~- store.launcher(properties.sorted, properties.project.name, properties.collection)~~
	167	+ store.launcher(properties.sorted, properties.dbname, properties.collection)
	168	+
169	169	stopwatch.elapsed()
170	170	log.log_to_mongo(properties, 'dataset', 'store', stopwatch, event='finish')
171	171
—	—	@@ -173,13 +173,13 @@
174	174	print 'Start transforming dataset'
175	175	stopwatch = timer.Timer()
176	176	log.log_to_mongo(properties, 'dataset', 'transform', stopwatch, event='start')
177		~~- db.cleanup_database(properties.project.name, logger, 'dataset')~~
	177	+ db.cleanup_database(properties.dbname, logger, 'dataset')
178	178	# write_message_to_log(logger, settings,
179	179	# message=None,
180	180	# verb='Transforming',
181	181	# project=properties.project,
182	182	# collection=properties.collection)
183		~~- transformer.transform_editors_single_launcher(properties.project.name,~~
	183	+ transformer.transform_editors_single_launcher(properties.dbname,
184	184	properties.collection)
185	185	stopwatch.elapsed()
186	186	log.log_to_mongo(properties, 'dataset', 'transform', stopwatch,
—	—	@@ -200,7 +200,7 @@
201	201	# dbname=properties.full_project,
202	202	# collection=properties.collection)
203	203
204		~~- analyzer.generate_chart_data(properties.project.name,~~
	204	+ analyzer.generate_chart_data(properties.dbname,
205	205	collection,
206	206	properties.language.code,
207	207	target,
Index: trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py
—	—	@@ -17,9 +17,12 @@
18	18	__date__ = '2011-01-31'
19	19	__version__ = '0.1'
20	20
	21	+import datetime
	22	+from dateutil.relativedelta import relativedelta
	23	+from utils import data_converter
21	24
22	25	def histogram_by_backward_cohort(var, editor, **kwargs):
23		~~- break_down=kwargs.pop('break_down', False)~~
	26	+ break_down = kwargs.pop('break_down', False)
24	27	new_wikipedian = editor['new_wikipedian']
25	28	n = editor['edit_count']
26	29
—	—	@@ -36,6 +39,10 @@
37	40	if w >= editor_dt:
38	41	datum = datetime.datetime(int(year), 12, 31)
39	42	freq = editor['edits_by_year'][year]
	43	+ if datum == datetime.datetime(2003, 12, 31):
	44	+ if w == 24:
	45	+ if freq == 1.0:
	46	+ print 'break'
40	47	var.add(datum, {w:{freq:1}})
41	48	break
42		~~- return var~~
\ No newline at end of file
	49	+ return var
Index: trunk/tools/editor_trends/analyses/analyzer.py
—	—	@@ -167,15 +167,16 @@
168	168
169	169
170	170	if __name__ == '__main__':
171		~~- generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)~~
172		~~- generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', time_unit='year')~~
173		~~- generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year')~~
174		~~- generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', time_unit='year')~~
175		~~- generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0)~~
176		~~- generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide')~~
177		~~- generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')~~
178		~~- generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_edits', time_unit='year', cutoff=0)~~
179		~~- generate_chart_data('wiki', 'editors_dataset', 'en', 'time_to_new_wikipedian', time_unit='year', cutoff=0)~~
180		~~- generate_chart_data('wiki', 'editors_dataset', 'en', 'new_editor_count', time_unit='month', cutoff=0)~~
	171	+ generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50)
	172	+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)
	173	+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', time_unit='year')
	174	+ #generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year')
	175	+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', time_unit='year')
	176	+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0)
	177	+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide')
	178	+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')
	179	+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_edits', time_unit='year', cutoff=0)
	180	+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'time_to_new_wikipedian', time_unit='year', cutoff=0)
	181	+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'new_editor_count', time_unit='month', cutoff=0)
181	182
182	183	#available_analyses()
Index: trunk/tools/editor_trends/wikitree/parser.py
—	—	@@ -61,7 +61,11 @@
62	62	for ns in namespaces:
63	63	key = ns.get('key')
64	64	d[key] = extract_text(ns)
65		~~- print ns.get('key'), ns.text~~
	65	+ text = ns.text if ns.text != None else ''
	66	+ try:
	67	+ print key, text.encode(settings.encoding)
	68	+ except UnicodeEncodeError:
	69	+ print key
66	70	return d
67	71
68	72
Index: trunk/tools/editor_trends/etl/downloader.py
—	—	@@ -39,7 +39,6 @@
40	40	success = True
41	41	chunk = 1024 * 4
42	42
43		-
44	43	while True:
45	44	filename = task_queue.get(block=False)
46	45	task_queue.task_done()
—	—	@@ -58,15 +57,16 @@
59	58	filename)
60	59	mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, properties.settings.timestamp_server)
61	60	if file_utils.check_file_exists(properties.location, filename):
62		~~- #This can be activated as soon as bug 21575 is fixed.~~
63		~~- properties.force = True~~
64	61	mod_loc = file_utils.get_modified_date(properties.location, filename)
65		~~- if mod_loc != mod_date and properties.force == False:~~
	62	+ if mod_loc == mod_date and (properties.force == False or properties.force == None):
66	63	print 'You already have downloaded the most recent %s%s dumpfile.' % (properties.language.code, properties.project.name)
67		~~- break~~
	64	+ continue
68	65
69	66	if filemode == 'w':
70		~~- fh = file_utils.create_txt_filehandle(properties.location, filename, filemode, properties.settings.encoding)~~
	67	+ fh = file_utils.create_txt_filehandle(properties.location,
	68	+ filename,
	69	+ filemode,
	70	+ properties.settings.encoding)
71	71	else:
72	72	fh = file_utils.create_binary_filehandle(properties.location, filename, 'wb')
73	73
—	—	@@ -92,27 +92,23 @@
93	93
94	94	except urllib2.URLError, error:
95	95	print 'Reason: %s' % error
96		~~- success = False~~
97	96	except urllib2.HTTPError, error:
98	97	print 'Error: %s' % error
99		~~- success = False~~
100	98	finally:
101	99	fh.close()
102	100	file_utils.set_modified_data(mod_date, properties.location, filename)
103	101
104		~~- return success~~
105	102
106	103
107	104	def launcher(properties, settings, logger):
108	105	print 'Creating list of files to be downloaded...'
109		~~- result = True~~
110	106	tasks = http_utils.create_list_dumpfiles(properties.settings.wp_dump_location,
111	107	properties.dump_relative_path,
112	108	properties.dump_filename)
113	109	#print tasks.qsize()
114	110	#if tasks.qsize() < properties.settings.number_of_processes:
115	111	# properties.settings.number_of_processes = tasks.qsize()
116		~~- if tasks.qsize() > 1:~~
	112	+ if tasks.qsize() > 2:
117	113	consumers = [multiprocessing.Process(target=download_wiki_file,
118	114	args=(tasks, properties))
119	115	for i in xrange(properties.settings.number_of_processes)]
—	—	@@ -124,8 +120,7 @@
125	121	w.start()
126	122
127	123	tasks.join()
128		~~- for consumer in consumers:~~
129		~~- if consumer.exitcode != 0:~~
130		~~- result = False~~
	124	+# for consumer in consumers:
	125	+# if consumer.exitcode != 0:
	126	+# result = False
131	127
132		~~- return result~~
Index: trunk/tools/editor_trends/wikilytics/api/models.py
—	—	@@ -69,7 +69,6 @@
70	70	def __unicode__(self):
71	71	return u'%s%s' % (self.language_code, self.project)
72	72
73		-
74	73	@permalink
75	74	def get_absolute_url(self):
76	75	if self.jobtype != 'dataset':
Index: trunk/tools/editor_trends/wikilytics/templates/datasets.html
—	—	@@ -33,7 +33,7 @@
34	34	{% for job in jobs %}
35	35	<ul>
36	36	<li>Project: {{ job.project }}</li>
37		~~- <li>Language: {{ job.language }}</li>~~
	37	+ <li>Language: {{ job.language_code }}</li>
38	38	<li>Created: {{ job.created }}</li>
39	39	<li>Finished: {{ job.finished }}</li>
40	40	<li>In progress: {{ job.in_progress }}</li>
Index: trunk/tools/editor_trends/classes/runtime_settings.py
—	—	@@ -56,7 +56,7 @@
57	57	self.settings.input_location != None else self.get_value('location')
58	58	self.project = self.update_project_settings()
59	59	self.language = self.update_language_settings()
60		-
	60	+ self.dbname = '%s%s' % (self.language.code, self.project.name)
61	61	self.targets = self.split_keywords(self.get_value('charts'))
62	62	self.keywords = self.split_keywords(self.get_value('keywords'))
63	63	self.function = self.get_value('func')
Index: trunk/tools/editor_trends/utils/file_utils.py
—	—	@@ -25,6 +25,8 @@
26	26
27	27	import re
28	28	import htmlentitydefs
	29	+import time
	30	+import datetime
29	31	import cPickle
30	32	import codecs
31	33	import os
—	—	@@ -157,7 +159,7 @@
158	160	lock.release()
159	161
160	162	def write_dict_to_csv(data, fh, keys, write_key=True, format='long'):
161		~~- assert format == 'long' or format == 'wide', 'Format should either be long or wide.'~~
	163	+ assert format == 'long' or format == 'wide', 'Format should either be long or wide.'
162	164
163	165	if format == 'long':
164	166	for key in keys:
—	—	@@ -231,18 +233,22 @@
232	234
233	235	def set_modified_data(mod_rem, location, filename):
234	236	'''
235		~~- Mod_rem is the modified date of the remote file (the Wikimedia dump file)~~
	237	+ Mod_rem is the modified date of the remote file (the Wikimedia dump file),
236	238	Mon, 15 Mar 2010 07:07:30 GMT Example server timestamp
237	239	'''
	240	+ assert isinstance(mod_rem, datetime.datetime), '''The mod_rem variable should
	241	+ be an instane of datetime.datetime.'''
238	242	path = os.path.join(location, filename)
239		~~- print mod_rem~~
240		~~- #smod_rem = text_utils.convert_timestamp_to_datetime_naive(mod_rem, settings.timestamp_format)~~
	243	+ mod_rem = mod_rem.timetuple()
	244	+ mod_rem = int(time.mktime(mod_rem.timetuple()))
241	245	os.utime(path, (mod_rem, mod_rem))
242	246	#sraise exceptions.NotYetImplementedError(set_modified_data)
243	247
244	248	def get_modified_date(location, filename):
245	249	path = os.path.join(location, filename)
246		~~- return os.stat(path).st_mtime~~
	250	+ mod_date = os.stat(path).st_mtime
	251	+ mod_date = datetime.datetime.fromtimestamp(mod_date)
	252	+ return mod_date
247	253
248	254
249	255	def check_file_exists(location, filename):
Index: trunk/tools/editor_trends/utils/http_utils.py
—	—	@@ -74,10 +74,12 @@
75	75	else:
76	76	print 'Added chunk to download: %s' % f
77	77	task_queue.put(f)
78		~~-# if x < settings.number_of_processes:~~
79		~~-# settings.number_of_processes = x~~
80		~~- for x in xrange(settings.number_of_processes):~~
81		~~- task_queue.put(None)~~
	78	+ if x == 1:
	79	+ for x in xrange(1):
	80	+ task_queue.put(None)
	81	+ else:
	82	+ for x in xrange(settings.number_of_processes):
	83	+ task_queue.put(None)
82	84	return task_queue
83	85
84	86
—	—	@@ -106,7 +108,6 @@
107	109
108	110	def determine_modified_date(domain, path, filename):
109	111	res = get_headers(domain, path, filename)
110		~~- print res.__dict__~~
111	112	if res != None and (res.status == 200 or res.status == 301):
112	113	return res.getheader('last-modified', -1)
113	114	else:
—	—	@@ -129,11 +130,6 @@
130	131	print mod_date
131	132	mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, '%a, %d %b %Y %H:%M:%S %Z')
132	133	print mod_date
133		~~- #check_remote_path_exists(domain, path, filename)~~
134		~~- #read_directory_contents(domain, path)~~
135		~~-# download_wp_dump('http://download.wikimedia.org/enwiki/latest',~~
136		~~-# 'enwiki-latest-page_props.sql.gz',~~
137		~~-# settings.input_location)~~
138	134
139	135
140	136	if __name__ == '__main__':
Index: trunk/tools/editor_trends/utils/data_converter.py
—	—	@@ -55,9 +55,9 @@
56	56	for obs in var['obs'][date]['data']:
57	57	if ds.format == 'long':
58	58	if isinstance(var['obs'][date]['data'], dict):
59		~~- for subdata in var['obs'][date]['data']:~~
60		~~- for k,v in var['obs'][date]['data'][subdata]['data'].iteritems():~~
61		~~- o.append([datum, obs, k, v])~~
	59	+ #for subdata in var['obs'][date]['data']:
	60	+ for k, v in var['obs'][date]['data'][obs]['data'].iteritems():
	61	+ o.append([datum, obs, k, v])
62	62	else:
63	63	o.append([datum, obs, var['obs'][date]['data'][obs]])
64	64	data.extend(o)
Index: trunk/tools/editor_trends/cronjobs.py
—	—	@@ -39,6 +39,9 @@
40	40	project = pjc.get_project(task['project'])
41	41	lnc = languages.LanguageContainer()
42	42	language = lnc.get_language(task['language_code'])
	43	+
	44	+ args.language = language.name
	45	+ args.project = project.name
43	46	rts = runtime_settings.RunTimeSettings(project, language, settings, args)
44	47	res = manager.all_launcher(rts, settings, None)
45	48	return res
—	—	@@ -86,7 +89,6 @@
87	90	tasks = []
88	91	jobs = coll.find({'finished': False, 'in_progress': False, 'error': False})
89	92	for job in jobs:
90		~~- job['language_code'] = u'nl'~~
91	93	tasks.append(job)
92	94
93	95	for task in tasks:
—	—	@@ -113,8 +115,11 @@
114	116	launcher()
115	117
116	118
117		-
118	119	if __name__ == '__main__':
	120	+ x = 0
119	121	while True:
120	122	launcher()
121		~~- time.sleep(5 * 60)~~
	123	+ time.sleep(x * 60)
	124	+ x = +1
	125	+ if x > 30:
	126	+ x = 0

Status & tagging log

09:52, 2 February 2011 Reedy (talk | contribs) changed the status of r81374 [removed: new added: deferred]