r81172 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r81171‎ \| r81172 \| r81173 >
Date:	22:30, 28 January 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	A little bit of cleaning up.
Modified paths:	/trunk/tools/editor_trends/analyses/analyzer.py (modified) (history) /trunk/tools/editor_trends/analyses/dataset.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py (modified) (history) /trunk/tools/editor_trends/classes/bots.py (modified) (history) /trunk/tools/editor_trends/etl/downloader.py (modified) (history) /trunk/tools/editor_trends/etl/extracter.py (modified) (history) /trunk/tools/editor_trends/etl/sort.py (modified) (history) /trunk/tools/editor_trends/etl/store.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/utils/data_converter.py (modified) (history) /trunk/tools/editor_trends/utils/file_utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -173,13 +173,13 @@
174	174	print 'Start transforming dataset'
175	175	stopwatch = timer.Timer()
176	176	log.log_to_mongo(properties, 'dataset', 'transform', stopwatch, event='start')
177		~~- db.cleanup_database(properties.project, logger, 'dataset')~~
	177	+ db.cleanup_database(properties.project.name, logger, 'dataset')
178	178	# write_message_to_log(logger, settings,
179	179	# message=None,
180	180	# verb='Transforming',
181	181	# project=properties.project,
182	182	# collection=properties.collection)
183		~~- transformer.transform_editors_single_launcher(properties.project,~~
	183	+ transformer.transform_editors_single_launcher(properties.project.name,
184	184	properties.collection)
185	185	stopwatch.elapsed()
186	186	log.log_to_mongo(properties, 'dataset', 'transform', stopwatch,
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py
—	—	@@ -17,7 +17,11 @@
18	18	__date__ = '2011-01-25'
19	19	__version__ = '0.1'
20	20
	21	+import datetime
	22	+from dateutil.relativedelta import relativedelta
	23	+from utils import data_converter
21	24
	25	+
22	26	def cohort_dataset_backward_bar(var, editor, **kwargs):
23	27	#first_edit = editor['first_edit']
24	28	'''
—	—	@@ -30,7 +34,7 @@
31	35	n = editor['edit_count']
32	36
33	37	if n >= var.cum_cutoff:
34		~~- windows = create_windows(var, break_down_first_year=False)~~
	38	+ windows = data_converter.create_windows(var, break_down_first_year=False)
35	39	for year in xrange(new_wikipedian.year, var.max_year):
36	40	year = str(year)
37	41	if editor['edits_by_year'][year] >= var.cutoff:
Index: trunk/tools/editor_trends/analyses/analyzer.py
—	—	@@ -94,8 +94,6 @@
95	95	format=fmt)
96	96	var = dataset.Variable('count', **kwargs)
97	97
98		-
99		-
100	98	for editor in editors:
101	99	editor = coll.find_one({'editor': editor})
102	100	var = func(var, editor, dbname=dbname)
—	—	@@ -168,25 +166,14 @@
169	167	return min_year, max_year
170	168
171	169
172		~~-def create_windows(var, break_down_first_year=True):~~
173		~~- '''~~
174		~~- This function creates a list of months. If break_down_first_year = True then~~
175		~~- the first year will be split in 3, 6, 9 months as well.~~
176		~~- '''~~
177		~~- years = var.max_year - var.min_year~~
178		~~- windows = [y * 12 for y in xrange(1, years)]~~
179		~~- if break_down_first_year:~~
180		~~- windows = [3, 6, 9] + windows~~
181		~~- return windows~~
182		-
183		-
184	170	if __name__ == '__main__':
	171	+ generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)
185	172	#generate_chart_data('wiki', 'editors_dataset','en', 'total_number_of_new_wikipedians', time_unit='year')
186	173	#generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year')
187	174	#generate_chart_data('wiki', 'editors_dataset','en', 'total_cumulative_edits', time_unit='year')
188		~~- generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0)~~
189		~~- generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide')~~
190		~~- generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')~~
	175	+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0)
	176	+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide')
	177	+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')
191	178	#generate_chart_data('wiki', 'editors_dataset','en', 'histogram_edits', time_unit='year', cutoff=0)
192	179	#generate_chart_data('wiki', 'editors_dataset','en', 'time_to_new_wikipedian', time_unit='year', cutoff=0)
193	180	#generate_chart_data('wiki', 'editors_dataset','en', 'new_editor_count', time_unit='month', cutoff=0)
Index: trunk/tools/editor_trends/analyses/dataset.py
—	—	@@ -129,33 +129,26 @@
130	130	def __getitem__(self, key):
131	131	return getattr(self, key, [])
132	132
133		~~- def next(self):~~
134		~~- try:~~
135		~~- return len(self.data.keys()) + 1~~
136		~~- except IndexError:~~
137		~~- return 0~~
138		-
139	133	def add(self, value, update):
140	134	'''
141	135	If update == True then data[i] will be incremented else data[i] will be
142	136	created, in that case make sure that i is unique. Update is useful for
143	137	tallying a variable.
144	138	'''
145		~~- if hasattr(value, '__iter__') == False:~~
146		~~- d = {}~~
147		~~- d[0] = value~~
148		~~- value = d~~
149		~~- assert type(value) == type({})~~
150		~~- x = self.next()~~
151		~~- for i, v in value.iteritems():~~
152		~~- self.data.setdefault(i, 0)~~
153		~~- if update:~~
154		~~- self.data[i] += v~~
155		~~- else:~~
156		~~- i += x~~
157		~~- self.data[i] = v~~
	139	+ assert isinstance(value, dict)
	140	+ if update:
	141	+ for k, v in value:
	142	+ self.data.setdefault(k, 0)
	143	+ self.data[k] += v
	144	+ else:
	145	+ try:
	146	+ i = max(self.data.keys()) + 1
	147	+ except ValueError:
	148	+ i = 0
	149	+ self.data[i] = value
158	150
159	151
	152	+
160	153	class Variable(Data):
161	154	'''
162	155	This class constructs a time-based variable.
—	—	@@ -337,8 +330,8 @@
338	331	data, all_keys = data_converter.convert_dataset_to_lists(self, 'manage')
339	332	headers = data_converter.add_headers(self, all_keys)
340	333	fh = file_utils.create_txt_filehandle(settings.dataset_location, self.filename, 'w', settings.encoding)
341		~~- file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True, format=self.format)~~
342		~~- file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format)~~
	334	+ file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True)
	335	+ file_utils.write_list_to_csv(data, fh, recursive=False, newline=True)
343	336	fh.close()
344	337
345	338	def encode(self):
Index: trunk/tools/editor_trends/etl/store.py
—	—	@@ -17,7 +17,7 @@
18	18	__date__ = '2011-01-04'
19	19	__version__ = '0.1'
20	20
21		-
	21	+from Queue import Empty
22	22	import multiprocessing
23	23	import sys
24	24
—	—	@@ -41,11 +41,16 @@
42	42	'''
43	43	mongo = db.init_mongo_db(dbname)
44	44	collection = mongo[collection]
	45	+
45	46	editor_cache = cache.EditorCache(collection)
46	47	prev_contributor = -1
47	48	edits = 0
48	49	while True:
49		~~- filename = tasks.get(block=False)~~
	50	+ try:
	51	+ filename = tasks.get(block=False)
	52	+ except Empty:
	53	+ break
	54	+
50	55	tasks.task_done()
51	56	if filename == None:
52	57	print 'Swallowing a poison pill.'
—	—	@@ -53,28 +58,29 @@
54	59	print '%s files left in the queue.' % messages.show(tasks.qsize)
55	60
56	61	fh = file_utils.create_txt_filehandle(source, filename, 'r', settings.encoding)
	62	+ print fh
57	63	for line in file_utils.read_raw_data(fh):
58		~~- #print line~~
59		~~- if len(line) == 0:~~
60		~~- continue~~
61		~~- contributor = line[0]~~
62		~~- #print 'Parsing %s' % contributor~~
63		~~- if prev_contributor != contributor:~~
64		~~- if edits > 9:~~
65		~~- editor_cache.add(prev_contributor, 'NEXT')~~
66		~~- #print 'Stored %s' % prev_contributor~~
67		~~- else:~~
68		~~- editor_cache.clear(prev_contributor)~~
69		~~- edits = 0~~
70		~~- edits += 1~~
71		~~- date = text_utils.convert_timestamp_to_datetime_utc(line[1]) #+ datetime.timedelta(days=1)~~
72		~~- article_id = int(line[2])~~
73		~~- username = line[3].encode(settings.encoding)~~
74		~~- value = {'date': date, 'article': article_id, 'username': username}~~
75		~~- editor_cache.add(contributor, value)~~
76		~~- prev_contributor = contributor~~
	64	+ if len(line) > 1:
	65	+ contributor = line[0]
	66	+ #print 'Parsing %s' % contributor
	67	+ if prev_contributor != contributor:
	68	+ if edits > 9:
	69	+ editor_cache.add(prev_contributor, 'NEXT')
	70	+ print 'Stored %s' % prev_contributor
	71	+ else:
	72	+ editor_cache.clear(prev_contributor)
	73	+ edits = 0
	74	+ edits += 1
	75	+ date = text_utils.convert_timestamp_to_datetime_utc(line[1])
	76	+ article_id = int(line[2])
	77	+ username = line[3].encode(settings.encoding)
	78	+ value = {'date': date,
	79	+ 'article': article_id,
	80	+ 'username': username}
	81	+ editor_cache.add(contributor, value)
	82	+ prev_contributor = contributor
77	83	fh.close()
78		~~- print editor_cache.n~~
	84	+ #print editor_cache.n
79	85
80	86
81	87	def launcher(source, dbname, collection):
—	—	@@ -88,8 +94,8 @@
89	95	coll.create_index('editor')
90	96
91	97	files = file_utils.retrieve_file_list(source, 'csv')
92		~~- print files~~
93		~~- print source~~
	98	+
	99	+ print 'Input directory is: %s ' % source
94	100	tasks = multiprocessing.JoinableQueue()
95	101	consumers = [multiprocessing.Process(target=store_editors,
96	102	args=(tasks, dbname, collection, source))
—	—	@@ -106,3 +112,4 @@
107	113
108	114	tasks.join()
109	115
	116	+
Index: trunk/tools/editor_trends/etl/downloader.py
—	—	@@ -12,7 +12,6 @@
13	13	http://www.fsf.org/licenses/gpl.html
14	14	'''
15	15
16		-
17	16	__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
18	17	__email__ = 'dvanliere at gmail dot com'
19	18	__date__ = '2011-01-21'
—	—	@@ -23,15 +22,15 @@
24	23	import multiprocessing
25	24	import sys
26	25
27		~~-sys.path.append('..')~~
28		~~-import configuration~~
29		~~-settings = configuration.Settings()~~
	26	+#sys.path.append('..')
	27	+#import configuration
	28	+#settings = configuration.Settings()
30	29
31	30	from utils import file_utils
32	31	from utils import http_utils
33	32	from utils import log
34	33
35		~~-def download_wiki_file(task_queue, config):~~
	34	+def download_wiki_file(task_queue, properties):
36	35	'''
37	36	This is a very simple replacement for wget and curl because Windows does
38	37	not have these tools installed by default
—	—	@@ -46,26 +45,39 @@
47	46	break
48	47	extension = file_utils.determine_file_extension(filename)
49	48	filemode = file_utils.determine_file_mode(extension)
50		~~- filesize = http_utils.determine_remote_filesize(settings.wp_dump_location, config.path, filename)~~
	49	+ filesize = http_utils.determine_remote_filesize(properties.settings.wp_dump_location,
	50	+ properties.dump_relative_path,
	51	+ filename)
	52	+
	53	+# mod_rem = http_utils.determine_modified_date(properties.settings.wp_dump_location,
	54	+# properties.dump_relative_path,
	55	+# filename)
	56	+
	57	+ if file_utils.check_file_exists(properties.location, filename):
	58	+ #This can be activated as soon as bug 21575 is fixed.
	59	+ #mod_loc = file_utils.get_modified_date(properties.location, filename)
	60	+ #if mod_loc != mod_rem:
	61	+ print 'Swallowed a poison pill'
	62	+ break
	63	+
51	64	if filemode == 'w':
52		~~- fh = file_utils.create_txt_filehandle(config.location, filename, filemode, settings.encoding)~~
	65	+ fh = file_utils.create_txt_filehandle(properties.location, filename, filemode, properties.settings.encoding)
	66	+
53	67	else:
54		~~- fh = file_utils.create_binary_filehandle(config.location, filename, 'wb')~~
	68	+ fh = file_utils.create_binary_filehandle(properties.location, filename, 'wb')
55	69
56	70	if filesize != -1:
57	71	widgets = log.init_progressbar_widgets(filename)
58	72	pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start()
59	73
60	74	try:
61		~~- if filename.endswith('json'):~~
62		~~- req = urllib2.Request(settings.wp_dump_location + config.path)~~
63		~~- else:~~
64		~~- req = urllib2.Request(settings.wp_dump_location + config.path + filename)~~
	75	+ path = '%s%s' % (properties.dump_absolute_path, filename)
	76	+ req = urllib2.Request(path)
65	77	response = urllib2.urlopen(req)
66	78	while True:
67	79	data = response.read(chunk)
68	80	if not data:
69		~~- print 'Finished downloading %s%s%s.' % (settings.wp_dump_location, config.path, filename)~~
	81	+ print 'Finished downloading %s.' % (path)
70	82	break
71	83	fh.write(data)
72	84
—	—	@@ -83,22 +95,30 @@
84	96	finally:
85	97	fh.close()
86	98
	99	+ #file_utils.set_modified_data(mod_rem, properties.location, filename)
	100	+
87	101	return success
88	102
89	103
90	104	def launcher(properties, settings, logger):
91	105	print 'Creating list of files to be downloaded...'
92		~~- tasks = http_utils.create_list_dumpfiles(settings.wp_dump_location,~~
93		~~- properties.path,~~
94		~~- properties.filename)~~
	106	+ result = True
	107	+ tasks = http_utils.create_list_dumpfiles(properties.settings.wp_dump_location,
	108	+ properties.dump_relative_path,
	109	+ properties.dump_filename)
	110	+ #print tasks.qsize()
	111	+ #if tasks.qsize() < properties.settings.number_of_processes:
	112	+ # properties.settings.number_of_processes = tasks.qsize()
95	113	consumers = [multiprocessing.Process(target=download_wiki_file,
96	114	args=(tasks, properties))
97		~~- for i in xrange(settings.number_of_processes)]~~
98		-
	115	+ for i in xrange(properties.settings.number_of_processes + 1)]
99	116	print 'Starting consumers to download files...'
100	117	for w in consumers:
101	118	w.start()
102	119
103	120	tasks.join()
104		~~- result = all([consumer.exitcode for consumer in consumers])~~
	121	+ for consumer in consumers:
	122	+ if consumer.exitcode != 0 and consumer.exitcode != None:
	123	+ result = False
	124	+
105	125	return result
Index: trunk/tools/editor_trends/etl/extracter.py
—	—	@@ -22,6 +22,7 @@
23	23	import os
24	24	import multiprocessing
25	25	import progressbar
	26	+from Queue import Empty
26	27
27	28	sys.path.append('..')
28	29	import configuration
—	—	@@ -204,7 +205,7 @@
205	206	for function in tags[tag].keys():
206	207	f = tags[tag][function]
207	208	value = f(el, bots=bots)
208		~~- if isinstance(value, list):~~
	209	+ if isinstance(value, dict):
209	210	#if type(value) == type({}):
210	211	for kw in value:
211	212	vars[x][kw] = value[kw]
—	—	@@ -232,35 +233,51 @@
233	234	location = os.path.join(settings.input_location, language_code, project)
234	235	output = os.path.join(settings.input_location, language_code, project, 'txt')
235	236	widgets = log.init_progressbar_widgets('Extracting data')
	237	+ filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a',
	238	+ settings.encoding) for fh in xrange(settings.max_filehandles)]
236	239
237	240	while True:
238	241	total, processed = 0.0, 0.0
239		~~- filename = tasks.get(block=False)~~
240		~~- tasks.task_done()~~
	242	+ try:
	243	+ filename = tasks.get(block=False)
	244	+ except Empty:
	245	+ break
	246	+ finally:
	247	+ print tasks.qsize()
	248	+ tasks.task_done()
	249	+
241	250	if filename == None:
242		~~- print 'There are no more jobs in the queue left.'~~
	251	+ print '\nThere are no more jobs in the queue left.'
243	252	break
244	253
245	254	filesize = file_utils.determine_filesize(location, filename)
246		~~- fh = file_utils.create_txt_filehandle(location, filename, 'r', settings.encoding)~~
247		~~- ns, xml_namespace = wikitree.parser.extract_meta_information(fh)~~
	255	+ print 'Opening %s...' % (os.path.join(location, filename))
	256	+ print 'Filesize: %s' % filesize
	257	+ fh1 = file_utils.create_txt_filehandle(location, filename, 'r', settings.encoding)
	258	+ fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', settings.encoding)
	259	+ ns, xml_namespace = wikitree.parser.extract_meta_information(fh1)
248	260	ns = build_namespaces_locale(ns, namespaces)
249	261	settings.xml_namespace = xml_namespace
250	262
251	263	pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start()
252		~~- for page, article_size in wikitree.parser.read_input(fh):~~
	264	+ for page, article_size in wikitree.parser.read_input(fh1):
253	265	title = page.find('title')
254	266	total += 1
255	267	if verify_article_belongs_namespace(title, ns):
256	268	article_id = page.find('id').text
	269	+ title = page.find('title').text
257	270	revisions = page.findall('revision')
258	271	revisions = parse_comments(revisions, remove_numeric_character_references)
259	272	output = output_editor_information(revisions, article_id, bot_ids)
260	273	write_output(output, filehandles, lock)
	274	+ file_utils.write_list_to_csv([article_id, title], fh2)
261	275	processed += 1
262	276	page.clear()
263	277	pbar.update(pbar.currval + article_size)
264		~~- fh.close()~~
	278	+
	279	+ fh1.close()
	280	+ fh2.close()
	281	+ print 'Closing %s...' % (os.path.join(location, filename))
265	282	print 'Total pages: %s' % total
266	283	print 'Pages processed: %s (%s)' % (processed, processed / total)
267	284
—	—	@@ -269,7 +286,8 @@
270	287
271	288	def group_observations(obs):
272	289	'''
273		~~- mmm forgot the purpose of this function~~
	290	+ This function groups observation by editor id, this way we have to make
	291	+ fewer fileopening calls.
274	292	'''
275	293	d = {}
276	294	for o in obs:
—	—	@@ -283,17 +301,19 @@
284	302	def write_output(observations, filehandles, lock):
285	303	observations = group_observations(observations)
286	304	for obs in observations:
287		~~- for i, o in enumerate(observations[obs]):~~
288		~~- if i == 0:~~
289		~~- fh = filehandles[hash(obs)]~~
290		~~- try:~~
291		~~- lock.acquire()~~
	305	+ lock.acquire() #lock the write around all edits of an editor for a particular page
	306	+ try:
	307	+ for i, o in enumerate(observations[obs]):
	308	+ if i == 0:
	309	+ fh = filehandles[hash(obs)]
292	310	file_utils.write_list_to_csv(o, fh)
293		~~- lock.releas()~~
294		~~- except Exception, error:~~
295		~~- print error~~
296	311
	312	+ except Exception, error:
	313	+ print 'Encountered the following error while writing data to %s: %s' % (error, fh)
	314	+ finally:
	315	+ lock.release()
297	316
	317	+
298	318	def hash(id):
299	319	'''
300	320	A very simple hash function based on modulo. The except clause has been
—	—	@@ -307,8 +327,10 @@
308	328
309	329
310	330	def prepare(output):
311		~~- file_utils.delete_file(output, None, directory=True)~~
312		~~- file_utils.create_directory(output)~~
	331	+ res = file_utils.delete_file(output, None, directory=True)
	332	+ if res:
	333	+ res = file_utils.create_directory(output)
	334	+ return res
313	335
314	336
315	337	def unzip(properties):
—	—	@@ -336,7 +358,7 @@
337	359	print 'There was an error while extracting %s, please make sure \
338	360	that %s is valid archive.' % (fn, fn)
339	361	return False
340		-
	362	+ print tasks.qsize()
341	363	return tasks
342	364
343	365	def launcher(properties):
—	—	@@ -349,17 +371,22 @@
350	372	'''
351	373	result = True
352	374	tasks = unzip(properties)
353		~~- prepare(properties.language_code, properties.project)~~
	375	+
	376	+ output = os.path.join(settings.input_location, properties.language.code,
	377	+ properties.project.name, 'txt')
	378	+ result = prepare(output)
	379	+ if not result:
	380	+ return result
	381	+
354	382	lock = multiprocessing.Lock()
355		~~- filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a',~~
356		~~- settings.encoding) for fh in xrange(settings.max_filehandles)]~~
357		~~- output = os.path.join(settings.input_location, properties.language_code,~~
358		~~- properties.project, 'txt')~~
	383	+# filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a',
	384	+# settings.encoding) for fh in xrange(settings.max_filehandles)]
359	385
	386	+ filehandles = []
360	387	consumers = [multiprocessing.Process(target=parse_dumpfile,
361	388	args=(tasks,
362		~~- properties.project,~~
363		~~- properties.language_code,~~
	389	+ properties.project.name,
	390	+ properties.language.code,
364	391	filehandles,
365	392	lock,
366	393	properties.namespaces))
—	—	@@ -369,15 +396,12 @@
370	397	tasks.put(None)
371	398
372	399	for w in consumers:
	400	+ print 'Launching process...'
373	401	w.start()
374	402
375	403	tasks.join()
376	404	filehandles = [fh.close() for fh in filehandles]
377		~~-# result = parse_dumpfile(properties.project, file_without_ext,~~
378		~~-# properties.language_code,~~
379		~~-# namespaces=['0'])~~
380	405
381		-
382	406	result = all([consumer.exitcode for consumer in consumers])
383	407	return result
384	408
—	—	@@ -386,8 +410,7 @@
387	411	project = 'wiki'
388	412	language_code = 'sv'
389	413	filename = 'svwiki-latest-stub-meta-history.xml'
390		~~- #parse_dumpfile(project, filename, language_code)~~
391		~~- launcher()~~
	414	+ parse_dumpfile(project, filename, language_code)
392	415
393	416	if __name__ == '__main__':
394	417	debug()
Index: trunk/tools/editor_trends/etl/sort.py
—	—	@@ -83,8 +83,8 @@
84	84	'''
85	85	Merges smaller sorted files in one big file, no longer used.
86	86	'''
87		~~- fh = file_utils.create_txt_filehandle(target, 'merged_%s.txt' % iteration, 'w',~~
88		~~- settings.encoding)~~
	87	+ fh = file_utils.create_txt_filehandle(target, 'merged_%s.txt' % iteration,
	88	+ 'w', settings.encoding)
89	89	lines = 0
90	90	for line in heapq.merge(*[readline(filename) for filename in files]):
91	91	file_utils.write_list_to_csv(line, fh)
—	—	@@ -98,7 +98,8 @@
99	99	'''
100	100	Writes the sorted file to target
101	101	'''
102		~~- fh = file_utils.create_txt_filehandle(target, filename, 'w', settings.encoding)~~
	102	+ fh = file_utils.create_txt_filehandle(target, filename, 'w',
	103	+ settings.encoding)
103	104	file_utils.write_list_to_csv(sorted_data, fh)
104	105	fh.close()
105	106
—	—	@@ -114,6 +115,7 @@
115	116	tasks.task_done()
116	117	if filename == None:
117	118	print 'Swallowed a poison pill'
	119	+ print tasks.qsize()
118	120	break
119	121
120	122	fh = file_utils.create_txt_filehandle(source,
—	—	@@ -129,8 +131,8 @@
130	132	sorted_data = mergesort(data)
131	133	write_sorted_file(sorted_data, filename, target)
132	134	print filename, messages.show(tasks.qsize)
133		~~- except UnicodeDecodeError:~~
134		~~- continue~~
	135	+ except UnicodeDecodeError, e:
	136	+ print e
135	137	except Empty:
136	138	break
137	139
Index: trunk/tools/editor_trends/classes/bots.py
—	—	@@ -87,7 +87,7 @@
88	88	self.add_clock_data()
89	89	self.active()
90	90	self.data.append(self.dt)
91		~~- file_utils.write_list_to_csv(self.data, fh, recursive=False, newline=True)~~
	91	+ file_utils.write_list_to_csv(self.data, fh)
92	92
93	93
94	94
Index: trunk/tools/editor_trends/utils/file_utils.py
—	—	@@ -31,6 +31,8 @@
32	32	import ctypes
33	33	import sys
34	34	import shutil
	35	+import multiprocessing
	36	+
35	37	sys.path.append('..')
36	38
37	39
—	—	@@ -117,7 +119,7 @@
118	120	return 'wb'
119	121
120	122
121		~~-def write_list_to_csv(data, fh, recursive=False, newline=True, format='wide'):~~
	123	+def write_list_to_csv(data, fh, recursive=False, newline=True):
122	124	'''
123	125	@data is a list which can contain other lists that will be written as a
124	126	single line to a textfile
—	—	@@ -126,22 +128,22 @@
127	129	The calling function is responsible for:
128	130	1) closing the filehandle
129	131	'''
	132	+ lock = multiprocessing.Lock()
130	133	tab = False
131	134	wrote_newline = None
132	135	if recursive:
133	136	recursive = False
	137	+ lock.acquire()
134	138	for x, d in enumerate(data):
135	139	if tab:
136	140	fh.write('\t')
137	141	if isinstance(d, list):
138		~~- #if type(d) == type([]):~~
139	142	recursive = write_list_to_csv(d, fh, recursive=True, newline=False)
140	143	#when there is a list of lists but no other elements in the first list
141	144	#then write a newline.
142	145	if len(d) == len(data[x]):
143	146	fh.write('\n')
144	147	elif isinstance(d, dict):
145		~~- #elif type(d) == type({}):~~
146	148	tab = write_dict_to_csv(d, fh, d.keys(), write_key=False, format=format)
147	149	else:
148	150	fh.write('%s' % d)
—	—	@@ -152,8 +154,8 @@
153	155	return True
154	156	if newline:
155	157	fh.write('\n')
	158	+ lock.release()
156	159
157		-
158	160	def write_dict_to_csv(data, fh, keys, write_key=True, format='long'):
159	161	assert format == 'long' or format == 'wide'
160	162
—	—	@@ -162,13 +164,10 @@
163	165	if write_key:
164	166	fh.write('%s\t' % key)
165	167	if isinstance(data[key], list):
166		~~- #if type(data[key]) == type([]):~~
167	168	for d in data[key]:
168	169	fh.write('%s\t%s\n' % (key, d))
169	170	elif isinstance(data[key], dict):
170		~~- #elif type(data[key]) == type({}):~~
171	171	write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format)
172		~~-# elif getattr(data[key], '__iter__', False):~~
173	172	# for d in data[key]:
174	173	# fh.write('%s\t%s\t%s\n' % (key, d, data[key][d]))
175	174	else:
—	—	@@ -188,8 +187,6 @@
189	188	fh.write('%s\t' % (data[key]))
190	189	fh.write('\n')
191	190
192		~~- #if type(data[key]) == type([]):~~
193		~~- # write_list_to_csv(data[key], fh, recursive=False, newline=True)~~
194	191
195	192	def create_txt_filehandle(location, name, mode, encoding):
196	193	filename = construct_filename(name, '.csv')
Index: trunk/tools/editor_trends/utils/data_converter.py
—	—	@@ -20,6 +20,17 @@
21	21
22	22	import datetime
23	23
	24	+def create_windows(var, break_down_first_year=True):
	25	+ '''
	26	+ This function creates a list of months. If break_down_first_year = True then
	27	+ the first year will be split in 3, 6, 9 months as well.
	28	+ '''
	29	+ years = var.max_year - var.min_year
	30	+ windows = [y * 12 for y in xrange(1, years)]
	31	+ if break_down_first_year:
	32	+ windows = [3, 6, 9] + windows
	33	+ return windows
	34	+
24	35	def convert_seconds_to_date(secs):
25	36	#return time.gmtime(secs)
26	37	return datetime.datetime.fromtimestamp(secs)

Status & tagging log

07:05, 29 January 2011 Reedy (talk | contribs) changed the status of r81172 [removed: new added: deferred]