r81172 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r81171‎ | r81172 | r81173 >
Date:22:30, 28 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
A little bit of cleaning up.
Modified paths:
  • /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/dataset.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py (modified) (history)
  • /trunk/tools/editor_trends/classes/bots.py (modified) (history)
  • /trunk/tools/editor_trends/etl/downloader.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/sort.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/utils/data_converter.py (modified) (history)
  • /trunk/tools/editor_trends/utils/file_utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -173,13 +173,13 @@
174174 print 'Start transforming dataset'
175175 stopwatch = timer.Timer()
176176 log.log_to_mongo(properties, 'dataset', 'transform', stopwatch, event='start')
177 - db.cleanup_database(properties.project, logger, 'dataset')
 177+ db.cleanup_database(properties.project.name, logger, 'dataset')
178178 # write_message_to_log(logger, settings,
179179 # message=None,
180180 # verb='Transforming',
181181 # project=properties.project,
182182 # collection=properties.collection)
183 - transformer.transform_editors_single_launcher(properties.project,
 183+ transformer.transform_editors_single_launcher(properties.project.name,
184184 properties.collection)
185185 stopwatch.elapsed()
186186 log.log_to_mongo(properties, 'dataset', 'transform', stopwatch,
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py
@@ -17,7 +17,11 @@
1818 __date__ = '2011-01-25'
1919 __version__ = '0.1'
2020
 21+import datetime
 22+from dateutil.relativedelta import relativedelta
 23+from utils import data_converter
2124
 25+
2226 def cohort_dataset_backward_bar(var, editor, **kwargs):
2327 #first_edit = editor['first_edit']
2428 '''
@@ -30,7 +34,7 @@
3135 n = editor['edit_count']
3236
3337 if n >= var.cum_cutoff:
34 - windows = create_windows(var, break_down_first_year=False)
 38+ windows = data_converter.create_windows(var, break_down_first_year=False)
3539 for year in xrange(new_wikipedian.year, var.max_year):
3640 year = str(year)
3741 if editor['edits_by_year'][year] >= var.cutoff:
Index: trunk/tools/editor_trends/analyses/analyzer.py
@@ -94,8 +94,6 @@
9595 format=fmt)
9696 var = dataset.Variable('count', **kwargs)
9797
98 -
99 -
10098 for editor in editors:
10199 editor = coll.find_one({'editor': editor})
102100 var = func(var, editor, dbname=dbname)
@@ -168,25 +166,14 @@
169167 return min_year, max_year
170168
171169
172 -def create_windows(var, break_down_first_year=True):
173 - '''
174 - This function creates a list of months. If break_down_first_year = True then
175 - the first year will be split in 3, 6, 9 months as well.
176 - '''
177 - years = var.max_year - var.min_year
178 - windows = [y * 12 for y in xrange(1, years)]
179 - if break_down_first_year:
180 - windows = [3, 6, 9] + windows
181 - return windows
182 -
183 -
184170 if __name__ == '__main__':
 171+ generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)
185172 #generate_chart_data('wiki', 'editors_dataset','en', 'total_number_of_new_wikipedians', time_unit='year')
186173 #generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year')
187174 #generate_chart_data('wiki', 'editors_dataset','en', 'total_cumulative_edits', time_unit='year')
188 - generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0)
189 - generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide')
190 - generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')
 175+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0)
 176+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide')
 177+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')
191178 #generate_chart_data('wiki', 'editors_dataset','en', 'histogram_edits', time_unit='year', cutoff=0)
192179 #generate_chart_data('wiki', 'editors_dataset','en', 'time_to_new_wikipedian', time_unit='year', cutoff=0)
193180 #generate_chart_data('wiki', 'editors_dataset','en', 'new_editor_count', time_unit='month', cutoff=0)
Index: trunk/tools/editor_trends/analyses/dataset.py
@@ -129,33 +129,26 @@
130130 def __getitem__(self, key):
131131 return getattr(self, key, [])
132132
133 - def next(self):
134 - try:
135 - return len(self.data.keys()) + 1
136 - except IndexError:
137 - return 0
138 -
139133 def add(self, value, update):
140134 '''
141135 If update == True then data[i] will be incremented else data[i] will be
142136 created, in that case make sure that i is unique. Update is useful for
143137 tallying a variable.
144138 '''
145 - if hasattr(value, '__iter__') == False:
146 - d = {}
147 - d[0] = value
148 - value = d
149 - assert type(value) == type({})
150 - x = self.next()
151 - for i, v in value.iteritems():
152 - self.data.setdefault(i, 0)
153 - if update:
154 - self.data[i] += v
155 - else:
156 - i += x
157 - self.data[i] = v
 139+ assert isinstance(value, dict)
 140+ if update:
 141+ for k, v in value:
 142+ self.data.setdefault(k, 0)
 143+ self.data[k] += v
 144+ else:
 145+ try:
 146+ i = max(self.data.keys()) + 1
 147+ except ValueError:
 148+ i = 0
 149+ self.data[i] = value
158150
159151
 152+
160153 class Variable(Data):
161154 '''
162155 This class constructs a time-based variable.
@@ -337,8 +330,8 @@
338331 data, all_keys = data_converter.convert_dataset_to_lists(self, 'manage')
339332 headers = data_converter.add_headers(self, all_keys)
340333 fh = file_utils.create_txt_filehandle(settings.dataset_location, self.filename, 'w', settings.encoding)
341 - file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True, format=self.format)
342 - file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format)
 334+ file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True)
 335+ file_utils.write_list_to_csv(data, fh, recursive=False, newline=True)
343336 fh.close()
344337
345338 def encode(self):
Index: trunk/tools/editor_trends/etl/store.py
@@ -17,7 +17,7 @@
1818 __date__ = '2011-01-04'
1919 __version__ = '0.1'
2020
21 -
 21+from Queue import Empty
2222 import multiprocessing
2323 import sys
2424
@@ -41,11 +41,16 @@
4242 '''
4343 mongo = db.init_mongo_db(dbname)
4444 collection = mongo[collection]
 45+
4546 editor_cache = cache.EditorCache(collection)
4647 prev_contributor = -1
4748 edits = 0
4849 while True:
49 - filename = tasks.get(block=False)
 50+ try:
 51+ filename = tasks.get(block=False)
 52+ except Empty:
 53+ break
 54+
5055 tasks.task_done()
5156 if filename == None:
5257 print 'Swallowing a poison pill.'
@@ -53,28 +58,29 @@
5459 print '%s files left in the queue.' % messages.show(tasks.qsize)
5560
5661 fh = file_utils.create_txt_filehandle(source, filename, 'r', settings.encoding)
 62+ print fh
5763 for line in file_utils.read_raw_data(fh):
58 - #print line
59 - if len(line) == 0:
60 - continue
61 - contributor = line[0]
62 - #print 'Parsing %s' % contributor
63 - if prev_contributor != contributor:
64 - if edits > 9:
65 - editor_cache.add(prev_contributor, 'NEXT')
66 - #print 'Stored %s' % prev_contributor
67 - else:
68 - editor_cache.clear(prev_contributor)
69 - edits = 0
70 - edits += 1
71 - date = text_utils.convert_timestamp_to_datetime_utc(line[1]) #+ datetime.timedelta(days=1)
72 - article_id = int(line[2])
73 - username = line[3].encode(settings.encoding)
74 - value = {'date': date, 'article': article_id, 'username': username}
75 - editor_cache.add(contributor, value)
76 - prev_contributor = contributor
 64+ if len(line) > 1:
 65+ contributor = line[0]
 66+ #print 'Parsing %s' % contributor
 67+ if prev_contributor != contributor:
 68+ if edits > 9:
 69+ editor_cache.add(prev_contributor, 'NEXT')
 70+ print 'Stored %s' % prev_contributor
 71+ else:
 72+ editor_cache.clear(prev_contributor)
 73+ edits = 0
 74+ edits += 1
 75+ date = text_utils.convert_timestamp_to_datetime_utc(line[1])
 76+ article_id = int(line[2])
 77+ username = line[3].encode(settings.encoding)
 78+ value = {'date': date,
 79+ 'article': article_id,
 80+ 'username': username}
 81+ editor_cache.add(contributor, value)
 82+ prev_contributor = contributor
7783 fh.close()
78 - print editor_cache.n
 84+ #print editor_cache.n
7985
8086
8187 def launcher(source, dbname, collection):
@@ -88,8 +94,8 @@
8995 coll.create_index('editor')
9096
9197 files = file_utils.retrieve_file_list(source, 'csv')
92 - print files
93 - print source
 98+
 99+ print 'Input directory is: %s ' % source
94100 tasks = multiprocessing.JoinableQueue()
95101 consumers = [multiprocessing.Process(target=store_editors,
96102 args=(tasks, dbname, collection, source))
@@ -106,3 +112,4 @@
107113
108114 tasks.join()
109115
 116+
Index: trunk/tools/editor_trends/etl/downloader.py
@@ -12,7 +12,6 @@
1313 http://www.fsf.org/licenses/gpl.html
1414 '''
1515
16 -
1716 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
1817 __email__ = 'dvanliere at gmail dot com'
1918 __date__ = '2011-01-21'
@@ -23,15 +22,15 @@
2423 import multiprocessing
2524 import sys
2625
27 -sys.path.append('..')
28 -import configuration
29 -settings = configuration.Settings()
 26+#sys.path.append('..')
 27+#import configuration
 28+#settings = configuration.Settings()
3029
3130 from utils import file_utils
3231 from utils import http_utils
3332 from utils import log
3433
35 -def download_wiki_file(task_queue, config):
 34+def download_wiki_file(task_queue, properties):
3635 '''
3736 This is a very simple replacement for wget and curl because Windows does
3837 not have these tools installed by default
@@ -46,26 +45,39 @@
4746 break
4847 extension = file_utils.determine_file_extension(filename)
4948 filemode = file_utils.determine_file_mode(extension)
50 - filesize = http_utils.determine_remote_filesize(settings.wp_dump_location, config.path, filename)
 49+ filesize = http_utils.determine_remote_filesize(properties.settings.wp_dump_location,
 50+ properties.dump_relative_path,
 51+ filename)
 52+
 53+# mod_rem = http_utils.determine_modified_date(properties.settings.wp_dump_location,
 54+# properties.dump_relative_path,
 55+# filename)
 56+
 57+ if file_utils.check_file_exists(properties.location, filename):
 58+ #This can be activated as soon as bug 21575 is fixed.
 59+ #mod_loc = file_utils.get_modified_date(properties.location, filename)
 60+ #if mod_loc != mod_rem:
 61+ print 'Swallowed a poison pill'
 62+ break
 63+
5164 if filemode == 'w':
52 - fh = file_utils.create_txt_filehandle(config.location, filename, filemode, settings.encoding)
 65+ fh = file_utils.create_txt_filehandle(properties.location, filename, filemode, properties.settings.encoding)
 66+
5367 else:
54 - fh = file_utils.create_binary_filehandle(config.location, filename, 'wb')
 68+ fh = file_utils.create_binary_filehandle(properties.location, filename, 'wb')
5569
5670 if filesize != -1:
5771 widgets = log.init_progressbar_widgets(filename)
5872 pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start()
5973
6074 try:
61 - if filename.endswith('json'):
62 - req = urllib2.Request(settings.wp_dump_location + config.path)
63 - else:
64 - req = urllib2.Request(settings.wp_dump_location + config.path + filename)
 75+ path = '%s%s' % (properties.dump_absolute_path, filename)
 76+ req = urllib2.Request(path)
6577 response = urllib2.urlopen(req)
6678 while True:
6779 data = response.read(chunk)
6880 if not data:
69 - print 'Finished downloading %s%s%s.' % (settings.wp_dump_location, config.path, filename)
 81+ print 'Finished downloading %s.' % (path)
7082 break
7183 fh.write(data)
7284
@@ -83,22 +95,30 @@
8496 finally:
8597 fh.close()
8698
 99+ #file_utils.set_modified_data(mod_rem, properties.location, filename)
 100+
87101 return success
88102
89103
90104 def launcher(properties, settings, logger):
91105 print 'Creating list of files to be downloaded...'
92 - tasks = http_utils.create_list_dumpfiles(settings.wp_dump_location,
93 - properties.path,
94 - properties.filename)
 106+ result = True
 107+ tasks = http_utils.create_list_dumpfiles(properties.settings.wp_dump_location,
 108+ properties.dump_relative_path,
 109+ properties.dump_filename)
 110+ #print tasks.qsize()
 111+ #if tasks.qsize() < properties.settings.number_of_processes:
 112+ # properties.settings.number_of_processes = tasks.qsize()
95113 consumers = [multiprocessing.Process(target=download_wiki_file,
96114 args=(tasks, properties))
97 - for i in xrange(settings.number_of_processes)]
98 -
 115+ for i in xrange(properties.settings.number_of_processes + 1)]
99116 print 'Starting consumers to download files...'
100117 for w in consumers:
101118 w.start()
102119
103120 tasks.join()
104 - result = all([consumer.exitcode for consumer in consumers])
 121+ for consumer in consumers:
 122+ if consumer.exitcode != 0 and consumer.exitcode != None:
 123+ result = False
 124+
105125 return result
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -22,6 +22,7 @@
2323 import os
2424 import multiprocessing
2525 import progressbar
 26+from Queue import Empty
2627
2728 sys.path.append('..')
2829 import configuration
@@ -204,7 +205,7 @@
205206 for function in tags[tag].keys():
206207 f = tags[tag][function]
207208 value = f(el, bots=bots)
208 - if isinstance(value, list):
 209+ if isinstance(value, dict):
209210 #if type(value) == type({}):
210211 for kw in value:
211212 vars[x][kw] = value[kw]
@@ -232,35 +233,51 @@
233234 location = os.path.join(settings.input_location, language_code, project)
234235 output = os.path.join(settings.input_location, language_code, project, 'txt')
235236 widgets = log.init_progressbar_widgets('Extracting data')
 237+ filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a',
 238+ settings.encoding) for fh in xrange(settings.max_filehandles)]
236239
237240 while True:
238241 total, processed = 0.0, 0.0
239 - filename = tasks.get(block=False)
240 - tasks.task_done()
 242+ try:
 243+ filename = tasks.get(block=False)
 244+ except Empty:
 245+ break
 246+ finally:
 247+ print tasks.qsize()
 248+ tasks.task_done()
 249+
241250 if filename == None:
242 - print 'There are no more jobs in the queue left.'
 251+ print '\nThere are no more jobs in the queue left.'
243252 break
244253
245254 filesize = file_utils.determine_filesize(location, filename)
246 - fh = file_utils.create_txt_filehandle(location, filename, 'r', settings.encoding)
247 - ns, xml_namespace = wikitree.parser.extract_meta_information(fh)
 255+ print 'Opening %s...' % (os.path.join(location, filename))
 256+ print 'Filesize: %s' % filesize
 257+ fh1 = file_utils.create_txt_filehandle(location, filename, 'r', settings.encoding)
 258+ fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', settings.encoding)
 259+ ns, xml_namespace = wikitree.parser.extract_meta_information(fh1)
248260 ns = build_namespaces_locale(ns, namespaces)
249261 settings.xml_namespace = xml_namespace
250262
251263 pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start()
252 - for page, article_size in wikitree.parser.read_input(fh):
 264+ for page, article_size in wikitree.parser.read_input(fh1):
253265 title = page.find('title')
254266 total += 1
255267 if verify_article_belongs_namespace(title, ns):
256268 article_id = page.find('id').text
 269+ title = page.find('title').text
257270 revisions = page.findall('revision')
258271 revisions = parse_comments(revisions, remove_numeric_character_references)
259272 output = output_editor_information(revisions, article_id, bot_ids)
260273 write_output(output, filehandles, lock)
 274+ file_utils.write_list_to_csv([article_id, title], fh2)
261275 processed += 1
262276 page.clear()
263277 pbar.update(pbar.currval + article_size)
264 - fh.close()
 278+
 279+ fh1.close()
 280+ fh2.close()
 281+ print 'Closing %s...' % (os.path.join(location, filename))
265282 print 'Total pages: %s' % total
266283 print 'Pages processed: %s (%s)' % (processed, processed / total)
267284
@@ -269,7 +286,8 @@
270287
271288 def group_observations(obs):
272289 '''
273 - mmm forgot the purpose of this function
 290+ This function groups observation by editor id, this way we have to make
 291+ fewer fileopening calls.
274292 '''
275293 d = {}
276294 for o in obs:
@@ -283,17 +301,19 @@
284302 def write_output(observations, filehandles, lock):
285303 observations = group_observations(observations)
286304 for obs in observations:
287 - for i, o in enumerate(observations[obs]):
288 - if i == 0:
289 - fh = filehandles[hash(obs)]
290 - try:
291 - lock.acquire()
 305+ lock.acquire() #lock the write around all edits of an editor for a particular page
 306+ try:
 307+ for i, o in enumerate(observations[obs]):
 308+ if i == 0:
 309+ fh = filehandles[hash(obs)]
292310 file_utils.write_list_to_csv(o, fh)
293 - lock.releas()
294 - except Exception, error:
295 - print error
296311
 312+ except Exception, error:
 313+ print 'Encountered the following error while writing data to %s: %s' % (error, fh)
 314+ finally:
 315+ lock.release()
297316
 317+
298318 def hash(id):
299319 '''
300320 A very simple hash function based on modulo. The except clause has been
@@ -307,8 +327,10 @@
308328
309329
310330 def prepare(output):
311 - file_utils.delete_file(output, None, directory=True)
312 - file_utils.create_directory(output)
 331+ res = file_utils.delete_file(output, None, directory=True)
 332+ if res:
 333+ res = file_utils.create_directory(output)
 334+ return res
313335
314336
315337 def unzip(properties):
@@ -336,7 +358,7 @@
337359 print 'There was an error while extracting %s, please make sure \
338360 that %s is valid archive.' % (fn, fn)
339361 return False
340 -
 362+ print tasks.qsize()
341363 return tasks
342364
343365 def launcher(properties):
@@ -349,17 +371,22 @@
350372 '''
351373 result = True
352374 tasks = unzip(properties)
353 - prepare(properties.language_code, properties.project)
 375+
 376+ output = os.path.join(settings.input_location, properties.language.code,
 377+ properties.project.name, 'txt')
 378+ result = prepare(output)
 379+ if not result:
 380+ return result
 381+
354382 lock = multiprocessing.Lock()
355 - filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a',
356 - settings.encoding) for fh in xrange(settings.max_filehandles)]
357 - output = os.path.join(settings.input_location, properties.language_code,
358 - properties.project, 'txt')
 383+# filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a',
 384+# settings.encoding) for fh in xrange(settings.max_filehandles)]
359385
 386+ filehandles = []
360387 consumers = [multiprocessing.Process(target=parse_dumpfile,
361388 args=(tasks,
362 - properties.project,
363 - properties.language_code,
 389+ properties.project.name,
 390+ properties.language.code,
364391 filehandles,
365392 lock,
366393 properties.namespaces))
@@ -369,15 +396,12 @@
370397 tasks.put(None)
371398
372399 for w in consumers:
 400+ print 'Launching process...'
373401 w.start()
374402
375403 tasks.join()
376404 filehandles = [fh.close() for fh in filehandles]
377 -# result = parse_dumpfile(properties.project, file_without_ext,
378 -# properties.language_code,
379 -# namespaces=['0'])
380405
381 -
382406 result = all([consumer.exitcode for consumer in consumers])
383407 return result
384408
@@ -386,8 +410,7 @@
387411 project = 'wiki'
388412 language_code = 'sv'
389413 filename = 'svwiki-latest-stub-meta-history.xml'
390 - #parse_dumpfile(project, filename, language_code)
391 - launcher()
 414+ parse_dumpfile(project, filename, language_code)
392415
393416 if __name__ == '__main__':
394417 debug()
Index: trunk/tools/editor_trends/etl/sort.py
@@ -83,8 +83,8 @@
8484 '''
8585 Merges smaller sorted files in one big file, no longer used.
8686 '''
87 - fh = file_utils.create_txt_filehandle(target, 'merged_%s.txt' % iteration, 'w',
88 - settings.encoding)
 87+ fh = file_utils.create_txt_filehandle(target, 'merged_%s.txt' % iteration,
 88+ 'w', settings.encoding)
8989 lines = 0
9090 for line in heapq.merge(*[readline(filename) for filename in files]):
9191 file_utils.write_list_to_csv(line, fh)
@@ -98,7 +98,8 @@
9999 '''
100100 Writes the sorted file to target
101101 '''
102 - fh = file_utils.create_txt_filehandle(target, filename, 'w', settings.encoding)
 102+ fh = file_utils.create_txt_filehandle(target, filename, 'w',
 103+ settings.encoding)
103104 file_utils.write_list_to_csv(sorted_data, fh)
104105 fh.close()
105106
@@ -114,6 +115,7 @@
115116 tasks.task_done()
116117 if filename == None:
117118 print 'Swallowed a poison pill'
 119+ print tasks.qsize()
118120 break
119121
120122 fh = file_utils.create_txt_filehandle(source,
@@ -129,8 +131,8 @@
130132 sorted_data = mergesort(data)
131133 write_sorted_file(sorted_data, filename, target)
132134 print filename, messages.show(tasks.qsize)
133 - except UnicodeDecodeError:
134 - continue
 135+ except UnicodeDecodeError, e:
 136+ print e
135137 except Empty:
136138 break
137139
Index: trunk/tools/editor_trends/classes/bots.py
@@ -87,7 +87,7 @@
8888 self.add_clock_data()
8989 self.active()
9090 self.data.append(self.dt)
91 - file_utils.write_list_to_csv(self.data, fh, recursive=False, newline=True)
 91+ file_utils.write_list_to_csv(self.data, fh)
9292
9393
9494
Index: trunk/tools/editor_trends/utils/file_utils.py
@@ -31,6 +31,8 @@
3232 import ctypes
3333 import sys
3434 import shutil
 35+import multiprocessing
 36+
3537 sys.path.append('..')
3638
3739
@@ -117,7 +119,7 @@
118120 return 'wb'
119121
120122
121 -def write_list_to_csv(data, fh, recursive=False, newline=True, format='wide'):
 123+def write_list_to_csv(data, fh, recursive=False, newline=True):
122124 '''
123125 @data is a list which can contain other lists that will be written as a
124126 single line to a textfile
@@ -126,22 +128,22 @@
127129 The calling function is responsible for:
128130 1) closing the filehandle
129131 '''
 132+ lock = multiprocessing.Lock()
130133 tab = False
131134 wrote_newline = None
132135 if recursive:
133136 recursive = False
 137+ lock.acquire()
134138 for x, d in enumerate(data):
135139 if tab:
136140 fh.write('\t')
137141 if isinstance(d, list):
138 - #if type(d) == type([]):
139142 recursive = write_list_to_csv(d, fh, recursive=True, newline=False)
140143 #when there is a list of lists but no other elements in the first list
141144 #then write a newline.
142145 if len(d) == len(data[x]):
143146 fh.write('\n')
144147 elif isinstance(d, dict):
145 - #elif type(d) == type({}):
146148 tab = write_dict_to_csv(d, fh, d.keys(), write_key=False, format=format)
147149 else:
148150 fh.write('%s' % d)
@@ -152,8 +154,8 @@
153155 return True
154156 if newline:
155157 fh.write('\n')
 158+ lock.release()
156159
157 -
158160 def write_dict_to_csv(data, fh, keys, write_key=True, format='long'):
159161 assert format == 'long' or format == 'wide'
160162
@@ -162,13 +164,10 @@
163165 if write_key:
164166 fh.write('%s\t' % key)
165167 if isinstance(data[key], list):
166 - #if type(data[key]) == type([]):
167168 for d in data[key]:
168169 fh.write('%s\t%s\n' % (key, d))
169170 elif isinstance(data[key], dict):
170 - #elif type(data[key]) == type({}):
171171 write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format)
172 -# elif getattr(data[key], '__iter__', False):
173172 # for d in data[key]:
174173 # fh.write('%s\t%s\t%s\n' % (key, d, data[key][d]))
175174 else:
@@ -188,8 +187,6 @@
189188 fh.write('%s\t' % (data[key]))
190189 fh.write('\n')
191190
192 - #if type(data[key]) == type([]):
193 - # write_list_to_csv(data[key], fh, recursive=False, newline=True)
194191
195192 def create_txt_filehandle(location, name, mode, encoding):
196193 filename = construct_filename(name, '.csv')
Index: trunk/tools/editor_trends/utils/data_converter.py
@@ -20,6 +20,17 @@
2121
2222 import datetime
2323
 24+def create_windows(var, break_down_first_year=True):
 25+ '''
 26+ This function creates a list of months. If break_down_first_year = True then
 27+ the first year will be split in 3, 6, 9 months as well.
 28+ '''
 29+ years = var.max_year - var.min_year
 30+ windows = [y * 12 for y in xrange(1, years)]
 31+ if break_down_first_year:
 32+ windows = [3, 6, 9] + windows
 33+ return windows
 34+
2435 def convert_seconds_to_date(secs):
2536 #return time.gmtime(secs)
2637 return datetime.datetime.fromtimestamp(secs)

Status & tagging log