r81374 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r81373‎ | r81374 | r81375 >
Date:03:30, 2 February 2011
Author:diederik
Status:deferred
Tags:
Comment:
A bunch of small fixes, mostly corner cases on OSX, printing of Unicode strings and datetime conversions.
Modified paths:
  • /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py (modified) (history)
  • /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history)
  • /trunk/tools/editor_trends/cronjobs.py (modified) (history)
  • /trunk/tools/editor_trends/etl/downloader.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/utils/data_converter.py (modified) (history)
  • /trunk/tools/editor_trends/utils/file_utils.py (modified) (history)
  • /trunk/tools/editor_trends/utils/http_utils.py (modified) (history)
  • /trunk/tools/editor_trends/wikilytics/api/models.py (modified) (history)
  • /trunk/tools/editor_trends/wikilytics/templates/datasets.html (modified) (history)
  • /trunk/tools/editor_trends/wikitree/parser.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -104,7 +104,6 @@
105105 '''
106106 print 'Start downloading'
107107 stopwatch = timer.Timer()
108 - #project, language, jobtype, task, timer, event = 'start'
109108 log.log_to_mongo(properties, 'dataset', 'download', stopwatch, event='start')
110109 res = downloader.launcher(properties, settings, logger)
111110 stopwatch.elapsed()
@@ -153,7 +152,7 @@
154153 print 'Start storing data in MongoDB'
155154 stopwatch = timer.Timer()
156155 log.log_to_mongo(properties, 'dataset', 'store', stopwatch, event='start')
157 - db.cleanup_database(properties.project.name, logger)
 156+ db.cleanup_database(properties.dbname, logger)
158157 # write_message_to_log(logger, settings,
159158 # message=None,
160159 # verb='Storing',
@@ -164,7 +163,8 @@
165164 # collection=properties.collection)
166165 # for key in properties:
167166 # print key, getattr(properties, key)
168 - store.launcher(properties.sorted, properties.project.name, properties.collection)
 167+ store.launcher(properties.sorted, properties.dbname, properties.collection)
 168+
169169 stopwatch.elapsed()
170170 log.log_to_mongo(properties, 'dataset', 'store', stopwatch, event='finish')
171171
@@ -173,13 +173,13 @@
174174 print 'Start transforming dataset'
175175 stopwatch = timer.Timer()
176176 log.log_to_mongo(properties, 'dataset', 'transform', stopwatch, event='start')
177 - db.cleanup_database(properties.project.name, logger, 'dataset')
 177+ db.cleanup_database(properties.dbname, logger, 'dataset')
178178 # write_message_to_log(logger, settings,
179179 # message=None,
180180 # verb='Transforming',
181181 # project=properties.project,
182182 # collection=properties.collection)
183 - transformer.transform_editors_single_launcher(properties.project.name,
 183+ transformer.transform_editors_single_launcher(properties.dbname,
184184 properties.collection)
185185 stopwatch.elapsed()
186186 log.log_to_mongo(properties, 'dataset', 'transform', stopwatch,
@@ -200,7 +200,7 @@
201201 # dbname=properties.full_project,
202202 # collection=properties.collection)
203203
204 - analyzer.generate_chart_data(properties.project.name,
 204+ analyzer.generate_chart_data(properties.dbname,
205205 collection,
206206 properties.language.code,
207207 target,
Index: trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py
@@ -17,9 +17,12 @@
1818 __date__ = '2011-01-31'
1919 __version__ = '0.1'
2020
 21+import datetime
 22+from dateutil.relativedelta import relativedelta
 23+from utils import data_converter
2124
2225 def histogram_by_backward_cohort(var, editor, **kwargs):
23 - break_down=kwargs.pop('break_down', False)
 26+ break_down = kwargs.pop('break_down', False)
2427 new_wikipedian = editor['new_wikipedian']
2528 n = editor['edit_count']
2629
@@ -36,6 +39,10 @@
3740 if w >= editor_dt:
3841 datum = datetime.datetime(int(year), 12, 31)
3942 freq = editor['edits_by_year'][year]
 43+ if datum == datetime.datetime(2003, 12, 31):
 44+ if w == 24:
 45+ if freq == 1.0:
 46+ print 'break'
4047 var.add(datum, {w:{freq:1}})
4148 break
42 - return var
\ No newline at end of file
 49+ return var
Index: trunk/tools/editor_trends/analyses/analyzer.py
@@ -167,15 +167,16 @@
168168
169169
170170 if __name__ == '__main__':
171 - generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)
172 - generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', time_unit='year')
173 - generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year')
174 - generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', time_unit='year')
175 - generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0)
176 - generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide')
177 - generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')
178 - generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_edits', time_unit='year', cutoff=0)
179 - generate_chart_data('wiki', 'editors_dataset', 'en', 'time_to_new_wikipedian', time_unit='year', cutoff=0)
180 - generate_chart_data('wiki', 'editors_dataset', 'en', 'new_editor_count', time_unit='month', cutoff=0)
 171+ generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50)
 172+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)
 173+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', time_unit='year')
 174+ #generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year')
 175+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', time_unit='year')
 176+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0)
 177+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide')
 178+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')
 179+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_edits', time_unit='year', cutoff=0)
 180+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'time_to_new_wikipedian', time_unit='year', cutoff=0)
 181+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'new_editor_count', time_unit='month', cutoff=0)
181182
182183 #available_analyses()
Index: trunk/tools/editor_trends/wikitree/parser.py
@@ -61,7 +61,11 @@
6262 for ns in namespaces:
6363 key = ns.get('key')
6464 d[key] = extract_text(ns)
65 - print ns.get('key'), ns.text
 65+ text = ns.text if ns.text != None else ''
 66+ try:
 67+ print key, text.encode(settings.encoding)
 68+ except UnicodeEncodeError:
 69+ print key
6670 return d
6771
6872
Index: trunk/tools/editor_trends/etl/downloader.py
@@ -39,7 +39,6 @@
4040 success = True
4141 chunk = 1024 * 4
4242
43 -
4443 while True:
4544 filename = task_queue.get(block=False)
4645 task_queue.task_done()
@@ -58,15 +57,16 @@
5958 filename)
6059 mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, properties.settings.timestamp_server)
6160 if file_utils.check_file_exists(properties.location, filename):
62 - #This can be activated as soon as bug 21575 is fixed.
63 - properties.force = True
6461 mod_loc = file_utils.get_modified_date(properties.location, filename)
65 - if mod_loc != mod_date and properties.force == False:
 62+ if mod_loc == mod_date and (properties.force == False or properties.force == None):
6663 print 'You already have downloaded the most recent %s%s dumpfile.' % (properties.language.code, properties.project.name)
67 - break
 64+ continue
6865
6966 if filemode == 'w':
70 - fh = file_utils.create_txt_filehandle(properties.location, filename, filemode, properties.settings.encoding)
 67+ fh = file_utils.create_txt_filehandle(properties.location,
 68+ filename,
 69+ filemode,
 70+ properties.settings.encoding)
7171 else:
7272 fh = file_utils.create_binary_filehandle(properties.location, filename, 'wb')
7373
@@ -92,27 +92,23 @@
9393
9494 except urllib2.URLError, error:
9595 print 'Reason: %s' % error
96 - success = False
9796 except urllib2.HTTPError, error:
9897 print 'Error: %s' % error
99 - success = False
10098 finally:
10199 fh.close()
102100 file_utils.set_modified_data(mod_date, properties.location, filename)
103101
104 - return success
105102
106103
107104 def launcher(properties, settings, logger):
108105 print 'Creating list of files to be downloaded...'
109 - result = True
110106 tasks = http_utils.create_list_dumpfiles(properties.settings.wp_dump_location,
111107 properties.dump_relative_path,
112108 properties.dump_filename)
113109 #print tasks.qsize()
114110 #if tasks.qsize() < properties.settings.number_of_processes:
115111 # properties.settings.number_of_processes = tasks.qsize()
116 - if tasks.qsize() > 1:
 112+ if tasks.qsize() > 2:
117113 consumers = [multiprocessing.Process(target=download_wiki_file,
118114 args=(tasks, properties))
119115 for i in xrange(properties.settings.number_of_processes)]
@@ -124,8 +120,7 @@
125121 w.start()
126122
127123 tasks.join()
128 - for consumer in consumers:
129 - if consumer.exitcode != 0:
130 - result = False
 124+# for consumer in consumers:
 125+# if consumer.exitcode != 0:
 126+# result = False
131127
132 - return result
Index: trunk/tools/editor_trends/wikilytics/api/models.py
@@ -69,7 +69,6 @@
7070 def __unicode__(self):
7171 return u'%s%s' % (self.language_code, self.project)
7272
73 -
7473 @permalink
7574 def get_absolute_url(self):
7675 if self.jobtype != 'dataset':
Index: trunk/tools/editor_trends/wikilytics/templates/datasets.html
@@ -33,7 +33,7 @@
3434 {% for job in jobs %}
3535 <ul>
3636 <li>Project: {{ job.project }}</li>
37 - <li>Language: {{ job.language }}</li>
 37+ <li>Language: {{ job.language_code }}</li>
3838 <li>Created: {{ job.created }}</li>
3939 <li>Finished: {{ job.finished }}</li>
4040 <li>In progress: {{ job.in_progress }}</li>
Index: trunk/tools/editor_trends/classes/runtime_settings.py
@@ -56,7 +56,7 @@
5757 self.settings.input_location != None else self.get_value('location')
5858 self.project = self.update_project_settings()
5959 self.language = self.update_language_settings()
60 -
 60+ self.dbname = '%s%s' % (self.language.code, self.project.name)
6161 self.targets = self.split_keywords(self.get_value('charts'))
6262 self.keywords = self.split_keywords(self.get_value('keywords'))
6363 self.function = self.get_value('func')
Index: trunk/tools/editor_trends/utils/file_utils.py
@@ -25,6 +25,8 @@
2626
2727 import re
2828 import htmlentitydefs
 29+import time
 30+import datetime
2931 import cPickle
3032 import codecs
3133 import os
@@ -157,7 +159,7 @@
158160 lock.release()
159161
160162 def write_dict_to_csv(data, fh, keys, write_key=True, format='long'):
161 - assert format == 'long' or format == 'wide', 'Format should either be long or wide.'
 163+ assert format == 'long' or format == 'wide', 'Format should either be long or wide.'
162164
163165 if format == 'long':
164166 for key in keys:
@@ -231,18 +233,22 @@
232234
233235 def set_modified_data(mod_rem, location, filename):
234236 '''
235 - Mod_rem is the modified date of the remote file (the Wikimedia dump file)
 237+ Mod_rem is the modified date of the remote file (the Wikimedia dump file),
236238 Mon, 15 Mar 2010 07:07:30 GMT Example server timestamp
237239 '''
 240+ assert isinstance(mod_rem, datetime.datetime), '''The mod_rem variable should
 241+ be an instane of datetime.datetime.'''
238242 path = os.path.join(location, filename)
239 - print mod_rem
240 - #smod_rem = text_utils.convert_timestamp_to_datetime_naive(mod_rem, settings.timestamp_format)
 243+ mod_rem = mod_rem.timetuple()
 244+ mod_rem = int(time.mktime(mod_rem.timetuple()))
241245 os.utime(path, (mod_rem, mod_rem))
242246 #sraise exceptions.NotYetImplementedError(set_modified_data)
243247
244248 def get_modified_date(location, filename):
245249 path = os.path.join(location, filename)
246 - return os.stat(path).st_mtime
 250+ mod_date = os.stat(path).st_mtime
 251+ mod_date = datetime.datetime.fromtimestamp(mod_date)
 252+ return mod_date
247253
248254
249255 def check_file_exists(location, filename):
Index: trunk/tools/editor_trends/utils/http_utils.py
@@ -74,10 +74,12 @@
7575 else:
7676 print 'Added chunk to download: %s' % f
7777 task_queue.put(f)
78 -# if x < settings.number_of_processes:
79 -# settings.number_of_processes = x
80 - for x in xrange(settings.number_of_processes):
81 - task_queue.put(None)
 78+ if x == 1:
 79+ for x in xrange(1):
 80+ task_queue.put(None)
 81+ else:
 82+ for x in xrange(settings.number_of_processes):
 83+ task_queue.put(None)
8284 return task_queue
8385
8486
@@ -106,7 +108,6 @@
107109
108110 def determine_modified_date(domain, path, filename):
109111 res = get_headers(domain, path, filename)
110 - print res.__dict__
111112 if res != None and (res.status == 200 or res.status == 301):
112113 return res.getheader('last-modified', -1)
113114 else:
@@ -129,11 +130,6 @@
130131 print mod_date
131132 mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, '%a, %d %b %Y %H:%M:%S %Z')
132133 print mod_date
133 - #check_remote_path_exists(domain, path, filename)
134 - #read_directory_contents(domain, path)
135 -# download_wp_dump('http://download.wikimedia.org/enwiki/latest',
136 -# 'enwiki-latest-page_props.sql.gz',
137 -# settings.input_location)
138134
139135
140136 if __name__ == '__main__':
Index: trunk/tools/editor_trends/utils/data_converter.py
@@ -55,9 +55,9 @@
5656 for obs in var['obs'][date]['data']:
5757 if ds.format == 'long':
5858 if isinstance(var['obs'][date]['data'], dict):
59 - for subdata in var['obs'][date]['data']:
60 - for k,v in var['obs'][date]['data'][subdata]['data'].iteritems():
61 - o.append([datum, obs, k, v])
 59+ #for subdata in var['obs'][date]['data']:
 60+ for k, v in var['obs'][date]['data'][obs]['data'].iteritems():
 61+ o.append([datum, obs, k, v])
6262 else:
6363 o.append([datum, obs, var['obs'][date]['data'][obs]])
6464 data.extend(o)
Index: trunk/tools/editor_trends/cronjobs.py
@@ -39,6 +39,9 @@
4040 project = pjc.get_project(task['project'])
4141 lnc = languages.LanguageContainer()
4242 language = lnc.get_language(task['language_code'])
 43+
 44+ args.language = language.name
 45+ args.project = project.name
4346 rts = runtime_settings.RunTimeSettings(project, language, settings, args)
4447 res = manager.all_launcher(rts, settings, None)
4548 return res
@@ -86,7 +89,6 @@
8790 tasks = []
8891 jobs = coll.find({'finished': False, 'in_progress': False, 'error': False})
8992 for job in jobs:
90 - job['language_code'] = u'nl'
9193 tasks.append(job)
9294
9395 for task in tasks:
@@ -113,8 +115,11 @@
114116 launcher()
115117
116118
117 -
118119 if __name__ == '__main__':
 120+ x = 0
119121 while True:
120122 launcher()
121 - time.sleep(5 * 60)
 123+ time.sleep(x * 60)
 124+ x = +1
 125+ if x > 30:
 126+ x = 0

Status & tagging log