Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -104,7 +104,6 @@ |
105 | 105 | ''' |
106 | 106 | print 'Start downloading' |
107 | 107 | stopwatch = timer.Timer() |
108 | | - #project, language, jobtype, task, timer, event = 'start' |
109 | 108 | log.log_to_mongo(properties, 'dataset', 'download', stopwatch, event='start') |
110 | 109 | res = downloader.launcher(properties, settings, logger) |
111 | 110 | stopwatch.elapsed() |
— | — | @@ -153,7 +152,7 @@ |
154 | 153 | print 'Start storing data in MongoDB' |
155 | 154 | stopwatch = timer.Timer() |
156 | 155 | log.log_to_mongo(properties, 'dataset', 'store', stopwatch, event='start') |
157 | | - db.cleanup_database(properties.project.name, logger) |
| 156 | + db.cleanup_database(properties.dbname, logger) |
158 | 157 | # write_message_to_log(logger, settings, |
159 | 158 | # message=None, |
160 | 159 | # verb='Storing', |
— | — | @@ -164,7 +163,8 @@ |
165 | 164 | # collection=properties.collection) |
166 | 165 | # for key in properties: |
167 | 166 | # print key, getattr(properties, key) |
168 | | - store.launcher(properties.sorted, properties.project.name, properties.collection) |
| 167 | + store.launcher(properties.sorted, properties.dbname, properties.collection) |
| 168 | + |
169 | 169 | stopwatch.elapsed() |
170 | 170 | log.log_to_mongo(properties, 'dataset', 'store', stopwatch, event='finish') |
171 | 171 | |
— | — | @@ -173,13 +173,13 @@ |
174 | 174 | print 'Start transforming dataset' |
175 | 175 | stopwatch = timer.Timer() |
176 | 176 | log.log_to_mongo(properties, 'dataset', 'transform', stopwatch, event='start') |
177 | | - db.cleanup_database(properties.project.name, logger, 'dataset') |
| 177 | + db.cleanup_database(properties.dbname, logger, 'dataset') |
178 | 178 | # write_message_to_log(logger, settings, |
179 | 179 | # message=None, |
180 | 180 | # verb='Transforming', |
181 | 181 | # project=properties.project, |
182 | 182 | # collection=properties.collection) |
183 | | - transformer.transform_editors_single_launcher(properties.project.name, |
| 183 | + transformer.transform_editors_single_launcher(properties.dbname, |
184 | 184 | properties.collection) |
185 | 185 | stopwatch.elapsed() |
186 | 186 | log.log_to_mongo(properties, 'dataset', 'transform', stopwatch, |
— | — | @@ -200,7 +200,7 @@ |
201 | 201 | # dbname=properties.full_project, |
202 | 202 | # collection=properties.collection) |
203 | 203 | |
204 | | - analyzer.generate_chart_data(properties.project.name, |
| 204 | + analyzer.generate_chart_data(properties.dbname, |
205 | 205 | collection, |
206 | 206 | properties.language.code, |
207 | 207 | target, |
Index: trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py |
— | — | @@ -17,9 +17,12 @@ |
18 | 18 | __date__ = '2011-01-31' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
| 21 | +import datetime |
| 22 | +from dateutil.relativedelta import relativedelta |
| 23 | +from utils import data_converter |
21 | 24 | |
22 | 25 | def histogram_by_backward_cohort(var, editor, **kwargs): |
23 | | - break_down=kwargs.pop('break_down', False) |
| 26 | + break_down = kwargs.pop('break_down', False) |
24 | 27 | new_wikipedian = editor['new_wikipedian'] |
25 | 28 | n = editor['edit_count'] |
26 | 29 | |
— | — | @@ -36,6 +39,10 @@ |
37 | 40 | if w >= editor_dt: |
38 | 41 | datum = datetime.datetime(int(year), 12, 31) |
39 | 42 | freq = editor['edits_by_year'][year] |
| 43 | + if datum == datetime.datetime(2003, 12, 31): |
| 44 | + if w == 24: |
| 45 | + if freq == 1.0: |
| 46 | + print 'break' |
40 | 47 | var.add(datum, {w:{freq:1}}) |
41 | 48 | break |
42 | | - return var |
\ No newline at end of file |
| 49 | + return var |
Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -167,15 +167,16 @@ |
168 | 168 | |
169 | 169 | |
170 | 170 | if __name__ == '__main__': |
171 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5) |
172 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', time_unit='year') |
173 | | - generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year') |
174 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', time_unit='year') |
175 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0) |
176 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide') |
177 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide') |
178 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_edits', time_unit='year', cutoff=0) |
179 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'time_to_new_wikipedian', time_unit='year', cutoff=0) |
180 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'new_editor_count', time_unit='month', cutoff=0) |
| 171 | + generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50) |
| 172 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5) |
| 173 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', time_unit='year') |
| 174 | + #generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year') |
| 175 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', time_unit='year') |
| 176 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0) |
| 177 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide') |
| 178 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide') |
| 179 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_edits', time_unit='year', cutoff=0) |
| 180 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'time_to_new_wikipedian', time_unit='year', cutoff=0) |
| 181 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'new_editor_count', time_unit='month', cutoff=0) |
181 | 182 | |
182 | 183 | #available_analyses() |
Index: trunk/tools/editor_trends/wikitree/parser.py |
— | — | @@ -61,7 +61,11 @@ |
62 | 62 | for ns in namespaces: |
63 | 63 | key = ns.get('key') |
64 | 64 | d[key] = extract_text(ns) |
65 | | - print ns.get('key'), ns.text |
| 65 | + text = ns.text if ns.text != None else '' |
| 66 | + try: |
| 67 | + print key, text.encode(settings.encoding) |
| 68 | + except UnicodeEncodeError: |
| 69 | + print key |
66 | 70 | return d |
67 | 71 | |
68 | 72 | |
Index: trunk/tools/editor_trends/etl/downloader.py |
— | — | @@ -39,7 +39,6 @@ |
40 | 40 | success = True |
41 | 41 | chunk = 1024 * 4 |
42 | 42 | |
43 | | - |
44 | 43 | while True: |
45 | 44 | filename = task_queue.get(block=False) |
46 | 45 | task_queue.task_done() |
— | — | @@ -58,15 +57,16 @@ |
59 | 58 | filename) |
60 | 59 | mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, properties.settings.timestamp_server) |
61 | 60 | if file_utils.check_file_exists(properties.location, filename): |
62 | | - #This can be activated as soon as bug 21575 is fixed. |
63 | | - properties.force = True |
64 | 61 | mod_loc = file_utils.get_modified_date(properties.location, filename) |
65 | | - if mod_loc != mod_date and properties.force == False: |
| 62 | + if mod_loc == mod_date and (properties.force == False or properties.force == None): |
66 | 63 | print 'You already have downloaded the most recent %s%s dumpfile.' % (properties.language.code, properties.project.name) |
67 | | - break |
| 64 | + continue |
68 | 65 | |
69 | 66 | if filemode == 'w': |
70 | | - fh = file_utils.create_txt_filehandle(properties.location, filename, filemode, properties.settings.encoding) |
| 67 | + fh = file_utils.create_txt_filehandle(properties.location, |
| 68 | + filename, |
| 69 | + filemode, |
| 70 | + properties.settings.encoding) |
71 | 71 | else: |
72 | 72 | fh = file_utils.create_binary_filehandle(properties.location, filename, 'wb') |
73 | 73 | |
— | — | @@ -92,27 +92,23 @@ |
93 | 93 | |
94 | 94 | except urllib2.URLError, error: |
95 | 95 | print 'Reason: %s' % error |
96 | | - success = False |
97 | 96 | except urllib2.HTTPError, error: |
98 | 97 | print 'Error: %s' % error |
99 | | - success = False |
100 | 98 | finally: |
101 | 99 | fh.close() |
102 | 100 | file_utils.set_modified_data(mod_date, properties.location, filename) |
103 | 101 | |
104 | | - return success |
105 | 102 | |
106 | 103 | |
107 | 104 | def launcher(properties, settings, logger): |
108 | 105 | print 'Creating list of files to be downloaded...' |
109 | | - result = True |
110 | 106 | tasks = http_utils.create_list_dumpfiles(properties.settings.wp_dump_location, |
111 | 107 | properties.dump_relative_path, |
112 | 108 | properties.dump_filename) |
113 | 109 | #print tasks.qsize() |
114 | 110 | #if tasks.qsize() < properties.settings.number_of_processes: |
115 | 111 | # properties.settings.number_of_processes = tasks.qsize() |
116 | | - if tasks.qsize() > 1: |
| 112 | + if tasks.qsize() > 2: |
117 | 113 | consumers = [multiprocessing.Process(target=download_wiki_file, |
118 | 114 | args=(tasks, properties)) |
119 | 115 | for i in xrange(properties.settings.number_of_processes)] |
— | — | @@ -124,8 +120,7 @@ |
125 | 121 | w.start() |
126 | 122 | |
127 | 123 | tasks.join() |
128 | | - for consumer in consumers: |
129 | | - if consumer.exitcode != 0: |
130 | | - result = False |
| 124 | +# for consumer in consumers: |
| 125 | +# if consumer.exitcode != 0: |
| 126 | +# result = False |
131 | 127 | |
132 | | - return result |
Index: trunk/tools/editor_trends/wikilytics/api/models.py |
— | — | @@ -69,7 +69,6 @@ |
70 | 70 | def __unicode__(self): |
71 | 71 | return u'%s%s' % (self.language_code, self.project) |
72 | 72 | |
73 | | - |
74 | 73 | @permalink |
75 | 74 | def get_absolute_url(self): |
76 | 75 | if self.jobtype != 'dataset': |
Index: trunk/tools/editor_trends/wikilytics/templates/datasets.html |
— | — | @@ -33,7 +33,7 @@ |
34 | 34 | {% for job in jobs %} |
35 | 35 | <ul> |
36 | 36 | <li>Project: {{ job.project }}</li> |
37 | | - <li>Language: {{ job.language }}</li> |
| 37 | + <li>Language: {{ job.language_code }}</li> |
38 | 38 | <li>Created: {{ job.created }}</li> |
39 | 39 | <li>Finished: {{ job.finished }}</li> |
40 | 40 | <li>In progress: {{ job.in_progress }}</li> |
Index: trunk/tools/editor_trends/classes/runtime_settings.py |
— | — | @@ -56,7 +56,7 @@ |
57 | 57 | self.settings.input_location != None else self.get_value('location') |
58 | 58 | self.project = self.update_project_settings() |
59 | 59 | self.language = self.update_language_settings() |
60 | | - |
| 60 | + self.dbname = '%s%s' % (self.language.code, self.project.name) |
61 | 61 | self.targets = self.split_keywords(self.get_value('charts')) |
62 | 62 | self.keywords = self.split_keywords(self.get_value('keywords')) |
63 | 63 | self.function = self.get_value('func') |
Index: trunk/tools/editor_trends/utils/file_utils.py |
— | — | @@ -25,6 +25,8 @@ |
26 | 26 | |
27 | 27 | import re |
28 | 28 | import htmlentitydefs |
| 29 | +import time |
| 30 | +import datetime |
29 | 31 | import cPickle |
30 | 32 | import codecs |
31 | 33 | import os |
— | — | @@ -157,7 +159,7 @@ |
158 | 160 | lock.release() |
159 | 161 | |
160 | 162 | def write_dict_to_csv(data, fh, keys, write_key=True, format='long'): |
161 | | - assert format == 'long' or format == 'wide', 'Format should either be long or wide.' |
| 163 | + assert format == 'long' or format == 'wide', 'Format should either be long or wide.' |
162 | 164 | |
163 | 165 | if format == 'long': |
164 | 166 | for key in keys: |
— | — | @@ -231,18 +233,22 @@ |
232 | 234 | |
233 | 235 | def set_modified_data(mod_rem, location, filename): |
234 | 236 | ''' |
235 | | - Mod_rem is the modified date of the remote file (the Wikimedia dump file) |
| 237 | + Mod_rem is the modified date of the remote file (the Wikimedia dump file), |
236 | 238 | Mon, 15 Mar 2010 07:07:30 GMT Example server timestamp |
237 | 239 | ''' |
| 240 | + assert isinstance(mod_rem, datetime.datetime), '''The mod_rem variable should |
| 241 | + be an instane of datetime.datetime.''' |
238 | 242 | path = os.path.join(location, filename) |
239 | | - print mod_rem |
240 | | - #smod_rem = text_utils.convert_timestamp_to_datetime_naive(mod_rem, settings.timestamp_format) |
| 243 | + mod_rem = mod_rem.timetuple() |
| 244 | + mod_rem = int(time.mktime(mod_rem.timetuple())) |
241 | 245 | os.utime(path, (mod_rem, mod_rem)) |
242 | 246 | #sraise exceptions.NotYetImplementedError(set_modified_data) |
243 | 247 | |
244 | 248 | def get_modified_date(location, filename): |
245 | 249 | path = os.path.join(location, filename) |
246 | | - return os.stat(path).st_mtime |
| 250 | + mod_date = os.stat(path).st_mtime |
| 251 | + mod_date = datetime.datetime.fromtimestamp(mod_date) |
| 252 | + return mod_date |
247 | 253 | |
248 | 254 | |
249 | 255 | def check_file_exists(location, filename): |
Index: trunk/tools/editor_trends/utils/http_utils.py |
— | — | @@ -74,10 +74,12 @@ |
75 | 75 | else: |
76 | 76 | print 'Added chunk to download: %s' % f |
77 | 77 | task_queue.put(f) |
78 | | -# if x < settings.number_of_processes: |
79 | | -# settings.number_of_processes = x |
80 | | - for x in xrange(settings.number_of_processes): |
81 | | - task_queue.put(None) |
| 78 | + if x == 1: |
| 79 | + for x in xrange(1): |
| 80 | + task_queue.put(None) |
| 81 | + else: |
| 82 | + for x in xrange(settings.number_of_processes): |
| 83 | + task_queue.put(None) |
82 | 84 | return task_queue |
83 | 85 | |
84 | 86 | |
— | — | @@ -106,7 +108,6 @@ |
107 | 109 | |
108 | 110 | def determine_modified_date(domain, path, filename): |
109 | 111 | res = get_headers(domain, path, filename) |
110 | | - print res.__dict__ |
111 | 112 | if res != None and (res.status == 200 or res.status == 301): |
112 | 113 | return res.getheader('last-modified', -1) |
113 | 114 | else: |
— | — | @@ -129,11 +130,6 @@ |
130 | 131 | print mod_date |
131 | 132 | mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, '%a, %d %b %Y %H:%M:%S %Z') |
132 | 133 | print mod_date |
133 | | - #check_remote_path_exists(domain, path, filename) |
134 | | - #read_directory_contents(domain, path) |
135 | | -# download_wp_dump('http://download.wikimedia.org/enwiki/latest', |
136 | | -# 'enwiki-latest-page_props.sql.gz', |
137 | | -# settings.input_location) |
138 | 134 | |
139 | 135 | |
140 | 136 | if __name__ == '__main__': |
Index: trunk/tools/editor_trends/utils/data_converter.py |
— | — | @@ -55,9 +55,9 @@ |
56 | 56 | for obs in var['obs'][date]['data']: |
57 | 57 | if ds.format == 'long': |
58 | 58 | if isinstance(var['obs'][date]['data'], dict): |
59 | | - for subdata in var['obs'][date]['data']: |
60 | | - for k,v in var['obs'][date]['data'][subdata]['data'].iteritems(): |
61 | | - o.append([datum, obs, k, v]) |
| 59 | + #for subdata in var['obs'][date]['data']: |
| 60 | + for k, v in var['obs'][date]['data'][obs]['data'].iteritems(): |
| 61 | + o.append([datum, obs, k, v]) |
62 | 62 | else: |
63 | 63 | o.append([datum, obs, var['obs'][date]['data'][obs]]) |
64 | 64 | data.extend(o) |
Index: trunk/tools/editor_trends/cronjobs.py |
— | — | @@ -39,6 +39,9 @@ |
40 | 40 | project = pjc.get_project(task['project']) |
41 | 41 | lnc = languages.LanguageContainer() |
42 | 42 | language = lnc.get_language(task['language_code']) |
| 43 | + |
| 44 | + args.language = language.name |
| 45 | + args.project = project.name |
43 | 46 | rts = runtime_settings.RunTimeSettings(project, language, settings, args) |
44 | 47 | res = manager.all_launcher(rts, settings, None) |
45 | 48 | return res |
— | — | @@ -86,7 +89,6 @@ |
87 | 90 | tasks = [] |
88 | 91 | jobs = coll.find({'finished': False, 'in_progress': False, 'error': False}) |
89 | 92 | for job in jobs: |
90 | | - job['language_code'] = u'nl' |
91 | 93 | tasks.append(job) |
92 | 94 | |
93 | 95 | for task in tasks: |
— | — | @@ -113,8 +115,11 @@ |
114 | 116 | launcher() |
115 | 117 | |
116 | 118 | |
117 | | - |
118 | 119 | if __name__ == '__main__': |
| 120 | + x = 0 |
119 | 121 | while True: |
120 | 122 | launcher() |
121 | | - time.sleep(5 * 60) |
| 123 | + time.sleep(x * 60) |
| 124 | + x = +1 |
| 125 | + if x > 30: |
| 126 | + x = 0 |