Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -173,13 +173,13 @@ |
174 | 174 | print 'Start transforming dataset' |
175 | 175 | stopwatch = timer.Timer() |
176 | 176 | log.log_to_mongo(properties, 'dataset', 'transform', stopwatch, event='start') |
177 | | - db.cleanup_database(properties.project, logger, 'dataset') |
| 177 | + db.cleanup_database(properties.project.name, logger, 'dataset') |
178 | 178 | # write_message_to_log(logger, settings, |
179 | 179 | # message=None, |
180 | 180 | # verb='Transforming', |
181 | 181 | # project=properties.project, |
182 | 182 | # collection=properties.collection) |
183 | | - transformer.transform_editors_single_launcher(properties.project, |
| 183 | + transformer.transform_editors_single_launcher(properties.project.name, |
184 | 184 | properties.collection) |
185 | 185 | stopwatch.elapsed() |
186 | 186 | log.log_to_mongo(properties, 'dataset', 'transform', stopwatch, |
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py |
— | — | @@ -17,7 +17,11 @@ |
18 | 18 | __date__ = '2011-01-25'
|
19 | 19 | __version__ = '0.1'
|
20 | 20 |
|
| 21 | +import datetime
|
| 22 | +from dateutil.relativedelta import relativedelta
|
| 23 | +from utils import data_converter
|
21 | 24 |
|
| 25 | +
|
22 | 26 | def cohort_dataset_backward_bar(var, editor, **kwargs):
|
23 | 27 | #first_edit = editor['first_edit']
|
24 | 28 | '''
|
— | — | @@ -30,7 +34,7 @@ |
31 | 35 | n = editor['edit_count']
|
32 | 36 |
|
33 | 37 | if n >= var.cum_cutoff:
|
34 | | - windows = create_windows(var, break_down_first_year=False)
|
| 38 | + windows = data_converter.create_windows(var, break_down_first_year=False)
|
35 | 39 | for year in xrange(new_wikipedian.year, var.max_year):
|
36 | 40 | year = str(year)
|
37 | 41 | if editor['edits_by_year'][year] >= var.cutoff:
|
Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -94,8 +94,6 @@ |
95 | 95 | format=fmt) |
96 | 96 | var = dataset.Variable('count', **kwargs) |
97 | 97 | |
98 | | - |
99 | | - |
100 | 98 | for editor in editors: |
101 | 99 | editor = coll.find_one({'editor': editor}) |
102 | 100 | var = func(var, editor, dbname=dbname) |
— | — | @@ -168,25 +166,14 @@ |
169 | 167 | return min_year, max_year |
170 | 168 | |
171 | 169 | |
172 | | -def create_windows(var, break_down_first_year=True): |
173 | | - ''' |
174 | | - This function creates a list of months. If break_down_first_year = True then |
175 | | - the first year will be split in 3, 6, 9 months as well. |
176 | | - ''' |
177 | | - years = var.max_year - var.min_year |
178 | | - windows = [y * 12 for y in xrange(1, years)] |
179 | | - if break_down_first_year: |
180 | | - windows = [3, 6, 9] + windows |
181 | | - return windows |
182 | | - |
183 | | - |
184 | 170 | if __name__ == '__main__': |
| 171 | + generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5) |
185 | 172 | #generate_chart_data('wiki', 'editors_dataset','en', 'total_number_of_new_wikipedians', time_unit='year') |
186 | 173 | #generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', time_unit='year') |
187 | 174 | #generate_chart_data('wiki', 'editors_dataset','en', 'total_cumulative_edits', time_unit='year') |
188 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0) |
189 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide') |
190 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide') |
| 175 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_histogram', 'to_bar_json', time_unit='month', cutoff=5, cum_cutoff=0) |
| 176 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_backward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=10, cum_cutoff=0, format='wide') |
| 177 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'cohort_dataset_forward_bar', 'to_stacked_bar_json', time_unit='year', cutoff=5, cum_cutoff=0, format='wide') |
191 | 178 | #generate_chart_data('wiki', 'editors_dataset','en', 'histogram_edits', time_unit='year', cutoff=0) |
192 | 179 | #generate_chart_data('wiki', 'editors_dataset','en', 'time_to_new_wikipedian', time_unit='year', cutoff=0) |
193 | 180 | #generate_chart_data('wiki', 'editors_dataset','en', 'new_editor_count', time_unit='month', cutoff=0) |
Index: trunk/tools/editor_trends/analyses/dataset.py |
— | — | @@ -129,33 +129,26 @@ |
130 | 130 | def __getitem__(self, key): |
131 | 131 | return getattr(self, key, []) |
132 | 132 | |
133 | | - def next(self): |
134 | | - try: |
135 | | - return len(self.data.keys()) + 1 |
136 | | - except IndexError: |
137 | | - return 0 |
138 | | - |
139 | 133 | def add(self, value, update): |
140 | 134 | ''' |
141 | 135 | If update == True then data[i] will be incremented else data[i] will be |
142 | 136 | created, in that case make sure that i is unique. Update is useful for |
143 | 137 | tallying a variable. |
144 | 138 | ''' |
145 | | - if hasattr(value, '__iter__') == False: |
146 | | - d = {} |
147 | | - d[0] = value |
148 | | - value = d |
149 | | - assert type(value) == type({}) |
150 | | - x = self.next() |
151 | | - for i, v in value.iteritems(): |
152 | | - self.data.setdefault(i, 0) |
153 | | - if update: |
154 | | - self.data[i] += v |
155 | | - else: |
156 | | - i += x |
157 | | - self.data[i] = v |
| 139 | + assert isinstance(value, dict) |
| 140 | + if update: |
| 141 | + for k, v in value: |
| 142 | + self.data.setdefault(k, 0) |
| 143 | + self.data[k] += v |
| 144 | + else: |
| 145 | + try: |
| 146 | + i = max(self.data.keys()) + 1 |
| 147 | + except ValueError: |
| 148 | + i = 0 |
| 149 | + self.data[i] = value |
158 | 150 | |
159 | 151 | |
| 152 | + |
160 | 153 | class Variable(Data): |
161 | 154 | ''' |
162 | 155 | This class constructs a time-based variable. |
— | — | @@ -337,8 +330,8 @@ |
338 | 331 | data, all_keys = data_converter.convert_dataset_to_lists(self, 'manage') |
339 | 332 | headers = data_converter.add_headers(self, all_keys) |
340 | 333 | fh = file_utils.create_txt_filehandle(settings.dataset_location, self.filename, 'w', settings.encoding) |
341 | | - file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True, format=self.format) |
342 | | - file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format) |
| 334 | + file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True) |
| 335 | + file_utils.write_list_to_csv(data, fh, recursive=False, newline=True) |
343 | 336 | fh.close() |
344 | 337 | |
345 | 338 | def encode(self): |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -17,7 +17,7 @@ |
18 | 18 | __date__ = '2011-01-04' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
21 | | - |
| 21 | +from Queue import Empty |
22 | 22 | import multiprocessing |
23 | 23 | import sys |
24 | 24 | |
— | — | @@ -41,11 +41,16 @@ |
42 | 42 | ''' |
43 | 43 | mongo = db.init_mongo_db(dbname) |
44 | 44 | collection = mongo[collection] |
| 45 | + |
45 | 46 | editor_cache = cache.EditorCache(collection) |
46 | 47 | prev_contributor = -1 |
47 | 48 | edits = 0 |
48 | 49 | while True: |
49 | | - filename = tasks.get(block=False) |
| 50 | + try: |
| 51 | + filename = tasks.get(block=False) |
| 52 | + except Empty: |
| 53 | + break |
| 54 | + |
50 | 55 | tasks.task_done() |
51 | 56 | if filename == None: |
52 | 57 | print 'Swallowing a poison pill.' |
— | — | @@ -53,28 +58,29 @@ |
54 | 59 | print '%s files left in the queue.' % messages.show(tasks.qsize) |
55 | 60 | |
56 | 61 | fh = file_utils.create_txt_filehandle(source, filename, 'r', settings.encoding) |
| 62 | + print fh |
57 | 63 | for line in file_utils.read_raw_data(fh): |
58 | | - #print line |
59 | | - if len(line) == 0: |
60 | | - continue |
61 | | - contributor = line[0] |
62 | | - #print 'Parsing %s' % contributor |
63 | | - if prev_contributor != contributor: |
64 | | - if edits > 9: |
65 | | - editor_cache.add(prev_contributor, 'NEXT') |
66 | | - #print 'Stored %s' % prev_contributor |
67 | | - else: |
68 | | - editor_cache.clear(prev_contributor) |
69 | | - edits = 0 |
70 | | - edits += 1 |
71 | | - date = text_utils.convert_timestamp_to_datetime_utc(line[1]) #+ datetime.timedelta(days=1) |
72 | | - article_id = int(line[2]) |
73 | | - username = line[3].encode(settings.encoding) |
74 | | - value = {'date': date, 'article': article_id, 'username': username} |
75 | | - editor_cache.add(contributor, value) |
76 | | - prev_contributor = contributor |
| 64 | + if len(line) > 1: |
| 65 | + contributor = line[0] |
| 66 | + #print 'Parsing %s' % contributor |
| 67 | + if prev_contributor != contributor: |
| 68 | + if edits > 9: |
| 69 | + editor_cache.add(prev_contributor, 'NEXT') |
| 70 | + print 'Stored %s' % prev_contributor |
| 71 | + else: |
| 72 | + editor_cache.clear(prev_contributor) |
| 73 | + edits = 0 |
| 74 | + edits += 1 |
| 75 | + date = text_utils.convert_timestamp_to_datetime_utc(line[1]) |
| 76 | + article_id = int(line[2]) |
| 77 | + username = line[3].encode(settings.encoding) |
| 78 | + value = {'date': date, |
| 79 | + 'article': article_id, |
| 80 | + 'username': username} |
| 81 | + editor_cache.add(contributor, value) |
| 82 | + prev_contributor = contributor |
77 | 83 | fh.close() |
78 | | - print editor_cache.n |
| 84 | + #print editor_cache.n |
79 | 85 | |
80 | 86 | |
81 | 87 | def launcher(source, dbname, collection): |
— | — | @@ -88,8 +94,8 @@ |
89 | 95 | coll.create_index('editor') |
90 | 96 | |
91 | 97 | files = file_utils.retrieve_file_list(source, 'csv') |
92 | | - print files |
93 | | - print source |
| 98 | + |
| 99 | + print 'Input directory is: %s ' % source |
94 | 100 | tasks = multiprocessing.JoinableQueue() |
95 | 101 | consumers = [multiprocessing.Process(target=store_editors, |
96 | 102 | args=(tasks, dbname, collection, source)) |
— | — | @@ -106,3 +112,4 @@ |
107 | 113 | |
108 | 114 | tasks.join() |
109 | 115 | |
| 116 | + |
Index: trunk/tools/editor_trends/etl/downloader.py |
— | — | @@ -12,7 +12,6 @@ |
13 | 13 | http://www.fsf.org/licenses/gpl.html |
14 | 14 | ''' |
15 | 15 | |
16 | | - |
17 | 16 | __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
18 | 17 | __email__ = 'dvanliere at gmail dot com' |
19 | 18 | __date__ = '2011-01-21' |
— | — | @@ -23,15 +22,15 @@ |
24 | 23 | import multiprocessing |
25 | 24 | import sys |
26 | 25 | |
27 | | -sys.path.append('..') |
28 | | -import configuration |
29 | | -settings = configuration.Settings() |
| 26 | +#sys.path.append('..') |
| 27 | +#import configuration |
| 28 | +#settings = configuration.Settings() |
30 | 29 | |
31 | 30 | from utils import file_utils |
32 | 31 | from utils import http_utils |
33 | 32 | from utils import log |
34 | 33 | |
35 | | -def download_wiki_file(task_queue, config): |
| 34 | +def download_wiki_file(task_queue, properties): |
36 | 35 | ''' |
37 | 36 | This is a very simple replacement for wget and curl because Windows does |
38 | 37 | not have these tools installed by default |
— | — | @@ -46,26 +45,39 @@ |
47 | 46 | break |
48 | 47 | extension = file_utils.determine_file_extension(filename) |
49 | 48 | filemode = file_utils.determine_file_mode(extension) |
50 | | - filesize = http_utils.determine_remote_filesize(settings.wp_dump_location, config.path, filename) |
| 49 | + filesize = http_utils.determine_remote_filesize(properties.settings.wp_dump_location, |
| 50 | + properties.dump_relative_path, |
| 51 | + filename) |
| 52 | + |
| 53 | +# mod_rem = http_utils.determine_modified_date(properties.settings.wp_dump_location, |
| 54 | +# properties.dump_relative_path, |
| 55 | +# filename) |
| 56 | + |
| 57 | + if file_utils.check_file_exists(properties.location, filename): |
| 58 | + #This can be activated as soon as bug 21575 is fixed. |
| 59 | + #mod_loc = file_utils.get_modified_date(properties.location, filename) |
| 60 | + #if mod_loc != mod_rem: |
| 61 | + print 'Swallowed a poison pill' |
| 62 | + break |
| 63 | + |
51 | 64 | if filemode == 'w': |
52 | | - fh = file_utils.create_txt_filehandle(config.location, filename, filemode, settings.encoding) |
| 65 | + fh = file_utils.create_txt_filehandle(properties.location, filename, filemode, properties.settings.encoding) |
| 66 | + |
53 | 67 | else: |
54 | | - fh = file_utils.create_binary_filehandle(config.location, filename, 'wb') |
| 68 | + fh = file_utils.create_binary_filehandle(properties.location, filename, 'wb') |
55 | 69 | |
56 | 70 | if filesize != -1: |
57 | 71 | widgets = log.init_progressbar_widgets(filename) |
58 | 72 | pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start() |
59 | 73 | |
60 | 74 | try: |
61 | | - if filename.endswith('json'): |
62 | | - req = urllib2.Request(settings.wp_dump_location + config.path) |
63 | | - else: |
64 | | - req = urllib2.Request(settings.wp_dump_location + config.path + filename) |
| 75 | + path = '%s%s' % (properties.dump_absolute_path, filename) |
| 76 | + req = urllib2.Request(path) |
65 | 77 | response = urllib2.urlopen(req) |
66 | 78 | while True: |
67 | 79 | data = response.read(chunk) |
68 | 80 | if not data: |
69 | | - print 'Finished downloading %s%s%s.' % (settings.wp_dump_location, config.path, filename) |
| 81 | + print 'Finished downloading %s.' % (path) |
70 | 82 | break |
71 | 83 | fh.write(data) |
72 | 84 | |
— | — | @@ -83,22 +95,30 @@ |
84 | 96 | finally: |
85 | 97 | fh.close() |
86 | 98 | |
| 99 | + #file_utils.set_modified_data(mod_rem, properties.location, filename) |
| 100 | + |
87 | 101 | return success |
88 | 102 | |
89 | 103 | |
90 | 104 | def launcher(properties, settings, logger): |
91 | 105 | print 'Creating list of files to be downloaded...' |
92 | | - tasks = http_utils.create_list_dumpfiles(settings.wp_dump_location, |
93 | | - properties.path, |
94 | | - properties.filename) |
| 106 | + result = True |
| 107 | + tasks = http_utils.create_list_dumpfiles(properties.settings.wp_dump_location, |
| 108 | + properties.dump_relative_path, |
| 109 | + properties.dump_filename) |
| 110 | + #print tasks.qsize() |
| 111 | + #if tasks.qsize() < properties.settings.number_of_processes: |
| 112 | + # properties.settings.number_of_processes = tasks.qsize() |
95 | 113 | consumers = [multiprocessing.Process(target=download_wiki_file, |
96 | 114 | args=(tasks, properties)) |
97 | | - for i in xrange(settings.number_of_processes)] |
98 | | - |
| 115 | + for i in xrange(properties.settings.number_of_processes + 1)] |
99 | 116 | print 'Starting consumers to download files...' |
100 | 117 | for w in consumers: |
101 | 118 | w.start() |
102 | 119 | |
103 | 120 | tasks.join() |
104 | | - result = all([consumer.exitcode for consumer in consumers]) |
| 121 | + for consumer in consumers: |
| 122 | + if consumer.exitcode != 0 and consumer.exitcode != None: |
| 123 | + result = False |
| 124 | + |
105 | 125 | return result |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -22,6 +22,7 @@ |
23 | 23 | import os |
24 | 24 | import multiprocessing |
25 | 25 | import progressbar |
| 26 | +from Queue import Empty |
26 | 27 | |
27 | 28 | sys.path.append('..') |
28 | 29 | import configuration |
— | — | @@ -204,7 +205,7 @@ |
205 | 206 | for function in tags[tag].keys(): |
206 | 207 | f = tags[tag][function] |
207 | 208 | value = f(el, bots=bots) |
208 | | - if isinstance(value, list): |
| 209 | + if isinstance(value, dict): |
209 | 210 | #if type(value) == type({}): |
210 | 211 | for kw in value: |
211 | 212 | vars[x][kw] = value[kw] |
— | — | @@ -232,35 +233,51 @@ |
233 | 234 | location = os.path.join(settings.input_location, language_code, project) |
234 | 235 | output = os.path.join(settings.input_location, language_code, project, 'txt') |
235 | 236 | widgets = log.init_progressbar_widgets('Extracting data') |
| 237 | + filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a', |
| 238 | + settings.encoding) for fh in xrange(settings.max_filehandles)] |
236 | 239 | |
237 | 240 | while True: |
238 | 241 | total, processed = 0.0, 0.0 |
239 | | - filename = tasks.get(block=False) |
240 | | - tasks.task_done() |
| 242 | + try: |
| 243 | + filename = tasks.get(block=False) |
| 244 | + except Empty: |
| 245 | + break |
| 246 | + finally: |
| 247 | + print tasks.qsize() |
| 248 | + tasks.task_done() |
| 249 | + |
241 | 250 | if filename == None: |
242 | | - print 'There are no more jobs in the queue left.' |
| 251 | + print '\nThere are no more jobs in the queue left.' |
243 | 252 | break |
244 | 253 | |
245 | 254 | filesize = file_utils.determine_filesize(location, filename) |
246 | | - fh = file_utils.create_txt_filehandle(location, filename, 'r', settings.encoding) |
247 | | - ns, xml_namespace = wikitree.parser.extract_meta_information(fh) |
| 255 | + print 'Opening %s...' % (os.path.join(location, filename)) |
| 256 | + print 'Filesize: %s' % filesize |
| 257 | + fh1 = file_utils.create_txt_filehandle(location, filename, 'r', settings.encoding) |
| 258 | + fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', settings.encoding) |
| 259 | + ns, xml_namespace = wikitree.parser.extract_meta_information(fh1) |
248 | 260 | ns = build_namespaces_locale(ns, namespaces) |
249 | 261 | settings.xml_namespace = xml_namespace |
250 | 262 | |
251 | 263 | pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start() |
252 | | - for page, article_size in wikitree.parser.read_input(fh): |
| 264 | + for page, article_size in wikitree.parser.read_input(fh1): |
253 | 265 | title = page.find('title') |
254 | 266 | total += 1 |
255 | 267 | if verify_article_belongs_namespace(title, ns): |
256 | 268 | article_id = page.find('id').text |
| 269 | + title = page.find('title').text |
257 | 270 | revisions = page.findall('revision') |
258 | 271 | revisions = parse_comments(revisions, remove_numeric_character_references) |
259 | 272 | output = output_editor_information(revisions, article_id, bot_ids) |
260 | 273 | write_output(output, filehandles, lock) |
| 274 | + file_utils.write_list_to_csv([article_id, title], fh2) |
261 | 275 | processed += 1 |
262 | 276 | page.clear() |
263 | 277 | pbar.update(pbar.currval + article_size) |
264 | | - fh.close() |
| 278 | + |
| 279 | + fh1.close() |
| 280 | + fh2.close() |
| 281 | + print 'Closing %s...' % (os.path.join(location, filename)) |
265 | 282 | print 'Total pages: %s' % total |
266 | 283 | print 'Pages processed: %s (%s)' % (processed, processed / total) |
267 | 284 | |
— | — | @@ -269,7 +286,8 @@ |
270 | 287 | |
271 | 288 | def group_observations(obs): |
272 | 289 | ''' |
273 | | - mmm forgot the purpose of this function |
| 290 | + This function groups observation by editor id, this way we have to make |
| 291 | + fewer fileopening calls. |
274 | 292 | ''' |
275 | 293 | d = {} |
276 | 294 | for o in obs: |
— | — | @@ -283,17 +301,19 @@ |
284 | 302 | def write_output(observations, filehandles, lock): |
285 | 303 | observations = group_observations(observations) |
286 | 304 | for obs in observations: |
287 | | - for i, o in enumerate(observations[obs]): |
288 | | - if i == 0: |
289 | | - fh = filehandles[hash(obs)] |
290 | | - try: |
291 | | - lock.acquire() |
| 305 | + lock.acquire() #lock the write around all edits of an editor for a particular page |
| 306 | + try: |
| 307 | + for i, o in enumerate(observations[obs]): |
| 308 | + if i == 0: |
| 309 | + fh = filehandles[hash(obs)] |
292 | 310 | file_utils.write_list_to_csv(o, fh) |
293 | | - lock.releas() |
294 | | - except Exception, error: |
295 | | - print error |
296 | 311 | |
| 312 | + except Exception, error: |
| 313 | + print 'Encountered the following error while writing data to %s: %s' % (error, fh) |
| 314 | + finally: |
| 315 | + lock.release() |
297 | 316 | |
| 317 | + |
298 | 318 | def hash(id): |
299 | 319 | ''' |
300 | 320 | A very simple hash function based on modulo. The except clause has been |
— | — | @@ -307,8 +327,10 @@ |
308 | 328 | |
309 | 329 | |
310 | 330 | def prepare(output): |
311 | | - file_utils.delete_file(output, None, directory=True) |
312 | | - file_utils.create_directory(output) |
| 331 | + res = file_utils.delete_file(output, None, directory=True) |
| 332 | + if res: |
| 333 | + res = file_utils.create_directory(output) |
| 334 | + return res |
313 | 335 | |
314 | 336 | |
315 | 337 | def unzip(properties): |
— | — | @@ -336,7 +358,7 @@ |
337 | 359 | print 'There was an error while extracting %s, please make sure \ |
338 | 360 | that %s is valid archive.' % (fn, fn) |
339 | 361 | return False |
340 | | - |
| 362 | + print tasks.qsize() |
341 | 363 | return tasks |
342 | 364 | |
343 | 365 | def launcher(properties): |
— | — | @@ -349,17 +371,22 @@ |
350 | 372 | ''' |
351 | 373 | result = True |
352 | 374 | tasks = unzip(properties) |
353 | | - prepare(properties.language_code, properties.project) |
| 375 | + |
| 376 | + output = os.path.join(settings.input_location, properties.language.code, |
| 377 | + properties.project.name, 'txt') |
| 378 | + result = prepare(output) |
| 379 | + if not result: |
| 380 | + return result |
| 381 | + |
354 | 382 | lock = multiprocessing.Lock() |
355 | | - filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a', |
356 | | - settings.encoding) for fh in xrange(settings.max_filehandles)] |
357 | | - output = os.path.join(settings.input_location, properties.language_code, |
358 | | - properties.project, 'txt') |
| 383 | +# filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a', |
| 384 | +# settings.encoding) for fh in xrange(settings.max_filehandles)] |
359 | 385 | |
| 386 | + filehandles = [] |
360 | 387 | consumers = [multiprocessing.Process(target=parse_dumpfile, |
361 | 388 | args=(tasks, |
362 | | - properties.project, |
363 | | - properties.language_code, |
| 389 | + properties.project.name, |
| 390 | + properties.language.code, |
364 | 391 | filehandles, |
365 | 392 | lock, |
366 | 393 | properties.namespaces)) |
— | — | @@ -369,15 +396,12 @@ |
370 | 397 | tasks.put(None) |
371 | 398 | |
372 | 399 | for w in consumers: |
| 400 | + print 'Launching process...' |
373 | 401 | w.start() |
374 | 402 | |
375 | 403 | tasks.join() |
376 | 404 | filehandles = [fh.close() for fh in filehandles] |
377 | | -# result = parse_dumpfile(properties.project, file_without_ext, |
378 | | -# properties.language_code, |
379 | | -# namespaces=['0']) |
380 | 405 | |
381 | | - |
382 | 406 | result = all([consumer.exitcode for consumer in consumers]) |
383 | 407 | return result |
384 | 408 | |
— | — | @@ -386,8 +410,7 @@ |
387 | 411 | project = 'wiki' |
388 | 412 | language_code = 'sv' |
389 | 413 | filename = 'svwiki-latest-stub-meta-history.xml' |
390 | | - #parse_dumpfile(project, filename, language_code) |
391 | | - launcher() |
| 414 | + parse_dumpfile(project, filename, language_code) |
392 | 415 | |
393 | 416 | if __name__ == '__main__': |
394 | 417 | debug() |
Index: trunk/tools/editor_trends/etl/sort.py |
— | — | @@ -83,8 +83,8 @@ |
84 | 84 | ''' |
85 | 85 | Merges smaller sorted files in one big file, no longer used. |
86 | 86 | ''' |
87 | | - fh = file_utils.create_txt_filehandle(target, 'merged_%s.txt' % iteration, 'w', |
88 | | - settings.encoding) |
| 87 | + fh = file_utils.create_txt_filehandle(target, 'merged_%s.txt' % iteration, |
| 88 | + 'w', settings.encoding) |
89 | 89 | lines = 0 |
90 | 90 | for line in heapq.merge(*[readline(filename) for filename in files]): |
91 | 91 | file_utils.write_list_to_csv(line, fh) |
— | — | @@ -98,7 +98,8 @@ |
99 | 99 | ''' |
100 | 100 | Writes the sorted file to target |
101 | 101 | ''' |
102 | | - fh = file_utils.create_txt_filehandle(target, filename, 'w', settings.encoding) |
| 102 | + fh = file_utils.create_txt_filehandle(target, filename, 'w', |
| 103 | + settings.encoding) |
103 | 104 | file_utils.write_list_to_csv(sorted_data, fh) |
104 | 105 | fh.close() |
105 | 106 | |
— | — | @@ -114,6 +115,7 @@ |
115 | 116 | tasks.task_done() |
116 | 117 | if filename == None: |
117 | 118 | print 'Swallowed a poison pill' |
| 119 | + print tasks.qsize() |
118 | 120 | break |
119 | 121 | |
120 | 122 | fh = file_utils.create_txt_filehandle(source, |
— | — | @@ -129,8 +131,8 @@ |
130 | 132 | sorted_data = mergesort(data) |
131 | 133 | write_sorted_file(sorted_data, filename, target) |
132 | 134 | print filename, messages.show(tasks.qsize) |
133 | | - except UnicodeDecodeError: |
134 | | - continue |
| 135 | + except UnicodeDecodeError, e: |
| 136 | + print e |
135 | 137 | except Empty: |
136 | 138 | break |
137 | 139 | |
Index: trunk/tools/editor_trends/classes/bots.py |
— | — | @@ -87,7 +87,7 @@ |
88 | 88 | self.add_clock_data() |
89 | 89 | self.active() |
90 | 90 | self.data.append(self.dt) |
91 | | - file_utils.write_list_to_csv(self.data, fh, recursive=False, newline=True) |
| 91 | + file_utils.write_list_to_csv(self.data, fh) |
92 | 92 | |
93 | 93 | |
94 | 94 | |
Index: trunk/tools/editor_trends/utils/file_utils.py |
— | — | @@ -31,6 +31,8 @@ |
32 | 32 | import ctypes |
33 | 33 | import sys |
34 | 34 | import shutil |
| 35 | +import multiprocessing |
| 36 | + |
35 | 37 | sys.path.append('..') |
36 | 38 | |
37 | 39 | |
— | — | @@ -117,7 +119,7 @@ |
118 | 120 | return 'wb' |
119 | 121 | |
120 | 122 | |
121 | | -def write_list_to_csv(data, fh, recursive=False, newline=True, format='wide'): |
| 123 | +def write_list_to_csv(data, fh, recursive=False, newline=True): |
122 | 124 | ''' |
123 | 125 | @data is a list which can contain other lists that will be written as a |
124 | 126 | single line to a textfile |
— | — | @@ -126,22 +128,22 @@ |
127 | 129 | The calling function is responsible for: |
128 | 130 | 1) closing the filehandle |
129 | 131 | ''' |
| 132 | + lock = multiprocessing.Lock() |
130 | 133 | tab = False |
131 | 134 | wrote_newline = None |
132 | 135 | if recursive: |
133 | 136 | recursive = False |
| 137 | + lock.acquire() |
134 | 138 | for x, d in enumerate(data): |
135 | 139 | if tab: |
136 | 140 | fh.write('\t') |
137 | 141 | if isinstance(d, list): |
138 | | - #if type(d) == type([]): |
139 | 142 | recursive = write_list_to_csv(d, fh, recursive=True, newline=False) |
140 | 143 | #when there is a list of lists but no other elements in the first list |
141 | 144 | #then write a newline. |
142 | 145 | if len(d) == len(data[x]): |
143 | 146 | fh.write('\n') |
144 | 147 | elif isinstance(d, dict): |
145 | | - #elif type(d) == type({}): |
146 | 148 | tab = write_dict_to_csv(d, fh, d.keys(), write_key=False, format=format) |
147 | 149 | else: |
148 | 150 | fh.write('%s' % d) |
— | — | @@ -152,8 +154,8 @@ |
153 | 155 | return True |
154 | 156 | if newline: |
155 | 157 | fh.write('\n') |
| 158 | + lock.release() |
156 | 159 | |
157 | | - |
158 | 160 | def write_dict_to_csv(data, fh, keys, write_key=True, format='long'): |
159 | 161 | assert format == 'long' or format == 'wide' |
160 | 162 | |
— | — | @@ -162,13 +164,10 @@ |
163 | 165 | if write_key: |
164 | 166 | fh.write('%s\t' % key) |
165 | 167 | if isinstance(data[key], list): |
166 | | - #if type(data[key]) == type([]): |
167 | 168 | for d in data[key]: |
168 | 169 | fh.write('%s\t%s\n' % (key, d)) |
169 | 170 | elif isinstance(data[key], dict): |
170 | | - #elif type(data[key]) == type({}): |
171 | 171 | write_dict_to_csv(data[key], fh, data[key].keys(), write_key=False, format=format) |
172 | | -# elif getattr(data[key], '__iter__', False): |
173 | 172 | # for d in data[key]: |
174 | 173 | # fh.write('%s\t%s\t%s\n' % (key, d, data[key][d])) |
175 | 174 | else: |
— | — | @@ -188,8 +187,6 @@ |
189 | 188 | fh.write('%s\t' % (data[key])) |
190 | 189 | fh.write('\n') |
191 | 190 | |
192 | | - #if type(data[key]) == type([]): |
193 | | - # write_list_to_csv(data[key], fh, recursive=False, newline=True) |
194 | 191 | |
195 | 192 | def create_txt_filehandle(location, name, mode, encoding): |
196 | 193 | filename = construct_filename(name, '.csv') |
Index: trunk/tools/editor_trends/utils/data_converter.py |
— | — | @@ -20,6 +20,17 @@ |
21 | 21 | |
22 | 22 | import datetime |
23 | 23 | |
| 24 | +def create_windows(var, break_down_first_year=True): |
| 25 | + ''' |
| 26 | + This function creates a list of months. If break_down_first_year = True then |
| 27 | + the first year will be split in 3, 6, 9 months as well. |
| 28 | + ''' |
| 29 | + years = var.max_year - var.min_year |
| 30 | + windows = [y * 12 for y in xrange(1, years)] |
| 31 | + if break_down_first_year: |
| 32 | + windows = [3, 6, 9] + windows |
| 33 | + return windows |
| 34 | + |
24 | 35 | def convert_seconds_to_date(secs): |
25 | 36 | #return time.gmtime(secs) |
26 | 37 | return datetime.datetime.fromtimestamp(secs) |