Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -38,7 +38,31 @@ |
39 | 39 | from utils import timer |
40 | 40 | from utils import log |
41 | 41 | |
| 42 | +class Replicator: |
| 43 | + def __init__(self, rts, plugin, time_unit, cutoff=None, cum_cutoff=None, **kwargs): |
| 44 | + self.plugin = plugin |
| 45 | + self.rts = rts |
| 46 | + self.time_unit = time_unit |
| 47 | + if cutoff == None: |
| 48 | + self.cutoff = [1, 10, 50] |
| 49 | + else: |
| 50 | + self.cutoff = cutoff |
42 | 51 | |
| 52 | + if cutoff == None: |
| 53 | + self.cum_cutoff = [10] |
| 54 | + else: |
| 55 | + self.cum_cutoff = cum_cutoff |
| 56 | + self.kwargs = kwargs |
| 57 | + |
| 58 | + def __call__(self): |
| 59 | + for cum_cutoff in self.cum_cutoff: |
| 60 | + for cutoff in self.cutoff: |
| 61 | + generate_chart_data(self.rts, self.plugin, |
| 62 | + time_unit=self.time_unit, |
| 63 | + cutoff=cutoff, cum_cutoff=cum_cutoff, |
| 64 | + **self.kwargs) |
| 65 | + |
| 66 | + |
43 | 67 | class Analyzer(consumers.BaseConsumer): |
44 | 68 | def __init__(self, rts, tasks, result, var): |
45 | 69 | super(Analyzer, self).__init__(rts, tasks, result) |
— | — | @@ -109,9 +133,9 @@ |
110 | 134 | ds.filename) |
111 | 135 | ds.write(format='csv') |
112 | 136 | print 'Serializing dataset to %s_%s' % (rts.dbname, 'charts') |
113 | | - log.log_to_mongo(rts, 'chart', 'storing', stopwatch, event='start') |
114 | | - ds.write(format='mongo') |
115 | | - log.log_to_mongo(rts, 'chart', 'storing', stopwatch, event='finish') |
| 137 | + #log.log_to_mongo(rts, 'chart', 'storing', stopwatch, event='start') |
| 138 | + #ds.write(format='mongo') |
| 139 | + #log.log_to_mongo(rts, 'chart', 'storing', stopwatch, event='finish') |
116 | 140 | |
117 | 141 | |
118 | 142 | def generate_chart_data(rts, func, **kwargs): |
— | — | @@ -121,6 +145,8 @@ |
122 | 146 | ''' |
123 | 147 | stopwatch = timer.Timer() |
124 | 148 | plugin = retrieve_plugin(func) |
| 149 | + if not plugin: |
| 150 | + raise 'Plugin function %s is unknown, please make sure that you specify an existing plugin function.' % func |
125 | 151 | feedback(plugin, rts) |
126 | 152 | |
127 | 153 | obs = dict() |
— | — | @@ -177,11 +203,11 @@ |
178 | 204 | tasks.join() |
179 | 205 | |
180 | 206 | reconstruct_observations(var) |
181 | | - ds = dataset.Dataset(plugin.func_name, rts, format=fmt) |
| 207 | + ds = dataset.Dataset(plugin.func_name, rts, format=fmt, **kwargs) |
182 | 208 | ds.add_variable(var) |
183 | 209 | |
184 | 210 | stopwatch.elapsed() |
185 | | - #write_output(ds, rts, stopwatch) |
| 211 | + write_output(ds, rts, stopwatch) |
186 | 212 | |
187 | 213 | ds.summary() |
188 | 214 | #return True |
— | — | @@ -202,7 +228,7 @@ |
203 | 229 | return min_year, max_year |
204 | 230 | |
205 | 231 | |
206 | | -if __name__ == '__main__': |
| 232 | +def launcher(): |
207 | 233 | project, language, parser = manager.init_args_parser() |
208 | 234 | args = parser.parse_args(['django']) |
209 | 235 | rts = runtime_settings.init_environment('wiki', 'en', args) |
— | — | @@ -212,15 +238,25 @@ |
213 | 239 | rts.editors_dataset = 'editors_dataset' |
214 | 240 | #END TEMP FIX |
215 | 241 | |
216 | | - generate_chart_data(rts, 'histogram_by_backward_cohort', time_unit='year', cutoff=1, cum_cutoff=10) |
| 242 | +# replicator = Replicator(rts, 'histogram_by_backward_cohort', time_unit='year') |
| 243 | +# replicator() |
| 244 | +# replicator = Replicator(rts, 'cohort_dataset_backward_bar', time_unit='year', format='wide') |
| 245 | +# replicator() |
| 246 | + |
| 247 | +# generate_chart_data(rts, 'histogram_by_backward_cohort', time_unit='year', cutoff=1, cum_cutoff=10) |
217 | 248 | # generate_chart_data(rts, 'edit_patterns', time_unit='year', cutoff=5) |
218 | 249 | # generate_chart_data(rts, 'total_number_of_new_wikipedians', time_unit='year') |
219 | 250 | # generate_chart_data(rts, 'total_number_of_articles', time_unit='year') |
220 | 251 | # generate_chart_data(rts, 'total_cumulative_edits', time_unit='year') |
221 | | -# generate_chart_data(rts, 'cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=10) |
222 | | - generate_chart_data(rts, 'cohort_dataset_backward_bar', time_unit='year', cutoff=1, cum_cutoff=10, format='wide') |
| 252 | + generate_chart_data(rts, 'cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=10) |
| 253 | +# generate_chart_data(rts, 'cohort_dataset_backward_bar', time_unit='year', cutoff=1, cum_cutoff=10, format='wide') |
223 | 254 | # generate_chart_data(rts, 'cohort_dataset_forward_bar', time_unit='year', cutoff=5, cum_cutoff=0, format='wide') |
224 | 255 | # generate_chart_data(rts, 'histogram_edits', time_unit='year', cutoff=0) |
225 | 256 | # generate_chart_data(rts, 'time_to_new_wikipedian', time_unit='year', cutoff=0) |
226 | 257 | # generate_chart_data(rts, 'new_editor_count', time_unit='month', cutoff=0) |
227 | 258 | # #available_analyses() |
| 259 | + |
| 260 | + |
| 261 | + |
| 262 | +if __name__ == '__main__': |
| 263 | + launcher() |
Index: trunk/tools/editor_trends/classes/dataset.py |
— | — | @@ -26,7 +26,7 @@ |
27 | 27 | import cPickle |
28 | 28 | import hashlib |
29 | 29 | from pymongo.son_manipulator import SONManipulator |
30 | | -from multiprocessing import Manager |
| 30 | +from multiprocessing import Manager, RLock |
31 | 31 | from texttable import Texttable |
32 | 32 | from datetime import timedelta |
33 | 33 | |
— | — | @@ -91,13 +91,12 @@ |
92 | 92 | This is a generic hash function that expects a list of variables, used |
93 | 93 | to lookup an Observation or Variable. |
94 | 94 | ''' |
95 | | - id = '_'.join([str(var) for var in vars]) |
| 95 | + return hash('_'.join([str(var) for var in vars])) |
96 | 96 | #return id |
97 | | - m = hashlib.md5() |
98 | | - m.update(id) |
| 97 | + #m = hashlib.md5() |
| 98 | + #m.update(id) |
99 | 99 | #print id, m.hexdigest() |
100 | | - return m.hexdigest() |
101 | | - #return ''.join([str(var) for var in vars]) |
| 100 | + #return m.hexdigest() |
102 | 101 | |
103 | 102 | def encode_to_bson(self, data=None): |
104 | 103 | ''' |
— | — | @@ -209,20 +208,8 @@ |
210 | 209 | else: |
211 | 210 | self.data += value |
212 | 211 | self.count += 1 |
213 | | -# self.lock.acquire() |
214 | | -# try: |
215 | | -# if isinstance(value, list): |
216 | | -# if self.count == 0: |
217 | | -# self.data = [] |
218 | | -# self.data.append(value) |
219 | | -# else: |
220 | | -# self.data += value |
221 | | -# finally: |
222 | | -# self.count += 1 |
223 | | -# self.lock.release() |
224 | 212 | |
225 | 213 | |
226 | | - |
227 | 214 | def get_date_range(self): |
228 | 215 | return '%s-%s-%s:%s-%s-%s' % (self.t0.month, self.t0.day, self.t0.year, \ |
229 | 216 | self.t1.month, self.t1.day, self.t1.year) |
— | — | @@ -361,9 +348,9 @@ |
362 | 349 | ''' |
363 | 350 | |
364 | 351 | def __init__(self, chart, rts, vars=None, **kwargs): |
365 | | - self.encoder, chart, charts = json_encoders.get_json_encoder(chart) |
| 352 | + self.encoder, chart_type, charts = json_encoders.get_json_encoder(chart) |
366 | 353 | if self.encoder == None: |
367 | | - raise exceptions.UnknownChartError(chart, charts) |
| 354 | + raise exceptions.UnknownChartError(chart_type, charts) |
368 | 355 | self.chart = chart |
369 | 356 | self.name = 'Dataset to construct %s' % self.chart |
370 | 357 | self.project = rts.project.name |
— | — | @@ -427,7 +414,7 @@ |
428 | 415 | attrs = '_'.join(['%s=%s' % (k, getattr(var, k)) for k in keys]) |
429 | 416 | filename = '%s%s_%s_%s.csv' % (self.language_code, |
430 | 417 | self.project, |
431 | | - self.name, |
| 418 | + self.chart, |
432 | 419 | attrs) |
433 | 420 | self.filename = filename |
434 | 421 | |
— | — | @@ -467,9 +454,15 @@ |
468 | 455 | def to_csv(self): |
469 | 456 | data = data_converter.convert_dataset_to_lists(self, 'manage') |
470 | 457 | headers = data_converter.add_headers(self) |
471 | | - fh = file_utils.create_txt_filehandle(settings.dataset_location, self.filename, 'w', settings.encoding) |
| 458 | + lock = RLock() |
| 459 | + fh = file_utils.create_txt_filehandle(settings.dataset_location, |
| 460 | + self.filename, |
| 461 | + 'w', |
| 462 | + settings.encoding) |
472 | 463 | file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True) |
473 | | - file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format) |
| 464 | + file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, |
| 465 | + format=self.format, |
| 466 | + lock=lock) |
474 | 467 | fh.close() |
475 | 468 | |
476 | 469 | def encode(self): |