Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -61,8 +61,9 @@ |
62 | 62 | log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start') |
63 | 63 | ds.write(format='mongo') |
64 | 64 | stopwatch.elapsed() |
| 65 | + log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish') |
65 | 66 | |
66 | | - log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish') |
| 67 | + ds.summary() |
67 | 68 | return res |
68 | 69 | |
69 | 70 | |
Index: trunk/tools/editor_trends/analyses/dataset.py |
— | — | @@ -108,26 +108,22 @@ |
109 | 109 | The smallest unit, here the actual data is being stored. |
110 | 110 | Time_unit should either be 'year', 'month' or 'day'. |
111 | 111 | ''' |
112 | | - def __init__(self, date, time_unit): |
| 112 | + def __init__(self, date): |
113 | 113 | assert isinstance(date, datetime.datetime) |
114 | | - assert time_unit == 'year' or time_unit == 'month' or time_unit == 'day' |
115 | | - self.time_unit = time_unit |
116 | | - self.t0 = self.set_start_date(date) |
117 | | - self.t1 = self.set_end_date(date) |
118 | | - self.hash = self.__hash__(date) |
119 | 114 | self.date = date |
120 | 115 | self.data = {} |
121 | 116 | self._type = 'observation' |
122 | 117 | |
123 | 118 | def __repr__(self): |
124 | | - return '%s' % self.t1 |
| 119 | + return '%s' % self.date |
125 | 120 | |
126 | 121 | def __str__(self): |
127 | | - return 'range: %s:%s' % (self.t0, self.t1) |
| 122 | + return '%s' % self.date |
| 123 | + #return 'range: %s:%s' % (self.t0, self.t1) |
128 | 124 | |
129 | 125 | def __iter__(self): |
130 | | - for obs in self.obs: |
131 | | - yield self.obs[obs] |
| 126 | + for obs in self.data: |
| 127 | + yield self.data[obs] |
132 | 128 | |
133 | 129 | def __getitem__(self, key): |
134 | 130 | return getattr(self, key, []) |
— | — | @@ -138,22 +134,6 @@ |
139 | 135 | except IndexError: |
140 | 136 | return 0 |
141 | 137 | |
142 | | - def set_start_date(self, date): |
143 | | - if self.time_unit == 'year': |
144 | | - return datetime.datetime(date.year, 1, 1) |
145 | | - elif self.time_unit == 'month': |
146 | | - return datetime.datetime(date.year, date.month, 1) |
147 | | - else: |
148 | | - return datetime.datetime(date.year, date.month, date.day) |
149 | | - |
150 | | - def set_end_date(self, date): |
151 | | - if self.time_unit == 'year': |
152 | | - return datetime.datetime(date.year, 12, 31) |
153 | | - elif self.time_unit == 'month': |
154 | | - return datetime.datetime(date.year, date.month, calendar.monthrange(date.year, date.month)[1]) |
155 | | - else: |
156 | | - return datetime.datetime(date.year, date.month, date.day) |
157 | | - |
158 | 138 | def add(self, value, update): |
159 | 139 | ''' |
160 | 140 | If update == True then data[i] will be incremented else data[i] will be |
— | — | @@ -191,10 +171,10 @@ |
192 | 172 | self.props.append(kw) |
193 | 173 | |
194 | 174 | def __str__(self): |
195 | | - return self.name |
| 175 | + return '%s' % self.name |
196 | 176 | |
197 | 177 | def __repr__(self): |
198 | | - return self.name |
| 178 | + return '%s' % self.name |
199 | 179 | |
200 | 180 | def __getitem__(self, key): |
201 | 181 | return getattr(self, key, []) |
— | — | @@ -212,7 +192,7 @@ |
213 | 193 | for key in self.__dict__.keys(): |
214 | 194 | yield key, getattr(self, key) |
215 | 195 | |
216 | | - def obs(self): |
| 196 | + def itervalues(self): |
217 | 197 | for date in self: |
218 | 198 | for key in self.obs[date].data.keys(): |
219 | 199 | yield self.obs[date].data[key] |
— | — | @@ -222,14 +202,17 @@ |
223 | 203 | for value in self.obs[date].data.keys(): |
224 | 204 | yield (value, self.obs[date].data[value]) |
225 | 205 | |
226 | | - def get_observation(self, date): |
227 | | - key = self.__hash__(date) |
228 | | - return self.obs.get(key, Observation(date, self.time_unit)) |
| 206 | + def get_data(self): |
| 207 | + return [o for o in self.itervalues()] |
229 | 208 | |
| 209 | + def get_observation(self, key, date): |
| 210 | + return self.obs.get(key, Observation(date)) |
| 211 | + |
230 | 212 | def add(self, date, value, update=True): |
231 | | - data = self.get_observation(date) |
| 213 | + key = self.__hash__(self.set_end_date(date)) |
| 214 | + data = self.get_observation(key, date) |
232 | 215 | data.add(value, update) |
233 | | - self.obs[data.hash] = data |
| 216 | + self.obs[key] = data |
234 | 217 | |
235 | 218 | def encode(self): |
236 | 219 | bson = {} |
— | — | @@ -256,7 +239,22 @@ |
257 | 240 | setattr(self, prop, values[varname][prop]) |
258 | 241 | self.props.append(prop) |
259 | 242 | |
| 243 | + def set_start_date(self, date): |
| 244 | + if self.time_unit == 'year': |
| 245 | + return datetime.datetime(date.year, 1, 1) |
| 246 | + elif self.time_unit == 'month': |
| 247 | + return datetime.datetime(date.year, date.month, 1) |
| 248 | + else: |
| 249 | + return datetime.datetime(date.year, date.month, date.day) |
260 | 250 | |
| 251 | + def set_end_date(self, date): |
| 252 | + if self.time_unit == 'year': |
| 253 | + return datetime.datetime(date.year, 12, 31) |
| 254 | + elif self.time_unit == 'month': |
| 255 | + return datetime.datetime(date.year, date.month, calendar.monthrange(date.year, date.month)[1]) |
| 256 | + else: |
| 257 | + return datetime.datetime(date.year, date.month, date.day) |
| 258 | + |
261 | 259 | class Dataset: |
262 | 260 | ''' |
263 | 261 | This class acts as a container for the Variable class and has some methods |
— | — | @@ -272,6 +270,7 @@ |
273 | 271 | self._type = 'dataset' |
274 | 272 | self.filename = '%s_%s.csv' % (self.project, self.name) |
275 | 273 | self.created = datetime.datetime.now() |
| 274 | + self.format = 'long' |
276 | 275 | for kw in kwargs: |
277 | 276 | setattr(self, kw, kwargs[kw]) |
278 | 277 | self.props = self.__dict__.keys() |
— | — | @@ -335,7 +334,7 @@ |
336 | 335 | return max([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()]) |
337 | 336 | |
338 | 337 | def get_standard_deviation(self, number_list): |
339 | | - mean = get_mean(number_list) |
| 338 | + mean = self.get_mean(number_list) |
340 | 339 | std = 0 |
341 | 340 | n = len(number_list) |
342 | 341 | for i in number_list: |
— | — | @@ -363,13 +362,25 @@ |
364 | 363 | float_nums = [float(x) for x in number_list] |
365 | 364 | return sum(float_nums) / len(number_list) |
366 | 365 | |
| 366 | + def descriptives(self): |
| 367 | + for variable in self: |
| 368 | + data = variable.get_data() |
| 369 | + variable.mean = self.get_mean(data) |
| 370 | + variable.median = self.get_median(data) |
| 371 | + variable.sds = self.get_standard_deviation(data) |
| 372 | + variable.min = min(data) |
| 373 | + variable.max = max(data) |
| 374 | + variable.n = len(data) |
| 375 | + |
367 | 376 | def summary(self): |
368 | | - print 'Variable: %s' % self.name |
369 | | - print 'Mean: %s' % self.get_mean(self) |
370 | | - print 'Median: %s' % self.get_median(self) |
371 | | - print 'Standard Deviation: %s' % self.get_standard_deviation(self) |
372 | | - print 'Minimum: %s' % self.min() |
373 | | - print 'Maximum: %s' % self.max() |
| 377 | + self.descriptives() |
| 378 | + print '%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('Variable', 'Mean', 'Median', 'SD', |
| 379 | + 'Minimum', 'Maximum', 'Num Obs') |
| 380 | + for variable in self: |
| 381 | + print '%s\t%s\t%s\t%s\t%s\t%s\t%s' % (variable.name, variable.mean, |
| 382 | + variable.median, variable.sds, |
| 383 | + variable.min, variable.max, |
| 384 | + variable.n) |
374 | 385 | |
375 | 386 | |
376 | 387 | def debug(): |
— | — | @@ -379,30 +390,30 @@ |
380 | 391 | |
381 | 392 | d1 = datetime.datetime.today() |
382 | 393 | d2 = datetime.datetime(2007, 6, 7) |
383 | | - ds = Dataset('test', 'enwiki', 'editors_dataset', [ |
| 394 | + ds = Dataset('test', 'wiki', 'editors_dataset', 'en', [ |
384 | 395 | {'name': 'count', 'time_unit': 'year'}, |
385 | 396 | {'name': 'testest', 'time_unit': 'year'} |
386 | 397 | ]) |
387 | 398 | ds.count.add(d1, 5) |
| 399 | + ds.count.add(d1, 135) |
388 | 400 | ds.count.add(d2, 514) |
389 | 401 | ds.testest.add(d1, 135) |
390 | 402 | ds.testest.add(d2, 535) |
391 | 403 | #ds.summary() |
392 | | - ds.write_to_csv() |
| 404 | + ds.write(format='csv') |
393 | 405 | v = Variable('test', 'year') |
| 406 | + ds.summary() |
394 | 407 | ds.encode() |
| 408 | + print ds |
| 409 | + |
395 | 410 | mongo.test.insert({'variables': ds}) |
396 | 411 | |
397 | | - v.add(date , 5) |
398 | | - o = v.get_observation(date) |
| 412 | + v.add(d2 , 5) |
| 413 | + #o = v.get_observation(d2) |
399 | 414 | ds = rawdata.find_one({'project': 'wiki', |
400 | 415 | 'language_code': 'en', |
401 | 416 | 'hash': 'cohort_dataset_backward_bar'}) |
402 | | - print ds |
403 | 417 | |
404 | 418 | |
405 | | - |
406 | | - |
407 | | - |
408 | 419 | if __name__ == '__main__': |
409 | 420 | debug() |