r81093 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r81092‎ | r81093 | r81094 >
Date:16:52, 27 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
Added simple descriptive statistics to describe the variables in a dataset.
Modified paths:
  • /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/dataset.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/analyzer.py
@@ -61,8 +61,9 @@
6262 log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='start')
6363 ds.write(format='mongo')
6464 stopwatch.elapsed()
 65+ log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish')
6566
66 - log.log_to_mongo(ds, 'chart', 'storing', stopwatch, event='finish')
 67+ ds.summary()
6768 return res
6869
6970
Index: trunk/tools/editor_trends/analyses/dataset.py
@@ -108,26 +108,22 @@
109109 The smallest unit, here the actual data is being stored.
110110 Time_unit should either be 'year', 'month' or 'day'.
111111 '''
112 - def __init__(self, date, time_unit):
 112+ def __init__(self, date):
113113 assert isinstance(date, datetime.datetime)
114 - assert time_unit == 'year' or time_unit == 'month' or time_unit == 'day'
115 - self.time_unit = time_unit
116 - self.t0 = self.set_start_date(date)
117 - self.t1 = self.set_end_date(date)
118 - self.hash = self.__hash__(date)
119114 self.date = date
120115 self.data = {}
121116 self._type = 'observation'
122117
123118 def __repr__(self):
124 - return '%s' % self.t1
 119+ return '%s' % self.date
125120
126121 def __str__(self):
127 - return 'range: %s:%s' % (self.t0, self.t1)
 122+ return '%s' % self.date
 123+ #return 'range: %s:%s' % (self.t0, self.t1)
128124
129125 def __iter__(self):
130 - for obs in self.obs:
131 - yield self.obs[obs]
 126+ for obs in self.data:
 127+ yield self.data[obs]
132128
133129 def __getitem__(self, key):
134130 return getattr(self, key, [])
@@ -138,22 +134,6 @@
139135 except IndexError:
140136 return 0
141137
142 - def set_start_date(self, date):
143 - if self.time_unit == 'year':
144 - return datetime.datetime(date.year, 1, 1)
145 - elif self.time_unit == 'month':
146 - return datetime.datetime(date.year, date.month, 1)
147 - else:
148 - return datetime.datetime(date.year, date.month, date.day)
149 -
150 - def set_end_date(self, date):
151 - if self.time_unit == 'year':
152 - return datetime.datetime(date.year, 12, 31)
153 - elif self.time_unit == 'month':
154 - return datetime.datetime(date.year, date.month, calendar.monthrange(date.year, date.month)[1])
155 - else:
156 - return datetime.datetime(date.year, date.month, date.day)
157 -
158138 def add(self, value, update):
159139 '''
160140 If update == True then data[i] will be incremented else data[i] will be
@@ -191,10 +171,10 @@
192172 self.props.append(kw)
193173
194174 def __str__(self):
195 - return self.name
 175+ return '%s' % self.name
196176
197177 def __repr__(self):
198 - return self.name
 178+ return '%s' % self.name
199179
200180 def __getitem__(self, key):
201181 return getattr(self, key, [])
@@ -212,7 +192,7 @@
213193 for key in self.__dict__.keys():
214194 yield key, getattr(self, key)
215195
216 - def obs(self):
 196+ def itervalues(self):
217197 for date in self:
218198 for key in self.obs[date].data.keys():
219199 yield self.obs[date].data[key]
@@ -222,14 +202,17 @@
223203 for value in self.obs[date].data.keys():
224204 yield (value, self.obs[date].data[value])
225205
226 - def get_observation(self, date):
227 - key = self.__hash__(date)
228 - return self.obs.get(key, Observation(date, self.time_unit))
 206+ def get_data(self):
 207+ return [o for o in self.itervalues()]
229208
 209+ def get_observation(self, key, date):
 210+ return self.obs.get(key, Observation(date))
 211+
230212 def add(self, date, value, update=True):
231 - data = self.get_observation(date)
 213+ key = self.__hash__(self.set_end_date(date))
 214+ data = self.get_observation(key, date)
232215 data.add(value, update)
233 - self.obs[data.hash] = data
 216+ self.obs[key] = data
234217
235218 def encode(self):
236219 bson = {}
@@ -256,7 +239,22 @@
257240 setattr(self, prop, values[varname][prop])
258241 self.props.append(prop)
259242
 243+ def set_start_date(self, date):
 244+ if self.time_unit == 'year':
 245+ return datetime.datetime(date.year, 1, 1)
 246+ elif self.time_unit == 'month':
 247+ return datetime.datetime(date.year, date.month, 1)
 248+ else:
 249+ return datetime.datetime(date.year, date.month, date.day)
260250
 251+ def set_end_date(self, date):
 252+ if self.time_unit == 'year':
 253+ return datetime.datetime(date.year, 12, 31)
 254+ elif self.time_unit == 'month':
 255+ return datetime.datetime(date.year, date.month, calendar.monthrange(date.year, date.month)[1])
 256+ else:
 257+ return datetime.datetime(date.year, date.month, date.day)
 258+
261259 class Dataset:
262260 '''
263261 This class acts as a container for the Variable class and has some methods
@@ -272,6 +270,7 @@
273271 self._type = 'dataset'
274272 self.filename = '%s_%s.csv' % (self.project, self.name)
275273 self.created = datetime.datetime.now()
 274+ self.format = 'long'
276275 for kw in kwargs:
277276 setattr(self, kw, kwargs[kw])
278277 self.props = self.__dict__.keys()
@@ -335,7 +334,7 @@
336335 return max([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()])
337336
338337 def get_standard_deviation(self, number_list):
339 - mean = get_mean(number_list)
 338+ mean = self.get_mean(number_list)
340339 std = 0
341340 n = len(number_list)
342341 for i in number_list:
@@ -363,13 +362,25 @@
364363 float_nums = [float(x) for x in number_list]
365364 return sum(float_nums) / len(number_list)
366365
 366+ def descriptives(self):
 367+ for variable in self:
 368+ data = variable.get_data()
 369+ variable.mean = self.get_mean(data)
 370+ variable.median = self.get_median(data)
 371+ variable.sds = self.get_standard_deviation(data)
 372+ variable.min = min(data)
 373+ variable.max = max(data)
 374+ variable.n = len(data)
 375+
367376 def summary(self):
368 - print 'Variable: %s' % self.name
369 - print 'Mean: %s' % self.get_mean(self)
370 - print 'Median: %s' % self.get_median(self)
371 - print 'Standard Deviation: %s' % self.get_standard_deviation(self)
372 - print 'Minimum: %s' % self.min()
373 - print 'Maximum: %s' % self.max()
 377+ self.descriptives()
 378+ print '%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('Variable', 'Mean', 'Median', 'SD',
 379+ 'Minimum', 'Maximum', 'Num Obs')
 380+ for variable in self:
 381+ print '%s\t%s\t%s\t%s\t%s\t%s\t%s' % (variable.name, variable.mean,
 382+ variable.median, variable.sds,
 383+ variable.min, variable.max,
 384+ variable.n)
374385
375386
376387 def debug():
@@ -379,30 +390,30 @@
380391
381392 d1 = datetime.datetime.today()
382393 d2 = datetime.datetime(2007, 6, 7)
383 - ds = Dataset('test', 'enwiki', 'editors_dataset', [
 394+ ds = Dataset('test', 'wiki', 'editors_dataset', 'en', [
384395 {'name': 'count', 'time_unit': 'year'},
385396 {'name': 'testest', 'time_unit': 'year'}
386397 ])
387398 ds.count.add(d1, 5)
 399+ ds.count.add(d1, 135)
388400 ds.count.add(d2, 514)
389401 ds.testest.add(d1, 135)
390402 ds.testest.add(d2, 535)
391403 #ds.summary()
392 - ds.write_to_csv()
 404+ ds.write(format='csv')
393405 v = Variable('test', 'year')
 406+ ds.summary()
394407 ds.encode()
 408+ print ds
 409+
395410 mongo.test.insert({'variables': ds})
396411
397 - v.add(date , 5)
398 - o = v.get_observation(date)
 412+ v.add(d2 , 5)
 413+ #o = v.get_observation(d2)
399414 ds = rawdata.find_one({'project': 'wiki',
400415 'language_code': 'en',
401416 'hash': 'cohort_dataset_backward_bar'})
402 - print ds
403417
404418
405 -
406 -
407 -
408419 if __name__ == '__main__':
409420 debug()

Status & tagging log