Index: trunk/tools/editor_trends/analyses/dataset.py |
— | — | @@ -0,0 +1,400 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-14' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import calendar |
| 22 | +import datetime |
| 23 | +import time |
| 24 | +import math |
| 25 | +import sys |
| 26 | +from pymongo.son_manipulator import SONManipulator |
| 27 | + |
| 28 | + |
| 29 | +sys.path.append('..') |
| 30 | +import configuration |
| 31 | +settings = configuration.Settings() |
| 32 | + |
| 33 | +from utils import utils |
| 34 | +from database import db |
| 35 | + |
| 36 | +class Transform(SONManipulator): |
| 37 | + def transform_incoming(self, son, collection): |
| 38 | + for (key, ds) in son.items(): |
| 39 | + if isinstance(ds, Dataset): |
| 40 | + son[key] = ds.encode() |
| 41 | + #elif isinstance(value, dict): # Make sure we recurse into sub-docs |
| 42 | + # son[key] = self.transform_incoming(value, collection) |
| 43 | + return son |
| 44 | + |
| 45 | + def transform_outgoing(self, son, collection): |
| 46 | + for (key, value) in son.items(): |
| 47 | + if isinstance(value, dict): |
| 48 | + if "_type" in value and value["_type"] == "custom": |
| 49 | + son[key] = decode_custom(value) |
| 50 | + else: # Again, make sure to recurse into sub-docs |
| 51 | + son[key] = self.transform_outgoing(value, collection) |
| 52 | + return son |
| 53 | + |
| 54 | + |
| 55 | +class Data: |
| 56 | + def __hash__(self, date): |
| 57 | + #return hash(self.convert_date_to_epoch(date)) |
| 58 | + return int(self.convert_date_to_epoch(date)) |
| 59 | + |
| 60 | + def encode_to_bson(self): |
| 61 | + kwargs = dict([(str(key), value) for key, value in self.__dict__.iteritems()]) |
| 62 | + for key, value in kwargs.iteritems(): |
| 63 | + if isinstance(value, dict): |
| 64 | + d = dict([(str(k), v) for k, v in value.iteritems()]) |
| 65 | + kwargs[key] = d |
| 66 | + |
| 67 | + |
| 68 | + kwargs['_type'] = self._type |
| 69 | + return kwargs |
| 70 | + #return {'_type': 'c', 'x': var.x()} |
| 71 | + |
| 72 | + def convert_seconds_to_date(self, secs): |
| 73 | + #return time.gmtime(secs) |
| 74 | + return datetime.datetime.fromtimestamp(secs) |
| 75 | + |
| 76 | + def convert_date_to_epoch(self, date): |
| 77 | + assert self.time_unit == 'year' or self.time_unit == 'month' \ |
| 78 | + or self.time_unit == 'day' |
| 79 | + |
| 80 | + if self.time_unit == 'year': |
| 81 | + datum = datetime.datetime(date.year, 1, 1) |
| 82 | + return time.mktime(datum.timetuple()) |
| 83 | + elif self.time_unit == 'month': |
| 84 | + datum = datetime.datetime(date.year, date.month, 1) |
| 85 | + return time.mktime(datum.timetuple()) |
| 86 | + else: |
| 87 | + return time.mktime(date.timetuple()) |
| 88 | + |
| 89 | + |
| 90 | +class Observation(Data): |
| 91 | + def __init__(self, date, time_unit): |
| 92 | + assert isinstance(date, datetime.datetime) |
| 93 | + self.time_unit = time_unit |
| 94 | + self.t0 = self.set_start_date(date) |
| 95 | + self.t1 = self.set_end_date(date) |
| 96 | + self.hash = self.__hash__(date) |
| 97 | + self.data = {} |
| 98 | + self._type = 'observation' |
| 99 | + |
| 100 | + def __repr__(self): |
| 101 | + return '%s' % self.t1 |
| 102 | + |
| 103 | + def __str__(self): |
| 104 | + return 'range: %s:%s' % (self.t0, self.t1) |
| 105 | + |
| 106 | + def __iter__(self): |
| 107 | + for obs in self.obs: |
| 108 | + yield self.obs[obs] |
| 109 | + |
| 110 | + def next(self): |
| 111 | + try: |
| 112 | + return len(self.data.keys()) + 1 |
| 113 | + except IndexError: |
| 114 | + return 0 |
| 115 | + |
| 116 | + def set_start_date(self, date): |
| 117 | + if self.time_unit == 'year': |
| 118 | + return datetime.datetime(date.year, 1, 1) |
| 119 | + elif self.time_unit == 'month': |
| 120 | + return datetime.datetime(date.year, date.month, 1) |
| 121 | + else: |
| 122 | + return datetime.datetime(date.year, date.month, date.day) |
| 123 | + |
| 124 | + def set_end_date(self, date): |
| 125 | + if self.time_unit == 'year': |
| 126 | + return datetime.datetime(date.year, 12, 31) |
| 127 | + elif self.time_unit == 'month': |
| 128 | + return datetime.datetime(date.year, date.month, calendar.monthrange(date.year, date.month)[1]) |
| 129 | + else: |
| 130 | + return datetime.datetime(date.year, date.month, date.day) |
| 131 | + |
| 132 | + def add(self, value, update): |
| 133 | + if hasattr(value, '__iter__') == False: |
| 134 | + d = {} |
| 135 | + d[0] = value |
| 136 | + value = d |
| 137 | + assert type(value) == type({}) |
| 138 | + x = self.next() |
| 139 | + for i, v in value.iteritems(): |
| 140 | + self.data.setdefault(i, 0) |
| 141 | + if update: |
| 142 | + self.data[i] += v |
| 143 | + else: |
| 144 | + i += x |
| 145 | + self.data[i] = v |
| 146 | + |
| 147 | + |
| 148 | +class Variable(Data): |
| 149 | + ''' |
| 150 | + This class constructs a time-based variable and has some associated simple |
| 151 | + statistical descriptives |
| 152 | + ''' |
| 153 | + def __init__(self, name, time_unit, **kwargs): |
| 154 | + self.name = name |
| 155 | + self.obs = {} |
| 156 | + self.time_unit = time_unit |
| 157 | + self._type = 'variable' |
| 158 | + #self.stats = stats |
| 159 | + for kw in kwargs: |
| 160 | + setattr(self, kw, kwargs[kw]) |
| 161 | + |
| 162 | + def __str__(self): |
| 163 | + return self.name |
| 164 | + |
| 165 | + def __repr__(self): |
| 166 | + return self.name |
| 167 | + |
| 168 | + def __getitem__(self, key): |
| 169 | + return self.obs[key] |
| 170 | + |
| 171 | + def __iter__(self): |
| 172 | + dates = self.obs.keys() |
| 173 | + dates.sort() |
| 174 | + for date in dates: |
| 175 | + yield date |
| 176 | + |
| 177 | + |
| 178 | + def __len__(self): |
| 179 | + return [x for x in xrange(self.obs())] |
| 180 | + |
| 181 | + def obs(self): |
| 182 | + for date in self: |
| 183 | + for key in self.obs[date].data.keys(): |
| 184 | + yield self.obs[date].data[key] |
| 185 | + |
| 186 | + def iteritems(self): |
| 187 | + for date in self: |
| 188 | + for value in self.obs[date].data.keys(): |
| 189 | + yield (value, self.obs[date].data[value]) |
| 190 | + |
| 191 | + def get_observation(self, date): |
| 192 | + key = self.__hash__(date) |
| 193 | + return self.obs.get(key, Observation(date, self.time_unit)) |
| 194 | + |
| 195 | + def min(self): |
| 196 | + return min([obs for obs in self]) |
| 197 | + #return min([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()]) |
| 198 | + |
| 199 | + def max(self): |
| 200 | + return max([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()]) |
| 201 | + |
| 202 | + def get_standard_deviation(self, number_list): |
| 203 | + mean = get_mean(number_list) |
| 204 | + std = 0 |
| 205 | + n = len(number_list) |
| 206 | + for i in number_list: |
| 207 | + std = std + (i - mean) ** 2 |
| 208 | + return math.sqrt(std / float(n - 1)) |
| 209 | + |
| 210 | + |
| 211 | + def get_median(self, number_list): |
| 212 | + #print number_list |
| 213 | + if number_list == []: return '.' |
| 214 | + data = sorted(number_list) |
| 215 | + data = [float(x) for x in data] |
| 216 | + if len(data) % 2 == 1: |
| 217 | + return data[(len(data) + 1) / 2 - 1] |
| 218 | + else: |
| 219 | + lower = data[len(data) / 2 - 1] |
| 220 | + upper = data[len(data) / 2] |
| 221 | + #print upper, lower |
| 222 | + return (lower + upper) / 2 |
| 223 | + |
| 224 | + |
| 225 | + def get_mean(self, number_list): |
| 226 | + #print number_list |
| 227 | + if number_list == []: return '.' |
| 228 | + float_nums = [float(x) for x in number_list] |
| 229 | + return sum(float_nums) / len(number_list) |
| 230 | + |
| 231 | + def summary(self): |
| 232 | + print 'Variable: %s' % self.name |
| 233 | + print 'Mean: %s' % self.get_mean(self) |
| 234 | + print 'Median: %s' % self.get_median(self) |
| 235 | + print 'Standard Deviation: %s' % self.get_standard_deviation(self) |
| 236 | + print 'Minimum: %s' % self.min() |
| 237 | + print 'Maximum: %s' % self.max() |
| 238 | + |
| 239 | + |
| 240 | + def add(self, date, value, update=True): |
| 241 | + data = self.get_observation(date) |
| 242 | + data.add(value, update) |
| 243 | + self.obs[data.hash] = data |
| 244 | + |
| 245 | + |
| 246 | +class Dataset: |
| 247 | + ''' |
| 248 | + This class acts as a container for the Variable class and has some methods |
| 249 | + to output the dataset to a csv file. |
| 250 | + ''' |
| 251 | + def __init__(self, name, vars=[{}]): |
| 252 | + self.name = '%s.csv' % name |
| 253 | + self.vars = [] |
| 254 | + self.format = 'long' |
| 255 | + self._type = 'dataset' |
| 256 | + for kwargs in vars: |
| 257 | + name = kwargs.pop('name') |
| 258 | + setattr(self, name, Variable(name, **kwargs)) |
| 259 | + self.vars.append(name) |
| 260 | + |
| 261 | + def __repr__(self): |
| 262 | + return 'Dataset contains %s variables' % (len(self.vars)) |
| 263 | + |
| 264 | + def __iter__(self): |
| 265 | + for var in self.vars: |
| 266 | + yield getattr(self, var) |
| 267 | + |
| 268 | + def get_all_keys(self, data): |
| 269 | + all_keys = [] |
| 270 | + for d in data: |
| 271 | + for key in d: |
| 272 | + if key not in all_keys: |
| 273 | + all_keys.append(key) |
| 274 | + all_keys.sort() |
| 275 | + all_keys.insert(0, all_keys[-1]) |
| 276 | + del all_keys[-1] |
| 277 | + return all_keys |
| 278 | + |
| 279 | + def make_data_rectangular(self, data, all_keys): |
| 280 | + for i, d in enumerate(data): |
| 281 | + for key in all_keys: |
| 282 | + if key not in d: |
| 283 | + d[key] = 0 |
| 284 | + data[i] = d |
| 285 | + return data |
| 286 | + |
| 287 | + def sort(self, data, all_keys): |
| 288 | + dates = [date['date'] for date in data] |
| 289 | + dates.sort() |
| 290 | + cube = [] |
| 291 | + for date in dates: |
| 292 | + for i, d in enumerate(data): |
| 293 | + if d['date'] == date: |
| 294 | + raw_data = d |
| 295 | + del data[i] |
| 296 | + break |
| 297 | + obs = [] |
| 298 | + for key in all_keys: |
| 299 | + obs.append(raw_data[key]) |
| 300 | + cube.append(obs) |
| 301 | + return cube |
| 302 | + |
| 303 | + def convert_dataset_to_lists(self): |
| 304 | + assert self.format == 'long' or self.format == 'wide' |
| 305 | + data, all_keys = [], [] |
| 306 | + for var in self: |
| 307 | + for date in var.obs.keys(): |
| 308 | + datum = var.convert_seconds_to_date(date) |
| 309 | + if self.format == 'long': |
| 310 | + o = [] |
| 311 | + else: |
| 312 | + o = {} |
| 313 | + o['date'] = datum |
| 314 | + |
| 315 | + for obs in var[date].data: |
| 316 | + if self.format == 'long': |
| 317 | + o.append([datum, obs, var.obs[date].data[obs]]) |
| 318 | + data.extend(o) |
| 319 | + o = [] |
| 320 | + else: |
| 321 | + o[obs] = var.obs[date].data[obs] |
| 322 | + #o.append({obs:var.obs[date].data[obs]}) |
| 323 | + if self.format == 'wide': |
| 324 | + data.append(o) |
| 325 | + if self.format == 'wide': |
| 326 | + #Make sure that each variable / observation combination exists. |
| 327 | + all_keys = self.get_all_keys(data) |
| 328 | + data = self.make_data_rectangular(data, all_keys) |
| 329 | + data = self.sort(data, all_keys) |
| 330 | + return data, all_keys |
| 331 | + |
| 332 | + def write(self, format='csv'): |
| 333 | + if format == 'csv': |
| 334 | + self.to_csv() |
| 335 | + |
| 336 | + def to_csv(self): |
| 337 | + |
| 338 | + data, all_keys = self.convert_dataset_to_lists() |
| 339 | + headers = self.add_headers(all_keys) |
| 340 | + fh = file_utils.create_txt_filehandle(settings.dataset_location, self.name, 'w', settings.encoding) |
| 341 | + file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True, format=self.format) |
| 342 | + file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format) |
| 343 | + fh.close() |
| 344 | + |
| 345 | + def add_headers(self, all_keys): |
| 346 | + assert self.format == 'long' or self.format == 'wide' |
| 347 | + headers = [] |
| 348 | + if self.format == 'long': |
| 349 | + headers.append('date') |
| 350 | + for var in self: |
| 351 | + if self.format == 'long': |
| 352 | + headers.extend([var.time_unit, var.name]) |
| 353 | + else: |
| 354 | + for key in all_keys: |
| 355 | + header = '%s_%s' % (key, var.name) |
| 356 | + headers.append(header) |
| 357 | + return headers |
| 358 | + |
| 359 | + def encode(self): |
| 360 | + bson = {} |
| 361 | + for var in self: |
| 362 | + dates = var.obs.keys() |
| 363 | + dates.sort() |
| 364 | + bson[var.name] = {} |
| 365 | + for date in dates: |
| 366 | + obs = var[date] |
| 367 | + key = str(obs.hash) |
| 368 | + bson[var.name][key] = obs.encode_to_bson() |
| 369 | + print bson |
| 370 | + return bson |
| 371 | + |
| 372 | + def encode_to_bson(self, var): |
| 373 | + return {'_type': 'dataset', 'x': var.x()} |
| 374 | + |
| 375 | + |
| 376 | + def decode_from_bson(self, document): |
| 377 | + assert document["_type"] == "custom" |
| 378 | + return self(document["x"]) |
| 379 | + |
| 380 | +def debug(): |
| 381 | + mongo = db.init_mongo_db('enwiki') |
| 382 | + rawdata = mongo['test'] |
| 383 | + mongo.add_son_manipulator(Transform()) |
| 384 | + date = datetime.datetime.today() |
| 385 | + ds = Dataset('test', [{'name': 'count', 'time_unit': 'year'}]) |
| 386 | + ds.count.add(date, 5) |
| 387 | + #ds.summary() |
| 388 | + #ds.write_to_csv() |
| 389 | + v = Variable('test', 'year') |
| 390 | + ds.encode() |
| 391 | + mongo.test.insert({'dataset': ds}) |
| 392 | + |
| 393 | + #v.add(date , 5) |
| 394 | + #o = v.get_observation(date) |
| 395 | + |
| 396 | + #v.summary() |
| 397 | + print ds |
| 398 | + |
| 399 | + |
| 400 | +if __name__ == '__main__': |
| 401 | + debug() |
Property changes on: trunk/tools/editor_trends/analyses/dataset.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 402 | + native |