r80726 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r80725‎ | r80726 | r80727 >
Date:23:10, 21 January 2011
Author:diederik
Status:deferred
Tags:
Comment:
Dataset is a custom class to keep track of multiple time-varying variables (variables are here used in the social science sense of the word, a variable has a different value depending on the time of observation). In addition, dataset offers functionality to calculate simple statistics and output to different formats including:
* csv (long & wide)
* mongodb
Modified paths:
  • /trunk/tools/editor_trends/analyses/dataset.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/dataset.py
@@ -0,0 +1,400 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-14'
 19+__version__ = '0.1'
 20+
 21+import calendar
 22+import datetime
 23+import time
 24+import math
 25+import sys
 26+from pymongo.son_manipulator import SONManipulator
 27+
 28+
 29+sys.path.append('..')
 30+import configuration
 31+settings = configuration.Settings()
 32+
 33+from utils import utils
 34+from database import db
 35+
 36+class Transform(SONManipulator):
 37+ def transform_incoming(self, son, collection):
 38+ for (key, ds) in son.items():
 39+ if isinstance(ds, Dataset):
 40+ son[key] = ds.encode()
 41+ #elif isinstance(value, dict): # Make sure we recurse into sub-docs
 42+ # son[key] = self.transform_incoming(value, collection)
 43+ return son
 44+
 45+ def transform_outgoing(self, son, collection):
 46+ for (key, value) in son.items():
 47+ if isinstance(value, dict):
 48+ if "_type" in value and value["_type"] == "custom":
 49+ son[key] = decode_custom(value)
 50+ else: # Again, make sure to recurse into sub-docs
 51+ son[key] = self.transform_outgoing(value, collection)
 52+ return son
 53+
 54+
 55+class Data:
 56+ def __hash__(self, date):
 57+ #return hash(self.convert_date_to_epoch(date))
 58+ return int(self.convert_date_to_epoch(date))
 59+
 60+ def encode_to_bson(self):
 61+ kwargs = dict([(str(key), value) for key, value in self.__dict__.iteritems()])
 62+ for key, value in kwargs.iteritems():
 63+ if isinstance(value, dict):
 64+ d = dict([(str(k), v) for k, v in value.iteritems()])
 65+ kwargs[key] = d
 66+
 67+
 68+ kwargs['_type'] = self._type
 69+ return kwargs
 70+ #return {'_type': 'c', 'x': var.x()}
 71+
 72+ def convert_seconds_to_date(self, secs):
 73+ #return time.gmtime(secs)
 74+ return datetime.datetime.fromtimestamp(secs)
 75+
 76+ def convert_date_to_epoch(self, date):
 77+ assert self.time_unit == 'year' or self.time_unit == 'month' \
 78+ or self.time_unit == 'day'
 79+
 80+ if self.time_unit == 'year':
 81+ datum = datetime.datetime(date.year, 1, 1)
 82+ return time.mktime(datum.timetuple())
 83+ elif self.time_unit == 'month':
 84+ datum = datetime.datetime(date.year, date.month, 1)
 85+ return time.mktime(datum.timetuple())
 86+ else:
 87+ return time.mktime(date.timetuple())
 88+
 89+
 90+class Observation(Data):
 91+ def __init__(self, date, time_unit):
 92+ assert isinstance(date, datetime.datetime)
 93+ self.time_unit = time_unit
 94+ self.t0 = self.set_start_date(date)
 95+ self.t1 = self.set_end_date(date)
 96+ self.hash = self.__hash__(date)
 97+ self.data = {}
 98+ self._type = 'observation'
 99+
 100+ def __repr__(self):
 101+ return '%s' % self.t1
 102+
 103+ def __str__(self):
 104+ return 'range: %s:%s' % (self.t0, self.t1)
 105+
 106+ def __iter__(self):
 107+ for obs in self.obs:
 108+ yield self.obs[obs]
 109+
 110+ def next(self):
 111+ try:
 112+ return len(self.data.keys()) + 1
 113+ except IndexError:
 114+ return 0
 115+
 116+ def set_start_date(self, date):
 117+ if self.time_unit == 'year':
 118+ return datetime.datetime(date.year, 1, 1)
 119+ elif self.time_unit == 'month':
 120+ return datetime.datetime(date.year, date.month, 1)
 121+ else:
 122+ return datetime.datetime(date.year, date.month, date.day)
 123+
 124+ def set_end_date(self, date):
 125+ if self.time_unit == 'year':
 126+ return datetime.datetime(date.year, 12, 31)
 127+ elif self.time_unit == 'month':
 128+ return datetime.datetime(date.year, date.month, calendar.monthrange(date.year, date.month)[1])
 129+ else:
 130+ return datetime.datetime(date.year, date.month, date.day)
 131+
 132+ def add(self, value, update):
 133+ if hasattr(value, '__iter__') == False:
 134+ d = {}
 135+ d[0] = value
 136+ value = d
 137+ assert type(value) == type({})
 138+ x = self.next()
 139+ for i, v in value.iteritems():
 140+ self.data.setdefault(i, 0)
 141+ if update:
 142+ self.data[i] += v
 143+ else:
 144+ i += x
 145+ self.data[i] = v
 146+
 147+
 148+class Variable(Data):
 149+ '''
 150+ This class constructs a time-based variable and has some associated simple
 151+ statistical descriptives
 152+ '''
 153+ def __init__(self, name, time_unit, **kwargs):
 154+ self.name = name
 155+ self.obs = {}
 156+ self.time_unit = time_unit
 157+ self._type = 'variable'
 158+ #self.stats = stats
 159+ for kw in kwargs:
 160+ setattr(self, kw, kwargs[kw])
 161+
 162+ def __str__(self):
 163+ return self.name
 164+
 165+ def __repr__(self):
 166+ return self.name
 167+
 168+ def __getitem__(self, key):
 169+ return self.obs[key]
 170+
 171+ def __iter__(self):
 172+ dates = self.obs.keys()
 173+ dates.sort()
 174+ for date in dates:
 175+ yield date
 176+
 177+
 178+ def __len__(self):
 179+ return [x for x in xrange(self.obs())]
 180+
 181+ def obs(self):
 182+ for date in self:
 183+ for key in self.obs[date].data.keys():
 184+ yield self.obs[date].data[key]
 185+
 186+ def iteritems(self):
 187+ for date in self:
 188+ for value in self.obs[date].data.keys():
 189+ yield (value, self.obs[date].data[value])
 190+
 191+ def get_observation(self, date):
 192+ key = self.__hash__(date)
 193+ return self.obs.get(key, Observation(date, self.time_unit))
 194+
 195+ def min(self):
 196+ return min([obs for obs in self])
 197+ #return min([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()])
 198+
 199+ def max(self):
 200+ return max([self.obs[date].data[k] for date in self.obs.keys() for k in self.obs[date].data.keys()])
 201+
 202+ def get_standard_deviation(self, number_list):
 203+ mean = get_mean(number_list)
 204+ std = 0
 205+ n = len(number_list)
 206+ for i in number_list:
 207+ std = std + (i - mean) ** 2
 208+ return math.sqrt(std / float(n - 1))
 209+
 210+
 211+ def get_median(self, number_list):
 212+ #print number_list
 213+ if number_list == []: return '.'
 214+ data = sorted(number_list)
 215+ data = [float(x) for x in data]
 216+ if len(data) % 2 == 1:
 217+ return data[(len(data) + 1) / 2 - 1]
 218+ else:
 219+ lower = data[len(data) / 2 - 1]
 220+ upper = data[len(data) / 2]
 221+ #print upper, lower
 222+ return (lower + upper) / 2
 223+
 224+
 225+ def get_mean(self, number_list):
 226+ #print number_list
 227+ if number_list == []: return '.'
 228+ float_nums = [float(x) for x in number_list]
 229+ return sum(float_nums) / len(number_list)
 230+
 231+ def summary(self):
 232+ print 'Variable: %s' % self.name
 233+ print 'Mean: %s' % self.get_mean(self)
 234+ print 'Median: %s' % self.get_median(self)
 235+ print 'Standard Deviation: %s' % self.get_standard_deviation(self)
 236+ print 'Minimum: %s' % self.min()
 237+ print 'Maximum: %s' % self.max()
 238+
 239+
 240+ def add(self, date, value, update=True):
 241+ data = self.get_observation(date)
 242+ data.add(value, update)
 243+ self.obs[data.hash] = data
 244+
 245+
 246+class Dataset:
 247+ '''
 248+ This class acts as a container for the Variable class and has some methods
 249+ to output the dataset to a csv file.
 250+ '''
 251+ def __init__(self, name, vars=[{}]):
 252+ self.name = '%s.csv' % name
 253+ self.vars = []
 254+ self.format = 'long'
 255+ self._type = 'dataset'
 256+ for kwargs in vars:
 257+ name = kwargs.pop('name')
 258+ setattr(self, name, Variable(name, **kwargs))
 259+ self.vars.append(name)
 260+
 261+ def __repr__(self):
 262+ return 'Dataset contains %s variables' % (len(self.vars))
 263+
 264+ def __iter__(self):
 265+ for var in self.vars:
 266+ yield getattr(self, var)
 267+
 268+ def get_all_keys(self, data):
 269+ all_keys = []
 270+ for d in data:
 271+ for key in d:
 272+ if key not in all_keys:
 273+ all_keys.append(key)
 274+ all_keys.sort()
 275+ all_keys.insert(0, all_keys[-1])
 276+ del all_keys[-1]
 277+ return all_keys
 278+
 279+ def make_data_rectangular(self, data, all_keys):
 280+ for i, d in enumerate(data):
 281+ for key in all_keys:
 282+ if key not in d:
 283+ d[key] = 0
 284+ data[i] = d
 285+ return data
 286+
 287+ def sort(self, data, all_keys):
 288+ dates = [date['date'] for date in data]
 289+ dates.sort()
 290+ cube = []
 291+ for date in dates:
 292+ for i, d in enumerate(data):
 293+ if d['date'] == date:
 294+ raw_data = d
 295+ del data[i]
 296+ break
 297+ obs = []
 298+ for key in all_keys:
 299+ obs.append(raw_data[key])
 300+ cube.append(obs)
 301+ return cube
 302+
 303+ def convert_dataset_to_lists(self):
 304+ assert self.format == 'long' or self.format == 'wide'
 305+ data, all_keys = [], []
 306+ for var in self:
 307+ for date in var.obs.keys():
 308+ datum = var.convert_seconds_to_date(date)
 309+ if self.format == 'long':
 310+ o = []
 311+ else:
 312+ o = {}
 313+ o['date'] = datum
 314+
 315+ for obs in var[date].data:
 316+ if self.format == 'long':
 317+ o.append([datum, obs, var.obs[date].data[obs]])
 318+ data.extend(o)
 319+ o = []
 320+ else:
 321+ o[obs] = var.obs[date].data[obs]
 322+ #o.append({obs:var.obs[date].data[obs]})
 323+ if self.format == 'wide':
 324+ data.append(o)
 325+ if self.format == 'wide':
 326+ #Make sure that each variable / observation combination exists.
 327+ all_keys = self.get_all_keys(data)
 328+ data = self.make_data_rectangular(data, all_keys)
 329+ data = self.sort(data, all_keys)
 330+ return data, all_keys
 331+
 332+ def write(self, format='csv'):
 333+ if format == 'csv':
 334+ self.to_csv()
 335+
 336+ def to_csv(self):
 337+
 338+ data, all_keys = self.convert_dataset_to_lists()
 339+ headers = self.add_headers(all_keys)
 340+ fh = file_utils.create_txt_filehandle(settings.dataset_location, self.name, 'w', settings.encoding)
 341+ file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True, format=self.format)
 342+ file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format)
 343+ fh.close()
 344+
 345+ def add_headers(self, all_keys):
 346+ assert self.format == 'long' or self.format == 'wide'
 347+ headers = []
 348+ if self.format == 'long':
 349+ headers.append('date')
 350+ for var in self:
 351+ if self.format == 'long':
 352+ headers.extend([var.time_unit, var.name])
 353+ else:
 354+ for key in all_keys:
 355+ header = '%s_%s' % (key, var.name)
 356+ headers.append(header)
 357+ return headers
 358+
 359+ def encode(self):
 360+ bson = {}
 361+ for var in self:
 362+ dates = var.obs.keys()
 363+ dates.sort()
 364+ bson[var.name] = {}
 365+ for date in dates:
 366+ obs = var[date]
 367+ key = str(obs.hash)
 368+ bson[var.name][key] = obs.encode_to_bson()
 369+ print bson
 370+ return bson
 371+
 372+ def encode_to_bson(self, var):
 373+ return {'_type': 'dataset', 'x': var.x()}
 374+
 375+
 376+ def decode_from_bson(self, document):
 377+ assert document["_type"] == "custom"
 378+ return self(document["x"])
 379+
 380+def debug():
 381+ mongo = db.init_mongo_db('enwiki')
 382+ rawdata = mongo['test']
 383+ mongo.add_son_manipulator(Transform())
 384+ date = datetime.datetime.today()
 385+ ds = Dataset('test', [{'name': 'count', 'time_unit': 'year'}])
 386+ ds.count.add(date, 5)
 387+ #ds.summary()
 388+ #ds.write_to_csv()
 389+ v = Variable('test', 'year')
 390+ ds.encode()
 391+ mongo.test.insert({'dataset': ds})
 392+
 393+ #v.add(date , 5)
 394+ #o = v.get_observation(date)
 395+
 396+ #v.summary()
 397+ print ds
 398+
 399+
 400+if __name__ == '__main__':
 401+ debug()
Property changes on: trunk/tools/editor_trends/analyses/dataset.py
___________________________________________________________________
Added: svn:eol-style
1402 + native

Status & tagging log