r82003 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r82002‎ | r82003 | r82004 >
Date:01:06, 12 February 2011
Author:diederik
Status:deferred
Tags:
Comment:
Another round of moving files to their appropriate places.
Modified paths:
  • /trunk/tools/editor_trends/classes/dataset.py (added) (history)
  • /trunk/tools/editor_trends/classes/settings.py (added) (history)
  • /trunk/tools/editor_trends/classes/singleton.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/classes/settings.py
@@ -0,0 +1,191 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
 20+
 21+'''
 22+This file contains settings that are used for constructing and analyzing
 23+the datasets as part of the Editor Dynamics and Anti-Vandalism projects.
 24+'''
 25+
 26+from multiprocessing import cpu_count
 27+import ConfigParser
 28+import os
 29+import sys
 30+import platform
 31+import subprocess
 32+
 33+
 34+from classes import exceptions
 35+from classes import singleton
 36+
 37+try:
 38+ from _winreg import *
 39+ from pywin import win32file
 40+ '''increase the maximum number of open files on Windows to 1024'''
 41+ win32file._setmaxstdio(1024)
 42+except ImportError:
 43+ pass
 44+
 45+try:
 46+ import resource
 47+except ImportError:
 48+ pass
 49+
 50+class Settings:
 51+ #__metaclass__ = singleton.Singleton
 52+
 53+ def __init__(self, process_multiplier=1):
 54+ self.minimum_python_version = (2, 6)
 55+ self.detect_python_version()
 56+ self.encoding = 'utf-8'
 57+
 58+ #Date format as used by Erik Zachte
 59+ self.date_format = '%Y-%m-%d'
 60+
 61+ # Timestamp format as generated by the MediaWiki dumps
 62+ self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
 63+ self.timestamp_server = '%a, %d %b %Y %H:%M:%S %Z'
 64+ #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason
 65+ self.max_xmlfile_size = 4096 * 1024
 66+
 67+ #Change this to match your computers configuration (RAM / CPU)
 68+ self.number_of_processes = cpu_count() * process_multiplier
 69+
 70+ self.wp_dump_location = 'http://dumps.wikimedia.org'
 71+ self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'
 72+ self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json']
 73+ self.windows_register = {'7z.exe': 'Software\\7-Zip', }
 74+ #Extensions of ascii files, this is used to determine the filemode to use
 75+ self.platform = self.determine_platform()
 76+
 77+ self.architecture = platform.machine()
 78+ self.working_directory = self.determine_working_directory()
 79+ print sys.path
 80+ self.update_python_path()
 81+ print sys.path
 82+
 83+ self.root = os.path.expanduser('~') if self.platform != 'Windows' else 'c:\\'
 84+ self.max_filehandles = self.determine_max_filehandles_open()
 85+ self.tab_width = 4 if self.platform == 'Windows' else 8
 86+
 87+
 88+ result = self.load_configuration()
 89+ if not result:
 90+ self.input_location = os.path.join(self.root, 'wikimedia')
 91+
 92+ # Default Input file
 93+ self.input_filename = os.path.join(self.input_location, 'en',
 94+ 'wiki',
 95+ 'enwiki-20100916-stub-meta-history.xml')
 96+ # This is the place where error messages are stored for debugging purposes
 97+ self.log_location = os.path.join(self.working_directory,
 98+ 'logs')
 99+ self.csv_location = os.path.join(self.working_directory,
 100+ 'data', 'csv')
 101+ self.dataset_location = os.path.join(self.working_directory, 'datasets')
 102+ self.binary_location = os.path.join(self.working_directory,
 103+ 'data', 'objects')
 104+
 105+ self.chart_location = os.path.join(self.working_directory, 'statistics',
 106+ 'charts')
 107+ self.file_choices = ('stub-meta-history.xml.gz',
 108+ 'stub-meta-current.xml.gz',
 109+ 'pages-meta-history.xml.7z',
 110+ 'pages-meta-current.xml.bz2',)
 111+
 112+ def load_configuration(self):
 113+ if os.path.exists(os.path.join(self.working_directory, 'wiki.cfg')):
 114+ config = ConfigParser.RawConfigParser()
 115+ config.read(os.path.join(self.working_directory, 'wiki.cfg'))
 116+ self.working_directory = config.get('file_locations', 'working_directory')
 117+ self.input_location = config.get('file_locations', 'input_location')
 118+ self.default_project = config.get('wiki', 'project')
 119+ self.default_language = config.get('wiki', 'language')
 120+ return True
 121+ else:
 122+ return False
 123+
 124+ def determine_working_directory(self):
 125+ cwd = os.getcwd()
 126+ if not cwd.endswith('editor_trends%s' % os.sep):
 127+ pos = cwd.find('editor_trends') + 14
 128+ cwd = cwd[:pos]
 129+ return cwd
 130+
 131+ def detect_python_version(self):
 132+ version = sys.version_info[0:2]
 133+ #logger.debug('Python version: %s' % '.'.join(str(version)))
 134+ if version < self.minimum_python_version:
 135+ raise exceptions.OutDatedPythonVersionError
 136+
 137+ def determine_platform(self):
 138+ if platform.system() == 'Darwin':
 139+ return 'OSX'
 140+ else:
 141+ return platform.system()
 142+
 143+ def verify_environment(self, directories):
 144+ for directory in directories:
 145+ if not os.path.exists(directory):
 146+ try:
 147+ os.makedirs(directory)
 148+ except IOError:
 149+ print 'Configuration Error, could not create directory %s.' % directory
 150+
 151+ def detect_windows_program(self, program):
 152+ entry = self.windows_register.get(program, None)
 153+ try:
 154+ key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ)
 155+ return QueryValueEx(key, 'Path')[0]
 156+ except WindowsError:
 157+ return None
 158+
 159+ def detect_linux_program(self, program):
 160+ path = subprocess.Popen(['which', '%s' % program], stdout=subprocess.PIPE).communicate()[0]
 161+ return path.strip()
 162+
 163+ def detect_installed_program(self, program):
 164+ if self.platform == 'Windows':
 165+ if not program.endswith('.exe'):
 166+ program = program + '.exe'
 167+ path = self.detect_windows_program(program)
 168+ if path != None:
 169+ path = path + program
 170+ elif self.platform == 'Linux':
 171+ path = self.detect_linux_program(program)
 172+
 173+ return path
 174+
 175+ def determine_max_filehandles_open(self):
 176+ if self.platform == 'Windows' and self.architecture == 'i386':
 177+ return win32file._getmaxstdio()
 178+ elif self.platform != 'Windows':
 179+ return resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100
 180+ else:
 181+ return 500
 182+
 183+ def update_python_path(self):
 184+ IGNORE_DIRS = ['wikistats', 'zips', 'datasets', 'mapreduce', 'logs',
 185+ 'statistics', 'js_scripts', 'deployment',
 186+ 'documentation', 'data', 'code-snippets']
 187+ dirs = [name for name in os.listdir(self.working_directory) if
 188+ os.path.isdir(os.path.join(self.working_directory, name))]
 189+ for subdirname in dirs:
 190+ if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
 191+ sys.path.append(os.path.join(self.working_directory,
 192+ subdirname))
Property changes on: trunk/tools/editor_trends/classes/settings.py
___________________________________________________________________
Added: svn:eol-style
1193 + native
Index: trunk/tools/editor_trends/classes/singleton.py
@@ -0,0 +1,34 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2011-02-11'
 19+__version__ = '0.1'
 20+
 21+
 22+class Singleton(type):
 23+ '''
 24+ Recipe: http://stackoverflow.com/questions/31875/is-there-a-simple-elegant-way-to-define-singletons-in-python
 25+ '''
 26+ def __init__(cls, name, bases, dict):
 27+ super(Singleton, cls).__init__(name, bases, dict)
 28+ cls.instance = None
 29+
 30+ def __call__(cls, *args, **kw):
 31+ if cls.instance is None:
 32+ cls.instance = super(Singleton, cls).__call__(*args, **kw)
 33+ return cls.instance
 34+ else:
 35+ return cls.instance
Property changes on: trunk/tools/editor_trends/classes/singleton.py
___________________________________________________________________
Added: svn:eol-style
136 + native
Index: trunk/tools/editor_trends/classes/dataset.py
@@ -0,0 +1,480 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-01-14'
 19+__version__ = '0.1'
 20+
 21+import calendar
 22+import datetime
 23+import time
 24+import math
 25+import operator
 26+import sys
 27+from pymongo.son_manipulator import SONManipulator
 28+from multiprocessing import Lock
 29+
 30+
 31+sys.path.append('..')
 32+import configuration
 33+settings = configuration.Settings()
 34+
 35+from utils import file_utils
 36+from utils import data_converter
 37+from database import db
 38+import json_encoders
 39+
 40+class Transform(SONManipulator):
 41+ '''
 42+ This encoder transforms a Dataset to a MongoDB bson document.
 43+ To use this encoder initalize a mongo database instance and then add:
 44+ mongo.add_son_manipulator(Transform())
 45+ '''
 46+ def transform_incoming(self, son, collection):
 47+ for (key, ds) in son.items():
 48+ son[key] = {}
 49+ for x, var in enumerate(ds):
 50+ if isinstance(var, Variable):
 51+ son[key][var.name] = var.encode()
 52+ for prop in ds.props:
 53+ son[prop] = getattr(ds, prop)
 54+ return son
 55+
 56+ def transform_outgoing(self, son, collection):
 57+ for (key, value) in son.items():
 58+ if isinstance(value, dict):
 59+ names = value.keys()
 60+ for name in names:
 61+ var = Variable(name, None)
 62+ var.decode(value)
 63+ son['variables'][name] = var
 64+ else: # Again, make sure to recurse into sub-docs
 65+ son[key] = value
 66+ name = son.pop('name', None)
 67+ project = son.pop('project', None)
 68+ collection = son.pop('collection', None)
 69+ language_code = son.pop('language_code', None)
 70+ variables = son.pop('variables', [])
 71+ ds = Dataset(name, project, collection, language_code, **son)
 72+ for var in variables:
 73+ var = variables[var]
 74+ ds.add_variable(var)
 75+ return ds
 76+
 77+
 78+class Data:
 79+ '''
 80+ Some generic functions that are required by the Observation, Variable, and
 81+ Dataset classes.
 82+ '''
 83+ def __hash__(self, vars):
 84+ id = ''.join([str(var) for var in vars])
 85+ return hash(id)
 86+ #return int(self.convert_date_to_epoch(date))
 87+
 88+ def encode_to_bson(self, data=None):
 89+ if data:
 90+ kwargs = dict([(str(key), value) for key, value in data.__dict__.iteritems()])
 91+ else:
 92+ kwargs = dict([(str(key), value) for key, value in self.__dict__.iteritems()])
 93+ for key, value in kwargs.iteritems():
 94+ if isinstance(value, dict):
 95+ d = {}
 96+ for k, v in value.iteritems():
 97+ if isinstance(v, Observation):
 98+ v = self.encode_to_bson(v)
 99+ d[str(k)] = v
 100+ kwargs[key] = d
 101+ return kwargs
 102+
 103+ def convert_date_to_epoch(self, date):
 104+ assert self.time_unit == 'year' or self.time_unit == 'month' \
 105+ or self.time_unit == 'day', 'Time unit should either be year, month or day.'
 106+
 107+ if self.time_unit == 'year':
 108+ datum = datetime.datetime(date.year, 1, 1)
 109+ return int(time.mktime(datum.timetuple()))
 110+ elif self.time_unit == 'month':
 111+ datum = datetime.datetime(date.year, date.month, 1)
 112+ return int(time.mktime(datum.timetuple()))
 113+ elif self.time_unit == 'day':
 114+ return int(time.mktime(date.timetuple()))
 115+ else:
 116+ return date
 117+
 118+ def set_date_range(self, date):
 119+ if self.time_unit == 'year':
 120+ return datetime.datetime(date.year, 12, 31), datetime.datetime(date.year, 1, 1)
 121+ elif self.time_unit == 'month':
 122+ day = calendar.monthrange(date.year, date.month)[1]
 123+ return datetime.datetime(date.year, date.month, day), datetime.datetime(date.year, date.month, 1)
 124+ else:
 125+ return datetime.datetime(date.year, date.month, date.day), datetime.datetime(date.year, date.month, date.day)
 126+
 127+
 128+class Observation(Data):
 129+ lock = Lock()
 130+ '''
 131+ The smallest unit, here the actual data is being stored.
 132+ Time_unit should either be 'year', 'month' or 'day'.
 133+ '''
 134+ def __init__(self, date, time_unit, id, meta):
 135+ assert isinstance(date, datetime.datetime), 'Date variable should be a datetime.datetime instance.'
 136+ self.date = date
 137+ self.data = 0
 138+ self.time_unit = time_unit
 139+ self.t1, self.t0 = self.set_date_range(date)
 140+ self.id = id
 141+ self.props = []
 142+ self.count = 0
 143+ for mt in meta:
 144+ if isinstance(mt, float):
 145+ raise Exception, 'Mongo does not allow a dot "." in the name of a key, please use an integer or string as key.'
 146+ elif not isinstance(mt, list):
 147+ setattr(self, mt, meta[mt])
 148+ self.props.append(mt)
 149+ self._type = 'observation'
 150+
 151+ def __repr__(self):
 152+ return '%s' % self.date
 153+
 154+ def __str__(self):
 155+ return 'range: %s:%s' % (self.t0, self.t1)
 156+
 157+ def __iter__(self):
 158+ for obs in self.data:
 159+ yield self.data[obs]
 160+
 161+ def __getitem__(self, key):
 162+ return getattr(self, key, [])
 163+
 164+ def add(self, value):
 165+ '''
 166+ If update == True then data[i] will be incremented else data[i] will be
 167+ created, in that case make sure that i is unique. Update is useful for
 168+ tallying a variable.
 169+ '''
 170+ self.lock.acquire()
 171+ try:
 172+ if isinstance(value, list):
 173+ if self.count == 0:
 174+ self.data = []
 175+ self.data.append(value)
 176+ else:
 177+ self.data += value
 178+ finally:
 179+ self.count += 1
 180+ self.lock.release()
 181+
 182+ def get_date_range(self):
 183+ return '%s-%s-%s:%s-%s-%s' % (self.t0.month, self.t0.day, self.t0.year, \
 184+ self.t1.month, self.t1.day, self.t1.year)
 185+
 186+class Variable(Data):
 187+ '''
 188+ This class constructs a time-based variable.
 189+ '''
 190+ lock = Lock()
 191+ def __init__(self, name, time_unit, **kwargs):
 192+ self.name = name
 193+ self.obs = {}
 194+ self.time_unit = time_unit
 195+ self.groupbys = []
 196+ self._type = 'variable'
 197+ self.props = ['name', 'time_unit', '_type']
 198+ for kw in kwargs:
 199+ setattr(self, kw, kwargs[kw])
 200+ self.props.append(kw)
 201+
 202+ def __str__(self):
 203+ return '%s' % self.name
 204+
 205+ def __repr__(self):
 206+ return '%s' % self.name
 207+
 208+ def __getitem__(self, key):
 209+ return getattr(self, key, [])
 210+
 211+ def __iter__(self):
 212+ keys = self.obs.keys()
 213+ for key in keys:
 214+ yield key
 215+
 216+ def __len__(self):
 217+ return [x for x in xrange(self.obs())]
 218+
 219+ def items(self):
 220+ for key in self.__dict__.keys():
 221+ yield key, getattr(self, key)
 222+
 223+ def itervalues(self):
 224+ for key in self:
 225+ yield self.obs[key].data
 226+
 227+ def iteritems(self):
 228+ for key in self:
 229+ yield (key, self.obs[key])
 230+
 231+
 232+ def get_data(self):
 233+ return [o for o in self.itervalues()]
 234+
 235+ def get_observation(self, id, date, meta):
 236+ self.lock.acquire()
 237+ try:
 238+ obs = self.obs.get(id, Observation(date, self.time_unit, id, meta))
 239+ finally:
 240+ self.lock.release()
 241+ return obs
 242+
 243+ def add(self, date, value, meta={}):
 244+ assert isinstance(meta, dict), 'The meta variable should be a dict (either empty or with variables to group by.'
 245+ #id = self.convert_date_to_epoch(date)
 246+ start, end = self.set_date_range(date)
 247+ values = meta.values()
 248+ values.insert(0, end)
 249+ values.insert(0, start)
 250+ id = self.__hash__(values)
 251+
 252+ obs = self.get_observation(id, date, meta)
 253+ obs.add(value)
 254+ self.obs[id] = obs
 255+
 256+ def encode(self):
 257+ bson = {}
 258+ for prop in self.props:
 259+ bson[prop] = getattr(self, prop)
 260+
 261+ bson['obs'] = {}
 262+ for obs in self:
 263+ data = self.obs[obs]
 264+ obs = str(obs)
 265+ bson['obs'][obs] = data.encode_to_bson()
 266+ return bson
 267+
 268+ def decode(self, values):
 269+ for varname in values:
 270+ for prop in values[varname]:
 271+ if isinstance(values[varname][prop], dict):
 272+ data = values[varname][prop]
 273+ for d in data:
 274+ date = data[d]['date']
 275+ obs = data[d]['data']
 276+ self.add(date, obs)
 277+ else:
 278+ setattr(self, prop, values[varname][prop])
 279+ self.props.append(prop)
 280+
 281+ def get_date_range(self):
 282+ dates = [self.obs[key].date for key in self]
 283+ first = min(dates)
 284+ last = max(dates)
 285+ return first, last
 286+
 287+
 288+class Dataset:
 289+ '''
 290+ This class acts as a container for the Variable class and has some methods
 291+ to output the dataset to a csv file, mongodb and display statistics.
 292+ '''
 293+
 294+ def __init__(self, name, project, collection, language_code, encoder, vars=None, **kwargs):
 295+ encoders = json_encoders.available_json_encoders()
 296+ if encoder not in encoders:
 297+ raise exception.UnknownJSONEncoderError(encoder)
 298+ else:
 299+ self.encoder = encoder
 300+ self.name = name
 301+ self.project = project
 302+ self.collection = collection
 303+ self.language_code = language_code
 304+ self.hash = self.name
 305+ self._type = 'dataset'
 306+ self.created = datetime.datetime.now()
 307+ self.format = 'long'
 308+ for kw in kwargs:
 309+ setattr(self, kw, kwargs[kw])
 310+ self.props = self.__dict__.keys()
 311+
 312+ self.variables = []
 313+ if vars != None:
 314+ for kwargs in vars:
 315+ name = kwargs.pop('name')
 316+ setattr(self, name, Variable(name, **kwargs))
 317+ self.variables.append(name)
 318+ #self.filename = self.create_filename()
 319+
 320+ def __repr__(self):
 321+ return 'Dataset contains %s variables' % (len(self.variables))
 322+
 323+ def __iter__(self):
 324+ for var in self.variables:
 325+ yield getattr(self, var)
 326+
 327+
 328+ def create_filename(self):
 329+ '''
 330+ This function creates a filename for the dataset by searching for shared
 331+ properties among the different variables in the dataset. All shared
 332+ properties will be used in the filename to make sure that one analysis
 333+ that's run with different parameters gets stored in separate files.
 334+ '''
 335+ common = {}
 336+ props = set()
 337+ for var in self.variables:
 338+ s = set()
 339+ var = getattr(self, var)
 340+ for prop in var.props:
 341+ if prop not in ['name', 'time_unit', '_type']:
 342+ s.add(prop)
 343+ props.add(prop)
 344+ common[var.name] = s
 345+
 346+ keys = []
 347+ for prop in props:
 348+ attrs = []
 349+ for s in common.values():
 350+ attrs.append(prop)
 351+ if len(attrs) == len(common.values()):
 352+ keys.append(prop)
 353+ keys.sort()
 354+ attrs = '_'.join(['%s=%s' % (k, getattr(var, k)) for k in keys])
 355+ filename = '%s%s_%s_%s.csv' % (self.language_code,
 356+ self.project,
 357+ self.name,
 358+ attrs)
 359+ self.filename = filename
 360+
 361+
 362+ def add_variable(self, var):
 363+ if isinstance(var, Variable):
 364+ self.variables.append(var.name)
 365+ setattr(self, var.name, var)
 366+ else:
 367+ raise TypeError('You can only instance of Variable to a dataset.')
 368+
 369+ def write(self, format='csv'):
 370+ self.create_filename()
 371+ if format == 'csv':
 372+ self.to_csv()
 373+ elif format == 'mongo':
 374+ self.to_mongo()
 375+
 376+ def to_mongo(self):
 377+ dbname = '%s%s' % (self.language_code, self.project)
 378+ mongo = db.init_mongo_db(dbname)
 379+ coll = mongo['%s_%s' % (dbname, 'charts')]
 380+ mongo.add_son_manipulator(Transform())
 381+ coll.remove({'hash':self.hash, 'project':self.project,
 382+ 'language_code':self.language_code})
 383+ coll.insert({'variables': self})
 384+
 385+ def to_csv(self):
 386+ data = data_converter.convert_dataset_to_lists(self, 'manage')
 387+ headers = data_converter.add_headers(self)
 388+ fh = file_utils.create_txt_filehandle(settings.dataset_location, self.filename, 'w', settings.encoding)
 389+ file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True)
 390+ file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format)
 391+ fh.close()
 392+
 393+ def encode(self):
 394+ props = {}
 395+ for prop in self.props:
 396+ props[prop] = getattr(self, prop)
 397+ return props
 398+
 399+ def get_standard_deviation(self, number_list):
 400+ mean = self.get_mean(number_list)
 401+ std = 0
 402+ n = len(number_list)
 403+ for i in number_list:
 404+ std = std + (i - mean) ** 2
 405+ return math.sqrt(std / float(n - 1))
 406+
 407+ def get_median(self, number_list):
 408+ if number_list == []:
 409+ return '.'
 410+ data = sorted(number_list)
 411+ data = [float(x) for x in data]
 412+ if len(data) % 2 == 1:
 413+ return data[(len(data) + 1) / 2 - 1]
 414+ else:
 415+ lower = data[len(data) / 2 - 1]
 416+ upper = data[len(data) / 2]
 417+ return (lower + upper) / 2
 418+
 419+ def get_mean(self, number_list):
 420+ if number_list == []:
 421+ return '.'
 422+ float_nums = [float(x) for x in number_list]
 423+ return sum(float_nums) / len(number_list)
 424+
 425+ def descriptives(self):
 426+ for variable in self:
 427+ data = variable.get_data()
 428+ variable.mean = self.get_mean(data)
 429+ variable.median = self.get_median(data)
 430+ variable.sds = self.get_standard_deviation(data)
 431+ variable.min = min(data)
 432+ variable.max = max(data)
 433+ variable.n = len(data)
 434+ variable.first_obs, variable.last_obs = variable.get_date_range()
 435+
 436+ def summary(self):
 437+ self.descriptives()
 438+ print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('Variable', 'Mean',
 439+ 'Median', 'SD', 'Minimum', 'Maximum',
 440+ 'Num Obs', 'First Obs', 'Final Obs')
 441+ for variable in self:
 442+ print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (variable.name,
 443+ variable.mean, variable.median,
 444+ variable.sds, variable.min,
 445+ variable.max, variable.n,
 446+ variable.first_obs, variable.last_obs)
 447+
 448+
 449+def debug():
 450+ mongo = db.init_mongo_db('enwiki')
 451+ rawdata = mongo['enwiki_charts']
 452+ mongo.add_son_manipulator(Transform())
 453+
 454+ d1 = datetime.datetime.today()
 455+ d2 = datetime.datetime(2007, 6, 7)
 456+ ds = Dataset('test', 'wiki', 'editors_dataset', 'en', 'to_bar_json', [
 457+ {'name': 'count', 'time_unit': 'year'},
 458+ # {'name': 'testest', 'time_unit': 'year'}
 459+ ])
 460+ ds.count.add(d1, 10, ['exp', 'window'])
 461+ ds.count.add(d1, 135, ['exp', 'window'])
 462+ ds.count.add(d2, 1, ['exp', 'window'])
 463+ #ds.testest.add(d1, 135)
 464+ #ds.testest.add(d2, 535)
 465+ ds.summary()
 466+ ds.write(format='csv')
 467+# v = Variable('test', 'year')
 468+ ds.encode()
 469+ print ds
 470+
 471+ # mongo.test.insert({'variables': ds})
 472+
 473+ # v.add(d2 , 5)
 474+ #o = v.get_observation(d2)
 475+# ds = rawdata.find_one({'project': 'wiki',
 476+# 'language_code': 'en',
 477+# 'hash': 'cohort_dataset_backward_bar'})
 478+
 479+
 480+if __name__ == '__main__':
 481+ debug()
Property changes on: trunk/tools/editor_trends/classes/dataset.py
___________________________________________________________________
Added: svn:eol-style
1482 + native

Status & tagging log