Index: trunk/tools/editor_trends/classes/settings.py |
— | — | @@ -0,0 +1,191 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-10-21' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +''' |
| 22 | +This file contains settings that are used for constructing and analyzing |
| 23 | +the datasets as part of the Editor Dynamics and Anti-Vandalism projects. |
| 24 | +''' |
| 25 | + |
| 26 | +from multiprocessing import cpu_count |
| 27 | +import ConfigParser |
| 28 | +import os |
| 29 | +import sys |
| 30 | +import platform |
| 31 | +import subprocess |
| 32 | + |
| 33 | + |
| 34 | +from classes import exceptions |
| 35 | +from classes import singleton |
| 36 | + |
| 37 | +try: |
| 38 | + from _winreg import * |
| 39 | + from pywin import win32file |
| 40 | + '''increase the maximum number of open files on Windows to 1024''' |
| 41 | + win32file._setmaxstdio(1024) |
| 42 | +except ImportError: |
| 43 | + pass |
| 44 | + |
| 45 | +try: |
| 46 | + import resource |
| 47 | +except ImportError: |
| 48 | + pass |
| 49 | + |
| 50 | +class Settings: |
| 51 | + #__metaclass__ = singleton.Singleton |
| 52 | + |
| 53 | + def __init__(self, process_multiplier=1): |
| 54 | + self.minimum_python_version = (2, 6) |
| 55 | + self.detect_python_version() |
| 56 | + self.encoding = 'utf-8' |
| 57 | + |
| 58 | + #Date format as used by Erik Zachte |
| 59 | + self.date_format = '%Y-%m-%d' |
| 60 | + |
| 61 | + # Timestamp format as generated by the MediaWiki dumps |
| 62 | + self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' |
| 63 | + self.timestamp_server = '%a, %d %b %Y %H:%M:%S %Z' |
| 64 | + #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason |
| 65 | + self.max_xmlfile_size = 4096 * 1024 |
| 66 | + |
| 67 | + #Change this to match your computers configuration (RAM / CPU) |
| 68 | + self.number_of_processes = cpu_count() * process_multiplier |
| 69 | + |
| 70 | + self.wp_dump_location = 'http://dumps.wikimedia.org' |
| 71 | + self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/' |
| 72 | + self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json'] |
| 73 | + self.windows_register = {'7z.exe': 'Software\\7-Zip', } |
| 74 | + #Extensions of ascii files, this is used to determine the filemode to use |
| 75 | + self.platform = self.determine_platform() |
| 76 | + |
| 77 | + self.architecture = platform.machine() |
| 78 | + self.working_directory = self.determine_working_directory() |
| 79 | + print sys.path |
| 80 | + self.update_python_path() |
| 81 | + print sys.path |
| 82 | + |
| 83 | + self.root = os.path.expanduser('~') if self.platform != 'Windows' else 'c:\\' |
| 84 | + self.max_filehandles = self.determine_max_filehandles_open() |
| 85 | + self.tab_width = 4 if self.platform == 'Windows' else 8 |
| 86 | + |
| 87 | + |
| 88 | + result = self.load_configuration() |
| 89 | + if not result: |
| 90 | + self.input_location = os.path.join(self.root, 'wikimedia') |
| 91 | + |
| 92 | + # Default Input file |
| 93 | + self.input_filename = os.path.join(self.input_location, 'en', |
| 94 | + 'wiki', |
| 95 | + 'enwiki-20100916-stub-meta-history.xml') |
| 96 | + # This is the place where error messages are stored for debugging purposes |
| 97 | + self.log_location = os.path.join(self.working_directory, |
| 98 | + 'logs') |
| 99 | + self.csv_location = os.path.join(self.working_directory, |
| 100 | + 'data', 'csv') |
| 101 | + self.dataset_location = os.path.join(self.working_directory, 'datasets') |
| 102 | + self.binary_location = os.path.join(self.working_directory, |
| 103 | + 'data', 'objects') |
| 104 | + |
| 105 | + self.chart_location = os.path.join(self.working_directory, 'statistics', |
| 106 | + 'charts') |
| 107 | + self.file_choices = ('stub-meta-history.xml.gz', |
| 108 | + 'stub-meta-current.xml.gz', |
| 109 | + 'pages-meta-history.xml.7z', |
| 110 | + 'pages-meta-current.xml.bz2',) |
| 111 | + |
| 112 | + def load_configuration(self): |
| 113 | + if os.path.exists(os.path.join(self.working_directory, 'wiki.cfg')): |
| 114 | + config = ConfigParser.RawConfigParser() |
| 115 | + config.read(os.path.join(self.working_directory, 'wiki.cfg')) |
| 116 | + self.working_directory = config.get('file_locations', 'working_directory') |
| 117 | + self.input_location = config.get('file_locations', 'input_location') |
| 118 | + self.default_project = config.get('wiki', 'project') |
| 119 | + self.default_language = config.get('wiki', 'language') |
| 120 | + return True |
| 121 | + else: |
| 122 | + return False |
| 123 | + |
| 124 | + def determine_working_directory(self): |
| 125 | + cwd = os.getcwd() |
| 126 | + if not cwd.endswith('editor_trends%s' % os.sep): |
| 127 | + pos = cwd.find('editor_trends') + 14 |
| 128 | + cwd = cwd[:pos] |
| 129 | + return cwd |
| 130 | + |
| 131 | + def detect_python_version(self): |
| 132 | + version = sys.version_info[0:2] |
| 133 | + #logger.debug('Python version: %s' % '.'.join(str(version))) |
| 134 | + if version < self.minimum_python_version: |
| 135 | + raise exceptions.OutDatedPythonVersionError |
| 136 | + |
| 137 | + def determine_platform(self): |
| 138 | + if platform.system() == 'Darwin': |
| 139 | + return 'OSX' |
| 140 | + else: |
| 141 | + return platform.system() |
| 142 | + |
| 143 | + def verify_environment(self, directories): |
| 144 | + for directory in directories: |
| 145 | + if not os.path.exists(directory): |
| 146 | + try: |
| 147 | + os.makedirs(directory) |
| 148 | + except IOError: |
| 149 | + print 'Configuration Error, could not create directory %s.' % directory |
| 150 | + |
| 151 | + def detect_windows_program(self, program): |
| 152 | + entry = self.windows_register.get(program, None) |
| 153 | + try: |
| 154 | + key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ) |
| 155 | + return QueryValueEx(key, 'Path')[0] |
| 156 | + except WindowsError: |
| 157 | + return None |
| 158 | + |
| 159 | + def detect_linux_program(self, program): |
| 160 | + path = subprocess.Popen(['which', '%s' % program], stdout=subprocess.PIPE).communicate()[0] |
| 161 | + return path.strip() |
| 162 | + |
| 163 | + def detect_installed_program(self, program): |
| 164 | + if self.platform == 'Windows': |
| 165 | + if not program.endswith('.exe'): |
| 166 | + program = program + '.exe' |
| 167 | + path = self.detect_windows_program(program) |
| 168 | + if path != None: |
| 169 | + path = path + program |
| 170 | + elif self.platform == 'Linux': |
| 171 | + path = self.detect_linux_program(program) |
| 172 | + |
| 173 | + return path |
| 174 | + |
| 175 | + def determine_max_filehandles_open(self): |
| 176 | + if self.platform == 'Windows' and self.architecture == 'i386': |
| 177 | + return win32file._getmaxstdio() |
| 178 | + elif self.platform != 'Windows': |
| 179 | + return resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100 |
| 180 | + else: |
| 181 | + return 500 |
| 182 | + |
| 183 | + def update_python_path(self): |
| 184 | + IGNORE_DIRS = ['wikistats', 'zips', 'datasets', 'mapreduce', 'logs', |
| 185 | + 'statistics', 'js_scripts', 'deployment', |
| 186 | + 'documentation', 'data', 'code-snippets'] |
| 187 | + dirs = [name for name in os.listdir(self.working_directory) if |
| 188 | + os.path.isdir(os.path.join(self.working_directory, name))] |
| 189 | + for subdirname in dirs: |
| 190 | + if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS: |
| 191 | + sys.path.append(os.path.join(self.working_directory, |
| 192 | + subdirname)) |
Property changes on: trunk/tools/editor_trends/classes/settings.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 193 | + native |
Index: trunk/tools/editor_trends/classes/singleton.py |
— | — | @@ -0,0 +1,34 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-02-11' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | + |
| 22 | +class Singleton(type): |
| 23 | + ''' |
| 24 | + Recipe: http://stackoverflow.com/questions/31875/is-there-a-simple-elegant-way-to-define-singletons-in-python |
| 25 | + ''' |
| 26 | + def __init__(cls, name, bases, dict): |
| 27 | + super(Singleton, cls).__init__(name, bases, dict) |
| 28 | + cls.instance = None |
| 29 | + |
| 30 | + def __call__(cls, *args, **kw): |
| 31 | + if cls.instance is None: |
| 32 | + cls.instance = super(Singleton, cls).__call__(*args, **kw) |
| 33 | + return cls.instance |
| 34 | + else: |
| 35 | + return cls.instance |
Property changes on: trunk/tools/editor_trends/classes/singleton.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 36 | + native |
Index: trunk/tools/editor_trends/classes/dataset.py |
— | — | @@ -0,0 +1,480 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-14' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import calendar |
| 22 | +import datetime |
| 23 | +import time |
| 24 | +import math |
| 25 | +import operator |
| 26 | +import sys |
| 27 | +from pymongo.son_manipulator import SONManipulator |
| 28 | +from multiprocessing import Lock |
| 29 | + |
| 30 | + |
| 31 | +sys.path.append('..') |
| 32 | +import configuration |
| 33 | +settings = configuration.Settings() |
| 34 | + |
| 35 | +from utils import file_utils |
| 36 | +from utils import data_converter |
| 37 | +from database import db |
| 38 | +import json_encoders |
| 39 | + |
| 40 | +class Transform(SONManipulator): |
| 41 | + ''' |
| 42 | + This encoder transforms a Dataset to a MongoDB bson document. |
| 43 | + To use this encoder initalize a mongo database instance and then add: |
| 44 | + mongo.add_son_manipulator(Transform()) |
| 45 | + ''' |
| 46 | + def transform_incoming(self, son, collection): |
| 47 | + for (key, ds) in son.items(): |
| 48 | + son[key] = {} |
| 49 | + for x, var in enumerate(ds): |
| 50 | + if isinstance(var, Variable): |
| 51 | + son[key][var.name] = var.encode() |
| 52 | + for prop in ds.props: |
| 53 | + son[prop] = getattr(ds, prop) |
| 54 | + return son |
| 55 | + |
| 56 | + def transform_outgoing(self, son, collection): |
| 57 | + for (key, value) in son.items(): |
| 58 | + if isinstance(value, dict): |
| 59 | + names = value.keys() |
| 60 | + for name in names: |
| 61 | + var = Variable(name, None) |
| 62 | + var.decode(value) |
| 63 | + son['variables'][name] = var |
| 64 | + else: # Again, make sure to recurse into sub-docs |
| 65 | + son[key] = value |
| 66 | + name = son.pop('name', None) |
| 67 | + project = son.pop('project', None) |
| 68 | + collection = son.pop('collection', None) |
| 69 | + language_code = son.pop('language_code', None) |
| 70 | + variables = son.pop('variables', []) |
| 71 | + ds = Dataset(name, project, collection, language_code, **son) |
| 72 | + for var in variables: |
| 73 | + var = variables[var] |
| 74 | + ds.add_variable(var) |
| 75 | + return ds |
| 76 | + |
| 77 | + |
| 78 | +class Data: |
| 79 | + ''' |
| 80 | + Some generic functions that are required by the Observation, Variable, and |
| 81 | + Dataset classes. |
| 82 | + ''' |
| 83 | + def __hash__(self, vars): |
| 84 | + id = ''.join([str(var) for var in vars]) |
| 85 | + return hash(id) |
| 86 | + #return int(self.convert_date_to_epoch(date)) |
| 87 | + |
| 88 | + def encode_to_bson(self, data=None): |
| 89 | + if data: |
| 90 | + kwargs = dict([(str(key), value) for key, value in data.__dict__.iteritems()]) |
| 91 | + else: |
| 92 | + kwargs = dict([(str(key), value) for key, value in self.__dict__.iteritems()]) |
| 93 | + for key, value in kwargs.iteritems(): |
| 94 | + if isinstance(value, dict): |
| 95 | + d = {} |
| 96 | + for k, v in value.iteritems(): |
| 97 | + if isinstance(v, Observation): |
| 98 | + v = self.encode_to_bson(v) |
| 99 | + d[str(k)] = v |
| 100 | + kwargs[key] = d |
| 101 | + return kwargs |
| 102 | + |
| 103 | + def convert_date_to_epoch(self, date): |
| 104 | + assert self.time_unit == 'year' or self.time_unit == 'month' \ |
| 105 | + or self.time_unit == 'day', 'Time unit should either be year, month or day.' |
| 106 | + |
| 107 | + if self.time_unit == 'year': |
| 108 | + datum = datetime.datetime(date.year, 1, 1) |
| 109 | + return int(time.mktime(datum.timetuple())) |
| 110 | + elif self.time_unit == 'month': |
| 111 | + datum = datetime.datetime(date.year, date.month, 1) |
| 112 | + return int(time.mktime(datum.timetuple())) |
| 113 | + elif self.time_unit == 'day': |
| 114 | + return int(time.mktime(date.timetuple())) |
| 115 | + else: |
| 116 | + return date |
| 117 | + |
| 118 | + def set_date_range(self, date): |
| 119 | + if self.time_unit == 'year': |
| 120 | + return datetime.datetime(date.year, 12, 31), datetime.datetime(date.year, 1, 1) |
| 121 | + elif self.time_unit == 'month': |
| 122 | + day = calendar.monthrange(date.year, date.month)[1] |
| 123 | + return datetime.datetime(date.year, date.month, day), datetime.datetime(date.year, date.month, 1) |
| 124 | + else: |
| 125 | + return datetime.datetime(date.year, date.month, date.day), datetime.datetime(date.year, date.month, date.day) |
| 126 | + |
| 127 | + |
| 128 | +class Observation(Data): |
| 129 | + lock = Lock() |
| 130 | + ''' |
| 131 | + The smallest unit, here the actual data is being stored. |
| 132 | + Time_unit should either be 'year', 'month' or 'day'. |
| 133 | + ''' |
| 134 | + def __init__(self, date, time_unit, id, meta): |
| 135 | + assert isinstance(date, datetime.datetime), 'Date variable should be a datetime.datetime instance.' |
| 136 | + self.date = date |
| 137 | + self.data = 0 |
| 138 | + self.time_unit = time_unit |
| 139 | + self.t1, self.t0 = self.set_date_range(date) |
| 140 | + self.id = id |
| 141 | + self.props = [] |
| 142 | + self.count = 0 |
| 143 | + for mt in meta: |
| 144 | + if isinstance(mt, float): |
| 145 | + raise Exception, 'Mongo does not allow a dot "." in the name of a key, please use an integer or string as key.' |
| 146 | + elif not isinstance(mt, list): |
| 147 | + setattr(self, mt, meta[mt]) |
| 148 | + self.props.append(mt) |
| 149 | + self._type = 'observation' |
| 150 | + |
| 151 | + def __repr__(self): |
| 152 | + return '%s' % self.date |
| 153 | + |
| 154 | + def __str__(self): |
| 155 | + return 'range: %s:%s' % (self.t0, self.t1) |
| 156 | + |
| 157 | + def __iter__(self): |
| 158 | + for obs in self.data: |
| 159 | + yield self.data[obs] |
| 160 | + |
| 161 | + def __getitem__(self, key): |
| 162 | + return getattr(self, key, []) |
| 163 | + |
| 164 | + def add(self, value): |
| 165 | + ''' |
| 166 | + If update == True then data[i] will be incremented else data[i] will be |
| 167 | + created, in that case make sure that i is unique. Update is useful for |
| 168 | + tallying a variable. |
| 169 | + ''' |
| 170 | + self.lock.acquire() |
| 171 | + try: |
| 172 | + if isinstance(value, list): |
| 173 | + if self.count == 0: |
| 174 | + self.data = [] |
| 175 | + self.data.append(value) |
| 176 | + else: |
| 177 | + self.data += value |
| 178 | + finally: |
| 179 | + self.count += 1 |
| 180 | + self.lock.release() |
| 181 | + |
| 182 | + def get_date_range(self): |
| 183 | + return '%s-%s-%s:%s-%s-%s' % (self.t0.month, self.t0.day, self.t0.year, \ |
| 184 | + self.t1.month, self.t1.day, self.t1.year) |
| 185 | + |
| 186 | +class Variable(Data): |
| 187 | + ''' |
| 188 | + This class constructs a time-based variable. |
| 189 | + ''' |
| 190 | + lock = Lock() |
| 191 | + def __init__(self, name, time_unit, **kwargs): |
| 192 | + self.name = name |
| 193 | + self.obs = {} |
| 194 | + self.time_unit = time_unit |
| 195 | + self.groupbys = [] |
| 196 | + self._type = 'variable' |
| 197 | + self.props = ['name', 'time_unit', '_type'] |
| 198 | + for kw in kwargs: |
| 199 | + setattr(self, kw, kwargs[kw]) |
| 200 | + self.props.append(kw) |
| 201 | + |
| 202 | + def __str__(self): |
| 203 | + return '%s' % self.name |
| 204 | + |
| 205 | + def __repr__(self): |
| 206 | + return '%s' % self.name |
| 207 | + |
| 208 | + def __getitem__(self, key): |
| 209 | + return getattr(self, key, []) |
| 210 | + |
| 211 | + def __iter__(self): |
| 212 | + keys = self.obs.keys() |
| 213 | + for key in keys: |
| 214 | + yield key |
| 215 | + |
| 216 | + def __len__(self): |
| 217 | + return [x for x in xrange(self.obs())] |
| 218 | + |
| 219 | + def items(self): |
| 220 | + for key in self.__dict__.keys(): |
| 221 | + yield key, getattr(self, key) |
| 222 | + |
| 223 | + def itervalues(self): |
| 224 | + for key in self: |
| 225 | + yield self.obs[key].data |
| 226 | + |
| 227 | + def iteritems(self): |
| 228 | + for key in self: |
| 229 | + yield (key, self.obs[key]) |
| 230 | + |
| 231 | + |
| 232 | + def get_data(self): |
| 233 | + return [o for o in self.itervalues()] |
| 234 | + |
| 235 | + def get_observation(self, id, date, meta): |
| 236 | + self.lock.acquire() |
| 237 | + try: |
| 238 | + obs = self.obs.get(id, Observation(date, self.time_unit, id, meta)) |
| 239 | + finally: |
| 240 | + self.lock.release() |
| 241 | + return obs |
| 242 | + |
| 243 | + def add(self, date, value, meta={}): |
| 244 | + assert isinstance(meta, dict), 'The meta variable should be a dict (either empty or with variables to group by.' |
| 245 | + #id = self.convert_date_to_epoch(date) |
| 246 | + start, end = self.set_date_range(date) |
| 247 | + values = meta.values() |
| 248 | + values.insert(0, end) |
| 249 | + values.insert(0, start) |
| 250 | + id = self.__hash__(values) |
| 251 | + |
| 252 | + obs = self.get_observation(id, date, meta) |
| 253 | + obs.add(value) |
| 254 | + self.obs[id] = obs |
| 255 | + |
| 256 | + def encode(self): |
| 257 | + bson = {} |
| 258 | + for prop in self.props: |
| 259 | + bson[prop] = getattr(self, prop) |
| 260 | + |
| 261 | + bson['obs'] = {} |
| 262 | + for obs in self: |
| 263 | + data = self.obs[obs] |
| 264 | + obs = str(obs) |
| 265 | + bson['obs'][obs] = data.encode_to_bson() |
| 266 | + return bson |
| 267 | + |
| 268 | + def decode(self, values): |
| 269 | + for varname in values: |
| 270 | + for prop in values[varname]: |
| 271 | + if isinstance(values[varname][prop], dict): |
| 272 | + data = values[varname][prop] |
| 273 | + for d in data: |
| 274 | + date = data[d]['date'] |
| 275 | + obs = data[d]['data'] |
| 276 | + self.add(date, obs) |
| 277 | + else: |
| 278 | + setattr(self, prop, values[varname][prop]) |
| 279 | + self.props.append(prop) |
| 280 | + |
| 281 | + def get_date_range(self): |
| 282 | + dates = [self.obs[key].date for key in self] |
| 283 | + first = min(dates) |
| 284 | + last = max(dates) |
| 285 | + return first, last |
| 286 | + |
| 287 | + |
| 288 | +class Dataset: |
| 289 | + ''' |
| 290 | + This class acts as a container for the Variable class and has some methods |
| 291 | + to output the dataset to a csv file, mongodb and display statistics. |
| 292 | + ''' |
| 293 | + |
| 294 | + def __init__(self, name, project, collection, language_code, encoder, vars=None, **kwargs): |
| 295 | + encoders = json_encoders.available_json_encoders() |
| 296 | + if encoder not in encoders: |
| 297 | + raise exception.UnknownJSONEncoderError(encoder) |
| 298 | + else: |
| 299 | + self.encoder = encoder |
| 300 | + self.name = name |
| 301 | + self.project = project |
| 302 | + self.collection = collection |
| 303 | + self.language_code = language_code |
| 304 | + self.hash = self.name |
| 305 | + self._type = 'dataset' |
| 306 | + self.created = datetime.datetime.now() |
| 307 | + self.format = 'long' |
| 308 | + for kw in kwargs: |
| 309 | + setattr(self, kw, kwargs[kw]) |
| 310 | + self.props = self.__dict__.keys() |
| 311 | + |
| 312 | + self.variables = [] |
| 313 | + if vars != None: |
| 314 | + for kwargs in vars: |
| 315 | + name = kwargs.pop('name') |
| 316 | + setattr(self, name, Variable(name, **kwargs)) |
| 317 | + self.variables.append(name) |
| 318 | + #self.filename = self.create_filename() |
| 319 | + |
| 320 | + def __repr__(self): |
| 321 | + return 'Dataset contains %s variables' % (len(self.variables)) |
| 322 | + |
| 323 | + def __iter__(self): |
| 324 | + for var in self.variables: |
| 325 | + yield getattr(self, var) |
| 326 | + |
| 327 | + |
| 328 | + def create_filename(self): |
| 329 | + ''' |
| 330 | + This function creates a filename for the dataset by searching for shared |
| 331 | + properties among the different variables in the dataset. All shared |
| 332 | + properties will be used in the filename to make sure that one analysis |
| 333 | + that's run with different parameters gets stored in separate files. |
| 334 | + ''' |
| 335 | + common = {} |
| 336 | + props = set() |
| 337 | + for var in self.variables: |
| 338 | + s = set() |
| 339 | + var = getattr(self, var) |
| 340 | + for prop in var.props: |
| 341 | + if prop not in ['name', 'time_unit', '_type']: |
| 342 | + s.add(prop) |
| 343 | + props.add(prop) |
| 344 | + common[var.name] = s |
| 345 | + |
| 346 | + keys = [] |
| 347 | + for prop in props: |
| 348 | + attrs = [] |
| 349 | + for s in common.values(): |
| 350 | + attrs.append(prop) |
| 351 | + if len(attrs) == len(common.values()): |
| 352 | + keys.append(prop) |
| 353 | + keys.sort() |
| 354 | + attrs = '_'.join(['%s=%s' % (k, getattr(var, k)) for k in keys]) |
| 355 | + filename = '%s%s_%s_%s.csv' % (self.language_code, |
| 356 | + self.project, |
| 357 | + self.name, |
| 358 | + attrs) |
| 359 | + self.filename = filename |
| 360 | + |
| 361 | + |
| 362 | + def add_variable(self, var): |
| 363 | + if isinstance(var, Variable): |
| 364 | + self.variables.append(var.name) |
| 365 | + setattr(self, var.name, var) |
| 366 | + else: |
| 367 | + raise TypeError('You can only instance of Variable to a dataset.') |
| 368 | + |
| 369 | + def write(self, format='csv'): |
| 370 | + self.create_filename() |
| 371 | + if format == 'csv': |
| 372 | + self.to_csv() |
| 373 | + elif format == 'mongo': |
| 374 | + self.to_mongo() |
| 375 | + |
| 376 | + def to_mongo(self): |
| 377 | + dbname = '%s%s' % (self.language_code, self.project) |
| 378 | + mongo = db.init_mongo_db(dbname) |
| 379 | + coll = mongo['%s_%s' % (dbname, 'charts')] |
| 380 | + mongo.add_son_manipulator(Transform()) |
| 381 | + coll.remove({'hash':self.hash, 'project':self.project, |
| 382 | + 'language_code':self.language_code}) |
| 383 | + coll.insert({'variables': self}) |
| 384 | + |
| 385 | + def to_csv(self): |
| 386 | + data = data_converter.convert_dataset_to_lists(self, 'manage') |
| 387 | + headers = data_converter.add_headers(self) |
| 388 | + fh = file_utils.create_txt_filehandle(settings.dataset_location, self.filename, 'w', settings.encoding) |
| 389 | + file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True) |
| 390 | + file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format) |
| 391 | + fh.close() |
| 392 | + |
| 393 | + def encode(self): |
| 394 | + props = {} |
| 395 | + for prop in self.props: |
| 396 | + props[prop] = getattr(self, prop) |
| 397 | + return props |
| 398 | + |
| 399 | + def get_standard_deviation(self, number_list): |
| 400 | + mean = self.get_mean(number_list) |
| 401 | + std = 0 |
| 402 | + n = len(number_list) |
| 403 | + for i in number_list: |
| 404 | + std = std + (i - mean) ** 2 |
| 405 | + return math.sqrt(std / float(n - 1)) |
| 406 | + |
| 407 | + def get_median(self, number_list): |
| 408 | + if number_list == []: |
| 409 | + return '.' |
| 410 | + data = sorted(number_list) |
| 411 | + data = [float(x) for x in data] |
| 412 | + if len(data) % 2 == 1: |
| 413 | + return data[(len(data) + 1) / 2 - 1] |
| 414 | + else: |
| 415 | + lower = data[len(data) / 2 - 1] |
| 416 | + upper = data[len(data) / 2] |
| 417 | + return (lower + upper) / 2 |
| 418 | + |
| 419 | + def get_mean(self, number_list): |
| 420 | + if number_list == []: |
| 421 | + return '.' |
| 422 | + float_nums = [float(x) for x in number_list] |
| 423 | + return sum(float_nums) / len(number_list) |
| 424 | + |
| 425 | + def descriptives(self): |
| 426 | + for variable in self: |
| 427 | + data = variable.get_data() |
| 428 | + variable.mean = self.get_mean(data) |
| 429 | + variable.median = self.get_median(data) |
| 430 | + variable.sds = self.get_standard_deviation(data) |
| 431 | + variable.min = min(data) |
| 432 | + variable.max = max(data) |
| 433 | + variable.n = len(data) |
| 434 | + variable.first_obs, variable.last_obs = variable.get_date_range() |
| 435 | + |
| 436 | + def summary(self): |
| 437 | + self.descriptives() |
| 438 | + print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('Variable', 'Mean', |
| 439 | + 'Median', 'SD', 'Minimum', 'Maximum', |
| 440 | + 'Num Obs', 'First Obs', 'Final Obs') |
| 441 | + for variable in self: |
| 442 | + print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (variable.name, |
| 443 | + variable.mean, variable.median, |
| 444 | + variable.sds, variable.min, |
| 445 | + variable.max, variable.n, |
| 446 | + variable.first_obs, variable.last_obs) |
| 447 | + |
| 448 | + |
| 449 | +def debug(): |
| 450 | + mongo = db.init_mongo_db('enwiki') |
| 451 | + rawdata = mongo['enwiki_charts'] |
| 452 | + mongo.add_son_manipulator(Transform()) |
| 453 | + |
| 454 | + d1 = datetime.datetime.today() |
| 455 | + d2 = datetime.datetime(2007, 6, 7) |
| 456 | + ds = Dataset('test', 'wiki', 'editors_dataset', 'en', 'to_bar_json', [ |
| 457 | + {'name': 'count', 'time_unit': 'year'}, |
| 458 | + # {'name': 'testest', 'time_unit': 'year'} |
| 459 | + ]) |
| 460 | + ds.count.add(d1, 10, ['exp', 'window']) |
| 461 | + ds.count.add(d1, 135, ['exp', 'window']) |
| 462 | + ds.count.add(d2, 1, ['exp', 'window']) |
| 463 | + #ds.testest.add(d1, 135) |
| 464 | + #ds.testest.add(d2, 535) |
| 465 | + ds.summary() |
| 466 | + ds.write(format='csv') |
| 467 | +# v = Variable('test', 'year') |
| 468 | + ds.encode() |
| 469 | + print ds |
| 470 | + |
| 471 | + # mongo.test.insert({'variables': ds}) |
| 472 | + |
| 473 | + # v.add(d2 , 5) |
| 474 | + #o = v.get_observation(d2) |
| 475 | +# ds = rawdata.find_one({'project': 'wiki', |
| 476 | +# 'language_code': 'en', |
| 477 | +# 'hash': 'cohort_dataset_backward_bar'}) |
| 478 | + |
| 479 | + |
| 480 | +if __name__ == '__main__': |
| 481 | + debug() |
Property changes on: trunk/tools/editor_trends/classes/dataset.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 482 | + native |