r82003 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r82002‎ \| r82003 \| r82004 >
Date:	01:06, 12 February 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Another round of moving files to their appropriate places.
Modified paths:	/trunk/tools/editor_trends/classes/dataset.py (added) (history) /trunk/tools/editor_trends/classes/settings.py (added) (history) /trunk/tools/editor_trends/classes/singleton.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/classes/settings.py
—	—	@@ -0,0 +1,191 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
	20	+
	21	+'''
	22	+This file contains settings that are used for constructing and analyzing
	23	+the datasets as part of the Editor Dynamics and Anti-Vandalism projects.
	24	+'''
	25	+
	26	+from multiprocessing import cpu_count
	27	+import ConfigParser
	28	+import os
	29	+import sys
	30	+import platform
	31	+import subprocess
	32	+
	33	+
	34	+from classes import exceptions
	35	+from classes import singleton
	36	+
	37	+try:
	38	+ from _winreg import *
	39	+ from pywin import win32file
	40	+ '''increase the maximum number of open files on Windows to 1024'''
	41	+ win32file._setmaxstdio(1024)
	42	+except ImportError:
	43	+ pass
	44	+
	45	+try:
	46	+ import resource
	47	+except ImportError:
	48	+ pass
	49	+
	50	+class Settings:
	51	+ #__metaclass__ = singleton.Singleton
	52	+
	53	+ def __init__(self, process_multiplier=1):
	54	+ self.minimum_python_version = (2, 6)
	55	+ self.detect_python_version()
	56	+ self.encoding = 'utf-8'
	57	+
	58	+ #Date format as used by Erik Zachte
	59	+ self.date_format = '%Y-%m-%d'
	60	+
	61	+ # Timestamp format as generated by the MediaWiki dumps
	62	+ self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
	63	+ self.timestamp_server = '%a, %d %b %Y %H:%M:%S %Z'
	64	+ #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason
	65	+ self.max_xmlfile_size = 4096 * 1024
	66	+
	67	+ #Change this to match your computers configuration (RAM / CPU)
	68	+ self.number_of_processes = cpu_count() * process_multiplier
	69	+
	70	+ self.wp_dump_location = 'http://dumps.wikimedia.org'
	71	+ self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'
	72	+ self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json']
	73	+ self.windows_register = {'7z.exe': 'Software\\7-Zip', }
	74	+ #Extensions of ascii files, this is used to determine the filemode to use
	75	+ self.platform = self.determine_platform()
	76	+
	77	+ self.architecture = platform.machine()
	78	+ self.working_directory = self.determine_working_directory()
	79	+ print sys.path
	80	+ self.update_python_path()
	81	+ print sys.path
	82	+
	83	+ self.root = os.path.expanduser('~') if self.platform != 'Windows' else 'c:\\'
	84	+ self.max_filehandles = self.determine_max_filehandles_open()
	85	+ self.tab_width = 4 if self.platform == 'Windows' else 8
	86	+
	87	+
	88	+ result = self.load_configuration()
	89	+ if not result:
	90	+ self.input_location = os.path.join(self.root, 'wikimedia')
	91	+
	92	+ # Default Input file
	93	+ self.input_filename = os.path.join(self.input_location, 'en',
	94	+ 'wiki',
	95	+ 'enwiki-20100916-stub-meta-history.xml')
	96	+ # This is the place where error messages are stored for debugging purposes
	97	+ self.log_location = os.path.join(self.working_directory,
	98	+ 'logs')
	99	+ self.csv_location = os.path.join(self.working_directory,
	100	+ 'data', 'csv')
	101	+ self.dataset_location = os.path.join(self.working_directory, 'datasets')
	102	+ self.binary_location = os.path.join(self.working_directory,
	103	+ 'data', 'objects')
	104	+
	105	+ self.chart_location = os.path.join(self.working_directory, 'statistics',
	106	+ 'charts')
	107	+ self.file_choices = ('stub-meta-history.xml.gz',
	108	+ 'stub-meta-current.xml.gz',
	109	+ 'pages-meta-history.xml.7z',
	110	+ 'pages-meta-current.xml.bz2',)
	111	+
	112	+ def load_configuration(self):
	113	+ if os.path.exists(os.path.join(self.working_directory, 'wiki.cfg')):
	114	+ config = ConfigParser.RawConfigParser()
	115	+ config.read(os.path.join(self.working_directory, 'wiki.cfg'))
	116	+ self.working_directory = config.get('file_locations', 'working_directory')
	117	+ self.input_location = config.get('file_locations', 'input_location')
	118	+ self.default_project = config.get('wiki', 'project')
	119	+ self.default_language = config.get('wiki', 'language')
	120	+ return True
	121	+ else:
	122	+ return False
	123	+
	124	+ def determine_working_directory(self):
	125	+ cwd = os.getcwd()
	126	+ if not cwd.endswith('editor_trends%s' % os.sep):
	127	+ pos = cwd.find('editor_trends') + 14
	128	+ cwd = cwd[:pos]
	129	+ return cwd
	130	+
	131	+ def detect_python_version(self):
	132	+ version = sys.version_info[0:2]
	133	+ #logger.debug('Python version: %s' % '.'.join(str(version)))
	134	+ if version < self.minimum_python_version:
	135	+ raise exceptions.OutDatedPythonVersionError
	136	+
	137	+ def determine_platform(self):
	138	+ if platform.system() == 'Darwin':
	139	+ return 'OSX'
	140	+ else:
	141	+ return platform.system()
	142	+
	143	+ def verify_environment(self, directories):
	144	+ for directory in directories:
	145	+ if not os.path.exists(directory):
	146	+ try:
	147	+ os.makedirs(directory)
	148	+ except IOError:
	149	+ print 'Configuration Error, could not create directory %s.' % directory
	150	+
	151	+ def detect_windows_program(self, program):
	152	+ entry = self.windows_register.get(program, None)
	153	+ try:
	154	+ key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ)
	155	+ return QueryValueEx(key, 'Path')[0]
	156	+ except WindowsError:
	157	+ return None
	158	+
	159	+ def detect_linux_program(self, program):
	160	+ path = subprocess.Popen(['which', '%s' % program], stdout=subprocess.PIPE).communicate()[0]
	161	+ return path.strip()
	162	+
	163	+ def detect_installed_program(self, program):
	164	+ if self.platform == 'Windows':
	165	+ if not program.endswith('.exe'):
	166	+ program = program + '.exe'
	167	+ path = self.detect_windows_program(program)
	168	+ if path != None:
	169	+ path = path + program
	170	+ elif self.platform == 'Linux':
	171	+ path = self.detect_linux_program(program)
	172	+
	173	+ return path
	174	+
	175	+ def determine_max_filehandles_open(self):
	176	+ if self.platform == 'Windows' and self.architecture == 'i386':
	177	+ return win32file._getmaxstdio()
	178	+ elif self.platform != 'Windows':
	179	+ return resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100
	180	+ else:
	181	+ return 500
	182	+
	183	+ def update_python_path(self):
	184	+ IGNORE_DIRS = ['wikistats', 'zips', 'datasets', 'mapreduce', 'logs',
	185	+ 'statistics', 'js_scripts', 'deployment',
	186	+ 'documentation', 'data', 'code-snippets']
	187	+ dirs = [name for name in os.listdir(self.working_directory) if
	188	+ os.path.isdir(os.path.join(self.working_directory, name))]
	189	+ for subdirname in dirs:
	190	+ if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
	191	+ sys.path.append(os.path.join(self.working_directory,
	192	+ subdirname))
Property changes on: trunk/tools/editor_trends/classes/settings.py
___________________________________________________________________
Added: svn:eol-style
1	193	+ native
Index: trunk/tools/editor_trends/classes/singleton.py
—	—	@@ -0,0 +1,34 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-02-11'
	19	+__version__ = '0.1'
	20	+
	21	+
	22	+class Singleton(type):
	23	+ '''
	24	+ Recipe: http://stackoverflow.com/questions/31875/is-there-a-simple-elegant-way-to-define-singletons-in-python
	25	+ '''
	26	+ def __init__(cls, name, bases, dict):
	27	+ super(Singleton, cls).__init__(name, bases, dict)
	28	+ cls.instance = None
	29	+
	30	+ def __call__(cls, args, *kw):
	31	+ if cls.instance is None:
	32	+ cls.instance = super(Singleton, cls).__call__(args, *kw)
	33	+ return cls.instance
	34	+ else:
	35	+ return cls.instance
Property changes on: trunk/tools/editor_trends/classes/singleton.py
___________________________________________________________________
Added: svn:eol-style
1	36	+ native
Index: trunk/tools/editor_trends/classes/dataset.py
—	—	@@ -0,0 +1,480 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-14'
	19	+__version__ = '0.1'
	20	+
	21	+import calendar
	22	+import datetime
	23	+import time
	24	+import math
	25	+import operator
	26	+import sys
	27	+from pymongo.son_manipulator import SONManipulator
	28	+from multiprocessing import Lock
	29	+
	30	+
	31	+sys.path.append('..')
	32	+import configuration
	33	+settings = configuration.Settings()
	34	+
	35	+from utils import file_utils
	36	+from utils import data_converter
	37	+from database import db
	38	+import json_encoders
	39	+
	40	+class Transform(SONManipulator):
	41	+ '''
	42	+ This encoder transforms a Dataset to a MongoDB bson document.
	43	+ To use this encoder initalize a mongo database instance and then add:
	44	+ mongo.add_son_manipulator(Transform())
	45	+ '''
	46	+ def transform_incoming(self, son, collection):
	47	+ for (key, ds) in son.items():
	48	+ son[key] = {}
	49	+ for x, var in enumerate(ds):
	50	+ if isinstance(var, Variable):
	51	+ son[key][var.name] = var.encode()
	52	+ for prop in ds.props:
	53	+ son[prop] = getattr(ds, prop)
	54	+ return son
	55	+
	56	+ def transform_outgoing(self, son, collection):
	57	+ for (key, value) in son.items():
	58	+ if isinstance(value, dict):
	59	+ names = value.keys()
	60	+ for name in names:
	61	+ var = Variable(name, None)
	62	+ var.decode(value)
	63	+ son['variables'][name] = var
	64	+ else: # Again, make sure to recurse into sub-docs
	65	+ son[key] = value
	66	+ name = son.pop('name', None)
	67	+ project = son.pop('project', None)
	68	+ collection = son.pop('collection', None)
	69	+ language_code = son.pop('language_code', None)
	70	+ variables = son.pop('variables', [])
	71	+ ds = Dataset(name, project, collection, language_code, **son)
	72	+ for var in variables:
	73	+ var = variables[var]
	74	+ ds.add_variable(var)
	75	+ return ds
	76	+
	77	+
	78	+class Data:
	79	+ '''
	80	+ Some generic functions that are required by the Observation, Variable, and
	81	+ Dataset classes.
	82	+ '''
	83	+ def __hash__(self, vars):
	84	+ id = ''.join([str(var) for var in vars])
	85	+ return hash(id)
	86	+ #return int(self.convert_date_to_epoch(date))
	87	+
	88	+ def encode_to_bson(self, data=None):
	89	+ if data:
	90	+ kwargs = dict([(str(key), value) for key, value in data.__dict__.iteritems()])
	91	+ else:
	92	+ kwargs = dict([(str(key), value) for key, value in self.__dict__.iteritems()])
	93	+ for key, value in kwargs.iteritems():
	94	+ if isinstance(value, dict):
	95	+ d = {}
	96	+ for k, v in value.iteritems():
	97	+ if isinstance(v, Observation):
	98	+ v = self.encode_to_bson(v)
	99	+ d[str(k)] = v
	100	+ kwargs[key] = d
	101	+ return kwargs
	102	+
	103	+ def convert_date_to_epoch(self, date):
	104	+ assert self.time_unit == 'year' or self.time_unit == 'month' \
	105	+ or self.time_unit == 'day', 'Time unit should either be year, month or day.'
	106	+
	107	+ if self.time_unit == 'year':
	108	+ datum = datetime.datetime(date.year, 1, 1)
	109	+ return int(time.mktime(datum.timetuple()))
	110	+ elif self.time_unit == 'month':
	111	+ datum = datetime.datetime(date.year, date.month, 1)
	112	+ return int(time.mktime(datum.timetuple()))
	113	+ elif self.time_unit == 'day':
	114	+ return int(time.mktime(date.timetuple()))
	115	+ else:
	116	+ return date
	117	+
	118	+ def set_date_range(self, date):
	119	+ if self.time_unit == 'year':
	120	+ return datetime.datetime(date.year, 12, 31), datetime.datetime(date.year, 1, 1)
	121	+ elif self.time_unit == 'month':
	122	+ day = calendar.monthrange(date.year, date.month)[1]
	123	+ return datetime.datetime(date.year, date.month, day), datetime.datetime(date.year, date.month, 1)
	124	+ else:
	125	+ return datetime.datetime(date.year, date.month, date.day), datetime.datetime(date.year, date.month, date.day)
	126	+
	127	+
	128	+class Observation(Data):
	129	+ lock = Lock()
	130	+ '''
	131	+ The smallest unit, here the actual data is being stored.
	132	+ Time_unit should either be 'year', 'month' or 'day'.
	133	+ '''
	134	+ def __init__(self, date, time_unit, id, meta):
	135	+ assert isinstance(date, datetime.datetime), 'Date variable should be a datetime.datetime instance.'
	136	+ self.date = date
	137	+ self.data = 0
	138	+ self.time_unit = time_unit
	139	+ self.t1, self.t0 = self.set_date_range(date)
	140	+ self.id = id
	141	+ self.props = []
	142	+ self.count = 0
	143	+ for mt in meta:
	144	+ if isinstance(mt, float):
	145	+ raise Exception, 'Mongo does not allow a dot "." in the name of a key, please use an integer or string as key.'
	146	+ elif not isinstance(mt, list):
	147	+ setattr(self, mt, meta[mt])
	148	+ self.props.append(mt)
	149	+ self._type = 'observation'
	150	+
	151	+ def __repr__(self):
	152	+ return '%s' % self.date
	153	+
	154	+ def __str__(self):
	155	+ return 'range: %s:%s' % (self.t0, self.t1)
	156	+
	157	+ def __iter__(self):
	158	+ for obs in self.data:
	159	+ yield self.data[obs]
	160	+
	161	+ def __getitem__(self, key):
	162	+ return getattr(self, key, [])
	163	+
	164	+ def add(self, value):
	165	+ '''
	166	+ If update == True then data[i] will be incremented else data[i] will be
	167	+ created, in that case make sure that i is unique. Update is useful for
	168	+ tallying a variable.
	169	+ '''
	170	+ self.lock.acquire()
	171	+ try:
	172	+ if isinstance(value, list):
	173	+ if self.count == 0:
	174	+ self.data = []
	175	+ self.data.append(value)
	176	+ else:
	177	+ self.data += value
	178	+ finally:
	179	+ self.count += 1
	180	+ self.lock.release()
	181	+
	182	+ def get_date_range(self):
	183	+ return '%s-%s-%s:%s-%s-%s' % (self.t0.month, self.t0.day, self.t0.year, \
	184	+ self.t1.month, self.t1.day, self.t1.year)
	185	+
	186	+class Variable(Data):
	187	+ '''
	188	+ This class constructs a time-based variable.
	189	+ '''
	190	+ lock = Lock()
	191	+ def __init__(self, name, time_unit, **kwargs):
	192	+ self.name = name
	193	+ self.obs = {}
	194	+ self.time_unit = time_unit
	195	+ self.groupbys = []
	196	+ self._type = 'variable'
	197	+ self.props = ['name', 'time_unit', '_type']
	198	+ for kw in kwargs:
	199	+ setattr(self, kw, kwargs[kw])
	200	+ self.props.append(kw)
	201	+
	202	+ def __str__(self):
	203	+ return '%s' % self.name
	204	+
	205	+ def __repr__(self):
	206	+ return '%s' % self.name
	207	+
	208	+ def __getitem__(self, key):
	209	+ return getattr(self, key, [])
	210	+
	211	+ def __iter__(self):
	212	+ keys = self.obs.keys()
	213	+ for key in keys:
	214	+ yield key
	215	+
	216	+ def __len__(self):
	217	+ return [x for x in xrange(self.obs())]
	218	+
	219	+ def items(self):
	220	+ for key in self.__dict__.keys():
	221	+ yield key, getattr(self, key)
	222	+
	223	+ def itervalues(self):
	224	+ for key in self:
	225	+ yield self.obs[key].data
	226	+
	227	+ def iteritems(self):
	228	+ for key in self:
	229	+ yield (key, self.obs[key])
	230	+
	231	+
	232	+ def get_data(self):
	233	+ return [o for o in self.itervalues()]
	234	+
	235	+ def get_observation(self, id, date, meta):
	236	+ self.lock.acquire()
	237	+ try:
	238	+ obs = self.obs.get(id, Observation(date, self.time_unit, id, meta))
	239	+ finally:
	240	+ self.lock.release()
	241	+ return obs
	242	+
	243	+ def add(self, date, value, meta={}):
	244	+ assert isinstance(meta, dict), 'The meta variable should be a dict (either empty or with variables to group by.'
	245	+ #id = self.convert_date_to_epoch(date)
	246	+ start, end = self.set_date_range(date)
	247	+ values = meta.values()
	248	+ values.insert(0, end)
	249	+ values.insert(0, start)
	250	+ id = self.__hash__(values)
	251	+
	252	+ obs = self.get_observation(id, date, meta)
	253	+ obs.add(value)
	254	+ self.obs[id] = obs
	255	+
	256	+ def encode(self):
	257	+ bson = {}
	258	+ for prop in self.props:
	259	+ bson[prop] = getattr(self, prop)
	260	+
	261	+ bson['obs'] = {}
	262	+ for obs in self:
	263	+ data = self.obs[obs]
	264	+ obs = str(obs)
	265	+ bson['obs'][obs] = data.encode_to_bson()
	266	+ return bson
	267	+
	268	+ def decode(self, values):
	269	+ for varname in values:
	270	+ for prop in values[varname]:
	271	+ if isinstance(values[varname][prop], dict):
	272	+ data = values[varname][prop]
	273	+ for d in data:
	274	+ date = data[d]['date']
	275	+ obs = data[d]['data']
	276	+ self.add(date, obs)
	277	+ else:
	278	+ setattr(self, prop, values[varname][prop])
	279	+ self.props.append(prop)
	280	+
	281	+ def get_date_range(self):
	282	+ dates = [self.obs[key].date for key in self]
	283	+ first = min(dates)
	284	+ last = max(dates)
	285	+ return first, last
	286	+
	287	+
	288	+class Dataset:
	289	+ '''
	290	+ This class acts as a container for the Variable class and has some methods
	291	+ to output the dataset to a csv file, mongodb and display statistics.
	292	+ '''
	293	+
	294	+ def __init__(self, name, project, collection, language_code, encoder, vars=None, **kwargs):
	295	+ encoders = json_encoders.available_json_encoders()
	296	+ if encoder not in encoders:
	297	+ raise exception.UnknownJSONEncoderError(encoder)
	298	+ else:
	299	+ self.encoder = encoder
	300	+ self.name = name
	301	+ self.project = project
	302	+ self.collection = collection
	303	+ self.language_code = language_code
	304	+ self.hash = self.name
	305	+ self._type = 'dataset'
	306	+ self.created = datetime.datetime.now()
	307	+ self.format = 'long'
	308	+ for kw in kwargs:
	309	+ setattr(self, kw, kwargs[kw])
	310	+ self.props = self.__dict__.keys()
	311	+
	312	+ self.variables = []
	313	+ if vars != None:
	314	+ for kwargs in vars:
	315	+ name = kwargs.pop('name')
	316	+ setattr(self, name, Variable(name, **kwargs))
	317	+ self.variables.append(name)
	318	+ #self.filename = self.create_filename()
	319	+
	320	+ def __repr__(self):
	321	+ return 'Dataset contains %s variables' % (len(self.variables))
	322	+
	323	+ def __iter__(self):
	324	+ for var in self.variables:
	325	+ yield getattr(self, var)
	326	+
	327	+
	328	+ def create_filename(self):
	329	+ '''
	330	+ This function creates a filename for the dataset by searching for shared
	331	+ properties among the different variables in the dataset. All shared
	332	+ properties will be used in the filename to make sure that one analysis
	333	+ that's run with different parameters gets stored in separate files.
	334	+ '''
	335	+ common = {}
	336	+ props = set()
	337	+ for var in self.variables:
	338	+ s = set()
	339	+ var = getattr(self, var)
	340	+ for prop in var.props:
	341	+ if prop not in ['name', 'time_unit', '_type']:
	342	+ s.add(prop)
	343	+ props.add(prop)
	344	+ common[var.name] = s
	345	+
	346	+ keys = []
	347	+ for prop in props:
	348	+ attrs = []
	349	+ for s in common.values():
	350	+ attrs.append(prop)
	351	+ if len(attrs) == len(common.values()):
	352	+ keys.append(prop)
	353	+ keys.sort()
	354	+ attrs = '_'.join(['%s=%s' % (k, getattr(var, k)) for k in keys])
	355	+ filename = '%s%s_%s_%s.csv' % (self.language_code,
	356	+ self.project,
	357	+ self.name,
	358	+ attrs)
	359	+ self.filename = filename
	360	+
	361	+
	362	+ def add_variable(self, var):
	363	+ if isinstance(var, Variable):
	364	+ self.variables.append(var.name)
	365	+ setattr(self, var.name, var)
	366	+ else:
	367	+ raise TypeError('You can only instance of Variable to a dataset.')
	368	+
	369	+ def write(self, format='csv'):
	370	+ self.create_filename()
	371	+ if format == 'csv':
	372	+ self.to_csv()
	373	+ elif format == 'mongo':
	374	+ self.to_mongo()
	375	+
	376	+ def to_mongo(self):
	377	+ dbname = '%s%s' % (self.language_code, self.project)
	378	+ mongo = db.init_mongo_db(dbname)
	379	+ coll = mongo['%s_%s' % (dbname, 'charts')]
	380	+ mongo.add_son_manipulator(Transform())
	381	+ coll.remove({'hash':self.hash, 'project':self.project,
	382	+ 'language_code':self.language_code})
	383	+ coll.insert({'variables': self})
	384	+
	385	+ def to_csv(self):
	386	+ data = data_converter.convert_dataset_to_lists(self, 'manage')
	387	+ headers = data_converter.add_headers(self)
	388	+ fh = file_utils.create_txt_filehandle(settings.dataset_location, self.filename, 'w', settings.encoding)
	389	+ file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True)
	390	+ file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format)
	391	+ fh.close()
	392	+
	393	+ def encode(self):
	394	+ props = {}
	395	+ for prop in self.props:
	396	+ props[prop] = getattr(self, prop)
	397	+ return props
	398	+
	399	+ def get_standard_deviation(self, number_list):
	400	+ mean = self.get_mean(number_list)
	401	+ std = 0
	402	+ n = len(number_list)
	403	+ for i in number_list:
	404	+ std = std + (i - mean) ** 2
	405	+ return math.sqrt(std / float(n - 1))
	406	+
	407	+ def get_median(self, number_list):
	408	+ if number_list == []:
	409	+ return '.'
	410	+ data = sorted(number_list)
	411	+ data = [float(x) for x in data]
	412	+ if len(data) % 2 == 1:
	413	+ return data[(len(data) + 1) / 2 - 1]
	414	+ else:
	415	+ lower = data[len(data) / 2 - 1]
	416	+ upper = data[len(data) / 2]
	417	+ return (lower + upper) / 2
	418	+
	419	+ def get_mean(self, number_list):
	420	+ if number_list == []:
	421	+ return '.'
	422	+ float_nums = [float(x) for x in number_list]
	423	+ return sum(float_nums) / len(number_list)
	424	+
	425	+ def descriptives(self):
	426	+ for variable in self:
	427	+ data = variable.get_data()
	428	+ variable.mean = self.get_mean(data)
	429	+ variable.median = self.get_median(data)
	430	+ variable.sds = self.get_standard_deviation(data)
	431	+ variable.min = min(data)
	432	+ variable.max = max(data)
	433	+ variable.n = len(data)
	434	+ variable.first_obs, variable.last_obs = variable.get_date_range()
	435	+
	436	+ def summary(self):
	437	+ self.descriptives()
	438	+ print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('Variable', 'Mean',
	439	+ 'Median', 'SD', 'Minimum', 'Maximum',
	440	+ 'Num Obs', 'First Obs', 'Final Obs')
	441	+ for variable in self:
	442	+ print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (variable.name,
	443	+ variable.mean, variable.median,
	444	+ variable.sds, variable.min,
	445	+ variable.max, variable.n,
	446	+ variable.first_obs, variable.last_obs)
	447	+
	448	+
	449	+def debug():
	450	+ mongo = db.init_mongo_db('enwiki')
	451	+ rawdata = mongo['enwiki_charts']
	452	+ mongo.add_son_manipulator(Transform())
	453	+
	454	+ d1 = datetime.datetime.today()
	455	+ d2 = datetime.datetime(2007, 6, 7)
	456	+ ds = Dataset('test', 'wiki', 'editors_dataset', 'en', 'to_bar_json', [
	457	+ {'name': 'count', 'time_unit': 'year'},
	458	+ # {'name': 'testest', 'time_unit': 'year'}
	459	+ ])
	460	+ ds.count.add(d1, 10, ['exp', 'window'])
	461	+ ds.count.add(d1, 135, ['exp', 'window'])
	462	+ ds.count.add(d2, 1, ['exp', 'window'])
	463	+ #ds.testest.add(d1, 135)
	464	+ #ds.testest.add(d2, 535)
	465	+ ds.summary()
	466	+ ds.write(format='csv')
	467	+# v = Variable('test', 'year')
	468	+ ds.encode()
	469	+ print ds
	470	+
	471	+ # mongo.test.insert({'variables': ds})
	472	+
	473	+ # v.add(d2 , 5)
	474	+ #o = v.get_observation(d2)
	475	+# ds = rawdata.find_one({'project': 'wiki',
	476	+# 'language_code': 'en',
	477	+# 'hash': 'cohort_dataset_backward_bar'})
	478	+
	479	+
	480	+if __name__ == '__main__':
	481	+ debug()
Property changes on: trunk/tools/editor_trends/classes/dataset.py
___________________________________________________________________
Added: svn:eol-style
1	482	+ native

Status & tagging log

23:32, 12 February 2011 Reedy (talk | contribs) changed the status of r82003 [removed: new added: deferred]