Index: trunk/tools/editor_trends/configuration.py |
— | — | @@ -1,206 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__email__ = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2010-10-21' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -''' |
22 | | -This file contains settings that are used for constructing and analyzing |
23 | | -the datasets as part of the Editor Dynamics and Anti-Vandalism projects. |
24 | | -''' |
25 | | - |
26 | | -from multiprocessing import cpu_count |
27 | | -import ConfigParser |
28 | | -import os |
29 | | -import sys |
30 | | -import platform |
31 | | -import subprocess |
32 | | - |
33 | | - |
34 | | -from classes import exceptions |
35 | | - |
36 | | -try: |
37 | | - from _winreg import * |
38 | | - from pywin import win32file |
39 | | - '''increase the maximum number of open files on Windows to 1024''' |
40 | | - win32file._setmaxstdio(1024) |
41 | | -except ImportError: |
42 | | - pass |
43 | | - |
44 | | -try: |
45 | | - import resource |
46 | | -except ImportError: |
47 | | - pass |
48 | | - |
49 | | - |
50 | | -class Singleton(type): |
51 | | - ''' |
52 | | - Recipe: http://stackoverflow.com/questions/31875/is-there-a-simple-elegant-way-to-define-singletons-in-python |
53 | | - ''' |
54 | | - def __init__(cls, name, bases, dict): |
55 | | - super(Singleton, cls).__init__(name, bases, dict) |
56 | | - cls.instance = None |
57 | | - |
58 | | - def __call__(cls, *args, **kw): |
59 | | - if cls.instance is None: |
60 | | - cls.instance = super(Singleton, cls).__call__(*args, **kw) |
61 | | - return cls.instance |
62 | | - else: |
63 | | - return cls.instance |
64 | | - |
65 | | -class Settings: |
66 | | - __metaclass__ = Singleton |
67 | | - |
68 | | - def __init__(self, process_multiplier=1): |
69 | | - self.minimum_python_version = (2, 6) |
70 | | - self.detect_python_version() |
71 | | - self.encoding = 'utf-8' |
72 | | - |
73 | | - #Date format as used by Erik Zachte |
74 | | - self.date_format = '%Y-%m-%d' |
75 | | - |
76 | | - # Timestamp format as generated by the MediaWiki dumps |
77 | | - self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' |
78 | | - self.timestamp_server = '%a, %d %b %Y %H:%M:%S %Z' |
79 | | - #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason |
80 | | - self.max_xmlfile_size = 4096 * 1024 |
81 | | - |
82 | | - #Change this to match your computers configuration (RAM / CPU) |
83 | | - self.number_of_processes = cpu_count() * process_multiplier |
84 | | - |
85 | | - self.wp_dump_location = 'http://dumps.wikimedia.org' |
86 | | - self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/' |
87 | | - self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json'] |
88 | | - self.windows_register = {'7z.exe': 'Software\\7-Zip', } |
89 | | - #Extensions of ascii files, this is used to determine the filemode to use |
90 | | - self.platform = self.determine_platform() |
91 | | - |
92 | | - self.architecture = platform.machine() |
93 | | - self.working_directory = self.determine_working_directory() |
94 | | - print sys.path |
95 | | - self.update_python_path() |
96 | | - print sys.path |
97 | | - |
98 | | - self.root = os.path.expanduser('~') if self.platform != 'Windows' else 'c:\\' |
99 | | - self.max_filehandles = self.determine_max_filehandles_open() |
100 | | - self.tab_width = 4 if self.platform == 'Windows' else 8 |
101 | | - |
102 | | - |
103 | | - result = self.load_configuration() |
104 | | - if not result: |
105 | | - self.input_location = os.path.join(self.root, 'wikimedia') |
106 | | - |
107 | | - # Default Input file |
108 | | - self.input_filename = os.path.join(self.input_location, 'en', |
109 | | - 'wiki', |
110 | | - 'enwiki-20100916-stub-meta-history.xml') |
111 | | - # This is the place where error messages are stored for debugging purposes |
112 | | - self.log_location = os.path.join(self.working_directory, |
113 | | - 'logs') |
114 | | - self.csv_location = os.path.join(self.working_directory, |
115 | | - 'data', 'csv') |
116 | | - self.dataset_location = os.path.join(self.working_directory, 'datasets') |
117 | | - self.binary_location = os.path.join(self.working_directory, |
118 | | - 'data', 'objects') |
119 | | - |
120 | | - self.chart_location = os.path.join(self.working_directory, 'statistics', |
121 | | - 'charts') |
122 | | - self.file_choices = ('stub-meta-history.xml.gz', |
123 | | - 'stub-meta-current.xml.gz', |
124 | | - 'pages-meta-history.xml.7z', |
125 | | - 'pages-meta-current.xml.bz2',) |
126 | | - |
127 | | - def load_configuration(self): |
128 | | - if os.path.exists(os.path.join(self.working_directory, 'wiki.cfg')): |
129 | | - config = ConfigParser.RawConfigParser() |
130 | | - config.read(os.path.join(self.working_directory, 'wiki.cfg')) |
131 | | - self.working_directory = config.get('file_locations', 'working_directory') |
132 | | - self.input_location = config.get('file_locations', 'input_location') |
133 | | - self.default_project = config.get('wiki', 'project') |
134 | | - self.default_language = config.get('wiki', 'language') |
135 | | - return True |
136 | | - else: |
137 | | - return False |
138 | | - |
139 | | - def determine_working_directory(self): |
140 | | - cwd = os.getcwd() |
141 | | - if not cwd.endswith('editor_trends%s' % os.sep): |
142 | | - pos = cwd.find('editor_trends') + 14 |
143 | | - cwd = cwd[:pos] |
144 | | - return cwd |
145 | | - |
146 | | - def detect_python_version(self): |
147 | | - version = sys.version_info[0:2] |
148 | | - #logger.debug('Python version: %s' % '.'.join(str(version))) |
149 | | - if version < self.minimum_python_version: |
150 | | - raise exceptions.OutDatedPythonVersionError |
151 | | - |
152 | | - def determine_platform(self): |
153 | | - if platform.system() == 'Darwin': |
154 | | - return 'OSX' |
155 | | - else: |
156 | | - return platform.system() |
157 | | - |
158 | | - def verify_environment(self, directories): |
159 | | - for directory in directories: |
160 | | - if not os.path.exists(directory): |
161 | | - try: |
162 | | - os.makedirs(directory) |
163 | | - except IOError: |
164 | | - print 'Configuration Error, could not create directory %s.' % directory |
165 | | - |
166 | | - def detect_windows_program(self, program): |
167 | | - entry = self.windows_register.get(program, None) |
168 | | - try: |
169 | | - key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ) |
170 | | - return QueryValueEx(key, 'Path')[0] |
171 | | - except WindowsError: |
172 | | - return None |
173 | | - |
174 | | - def detect_linux_program(self, program): |
175 | | - path = subprocess.Popen(['which', '%s' % program], stdout=subprocess.PIPE).communicate()[0] |
176 | | - return path.strip() |
177 | | - |
178 | | - def detect_installed_program(self, program): |
179 | | - if self.platform == 'Windows': |
180 | | - if not program.endswith('.exe'): |
181 | | - program = program + '.exe' |
182 | | - path = self.detect_windows_program(program) |
183 | | - if path != None: |
184 | | - path = path + program |
185 | | - elif self.platform == 'Linux': |
186 | | - path = self.detect_linux_program(program) |
187 | | - |
188 | | - return path |
189 | | - |
190 | | - def determine_max_filehandles_open(self): |
191 | | - if self.platform == 'Windows' and self.architecture == 'i386': |
192 | | - return win32file._getmaxstdio() |
193 | | - elif self.platform != 'Windows': |
194 | | - return resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100 |
195 | | - else: |
196 | | - return 500 |
197 | | - |
198 | | - def update_python_path(self): |
199 | | - IGNORE_DIRS = ['wikistats', 'zips', 'datasets', 'mapreduce', 'logs', |
200 | | - 'statistics', 'js_scripts', 'deployment', |
201 | | - 'documentation', 'data', 'code-snippets'] |
202 | | - dirs = [name for name in os.listdir(self.working_directory) if |
203 | | - os.path.isdir(os.path.join(self.working_directory, name))] |
204 | | - for subdirname in dirs: |
205 | | - if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS: |
206 | | - sys.path.append(os.path.join(self.working_directory, |
207 | | - subdirname)) |
Index: trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py |
— | — | @@ -39,10 +39,6 @@ |
40 | 40 | if w >= editor_dt: |
41 | 41 | datum = datetime.datetime(int(year), 12, 31) |
42 | 42 | freq = int(editor['edits_by_year'][year]) |
43 | | - #if datum == datetime.datetime(2003, 12, 31): |
44 | | -# if w == 24: |
45 | | -# if freq == 1.0: |
46 | | -# print 'break' |
47 | | - var.add(datum, 1, {'window': w, 'frequency': freq}) #{w:{freq:1}}) |
| 43 | + var.add(datum, 1, {'window': w, 'frequency': freq}) |
48 | 44 | break |
49 | 45 | return var |
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py |
— | — | @@ -23,7 +23,6 @@ |
24 | 24 |
|
25 | 25 |
|
26 | 26 | def cohort_dataset_backward_bar(var, editor, **kwargs):
|
27 | | - #first_edit = editor['first_edit']
|
28 | 27 | '''
|
29 | 28 | The backward looking bar chart looks for every year that an editor
|
30 | 29 | was part of the Wikimedia community whether this person made at least cutoff
|
Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -26,6 +26,7 @@ |
27 | 27 | if '..' not in sys.path: |
28 | 28 | sys.path.append('..') |
29 | 29 | |
| 30 | +import inventory |
30 | 31 | from classes import dataset |
31 | 32 | from classes import settings |
32 | 33 | settings = settings.Settings() |
— | — | @@ -42,7 +43,7 @@ |
43 | 44 | stopwatch = timer.Timer() |
44 | 45 | res = True |
45 | 46 | dbname = '%s%s' % (language_code, project) |
46 | | - functions = available_analyses() |
| 47 | + functions = inventory.available_analyses() |
47 | 48 | try: |
48 | 49 | func = functions[func] |
49 | 50 | except KeyError: |
— | — | @@ -119,8 +120,8 @@ |
120 | 121 | |
121 | 122 | |
122 | 123 | if __name__ == '__main__': |
123 | | - #generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50) |
124 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5) |
| 124 | + generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50) |
| 125 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5) |
125 | 126 | #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', 'to_bar_json', time_unit='year') |
126 | 127 | #generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', 'to_bar_json', time_unit='year') |
127 | 128 | #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', 'to_bar_json', time_unit='year') |
Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -229,12 +229,12 @@ |
230 | 230 | file_utils.delete_file(settings.binary_location, filename) |
231 | 231 | |
232 | 232 | |
233 | | -def all_launcher(properties, logger): |
| 233 | +def all_launcher(rts, logger): |
234 | 234 | print 'The entire data processing chain has been called, this will take a \ |
235 | 235 | couple of hours (at least) to complete.' |
236 | 236 | stopwatch = timer.Timer() |
237 | | - log.log_to_mongo(properties, 'dataset', 'all', stopwatch, event='start') |
238 | | - print 'Start of building %s %s dataset.' % (properties.language.name, properties.project) |
| 237 | + log.log_to_mongo(rts, 'dataset', 'all', stopwatch, event='start') |
| 238 | + print 'Start of building %s %s dataset.' % (rts.language.name, rts.project) |
239 | 239 | |
240 | 240 | # write_message_to_log(logger, settings, |
241 | 241 | # message=message, |
— | — | @@ -242,8 +242,8 @@ |
243 | 243 | # full_project=properties.full_project, |
244 | 244 | # ignore=properties.ignore, |
245 | 245 | # clean=properties.clean) |
246 | | - if properties.clean: |
247 | | - cleanup(properties, settings, logger) |
| 246 | + if rts.clean: |
| 247 | + cleanup(rts, logger) |
248 | 248 | |
249 | 249 | functions = ordered_dict.OrderedDict(((downloader_launcher, 'download'), |
250 | 250 | (extract_launcher, 'extract'), |
— | — | @@ -253,16 +253,16 @@ |
254 | 254 | (dataset_launcher, 'dataset'))) |
255 | 255 | |
256 | 256 | for function, callname in functions.iteritems(): |
257 | | - if callname not in properties.ignore: |
| 257 | + if callname not in rts.ignore: |
258 | 258 | print 'Starting %s' % function.func_name |
259 | | - res = function(properties, logger) |
| 259 | + res = function(rts, logger) |
260 | 260 | if res == False: |
261 | 261 | sys.exit(False) |
262 | 262 | elif res == None: |
263 | 263 | print 'Function %s does not return a status, \ |
264 | 264 | implement NOW' % function.func_name |
265 | 265 | stopwatch.elapsed() |
266 | | - log.log_to_mongo(properties, 'dataset', 'all', stopwatch, event='finish') |
| 266 | + log.log_to_mongo(rts, 'dataset', 'all', stopwatch, event='finish') |
267 | 267 | |
268 | 268 | |
269 | 269 | |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -382,6 +382,8 @@ |
383 | 383 | ''' |
384 | 384 | result = True |
385 | 385 | tasks = unzip(rts) |
| 386 | + if not tasks: |
| 387 | + return False |
386 | 388 | |
387 | 389 | output = os.path.join(rts.input_location, rts.language.code, |
388 | 390 | rts.project.name, 'txt') |
Index: trunk/tools/editor_trends/etl/transformer.py |
— | — | @@ -17,6 +17,7 @@ |
18 | 18 | __date__ = '2010-11-02' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
| 21 | +import progressbar |
21 | 22 | import multiprocessing |
22 | 23 | from Queue import Empty |
23 | 24 | from operator import itemgetter |
— | — | @@ -192,10 +193,11 @@ |
193 | 194 | def transform_editors_single_launcher(rts): |
194 | 195 | ids = db.retrieve_distinct_keys(rts.dbname, rts.editors_raw, 'editor') |
195 | 196 | input_db, output_db = setup_database(rts) |
| 197 | + pbar = progressbar.ProgressBar(maxval=len(ids)).start() |
196 | 198 | for x, id in enumerate(ids): |
197 | | - print '%s editors to go...' % (len(ids) - x) |
198 | 199 | editor = Editor(id, input_db, output_db) |
199 | 200 | editor() |
| 201 | + pbar.update(pbar.currval + 1) |
200 | 202 | |
201 | 203 | |
202 | 204 | if __name__ == '__main__': |
Index: trunk/tools/editor_trends/wikilytics/api/views.py |
— | — | @@ -42,7 +42,7 @@ |
43 | 43 | if created: |
44 | 44 | job.save() |
45 | 45 | jobs = Job.objects.filter(jobtype='dataset', finished=False, in_progress=False) |
46 | | - ds = Dataset.objects.using('enwiki').filter(project=project, language_code=language) |
| 46 | + ds = Dataset.objects.filter(project=project, language_code=language) |
47 | 47 | print ds |
48 | 48 | return render_to_response('datasets.html', {'datasets': ds, 'jobs': jobs}) |
49 | 49 | |
— | — | @@ -59,7 +59,7 @@ |
60 | 60 | c = {} |
61 | 61 | print project, language, chart |
62 | 62 | try: |
63 | | - ds = Dataset.objects.using('enwiki').get(project=project, language_code=language, name=chart) |
| 63 | + ds = Dataset.objects.get(project=project, language_code=language, name=chart) |
64 | 64 | print ds |
65 | 65 | except: |
66 | 66 | hash = helpers.create_hash(project, language) |
Index: trunk/tools/editor_trends/wikilytics/api/models.py |
— | — | @@ -19,7 +19,7 @@ |
20 | 20 | variables = DictField() |
21 | 21 | |
22 | 22 | class Meta: |
23 | | - db_table = 'enwiki_charts' |
| 23 | + db_table = 'charts' |
24 | 24 | |
25 | 25 | def __iter__(self): |
26 | 26 | for key, value in self.variables.items(): |
— | — | @@ -34,24 +34,24 @@ |
35 | 35 | return reverse('chart_generator', args=[self.project, self.language_code, self.name]) |
36 | 36 | |
37 | 37 | |
38 | | -class Editor(models.Model): |
39 | | - username = models.CharField(max_length=64) |
40 | | - editor = models.IntegerField() |
41 | | - first_edit = models.DateField() |
42 | | - final_edit = models.DateField() |
43 | | - new_wikipedian = models.DateField() |
44 | | - monthly_edits = DictField() |
45 | | - edit_count = models.IntegerField() |
46 | | - articles_by_year = DictField() |
47 | | - edits_by_year = DictField() |
48 | | - edits = ListField() |
| 38 | +#class Editor(models.Model, language_code, project): |
| 39 | +# username = models.CharField(max_length=64) |
| 40 | +# editor = models.IntegerField() |
| 41 | +# first_edit = models.DateField() |
| 42 | +# final_edit = models.DateField() |
| 43 | +# new_wikipedian = models.DateField() |
| 44 | +# monthly_edits = DictField() |
| 45 | +# edit_count = models.IntegerField() |
| 46 | +# articles_by_year = DictField() |
| 47 | +# edits_by_year = DictField() |
| 48 | +# edits = ListField() |
| 49 | +# |
| 50 | +# class Meta: |
| 51 | +# db_table = '%s%s_editors_dataset' % (language_code, project) |
| 52 | +# |
| 53 | +# def __unicode__(self): |
| 54 | +# return u'%s, total edits: %s' % (self.username, self.edit_count) |
49 | 55 | |
50 | | - class Meta: |
51 | | - db_table = 'editors_dataset' |
52 | | - |
53 | | - def __unicode__(self): |
54 | | - return u'%s, total edits: %s' % (self.username, self.edit_count) |
55 | | - |
56 | 56 | class EditorAdmin(admin.ModelAdmin): |
57 | 57 | pass |
58 | 58 | |
Index: trunk/tools/editor_trends/wikilytics/api/forms.py |
— | — | @@ -3,7 +3,7 @@ |
4 | 4 | |
5 | 5 | from wikilytics.api.widgets import MonthYearWidget |
6 | 6 | from editor_trends.classes import projects |
7 | | -from editor_trends.analyses.analyzer import available_analyses |
| 7 | +from editor_trends.analyses.inventory import available_analyses |
8 | 8 | |
9 | 9 | |
10 | 10 | |
Index: trunk/tools/editor_trends/classes/runtime_settings.py |
— | — | @@ -29,7 +29,6 @@ |
30 | 30 | import datetime |
31 | 31 | import time |
32 | 32 | import re |
33 | | -#sys.path.append('..') |
34 | 33 | |
35 | 34 | from settings import Settings |
36 | 35 | from utils import text_utils |
— | — | @@ -60,7 +59,6 @@ |
61 | 60 | self.input_location != None else self.get_value('location') |
62 | 61 | self.project = self.update_project_settings() |
63 | 62 | self.language = self.update_language_settings() |
64 | | - #self.dbname = '%s%s' % (self.language.code, self.project.name) |
65 | 63 | self.targets = self.split_keywords(self.get_value('charts')) |
66 | 64 | self.keywords = self.split_keywords(self.get_value('keywords')) |
67 | 65 | self.function = self.get_value('func') |
Index: trunk/tools/editor_trends/classes/dataset.py |
— | — | @@ -25,8 +25,8 @@ |
26 | 26 | import sys |
27 | 27 | from pymongo.son_manipulator import SONManipulator |
28 | 28 | from multiprocessing import Lock |
| 29 | +from texttable import Texttable |
29 | 30 | |
30 | | - |
31 | 31 | sys.path.append('..') |
32 | 32 | import configuration |
33 | 33 | settings = configuration.Settings() |
— | — | @@ -80,11 +80,18 @@ |
81 | 81 | Dataset classes. |
82 | 82 | ''' |
83 | 83 | def __hash__(self, vars): |
| 84 | + ''' |
| 85 | + This is a generic hash function that expects a list of variables, used |
| 86 | + to lookup an observation or Variable. |
| 87 | + ''' |
84 | 88 | id = ''.join([str(var) for var in vars]) |
85 | 89 | return hash(id) |
86 | | - #return int(self.convert_date_to_epoch(date)) |
87 | 90 | |
88 | 91 | def encode_to_bson(self, data=None): |
| 92 | + ''' |
| 93 | + This function converts a Variable or Observation to a dictionary that |
| 94 | + can be stored in Mongo. |
| 95 | + ''' |
89 | 96 | if data: |
90 | 97 | kwargs = dict([(str(key), value) for key, value in data.__dict__.iteritems()]) |
91 | 98 | else: |
— | — | @@ -100,6 +107,10 @@ |
101 | 108 | return kwargs |
102 | 109 | |
103 | 110 | def convert_date_to_epoch(self, date): |
| 111 | + ''' |
| 112 | + Calculate the number of seconds since epoch depending on the time_unit |
| 113 | + of the date |
| 114 | + ''' |
104 | 115 | assert self.time_unit == 'year' or self.time_unit == 'month' \ |
105 | 116 | or self.time_unit == 'day', 'Time unit should either be year, month or day.' |
106 | 117 | |
— | — | @@ -115,6 +126,9 @@ |
116 | 127 | return date |
117 | 128 | |
118 | 129 | def set_date_range(self, date): |
| 130 | + ''' |
| 131 | + Determine the width of a date range for an observation. |
| 132 | + ''' |
119 | 133 | if self.time_unit == 'year': |
120 | 134 | return datetime.datetime(date.year, 12, 31), datetime.datetime(date.year, 1, 1) |
121 | 135 | elif self.time_unit == 'month': |
— | — | @@ -213,7 +227,7 @@ |
214 | 228 | yield key |
215 | 229 | |
216 | 230 | def __len__(self): |
217 | | - return [x for x in xrange(self.obs())] |
| 231 | + return len(self.obs.keys()) |
218 | 232 | |
219 | 233 | def items(self): |
220 | 234 | for key in self.__dict__.keys(): |
— | — | @@ -240,8 +254,25 @@ |
241 | 255 | return obs |
242 | 256 | |
243 | 257 | def add(self, date, value, meta={}): |
| 258 | + ''' |
| 259 | + The add function is used to add an observation to a variable. An |
| 260 | + observation is always grouped by the combination of the date and time_unit. |
| 261 | + Time_unit is a property of a Variable and indicates how granular the |
| 262 | + observations should be grouped. For example, if time_unit == year then |
| 263 | + all observations in a given year will be grouped. |
| 264 | + When calling add you should supply at least two variables: |
| 265 | + 1) date: when did the observation happen |
| 266 | + 2) value: an integer or float that was observed on that date |
| 267 | + Optionally you can supply a dictionary for extra groupings. The key is |
| 268 | + the name of the extra grouping. |
| 269 | + For example, if you add {'experience': 3} as the meta dict when calling |
| 270 | + add then you will create an extra grouping called experience and all |
| 271 | + future observations who fall in the same date range and the same |
| 272 | + exerience level will be grouped by that particular observation. You |
| 273 | + can use as many extra groupings as you want but usually one extra grouping |
| 274 | + should be enough. |
| 275 | + ''' |
244 | 276 | assert isinstance(meta, dict), 'The meta variable should be a dict (either empty or with variables to group by.' |
245 | | - #id = self.convert_date_to_epoch(date) |
246 | 277 | start, end = self.set_date_range(date) |
247 | 278 | values = meta.values() |
248 | 279 | values.insert(0, end) |
— | — | @@ -252,6 +283,12 @@ |
253 | 284 | obs.add(value) |
254 | 285 | self.obs[id] = obs |
255 | 286 | |
| 287 | + def number_of_obs(self): |
| 288 | + n = 0 |
| 289 | + for obs in self.obs: |
| 290 | + n += self.obs[obs].count |
| 291 | + return n |
| 292 | + |
256 | 293 | def encode(self): |
257 | 294 | bson = {} |
258 | 295 | for prop in self.props: |
— | — | @@ -323,6 +360,12 @@ |
324 | 361 | for var in self.variables: |
325 | 362 | yield getattr(self, var) |
326 | 363 | |
| 364 | + def details(self): |
| 365 | + print 'Project: %s%s' % (self.language_code, self.project) |
| 366 | + print 'JSON encoder: %s' % self.encoder |
| 367 | + print 'Raw data was retrieved from: %s%s/%s' % (self.language_code, |
| 368 | + self.project, |
| 369 | + self.collection) |
327 | 370 | |
328 | 371 | def create_filename(self): |
329 | 372 | ''' |
— | — | @@ -359,6 +402,9 @@ |
360 | 403 | |
361 | 404 | |
362 | 405 | def add_variable(self, var): |
| 406 | + ''' |
| 407 | + Call this function to add a Variable to a dataset. |
| 408 | + ''' |
363 | 409 | if isinstance(var, Variable): |
364 | 410 | self.variables.append(var.name) |
365 | 411 | setattr(self, var.name, var) |
— | — | @@ -366,6 +412,9 @@ |
367 | 413 | raise TypeError('You can only instance of Variable to a dataset.') |
368 | 414 | |
369 | 415 | def write(self, format='csv'): |
| 416 | + ''' |
| 417 | + This is the entry point for outputting data, either to csv or mongo. |
| 418 | + ''' |
370 | 419 | self.create_filename() |
371 | 420 | if format == 'csv': |
372 | 421 | self.to_csv() |
— | — | @@ -429,22 +478,29 @@ |
430 | 479 | variable.sds = self.get_standard_deviation(data) |
431 | 480 | variable.min = min(data) |
432 | 481 | variable.max = max(data) |
433 | | - variable.n = len(data) |
| 482 | + variable.num_obs = variable.number_of_obs() |
| 483 | + variable.num_dates = len(variable) |
434 | 484 | variable.first_obs, variable.last_obs = variable.get_date_range() |
435 | 485 | |
436 | 486 | def summary(self): |
437 | 487 | self.descriptives() |
438 | | - print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('Variable', 'Mean', |
439 | | - 'Median', 'SD', 'Minimum', 'Maximum', |
440 | | - 'Num Obs', 'First Obs', 'Final Obs') |
441 | | - for variable in self: |
442 | | - print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (variable.name, |
443 | | - variable.mean, variable.median, |
444 | | - variable.sds, variable.min, |
445 | | - variable.max, variable.n, |
446 | | - variable.first_obs, variable.last_obs) |
| 488 | + table = Texttable(max_width=0) |
| 489 | + vars = ['Variable', 'Mean', 'Median', 'SD', 'Minimum', 'Maximum', |
| 490 | + 'Num Obs', 'Num of\nUnique Dates', 'First Obs', 'Final Obs'] |
| 491 | + table.add_row([var for var in vars]) |
| 492 | + table.set_cols_align(['r' for v in vars]) |
| 493 | + table.set_cols_valign(['m' for v in vars]) |
447 | 494 | |
| 495 | + for x, variable in enumerate(self): |
| 496 | + table.add_row([variable.name, variable.mean, variable.median, |
| 497 | + variable.sds, variable.min, variable.max, |
| 498 | + variable.num_obs, variable.num_dates, |
| 499 | + variable.first_obs, variable.last_obs]) |
| 500 | + print table.draw() |
| 501 | + print self |
| 502 | + print self.details() |
448 | 503 | |
| 504 | + |
449 | 505 | def debug(): |
450 | 506 | mongo = db.init_mongo_db('enwiki') |
451 | 507 | rawdata = mongo['enwiki_charts'] |
— | — | @@ -456,17 +512,17 @@ |
457 | 513 | {'name': 'count', 'time_unit': 'year'}, |
458 | 514 | # {'name': 'testest', 'time_unit': 'year'} |
459 | 515 | ]) |
460 | | - ds.count.add(d1, 10, ['exp', 'window']) |
461 | | - ds.count.add(d1, 135, ['exp', 'window']) |
462 | | - ds.count.add(d2, 1, ['exp', 'window']) |
| 516 | + ds.count.add(d1, 10, {'exp': 3}) |
| 517 | + ds.count.add(d1, 135, {'exp': 3}) |
| 518 | + ds.count.add(d2, 1, {'exp': 4}) |
463 | 519 | #ds.testest.add(d1, 135) |
464 | 520 | #ds.testest.add(d2, 535) |
465 | 521 | ds.summary() |
466 | 522 | ds.write(format='csv') |
467 | 523 | # v = Variable('test', 'year') |
468 | 524 | ds.encode() |
469 | | - print ds |
470 | 525 | |
| 526 | + |
471 | 527 | # mongo.test.insert({'variables': ds}) |
472 | 528 | |
473 | 529 | # v.add(d2 , 5) |
Index: trunk/tools/editor_trends/utils/compression.py |
— | — | @@ -89,8 +89,8 @@ |
90 | 90 | if self.program_installed == None: |
91 | 91 | raise exceptions.CompressionNotSupportedError |
92 | 92 | |
93 | | - print self.location |
94 | | - print self.file |
| 93 | + #print self.location |
| 94 | + #print self.file |
95 | 95 | if not file_utils.check_file_exists(self.location, self.file): |
96 | 96 | raise exceptions.FileNotFoundException(self.location, self.file) |
97 | 97 | |
— | — | @@ -103,7 +103,7 @@ |
104 | 104 | commands = args.get(self.name, None) |
105 | 105 | #print commands |
106 | 106 | if commands != None: |
107 | | - p = subprocess.call(commands) |
| 107 | + p = subprocess.call(commands, shell=True) |
108 | 108 | #p = subprocess.Popen(commands, shell=True).wait() |
109 | 109 | else: |
110 | 110 | raise exceptions.CompressionNotSupportedError |
Index: trunk/tools/editor_trends/cronjobs.py |
— | — | @@ -29,11 +29,8 @@ |
30 | 30 | from analyses import analyzer |
31 | 31 | |
32 | 32 | |
33 | | -def launch_editor_trends_toolkit(task): |
34 | | - ''' |
35 | | - This function should only be called as a cronjob and not directly. |
36 | | - ''' |
37 | | - project, language, parser, settings = manager.init_args_parser() |
| 33 | +def init_environment(task): |
| 34 | + project, language, parser = manager.init_args_parser() |
38 | 35 | args = parser.parse_args(['django']) |
39 | 36 | pjc = projects.ProjectContainer() |
40 | 37 | project = pjc.get_project(task['project']) |
— | — | @@ -42,8 +39,16 @@ |
43 | 40 | |
44 | 41 | args.language = language.name |
45 | 42 | args.project = project.name |
46 | | - rts = runtime_settings.RunTimeSettings(project, language, settings, args) |
47 | | - res = manager.all_launcher(rts, settings, None) |
| 43 | + rts = runtime_settings.RunTimeSettings(project, language, args) |
| 44 | + return rts |
| 45 | + |
| 46 | + |
| 47 | +def launch_editor_trends_toolkit(task): |
| 48 | + ''' |
| 49 | + This function should only be called as a cronjob and not directly. |
| 50 | + ''' |
| 51 | + rts = init_environment(task) |
| 52 | + res = manager.all_launcher(rts, None) |
48 | 53 | return res |
49 | 54 | |
50 | 55 | |
— | — | @@ -53,22 +58,19 @@ |
54 | 59 | ''' |
55 | 60 | res = True |
56 | 61 | try: |
57 | | - project = task['project'] |
58 | | - language_code = task['language_code'] |
| 62 | + rts = init_environment(task) |
59 | 63 | func = task['jobtype'] |
60 | | - |
61 | | - collection = 'editors_dataset' #FIXME hardcoded string |
62 | 64 | time_unit = 'month' #FIXME hardcoded string |
63 | 65 | cutoff = 1 #FIXME hardcoded string |
64 | 66 | cum_cutoff = 50 #FIXME hardcoded string |
65 | 67 | |
66 | | - analyzer.generate_chart_data(project, |
67 | | - collection, |
68 | | - language_code, |
69 | | - func, |
70 | | - time_unit=time_unit, |
71 | | - cutoff=cutoff, |
72 | | - cum_cutoff=cum_cutoff) |
| 68 | + analyzer.generate_chart_data(rts.project.name, |
| 69 | + rts.collection, |
| 70 | + rts.language.code, |
| 71 | + func, |
| 72 | + time_unit=time_unit, |
| 73 | + cutoff=cutoff, |
| 74 | + cum_cutoff=cum_cutoff) |
73 | 75 | except AttributeError, e: |
74 | 76 | res = False |
75 | 77 | print e #need to capture more fine grained errors but not quite what errors are going to happen. |