r82149 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r82148‎ | r82149 | r82150 >
Date:22:40, 14 February 2011
Author:diederik
Status:deferred
Tags:
Comment:
Today's batch of fixes:
1) Dataset descriptives are nicely formatted in a table
2) Extract phase waits for 7z to finish extracting
3) Removed some hard-coded strings
4) Fixed some references.
5) Delete an old file
Modified paths:
  • /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py (modified) (history)
  • /trunk/tools/editor_trends/classes/dataset.py (modified) (history)
  • /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history)
  • /trunk/tools/editor_trends/configuration.py (deleted) (history)
  • /trunk/tools/editor_trends/cronjobs.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/transformer.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/utils/compression.py (modified) (history)
  • /trunk/tools/editor_trends/wikilytics/api/forms.py (modified) (history)
  • /trunk/tools/editor_trends/wikilytics/api/models.py (modified) (history)
  • /trunk/tools/editor_trends/wikilytics/api/views.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/configuration.py
@@ -1,206 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__email__ = 'dvanliere at gmail dot com'
18 -__date__ = '2010-10-21'
19 -__version__ = '0.1'
20 -
21 -'''
22 -This file contains settings that are used for constructing and analyzing
23 -the datasets as part of the Editor Dynamics and Anti-Vandalism projects.
24 -'''
25 -
26 -from multiprocessing import cpu_count
27 -import ConfigParser
28 -import os
29 -import sys
30 -import platform
31 -import subprocess
32 -
33 -
34 -from classes import exceptions
35 -
36 -try:
37 - from _winreg import *
38 - from pywin import win32file
39 - '''increase the maximum number of open files on Windows to 1024'''
40 - win32file._setmaxstdio(1024)
41 -except ImportError:
42 - pass
43 -
44 -try:
45 - import resource
46 -except ImportError:
47 - pass
48 -
49 -
50 -class Singleton(type):
51 - '''
52 - Recipe: http://stackoverflow.com/questions/31875/is-there-a-simple-elegant-way-to-define-singletons-in-python
53 - '''
54 - def __init__(cls, name, bases, dict):
55 - super(Singleton, cls).__init__(name, bases, dict)
56 - cls.instance = None
57 -
58 - def __call__(cls, *args, **kw):
59 - if cls.instance is None:
60 - cls.instance = super(Singleton, cls).__call__(*args, **kw)
61 - return cls.instance
62 - else:
63 - return cls.instance
64 -
65 -class Settings:
66 - __metaclass__ = Singleton
67 -
68 - def __init__(self, process_multiplier=1):
69 - self.minimum_python_version = (2, 6)
70 - self.detect_python_version()
71 - self.encoding = 'utf-8'
72 -
73 - #Date format as used by Erik Zachte
74 - self.date_format = '%Y-%m-%d'
75 -
76 - # Timestamp format as generated by the MediaWiki dumps
77 - self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
78 - self.timestamp_server = '%a, %d %b %Y %H:%M:%S %Z'
79 - #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason
80 - self.max_xmlfile_size = 4096 * 1024
81 -
82 - #Change this to match your computers configuration (RAM / CPU)
83 - self.number_of_processes = cpu_count() * process_multiplier
84 -
85 - self.wp_dump_location = 'http://dumps.wikimedia.org'
86 - self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'
87 - self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json']
88 - self.windows_register = {'7z.exe': 'Software\\7-Zip', }
89 - #Extensions of ascii files, this is used to determine the filemode to use
90 - self.platform = self.determine_platform()
91 -
92 - self.architecture = platform.machine()
93 - self.working_directory = self.determine_working_directory()
94 - print sys.path
95 - self.update_python_path()
96 - print sys.path
97 -
98 - self.root = os.path.expanduser('~') if self.platform != 'Windows' else 'c:\\'
99 - self.max_filehandles = self.determine_max_filehandles_open()
100 - self.tab_width = 4 if self.platform == 'Windows' else 8
101 -
102 -
103 - result = self.load_configuration()
104 - if not result:
105 - self.input_location = os.path.join(self.root, 'wikimedia')
106 -
107 - # Default Input file
108 - self.input_filename = os.path.join(self.input_location, 'en',
109 - 'wiki',
110 - 'enwiki-20100916-stub-meta-history.xml')
111 - # This is the place where error messages are stored for debugging purposes
112 - self.log_location = os.path.join(self.working_directory,
113 - 'logs')
114 - self.csv_location = os.path.join(self.working_directory,
115 - 'data', 'csv')
116 - self.dataset_location = os.path.join(self.working_directory, 'datasets')
117 - self.binary_location = os.path.join(self.working_directory,
118 - 'data', 'objects')
119 -
120 - self.chart_location = os.path.join(self.working_directory, 'statistics',
121 - 'charts')
122 - self.file_choices = ('stub-meta-history.xml.gz',
123 - 'stub-meta-current.xml.gz',
124 - 'pages-meta-history.xml.7z',
125 - 'pages-meta-current.xml.bz2',)
126 -
127 - def load_configuration(self):
128 - if os.path.exists(os.path.join(self.working_directory, 'wiki.cfg')):
129 - config = ConfigParser.RawConfigParser()
130 - config.read(os.path.join(self.working_directory, 'wiki.cfg'))
131 - self.working_directory = config.get('file_locations', 'working_directory')
132 - self.input_location = config.get('file_locations', 'input_location')
133 - self.default_project = config.get('wiki', 'project')
134 - self.default_language = config.get('wiki', 'language')
135 - return True
136 - else:
137 - return False
138 -
139 - def determine_working_directory(self):
140 - cwd = os.getcwd()
141 - if not cwd.endswith('editor_trends%s' % os.sep):
142 - pos = cwd.find('editor_trends') + 14
143 - cwd = cwd[:pos]
144 - return cwd
145 -
146 - def detect_python_version(self):
147 - version = sys.version_info[0:2]
148 - #logger.debug('Python version: %s' % '.'.join(str(version)))
149 - if version < self.minimum_python_version:
150 - raise exceptions.OutDatedPythonVersionError
151 -
152 - def determine_platform(self):
153 - if platform.system() == 'Darwin':
154 - return 'OSX'
155 - else:
156 - return platform.system()
157 -
158 - def verify_environment(self, directories):
159 - for directory in directories:
160 - if not os.path.exists(directory):
161 - try:
162 - os.makedirs(directory)
163 - except IOError:
164 - print 'Configuration Error, could not create directory %s.' % directory
165 -
166 - def detect_windows_program(self, program):
167 - entry = self.windows_register.get(program, None)
168 - try:
169 - key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ)
170 - return QueryValueEx(key, 'Path')[0]
171 - except WindowsError:
172 - return None
173 -
174 - def detect_linux_program(self, program):
175 - path = subprocess.Popen(['which', '%s' % program], stdout=subprocess.PIPE).communicate()[0]
176 - return path.strip()
177 -
178 - def detect_installed_program(self, program):
179 - if self.platform == 'Windows':
180 - if not program.endswith('.exe'):
181 - program = program + '.exe'
182 - path = self.detect_windows_program(program)
183 - if path != None:
184 - path = path + program
185 - elif self.platform == 'Linux':
186 - path = self.detect_linux_program(program)
187 -
188 - return path
189 -
190 - def determine_max_filehandles_open(self):
191 - if self.platform == 'Windows' and self.architecture == 'i386':
192 - return win32file._getmaxstdio()
193 - elif self.platform != 'Windows':
194 - return resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100
195 - else:
196 - return 500
197 -
198 - def update_python_path(self):
199 - IGNORE_DIRS = ['wikistats', 'zips', 'datasets', 'mapreduce', 'logs',
200 - 'statistics', 'js_scripts', 'deployment',
201 - 'documentation', 'data', 'code-snippets']
202 - dirs = [name for name in os.listdir(self.working_directory) if
203 - os.path.isdir(os.path.join(self.working_directory, name))]
204 - for subdirname in dirs:
205 - if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
206 - sys.path.append(os.path.join(self.working_directory,
207 - subdirname))
Index: trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py
@@ -39,10 +39,6 @@
4040 if w >= editor_dt:
4141 datum = datetime.datetime(int(year), 12, 31)
4242 freq = int(editor['edits_by_year'][year])
43 - #if datum == datetime.datetime(2003, 12, 31):
44 -# if w == 24:
45 -# if freq == 1.0:
46 -# print 'break'
47 - var.add(datum, 1, {'window': w, 'frequency': freq}) #{w:{freq:1}})
 43+ var.add(datum, 1, {'window': w, 'frequency': freq})
4844 break
4945 return var
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py
@@ -23,7 +23,6 @@
2424
2525
2626 def cohort_dataset_backward_bar(var, editor, **kwargs):
27 - #first_edit = editor['first_edit']
2827 '''
2928 The backward looking bar chart looks for every year that an editor
3029 was part of the Wikimedia community whether this person made at least cutoff
Index: trunk/tools/editor_trends/analyses/analyzer.py
@@ -26,6 +26,7 @@
2727 if '..' not in sys.path:
2828 sys.path.append('..')
2929
 30+import inventory
3031 from classes import dataset
3132 from classes import settings
3233 settings = settings.Settings()
@@ -42,7 +43,7 @@
4344 stopwatch = timer.Timer()
4445 res = True
4546 dbname = '%s%s' % (language_code, project)
46 - functions = available_analyses()
 47+ functions = inventory.available_analyses()
4748 try:
4849 func = functions[func]
4950 except KeyError:
@@ -119,8 +120,8 @@
120121
121122
122123 if __name__ == '__main__':
123 - #generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50)
124 - generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)
 124+ generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50)
 125+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)
125126 #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', 'to_bar_json', time_unit='year')
126127 #generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', 'to_bar_json', time_unit='year')
127128 #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', 'to_bar_json', time_unit='year')
Index: trunk/tools/editor_trends/manage.py
@@ -229,12 +229,12 @@
230230 file_utils.delete_file(settings.binary_location, filename)
231231
232232
233 -def all_launcher(properties, logger):
 233+def all_launcher(rts, logger):
234234 print 'The entire data processing chain has been called, this will take a \
235235 couple of hours (at least) to complete.'
236236 stopwatch = timer.Timer()
237 - log.log_to_mongo(properties, 'dataset', 'all', stopwatch, event='start')
238 - print 'Start of building %s %s dataset.' % (properties.language.name, properties.project)
 237+ log.log_to_mongo(rts, 'dataset', 'all', stopwatch, event='start')
 238+ print 'Start of building %s %s dataset.' % (rts.language.name, rts.project)
239239
240240 # write_message_to_log(logger, settings,
241241 # message=message,
@@ -242,8 +242,8 @@
243243 # full_project=properties.full_project,
244244 # ignore=properties.ignore,
245245 # clean=properties.clean)
246 - if properties.clean:
247 - cleanup(properties, settings, logger)
 246+ if rts.clean:
 247+ cleanup(rts, logger)
248248
249249 functions = ordered_dict.OrderedDict(((downloader_launcher, 'download'),
250250 (extract_launcher, 'extract'),
@@ -253,16 +253,16 @@
254254 (dataset_launcher, 'dataset')))
255255
256256 for function, callname in functions.iteritems():
257 - if callname not in properties.ignore:
 257+ if callname not in rts.ignore:
258258 print 'Starting %s' % function.func_name
259 - res = function(properties, logger)
 259+ res = function(rts, logger)
260260 if res == False:
261261 sys.exit(False)
262262 elif res == None:
263263 print 'Function %s does not return a status, \
264264 implement NOW' % function.func_name
265265 stopwatch.elapsed()
266 - log.log_to_mongo(properties, 'dataset', 'all', stopwatch, event='finish')
 266+ log.log_to_mongo(rts, 'dataset', 'all', stopwatch, event='finish')
267267
268268
269269
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -382,6 +382,8 @@
383383 '''
384384 result = True
385385 tasks = unzip(rts)
 386+ if not tasks:
 387+ return False
386388
387389 output = os.path.join(rts.input_location, rts.language.code,
388390 rts.project.name, 'txt')
Index: trunk/tools/editor_trends/etl/transformer.py
@@ -17,6 +17,7 @@
1818 __date__ = '2010-11-02'
1919 __version__ = '0.1'
2020
 21+import progressbar
2122 import multiprocessing
2223 from Queue import Empty
2324 from operator import itemgetter
@@ -192,10 +193,11 @@
193194 def transform_editors_single_launcher(rts):
194195 ids = db.retrieve_distinct_keys(rts.dbname, rts.editors_raw, 'editor')
195196 input_db, output_db = setup_database(rts)
 197+ pbar = progressbar.ProgressBar(maxval=len(ids)).start()
196198 for x, id in enumerate(ids):
197 - print '%s editors to go...' % (len(ids) - x)
198199 editor = Editor(id, input_db, output_db)
199200 editor()
 201+ pbar.update(pbar.currval + 1)
200202
201203
202204 if __name__ == '__main__':
Index: trunk/tools/editor_trends/wikilytics/api/views.py
@@ -42,7 +42,7 @@
4343 if created:
4444 job.save()
4545 jobs = Job.objects.filter(jobtype='dataset', finished=False, in_progress=False)
46 - ds = Dataset.objects.using('enwiki').filter(project=project, language_code=language)
 46+ ds = Dataset.objects.filter(project=project, language_code=language)
4747 print ds
4848 return render_to_response('datasets.html', {'datasets': ds, 'jobs': jobs})
4949
@@ -59,7 +59,7 @@
6060 c = {}
6161 print project, language, chart
6262 try:
63 - ds = Dataset.objects.using('enwiki').get(project=project, language_code=language, name=chart)
 63+ ds = Dataset.objects.get(project=project, language_code=language, name=chart)
6464 print ds
6565 except:
6666 hash = helpers.create_hash(project, language)
Index: trunk/tools/editor_trends/wikilytics/api/models.py
@@ -19,7 +19,7 @@
2020 variables = DictField()
2121
2222 class Meta:
23 - db_table = 'enwiki_charts'
 23+ db_table = 'charts'
2424
2525 def __iter__(self):
2626 for key, value in self.variables.items():
@@ -34,24 +34,24 @@
3535 return reverse('chart_generator', args=[self.project, self.language_code, self.name])
3636
3737
38 -class Editor(models.Model):
39 - username = models.CharField(max_length=64)
40 - editor = models.IntegerField()
41 - first_edit = models.DateField()
42 - final_edit = models.DateField()
43 - new_wikipedian = models.DateField()
44 - monthly_edits = DictField()
45 - edit_count = models.IntegerField()
46 - articles_by_year = DictField()
47 - edits_by_year = DictField()
48 - edits = ListField()
 38+#class Editor(models.Model, language_code, project):
 39+# username = models.CharField(max_length=64)
 40+# editor = models.IntegerField()
 41+# first_edit = models.DateField()
 42+# final_edit = models.DateField()
 43+# new_wikipedian = models.DateField()
 44+# monthly_edits = DictField()
 45+# edit_count = models.IntegerField()
 46+# articles_by_year = DictField()
 47+# edits_by_year = DictField()
 48+# edits = ListField()
 49+#
 50+# class Meta:
 51+# db_table = '%s%s_editors_dataset' % (language_code, project)
 52+#
 53+# def __unicode__(self):
 54+# return u'%s, total edits: %s' % (self.username, self.edit_count)
4955
50 - class Meta:
51 - db_table = 'editors_dataset'
52 -
53 - def __unicode__(self):
54 - return u'%s, total edits: %s' % (self.username, self.edit_count)
55 -
5656 class EditorAdmin(admin.ModelAdmin):
5757 pass
5858
Index: trunk/tools/editor_trends/wikilytics/api/forms.py
@@ -3,7 +3,7 @@
44
55 from wikilytics.api.widgets import MonthYearWidget
66 from editor_trends.classes import projects
7 -from editor_trends.analyses.analyzer import available_analyses
 7+from editor_trends.analyses.inventory import available_analyses
88
99
1010
Index: trunk/tools/editor_trends/classes/runtime_settings.py
@@ -29,7 +29,6 @@
3030 import datetime
3131 import time
3232 import re
33 -#sys.path.append('..')
3433
3534 from settings import Settings
3635 from utils import text_utils
@@ -60,7 +59,6 @@
6160 self.input_location != None else self.get_value('location')
6261 self.project = self.update_project_settings()
6362 self.language = self.update_language_settings()
64 - #self.dbname = '%s%s' % (self.language.code, self.project.name)
6563 self.targets = self.split_keywords(self.get_value('charts'))
6664 self.keywords = self.split_keywords(self.get_value('keywords'))
6765 self.function = self.get_value('func')
Index: trunk/tools/editor_trends/classes/dataset.py
@@ -25,8 +25,8 @@
2626 import sys
2727 from pymongo.son_manipulator import SONManipulator
2828 from multiprocessing import Lock
 29+from texttable import Texttable
2930
30 -
3131 sys.path.append('..')
3232 import configuration
3333 settings = configuration.Settings()
@@ -80,11 +80,18 @@
8181 Dataset classes.
8282 '''
8383 def __hash__(self, vars):
 84+ '''
 85+ This is a generic hash function that expects a list of variables, used
 86+ to lookup an observation or Variable.
 87+ '''
8488 id = ''.join([str(var) for var in vars])
8589 return hash(id)
86 - #return int(self.convert_date_to_epoch(date))
8790
8891 def encode_to_bson(self, data=None):
 92+ '''
 93+ This function converts a Variable or Observation to a dictionary that
 94+ can be stored in Mongo.
 95+ '''
8996 if data:
9097 kwargs = dict([(str(key), value) for key, value in data.__dict__.iteritems()])
9198 else:
@@ -100,6 +107,10 @@
101108 return kwargs
102109
103110 def convert_date_to_epoch(self, date):
 111+ '''
 112+ Calculate the number of seconds since epoch depending on the time_unit
 113+ of the date
 114+ '''
104115 assert self.time_unit == 'year' or self.time_unit == 'month' \
105116 or self.time_unit == 'day', 'Time unit should either be year, month or day.'
106117
@@ -115,6 +126,9 @@
116127 return date
117128
118129 def set_date_range(self, date):
 130+ '''
 131+ Determine the width of a date range for an observation.
 132+ '''
119133 if self.time_unit == 'year':
120134 return datetime.datetime(date.year, 12, 31), datetime.datetime(date.year, 1, 1)
121135 elif self.time_unit == 'month':
@@ -213,7 +227,7 @@
214228 yield key
215229
216230 def __len__(self):
217 - return [x for x in xrange(self.obs())]
 231+ return len(self.obs.keys())
218232
219233 def items(self):
220234 for key in self.__dict__.keys():
@@ -240,8 +254,25 @@
241255 return obs
242256
243257 def add(self, date, value, meta={}):
 258+ '''
 259+ The add function is used to add an observation to a variable. An
 260+ observation is always grouped by the combination of the date and time_unit.
 261+ Time_unit is a property of a Variable and indicates how granular the
 262+ observations should be grouped. For example, if time_unit == year then
 263+ all observations in a given year will be grouped.
 264+ When calling add you should supply at least two variables:
 265+ 1) date: when did the observation happen
 266+ 2) value: an integer or float that was observed on that date
 267+ Optionally you can supply a dictionary for extra groupings. The key is
 268+ the name of the extra grouping.
 269+ For example, if you add {'experience': 3} as the meta dict when calling
 270+ add then you will create an extra grouping called experience and all
 271+ future observations who fall in the same date range and the same
 272+ exerience level will be grouped by that particular observation. You
 273+ can use as many extra groupings as you want but usually one extra grouping
 274+ should be enough.
 275+ '''
244276 assert isinstance(meta, dict), 'The meta variable should be a dict (either empty or with variables to group by.'
245 - #id = self.convert_date_to_epoch(date)
246277 start, end = self.set_date_range(date)
247278 values = meta.values()
248279 values.insert(0, end)
@@ -252,6 +283,12 @@
253284 obs.add(value)
254285 self.obs[id] = obs
255286
 287+ def number_of_obs(self):
 288+ n = 0
 289+ for obs in self.obs:
 290+ n += self.obs[obs].count
 291+ return n
 292+
256293 def encode(self):
257294 bson = {}
258295 for prop in self.props:
@@ -323,6 +360,12 @@
324361 for var in self.variables:
325362 yield getattr(self, var)
326363
 364+ def details(self):
 365+ print 'Project: %s%s' % (self.language_code, self.project)
 366+ print 'JSON encoder: %s' % self.encoder
 367+ print 'Raw data was retrieved from: %s%s/%s' % (self.language_code,
 368+ self.project,
 369+ self.collection)
327370
328371 def create_filename(self):
329372 '''
@@ -359,6 +402,9 @@
360403
361404
362405 def add_variable(self, var):
 406+ '''
 407+ Call this function to add a Variable to a dataset.
 408+ '''
363409 if isinstance(var, Variable):
364410 self.variables.append(var.name)
365411 setattr(self, var.name, var)
@@ -366,6 +412,9 @@
367413 raise TypeError('You can only instance of Variable to a dataset.')
368414
369415 def write(self, format='csv'):
 416+ '''
 417+ This is the entry point for outputting data, either to csv or mongo.
 418+ '''
370419 self.create_filename()
371420 if format == 'csv':
372421 self.to_csv()
@@ -429,22 +478,29 @@
430479 variable.sds = self.get_standard_deviation(data)
431480 variable.min = min(data)
432481 variable.max = max(data)
433 - variable.n = len(data)
 482+ variable.num_obs = variable.number_of_obs()
 483+ variable.num_dates = len(variable)
434484 variable.first_obs, variable.last_obs = variable.get_date_range()
435485
436486 def summary(self):
437487 self.descriptives()
438 - print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('Variable', 'Mean',
439 - 'Median', 'SD', 'Minimum', 'Maximum',
440 - 'Num Obs', 'First Obs', 'Final Obs')
441 - for variable in self:
442 - print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (variable.name,
443 - variable.mean, variable.median,
444 - variable.sds, variable.min,
445 - variable.max, variable.n,
446 - variable.first_obs, variable.last_obs)
 488+ table = Texttable(max_width=0)
 489+ vars = ['Variable', 'Mean', 'Median', 'SD', 'Minimum', 'Maximum',
 490+ 'Num Obs', 'Num of\nUnique Dates', 'First Obs', 'Final Obs']
 491+ table.add_row([var for var in vars])
 492+ table.set_cols_align(['r' for v in vars])
 493+ table.set_cols_valign(['m' for v in vars])
447494
 495+ for x, variable in enumerate(self):
 496+ table.add_row([variable.name, variable.mean, variable.median,
 497+ variable.sds, variable.min, variable.max,
 498+ variable.num_obs, variable.num_dates,
 499+ variable.first_obs, variable.last_obs])
 500+ print table.draw()
 501+ print self
 502+ print self.details()
448503
 504+
449505 def debug():
450506 mongo = db.init_mongo_db('enwiki')
451507 rawdata = mongo['enwiki_charts']
@@ -456,17 +512,17 @@
457513 {'name': 'count', 'time_unit': 'year'},
458514 # {'name': 'testest', 'time_unit': 'year'}
459515 ])
460 - ds.count.add(d1, 10, ['exp', 'window'])
461 - ds.count.add(d1, 135, ['exp', 'window'])
462 - ds.count.add(d2, 1, ['exp', 'window'])
 516+ ds.count.add(d1, 10, {'exp': 3})
 517+ ds.count.add(d1, 135, {'exp': 3})
 518+ ds.count.add(d2, 1, {'exp': 4})
463519 #ds.testest.add(d1, 135)
464520 #ds.testest.add(d2, 535)
465521 ds.summary()
466522 ds.write(format='csv')
467523 # v = Variable('test', 'year')
468524 ds.encode()
469 - print ds
470525
 526+
471527 # mongo.test.insert({'variables': ds})
472528
473529 # v.add(d2 , 5)
Index: trunk/tools/editor_trends/utils/compression.py
@@ -89,8 +89,8 @@
9090 if self.program_installed == None:
9191 raise exceptions.CompressionNotSupportedError
9292
93 - print self.location
94 - print self.file
 93+ #print self.location
 94+ #print self.file
9595 if not file_utils.check_file_exists(self.location, self.file):
9696 raise exceptions.FileNotFoundException(self.location, self.file)
9797
@@ -103,7 +103,7 @@
104104 commands = args.get(self.name, None)
105105 #print commands
106106 if commands != None:
107 - p = subprocess.call(commands)
 107+ p = subprocess.call(commands, shell=True)
108108 #p = subprocess.Popen(commands, shell=True).wait()
109109 else:
110110 raise exceptions.CompressionNotSupportedError
Index: trunk/tools/editor_trends/cronjobs.py
@@ -29,11 +29,8 @@
3030 from analyses import analyzer
3131
3232
33 -def launch_editor_trends_toolkit(task):
34 - '''
35 - This function should only be called as a cronjob and not directly.
36 - '''
37 - project, language, parser, settings = manager.init_args_parser()
 33+def init_environment(task):
 34+ project, language, parser = manager.init_args_parser()
3835 args = parser.parse_args(['django'])
3936 pjc = projects.ProjectContainer()
4037 project = pjc.get_project(task['project'])
@@ -42,8 +39,16 @@
4340
4441 args.language = language.name
4542 args.project = project.name
46 - rts = runtime_settings.RunTimeSettings(project, language, settings, args)
47 - res = manager.all_launcher(rts, settings, None)
 43+ rts = runtime_settings.RunTimeSettings(project, language, args)
 44+ return rts
 45+
 46+
 47+def launch_editor_trends_toolkit(task):
 48+ '''
 49+ This function should only be called as a cronjob and not directly.
 50+ '''
 51+ rts = init_environment(task)
 52+ res = manager.all_launcher(rts, None)
4853 return res
4954
5055
@@ -53,22 +58,19 @@
5459 '''
5560 res = True
5661 try:
57 - project = task['project']
58 - language_code = task['language_code']
 62+ rts = init_environment(task)
5963 func = task['jobtype']
60 -
61 - collection = 'editors_dataset' #FIXME hardcoded string
6264 time_unit = 'month' #FIXME hardcoded string
6365 cutoff = 1 #FIXME hardcoded string
6466 cum_cutoff = 50 #FIXME hardcoded string
6567
66 - analyzer.generate_chart_data(project,
67 - collection,
68 - language_code,
69 - func,
70 - time_unit=time_unit,
71 - cutoff=cutoff,
72 - cum_cutoff=cum_cutoff)
 68+ analyzer.generate_chart_data(rts.project.name,
 69+ rts.collection,
 70+ rts.language.code,
 71+ func,
 72+ time_unit=time_unit,
 73+ cutoff=cutoff,
 74+ cum_cutoff=cum_cutoff)
7375 except AttributeError, e:
7476 res = False
7577 print e #need to capture more fine grained errors but not quite what errors are going to happen.

Follow-up revisions

RevisionCommit summaryAuthorDate
r87957* Fix up bone-headed mistake from r82149...mah09:36, 13 May 2011

Status & tagging log