r82149 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r82148‎ \| r82149 \| r82150 >
Date:	22:40, 14 February 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Today's batch of fixes: 1) Dataset descriptives are nicely formatted in a table 2) Extract phase waits for 7z to finish extracting 3) Removed some hard-coded strings 4) Fixed some references. 5) Delete an old file
Modified paths:	/trunk/tools/editor_trends/analyses/analyzer.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py (modified) (history) /trunk/tools/editor_trends/classes/dataset.py (modified) (history) /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history) /trunk/tools/editor_trends/configuration.py (deleted) (history) /trunk/tools/editor_trends/cronjobs.py (modified) (history) /trunk/tools/editor_trends/etl/extracter.py (modified) (history) /trunk/tools/editor_trends/etl/transformer.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/utils/compression.py (modified) (history) /trunk/tools/editor_trends/wikilytics/api/forms.py (modified) (history) /trunk/tools/editor_trends/wikilytics/api/models.py (modified) (history) /trunk/tools/editor_trends/wikilytics/api/views.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/configuration.py
—	—	@@ -1,206 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2010-10-21'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-'''~~
22		~~-This file contains settings that are used for constructing and analyzing~~
23		~~-the datasets as part of the Editor Dynamics and Anti-Vandalism projects.~~
24		~~-'''~~
25		-
26		~~-from multiprocessing import cpu_count~~
27		~~-import ConfigParser~~
28		~~-import os~~
29		~~-import sys~~
30		~~-import platform~~
31		~~-import subprocess~~
32		-
33		-
34		~~-from classes import exceptions~~
35		-
36		~~-try:~~
37		- from _winreg import *
38		~~- from pywin import win32file~~
39		~~- '''increase the maximum number of open files on Windows to 1024'''~~
40		~~- win32file._setmaxstdio(1024)~~
41		~~-except ImportError:~~
42		~~- pass~~
43		-
44		~~-try:~~
45		~~- import resource~~
46		~~-except ImportError:~~
47		~~- pass~~
48		-
49		-
50		~~-class Singleton(type):~~
51		~~- '''~~
52		~~- Recipe: http://stackoverflow.com/questions/31875/is-there-a-simple-elegant-way-to-define-singletons-in-python~~
53		~~- '''~~
54		~~- def __init__(cls, name, bases, dict):~~
55		~~- super(Singleton, cls).__init__(name, bases, dict)~~
56		~~- cls.instance = None~~
57		-
58		~~- def __call__(cls, args, *kw):~~
59		~~- if cls.instance is None:~~
60		~~- cls.instance = super(Singleton, cls).__call__(args, *kw)~~
61		~~- return cls.instance~~
62		~~- else:~~
63		~~- return cls.instance~~
64		-
65		~~-class Settings:~~
66		~~- __metaclass__ = Singleton~~
67		-
68		~~- def __init__(self, process_multiplier=1):~~
69		~~- self.minimum_python_version = (2, 6)~~
70		~~- self.detect_python_version()~~
71		~~- self.encoding = 'utf-8'~~
72		-
73		~~- #Date format as used by Erik Zachte~~
74		~~- self.date_format = '%Y-%m-%d'~~
75		-
76		~~- # Timestamp format as generated by the MediaWiki dumps~~
77		~~- self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ'~~
78		~~- self.timestamp_server = '%a, %d %b %Y %H:%M:%S %Z'~~
79		~~- #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason~~
80		~~- self.max_xmlfile_size = 4096 * 1024~~
81		-
82		~~- #Change this to match your computers configuration (RAM / CPU)~~
83		~~- self.number_of_processes = cpu_count() * process_multiplier~~
84		-
85		~~- self.wp_dump_location = 'http://dumps.wikimedia.org'~~
86		~~- self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'~~
87		~~- self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json']~~
88		~~- self.windows_register = {'7z.exe': 'Software\\7-Zip', }~~
89		~~- #Extensions of ascii files, this is used to determine the filemode to use~~
90		~~- self.platform = self.determine_platform()~~
91		-
92		~~- self.architecture = platform.machine()~~
93		~~- self.working_directory = self.determine_working_directory()~~
94		~~- print sys.path~~
95		~~- self.update_python_path()~~
96		~~- print sys.path~~
97		-
98		~~- self.root = os.path.expanduser('~') if self.platform != 'Windows' else 'c:\\'~~
99		~~- self.max_filehandles = self.determine_max_filehandles_open()~~
100		~~- self.tab_width = 4 if self.platform == 'Windows' else 8~~
101		-
102		-
103		~~- result = self.load_configuration()~~
104		~~- if not result:~~
105		~~- self.input_location = os.path.join(self.root, 'wikimedia')~~
106		-
107		~~- # Default Input file~~
108		~~- self.input_filename = os.path.join(self.input_location, 'en',~~
109		~~- 'wiki',~~
110		~~- 'enwiki-20100916-stub-meta-history.xml')~~
111		~~- # This is the place where error messages are stored for debugging purposes~~
112		~~- self.log_location = os.path.join(self.working_directory,~~
113		~~- 'logs')~~
114		~~- self.csv_location = os.path.join(self.working_directory,~~
115		~~- 'data', 'csv')~~
116		~~- self.dataset_location = os.path.join(self.working_directory, 'datasets')~~
117		~~- self.binary_location = os.path.join(self.working_directory,~~
118		~~- 'data', 'objects')~~
119		-
120		~~- self.chart_location = os.path.join(self.working_directory, 'statistics',~~
121		~~- 'charts')~~
122		~~- self.file_choices = ('stub-meta-history.xml.gz',~~
123		~~- 'stub-meta-current.xml.gz',~~
124		~~- 'pages-meta-history.xml.7z',~~
125		~~- 'pages-meta-current.xml.bz2',)~~
126		-
127		~~- def load_configuration(self):~~
128		~~- if os.path.exists(os.path.join(self.working_directory, 'wiki.cfg')):~~
129		~~- config = ConfigParser.RawConfigParser()~~
130		~~- config.read(os.path.join(self.working_directory, 'wiki.cfg'))~~
131		~~- self.working_directory = config.get('file_locations', 'working_directory')~~
132		~~- self.input_location = config.get('file_locations', 'input_location')~~
133		~~- self.default_project = config.get('wiki', 'project')~~
134		~~- self.default_language = config.get('wiki', 'language')~~
135		~~- return True~~
136		~~- else:~~
137		~~- return False~~
138		-
139		~~- def determine_working_directory(self):~~
140		~~- cwd = os.getcwd()~~
141		~~- if not cwd.endswith('editor_trends%s' % os.sep):~~
142		~~- pos = cwd.find('editor_trends') + 14~~
143		~~- cwd = cwd[:pos]~~
144		~~- return cwd~~
145		-
146		~~- def detect_python_version(self):~~
147		~~- version = sys.version_info[0:2]~~
148		~~- #logger.debug('Python version: %s' % '.'.join(str(version)))~~
149		~~- if version < self.minimum_python_version:~~
150		~~- raise exceptions.OutDatedPythonVersionError~~
151		-
152		~~- def determine_platform(self):~~
153		~~- if platform.system() == 'Darwin':~~
154		~~- return 'OSX'~~
155		~~- else:~~
156		~~- return platform.system()~~
157		-
158		~~- def verify_environment(self, directories):~~
159		~~- for directory in directories:~~
160		~~- if not os.path.exists(directory):~~
161		~~- try:~~
162		~~- os.makedirs(directory)~~
163		~~- except IOError:~~
164		~~- print 'Configuration Error, could not create directory %s.' % directory~~
165		-
166		~~- def detect_windows_program(self, program):~~
167		~~- entry = self.windows_register.get(program, None)~~
168		~~- try:~~
169		~~- key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ)~~
170		~~- return QueryValueEx(key, 'Path')[0]~~
171		~~- except WindowsError:~~
172		~~- return None~~
173		-
174		~~- def detect_linux_program(self, program):~~
175		~~- path = subprocess.Popen(['which', '%s' % program], stdout=subprocess.PIPE).communicate()[0]~~
176		~~- return path.strip()~~
177		-
178		~~- def detect_installed_program(self, program):~~
179		~~- if self.platform == 'Windows':~~
180		~~- if not program.endswith('.exe'):~~
181		~~- program = program + '.exe'~~
182		~~- path = self.detect_windows_program(program)~~
183		~~- if path != None:~~
184		~~- path = path + program~~
185		~~- elif self.platform == 'Linux':~~
186		~~- path = self.detect_linux_program(program)~~
187		-
188		~~- return path~~
189		-
190		~~- def determine_max_filehandles_open(self):~~
191		~~- if self.platform == 'Windows' and self.architecture == 'i386':~~
192		~~- return win32file._getmaxstdio()~~
193		~~- elif self.platform != 'Windows':~~
194		~~- return resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100~~
195		~~- else:~~
196		~~- return 500~~
197		-
198		~~- def update_python_path(self):~~
199		~~- IGNORE_DIRS = ['wikistats', 'zips', 'datasets', 'mapreduce', 'logs',~~
200		~~- 'statistics', 'js_scripts', 'deployment',~~
201		~~- 'documentation', 'data', 'code-snippets']~~
202		~~- dirs = [name for name in os.listdir(self.working_directory) if~~
203		~~- os.path.isdir(os.path.join(self.working_directory, name))]~~
204		~~- for subdirname in dirs:~~
205		~~- if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:~~
206		~~- sys.path.append(os.path.join(self.working_directory,~~
207		~~- subdirname))~~
Index: trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py
—	—	@@ -39,10 +39,6 @@
40	40	if w >= editor_dt:
41	41	datum = datetime.datetime(int(year), 12, 31)
42	42	freq = int(editor['edits_by_year'][year])
43		~~- #if datum == datetime.datetime(2003, 12, 31):~~
44		~~-# if w == 24:~~
45		~~-# if freq == 1.0:~~
46		~~-# print 'break'~~
47		~~- var.add(datum, 1, {'window': w, 'frequency': freq}) #{w:{freq:1}})~~
	43	+ var.add(datum, 1, {'window': w, 'frequency': freq})
48	44	break
49	45	return var
Index: trunk/tools/editor_trends/analyses/plugins/cohort_dataset_backward_bar.py
—	—	@@ -23,7 +23,6 @@
24	24
25	25
26	26	def cohort_dataset_backward_bar(var, editor, **kwargs):
27		~~- #first_edit = editor['first_edit']~~
28	27	'''
29	28	The backward looking bar chart looks for every year that an editor
30	29	was part of the Wikimedia community whether this person made at least cutoff
Index: trunk/tools/editor_trends/analyses/analyzer.py
—	—	@@ -26,6 +26,7 @@
27	27	if '..' not in sys.path:
28	28	sys.path.append('..')
29	29
	30	+import inventory
30	31	from classes import dataset
31	32	from classes import settings
32	33	settings = settings.Settings()
—	—	@@ -42,7 +43,7 @@
43	44	stopwatch = timer.Timer()
44	45	res = True
45	46	dbname = '%s%s' % (language_code, project)
46		~~- functions = available_analyses()~~
	47	+ functions = inventory.available_analyses()
47	48	try:
48	49	func = functions[func]
49	50	except KeyError:
—	—	@@ -119,8 +120,8 @@
120	121
121	122
122	123	if __name__ == '__main__':
123		~~- #generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50)~~
124		~~- generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)~~
	124	+ generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50)
	125	+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)
125	126	#generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', 'to_bar_json', time_unit='year')
126	127	#generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', 'to_bar_json', time_unit='year')
127	128	#generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', 'to_bar_json', time_unit='year')
Index: trunk/tools/editor_trends/manage.py
—	—	@@ -229,12 +229,12 @@
230	230	file_utils.delete_file(settings.binary_location, filename)
231	231
232	232
233		~~-def all_launcher(properties, logger):~~
	233	+def all_launcher(rts, logger):
234	234	print 'The entire data processing chain has been called, this will take a \
235	235	couple of hours (at least) to complete.'
236	236	stopwatch = timer.Timer()
237		~~- log.log_to_mongo(properties, 'dataset', 'all', stopwatch, event='start')~~
238		~~- print 'Start of building %s %s dataset.' % (properties.language.name, properties.project)~~
	237	+ log.log_to_mongo(rts, 'dataset', 'all', stopwatch, event='start')
	238	+ print 'Start of building %s %s dataset.' % (rts.language.name, rts.project)
239	239
240	240	# write_message_to_log(logger, settings,
241	241	# message=message,
—	—	@@ -242,8 +242,8 @@
243	243	# full_project=properties.full_project,
244	244	# ignore=properties.ignore,
245	245	# clean=properties.clean)
246		~~- if properties.clean:~~
247		~~- cleanup(properties, settings, logger)~~
	246	+ if rts.clean:
	247	+ cleanup(rts, logger)
248	248
249	249	functions = ordered_dict.OrderedDict(((downloader_launcher, 'download'),
250	250	(extract_launcher, 'extract'),
—	—	@@ -253,16 +253,16 @@
254	254	(dataset_launcher, 'dataset')))
255	255
256	256	for function, callname in functions.iteritems():
257		~~- if callname not in properties.ignore:~~
	257	+ if callname not in rts.ignore:
258	258	print 'Starting %s' % function.func_name
259		~~- res = function(properties, logger)~~
	259	+ res = function(rts, logger)
260	260	if res == False:
261	261	sys.exit(False)
262	262	elif res == None:
263	263	print 'Function %s does not return a status, \
264	264	implement NOW' % function.func_name
265	265	stopwatch.elapsed()
266		~~- log.log_to_mongo(properties, 'dataset', 'all', stopwatch, event='finish')~~
	266	+ log.log_to_mongo(rts, 'dataset', 'all', stopwatch, event='finish')
267	267
268	268
269	269
Index: trunk/tools/editor_trends/etl/extracter.py
—	—	@@ -382,6 +382,8 @@
383	383	'''
384	384	result = True
385	385	tasks = unzip(rts)
	386	+ if not tasks:
	387	+ return False
386	388
387	389	output = os.path.join(rts.input_location, rts.language.code,
388	390	rts.project.name, 'txt')
Index: trunk/tools/editor_trends/etl/transformer.py
—	—	@@ -17,6 +17,7 @@
18	18	__date__ = '2010-11-02'
19	19	__version__ = '0.1'
20	20
	21	+import progressbar
21	22	import multiprocessing
22	23	from Queue import Empty
23	24	from operator import itemgetter
—	—	@@ -192,10 +193,11 @@
193	194	def transform_editors_single_launcher(rts):
194	195	ids = db.retrieve_distinct_keys(rts.dbname, rts.editors_raw, 'editor')
195	196	input_db, output_db = setup_database(rts)
	197	+ pbar = progressbar.ProgressBar(maxval=len(ids)).start()
196	198	for x, id in enumerate(ids):
197		~~- print '%s editors to go...' % (len(ids) - x)~~
198	199	editor = Editor(id, input_db, output_db)
199	200	editor()
	201	+ pbar.update(pbar.currval + 1)
200	202
201	203
202	204	if __name__ == '__main__':
Index: trunk/tools/editor_trends/wikilytics/api/views.py
—	—	@@ -42,7 +42,7 @@
43	43	if created:
44	44	job.save()
45	45	jobs = Job.objects.filter(jobtype='dataset', finished=False, in_progress=False)
46		~~- ds = Dataset.objects.using('enwiki').filter(project=project, language_code=language)~~
	46	+ ds = Dataset.objects.filter(project=project, language_code=language)
47	47	print ds
48	48	return render_to_response('datasets.html', {'datasets': ds, 'jobs': jobs})
49	49
—	—	@@ -59,7 +59,7 @@
60	60	c = {}
61	61	print project, language, chart
62	62	try:
63		~~- ds = Dataset.objects.using('enwiki').get(project=project, language_code=language, name=chart)~~
	63	+ ds = Dataset.objects.get(project=project, language_code=language, name=chart)
64	64	print ds
65	65	except:
66	66	hash = helpers.create_hash(project, language)
Index: trunk/tools/editor_trends/wikilytics/api/models.py
—	—	@@ -19,7 +19,7 @@
20	20	variables = DictField()
21	21
22	22	class Meta:
23		~~- db_table = 'enwiki_charts'~~
	23	+ db_table = 'charts'
24	24
25	25	def __iter__(self):
26	26	for key, value in self.variables.items():
—	—	@@ -34,24 +34,24 @@
35	35	return reverse('chart_generator', args=[self.project, self.language_code, self.name])
36	36
37	37
38		~~-class Editor(models.Model):~~
39		~~- username = models.CharField(max_length=64)~~
40		~~- editor = models.IntegerField()~~
41		~~- first_edit = models.DateField()~~
42		~~- final_edit = models.DateField()~~
43		~~- new_wikipedian = models.DateField()~~
44		~~- monthly_edits = DictField()~~
45		~~- edit_count = models.IntegerField()~~
46		~~- articles_by_year = DictField()~~
47		~~- edits_by_year = DictField()~~
48		~~- edits = ListField()~~
	38	+#class Editor(models.Model, language_code, project):
	39	+# username = models.CharField(max_length=64)
	40	+# editor = models.IntegerField()
	41	+# first_edit = models.DateField()
	42	+# final_edit = models.DateField()
	43	+# new_wikipedian = models.DateField()
	44	+# monthly_edits = DictField()
	45	+# edit_count = models.IntegerField()
	46	+# articles_by_year = DictField()
	47	+# edits_by_year = DictField()
	48	+# edits = ListField()
	49	+#
	50	+# class Meta:
	51	+# db_table = '%s%s_editors_dataset' % (language_code, project)
	52	+#
	53	+# def __unicode__(self):
	54	+# return u'%s, total edits: %s' % (self.username, self.edit_count)
49	55
50		~~- class Meta:~~
51		~~- db_table = 'editors_dataset'~~
52		-
53		~~- def __unicode__(self):~~
54		~~- return u'%s, total edits: %s' % (self.username, self.edit_count)~~
55		-
56	56	class EditorAdmin(admin.ModelAdmin):
57	57	pass
58	58
Index: trunk/tools/editor_trends/wikilytics/api/forms.py
—	—	@@ -3,7 +3,7 @@
4	4
5	5	from wikilytics.api.widgets import MonthYearWidget
6	6	from editor_trends.classes import projects
7		~~-from editor_trends.analyses.analyzer import available_analyses~~
	7	+from editor_trends.analyses.inventory import available_analyses
8	8
9	9
10	10
Index: trunk/tools/editor_trends/classes/runtime_settings.py
—	—	@@ -29,7 +29,6 @@
30	30	import datetime
31	31	import time
32	32	import re
33		~~-#sys.path.append('..')~~
34	33
35	34	from settings import Settings
36	35	from utils import text_utils
—	—	@@ -60,7 +59,6 @@
61	60	self.input_location != None else self.get_value('location')
62	61	self.project = self.update_project_settings()
63	62	self.language = self.update_language_settings()
64		~~- #self.dbname = '%s%s' % (self.language.code, self.project.name)~~
65	63	self.targets = self.split_keywords(self.get_value('charts'))
66	64	self.keywords = self.split_keywords(self.get_value('keywords'))
67	65	self.function = self.get_value('func')
Index: trunk/tools/editor_trends/classes/dataset.py
—	—	@@ -25,8 +25,8 @@
26	26	import sys
27	27	from pymongo.son_manipulator import SONManipulator
28	28	from multiprocessing import Lock
	29	+from texttable import Texttable
29	30
30		-
31	31	sys.path.append('..')
32	32	import configuration
33	33	settings = configuration.Settings()
—	—	@@ -80,11 +80,18 @@
81	81	Dataset classes.
82	82	'''
83	83	def __hash__(self, vars):
	84	+ '''
	85	+ This is a generic hash function that expects a list of variables, used
	86	+ to lookup an observation or Variable.
	87	+ '''
84	88	id = ''.join([str(var) for var in vars])
85	89	return hash(id)
86		~~- #return int(self.convert_date_to_epoch(date))~~
87	90
88	91	def encode_to_bson(self, data=None):
	92	+ '''
	93	+ This function converts a Variable or Observation to a dictionary that
	94	+ can be stored in Mongo.
	95	+ '''
89	96	if data:
90	97	kwargs = dict([(str(key), value) for key, value in data.__dict__.iteritems()])
91	98	else:
—	—	@@ -100,6 +107,10 @@
101	108	return kwargs
102	109
103	110	def convert_date_to_epoch(self, date):
	111	+ '''
	112	+ Calculate the number of seconds since epoch depending on the time_unit
	113	+ of the date
	114	+ '''
104	115	assert self.time_unit == 'year' or self.time_unit == 'month' \
105	116	or self.time_unit == 'day', 'Time unit should either be year, month or day.'
106	117
—	—	@@ -115,6 +126,9 @@
116	127	return date
117	128
118	129	def set_date_range(self, date):
	130	+ '''
	131	+ Determine the width of a date range for an observation.
	132	+ '''
119	133	if self.time_unit == 'year':
120	134	return datetime.datetime(date.year, 12, 31), datetime.datetime(date.year, 1, 1)
121	135	elif self.time_unit == 'month':
—	—	@@ -213,7 +227,7 @@
214	228	yield key
215	229
216	230	def __len__(self):
217		~~- return [x for x in xrange(self.obs())]~~
	231	+ return len(self.obs.keys())
218	232
219	233	def items(self):
220	234	for key in self.__dict__.keys():
—	—	@@ -240,8 +254,25 @@
241	255	return obs
242	256
243	257	def add(self, date, value, meta={}):
	258	+ '''
	259	+ The add function is used to add an observation to a variable. An
	260	+ observation is always grouped by the combination of the date and time_unit.
	261	+ Time_unit is a property of a Variable and indicates how granular the
	262	+ observations should be grouped. For example, if time_unit == year then
	263	+ all observations in a given year will be grouped.
	264	+ When calling add you should supply at least two variables:
	265	+ 1) date: when did the observation happen
	266	+ 2) value: an integer or float that was observed on that date
	267	+ Optionally you can supply a dictionary for extra groupings. The key is
	268	+ the name of the extra grouping.
	269	+ For example, if you add {'experience': 3} as the meta dict when calling
	270	+ add then you will create an extra grouping called experience and all
	271	+ future observations who fall in the same date range and the same
	272	+ exerience level will be grouped by that particular observation. You
	273	+ can use as many extra groupings as you want but usually one extra grouping
	274	+ should be enough.
	275	+ '''
244	276	assert isinstance(meta, dict), 'The meta variable should be a dict (either empty or with variables to group by.'
245		~~- #id = self.convert_date_to_epoch(date)~~
246	277	start, end = self.set_date_range(date)
247	278	values = meta.values()
248	279	values.insert(0, end)
—	—	@@ -252,6 +283,12 @@
253	284	obs.add(value)
254	285	self.obs[id] = obs
255	286
	287	+ def number_of_obs(self):
	288	+ n = 0
	289	+ for obs in self.obs:
	290	+ n += self.obs[obs].count
	291	+ return n
	292	+
256	293	def encode(self):
257	294	bson = {}
258	295	for prop in self.props:
—	—	@@ -323,6 +360,12 @@
324	361	for var in self.variables:
325	362	yield getattr(self, var)
326	363
	364	+ def details(self):
	365	+ print 'Project: %s%s' % (self.language_code, self.project)
	366	+ print 'JSON encoder: %s' % self.encoder
	367	+ print 'Raw data was retrieved from: %s%s/%s' % (self.language_code,
	368	+ self.project,
	369	+ self.collection)
327	370
328	371	def create_filename(self):
329	372	'''
—	—	@@ -359,6 +402,9 @@
360	403
361	404
362	405	def add_variable(self, var):
	406	+ '''
	407	+ Call this function to add a Variable to a dataset.
	408	+ '''
363	409	if isinstance(var, Variable):
364	410	self.variables.append(var.name)
365	411	setattr(self, var.name, var)
—	—	@@ -366,6 +412,9 @@
367	413	raise TypeError('You can only instance of Variable to a dataset.')
368	414
369	415	def write(self, format='csv'):
	416	+ '''
	417	+ This is the entry point for outputting data, either to csv or mongo.
	418	+ '''
370	419	self.create_filename()
371	420	if format == 'csv':
372	421	self.to_csv()
—	—	@@ -429,22 +478,29 @@
430	479	variable.sds = self.get_standard_deviation(data)
431	480	variable.min = min(data)
432	481	variable.max = max(data)
433		~~- variable.n = len(data)~~
	482	+ variable.num_obs = variable.number_of_obs()
	483	+ variable.num_dates = len(variable)
434	484	variable.first_obs, variable.last_obs = variable.get_date_range()
435	485
436	486	def summary(self):
437	487	self.descriptives()
438		~~- print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('Variable', 'Mean',~~
439		~~- 'Median', 'SD', 'Minimum', 'Maximum',~~
440		~~- 'Num Obs', 'First Obs', 'Final Obs')~~
441		~~- for variable in self:~~
442		~~- print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (variable.name,~~
443		~~- variable.mean, variable.median,~~
444		~~- variable.sds, variable.min,~~
445		~~- variable.max, variable.n,~~
446		~~- variable.first_obs, variable.last_obs)~~
	488	+ table = Texttable(max_width=0)
	489	+ vars = ['Variable', 'Mean', 'Median', 'SD', 'Minimum', 'Maximum',
	490	+ 'Num Obs', 'Num of\nUnique Dates', 'First Obs', 'Final Obs']
	491	+ table.add_row([var for var in vars])
	492	+ table.set_cols_align(['r' for v in vars])
	493	+ table.set_cols_valign(['m' for v in vars])
447	494
	495	+ for x, variable in enumerate(self):
	496	+ table.add_row([variable.name, variable.mean, variable.median,
	497	+ variable.sds, variable.min, variable.max,
	498	+ variable.num_obs, variable.num_dates,
	499	+ variable.first_obs, variable.last_obs])
	500	+ print table.draw()
	501	+ print self
	502	+ print self.details()
448	503
	504	+
449	505	def debug():
450	506	mongo = db.init_mongo_db('enwiki')
451	507	rawdata = mongo['enwiki_charts']
—	—	@@ -456,17 +512,17 @@
457	513	{'name': 'count', 'time_unit': 'year'},
458	514	# {'name': 'testest', 'time_unit': 'year'}
459	515	])
460		~~- ds.count.add(d1, 10, ['exp', 'window'])~~
461		~~- ds.count.add(d1, 135, ['exp', 'window'])~~
462		~~- ds.count.add(d2, 1, ['exp', 'window'])~~
	516	+ ds.count.add(d1, 10, {'exp': 3})
	517	+ ds.count.add(d1, 135, {'exp': 3})
	518	+ ds.count.add(d2, 1, {'exp': 4})
463	519	#ds.testest.add(d1, 135)
464	520	#ds.testest.add(d2, 535)
465	521	ds.summary()
466	522	ds.write(format='csv')
467	523	# v = Variable('test', 'year')
468	524	ds.encode()
469		~~- print ds~~
470	525
	526	+
471	527	# mongo.test.insert({'variables': ds})
472	528
473	529	# v.add(d2 , 5)
Index: trunk/tools/editor_trends/utils/compression.py
—	—	@@ -89,8 +89,8 @@
90	90	if self.program_installed == None:
91	91	raise exceptions.CompressionNotSupportedError
92	92
93		~~- print self.location~~
94		~~- print self.file~~
	93	+ #print self.location
	94	+ #print self.file
95	95	if not file_utils.check_file_exists(self.location, self.file):
96	96	raise exceptions.FileNotFoundException(self.location, self.file)
97	97
—	—	@@ -103,7 +103,7 @@
104	104	commands = args.get(self.name, None)
105	105	#print commands
106	106	if commands != None:
107		~~- p = subprocess.call(commands)~~
	107	+ p = subprocess.call(commands, shell=True)
108	108	#p = subprocess.Popen(commands, shell=True).wait()
109	109	else:
110	110	raise exceptions.CompressionNotSupportedError
Index: trunk/tools/editor_trends/cronjobs.py
—	—	@@ -29,11 +29,8 @@
30	30	from analyses import analyzer
31	31
32	32
33		~~-def launch_editor_trends_toolkit(task):~~
34		~~- '''~~
35		~~- This function should only be called as a cronjob and not directly.~~
36		~~- '''~~
37		~~- project, language, parser, settings = manager.init_args_parser()~~
	33	+def init_environment(task):
	34	+ project, language, parser = manager.init_args_parser()
38	35	args = parser.parse_args(['django'])
39	36	pjc = projects.ProjectContainer()
40	37	project = pjc.get_project(task['project'])
—	—	@@ -42,8 +39,16 @@
43	40
44	41	args.language = language.name
45	42	args.project = project.name
46		~~- rts = runtime_settings.RunTimeSettings(project, language, settings, args)~~
47		~~- res = manager.all_launcher(rts, settings, None)~~
	43	+ rts = runtime_settings.RunTimeSettings(project, language, args)
	44	+ return rts
	45	+
	46	+
	47	+def launch_editor_trends_toolkit(task):
	48	+ '''
	49	+ This function should only be called as a cronjob and not directly.
	50	+ '''
	51	+ rts = init_environment(task)
	52	+ res = manager.all_launcher(rts, None)
48	53	return res
49	54
50	55
—	—	@@ -53,22 +58,19 @@
54	59	'''
55	60	res = True
56	61	try:
57		~~- project = task['project']~~
58		~~- language_code = task['language_code']~~
	62	+ rts = init_environment(task)
59	63	func = task['jobtype']
60		-
61		~~- collection = 'editors_dataset' #FIXME hardcoded string~~
62	64	time_unit = 'month' #FIXME hardcoded string
63	65	cutoff = 1 #FIXME hardcoded string
64	66	cum_cutoff = 50 #FIXME hardcoded string
65	67
66		~~- analyzer.generate_chart_data(project,~~
67		~~- collection,~~
68		~~- language_code,~~
69		~~- func,~~
70		~~- time_unit=time_unit,~~
71		~~- cutoff=cutoff,~~
72		~~- cum_cutoff=cum_cutoff)~~
	68	+ analyzer.generate_chart_data(rts.project.name,
	69	+ rts.collection,
	70	+ rts.language.code,
	71	+ func,
	72	+ time_unit=time_unit,
	73	+ cutoff=cutoff,
	74	+ cum_cutoff=cum_cutoff)
73	75	except AttributeError, e:
74	76	res = False
75	77	print e #need to capture more fine grained errors but not quite what errors are going to happen.

Follow-up revisions

Revision	Commit summary	Author	Date
r87957	* Fix up bone-headed mistake from r82149...	mah	09:36, 13 May 2011

Status & tagging log

22:41, 14 February 2011 Reedy (talk | contribs) changed the status of r82149 [removed: new added: deferred]