r82005 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r82004‎ \| r82005 \| r82006 >
Date:	01:44, 12 February 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	A lot of changes: 1) Using one database for all projects instead of a separate database for each project 2) RunTimeSettings inherits from Settings 3) Major code cleanup
Modified paths:	/trunk/tools/editor_trends/__init__.py (modified) (history) /trunk/tools/editor_trends/analyses/__init__.py (modified) (history) /trunk/tools/editor_trends/analyses/adhoc (added) (history) /trunk/tools/editor_trends/analyses/adhoc/community_graph.py (added) (history) /trunk/tools/editor_trends/analyses/adhoc/file_size_reduction.py (added) (history) /trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py (added) (history) /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history) /trunk/tools/editor_trends/analyses/community_graph.py (deleted) (history) /trunk/tools/editor_trends/analyses/dataset.py (deleted) (history) /trunk/tools/editor_trends/analyses/file_size_reduction.py (deleted) (history) /trunk/tools/editor_trends/analyses/inventory.py (added) (history) /trunk/tools/editor_trends/analyses/json_encoders.py (modified) (history) /trunk/tools/editor_trends/analyses/match_talkpage_article.py (deleted) (history) /trunk/tools/editor_trends/analyses/plugins/edit_patterns.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py (modified) (history) /trunk/tools/editor_trends/bots/__init__.py (modified) (history) /trunk/tools/editor_trends/classes/dataset.py (modified) (history) /trunk/tools/editor_trends/classes/languages.py (modified) (history) /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history) /trunk/tools/editor_trends/classes/settings.py (modified) (history) /trunk/tools/editor_trends/code-snippets/__init__.py (modified) (history) /trunk/tools/editor_trends/configuration.py (modified) (history) /trunk/tools/editor_trends/etl/__init__.py (modified) (history) /trunk/tools/editor_trends/etl/downloader.py (modified) (history) /trunk/tools/editor_trends/etl/extracter.py (modified) (history) /trunk/tools/editor_trends/etl/sort.py (modified) (history) /trunk/tools/editor_trends/etl/store.py (modified) (history) /trunk/tools/editor_trends/etl/transformer.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/utils/__init__.py (modified) (history) /trunk/tools/editor_trends/utils/compression.py (modified) (history) /trunk/tools/editor_trends/utils/log.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -26,7 +26,7 @@
27	27	from argparse import RawTextHelpFormatter
28	28	import ConfigParser
29	29
30		~~-import configuration~~
	30	+#import configuration
31	31	from utils import file_utils
32	32	from utils import ordered_dict
33	33	from utils import log
—	—	@@ -40,7 +40,7 @@
41	41	from etl import store
42	42	from etl import sort
43	43	from etl import transformer
44		~~-from analyses import analyzer~~
	44	+from analyses import inventory
45	45
46	46
47	47	def show_choices(settings, attr):
—	—	@@ -50,8 +50,7 @@
51	51	return choices
52	52
53	53
54		-
55		~~-def config_launcher(properties, settings, logger):~~
	54	+def config_launcher(properties, logger):
56	55	'''
57	56	Config launcher is used to reconfigure editor trends toolkit.
58	57	'''
—	—	@@ -98,20 +97,20 @@
99	98
100	99
101	100
102		~~-def downloader_launcher(properties, settings, logger):~~
	101	+def downloader_launcher(properties, logger):
103	102	'''
104	103	This launcher calls the dump downloader to download a Wikimedia dump file.
105	104	'''
106	105	print 'Start downloading'
107	106	stopwatch = timer.Timer()
108	107	log.log_to_mongo(properties, 'dataset', 'download', stopwatch, event='start')
109		~~- res = downloader.launcher(properties, settings, logger)~~
	108	+ res = downloader.launcher(properties, logger)
110	109	stopwatch.elapsed()
111	110	log.log_to_mongo(properties, 'dataset', 'download', stopwatch, event='finish')
112	111	return res
113	112
114	113
115		~~-def extract_launcher(properties, settings, logger):~~
	114	+def extract_launcher(properties, logger):
116	115	'''
117	116	The extract launcher is used to extract the required variables from a dump
118	117	file. If the zip file is a known archive then it will first launch the
—	—	@@ -125,34 +124,34 @@
126	125	log.log_to_mongo(properties, 'dataset', 'extract', stopwatch, event='finish')
127	126
128	127
129		~~-def sort_launcher(properties, settings, logger):~~
	128	+def sort_launcher(rts, logger):
130	129	'''
131	130	After the extracter has finished then the created output files need to be
132	131	sorted. This function takes care of that.
133	132	'''
134	133	print 'Start sorting data'
135	134	stopwatch = timer.Timer()
136		~~- log.log_to_mongo(properties, 'dataset', 'sort', stopwatch, event='start')~~
	135	+ log.log_to_mongo(rts, 'dataset', 'sort', stopwatch, event='start')
137	136	# write_message_to_log(logger, settings,
138	137	# message=None,
139	138	# verb=None,
140	139	# location=properties.location,
141	140	# input=properties.txt,
142	141	# output=properties.sorted)
143		~~- sort.mergesort_launcher(properties.txt, properties.sorted)~~
	142	+ sort.launcher(rts)
144	143	stopwatch.elapsed()
145		~~- log.log_to_mongo(properties, 'dataset', 'sort', stopwatch, event='finish')~~
	144	+ log.log_to_mongo(rts, 'dataset', 'sort', stopwatch, event='finish')
146	145
147	146
148		~~-def store_launcher(properties, settings, logger):~~
	147	+def store_launcher(rts, logger):
149	148	'''
150	149	The data is ready to be stored once the sorted function has completed. This
151	150	function starts storing data in MongoDB.
152	151	'''
153	152	print 'Start storing data in MongoDB'
154	153	stopwatch = timer.Timer()
155		~~- log.log_to_mongo(properties, 'dataset', 'store', stopwatch, event='start')~~
156		~~- db.cleanup_database(properties.dbname, logger)~~
	154	+ log.log_to_mongo(rts, 'dataset', 'store', stopwatch, event='start')
	155	+ db.cleanup_database(rts.dbname, logger)
157	156	# write_message_to_log(logger, settings,
158	157	# message=None,
159	158	# verb='Storing',
—	—	@@ -163,36 +162,34 @@
164	163	# collection=properties.collection)
165	164	# for key in properties:
166	165	# print key, getattr(properties, key)
167		~~- store.launcher(properties.sorted, properties.dbname, properties.collection)~~
168		-
	166	+ store.launcher(rts)
169	167	stopwatch.elapsed()
170		~~- log.log_to_mongo(properties, 'dataset', 'store', stopwatch, event='finish')~~
	168	+ log.log_to_mongo(rts, 'dataset', 'store', stopwatch, event='finish')
171	169
172	170
173		~~-def transformer_launcher(properties, settings, logger):~~
	171	+def transformer_launcher(rts, logger):
174	172	print 'Start transforming dataset'
175	173	stopwatch = timer.Timer()
176		~~- log.log_to_mongo(properties, 'dataset', 'transform', stopwatch, event='start')~~
177		~~- db.cleanup_database(properties.dbname, logger, 'dataset')~~
	174	+ log.log_to_mongo(rts, 'dataset', 'transform', stopwatch, event='start')
	175	+ db.cleanup_database(rts.dbname, logger, 'dataset')
178	176	# write_message_to_log(logger, settings,
179	177	# message=None,
180	178	# verb='Transforming',
181	179	# project=properties.project,
182	180	# collection=properties.collection)
183		~~- transformer.transform_editors_single_launcher(properties.dbname,~~
184		~~- properties.collection)~~
	181	+ transformer.transform_editors_single_launcher(rts)
185	182	stopwatch.elapsed()
186		~~- log.log_to_mongo(properties, 'dataset', 'transform', stopwatch,~~
	183	+ log.log_to_mongo(rts, 'dataset', 'transform', stopwatch,
187	184	event='finish')
188	185
189	186
190		~~-def dataset_launcher(properties, settings, logger):~~
	187	+def dataset_launcher(rts, logger):
191	188	print 'Start exporting dataset'
192	189	stopwatch = timer.Timer()
193		~~- log.log_to_mongo(properties, 'dataset', 'export', stopwatch, event='start')~~
	190	+ log.log_to_mongo(rts, 'dataset', 'export', stopwatch, event='start')
194	191
195		~~- collection = '%s_%s' % (properties.collection, 'dataset')~~
196		~~- for target in properties.targets:~~
	192	+ #collection = '%s_%s' % (rts.collection, 'dataset')
	193	+ for target in rts.targets:
197	194	# write_message_to_log(logger, settings,
198	195	# message=None,
199	196	# verb='Exporting',
—	—	@@ -200,16 +197,16 @@
201	198	# dbname=properties.full_project,
202	199	# collection=properties.collection)
203	200
204		~~- analyzer.generate_chart_data(properties.dbname,~~
205		~~- collection,~~
206		~~- properties.language.code,~~
	201	+ analyzer.generate_chart_data(rts.dbname,
	202	+ rts.editors_dataset,
	203	+ rts.language.code,
207	204	target,
208		~~- **properties.keywords)~~
	205	+ **rts.keywords)
209	206	stopwatch.elapsed()
210	207	log.log_to_mongo(properties, 'dataset', 'export', stopwatch, event='finish')
211	208
212	209
213		~~-def cleanup(properties, settings, logger):~~
	210	+def cleanup(rts, logger):
214	211	directories = properties.directories[1:]
215	212	for directory in directories:
216	213	write_message_to_log(logger, setting,
—	—	@@ -232,7 +229,7 @@
233	230	file_utils.delete_file(settings.binary_location, filename)
234	231
235	232
236		~~-def all_launcher(properties, settings, logger):~~
	233	+def all_launcher(properties, logger):
237	234	print 'The entire data processing chain has been called, this will take a \
238	235	couple of hours (at least) to complete.'
239	236	stopwatch = timer.Timer()
—	—	@@ -258,7 +255,7 @@
259	256	for function, callname in functions.iteritems():
260	257	if callname not in properties.ignore:
261	258	print 'Starting %s' % function.func_name
262		~~- res = function(properties, settings, logger)~~
	259	+ res = function(properties, logger)
263	260	if res == False:
264	261	sys.exit(False)
265	262	elif res == None:
—	—	@@ -284,11 +281,11 @@
285	282	'''
286	283	Entry point for parsing command line and launching the needed function(s).
287	284	'''
288		~~- settings = configuration.Settings()~~
	285	+ #settings = configuration.Settings()
289	286	language = languages.init()
290	287	project = projects.init()
291	288	pjc = projects.ProjectContainer()
292		~~- rts = runtime_settings.RunTimeSettings(project, language, settings)~~
	289	+ rts = runtime_settings.RunTimeSettings(project, language)
293	290
294	291	#Init Argument Parser
295	292	parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter)
—	—	@@ -301,7 +298,7 @@
302	299	action='store',
303	300	help='Enter the first letter of a language to see which languages are \
304	301	available.')
305		~~- parser_languages.set_defaults(func=language.show_languages, args=[settings, project])~~
	302	+ parser_languages.set_defaults(func=language.show_languages, args=[project])
306	303
307	304	#CONFIG
308	305	parser_config = subparsers.add_parser('config',
—	—	@@ -350,7 +347,7 @@
351	348	parser_dataset.add_argument('-c', '--charts',
352	349	action='store',
353	350	help='Should be a valid function name that matches one of the plugin functions',
354		~~- default=analyzer.available_analyses()['new_editor_count'])~~
	351	+ default=inventory.available_analyses()['new_editor_count'])
355	352
356	353	parser_dataset.add_argument('-k', '--keywords',
357	354	action='store',
—	—	@@ -399,12 +396,13 @@
400	397	parser.add_argument('-c', '--collection',
401	398	action='store',
402	399	help='Name of MongoDB collection',
403		~~- default='editors')~~
	400	+ default='editors_raw')
404	401
405	402	parser.add_argument('-o', '--location',
406	403	action='store',
407	404	help='Indicate where you want to store the downloaded file.',
408		~~- default=settings.input_location)~~
	405	+ #default=settings.input_location)
	406	+ default=rts.input_location)
409	407
410	408	parser.add_argument('-ns', '--namespace',
411	409	action='store',
—	—	@@ -413,41 +411,41 @@
414	412
415	413	parser.add_argument('-f', '--file',
416	414	action='store',
417		~~- choices=settings.file_choices,~~
	415	+ choices=rts.file_choices,
418	416	help='Indicate which dump you want to download. Valid choices are:\n \
419		~~- %s' % ''.join([f + ',\n' for f in settings.file_choices]),~~
	417	+ %s' % ''.join([f + ',\n' for f in rts.file_choices]),
420	418	default='stub-meta-history.xml.gz')
421	419
422	420
423		~~- return project, language, parser, settings~~
	421	+ return project, language, parser
424	422
425	423	def main():
426		~~- project, language, parser, settings = init_args_parser()~~
	424	+ project, language, parser, = init_args_parser()
427	425	args = parser.parse_args()
428		~~- properties = runtime_settings.RunTimeSettings(project, language, settings, args)~~
	426	+ rts = runtime_settings.RunTimeSettings(project, language, args)
429	427	#initialize logger
430	428	logger = logging.getLogger('manager')
431	429	logger.setLevel(logging.DEBUG)
432	430
433	431	# Add the log message handler to the logger
434	432	today = datetime.datetime.today()
435		~~- log_filename = os.path.join(settings.log_location, '%s%s_%s-%s-%s.log' \~~
436		~~- % (properties.language.code, properties.project.name,~~
	433	+ log_filename = os.path.join(rts.log_location, '%s%s_%s-%s-%s.log' \
	434	+ % (rts.language.code, rts.project.name,
437	435	today.day, today.month, today.year))
438	436	handler = logging.handlers.RotatingFileHandler(log_filename,
439	437	maxBytes=1024 * 1024,
440	438	backupCount=3)
441	439
442	440	logger.addHandler(handler)
443		~~- logger.debug('Chosen language: \t%s' % properties.language)~~
	441	+ logger.debug('Chosen language: \t%s' % rts.language)
444	442
445	443	#start manager
446	444	#detect_python_version(logger)
447	445	about_statement()
448	446	#config.create_configuration(settings, args)
449	447
450		~~- properties.show_settings()~~
451		~~- args.func(properties, settings, logger)~~
	448	+ rts.show_settings()
	449	+ args.func(rts, logger)
452	450
453	451
454	452	if __name__ == '__main__':
Index: trunk/tools/editor_trends/analyses/community_graph.py
—	—	@@ -1,62 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-01-10'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-import sys~~
22		~~-sys.path.append('..')~~
23		-
24		~~-import configuration~~
25		~~-settings = configuration.Settings()~~
26		-
27		~~-from database import db~~
28		~~-from utils import file_utils~~
29		-
30		~~-try:~~
31		~~- import psyco~~
32		~~- psyco.full()~~
33		~~-except ImportError:~~
34		~~- pass~~
35		-
36		~~-def create_articles_set(edits):~~
37		~~- s = set()~~
38		~~- years = edits.keys()~~
39		~~- for year in years:~~
40		~~- for edit in edits[year]:~~
41		~~- s.add(edit['article'])~~
42		~~- return s~~
43		-
44		-
45		~~-def create_edgelist(project, collection):~~
46		~~- ids = db.retrieve_distinct_keys(project, collection, 'editor')~~
47		~~- conn = db.init_mongo_db(project)~~
48		~~- ids.sort()~~
49		~~- fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', settings.encoding)~~
50		~~- for i in ids:~~
51		~~- author_i = conn[collection].find_one({'editor': i})~~
52		~~- article_i = create_articles_set(author_i['edits'])~~
53		~~- for j in ids:~~
54		~~- if i > j:~~
55		~~- author_j = conn[collection].find_one({'editor': j})~~
56		~~- article_j = create_articles_set(author_j['edits'])~~
57		~~- common = article_i.intersection(article_j)~~
58		~~- if len(common) > 0:~~
59		~~- file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True)~~
60		~~- fh.close()~~
61		-
62		~~-if __name__ == '__main__':~~
63		~~- create_edgelist('enwiki', 'editors')~~
Index: trunk/tools/editor_trends/analyses/dataset.py
—	—	@@ -1,473 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-01-14'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-import calendar~~
22		~~-import datetime~~
23		~~-import time~~
24		~~-import math~~
25		~~-import operator~~
26		~~-import sys~~
27		~~-from pymongo.son_manipulator import SONManipulator~~
28		~~-from multiprocessing import Lock~~
29		-
30		-
31		~~-sys.path.append('..')~~
32		~~-import configuration~~
33		~~-settings = configuration.Settings()~~
34		-
35		~~-from utils import file_utils~~
36		~~-from utils import data_converter~~
37		~~-from database import db~~
38		~~-import json_encoders~~
39		-
40		~~-class Transform(SONManipulator):~~
41		~~- '''~~
42		~~- This encoder transforms a Dataset to a MongoDB bson document.~~
43		~~- To use this encoder initalize a mongo database instance and then add:~~
44		~~- mongo.add_son_manipulator(Transform())~~
45		~~- '''~~
46		~~- def transform_incoming(self, son, collection):~~
47		~~- for (key, ds) in son.items():~~
48		~~- son[key] = {}~~
49		~~- for x, var in enumerate(ds):~~
50		~~- if isinstance(var, Variable):~~
51		~~- son[key][var.name] = var.encode()~~
52		~~- for prop in ds.props:~~
53		~~- son[prop] = getattr(ds, prop)~~
54		~~- return son~~
55		-
56		~~- def transform_outgoing(self, son, collection):~~
57		~~- for (key, value) in son.items():~~
58		~~- if isinstance(value, dict):~~
59		~~- names = value.keys()~~
60		~~- for name in names:~~
61		~~- var = Variable(name, None)~~
62		~~- var.decode(value)~~
63		~~- son['variables'][name] = var~~
64		~~- else: # Again, make sure to recurse into sub-docs~~
65		~~- son[key] = value~~
66		~~- name = son.pop('name', None)~~
67		~~- project = son.pop('project', None)~~
68		~~- collection = son.pop('collection', None)~~
69		~~- language_code = son.pop('language_code', None)~~
70		~~- variables = son.pop('variables', [])~~
71		~~- ds = Dataset(name, project, collection, language_code, **son)~~
72		~~- for var in variables:~~
73		~~- var = variables[var]~~
74		~~- ds.add_variable(var)~~
75		~~- return ds~~
76		-
77		-
78		~~-class Data:~~
79		~~- '''~~
80		~~- Some generic functions that are required by the Observation, Variable, and~~
81		~~- Dataset classes.~~
82		~~- '''~~
83		~~- def __hash__(self, vars):~~
84		~~- id = ''.join([str(var) for var in vars])~~
85		~~- return hash(id)~~
86		~~- #return int(self.convert_date_to_epoch(date))~~
87		-
88		~~- def encode_to_bson(self, data=None):~~
89		~~- if data:~~
90		~~- kwargs = dict([(str(key), value) for key, value in data.__dict__.iteritems()])~~
91		~~- else:~~
92		~~- kwargs = dict([(str(key), value) for key, value in self.__dict__.iteritems()])~~
93		~~- for key, value in kwargs.iteritems():~~
94		~~- if isinstance(value, dict):~~
95		~~- d = {}~~
96		~~- for k, v in value.iteritems():~~
97		~~- if isinstance(v, Observation):~~
98		~~- v = self.encode_to_bson(v)~~
99		~~- d[str(k)] = v~~
100		~~- kwargs[key] = d~~
101		~~- return kwargs~~
102		-
103		~~- def convert_date_to_epoch(self, date):~~
104		~~- assert self.time_unit == 'year' or self.time_unit == 'month' \~~
105		~~- or self.time_unit == 'day', 'Time unit should either be year, month or day.'~~
106		-
107		~~- if self.time_unit == 'year':~~
108		~~- datum = datetime.datetime(date.year, 1, 1)~~
109		~~- return int(time.mktime(datum.timetuple()))~~
110		~~- elif self.time_unit == 'month':~~
111		~~- datum = datetime.datetime(date.year, date.month, 1)~~
112		~~- return int(time.mktime(datum.timetuple()))~~
113		~~- elif self.time_unit == 'day':~~
114		~~- return int(time.mktime(date.timetuple()))~~
115		~~- else:~~
116		~~- return date~~
117		-
118		~~- def set_date_range(self, date):~~
119		~~- if self.time_unit == 'year':~~
120		~~- return datetime.datetime(date.year, 12, 31), datetime.datetime(date.year, 1, 1)~~
121		~~- elif self.time_unit == 'month':~~
122		~~- day = calendar.monthrange(date.year, date.month)[1]~~
123		~~- return datetime.datetime(date.year, date.month, day), datetime.datetime(date.year, date.month, 1)~~
124		~~- else:~~
125		~~- return datetime.datetime(date.year, date.month, date.day), datetime.datetime(date.year, date.month, date.day)~~
126		-
127		-
128		~~-class Observation(Data):~~
129		~~- lock = Lock()~~
130		~~- '''~~
131		~~- The smallest unit, here the actual data is being stored.~~
132		~~- Time_unit should either be 'year', 'month' or 'day'.~~
133		~~- '''~~
134		~~- def __init__(self, date, time_unit, id, meta):~~
135		~~- assert isinstance(date, datetime.datetime), 'Date variable should be a datetime.datetime instance.'~~
136		~~- self.date = date~~
137		~~- self.data = 0~~
138		~~- self.time_unit = time_unit~~
139		~~- self.t1, self.t0 = self.set_date_range(date)~~
140		~~- self.id = id~~
141		~~- self.props = []~~
142		~~- for mt in meta:~~
143		~~- if isinstance(mt, float):~~
144		~~- raise Exception, 'Mongo does not allow a dot "." in the name of a key, please use an integer or string as key.'~~
145		~~- elif not isinstance(mt, list):~~
146		~~- setattr(self, mt, meta[mt])~~
147		~~- self.props.append(mt)~~
148		~~- self._type = 'observation'~~
149		-
150		~~- def __repr__(self):~~
151		~~- return '%s' % self.date~~
152		-
153		~~- def __str__(self):~~
154		~~- return 'range: %s:%s' % (self.t0, self.t1)~~
155		-
156		~~- def __iter__(self):~~
157		~~- for obs in self.data:~~
158		~~- yield self.data[obs]~~
159		-
160		~~- def __getitem__(self, key):~~
161		~~- return getattr(self, key, [])~~
162		-
163		~~- def add(self, value):~~
164		~~- '''~~
165		~~- If update == True then data[i] will be incremented else data[i] will be~~
166		~~- created, in that case make sure that i is unique. Update is useful for~~
167		~~- tallying a variable.~~
168		~~- '''~~
169		~~- self.lock.acquire()~~
170		~~- try:~~
171		~~- self.data += value~~
172		~~- finally:~~
173		~~- self.lock.release()~~
174		-
175		~~- def get_date_range(self):~~
176		~~- return '%s-%s-%s:%s-%s-%s' % (self.t0.month, self.t0.day, self.t0.year, \~~
177		~~- self.t1.month, self.t1.day, self.t1.year)~~
178		-
179		~~-class Variable(Data):~~
180		~~- '''~~
181		~~- This class constructs a time-based variable.~~
182		~~- '''~~
183		~~- lock = Lock()~~
184		~~- def __init__(self, name, time_unit, **kwargs):~~
185		~~- self.name = name~~
186		~~- self.obs = {}~~
187		~~- self.time_unit = time_unit~~
188		~~- self.groupbys = []~~
189		~~- self._type = 'variable'~~
190		~~- self.props = ['name', 'time_unit', '_type']~~
191		~~- for kw in kwargs:~~
192		~~- setattr(self, kw, kwargs[kw])~~
193		~~- self.props.append(kw)~~
194		-
195		~~- def __str__(self):~~
196		~~- return '%s' % self.name~~
197		-
198		~~- def __repr__(self):~~
199		~~- return '%s' % self.name~~
200		-
201		~~- def __getitem__(self, key):~~
202		~~- return getattr(self, key, [])~~
203		-
204		~~- def __iter__(self):~~
205		~~- keys = self.obs.keys()~~
206		~~- for key in keys:~~
207		~~- yield key~~
208		-
209		~~- def __len__(self):~~
210		~~- return [x for x in xrange(self.obs())]~~
211		-
212		~~- def items(self):~~
213		~~- for key in self.__dict__.keys():~~
214		~~- yield key, getattr(self, key)~~
215		-
216		~~- def itervalues(self):~~
217		~~- for key in self:~~
218		~~- yield self.obs[key].data~~
219		-
220		~~- def iteritems(self):~~
221		~~- for key in self:~~
222		~~- yield (key, self.obs[key])~~
223		-
224		-
225		~~- def get_data(self):~~
226		~~- return [o for o in self.itervalues()]~~
227		-
228		~~- def get_observation(self, id, date, meta):~~
229		~~- self.lock.acquire()~~
230		~~- try:~~
231		~~- obs = self.obs.get(id, Observation(date, self.time_unit, id, meta))~~
232		~~- finally:~~
233		~~- self.lock.release()~~
234		~~- return obs~~
235		-
236		~~- def add(self, date, value, meta={}):~~
237		~~- assert isinstance(meta, dict), 'The meta variable should be a dict (either empty or with variables to group by.'~~
238		~~- #id = self.convert_date_to_epoch(date)~~
239		~~- start, end = self.set_date_range(date)~~
240		~~- values = meta.values()~~
241		~~- values.insert(0, end)~~
242		~~- values.insert(0, start)~~
243		~~- id = self.__hash__(values)~~
244		-
245		~~- obs = self.get_observation(id, date, meta)~~
246		~~- obs.add(value)~~
247		~~- self.obs[id] = obs~~
248		-
249		~~- def encode(self):~~
250		~~- bson = {}~~
251		~~- for prop in self.props:~~
252		~~- bson[prop] = getattr(self, prop)~~
253		-
254		~~- bson['obs'] = {}~~
255		~~- for obs in self:~~
256		~~- data = self.obs[obs]~~
257		~~- obs = str(obs)~~
258		~~- bson['obs'][obs] = data.encode_to_bson()~~
259		~~- return bson~~
260		-
261		~~- def decode(self, values):~~
262		~~- for varname in values:~~
263		~~- for prop in values[varname]:~~
264		~~- if isinstance(values[varname][prop], dict):~~
265		~~- data = values[varname][prop]~~
266		~~- for d in data:~~
267		~~- date = data[d]['date']~~
268		~~- obs = data[d]['data']~~
269		~~- self.add(date, obs)~~
270		~~- else:~~
271		~~- setattr(self, prop, values[varname][prop])~~
272		~~- self.props.append(prop)~~
273		-
274		~~- def get_date_range(self):~~
275		~~- dates = [self.obs[key].date for key in self]~~
276		~~- first = min(dates)~~
277		~~- last = max(dates)~~
278		~~- return first, last~~
279		-
280		-
281		~~-class Dataset:~~
282		~~- '''~~
283		~~- This class acts as a container for the Variable class and has some methods~~
284		~~- to output the dataset to a csv file, mongodb and display statistics.~~
285		~~- '''~~
286		-
287		~~- def __init__(self, name, project, collection, language_code, encoder, vars=None, **kwargs):~~
288		~~- encoders = json_encoders.available_json_encoders()~~
289		~~- if encoder not in encoders:~~
290		~~- raise exception.UnknownJSONEncoderError(encoder)~~
291		~~- else:~~
292		~~- self.encoder = encoder~~
293		~~- self.name = name~~
294		~~- self.project = project~~
295		~~- self.collection = collection~~
296		~~- self.language_code = language_code~~
297		~~- self.hash = self.name~~
298		~~- self._type = 'dataset'~~
299		~~- self.created = datetime.datetime.now()~~
300		~~- self.format = 'long'~~
301		~~- for kw in kwargs:~~
302		~~- setattr(self, kw, kwargs[kw])~~
303		~~- self.props = self.__dict__.keys()~~
304		-
305		~~- self.variables = []~~
306		~~- if vars != None:~~
307		~~- for kwargs in vars:~~
308		~~- name = kwargs.pop('name')~~
309		~~- setattr(self, name, Variable(name, **kwargs))~~
310		~~- self.variables.append(name)~~
311		~~- #self.filename = self.create_filename()~~
312		-
313		~~- def __repr__(self):~~
314		~~- return 'Dataset contains %s variables' % (len(self.variables))~~
315		-
316		~~- def __iter__(self):~~
317		~~- for var in self.variables:~~
318		~~- yield getattr(self, var)~~
319		-
320		-
321		~~- def create_filename(self):~~
322		~~- '''~~
323		~~- This function creates a filename for the dataset by searching for shared~~
324		~~- properties among the different variables in the dataset. All shared~~
325		~~- properties will be used in the filename to make sure that one analysis~~
326		~~- that's run with different parameters gets stored in separate files.~~
327		~~- '''~~
328		~~- common = {}~~
329		~~- props = set()~~
330		~~- for var in self.variables:~~
331		~~- s = set()~~
332		~~- var = getattr(self, var)~~
333		~~- for prop in var.props:~~
334		~~- if prop not in ['name', 'time_unit', '_type']:~~
335		~~- s.add(prop)~~
336		~~- props.add(prop)~~
337		~~- common[var.name] = s~~
338		-
339		~~- keys = []~~
340		~~- for prop in props:~~
341		~~- attrs = []~~
342		~~- for s in common.values():~~
343		~~- attrs.append(prop)~~
344		~~- if len(attrs) == len(common.values()):~~
345		~~- keys.append(prop)~~
346		~~- keys.sort()~~
347		~~- attrs = '_'.join(['%s=%s' % (k, getattr(var, k)) for k in keys])~~
348		~~- filename = '%s%s_%s_%s.csv' % (self.language_code,~~
349		~~- self.project,~~
350		~~- self.name,~~
351		~~- attrs)~~
352		~~- self.filename = filename~~
353		-
354		-
355		~~- def add_variable(self, var):~~
356		~~- if isinstance(var, Variable):~~
357		~~- self.variables.append(var.name)~~
358		~~- setattr(self, var.name, var)~~
359		~~- else:~~
360		~~- raise TypeError('You can only instance of Variable to a dataset.')~~
361		-
362		~~- def write(self, format='csv'):~~
363		~~- self.create_filename()~~
364		~~- if format == 'csv':~~
365		~~- self.to_csv()~~
366		~~- elif format == 'mongo':~~
367		~~- self.to_mongo()~~
368		-
369		~~- def to_mongo(self):~~
370		~~- dbname = '%s%s' % (self.language_code, self.project)~~
371		~~- mongo = db.init_mongo_db(dbname)~~
372		~~- coll = mongo['%s_%s' % (dbname, 'charts')]~~
373		~~- mongo.add_son_manipulator(Transform())~~
374		~~- coll.remove({'hash':self.hash, 'project':self.project,~~
375		~~- 'language_code':self.language_code})~~
376		~~- coll.insert({'variables': self})~~
377		-
378		~~- def to_csv(self):~~
379		~~- data = data_converter.convert_dataset_to_lists(self, 'manage')~~
380		~~- headers = data_converter.add_headers(self)~~
381		~~- fh = file_utils.create_txt_filehandle(settings.dataset_location, self.filename, 'w', settings.encoding)~~
382		~~- file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True)~~
383		~~- file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format)~~
384		~~- fh.close()~~
385		-
386		~~- def encode(self):~~
387		~~- props = {}~~
388		~~- for prop in self.props:~~
389		~~- props[prop] = getattr(self, prop)~~
390		~~- return props~~
391		-
392		~~- def get_standard_deviation(self, number_list):~~
393		~~- mean = self.get_mean(number_list)~~
394		~~- std = 0~~
395		~~- n = len(number_list)~~
396		~~- for i in number_list:~~
397		~~- std = std + (i - mean) ** 2~~
398		~~- return math.sqrt(std / float(n - 1))~~
399		-
400		~~- def get_median(self, number_list):~~
401		~~- if number_list == []:~~
402		~~- return '.'~~
403		~~- data = sorted(number_list)~~
404		~~- data = [float(x) for x in data]~~
405		~~- if len(data) % 2 == 1:~~
406		~~- return data[(len(data) + 1) / 2 - 1]~~
407		~~- else:~~
408		~~- lower = data[len(data) / 2 - 1]~~
409		~~- upper = data[len(data) / 2]~~
410		~~- return (lower + upper) / 2~~
411		-
412		~~- def get_mean(self, number_list):~~
413		~~- if number_list == []:~~
414		~~- return '.'~~
415		~~- float_nums = [float(x) for x in number_list]~~
416		~~- return sum(float_nums) / len(number_list)~~
417		-
418		~~- def descriptives(self):~~
419		~~- for variable in self:~~
420		~~- data = variable.get_data()~~
421		~~- variable.mean = self.get_mean(data)~~
422		~~- variable.median = self.get_median(data)~~
423		~~- variable.sds = self.get_standard_deviation(data)~~
424		~~- variable.min = min(data)~~
425		~~- variable.max = max(data)~~
426		~~- variable.n = len(data)~~
427		~~- variable.first_obs, variable.last_obs = variable.get_date_range()~~
428		-
429		~~- def summary(self):~~
430		~~- self.descriptives()~~
431		~~- print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('Variable', 'Mean',~~
432		~~- 'Median', 'SD', 'Minimum', 'Maximum',~~
433		~~- 'Num Obs', 'First Obs', 'Final Obs')~~
434		~~- for variable in self:~~
435		~~- print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (variable.name,~~
436		~~- variable.mean, variable.median,~~
437		~~- variable.sds, variable.min,~~
438		~~- variable.max, variable.n,~~
439		~~- variable.first_obs, variable.last_obs)~~
440		-
441		-
442		~~-def debug():~~
443		~~- mongo = db.init_mongo_db('enwiki')~~
444		~~- rawdata = mongo['enwiki_charts']~~
445		~~- mongo.add_son_manipulator(Transform())~~
446		-
447		~~- d1 = datetime.datetime.today()~~
448		~~- d2 = datetime.datetime(2007, 6, 7)~~
449		~~- ds = Dataset('test', 'wiki', 'editors_dataset', 'en', 'to_bar_json', [~~
450		~~- {'name': 'count', 'time_unit': 'year'},~~
451		~~- # {'name': 'testest', 'time_unit': 'year'}~~
452		~~- ])~~
453		~~- ds.count.add(d1, 10, ['exp', 'window'])~~
454		~~- ds.count.add(d1, 135, ['exp', 'window'])~~
455		~~- ds.count.add(d2, 1, ['exp', 'window'])~~
456		~~- #ds.testest.add(d1, 135)~~
457		~~- #ds.testest.add(d2, 535)~~
458		~~- ds.summary()~~
459		~~- ds.write(format='csv')~~
460		~~-# v = Variable('test', 'year')~~
461		~~- ds.encode()~~
462		~~- print ds~~
463		-
464		~~- # mongo.test.insert({'variables': ds})~~
465		-
466		~~- # v.add(d2 , 5)~~
467		~~- #o = v.get_observation(d2)~~
468		~~-# ds = rawdata.find_one({'project': 'wiki',~~
469		~~-# 'language_code': 'en',~~
470		~~-# 'hash': 'cohort_dataset_backward_bar'})~~
471		-
472		-
473		~~-if __name__ == '__main__':~~
474		~~- debug()~~
Index: trunk/tools/editor_trends/analyses/file_size_reduction.py
—	—	@@ -1,100 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2010-11-15'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-import sys~~
22		~~-sys.path.append('..')~~
23		-
24		~~-import os~~
25		~~-import xml.etree.cElementTree as cElementTree~~
26		-
27		~~-import configuration~~
28		~~-from utils import file_utils~~
29		~~-settings = configuration.Settings()~~
30		-
31		-
32		~~-class DumpStatistics(object):~~
33		~~- ''' Simple class to keep track of XML tags, how often they occur,~~
34		~~- and the length of strings they contain. This is used to calculate the~~
35		~~- overhead.~~
36		~~- '''~~
37		~~- def __init__(self):~~
38		~~- self.tags = {}~~
39		-
40		~~- def add_tag(self, kwargs):~~
41		~~- for kw in kwargs:~~
42		~~- if kw not in self.tags:~~
43		~~- self.tags[kw] = {}~~
44		~~- self.tags[kw]['n'] = 0~~
45		~~- self.tags[kw]['size'] = 0~~
46		~~- self.tags[kw]['n'] += 1~~
47		~~- self.tags[kw]['size'] += self.determine_length(kwargs[kw])~~
48		-
49		~~- def average_size_text(self):~~
50		~~- avg = {}~~
51		~~- for kw in self.tags:~~
52		~~- avg[kw] = self.tags[kw]['size'] / self.tags[kw]['n']~~
53		~~- return avg~~
54		-
55		~~- def total_size_text(self):~~
56		~~- return sum([self.tags[kw]['size'] for kw in self.tags])~~
57		-
58		~~- def total_size_xml(self):~~
59		~~- # the x2 is for the opening and closing tag~~
60		~~- # the +5 is for 2x <, 2x > and 1x /~~
61		~~- return sum([(len(kw) * (self.tags[kw]['n'] * 2) + 5) for kw in self.tags])~~
62		-
63		~~- def determine_length(self, text):~~
64		~~- if text == None:~~
65		~~- return 0~~
66		~~- else:~~
67		~~- return len(text)~~
68		-
69		-
70		~~-def calculate_filesize_overhead(location, filename):~~
71		~~- counter = None~~
72		~~- ds = DumpStatistics()~~
73		~~- filename = os.path.join(location, filename)~~
74		~~- context = cElementTree.iterparse(filename, events=('start', 'end'))~~
75		~~- context = iter(context)~~
76		~~- event, root = context.next() #get the root element of the XML doc~~
77		-
78		~~- try:~~
79		~~- for event, elem in context:~~
80		~~- if event == 'end':~~
81		~~- ds.add_tag({elem.tag:elem.text})~~
82		~~- root.clear() # when done parsing a section clear the tree to release memory~~
83		~~- except SyntaxError:~~
84		~~- pass~~
85		~~- file_utils.store_object(ds, settings.binary_location, 'ds')~~
86		~~- xml_size = ds.total_size_xml()~~
87		~~- text_size = ds.total_size_text()~~
88		~~- print text_size, xml_size~~
89		~~- print ds.tags~~
90		-
91		-
92		~~-def output_dumpstatistics():~~
93		~~- ds = file_utils.load_object(settings.binary_location, 'ds.bin')~~
94		-
95		~~- for key in ds.tags:~~
96		~~- print '%s\t%s' % (key, ds.tags[key])~~
97		-
98		~~-if __name__ == '__main__':~~
99		~~- input = os.path.join(settings.input_location, 'en', 'wiki')~~
100		~~- calculate_filesize_overhead(input, 'enwiki-latest-stub-meta-history.xml')~~
101		~~- output_dumpstatistics()~~
Index: trunk/tools/editor_trends/analyses/match_talkpage_article.py
—	—	@@ -1,72 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-01-07'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-import sys~~
22		~~-import os~~
23		~~-sys.path.append('..')~~
24		-
25		~~-import configuration~~
26		~~-settings = configuration.Settings()~~
27		-
28		~~-from etl import extracter~~
29		~~-from utils import file_utils~~
30		~~-import wikitree~~
31		-
32		~~-try:~~
33		~~- import psyco~~
34		~~- psyco.full()~~
35		~~-except ImportError:~~
36		~~- pass~~
37		-
38		~~-class Article:~~
39		~~- def __init__(self, title, id, talk_id=None):~~
40		~~- self.title = title~~
41		~~- self.id = id~~
42		~~- self.talk_id = talk_id~~
43		-
44		-
45		~~-def parse_dumpfile(project, language_code, namespaces=['0', '1']):~~
46		~~- articles = {}~~
47		~~- ns = extracter.load_namespace(language_code)~~
48		~~- non_valid_namespaces = extracter.build_namespaces_locale(ns, namespaces)~~
49		-
50		-
51		~~- location = os.path.join(settings.input_location, language_code, project)~~
52		~~- fh = file_utils.create_txt_filehandle(location,~~
53		~~- '%s%s-latest-stub-meta-history.xml' % (language_code, project),~~
54		~~- 'r', settings.encoding)~~
55		-
56		~~- for page, article_size in wikitree.parser.read_input(fh):~~
57		~~- title = page.find('title')~~
58		~~- if extracter.verify_article_belongs_namespace(title, non_valid_namespaces):~~
59		~~- article_id = page.find('id').text~~
60		~~- title = title.text~~
61		~~- if title.startswith(ns['1'].get('canonical')):~~
62		~~- namespace = 'Talk'~~
63		~~- article = articles.get(article_id, Article(None, None, article_id))~~
64		~~- article.talk_id = article_id~~
65		~~- else:~~
66		~~- namespace = 'Main'~~
67		~~- article = articles.get(article_id, Article(title, article_id))~~
68		~~- articles[article_id] = article~~
69		-
70		~~- file_utils.store_object(articles, settings.binary_location, 'talk2article.bin')~~
71		-
72		~~-if __name__ == '__main__':~~
73		~~- parse_dumpfile('wiki', 'en')~~
Index: trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py
—	—	@@ -39,10 +39,10 @@
40	40	if w >= editor_dt:
41	41	datum = datetime.datetime(int(year), 12, 31)
42	42	freq = int(editor['edits_by_year'][year])
43		~~- if datum == datetime.datetime(2003, 12, 31):~~
	43	+ #if datum == datetime.datetime(2003, 12, 31):
44	44	# if w == 24:
45	45	# if freq == 1.0:
46	46	# print 'break'
47		~~- var.add(datum, 1, {'window': w, 'frequency': freq}) #{w:{freq:1}})~~
48		~~- break~~
	47	+ var.add(datum, 1, {'window': w, 'frequency': freq}) #{w:{freq:1}})
	48	+ break
49	49	return var
Index: trunk/tools/editor_trends/analyses/plugins/edit_patterns.py
—	—	@@ -27,23 +27,12 @@
28	28	if dt.days < 366:
29	29	return var
30	30
31		~~- m = 0~~
32		~~- obs = {}~~
33	31	for year in xrange(new_wikipedian.year, new_wikipedian.year + 2):
34		~~- if m == 12:~~
35		~~- break~~
	32	+ obs = [False for x in xrange(13)]
36	33	for month in xrange(new_wikipedian.month, 13):
37	34	n = monthly[str(year)][str(month)]
38	35	date = datetime.datetime(year, month, 1)
39	36	if n >= var.cutoff:
40		~~- var.add(date, True, {'month':m})~~
41		~~- #obs[m] = True~~
42		~~- else:~~
43		~~- var.add(date, False, {'month':m})~~
44		~~- #obs[m] = False~~
45		~~- m += 1~~
46		~~- if m == 12:~~
47		~~- break~~
48		~~-# if m == 12:~~
49		~~-# var.add(date, obs)~~
	37	+ obs[month] = True
	38	+ var.add(date, obs)
50	39	return var
Index: trunk/tools/editor_trends/analyses/json_encoders.py
—	—	@@ -17,9 +17,13 @@
18	18	__date__ = '2011-01-27'
19	19	__version__ = '0.1'
20	20
	21	+import sys
21	22	import types
22		~~-import analyzer~~
23	23
	24	+if '..' not in sys.path:
	25	+ sys.path.append('..')
	26	+
	27	+import inventory
24	28	from classes import exceptions
25	29	from utils import data_converter
26	30
—	—	@@ -67,6 +71,7 @@
68	72	options['series']['bars']['align'] = 'center'
69	73	return options
70	74
	75	+
71	76	def to_bar_json(ds):
72	77	data = {}
73	78
—	—	@@ -95,6 +100,7 @@
96	101	print json
97	102	return json
98	103
	104	+
99	105	def to_stacked_bar_json(ds):
100	106	'''
101	107	This function outputs data in a format that is understood by jquery
Index: trunk/tools/editor_trends/analyses/inventory.py
—	—	@@ -0,0 +1,70 @@
	2	+#!/usr/bin/python
	3	+# coding=utf-8
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http,//www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-02-11'
	19	+__version__ = '0.1'
	20	+
	21	+
	22	+import os
	23	+import sys
	24	+import types
	25	+
	26	+def available_analyses(caller='manage'):
	27	+ '''
	28	+ Generates a dictionary:
	29	+ key: name of analysis
	30	+ value: function that generates the dataset
	31	+ ignore: a list of functions that should never be called from manage.py,
	32	+ they are not valid entry points.
	33	+ '''
	34	+ assert caller == 'django' or caller == 'manage'
	35	+ ignore = ['__init__']
	36	+ functions = {}
	37	+
	38	+ fn = os.path.realpath(__file__)
	39	+ pos = fn.rfind(os.sep)
	40	+ loc = fn[:pos]
	41	+ path = os.path.join(loc , 'plugins')
	42	+ plugins = import_libs(path)
	43	+
	44	+ for plugin in plugins:
	45	+ if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore:
	46	+ functions[plugin.func_name] = plugin
	47	+ if caller == 'manage':
	48	+ return functions
	49	+ elif caller == 'django':
	50	+ django_functions = []
	51	+ for function in functions:
	52	+ fancy_name = function.replace('_', ' ').title()
	53	+ django_functions.append((function, fancy_name))
	54	+
	55	+ return django_functions
	56	+
	57	+
	58	+def import_libs(path):
	59	+ '''
	60	+ Dynamically importing functions from the plugins directory.
	61	+ '''
	62	+ library_list = []
	63	+ sys.path.append(path)
	64	+ for f in os.listdir(os.path.abspath(path)):
	65	+ module_name, ext = os.path.splitext(f)
	66	+ if ext == '.py':
	67	+ module = __import__(module_name)
	68	+ func = getattr(module, module_name)
	69	+ library_list.append(func)
	70	+
	71	+ return library_list
Index: trunk/tools/editor_trends/analyses/__init__.py
—	—	@@ -0,0 +1 @@
	2	+
Index: trunk/tools/editor_trends/analyses/analyzer.py
—	—	@@ -21,19 +21,20 @@
22	22	import sys
23	23	import os
24	24	import progressbar
25		~~-import types~~
26	25	import datetime
27	26
28		~~-sys.path.append('..')~~
	27	+if '..' not in sys.path:
	28	+ sys.path.append('..')
29	29
30		~~-import configuration~~
31		~~-settings = configuration.Settings()~~
	30	+from classes import dataset
	31	+from classes import settings
	32	+settings = settings.Settings()
32	33	from database import db
33	34	from utils import timer
34	35	from utils import log
35		~~-import dataset~~
36	36
37	37
	38	+
38	39	def generate_chart_data(project, collection, language_code, func, encoder, **kwargs):
39	40	'''
40	41	This is the entry function to be called to generate data for creating charts.
—	—	@@ -102,54 +103,6 @@
103	104	return ds
104	105
105	106
106		~~-def available_analyses(caller='manage'):~~
107		~~- '''~~
108		~~- Generates a dictionary:~~
109		~~- key: name of analysis~~
110		~~- value: function that generates the dataset~~
111		~~- ignore: a list of functions that should never be called from manage.py,~~
112		~~- they are not valid entry points.~~
113		~~- '''~~
114		~~- assert caller == 'django' or caller == 'manage'~~
115		~~- ignore = ['__init__']~~
116		~~- functions = {}~~
117		-
118		~~- fn = os.path.realpath(__file__)~~
119		~~- pos = fn.rfind(os.sep)~~
120		~~- loc = fn[:pos]~~
121		~~- path = os.path.join(loc , 'plugins')~~
122		~~- plugins = import_libs(path)~~
123		-
124		~~- for plugin in plugins:~~
125		~~- if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore:~~
126		~~- functions[plugin.func_name] = plugin~~
127		~~- if caller == 'manage':~~
128		~~- return functions~~
129		~~- elif caller == 'django':~~
130		~~- django_functions = []~~
131		~~- for function in functions:~~
132		~~- fancy_name = function.replace('_', ' ').title()~~
133		~~- django_functions.append((function, fancy_name))~~
134		-
135		~~- return django_functions~~
136		-
137		-
138		~~-def import_libs(path):~~
139		~~- '''~~
140		~~- Dynamically importing functions from the plugins directory.~~
141		~~- '''~~
142		~~- library_list = []~~
143		~~- sys.path.append(path)~~
144		~~- for f in os.listdir(os.path.abspath(path)):~~
145		~~- module_name, ext = os.path.splitext(f)~~
146		~~- if ext == '.py':~~
147		~~- module = __import__(module_name)~~
148		~~- func = getattr(module, module_name)~~
149		~~- library_list.append(func)~~
150		-
151		~~- return library_list~~
152		-
153		-
154	107	def determine_project_year_range(dbname, collection, var):
155	108	'''
156	109	Determine the first and final year for the observed data
—	—	@@ -166,8 +119,8 @@
167	120
168	121
169	122	if __name__ == '__main__':
170		~~- generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50)~~
171		~~- #generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)~~
	123	+ #generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50)
	124	+ generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5)
172	125	#generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', 'to_bar_json', time_unit='year')
173	126	#generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', 'to_bar_json', time_unit='year')
174	127	#generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', 'to_bar_json', time_unit='year')
Index: trunk/tools/editor_trends/analyses/adhoc/community_graph.py
—	—	@@ -0,0 +1,62 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-10'
	19	+__version__ = '0.1'
	20	+
	21	+import sys
	22	+sys.path.append('..')
	23	+
	24	+import configuration
	25	+settings = configuration.Settings()
	26	+
	27	+from database import db
	28	+from utils import file_utils
	29	+
	30	+try:
	31	+ import psyco
	32	+ psyco.full()
	33	+except ImportError:
	34	+ pass
	35	+
	36	+def create_articles_set(edits):
	37	+ s = set()
	38	+ years = edits.keys()
	39	+ for year in years:
	40	+ for edit in edits[year]:
	41	+ s.add(edit['article'])
	42	+ return s
	43	+
	44	+
	45	+def create_edgelist(project, collection):
	46	+ ids = db.retrieve_distinct_keys(project, collection, 'editor')
	47	+ conn = db.init_mongo_db(project)
	48	+ ids.sort()
	49	+ fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', settings.encoding)
	50	+ for i in ids:
	51	+ author_i = conn[collection].find_one({'editor': i})
	52	+ article_i = create_articles_set(author_i['edits'])
	53	+ for j in ids:
	54	+ if i > j:
	55	+ author_j = conn[collection].find_one({'editor': j})
	56	+ article_j = create_articles_set(author_j['edits'])
	57	+ common = article_i.intersection(article_j)
	58	+ if len(common) > 0:
	59	+ file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True)
	60	+ fh.close()
	61	+
	62	+if __name__ == '__main__':
	63	+ create_edgelist('enwiki', 'editors')
Property changes on: trunk/tools/editor_trends/analyses/adhoc/community_graph.py
___________________________________________________________________
Added: svn:eol-style
1	64	+ native
Index: trunk/tools/editor_trends/analyses/adhoc/file_size_reduction.py
—	—	@@ -0,0 +1,100 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-11-15'
	19	+__version__ = '0.1'
	20	+
	21	+import sys
	22	+sys.path.append('..')
	23	+
	24	+import os
	25	+import xml.etree.cElementTree as cElementTree
	26	+
	27	+import configuration
	28	+from utils import file_utils
	29	+settings = configuration.Settings()
	30	+
	31	+
	32	+class DumpStatistics(object):
	33	+ ''' Simple class to keep track of XML tags, how often they occur,
	34	+ and the length of strings they contain. This is used to calculate the
	35	+ overhead.
	36	+ '''
	37	+ def __init__(self):
	38	+ self.tags = {}
	39	+
	40	+ def add_tag(self, kwargs):
	41	+ for kw in kwargs:
	42	+ if kw not in self.tags:
	43	+ self.tags[kw] = {}
	44	+ self.tags[kw]['n'] = 0
	45	+ self.tags[kw]['size'] = 0
	46	+ self.tags[kw]['n'] += 1
	47	+ self.tags[kw]['size'] += self.determine_length(kwargs[kw])
	48	+
	49	+ def average_size_text(self):
	50	+ avg = {}
	51	+ for kw in self.tags:
	52	+ avg[kw] = self.tags[kw]['size'] / self.tags[kw]['n']
	53	+ return avg
	54	+
	55	+ def total_size_text(self):
	56	+ return sum([self.tags[kw]['size'] for kw in self.tags])
	57	+
	58	+ def total_size_xml(self):
	59	+ # the x2 is for the opening and closing tag
	60	+ # the +5 is for 2x <, 2x > and 1x /
	61	+ return sum([(len(kw) * (self.tags[kw]['n'] * 2) + 5) for kw in self.tags])
	62	+
	63	+ def determine_length(self, text):
	64	+ if text == None:
	65	+ return 0
	66	+ else:
	67	+ return len(text)
	68	+
	69	+
	70	+def calculate_filesize_overhead(location, filename):
	71	+ counter = None
	72	+ ds = DumpStatistics()
	73	+ filename = os.path.join(location, filename)
	74	+ context = cElementTree.iterparse(filename, events=('start', 'end'))
	75	+ context = iter(context)
	76	+ event, root = context.next() #get the root element of the XML doc
	77	+
	78	+ try:
	79	+ for event, elem in context:
	80	+ if event == 'end':
	81	+ ds.add_tag({elem.tag:elem.text})
	82	+ root.clear() # when done parsing a section clear the tree to release memory
	83	+ except SyntaxError:
	84	+ pass
	85	+ file_utils.store_object(ds, settings.binary_location, 'ds')
	86	+ xml_size = ds.total_size_xml()
	87	+ text_size = ds.total_size_text()
	88	+ print text_size, xml_size
	89	+ print ds.tags
	90	+
	91	+
	92	+def output_dumpstatistics():
	93	+ ds = file_utils.load_object(settings.binary_location, 'ds.bin')
	94	+
	95	+ for key in ds.tags:
	96	+ print '%s\t%s' % (key, ds.tags[key])
	97	+
	98	+if __name__ == '__main__':
	99	+ input = os.path.join(settings.input_location, 'en', 'wiki')
	100	+ calculate_filesize_overhead(input, 'enwiki-latest-stub-meta-history.xml')
	101	+ output_dumpstatistics()
Property changes on: trunk/tools/editor_trends/analyses/adhoc/file_size_reduction.py
___________________________________________________________________
Added: svn:eol-style
1	102	+ native
Index: trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py
—	—	@@ -0,0 +1,72 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-01-07'
	19	+__version__ = '0.1'
	20	+
	21	+import sys
	22	+import os
	23	+sys.path.append('..')
	24	+
	25	+import configuration
	26	+settings = configuration.Settings()
	27	+
	28	+from etl import extracter
	29	+from utils import file_utils
	30	+import wikitree
	31	+
	32	+try:
	33	+ import psyco
	34	+ psyco.full()
	35	+except ImportError:
	36	+ pass
	37	+
	38	+class Article:
	39	+ def __init__(self, title, id, talk_id=None):
	40	+ self.title = title
	41	+ self.id = id
	42	+ self.talk_id = talk_id
	43	+
	44	+
	45	+def parse_dumpfile(project, language_code, namespaces=['0', '1']):
	46	+ articles = {}
	47	+ ns = extracter.load_namespace(language_code)
	48	+ non_valid_namespaces = extracter.build_namespaces_locale(ns, namespaces)
	49	+
	50	+
	51	+ location = os.path.join(settings.input_location, language_code, project)
	52	+ fh = file_utils.create_txt_filehandle(location,
	53	+ '%s%s-latest-stub-meta-history.xml' % (language_code, project),
	54	+ 'r', settings.encoding)
	55	+
	56	+ for page, article_size in wikitree.parser.read_input(fh):
	57	+ title = page.find('title')
	58	+ if extracter.verify_article_belongs_namespace(title, non_valid_namespaces):
	59	+ article_id = page.find('id').text
	60	+ title = title.text
	61	+ if title.startswith(ns['1'].get('canonical')):
	62	+ namespace = 'Talk'
	63	+ article = articles.get(article_id, Article(None, None, article_id))
	64	+ article.talk_id = article_id
	65	+ else:
	66	+ namespace = 'Main'
	67	+ article = articles.get(article_id, Article(title, article_id))
	68	+ articles[article_id] = article
	69	+
	70	+ file_utils.store_object(articles, settings.binary_location, 'talk2article.bin')
	71	+
	72	+if __name__ == '__main__':
	73	+ parse_dumpfile('wiki', 'en')
Property changes on: trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py
___________________________________________________________________
Added: svn:eol-style
1	74	+ native
Index: trunk/tools/editor_trends/etl/store.py
—	—	@@ -22,27 +22,22 @@
23	23	import sys
24	24	import os
25	25
26		~~-sys.path.append('..')~~
27		~~-import configuration~~
28		~~-settings = configuration.Settings()~~
29	26	from utils import file_utils
30	27	from utils import text_utils
31		~~-from utils import messages~~
32	28	from database import cache
	29	+from utils import messages
33	30	from database import db
34	31
35	32
36		~~-def store_articles(project, language_code):~~
37		~~- location = os.path.join(settings.input_location, language_code, project)~~
38		~~- fh = file_utils.create_txt_filehandle(location, 'articles.csv', 'r', settings.encoding)~~
	33	+def store_articles(rts):
	34	+ location = os.path.join(rts.input_location, rts.language.code, rts.project.name)
	35	+ fh = file_utils.create_txt_filehandle(location, 'articles.csv', 'r', rts.encoding)
39	36	headers = ['id', 'title']
40	37	data = file_utils.read_unicode_text(fh)
41	38	fh.close()
42	39
43		~~- dbname = '%s%s' % (language_code, project)~~
44		~~- collection = '%s_%s' % (dbname, 'articles')~~
45		~~- mongo = db.init_mongo_db(dbname)~~
46		~~- collection = mongo[collection]~~
	40	+ mongo = db.init_mongo_db(rts.dbname)
	41	+ collection = mongo[rts.articles_raw]
47	42
48	43	articles = {}
49	44	for x, d in enumerate(data):
—	—	@@ -55,7 +50,7 @@
56	51	collection.insert(articles)
57	52
58	53
59		~~-def store_editors(tasks, dbname, collection, source):~~
	54	+def store_editors(tasks, rts):
60	55	'''
61	56	This function is called by multiple consumers who each take a sorted file
62	57	and create a cache object. If the number of edits made by an editor is above
—	—	@@ -63,8 +58,8 @@
64	59	is discarded.
65	60	The treshold is currently more than 9 edits and is not yet configurable.
66	61	'''
67		~~- mongo = db.init_mongo_db(dbname)~~
68		~~- collection = mongo[collection]~~
	62	+ mongo = db.init_mongo_db(rts.dbname)
	63	+ collection = mongo[rts.editors_raw]
69	64
70	65	editor_cache = cache.EditorCache(collection)
71	66	prev_contributor = -1
—	—	@@ -80,7 +75,7 @@
81	76	break
82	77	print '%s files left in the queue.' % messages.show(tasks.qsize)
83	78
84		~~- fh = file_utils.create_txt_filehandle(source, filename, 'r', settings.encoding)~~
	79	+ fh = file_utils.create_txt_filehandle(rts.sorted, filename, 'r', rts.encoding)
85	80	for line in file_utils.read_raw_data(fh):
86	81	if len(line) > 1:
87	82	contributor = line[0]
—	—	@@ -89,7 +84,7 @@
90	85	editor_cache.add(prev_contributor, 'NEXT')
91	86	date = text_utils.convert_timestamp_to_datetime_utc(line[1])
92	87	article_id = int(line[2])
93		~~- username = line[3].encode(settings.encoding)~~
	88	+ username = line[3].encode(rts.encoding)
94	89	ns = int(line[4])
95	90	value = {'date': date,
96	91	'article': article_id,
—	—	@@ -101,37 +96,41 @@
102	97	#print editor_cache.n
103	98
104	99
105		~~-def launcher(source, dbname, collection):~~
	100	+def launcher(rts):
106	101	'''
107	102	This is the main entry point and creates a number of workers and launches
108	103	them.
109	104	'''
110		~~- mongo = db.init_mongo_db(dbname)~~
111		~~- coll = mongo[collection]~~
	105	+ #rts.sorted, rts.dbname, rts.collection
	106	+ mongo = db.init_mongo_db(rts.dbname)
	107	+ coll = mongo[rts.editors_raw]
112	108	coll.ensure_index('editor')
113	109	coll.create_index('editor')
114	110
115		~~- files = file_utils.retrieve_file_list(source, 'csv')~~
	111	+ files = file_utils.retrieve_file_list(rts.sorted, 'csv')
116	112
117		~~- print 'Input directory is: %s ' % source~~
	113	+ print 'Input directory is: %s ' % rts.sorted
118	114	tasks = multiprocessing.JoinableQueue()
119	115	consumers = [multiprocessing.Process(target=store_editors,
120		~~- args=(tasks, dbname, collection, source))~~
121		~~- for i in xrange(settings.number_of_processes)]~~
	116	+ args=(tasks, rts))
	117	+ for i in xrange(rts.number_of_processes)]
122	118
123	119	for filename in files:
124	120	tasks.put(filename)
125	121
126		~~- for x in xrange(settings.number_of_processes):~~
	122	+ for x in xrange(rts.number_of_processes):
127	123	tasks.put(None)
128	124
129	125	for w in consumers:
130	126	w.start()
131	127
132	128	tasks.join()
	129	+ store_articles(rts)
133	130
134	131
135	132	def debug():
136	133	store_articles('wiki', 'cs')
	134	+
	135	+
137	136	if __name__ == '__main__':
138	137	debug()
Index: trunk/tools/editor_trends/etl/downloader.py
—	—	@@ -48,14 +48,14 @@
49	49	widgets = log.init_progressbar_widgets(filename)
50	50	extension = file_utils.determine_file_extension(filename)
51	51	filemode = file_utils.determine_file_mode(extension)
52		~~- filesize = http_utils.determine_remote_filesize(properties.settings.wp_dump_location,~~
	52	+ filesize = http_utils.determine_remote_filesize(properties.wp_dump_location,
53	53	properties.dump_relative_path,
54	54	filename)
55	55
56		~~- mod_date = http_utils.determine_modified_date(properties.settings.wp_dump_location,~~
	56	+ mod_date = http_utils.determine_modified_date(properties.wp_dump_location,
57	57	properties.dump_relative_path,
58	58	filename)
59		~~- mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, properties.settings.timestamp_server)~~
	59	+ mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, properties.timestamp_server)
60	60	if file_utils.check_file_exists(properties.location, filename):
61	61	mod_loc = file_utils.get_modified_date(properties.location, filename)
62	62	if mod_loc == mod_date and (properties.force == False or properties.force == None):
—	—	@@ -66,7 +66,7 @@
67	67	fh = file_utils.create_txt_filehandle(properties.location,
68	68	filename,
69	69	filemode,
70		~~- properties.settings.encoding)~~
	70	+ properties.encoding)
71	71	else:
72	72	fh = file_utils.create_binary_filehandle(properties.location, filename, 'wb')
73	73
—	—	@@ -100,18 +100,18 @@
101	101
102	102
103	103
104		~~-def launcher(properties, settings, logger):~~
	104	+def launcher(properties, logger):
105	105	print 'Creating list of files to be downloaded...'
106		~~- tasks = http_utils.create_list_dumpfiles(properties.settings.wp_dump_location,~~
	106	+ tasks = http_utils.create_list_dumpfiles(properties.wp_dump_location,
107	107	properties.dump_relative_path,
108	108	properties.dump_filename)
109	109	#print tasks.qsize()
110	110	#if tasks.qsize() < properties.settings.number_of_processes:
111		~~- # properties.settings.number_of_processes = tasks.qsize()~~
	111	+ # properties..number_of_processes = tasks.qsize()
112	112	if tasks.qsize() > 2:
113	113	consumers = [multiprocessing.Process(target=download_wiki_file,
114	114	args=(tasks, properties))
115		~~- for i in xrange(properties.settings.number_of_processes)]~~
	115	+ for i in xrange(properties.number_of_processes)]
116	116	else: consumers = [multiprocessing.Process(target=download_wiki_file,
117	117	args=(tasks, properties))
118	118	for i in xrange(1)]
Index: trunk/tools/editor_trends/etl/__init__.py
—	—	@@ -0,0 +1 @@
	2	+
Index: trunk/tools/editor_trends/etl/extracter.py
—	—	@@ -24,10 +24,6 @@
25	25	import progressbar
26	26	from Queue import Empty
27	27
28		~~-sys.path.append('..')~~
29		~~-import configuration~~
30		~~-settings = configuration.Settings()~~
31		-
32	28	import wikitree.parser
33	29	from bots import detector
34	30	from utils import file_utils
—	—	@@ -44,8 +40,8 @@
45	41	RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
46	42
47	43
48		~~-def remove_numeric_character_references(text):~~
49		~~- return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode(settings.encoding)~~
	44	+def remove_numeric_character_references(rts, text):
	45	+ return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode(rts.encoding)
50	46
51	47
52	48	def lenient_deccharref(m):
—	—	@@ -75,9 +71,9 @@
76	72	return ns
77	73
78	74
79		~~-def parse_comments(revisions, function):~~
	75	+def parse_comments(rts, revisions, function):
80	76	for revision in revisions:
81		~~- comment = revision.find('{%s}comment' % settings.xml_namespace)~~
	77	+ comment = revision.find('{%s}comment' % rts.xml_namespace)
82	78	if comment != None and comment.text != None:
83	79	comment.text = function(comment.text)
84	80	return revisions
—	—	@@ -101,12 +97,7 @@
102	98	else:
103	99	return False
104	100
105		~~-# for namespace in namespaces:~~
106		~~-# if title.startswith(namespace):~~
107		~~-# return False~~
108		~~-# return True~~
109	101
110		-
111	102	def validate_hostname(address):
112	103	'''
113	104	This is not a foolproof solution at all. The problem is that it's really
—	—	@@ -183,7 +174,7 @@
184	175	return None
185	176
186	177
187		~~-def output_editor_information(revisions, page, bots):~~
	178	+def output_editor_information(revisions, page, bots, rts):
188	179	'''
189	180	@elem is an XML element containing 1 revision from a page
190	181	@output is where to store the data, a filehandle
—	—	@@ -237,6 +228,7 @@
238	229	flat.append(f)
239	230	return flat
240	231
	232	+
241	233	def add_namespace_to_output(output, namespace):
242	234	for x, o in enumerate(output):
243	235	o.append(namespace['id'])
—	—	@@ -244,13 +236,13 @@
245	237	return output
246	238
247	239
248		~~-def parse_dumpfile(tasks, project, language_code, filehandles, lock, namespaces=['0']):~~
249		~~- bot_ids = detector.retrieve_bots(language_code)~~
250		~~- location = os.path.join(settings.input_location, language_code, project)~~
251		~~- output = os.path.join(settings.input_location, language_code, project, 'txt')~~
	240	+def parse_dumpfile(tasks, rts, filehandles, lock):
	241	+ bot_ids = detector.retrieve_bots(rts.language.code)
	242	+ location = os.path.join(rts.input_location, rts.language.code, rts.project.name)
	243	+ output = os.path.join(rts.input_location, rts.language.code, rts.project.name, 'txt')
252	244	widgets = log.init_progressbar_widgets('Extracting data')
253	245	filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a',
254		~~- settings.encoding) for fh in xrange(settings.max_filehandles)]~~
	246	+ rts.encoding) for fh in xrange(rts.max_filehandles)]
255	247
256	248	while True:
257	249	total, processed = 0.0, 0.0
—	—	@@ -269,11 +261,11 @@
270	262	filesize = file_utils.determine_filesize(location, filename)
271	263	print 'Opening %s...' % (os.path.join(location, filename))
272	264	print 'Filesize: %s' % filesize
273		~~- fh1 = file_utils.create_txt_filehandle(location, filename, 'r', settings.encoding)~~
274		~~- fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', settings.encoding)~~
	265	+ fh1 = file_utils.create_txt_filehandle(location, filename, 'r', rts.encoding)
	266	+ fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', rts.encoding)
275	267	ns, xml_namespace = wikitree.parser.extract_meta_information(fh1)
276		~~- ns = build_namespaces_locale(ns, namespaces)~~
277		~~- settings.xml_namespace = xml_namespace~~
	268	+ ns = build_namespaces_locale(ns, rts.namespaces)
	269	+ rts.xml_namespace = xml_namespace
278	270
279	271	pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start()
280	272	for page, article_size in wikitree.parser.read_input(fh1):
—	—	@@ -281,14 +273,13 @@
282	274	total += 1
283	275	namespace = parse_article(title, ns)
284	276	if namespace != False:
285		~~- #if verify_article_belongs_namespace(title, ns):~~
286	277	article_id = page.find('id').text
287	278	title = page.find('title').text
288	279	revisions = page.findall('revision')
289		~~- revisions = parse_comments(revisions, remove_numeric_character_references)~~
290		~~- output = output_editor_information(revisions, article_id, bot_ids)~~
	280	+ revisions = parse_comments(rts, revisions, remove_numeric_character_references)
	281	+ output = output_editor_information(revisions, article_id, bot_ids, rts)
291	282	output = add_namespace_to_output(output, namespace)
292		~~- write_output(output, filehandles, lock)~~
	283	+ write_output(output, filehandles, lock, rts)
293	284	file_utils.write_list_to_csv([article_id, title], fh2)
294	285	processed += 1
295	286	page.clear()
—	—	@@ -317,14 +308,14 @@
318	309	return d
319	310
320	311
321		~~-def write_output(observations, filehandles, lock):~~
	312	+def write_output(observations, filehandles, lock, rts):
322	313	observations = group_observations(observations)
323	314	for obs in observations:
324	315	lock.acquire() #lock the write around all edits of an editor for a particular page
325	316	try:
326	317	for i, o in enumerate(observations[obs]):
327	318	if i == 0:
328		~~- fh = filehandles[hash(obs)]~~
	319	+ fh = filehandles[hash(rts, obs)]
329	320	file_utils.write_list_to_csv(o, fh)
330	321
331	322	except Exception, error:
—	—	@@ -333,16 +324,16 @@
334	325	lock.release()
335	326
336	327
337		~~-def hash(id):~~
	328	+def hash(rts, id):
338	329	'''
339	330	A very simple hash function based on modulo. The except clause has been
340	331	added because there are instances where the username is stored in userid
341	332	tag and hence that's a string and not an integer.
342	333	'''
343	334	try:
344		~~- return int(id) % settings.max_filehandles~~
	335	+ return int(id) % rts.max_filehandles
345	336	except ValueError:
346		~~- return sum([ord(i) for i in id]) % settings.max_filehandles~~
	337	+ return sum([ord(i) for i in id]) % rts.max_filehandles
347	338
348	339
349	340	def prepare(output):
—	—	@@ -380,7 +371,8 @@
381	372	print tasks.qsize()
382	373	return tasks
383	374
384		~~-def launcher(properties):~~
	375	+
	376	+def launcher(rts):
385	377	'''
386	378	This is the main entry point for the extact phase of the data processing
387	379	chain. First, it will put a the files that need to be extracted in a queue
—	—	@@ -389,10 +381,10 @@
390	382	the variables from the different dump files.
391	383	'''
392	384	result = True
393		~~- tasks = unzip(properties)~~
	385	+ tasks = unzip(rts)
394	386
395		~~- output = os.path.join(settings.input_location, properties.language.code,~~
396		~~- properties.project.name, 'txt')~~
	387	+ output = os.path.join(rts.input_location, rts.language.code,
	388	+ rts.project.name, 'txt')
397	389	result = prepare(output)
398	390	if not result:
399	391	return result
—	—	@@ -404,14 +396,12 @@
405	397	filehandles = []
406	398	consumers = [multiprocessing.Process(target=parse_dumpfile,
407	399	args=(tasks,
408		~~- properties.project.name,~~
409		~~- properties.language.code,~~
	400	+ rts,
410	401	filehandles,
411		~~- lock,~~
412		~~- properties.namespaces))~~
413		~~- for x in xrange(settings.number_of_processes)]~~
	402	+ lock))
	403	+ for x in xrange(rts.number_of_processes)]
414	404
415		~~- for x in xrange(settings.number_of_processes):~~
	405	+ for x in xrange(rts.number_of_processes):
416	406	tasks.put(None)
417	407
418	408	for w in consumers:
Index: trunk/tools/editor_trends/etl/transformer.py
—	—	@@ -23,9 +23,6 @@
24	24	import datetime
25	25	import sys
26	26
27		~~-sys.path.append('..')~~
28		~~-import configuration~~
29		~~-settings = configuration.Settings()~~
30	27	from database import db
31	28	from utils import file_utils
32	29	from utils import messages
—	—	@@ -101,6 +98,7 @@
102	99	'username': username
103	100	}, safe=True)
104	101
	102	+
105	103	def determine_year_range(edits):
106	104	years = [year for year in edits if edits[year] != []]
107	105	first_year = int(min(years))
—	—	@@ -119,8 +117,6 @@
120	118	return dc
121	119
122	120
123		-
124		-
125	121	def determine_edits_by_month(edits, first_year, final_year):
126	122	dc = shaper.create_datacontainer(first_year, final_year)
127	123	dc = shaper.add_months_to_datacontainer(dc, 0.0)
—	—	@@ -161,17 +157,17 @@
162	158	return sorted(edits, key=itemgetter('date'))
163	159
164	160
165		~~-def transform_editors_multi_launcher(dbname, collection):~~
166		~~- ids = db.retrieve_distinct_keys(dbname, collection, 'editor')~~
167		~~- kwargs = {'definition': 'traditional',~~
168		~~- 'pbar': True,~~
169		~~- }~~
	161	+def transform_editors_multi_launcher(rts):
	162	+ ids = db.retrieve_distinct_keys(rts.dbname, rts.editors_raw, 'editor')
	163	+# kwargs = {'definition': 'traditional',
	164	+# 'pbar': True,
	165	+# }
170	166	tasks = multiprocessing.JoinableQueue()
171		~~- consumers = [EditorConsumer(tasks, None) for i in xrange(settings.number_of_processes)]~~
	167	+ consumers = [EditorConsumer(tasks, None) for i in xrange(rts.number_of_processes)]
172	168
173	169	for id in ids:
174		~~- tasks.put(Editor(dbname, collection, id))~~
175		~~- for x in xrange(settings.number_of_processes):~~
	170	+ tasks.put(Editor(rts.dbname, rts.editors_raw, id))
	171	+ for x in xrange(rts.number_of_processes):
176	172	tasks.put(None)
177	173
178	174	print messages.show(tasks.qsize)
—	—	@@ -181,10 +177,10 @@
182	178	tasks.join()
183	179
184	180
185		~~-def setup_database(dbname, collection):~~
186		~~- mongo = db.init_mongo_db(dbname)~~
187		~~- input_db = mongo[collection]~~
188		~~- output_db = mongo['%s_dataset' % collection]~~
	181	+def setup_database(rts):
	182	+ mongo = db.init_mongo_db(rts.dbname)
	183	+ input_db = mongo[rts.editors_raw]
	184	+ output_db = mongo[rts.editors_dataset]
189	185
190	186	output_db.ensure_index('editor')
191	187	output_db.create_index('editor')
—	—	@@ -193,9 +189,9 @@
194	190	return input_db, output_db
195	191
196	192
197		~~-def transform_editors_single_launcher(dbname, collection):~~
198		~~- ids = db.retrieve_distinct_keys(dbname, collection, 'editor')~~
199		~~- input_db, output_db = setup_database(dbname, collection)~~
	193	+def transform_editors_single_launcher(rts):
	194	+ ids = db.retrieve_distinct_keys(rts.dbname, rts.editors_raw, 'editor')
	195	+ input_db, output_db = setup_database(rts)
200	196	for x, id in enumerate(ids):
201	197	print '%s editors to go...' % (len(ids) - x)
202	198	editor = Editor(id, input_db, output_db)
Index: trunk/tools/editor_trends/etl/sort.py
—	—	@@ -24,14 +24,10 @@
25	25	import multiprocessing
26	26	from Queue import Empty
27	27
28		~~-sys.path.append('..')~~
29		~~-import configuration~~
30		~~-settings = configuration.Settings()~~
31		-
32	28	from utils import file_utils
33	29	from utils import messages
34		~~-#import wikitree.parser~~
35	30
	31	+
36	32	def quick_sort(obs):
37	33	'''
38	34	Quicksort is a sorting algorithm developed by C. A. R. Hoare that, on \
—	—	@@ -79,12 +75,15 @@
80	76
81	77
82	78
83		~~-def merge_sorted_files(target, files, iteration):~~
	79	+def merge_sorted_files(target, files, iteration, rts):
84	80	'''
85		~~- Merges smaller sorted files in one big file, no longer used.~~
	81	+ Merges smaller sorted files in one big file, Only used for creating
	82	+ data competition file.
86	83	'''
87		~~- fh = file_utils.create_txt_filehandle(target, 'merged_%s.txt' % iteration,~~
88		~~- 'w', settings.encoding)~~
	84	+ fh = file_utils.create_txt_filehandle(target,
	85	+ 'merged_%s.txt' % iteration,
	86	+ 'w',
	87	+ rts.encoding)
89	88	lines = 0
90	89	for line in heapq.merge(*[readline(filename) for filename in files]):
91	90	file_utils.write_list_to_csv(line, fh)
—	—	@@ -94,17 +93,19 @@
95	94	return fh.name
96	95
97	96
98		~~-def write_sorted_file(sorted_data, filename, target):~~
	97	+def write_sorted_file(sorted_data, filename, rts):
99	98	'''
100	99	Writes the sorted file to target
101	100	'''
102		~~- fh = file_utils.create_txt_filehandle(target, filename, 'w',~~
103		~~- settings.encoding)~~
	101	+ fh = file_utils.create_txt_filehandle(rts.sorted,
	102	+ filename,
	103	+ 'w',
	104	+ rts.encoding)
104	105	file_utils.write_list_to_csv(sorted_data, fh)
105	106	fh.close()
106	107
107	108
108		~~-def mergesort_feeder(tasks, source, target):~~
	109	+def mergesort_feeder(tasks, rts):
109	110	'''
110	111	The feeder function is called by the launcher and gives it a task to
111	112	complete.
—	—	@@ -118,10 +119,10 @@
119	120	print tasks.qsize()
120	121	break
121	122
122		~~- fh = file_utils.create_txt_filehandle(source,~~
123		~~- filename,~~
124		~~- 'r',~~
125		~~- settings.encoding)~~
	123	+ fh = file_utils.create_txt_filehandle(rts.txt,
	124	+ filename,
	125	+ 'r',
	126	+ rts.encoding)
126	127	#print fh
127	128	#data = fh.readlines()
128	129	data = file_utils.read_unicode_text(fh)
—	—	@@ -129,7 +130,7 @@
130	131	data = [d.strip() for d in data]
131	132	data = [d.split('\t') for d in data]
132	133	sorted_data = mergesort(data)
133		~~- write_sorted_file(sorted_data, filename, target)~~
	134	+ write_sorted_file(sorted_data, filename, rts)
134	135	print filename, messages.show(tasks.qsize)
135	136	except UnicodeDecodeError, e:
136	137	print e
—	—	@@ -137,19 +138,19 @@
138	139	pass
139	140
140	141
141		~~-def mergesort_launcher(source, target):~~
142		~~- settings.verify_environment([source, target])~~
143		~~- files = file_utils.retrieve_file_list(source, 'csv')~~
144		~~- #print files~~
145		~~- print source~~
	142	+def launcher(rts):
	143	+ '''
	144	+ rts is an instance of RunTimeSettings
	145	+ '''
	146	+ files = file_utils.retrieve_file_list(rts.txt, 'csv')
146	147	tasks = multiprocessing.JoinableQueue()
147	148	consumers = [multiprocessing.Process(target=mergesort_feeder,
148		~~- args=(tasks, source, target))~~
149		~~- for x in xrange(settings.number_of_processes)]~~
	149	+ args=(tasks, rts))
	150	+ for x in xrange(rts.number_of_processes)]
150	151	for filename in files:
151	152	tasks.put(filename)
152	153
153		~~- for x in xrange(settings.number_of_processes):~~
	154	+ for x in xrange(rts.number_of_processes):
154	155	tasks.put(None)
155	156
156	157	for w in consumers:
—	—	@@ -157,6 +158,7 @@
158	159
159	160	tasks.join()
160	161
	162	+
161	163	def debug():
162	164	'''
163	165	Simple test function
Index: trunk/tools/editor_trends/__init__.py
—	—	@@ -1,14 +1,30 @@
2	2	import os
3	3	import sys
4	4
5		~~-WORKING_DIRECTORY = os.getcwd()#[:-9]~~
6		~~-IGNORE_DIRS = ['wikistats', 'zips']~~
	5	+from classes import singleton
7	6
8		~~-dirs = [name for name in os.listdir(WORKING_DIRECTORY) if~~
9		~~- os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]~~
	7	+class Path:
	8	+ __metaclass__ = singleton.Singleton
10	9
	10	+ def __init__(self):
	11	+ self.cwd = self.determine_working_directory()
	12	+ self.update_python_path()
11	13
12		~~-for subdirname in dirs:~~
13		~~- if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:~~
14		~~- sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname))~~
15		~~- #print os.path.join(WORKING_DIRECTORY, subdirname)~~
	14	+ def determine_working_directory(self):
	15	+ cwd = os.getcwd()
	16	+ if not cwd.endswith('editor_trends%s' % os.sep):
	17	+ pos = cwd.find('editor_trends') + 14
	18	+ cwd = cwd[:pos]
	19	+ return cwd
	20	+
	21	+ def update_python_path(self):
	22	+ IGNORE_DIRS = ['wikistats', 'zips', 'datasets', 'mapreduce', 'logs',
	23	+ 'statistics', 'js_scripts', 'deployment',
	24	+ 'documentation', 'data', 'code-snippets']
	25	+ dirs = [name for name in os.listdir(self.cwd) if
	26	+ os.path.isdir(os.path.join(self.cwd, name))]
	27	+ for subdirname in dirs:
	28	+ if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
	29	+ sys.path.append(os.path.join(self.cwd, subdirname))
	30	+
	31	+Path()
Index: trunk/tools/editor_trends/classes/settings.py
—	—	@@ -75,9 +75,7 @@
76	76
77	77	self.architecture = platform.machine()
78	78	self.working_directory = self.determine_working_directory()
79		~~- print sys.path~~
80	79	self.update_python_path()
81		~~- print sys.path~~
82	80
83	81	self.root = os.path.expanduser('~') if self.platform != 'Windows' else 'c:\\'
84	82	self.max_filehandles = self.determine_max_filehandles_open()
Index: trunk/tools/editor_trends/classes/runtime_settings.py
—	—	@@ -29,14 +29,15 @@
30	30	import datetime
31	31	import time
32	32	import re
33		~~-sys.path.append('..')~~
	33	+#sys.path.append('..')
34	34
	35	+from settings import Settings
35	36	from utils import text_utils
36	37	from utils import ordered_dict as odict
37	38	from classes import languages
38	39
39	40
40		~~-class RunTimeSettings:~~
	41	+class RunTimeSettings(Settings):
41	42	'''
42	43	This class keeps track of the commands issued by the user and is used to
43	44	feed the different etl functions. Difference with configuration class is
—	—	@@ -44,25 +45,26 @@
45	46	same for a user while these settings can change depending on the kind of
46	47	analysis requested.
47	48	'''
48		~~- def __init__(self, project, language, settings, args=None):~~
	49	+ def __init__(self, project, language, args=None):
	50	+ Settings.__init__(self)
49	51	self.project = project
50	52	self.language = language
51		~~- self.settings = settings~~
	53	+ self.dbname = 'wikilytics'
52	54
53	55	if args:
54	56	self.args = args
55	57	self.hash = self.secs_since_epoch()
56		~~- print self.settings.input_location~~
57		~~- print self.get_value('location')~~
58		~~- self.base_location = self.settings.input_location if \~~
59		~~- self.settings.input_location != None else self.get_value('location')~~
	58	+ #print self.settings.input_location
	59	+ #print self.get_value('location')
	60	+ self.input_location = self.input_location if \
	61	+ self.input_location != None else self.get_value('location')
60	62	self.project = self.update_project_settings()
61	63	self.language = self.update_language_settings()
62		~~- self.dbname = '%s%s' % (self.language.code, self.project.name)~~
	64	+ #self.dbname = '%s%s' % (self.language.code, self.project.name)
63	65	self.targets = self.split_keywords(self.get_value('charts'))
64	66	self.keywords = self.split_keywords(self.get_value('keywords'))
65	67	self.function = self.get_value('func')
66		~~- self.collection = self.get_value('collection')~~
	68	+
67	69	self.ignore = self.get_value('except')
68	70	self.clean = self.get_value('new')
69	71	self.force = self.get_value('force')
—	—	@@ -70,9 +72,9 @@
71	73	self.filename = self.generate_wikidump_filename()
72	74	self.namespaces = self.get_namespaces()
73	75
74		~~- self.dataset = os.path.join(settings.dataset_location,~~
	76	+ self.dataset = os.path.join(self.dataset_location,
75	77	self.project.name)
76		~~- self.charts = os.path.join(settings.chart_location,~~
	78	+ self.charts = os.path.join(self.chart_location,
77	79	self.project.name)
78	80
79	81	self.txt = os.path.join(self.location, 'txt')
—	—	@@ -86,8 +88,11 @@
87	89	self.dump_filename = self.generate_wikidump_filename()
88	90	self.dump_relative_path = self.set_dump_path()
89	91	self.dump_absolute_path = self.set_dump_path(absolute=True)
90		~~- print self.directories~~
91		~~- settings.verify_environment(self.directories)~~
	92	+ self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name)
	93	+ self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name)
	94	+ self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name)
	95	+ self.analyzer_collection = self.get_value('collection')
	96	+ self.verify_environment(self.directories)
92	97
93	98	def __str__(self):
94	99	return 'Runtime Settings for project %s%s' % (self.language.name,
—	—	@@ -126,7 +131,7 @@
127	132	'''
128	133	Construct the full project location
129	134	'''
130		~~- return os.path.join(self.base_location, self.language.code, self.project.name)~~
	135	+ return os.path.join(self.input_location, self.language.code, self.project.name)
131	136
132	137	def show_settings(self):
133	138	'''
—	—	@@ -141,7 +146,7 @@
142	147	max_length_key = max([len(key) for key in about.keys()])
143	148	print 'Final settings after parsing command line arguments:'
144	149	for ab in about:
145		~~- print '%s: %s' % (ab.rjust(max_length_key), about[ab].encode(self.settings.encoding))~~
	150	+ print '%s: %s' % (ab.rjust(max_length_key), about[ab].encode(self.encoding))
146	151
147	152
148	153	def get_value(self, key):
—	—	@@ -152,7 +157,7 @@
153	158
154	159	def set_dump_path(self, absolute=False):
155	160	if absolute:
156		~~- return '%s/%s%s/latest/' % (self.settings.wp_dump_location, self.language.code, self.project.name)~~
	161	+ return '%s/%s%s/latest/' % (self.wp_dump_location, self.language.code, self.project.name)
157	162	else:
158	163	return '/%s%s/latest/' % (self.language.code, self.project.name)
159	164
Index: trunk/tools/editor_trends/classes/dataset.py
—	—	@@ -34,7 +34,7 @@
35	35	from utils import file_utils
36	36	from utils import data_converter
37	37	from database import db
38		~~-import json_encoders~~
	38	+from analyses import json_encoders
39	39
40	40	class Transform(SONManipulator):
41	41	'''
Index: trunk/tools/editor_trends/classes/languages.py
—	—	@@ -31,20 +31,20 @@
32	32	def __repr__(self):
33	33	return u'%s - %s' % (self.code, self.name)
34	34
35		~~- def show_languages(self, settings, project, startswith=None):~~
	35	+ def show_languages(self, project, startswith=None):
36	36	if startswith != None:
37	37	startswith = startswith.title()
38	38	project.valid_languages.sort()
39	39	for language in project.valid_languages:
40	40	try:
41	41	if startswith != None and language.startswith(first):
42		~~- print '%s' % language.decode(settings.encoding)~~
	42	+ print '%s' % language.decode('utf-8')
43	43	elif startswith == None:
44		~~- print '%s' % language.decode(settings.encoding)~~
	44	+ print '%s' % language.decode('utf-8')
45	45	except UnicodeEncodeError:
46	46	print '%s' % language
47		-
48	47
	48	+
49	49	class LanguageContainer:
50	50	def __init__(self):
51	51	self.init_languages = odict.OrderedDict([
Index: trunk/tools/editor_trends/configuration.py
—	—	@@ -90,7 +90,9 @@
91	91
92	92	self.architecture = platform.machine()
93	93	self.working_directory = self.determine_working_directory()
	94	+ print sys.path
94	95	self.update_python_path()
	96	+ print sys.path
95	97
96	98	self.root = os.path.expanduser('~') if self.platform != 'Windows' else 'c:\\'
97	99	self.max_filehandles = self.determine_max_filehandles_open()
Index: trunk/tools/editor_trends/utils/__init__.py
—	—	@@ -0,0 +1 @@
	2	+
Index: trunk/tools/editor_trends/utils/compression.py
—	—	@@ -22,10 +22,12 @@
23	23	import os
24	24	sys.path.append('..')
25	25
26		~~-import configuration~~
27		~~-settings = configuration.Settings()~~
	26	+#import configuration
	27	+#settings = configuration.Settings()
	28	+from classes import settings
	29	+settings = settings.Settings()
	30	+from classes import exceptions
28	31	import file_utils
29		~~-from classes import exceptions~~
30	32	import timer
31	33	import log
32	34
—	—	@@ -128,6 +130,7 @@
129	131	self.name = p
130	132	self.program_installed = path
131	133
	134	+
132	135	def launch_zip_extractor(location, filename, properties):
133	136	'''
134	137
—	—	@@ -141,6 +144,7 @@
142	145	log.log_to_mongo(properties, 'dataset', 'unpack', stopwatch, event='finish')
143	146	return retcode
144	147
	148	+
145	149	if __name__ == '__main__':
146		~~- c = Compressor('C:\Users\diederik.vanliere\Documents', 'django.zip')~~
	150	+ c = Compressor('C:\Users\diederik.vanliere\Documents', 'test.zip')
147	151	c.extract()
Index: trunk/tools/editor_trends/utils/log.py
—	—	@@ -27,10 +27,10 @@
28	28
29	29	from database import db
30	30
31		~~-def log_to_mongo(properties, jobtype, task, timer, event='start'):~~
32		~~- conn = db.init_mongo_db('wikilytics')~~
	31	+def log_to_mongo(rts, jobtype, task, timer, event='start'):
	32	+ conn = db.init_mongo_db(rts.dbname)
33	33	created = datetime.datetime.now()
34		~~- hash = '%s_%s' % (properties.project, properties.hash)~~
	34	+ hash = '%s_%s' % (rts.project, rts.hash)
35	35	coll = conn['jobs']
36	36
37	37	job = coll.find_one({'hash': hash})
—	—	@@ -38,8 +38,8 @@
39	39	if job == None:
40	40	if jobtype == 'dataset':
41	41	_id = coll.save({'hash': hash, 'created': created, 'finished': False,
42		~~- 'language_code': properties.language.code,~~
43		~~- 'project': properties.project.name,~~
	42	+ 'language_code': rts.language.code,
	43	+ 'project': rts.project.name,
44	44	'in_progress': True, 'jobtype': jobtype,
45	45	'tasks': {}})
46	46
—	—	@@ -47,8 +47,8 @@
48	48	elif jobtype == 'chart':
49	49	_id = coll.save({'hash': hash, 'created': created,
50	50	'jobtype': jobtype,
51		~~- 'project': properties.project,~~
52		~~- 'language_code': properties.language_code,~~
	51	+ 'project': rts.project,
	52	+ 'language_code': rts.language_code,
53	53	'tasks': {}})
54	54
55	55	job = coll.find_one({'_id': _id})
Index: trunk/tools/editor_trends/bots/__init__.py
—	—	@@ -0,0 +1 @@
	2	+
Index: trunk/tools/editor_trends/code-snippets/__init__.py
—	—	@@ -0,0 +1,8 @@
	2	+import os
	3	+
	4	+cwd = os.getcwd()
	5	+pos = cwd.rfind(os.sep)
	6	+cwd = cwd[:pos]
	7	+
	8	+from __init__ import Path
	9	+Path()

Status & tagging log

23:32, 12 February 2011 Reedy (talk | contribs) changed the status of r82005 [removed: new added: deferred]