r84920 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r84919‎ \| r84920 \| r84921 >
Date:	20:52, 28 March 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Fixed command line support for generating datasets.
Modified paths:	/trunk/tools/editor_trends/analyses/analyzer.py (modified) (history) /trunk/tools/editor_trends/analyses/inventory.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/new_editor_count.py (modified) (history) /trunk/tools/editor_trends/classes/analytics.py (added) (history) /trunk/tools/editor_trends/classes/exceptions.py (modified) (history) /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history) /trunk/tools/editor_trends/database/db.py (modified) (history) /trunk/tools/editor_trends/etl/enricher.py (modified) (history) /trunk/tools/editor_trends/etl/extracter.py (modified) (history) /trunk/tools/editor_trends/etl/store.py (modified) (history) /trunk/tools/editor_trends/etl/transformer.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/plugins/new_editor_count.py
—	—	@@ -26,6 +26,7 @@
27	27	stats.download.org to make sure that we are using the same numbers.
28	28	'''
29	29	# headers = ['year', 'month', 'count']
30		~~- new_wikipedian = editor['new_wikipedian']~~
31		~~- var.add(new_wikipedian, 1)~~
	30	+ if editor['new_wikipedian'] != False:
	31	+ new_wikipedian = editor['new_wikipedian']
	32	+ var.add(new_wikipedian, 1)
32	33	return var
Index: trunk/tools/editor_trends/analyses/inventory.py
—	—	@@ -32,7 +32,7 @@
33	33	'''
34	34	assert caller == 'django' or caller == 'manage'
35	35	ignore = ['__init__']
36		~~- functions = {}~~
	36	+ charts = {}
37	37
38	38	fn = os.path.realpath(__file__)
39	39	pos = fn.rfind(os.sep)
—	—	@@ -42,14 +42,14 @@
43	43
44	44	for plugin in plugins:
45	45	if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore:
46		~~- functions[plugin.func_name] = plugin~~
	46	+ charts[plugin.func_name] = plugin
47	47	if caller == 'manage':
48		~~- return functions~~
	48	+ return charts
49	49	elif caller == 'django':
50	50	django_functions = []
51		~~- for function in functions:~~
52		~~- fancy_name = function.replace('_', ' ').title()~~
53		~~- django_functions.append((function, fancy_name))~~
	51	+ for chart in charts:
	52	+ fancy_name = chart.replace('_', ' ').title()
	53	+ django_functions.append((chart, fancy_name))
54	54
55	55	return django_functions
56	56
Index: trunk/tools/editor_trends/analyses/analyzer.py
—	—	@@ -19,7 +19,7 @@
20	20
21	21	from multiprocessing import JoinableQueue, Manager, RLock, Process
22	22	from multiprocessing.managers import BaseManager
23		~~-from Queue import Empty~~
	23	+
24	24	import sys
25	25	import cPickle
26	26	import os
—	—	@@ -30,84 +30,16 @@
31	31	sys.path.append('..')
32	32
33	33	import inventory
34		~~-import manage as manager~~
35	34	from classes import dataset
36	35	from classes import runtime_settings
37	36	from classes import consumers
	37	+from classes import exceptions
	38	+from classes import analytics
38	39	from database import db
39	40	from utils import timer
40	41	from utils import log
41	42
42		~~-class Replicator:~~
43		~~- def __init__(self, args, plugin, time_unit, cutoff=None, cum_cutoff=None, **kwargs):~~
44		~~- self.args = args~~
45		~~- self.plugin = plugin~~
46		~~- self.time_unit = time_unit~~
47		~~- languages = kwargs.pop('languages', False)~~
48		~~- if languages:~~
49		~~- self.languages = ['de', 'fr', 'es', 'ja', 'ru']~~
50		~~- else:~~
51		~~- self.languages = ['en']~~
52		~~- if cutoff == None:~~
53		~~- self.cutoff = [1, 10, 50]~~
54		~~- else:~~
55		~~- self.cutoff = cutoff~~
56	43
57		~~- if cutoff == None:~~
58		~~- self.cum_cutoff = [10]~~
59		~~- else:~~
60		~~- self.cum_cutoff = cum_cutoff~~
61		~~- self.kwargs = kwargs~~
62		-
63		~~- def __call__(self):~~
64		~~- project = 'wiki'~~
65		~~- for lang in self.languages:~~
66		~~- self.rts = runtime_settings.init_environment(project, lang, self.args)~~
67		~~- #TEMP FIX, REMOVE~~
68		~~- #rts.dbname = 'enwiki'~~
69		~~- self.rts.editors_dataset = 'editors_dataset'~~
70		-
71		~~- self.rts.dbname = '%s%s' % (lang, project)~~
72		~~- for cum_cutoff in self.cum_cutoff:~~
73		~~- for cutoff in self.cutoff:~~
74		~~- generate_chart_data(self.rts, self.plugin,~~
75		~~- time_unit=self.time_unit,~~
76		~~- cutoff=cutoff, cum_cutoff=cum_cutoff,~~
77		~~- **self.kwargs)~~
78		-
79		-
80		~~-class Analyzer(consumers.BaseConsumer):~~
81		~~- def __init__(self, rts, tasks, result, var):~~
82		~~- super(Analyzer, self).__init__(rts, tasks, result)~~
83		~~- self.var = var~~
84		-
85		~~- def run(self):~~
86		~~- '''~~
87		~~- Generic loop function that loops over all the editors of a Wikipedia~~
88		~~- project and then calls the plugin that does the actual mapping.~~
89		~~- '''~~
90		~~- mongo = db.init_mongo_db(self.rts.dbname)~~
91		~~- coll = mongo[self.rts.editors_dataset]~~
92		~~- while True:~~
93		~~- try:~~
94		~~- task = self.tasks.get(block=False)~~
95		~~- self.tasks.task_done()~~
96		~~- if task == None:~~
97		~~- self.result.put(self.var)~~
98		~~- break~~
99		~~- editor = coll.find_one({'editor': task.editor})~~
100		-
101		~~- task.plugin(self.var, editor, dbname=self.rts.dbname)~~
102		~~- self.result.put(True)~~
103		~~- except Empty:~~
104		~~- pass~~
105		-
106		~~-class Task:~~
107		~~- def __init__(self, plugin, editor):~~
108		~~- self.plugin = plugin~~
109		~~- self.editor = editor~~
110		-
111		-
112	44	def reconstruct_observations(var):
113	45	'''
114	46	When the Task queue is empty then the Variable instance is returned. However,
—	—	@@ -158,8 +90,9 @@
159	91	'''
160	92	stopwatch = timer.Timer()
161	93	plugin = retrieve_plugin(func)
	94	+ available_plugins = inventory.available_analyses()
162	95	if not plugin:
163		~~- raise 'Plugin function %s is unknown, please make sure that you specify an existing plugin function.' % func~~
	96	+ raise exceptions.UnknownPluginError(plugin, available_plugins)
164	97	feedback(plugin, rts)
165	98
166	99	obs = dict()
—	—	@@ -183,9 +116,9 @@
184	117	var = dataset.Variable('count', time_unit, lock, obs_proxy, **kwargs)
185	118
186	119	for editor in editors:
187		~~- tasks.put(Task(plugin, editor))~~
	120	+ tasks.put(analytics.Task(plugin, editor))
188	121
189		~~- consumers = [Analyzer(rts, tasks, result, var) for~~
	122	+ consumers = [analytics.Analyzer(rts, tasks, result, var) for
190	123	x in xrange(rts.number_of_processes)]
191	124
192	125
—	—	@@ -228,6 +161,7 @@
229	162	'''
230	163	Determine the first and final year for the observed data
231	164	'''
	165	+ print dbname, collection, var
232	166	try:
233	167	max_year = db.run_query(dbname, collection, var, 'max')
234	168	max_year = max_year[var].year + 1
—	—	@@ -240,31 +174,31 @@
241	175
242	176
243	177	def launcher():
244		~~- project, language, parser = manager.init_args_parser()~~
245		~~- args = parser.parse_args(['django'])~~
246		~~- rts = runtime_settings.init_environment('wiki', 'en', args)~~
	178	+# project, language, parser = manage.init_args_parser()
	179	+# args = parser.parse_args(['django'])
	180	+# rts = runtime_settings.init_environment('wiki', 'en', args)
247	181
248	182	#TEMP FIX, REMOVE
249		~~- rts.dbname = 'enwiki'~~
250		~~- rts.editors_dataset = 'editors_dataset'~~
	183	+# rts.dbname = 'enwiki'
	184	+# rts.editors_dataset = 'editors_dataset'
251	185	#END TEMP FIX
252	186
253		~~-# replicator = Replicator(rts, 'histogram_by_backward_cohort', time_unit='year')~~
	187	+# replicator = analytics.Replicator('histogram_by_backward_cohort', time_unit='year')
254	188	# replicator()
255		~~- replicator = Replicator(args, 'cohort_dataset_backward_bar', time_unit='year', format='wide', languages=True)~~
	189	+ replicator = analytics.Replicator('cohort_dataset_backward_bar', time_unit='year', format='wide', languages=True)
256	190	replicator()
257	191
258		~~-# generate_chart_data(rts, 'histogram_by_backward_cohort', time_unit='year', cutoff=1, cum_cutoff=10)~~
259		~~-# generate_chart_data(rts, 'edit_patterns', time_unit='year', cutoff=5)~~
260		~~-# generate_chart_data(rts, 'total_number_of_new_wikipedians', time_unit='year')~~
261		~~-# generate_chart_data(rts, 'total_number_of_articles', time_unit='year')~~
262		~~-# generate_chart_data(rts, 'total_cumulative_edits', time_unit='year')~~
263		~~-# generate_chart_data(rts, 'cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=10)~~
264		~~-# generate_chart_data(rts, 'cohort_dataset_backward_bar', time_unit='year', cutoff=1, cum_cutoff=10, format='wide')~~
265		~~-# generate_chart_data(rts, 'cohort_dataset_forward_bar', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')~~
266		~~-# generate_chart_data(rts, 'histogram_edits', time_unit='year', cutoff=0)~~
267		~~-# generate_chart_data(rts, 'time_to_new_wikipedian', time_unit='year', cutoff=0)~~
268		~~-# generate_chart_data(rts, 'new_editor_count', time_unit='month', cutoff=0)~~
	192	+# generate_chart_data('histogram_by_backward_cohort', time_unit='year', cutoff=1, cum_cutoff=10)
	193	+# generate_chart_data('edit_patterns', time_unit='year', cutoff=5)
	194	+# generate_chart_data('total_number_of_new_wikipedians', time_unit='year')
	195	+# generate_chart_data('total_number_of_articles', time_unit='year')
	196	+# generate_chart_data('total_cumulative_edits', time_unit='year')
	197	+# generate_chart_data('cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=10)
	198	+# generate_chart_data('cohort_dataset_backward_bar', time_unit='year', cutoff=1, cum_cutoff=10, format='wide')
	199	+# generate_chart_data('cohort_dataset_forward_bar', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')
	200	+# generate_chart_data('histogram_edits', time_unit='year', cutoff=0)
	201	+# generate_chart_data('time_to_new_wikipedian', time_unit='year', cutoff=0)
	202	+# generate_chart_data('new_editor_count', time_unit='month', cutoff=0)
269	203	# #available_analyses()
270	204
271	205
Index: trunk/tools/editor_trends/manage.py
—	—	@@ -22,23 +22,23 @@
23	23	import logging.handlers
24	24	import sys
25	25	import datetime
26		~~-from argparse import ArgumentParser~~
27		~~-from argparse import RawTextHelpFormatter~~
28	26	import ConfigParser
	27	+from argparse import ArgumentParser, RawTextHelpFormatter
29	28
	29	+from classes import languages
	30	+from classes import projects
	31	+from classes import runtime_settings
30	32	from utils import file_utils
31	33	from utils import ordered_dict
32	34	from utils import log
33	35	from utils import timer
34		~~-from classes import projects~~
35		~~-from classes import languages~~
36		~~-from classes import runtime_settings~~
37	36	from database import db
38	37	from etl import downloader
39	38	from etl import extracter
40	39	from etl import store
41	40	from etl import sort
42	41	from etl import transformer
	42	+from analyses import analyzer
43	43	from analyses import inventory
44	44
45	45
—	—	@@ -49,6 +49,148 @@
50	50	return choices
51	51
52	52
	53	+def init_args_parser():
	54	+ '''
	55	+ Entry point for parsing command line and launching the needed function(s).
	56	+ '''
	57	+ language = languages.init()
	58	+ project = projects.init()
	59	+ pjc = projects.ProjectContainer()
	60	+ rts = runtime_settings.RunTimeSettings(project, language)
	61	+
	62	+ #Init Argument Parser
	63	+ parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter)
	64	+ subparsers = parser.add_subparsers(help='sub - command help')
	65	+
	66	+ #SHOW LANGUAGES
	67	+ parser_languages = subparsers.add_parser('show_languages',
	68	+ help='Overview of all valid languages.')
	69	+ parser_languages.add_argument('-s', '--startswith',
	70	+ action='store',
	71	+ help='Enter the first letter of a language to see which languages are \
	72	+ available.')
	73	+ parser_languages.set_defaults(func=language.show_languages, args=[project])
	74	+
	75	+ #CONFIG
	76	+ parser_config = subparsers.add_parser('config',
	77	+ help='The config sub command allows you set the data location of where \
	78	+ to store files.')
	79	+ parser_config.set_defaults(func=config_launcher)
	80	+ parser_config.add_argument('-f', '--force',
	81	+ action='store_true',
	82	+ help='Reconfigure Editor Toolkit (this will replace wiki.cfg')
	83	+
	84	+ #DOWNLOAD
	85	+ parser_download = subparsers.add_parser('download',
	86	+ help='The download sub command allows you to download a Wikipedia dump\
	87	+ file.')
	88	+ parser_download.set_defaults(func=downloader_launcher)
	89	+
	90	+ #EXTRACT
	91	+ parser_create = subparsers.add_parser('extract',
	92	+ help='The store sub command parsers the XML chunk files, extracts the \
	93	+ information and stores it in a MongoDB.')
	94	+ parser_create.set_defaults(func=extract_launcher)
	95	+
	96	+
	97	+ #SORT
	98	+ parser_sort = subparsers.add_parser('sort',
	99	+ help='By presorting the data, significant processing time reductions \
	100	+ are achieved.')
	101	+ parser_sort.set_defaults(func=sort_launcher)
	102	+
	103	+ #STORE
	104	+ parser_store = subparsers.add_parser('store',
	105	+ help='The store sub command parsers the XML chunk files, extracts the \
	106	+ information and stores it in a MongoDB.')
	107	+ parser_store.set_defaults(func=store_launcher)
	108	+
	109	+ #TRANSFORM
	110	+ parser_transform = subparsers.add_parser('transform',
	111	+ help='Transform the raw datatable to an enriched dataset that can be \
	112	+ exported.')
	113	+ parser_transform.set_defaults(func=transformer_launcher)
	114	+
	115	+ #DATASET
	116	+ parser_dataset = subparsers.add_parser('dataset',
	117	+ help='Create a dataset from the MongoDB and write it to a csv file.')
	118	+ parser_dataset.set_defaults(func=dataset_launcher)
	119	+ parser_dataset.add_argument('-c', '--charts',
	120	+ action='store',
	121	+ help='Should be a valid function name that matches one of the plugin functions',
	122	+ default=inventory.available_analyses()['new_editor_count'])
	123	+
	124	+ parser_dataset.add_argument('-k', '--keywords',
	125	+ action='store',
	126	+ help='Add additional keywords in the format keyword1=value1,keyword2=value2',
	127	+ default='')
	128	+
	129	+ #ALL
	130	+ parser_all = subparsers.add_parser('all',
	131	+ help='The all sub command runs the download, split, store and dataset \
	132	+ commands.\n\nWARNING: THIS COULD TAKE DAYS DEPENDING ON THE \
	133	+ CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE WIKIMEDIA DUMP FILE.')
	134	+ parser_all.set_defaults(func=all_launcher)
	135	+ parser_all.add_argument('-e', '--except',
	136	+ action='store',
	137	+ help='Should be a list of functions that are to be ignored when \
	138	+ executing all.',
	139	+ default=[])
	140	+
	141	+ parser_all.add_argument('-n', '--new',
	142	+ action='store_true',
	143	+ help='This will delete all previous output and starts from scratch. \
	144	+ Mostly useful for debugging purposes.',
	145	+ default=False)
	146	+
	147	+ #DJANGO
	148	+ parser_django = subparsers.add_parser('django')
	149	+ parser_django.add_argument('-e', '--except',
	150	+ action='store',
	151	+ help='Should be a list of functions that are to be ignored when \
	152	+ executing all.',
	153	+ default=[])
	154	+
	155	+ parser.add_argument('-l', '--language',
	156	+ action='store',
	157	+ help='Example of valid languages.',
	158	+ choices=project.supported_languages(),
	159	+ default=unicode(language.name)
	160	+ )
	161	+
	162	+ parser.add_argument('-p', '--project',
	163	+ action='store',
	164	+ help='Specify the Wikimedia project that you would like to download',
	165	+ choices=pjc.supported_projects(),
	166	+ default='wiki')
	167	+
	168	+ parser.add_argument('-c', '--collection',
	169	+ action='store',
	170	+ help='Name of MongoDB collection',
	171	+ default='editors_raw')
	172	+
	173	+ parser.add_argument('-o', '--location',
	174	+ action='store',
	175	+ help='Indicate where you want to store the downloaded file.',
	176	+ #default=settings.input_location)
	177	+ default=rts.input_location)
	178	+
	179	+ parser.add_argument('-ns', '--namespace',
	180	+ action='store',
	181	+ help='A list of namespaces to include for analysis.',
	182	+ default='0')
	183	+
	184	+ parser.add_argument('-f', '--file',
	185	+ action='store',
	186	+ choices=rts.file_choices,
	187	+ help='Indicate which dump you want to download. Valid choices are:\n \
	188	+ %s' % ''.join([f + ',\n' for f in rts.file_choices]),
	189	+ default='stub-meta-history.xml.gz')
	190	+
	191	+
	192	+ return project, language, parser
	193	+
	194	+
53	195	def config_launcher(rts, logger):
54	196	'''
55	197	Config launcher is used to reconfigure editor trends toolkit.
—	—	@@ -178,8 +320,7 @@
179	321	# collection=properties.collection)
180	322	transformer.transform_editors_single_launcher(rts)
181	323	stopwatch.elapsed()
182		~~- log.log_to_mongo(rts, 'dataset', 'transform', stopwatch,~~
183		~~- event='finish')~~
	324	+ log.log_to_mongo(rts, 'dataset', 'transform', stopwatch, event='finish')
184	325
185	326
186	327	def dataset_launcher(rts, logger):
—	—	@@ -187,20 +328,14 @@
188	329	stopwatch = timer.Timer()
189	330	log.log_to_mongo(rts, 'dataset', 'export', stopwatch, event='start')
190	331
191		~~- #collection = '%s_%s' % (rts.collection, 'dataset')~~
192		~~- for target in rts.targets:~~
	332	+ for chart in rts.charts:
	333	+ analyzer.generate_chart_data(rts, chart, **rts.keywords)
193	334	# write_message_to_log(logger, settings,
194	335	# message=None,
195	336	# verb='Exporting',
196	337	# target=target,
197	338	# dbname=properties.full_project,
198	339	# collection=properties.collection)
199		-
200		~~- analyzer.generate_chart_data(rts.dbname,~~
201		~~- rts.editors_dataset,~~
202		~~- rts.language.code,~~
203		~~- target,~~
204		~~- **rts.keywords)~~
205	340	stopwatch.elapsed()
206	341	log.log_to_mongo(rts, 'dataset', 'export', stopwatch, event='finish')
207	342
—	—	@@ -414,8 +549,6 @@
415	550	default='stub-meta-history.xml.gz')
416	551
417	552
418		~~- return project, language, parser~~
419		-
420	553	def main():
421	554	project, language, parser, = init_args_parser()
422	555	args = parser.parse_args()
Index: trunk/tools/editor_trends/etl/store.py
—	—	@@ -86,9 +86,13 @@
87	87	location = os.path.join(rts.input_location, rts.language.code, rts.project.name)
88	88	fh = file_utils.create_txt_filehandle(location, 'articles.csv', 'r', rts.encoding)
89	89	print 'Storing article titles...'
	90	+ print location
90	91	for line in fh:
91	92	line = line.strip()
92		~~- id, title = line.split('\t')~~
	93	+ try:
	94	+ id, title = line.split('\t')
	95	+ except ValueError:
	96	+ print line.encode('utf-8')
93	97	collection.insert({'id':id, 'title':title})
94	98	fh.close()
95	99	print 'Done...'
Index: trunk/tools/editor_trends/etl/enricher.py
—	—	@@ -421,7 +421,7 @@
422	422
423	423
424	424	def parse_xml(fh):
425		~~- context = iterparse(fh, events=('start', 'end'))~~
	425	+ context = iterparse(fh, events=('end',))
426	426	context = iter(context)
427	427
428	428	article = {}
—	—	@@ -439,11 +439,14 @@
440	440	id = True
441	441	elif event == 'end' and elem.tag == '%s%s' % (namespace, 'page'):
442	442	yield article
	443	+ elem.clear()
443	444	for elem in article.values():
444	445	elem.clear()
445	446	article = {}
446	447	article['revisions'] = []
447	448	id = False
	449	+ elif event == 'end':
	450	+ elem.clear()
448	451
449	452
450	453	def stream_raw_xml(input_queue, storage, id, function, dataset):
Index: trunk/tools/editor_trends/etl/extracter.py
—	—	@@ -345,6 +345,7 @@
346	346
347	347
348	348	def prepare(output):
	349	+ res = file_utils.delete_file(output, 'articles.csv')
349	350	res = file_utils.delete_file(output, None, directory=True)
350	351	if res:
351	352	res = file_utils.create_directory(output)
Index: trunk/tools/editor_trends/etl/transformer.py
—	—	@@ -191,7 +191,9 @@
192	192
193	193
194	194	def transform_editors_single_launcher(rts):
	195	+ print rts.dbname, rts.editors_raw
195	196	ids = db.retrieve_distinct_keys(rts.dbname, rts.editors_raw, 'editor')
	197	+ print len(ids)
196	198	input_db, output_db = setup_database(rts)
197	199	pbar = progressbar.ProgressBar(maxval=len(ids)).start()
198	200	for x, id in enumerate(ids):
Index: trunk/tools/editor_trends/classes/exceptions.py
—	—	@@ -71,6 +71,16 @@
72	72	return 'Currently, chart type %s is not supported. Please choose one of \
73	73	the following charts: %s' % (self.chart, self.charts)
74	74
	75	+class UnknownPluginError(Error):
	76	+ def __init__(self, plugin, plugins):
	77	+ self.plugin = plugin
	78	+ self.plugins = plugins
	79	+
	80	+ def __str__(self):
	81	+ return 'Plugin %s is an unknown plugin. Please choose one of the \
	82	+ the following plugins: %s' % (self.plugin, self.plugins)
	83	+
	84	+
75	85	class NotYetImplementedError(Error):
76	86	def __init__(self, func):
77	87	self.func = func
Index: trunk/tools/editor_trends/classes/runtime_settings.py
—	—	@@ -32,6 +32,7 @@
33	33	from settings import Settings
34	34	from utils import text_utils
35	35	from utils import ordered_dict as odict
	36	+from analyses import inventory
36	37	import languages
37	38	import projects
38	39
—	—	@@ -59,7 +60,7 @@
60	61	self.input_location != None else self.get_value('location')
61	62	self.project = self.update_project_settings()
62	63	self.language = self.update_language_settings()
63		~~- self.targets = self.split_keywords(self.get_value('charts'))~~
	64	+ self.charts = self.determine_chart(self.get_value('charts'))
64	65	self.keywords = self.split_keywords(self.get_value('keywords'))
65	66	self.function = self.get_value('func')
66	67
—	—	@@ -72,8 +73,6 @@
73	74
74	75	self.dataset = os.path.join(self.dataset_location,
75	76	self.project.name)
76		~~- self.charts = os.path.join(self.chart_location,~~
77		~~- self.project.name)~~
78	77
79	78	self.txt = os.path.join(self.location, 'txt')
80	79	self.sorted = os.path.join(self.location, 'sorted')
—	—	@@ -81,8 +80,7 @@
82	81	self.directories = [self.location,
83	82	self.txt,
84	83	self.sorted,
85		~~- self.dataset,~~
86		~~- self.charts]~~
	84	+ self.dataset]
87	85	self.dump_filename = self.generate_wikidump_filename()
88	86	self.dump_relative_path = self.set_dump_path()
89	87	self.dump_absolute_path = self.set_dump_path(absolute=True)
—	—	@@ -121,10 +119,21 @@
122	120	except ValueError:
123	121	pass
124	122	d[key] = value
125		~~- else:~~
126		~~- return keywords~~
127	123	return d
128	124
	125	+ def determine_chart(self, chart):
	126	+ requested_charts = []
	127	+ if chart != None:
	128	+ charts = chart.split(',')
	129	+ available_charts = inventory.available_analyses()
	130	+ for chart in charts:
	131	+ if chart not in available_charts:
	132	+ raise exception.UnknownChartError(chart, available_charts)
	133	+ sys.exit(-1)
	134	+ else:
	135	+ requested_charts.append(chart)
	136	+ return requested_charts
	137	+
129	138	def get_project_location(self):
130	139	'''
131	140	Construct the full project location
Index: trunk/tools/editor_trends/classes/analytics.py
—	—	@@ -0,0 +1,106 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-03-28'
	19	+__version__ = '0.1'
	20	+
	21	+import sys
	22	+from Queue import Empty
	23	+
	24	+if '..' not in sys.path:
	25	+ sys.path.append('..')
	26	+
	27	+from classes import consumers
	28	+from database import db
	29	+
	30	+class Replicator:
	31	+ def __init__(self, plugin, time_unit, cutoff=None, cum_cutoff=None, **kwargs):
	32	+ #this is an ugly hack to prevent a circular import problem
	33	+ #this needs a better fix.
	34	+ import manage
	35	+
	36	+ project, language, parser = manage.init_args_parser()
	37	+ self.project = project
	38	+ self.language = language
	39	+ self.args = parser.parse_args(['django'])
	40	+ self.plugin = plugin
	41	+ self.time_unit = time_unit
	42	+ languages = kwargs.pop('languages', False)
	43	+ if languages:
	44	+ self.languages = ['de', 'fr', 'es', 'ja', 'ru']
	45	+ else:
	46	+ self.languages = ['en']
	47	+ if cutoff == None:
	48	+ self.cutoff = [1, 10, 50]
	49	+ else:
	50	+ self.cutoff = cutoff
	51	+
	52	+ if cutoff == None:
	53	+ self.cum_cutoff = [10]
	54	+ else:
	55	+ self.cum_cutoff = cum_cutoff
	56	+ self.kwargs = kwargs
	57	+
	58	+ def __call__(self):
	59	+ project = 'wiki'
	60	+
	61	+ #rts = runtime_settings.init_environment('wiki', 'en', args)
	62	+
	63	+ for lang in self.languages:
	64	+ self.rts = runtime_settings.init_environment(project, lang, self.args)
	65	+ #TEMP FIX, REMOVE
	66	+ #rts.dbname = 'enwiki'
	67	+ self.rts.editors_dataset = 'editors_dataset'
	68	+
	69	+ self.rts.dbname = '%s%s' % (lang, project)
	70	+ for cum_cutoff in self.cum_cutoff:
	71	+ for cutoff in self.cutoff:
	72	+ generate_chart_data(self.rts, self.plugin,
	73	+ time_unit=self.time_unit,
	74	+ cutoff=cutoff, cum_cutoff=cum_cutoff,
	75	+ **self.kwargs)
	76	+
	77	+
	78	+class Analyzer(consumers.BaseConsumer):
	79	+ def __init__(self, rts, tasks, result, var):
	80	+ super(Analyzer, self).__init__(rts, tasks, result)
	81	+ self.var = var
	82	+
	83	+ def run(self):
	84	+ '''
	85	+ Generic loop function that loops over all the editors of a Wikipedia
	86	+ project and then calls the plugin that does the actual mapping.
	87	+ '''
	88	+ mongo = db.init_mongo_db(self.rts.dbname)
	89	+ coll = mongo[self.rts.editors_dataset]
	90	+ while True:
	91	+ try:
	92	+ task = self.tasks.get(block=False)
	93	+ self.tasks.task_done()
	94	+ if task == None:
	95	+ self.result.put(self.var)
	96	+ break
	97	+ editor = coll.find_one({'editor': task.editor})
	98	+
	99	+ task.plugin(self.var, editor, dbname=self.rts.dbname)
	100	+ self.result.put(True)
	101	+ except Empty:
	102	+ pass
	103	+
	104	+class Task:
	105	+ def __init__(self, plugin, editor):
	106	+ self.plugin = plugin
	107	+ self.editor = editor
Property changes on: trunk/tools/editor_trends/classes/analytics.py
___________________________________________________________________
Added: svn:eol-style
1	108	+ native
Index: trunk/tools/editor_trends/database/db.py
—	—	@@ -80,9 +80,9 @@
81	81	mongo = init_mongo_db(dbname)
82	82	collection = mongo[collection]
83	83	if qualifier == 'min':
84		~~- return collection.find().sort(var, pymongo.ASCENDING).limit(1)[0]~~
	84	+ return collection.find({var : {'$ne' : False}}).sort(var, pymongo.ASCENDING).limit(1)[0]
85	85	elif qualifier == 'max':
86		~~- return collection.find().sort(var, pymongo.DESCENDING).limit(1)[0]~~
	86	+ return collection.find({var : {'$ne' : False}}).sort(var, pymongo.DESCENDING).limit(1)[0]
87	87	else:
88	88	return collection.find({var: 1})
89	89
—	—	@@ -120,9 +120,6 @@
121	121	return data
122	122
123	123
124		~~-def retrieve_max_value(dbname, collection, var):~~
125		~~- pass~~
126		-
127	124	def retrieve_distinct_keys(dbname, collection, field, force_new=False):
128	125	#mongo = init_mongo_db(dbname)
129	126	#editors = mongo[collection]
—	—	@@ -132,8 +129,8 @@
133	130	< 4mb just do a distinct query, index > 4mb do a map reduce.
134	131	'''
135	132	if force_new == False and file_utils.check_file_exists(settings.binary_location,
136		~~- '%s_%s.bin' % (dbname, field)):~~
137		~~- ids = file_utils.load_object(settings.binary_location, '%s_%s.bin' % (dbname, field))~~
	133	+ '%s_%s_%s.bin' % (dbname, collection, field)):
	134	+ ids = file_utils.load_object(settings.binary_location, '%s_%s_%s.bin' % (dbname, collection, field))
138	135	else:
139	136	mongo = init_mongo_db(dbname)
140	137	editors = mongo[collection]
—	—	@@ -145,7 +142,7 @@
146	143	#params['size'] = 'size'
147	144	#size = editors.find_one({'size': 1})
148	145	ids = retrieve_distinct_keys_mapreduce(editors, field)
149		~~- file_utils.store_object(ids, settings.binary_location, '%s_%s.bin' % (dbname, field))~~
	146	+ file_utils.store_object(ids, settings.binary_location, '%s_%s_%s.bin' % (dbname, collection, field))
150	147	return ids
151	148
152	149