r84920 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r84919‎ | r84920 | r84921 >
Date:20:52, 28 March 2011
Author:diederik
Status:deferred
Tags:
Comment:
Fixed command line support for generating datasets.
Modified paths:
  • /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/inventory.py (modified) (history)
  • /trunk/tools/editor_trends/analyses/plugins/new_editor_count.py (modified) (history)
  • /trunk/tools/editor_trends/classes/analytics.py (added) (history)
  • /trunk/tools/editor_trends/classes/exceptions.py (modified) (history)
  • /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history)
  • /trunk/tools/editor_trends/database/db.py (modified) (history)
  • /trunk/tools/editor_trends/etl/enricher.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)
  • /trunk/tools/editor_trends/etl/transformer.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/plugins/new_editor_count.py
@@ -26,6 +26,7 @@
2727 stats.download.org to make sure that we are using the same numbers.
2828 '''
2929 # headers = ['year', 'month', 'count']
30 - new_wikipedian = editor['new_wikipedian']
31 - var.add(new_wikipedian, 1)
 30+ if editor['new_wikipedian'] != False:
 31+ new_wikipedian = editor['new_wikipedian']
 32+ var.add(new_wikipedian, 1)
3233 return var
Index: trunk/tools/editor_trends/analyses/inventory.py
@@ -32,7 +32,7 @@
3333 '''
3434 assert caller == 'django' or caller == 'manage'
3535 ignore = ['__init__']
36 - functions = {}
 36+ charts = {}
3737
3838 fn = os.path.realpath(__file__)
3939 pos = fn.rfind(os.sep)
@@ -42,14 +42,14 @@
4343
4444 for plugin in plugins:
4545 if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore:
46 - functions[plugin.func_name] = plugin
 46+ charts[plugin.func_name] = plugin
4747 if caller == 'manage':
48 - return functions
 48+ return charts
4949 elif caller == 'django':
5050 django_functions = []
51 - for function in functions:
52 - fancy_name = function.replace('_', ' ').title()
53 - django_functions.append((function, fancy_name))
 51+ for chart in charts:
 52+ fancy_name = chart.replace('_', ' ').title()
 53+ django_functions.append((chart, fancy_name))
5454
5555 return django_functions
5656
Index: trunk/tools/editor_trends/analyses/analyzer.py
@@ -19,7 +19,7 @@
2020
2121 from multiprocessing import JoinableQueue, Manager, RLock, Process
2222 from multiprocessing.managers import BaseManager
23 -from Queue import Empty
 23+
2424 import sys
2525 import cPickle
2626 import os
@@ -30,84 +30,16 @@
3131 sys.path.append('..')
3232
3333 import inventory
34 -import manage as manager
3534 from classes import dataset
3635 from classes import runtime_settings
3736 from classes import consumers
 37+from classes import exceptions
 38+from classes import analytics
3839 from database import db
3940 from utils import timer
4041 from utils import log
4142
42 -class Replicator:
43 - def __init__(self, args, plugin, time_unit, cutoff=None, cum_cutoff=None, **kwargs):
44 - self.args = args
45 - self.plugin = plugin
46 - self.time_unit = time_unit
47 - languages = kwargs.pop('languages', False)
48 - if languages:
49 - self.languages = ['de', 'fr', 'es', 'ja', 'ru']
50 - else:
51 - self.languages = ['en']
52 - if cutoff == None:
53 - self.cutoff = [1, 10, 50]
54 - else:
55 - self.cutoff = cutoff
5643
57 - if cutoff == None:
58 - self.cum_cutoff = [10]
59 - else:
60 - self.cum_cutoff = cum_cutoff
61 - self.kwargs = kwargs
62 -
63 - def __call__(self):
64 - project = 'wiki'
65 - for lang in self.languages:
66 - self.rts = runtime_settings.init_environment(project, lang, self.args)
67 - #TEMP FIX, REMOVE
68 - #rts.dbname = 'enwiki'
69 - self.rts.editors_dataset = 'editors_dataset'
70 -
71 - self.rts.dbname = '%s%s' % (lang, project)
72 - for cum_cutoff in self.cum_cutoff:
73 - for cutoff in self.cutoff:
74 - generate_chart_data(self.rts, self.plugin,
75 - time_unit=self.time_unit,
76 - cutoff=cutoff, cum_cutoff=cum_cutoff,
77 - **self.kwargs)
78 -
79 -
80 -class Analyzer(consumers.BaseConsumer):
81 - def __init__(self, rts, tasks, result, var):
82 - super(Analyzer, self).__init__(rts, tasks, result)
83 - self.var = var
84 -
85 - def run(self):
86 - '''
87 - Generic loop function that loops over all the editors of a Wikipedia
88 - project and then calls the plugin that does the actual mapping.
89 - '''
90 - mongo = db.init_mongo_db(self.rts.dbname)
91 - coll = mongo[self.rts.editors_dataset]
92 - while True:
93 - try:
94 - task = self.tasks.get(block=False)
95 - self.tasks.task_done()
96 - if task == None:
97 - self.result.put(self.var)
98 - break
99 - editor = coll.find_one({'editor': task.editor})
100 -
101 - task.plugin(self.var, editor, dbname=self.rts.dbname)
102 - self.result.put(True)
103 - except Empty:
104 - pass
105 -
106 -class Task:
107 - def __init__(self, plugin, editor):
108 - self.plugin = plugin
109 - self.editor = editor
110 -
111 -
11244 def reconstruct_observations(var):
11345 '''
11446 When the Task queue is empty then the Variable instance is returned. However,
@@ -158,8 +90,9 @@
15991 '''
16092 stopwatch = timer.Timer()
16193 plugin = retrieve_plugin(func)
 94+ available_plugins = inventory.available_analyses()
16295 if not plugin:
163 - raise 'Plugin function %s is unknown, please make sure that you specify an existing plugin function.' % func
 96+ raise exceptions.UnknownPluginError(plugin, available_plugins)
16497 feedback(plugin, rts)
16598
16699 obs = dict()
@@ -183,9 +116,9 @@
184117 var = dataset.Variable('count', time_unit, lock, obs_proxy, **kwargs)
185118
186119 for editor in editors:
187 - tasks.put(Task(plugin, editor))
 120+ tasks.put(analytics.Task(plugin, editor))
188121
189 - consumers = [Analyzer(rts, tasks, result, var) for
 122+ consumers = [analytics.Analyzer(rts, tasks, result, var) for
190123 x in xrange(rts.number_of_processes)]
191124
192125
@@ -228,6 +161,7 @@
229162 '''
230163 Determine the first and final year for the observed data
231164 '''
 165+ print dbname, collection, var
232166 try:
233167 max_year = db.run_query(dbname, collection, var, 'max')
234168 max_year = max_year[var].year + 1
@@ -240,31 +174,31 @@
241175
242176
243177 def launcher():
244 - project, language, parser = manager.init_args_parser()
245 - args = parser.parse_args(['django'])
246 - rts = runtime_settings.init_environment('wiki', 'en', args)
 178+# project, language, parser = manage.init_args_parser()
 179+# args = parser.parse_args(['django'])
 180+# rts = runtime_settings.init_environment('wiki', 'en', args)
247181
248182 #TEMP FIX, REMOVE
249 - rts.dbname = 'enwiki'
250 - rts.editors_dataset = 'editors_dataset'
 183+# rts.dbname = 'enwiki'
 184+# rts.editors_dataset = 'editors_dataset'
251185 #END TEMP FIX
252186
253 -# replicator = Replicator(rts, 'histogram_by_backward_cohort', time_unit='year')
 187+# replicator = analytics.Replicator('histogram_by_backward_cohort', time_unit='year')
254188 # replicator()
255 - replicator = Replicator(args, 'cohort_dataset_backward_bar', time_unit='year', format='wide', languages=True)
 189+ replicator = analytics.Replicator('cohort_dataset_backward_bar', time_unit='year', format='wide', languages=True)
256190 replicator()
257191
258 -# generate_chart_data(rts, 'histogram_by_backward_cohort', time_unit='year', cutoff=1, cum_cutoff=10)
259 -# generate_chart_data(rts, 'edit_patterns', time_unit='year', cutoff=5)
260 -# generate_chart_data(rts, 'total_number_of_new_wikipedians', time_unit='year')
261 -# generate_chart_data(rts, 'total_number_of_articles', time_unit='year')
262 -# generate_chart_data(rts, 'total_cumulative_edits', time_unit='year')
263 -# generate_chart_data(rts, 'cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=10)
264 -# generate_chart_data(rts, 'cohort_dataset_backward_bar', time_unit='year', cutoff=1, cum_cutoff=10, format='wide')
265 -# generate_chart_data(rts, 'cohort_dataset_forward_bar', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')
266 -# generate_chart_data(rts, 'histogram_edits', time_unit='year', cutoff=0)
267 -# generate_chart_data(rts, 'time_to_new_wikipedian', time_unit='year', cutoff=0)
268 -# generate_chart_data(rts, 'new_editor_count', time_unit='month', cutoff=0)
 192+# generate_chart_data('histogram_by_backward_cohort', time_unit='year', cutoff=1, cum_cutoff=10)
 193+# generate_chart_data('edit_patterns', time_unit='year', cutoff=5)
 194+# generate_chart_data('total_number_of_new_wikipedians', time_unit='year')
 195+# generate_chart_data('total_number_of_articles', time_unit='year')
 196+# generate_chart_data('total_cumulative_edits', time_unit='year')
 197+# generate_chart_data('cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=10)
 198+# generate_chart_data('cohort_dataset_backward_bar', time_unit='year', cutoff=1, cum_cutoff=10, format='wide')
 199+# generate_chart_data('cohort_dataset_forward_bar', time_unit='year', cutoff=5, cum_cutoff=0, format='wide')
 200+# generate_chart_data('histogram_edits', time_unit='year', cutoff=0)
 201+# generate_chart_data('time_to_new_wikipedian', time_unit='year', cutoff=0)
 202+# generate_chart_data('new_editor_count', time_unit='month', cutoff=0)
269203 # #available_analyses()
270204
271205
Index: trunk/tools/editor_trends/manage.py
@@ -22,23 +22,23 @@
2323 import logging.handlers
2424 import sys
2525 import datetime
26 -from argparse import ArgumentParser
27 -from argparse import RawTextHelpFormatter
2826 import ConfigParser
 27+from argparse import ArgumentParser, RawTextHelpFormatter
2928
 29+from classes import languages
 30+from classes import projects
 31+from classes import runtime_settings
3032 from utils import file_utils
3133 from utils import ordered_dict
3234 from utils import log
3335 from utils import timer
34 -from classes import projects
35 -from classes import languages
36 -from classes import runtime_settings
3736 from database import db
3837 from etl import downloader
3938 from etl import extracter
4039 from etl import store
4140 from etl import sort
4241 from etl import transformer
 42+from analyses import analyzer
4343 from analyses import inventory
4444
4545
@@ -49,6 +49,148 @@
5050 return choices
5151
5252
 53+def init_args_parser():
 54+ '''
 55+ Entry point for parsing command line and launching the needed function(s).
 56+ '''
 57+ language = languages.init()
 58+ project = projects.init()
 59+ pjc = projects.ProjectContainer()
 60+ rts = runtime_settings.RunTimeSettings(project, language)
 61+
 62+ #Init Argument Parser
 63+ parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter)
 64+ subparsers = parser.add_subparsers(help='sub - command help')
 65+
 66+ #SHOW LANGUAGES
 67+ parser_languages = subparsers.add_parser('show_languages',
 68+ help='Overview of all valid languages.')
 69+ parser_languages.add_argument('-s', '--startswith',
 70+ action='store',
 71+ help='Enter the first letter of a language to see which languages are \
 72+ available.')
 73+ parser_languages.set_defaults(func=language.show_languages, args=[project])
 74+
 75+ #CONFIG
 76+ parser_config = subparsers.add_parser('config',
 77+ help='The config sub command allows you set the data location of where \
 78+ to store files.')
 79+ parser_config.set_defaults(func=config_launcher)
 80+ parser_config.add_argument('-f', '--force',
 81+ action='store_true',
 82+ help='Reconfigure Editor Toolkit (this will replace wiki.cfg')
 83+
 84+ #DOWNLOAD
 85+ parser_download = subparsers.add_parser('download',
 86+ help='The download sub command allows you to download a Wikipedia dump\
 87+ file.')
 88+ parser_download.set_defaults(func=downloader_launcher)
 89+
 90+ #EXTRACT
 91+ parser_create = subparsers.add_parser('extract',
 92+ help='The store sub command parsers the XML chunk files, extracts the \
 93+ information and stores it in a MongoDB.')
 94+ parser_create.set_defaults(func=extract_launcher)
 95+
 96+
 97+ #SORT
 98+ parser_sort = subparsers.add_parser('sort',
 99+ help='By presorting the data, significant processing time reductions \
 100+ are achieved.')
 101+ parser_sort.set_defaults(func=sort_launcher)
 102+
 103+ #STORE
 104+ parser_store = subparsers.add_parser('store',
 105+ help='The store sub command parsers the XML chunk files, extracts the \
 106+ information and stores it in a MongoDB.')
 107+ parser_store.set_defaults(func=store_launcher)
 108+
 109+ #TRANSFORM
 110+ parser_transform = subparsers.add_parser('transform',
 111+ help='Transform the raw datatable to an enriched dataset that can be \
 112+ exported.')
 113+ parser_transform.set_defaults(func=transformer_launcher)
 114+
 115+ #DATASET
 116+ parser_dataset = subparsers.add_parser('dataset',
 117+ help='Create a dataset from the MongoDB and write it to a csv file.')
 118+ parser_dataset.set_defaults(func=dataset_launcher)
 119+ parser_dataset.add_argument('-c', '--charts',
 120+ action='store',
 121+ help='Should be a valid function name that matches one of the plugin functions',
 122+ default=inventory.available_analyses()['new_editor_count'])
 123+
 124+ parser_dataset.add_argument('-k', '--keywords',
 125+ action='store',
 126+ help='Add additional keywords in the format keyword1=value1,keyword2=value2',
 127+ default='')
 128+
 129+ #ALL
 130+ parser_all = subparsers.add_parser('all',
 131+ help='The all sub command runs the download, split, store and dataset \
 132+ commands.\n\nWARNING: THIS COULD TAKE DAYS DEPENDING ON THE \
 133+ CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE WIKIMEDIA DUMP FILE.')
 134+ parser_all.set_defaults(func=all_launcher)
 135+ parser_all.add_argument('-e', '--except',
 136+ action='store',
 137+ help='Should be a list of functions that are to be ignored when \
 138+ executing all.',
 139+ default=[])
 140+
 141+ parser_all.add_argument('-n', '--new',
 142+ action='store_true',
 143+ help='This will delete all previous output and starts from scratch. \
 144+ Mostly useful for debugging purposes.',
 145+ default=False)
 146+
 147+ #DJANGO
 148+ parser_django = subparsers.add_parser('django')
 149+ parser_django.add_argument('-e', '--except',
 150+ action='store',
 151+ help='Should be a list of functions that are to be ignored when \
 152+ executing all.',
 153+ default=[])
 154+
 155+ parser.add_argument('-l', '--language',
 156+ action='store',
 157+ help='Example of valid languages.',
 158+ choices=project.supported_languages(),
 159+ default=unicode(language.name)
 160+ )
 161+
 162+ parser.add_argument('-p', '--project',
 163+ action='store',
 164+ help='Specify the Wikimedia project that you would like to download',
 165+ choices=pjc.supported_projects(),
 166+ default='wiki')
 167+
 168+ parser.add_argument('-c', '--collection',
 169+ action='store',
 170+ help='Name of MongoDB collection',
 171+ default='editors_raw')
 172+
 173+ parser.add_argument('-o', '--location',
 174+ action='store',
 175+ help='Indicate where you want to store the downloaded file.',
 176+ #default=settings.input_location)
 177+ default=rts.input_location)
 178+
 179+ parser.add_argument('-ns', '--namespace',
 180+ action='store',
 181+ help='A list of namespaces to include for analysis.',
 182+ default='0')
 183+
 184+ parser.add_argument('-f', '--file',
 185+ action='store',
 186+ choices=rts.file_choices,
 187+ help='Indicate which dump you want to download. Valid choices are:\n \
 188+ %s' % ''.join([f + ',\n' for f in rts.file_choices]),
 189+ default='stub-meta-history.xml.gz')
 190+
 191+
 192+ return project, language, parser
 193+
 194+
53195 def config_launcher(rts, logger):
54196 '''
55197 Config launcher is used to reconfigure editor trends toolkit.
@@ -178,8 +320,7 @@
179321 # collection=properties.collection)
180322 transformer.transform_editors_single_launcher(rts)
181323 stopwatch.elapsed()
182 - log.log_to_mongo(rts, 'dataset', 'transform', stopwatch,
183 - event='finish')
 324+ log.log_to_mongo(rts, 'dataset', 'transform', stopwatch, event='finish')
184325
185326
186327 def dataset_launcher(rts, logger):
@@ -187,20 +328,14 @@
188329 stopwatch = timer.Timer()
189330 log.log_to_mongo(rts, 'dataset', 'export', stopwatch, event='start')
190331
191 - #collection = '%s_%s' % (rts.collection, 'dataset')
192 - for target in rts.targets:
 332+ for chart in rts.charts:
 333+ analyzer.generate_chart_data(rts, chart, **rts.keywords)
193334 # write_message_to_log(logger, settings,
194335 # message=None,
195336 # verb='Exporting',
196337 # target=target,
197338 # dbname=properties.full_project,
198339 # collection=properties.collection)
199 -
200 - analyzer.generate_chart_data(rts.dbname,
201 - rts.editors_dataset,
202 - rts.language.code,
203 - target,
204 - **rts.keywords)
205340 stopwatch.elapsed()
206341 log.log_to_mongo(rts, 'dataset', 'export', stopwatch, event='finish')
207342
@@ -414,8 +549,6 @@
415550 default='stub-meta-history.xml.gz')
416551
417552
418 - return project, language, parser
419 -
420553 def main():
421554 project, language, parser, = init_args_parser()
422555 args = parser.parse_args()
Index: trunk/tools/editor_trends/etl/store.py
@@ -86,9 +86,13 @@
8787 location = os.path.join(rts.input_location, rts.language.code, rts.project.name)
8888 fh = file_utils.create_txt_filehandle(location, 'articles.csv', 'r', rts.encoding)
8989 print 'Storing article titles...'
 90+ print location
9091 for line in fh:
9192 line = line.strip()
92 - id, title = line.split('\t')
 93+ try:
 94+ id, title = line.split('\t')
 95+ except ValueError:
 96+ print line.encode('utf-8')
9397 collection.insert({'id':id, 'title':title})
9498 fh.close()
9599 print 'Done...'
Index: trunk/tools/editor_trends/etl/enricher.py
@@ -421,7 +421,7 @@
422422
423423
424424 def parse_xml(fh):
425 - context = iterparse(fh, events=('start', 'end'))
 425+ context = iterparse(fh, events=('end',))
426426 context = iter(context)
427427
428428 article = {}
@@ -439,11 +439,14 @@
440440 id = True
441441 elif event == 'end' and elem.tag == '%s%s' % (namespace, 'page'):
442442 yield article
 443+ elem.clear()
443444 for elem in article.values():
444445 elem.clear()
445446 article = {}
446447 article['revisions'] = []
447448 id = False
 449+ elif event == 'end':
 450+ elem.clear()
448451
449452
450453 def stream_raw_xml(input_queue, storage, id, function, dataset):
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -345,6 +345,7 @@
346346
347347
348348 def prepare(output):
 349+ res = file_utils.delete_file(output, 'articles.csv')
349350 res = file_utils.delete_file(output, None, directory=True)
350351 if res:
351352 res = file_utils.create_directory(output)
Index: trunk/tools/editor_trends/etl/transformer.py
@@ -191,7 +191,9 @@
192192
193193
194194 def transform_editors_single_launcher(rts):
 195+ print rts.dbname, rts.editors_raw
195196 ids = db.retrieve_distinct_keys(rts.dbname, rts.editors_raw, 'editor')
 197+ print len(ids)
196198 input_db, output_db = setup_database(rts)
197199 pbar = progressbar.ProgressBar(maxval=len(ids)).start()
198200 for x, id in enumerate(ids):
Index: trunk/tools/editor_trends/classes/exceptions.py
@@ -71,6 +71,16 @@
7272 return 'Currently, chart type %s is not supported. Please choose one of \
7373 the following charts: %s' % (self.chart, self.charts)
7474
 75+class UnknownPluginError(Error):
 76+ def __init__(self, plugin, plugins):
 77+ self.plugin = plugin
 78+ self.plugins = plugins
 79+
 80+ def __str__(self):
 81+ return 'Plugin %s is an unknown plugin. Please choose one of the \
 82+ the following plugins: %s' % (self.plugin, self.plugins)
 83+
 84+
7585 class NotYetImplementedError(Error):
7686 def __init__(self, func):
7787 self.func = func
Index: trunk/tools/editor_trends/classes/runtime_settings.py
@@ -32,6 +32,7 @@
3333 from settings import Settings
3434 from utils import text_utils
3535 from utils import ordered_dict as odict
 36+from analyses import inventory
3637 import languages
3738 import projects
3839
@@ -59,7 +60,7 @@
6061 self.input_location != None else self.get_value('location')
6162 self.project = self.update_project_settings()
6263 self.language = self.update_language_settings()
63 - self.targets = self.split_keywords(self.get_value('charts'))
 64+ self.charts = self.determine_chart(self.get_value('charts'))
6465 self.keywords = self.split_keywords(self.get_value('keywords'))
6566 self.function = self.get_value('func')
6667
@@ -72,8 +73,6 @@
7374
7475 self.dataset = os.path.join(self.dataset_location,
7576 self.project.name)
76 - self.charts = os.path.join(self.chart_location,
77 - self.project.name)
7877
7978 self.txt = os.path.join(self.location, 'txt')
8079 self.sorted = os.path.join(self.location, 'sorted')
@@ -81,8 +80,7 @@
8281 self.directories = [self.location,
8382 self.txt,
8483 self.sorted,
85 - self.dataset,
86 - self.charts]
 84+ self.dataset]
8785 self.dump_filename = self.generate_wikidump_filename()
8886 self.dump_relative_path = self.set_dump_path()
8987 self.dump_absolute_path = self.set_dump_path(absolute=True)
@@ -121,10 +119,21 @@
122120 except ValueError:
123121 pass
124122 d[key] = value
125 - else:
126 - return keywords
127123 return d
128124
 125+ def determine_chart(self, chart):
 126+ requested_charts = []
 127+ if chart != None:
 128+ charts = chart.split(',')
 129+ available_charts = inventory.available_analyses()
 130+ for chart in charts:
 131+ if chart not in available_charts:
 132+ raise exception.UnknownChartError(chart, available_charts)
 133+ sys.exit(-1)
 134+ else:
 135+ requested_charts.append(chart)
 136+ return requested_charts
 137+
129138 def get_project_location(self):
130139 '''
131140 Construct the full project location
Index: trunk/tools/editor_trends/classes/analytics.py
@@ -0,0 +1,106 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2011-03-28'
 19+__version__ = '0.1'
 20+
 21+import sys
 22+from Queue import Empty
 23+
 24+if '..' not in sys.path:
 25+ sys.path.append('..')
 26+
 27+from classes import consumers
 28+from database import db
 29+
 30+class Replicator:
 31+ def __init__(self, plugin, time_unit, cutoff=None, cum_cutoff=None, **kwargs):
 32+ #this is an ugly hack to prevent a circular import problem
 33+ #this needs a better fix.
 34+ import manage
 35+
 36+ project, language, parser = manage.init_args_parser()
 37+ self.project = project
 38+ self.language = language
 39+ self.args = parser.parse_args(['django'])
 40+ self.plugin = plugin
 41+ self.time_unit = time_unit
 42+ languages = kwargs.pop('languages', False)
 43+ if languages:
 44+ self.languages = ['de', 'fr', 'es', 'ja', 'ru']
 45+ else:
 46+ self.languages = ['en']
 47+ if cutoff == None:
 48+ self.cutoff = [1, 10, 50]
 49+ else:
 50+ self.cutoff = cutoff
 51+
 52+ if cutoff == None:
 53+ self.cum_cutoff = [10]
 54+ else:
 55+ self.cum_cutoff = cum_cutoff
 56+ self.kwargs = kwargs
 57+
 58+ def __call__(self):
 59+ project = 'wiki'
 60+
 61+ #rts = runtime_settings.init_environment('wiki', 'en', args)
 62+
 63+ for lang in self.languages:
 64+ self.rts = runtime_settings.init_environment(project, lang, self.args)
 65+ #TEMP FIX, REMOVE
 66+ #rts.dbname = 'enwiki'
 67+ self.rts.editors_dataset = 'editors_dataset'
 68+
 69+ self.rts.dbname = '%s%s' % (lang, project)
 70+ for cum_cutoff in self.cum_cutoff:
 71+ for cutoff in self.cutoff:
 72+ generate_chart_data(self.rts, self.plugin,
 73+ time_unit=self.time_unit,
 74+ cutoff=cutoff, cum_cutoff=cum_cutoff,
 75+ **self.kwargs)
 76+
 77+
 78+class Analyzer(consumers.BaseConsumer):
 79+ def __init__(self, rts, tasks, result, var):
 80+ super(Analyzer, self).__init__(rts, tasks, result)
 81+ self.var = var
 82+
 83+ def run(self):
 84+ '''
 85+ Generic loop function that loops over all the editors of a Wikipedia
 86+ project and then calls the plugin that does the actual mapping.
 87+ '''
 88+ mongo = db.init_mongo_db(self.rts.dbname)
 89+ coll = mongo[self.rts.editors_dataset]
 90+ while True:
 91+ try:
 92+ task = self.tasks.get(block=False)
 93+ self.tasks.task_done()
 94+ if task == None:
 95+ self.result.put(self.var)
 96+ break
 97+ editor = coll.find_one({'editor': task.editor})
 98+
 99+ task.plugin(self.var, editor, dbname=self.rts.dbname)
 100+ self.result.put(True)
 101+ except Empty:
 102+ pass
 103+
 104+class Task:
 105+ def __init__(self, plugin, editor):
 106+ self.plugin = plugin
 107+ self.editor = editor
Property changes on: trunk/tools/editor_trends/classes/analytics.py
___________________________________________________________________
Added: svn:eol-style
1108 + native
Index: trunk/tools/editor_trends/database/db.py
@@ -80,9 +80,9 @@
8181 mongo = init_mongo_db(dbname)
8282 collection = mongo[collection]
8383 if qualifier == 'min':
84 - return collection.find().sort(var, pymongo.ASCENDING).limit(1)[0]
 84+ return collection.find({var : {'$ne' : False}}).sort(var, pymongo.ASCENDING).limit(1)[0]
8585 elif qualifier == 'max':
86 - return collection.find().sort(var, pymongo.DESCENDING).limit(1)[0]
 86+ return collection.find({var : {'$ne' : False}}).sort(var, pymongo.DESCENDING).limit(1)[0]
8787 else:
8888 return collection.find({var: 1})
8989
@@ -120,9 +120,6 @@
121121 return data
122122
123123
124 -def retrieve_max_value(dbname, collection, var):
125 - pass
126 -
127124 def retrieve_distinct_keys(dbname, collection, field, force_new=False):
128125 #mongo = init_mongo_db(dbname)
129126 #editors = mongo[collection]
@@ -132,8 +129,8 @@
133130 < 4mb just do a distinct query, index > 4mb do a map reduce.
134131 '''
135132 if force_new == False and file_utils.check_file_exists(settings.binary_location,
136 - '%s_%s.bin' % (dbname, field)):
137 - ids = file_utils.load_object(settings.binary_location, '%s_%s.bin' % (dbname, field))
 133+ '%s_%s_%s.bin' % (dbname, collection, field)):
 134+ ids = file_utils.load_object(settings.binary_location, '%s_%s_%s.bin' % (dbname, collection, field))
138135 else:
139136 mongo = init_mongo_db(dbname)
140137 editors = mongo[collection]
@@ -145,7 +142,7 @@
146143 #params['size'] = 'size'
147144 #size = editors.find_one({'size': 1})
148145 ids = retrieve_distinct_keys_mapreduce(editors, field)
149 - file_utils.store_object(ids, settings.binary_location, '%s_%s.bin' % (dbname, field))
 146+ file_utils.store_object(ids, settings.binary_location, '%s_%s_%s.bin' % (dbname, collection, field))
150147 return ids
151148
152149