Index: trunk/tools/editor_trends/analyses/plugins/new_editor_count.py |
— | — | @@ -26,6 +26,7 @@ |
27 | 27 | stats.download.org to make sure that we are using the same numbers. |
28 | 28 | ''' |
29 | 29 | # headers = ['year', 'month', 'count'] |
30 | | - new_wikipedian = editor['new_wikipedian'] |
31 | | - var.add(new_wikipedian, 1) |
| 30 | + if editor['new_wikipedian'] != False: |
| 31 | + new_wikipedian = editor['new_wikipedian'] |
| 32 | + var.add(new_wikipedian, 1) |
32 | 33 | return var |
Index: trunk/tools/editor_trends/analyses/inventory.py |
— | — | @@ -32,7 +32,7 @@ |
33 | 33 | ''' |
34 | 34 | assert caller == 'django' or caller == 'manage' |
35 | 35 | ignore = ['__init__'] |
36 | | - functions = {} |
| 36 | + charts = {} |
37 | 37 | |
38 | 38 | fn = os.path.realpath(__file__) |
39 | 39 | pos = fn.rfind(os.sep) |
— | — | @@ -42,14 +42,14 @@ |
43 | 43 | |
44 | 44 | for plugin in plugins: |
45 | 45 | if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore: |
46 | | - functions[plugin.func_name] = plugin |
| 46 | + charts[plugin.func_name] = plugin |
47 | 47 | if caller == 'manage': |
48 | | - return functions |
| 48 | + return charts |
49 | 49 | elif caller == 'django': |
50 | 50 | django_functions = [] |
51 | | - for function in functions: |
52 | | - fancy_name = function.replace('_', ' ').title() |
53 | | - django_functions.append((function, fancy_name)) |
| 51 | + for chart in charts: |
| 52 | + fancy_name = chart.replace('_', ' ').title() |
| 53 | + django_functions.append((chart, fancy_name)) |
54 | 54 | |
55 | 55 | return django_functions |
56 | 56 | |
Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -19,7 +19,7 @@ |
20 | 20 | |
21 | 21 | from multiprocessing import JoinableQueue, Manager, RLock, Process |
22 | 22 | from multiprocessing.managers import BaseManager |
23 | | -from Queue import Empty |
| 23 | + |
24 | 24 | import sys |
25 | 25 | import cPickle |
26 | 26 | import os |
— | — | @@ -30,84 +30,16 @@ |
31 | 31 | sys.path.append('..') |
32 | 32 | |
33 | 33 | import inventory |
34 | | -import manage as manager |
35 | 34 | from classes import dataset |
36 | 35 | from classes import runtime_settings |
37 | 36 | from classes import consumers |
| 37 | +from classes import exceptions |
| 38 | +from classes import analytics |
38 | 39 | from database import db |
39 | 40 | from utils import timer |
40 | 41 | from utils import log |
41 | 42 | |
42 | | -class Replicator: |
43 | | - def __init__(self, args, plugin, time_unit, cutoff=None, cum_cutoff=None, **kwargs): |
44 | | - self.args = args |
45 | | - self.plugin = plugin |
46 | | - self.time_unit = time_unit |
47 | | - languages = kwargs.pop('languages', False) |
48 | | - if languages: |
49 | | - self.languages = ['de', 'fr', 'es', 'ja', 'ru'] |
50 | | - else: |
51 | | - self.languages = ['en'] |
52 | | - if cutoff == None: |
53 | | - self.cutoff = [1, 10, 50] |
54 | | - else: |
55 | | - self.cutoff = cutoff |
56 | 43 | |
57 | | - if cutoff == None: |
58 | | - self.cum_cutoff = [10] |
59 | | - else: |
60 | | - self.cum_cutoff = cum_cutoff |
61 | | - self.kwargs = kwargs |
62 | | - |
63 | | - def __call__(self): |
64 | | - project = 'wiki' |
65 | | - for lang in self.languages: |
66 | | - self.rts = runtime_settings.init_environment(project, lang, self.args) |
67 | | - #TEMP FIX, REMOVE |
68 | | - #rts.dbname = 'enwiki' |
69 | | - self.rts.editors_dataset = 'editors_dataset' |
70 | | - |
71 | | - self.rts.dbname = '%s%s' % (lang, project) |
72 | | - for cum_cutoff in self.cum_cutoff: |
73 | | - for cutoff in self.cutoff: |
74 | | - generate_chart_data(self.rts, self.plugin, |
75 | | - time_unit=self.time_unit, |
76 | | - cutoff=cutoff, cum_cutoff=cum_cutoff, |
77 | | - **self.kwargs) |
78 | | - |
79 | | - |
80 | | -class Analyzer(consumers.BaseConsumer): |
81 | | - def __init__(self, rts, tasks, result, var): |
82 | | - super(Analyzer, self).__init__(rts, tasks, result) |
83 | | - self.var = var |
84 | | - |
85 | | - def run(self): |
86 | | - ''' |
87 | | - Generic loop function that loops over all the editors of a Wikipedia |
88 | | - project and then calls the plugin that does the actual mapping. |
89 | | - ''' |
90 | | - mongo = db.init_mongo_db(self.rts.dbname) |
91 | | - coll = mongo[self.rts.editors_dataset] |
92 | | - while True: |
93 | | - try: |
94 | | - task = self.tasks.get(block=False) |
95 | | - self.tasks.task_done() |
96 | | - if task == None: |
97 | | - self.result.put(self.var) |
98 | | - break |
99 | | - editor = coll.find_one({'editor': task.editor}) |
100 | | - |
101 | | - task.plugin(self.var, editor, dbname=self.rts.dbname) |
102 | | - self.result.put(True) |
103 | | - except Empty: |
104 | | - pass |
105 | | - |
106 | | -class Task: |
107 | | - def __init__(self, plugin, editor): |
108 | | - self.plugin = plugin |
109 | | - self.editor = editor |
110 | | - |
111 | | - |
112 | 44 | def reconstruct_observations(var): |
113 | 45 | ''' |
114 | 46 | When the Task queue is empty then the Variable instance is returned. However, |
— | — | @@ -158,8 +90,9 @@ |
159 | 91 | ''' |
160 | 92 | stopwatch = timer.Timer() |
161 | 93 | plugin = retrieve_plugin(func) |
| 94 | + available_plugins = inventory.available_analyses() |
162 | 95 | if not plugin: |
163 | | - raise 'Plugin function %s is unknown, please make sure that you specify an existing plugin function.' % func |
| 96 | + raise exceptions.UnknownPluginError(plugin, available_plugins) |
164 | 97 | feedback(plugin, rts) |
165 | 98 | |
166 | 99 | obs = dict() |
— | — | @@ -183,9 +116,9 @@ |
184 | 117 | var = dataset.Variable('count', time_unit, lock, obs_proxy, **kwargs) |
185 | 118 | |
186 | 119 | for editor in editors: |
187 | | - tasks.put(Task(plugin, editor)) |
| 120 | + tasks.put(analytics.Task(plugin, editor)) |
188 | 121 | |
189 | | - consumers = [Analyzer(rts, tasks, result, var) for |
| 122 | + consumers = [analytics.Analyzer(rts, tasks, result, var) for |
190 | 123 | x in xrange(rts.number_of_processes)] |
191 | 124 | |
192 | 125 | |
— | — | @@ -228,6 +161,7 @@ |
229 | 162 | ''' |
230 | 163 | Determine the first and final year for the observed data |
231 | 164 | ''' |
| 165 | + print dbname, collection, var |
232 | 166 | try: |
233 | 167 | max_year = db.run_query(dbname, collection, var, 'max') |
234 | 168 | max_year = max_year[var].year + 1 |
— | — | @@ -240,31 +174,31 @@ |
241 | 175 | |
242 | 176 | |
243 | 177 | def launcher(): |
244 | | - project, language, parser = manager.init_args_parser() |
245 | | - args = parser.parse_args(['django']) |
246 | | - rts = runtime_settings.init_environment('wiki', 'en', args) |
| 178 | +# project, language, parser = manage.init_args_parser() |
| 179 | +# args = parser.parse_args(['django']) |
| 180 | +# rts = runtime_settings.init_environment('wiki', 'en', args) |
247 | 181 | |
248 | 182 | #TEMP FIX, REMOVE |
249 | | - rts.dbname = 'enwiki' |
250 | | - rts.editors_dataset = 'editors_dataset' |
| 183 | +# rts.dbname = 'enwiki' |
| 184 | +# rts.editors_dataset = 'editors_dataset' |
251 | 185 | #END TEMP FIX |
252 | 186 | |
253 | | -# replicator = Replicator(rts, 'histogram_by_backward_cohort', time_unit='year') |
| 187 | +# replicator = analytics.Replicator('histogram_by_backward_cohort', time_unit='year') |
254 | 188 | # replicator() |
255 | | - replicator = Replicator(args, 'cohort_dataset_backward_bar', time_unit='year', format='wide', languages=True) |
| 189 | + replicator = analytics.Replicator('cohort_dataset_backward_bar', time_unit='year', format='wide', languages=True) |
256 | 190 | replicator() |
257 | 191 | |
258 | | -# generate_chart_data(rts, 'histogram_by_backward_cohort', time_unit='year', cutoff=1, cum_cutoff=10) |
259 | | -# generate_chart_data(rts, 'edit_patterns', time_unit='year', cutoff=5) |
260 | | -# generate_chart_data(rts, 'total_number_of_new_wikipedians', time_unit='year') |
261 | | -# generate_chart_data(rts, 'total_number_of_articles', time_unit='year') |
262 | | -# generate_chart_data(rts, 'total_cumulative_edits', time_unit='year') |
263 | | -# generate_chart_data(rts, 'cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=10) |
264 | | -# generate_chart_data(rts, 'cohort_dataset_backward_bar', time_unit='year', cutoff=1, cum_cutoff=10, format='wide') |
265 | | -# generate_chart_data(rts, 'cohort_dataset_forward_bar', time_unit='year', cutoff=5, cum_cutoff=0, format='wide') |
266 | | -# generate_chart_data(rts, 'histogram_edits', time_unit='year', cutoff=0) |
267 | | -# generate_chart_data(rts, 'time_to_new_wikipedian', time_unit='year', cutoff=0) |
268 | | -# generate_chart_data(rts, 'new_editor_count', time_unit='month', cutoff=0) |
| 192 | +# generate_chart_data('histogram_by_backward_cohort', time_unit='year', cutoff=1, cum_cutoff=10) |
| 193 | +# generate_chart_data('edit_patterns', time_unit='year', cutoff=5) |
| 194 | +# generate_chart_data('total_number_of_new_wikipedians', time_unit='year') |
| 195 | +# generate_chart_data('total_number_of_articles', time_unit='year') |
| 196 | +# generate_chart_data('total_cumulative_edits', time_unit='year') |
| 197 | +# generate_chart_data('cohort_dataset_forward_histogram', time_unit='month', cutoff=1, cum_cutoff=10) |
| 198 | +# generate_chart_data('cohort_dataset_backward_bar', time_unit='year', cutoff=1, cum_cutoff=10, format='wide') |
| 199 | +# generate_chart_data('cohort_dataset_forward_bar', time_unit='year', cutoff=5, cum_cutoff=0, format='wide') |
| 200 | +# generate_chart_data('histogram_edits', time_unit='year', cutoff=0) |
| 201 | +# generate_chart_data('time_to_new_wikipedian', time_unit='year', cutoff=0) |
| 202 | +# generate_chart_data('new_editor_count', time_unit='month', cutoff=0) |
269 | 203 | # #available_analyses() |
270 | 204 | |
271 | 205 | |
Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -22,23 +22,23 @@ |
23 | 23 | import logging.handlers |
24 | 24 | import sys |
25 | 25 | import datetime |
26 | | -from argparse import ArgumentParser |
27 | | -from argparse import RawTextHelpFormatter |
28 | 26 | import ConfigParser |
| 27 | +from argparse import ArgumentParser, RawTextHelpFormatter |
29 | 28 | |
| 29 | +from classes import languages |
| 30 | +from classes import projects |
| 31 | +from classes import runtime_settings |
30 | 32 | from utils import file_utils |
31 | 33 | from utils import ordered_dict |
32 | 34 | from utils import log |
33 | 35 | from utils import timer |
34 | | -from classes import projects |
35 | | -from classes import languages |
36 | | -from classes import runtime_settings |
37 | 36 | from database import db |
38 | 37 | from etl import downloader |
39 | 38 | from etl import extracter |
40 | 39 | from etl import store |
41 | 40 | from etl import sort |
42 | 41 | from etl import transformer |
| 42 | +from analyses import analyzer |
43 | 43 | from analyses import inventory |
44 | 44 | |
45 | 45 | |
— | — | @@ -49,6 +49,148 @@ |
50 | 50 | return choices |
51 | 51 | |
52 | 52 | |
| 53 | +def init_args_parser(): |
| 54 | + ''' |
| 55 | + Entry point for parsing command line and launching the needed function(s). |
| 56 | + ''' |
| 57 | + language = languages.init() |
| 58 | + project = projects.init() |
| 59 | + pjc = projects.ProjectContainer() |
| 60 | + rts = runtime_settings.RunTimeSettings(project, language) |
| 61 | + |
| 62 | + #Init Argument Parser |
| 63 | + parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter) |
| 64 | + subparsers = parser.add_subparsers(help='sub - command help') |
| 65 | + |
| 66 | + #SHOW LANGUAGES |
| 67 | + parser_languages = subparsers.add_parser('show_languages', |
| 68 | + help='Overview of all valid languages.') |
| 69 | + parser_languages.add_argument('-s', '--startswith', |
| 70 | + action='store', |
| 71 | + help='Enter the first letter of a language to see which languages are \ |
| 72 | + available.') |
| 73 | + parser_languages.set_defaults(func=language.show_languages, args=[project]) |
| 74 | + |
| 75 | + #CONFIG |
| 76 | + parser_config = subparsers.add_parser('config', |
| 77 | + help='The config sub command allows you set the data location of where \ |
| 78 | + to store files.') |
| 79 | + parser_config.set_defaults(func=config_launcher) |
| 80 | + parser_config.add_argument('-f', '--force', |
| 81 | + action='store_true', |
| 82 | + help='Reconfigure Editor Toolkit (this will replace wiki.cfg') |
| 83 | + |
| 84 | + #DOWNLOAD |
| 85 | + parser_download = subparsers.add_parser('download', |
| 86 | + help='The download sub command allows you to download a Wikipedia dump\ |
| 87 | + file.') |
| 88 | + parser_download.set_defaults(func=downloader_launcher) |
| 89 | + |
| 90 | + #EXTRACT |
| 91 | + parser_create = subparsers.add_parser('extract', |
| 92 | + help='The store sub command parsers the XML chunk files, extracts the \ |
| 93 | + information and stores it in a MongoDB.') |
| 94 | + parser_create.set_defaults(func=extract_launcher) |
| 95 | + |
| 96 | + |
| 97 | + #SORT |
| 98 | + parser_sort = subparsers.add_parser('sort', |
| 99 | + help='By presorting the data, significant processing time reductions \ |
| 100 | + are achieved.') |
| 101 | + parser_sort.set_defaults(func=sort_launcher) |
| 102 | + |
| 103 | + #STORE |
| 104 | + parser_store = subparsers.add_parser('store', |
| 105 | + help='The store sub command parsers the XML chunk files, extracts the \ |
| 106 | + information and stores it in a MongoDB.') |
| 107 | + parser_store.set_defaults(func=store_launcher) |
| 108 | + |
| 109 | + #TRANSFORM |
| 110 | + parser_transform = subparsers.add_parser('transform', |
| 111 | + help='Transform the raw datatable to an enriched dataset that can be \ |
| 112 | + exported.') |
| 113 | + parser_transform.set_defaults(func=transformer_launcher) |
| 114 | + |
| 115 | + #DATASET |
| 116 | + parser_dataset = subparsers.add_parser('dataset', |
| 117 | + help='Create a dataset from the MongoDB and write it to a csv file.') |
| 118 | + parser_dataset.set_defaults(func=dataset_launcher) |
| 119 | + parser_dataset.add_argument('-c', '--charts', |
| 120 | + action='store', |
| 121 | + help='Should be a valid function name that matches one of the plugin functions', |
| 122 | + default=inventory.available_analyses()['new_editor_count']) |
| 123 | + |
| 124 | + parser_dataset.add_argument('-k', '--keywords', |
| 125 | + action='store', |
| 126 | + help='Add additional keywords in the format keyword1=value1,keyword2=value2', |
| 127 | + default='') |
| 128 | + |
| 129 | + #ALL |
| 130 | + parser_all = subparsers.add_parser('all', |
| 131 | + help='The all sub command runs the download, split, store and dataset \ |
| 132 | + commands.\n\nWARNING: THIS COULD TAKE DAYS DEPENDING ON THE \ |
| 133 | + CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE WIKIMEDIA DUMP FILE.') |
| 134 | + parser_all.set_defaults(func=all_launcher) |
| 135 | + parser_all.add_argument('-e', '--except', |
| 136 | + action='store', |
| 137 | + help='Should be a list of functions that are to be ignored when \ |
| 138 | + executing all.', |
| 139 | + default=[]) |
| 140 | + |
| 141 | + parser_all.add_argument('-n', '--new', |
| 142 | + action='store_true', |
| 143 | + help='This will delete all previous output and starts from scratch. \ |
| 144 | + Mostly useful for debugging purposes.', |
| 145 | + default=False) |
| 146 | + |
| 147 | + #DJANGO |
| 148 | + parser_django = subparsers.add_parser('django') |
| 149 | + parser_django.add_argument('-e', '--except', |
| 150 | + action='store', |
| 151 | + help='Should be a list of functions that are to be ignored when \ |
| 152 | + executing all.', |
| 153 | + default=[]) |
| 154 | + |
| 155 | + parser.add_argument('-l', '--language', |
| 156 | + action='store', |
| 157 | + help='Example of valid languages.', |
| 158 | + choices=project.supported_languages(), |
| 159 | + default=unicode(language.name) |
| 160 | + ) |
| 161 | + |
| 162 | + parser.add_argument('-p', '--project', |
| 163 | + action='store', |
| 164 | + help='Specify the Wikimedia project that you would like to download', |
| 165 | + choices=pjc.supported_projects(), |
| 166 | + default='wiki') |
| 167 | + |
| 168 | + parser.add_argument('-c', '--collection', |
| 169 | + action='store', |
| 170 | + help='Name of MongoDB collection', |
| 171 | + default='editors_raw') |
| 172 | + |
| 173 | + parser.add_argument('-o', '--location', |
| 174 | + action='store', |
| 175 | + help='Indicate where you want to store the downloaded file.', |
| 176 | + #default=settings.input_location) |
| 177 | + default=rts.input_location) |
| 178 | + |
| 179 | + parser.add_argument('-ns', '--namespace', |
| 180 | + action='store', |
| 181 | + help='A list of namespaces to include for analysis.', |
| 182 | + default='0') |
| 183 | + |
| 184 | + parser.add_argument('-f', '--file', |
| 185 | + action='store', |
| 186 | + choices=rts.file_choices, |
| 187 | + help='Indicate which dump you want to download. Valid choices are:\n \ |
| 188 | + %s' % ''.join([f + ',\n' for f in rts.file_choices]), |
| 189 | + default='stub-meta-history.xml.gz') |
| 190 | + |
| 191 | + |
| 192 | + return project, language, parser |
| 193 | + |
| 194 | + |
53 | 195 | def config_launcher(rts, logger): |
54 | 196 | ''' |
55 | 197 | Config launcher is used to reconfigure editor trends toolkit. |
— | — | @@ -178,8 +320,7 @@ |
179 | 321 | # collection=properties.collection) |
180 | 322 | transformer.transform_editors_single_launcher(rts) |
181 | 323 | stopwatch.elapsed() |
182 | | - log.log_to_mongo(rts, 'dataset', 'transform', stopwatch, |
183 | | - event='finish') |
| 324 | + log.log_to_mongo(rts, 'dataset', 'transform', stopwatch, event='finish') |
184 | 325 | |
185 | 326 | |
186 | 327 | def dataset_launcher(rts, logger): |
— | — | @@ -187,20 +328,14 @@ |
188 | 329 | stopwatch = timer.Timer() |
189 | 330 | log.log_to_mongo(rts, 'dataset', 'export', stopwatch, event='start') |
190 | 331 | |
191 | | - #collection = '%s_%s' % (rts.collection, 'dataset') |
192 | | - for target in rts.targets: |
| 332 | + for chart in rts.charts: |
| 333 | + analyzer.generate_chart_data(rts, chart, **rts.keywords) |
193 | 334 | # write_message_to_log(logger, settings, |
194 | 335 | # message=None, |
195 | 336 | # verb='Exporting', |
196 | 337 | # target=target, |
197 | 338 | # dbname=properties.full_project, |
198 | 339 | # collection=properties.collection) |
199 | | - |
200 | | - analyzer.generate_chart_data(rts.dbname, |
201 | | - rts.editors_dataset, |
202 | | - rts.language.code, |
203 | | - target, |
204 | | - **rts.keywords) |
205 | 340 | stopwatch.elapsed() |
206 | 341 | log.log_to_mongo(rts, 'dataset', 'export', stopwatch, event='finish') |
207 | 342 | |
— | — | @@ -414,8 +549,6 @@ |
415 | 550 | default='stub-meta-history.xml.gz') |
416 | 551 | |
417 | 552 | |
418 | | - return project, language, parser |
419 | | - |
420 | 553 | def main(): |
421 | 554 | project, language, parser, = init_args_parser() |
422 | 555 | args = parser.parse_args() |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -86,9 +86,13 @@ |
87 | 87 | location = os.path.join(rts.input_location, rts.language.code, rts.project.name) |
88 | 88 | fh = file_utils.create_txt_filehandle(location, 'articles.csv', 'r', rts.encoding) |
89 | 89 | print 'Storing article titles...' |
| 90 | + print location |
90 | 91 | for line in fh: |
91 | 92 | line = line.strip() |
92 | | - id, title = line.split('\t') |
| 93 | + try: |
| 94 | + id, title = line.split('\t') |
| 95 | + except ValueError: |
| 96 | + print line.encode('utf-8') |
93 | 97 | collection.insert({'id':id, 'title':title}) |
94 | 98 | fh.close() |
95 | 99 | print 'Done...' |
Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -421,7 +421,7 @@ |
422 | 422 | |
423 | 423 | |
424 | 424 | def parse_xml(fh): |
425 | | - context = iterparse(fh, events=('start', 'end')) |
| 425 | + context = iterparse(fh, events=('end',)) |
426 | 426 | context = iter(context) |
427 | 427 | |
428 | 428 | article = {} |
— | — | @@ -439,11 +439,14 @@ |
440 | 440 | id = True |
441 | 441 | elif event == 'end' and elem.tag == '%s%s' % (namespace, 'page'): |
442 | 442 | yield article |
| 443 | + elem.clear() |
443 | 444 | for elem in article.values(): |
444 | 445 | elem.clear() |
445 | 446 | article = {} |
446 | 447 | article['revisions'] = [] |
447 | 448 | id = False |
| 449 | + elif event == 'end': |
| 450 | + elem.clear() |
448 | 451 | |
449 | 452 | |
450 | 453 | def stream_raw_xml(input_queue, storage, id, function, dataset): |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -345,6 +345,7 @@ |
346 | 346 | |
347 | 347 | |
348 | 348 | def prepare(output): |
| 349 | + res = file_utils.delete_file(output, 'articles.csv') |
349 | 350 | res = file_utils.delete_file(output, None, directory=True) |
350 | 351 | if res: |
351 | 352 | res = file_utils.create_directory(output) |
Index: trunk/tools/editor_trends/etl/transformer.py |
— | — | @@ -191,7 +191,9 @@ |
192 | 192 | |
193 | 193 | |
194 | 194 | def transform_editors_single_launcher(rts): |
| 195 | + print rts.dbname, rts.editors_raw |
195 | 196 | ids = db.retrieve_distinct_keys(rts.dbname, rts.editors_raw, 'editor') |
| 197 | + print len(ids) |
196 | 198 | input_db, output_db = setup_database(rts) |
197 | 199 | pbar = progressbar.ProgressBar(maxval=len(ids)).start() |
198 | 200 | for x, id in enumerate(ids): |
Index: trunk/tools/editor_trends/classes/exceptions.py |
— | — | @@ -71,6 +71,16 @@ |
72 | 72 | return 'Currently, chart type %s is not supported. Please choose one of \ |
73 | 73 | the following charts: %s' % (self.chart, self.charts) |
74 | 74 | |
| 75 | +class UnknownPluginError(Error): |
| 76 | + def __init__(self, plugin, plugins): |
| 77 | + self.plugin = plugin |
| 78 | + self.plugins = plugins |
| 79 | + |
| 80 | + def __str__(self): |
| 81 | + return 'Plugin %s is an unknown plugin. Please choose one of the \ |
| 82 | + the following plugins: %s' % (self.plugin, self.plugins) |
| 83 | + |
| 84 | + |
75 | 85 | class NotYetImplementedError(Error): |
76 | 86 | def __init__(self, func): |
77 | 87 | self.func = func |
Index: trunk/tools/editor_trends/classes/runtime_settings.py |
— | — | @@ -32,6 +32,7 @@ |
33 | 33 | from settings import Settings |
34 | 34 | from utils import text_utils |
35 | 35 | from utils import ordered_dict as odict |
| 36 | +from analyses import inventory |
36 | 37 | import languages |
37 | 38 | import projects |
38 | 39 | |
— | — | @@ -59,7 +60,7 @@ |
60 | 61 | self.input_location != None else self.get_value('location') |
61 | 62 | self.project = self.update_project_settings() |
62 | 63 | self.language = self.update_language_settings() |
63 | | - self.targets = self.split_keywords(self.get_value('charts')) |
| 64 | + self.charts = self.determine_chart(self.get_value('charts')) |
64 | 65 | self.keywords = self.split_keywords(self.get_value('keywords')) |
65 | 66 | self.function = self.get_value('func') |
66 | 67 | |
— | — | @@ -72,8 +73,6 @@ |
73 | 74 | |
74 | 75 | self.dataset = os.path.join(self.dataset_location, |
75 | 76 | self.project.name) |
76 | | - self.charts = os.path.join(self.chart_location, |
77 | | - self.project.name) |
78 | 77 | |
79 | 78 | self.txt = os.path.join(self.location, 'txt') |
80 | 79 | self.sorted = os.path.join(self.location, 'sorted') |
— | — | @@ -81,8 +80,7 @@ |
82 | 81 | self.directories = [self.location, |
83 | 82 | self.txt, |
84 | 83 | self.sorted, |
85 | | - self.dataset, |
86 | | - self.charts] |
| 84 | + self.dataset] |
87 | 85 | self.dump_filename = self.generate_wikidump_filename() |
88 | 86 | self.dump_relative_path = self.set_dump_path() |
89 | 87 | self.dump_absolute_path = self.set_dump_path(absolute=True) |
— | — | @@ -121,10 +119,21 @@ |
122 | 120 | except ValueError: |
123 | 121 | pass |
124 | 122 | d[key] = value |
125 | | - else: |
126 | | - return keywords |
127 | 123 | return d |
128 | 124 | |
| 125 | + def determine_chart(self, chart): |
| 126 | + requested_charts = [] |
| 127 | + if chart != None: |
| 128 | + charts = chart.split(',') |
| 129 | + available_charts = inventory.available_analyses() |
| 130 | + for chart in charts: |
| 131 | + if chart not in available_charts: |
| 132 | + raise exception.UnknownChartError(chart, available_charts) |
| 133 | + sys.exit(-1) |
| 134 | + else: |
| 135 | + requested_charts.append(chart) |
| 136 | + return requested_charts |
| 137 | + |
129 | 138 | def get_project_location(self): |
130 | 139 | ''' |
131 | 140 | Construct the full project location |
Index: trunk/tools/editor_trends/classes/analytics.py |
— | — | @@ -0,0 +1,106 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__author__email = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-03-28' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import sys |
| 22 | +from Queue import Empty |
| 23 | + |
| 24 | +if '..' not in sys.path: |
| 25 | + sys.path.append('..') |
| 26 | + |
| 27 | +from classes import consumers |
| 28 | +from database import db |
| 29 | + |
| 30 | +class Replicator: |
| 31 | + def __init__(self, plugin, time_unit, cutoff=None, cum_cutoff=None, **kwargs): |
| 32 | + #this is an ugly hack to prevent a circular import problem |
| 33 | + #this needs a better fix. |
| 34 | + import manage |
| 35 | + |
| 36 | + project, language, parser = manage.init_args_parser() |
| 37 | + self.project = project |
| 38 | + self.language = language |
| 39 | + self.args = parser.parse_args(['django']) |
| 40 | + self.plugin = plugin |
| 41 | + self.time_unit = time_unit |
| 42 | + languages = kwargs.pop('languages', False) |
| 43 | + if languages: |
| 44 | + self.languages = ['de', 'fr', 'es', 'ja', 'ru'] |
| 45 | + else: |
| 46 | + self.languages = ['en'] |
| 47 | + if cutoff == None: |
| 48 | + self.cutoff = [1, 10, 50] |
| 49 | + else: |
| 50 | + self.cutoff = cutoff |
| 51 | + |
| 52 | + if cutoff == None: |
| 53 | + self.cum_cutoff = [10] |
| 54 | + else: |
| 55 | + self.cum_cutoff = cum_cutoff |
| 56 | + self.kwargs = kwargs |
| 57 | + |
| 58 | + def __call__(self): |
| 59 | + project = 'wiki' |
| 60 | + |
| 61 | + #rts = runtime_settings.init_environment('wiki', 'en', args) |
| 62 | + |
| 63 | + for lang in self.languages: |
| 64 | + self.rts = runtime_settings.init_environment(project, lang, self.args) |
| 65 | + #TEMP FIX, REMOVE |
| 66 | + #rts.dbname = 'enwiki' |
| 67 | + self.rts.editors_dataset = 'editors_dataset' |
| 68 | + |
| 69 | + self.rts.dbname = '%s%s' % (lang, project) |
| 70 | + for cum_cutoff in self.cum_cutoff: |
| 71 | + for cutoff in self.cutoff: |
| 72 | + generate_chart_data(self.rts, self.plugin, |
| 73 | + time_unit=self.time_unit, |
| 74 | + cutoff=cutoff, cum_cutoff=cum_cutoff, |
| 75 | + **self.kwargs) |
| 76 | + |
| 77 | + |
| 78 | +class Analyzer(consumers.BaseConsumer): |
| 79 | + def __init__(self, rts, tasks, result, var): |
| 80 | + super(Analyzer, self).__init__(rts, tasks, result) |
| 81 | + self.var = var |
| 82 | + |
| 83 | + def run(self): |
| 84 | + ''' |
| 85 | + Generic loop function that loops over all the editors of a Wikipedia |
| 86 | + project and then calls the plugin that does the actual mapping. |
| 87 | + ''' |
| 88 | + mongo = db.init_mongo_db(self.rts.dbname) |
| 89 | + coll = mongo[self.rts.editors_dataset] |
| 90 | + while True: |
| 91 | + try: |
| 92 | + task = self.tasks.get(block=False) |
| 93 | + self.tasks.task_done() |
| 94 | + if task == None: |
| 95 | + self.result.put(self.var) |
| 96 | + break |
| 97 | + editor = coll.find_one({'editor': task.editor}) |
| 98 | + |
| 99 | + task.plugin(self.var, editor, dbname=self.rts.dbname) |
| 100 | + self.result.put(True) |
| 101 | + except Empty: |
| 102 | + pass |
| 103 | + |
| 104 | +class Task: |
| 105 | + def __init__(self, plugin, editor): |
| 106 | + self.plugin = plugin |
| 107 | + self.editor = editor |
Property changes on: trunk/tools/editor_trends/classes/analytics.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 108 | + native |
Index: trunk/tools/editor_trends/database/db.py |
— | — | @@ -80,9 +80,9 @@ |
81 | 81 | mongo = init_mongo_db(dbname) |
82 | 82 | collection = mongo[collection] |
83 | 83 | if qualifier == 'min': |
84 | | - return collection.find().sort(var, pymongo.ASCENDING).limit(1)[0] |
| 84 | + return collection.find({var : {'$ne' : False}}).sort(var, pymongo.ASCENDING).limit(1)[0] |
85 | 85 | elif qualifier == 'max': |
86 | | - return collection.find().sort(var, pymongo.DESCENDING).limit(1)[0] |
| 86 | + return collection.find({var : {'$ne' : False}}).sort(var, pymongo.DESCENDING).limit(1)[0] |
87 | 87 | else: |
88 | 88 | return collection.find({var: 1}) |
89 | 89 | |
— | — | @@ -120,9 +120,6 @@ |
121 | 121 | return data |
122 | 122 | |
123 | 123 | |
124 | | -def retrieve_max_value(dbname, collection, var): |
125 | | - pass |
126 | | - |
127 | 124 | def retrieve_distinct_keys(dbname, collection, field, force_new=False): |
128 | 125 | #mongo = init_mongo_db(dbname) |
129 | 126 | #editors = mongo[collection] |
— | — | @@ -132,8 +129,8 @@ |
133 | 130 | < 4mb just do a distinct query, index > 4mb do a map reduce. |
134 | 131 | ''' |
135 | 132 | if force_new == False and file_utils.check_file_exists(settings.binary_location, |
136 | | - '%s_%s.bin' % (dbname, field)): |
137 | | - ids = file_utils.load_object(settings.binary_location, '%s_%s.bin' % (dbname, field)) |
| 133 | + '%s_%s_%s.bin' % (dbname, collection, field)): |
| 134 | + ids = file_utils.load_object(settings.binary_location, '%s_%s_%s.bin' % (dbname, collection, field)) |
138 | 135 | else: |
139 | 136 | mongo = init_mongo_db(dbname) |
140 | 137 | editors = mongo[collection] |
— | — | @@ -145,7 +142,7 @@ |
146 | 143 | #params['size'] = 'size' |
147 | 144 | #size = editors.find_one({'size': 1}) |
148 | 145 | ids = retrieve_distinct_keys_mapreduce(editors, field) |
149 | | - file_utils.store_object(ids, settings.binary_location, '%s_%s.bin' % (dbname, field)) |
| 146 | + file_utils.store_object(ids, settings.binary_location, '%s_%s_%s.bin' % (dbname, collection, field)) |
150 | 147 | return ids |
151 | 148 | |
152 | 149 | |