Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -26,7 +26,7 @@ |
27 | 27 | from argparse import RawTextHelpFormatter |
28 | 28 | import ConfigParser |
29 | 29 | |
30 | | -import configuration |
| 30 | +#import configuration |
31 | 31 | from utils import file_utils |
32 | 32 | from utils import ordered_dict |
33 | 33 | from utils import log |
— | — | @@ -40,7 +40,7 @@ |
41 | 41 | from etl import store |
42 | 42 | from etl import sort |
43 | 43 | from etl import transformer |
44 | | -from analyses import analyzer |
| 44 | +from analyses import inventory |
45 | 45 | |
46 | 46 | |
47 | 47 | def show_choices(settings, attr): |
— | — | @@ -50,8 +50,7 @@ |
51 | 51 | return choices |
52 | 52 | |
53 | 53 | |
54 | | - |
55 | | -def config_launcher(properties, settings, logger): |
| 54 | +def config_launcher(properties, logger): |
56 | 55 | ''' |
57 | 56 | Config launcher is used to reconfigure editor trends toolkit. |
58 | 57 | ''' |
— | — | @@ -98,20 +97,20 @@ |
99 | 98 | |
100 | 99 | |
101 | 100 | |
102 | | -def downloader_launcher(properties, settings, logger): |
| 101 | +def downloader_launcher(properties, logger): |
103 | 102 | ''' |
104 | 103 | This launcher calls the dump downloader to download a Wikimedia dump file. |
105 | 104 | ''' |
106 | 105 | print 'Start downloading' |
107 | 106 | stopwatch = timer.Timer() |
108 | 107 | log.log_to_mongo(properties, 'dataset', 'download', stopwatch, event='start') |
109 | | - res = downloader.launcher(properties, settings, logger) |
| 108 | + res = downloader.launcher(properties, logger) |
110 | 109 | stopwatch.elapsed() |
111 | 110 | log.log_to_mongo(properties, 'dataset', 'download', stopwatch, event='finish') |
112 | 111 | return res |
113 | 112 | |
114 | 113 | |
115 | | -def extract_launcher(properties, settings, logger): |
| 114 | +def extract_launcher(properties, logger): |
116 | 115 | ''' |
117 | 116 | The extract launcher is used to extract the required variables from a dump |
118 | 117 | file. If the zip file is a known archive then it will first launch the |
— | — | @@ -125,34 +124,34 @@ |
126 | 125 | log.log_to_mongo(properties, 'dataset', 'extract', stopwatch, event='finish') |
127 | 126 | |
128 | 127 | |
129 | | -def sort_launcher(properties, settings, logger): |
| 128 | +def sort_launcher(rts, logger): |
130 | 129 | ''' |
131 | 130 | After the extracter has finished then the created output files need to be |
132 | 131 | sorted. This function takes care of that. |
133 | 132 | ''' |
134 | 133 | print 'Start sorting data' |
135 | 134 | stopwatch = timer.Timer() |
136 | | - log.log_to_mongo(properties, 'dataset', 'sort', stopwatch, event='start') |
| 135 | + log.log_to_mongo(rts, 'dataset', 'sort', stopwatch, event='start') |
137 | 136 | # write_message_to_log(logger, settings, |
138 | 137 | # message=None, |
139 | 138 | # verb=None, |
140 | 139 | # location=properties.location, |
141 | 140 | # input=properties.txt, |
142 | 141 | # output=properties.sorted) |
143 | | - sort.mergesort_launcher(properties.txt, properties.sorted) |
| 142 | + sort.launcher(rts) |
144 | 143 | stopwatch.elapsed() |
145 | | - log.log_to_mongo(properties, 'dataset', 'sort', stopwatch, event='finish') |
| 144 | + log.log_to_mongo(rts, 'dataset', 'sort', stopwatch, event='finish') |
146 | 145 | |
147 | 146 | |
148 | | -def store_launcher(properties, settings, logger): |
| 147 | +def store_launcher(rts, logger): |
149 | 148 | ''' |
150 | 149 | The data is ready to be stored once the sorted function has completed. This |
151 | 150 | function starts storing data in MongoDB. |
152 | 151 | ''' |
153 | 152 | print 'Start storing data in MongoDB' |
154 | 153 | stopwatch = timer.Timer() |
155 | | - log.log_to_mongo(properties, 'dataset', 'store', stopwatch, event='start') |
156 | | - db.cleanup_database(properties.dbname, logger) |
| 154 | + log.log_to_mongo(rts, 'dataset', 'store', stopwatch, event='start') |
| 155 | + db.cleanup_database(rts.dbname, logger) |
157 | 156 | # write_message_to_log(logger, settings, |
158 | 157 | # message=None, |
159 | 158 | # verb='Storing', |
— | — | @@ -163,36 +162,34 @@ |
164 | 163 | # collection=properties.collection) |
165 | 164 | # for key in properties: |
166 | 165 | # print key, getattr(properties, key) |
167 | | - store.launcher(properties.sorted, properties.dbname, properties.collection) |
168 | | - |
| 166 | + store.launcher(rts) |
169 | 167 | stopwatch.elapsed() |
170 | | - log.log_to_mongo(properties, 'dataset', 'store', stopwatch, event='finish') |
| 168 | + log.log_to_mongo(rts, 'dataset', 'store', stopwatch, event='finish') |
171 | 169 | |
172 | 170 | |
173 | | -def transformer_launcher(properties, settings, logger): |
| 171 | +def transformer_launcher(rts, logger): |
174 | 172 | print 'Start transforming dataset' |
175 | 173 | stopwatch = timer.Timer() |
176 | | - log.log_to_mongo(properties, 'dataset', 'transform', stopwatch, event='start') |
177 | | - db.cleanup_database(properties.dbname, logger, 'dataset') |
| 174 | + log.log_to_mongo(rts, 'dataset', 'transform', stopwatch, event='start') |
| 175 | + db.cleanup_database(rts.dbname, logger, 'dataset') |
178 | 176 | # write_message_to_log(logger, settings, |
179 | 177 | # message=None, |
180 | 178 | # verb='Transforming', |
181 | 179 | # project=properties.project, |
182 | 180 | # collection=properties.collection) |
183 | | - transformer.transform_editors_single_launcher(properties.dbname, |
184 | | - properties.collection) |
| 181 | + transformer.transform_editors_single_launcher(rts) |
185 | 182 | stopwatch.elapsed() |
186 | | - log.log_to_mongo(properties, 'dataset', 'transform', stopwatch, |
| 183 | + log.log_to_mongo(rts, 'dataset', 'transform', stopwatch, |
187 | 184 | event='finish') |
188 | 185 | |
189 | 186 | |
190 | | -def dataset_launcher(properties, settings, logger): |
| 187 | +def dataset_launcher(rts, logger): |
191 | 188 | print 'Start exporting dataset' |
192 | 189 | stopwatch = timer.Timer() |
193 | | - log.log_to_mongo(properties, 'dataset', 'export', stopwatch, event='start') |
| 190 | + log.log_to_mongo(rts, 'dataset', 'export', stopwatch, event='start') |
194 | 191 | |
195 | | - collection = '%s_%s' % (properties.collection, 'dataset') |
196 | | - for target in properties.targets: |
| 192 | + #collection = '%s_%s' % (rts.collection, 'dataset') |
| 193 | + for target in rts.targets: |
197 | 194 | # write_message_to_log(logger, settings, |
198 | 195 | # message=None, |
199 | 196 | # verb='Exporting', |
— | — | @@ -200,16 +197,16 @@ |
201 | 198 | # dbname=properties.full_project, |
202 | 199 | # collection=properties.collection) |
203 | 200 | |
204 | | - analyzer.generate_chart_data(properties.dbname, |
205 | | - collection, |
206 | | - properties.language.code, |
| 201 | + analyzer.generate_chart_data(rts.dbname, |
| 202 | + rts.editors_dataset, |
| 203 | + rts.language.code, |
207 | 204 | target, |
208 | | - **properties.keywords) |
| 205 | + **rts.keywords) |
209 | 206 | stopwatch.elapsed() |
210 | 207 | log.log_to_mongo(properties, 'dataset', 'export', stopwatch, event='finish') |
211 | 208 | |
212 | 209 | |
213 | | -def cleanup(properties, settings, logger): |
| 210 | +def cleanup(rts, logger): |
214 | 211 | directories = properties.directories[1:] |
215 | 212 | for directory in directories: |
216 | 213 | write_message_to_log(logger, setting, |
— | — | @@ -232,7 +229,7 @@ |
233 | 230 | file_utils.delete_file(settings.binary_location, filename) |
234 | 231 | |
235 | 232 | |
236 | | -def all_launcher(properties, settings, logger): |
| 233 | +def all_launcher(properties, logger): |
237 | 234 | print 'The entire data processing chain has been called, this will take a \ |
238 | 235 | couple of hours (at least) to complete.' |
239 | 236 | stopwatch = timer.Timer() |
— | — | @@ -258,7 +255,7 @@ |
259 | 256 | for function, callname in functions.iteritems(): |
260 | 257 | if callname not in properties.ignore: |
261 | 258 | print 'Starting %s' % function.func_name |
262 | | - res = function(properties, settings, logger) |
| 259 | + res = function(properties, logger) |
263 | 260 | if res == False: |
264 | 261 | sys.exit(False) |
265 | 262 | elif res == None: |
— | — | @@ -284,11 +281,11 @@ |
285 | 282 | ''' |
286 | 283 | Entry point for parsing command line and launching the needed function(s). |
287 | 284 | ''' |
288 | | - settings = configuration.Settings() |
| 285 | + #settings = configuration.Settings() |
289 | 286 | language = languages.init() |
290 | 287 | project = projects.init() |
291 | 288 | pjc = projects.ProjectContainer() |
292 | | - rts = runtime_settings.RunTimeSettings(project, language, settings) |
| 289 | + rts = runtime_settings.RunTimeSettings(project, language) |
293 | 290 | |
294 | 291 | #Init Argument Parser |
295 | 292 | parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter) |
— | — | @@ -301,7 +298,7 @@ |
302 | 299 | action='store', |
303 | 300 | help='Enter the first letter of a language to see which languages are \ |
304 | 301 | available.') |
305 | | - parser_languages.set_defaults(func=language.show_languages, args=[settings, project]) |
| 302 | + parser_languages.set_defaults(func=language.show_languages, args=[project]) |
306 | 303 | |
307 | 304 | #CONFIG |
308 | 305 | parser_config = subparsers.add_parser('config', |
— | — | @@ -350,7 +347,7 @@ |
351 | 348 | parser_dataset.add_argument('-c', '--charts', |
352 | 349 | action='store', |
353 | 350 | help='Should be a valid function name that matches one of the plugin functions', |
354 | | - default=analyzer.available_analyses()['new_editor_count']) |
| 351 | + default=inventory.available_analyses()['new_editor_count']) |
355 | 352 | |
356 | 353 | parser_dataset.add_argument('-k', '--keywords', |
357 | 354 | action='store', |
— | — | @@ -399,12 +396,13 @@ |
400 | 397 | parser.add_argument('-c', '--collection', |
401 | 398 | action='store', |
402 | 399 | help='Name of MongoDB collection', |
403 | | - default='editors') |
| 400 | + default='editors_raw') |
404 | 401 | |
405 | 402 | parser.add_argument('-o', '--location', |
406 | 403 | action='store', |
407 | 404 | help='Indicate where you want to store the downloaded file.', |
408 | | - default=settings.input_location) |
| 405 | + #default=settings.input_location) |
| 406 | + default=rts.input_location) |
409 | 407 | |
410 | 408 | parser.add_argument('-ns', '--namespace', |
411 | 409 | action='store', |
— | — | @@ -413,41 +411,41 @@ |
414 | 412 | |
415 | 413 | parser.add_argument('-f', '--file', |
416 | 414 | action='store', |
417 | | - choices=settings.file_choices, |
| 415 | + choices=rts.file_choices, |
418 | 416 | help='Indicate which dump you want to download. Valid choices are:\n \ |
419 | | - %s' % ''.join([f + ',\n' for f in settings.file_choices]), |
| 417 | + %s' % ''.join([f + ',\n' for f in rts.file_choices]), |
420 | 418 | default='stub-meta-history.xml.gz') |
421 | 419 | |
422 | 420 | |
423 | | - return project, language, parser, settings |
| 421 | + return project, language, parser |
424 | 422 | |
425 | 423 | def main(): |
426 | | - project, language, parser, settings = init_args_parser() |
| 424 | + project, language, parser, = init_args_parser() |
427 | 425 | args = parser.parse_args() |
428 | | - properties = runtime_settings.RunTimeSettings(project, language, settings, args) |
| 426 | + rts = runtime_settings.RunTimeSettings(project, language, args) |
429 | 427 | #initialize logger |
430 | 428 | logger = logging.getLogger('manager') |
431 | 429 | logger.setLevel(logging.DEBUG) |
432 | 430 | |
433 | 431 | # Add the log message handler to the logger |
434 | 432 | today = datetime.datetime.today() |
435 | | - log_filename = os.path.join(settings.log_location, '%s%s_%s-%s-%s.log' \ |
436 | | - % (properties.language.code, properties.project.name, |
| 433 | + log_filename = os.path.join(rts.log_location, '%s%s_%s-%s-%s.log' \ |
| 434 | + % (rts.language.code, rts.project.name, |
437 | 435 | today.day, today.month, today.year)) |
438 | 436 | handler = logging.handlers.RotatingFileHandler(log_filename, |
439 | 437 | maxBytes=1024 * 1024, |
440 | 438 | backupCount=3) |
441 | 439 | |
442 | 440 | logger.addHandler(handler) |
443 | | - logger.debug('Chosen language: \t%s' % properties.language) |
| 441 | + logger.debug('Chosen language: \t%s' % rts.language) |
444 | 442 | |
445 | 443 | #start manager |
446 | 444 | #detect_python_version(logger) |
447 | 445 | about_statement() |
448 | 446 | #config.create_configuration(settings, args) |
449 | 447 | |
450 | | - properties.show_settings() |
451 | | - args.func(properties, settings, logger) |
| 448 | + rts.show_settings() |
| 449 | + args.func(rts, logger) |
452 | 450 | |
453 | 451 | |
454 | 452 | if __name__ == '__main__': |
Index: trunk/tools/editor_trends/analyses/community_graph.py |
— | — | @@ -1,62 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__email__ = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2011-01-10' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -import sys |
22 | | -sys.path.append('..') |
23 | | - |
24 | | -import configuration |
25 | | -settings = configuration.Settings() |
26 | | - |
27 | | -from database import db |
28 | | -from utils import file_utils |
29 | | - |
30 | | -try: |
31 | | - import psyco |
32 | | - psyco.full() |
33 | | -except ImportError: |
34 | | - pass |
35 | | - |
36 | | -def create_articles_set(edits): |
37 | | - s = set() |
38 | | - years = edits.keys() |
39 | | - for year in years: |
40 | | - for edit in edits[year]: |
41 | | - s.add(edit['article']) |
42 | | - return s |
43 | | - |
44 | | - |
45 | | -def create_edgelist(project, collection): |
46 | | - ids = db.retrieve_distinct_keys(project, collection, 'editor') |
47 | | - conn = db.init_mongo_db(project) |
48 | | - ids.sort() |
49 | | - fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', settings.encoding) |
50 | | - for i in ids: |
51 | | - author_i = conn[collection].find_one({'editor': i}) |
52 | | - article_i = create_articles_set(author_i['edits']) |
53 | | - for j in ids: |
54 | | - if i > j: |
55 | | - author_j = conn[collection].find_one({'editor': j}) |
56 | | - article_j = create_articles_set(author_j['edits']) |
57 | | - common = article_i.intersection(article_j) |
58 | | - if len(common) > 0: |
59 | | - file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True) |
60 | | - fh.close() |
61 | | - |
62 | | -if __name__ == '__main__': |
63 | | - create_edgelist('enwiki', 'editors') |
Index: trunk/tools/editor_trends/analyses/dataset.py |
— | — | @@ -1,473 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__email__ = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2011-01-14' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -import calendar |
22 | | -import datetime |
23 | | -import time |
24 | | -import math |
25 | | -import operator |
26 | | -import sys |
27 | | -from pymongo.son_manipulator import SONManipulator |
28 | | -from multiprocessing import Lock |
29 | | - |
30 | | - |
31 | | -sys.path.append('..') |
32 | | -import configuration |
33 | | -settings = configuration.Settings() |
34 | | - |
35 | | -from utils import file_utils |
36 | | -from utils import data_converter |
37 | | -from database import db |
38 | | -import json_encoders |
39 | | - |
40 | | -class Transform(SONManipulator): |
41 | | - ''' |
42 | | - This encoder transforms a Dataset to a MongoDB bson document. |
43 | | - To use this encoder initalize a mongo database instance and then add: |
44 | | - mongo.add_son_manipulator(Transform()) |
45 | | - ''' |
46 | | - def transform_incoming(self, son, collection): |
47 | | - for (key, ds) in son.items(): |
48 | | - son[key] = {} |
49 | | - for x, var in enumerate(ds): |
50 | | - if isinstance(var, Variable): |
51 | | - son[key][var.name] = var.encode() |
52 | | - for prop in ds.props: |
53 | | - son[prop] = getattr(ds, prop) |
54 | | - return son |
55 | | - |
56 | | - def transform_outgoing(self, son, collection): |
57 | | - for (key, value) in son.items(): |
58 | | - if isinstance(value, dict): |
59 | | - names = value.keys() |
60 | | - for name in names: |
61 | | - var = Variable(name, None) |
62 | | - var.decode(value) |
63 | | - son['variables'][name] = var |
64 | | - else: # Again, make sure to recurse into sub-docs |
65 | | - son[key] = value |
66 | | - name = son.pop('name', None) |
67 | | - project = son.pop('project', None) |
68 | | - collection = son.pop('collection', None) |
69 | | - language_code = son.pop('language_code', None) |
70 | | - variables = son.pop('variables', []) |
71 | | - ds = Dataset(name, project, collection, language_code, **son) |
72 | | - for var in variables: |
73 | | - var = variables[var] |
74 | | - ds.add_variable(var) |
75 | | - return ds |
76 | | - |
77 | | - |
78 | | -class Data: |
79 | | - ''' |
80 | | - Some generic functions that are required by the Observation, Variable, and |
81 | | - Dataset classes. |
82 | | - ''' |
83 | | - def __hash__(self, vars): |
84 | | - id = ''.join([str(var) for var in vars]) |
85 | | - return hash(id) |
86 | | - #return int(self.convert_date_to_epoch(date)) |
87 | | - |
88 | | - def encode_to_bson(self, data=None): |
89 | | - if data: |
90 | | - kwargs = dict([(str(key), value) for key, value in data.__dict__.iteritems()]) |
91 | | - else: |
92 | | - kwargs = dict([(str(key), value) for key, value in self.__dict__.iteritems()]) |
93 | | - for key, value in kwargs.iteritems(): |
94 | | - if isinstance(value, dict): |
95 | | - d = {} |
96 | | - for k, v in value.iteritems(): |
97 | | - if isinstance(v, Observation): |
98 | | - v = self.encode_to_bson(v) |
99 | | - d[str(k)] = v |
100 | | - kwargs[key] = d |
101 | | - return kwargs |
102 | | - |
103 | | - def convert_date_to_epoch(self, date): |
104 | | - assert self.time_unit == 'year' or self.time_unit == 'month' \ |
105 | | - or self.time_unit == 'day', 'Time unit should either be year, month or day.' |
106 | | - |
107 | | - if self.time_unit == 'year': |
108 | | - datum = datetime.datetime(date.year, 1, 1) |
109 | | - return int(time.mktime(datum.timetuple())) |
110 | | - elif self.time_unit == 'month': |
111 | | - datum = datetime.datetime(date.year, date.month, 1) |
112 | | - return int(time.mktime(datum.timetuple())) |
113 | | - elif self.time_unit == 'day': |
114 | | - return int(time.mktime(date.timetuple())) |
115 | | - else: |
116 | | - return date |
117 | | - |
118 | | - def set_date_range(self, date): |
119 | | - if self.time_unit == 'year': |
120 | | - return datetime.datetime(date.year, 12, 31), datetime.datetime(date.year, 1, 1) |
121 | | - elif self.time_unit == 'month': |
122 | | - day = calendar.monthrange(date.year, date.month)[1] |
123 | | - return datetime.datetime(date.year, date.month, day), datetime.datetime(date.year, date.month, 1) |
124 | | - else: |
125 | | - return datetime.datetime(date.year, date.month, date.day), datetime.datetime(date.year, date.month, date.day) |
126 | | - |
127 | | - |
128 | | -class Observation(Data): |
129 | | - lock = Lock() |
130 | | - ''' |
131 | | - The smallest unit, here the actual data is being stored. |
132 | | - Time_unit should either be 'year', 'month' or 'day'. |
133 | | - ''' |
134 | | - def __init__(self, date, time_unit, id, meta): |
135 | | - assert isinstance(date, datetime.datetime), 'Date variable should be a datetime.datetime instance.' |
136 | | - self.date = date |
137 | | - self.data = 0 |
138 | | - self.time_unit = time_unit |
139 | | - self.t1, self.t0 = self.set_date_range(date) |
140 | | - self.id = id |
141 | | - self.props = [] |
142 | | - for mt in meta: |
143 | | - if isinstance(mt, float): |
144 | | - raise Exception, 'Mongo does not allow a dot "." in the name of a key, please use an integer or string as key.' |
145 | | - elif not isinstance(mt, list): |
146 | | - setattr(self, mt, meta[mt]) |
147 | | - self.props.append(mt) |
148 | | - self._type = 'observation' |
149 | | - |
150 | | - def __repr__(self): |
151 | | - return '%s' % self.date |
152 | | - |
153 | | - def __str__(self): |
154 | | - return 'range: %s:%s' % (self.t0, self.t1) |
155 | | - |
156 | | - def __iter__(self): |
157 | | - for obs in self.data: |
158 | | - yield self.data[obs] |
159 | | - |
160 | | - def __getitem__(self, key): |
161 | | - return getattr(self, key, []) |
162 | | - |
163 | | - def add(self, value): |
164 | | - ''' |
165 | | - If update == True then data[i] will be incremented else data[i] will be |
166 | | - created, in that case make sure that i is unique. Update is useful for |
167 | | - tallying a variable. |
168 | | - ''' |
169 | | - self.lock.acquire() |
170 | | - try: |
171 | | - self.data += value |
172 | | - finally: |
173 | | - self.lock.release() |
174 | | - |
175 | | - def get_date_range(self): |
176 | | - return '%s-%s-%s:%s-%s-%s' % (self.t0.month, self.t0.day, self.t0.year, \ |
177 | | - self.t1.month, self.t1.day, self.t1.year) |
178 | | - |
179 | | -class Variable(Data): |
180 | | - ''' |
181 | | - This class constructs a time-based variable. |
182 | | - ''' |
183 | | - lock = Lock() |
184 | | - def __init__(self, name, time_unit, **kwargs): |
185 | | - self.name = name |
186 | | - self.obs = {} |
187 | | - self.time_unit = time_unit |
188 | | - self.groupbys = [] |
189 | | - self._type = 'variable' |
190 | | - self.props = ['name', 'time_unit', '_type'] |
191 | | - for kw in kwargs: |
192 | | - setattr(self, kw, kwargs[kw]) |
193 | | - self.props.append(kw) |
194 | | - |
195 | | - def __str__(self): |
196 | | - return '%s' % self.name |
197 | | - |
198 | | - def __repr__(self): |
199 | | - return '%s' % self.name |
200 | | - |
201 | | - def __getitem__(self, key): |
202 | | - return getattr(self, key, []) |
203 | | - |
204 | | - def __iter__(self): |
205 | | - keys = self.obs.keys() |
206 | | - for key in keys: |
207 | | - yield key |
208 | | - |
209 | | - def __len__(self): |
210 | | - return [x for x in xrange(self.obs())] |
211 | | - |
212 | | - def items(self): |
213 | | - for key in self.__dict__.keys(): |
214 | | - yield key, getattr(self, key) |
215 | | - |
216 | | - def itervalues(self): |
217 | | - for key in self: |
218 | | - yield self.obs[key].data |
219 | | - |
220 | | - def iteritems(self): |
221 | | - for key in self: |
222 | | - yield (key, self.obs[key]) |
223 | | - |
224 | | - |
225 | | - def get_data(self): |
226 | | - return [o for o in self.itervalues()] |
227 | | - |
228 | | - def get_observation(self, id, date, meta): |
229 | | - self.lock.acquire() |
230 | | - try: |
231 | | - obs = self.obs.get(id, Observation(date, self.time_unit, id, meta)) |
232 | | - finally: |
233 | | - self.lock.release() |
234 | | - return obs |
235 | | - |
236 | | - def add(self, date, value, meta={}): |
237 | | - assert isinstance(meta, dict), 'The meta variable should be a dict (either empty or with variables to group by.' |
238 | | - #id = self.convert_date_to_epoch(date) |
239 | | - start, end = self.set_date_range(date) |
240 | | - values = meta.values() |
241 | | - values.insert(0, end) |
242 | | - values.insert(0, start) |
243 | | - id = self.__hash__(values) |
244 | | - |
245 | | - obs = self.get_observation(id, date, meta) |
246 | | - obs.add(value) |
247 | | - self.obs[id] = obs |
248 | | - |
249 | | - def encode(self): |
250 | | - bson = {} |
251 | | - for prop in self.props: |
252 | | - bson[prop] = getattr(self, prop) |
253 | | - |
254 | | - bson['obs'] = {} |
255 | | - for obs in self: |
256 | | - data = self.obs[obs] |
257 | | - obs = str(obs) |
258 | | - bson['obs'][obs] = data.encode_to_bson() |
259 | | - return bson |
260 | | - |
261 | | - def decode(self, values): |
262 | | - for varname in values: |
263 | | - for prop in values[varname]: |
264 | | - if isinstance(values[varname][prop], dict): |
265 | | - data = values[varname][prop] |
266 | | - for d in data: |
267 | | - date = data[d]['date'] |
268 | | - obs = data[d]['data'] |
269 | | - self.add(date, obs) |
270 | | - else: |
271 | | - setattr(self, prop, values[varname][prop]) |
272 | | - self.props.append(prop) |
273 | | - |
274 | | - def get_date_range(self): |
275 | | - dates = [self.obs[key].date for key in self] |
276 | | - first = min(dates) |
277 | | - last = max(dates) |
278 | | - return first, last |
279 | | - |
280 | | - |
281 | | -class Dataset: |
282 | | - ''' |
283 | | - This class acts as a container for the Variable class and has some methods |
284 | | - to output the dataset to a csv file, mongodb and display statistics. |
285 | | - ''' |
286 | | - |
287 | | - def __init__(self, name, project, collection, language_code, encoder, vars=None, **kwargs): |
288 | | - encoders = json_encoders.available_json_encoders() |
289 | | - if encoder not in encoders: |
290 | | - raise exception.UnknownJSONEncoderError(encoder) |
291 | | - else: |
292 | | - self.encoder = encoder |
293 | | - self.name = name |
294 | | - self.project = project |
295 | | - self.collection = collection |
296 | | - self.language_code = language_code |
297 | | - self.hash = self.name |
298 | | - self._type = 'dataset' |
299 | | - self.created = datetime.datetime.now() |
300 | | - self.format = 'long' |
301 | | - for kw in kwargs: |
302 | | - setattr(self, kw, kwargs[kw]) |
303 | | - self.props = self.__dict__.keys() |
304 | | - |
305 | | - self.variables = [] |
306 | | - if vars != None: |
307 | | - for kwargs in vars: |
308 | | - name = kwargs.pop('name') |
309 | | - setattr(self, name, Variable(name, **kwargs)) |
310 | | - self.variables.append(name) |
311 | | - #self.filename = self.create_filename() |
312 | | - |
313 | | - def __repr__(self): |
314 | | - return 'Dataset contains %s variables' % (len(self.variables)) |
315 | | - |
316 | | - def __iter__(self): |
317 | | - for var in self.variables: |
318 | | - yield getattr(self, var) |
319 | | - |
320 | | - |
321 | | - def create_filename(self): |
322 | | - ''' |
323 | | - This function creates a filename for the dataset by searching for shared |
324 | | - properties among the different variables in the dataset. All shared |
325 | | - properties will be used in the filename to make sure that one analysis |
326 | | - that's run with different parameters gets stored in separate files. |
327 | | - ''' |
328 | | - common = {} |
329 | | - props = set() |
330 | | - for var in self.variables: |
331 | | - s = set() |
332 | | - var = getattr(self, var) |
333 | | - for prop in var.props: |
334 | | - if prop not in ['name', 'time_unit', '_type']: |
335 | | - s.add(prop) |
336 | | - props.add(prop) |
337 | | - common[var.name] = s |
338 | | - |
339 | | - keys = [] |
340 | | - for prop in props: |
341 | | - attrs = [] |
342 | | - for s in common.values(): |
343 | | - attrs.append(prop) |
344 | | - if len(attrs) == len(common.values()): |
345 | | - keys.append(prop) |
346 | | - keys.sort() |
347 | | - attrs = '_'.join(['%s=%s' % (k, getattr(var, k)) for k in keys]) |
348 | | - filename = '%s%s_%s_%s.csv' % (self.language_code, |
349 | | - self.project, |
350 | | - self.name, |
351 | | - attrs) |
352 | | - self.filename = filename |
353 | | - |
354 | | - |
355 | | - def add_variable(self, var): |
356 | | - if isinstance(var, Variable): |
357 | | - self.variables.append(var.name) |
358 | | - setattr(self, var.name, var) |
359 | | - else: |
360 | | - raise TypeError('You can only instance of Variable to a dataset.') |
361 | | - |
362 | | - def write(self, format='csv'): |
363 | | - self.create_filename() |
364 | | - if format == 'csv': |
365 | | - self.to_csv() |
366 | | - elif format == 'mongo': |
367 | | - self.to_mongo() |
368 | | - |
369 | | - def to_mongo(self): |
370 | | - dbname = '%s%s' % (self.language_code, self.project) |
371 | | - mongo = db.init_mongo_db(dbname) |
372 | | - coll = mongo['%s_%s' % (dbname, 'charts')] |
373 | | - mongo.add_son_manipulator(Transform()) |
374 | | - coll.remove({'hash':self.hash, 'project':self.project, |
375 | | - 'language_code':self.language_code}) |
376 | | - coll.insert({'variables': self}) |
377 | | - |
378 | | - def to_csv(self): |
379 | | - data = data_converter.convert_dataset_to_lists(self, 'manage') |
380 | | - headers = data_converter.add_headers(self) |
381 | | - fh = file_utils.create_txt_filehandle(settings.dataset_location, self.filename, 'w', settings.encoding) |
382 | | - file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True) |
383 | | - file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format) |
384 | | - fh.close() |
385 | | - |
386 | | - def encode(self): |
387 | | - props = {} |
388 | | - for prop in self.props: |
389 | | - props[prop] = getattr(self, prop) |
390 | | - return props |
391 | | - |
392 | | - def get_standard_deviation(self, number_list): |
393 | | - mean = self.get_mean(number_list) |
394 | | - std = 0 |
395 | | - n = len(number_list) |
396 | | - for i in number_list: |
397 | | - std = std + (i - mean) ** 2 |
398 | | - return math.sqrt(std / float(n - 1)) |
399 | | - |
400 | | - def get_median(self, number_list): |
401 | | - if number_list == []: |
402 | | - return '.' |
403 | | - data = sorted(number_list) |
404 | | - data = [float(x) for x in data] |
405 | | - if len(data) % 2 == 1: |
406 | | - return data[(len(data) + 1) / 2 - 1] |
407 | | - else: |
408 | | - lower = data[len(data) / 2 - 1] |
409 | | - upper = data[len(data) / 2] |
410 | | - return (lower + upper) / 2 |
411 | | - |
412 | | - def get_mean(self, number_list): |
413 | | - if number_list == []: |
414 | | - return '.' |
415 | | - float_nums = [float(x) for x in number_list] |
416 | | - return sum(float_nums) / len(number_list) |
417 | | - |
418 | | - def descriptives(self): |
419 | | - for variable in self: |
420 | | - data = variable.get_data() |
421 | | - variable.mean = self.get_mean(data) |
422 | | - variable.median = self.get_median(data) |
423 | | - variable.sds = self.get_standard_deviation(data) |
424 | | - variable.min = min(data) |
425 | | - variable.max = max(data) |
426 | | - variable.n = len(data) |
427 | | - variable.first_obs, variable.last_obs = variable.get_date_range() |
428 | | - |
429 | | - def summary(self): |
430 | | - self.descriptives() |
431 | | - print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('Variable', 'Mean', |
432 | | - 'Median', 'SD', 'Minimum', 'Maximum', |
433 | | - 'Num Obs', 'First Obs', 'Final Obs') |
434 | | - for variable in self: |
435 | | - print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (variable.name, |
436 | | - variable.mean, variable.median, |
437 | | - variable.sds, variable.min, |
438 | | - variable.max, variable.n, |
439 | | - variable.first_obs, variable.last_obs) |
440 | | - |
441 | | - |
442 | | -def debug(): |
443 | | - mongo = db.init_mongo_db('enwiki') |
444 | | - rawdata = mongo['enwiki_charts'] |
445 | | - mongo.add_son_manipulator(Transform()) |
446 | | - |
447 | | - d1 = datetime.datetime.today() |
448 | | - d2 = datetime.datetime(2007, 6, 7) |
449 | | - ds = Dataset('test', 'wiki', 'editors_dataset', 'en', 'to_bar_json', [ |
450 | | - {'name': 'count', 'time_unit': 'year'}, |
451 | | - # {'name': 'testest', 'time_unit': 'year'} |
452 | | - ]) |
453 | | - ds.count.add(d1, 10, ['exp', 'window']) |
454 | | - ds.count.add(d1, 135, ['exp', 'window']) |
455 | | - ds.count.add(d2, 1, ['exp', 'window']) |
456 | | - #ds.testest.add(d1, 135) |
457 | | - #ds.testest.add(d2, 535) |
458 | | - ds.summary() |
459 | | - ds.write(format='csv') |
460 | | -# v = Variable('test', 'year') |
461 | | - ds.encode() |
462 | | - print ds |
463 | | - |
464 | | - # mongo.test.insert({'variables': ds}) |
465 | | - |
466 | | - # v.add(d2 , 5) |
467 | | - #o = v.get_observation(d2) |
468 | | -# ds = rawdata.find_one({'project': 'wiki', |
469 | | -# 'language_code': 'en', |
470 | | -# 'hash': 'cohort_dataset_backward_bar'}) |
471 | | - |
472 | | - |
473 | | -if __name__ == '__main__': |
474 | | - debug() |
Index: trunk/tools/editor_trends/analyses/file_size_reduction.py |
— | — | @@ -1,100 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__email__ = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2010-11-15' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -import sys |
22 | | -sys.path.append('..') |
23 | | - |
24 | | -import os |
25 | | -import xml.etree.cElementTree as cElementTree |
26 | | - |
27 | | -import configuration |
28 | | -from utils import file_utils |
29 | | -settings = configuration.Settings() |
30 | | - |
31 | | - |
32 | | -class DumpStatistics(object): |
33 | | - ''' Simple class to keep track of XML tags, how often they occur, |
34 | | - and the length of strings they contain. This is used to calculate the |
35 | | - overhead. |
36 | | - ''' |
37 | | - def __init__(self): |
38 | | - self.tags = {} |
39 | | - |
40 | | - def add_tag(self, kwargs): |
41 | | - for kw in kwargs: |
42 | | - if kw not in self.tags: |
43 | | - self.tags[kw] = {} |
44 | | - self.tags[kw]['n'] = 0 |
45 | | - self.tags[kw]['size'] = 0 |
46 | | - self.tags[kw]['n'] += 1 |
47 | | - self.tags[kw]['size'] += self.determine_length(kwargs[kw]) |
48 | | - |
49 | | - def average_size_text(self): |
50 | | - avg = {} |
51 | | - for kw in self.tags: |
52 | | - avg[kw] = self.tags[kw]['size'] / self.tags[kw]['n'] |
53 | | - return avg |
54 | | - |
55 | | - def total_size_text(self): |
56 | | - return sum([self.tags[kw]['size'] for kw in self.tags]) |
57 | | - |
58 | | - def total_size_xml(self): |
59 | | - # the x2 is for the opening and closing tag |
60 | | - # the +5 is for 2x <, 2x > and 1x / |
61 | | - return sum([(len(kw) * (self.tags[kw]['n'] * 2) + 5) for kw in self.tags]) |
62 | | - |
63 | | - def determine_length(self, text): |
64 | | - if text == None: |
65 | | - return 0 |
66 | | - else: |
67 | | - return len(text) |
68 | | - |
69 | | - |
70 | | -def calculate_filesize_overhead(location, filename): |
71 | | - counter = None |
72 | | - ds = DumpStatistics() |
73 | | - filename = os.path.join(location, filename) |
74 | | - context = cElementTree.iterparse(filename, events=('start', 'end')) |
75 | | - context = iter(context) |
76 | | - event, root = context.next() #get the root element of the XML doc |
77 | | - |
78 | | - try: |
79 | | - for event, elem in context: |
80 | | - if event == 'end': |
81 | | - ds.add_tag({elem.tag:elem.text}) |
82 | | - root.clear() # when done parsing a section clear the tree to release memory |
83 | | - except SyntaxError: |
84 | | - pass |
85 | | - file_utils.store_object(ds, settings.binary_location, 'ds') |
86 | | - xml_size = ds.total_size_xml() |
87 | | - text_size = ds.total_size_text() |
88 | | - print text_size, xml_size |
89 | | - print ds.tags |
90 | | - |
91 | | - |
92 | | -def output_dumpstatistics(): |
93 | | - ds = file_utils.load_object(settings.binary_location, 'ds.bin') |
94 | | - |
95 | | - for key in ds.tags: |
96 | | - print '%s\t%s' % (key, ds.tags[key]) |
97 | | - |
98 | | -if __name__ == '__main__': |
99 | | - input = os.path.join(settings.input_location, 'en', 'wiki') |
100 | | - calculate_filesize_overhead(input, 'enwiki-latest-stub-meta-history.xml') |
101 | | - output_dumpstatistics() |
Index: trunk/tools/editor_trends/analyses/match_talkpage_article.py |
— | — | @@ -1,72 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__email__ = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2011-01-07' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | -import sys |
22 | | -import os |
23 | | -sys.path.append('..') |
24 | | - |
25 | | -import configuration |
26 | | -settings = configuration.Settings() |
27 | | - |
28 | | -from etl import extracter |
29 | | -from utils import file_utils |
30 | | -import wikitree |
31 | | - |
32 | | -try: |
33 | | - import psyco |
34 | | - psyco.full() |
35 | | -except ImportError: |
36 | | - pass |
37 | | - |
38 | | -class Article: |
39 | | - def __init__(self, title, id, talk_id=None): |
40 | | - self.title = title |
41 | | - self.id = id |
42 | | - self.talk_id = talk_id |
43 | | - |
44 | | - |
45 | | -def parse_dumpfile(project, language_code, namespaces=['0', '1']): |
46 | | - articles = {} |
47 | | - ns = extracter.load_namespace(language_code) |
48 | | - non_valid_namespaces = extracter.build_namespaces_locale(ns, namespaces) |
49 | | - |
50 | | - |
51 | | - location = os.path.join(settings.input_location, language_code, project) |
52 | | - fh = file_utils.create_txt_filehandle(location, |
53 | | - '%s%s-latest-stub-meta-history.xml' % (language_code, project), |
54 | | - 'r', settings.encoding) |
55 | | - |
56 | | - for page, article_size in wikitree.parser.read_input(fh): |
57 | | - title = page.find('title') |
58 | | - if extracter.verify_article_belongs_namespace(title, non_valid_namespaces): |
59 | | - article_id = page.find('id').text |
60 | | - title = title.text |
61 | | - if title.startswith(ns['1'].get('canonical')): |
62 | | - namespace = 'Talk' |
63 | | - article = articles.get(article_id, Article(None, None, article_id)) |
64 | | - article.talk_id = article_id |
65 | | - else: |
66 | | - namespace = 'Main' |
67 | | - article = articles.get(article_id, Article(title, article_id)) |
68 | | - articles[article_id] = article |
69 | | - |
70 | | - file_utils.store_object(articles, settings.binary_location, 'talk2article.bin') |
71 | | - |
72 | | -if __name__ == '__main__': |
73 | | - parse_dumpfile('wiki', 'en') |
Index: trunk/tools/editor_trends/analyses/plugins/histogram_by_backward_cohort.py |
— | — | @@ -39,10 +39,10 @@ |
40 | 40 | if w >= editor_dt: |
41 | 41 | datum = datetime.datetime(int(year), 12, 31) |
42 | 42 | freq = int(editor['edits_by_year'][year]) |
43 | | - if datum == datetime.datetime(2003, 12, 31): |
| 43 | + #if datum == datetime.datetime(2003, 12, 31): |
44 | 44 | # if w == 24: |
45 | 45 | # if freq == 1.0: |
46 | 46 | # print 'break' |
47 | | - var.add(datum, 1, {'window': w, 'frequency': freq}) #{w:{freq:1}}) |
48 | | - break |
| 47 | + var.add(datum, 1, {'window': w, 'frequency': freq}) #{w:{freq:1}}) |
| 48 | + break |
49 | 49 | return var |
Index: trunk/tools/editor_trends/analyses/plugins/edit_patterns.py |
— | — | @@ -27,23 +27,12 @@ |
28 | 28 | if dt.days < 366: |
29 | 29 | return var |
30 | 30 | |
31 | | - m = 0 |
32 | | - obs = {} |
33 | 31 | for year in xrange(new_wikipedian.year, new_wikipedian.year + 2): |
34 | | - if m == 12: |
35 | | - break |
| 32 | + obs = [False for x in xrange(13)] |
36 | 33 | for month in xrange(new_wikipedian.month, 13): |
37 | 34 | n = monthly[str(year)][str(month)] |
38 | 35 | date = datetime.datetime(year, month, 1) |
39 | 36 | if n >= var.cutoff: |
40 | | - var.add(date, True, {'month':m}) |
41 | | - #obs[m] = True |
42 | | - else: |
43 | | - var.add(date, False, {'month':m}) |
44 | | - #obs[m] = False |
45 | | - m += 1 |
46 | | - if m == 12: |
47 | | - break |
48 | | -# if m == 12: |
49 | | -# var.add(date, obs) |
| 37 | + obs[month] = True |
| 38 | + var.add(date, obs) |
50 | 39 | return var |
Index: trunk/tools/editor_trends/analyses/json_encoders.py |
— | — | @@ -17,9 +17,13 @@ |
18 | 18 | __date__ = '2011-01-27' |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
| 21 | +import sys |
21 | 22 | import types |
22 | | -import analyzer |
23 | 23 | |
| 24 | +if '..' not in sys.path: |
| 25 | + sys.path.append('..') |
| 26 | + |
| 27 | +import inventory |
24 | 28 | from classes import exceptions |
25 | 29 | from utils import data_converter |
26 | 30 | |
— | — | @@ -67,6 +71,7 @@ |
68 | 72 | options['series']['bars']['align'] = 'center' |
69 | 73 | return options |
70 | 74 | |
| 75 | + |
71 | 76 | def to_bar_json(ds): |
72 | 77 | data = {} |
73 | 78 | |
— | — | @@ -95,6 +100,7 @@ |
96 | 101 | print json |
97 | 102 | return json |
98 | 103 | |
| 104 | + |
99 | 105 | def to_stacked_bar_json(ds): |
100 | 106 | ''' |
101 | 107 | This function outputs data in a format that is understood by jquery |
Index: trunk/tools/editor_trends/analyses/inventory.py |
— | — | @@ -0,0 +1,70 @@ |
| 2 | +#!/usr/bin/python
|
| 3 | +# coding=utf-8
|
| 4 | +'''
|
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
|
| 6 | +This program is free software; you can redistribute it and/or
|
| 7 | +modify it under the terms of the GNU General Public License version 2
|
| 8 | +as published by the Free Software Foundation.
|
| 9 | +This program is distributed in the hope that it will be useful,
|
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
| 12 | +See the GNU General Public License for more details, at
|
| 13 | +http,//www.fsf.org/licenses/gpl.html
|
| 14 | +'''
|
| 15 | +
|
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
|
| 17 | +__author__email = 'dvanliere at gmail dot com'
|
| 18 | +__date__ = '2011-02-11'
|
| 19 | +__version__ = '0.1'
|
| 20 | +
|
| 21 | +
|
| 22 | +import os
|
| 23 | +import sys
|
| 24 | +import types
|
| 25 | +
|
| 26 | +def available_analyses(caller='manage'):
|
| 27 | + '''
|
| 28 | + Generates a dictionary:
|
| 29 | + key: name of analysis
|
| 30 | + value: function that generates the dataset
|
| 31 | + ignore: a list of functions that should never be called from manage.py,
|
| 32 | + they are not valid entry points.
|
| 33 | + '''
|
| 34 | + assert caller == 'django' or caller == 'manage'
|
| 35 | + ignore = ['__init__']
|
| 36 | + functions = {}
|
| 37 | +
|
| 38 | + fn = os.path.realpath(__file__)
|
| 39 | + pos = fn.rfind(os.sep)
|
| 40 | + loc = fn[:pos]
|
| 41 | + path = os.path.join(loc , 'plugins')
|
| 42 | + plugins = import_libs(path)
|
| 43 | +
|
| 44 | + for plugin in plugins:
|
| 45 | + if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore:
|
| 46 | + functions[plugin.func_name] = plugin
|
| 47 | + if caller == 'manage':
|
| 48 | + return functions
|
| 49 | + elif caller == 'django':
|
| 50 | + django_functions = []
|
| 51 | + for function in functions:
|
| 52 | + fancy_name = function.replace('_', ' ').title()
|
| 53 | + django_functions.append((function, fancy_name))
|
| 54 | +
|
| 55 | + return django_functions
|
| 56 | +
|
| 57 | +
|
| 58 | +def import_libs(path):
|
| 59 | + '''
|
| 60 | + Dynamically importing functions from the plugins directory.
|
| 61 | + '''
|
| 62 | + library_list = []
|
| 63 | + sys.path.append(path)
|
| 64 | + for f in os.listdir(os.path.abspath(path)):
|
| 65 | + module_name, ext = os.path.splitext(f)
|
| 66 | + if ext == '.py':
|
| 67 | + module = __import__(module_name)
|
| 68 | + func = getattr(module, module_name)
|
| 69 | + library_list.append(func)
|
| 70 | +
|
| 71 | + return library_list
|
Index: trunk/tools/editor_trends/analyses/__init__.py |
— | — | @@ -0,0 +1 @@ |
| 2 | + |
Index: trunk/tools/editor_trends/analyses/analyzer.py |
— | — | @@ -21,19 +21,20 @@ |
22 | 22 | import sys |
23 | 23 | import os |
24 | 24 | import progressbar |
25 | | -import types |
26 | 25 | import datetime |
27 | 26 | |
28 | | -sys.path.append('..') |
| 27 | +if '..' not in sys.path: |
| 28 | + sys.path.append('..') |
29 | 29 | |
30 | | -import configuration |
31 | | -settings = configuration.Settings() |
| 30 | +from classes import dataset |
| 31 | +from classes import settings |
| 32 | +settings = settings.Settings() |
32 | 33 | from database import db |
33 | 34 | from utils import timer |
34 | 35 | from utils import log |
35 | | -import dataset |
36 | 36 | |
37 | 37 | |
| 38 | + |
38 | 39 | def generate_chart_data(project, collection, language_code, func, encoder, **kwargs): |
39 | 40 | ''' |
40 | 41 | This is the entry function to be called to generate data for creating charts. |
— | — | @@ -102,54 +103,6 @@ |
103 | 104 | return ds |
104 | 105 | |
105 | 106 | |
106 | | -def available_analyses(caller='manage'): |
107 | | - ''' |
108 | | - Generates a dictionary: |
109 | | - key: name of analysis |
110 | | - value: function that generates the dataset |
111 | | - ignore: a list of functions that should never be called from manage.py, |
112 | | - they are not valid entry points. |
113 | | - ''' |
114 | | - assert caller == 'django' or caller == 'manage' |
115 | | - ignore = ['__init__'] |
116 | | - functions = {} |
117 | | - |
118 | | - fn = os.path.realpath(__file__) |
119 | | - pos = fn.rfind(os.sep) |
120 | | - loc = fn[:pos] |
121 | | - path = os.path.join(loc , 'plugins') |
122 | | - plugins = import_libs(path) |
123 | | - |
124 | | - for plugin in plugins: |
125 | | - if isinstance(plugin, types.FunctionType) and plugin.func_name not in ignore: |
126 | | - functions[plugin.func_name] = plugin |
127 | | - if caller == 'manage': |
128 | | - return functions |
129 | | - elif caller == 'django': |
130 | | - django_functions = [] |
131 | | - for function in functions: |
132 | | - fancy_name = function.replace('_', ' ').title() |
133 | | - django_functions.append((function, fancy_name)) |
134 | | - |
135 | | - return django_functions |
136 | | - |
137 | | - |
138 | | -def import_libs(path): |
139 | | - ''' |
140 | | - Dynamically importing functions from the plugins directory. |
141 | | - ''' |
142 | | - library_list = [] |
143 | | - sys.path.append(path) |
144 | | - for f in os.listdir(os.path.abspath(path)): |
145 | | - module_name, ext = os.path.splitext(f) |
146 | | - if ext == '.py': |
147 | | - module = __import__(module_name) |
148 | | - func = getattr(module, module_name) |
149 | | - library_list.append(func) |
150 | | - |
151 | | - return library_list |
152 | | - |
153 | | - |
154 | 107 | def determine_project_year_range(dbname, collection, var): |
155 | 108 | ''' |
156 | 109 | Determine the first and final year for the observed data |
— | — | @@ -166,8 +119,8 @@ |
167 | 120 | |
168 | 121 | |
169 | 122 | if __name__ == '__main__': |
170 | | - generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50) |
171 | | - #generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5) |
| 123 | + #generate_chart_data('wiki', 'editors_dataset', 'en', 'histogram_by_backward_cohort', 'to_bar_json', time_unit='year', cutoff=0, cum_cutoff=50) |
| 124 | + generate_chart_data('wiki', 'editors_dataset', 'en', 'edit_patterns', 'to_bar_json', time_unit='year', cutoff=5) |
172 | 125 | #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_number_of_new_wikipedians', 'to_bar_json', time_unit='year') |
173 | 126 | #generate_chart_data('wiki', 'editors', 'en', 'total_number_of_articles', 'to_bar_json', time_unit='year') |
174 | 127 | #generate_chart_data('wiki', 'editors_dataset', 'en', 'total_cumulative_edits', 'to_bar_json', time_unit='year') |
Index: trunk/tools/editor_trends/analyses/adhoc/community_graph.py |
— | — | @@ -0,0 +1,62 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-10' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import sys |
| 22 | +sys.path.append('..') |
| 23 | + |
| 24 | +import configuration |
| 25 | +settings = configuration.Settings() |
| 26 | + |
| 27 | +from database import db |
| 28 | +from utils import file_utils |
| 29 | + |
| 30 | +try: |
| 31 | + import psyco |
| 32 | + psyco.full() |
| 33 | +except ImportError: |
| 34 | + pass |
| 35 | + |
| 36 | +def create_articles_set(edits): |
| 37 | + s = set() |
| 38 | + years = edits.keys() |
| 39 | + for year in years: |
| 40 | + for edit in edits[year]: |
| 41 | + s.add(edit['article']) |
| 42 | + return s |
| 43 | + |
| 44 | + |
| 45 | +def create_edgelist(project, collection): |
| 46 | + ids = db.retrieve_distinct_keys(project, collection, 'editor') |
| 47 | + conn = db.init_mongo_db(project) |
| 48 | + ids.sort() |
| 49 | + fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', settings.encoding) |
| 50 | + for i in ids: |
| 51 | + author_i = conn[collection].find_one({'editor': i}) |
| 52 | + article_i = create_articles_set(author_i['edits']) |
| 53 | + for j in ids: |
| 54 | + if i > j: |
| 55 | + author_j = conn[collection].find_one({'editor': j}) |
| 56 | + article_j = create_articles_set(author_j['edits']) |
| 57 | + common = article_i.intersection(article_j) |
| 58 | + if len(common) > 0: |
| 59 | + file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True) |
| 60 | + fh.close() |
| 61 | + |
| 62 | +if __name__ == '__main__': |
| 63 | + create_edgelist('enwiki', 'editors') |
Property changes on: trunk/tools/editor_trends/analyses/adhoc/community_graph.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 64 | + native |
Index: trunk/tools/editor_trends/analyses/adhoc/file_size_reduction.py |
— | — | @@ -0,0 +1,100 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2010-11-15' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import sys |
| 22 | +sys.path.append('..') |
| 23 | + |
| 24 | +import os |
| 25 | +import xml.etree.cElementTree as cElementTree |
| 26 | + |
| 27 | +import configuration |
| 28 | +from utils import file_utils |
| 29 | +settings = configuration.Settings() |
| 30 | + |
| 31 | + |
| 32 | +class DumpStatistics(object): |
| 33 | + ''' Simple class to keep track of XML tags, how often they occur, |
| 34 | + and the length of strings they contain. This is used to calculate the |
| 35 | + overhead. |
| 36 | + ''' |
| 37 | + def __init__(self): |
| 38 | + self.tags = {} |
| 39 | + |
| 40 | + def add_tag(self, kwargs): |
| 41 | + for kw in kwargs: |
| 42 | + if kw not in self.tags: |
| 43 | + self.tags[kw] = {} |
| 44 | + self.tags[kw]['n'] = 0 |
| 45 | + self.tags[kw]['size'] = 0 |
| 46 | + self.tags[kw]['n'] += 1 |
| 47 | + self.tags[kw]['size'] += self.determine_length(kwargs[kw]) |
| 48 | + |
| 49 | + def average_size_text(self): |
| 50 | + avg = {} |
| 51 | + for kw in self.tags: |
| 52 | + avg[kw] = self.tags[kw]['size'] / self.tags[kw]['n'] |
| 53 | + return avg |
| 54 | + |
| 55 | + def total_size_text(self): |
| 56 | + return sum([self.tags[kw]['size'] for kw in self.tags]) |
| 57 | + |
| 58 | + def total_size_xml(self): |
| 59 | + # the x2 is for the opening and closing tag |
| 60 | + # the +5 is for 2x <, 2x > and 1x / |
| 61 | + return sum([(len(kw) * (self.tags[kw]['n'] * 2) + 5) for kw in self.tags]) |
| 62 | + |
| 63 | + def determine_length(self, text): |
| 64 | + if text == None: |
| 65 | + return 0 |
| 66 | + else: |
| 67 | + return len(text) |
| 68 | + |
| 69 | + |
| 70 | +def calculate_filesize_overhead(location, filename): |
| 71 | + counter = None |
| 72 | + ds = DumpStatistics() |
| 73 | + filename = os.path.join(location, filename) |
| 74 | + context = cElementTree.iterparse(filename, events=('start', 'end')) |
| 75 | + context = iter(context) |
| 76 | + event, root = context.next() #get the root element of the XML doc |
| 77 | + |
| 78 | + try: |
| 79 | + for event, elem in context: |
| 80 | + if event == 'end': |
| 81 | + ds.add_tag({elem.tag:elem.text}) |
| 82 | + root.clear() # when done parsing a section clear the tree to release memory |
| 83 | + except SyntaxError: |
| 84 | + pass |
| 85 | + file_utils.store_object(ds, settings.binary_location, 'ds') |
| 86 | + xml_size = ds.total_size_xml() |
| 87 | + text_size = ds.total_size_text() |
| 88 | + print text_size, xml_size |
| 89 | + print ds.tags |
| 90 | + |
| 91 | + |
| 92 | +def output_dumpstatistics(): |
| 93 | + ds = file_utils.load_object(settings.binary_location, 'ds.bin') |
| 94 | + |
| 95 | + for key in ds.tags: |
| 96 | + print '%s\t%s' % (key, ds.tags[key]) |
| 97 | + |
| 98 | +if __name__ == '__main__': |
| 99 | + input = os.path.join(settings.input_location, 'en', 'wiki') |
| 100 | + calculate_filesize_overhead(input, 'enwiki-latest-stub-meta-history.xml') |
| 101 | + output_dumpstatistics() |
Property changes on: trunk/tools/editor_trends/analyses/adhoc/file_size_reduction.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 102 | + native |
Index: trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py |
— | — | @@ -0,0 +1,72 @@ |
| 2 | +#!/usr/bin/python |
| 3 | +# -*- coding: utf-8 -*- |
| 4 | +''' |
| 5 | +Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
| 6 | +This program is free software; you can redistribute it and/or |
| 7 | +modify it under the terms of the GNU General Public License version 2 |
| 8 | +as published by the Free Software Foundation. |
| 9 | +This program is distributed in the hope that it will be useful, |
| 10 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 12 | +See the GNU General Public License for more details, at |
| 13 | +http://www.fsf.org/licenses/gpl.html |
| 14 | +''' |
| 15 | + |
| 16 | +__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
| 17 | +__email__ = 'dvanliere at gmail dot com' |
| 18 | +__date__ = '2011-01-07' |
| 19 | +__version__ = '0.1' |
| 20 | + |
| 21 | +import sys |
| 22 | +import os |
| 23 | +sys.path.append('..') |
| 24 | + |
| 25 | +import configuration |
| 26 | +settings = configuration.Settings() |
| 27 | + |
| 28 | +from etl import extracter |
| 29 | +from utils import file_utils |
| 30 | +import wikitree |
| 31 | + |
| 32 | +try: |
| 33 | + import psyco |
| 34 | + psyco.full() |
| 35 | +except ImportError: |
| 36 | + pass |
| 37 | + |
| 38 | +class Article: |
| 39 | + def __init__(self, title, id, talk_id=None): |
| 40 | + self.title = title |
| 41 | + self.id = id |
| 42 | + self.talk_id = talk_id |
| 43 | + |
| 44 | + |
| 45 | +def parse_dumpfile(project, language_code, namespaces=['0', '1']): |
| 46 | + articles = {} |
| 47 | + ns = extracter.load_namespace(language_code) |
| 48 | + non_valid_namespaces = extracter.build_namespaces_locale(ns, namespaces) |
| 49 | + |
| 50 | + |
| 51 | + location = os.path.join(settings.input_location, language_code, project) |
| 52 | + fh = file_utils.create_txt_filehandle(location, |
| 53 | + '%s%s-latest-stub-meta-history.xml' % (language_code, project), |
| 54 | + 'r', settings.encoding) |
| 55 | + |
| 56 | + for page, article_size in wikitree.parser.read_input(fh): |
| 57 | + title = page.find('title') |
| 58 | + if extracter.verify_article_belongs_namespace(title, non_valid_namespaces): |
| 59 | + article_id = page.find('id').text |
| 60 | + title = title.text |
| 61 | + if title.startswith(ns['1'].get('canonical')): |
| 62 | + namespace = 'Talk' |
| 63 | + article = articles.get(article_id, Article(None, None, article_id)) |
| 64 | + article.talk_id = article_id |
| 65 | + else: |
| 66 | + namespace = 'Main' |
| 67 | + article = articles.get(article_id, Article(title, article_id)) |
| 68 | + articles[article_id] = article |
| 69 | + |
| 70 | + file_utils.store_object(articles, settings.binary_location, 'talk2article.bin') |
| 71 | + |
| 72 | +if __name__ == '__main__': |
| 73 | + parse_dumpfile('wiki', 'en') |
Property changes on: trunk/tools/editor_trends/analyses/adhoc/match_talkpage_article.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 74 | + native |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -22,27 +22,22 @@ |
23 | 23 | import sys |
24 | 24 | import os |
25 | 25 | |
26 | | -sys.path.append('..') |
27 | | -import configuration |
28 | | -settings = configuration.Settings() |
29 | 26 | from utils import file_utils |
30 | 27 | from utils import text_utils |
31 | | -from utils import messages |
32 | 28 | from database import cache |
| 29 | +from utils import messages |
33 | 30 | from database import db |
34 | 31 | |
35 | 32 | |
36 | | -def store_articles(project, language_code): |
37 | | - location = os.path.join(settings.input_location, language_code, project) |
38 | | - fh = file_utils.create_txt_filehandle(location, 'articles.csv', 'r', settings.encoding) |
| 33 | +def store_articles(rts): |
| 34 | + location = os.path.join(rts.input_location, rts.language.code, rts.project.name) |
| 35 | + fh = file_utils.create_txt_filehandle(location, 'articles.csv', 'r', rts.encoding) |
39 | 36 | headers = ['id', 'title'] |
40 | 37 | data = file_utils.read_unicode_text(fh) |
41 | 38 | fh.close() |
42 | 39 | |
43 | | - dbname = '%s%s' % (language_code, project) |
44 | | - collection = '%s_%s' % (dbname, 'articles') |
45 | | - mongo = db.init_mongo_db(dbname) |
46 | | - collection = mongo[collection] |
| 40 | + mongo = db.init_mongo_db(rts.dbname) |
| 41 | + collection = mongo[rts.articles_raw] |
47 | 42 | |
48 | 43 | articles = {} |
49 | 44 | for x, d in enumerate(data): |
— | — | @@ -55,7 +50,7 @@ |
56 | 51 | collection.insert(articles) |
57 | 52 | |
58 | 53 | |
59 | | -def store_editors(tasks, dbname, collection, source): |
| 54 | +def store_editors(tasks, rts): |
60 | 55 | ''' |
61 | 56 | This function is called by multiple consumers who each take a sorted file |
62 | 57 | and create a cache object. If the number of edits made by an editor is above |
— | — | @@ -63,8 +58,8 @@ |
64 | 59 | is discarded. |
65 | 60 | The treshold is currently more than 9 edits and is not yet configurable. |
66 | 61 | ''' |
67 | | - mongo = db.init_mongo_db(dbname) |
68 | | - collection = mongo[collection] |
| 62 | + mongo = db.init_mongo_db(rts.dbname) |
| 63 | + collection = mongo[rts.editors_raw] |
69 | 64 | |
70 | 65 | editor_cache = cache.EditorCache(collection) |
71 | 66 | prev_contributor = -1 |
— | — | @@ -80,7 +75,7 @@ |
81 | 76 | break |
82 | 77 | print '%s files left in the queue.' % messages.show(tasks.qsize) |
83 | 78 | |
84 | | - fh = file_utils.create_txt_filehandle(source, filename, 'r', settings.encoding) |
| 79 | + fh = file_utils.create_txt_filehandle(rts.sorted, filename, 'r', rts.encoding) |
85 | 80 | for line in file_utils.read_raw_data(fh): |
86 | 81 | if len(line) > 1: |
87 | 82 | contributor = line[0] |
— | — | @@ -89,7 +84,7 @@ |
90 | 85 | editor_cache.add(prev_contributor, 'NEXT') |
91 | 86 | date = text_utils.convert_timestamp_to_datetime_utc(line[1]) |
92 | 87 | article_id = int(line[2]) |
93 | | - username = line[3].encode(settings.encoding) |
| 88 | + username = line[3].encode(rts.encoding) |
94 | 89 | ns = int(line[4]) |
95 | 90 | value = {'date': date, |
96 | 91 | 'article': article_id, |
— | — | @@ -101,37 +96,41 @@ |
102 | 97 | #print editor_cache.n |
103 | 98 | |
104 | 99 | |
105 | | -def launcher(source, dbname, collection): |
| 100 | +def launcher(rts): |
106 | 101 | ''' |
107 | 102 | This is the main entry point and creates a number of workers and launches |
108 | 103 | them. |
109 | 104 | ''' |
110 | | - mongo = db.init_mongo_db(dbname) |
111 | | - coll = mongo[collection] |
| 105 | + #rts.sorted, rts.dbname, rts.collection |
| 106 | + mongo = db.init_mongo_db(rts.dbname) |
| 107 | + coll = mongo[rts.editors_raw] |
112 | 108 | coll.ensure_index('editor') |
113 | 109 | coll.create_index('editor') |
114 | 110 | |
115 | | - files = file_utils.retrieve_file_list(source, 'csv') |
| 111 | + files = file_utils.retrieve_file_list(rts.sorted, 'csv') |
116 | 112 | |
117 | | - print 'Input directory is: %s ' % source |
| 113 | + print 'Input directory is: %s ' % rts.sorted |
118 | 114 | tasks = multiprocessing.JoinableQueue() |
119 | 115 | consumers = [multiprocessing.Process(target=store_editors, |
120 | | - args=(tasks, dbname, collection, source)) |
121 | | - for i in xrange(settings.number_of_processes)] |
| 116 | + args=(tasks, rts)) |
| 117 | + for i in xrange(rts.number_of_processes)] |
122 | 118 | |
123 | 119 | for filename in files: |
124 | 120 | tasks.put(filename) |
125 | 121 | |
126 | | - for x in xrange(settings.number_of_processes): |
| 122 | + for x in xrange(rts.number_of_processes): |
127 | 123 | tasks.put(None) |
128 | 124 | |
129 | 125 | for w in consumers: |
130 | 126 | w.start() |
131 | 127 | |
132 | 128 | tasks.join() |
| 129 | + store_articles(rts) |
133 | 130 | |
134 | 131 | |
135 | 132 | def debug(): |
136 | 133 | store_articles('wiki', 'cs') |
| 134 | + |
| 135 | + |
137 | 136 | if __name__ == '__main__': |
138 | 137 | debug() |
Index: trunk/tools/editor_trends/etl/downloader.py |
— | — | @@ -48,14 +48,14 @@ |
49 | 49 | widgets = log.init_progressbar_widgets(filename) |
50 | 50 | extension = file_utils.determine_file_extension(filename) |
51 | 51 | filemode = file_utils.determine_file_mode(extension) |
52 | | - filesize = http_utils.determine_remote_filesize(properties.settings.wp_dump_location, |
| 52 | + filesize = http_utils.determine_remote_filesize(properties.wp_dump_location, |
53 | 53 | properties.dump_relative_path, |
54 | 54 | filename) |
55 | 55 | |
56 | | - mod_date = http_utils.determine_modified_date(properties.settings.wp_dump_location, |
| 56 | + mod_date = http_utils.determine_modified_date(properties.wp_dump_location, |
57 | 57 | properties.dump_relative_path, |
58 | 58 | filename) |
59 | | - mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, properties.settings.timestamp_server) |
| 59 | + mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, properties.timestamp_server) |
60 | 60 | if file_utils.check_file_exists(properties.location, filename): |
61 | 61 | mod_loc = file_utils.get_modified_date(properties.location, filename) |
62 | 62 | if mod_loc == mod_date and (properties.force == False or properties.force == None): |
— | — | @@ -66,7 +66,7 @@ |
67 | 67 | fh = file_utils.create_txt_filehandle(properties.location, |
68 | 68 | filename, |
69 | 69 | filemode, |
70 | | - properties.settings.encoding) |
| 70 | + properties.encoding) |
71 | 71 | else: |
72 | 72 | fh = file_utils.create_binary_filehandle(properties.location, filename, 'wb') |
73 | 73 | |
— | — | @@ -100,18 +100,18 @@ |
101 | 101 | |
102 | 102 | |
103 | 103 | |
104 | | -def launcher(properties, settings, logger): |
| 104 | +def launcher(properties, logger): |
105 | 105 | print 'Creating list of files to be downloaded...' |
106 | | - tasks = http_utils.create_list_dumpfiles(properties.settings.wp_dump_location, |
| 106 | + tasks = http_utils.create_list_dumpfiles(properties.wp_dump_location, |
107 | 107 | properties.dump_relative_path, |
108 | 108 | properties.dump_filename) |
109 | 109 | #print tasks.qsize() |
110 | 110 | #if tasks.qsize() < properties.settings.number_of_processes: |
111 | | - # properties.settings.number_of_processes = tasks.qsize() |
| 111 | + # properties..number_of_processes = tasks.qsize() |
112 | 112 | if tasks.qsize() > 2: |
113 | 113 | consumers = [multiprocessing.Process(target=download_wiki_file, |
114 | 114 | args=(tasks, properties)) |
115 | | - for i in xrange(properties.settings.number_of_processes)] |
| 115 | + for i in xrange(properties.number_of_processes)] |
116 | 116 | else: consumers = [multiprocessing.Process(target=download_wiki_file, |
117 | 117 | args=(tasks, properties)) |
118 | 118 | for i in xrange(1)] |
Index: trunk/tools/editor_trends/etl/__init__.py |
— | — | @@ -0,0 +1 @@ |
| 2 | + |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -24,10 +24,6 @@ |
25 | 25 | import progressbar |
26 | 26 | from Queue import Empty |
27 | 27 | |
28 | | -sys.path.append('..') |
29 | | -import configuration |
30 | | -settings = configuration.Settings() |
31 | | - |
32 | 28 | import wikitree.parser |
33 | 29 | from bots import detector |
34 | 30 | from utils import file_utils |
— | — | @@ -44,8 +40,8 @@ |
45 | 41 | RE_NUMERIC_CHARACTER = re.compile('&#(\d+);') |
46 | 42 | |
47 | 43 | |
48 | | -def remove_numeric_character_references(text): |
49 | | - return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode(settings.encoding) |
| 44 | +def remove_numeric_character_references(rts, text): |
| 45 | + return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode(rts.encoding) |
50 | 46 | |
51 | 47 | |
52 | 48 | def lenient_deccharref(m): |
— | — | @@ -75,9 +71,9 @@ |
76 | 72 | return ns |
77 | 73 | |
78 | 74 | |
79 | | -def parse_comments(revisions, function): |
| 75 | +def parse_comments(rts, revisions, function): |
80 | 76 | for revision in revisions: |
81 | | - comment = revision.find('{%s}comment' % settings.xml_namespace) |
| 77 | + comment = revision.find('{%s}comment' % rts.xml_namespace) |
82 | 78 | if comment != None and comment.text != None: |
83 | 79 | comment.text = function(comment.text) |
84 | 80 | return revisions |
— | — | @@ -101,12 +97,7 @@ |
102 | 98 | else: |
103 | 99 | return False |
104 | 100 | |
105 | | -# for namespace in namespaces: |
106 | | -# if title.startswith(namespace): |
107 | | -# return False |
108 | | -# return True |
109 | 101 | |
110 | | - |
111 | 102 | def validate_hostname(address): |
112 | 103 | ''' |
113 | 104 | This is not a foolproof solution at all. The problem is that it's really |
— | — | @@ -183,7 +174,7 @@ |
184 | 175 | return None |
185 | 176 | |
186 | 177 | |
187 | | -def output_editor_information(revisions, page, bots): |
| 178 | +def output_editor_information(revisions, page, bots, rts): |
188 | 179 | ''' |
189 | 180 | @elem is an XML element containing 1 revision from a page |
190 | 181 | @output is where to store the data, a filehandle |
— | — | @@ -237,6 +228,7 @@ |
238 | 229 | flat.append(f) |
239 | 230 | return flat |
240 | 231 | |
| 232 | + |
241 | 233 | def add_namespace_to_output(output, namespace): |
242 | 234 | for x, o in enumerate(output): |
243 | 235 | o.append(namespace['id']) |
— | — | @@ -244,13 +236,13 @@ |
245 | 237 | return output |
246 | 238 | |
247 | 239 | |
248 | | -def parse_dumpfile(tasks, project, language_code, filehandles, lock, namespaces=['0']): |
249 | | - bot_ids = detector.retrieve_bots(language_code) |
250 | | - location = os.path.join(settings.input_location, language_code, project) |
251 | | - output = os.path.join(settings.input_location, language_code, project, 'txt') |
| 240 | +def parse_dumpfile(tasks, rts, filehandles, lock): |
| 241 | + bot_ids = detector.retrieve_bots(rts.language.code) |
| 242 | + location = os.path.join(rts.input_location, rts.language.code, rts.project.name) |
| 243 | + output = os.path.join(rts.input_location, rts.language.code, rts.project.name, 'txt') |
252 | 244 | widgets = log.init_progressbar_widgets('Extracting data') |
253 | 245 | filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a', |
254 | | - settings.encoding) for fh in xrange(settings.max_filehandles)] |
| 246 | + rts.encoding) for fh in xrange(rts.max_filehandles)] |
255 | 247 | |
256 | 248 | while True: |
257 | 249 | total, processed = 0.0, 0.0 |
— | — | @@ -269,11 +261,11 @@ |
270 | 262 | filesize = file_utils.determine_filesize(location, filename) |
271 | 263 | print 'Opening %s...' % (os.path.join(location, filename)) |
272 | 264 | print 'Filesize: %s' % filesize |
273 | | - fh1 = file_utils.create_txt_filehandle(location, filename, 'r', settings.encoding) |
274 | | - fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', settings.encoding) |
| 265 | + fh1 = file_utils.create_txt_filehandle(location, filename, 'r', rts.encoding) |
| 266 | + fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', rts.encoding) |
275 | 267 | ns, xml_namespace = wikitree.parser.extract_meta_information(fh1) |
276 | | - ns = build_namespaces_locale(ns, namespaces) |
277 | | - settings.xml_namespace = xml_namespace |
| 268 | + ns = build_namespaces_locale(ns, rts.namespaces) |
| 269 | + rts.xml_namespace = xml_namespace |
278 | 270 | |
279 | 271 | pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start() |
280 | 272 | for page, article_size in wikitree.parser.read_input(fh1): |
— | — | @@ -281,14 +273,13 @@ |
282 | 274 | total += 1 |
283 | 275 | namespace = parse_article(title, ns) |
284 | 276 | if namespace != False: |
285 | | - #if verify_article_belongs_namespace(title, ns): |
286 | 277 | article_id = page.find('id').text |
287 | 278 | title = page.find('title').text |
288 | 279 | revisions = page.findall('revision') |
289 | | - revisions = parse_comments(revisions, remove_numeric_character_references) |
290 | | - output = output_editor_information(revisions, article_id, bot_ids) |
| 280 | + revisions = parse_comments(rts, revisions, remove_numeric_character_references) |
| 281 | + output = output_editor_information(revisions, article_id, bot_ids, rts) |
291 | 282 | output = add_namespace_to_output(output, namespace) |
292 | | - write_output(output, filehandles, lock) |
| 283 | + write_output(output, filehandles, lock, rts) |
293 | 284 | file_utils.write_list_to_csv([article_id, title], fh2) |
294 | 285 | processed += 1 |
295 | 286 | page.clear() |
— | — | @@ -317,14 +308,14 @@ |
318 | 309 | return d |
319 | 310 | |
320 | 311 | |
321 | | -def write_output(observations, filehandles, lock): |
| 312 | +def write_output(observations, filehandles, lock, rts): |
322 | 313 | observations = group_observations(observations) |
323 | 314 | for obs in observations: |
324 | 315 | lock.acquire() #lock the write around all edits of an editor for a particular page |
325 | 316 | try: |
326 | 317 | for i, o in enumerate(observations[obs]): |
327 | 318 | if i == 0: |
328 | | - fh = filehandles[hash(obs)] |
| 319 | + fh = filehandles[hash(rts, obs)] |
329 | 320 | file_utils.write_list_to_csv(o, fh) |
330 | 321 | |
331 | 322 | except Exception, error: |
— | — | @@ -333,16 +324,16 @@ |
334 | 325 | lock.release() |
335 | 326 | |
336 | 327 | |
337 | | -def hash(id): |
| 328 | +def hash(rts, id): |
338 | 329 | ''' |
339 | 330 | A very simple hash function based on modulo. The except clause has been |
340 | 331 | added because there are instances where the username is stored in userid |
341 | 332 | tag and hence that's a string and not an integer. |
342 | 333 | ''' |
343 | 334 | try: |
344 | | - return int(id) % settings.max_filehandles |
| 335 | + return int(id) % rts.max_filehandles |
345 | 336 | except ValueError: |
346 | | - return sum([ord(i) for i in id]) % settings.max_filehandles |
| 337 | + return sum([ord(i) for i in id]) % rts.max_filehandles |
347 | 338 | |
348 | 339 | |
349 | 340 | def prepare(output): |
— | — | @@ -380,7 +371,8 @@ |
381 | 372 | print tasks.qsize() |
382 | 373 | return tasks |
383 | 374 | |
384 | | -def launcher(properties): |
| 375 | + |
| 376 | +def launcher(rts): |
385 | 377 | ''' |
386 | 378 | This is the main entry point for the extact phase of the data processing |
387 | 379 | chain. First, it will put a the files that need to be extracted in a queue |
— | — | @@ -389,10 +381,10 @@ |
390 | 382 | the variables from the different dump files. |
391 | 383 | ''' |
392 | 384 | result = True |
393 | | - tasks = unzip(properties) |
| 385 | + tasks = unzip(rts) |
394 | 386 | |
395 | | - output = os.path.join(settings.input_location, properties.language.code, |
396 | | - properties.project.name, 'txt') |
| 387 | + output = os.path.join(rts.input_location, rts.language.code, |
| 388 | + rts.project.name, 'txt') |
397 | 389 | result = prepare(output) |
398 | 390 | if not result: |
399 | 391 | return result |
— | — | @@ -404,14 +396,12 @@ |
405 | 397 | filehandles = [] |
406 | 398 | consumers = [multiprocessing.Process(target=parse_dumpfile, |
407 | 399 | args=(tasks, |
408 | | - properties.project.name, |
409 | | - properties.language.code, |
| 400 | + rts, |
410 | 401 | filehandles, |
411 | | - lock, |
412 | | - properties.namespaces)) |
413 | | - for x in xrange(settings.number_of_processes)] |
| 402 | + lock)) |
| 403 | + for x in xrange(rts.number_of_processes)] |
414 | 404 | |
415 | | - for x in xrange(settings.number_of_processes): |
| 405 | + for x in xrange(rts.number_of_processes): |
416 | 406 | tasks.put(None) |
417 | 407 | |
418 | 408 | for w in consumers: |
Index: trunk/tools/editor_trends/etl/transformer.py |
— | — | @@ -23,9 +23,6 @@ |
24 | 24 | import datetime |
25 | 25 | import sys |
26 | 26 | |
27 | | -sys.path.append('..') |
28 | | -import configuration |
29 | | -settings = configuration.Settings() |
30 | 27 | from database import db |
31 | 28 | from utils import file_utils |
32 | 29 | from utils import messages |
— | — | @@ -101,6 +98,7 @@ |
102 | 99 | 'username': username |
103 | 100 | }, safe=True) |
104 | 101 | |
| 102 | + |
105 | 103 | def determine_year_range(edits): |
106 | 104 | years = [year for year in edits if edits[year] != []] |
107 | 105 | first_year = int(min(years)) |
— | — | @@ -119,8 +117,6 @@ |
120 | 118 | return dc |
121 | 119 | |
122 | 120 | |
123 | | - |
124 | | - |
125 | 121 | def determine_edits_by_month(edits, first_year, final_year): |
126 | 122 | dc = shaper.create_datacontainer(first_year, final_year) |
127 | 123 | dc = shaper.add_months_to_datacontainer(dc, 0.0) |
— | — | @@ -161,17 +157,17 @@ |
162 | 158 | return sorted(edits, key=itemgetter('date')) |
163 | 159 | |
164 | 160 | |
165 | | -def transform_editors_multi_launcher(dbname, collection): |
166 | | - ids = db.retrieve_distinct_keys(dbname, collection, 'editor') |
167 | | - kwargs = {'definition': 'traditional', |
168 | | - 'pbar': True, |
169 | | - } |
| 161 | +def transform_editors_multi_launcher(rts): |
| 162 | + ids = db.retrieve_distinct_keys(rts.dbname, rts.editors_raw, 'editor') |
| 163 | +# kwargs = {'definition': 'traditional', |
| 164 | +# 'pbar': True, |
| 165 | +# } |
170 | 166 | tasks = multiprocessing.JoinableQueue() |
171 | | - consumers = [EditorConsumer(tasks, None) for i in xrange(settings.number_of_processes)] |
| 167 | + consumers = [EditorConsumer(tasks, None) for i in xrange(rts.number_of_processes)] |
172 | 168 | |
173 | 169 | for id in ids: |
174 | | - tasks.put(Editor(dbname, collection, id)) |
175 | | - for x in xrange(settings.number_of_processes): |
| 170 | + tasks.put(Editor(rts.dbname, rts.editors_raw, id)) |
| 171 | + for x in xrange(rts.number_of_processes): |
176 | 172 | tasks.put(None) |
177 | 173 | |
178 | 174 | print messages.show(tasks.qsize) |
— | — | @@ -181,10 +177,10 @@ |
182 | 178 | tasks.join() |
183 | 179 | |
184 | 180 | |
185 | | -def setup_database(dbname, collection): |
186 | | - mongo = db.init_mongo_db(dbname) |
187 | | - input_db = mongo[collection] |
188 | | - output_db = mongo['%s_dataset' % collection] |
| 181 | +def setup_database(rts): |
| 182 | + mongo = db.init_mongo_db(rts.dbname) |
| 183 | + input_db = mongo[rts.editors_raw] |
| 184 | + output_db = mongo[rts.editors_dataset] |
189 | 185 | |
190 | 186 | output_db.ensure_index('editor') |
191 | 187 | output_db.create_index('editor') |
— | — | @@ -193,9 +189,9 @@ |
194 | 190 | return input_db, output_db |
195 | 191 | |
196 | 192 | |
197 | | -def transform_editors_single_launcher(dbname, collection): |
198 | | - ids = db.retrieve_distinct_keys(dbname, collection, 'editor') |
199 | | - input_db, output_db = setup_database(dbname, collection) |
| 193 | +def transform_editors_single_launcher(rts): |
| 194 | + ids = db.retrieve_distinct_keys(rts.dbname, rts.editors_raw, 'editor') |
| 195 | + input_db, output_db = setup_database(rts) |
200 | 196 | for x, id in enumerate(ids): |
201 | 197 | print '%s editors to go...' % (len(ids) - x) |
202 | 198 | editor = Editor(id, input_db, output_db) |
Index: trunk/tools/editor_trends/etl/sort.py |
— | — | @@ -24,14 +24,10 @@ |
25 | 25 | import multiprocessing |
26 | 26 | from Queue import Empty |
27 | 27 | |
28 | | -sys.path.append('..') |
29 | | -import configuration |
30 | | -settings = configuration.Settings() |
31 | | - |
32 | 28 | from utils import file_utils |
33 | 29 | from utils import messages |
34 | | -#import wikitree.parser |
35 | 30 | |
| 31 | + |
36 | 32 | def quick_sort(obs): |
37 | 33 | ''' |
38 | 34 | Quicksort is a sorting algorithm developed by C. A. R. Hoare that, on \ |
— | — | @@ -79,12 +75,15 @@ |
80 | 76 | |
81 | 77 | |
82 | 78 | |
83 | | -def merge_sorted_files(target, files, iteration): |
| 79 | +def merge_sorted_files(target, files, iteration, rts): |
84 | 80 | ''' |
85 | | - Merges smaller sorted files in one big file, no longer used. |
| 81 | + Merges smaller sorted files in one big file, Only used for creating |
| 82 | + data competition file. |
86 | 83 | ''' |
87 | | - fh = file_utils.create_txt_filehandle(target, 'merged_%s.txt' % iteration, |
88 | | - 'w', settings.encoding) |
| 84 | + fh = file_utils.create_txt_filehandle(target, |
| 85 | + 'merged_%s.txt' % iteration, |
| 86 | + 'w', |
| 87 | + rts.encoding) |
89 | 88 | lines = 0 |
90 | 89 | for line in heapq.merge(*[readline(filename) for filename in files]): |
91 | 90 | file_utils.write_list_to_csv(line, fh) |
— | — | @@ -94,17 +93,19 @@ |
95 | 94 | return fh.name |
96 | 95 | |
97 | 96 | |
98 | | -def write_sorted_file(sorted_data, filename, target): |
| 97 | +def write_sorted_file(sorted_data, filename, rts): |
99 | 98 | ''' |
100 | 99 | Writes the sorted file to target |
101 | 100 | ''' |
102 | | - fh = file_utils.create_txt_filehandle(target, filename, 'w', |
103 | | - settings.encoding) |
| 101 | + fh = file_utils.create_txt_filehandle(rts.sorted, |
| 102 | + filename, |
| 103 | + 'w', |
| 104 | + rts.encoding) |
104 | 105 | file_utils.write_list_to_csv(sorted_data, fh) |
105 | 106 | fh.close() |
106 | 107 | |
107 | 108 | |
108 | | -def mergesort_feeder(tasks, source, target): |
| 109 | +def mergesort_feeder(tasks, rts): |
109 | 110 | ''' |
110 | 111 | The feeder function is called by the launcher and gives it a task to |
111 | 112 | complete. |
— | — | @@ -118,10 +119,10 @@ |
119 | 120 | print tasks.qsize() |
120 | 121 | break |
121 | 122 | |
122 | | - fh = file_utils.create_txt_filehandle(source, |
123 | | - filename, |
124 | | - 'r', |
125 | | - settings.encoding) |
| 123 | + fh = file_utils.create_txt_filehandle(rts.txt, |
| 124 | + filename, |
| 125 | + 'r', |
| 126 | + rts.encoding) |
126 | 127 | #print fh |
127 | 128 | #data = fh.readlines() |
128 | 129 | data = file_utils.read_unicode_text(fh) |
— | — | @@ -129,7 +130,7 @@ |
130 | 131 | data = [d.strip() for d in data] |
131 | 132 | data = [d.split('\t') for d in data] |
132 | 133 | sorted_data = mergesort(data) |
133 | | - write_sorted_file(sorted_data, filename, target) |
| 134 | + write_sorted_file(sorted_data, filename, rts) |
134 | 135 | print filename, messages.show(tasks.qsize) |
135 | 136 | except UnicodeDecodeError, e: |
136 | 137 | print e |
— | — | @@ -137,19 +138,19 @@ |
138 | 139 | pass |
139 | 140 | |
140 | 141 | |
141 | | -def mergesort_launcher(source, target): |
142 | | - settings.verify_environment([source, target]) |
143 | | - files = file_utils.retrieve_file_list(source, 'csv') |
144 | | - #print files |
145 | | - print source |
| 142 | +def launcher(rts): |
| 143 | + ''' |
| 144 | + rts is an instance of RunTimeSettings |
| 145 | + ''' |
| 146 | + files = file_utils.retrieve_file_list(rts.txt, 'csv') |
146 | 147 | tasks = multiprocessing.JoinableQueue() |
147 | 148 | consumers = [multiprocessing.Process(target=mergesort_feeder, |
148 | | - args=(tasks, source, target)) |
149 | | - for x in xrange(settings.number_of_processes)] |
| 149 | + args=(tasks, rts)) |
| 150 | + for x in xrange(rts.number_of_processes)] |
150 | 151 | for filename in files: |
151 | 152 | tasks.put(filename) |
152 | 153 | |
153 | | - for x in xrange(settings.number_of_processes): |
| 154 | + for x in xrange(rts.number_of_processes): |
154 | 155 | tasks.put(None) |
155 | 156 | |
156 | 157 | for w in consumers: |
— | — | @@ -157,6 +158,7 @@ |
158 | 159 | |
159 | 160 | tasks.join() |
160 | 161 | |
| 162 | + |
161 | 163 | def debug(): |
162 | 164 | ''' |
163 | 165 | Simple test function |
Index: trunk/tools/editor_trends/__init__.py |
— | — | @@ -1,14 +1,30 @@ |
2 | 2 | import os |
3 | 3 | import sys |
4 | 4 | |
5 | | -WORKING_DIRECTORY = os.getcwd()#[:-9] |
6 | | -IGNORE_DIRS = ['wikistats', 'zips'] |
| 5 | +from classes import singleton |
7 | 6 | |
8 | | -dirs = [name for name in os.listdir(WORKING_DIRECTORY) if |
9 | | - os.path.isdir(os.path.join(WORKING_DIRECTORY, name))] |
| 7 | +class Path: |
| 8 | + __metaclass__ = singleton.Singleton |
10 | 9 | |
| 10 | + def __init__(self): |
| 11 | + self.cwd = self.determine_working_directory() |
| 12 | + self.update_python_path() |
11 | 13 | |
12 | | -for subdirname in dirs: |
13 | | - if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS: |
14 | | - sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname)) |
15 | | - #print os.path.join(WORKING_DIRECTORY, subdirname) |
| 14 | + def determine_working_directory(self): |
| 15 | + cwd = os.getcwd() |
| 16 | + if not cwd.endswith('editor_trends%s' % os.sep): |
| 17 | + pos = cwd.find('editor_trends') + 14 |
| 18 | + cwd = cwd[:pos] |
| 19 | + return cwd |
| 20 | + |
| 21 | + def update_python_path(self): |
| 22 | + IGNORE_DIRS = ['wikistats', 'zips', 'datasets', 'mapreduce', 'logs', |
| 23 | + 'statistics', 'js_scripts', 'deployment', |
| 24 | + 'documentation', 'data', 'code-snippets'] |
| 25 | + dirs = [name for name in os.listdir(self.cwd) if |
| 26 | + os.path.isdir(os.path.join(self.cwd, name))] |
| 27 | + for subdirname in dirs: |
| 28 | + if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS: |
| 29 | + sys.path.append(os.path.join(self.cwd, subdirname)) |
| 30 | + |
| 31 | +Path() |
Index: trunk/tools/editor_trends/classes/settings.py |
— | — | @@ -75,9 +75,7 @@ |
76 | 76 | |
77 | 77 | self.architecture = platform.machine() |
78 | 78 | self.working_directory = self.determine_working_directory() |
79 | | - print sys.path |
80 | 79 | self.update_python_path() |
81 | | - print sys.path |
82 | 80 | |
83 | 81 | self.root = os.path.expanduser('~') if self.platform != 'Windows' else 'c:\\' |
84 | 82 | self.max_filehandles = self.determine_max_filehandles_open() |
Index: trunk/tools/editor_trends/classes/runtime_settings.py |
— | — | @@ -29,14 +29,15 @@ |
30 | 30 | import datetime |
31 | 31 | import time |
32 | 32 | import re |
33 | | -sys.path.append('..') |
| 33 | +#sys.path.append('..') |
34 | 34 | |
| 35 | +from settings import Settings |
35 | 36 | from utils import text_utils |
36 | 37 | from utils import ordered_dict as odict |
37 | 38 | from classes import languages |
38 | 39 | |
39 | 40 | |
40 | | -class RunTimeSettings: |
| 41 | +class RunTimeSettings(Settings): |
41 | 42 | ''' |
42 | 43 | This class keeps track of the commands issued by the user and is used to |
43 | 44 | feed the different etl functions. Difference with configuration class is |
— | — | @@ -44,25 +45,26 @@ |
45 | 46 | same for a user while these settings can change depending on the kind of |
46 | 47 | analysis requested. |
47 | 48 | ''' |
48 | | - def __init__(self, project, language, settings, args=None): |
| 49 | + def __init__(self, project, language, args=None): |
| 50 | + Settings.__init__(self) |
49 | 51 | self.project = project |
50 | 52 | self.language = language |
51 | | - self.settings = settings |
| 53 | + self.dbname = 'wikilytics' |
52 | 54 | |
53 | 55 | if args: |
54 | 56 | self.args = args |
55 | 57 | self.hash = self.secs_since_epoch() |
56 | | - print self.settings.input_location |
57 | | - print self.get_value('location') |
58 | | - self.base_location = self.settings.input_location if \ |
59 | | - self.settings.input_location != None else self.get_value('location') |
| 58 | + #print self.settings.input_location |
| 59 | + #print self.get_value('location') |
| 60 | + self.input_location = self.input_location if \ |
| 61 | + self.input_location != None else self.get_value('location') |
60 | 62 | self.project = self.update_project_settings() |
61 | 63 | self.language = self.update_language_settings() |
62 | | - self.dbname = '%s%s' % (self.language.code, self.project.name) |
| 64 | + #self.dbname = '%s%s' % (self.language.code, self.project.name) |
63 | 65 | self.targets = self.split_keywords(self.get_value('charts')) |
64 | 66 | self.keywords = self.split_keywords(self.get_value('keywords')) |
65 | 67 | self.function = self.get_value('func') |
66 | | - self.collection = self.get_value('collection') |
| 68 | + |
67 | 69 | self.ignore = self.get_value('except') |
68 | 70 | self.clean = self.get_value('new') |
69 | 71 | self.force = self.get_value('force') |
— | — | @@ -70,9 +72,9 @@ |
71 | 73 | self.filename = self.generate_wikidump_filename() |
72 | 74 | self.namespaces = self.get_namespaces() |
73 | 75 | |
74 | | - self.dataset = os.path.join(settings.dataset_location, |
| 76 | + self.dataset = os.path.join(self.dataset_location, |
75 | 77 | self.project.name) |
76 | | - self.charts = os.path.join(settings.chart_location, |
| 78 | + self.charts = os.path.join(self.chart_location, |
77 | 79 | self.project.name) |
78 | 80 | |
79 | 81 | self.txt = os.path.join(self.location, 'txt') |
— | — | @@ -86,8 +88,11 @@ |
87 | 89 | self.dump_filename = self.generate_wikidump_filename() |
88 | 90 | self.dump_relative_path = self.set_dump_path() |
89 | 91 | self.dump_absolute_path = self.set_dump_path(absolute=True) |
90 | | - print self.directories |
91 | | - settings.verify_environment(self.directories) |
| 92 | + self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name) |
| 93 | + self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name) |
| 94 | + self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name) |
| 95 | + self.analyzer_collection = self.get_value('collection') |
| 96 | + self.verify_environment(self.directories) |
92 | 97 | |
93 | 98 | def __str__(self): |
94 | 99 | return 'Runtime Settings for project %s%s' % (self.language.name, |
— | — | @@ -126,7 +131,7 @@ |
127 | 132 | ''' |
128 | 133 | Construct the full project location |
129 | 134 | ''' |
130 | | - return os.path.join(self.base_location, self.language.code, self.project.name) |
| 135 | + return os.path.join(self.input_location, self.language.code, self.project.name) |
131 | 136 | |
132 | 137 | def show_settings(self): |
133 | 138 | ''' |
— | — | @@ -141,7 +146,7 @@ |
142 | 147 | max_length_key = max([len(key) for key in about.keys()]) |
143 | 148 | print 'Final settings after parsing command line arguments:' |
144 | 149 | for ab in about: |
145 | | - print '%s: %s' % (ab.rjust(max_length_key), about[ab].encode(self.settings.encoding)) |
| 150 | + print '%s: %s' % (ab.rjust(max_length_key), about[ab].encode(self.encoding)) |
146 | 151 | |
147 | 152 | |
148 | 153 | def get_value(self, key): |
— | — | @@ -152,7 +157,7 @@ |
153 | 158 | |
154 | 159 | def set_dump_path(self, absolute=False): |
155 | 160 | if absolute: |
156 | | - return '%s/%s%s/latest/' % (self.settings.wp_dump_location, self.language.code, self.project.name) |
| 161 | + return '%s/%s%s/latest/' % (self.wp_dump_location, self.language.code, self.project.name) |
157 | 162 | else: |
158 | 163 | return '/%s%s/latest/' % (self.language.code, self.project.name) |
159 | 164 | |
Index: trunk/tools/editor_trends/classes/dataset.py |
— | — | @@ -34,7 +34,7 @@ |
35 | 35 | from utils import file_utils |
36 | 36 | from utils import data_converter |
37 | 37 | from database import db |
38 | | -import json_encoders |
| 38 | +from analyses import json_encoders |
39 | 39 | |
40 | 40 | class Transform(SONManipulator): |
41 | 41 | ''' |
Index: trunk/tools/editor_trends/classes/languages.py |
— | — | @@ -31,20 +31,20 @@ |
32 | 32 | def __repr__(self): |
33 | 33 | return u'%s - %s' % (self.code, self.name) |
34 | 34 | |
35 | | - def show_languages(self, settings, project, startswith=None): |
| 35 | + def show_languages(self, project, startswith=None): |
36 | 36 | if startswith != None: |
37 | 37 | startswith = startswith.title() |
38 | 38 | project.valid_languages.sort() |
39 | 39 | for language in project.valid_languages: |
40 | 40 | try: |
41 | 41 | if startswith != None and language.startswith(first): |
42 | | - print '%s' % language.decode(settings.encoding) |
| 42 | + print '%s' % language.decode('utf-8') |
43 | 43 | elif startswith == None: |
44 | | - print '%s' % language.decode(settings.encoding) |
| 44 | + print '%s' % language.decode('utf-8') |
45 | 45 | except UnicodeEncodeError: |
46 | 46 | print '%s' % language |
47 | | - |
48 | 47 | |
| 48 | + |
49 | 49 | class LanguageContainer: |
50 | 50 | def __init__(self): |
51 | 51 | self.init_languages = odict.OrderedDict([ |
Index: trunk/tools/editor_trends/configuration.py |
— | — | @@ -90,7 +90,9 @@ |
91 | 91 | |
92 | 92 | self.architecture = platform.machine() |
93 | 93 | self.working_directory = self.determine_working_directory() |
| 94 | + print sys.path |
94 | 95 | self.update_python_path() |
| 96 | + print sys.path |
95 | 97 | |
96 | 98 | self.root = os.path.expanduser('~') if self.platform != 'Windows' else 'c:\\' |
97 | 99 | self.max_filehandles = self.determine_max_filehandles_open() |
Index: trunk/tools/editor_trends/utils/__init__.py |
— | — | @@ -0,0 +1 @@ |
| 2 | + |
Index: trunk/tools/editor_trends/utils/compression.py |
— | — | @@ -22,10 +22,12 @@ |
23 | 23 | import os |
24 | 24 | sys.path.append('..') |
25 | 25 | |
26 | | -import configuration |
27 | | -settings = configuration.Settings() |
| 26 | +#import configuration |
| 27 | +#settings = configuration.Settings() |
| 28 | +from classes import settings |
| 29 | +settings = settings.Settings() |
| 30 | +from classes import exceptions |
28 | 31 | import file_utils |
29 | | -from classes import exceptions |
30 | 32 | import timer |
31 | 33 | import log |
32 | 34 | |
— | — | @@ -128,6 +130,7 @@ |
129 | 131 | self.name = p |
130 | 132 | self.program_installed = path |
131 | 133 | |
| 134 | + |
132 | 135 | def launch_zip_extractor(location, filename, properties): |
133 | 136 | ''' |
134 | 137 | |
— | — | @@ -141,6 +144,7 @@ |
142 | 145 | log.log_to_mongo(properties, 'dataset', 'unpack', stopwatch, event='finish') |
143 | 146 | return retcode |
144 | 147 | |
| 148 | + |
145 | 149 | if __name__ == '__main__': |
146 | | - c = Compressor('C:\Users\diederik.vanliere\Documents', 'django.zip') |
| 150 | + c = Compressor('C:\Users\diederik.vanliere\Documents', 'test.zip') |
147 | 151 | c.extract() |
Index: trunk/tools/editor_trends/utils/log.py |
— | — | @@ -27,10 +27,10 @@ |
28 | 28 | |
29 | 29 | from database import db |
30 | 30 | |
31 | | -def log_to_mongo(properties, jobtype, task, timer, event='start'): |
32 | | - conn = db.init_mongo_db('wikilytics') |
| 31 | +def log_to_mongo(rts, jobtype, task, timer, event='start'): |
| 32 | + conn = db.init_mongo_db(rts.dbname) |
33 | 33 | created = datetime.datetime.now() |
34 | | - hash = '%s_%s' % (properties.project, properties.hash) |
| 34 | + hash = '%s_%s' % (rts.project, rts.hash) |
35 | 35 | coll = conn['jobs'] |
36 | 36 | |
37 | 37 | job = coll.find_one({'hash': hash}) |
— | — | @@ -38,8 +38,8 @@ |
39 | 39 | if job == None: |
40 | 40 | if jobtype == 'dataset': |
41 | 41 | _id = coll.save({'hash': hash, 'created': created, 'finished': False, |
42 | | - 'language_code': properties.language.code, |
43 | | - 'project': properties.project.name, |
| 42 | + 'language_code': rts.language.code, |
| 43 | + 'project': rts.project.name, |
44 | 44 | 'in_progress': True, 'jobtype': jobtype, |
45 | 45 | 'tasks': {}}) |
46 | 46 | |
— | — | @@ -47,8 +47,8 @@ |
48 | 48 | elif jobtype == 'chart': |
49 | 49 | _id = coll.save({'hash': hash, 'created': created, |
50 | 50 | 'jobtype': jobtype, |
51 | | - 'project': properties.project, |
52 | | - 'language_code': properties.language_code, |
| 51 | + 'project': rts.project, |
| 52 | + 'language_code': rts.language_code, |
53 | 53 | 'tasks': {}}) |
54 | 54 | |
55 | 55 | job = coll.find_one({'_id': _id}) |
Index: trunk/tools/editor_trends/bots/__init__.py |
— | — | @@ -0,0 +1 @@ |
| 2 | + |
Index: trunk/tools/editor_trends/code-snippets/__init__.py |
— | — | @@ -0,0 +1,8 @@ |
| 2 | +import os |
| 3 | + |
| 4 | +cwd = os.getcwd() |
| 5 | +pos = cwd.rfind(os.sep) |
| 6 | +cwd = cwd[:pos] |
| 7 | + |
| 8 | +from __init__ import Path |
| 9 | +Path() |