Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -18,6 +18,7 @@ |
19 | 19 | __version__ = '0.1' |
20 | 20 | |
21 | 21 | import os |
| 22 | +import logging |
22 | 23 | import sys |
23 | 24 | import datetime |
24 | 25 | from argparse import ArgumentParser |
— | — | @@ -32,6 +33,7 @@ |
33 | 34 | import languages |
34 | 35 | from utils import utils |
35 | 36 | from utils import dump_downloader |
| 37 | +from utils import compression |
36 | 38 | from etl import chunker |
37 | 39 | from etl import extract |
38 | 40 | from etl import loader |
— | — | @@ -65,12 +67,12 @@ |
66 | 68 | return language_code.split('_')[0] |
67 | 69 | |
68 | 70 | |
69 | | -def retrieve_projectname(args): |
70 | | - language_code = retrieve_language(args) |
| 71 | +def get_projectname(args): |
| 72 | + language_code = get_language(args) |
71 | 73 | if language_code == None: |
72 | 74 | print 'Entered language: %s is not a valid Wikimedia language' % get_value(args, 'language') |
73 | 75 | sys.exit(-1) |
74 | | - project = retrieve_project(args) |
| 76 | + project = get_project(args) |
75 | 77 | |
76 | 78 | if project == None: |
77 | 79 | print 'Entered project: %s is not valid Wikimedia Foundation project.' % get_value(args, 'project') |
— | — | @@ -81,59 +83,87 @@ |
82 | 84 | return '%s%s' % (language_code, project) |
83 | 85 | |
84 | 86 | |
85 | | -def retrieve_language(args): |
| 87 | +def get_language(args): |
86 | 88 | language = get_value(args, 'language') |
87 | 89 | language = language.title() |
88 | 90 | return languages.MAPPING.get(language, 'en') |
89 | 91 | |
90 | 92 | |
91 | | -def retrieve_project(args): |
| 93 | +def get_namespaces(args): |
| 94 | + namespaces = get_value(args, 'namespace') |
| 95 | + if namespaces != None: |
| 96 | + return namespaces.split(',') |
| 97 | + else: |
| 98 | + return namespaces |
| 99 | + |
| 100 | +def write_message_to_log(logger, args, message=None, verb=None, **kwargs): |
| 101 | + function = get_value(args, 'func') |
| 102 | + logger.debug('Starting %s task' % function.func_name) |
| 103 | + if message: |
| 104 | + logger.debug(message) |
| 105 | + for kw in kwargs: |
| 106 | + if verb: |
| 107 | + logger.debug('Action: %s\tSetting: %s' % (verb, kwargs[kw])) |
| 108 | + else: |
| 109 | + logger.debug('Key: %s\tSetting: %s' % (kw, kwargs[kw])) |
| 110 | + |
| 111 | + |
| 112 | + |
| 113 | +def get_project(args): |
92 | 114 | project = get_value(args, 'project') |
93 | 115 | if project != 'wiki': |
94 | 116 | project = settings.projects.get(project, None) |
95 | 117 | return project |
96 | 118 | |
97 | 119 | |
98 | | -def generate_wikidump_filename(project, args): |
99 | | - return '%s-%s-%s' % (project, 'latest', get_value(args, 'file')) |
| 120 | +def generate_wikidump_filename(language_code, project, args): |
| 121 | + return '%s%s-%s-%s' % (language_code, project, 'latest', get_value(args, 'file')) |
100 | 122 | |
101 | 123 | |
102 | | -def determine_file_locations(args): |
103 | | - locations = {} |
| 124 | +def determine_file_locations(args, logger): |
| 125 | + config = {} |
104 | 126 | location = get_value(args, 'location') if get_value(args, 'location') != None else settings.input_location |
105 | | - project = retrieve_project(args) |
106 | | - language_code = retrieve_language(args) |
107 | | - locations['language_code'] = language_code |
108 | | - locations['language'] = get_value(args, 'language') |
109 | | - locations['location'] = os.path.join(location, language_code, project) |
110 | | - locations['chunks'] = os.path.join(locations['location'], 'chunks') |
111 | | - locations['txt'] = os.path.join(locations['location'], 'txt') |
112 | | - locations['sorted'] = os.path.join(locations['location'], 'sorted') |
113 | | - locations['dbready'] = os.path.join(locations['location'], 'dbready') |
114 | | - locations['project'] = project |
115 | | - locations['full_project'] = retrieve_projectname(args) |
116 | | - locations['filename'] = generate_wikidump_filename(project, args) |
117 | | - locations['collection'] = get_value(args, 'collection') |
118 | | - locations['directories'] = [locations['chunks'], locations['location'], locations['txt'], locations['sorted'], locations['dbready']] |
119 | | - return locations |
| 127 | + project = get_project(args) |
| 128 | + language_code = get_language(args) |
| 129 | + config['language_code'] = language_code |
| 130 | + config['language'] = get_value(args, 'language') |
| 131 | + config['location'] = os.path.join(location, language_code, project) |
| 132 | + config['chunks'] = os.path.join(config['location'], 'chunks') |
| 133 | + config['txt'] = os.path.join(config['location'], 'txt') |
| 134 | + config['sorted'] = os.path.join(config['location'], 'sorted') |
| 135 | + config['dbready'] = os.path.join(config['location'], 'dbready') |
| 136 | + config['project'] = project |
| 137 | + config['full_project'] = get_projectname(args) |
| 138 | + config['filename'] = generate_wikidump_filename(language_code, project, args) |
| 139 | + config['collection'] = get_value(args, 'collection') |
| 140 | + config['namespaces'] = get_namespaces(args) |
| 141 | + config['directories'] = [config['location'], config['chunks'], config['txt'], config['sorted'], config['dbready']] |
120 | 142 | |
| 143 | + message = 'Settings as generated from the configuration module.' |
| 144 | + write_message_to_log(logger, args, message, None, **config) |
| 145 | + #for c in config: |
| 146 | + # logger.debug('Key: %s - Setting: %s' % (c, config[c])) |
| 147 | + return config |
121 | 148 | |
122 | | -def show_settings(args, **kwargs): |
123 | | - project = settings.projects.get(kwargs.pop('project'), 'wiki') |
| 149 | + |
| 150 | +def show_settings(args, logger, **kwargs): |
| 151 | + language_map = languages.language_map() |
| 152 | + language = kwargs.pop('language') |
124 | 153 | language_code = kwargs.pop('language_code') |
125 | | - language = kwargs.pop('language') |
126 | | - location = kwargs.pop('location') |
127 | | - project = project.title() |
128 | | - language_map = languages.language_map() |
129 | | - print 'Project: %s' % (project) |
130 | | - print 'Language: %s / %s' % (language_map[language_code].decode(settings.encoding), language.decode(settings.encoding)) |
131 | | - print 'Input directory: %s' % location |
132 | | - print 'Output directory: %s and subdirectories' % location |
| 154 | + config = {} |
| 155 | + config['Project'] = settings.projects.get(kwargs.pop('project'), 'wiki').title() |
| 156 | + config['Language'] = '%s / %s' % (language_map[language_code].decode(settings.encoding), language.decode(settings.encoding)) |
| 157 | + config['Input directory'] = kwargs.get('location') |
| 158 | + config['Output directory'] = '%s and subdirectories' % kwargs.get('location') |
133 | 159 | |
| 160 | + message = 'Final settings after parsing command line arguments:' |
| 161 | + write_message_to_log(logger, args, message, None, **config) |
134 | 162 | |
135 | | -def dump_downloader_launcher(args, **kwargs): |
| 163 | + |
| 164 | +def dump_downloader_launcher(args, logger, **kwargs): |
136 | 165 | print 'dump downloader' |
137 | 166 | timer = Timer() |
| 167 | + write_message_to_log(logger, args, **kwargs) |
138 | 168 | filename = kwargs.get('filename') |
139 | 169 | extension = kwargs.get('extension') |
140 | 170 | location = kwargs.get('location') |
— | — | @@ -146,37 +176,42 @@ |
147 | 177 | timer.elapsed() |
148 | 178 | |
149 | 179 | |
150 | | -def chunker_launcher(args, **kwargs): |
| 180 | +def chunker_launcher(args, logger, **kwargs): |
151 | 181 | print 'split_settings.input_filename_launcher' |
152 | 182 | timer = Timer() |
| 183 | + write_message_to_log(logger, args, **kwargs) |
153 | 184 | filename = kwargs.pop('filename') |
154 | | - filename = 'en-latest-pages-meta-history.xml.bz2' |
155 | 185 | location = kwargs.pop('location') |
156 | 186 | project = kwargs.pop('project') |
157 | 187 | language = kwargs.pop('language') |
158 | 188 | language_code = kwargs.pop('language_code') |
| 189 | + namespaces = kwargs.pop('namespaces') |
| 190 | + |
159 | 191 | ext = utils.determine_file_extension(filename) |
160 | | - if ext in settings.compression.keys(): |
161 | | - file = filename.replace('.'+ ext, '') |
| 192 | + file = filename.replace('.' + ext, '') |
162 | 193 | result = utils.check_file_exists(location, file) |
163 | 194 | if not result: |
164 | | - retcode = launch_zip_extractor(args, location, filename, ext) |
| 195 | + retcode = launch_zip_extractor(args, location, filename) |
165 | 196 | else: |
166 | 197 | retcode = 0 |
167 | 198 | if retcode != 0: |
168 | 199 | sys.exit(retcode) |
169 | | - chunker.split_file(location, file, project, language_code, language) |
| 200 | + |
| 201 | + chunker.split_file(location, file, project, language_code, namespaces, format='xml', zip=False) |
170 | 202 | timer.elapsed() |
171 | 203 | |
172 | 204 | |
173 | | -def launch_zip_extractor(args, location, file, ext): |
| 205 | +def launch_zip_extractor(args, location, file): |
174 | 206 | timer = Timer() |
175 | | - utils.zip_extract(location, file, ext) |
| 207 | + write_message_to_log(logger, args, location=location, file=file) |
| 208 | + compressor = compression.Compressor(location, file) |
| 209 | + compressor.extract() |
176 | 210 | timer.elapsed() |
177 | 211 | |
178 | 212 | |
179 | | -def extract_launcher(args, **kwargs): |
| 213 | +def extract_launcher(args, logger, **kwargs): |
180 | 214 | timer = Timer() |
| 215 | + write_message_to_log(logger, args, **kwargs) |
181 | 216 | location = kwargs.pop('location') |
182 | 217 | language_code = kwargs.pop('language_code') |
183 | 218 | project = kwargs.pop('project') |
— | — | @@ -184,8 +219,9 @@ |
185 | 220 | timer.elapsed() |
186 | 221 | |
187 | 222 | |
188 | | -def sort_launcher(args, **kwargs): |
| 223 | +def sort_launcher(args, logger, **kwargs): |
189 | 224 | timer = Timer() |
| 225 | + write_message_to_log(logger, args, **kwargs) |
190 | 226 | location = kwargs.pop('location') |
191 | 227 | input = os.path.join(location, 'txt') |
192 | 228 | output = os.path.join(location, 'sorted') |
— | — | @@ -196,8 +232,9 @@ |
197 | 233 | timer.elapsed() |
198 | 234 | |
199 | 235 | |
200 | | -def store_launcher(args, **kwargs): |
| 236 | +def store_launcher(args, logger, **kwargs): |
201 | 237 | timer = Timer() |
| 238 | + write_message_to_log(logger, args, **kwargs) |
202 | 239 | location = kwargs.pop('location') |
203 | 240 | input = os.path.join(location, 'dbready') |
204 | 241 | dbname = kwargs.pop('full_project') |
— | — | @@ -206,37 +243,47 @@ |
207 | 244 | timer.elapsed() |
208 | 245 | |
209 | 246 | |
210 | | -def transformer_launcher(args, **kwargs): |
| 247 | +def transformer_launcher(args, logger, **kwargs): |
211 | 248 | print 'dataset launcher' |
212 | 249 | timer = Timer() |
| 250 | + write_message_to_log(logger, args, **kwargs) |
213 | 251 | project = kwargs.pop('full_project') |
214 | 252 | collection = kwargs.pop('collection') |
215 | 253 | transformer.run_optimize_editors(project, collection) |
216 | 254 | timer.elapsed() |
217 | 255 | |
218 | 256 | |
219 | | -def exporter_launcher(args, **kwargs): |
| 257 | +def exporter_launcher(args, logger, **kwargs): |
220 | 258 | timer = Timer() |
| 259 | + write_message_to_log(logger, args, **kwargs) |
221 | 260 | project = kwargs.pop('full_project') |
222 | 261 | exporter.generate_editor_dataset_launcher(project) |
223 | 262 | timer.elapsed() |
224 | 263 | |
225 | 264 | |
226 | | -def all_launcher(args, **kwargs): |
| 265 | +def all_launcher(args, logger, **kwargs): |
227 | 266 | print 'all_launcher' |
228 | 267 | timer = Timer() |
| 268 | + message = 'Starting ' |
| 269 | + write_message_to_log(logger, args, message, **kwargs) |
229 | 270 | ignore = get_value(args, 'except') |
| 271 | + clean = get_value(args, 'clean') |
| 272 | + if clean: |
| 273 | + dirs = kwargs.get('directories')[1:] |
| 274 | + for dir in dirs: |
| 275 | + write_message_to_log(logger, args, verb='Deleting', **kwargs) |
| 276 | + utils.delete_file(dir, '') |
230 | 277 | functions = {dump_downloader_launcher: 'download', |
231 | 278 | chunker_launcher: 'split', |
232 | 279 | extract_launcher: 'extract', |
233 | 280 | sort_launcher: 'sort', |
234 | | - transformer_launcher: 'transform', |
| 281 | + transformer_launcher: 'transform', |
235 | 282 | exporter_launcher: 'export' |
236 | 283 | } |
237 | 284 | for function, callname in functions.iteritems(): |
238 | 285 | if callname not in ignore: |
239 | 286 | function(args, **kwargs) |
240 | | - |
| 287 | + |
241 | 288 | timer.elapsed() |
242 | 289 | |
243 | 290 | |
— | — | @@ -265,8 +312,9 @@ |
266 | 313 | print '%s' % language |
267 | 314 | |
268 | 315 | |
269 | | -def detect_python_version(): |
| 316 | +def detect_python_version(logger): |
270 | 317 | version = sys.version_info[0:2] |
| 318 | + logger.debug('Python version: %s' % '.'.join(str(version))) |
271 | 319 | if version < settings.minimum_python_version: |
272 | 320 | raise 'Please upgrade to Python 2.6 or higher (but not Python 3.x).' |
273 | 321 | |
— | — | @@ -279,6 +327,9 @@ |
280 | 328 | |
281 | 329 | |
282 | 330 | def main(): |
| 331 | + logger = logging.getLogger('manager') |
| 332 | + logger.setLevel(logging.DEBUG) |
| 333 | + |
283 | 334 | default_language = determine_default_language() |
284 | 335 | file_choices = ('stub-meta-history.xml.gz', |
285 | 336 | 'stub-meta-current.xml.gz', |
— | — | @@ -303,6 +354,7 @@ |
304 | 355 | parser_download.set_defaults(func=dump_downloader_launcher) |
305 | 356 | |
306 | 357 | parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.') |
| 358 | + |
307 | 359 | parser_split.set_defaults(func=chunker_launcher) |
308 | 360 | |
309 | 361 | parser_create = subparsers.add_parser('extract', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.') |
— | — | @@ -317,7 +369,7 @@ |
318 | 370 | help='Name of MongoDB collection', |
319 | 371 | default='editors') |
320 | 372 | |
321 | | - parser_transform = subparsers.add_parser('transform', help='Transform the raw datatabe to an enriched dataset that can be exported.') |
| 373 | + parser_transform = subparsers.add_parser('transform', help='Transform the raw datatable to an enriched dataset that can be exported.') |
322 | 374 | parser_transform.set_defaults(func=transformer_launcher) |
323 | 375 | parser_transform.add_argument('-c', '--collection', action='store', |
324 | 376 | help='Name of MongoDB collection', |
— | — | @@ -331,8 +383,11 @@ |
332 | 384 | parser_all.add_argument('-e', '--except', action='store', |
333 | 385 | help='Should be a list of functions that are to be ignored when executing \'all\'.', |
334 | 386 | default=[]) |
335 | | - |
| 387 | + parser_all.add_argument('-n', '--new', action='store_false', |
| 388 | + help='This will delete all previous output and starts from scratch. Mostly useful for debugging purposes.', |
| 389 | + default=False) |
336 | 390 | |
| 391 | + |
337 | 392 | parser.add_argument('-l', '--language', action='store', |
338 | 393 | help='Example of valid languages.', |
339 | 394 | choices=supported_languages(), |
— | — | @@ -346,25 +401,29 @@ |
347 | 402 | parser.add_argument('-o', '--location', action='store', |
348 | 403 | help='Indicate where you want to store the downloaded file.', |
349 | 404 | ) |
350 | | - #default=settings.input_location) |
351 | 405 | |
| 406 | + parser.add_argument('-n', '--namespace', action='store', |
| 407 | + help='A list of namespaces to include for analysis.', |
| 408 | + default='0') |
| 409 | + |
| 410 | + |
352 | 411 | parser.add_argument('-f', '--file', action='store', |
353 | 412 | choices=file_choices, |
354 | 413 | help='Indicate which dump you want to download. Valid choices are:\n %s' % ''.join([f + ',\n' for f in file_choices]), |
355 | | - default='stub-meta-current.xml.gz') |
| 414 | + default='stub-meta-history.xml.gz') |
356 | 415 | |
357 | 416 | parser.add_argument('-prog', '--progress', action='store_true', default=True, |
358 | 417 | help='Indicate whether you want to have a progressbar.') |
359 | 418 | |
360 | | - detect_python_version() |
| 419 | + detect_python_version(logger) |
361 | 420 | about() |
362 | 421 | args = parser.parse_args() |
363 | 422 | config.create_configuration(settings, args) |
364 | | - locations = determine_file_locations(args) |
| 423 | + locations = determine_file_locations(args, logger) |
365 | 424 | settings.verify_environment(locations['directories']) |
366 | | - show_settings(args, **locations) |
| 425 | + show_settings(args, logger, **locations) |
367 | 426 | #locations['settings'] = settings |
368 | | - args.func(args, **locations) |
| 427 | + args.func(args, logger, **locations) |
369 | 428 | t1 = datetime.datetime.now() |
370 | 429 | |
371 | 430 | |
Index: trunk/tools/editor_trends/etl/exporter.py |
— | — | @@ -21,7 +21,7 @@ |
22 | 22 | import datetime |
23 | 23 | from dateutil.relativedelta import * |
24 | 24 | import calendar |
25 | | -from multiprocessing import Queue |
| 25 | +import multiprocessing |
26 | 26 | from Queue import Empty |
27 | 27 | |
28 | 28 | |
— | — | @@ -32,8 +32,6 @@ |
33 | 33 | from utils import models, utils |
34 | 34 | from database import db |
35 | 35 | from etl import shaper |
36 | | -from utils import process_constructor as pc |
37 | | -import progressbar |
38 | 36 | |
39 | 37 | try: |
40 | 38 | import psyco |
— | — | @@ -181,10 +179,10 @@ |
182 | 180 | return headers |
183 | 181 | |
184 | 182 | |
185 | | -def generate_long_editor_dataset(input_queue, vars, **kwargs): |
186 | | - dbname = kwargs.pop('dbname') |
| 183 | +def generate_long_editor_dataset(tasks, dbname, collection, **kwargs): |
187 | 184 | mongo = db.init_mongo_db(dbname) |
188 | | - editors = mongo['dataset'] |
| 185 | + vars = ['monthly_edits'] |
| 186 | + editors = mongo[collection + 'dataset'] |
189 | 187 | name = dbname + '_long_editors.csv' |
190 | 188 | #fh = utils.create_txt_filehandle(settings.dataset_location, name, 'w', settings.encoding) |
191 | 189 | vars_to_expand = [] |
— | — | @@ -202,11 +200,9 @@ |
203 | 201 | ld.write_longitudinal_data() |
204 | 202 | |
205 | 203 | |
206 | | -def generate_cohort_analysis(input_queue, **kwargs): |
207 | | - dbname = kwargs.get('dbname') |
208 | | - pbar = kwargs.get('pbar') |
| 204 | +def generate_cohort_dataset(tasks, dbname, collection, **kwargs): |
209 | 205 | mongo = db.init_mongo_db(dbname) |
210 | | - editors = mongo['dataset'] |
| 206 | + editors = mongo[collection + 'dataset'] |
211 | 207 | year = datetime.datetime.now().year + 1 |
212 | 208 | begin = year - 2001 |
213 | 209 | p = [3, 6, 9] |
— | — | @@ -215,7 +211,7 @@ |
216 | 212 | data = {} |
217 | 213 | while True: |
218 | 214 | try: |
219 | | - id = input_queue.get(block=False) |
| 215 | + id = tasks.get(block=False) |
220 | 216 | obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1}) |
221 | 217 | first_edit = obs['first_edit'] |
222 | 218 | last_edit = obs['final_edit'] |
— | — | @@ -244,7 +240,6 @@ |
245 | 241 | p = min(edits) |
246 | 242 | data[y]['n'] += 1 |
247 | 243 | data[y][p] += 1 |
248 | | - #pbar.update(+1) |
249 | 244 | except Empty: |
250 | 245 | break |
251 | 246 | utils.store_object(data, settings.binary_location, 'cohort_data') |
— | — | @@ -257,20 +252,16 @@ |
258 | 253 | return False |
259 | 254 | |
260 | 255 | |
261 | | -def generate_wide_editor_dataset(input_queue, **kwargs): |
262 | | - dbname = kwargs.pop('dbname') |
| 256 | +def generate_wide_editor_dataset(tasks, dbname, collection, **kwargs): |
263 | 257 | mongo = db.init_mongo_db(dbname) |
264 | | - editors = mongo['dataset'] |
| 258 | + editors = mongo[collection + 'dataset'] |
265 | 259 | name = dbname + '_wide_editors.csv' |
266 | 260 | fh = utils.create_txt_filehandle(settings.dataset_location, name, 'a', settings.encoding) |
267 | 261 | x = 0 |
268 | 262 | vars_to_expand = ['edits', 'edits_by_year', 'articles_by_year'] |
269 | 263 | while True: |
270 | 264 | try: |
271 | | - if debug: |
272 | | - id = u'99797' |
273 | | - else: |
274 | | - id = input_queue.get(block=False) |
| 265 | + id = input_queue.get(block=False) |
275 | 266 | print input_queue.qsize() |
276 | 267 | obs = editors.find_one({'editor': id}) |
277 | 268 | obs = expand_observations(obs, vars_to_expand) |
— | — | @@ -292,49 +283,25 @@ |
293 | 284 | fh.close() |
294 | 285 | |
295 | 286 | |
296 | | -def retrieve_edits_by_contributor_launcher(): |
297 | | - pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors') |
| 287 | +def dataset_launcher(dbname, collection, target): |
| 288 | + editors = retrieve_editor_ids_mongo(dbname, collection) |
| 289 | + tasks = multiprocessing.JoinableQueue() |
| 290 | + consumers = [multiprocessing.Process(target=target, args=(tasks, dbname, collection)) for i in xrange(settings.number_of_processes)] |
| 291 | + for editor in editors: |
| 292 | + tasks.put(editor) |
| 293 | + print 'The queue contains %s editors.' % tasks.qsize() |
| 294 | + for x in xrange(settings.number_of_processes): |
| 295 | + tasks.put(None) |
298 | 296 | |
| 297 | + for w in consumers: |
| 298 | + w.start() |
299 | 299 | |
300 | | -def debug_retrieve_edits_by_contributor_launcher(dbname): |
301 | | - kwargs = {'debug': False, |
302 | | - 'dbname': dbname, |
303 | | - } |
304 | | - ids = retrieve_editor_ids_mongo(dbname, 'editors') |
305 | | - input_queue = pc.load_queue(ids) |
306 | | - q = Queue() |
307 | | - generate_editor_dataset(input_queue, q, False, kwargs) |
| 300 | + tasks.join() |
308 | 301 | |
309 | 302 | |
310 | | -def generate_editor_dataset_launcher(dbname): |
311 | | - kwargs = {'nr_input_processors': 1, |
312 | | - 'nr_output_processors': 1, |
313 | | - 'debug': False, |
314 | | - 'dbname': dbname, |
315 | | - 'poison_pill':False, |
316 | | - 'pbar': True |
317 | | - } |
318 | | - ids = retrieve_editor_ids_mongo(dbname, 'editors') |
319 | | - ids = list(ids) |
320 | | - chunks = dict({0: ids}) |
321 | | - pc.build_scaffolding(pc.load_queue, generate_cohort_analysis, chunks, False, False, **kwargs) |
322 | | - |
323 | | - |
324 | | -def generate_editor_dataset_debug(dbname): |
325 | | - ids = retrieve_editor_ids_mongo(dbname, 'editors') |
326 | | - #ids = list(ids)[:1000] |
327 | | - input_queue = pc.load_queue(ids) |
328 | | - kwargs = {'nr_input_processors': 1, |
329 | | - 'nr_output_processors': 1, |
330 | | - 'debug': True, |
331 | | - 'dbname': dbname, |
332 | | - } |
333 | | - #generate_editor_dataset(input_queue, False, False, kwargs) |
334 | | - vars = ['monthly_edits'] |
335 | | - generate_long_editor_dataset(input_queue, vars, **kwargs) |
336 | | - |
337 | 303 | if __name__ == '__main__': |
338 | | - #generate_editor_dataset_debug('test') |
339 | | - #generate_editor_dataset_launcher('enwiki') |
340 | | - generate_editor_dataset_debug('enwiki') |
341 | | - #debug_retrieve_edits_by_contributor_launcher() |
| 304 | + dbname = 'enwiki' |
| 305 | + collection = 'editors' |
| 306 | + dataset_launcher(dbname, collection, generate_cohort_dataset) |
| 307 | + dataset_launcher(dbname, collection, generate_long_editor_dataset) |
| 308 | + dataset_launcher(dbname, collection, generate_wide_editor_dataset) |
Index: trunk/tools/editor_trends/etl/chunker.py |
— | — | @@ -87,7 +87,7 @@ |
88 | 88 | ''' |
89 | 89 | ns = [] |
90 | 90 | for namespace in namespaces: |
91 | | - if int(namespace) not in include: |
| 91 | + if namespace not in include: |
92 | 92 | value = namespaces[namespace].get(u'*', None) |
93 | 93 | ns.append(value) |
94 | 94 | return ns |
— | — | @@ -165,8 +165,14 @@ |
166 | 166 | return flat |
167 | 167 | |
168 | 168 | |
169 | | -def split_file(location, file, project, language_code, include, format='xml', zip=False): |
170 | | - '''Reads xml file and splits it in N chunks''' |
| 169 | +def split_file(location, file, project, language_code, namespaces=[0], format='xml', zip=False): |
| 170 | + ''' |
| 171 | + Reads xml file and splits it in N chunks |
| 172 | + |
| 173 | + @namespaces is a list indicating which namespaces should be included, default |
| 174 | + is to include namespace 0 (main namespace) |
| 175 | + @zip indicates whether to compress the chunk or not |
| 176 | + ''' |
171 | 177 | #location = os.path.join(settings.input_location, language) |
172 | 178 | input = os.path.join(location, file) |
173 | 179 | output = os.path.join(location, 'chunks') |
— | — | @@ -178,7 +184,7 @@ |
179 | 185 | fh = utils.create_txt_filehandle(output, '%s.tsv' % f, 'w', settings.encoding) |
180 | 186 | |
181 | 187 | ns = load_namespace(language_code) |
182 | | - ns = build_namespaces_locale(ns, include) |
| 188 | + ns = build_namespaces_locale(ns, namespaces) |
183 | 189 | |
184 | 190 | counter = 0 |
185 | 191 | tag = '{%s}page' % settings.xml_namespace |
Index: trunk/tools/editor_trends/etl/models.py |
— | — | @@ -52,19 +52,19 @@ |
53 | 53 | self.lock = None |
54 | 54 | for kw in kwargs: |
55 | 55 | setattr(self, kw, kwargs[kw]) |
56 | | - |
| 56 | + |
57 | 57 | def create_file_handle(self): |
58 | 58 | self.mode = 'a' |
59 | 59 | if self.output_file == None: |
60 | 60 | self.mode = 'w' |
61 | 61 | self.output_file = self.file[:-4] + '.txt' |
62 | | - |
| 62 | + |
63 | 63 | self.fh = utils.create_txt_filehandle(self.output, self.output_file, self.mode, settings.encoding) |
64 | 64 | |
65 | 65 | def __str__(self): |
66 | 66 | return '%s' % (self.file) |
67 | 67 | |
68 | | - def __call__(self): |
| 68 | + def __call__(self, bots=None): |
69 | 69 | if settings.debug: |
70 | 70 | messages = {} |
71 | 71 | vars = {} |
— | — | @@ -104,5 +104,5 @@ |
105 | 105 | |
106 | 106 | if settings.debug: |
107 | 107 | utils.report_error_messages(messages, self.target) |
108 | | - |
109 | | - return self.bots |
\ No newline at end of file |
| 108 | + |
| 109 | + return self.bots |
Index: trunk/tools/editor_trends/etl/loader.py |
— | — | @@ -19,19 +19,21 @@ |
20 | 20 | |
21 | 21 | import os |
22 | 22 | import sys |
| 23 | +import multiprocessing |
23 | 24 | from Queue import Empty |
24 | 25 | |
25 | 26 | sys.path.append('..') |
26 | | -import configuration |
| 27 | +import configuration |
27 | 28 | settings = configuration.Settings() |
28 | 29 | from database import db |
29 | 30 | from database import cache |
30 | 31 | from utils import utils |
31 | 32 | from utils import sort |
32 | | -import process_constructor as pc |
33 | 33 | |
| 34 | +#import process_constructor as pc |
34 | 35 | |
35 | 36 | |
| 37 | + |
36 | 38 | def store_editors(input, dbname, collection): |
37 | 39 | filename = utils.retrieve_file_list(input, 'txt', mask=None)[0] |
38 | 40 | fh = utils.create_txt_filehandle(input, filename, 'r', settings.encoding) |
— | — | @@ -47,7 +49,7 @@ |
48 | 50 | for line in sort.readline(fh): |
49 | 51 | if len(line) == 0: |
50 | 52 | continue |
51 | | - contributor = int(line[0]) |
| 53 | + contributor = int(line[0]) |
52 | 54 | if prev_contributor != contributor: |
53 | 55 | if edits >= 10: |
54 | 56 | result = editor_cache.add(prev_contributor, 'NEXT') |
— | — | @@ -94,31 +96,47 @@ |
95 | 97 | filehandles = [fh.close() for fh in filehandles] |
96 | 98 | filename = 'merged_final.txt' |
97 | 99 | for r in to_remove: |
98 | | - utils.delete_file(output ,r) |
99 | | - |
100 | | - |
| 100 | + utils.delete_file(output , r) |
101 | 101 | |
102 | 102 | |
103 | | -def mergesort_feeder(task_queue, **kwargs): |
104 | | - input = kwargs.get('input', None) |
105 | | - output = kwargs.get('output', None) |
106 | | - #while True: |
107 | | - # try: |
108 | | - #file = input_queue.get(block=False) |
109 | | - for file in task_queue: |
110 | | - fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding) |
111 | | - data = fh.readlines() |
112 | | - fh.close() |
113 | | - data = [d.replace('\n', '') for d in data] |
114 | | - data = [d.split('\t') for d in data] |
115 | | - sorted_data = sort.mergesort(data) |
116 | | - sort.write_sorted_file(sorted_data, file, output) |
117 | 103 | |
118 | 104 | |
| 105 | +def mergesort_feeder(tasks, input, output): |
| 106 | + while True: |
| 107 | + try: |
| 108 | + file = tasks.get(block=False) |
| 109 | + print file |
| 110 | + if file == None: |
| 111 | + break |
| 112 | + fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding) |
| 113 | + data = fh.readlines() |
| 114 | + fh.close() |
| 115 | + data = [d.replace('\n', '') for d in data] |
| 116 | + data = [d.split('\t') for d in data] |
| 117 | + sorted_data = sort.mergesort(data) |
| 118 | + sort.write_sorted_file(sorted_data, file, output) |
| 119 | + except Empty: |
| 120 | + break |
| 121 | + |
| 122 | + |
119 | 123 | def mergesort_launcher(input, output): |
120 | 124 | settings.verify_environment([input, output]) |
121 | 125 | files = utils.retrieve_file_list(input, 'txt') |
122 | | - mergesort_feeder(files, input=input, output=output) |
| 126 | + tasks = multiprocessing.JoinableQueue() |
| 127 | + consumers = [multiprocessing.Process(target=mergesort_feeder, args=(tasks, input, output)) for i in xrange(settings.number_of_processes)] |
| 128 | + for file in files: |
| 129 | + tasks.put(file) |
| 130 | + |
| 131 | + for x in xrange(settings.number_of_processes): |
| 132 | + tasks.put(None) |
| 133 | + |
| 134 | + for w in consumers: |
| 135 | + w.start() |
| 136 | + |
| 137 | + tasks.join() |
| 138 | + #mergesort_feeder(file, input=input, output=output) |
| 139 | + |
| 140 | + |
123 | 141 | #chunks = utils.split_list(files, settings.number_of_processes) |
124 | 142 | #pc.build_scaffolding(pc.load_queue, mergesort_feeder, chunks, False, False, **kwargs) |
125 | 143 | |
— | — | @@ -138,7 +156,7 @@ |
139 | 157 | input = os.path.join(settings.input_location, 'en', 'wiki', 'txt') |
140 | 158 | output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted') |
141 | 159 | dbname = 'enwiki' |
142 | | - #mergesort_launcher(input, output) |
| 160 | + mergesort_launcher(input, output) |
143 | 161 | final_output = os.path.join(settings.input_location, 'en', 'wiki', 'dbready') |
144 | 162 | mergesort_external_launcher(dbname, output, final_output) |
145 | | - store_editors(input, dbname, collection) |
\ No newline at end of file |
| 163 | + store_editors(input, dbname, collection) |
Index: trunk/tools/editor_trends/configuration.py |
— | — | @@ -152,13 +152,13 @@ |
153 | 153 | if not program.endswith('.exe'): |
154 | 154 | program = program + '.exe' |
155 | 155 | path = self.detect_windows_program(program) |
| 156 | + if path != None: |
| 157 | + path = path + program |
156 | 158 | elif self.platform == 'Linux': |
157 | 159 | path = self.detect_linux_program(program) |
158 | | - if path != None: |
159 | | - return path + program |
160 | | - else: |
161 | | - return path |
162 | 160 | |
| 161 | + return path |
| 162 | + |
163 | 163 | def determine_max_filehandles_open(self): |
164 | 164 | if self.platform == 'Windows' and self.architecture == 'i386': |
165 | 165 | return win32file._getmaxstdio() |
Index: trunk/tools/editor_trends/utils/utils.py |
— | — | @@ -220,10 +220,10 @@ |
221 | 221 | keys.sort() |
222 | 222 | for key in keys: |
223 | 223 | if write_key: |
224 | | - fh.write('%s' % key) |
| 224 | + fh.write('%s\t' % key) |
225 | 225 | if getattr(data[key], '__iter__', False): |
226 | 226 | for d in data[key]: |
227 | | - fh.write('\t%s' % d) |
| 227 | + fh.write('%s\t' % d) |
228 | 228 | else: |
229 | 229 | fh.write('%s\t' % (data[key])) |
230 | 230 | if newline: |
Index: trunk/tools/editor_trends/utils/compression.py |
— | — | @@ -18,8 +18,9 @@ |
19 | 19 | |
20 | 20 | def __init__(self, location, file, output=None): |
21 | 21 | self.extension = utils.determine_file_extension(file) |
22 | | - self.file = os.path.join(location, file) |
| 22 | + self.file = file |
23 | 23 | self.location = location |
| 24 | + self.path = os.path.join(self.file, self.location) |
24 | 25 | self.output = None |
25 | 26 | self.name = None |
26 | 27 | self.program = [] |
— | — | @@ -42,20 +43,20 @@ |
43 | 44 | ''' |
44 | 45 | if self.program == []: |
45 | 46 | self.init_compression_tool(self.extension, 'compress') |
46 | | - |
| 47 | + |
47 | 48 | if self.program_installed == None: |
48 | 49 | raise exceptions.CompressionNotSupportedError |
49 | | - |
| 50 | + |
50 | 51 | args = {'7z': ['%s' % self.program_installed, 'a', '-scsUTF-8', '-t%s' % self.compression, '%s' % self.output, '%s' % self.input], |
51 | 52 | } |
52 | | - |
| 53 | + |
53 | 54 | commands = args.get(self.name, None) |
54 | 55 | if commands != None: |
55 | 56 | p = subprocess.Popen(commands, shell=True).wait() |
56 | 57 | else: |
57 | 58 | raise exceptions.CompressionNotSupportedError |
58 | | - |
59 | 59 | |
| 60 | + |
60 | 61 | def extract(self): |
61 | 62 | ''' |
62 | 63 | @location is the directory where to store the compressed file |
— | — | @@ -68,18 +69,25 @@ |
69 | 70 | if self.program_installed == None: |
70 | 71 | raise exceptions.CompressionNotSupportedError |
71 | 72 | |
72 | | - args = {'7z': ['%s' % self.program_installed,'e', '-o%s' % self.location, '%s' % self.file], |
73 | | - 'bunzip2': ['%s' % self.program_installed, '-k', '%s' % self.file], |
74 | | - 'zip': ['%s' % self.program_installed, '-o', '%s' % self.file], |
75 | | - 'gz': ['%s' % self.program_installed, '-xzvf', '%s' % self.file], |
76 | | - 'tar': ['%s' % self.program_installed, '-xvf', '%s' % self.file] |
| 73 | + print self.location |
| 74 | + print self.file |
| 75 | + if not utils.check_file_exists(self.location, self.file): |
| 76 | + raise exceptions.FileNotFoundException(self.location, self.file) |
| 77 | + |
| 78 | + |
| 79 | + |
| 80 | + args = {'7z': ['%s' % self.program_installed, 'e', '-y', '-o%s' % self.location, '%s' % self.path], |
| 81 | + 'bunzip2': ['%s' % self.program_installed, '-k', '%s' % self.path], |
| 82 | + 'zip': ['%s' % self.program_installed, '-o', '%s' % self.path], |
| 83 | + 'gz': ['%s' % self.program_installed, '-xzvf', '%s' % self.path], |
| 84 | + 'tar': ['%s' % self.program_installed, '-xvf', '%s' % self.path] |
77 | 85 | } |
78 | 86 | commands = args.get(self.name, None) |
79 | 87 | if commands != None: |
80 | 88 | p = subprocess.Popen(commands, shell=True).wait() |
81 | 89 | else: |
82 | 90 | raise exceptions.CompressionNotSupportedError |
83 | | - |
| 91 | + |
84 | 92 | # if self.name == '7z': |
85 | 93 | # p = subprocess.Popen(['%s' % tool.extract_installed, 'e', '-o%s' % location, '%s' % input], shell=True).wait() |
86 | 94 | # elif tool_extract_installed.endswith('bunzip2'): |
— | — | @@ -112,7 +120,7 @@ |
113 | 121 | path = settings.detect_installed_program(p) |
114 | 122 | if path != None: |
115 | 123 | self.name = p |
116 | | - self.program_installed = path |
| 124 | + self.program_installed = path |
117 | 125 | |
118 | 126 | |
119 | 127 | if __name__ == '__main__': |
Index: trunk/tools/editor_trends/utils/exceptions.py |
— | — | @@ -23,12 +23,13 @@ |
24 | 24 | pass |
25 | 25 | |
26 | 26 | class FileNotFoundException(Error): |
27 | | - def __init__(self, file): |
28 | | - self.file = file |
| 27 | + def __init__(self, file, path): |
| 28 | + self.file = file |
| 29 | + self.path = path |
29 | 30 | |
30 | 31 | def __str__(self): |
31 | | - print 'The file %s was not found. Please make sure your path is up-to-date.' % self.file |
32 | | - |
| 32 | + print 'The file %s was not found. Please make sure that the file exists and the path is correct.' % (os.path.join(self.path, self.file)) |
| 33 | + |
33 | 34 | class PlatformNotSupportedError(Error): |
34 | 35 | def __init__(self, platform): |
35 | 36 | self.platform = platform |
— | — | @@ -39,6 +40,6 @@ |
40 | 41 | class CompressionNotSupportedError(Error): |
41 | 42 | def __init__(self, extension): |
42 | 43 | self.extension = extension |
43 | | - |
| 44 | + |
44 | 45 | def __str__(self): |
45 | | - print 'You have not installed a program to extract %s archives.' % self.extension |
\ No newline at end of file |
| 46 | + print 'You have not installed a program to extract %s archives.' % self.extension |
Index: trunk/tools/editor_trends/bots/models.py |
— | — | @@ -31,12 +31,17 @@ |
32 | 32 | |
33 | 33 | class Bot(object): |
34 | 34 | |
35 | | - def __init__(self, name): |
| 35 | + def __init__(self, name, **kwargs): |
36 | 36 | self.name = name |
37 | 37 | self.projects = [] |
38 | 38 | self.time = shaper.create_datacontainer(datatype='list') |
39 | 39 | self.verified = True |
| 40 | + for kw in kwargs: |
| 41 | + setattr(self, kw, kwargs[kw]) |
40 | 42 | |
| 43 | + def __repr__(self): |
| 44 | + return self.name |
| 45 | + |
41 | 46 | def hours_active(self): |
42 | 47 | self.clock = shaper.create_clock() |
43 | 48 | years = self.time.keys() |
— | — | @@ -49,38 +54,42 @@ |
50 | 55 | hours.sort() |
51 | 56 | for hour in hours: |
52 | 57 | self.data.append(self.clock[hour]) |
53 | | - |
54 | | - |
55 | 58 | |
| 59 | + |
| 60 | + def active(self): |
| 61 | + return float(sum(self.clock.values())) / 24.0 |
| 62 | + |
| 63 | + |
56 | 64 | def avg_lag_between_edits(self): |
57 | 65 | years = self.time.keys() |
58 | 66 | edits = [] |
59 | 67 | for year in years: |
60 | 68 | for x in self.time[year]: |
61 | 69 | edits.append(x) |
62 | | - |
| 70 | + |
63 | 71 | if edits != []: |
64 | 72 | edits.sort() |
65 | 73 | dt = datetime.timedelta() |
66 | 74 | for i, edit in enumerate(edits): |
67 | | - if i == len(edits) -1: |
| 75 | + if i == len(edits) - 1: |
68 | 76 | break |
69 | 77 | dt += edits[i + 1] - edits[i] |
70 | 78 | dt = dt / len(edits) |
71 | 79 | self.dt = dt |
72 | | - self.n = i |
| 80 | + self.n = i |
73 | 81 | else: |
74 | 82 | self.dt = None |
75 | | - |
| 83 | + |
76 | 84 | def write_training_dataset(self, fh): |
77 | 85 | self.data = [] |
78 | 86 | self.data.append(self.name) |
79 | 87 | self.data.append(self.verified) |
80 | 88 | self.add_clock_data() |
| 89 | + self.active() |
81 | 90 | self.data.append(self.dt) |
82 | 91 | utils.write_list_to_csv(self.data, fh, recursive=False, newline=True) |
83 | | - |
84 | 92 | |
85 | 93 | |
| 94 | + |
86 | 95 | if __name__ == '__main__': |
87 | 96 | pass |
Index: trunk/tools/editor_trends/bots/bots.py |
— | — | @@ -44,13 +44,16 @@ |
45 | 45 | pass |
46 | 46 | |
47 | 47 | |
48 | | -def read_bots_csv_file(manager, location, filename, encoding): |
| 48 | +def read_bots_csv_file(location, filename, encoding, manager=False): |
49 | 49 | ''' |
50 | 50 | Constructs a dictionary from Bots.csv |
51 | 51 | key is language |
52 | 52 | value is a list of bot names |
53 | 53 | ''' |
54 | | - bot_dict = manager.dict() |
| 54 | + if manager: |
| 55 | + bot_dict = manager.dict() |
| 56 | + else: |
| 57 | + bot_dict = dict() |
55 | 58 | for line in utils.read_data_from_csv(location, filename, encoding): |
56 | 59 | line = utils.clean_string(line) |
57 | 60 | language, bots = line.split(',') |
— | — | @@ -84,7 +87,7 @@ |
85 | 88 | This file reads the results from the lookup_bot_userid function and stores |
86 | 89 | it in a MongoDB collection. |
87 | 90 | ''' |
88 | | - bots = utils.read_dict_from_csv(settings.csv_location, 'bots_ids.csv', settings.encoding) |
| 91 | + #bots = utils.read_dict_from_csv(settings.csv_location, 'bots_ids.csv', settings.encoding) |
89 | 92 | mongo = db.init_mongo_db('bots') |
90 | 93 | collection = mongo['ids'] |
91 | 94 | db.remove_documents_from_mongo_db(collection, None) |
— | — | @@ -124,42 +127,47 @@ |
125 | 128 | if username == None or username.text == None: |
126 | 129 | continue |
127 | 130 | else: |
128 | | - username = username.text |
| 131 | + username = username.text #encode(settings.encoding) |
| 132 | + name = username.lower() |
| 133 | + |
129 | 134 | #print username.encode('utf-8') |
130 | 135 | if username in bots and bots[username].verified == True: |
131 | 136 | id = contributor.find('id').text |
132 | 137 | bot = bots[username] |
133 | 138 | |
134 | 139 | if not hasattr(bot, 'written'): |
135 | | - bot_dict = convert_object_to_dict(bot, exclude=['time', 'name', 'written']) |
136 | | - bot_dict['_username'] = username |
137 | | - bot_dict['id'] = id |
| 140 | + if username == 'Robbot': |
| 141 | + print 'test' |
| 142 | + bot.id = id |
| 143 | + bot.name = username |
| 144 | + bot_dict = convert_object_to_dict(bot, exclude=['time', 'written']) |
| 145 | + #bot_dict['_username'] = username |
| 146 | + #bot_dict['id'] = id |
138 | 147 | lock.acquire() |
139 | 148 | utils.write_dict_to_csv(bot_dict, fh, write_key=False) |
140 | 149 | lock.release() |
141 | | - bot.written = True |
| 150 | + bot.written = True |
| 151 | + bots[username] = bot |
142 | 152 | #bots.pop(username) |
143 | 153 | #if bots == {}: |
144 | 154 | # print 'Found id numbers for all bots.' |
145 | 155 | # return 'break' |
146 | 156 | #print username.encode('utf-8') |
147 | | - name = username.lower() |
| 157 | + |
148 | 158 | if name.find('bot') > -1: |
149 | | - bot = bots.get(username, botmodels.Bot(username)) |
150 | | - if bot not in bots: |
151 | | - bot.verified = False |
| 159 | + bot = bots.get(username, botmodels.Bot(username, verified=False)) |
152 | 160 | timestamp = revision.find('timestamp').text |
153 | 161 | if timestamp != None: |
154 | 162 | timestamp = utils.convert_timestamp_to_datetime_naive(timestamp) |
155 | 163 | bot.time[str(timestamp.year)].append(timestamp) |
156 | 164 | bots[username] = bot |
157 | | - |
| 165 | + return bots |
158 | 166 | #bot = bots.get('PseudoBot') |
159 | 167 | #bot.hours_active() |
160 | 168 | #bot.avg_lag_between_edits() |
161 | 169 | |
162 | 170 | |
163 | | -def bot_launcher(language_code, project, single=False): |
| 171 | +def bot_launcher(language_code, project, single=False, manager=False): |
164 | 172 | ''' |
165 | 173 | This function sets the stage to launch bot id detection and collecting data |
166 | 174 | to discover new bots. |
— | — | @@ -171,25 +179,28 @@ |
172 | 180 | files = utils.retrieve_file_list(input, 'xml', mask=None) |
173 | 181 | input_queue = pc.load_queue(files, poison_pill=True) |
174 | 182 | tasks = multiprocessing.JoinableQueue() |
175 | | - manager = multiprocessing.Manager() |
176 | | - bots = manager.dict() |
177 | | - lock = manager.Lock() |
178 | | - bots = read_bots_csv_file(manager, settings.csv_location, 'Bots.csv', settings.encoding) |
| 183 | + mgr = multiprocessing.Manager() |
| 184 | + bots = mgr.dict() |
| 185 | + lock = mgr.Lock() |
| 186 | + if manager: |
| 187 | + manager = mgr |
| 188 | + bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding, manager=manager) |
179 | 189 | |
180 | 190 | for file in files: |
181 | 191 | tasks.put(models.XMLFile(input, settings.csv_location, file, bots, lookup_bot_userid, 'bots_ids.csv', lock=lock)) |
182 | 192 | |
| 193 | + tracker = {} |
183 | 194 | if single: |
184 | 195 | while True: |
185 | 196 | try: |
186 | 197 | print '%s files left in the queue...' % tasks.qsize() |
187 | 198 | task = tasks.get(block=False) |
188 | | - task() |
| 199 | + bots = task(bots) |
189 | 200 | except Empty: |
190 | 201 | break |
191 | 202 | else: |
192 | 203 | bot_launcher_multi(tasks) |
193 | | - |
| 204 | + |
194 | 205 | utils.store_object(bots, settings.binary_location, 'bots.bin') |
195 | 206 | bot_training_dataset(bots) |
196 | 207 | store_bots() |
— | — | @@ -201,10 +212,10 @@ |
202 | 213 | print '%s' % key.encode(settings.encoding) |
203 | 214 | except: |
204 | 215 | pass |
205 | | - |
206 | | - |
207 | 216 | |
208 | 217 | |
| 218 | + |
| 219 | + |
209 | 220 | def bot_training_dataset(bots): |
210 | 221 | fh = utils.create_txt_filehandle(settings.csv_location, 'training_bots.csv', 'w', settings.encoding) |
211 | 222 | keys = bots.keys() |
— | — | @@ -213,10 +224,10 @@ |
214 | 225 | bot.hours_active() |
215 | 226 | bot.avg_lag_between_edits() |
216 | 227 | bot.write_training_dataset(fh) |
217 | | - |
| 228 | + |
218 | 229 | fh.close() |
219 | | - |
220 | | - |
| 230 | + |
| 231 | + |
221 | 232 | def bot_launcher_multi(tasks): |
222 | 233 | ''' |
223 | 234 | This is the launcher that uses multiprocesses. |
— | — | @@ -230,9 +241,14 @@ |
231 | 242 | |
232 | 243 | tasks.join() |
233 | 244 | |
| 245 | +def debug_bots_dict(): |
| 246 | + bots = utils.load_object(settings.binary_location, 'bots.bin') |
| 247 | + print 'done' |
234 | 248 | |
| 249 | + |
235 | 250 | if __name__ == '__main__': |
236 | 251 | language_code = 'en' |
237 | 252 | project = 'wiki' |
238 | | - #bot_launcher(language_code, project, single=True) |
239 | | - cProfile.run(bot_launcher(language_code, project, single=True), 'profile') |
| 253 | + #debug_bots_dict() |
| 254 | + bot_launcher(language_code, project, single=True) |
| 255 | + #cProfile.run(bot_launcher(language_code, project, single=True), 'profile') |