Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -43,6 +43,10 @@ |
44 | 44 | |
45 | 45 | |
46 | 46 | def show_choices(settings, attr): |
| 47 | + ''' |
| 48 | + Show possible choices in console, for example output valid languages or |
| 49 | + valid projects. |
| 50 | + ''' |
47 | 51 | choices = getattr(settings, attr).items() |
48 | 52 | choices.sort() |
49 | 53 | choices = ['%s\t%s' % (choice[0], choice[1]) for choice in choices] |
— | — | @@ -194,13 +198,12 @@ |
195 | 199 | ''' |
196 | 200 | Config launcher is used to reconfigure editor trends toolkit. |
197 | 201 | ''' |
198 | | -# settings.load_configuration() |
| 202 | + |
199 | 203 | pc = projects.ProjectContainer() |
200 | 204 | if not os.path.exists('wiki.cfg') or rts.force: |
201 | 205 | config = ConfigParser.RawConfigParser() |
202 | 206 | project = None |
203 | 207 | language = None |
204 | | - #language_map = languages.language_map() |
205 | 208 | working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.\n' % os.getcwd()) |
206 | 209 | input_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.\n' % rts.input_location) |
207 | 210 | |
— | — | @@ -234,7 +237,14 @@ |
235 | 238 | rts.working_directory = config.get('file_locations', 'working_directory') |
236 | 239 | rts.input_location = config.get('file_locations', 'input_location') |
237 | 240 | |
| 241 | + log.log_to_csv(logger, rts, 'New configuration', 'Creating', |
| 242 | + config_launcher, |
| 243 | + working_directory=working_directory, |
| 244 | + input_location=input_location, |
| 245 | + project=project, |
| 246 | + language=language,) |
238 | 247 | |
| 248 | + |
239 | 249 | def downloader_launcher(rts, logger): |
240 | 250 | ''' |
241 | 251 | This launcher calls the dump downloader to download a Wikimedia dump file. |
— | — | @@ -242,10 +252,9 @@ |
243 | 253 | print 'Start downloading' |
244 | 254 | stopwatch = timer.Timer() |
245 | 255 | log.log_to_mongo(rts, 'dataset', 'download', stopwatch, event='start') |
246 | | - res = downloader.launcher(rts, logger) |
| 256 | + downloader.launcher(rts, logger) |
247 | 257 | stopwatch.elapsed() |
248 | 258 | log.log_to_mongo(rts, 'dataset', 'download', stopwatch, event='finish') |
249 | | - return res |
250 | 259 | |
251 | 260 | |
252 | 261 | def extract_launcher(rts, logger): |
— | — | @@ -257,9 +266,11 @@ |
258 | 267 | print 'Extracting data from XML' |
259 | 268 | stopwatch = timer.Timer() |
260 | 269 | log.log_to_mongo(rts, 'dataset', 'extract', stopwatch, event='start') |
| 270 | + log.log_to_csv(logger, rts, 'Start', 'Extract', extract_launcher) |
261 | 271 | enricher.launcher(rts) |
262 | 272 | stopwatch.elapsed() |
263 | 273 | log.log_to_mongo(rts, 'dataset', 'extract', stopwatch, event='finish') |
| 274 | + log.log_to_csv(logger, rts, 'Finish', 'Extract', extract_launcher) |
264 | 275 | |
265 | 276 | |
266 | 277 | def sort_launcher(rts, logger): |
— | — | @@ -270,15 +281,11 @@ |
271 | 282 | print 'Start sorting data' |
272 | 283 | stopwatch = timer.Timer() |
273 | 284 | log.log_to_mongo(rts, 'dataset', 'sort', stopwatch, event='start') |
274 | | -# write_message_to_log(logger, settings, |
275 | | -# message=None, |
276 | | -# verb=None, |
277 | | -# location=properties.location, |
278 | | -# input=properties.txt, |
279 | | -# output=properties.sorted) |
| 285 | + log.log_to_csv(logger, rts, 'Start', 'Sort', sort_launcher) |
280 | 286 | sort.launcher(rts) |
281 | 287 | stopwatch.elapsed() |
282 | 288 | log.log_to_mongo(rts, 'dataset', 'sort', stopwatch, event='finish') |
| 289 | + log.log_to_csv(logger, rts, 'Finish', 'Sort', sort_launcher) |
283 | 290 | |
284 | 291 | |
285 | 292 | def store_launcher(rts, logger): |
— | — | @@ -289,99 +296,97 @@ |
290 | 297 | print 'Start storing data in MongoDB' |
291 | 298 | stopwatch = timer.Timer() |
292 | 299 | log.log_to_mongo(rts, 'dataset', 'store', stopwatch, event='start') |
| 300 | + log.log_to_csv(logger, rts, 'Start', 'Store', store_launcher) |
293 | 301 | db.cleanup_database(rts.dbname, logger) |
294 | | -# write_message_to_log(logger, settings, |
295 | | -# message=None, |
296 | | -# verb='Storing', |
297 | | -# function=properties.function, |
298 | | -# location=properties.location, |
299 | | -# input=properties.sorted, |
300 | | -# project=properties.full_project, |
301 | | -# collection=properties.collection) |
302 | | -# for key in properties: |
303 | | -# print key, getattr(properties, key) |
304 | 302 | store.launcher(rts) |
305 | 303 | stopwatch.elapsed() |
306 | 304 | log.log_to_mongo(rts, 'dataset', 'store', stopwatch, event='finish') |
| 305 | + log.log_to_csv(logger, rts, 'Finish', 'Store', store_launcher) |
307 | 306 | |
308 | 307 | |
309 | 308 | def transformer_launcher(rts, logger): |
| 309 | + ''' |
| 310 | + This function derives a number of variables from the editors_raw collection |
| 311 | + this will significantly improve processing speed. |
| 312 | + ''' |
310 | 313 | print 'Start transforming dataset' |
311 | 314 | stopwatch = timer.Timer() |
312 | 315 | log.log_to_mongo(rts, 'dataset', 'transform', stopwatch, event='start') |
| 316 | + log.log_to_csv(logger, rts, 'Start', 'Transform', transformer_launcher) |
313 | 317 | db.cleanup_database(rts.dbname, logger, 'dataset') |
314 | | -# write_message_to_log(logger, settings, |
315 | | -# message=None, |
316 | | -# verb='Transforming', |
317 | | -# project=properties.project, |
318 | | -# collection=properties.collection) |
319 | 318 | transformer.transform_editors_single_launcher(rts) |
320 | 319 | stopwatch.elapsed() |
321 | 320 | log.log_to_mongo(rts, 'dataset', 'transform', stopwatch, event='finish') |
| 321 | + log.log_to_csv(logger, rts, 'Finish', 'Transform', transformer_launcher) |
322 | 322 | |
323 | 323 | |
324 | 324 | def dataset_launcher(rts, logger): |
325 | | - print 'Start exporting dataset' |
| 325 | + ''' |
| 326 | + Dataset launcher is the entry point to generate datasets from the command |
| 327 | + line. |
| 328 | + ''' |
| 329 | + print 'Start generating dataset' |
326 | 330 | stopwatch = timer.Timer() |
327 | 331 | log.log_to_mongo(rts, 'dataset', 'export', stopwatch, event='start') |
328 | 332 | |
329 | 333 | for chart in rts.charts: |
330 | 334 | analyzer.generate_chart_data(rts, chart, **rts.keywords) |
331 | | -# write_message_to_log(logger, settings, |
332 | | -# message=None, |
333 | | -# verb='Exporting', |
334 | | -# target=target, |
335 | | -# dbname=properties.full_project, |
336 | | -# collection=properties.collection) |
| 335 | + log.log_to_csv(logger, rts, 'Start', 'Dataset', dataset_launcher, |
| 336 | + chart=chart, |
| 337 | + dbname=rts.dbname, |
| 338 | + collection=rts.editors_dataset) |
337 | 339 | stopwatch.elapsed() |
338 | 340 | log.log_to_mongo(rts, 'dataset', 'export', stopwatch, event='finish') |
| 341 | + log.log_to_csv(logger, rts, 'Finish', 'Dataset', dataset_launcher) |
339 | 342 | |
340 | 343 | |
341 | 344 | def cleanup(rts, logger): |
342 | | - directories = properties.directories[1:] |
| 345 | + ''' |
| 346 | + This function deletes all files of a previous Wikilytics run. |
| 347 | + ''' |
| 348 | + directories = rts.directories[1:] |
| 349 | + |
| 350 | + #remove directories |
343 | 351 | for directory in directories: |
344 | | - write_message_to_log(logger, setting, |
345 | | - message=None, |
346 | | - verb='Deleting', |
347 | | - dir=directory) |
348 | 352 | file_utils.delete_file(directory, '', directory=True) |
| 353 | + log.log_to_csv(logger, rts, |
| 354 | + message='Deleting %s' % directory, |
| 355 | + verb='Deleting', |
| 356 | + function=cleanup) |
349 | 357 | |
350 | | - write_message_to_log(logger, settings, |
351 | | - message=None, |
352 | | - verb='Creating', |
353 | | - dir=directories) |
354 | | - settings.verify_environment(directories) |
| 358 | + #create directories |
| 359 | + rts.verify_environment(directories) |
| 360 | + log.log_to_csv(logger, rts, message='Deleting %s' % directory, |
| 361 | + verb='Creating', function=rts.verify_environment) |
355 | 362 | |
356 | | - filename = '%s%s' % (properties.full_project, '_editor.bin') |
357 | | - write_message_to_log(logger, settings, |
358 | | - message=None, |
359 | | - verb='Deleting', |
360 | | - filename=filename) |
361 | | - file_utils.delete_file(settings.binary_location, filename) |
| 363 | + #remove binary files |
| 364 | + filename = '%s%s' % (rts.full_project, '_editor.bin') |
| 365 | + file_utils.delete_file(rts.binary_location, filename) |
| 366 | + log.log_to_csv(logger, rts, message='Deleting %s' % filename, |
| 367 | + verb='Deleting', |
| 368 | + function=file_utils.delete_file) |
362 | 369 | |
363 | 370 | |
| 371 | + |
364 | 372 | def all_launcher(rts, logger): |
365 | | - print 'The entire data processing chain has been called, this will take a \ |
366 | | - couple of hours (at least) to complete.' |
| 373 | + ''' |
| 374 | + The entire data processing chain has been called, this will take a |
| 375 | + couple of hours (at least) to complete. |
| 376 | + ''' |
| 377 | + |
367 | 378 | stopwatch = timer.Timer() |
368 | 379 | log.log_to_mongo(rts, 'dataset', 'all', stopwatch, event='start') |
369 | 380 | print 'Start of building %s %s dataset.' % (rts.language.name, rts.project) |
370 | 381 | |
371 | | -# write_message_to_log(logger, settings, |
372 | | -# message=message, |
373 | | -# verb=None, |
374 | | -# full_project=properties.full_project, |
375 | | -# ignore=properties.ignore, |
376 | | -# clean=properties.clean) |
377 | 382 | if rts.clean: |
| 383 | + print 'Removing previous datasets...' |
378 | 384 | cleanup(rts, logger) |
379 | 385 | |
380 | 386 | functions = ordered_dict.OrderedDict(((downloader_launcher, 'download'), |
381 | 387 | (extract_launcher, 'extract'), |
382 | 388 | (sort_launcher, 'sort'), |
383 | 389 | (store_launcher, 'store'), |
384 | | - (transformer_launcher, 'transform'), |
385 | | - (dataset_launcher, 'dataset'))) |
| 390 | + (transformer_launcher, 'transform'))) |
386 | 391 | |
387 | 392 | for function, callname in functions.iteritems(): |
388 | 393 | if callname not in rts.ignore: |
— | — | @@ -390,164 +395,29 @@ |
391 | 396 | if res == False: |
392 | 397 | sys.exit(False) |
393 | 398 | elif res == None: |
394 | | - print 'Function %s does not return a status, \ |
395 | | - implement NOW' % function.func_name |
| 399 | + pass |
396 | 400 | stopwatch.elapsed() |
397 | 401 | log.log_to_mongo(rts, 'dataset', 'all', stopwatch, event='finish') |
398 | 402 | |
399 | 403 | |
400 | 404 | |
401 | 405 | def about_statement(): |
| 406 | + ''' |
| 407 | + prints generic version information. |
| 408 | + ''' |
402 | 409 | print '' |
403 | 410 | print 'Wikilytics is (c) 2010-2011 by the Wikimedia Foundation.' |
404 | 411 | print 'Written by Diederik van Liere (dvanliere@gmail.com).' |
405 | | - print '''This software comes with ABSOLUTELY NO WARRANTY. This is |
406 | | - free software, and you are welcome to distribute it under certain |
407 | | - conditions.''' |
| 412 | + print '''This software comes with ABSOLUTELY NO WARRANTY. This is free |
| 413 | + software, and you are welcome to distribute it under certain conditions.''' |
408 | 414 | print 'See the README.1ST file for more information.' |
409 | 415 | print '' |
410 | 416 | |
411 | 417 | |
412 | | -def init_args_parser(): |
| 418 | +def main(): |
413 | 419 | ''' |
414 | | - Entry point for parsing command line and launching the needed function(s). |
| 420 | + This function initializes the command line parser. |
415 | 421 | ''' |
416 | | - language = languages.init() |
417 | | - project = projects.init() |
418 | | - pjc = projects.ProjectContainer() |
419 | | - rts = runtime_settings.RunTimeSettings(project, language) |
420 | | - |
421 | | - #Init Argument Parser |
422 | | - parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter) |
423 | | - subparsers = parser.add_subparsers(help='sub - command help') |
424 | | - |
425 | | - #SHOW LANGUAGES |
426 | | - parser_languages = subparsers.add_parser('show_languages', |
427 | | - help='Overview of all valid languages.') |
428 | | - parser_languages.add_argument('-s', '--startswith', |
429 | | - action='store', |
430 | | - help='Enter the first letter of a language to see which languages are \ |
431 | | - available.') |
432 | | - parser_languages.set_defaults(func=language.show_languages, args=[project]) |
433 | | - |
434 | | - #CONFIG |
435 | | - parser_config = subparsers.add_parser('config', |
436 | | - help='The config sub command allows you set the data location of where \ |
437 | | - to store files.') |
438 | | - parser_config.set_defaults(func=config_launcher) |
439 | | - parser_config.add_argument('-f', '--force', |
440 | | - action='store_true', |
441 | | - help='Reconfigure Editor Toolkit (this will replace wiki.cfg') |
442 | | - |
443 | | - #DOWNLOAD |
444 | | - parser_download = subparsers.add_parser('download', |
445 | | - help='The download sub command allows you to download a Wikipedia dump\ |
446 | | - file.') |
447 | | - parser_download.set_defaults(func=downloader_launcher) |
448 | | - |
449 | | - #EXTRACT |
450 | | - parser_create = subparsers.add_parser('extract', |
451 | | - help='The store sub command parsers the XML chunk files, extracts the \ |
452 | | - information and stores it in a MongoDB.') |
453 | | - parser_create.set_defaults(func=extract_launcher) |
454 | | - |
455 | | - #SORT |
456 | | - parser_sort = subparsers.add_parser('sort', |
457 | | - help='By presorting the data, significant processing time reductions \ |
458 | | - are achieved.') |
459 | | - parser_sort.set_defaults(func=sort_launcher) |
460 | | - |
461 | | - #STORE |
462 | | - parser_store = subparsers.add_parser('store', |
463 | | - help='The store sub command parsers the XML chunk files, extracts the \ |
464 | | - information and stores it in a MongoDB.') |
465 | | - parser_store.set_defaults(func=store_launcher) |
466 | | - |
467 | | - #TRANSFORM |
468 | | - parser_transform = subparsers.add_parser('transform', |
469 | | - help='Transform the raw datatable to an enriched dataset that can be \ |
470 | | - exported.') |
471 | | - parser_transform.set_defaults(func=transformer_launcher) |
472 | | - |
473 | | - #DATASET |
474 | | - parser_dataset = subparsers.add_parser('dataset', |
475 | | - help='Create a dataset from the MongoDB and write it to a csv file.') |
476 | | - parser_dataset.set_defaults(func=dataset_launcher) |
477 | | - parser_dataset.add_argument('-c', '--charts', |
478 | | - action='store', |
479 | | - help='Should be a valid function name that matches one of the plugin functions', |
480 | | - default=inventory.available_analyses()['new_editor_count']) |
481 | | - |
482 | | - parser_dataset.add_argument('-k', '--keywords', |
483 | | - action='store', |
484 | | - help='Add additional keywords in the format keyword1=value1,keyword2=value2', |
485 | | - default='') |
486 | | - |
487 | | - #ALL |
488 | | - parser_all = subparsers.add_parser('all', |
489 | | - help='The all sub command runs the download, split, store and dataset \ |
490 | | - commands.\n\nWARNING: THIS COULD TAKE DAYS DEPENDING ON THE \ |
491 | | - CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE WIKIMEDIA DUMP FILE.') |
492 | | - parser_all.set_defaults(func=all_launcher) |
493 | | - parser_all.add_argument('-e', '--except', |
494 | | - action='store', |
495 | | - help='Should be a list of functions that are to be ignored when \ |
496 | | - executing all.', |
497 | | - default=[]) |
498 | | - |
499 | | - parser_all.add_argument('-n', '--new', |
500 | | - action='store_true', |
501 | | - help='This will delete all previous output and starts from scratch. \ |
502 | | - Mostly useful for debugging purposes.', |
503 | | - default=False) |
504 | | - |
505 | | - #DJANGO |
506 | | - parser_django = subparsers.add_parser('django') |
507 | | - parser_django.add_argument('-e', '--except', |
508 | | - action='store', |
509 | | - help='Should be a list of functions that are to be ignored when \ |
510 | | - executing all.', |
511 | | - default=[]) |
512 | | - |
513 | | - parser.add_argument('-l', '--language', |
514 | | - action='store', |
515 | | - help='Example of valid languages.', |
516 | | - choices=project.supported_languages(), |
517 | | - default=unicode(language.name) |
518 | | - ) |
519 | | - |
520 | | - parser.add_argument('-p', '--project', |
521 | | - action='store', |
522 | | - help='Specify the Wikimedia project that you would like to download', |
523 | | - choices=pjc.supported_projects(), |
524 | | - default='wiki') |
525 | | - |
526 | | - parser.add_argument('-c', '--collection', |
527 | | - action='store', |
528 | | - help='Name of MongoDB collection', |
529 | | - default='editors_raw') |
530 | | - |
531 | | - parser.add_argument('-o', '--location', |
532 | | - action='store', |
533 | | - help='Indicate where you want to store the downloaded file.', |
534 | | - #default=settings.input_location) |
535 | | - default=rts.input_location) |
536 | | - |
537 | | - parser.add_argument('-ns', '--namespace', |
538 | | - action='store', |
539 | | - help='A list of namespaces to include for analysis.', |
540 | | - default='0') |
541 | | - |
542 | | - parser.add_argument('-f', '--file', |
543 | | - action='store', |
544 | | - choices=rts.file_choices, |
545 | | - help='Indicate which dump you want to download. Valid choices are:\n \ |
546 | | - %s' % ''.join([f + ',\n' for f in rts.file_choices]), |
547 | | - default='stub-meta-history.xml.gz') |
548 | | - |
549 | | - return project, language, parser |
550 | | - |
551 | | -def main(): |
552 | 422 | project, language, parser, = init_args_parser() |
553 | 423 | args = parser.parse_args() |
554 | 424 | rts = runtime_settings.RunTimeSettings(project, language, args) |