r77589 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r77588‎ | r77589 | r77590 >
Date:17:12, 2 December 2010
Author:diederik
Status:deferred
Tags:
Comment:
* Handle more edge cases
* Pipe line works
Modified paths:
  • /trunk/tools/editor_trends/config.py (modified) (history)
  • /trunk/tools/editor_trends/configuration.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -31,17 +31,20 @@
3232 settings = configuration.Settings()
3333
3434 import languages
 35+import config
3536 from utils import utils
3637 from utils import dump_downloader
3738 from utils import compression
 39+from utils import ordered_dict
 40+from database import db
3841 from etl import chunker
3942 from etl import extract
4043 from etl import loader
4144 from etl import transformer
4245 from etl import exporter
43 -import config
4446
4547
 48+
4649 class Timer(object):
4750 def __init__(self):
4851 self.t0 = datetime.datetime.now()
@@ -58,7 +61,7 @@
5962 return getattr(args, key, None)
6063
6164
62 -def config_launcher(args, **kwargs):
 65+def config_launcher(args, logger, **kwargs):
6366 settings.load_configuration()
6467
6568
@@ -151,13 +154,15 @@
152155 language = kwargs.pop('language')
153156 language_code = kwargs.pop('language_code')
154157 config = {}
155 - config['Project'] = settings.projects.get(kwargs.pop('project'), 'wiki').title()
156 - config['Language'] = '%s / %s' % (language_map[language_code].decode(settings.encoding), language.decode(settings.encoding))
157 - config['Input directory'] = kwargs.get('location')
 158+ config['Project'] = '\t\t%s' % settings.projects.get(kwargs.pop('project'), 'wiki').title()
 159+ config['Language'] = '\t%s / %s' % (language_map[language_code], language) #.decode(settings.encoding)
 160+ config['Input directory'] = '\t%s' % kwargs.get('location')
158161 config['Output directory'] = '%s and subdirectories' % kwargs.get('location')
159162
160163 message = 'Final settings after parsing command line arguments:'
161164 write_message_to_log(logger, args, message, None, **config)
 165+ for c in config:
 166+ print '%s\t%s' % (c, config[c])
162167
163168
164169 def dump_downloader_launcher(args, logger, **kwargs):
@@ -177,7 +182,7 @@
178183
179184
180185 def chunker_launcher(args, logger, **kwargs):
181 - print 'split_settings.input_filename_launcher'
 186+ print 'chunker_launcher'
182187 timer = Timer()
183188 write_message_to_log(logger, args, **kwargs)
184189 filename = kwargs.pop('filename')
@@ -191,7 +196,7 @@
192197 file = filename.replace('.' + ext, '')
193198 result = utils.check_file_exists(location, file)
194199 if not result:
195 - retcode = launch_zip_extractor(args, location, filename)
 200+ retcode = launch_zip_extractor(args, logger, location, filename)
196201 else:
197202 retcode = 0
198203 if retcode != 0:
@@ -201,7 +206,7 @@
202207 timer.elapsed()
203208
204209
205 -def launch_zip_extractor(args, location, file):
 210+def launch_zip_extractor(args, logger, location, file):
206211 timer = Timer()
207212 write_message_to_log(logger, args, location=location, file=file)
208213 compressor = compression.Compressor(location, file)
@@ -226,9 +231,8 @@
227232 input = os.path.join(location, 'txt')
228233 output = os.path.join(location, 'sorted')
229234 final_output = os.path.join(location, 'dbready')
230 - dbname = kwargs.pop('full_project')
231235 loader.mergesort_launcher(input, output)
232 - loader.mergesort_external_launcher(dbname, output, final_output)
 236+ loader.mergesort_external_launcher(output, final_output)
233237 timer.elapsed()
234238
235239
@@ -249,40 +253,48 @@
250254 write_message_to_log(logger, args, **kwargs)
251255 project = kwargs.pop('full_project')
252256 collection = kwargs.pop('collection')
253 - transformer.run_optimize_editors(project, collection)
 257+ transformer.transform_editors_single_launcher(project, collection)
254258 timer.elapsed()
255259
256260
257261 def exporter_launcher(args, logger, **kwargs):
258262 timer = Timer()
259263 write_message_to_log(logger, args, **kwargs)
260 - project = kwargs.pop('full_project')
261 - exporter.generate_editor_dataset_launcher(project)
 264+ collection = get_value(args, 'collection')
 265+ dbname = kwargs.pop('full_project')
 266+ targets = get_value(args, 'datasets')
 267+ targets = targets.split(',')
 268+ for target in targets:
 269+ exporter.dataset_launcher(dbname, collection, target)
262270 timer.elapsed()
263271
264272
265273 def all_launcher(args, logger, **kwargs):
266274 print 'all_launcher'
267275 timer = Timer()
268 - message = 'Starting '
 276+ full_project = kwargs.get('full_project', None)
 277+ message = 'Start of building %s dataset.' % full_project
 278+ db.cleanup_database(full_project)
269279 write_message_to_log(logger, args, message, **kwargs)
270280 ignore = get_value(args, 'except')
271 - clean = get_value(args, 'clean')
 281+ clean = get_value(args, 'new')
272282 if clean:
273283 dirs = kwargs.get('directories')[1:]
274284 for dir in dirs:
275285 write_message_to_log(logger, args, verb='Deleting', **kwargs)
276286 utils.delete_file(dir, '')
277 - functions = {dump_downloader_launcher: 'download',
278 - chunker_launcher: 'split',
279 - extract_launcher: 'extract',
280 - sort_launcher: 'sort',
281 - transformer_launcher: 'transform',
282 - exporter_launcher: 'export'
283 - }
 287+
 288+ functions = ordered_dict.OrderedDict(((dump_downloader_launcher, 'download'),
 289+ (chunker_launcher, 'split'),
 290+ (extract_launcher, 'extract'),
 291+ (sort_launcher, 'sort'),
 292+ (store_launcher, 'store'),
 293+ (transformer_launcher, 'transform'),
 294+ (exporter_launcher, 'export')))
 295+
284296 for function, callname in functions.iteritems():
285297 if callname not in ignore:
286 - function(args, **kwargs)
 298+ function(args, logger, **kwargs)
287299
288300 timer.elapsed()
289301
@@ -293,7 +305,7 @@
294306 return tuple(choices)
295307
296308
297 -def show_languages(args, location, filename, project, full_project, language_code, language):
 309+def show_languages(args, logger, **kwargs):
298310 first = get_value(args, 'startswith')
299311 if first != None:
300312 first = first.title()
@@ -331,13 +343,20 @@
332344 logger.setLevel(logging.DEBUG)
333345
334346 default_language = determine_default_language()
 347+
 348+ datasets = {'cohort': 'generate_cohort_dataset',
 349+ 'long': 'generate_long_editor_dataset',
 350+ 'wide': 'generate_wide_editor_dataset',
 351+ }
 352+
335353 file_choices = ('stub-meta-history.xml.gz',
336354 'stub-meta-current.xml.gz',
337355 'pages-meta-history.xml.7z',
338356 'pages-meta-current.xml.bz2')
339357
 358+
340359 parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter)
341 - subparsers = parser.add_subparsers(help='sub-command help')
 360+ subparsers = parser.add_subparsers(help='sub - command help')
342361
343362 parser_languages = subparsers.add_parser('show_languages', help='Overview of all valid languages.')
344363 parser_languages.add_argument('-s', '--startswith',
@@ -365,15 +384,9 @@
366385
367386 parser_store = subparsers.add_parser('store', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.')
368387 parser_store.set_defaults(func=store_launcher)
369 - parser_store.add_argument('-c', '--collection', action='store',
370 - help='Name of MongoDB collection',
371 - default='editors')
372388
373389 parser_transform = subparsers.add_parser('transform', help='Transform the raw datatable to an enriched dataset that can be exported.')
374390 parser_transform.set_defaults(func=transformer_launcher)
375 - parser_transform.add_argument('-c', '--collection', action='store',
376 - help='Name of MongoDB collection',
377 - default='editors')
378391
379392 parser_dataset = subparsers.add_parser('export', help='Create a dataset from the MongoDB and write it to a csv file.')
380393 parser_dataset.set_defaults(func=exporter_launcher)
@@ -383,11 +396,11 @@
384397 parser_all.add_argument('-e', '--except', action='store',
385398 help='Should be a list of functions that are to be ignored when executing \'all\'.',
386399 default=[])
 400+
387401 parser_all.add_argument('-n', '--new', action='store_false',
388402 help='This will delete all previous output and starts from scratch. Mostly useful for debugging purposes.',
389403 default=False)
390404
391 -
392405 parser.add_argument('-l', '--language', action='store',
393406 help='Example of valid languages.',
394407 choices=supported_languages(),
@@ -398,8 +411,14 @@
399412 choices=settings.projects.keys(),
400413 default='wiki')
401414
 415+ parser.add_argument('-c', '--collection', action='store',
 416+ help='Name of MongoDB collection',
 417+ default='editors')
 418+
 419+
402420 parser.add_argument('-o', '--location', action='store',
403421 help='Indicate where you want to store the downloaded file.',
 422+ default=settings.input_location
404423 )
405424
406425 parser.add_argument('-n', '--namespace', action='store',
@@ -412,6 +431,16 @@
413432 help='Indicate which dump you want to download. Valid choices are:\n %s' % ''.join([f + ',\n' for f in file_choices]),
414433 default='stub-meta-history.xml.gz')
415434
 435+ parser.add_argument('-dv', '--dumpversion', action='store',
 436+ choices=settings.dumpversions.keys(),
 437+ help='Indicate the Wikidump version that you are parsing.',
 438+ default=settings.dumpversions['0'])
 439+
 440+ parser.add_argument('-d', '--datasets', action='store',
 441+ choices=datasets.keys(),
 442+ help='Indicate what type of data should be exported.',
 443+ default=datasets['cohort'])
 444+
416445 parser.add_argument('-prog', '--progress', action='store_true', default=True,
417446 help='Indicate whether you want to have a progressbar.')
418447
@@ -422,9 +451,7 @@
423452 locations = determine_file_locations(args, logger)
424453 settings.verify_environment(locations['directories'])
425454 show_settings(args, logger, **locations)
426 - #locations['settings'] = settings
427455 args.func(args, logger, **locations)
428 - t1 = datetime.datetime.now()
429456
430457
431458 if __name__ == '__main__':
Index: trunk/tools/editor_trends/config.py
@@ -24,12 +24,22 @@
2525 from utils import utils
2626 import languages
2727
 28+def show_choices(settings, attr):
 29+ choices = getattr(settings, attr).items()
 30+ choices.sort()
 31+ choices = ['%s\t%s' % (choice[0], choice[1]) for choice in choices]
 32+ #print '\n'.join(choices)
 33+ return choices
 34+ #for choice in choices:
 35+ # print '%s\t%s' % (choice[0], choice[1])
 36+
2837 def create_configuration(settings, args):
2938 force = getattr(args, 'force', False)
3039 if not os.path.exists('wiki.cfg') or force:
3140 config = ConfigParser.RawConfigParser()
3241 project = None
3342 language = None
 43+ dumpversion = None
3444 language_map = languages.language_map()
3545 working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.\n' % os.getcwd())
3646 input_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.\n' % settings.input_location)
@@ -46,6 +56,16 @@
4757 language = language_map[args.language]
4858 language = language if language in languages.MAPPING else args.language
4959
 60+ while dumpversion not in settings.dumpversions.keys():
 61+ choices = '\n'.join(show_choices(settings, 'dumpversions'))
 62+ dumpversion = raw_input('Please indicate the version of the Wikipedia project you are analyzing.\nValid choices are:\n%s\nDefault is: 0 (%s)\nPress Enter to accept default.\n' % (choices, settings.dumpversions['0']))
 63+ if len(dumpversion) == 0:
 64+ dumpversion = settings.dumpversions['0']
 65+
 66+
 67+ #dumpversion = dumpversion if dumpversion in settings.dumpversions.keys() else args.dumpversion
 68+
 69+ dumpversion = settings.dumpversions[dumpversion]
5070 input_location = input_location if len(input_location) > 0 else settings.input_location
5171 working_directory = working_directory if len(working_directory) > 0 else os.getcwd()
5272
@@ -56,6 +76,7 @@
5777 config.add_section('wiki')
5878 config.set('wiki', 'project', project)
5979 config.set('wiki', 'language', language)
 80+ config.set('wiki', 'dumpversion', dumpversion)
6081
6182 fh = utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb')
6283 config.write(fh)
@@ -63,6 +84,7 @@
6485
6586 settings.working_directory = config.get('file_locations', 'working_directory')
6687 settings.input_location = config.get('file_locations', 'input_location')
 88+ settings.xml_namespace = config.get('wiki', 'dumpversion')
6789 return settings
6890
6991
Index: trunk/tools/editor_trends/configuration.py
@@ -57,12 +57,11 @@
5858 #Change this to match your computers configuration (RAM / CPU)
5959 self.minimum_python_version = (2, 6)
6060 self.wp_dump_location = 'http://download.wikimedia.org'
61 - self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'
 61+ self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.3/'
6262 self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json']
6363 self.windows_register = {'7z.exe': 'Software\\7-Zip', }
6464 #Extensions of ascii files, this is used to determine the filemode to use
6565 self.platform = self.determine_platform()
66 - #self.compression = self.init_compression_tools()
6766
6867 self.architecture = platform.machine()
6968 self.working_directory = self.determine_working_directory()
@@ -75,6 +74,9 @@
7675
7776 self.load_configuration()
7877 self.set_custom_settings(**kwargs)
 78+ self.dumpversions = {'0': 'http://www.mediawiki.org/xml/export-0.4/',
 79+ '1': 'http://www.mediawiki.org/xml/export-0.3/',
 80+ }
7981 self.projects = {'wiki': 'wikipedia',
8082 'commons': 'commonswiki',
8183 'books': 'wikibooks',

Status & tagging log