Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -31,17 +31,20 @@ |
32 | 32 | settings = configuration.Settings() |
33 | 33 | |
34 | 34 | import languages |
| 35 | +import config |
35 | 36 | from utils import utils |
36 | 37 | from utils import dump_downloader |
37 | 38 | from utils import compression |
| 39 | +from utils import ordered_dict |
| 40 | +from database import db |
38 | 41 | from etl import chunker |
39 | 42 | from etl import extract |
40 | 43 | from etl import loader |
41 | 44 | from etl import transformer |
42 | 45 | from etl import exporter |
43 | | -import config |
44 | 46 | |
45 | 47 | |
| 48 | + |
46 | 49 | class Timer(object): |
47 | 50 | def __init__(self): |
48 | 51 | self.t0 = datetime.datetime.now() |
— | — | @@ -58,7 +61,7 @@ |
59 | 62 | return getattr(args, key, None) |
60 | 63 | |
61 | 64 | |
62 | | -def config_launcher(args, **kwargs): |
| 65 | +def config_launcher(args, logger, **kwargs): |
63 | 66 | settings.load_configuration() |
64 | 67 | |
65 | 68 | |
— | — | @@ -151,13 +154,15 @@ |
152 | 155 | language = kwargs.pop('language') |
153 | 156 | language_code = kwargs.pop('language_code') |
154 | 157 | config = {} |
155 | | - config['Project'] = settings.projects.get(kwargs.pop('project'), 'wiki').title() |
156 | | - config['Language'] = '%s / %s' % (language_map[language_code].decode(settings.encoding), language.decode(settings.encoding)) |
157 | | - config['Input directory'] = kwargs.get('location') |
| 158 | + config['Project'] = '\t\t%s' % settings.projects.get(kwargs.pop('project'), 'wiki').title() |
| 159 | + config['Language'] = '\t%s / %s' % (language_map[language_code], language) #.decode(settings.encoding) |
| 160 | + config['Input directory'] = '\t%s' % kwargs.get('location') |
158 | 161 | config['Output directory'] = '%s and subdirectories' % kwargs.get('location') |
159 | 162 | |
160 | 163 | message = 'Final settings after parsing command line arguments:' |
161 | 164 | write_message_to_log(logger, args, message, None, **config) |
| 165 | + for c in config: |
| 166 | + print '%s\t%s' % (c, config[c]) |
162 | 167 | |
163 | 168 | |
164 | 169 | def dump_downloader_launcher(args, logger, **kwargs): |
— | — | @@ -177,7 +182,7 @@ |
178 | 183 | |
179 | 184 | |
180 | 185 | def chunker_launcher(args, logger, **kwargs): |
181 | | - print 'split_settings.input_filename_launcher' |
| 186 | + print 'chunker_launcher' |
182 | 187 | timer = Timer() |
183 | 188 | write_message_to_log(logger, args, **kwargs) |
184 | 189 | filename = kwargs.pop('filename') |
— | — | @@ -191,7 +196,7 @@ |
192 | 197 | file = filename.replace('.' + ext, '') |
193 | 198 | result = utils.check_file_exists(location, file) |
194 | 199 | if not result: |
195 | | - retcode = launch_zip_extractor(args, location, filename) |
| 200 | + retcode = launch_zip_extractor(args, logger, location, filename) |
196 | 201 | else: |
197 | 202 | retcode = 0 |
198 | 203 | if retcode != 0: |
— | — | @@ -201,7 +206,7 @@ |
202 | 207 | timer.elapsed() |
203 | 208 | |
204 | 209 | |
205 | | -def launch_zip_extractor(args, location, file): |
| 210 | +def launch_zip_extractor(args, logger, location, file): |
206 | 211 | timer = Timer() |
207 | 212 | write_message_to_log(logger, args, location=location, file=file) |
208 | 213 | compressor = compression.Compressor(location, file) |
— | — | @@ -226,9 +231,8 @@ |
227 | 232 | input = os.path.join(location, 'txt') |
228 | 233 | output = os.path.join(location, 'sorted') |
229 | 234 | final_output = os.path.join(location, 'dbready') |
230 | | - dbname = kwargs.pop('full_project') |
231 | 235 | loader.mergesort_launcher(input, output) |
232 | | - loader.mergesort_external_launcher(dbname, output, final_output) |
| 236 | + loader.mergesort_external_launcher(output, final_output) |
233 | 237 | timer.elapsed() |
234 | 238 | |
235 | 239 | |
— | — | @@ -249,40 +253,48 @@ |
250 | 254 | write_message_to_log(logger, args, **kwargs) |
251 | 255 | project = kwargs.pop('full_project') |
252 | 256 | collection = kwargs.pop('collection') |
253 | | - transformer.run_optimize_editors(project, collection) |
| 257 | + transformer.transform_editors_single_launcher(project, collection) |
254 | 258 | timer.elapsed() |
255 | 259 | |
256 | 260 | |
257 | 261 | def exporter_launcher(args, logger, **kwargs): |
258 | 262 | timer = Timer() |
259 | 263 | write_message_to_log(logger, args, **kwargs) |
260 | | - project = kwargs.pop('full_project') |
261 | | - exporter.generate_editor_dataset_launcher(project) |
| 264 | + collection = get_value(args, 'collection') |
| 265 | + dbname = kwargs.pop('full_project') |
| 266 | + targets = get_value(args, 'datasets') |
| 267 | + targets = targets.split(',') |
| 268 | + for target in targets: |
| 269 | + exporter.dataset_launcher(dbname, collection, target) |
262 | 270 | timer.elapsed() |
263 | 271 | |
264 | 272 | |
265 | 273 | def all_launcher(args, logger, **kwargs): |
266 | 274 | print 'all_launcher' |
267 | 275 | timer = Timer() |
268 | | - message = 'Starting ' |
| 276 | + full_project = kwargs.get('full_project', None) |
| 277 | + message = 'Start of building %s dataset.' % full_project |
| 278 | + db.cleanup_database(full_project) |
269 | 279 | write_message_to_log(logger, args, message, **kwargs) |
270 | 280 | ignore = get_value(args, 'except') |
271 | | - clean = get_value(args, 'clean') |
| 281 | + clean = get_value(args, 'new') |
272 | 282 | if clean: |
273 | 283 | dirs = kwargs.get('directories')[1:] |
274 | 284 | for dir in dirs: |
275 | 285 | write_message_to_log(logger, args, verb='Deleting', **kwargs) |
276 | 286 | utils.delete_file(dir, '') |
277 | | - functions = {dump_downloader_launcher: 'download', |
278 | | - chunker_launcher: 'split', |
279 | | - extract_launcher: 'extract', |
280 | | - sort_launcher: 'sort', |
281 | | - transformer_launcher: 'transform', |
282 | | - exporter_launcher: 'export' |
283 | | - } |
| 287 | + |
| 288 | + functions = ordered_dict.OrderedDict(((dump_downloader_launcher, 'download'), |
| 289 | + (chunker_launcher, 'split'), |
| 290 | + (extract_launcher, 'extract'), |
| 291 | + (sort_launcher, 'sort'), |
| 292 | + (store_launcher, 'store'), |
| 293 | + (transformer_launcher, 'transform'), |
| 294 | + (exporter_launcher, 'export'))) |
| 295 | + |
284 | 296 | for function, callname in functions.iteritems(): |
285 | 297 | if callname not in ignore: |
286 | | - function(args, **kwargs) |
| 298 | + function(args, logger, **kwargs) |
287 | 299 | |
288 | 300 | timer.elapsed() |
289 | 301 | |
— | — | @@ -293,7 +305,7 @@ |
294 | 306 | return tuple(choices) |
295 | 307 | |
296 | 308 | |
297 | | -def show_languages(args, location, filename, project, full_project, language_code, language): |
| 309 | +def show_languages(args, logger, **kwargs): |
298 | 310 | first = get_value(args, 'startswith') |
299 | 311 | if first != None: |
300 | 312 | first = first.title() |
— | — | @@ -331,13 +343,20 @@ |
332 | 344 | logger.setLevel(logging.DEBUG) |
333 | 345 | |
334 | 346 | default_language = determine_default_language() |
| 347 | + |
| 348 | + datasets = {'cohort': 'generate_cohort_dataset', |
| 349 | + 'long': 'generate_long_editor_dataset', |
| 350 | + 'wide': 'generate_wide_editor_dataset', |
| 351 | + } |
| 352 | + |
335 | 353 | file_choices = ('stub-meta-history.xml.gz', |
336 | 354 | 'stub-meta-current.xml.gz', |
337 | 355 | 'pages-meta-history.xml.7z', |
338 | 356 | 'pages-meta-current.xml.bz2') |
339 | 357 | |
| 358 | + |
340 | 359 | parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter) |
341 | | - subparsers = parser.add_subparsers(help='sub-command help') |
| 360 | + subparsers = parser.add_subparsers(help='sub - command help') |
342 | 361 | |
343 | 362 | parser_languages = subparsers.add_parser('show_languages', help='Overview of all valid languages.') |
344 | 363 | parser_languages.add_argument('-s', '--startswith', |
— | — | @@ -365,15 +384,9 @@ |
366 | 385 | |
367 | 386 | parser_store = subparsers.add_parser('store', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.') |
368 | 387 | parser_store.set_defaults(func=store_launcher) |
369 | | - parser_store.add_argument('-c', '--collection', action='store', |
370 | | - help='Name of MongoDB collection', |
371 | | - default='editors') |
372 | 388 | |
373 | 389 | parser_transform = subparsers.add_parser('transform', help='Transform the raw datatable to an enriched dataset that can be exported.') |
374 | 390 | parser_transform.set_defaults(func=transformer_launcher) |
375 | | - parser_transform.add_argument('-c', '--collection', action='store', |
376 | | - help='Name of MongoDB collection', |
377 | | - default='editors') |
378 | 391 | |
379 | 392 | parser_dataset = subparsers.add_parser('export', help='Create a dataset from the MongoDB and write it to a csv file.') |
380 | 393 | parser_dataset.set_defaults(func=exporter_launcher) |
— | — | @@ -383,11 +396,11 @@ |
384 | 397 | parser_all.add_argument('-e', '--except', action='store', |
385 | 398 | help='Should be a list of functions that are to be ignored when executing \'all\'.', |
386 | 399 | default=[]) |
| 400 | + |
387 | 401 | parser_all.add_argument('-n', '--new', action='store_false', |
388 | 402 | help='This will delete all previous output and starts from scratch. Mostly useful for debugging purposes.', |
389 | 403 | default=False) |
390 | 404 | |
391 | | - |
392 | 405 | parser.add_argument('-l', '--language', action='store', |
393 | 406 | help='Example of valid languages.', |
394 | 407 | choices=supported_languages(), |
— | — | @@ -398,8 +411,14 @@ |
399 | 412 | choices=settings.projects.keys(), |
400 | 413 | default='wiki') |
401 | 414 | |
| 415 | + parser.add_argument('-c', '--collection', action='store', |
| 416 | + help='Name of MongoDB collection', |
| 417 | + default='editors') |
| 418 | + |
| 419 | + |
402 | 420 | parser.add_argument('-o', '--location', action='store', |
403 | 421 | help='Indicate where you want to store the downloaded file.', |
| 422 | + default=settings.input_location |
404 | 423 | ) |
405 | 424 | |
406 | 425 | parser.add_argument('-n', '--namespace', action='store', |
— | — | @@ -412,6 +431,16 @@ |
413 | 432 | help='Indicate which dump you want to download. Valid choices are:\n %s' % ''.join([f + ',\n' for f in file_choices]), |
414 | 433 | default='stub-meta-history.xml.gz') |
415 | 434 | |
| 435 | + parser.add_argument('-dv', '--dumpversion', action='store', |
| 436 | + choices=settings.dumpversions.keys(), |
| 437 | + help='Indicate the Wikidump version that you are parsing.', |
| 438 | + default=settings.dumpversions['0']) |
| 439 | + |
| 440 | + parser.add_argument('-d', '--datasets', action='store', |
| 441 | + choices=datasets.keys(), |
| 442 | + help='Indicate what type of data should be exported.', |
| 443 | + default=datasets['cohort']) |
| 444 | + |
416 | 445 | parser.add_argument('-prog', '--progress', action='store_true', default=True, |
417 | 446 | help='Indicate whether you want to have a progressbar.') |
418 | 447 | |
— | — | @@ -422,9 +451,7 @@ |
423 | 452 | locations = determine_file_locations(args, logger) |
424 | 453 | settings.verify_environment(locations['directories']) |
425 | 454 | show_settings(args, logger, **locations) |
426 | | - #locations['settings'] = settings |
427 | 455 | args.func(args, logger, **locations) |
428 | | - t1 = datetime.datetime.now() |
429 | 456 | |
430 | 457 | |
431 | 458 | if __name__ == '__main__': |
Index: trunk/tools/editor_trends/config.py |
— | — | @@ -24,12 +24,22 @@ |
25 | 25 | from utils import utils |
26 | 26 | import languages |
27 | 27 | |
| 28 | +def show_choices(settings, attr): |
| 29 | + choices = getattr(settings, attr).items() |
| 30 | + choices.sort() |
| 31 | + choices = ['%s\t%s' % (choice[0], choice[1]) for choice in choices] |
| 32 | + #print '\n'.join(choices) |
| 33 | + return choices |
| 34 | + #for choice in choices: |
| 35 | + # print '%s\t%s' % (choice[0], choice[1]) |
| 36 | + |
28 | 37 | def create_configuration(settings, args): |
29 | 38 | force = getattr(args, 'force', False) |
30 | 39 | if not os.path.exists('wiki.cfg') or force: |
31 | 40 | config = ConfigParser.RawConfigParser() |
32 | 41 | project = None |
33 | 42 | language = None |
| 43 | + dumpversion = None |
34 | 44 | language_map = languages.language_map() |
35 | 45 | working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.\n' % os.getcwd()) |
36 | 46 | input_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.\n' % settings.input_location) |
— | — | @@ -46,6 +56,16 @@ |
47 | 57 | language = language_map[args.language] |
48 | 58 | language = language if language in languages.MAPPING else args.language |
49 | 59 | |
| 60 | + while dumpversion not in settings.dumpversions.keys(): |
| 61 | + choices = '\n'.join(show_choices(settings, 'dumpversions')) |
| 62 | + dumpversion = raw_input('Please indicate the version of the Wikipedia project you are analyzing.\nValid choices are:\n%s\nDefault is: 0 (%s)\nPress Enter to accept default.\n' % (choices, settings.dumpversions['0'])) |
| 63 | + if len(dumpversion) == 0: |
| 64 | + dumpversion = settings.dumpversions['0'] |
| 65 | + |
| 66 | + |
| 67 | + #dumpversion = dumpversion if dumpversion in settings.dumpversions.keys() else args.dumpversion |
| 68 | + |
| 69 | + dumpversion = settings.dumpversions[dumpversion] |
50 | 70 | input_location = input_location if len(input_location) > 0 else settings.input_location |
51 | 71 | working_directory = working_directory if len(working_directory) > 0 else os.getcwd() |
52 | 72 | |
— | — | @@ -56,6 +76,7 @@ |
57 | 77 | config.add_section('wiki') |
58 | 78 | config.set('wiki', 'project', project) |
59 | 79 | config.set('wiki', 'language', language) |
| 80 | + config.set('wiki', 'dumpversion', dumpversion) |
60 | 81 | |
61 | 82 | fh = utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb') |
62 | 83 | config.write(fh) |
— | — | @@ -63,6 +84,7 @@ |
64 | 85 | |
65 | 86 | settings.working_directory = config.get('file_locations', 'working_directory') |
66 | 87 | settings.input_location = config.get('file_locations', 'input_location') |
| 88 | + settings.xml_namespace = config.get('wiki', 'dumpversion') |
67 | 89 | return settings |
68 | 90 | |
69 | 91 | |
Index: trunk/tools/editor_trends/configuration.py |
— | — | @@ -57,12 +57,11 @@ |
58 | 58 | #Change this to match your computers configuration (RAM / CPU) |
59 | 59 | self.minimum_python_version = (2, 6) |
60 | 60 | self.wp_dump_location = 'http://download.wikimedia.org' |
61 | | - self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/' |
| 61 | + self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.3/' |
62 | 62 | self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json'] |
63 | 63 | self.windows_register = {'7z.exe': 'Software\\7-Zip', } |
64 | 64 | #Extensions of ascii files, this is used to determine the filemode to use |
65 | 65 | self.platform = self.determine_platform() |
66 | | - #self.compression = self.init_compression_tools() |
67 | 66 | |
68 | 67 | self.architecture = platform.machine() |
69 | 68 | self.working_directory = self.determine_working_directory() |
— | — | @@ -75,6 +74,9 @@ |
76 | 75 | |
77 | 76 | self.load_configuration() |
78 | 77 | self.set_custom_settings(**kwargs) |
| 78 | + self.dumpversions = {'0': 'http://www.mediawiki.org/xml/export-0.4/', |
| 79 | + '1': 'http://www.mediawiki.org/xml/export-0.3/', |
| 80 | + } |
79 | 81 | self.projects = {'wiki': 'wikipedia', |
80 | 82 | 'commons': 'commonswiki', |
81 | 83 | 'books': 'wikibooks', |