Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -155,6 +155,12 @@ |
156 | 156 | executing all.', |
157 | 157 | default=[]) |
158 | 158 | |
| 159 | + |
| 160 | + parser.add_argument('-k', '--kaggle', |
| 161 | + action='store', |
| 162 | + help='Indicate whether the output is for Kaggle or not', |
| 163 | + default=False) |
| 164 | + |
159 | 165 | parser.add_argument('-l', '--language', |
160 | 166 | action='store', |
161 | 167 | help='Example of valid languages.', |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -55,7 +55,7 @@ |
56 | 56 | break |
57 | 57 | |
58 | 58 | fh = file_utils.create_txt_filehandle(self.rts.sorted, filename, |
59 | | - 'r', self.rts.encoding) |
| 59 | + 'r', 'utf-8') |
60 | 60 | for line in file_utils.read_raw_data(fh): |
61 | 61 | if len(line) == 12: |
62 | 62 | editor = line[2] |
— | — | @@ -72,7 +72,7 @@ |
73 | 73 | |
74 | 74 | def prepare_data(self, line): |
75 | 75 | article_id = int(line[1]) |
76 | | - username = line[3].encode(self.rts.encoding) |
| 76 | + username = line[3].encode('utf-8') |
77 | 77 | ns = int(line[4]) |
78 | 78 | date = text_utils.convert_timestamp_to_datetime_utc(line[6]) |
79 | 79 | md5 = line[7] |
— | — | @@ -115,7 +115,7 @@ |
116 | 116 | collection.ensure_index('category') |
117 | 117 | |
118 | 118 | location = os.path.join(rts.input_location, rts.language.code, rts.project.name, 'txt') |
119 | | - fh = file_utils.create_txt_filehandle(location, 'titles.csv', 'r', rts.encoding) |
| 119 | + fh = file_utils.create_txt_filehandle(location, 'titles.csv', 'r', 'utf-8') |
120 | 120 | print 'Storing article titles...' |
121 | 121 | for line in fh: |
122 | 122 | line = line.strip() |
Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -464,19 +464,21 @@ |
465 | 465 | return editor |
466 | 466 | |
467 | 467 | |
468 | | -def determine_namespace(title, include_ns, exclude_ns): |
| 468 | +def determine_namespace(title, namespaces, include_ns, exclude_ns): |
469 | 469 | ''' |
470 | 470 | You can only determine whether an article belongs to the Main Namespace |
471 | 471 | by ruling out that it does not belong to any other namepace |
472 | 472 | ''' |
473 | 473 | ns = {} |
474 | 474 | if title != None: |
475 | | - for namespace in include_ns: |
476 | | - if title.startswith(namespace): |
477 | | - ns['namespace'] = include_ns[namespace] |
| 475 | + for key in include_ns: |
| 476 | + namespace = namespaces.get(key) |
| 477 | + if namespace and title.startswith(namespace): |
| 478 | + ns['namespace'] = key |
478 | 479 | if ns == {}: |
479 | | - for namespace in exclude_ns.values(): |
480 | | - if title.startswith(namespace): |
| 480 | + for key in exclude_ns: |
| 481 | + namespace = namespaces.get(key) |
| 482 | + if namespace and title.startswith(namespace): |
481 | 483 | '''article does not belong to any of the include_ns |
482 | 484 | namespaces''' |
483 | 485 | ns = False |
— | — | @@ -519,6 +521,37 @@ |
520 | 522 | return comment |
521 | 523 | |
522 | 524 | |
| 525 | +def create_namespace_dict(siteinfo, xml_namespace): |
| 526 | + ''' |
| 527 | + This function determines the local names of the different namespaces. |
| 528 | + ''' |
| 529 | + namespaces = {} |
| 530 | + print 'Constructing namespace dictionary' |
| 531 | + |
| 532 | + elements = siteinfo.find('%s%s' % (xml_namespace, 'namespaces')) |
| 533 | + for elem in elements: |
| 534 | + key = int(elem.get('key')) |
| 535 | + namespaces[key] = elem.text #extract_text(ns) |
| 536 | + text = elem.text if elem.text != None else '' |
| 537 | + try: |
| 538 | + print key, text.encode('utf-8') |
| 539 | + except UnicodeEncodeError: |
| 540 | + print key |
| 541 | + return namespaces |
| 542 | + |
| 543 | + |
| 544 | +def determine_xml_namespace(siteinfo): |
| 545 | + ''' |
| 546 | + This function determines the xml_namespace version |
| 547 | + ''' |
| 548 | + for elem in siteinfo : |
| 549 | + if elem.tag.endswith('sitename'): |
| 550 | + xml_namespace = elem.tag |
| 551 | + pos = xml_namespace.find('sitename') |
| 552 | + xml_namespace = xml_namespace[0:pos] |
| 553 | + return xml_namespace |
| 554 | + |
| 555 | + |
523 | 556 | def count_edits(article, counts, bots, xml_namespace): |
524 | 557 | title = parse_title(article['title']) |
525 | 558 | namespace = determine_namespace(title, {}, COUNT_EXCLUDE_NAMESPACE) |
— | — | @@ -542,14 +575,15 @@ |
543 | 576 | |
544 | 577 | |
545 | 578 | def create_variables(article, cache, bots, xml_namespace, comments=False): |
546 | | - include_ns = {'User Talk': 3, |
547 | | - 'Wikipedia Talk': 5, |
548 | | - 'Talk': 1, |
549 | | - 'User': 2, |
550 | | - 'Wikipedia': 4, |
| 579 | + include_ns = {3: 'User Talk', |
| 580 | + 5: 'Wikipedia Talk', |
| 581 | + 1: 'Talk', |
| 582 | + 2: 'User', |
| 583 | + 4: 'Wikipedia' |
551 | 584 | } |
552 | 585 | title = parse_title(article['title']) |
553 | | - namespace = determine_namespace(title, include_ns, EXCLUDE_NAMESPACE) |
| 586 | + namespaces = article['namespaces'] |
| 587 | + namespace = determine_namespace(title, namespaces, include_ns, EXCLUDE_NAMESPACE) |
554 | 588 | title_meta = parse_title_meta_data(title, namespace) |
555 | 589 | if namespace != False: |
556 | 590 | cache.stats.count_articles += 1 |
— | — | @@ -600,39 +634,43 @@ |
601 | 635 | revision.clear() |
602 | 636 | |
603 | 637 | |
604 | | -def parse_xml(fh, xml_namespace, wikilytics=True): |
| 638 | +def parse_xml(fh, rts): |
605 | 639 | context = iterparse(fh, events=('end',)) |
606 | 640 | context = iter(context) |
607 | 641 | |
608 | 642 | article = {} |
609 | 643 | article['revisions'] = [] |
610 | 644 | id = False |
611 | | - |
612 | 645 | for event, elem in context: |
613 | | - if event == 'end' and elem.tag == '%s%s' % (xml_namespace, 'title'): |
| 646 | + if event == 'end' and elem.tag.endswith('siteinfo'): |
| 647 | + xml_namespace = determine_xml_namespace(elem) |
| 648 | + namespaces = create_namespace_dict(elem, xml_namespace) |
| 649 | + article['namespaces'] = namespaces |
| 650 | + elif event == 'end' and elem.tag.endswith('title'): |
614 | 651 | article['title'] = elem |
615 | | - elif event == 'end' and elem.tag == '%s%s' % (xml_namespace, 'revision'): |
| 652 | + elif event == 'end' and elem.tag.endswith('revision'): |
616 | 653 | article['revisions'].append(elem) |
617 | | - elif event == 'end' and elem.tag == '%s%s' % (xml_namespace, 'id') and id == False: |
| 654 | + elif event == 'end' and elem.tag.endswith('id') and id == False: |
618 | 655 | article['id'] = elem |
619 | 656 | id = True |
620 | | - elif event == 'end' and elem.tag == '%s%s' % (xml_namespace, 'page'): |
621 | | - yield article |
| 657 | + elif event == 'end' and elem.tag.endswith('page'): |
| 658 | + yield article, xml_namespace |
622 | 659 | elem.clear() |
623 | 660 | article = {} |
624 | 661 | article['revisions'] = [] |
| 662 | + article['namespaces'] = namespaces |
625 | 663 | id = False |
626 | | - elif event == 'end': |
| 664 | + elif rts.kaggle == True and event == 'end': |
| 665 | + print 'I am cleaning up' |
627 | 666 | elem.clear() |
628 | 667 | |
629 | 668 | |
630 | 669 | def stream_raw_xml(input_queue, storage, process_id, function, dataset, locks, rts): |
631 | 670 | bots = detector.retrieve_bots('en') |
632 | | - xml_namespace = '{http://www.mediawiki.org/xml/export-0.4/}' |
633 | 671 | path = os.path.join(rts.location, 'txt') |
634 | 672 | |
635 | 673 | filehandles = [file_utils.create_txt_filehandle(path, '%s.csv' % fh, 'a', |
636 | | - rts.encoding) for fh in xrange(rts.max_filehandles)] |
| 674 | + 'utf-8') for fh in xrange(rts.max_filehandles)] |
637 | 675 | |
638 | 676 | title_file = os.path.join(path, 'titles.csv') |
639 | 677 | comment_file = os.path.join(path, 'comments.csv') |
— | — | @@ -659,9 +697,9 @@ |
660 | 698 | fh = file_utils.create_streaming_buffer(filename) |
661 | 699 | filename = os.path.split(filename)[1] |
662 | 700 | filename = os.path.splitext(filename)[0] |
663 | | - for article in parse_xml(fh, xml_namespace): |
| 701 | + for article, xml_namespace in parse_xml(fh, rts): |
664 | 702 | if dataset == 'training': |
665 | | - function(article, cache, bots, xml_namespace, wikilytics) |
| 703 | + function(article, cache, bots, xml_namespace) |
666 | 704 | elif dataset == 'prediction': |
667 | 705 | counts = function(article, counts, bots, xml_namespace) |
668 | 706 | i += 1 |
— | — | @@ -710,16 +748,19 @@ |
711 | 749 | res = file_utils.create_directory(output_txt) |
712 | 750 | |
713 | 751 | |
714 | | -def multiprocessor_launcher(function, path, dataset, storage, processors, extension, locks, rts): |
| 752 | +def multiprocessor_launcher(function, dataset, storage, locks, rts): |
715 | 753 | input_queue = JoinableQueue() |
716 | | - #files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2'] |
717 | | - #files = ['/home/diederik/kaggle/enwiki-20100904-pages-meta-history2.xml.bz2'] |
718 | 754 | |
719 | | - files = file_utils.retrieve_file_list(rts.input_location, extension) |
| 755 | + files = file_utils.retrieve_file_list(rts.location) |
| 756 | + if len(files) > cpu_count(): |
| 757 | + processors = cpu_count() - 1 |
| 758 | + else: |
| 759 | + processors = len(files) |
| 760 | + |
720 | 761 | #files = files[0:1] |
721 | | - |
| 762 | + print rts.input_location, rts.location |
722 | 763 | for filename in files: |
723 | | - filename = os.path.join(path, filename) |
| 764 | + filename = os.path.join(rts.location, filename) |
724 | 765 | print filename |
725 | 766 | input_queue.put(filename) |
726 | 767 | |
— | — | @@ -747,11 +788,9 @@ |
748 | 789 | function = create_variables |
749 | 790 | storage = 'csv' |
750 | 791 | dataset = 'training' |
751 | | - processors = 7 |
752 | | - extension = 'bz2' |
753 | 792 | rts = DummyRTS(path) |
754 | 793 | locks = [] |
755 | | - multiprocessor_launcher(function, path, dataset, storage, processors, extension, locks, rts) |
| 794 | + multiprocessor_launcher(function, dataset, storage, locks, rts) |
756 | 795 | |
757 | 796 | |
758 | 797 | def launcher_prediction(): |
— | — | @@ -762,11 +801,9 @@ |
763 | 802 | function = count_edits |
764 | 803 | storage = 'csv' |
765 | 804 | dataset = 'prediction' |
766 | | - processors = 7 |
767 | | - extension = 'bz2' |
768 | 805 | rts = DummyRTS(path) |
769 | 806 | locks = [] |
770 | | - multiprocessor_launcher(function, path, dataset, storage, processors, extension, locks, rts) |
| 807 | + multiprocessor_launcher(function, dataset, storage, locks, rts) |
771 | 808 | |
772 | 809 | |
773 | 810 | def launcher(rts): |
— | — | @@ -778,14 +815,12 @@ |
779 | 816 | function = create_variables |
780 | 817 | storage = 'csv' |
781 | 818 | dataset = 'training' |
782 | | - processors = 1 |
783 | | - extension = 'gz' |
784 | 819 | lock1 = RLock() |
785 | 820 | lock2 = RLock() |
786 | 821 | lock3 = RLock() |
787 | 822 | locks = [lock1, lock2, lock3] |
788 | 823 | setup(storage, rts) |
789 | | - multiprocessor_launcher(function, path, dataset, storage, processors, extension, locks, rts) |
| 824 | + multiprocessor_launcher(function, dataset, storage, locks, rts) |
790 | 825 | |
791 | 826 | |
792 | 827 | if __name__ == '__main__': |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -44,7 +44,7 @@ |
45 | 45 | |
46 | 46 | |
47 | 47 | def remove_numeric_character_references(rts, text): |
48 | | - return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode(rts.encoding) |
| 48 | + return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8') |
49 | 49 | |
50 | 50 | |
51 | 51 | def lenient_deccharref(m): |
— | — | @@ -278,7 +278,7 @@ |
279 | 279 | output = os.path.join(rts.input_location, rts.language.code, rts.project.name, 'txt') |
280 | 280 | widgets = log.init_progressbar_widgets('Extracting data') |
281 | 281 | filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a', |
282 | | - rts.encoding) for fh in xrange(rts.max_filehandles)] |
| 282 | + 'utf-8') for fh in xrange(rts.max_filehandles)] |
283 | 283 | while True: |
284 | 284 | total, processed = 0.0, 0.0 |
285 | 285 | try: |
— | — | @@ -297,7 +297,7 @@ |
298 | 298 | print 'Opening %s...' % (os.path.join(location, filename)) |
299 | 299 | print 'Filesize: %s' % filesize |
300 | 300 | fh1 = file_utils.create_txt_filehandle(location, filename, 'r', 'ascii') |
301 | | - fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', rts.encoding) |
| 301 | + fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', 'utf-8') |
302 | 302 | ns, xml_namespace = wikitree.parser.extract_meta_information(fh1) |
303 | 303 | ns = build_namespaces_locale(ns, rts.namespaces) |
304 | 304 | rts.xml_namespace = xml_namespace |
Index: trunk/tools/editor_trends/etl/sort.py |
— | — | @@ -50,7 +50,7 @@ |
51 | 51 | fh = file_utils.create_txt_filehandle(self.rts.txt, |
52 | 52 | filename, |
53 | 53 | 'r', |
54 | | - self.rts.encoding) |
| 54 | + 'utf-8') |
55 | 55 | data = file_utils.read_unicode_text(fh) |
56 | 56 | fh.close() |
57 | 57 | for x, d in enumerate(data): |
— | — | @@ -121,7 +121,7 @@ |
122 | 122 | fh = file_utils.create_txt_filehandle(target, |
123 | 123 | 'merged_%s.txt' % iteration, |
124 | 124 | 'w', |
125 | | - rts.encoding) |
| 125 | + 'utf-8') |
126 | 126 | lines = 0 |
127 | 127 | for line in heapq.merge(*[readline(filename) for filename in files]): |
128 | 128 | file_utils.write_list_to_csv(line, fh) |
— | — | @@ -138,7 +138,7 @@ |
139 | 139 | fh = file_utils.create_txt_filehandle(rts.sorted, |
140 | 140 | filename, |
141 | 141 | 'w', |
142 | | - rts.encoding) |
| 142 | + 'utf-8') |
143 | 143 | file_utils.write_list_to_csv(sorted_data, fh) |
144 | 144 | fh.close() |
145 | 145 | |
Index: trunk/tools/editor_trends/classes/settings.py |
— | — | @@ -52,7 +52,6 @@ |
53 | 53 | def __init__(self): |
54 | 54 | self.minimum_python_version = (2, 6) |
55 | 55 | self.detect_python_version() |
56 | | - self.encoding = 'utf-8' |
57 | 56 | |
58 | 57 | #Date format as used by Erik Zachte |
59 | 58 | self.date_format = '%Y-%m-%d' |
— | — | @@ -67,7 +66,6 @@ |
68 | 67 | self.number_of_processes = cpu_count() |
69 | 68 | |
70 | 69 | self.wp_dump_location = 'http://dumps.wikimedia.org' |
71 | | - self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/' |
72 | 70 | self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json'] |
73 | 71 | self.windows_register = {'7z.exe': 'Software\\7-Zip', } |
74 | 72 | #Extensions of ascii files, this is used to determine the filemode to use |
Index: trunk/tools/editor_trends/classes/runtime_settings.py |
— | — | @@ -62,6 +62,7 @@ |
63 | 63 | self.language = self.update_language_settings() |
64 | 64 | self.charts = self.determine_chart(self.get_value('charts')) |
65 | 65 | self.keywords = self.split_keywords(self.get_value('keywords')) |
| 66 | + self.kaggle = self.get_value('kaggle') |
66 | 67 | self.function = self.get_value('func') |
67 | 68 | |
68 | 69 | self.ignore = self.get_value('except') |
— | — | @@ -140,8 +141,12 @@ |
141 | 142 | ''' |
142 | 143 | Construct the full project location |
143 | 144 | ''' |
144 | | - return os.path.join(self.output_location, self.language.code, |
| 145 | + if self.kaggle: |
| 146 | + return os.path.join(self.input_location, self.language.code, |
145 | 147 | self.project.name) |
| 148 | + else: |
| 149 | + return os.path.join(self.input_location, self.language.code, |
| 150 | + self.project.name) |
146 | 151 | |
147 | 152 | def show_settings(self): |
148 | 153 | ''' |