r85395 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r85394‎ | r85395 | r85396 >
Date:23:16, 4 April 2011
Author:diederik
Status:deferred
Tags:
Comment:
Removed encoding variable and replaced it for a string.
Modified paths:
  • /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history)
  • /trunk/tools/editor_trends/classes/settings.py (modified) (history)
  • /trunk/tools/editor_trends/etl/enricher.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/sort.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -155,6 +155,12 @@
156156 executing all.',
157157 default=[])
158158
 159+
 160+ parser.add_argument('-k', '--kaggle',
 161+ action='store',
 162+ help='Indicate whether the output is for Kaggle or not',
 163+ default=False)
 164+
159165 parser.add_argument('-l', '--language',
160166 action='store',
161167 help='Example of valid languages.',
Index: trunk/tools/editor_trends/etl/store.py
@@ -55,7 +55,7 @@
5656 break
5757
5858 fh = file_utils.create_txt_filehandle(self.rts.sorted, filename,
59 - 'r', self.rts.encoding)
 59+ 'r', 'utf-8')
6060 for line in file_utils.read_raw_data(fh):
6161 if len(line) == 12:
6262 editor = line[2]
@@ -72,7 +72,7 @@
7373
7474 def prepare_data(self, line):
7575 article_id = int(line[1])
76 - username = line[3].encode(self.rts.encoding)
 76+ username = line[3].encode('utf-8')
7777 ns = int(line[4])
7878 date = text_utils.convert_timestamp_to_datetime_utc(line[6])
7979 md5 = line[7]
@@ -115,7 +115,7 @@
116116 collection.ensure_index('category')
117117
118118 location = os.path.join(rts.input_location, rts.language.code, rts.project.name, 'txt')
119 - fh = file_utils.create_txt_filehandle(location, 'titles.csv', 'r', rts.encoding)
 119+ fh = file_utils.create_txt_filehandle(location, 'titles.csv', 'r', 'utf-8')
120120 print 'Storing article titles...'
121121 for line in fh:
122122 line = line.strip()
Index: trunk/tools/editor_trends/etl/enricher.py
@@ -464,19 +464,21 @@
465465 return editor
466466
467467
468 -def determine_namespace(title, include_ns, exclude_ns):
 468+def determine_namespace(title, namespaces, include_ns, exclude_ns):
469469 '''
470470 You can only determine whether an article belongs to the Main Namespace
471471 by ruling out that it does not belong to any other namepace
472472 '''
473473 ns = {}
474474 if title != None:
475 - for namespace in include_ns:
476 - if title.startswith(namespace):
477 - ns['namespace'] = include_ns[namespace]
 475+ for key in include_ns:
 476+ namespace = namespaces.get(key)
 477+ if namespace and title.startswith(namespace):
 478+ ns['namespace'] = key
478479 if ns == {}:
479 - for namespace in exclude_ns.values():
480 - if title.startswith(namespace):
 480+ for key in exclude_ns:
 481+ namespace = namespaces.get(key)
 482+ if namespace and title.startswith(namespace):
481483 '''article does not belong to any of the include_ns
482484 namespaces'''
483485 ns = False
@@ -519,6 +521,37 @@
520522 return comment
521523
522524
 525+def create_namespace_dict(siteinfo, xml_namespace):
 526+ '''
 527+ This function determines the local names of the different namespaces.
 528+ '''
 529+ namespaces = {}
 530+ print 'Constructing namespace dictionary'
 531+
 532+ elements = siteinfo.find('%s%s' % (xml_namespace, 'namespaces'))
 533+ for elem in elements:
 534+ key = int(elem.get('key'))
 535+ namespaces[key] = elem.text #extract_text(ns)
 536+ text = elem.text if elem.text != None else ''
 537+ try:
 538+ print key, text.encode('utf-8')
 539+ except UnicodeEncodeError:
 540+ print key
 541+ return namespaces
 542+
 543+
 544+def determine_xml_namespace(siteinfo):
 545+ '''
 546+ This function determines the xml_namespace version
 547+ '''
 548+ for elem in siteinfo :
 549+ if elem.tag.endswith('sitename'):
 550+ xml_namespace = elem.tag
 551+ pos = xml_namespace.find('sitename')
 552+ xml_namespace = xml_namespace[0:pos]
 553+ return xml_namespace
 554+
 555+
523556 def count_edits(article, counts, bots, xml_namespace):
524557 title = parse_title(article['title'])
525558 namespace = determine_namespace(title, {}, COUNT_EXCLUDE_NAMESPACE)
@@ -542,14 +575,15 @@
543576
544577
545578 def create_variables(article, cache, bots, xml_namespace, comments=False):
546 - include_ns = {'User Talk': 3,
547 - 'Wikipedia Talk': 5,
548 - 'Talk': 1,
549 - 'User': 2,
550 - 'Wikipedia': 4,
 579+ include_ns = {3: 'User Talk',
 580+ 5: 'Wikipedia Talk',
 581+ 1: 'Talk',
 582+ 2: 'User',
 583+ 4: 'Wikipedia'
551584 }
552585 title = parse_title(article['title'])
553 - namespace = determine_namespace(title, include_ns, EXCLUDE_NAMESPACE)
 586+ namespaces = article['namespaces']
 587+ namespace = determine_namespace(title, namespaces, include_ns, EXCLUDE_NAMESPACE)
554588 title_meta = parse_title_meta_data(title, namespace)
555589 if namespace != False:
556590 cache.stats.count_articles += 1
@@ -600,39 +634,43 @@
601635 revision.clear()
602636
603637
604 -def parse_xml(fh, xml_namespace, wikilytics=True):
 638+def parse_xml(fh, rts):
605639 context = iterparse(fh, events=('end',))
606640 context = iter(context)
607641
608642 article = {}
609643 article['revisions'] = []
610644 id = False
611 -
612645 for event, elem in context:
613 - if event == 'end' and elem.tag == '%s%s' % (xml_namespace, 'title'):
 646+ if event == 'end' and elem.tag.endswith('siteinfo'):
 647+ xml_namespace = determine_xml_namespace(elem)
 648+ namespaces = create_namespace_dict(elem, xml_namespace)
 649+ article['namespaces'] = namespaces
 650+ elif event == 'end' and elem.tag.endswith('title'):
614651 article['title'] = elem
615 - elif event == 'end' and elem.tag == '%s%s' % (xml_namespace, 'revision'):
 652+ elif event == 'end' and elem.tag.endswith('revision'):
616653 article['revisions'].append(elem)
617 - elif event == 'end' and elem.tag == '%s%s' % (xml_namespace, 'id') and id == False:
 654+ elif event == 'end' and elem.tag.endswith('id') and id == False:
618655 article['id'] = elem
619656 id = True
620 - elif event == 'end' and elem.tag == '%s%s' % (xml_namespace, 'page'):
621 - yield article
 657+ elif event == 'end' and elem.tag.endswith('page'):
 658+ yield article, xml_namespace
622659 elem.clear()
623660 article = {}
624661 article['revisions'] = []
 662+ article['namespaces'] = namespaces
625663 id = False
626 - elif event == 'end':
 664+ elif rts.kaggle == True and event == 'end':
 665+ print 'I am cleaning up'
627666 elem.clear()
628667
629668
630669 def stream_raw_xml(input_queue, storage, process_id, function, dataset, locks, rts):
631670 bots = detector.retrieve_bots('en')
632 - xml_namespace = '{http://www.mediawiki.org/xml/export-0.4/}'
633671 path = os.path.join(rts.location, 'txt')
634672
635673 filehandles = [file_utils.create_txt_filehandle(path, '%s.csv' % fh, 'a',
636 - rts.encoding) for fh in xrange(rts.max_filehandles)]
 674+ 'utf-8') for fh in xrange(rts.max_filehandles)]
637675
638676 title_file = os.path.join(path, 'titles.csv')
639677 comment_file = os.path.join(path, 'comments.csv')
@@ -659,9 +697,9 @@
660698 fh = file_utils.create_streaming_buffer(filename)
661699 filename = os.path.split(filename)[1]
662700 filename = os.path.splitext(filename)[0]
663 - for article in parse_xml(fh, xml_namespace):
 701+ for article, xml_namespace in parse_xml(fh, rts):
664702 if dataset == 'training':
665 - function(article, cache, bots, xml_namespace, wikilytics)
 703+ function(article, cache, bots, xml_namespace)
666704 elif dataset == 'prediction':
667705 counts = function(article, counts, bots, xml_namespace)
668706 i += 1
@@ -710,16 +748,19 @@
711749 res = file_utils.create_directory(output_txt)
712750
713751
714 -def multiprocessor_launcher(function, path, dataset, storage, processors, extension, locks, rts):
 752+def multiprocessor_launcher(function, dataset, storage, locks, rts):
715753 input_queue = JoinableQueue()
716 - #files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']
717 - #files = ['/home/diederik/kaggle/enwiki-20100904-pages-meta-history2.xml.bz2']
718754
719 - files = file_utils.retrieve_file_list(rts.input_location, extension)
 755+ files = file_utils.retrieve_file_list(rts.location)
 756+ if len(files) > cpu_count():
 757+ processors = cpu_count() - 1
 758+ else:
 759+ processors = len(files)
 760+
720761 #files = files[0:1]
721 -
 762+ print rts.input_location, rts.location
722763 for filename in files:
723 - filename = os.path.join(path, filename)
 764+ filename = os.path.join(rts.location, filename)
724765 print filename
725766 input_queue.put(filename)
726767
@@ -747,11 +788,9 @@
748789 function = create_variables
749790 storage = 'csv'
750791 dataset = 'training'
751 - processors = 7
752 - extension = 'bz2'
753792 rts = DummyRTS(path)
754793 locks = []
755 - multiprocessor_launcher(function, path, dataset, storage, processors, extension, locks, rts)
 794+ multiprocessor_launcher(function, dataset, storage, locks, rts)
756795
757796
758797 def launcher_prediction():
@@ -762,11 +801,9 @@
763802 function = count_edits
764803 storage = 'csv'
765804 dataset = 'prediction'
766 - processors = 7
767 - extension = 'bz2'
768805 rts = DummyRTS(path)
769806 locks = []
770 - multiprocessor_launcher(function, path, dataset, storage, processors, extension, locks, rts)
 807+ multiprocessor_launcher(function, dataset, storage, locks, rts)
771808
772809
773810 def launcher(rts):
@@ -778,14 +815,12 @@
779816 function = create_variables
780817 storage = 'csv'
781818 dataset = 'training'
782 - processors = 1
783 - extension = 'gz'
784819 lock1 = RLock()
785820 lock2 = RLock()
786821 lock3 = RLock()
787822 locks = [lock1, lock2, lock3]
788823 setup(storage, rts)
789 - multiprocessor_launcher(function, path, dataset, storage, processors, extension, locks, rts)
 824+ multiprocessor_launcher(function, dataset, storage, locks, rts)
790825
791826
792827 if __name__ == '__main__':
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -44,7 +44,7 @@
4545
4646
4747 def remove_numeric_character_references(rts, text):
48 - return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode(rts.encoding)
 48+ return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
4949
5050
5151 def lenient_deccharref(m):
@@ -278,7 +278,7 @@
279279 output = os.path.join(rts.input_location, rts.language.code, rts.project.name, 'txt')
280280 widgets = log.init_progressbar_widgets('Extracting data')
281281 filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a',
282 - rts.encoding) for fh in xrange(rts.max_filehandles)]
 282+ 'utf-8') for fh in xrange(rts.max_filehandles)]
283283 while True:
284284 total, processed = 0.0, 0.0
285285 try:
@@ -297,7 +297,7 @@
298298 print 'Opening %s...' % (os.path.join(location, filename))
299299 print 'Filesize: %s' % filesize
300300 fh1 = file_utils.create_txt_filehandle(location, filename, 'r', 'ascii')
301 - fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', rts.encoding)
 301+ fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', 'utf-8')
302302 ns, xml_namespace = wikitree.parser.extract_meta_information(fh1)
303303 ns = build_namespaces_locale(ns, rts.namespaces)
304304 rts.xml_namespace = xml_namespace
Index: trunk/tools/editor_trends/etl/sort.py
@@ -50,7 +50,7 @@
5151 fh = file_utils.create_txt_filehandle(self.rts.txt,
5252 filename,
5353 'r',
54 - self.rts.encoding)
 54+ 'utf-8')
5555 data = file_utils.read_unicode_text(fh)
5656 fh.close()
5757 for x, d in enumerate(data):
@@ -121,7 +121,7 @@
122122 fh = file_utils.create_txt_filehandle(target,
123123 'merged_%s.txt' % iteration,
124124 'w',
125 - rts.encoding)
 125+ 'utf-8')
126126 lines = 0
127127 for line in heapq.merge(*[readline(filename) for filename in files]):
128128 file_utils.write_list_to_csv(line, fh)
@@ -138,7 +138,7 @@
139139 fh = file_utils.create_txt_filehandle(rts.sorted,
140140 filename,
141141 'w',
142 - rts.encoding)
 142+ 'utf-8')
143143 file_utils.write_list_to_csv(sorted_data, fh)
144144 fh.close()
145145
Index: trunk/tools/editor_trends/classes/settings.py
@@ -52,7 +52,6 @@
5353 def __init__(self):
5454 self.minimum_python_version = (2, 6)
5555 self.detect_python_version()
56 - self.encoding = 'utf-8'
5756
5857 #Date format as used by Erik Zachte
5958 self.date_format = '%Y-%m-%d'
@@ -67,7 +66,6 @@
6867 self.number_of_processes = cpu_count()
6968
7069 self.wp_dump_location = 'http://dumps.wikimedia.org'
71 - self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'
7270 self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json']
7371 self.windows_register = {'7z.exe': 'Software\\7-Zip', }
7472 #Extensions of ascii files, this is used to determine the filemode to use
Index: trunk/tools/editor_trends/classes/runtime_settings.py
@@ -62,6 +62,7 @@
6363 self.language = self.update_language_settings()
6464 self.charts = self.determine_chart(self.get_value('charts'))
6565 self.keywords = self.split_keywords(self.get_value('keywords'))
 66+ self.kaggle = self.get_value('kaggle')
6667 self.function = self.get_value('func')
6768
6869 self.ignore = self.get_value('except')
@@ -140,8 +141,12 @@
141142 '''
142143 Construct the full project location
143144 '''
144 - return os.path.join(self.output_location, self.language.code,
 145+ if self.kaggle:
 146+ return os.path.join(self.input_location, self.language.code,
145147 self.project.name)
 148+ else:
 149+ return os.path.join(self.input_location, self.language.code,
 150+ self.project.name)
146151
147152 def show_settings(self):
148153 '''