r85395 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r85394‎ \| r85395 \| r85396 >
Date:	23:16, 4 April 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Removed encoding variable and replaced it for a string.
Modified paths:	/trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history) /trunk/tools/editor_trends/classes/settings.py (modified) (history) /trunk/tools/editor_trends/etl/enricher.py (modified) (history) /trunk/tools/editor_trends/etl/extracter.py (modified) (history) /trunk/tools/editor_trends/etl/sort.py (modified) (history) /trunk/tools/editor_trends/etl/store.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -155,6 +155,12 @@
156	156	executing all.',
157	157	default=[])
158	158
	159	+
	160	+ parser.add_argument('-k', '--kaggle',
	161	+ action='store',
	162	+ help='Indicate whether the output is for Kaggle or not',
	163	+ default=False)
	164	+
159	165	parser.add_argument('-l', '--language',
160	166	action='store',
161	167	help='Example of valid languages.',
Index: trunk/tools/editor_trends/etl/store.py
—	—	@@ -55,7 +55,7 @@
56	56	break
57	57
58	58	fh = file_utils.create_txt_filehandle(self.rts.sorted, filename,
59		~~- 'r', self.rts.encoding)~~
	59	+ 'r', 'utf-8')
60	60	for line in file_utils.read_raw_data(fh):
61	61	if len(line) == 12:
62	62	editor = line[2]
—	—	@@ -72,7 +72,7 @@
73	73
74	74	def prepare_data(self, line):
75	75	article_id = int(line[1])
76		~~- username = line[3].encode(self.rts.encoding)~~
	76	+ username = line[3].encode('utf-8')
77	77	ns = int(line[4])
78	78	date = text_utils.convert_timestamp_to_datetime_utc(line[6])
79	79	md5 = line[7]
—	—	@@ -115,7 +115,7 @@
116	116	collection.ensure_index('category')
117	117
118	118	location = os.path.join(rts.input_location, rts.language.code, rts.project.name, 'txt')
119		~~- fh = file_utils.create_txt_filehandle(location, 'titles.csv', 'r', rts.encoding)~~
	119	+ fh = file_utils.create_txt_filehandle(location, 'titles.csv', 'r', 'utf-8')
120	120	print 'Storing article titles...'
121	121	for line in fh:
122	122	line = line.strip()
Index: trunk/tools/editor_trends/etl/enricher.py
—	—	@@ -464,19 +464,21 @@
465	465	return editor
466	466
467	467
468		~~-def determine_namespace(title, include_ns, exclude_ns):~~
	468	+def determine_namespace(title, namespaces, include_ns, exclude_ns):
469	469	'''
470	470	You can only determine whether an article belongs to the Main Namespace
471	471	by ruling out that it does not belong to any other namepace
472	472	'''
473	473	ns = {}
474	474	if title != None:
475		~~- for namespace in include_ns:~~
476		~~- if title.startswith(namespace):~~
477		~~- ns['namespace'] = include_ns[namespace]~~
	475	+ for key in include_ns:
	476	+ namespace = namespaces.get(key)
	477	+ if namespace and title.startswith(namespace):
	478	+ ns['namespace'] = key
478	479	if ns == {}:
479		~~- for namespace in exclude_ns.values():~~
480		~~- if title.startswith(namespace):~~
	480	+ for key in exclude_ns:
	481	+ namespace = namespaces.get(key)
	482	+ if namespace and title.startswith(namespace):
481	483	'''article does not belong to any of the include_ns
482	484	namespaces'''
483	485	ns = False
—	—	@@ -519,6 +521,37 @@
520	522	return comment
521	523
522	524
	525	+def create_namespace_dict(siteinfo, xml_namespace):
	526	+ '''
	527	+ This function determines the local names of the different namespaces.
	528	+ '''
	529	+ namespaces = {}
	530	+ print 'Constructing namespace dictionary'
	531	+
	532	+ elements = siteinfo.find('%s%s' % (xml_namespace, 'namespaces'))
	533	+ for elem in elements:
	534	+ key = int(elem.get('key'))
	535	+ namespaces[key] = elem.text #extract_text(ns)
	536	+ text = elem.text if elem.text != None else ''
	537	+ try:
	538	+ print key, text.encode('utf-8')
	539	+ except UnicodeEncodeError:
	540	+ print key
	541	+ return namespaces
	542	+
	543	+
	544	+def determine_xml_namespace(siteinfo):
	545	+ '''
	546	+ This function determines the xml_namespace version
	547	+ '''
	548	+ for elem in siteinfo :
	549	+ if elem.tag.endswith('sitename'):
	550	+ xml_namespace = elem.tag
	551	+ pos = xml_namespace.find('sitename')
	552	+ xml_namespace = xml_namespace[0:pos]
	553	+ return xml_namespace
	554	+
	555	+
523	556	def count_edits(article, counts, bots, xml_namespace):
524	557	title = parse_title(article['title'])
525	558	namespace = determine_namespace(title, {}, COUNT_EXCLUDE_NAMESPACE)
—	—	@@ -542,14 +575,15 @@
543	576
544	577
545	578	def create_variables(article, cache, bots, xml_namespace, comments=False):
546		~~- include_ns = {'User Talk': 3,~~
547		~~- 'Wikipedia Talk': 5,~~
548		~~- 'Talk': 1,~~
549		~~- 'User': 2,~~
550		~~- 'Wikipedia': 4,~~
	579	+ include_ns = {3: 'User Talk',
	580	+ 5: 'Wikipedia Talk',
	581	+ 1: 'Talk',
	582	+ 2: 'User',
	583	+ 4: 'Wikipedia'
551	584	}
552	585	title = parse_title(article['title'])
553		~~- namespace = determine_namespace(title, include_ns, EXCLUDE_NAMESPACE)~~
	586	+ namespaces = article['namespaces']
	587	+ namespace = determine_namespace(title, namespaces, include_ns, EXCLUDE_NAMESPACE)
554	588	title_meta = parse_title_meta_data(title, namespace)
555	589	if namespace != False:
556	590	cache.stats.count_articles += 1
—	—	@@ -600,39 +634,43 @@
601	635	revision.clear()
602	636
603	637
604		~~-def parse_xml(fh, xml_namespace, wikilytics=True):~~
	638	+def parse_xml(fh, rts):
605	639	context = iterparse(fh, events=('end',))
606	640	context = iter(context)
607	641
608	642	article = {}
609	643	article['revisions'] = []
610	644	id = False
611		-
612	645	for event, elem in context:
613		~~- if event == 'end' and elem.tag == '%s%s' % (xml_namespace, 'title'):~~
	646	+ if event == 'end' and elem.tag.endswith('siteinfo'):
	647	+ xml_namespace = determine_xml_namespace(elem)
	648	+ namespaces = create_namespace_dict(elem, xml_namespace)
	649	+ article['namespaces'] = namespaces
	650	+ elif event == 'end' and elem.tag.endswith('title'):
614	651	article['title'] = elem
615		~~- elif event == 'end' and elem.tag == '%s%s' % (xml_namespace, 'revision'):~~
	652	+ elif event == 'end' and elem.tag.endswith('revision'):
616	653	article['revisions'].append(elem)
617		~~- elif event == 'end' and elem.tag == '%s%s' % (xml_namespace, 'id') and id == False:~~
	654	+ elif event == 'end' and elem.tag.endswith('id') and id == False:
618	655	article['id'] = elem
619	656	id = True
620		~~- elif event == 'end' and elem.tag == '%s%s' % (xml_namespace, 'page'):~~
621		~~- yield article~~
	657	+ elif event == 'end' and elem.tag.endswith('page'):
	658	+ yield article, xml_namespace
622	659	elem.clear()
623	660	article = {}
624	661	article['revisions'] = []
	662	+ article['namespaces'] = namespaces
625	663	id = False
626		~~- elif event == 'end':~~
	664	+ elif rts.kaggle == True and event == 'end':
	665	+ print 'I am cleaning up'
627	666	elem.clear()
628	667
629	668
630	669	def stream_raw_xml(input_queue, storage, process_id, function, dataset, locks, rts):
631	670	bots = detector.retrieve_bots('en')
632		~~- xml_namespace = '{http://www.mediawiki.org/xml/export-0.4/}'~~
633	671	path = os.path.join(rts.location, 'txt')
634	672
635	673	filehandles = [file_utils.create_txt_filehandle(path, '%s.csv' % fh, 'a',
636		~~- rts.encoding) for fh in xrange(rts.max_filehandles)]~~
	674	+ 'utf-8') for fh in xrange(rts.max_filehandles)]
637	675
638	676	title_file = os.path.join(path, 'titles.csv')
639	677	comment_file = os.path.join(path, 'comments.csv')
—	—	@@ -659,9 +697,9 @@
660	698	fh = file_utils.create_streaming_buffer(filename)
661	699	filename = os.path.split(filename)[1]
662	700	filename = os.path.splitext(filename)[0]
663		~~- for article in parse_xml(fh, xml_namespace):~~
	701	+ for article, xml_namespace in parse_xml(fh, rts):
664	702	if dataset == 'training':
665		~~- function(article, cache, bots, xml_namespace, wikilytics)~~
	703	+ function(article, cache, bots, xml_namespace)
666	704	elif dataset == 'prediction':
667	705	counts = function(article, counts, bots, xml_namespace)
668	706	i += 1
—	—	@@ -710,16 +748,19 @@
711	749	res = file_utils.create_directory(output_txt)
712	750
713	751
714		~~-def multiprocessor_launcher(function, path, dataset, storage, processors, extension, locks, rts):~~
	752	+def multiprocessor_launcher(function, dataset, storage, locks, rts):
715	753	input_queue = JoinableQueue()
716		~~- #files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']~~
717		~~- #files = ['/home/diederik/kaggle/enwiki-20100904-pages-meta-history2.xml.bz2']~~
718	754
719		~~- files = file_utils.retrieve_file_list(rts.input_location, extension)~~
	755	+ files = file_utils.retrieve_file_list(rts.location)
	756	+ if len(files) > cpu_count():
	757	+ processors = cpu_count() - 1
	758	+ else:
	759	+ processors = len(files)
	760	+
720	761	#files = files[0:1]
721		-
	762	+ print rts.input_location, rts.location
722	763	for filename in files:
723		~~- filename = os.path.join(path, filename)~~
	764	+ filename = os.path.join(rts.location, filename)
724	765	print filename
725	766	input_queue.put(filename)
726	767
—	—	@@ -747,11 +788,9 @@
748	789	function = create_variables
749	790	storage = 'csv'
750	791	dataset = 'training'
751		~~- processors = 7~~
752		~~- extension = 'bz2'~~
753	792	rts = DummyRTS(path)
754	793	locks = []
755		~~- multiprocessor_launcher(function, path, dataset, storage, processors, extension, locks, rts)~~
	794	+ multiprocessor_launcher(function, dataset, storage, locks, rts)
756	795
757	796
758	797	def launcher_prediction():
—	—	@@ -762,11 +801,9 @@
763	802	function = count_edits
764	803	storage = 'csv'
765	804	dataset = 'prediction'
766		~~- processors = 7~~
767		~~- extension = 'bz2'~~
768	805	rts = DummyRTS(path)
769	806	locks = []
770		~~- multiprocessor_launcher(function, path, dataset, storage, processors, extension, locks, rts)~~
	807	+ multiprocessor_launcher(function, dataset, storage, locks, rts)
771	808
772	809
773	810	def launcher(rts):
—	—	@@ -778,14 +815,12 @@
779	816	function = create_variables
780	817	storage = 'csv'
781	818	dataset = 'training'
782		~~- processors = 1~~
783		~~- extension = 'gz'~~
784	819	lock1 = RLock()
785	820	lock2 = RLock()
786	821	lock3 = RLock()
787	822	locks = [lock1, lock2, lock3]
788	823	setup(storage, rts)
789		~~- multiprocessor_launcher(function, path, dataset, storage, processors, extension, locks, rts)~~
	824	+ multiprocessor_launcher(function, dataset, storage, locks, rts)
790	825
791	826
792	827	if __name__ == '__main__':
Index: trunk/tools/editor_trends/etl/extracter.py
—	—	@@ -44,7 +44,7 @@
45	45
46	46
47	47	def remove_numeric_character_references(rts, text):
48		~~- return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode(rts.encoding)~~
	48	+ return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
49	49
50	50
51	51	def lenient_deccharref(m):
—	—	@@ -278,7 +278,7 @@
279	279	output = os.path.join(rts.input_location, rts.language.code, rts.project.name, 'txt')
280	280	widgets = log.init_progressbar_widgets('Extracting data')
281	281	filehandles = [file_utils.create_txt_filehandle(output, '%s.csv' % fh, 'a',
282		~~- rts.encoding) for fh in xrange(rts.max_filehandles)]~~
	282	+ 'utf-8') for fh in xrange(rts.max_filehandles)]
283	283	while True:
284	284	total, processed = 0.0, 0.0
285	285	try:
—	—	@@ -297,7 +297,7 @@
298	298	print 'Opening %s...' % (os.path.join(location, filename))
299	299	print 'Filesize: %s' % filesize
300	300	fh1 = file_utils.create_txt_filehandle(location, filename, 'r', 'ascii')
301		~~- fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', rts.encoding)~~
	301	+ fh2 = file_utils.create_txt_filehandle(location, 'articles.csv', 'a', 'utf-8')
302	302	ns, xml_namespace = wikitree.parser.extract_meta_information(fh1)
303	303	ns = build_namespaces_locale(ns, rts.namespaces)
304	304	rts.xml_namespace = xml_namespace
Index: trunk/tools/editor_trends/etl/sort.py
—	—	@@ -50,7 +50,7 @@
51	51	fh = file_utils.create_txt_filehandle(self.rts.txt,
52	52	filename,
53	53	'r',
54		~~- self.rts.encoding)~~
	54	+ 'utf-8')
55	55	data = file_utils.read_unicode_text(fh)
56	56	fh.close()
57	57	for x, d in enumerate(data):
—	—	@@ -121,7 +121,7 @@
122	122	fh = file_utils.create_txt_filehandle(target,
123	123	'merged_%s.txt' % iteration,
124	124	'w',
125		~~- rts.encoding)~~
	125	+ 'utf-8')
126	126	lines = 0
127	127	for line in heapq.merge(*[readline(filename) for filename in files]):
128	128	file_utils.write_list_to_csv(line, fh)
—	—	@@ -138,7 +138,7 @@
139	139	fh = file_utils.create_txt_filehandle(rts.sorted,
140	140	filename,
141	141	'w',
142		~~- rts.encoding)~~
	142	+ 'utf-8')
143	143	file_utils.write_list_to_csv(sorted_data, fh)
144	144	fh.close()
145	145
Index: trunk/tools/editor_trends/classes/settings.py
—	—	@@ -52,7 +52,6 @@
53	53	def __init__(self):
54	54	self.minimum_python_version = (2, 6)
55	55	self.detect_python_version()
56		~~- self.encoding = 'utf-8'~~
57	56
58	57	#Date format as used by Erik Zachte
59	58	self.date_format = '%Y-%m-%d'
—	—	@@ -67,7 +66,6 @@
68	67	self.number_of_processes = cpu_count()
69	68
70	69	self.wp_dump_location = 'http://dumps.wikimedia.org'
71		~~- self.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'~~
72	70	self.ascii_extensions = ['txt', 'csv', 'xml', 'sql', 'json']
73	71	self.windows_register = {'7z.exe': 'Software\\7-Zip', }
74	72	#Extensions of ascii files, this is used to determine the filemode to use
Index: trunk/tools/editor_trends/classes/runtime_settings.py
—	—	@@ -62,6 +62,7 @@
63	63	self.language = self.update_language_settings()
64	64	self.charts = self.determine_chart(self.get_value('charts'))
65	65	self.keywords = self.split_keywords(self.get_value('keywords'))
	66	+ self.kaggle = self.get_value('kaggle')
66	67	self.function = self.get_value('func')
67	68
68	69	self.ignore = self.get_value('except')
—	—	@@ -140,8 +141,12 @@
141	142	'''
142	143	Construct the full project location
143	144	'''
144		~~- return os.path.join(self.output_location, self.language.code,~~
	145	+ if self.kaggle:
	146	+ return os.path.join(self.input_location, self.language.code,
145	147	self.project.name)
	148	+ else:
	149	+ return os.path.join(self.input_location, self.language.code,
	150	+ self.project.name)
146	151
147	152	def show_settings(self):
148	153	'''