r75936 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r75935‎ \| r75936 \| r75937 >
Date:	17:01, 3 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	PEP8 compliant.
Modified paths:	/trunk/tools/editor_trends/split_xml_file.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/split_xml_file.py
—	—	@@ -36,7 +36,6 @@
37	37	pass
38	38
39	39
40		-
41	40	RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
42	41
43	42
—	—	@@ -49,7 +48,7 @@
50	49	return unichr(int(m.group(1)))
51	50	except ValueError:
52	51	'''
53		~~- There are a few articles that raise a Value Error here, the reason is~~
	52	+ There are a few articles that raise a Value Error here, the reason is
54	53	that I am using a narrow Python build (UCS2) instead of a wide build
55	54	(UCS4). The quick fix is to return an empty string...
56	55	Real solution is to rebuild Python with UCS4 support.....
—	—	@@ -58,7 +57,7 @@
59	58
60	59
61	60	def remove_namespace(element, namespace):
62		~~- '''Remove namespace from the document.'''~~
	61	+ '''Remove namespace from the XML document.'''
63	62	ns = u'{%s}' % namespace
64	63	nsl = len(ns)
65	64	for elem in element.getiterator():
—	—	@@ -66,6 +65,7 @@
67	66	elem.tag = elem.tag[nsl:]
68	67	return element
69	68
	69	+
70	70	def load_namespace(language):
71	71	file = '%s_ns.json' % language
72	72	fh = utils.create_txt_filehandle(settings.NAMESPACE_LOCATION, file, 'r', settings.ENCODING)
—	—	@@ -76,6 +76,9 @@
77	77
78	78
79	79	def build_namespaces_locale(namespaces):
	80	+ '''
	81	+ Construct a list of all the non-main namespaces
	82	+ '''
80	83	ns = []
81	84	for namespace in namespaces:
82	85	value = namespaces[namespace].get(u'*', None)
—	—	@@ -89,15 +92,15 @@
90	93	for revision in revisions:
91	94	comment = revision.find('comment')
92	95	timestamp = revision.find('timestamp').text
93		~~-# text1 = remove_ascii_control_characters(text)~~
94		~~-# text2 = remove_numeric_character_references(text)~~
95		~~-# text3 = convert_html_entities(text)~~
96	96	if comment != None and comment.text != None:
97	97	comment.text = function(comment.text)
98	98	return xml
99	99
100	100
101	101	def is_article_main_namespace(elem, namespace):
	102	+ '''
	103	+ checks whether the article belongs to the main namespace
	104	+ '''
102	105	title = elem.find('title').text
103	106	for ns in namespace:
104	107	if title.startswith(ns):
—	—	@@ -105,7 +108,6 @@
106	109	return True
107	110
108	111
109		-
110	112	def write_xml_file(element, fh, counter, language):
111	113	'''Get file handle and write xml element to file'''
112	114	size = len(cElementTree.tostring(element))
—	—	@@ -120,7 +122,7 @@
121	123
122	124	def create_xml_file_handle(fh, counter, size, language):
123	125	'''Create file handle if none is supplied or if file size > max file size.'''
124		~~- path = os.path.join(settings.XML_FILE_LOCATION , language, '%s.xml' % counter)~~
	126	+ path = os.path.join(settings.XML_FILE_LOCATION, language, '%s.xml' % counter)
125	127	if not fh:
126	128	counter = 0
127	129	fh = codecs.open(path, 'w', encoding=settings.ENCODING)
—	—	@@ -147,14 +149,13 @@
148	150	ns = load_namespace(language)
149	151	ns = build_namespaces_locale(ns)
150	152
151		-
152	153	fh = None
153	154	counter = None
154	155	tag = '{%s}page' % settings.NAME_SPACE
155		-
	156	+
156	157	context = cElementTree.iterparse(settings.XML_FILE, events=('start', 'end'))
157	158	context = iter(context)
158		~~- event, root = context.next() # get the root element of the XML doc~~
	159	+ event, root = context.next() #get the root element of the XML doc
159	160
160	161	for event, elem in context:
161	162	if event == 'end':
—	—	@@ -163,10 +164,8 @@
164	165	elem = parse_comments(elem, remove_numeric_character_references)
165	166
166	167	if is_article_main_namespace(elem, ns):
167		~~- #fh, counter = write_xml_file(elem, fh, counter, language)~~
168		~~- pass~~
	168	+ fh, counter = write_xml_file(elem, fh, counter, language)
169	169	root.clear() # when done parsing a section clear the tree to safe memory
170		-
171	170	#elem = parse_comments(elem, convert_html_entities)
172	171	#elem = parse_comments(elem, remove_ascii_control_characters)
173	172	#print cElementTree.tostring(elem)

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r75936 [removed: new added: deferred]