r79606 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r79605‎ \| r79606 \| r79607 >
Date:	22:00, 4 January 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Greatly simplified the chunking code.
Modified paths:	/trunk/tools/editor_trends/etl/chunker.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/chunker.py
—	—	@@ -21,9 +21,7 @@
22	22	import sys
23	23	import codecs
24	24	import re
25		~~-import json~~
26	25	import os
27		~~-import random~~
28	26
29	27	import progressbar
30	28
—	—	@@ -31,11 +29,7 @@
32	30	sys.path.append('..')
33	31	import configuration
34	32	settings = configuration.Settings()
35		-
36	33	from utils import utils
37		~~-import extract~~
38		~~-import wikitree.parser~~
39		~~-from bots import bots~~
40	34
41	35
42	36	try:
—	—	@@ -75,28 +69,28 @@
76	70	return element
77	71
78	72
79		~~-def load_namespace(language):~~
80		~~- file = '%s_ns.json' % language~~
81		~~- fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding)~~
82		~~- ns = json.load(fh)~~
83		~~- fh.close()~~
84		~~- ns = ns['query']['namespaces']~~
85		~~- return ns~~
	73	+#def load_namespace(language):
	74	+# file = '%s_ns.json' % language
	75	+# fh = utils.create_txt_filehandle(settings.namespace_location, file, 'r', settings.encoding)
	76	+# ns = json.load(fh)
	77	+# fh.close()
	78	+# ns = ns['query']['namespaces']
	79	+# return ns
	80	+#
	81	+#
	82	+#def build_namespaces_locale(namespaces, include=[0]):
	83	+# '''
	84	+# @include is a list of namespace keys that should not be ignored, the default
	85	+# setting is to ignore all namespaces except the main namespace.
	86	+# '''
	87	+# ns = []
	88	+# for namespace in namespaces:
	89	+# if namespace not in include:
	90	+# value = namespaces[namespace].get(u'*', None)
	91	+# ns.append(value)
	92	+# return ns
86	93
87	94
88		~~-def build_namespaces_locale(namespaces, include=[0]):~~
89		~~- '''~~
90		~~- @include is a list of namespace keys that should not be ignored, the default~~
91		~~- setting is to ignore all namespaces except the main namespace.~~
92		~~- '''~~
93		~~- ns = []~~
94		~~- for namespace in namespaces:~~
95		~~- if namespace not in include:~~
96		~~- value = namespaces[namespace].get(u'*', None)~~
97		~~- ns.append(value)~~
98		~~- return ns~~
99		-
100		-
101	95	def parse_comments(xml, function):
102	96	revisions = xml.findall('revision')
103	97	for revision in revisions:
—	—	@@ -107,15 +101,15 @@
108	102	return xml
109	103
110	104
111		~~-def is_article_main_namespace(elem, namespace):~~
112		~~- '''~~
113		~~- checks whether the article belongs to the main namespace~~
114		~~- '''~~
115		~~- title = elem.find('title').text~~
116		~~- for ns in namespace:~~
117		~~- if title.startswith(ns):~~
118		~~- return False~~
119		~~- return True~~
	105	+#def is_article_main_namespace(elem, namespace):
	106	+# '''
	107	+# checks whether the article belongs to the main namespace
	108	+# '''
	109	+# title = elem.find('title').text
	110	+# for ns in namespace:
	111	+# if title.startswith(ns):
	112	+# return False
	113	+# return True
120	114
121	115
122	116	def write_xml_file(element, fh, output, counter, format):
—	—	@@ -125,15 +119,18 @@
126	120	size = len(xml_string)
127	121	fh, counter, new_file = create_file_handle(fh, output, counter, size, format)
128	122	fh.write(xml_string)
129		~~- except MemoryError:~~
130		~~- print 'Add error capturing logic'~~
131		~~- except UnicodeEncodeError, error:~~
	123	+ except Exception, error:
132	124	print error
133		~~- n = random.randrange(0, 10000)~~
134		~~- f = '%s%s.bin' % ('element', n)~~
135		~~- new_file = False~~
136		~~- #if element != None:~~
137		~~- # utils.store_object(element, settings.binary_location, f)~~
	125	+
	126	+# MemoryError:
	127	+# print 'Add error capturing logic'
	128	+# except UnicodeEncodeError, error:
	129	+# print error
	130	+# n = random.randrange(0, 10000)
	131	+# f = '%s%s.bin' % ('element', n)
	132	+# new_file = False
	133	+# #if element != None:
	134	+# # utils.store_object(element, settings.binary_location, f)
138	135	fh.write('\n')
139	136	return fh, counter, new_file
140	137
—	—	@@ -162,44 +159,44 @@
163	160	return fh, counter, False
164	161
165	162
166		~~-def flatten_xml_elements(data, page, bots):~~
167		~~- headers = ['id', 'date', 'article', 'username']~~
168		~~- tags = {'contributor': {'id': extract.extract_contributor_id,~~
169		~~- 'bot': extract.determine_username_is_bot,~~
170		~~- 'username': extract.extract_username,~~
171		~~- },~~
172		~~- 'timestamp': {'date': xml.extract_text},~~
173		~~- }~~
174		~~- vars = {}~~
175		~~- flat = []~~
	163	+#def flatten_xml_elements(data, page, bots):
	164	+# headers = ['id', 'date', 'article', 'username']
	165	+# tags = {'contributor': {'id': extract.extract_contributor_id,
	166	+# 'bot': extract.determine_username_is_bot,
	167	+# 'username': extract.extract_username,
	168	+# },
	169	+# 'timestamp': {'date': xml.extract_text},
	170	+# }
	171	+# vars = {}
	172	+# flat = []
	173	+#
	174	+# for x, elems in enumerate(data):
	175	+# vars[x] = {}
	176	+# vars[x]['article'] = page
	177	+# for tag in tags:
	178	+# el = xml.retrieve_xml_node(elems, tag)
	179	+# for function in tags[tag].keys():
	180	+# f = tags[tag][function]
	181	+# value = f(el, bots=bots)
	182	+# if type(value) == type({}):
	183	+# for kw in value:
	184	+# vars[x][kw] = value[kw]
	185	+# else:
	186	+# vars[x][function] = value
	187	+#
	188	+# for x, var in enumerate(vars):
	189	+# if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None:
	190	+# continue
	191	+# else:
	192	+# f = []
	193	+# for head in headers:
	194	+# f.append(vars[x][head])
	195	+# flat.append(f)
	196	+#
	197	+# return flat
176	198
177		~~- for x, elems in enumerate(data):~~
178		~~- vars[x] = {}~~
179		~~- vars[x]['article'] = page~~
180		~~- for tag in tags:~~
181		~~- el = xml.retrieve_xml_node(elems, tag)~~
182		~~- for function in tags[tag].keys():~~
183		~~- f = tags[tag][function]~~
184		~~- value = f(el, bots=bots)~~
185		~~- if type(value) == type({}):~~
186		~~- for kw in value:~~
187		~~- vars[x][kw] = value[kw]~~
188		~~- else:~~
189		~~- vars[x][function] = value~~
190	199
191		~~- for x, var in enumerate(vars):~~
192		~~- if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None:~~
193		~~- continue~~
194		~~- else:~~
195		~~- f = []~~
196		~~- for head in headers:~~
197		~~- f.append(vars[x][head])~~
198		~~- flat.append(f)~~
199		-
200		~~- return flat~~
201		-
202		-
203		~~-def split_file(location, file, project, language_code, namespaces=[0], format='xml', zip=False):~~
	200	+def split_file(location, file, project, language_code, zip=False):
204	201	'''
205	202	Reads xml file and splits it in N chunks
206	203	@namespaces is a list indicating which namespaces should be included, default
—	—	@@ -207,11 +204,11 @@
208	205	@zip indicates whether to compress the chunk or not
209	206	'''
210	207	input = os.path.join(location, file)
211		~~- if format == 'xml':~~
212		~~- output = os.path.join(location, 'chunks')~~
213		~~- else:~~
214		~~- output = os.path.join(location, 'txt')~~
215		~~- bot_ids = bots.retrieve_bots(language_code)~~
	208	+ #if format == 'xml':
	209	+ output = os.path.join(location, 'chunks')
	210	+ #else:
	211	+ # output = os.path.join(location, 'txt')
	212	+ # bot_ids = bots.retrieve_bots(language_code)
216	213	settings.verify_environment([output])
217	214
218	215	fh = None
—	—	@@ -230,25 +227,26 @@
231	228	if event == 'end':
232	229	if elem.tag == tag:
233	230	elem = remove_namespace(elem, settings.xml_namespace)
234		~~- if is_article_main_namespace(elem, ns):~~
235		~~- page = elem.find('id').text~~
236		~~- elem = parse_comments(elem, remove_numeric_character_references)~~
	231	+ #if is_article_main_namespace(elem, ns):
	232	+ # page = elem.find('id').text
	233	+ elem = parse_comments(elem, remove_numeric_character_references)
237	234
238		~~- if format == 'xml':~~
239		~~- fh, counter, new_file = write_xml_file(elem, fh, output, counter, format)~~
240		~~- else:~~
241		~~- data = [el.getchildren() for el in elem if el.tag == 'revision']~~
242		~~- data = flatten_xml_elements(data, page, bot_ids)~~
243		~~- if data != None:~~
244		~~- size = 64 * len(data)~~
245		~~- fh, counter, new_file = create_file_handle(fh, output, counter, size, format)~~
246		~~- utils.write_list_to_csv(data, fh, recursive=False, newline=True)~~
	235	+ # if format == 'xml':
	236	+ fh, counter, new_file = write_xml_file(elem, fh, output, counter, format)
	237	+# else:
	238	+# data = [el.getchildren() for el in elem if el.tag == 'revision']
	239	+# data = flatten_xml_elements(data, page, bot_ids)
	240	+# if data != None:
	241	+# size = 64 * len(data)
	242	+# fh, counter, new_file = create_file_handle(fh, output, counter, size, format)
	243	+# utils.write_list_to_csv(data, fh, recursive=False, newline=True)
247	244
248		~~- if zip and new_file:~~
249		~~- file = str(counter - 1) + format~~
250		~~- utils.zip_archive(settings.path_ziptool, output, file)~~
251		~~- utils.delete_file(output, file)~~
	245	+ if zip and new_file:
	246	+ file = str(counter - 1) + format
	247	+ utils.zip_archive(settings.path_ziptool, output, file)
	248	+ utils.delete_file(output, file)
252	249	root.clear() # when done parsing a section clear the tree to safe memory
	250	+
253	251	except SyntaxError:
254	252	f = utils.create_txt_filehandle(settings.log_location, 'split_xml', 'w', settings.encoding)
255	253	f.write(cElementTree.tostring(elem))

Status & tagging log

01:54, 5 January 2011 Reedy (talk | contribs) changed the status of r79606 [removed: new added: deferred]