r75218 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r75217‎ \| r75218 \| r75219 >
Date:	22:42, 22 October 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Performance improvements.
Modified paths:	/trunk/tools/editor_trends/construct_datasets.py (modified) (history) /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history) /trunk/tools/editor_trends/settings.py (modified) (history) /trunk/tools/editor_trends/split_xml_file.py (modified) (history) /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history) /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/map_wiki_editors.py
—	—	@@ -35,6 +35,7 @@
36	36	from database import db_settings
37	37	from database import db
38	38	from wikitree import xml
	39	+from statistics import dataset
39	40	from utils import process_constructor as pc
40	41
41	42
—	—	@@ -44,15 +45,10 @@
45	46	except ImportError:
46	47	pass
47	48
48		~~-contributors = {}~~
	49	+#contributors = {}
49	50
50	51	RE_BOT = re.compile('bot', re.IGNORECASE)
51	52	RE_SCRIPT = re.compile('script', re.IGNORECASE)
52		~~-#RE_NUMERIC_CHARACTER = re.compile('&#[\d{1,5}]+;')~~
53		-#
54		~~-#def remove_numeric_character_references(text):~~
55		~~-# return re.sub(RE_NUMERIC_CHARACTER, '', text)~~
56		-#
57	53
58	54
59	55	def determine_username_is_bot(username, kwargs):
—	—	@@ -108,7 +104,7 @@
109	105	data_queue.put(vars)
110	106	vars = {}
111	107
112		~~-def lookup_new_editors(xml_queue, data_queue, pbar, bots, debug=False, separator='\t'):~~
	108	+def parse_editors(xml_queue, data_queue, pbar, bots, debug=False, separator='\t'):
113	109	if settings.DEBUG:
114	110	messages = {}
115	111	vars = {}
—	—	@@ -118,14 +114,12 @@
119	115	file = xml_queue
120	116	else:
121	117	file = xml_queue.get(block=False)
122		~~- #print 'parsing %s' % file~~
123	118	if file == None:
124	119	print 'Swallowed a poison pill'
125	120	break
126	121	data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION,
127	122	file, 'r',
128	123	encoding=settings.ENCODING))
129		~~- #data = read_input(sys.stdin)~~
130	124	for raw_data in data:
131	125	xml_buffer = cStringIO.StringIO()
132	126	raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
—	—	@@ -155,16 +149,15 @@
156	150	sure which one yet. This happens when raw_data =
157	151	''.join(raw_data) is called. 18-22
158	152	'''
159		~~- print error~~
	153	+ print file, error
160	154	print raw_data[:12]
161	155	print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])
162	156	if settings.DEBUG:
163	157	utils.track_errors(xml_buffer, error, file, messages)
164	158
165		-
166	159	if pbar:
167		~~- print xml_queue.qsize()~~
168		~~- #utils.update_progressbar(pbar, xml_queue)~~
	160	+ #print xml_queue.qsize()
	161	+ utils.update_progressbar(pbar, xml_queue)
169	162	if debug:
170	163	break
171	164
—	—	@@ -175,25 +168,17 @@
176	169	utils.report_error_messages(messages, lookup_new_editors)
177	170
178	171
179		~~-def store_data_mongo(data_queue, pids, dbname):~~
	172	+def store_editors(data_queue, pids, dbname):
180	173	mongo = db.init_mongo_db(dbname)
181	174	collection = mongo['editors']
182	175	mongo.collection.ensure_index('editor')
183		~~- contributors = {}~~
184	176	while True:
185	177	try:
186	178	edit = data_queue.get(block=False)
187	179	contributor = edit['editor']
188		~~- if contributor not in contributors:~~
189		~~- collection.insert({'editor': contributor, 'edit_count': 0, })~~
190		~~- contributors[contributor] = 1~~
191		-
192		~~- key = str(contributors[contributor])~~
193	180	value = {'date':edit['date'], 'article': edit['article']}
194	181	collection.update({'editor': contributor}, {'$inc': {'edit_count': 1},
195		~~- '$push': {'edits': value}})~~
196		~~- contributors[contributor] += 1~~
197		-
	182	+ '$push': {'edits': value}}, True)
198	183	except Empty:
199	184	'''
200	185	This checks whether the Queue is empty because the preprocessors are
—	—	@@ -202,16 +187,32 @@
203	188	are finished and this Queue is empty than break, else wait for the
204	189	Queue to fill.
205	190	'''
206		-
207	191	if all([utils.check_if_process_is_running(pid) for pid in pids]):
208	192	pass
209	193	#print 'Empty queue or not %s?' % data_queue.qsize()
210	194	else:
211	195	break
212		~~- except Exception, error:~~
213		~~- print error~~
214	196
215	197
	198	+def optimize_editors(dbname, input_queue, **kwargs):
	199	+ mongo = db.init_mongo_db(dbname)
	200	+ collection = mongo['editors']
	201	+ definition = kwargs.pop('definition')
	202	+ while True:
	203	+ try:
	204	+ id = input_queue.get(block=False)
	205	+ #id = '94033'
	206	+ editor = collection.find_one({'editor': id})
	207	+ edits = editor['edits']
	208	+ edits.sort()
	209	+ year = edits[0]['date'].year
	210	+ new_wikipedian = dataset.determine_editor_is_new_wikipedian(edits, defintion)
	211	+ collection.update({'editor': id}, {'$set': {'edits': edits, 'year_joined': year, 'new_wikipedian': new_wikipedian}})
	212	+
	213	+ except Empty:
	214	+ break
	215	+
	216	+
216	217	def store_data_db(data_queue, pids):
217	218	connection = db.init_database()
218	219	cursor = connection.cursor()
—	—	@@ -243,43 +244,38 @@
244	245	connection.close()
245	246
246	247
247		~~-def run_stand_alone():~~
	248	+def run_stand_alone(dbname):
248	249	files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml')
249	250	#files = files[:2]
	251	+ kwargs = {'bots': ids,
	252	+ 'dbname': dbname,
	253	+ 'pbar': True,
	254	+ 'definition': 'traditional'}
	255	+
250	256	mongo = db.init_mongo_db('bots')
251	257	bots = mongo['ids']
252	258	ids = {}
253	259	cursor = bots.find()
254		-
255		~~- kwargs = {'bots': ids,~~
256		~~- 'dbname': 'enwiki',~~
257		~~- 'pbar': True}~~
258		-
259	260	for bot in cursor:
260	261	ids[bot['id']] = bot['name']
261		~~- pc.build_scaffolding(pc.load_queue, lookup_new_editors, files, store_data_mongo, True, **kwargs)~~
262		~~- keys = ['editor']~~
263		~~- for key in keys:~~
264		~~- db.add_index_to_collection('enwiki', 'editors', key)~~
	262	+
	263	+ pc.build_scaffolding(pc.load_queue, parse_editors, files, store_editors, True, **kwargs)
	264	+ ids = retrieve_ids_mongo_new(dbname, 'editors')
	265	+ pc.build_scaffolding(pc.load_queue, optimize_editors, ids, False, False, **kwargs)
265	266
266	267	def debug_lookup_new_editors():
267	268	q = Queue()
268	269	import progressbar
269	270	pbar = progressbar.ProgressBar().start()
270	271	#edits = db.init_mongo_db('editors')
271		~~- lookup_new_editors('464.xml', q, None, None, True)~~
	272	+ parse_editors('464.xml', q, None, None, True)
272	273	store_data_mongo(q, [], 'test')
273	274	#keys = ['editor']
274	275	#for key in keys:
275	276	# db.add_index_to_collection('editors', 'editors', key)
276	277
277		-
278		-
279		~~-def run_hadoop():~~
280		~~- pass~~
281		-
282		-
283	278	if __name__ == "__main__":
	279	+ #optimize_editors('enwiki')
284	280	#debug_lookup_new_editors()
285	281
286	282	if settings.RUN_MODE == 'stand_alone':
Index: trunk/tools/editor_trends/settings.py
—	—	@@ -79,6 +79,8 @@
80	80
81	81	DATASETS_FILE_LOCATION = WORKING_DIRECTORY + '/datasets/'
82	82
	83	+TXT_FILE_LOCATION = WORKING_DIRECTORY + '/csv/'
	84	+
83	85	#This section contains configuration variables for parsing / encoding and
84	86	#working with the XML files.
85	87
—	—	@@ -99,8 +101,3 @@
100	102
101	103	WP_DUMP_LOCATION = 'http://download.wikimedia.org'
102	104
103		~~-LANGUAGE_MAPPING = {~~
104		~~-'English': '/enwiki/latest/',~~
105		~~-'Russian': '/ruwiki/latest/',~~
106		~~-'German': '/dewiki/latest',~~
107		-}
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -256,7 +256,7 @@
257	257	ValueError: I/O operation on closed file
258	258	Not sure how to fix this, that's why the line is commented.
259	259	'''
260		~~- #pbar.update(x)~~
	260	+ pbar.update(pbar.currval + x)
261	261
262	262
263	263	def humanize_time_difference(seconds_elapsed):
Index: trunk/tools/editor_trends/utils/process_constructor.py
—	—	@@ -66,18 +66,16 @@
67	67
68	68
69	69	input_processes = [models.ProcessInputQueue(main, input_queue, result_queue,
70		~~- **kwargs) for i in xrange(settings.NUMBER_OF_PROCESSES)]~~
	70	+ **kwargs) for i in xrange(settings.NUMBER_OF_PROCESSES -1)]
71	71
72	72	for input_process in input_processes:
73	73	input_process.start()
74	74	pids = [p.pid for p in input_processes]
75	75	kwargs['pids'] = pids
76	76
77		-
78		-
79	77	if result_queue:
80	78	result_processes = [models.ProcessResultQueue(result_processor,
81		~~- result_queue, **kwargs) for i in xrange(1)]~~
	79	+ result_queue, **kwargs) for i in xrange(24)]
82	80	for result_process in result_processes:
83	81	result_process.start()
84	82
Index: trunk/tools/editor_trends/split_xml_file.py
—	—	@@ -13,8 +13,10 @@
14	14	'''
15	15
16	16	__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
17	20
18		-
19	21	import xml.etree.cElementTree as cElementTree
20	22	import codecs
21	23	import utils
—	—	@@ -30,10 +32,7 @@
31	33
32	34	RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
33	35
34		~~-#def convert_html_entities(text):~~
35		~~-# return utils.unescape(text)~~
36	36
37		-
38	37	def remove_numeric_character_references(text):
39	38	return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
40	39
—	—	@@ -57,26 +56,17 @@
58	57	for revision in revisions:
59	58	comment = revision.find('comment')
60	59	timestamp = revision.find('timestamp').text
61		~~- #if timestamp == '2007-11-25T09:21:11Z':~~
62		~~- # print 'debug'~~
63		~~- # text = comment.text~~
64		~~- #test2 = text.encode('utf-8')~~
65		~~- #test = text.decode('utf-8')~~
66	60
67	61	# text1 = remove_ascii_control_characters(text)
68	62	# text2 = remove_numeric_character_references(text)
69	63	# text3 = convert_html_entities(text)
70	64
71	65	if comment != None and comment.text != None:
72		~~- #print comment.text.encode('utf-8')~~
73		-
74	66	comment.text = function(comment.text)
75		~~- #text = comment.text~~
76		~~- #print text~~
77	67	return xml
78	68
79	69
80		~~-def write_xml_file(element, fh, counter):~~
	70	+def write_xml_file(element, fh, counter, language):
81	71	'''Get file handle and write xml element to file'''
82	72	size = len(cElementTree.tostring(element))
83	73	fh, counter = create_xml_file_handle(fh, counter, size)
—	—	@@ -89,20 +79,24 @@
90	80	'''Create file handle if none is supplied or if file size > max file size.'''
91	81	if not fh:
92	82	counter = 0
93		~~- fh = codecs.open(settings.LOCATION + str(counter) + '.xml', 'w', encoding=settings.ENCODING)~~
	83	+ fh = codecs.open(settings.LOCATION + '/' + language + '/' + str(counter) + '.xml', 'w', encoding=settings.ENCODING)
94	84	return fh, counter
95	85	elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE:
96	86	print 'Created chunk %s' % counter
97	87	fh.close
98	88	counter += 1
99		~~- fh = codecs.open(settings.LOCATION + str(counter) + '.xml', 'w', encoding=settings.ENCODING)~~
	89	+ fh = codecs.open(settings.LOCATION + '/' + language + '/' + str(counter) + '.xml', 'w', encoding=settings.ENCODING)
100	90	return fh, counter
101	91	else:
102	92	return fh, counter
103	93
104	94
105		~~-def split_xml():~~
	95	+def split_xml(language):
106	96	'''Reads xml file and splits it in N chunks'''
	97	+ result = utils.create_directory(language)
	98	+ if not result:
	99	+ return
	100	+
107	101	fh = None
108	102	counter = None
109	103	tag = '{%s}page' % settings.NAME_SPACE
—	—	@@ -118,10 +112,10 @@
119	113	elem = parse_comments(elem, remove_numeric_character_references)
120	114	#elem = parse_comments(elem, convert_html_entities)
121	115	#elem = parse_comments(elem, remove_ascii_control_characters)
122		~~- fh, counter = write_xml_file(elem, fh, counter)~~
	116	+ fh, counter = write_xml_file(elem, fh, counter, language)
123	117	#print cElementTree.tostring(elem)
124	118	root.clear() # when done parsing a section clear the tree to safe memory
125	119
126	120
127	121	if __name__ == "__main__":
128		~~- split_xml()~~
	122	+ split_xml('enwiki')
Index: trunk/tools/editor_trends/construct_datasets.py
—	—	@@ -36,26 +36,38 @@
37	37
38	38
39	39	def retrieve_editor_ids_mongo(RANDOM_SAMPLE=True):
40		~~- if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,~~
	40	+ raise DeprecatedError
	41	+# if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,
	42	+# retrieve_editor_ids_mongo):
	43	+# contributors = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
	44	+# retrieve_editor_ids_mongo)
	45	+# else:
	46	+# mongo = db.init_mongo_db('editors')
	47	+# editors = mongo['editors']
	48	+# contributors = set()
	49	+# #ids = editors.find().distinct('editor')
	50	+# ids = editors.find()
	51	+# for x, id in enumerate(ids):
	52	+# contributors.add(id['editor'])
	53	+# if len(contributors) == 100000:
	54	+# if RANDOM_SAMPLE:
	55	+# break
	56	+# if contributors != set():
	57	+# utils.store_object(contributors, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)
	58	+# return contributors
	59	+
	60	+def retrieve_ids_mongo_new(dbname, collection):
	61	+ if utils.check_file_exists(settings.TXT_FILE_LOCATION,
41	62	retrieve_editor_ids_mongo):
42		~~- contributors = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,~~
	63	+ ids = utils.load_object(settings.TXT_FILE_LOCATION,
43	64	retrieve_editor_ids_mongo)
44	65	else:
45		~~- mongo = db.init_mongo_db('editors')~~
46		~~- editors = mongo['editors']~~
47		~~- contributors = set()~~
48		~~- #ids = editors.find().distinct('editor')~~
49		~~- ids = editors.find()~~
50		~~- for x, id in enumerate(ids):~~
51		~~- contributors.add(id['editor'])~~
52		~~- if len(contributors) == 100000:~~
53		~~- if RANDOM_SAMPLE:~~
54		~~- break~~
55		~~- if contributors != set():~~
56		~~- utils.store_object(contributors, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)~~
57		~~- return contributors~~
	66	+ mongo = db.init_mongo_db(dbname)
	67	+ editors = mongo[collection]
	68	+ ids = editors.distinct()
	69	+ utils.store_object(contributors, settings.TXT_FILE_LOCATION, retrieve_editor_ids_mongo)
	70	+ return ids
58	71
59		-
60	72	def generate_editor_dataset(input_queue, data_queue, pbar, kwargs):
61	73	definition = kwargs.pop('definition')
62	74	limit = kwargs.pop('limit')
—	—	@@ -72,7 +84,7 @@
73	85
74	86	print input_queue.qsize()
75	87	if definition == 'Traditional':
76		-
	88	+
77	89	obs = editors.find({'editor': id}, {'date':1}).sort('date').limit(limit)
78	90	contributors = []
79	91	for ob in obs:

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r75218 [removed: new added: deferred]