r93774 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r93773‎ \| r93774 \| r93775 >
Date:	18:23, 2 August 2011
Author:	rfaulk
Status:	deferred
Tags:
Comment:	implemented a non-recursive way to process the category linkages
Modified paths:	/trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py
—	—	@@ -81,6 +81,8 @@
82	82
83	83	def __init__(self):
84	84
	85	+ self.__DEBUG__ = True
	86	+
85	87	self._query_names_['build_subcat_tbl'] = "CREATE TABLE rfaulk.categorylinks_cp select * from enwiki.categorylinks where cl_type = 'subcat'"
86	88	self._query_names_['drop_subcat_tbl'] = "drop table if exists rfaulk.categorylinks_cp;"
87	89	self._query_names_['get_first_rec'] = "select cl_from from categorylinks_cp limit 1"
—	—	@@ -89,11 +91,31 @@
90	92	self._query_names_['get_subcategories'] = "select cl_to from categorylinks_cp where cl_from = %s"
91	93	self._query_names_['delete_from_recs'] = "delete from rfaulk.categorylinks_cp where cl_from = %s"
92	94	self._query_names_['is_empty'] = "select * from rfaulk.categorylinks_cp limit 1"
	95	+ self._query_names_['get_category_links'] = "select cl_from, cl_to from categorylinks_cp limit 100"
93	96
94	97	WSORSlaveDataLoader.__init__(self)
95	98	logging.info('Creating CategoryLoader')
96	99
97	100	"""
	101	+
	102	+ """
	103	+ def get_category_links(self):
	104	+
	105	+ try:
	106	+ sql = self._query_names_['get_category_links']
	107	+ logging.info('Executing: ' + sql)
	108	+ results = self.execute_SQL(sql)
	109	+
	110	+ except:
	111	+
	112	+ logging.error('Could not retrieve page_id.')
	113	+ return -1
	114	+
	115	+ return results
	116	+
	117	+
	118	+
	119	+ """
98	120	Retrives the integer page id
99	121	"""
100	122	def get_page_id(self, page_title):
—	—	@@ -209,6 +231,8 @@
210	232
211	233	"""
212	234	Execution entry point of the class - builds a full category hierarchy from categorylinks
	235	+
	236	+ CURRENTLY THE EDGES ARE PROCESSED IN A NON=-RECURSIVE WAY, this is much faster
213	237	"""
214	238	def extract_hierarchy(self):
215	239
—	—	@@ -220,12 +244,31 @@
221	245	directed_graph = nx.DiGraph()
222	246
223	247	""" while there are rows left in categorylinks_cp """
	248	+
	249	+ """
224	250	while(not self.is_empty()):
225		-
	251	+
226	252	category_title = self.get_first_record_from_category_links()
227	253	self.build_category_tree(directed_graph, category_title)
228	254	directed_graph.add_weighted_edges_from([('ALL', category_title, 1)])
	255	+ """
229	256
	257	+ links = self.get_category_links()
	258	+ count = 0
	259	+
	260	+ for row in links:
	261	+
	262	+ cl_from = int(row[0])
	263	+ cl_to = str(row[1])
	264	+ cl_from = self.get_page_title(cl_from)
	265	+
	266	+ directed_graph.add_weighted_edges_from([(cl_from, cl_to, 1)])
	267	+
	268	+ if self.__DEBUG__:
	269	+
	270	+ logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to))
	271	+ count = count + 1
	272	+
230	273	logging.info('Category links finished processing.')
231	274
232	275	return directed_graph
—	—	@@ -293,33 +336,8 @@
294	337	else:
295	338	return True
296	339
297		~~- """~~
298		~~- The cl_from key is formatted in uppercase with non-uniform whitespace~~
299	340
300		~~- def normalize_field_cl_from(self, category):~~
301	341
302		~~- category = category.lower()~~
303		~~- words = category.split('\n')[0] # only keep text before the a carraige return~~
304		~~- words = words.split()~~
305		~~- len_words = len(words)~~
306		-
307		~~- category = ''~~
308		~~- category_camel = ''~~
309		-
310		-
311		~~- for i in range(len_words - 1):~~
312		~~- category = category + words[i] + ' '~~
313		~~- category_camel = category_camel + words[i][0].upper() + words[i][1:] + ' '~~
314		-
315		~~- category = category + words[len_words - 1]~~
316		~~- category_camel = category_camel + words[len_words - 1][0].upper() + words[len_words - 1][1:]~~
317		-
318		~~- category_upper = category.upper()~~
319		~~- category_lower = category.lower()~~
320		-
321		~~- return category_upper, category_lower, category_camel~~
322		~~- """~~
323		-
324	342	"""
325	343	Inherits WSORSlaveDataLoader
326	344

Status & tagging log

00:22, 3 August 2011 Reedy (talk | contribs) changed the status of r93774 [removed: new added: deferred]