r93774 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r93773‎ | r93774 | r93775 >
Date:18:23, 2 August 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
implemented a non-recursive way to process the category linkages
Modified paths:
  • /trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py
@@ -81,6 +81,8 @@
8282
8383 def __init__(self):
8484
 85+ self.__DEBUG__ = True
 86+
8587 self._query_names_['build_subcat_tbl'] = "CREATE TABLE rfaulk.categorylinks_cp select * from enwiki.categorylinks where cl_type = 'subcat'"
8688 self._query_names_['drop_subcat_tbl'] = "drop table if exists rfaulk.categorylinks_cp;"
8789 self._query_names_['get_first_rec'] = "select cl_from from categorylinks_cp limit 1"
@@ -89,11 +91,31 @@
9092 self._query_names_['get_subcategories'] = "select cl_to from categorylinks_cp where cl_from = %s"
9193 self._query_names_['delete_from_recs'] = "delete from rfaulk.categorylinks_cp where cl_from = %s"
9294 self._query_names_['is_empty'] = "select * from rfaulk.categorylinks_cp limit 1"
 95+ self._query_names_['get_category_links'] = "select cl_from, cl_to from categorylinks_cp limit 100"
9396
9497 WSORSlaveDataLoader.__init__(self)
9598 logging.info('Creating CategoryLoader')
9699
97100 """
 101+
 102+ """
 103+ def get_category_links(self):
 104+
 105+ try:
 106+ sql = self._query_names_['get_category_links']
 107+ logging.info('Executing: ' + sql)
 108+ results = self.execute_SQL(sql)
 109+
 110+ except:
 111+
 112+ logging.error('Could not retrieve page_id.')
 113+ return -1
 114+
 115+ return results
 116+
 117+
 118+
 119+ """
98120 Retrives the integer page id
99121 """
100122 def get_page_id(self, page_title):
@@ -209,6 +231,8 @@
210232
211233 """
212234 Execution entry point of the class - builds a full category hierarchy from categorylinks
 235+
 236+ CURRENTLY THE EDGES ARE PROCESSED IN A NON=-RECURSIVE WAY, this is much faster
213237 """
214238 def extract_hierarchy(self):
215239
@@ -220,12 +244,31 @@
221245 directed_graph = nx.DiGraph()
222246
223247 """ while there are rows left in categorylinks_cp """
 248+
 249+ """
224250 while(not self.is_empty()):
225 -
 251+
226252 category_title = self.get_first_record_from_category_links()
227253 self.build_category_tree(directed_graph, category_title)
228254 directed_graph.add_weighted_edges_from([('ALL', category_title, 1)])
 255+ """
229256
 257+ links = self.get_category_links()
 258+ count = 0
 259+
 260+ for row in links:
 261+
 262+ cl_from = int(row[0])
 263+ cl_to = str(row[1])
 264+ cl_from = self.get_page_title(cl_from)
 265+
 266+ directed_graph.add_weighted_edges_from([(cl_from, cl_to, 1)])
 267+
 268+ if self.__DEBUG__:
 269+
 270+ logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to))
 271+ count = count + 1
 272+
230273 logging.info('Category links finished processing.')
231274
232275 return directed_graph
@@ -293,33 +336,8 @@
294337 else:
295338 return True
296339
297 - """
298 - The cl_from key is formatted in uppercase with non-uniform whitespace
299340
300 - def normalize_field_cl_from(self, category):
301341
302 - category = category.lower()
303 - words = category.split('\n')[0] # only keep text before the a carraige return
304 - words = words.split()
305 - len_words = len(words)
306 -
307 - category = ''
308 - category_camel = ''
309 -
310 -
311 - for i in range(len_words - 1):
312 - category = category + words[i] + ' '
313 - category_camel = category_camel + words[i][0].upper() + words[i][1:] + ' '
314 -
315 - category = category + words[len_words - 1]
316 - category_camel = category_camel + words[len_words - 1][0].upper() + words[len_words - 1][1:]
317 -
318 - category_upper = category.upper()
319 - category_lower = category.lower()
320 -
321 - return category_upper, category_lower, category_camel
322 - """
323 -
324342 """
325343 Inherits WSORSlaveDataLoader
326344

Status & tagging log