r93800 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r93799‎ | r93800 | r93801 >
Date:06:42, 3 August 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
Category Loader now collects data for in-degree, out-degree, subcategeories, and the nodes which only have edges in or out exclusively
Modified paths:
  • /trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py
@@ -12,7 +12,7 @@
1313
1414
1515 """ Import python base modules """
16 -import sys, getopt, re, datetime, logging, MySQLdb, settings
 16+import sys, getopt, re, datetime, logging, MySQLdb, settings, operator
1717 import networkx as nx
1818
1919 """ Import Analytics modules """
@@ -91,13 +91,13 @@
9292 self._query_names_['get_subcategories'] = "select cl_to from categorylinks_cp where cl_from = %s"
9393 self._query_names_['delete_from_recs'] = "delete from rfaulk.categorylinks_cp where cl_from = %s"
9494 self._query_names_['is_empty'] = "select * from rfaulk.categorylinks_cp limit 1"
95 - self._query_names_['get_category_links'] = "select cl_from, cl_to from categorylinks_cp limit 100"
 95+ self._query_names_['get_category_links'] = "select cl_from, cl_to from categorylinks_cp limit 10000"
9696
9797 WSORSlaveDataLoader.__init__(self)
9898 logging.info('Creating CategoryLoader')
9999
100100 """
101 -
 101+ Retrieves all rows out of the category links table
102102 """
103103 def get_category_links(self):
104104
@@ -236,8 +236,8 @@
237237 """
238238 def extract_hierarchy(self):
239239
240 - #self.drop_category_links_cp_table()
241 - #self.create_category_links_cp_table()
 240+ self.drop_category_links_cp_table()
 241+ self.create_category_links_cp_table()
242242
243243 """ Create graph """
244244 logging.info('Initializing directed graph...')
@@ -256,24 +256,83 @@
257257 links = self.get_category_links()
258258 count = 0
259259
 260+ out_degrees = dict()
 261+ in_degrees = dict()
 262+ subcategories = dict()
 263+
 264+ """ Process subcategory links """
260265 for row in links:
261266
262267 cl_from = int(row[0])
263268 cl_to = str(row[1])
264269 cl_from = self.get_page_title(cl_from)
 270+
 271+ try:
 272+ subcategories[cl_from].append(cl_to)
265273
 274+ except KeyError:
 275+ subcategories[cl_from] = list()
 276+ subcategories[cl_from].append(cl_to)
 277+
 278+ try:
 279+ out_degrees[cl_from] = out_degrees[cl_from] + 1
 280+ except KeyError:
 281+ out_degrees[cl_from] = 1
 282+
 283+ try:
 284+ in_degrees[cl_to] = in_degrees[cl_to] + 1
 285+ except KeyError:
 286+ in_degrees[cl_to] = 1
 287+
266288 directed_graph.add_weighted_edges_from([(cl_from, cl_to, 1)])
267289
268 - if self.__DEBUG__:
 290+ if self.__DEBUG__ and count % 1000 == 0:
269291
270292 logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to))
271 - count = count + 1
 293+
 294+ count = count + 1
272295
 296+ logging.info('Sorting in degree list.')
 297+ sorted_in_degrees = sorted(in_degrees.iteritems(), key=operator.itemgetter(1), reverse=True)
 298+ logging.info('Sorting out degree list.')
 299+ sorted_out_degrees = sorted(out_degrees.iteritems(), key=operator.itemgetter(1), reverse=True)
 300+
 301+ in_only, out_only = self.get_uni_directionally_linked_categories(sorted_in_degrees, sorted_out_degrees)
 302+
273303 logging.info('Category links finished processing.')
274304
275 - return directed_graph
 305+ return directed_graph, in_degrees, out_degrees, sorted_in_degrees, sorted_out_degrees, subcategories, in_only, out_only
276306
277307
 308+ """
 309+ Returns
 310+ """
 311+ def get_uni_directionally_linked_categories(self, in_degrees, out_degrees):
 312+
 313+ logging.info('Generating lists of categories have either only in degrees or out degrees.')
 314+
 315+ in_keys = list()
 316+ for i in in_degrees:
 317+ in_keys.append(i[0])
 318+
 319+ out_keys = list()
 320+ for i in out_degrees:
 321+ out_keys.append(i[0])
 322+
 323+ in_only = list()
 324+ out_only = list()
 325+
 326+ for i in in_degrees:
 327+ if not(i[0] in out_keys):
 328+ in_only.append(i)
 329+
 330+ for i in out_degrees:
 331+ if not(i[0] in in_keys):
 332+ out_only.append(i)
 333+
 334+ return in_only, out_only
 335+
 336+
278337 """ drop rfaulk.categorylinks_cp """
279338 def drop_category_links_cp_table(self):
280339

Status & tagging log