r94667 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r94666‎ | r94667 | r94668 >
Date:17:47, 16 August 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
modified construct_topic tree() to build trees from all top level categories
added find_top_level category() to determine categories from article page ids
added determine_all_page_categories() to categorize all pages
Modified paths:
  • /trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py
@@ -12,18 +12,21 @@
1313
1414
1515 """ Import python base modules """
16 -import sys, getopt, re, datetime, logging, MySQLdb, settings, operator, pickle
 16+import sys, getopt, re, datetime, logging, MySQLdb, operator, pickle, shelve, random
1717 import networkx as nx
1818
1919 """ Import Analytics modules """
2020 from Fundraiser_Tools.classes.DataLoader import DataLoader
 21+import WSOR.scripts.classes.settings as settings
2122
2223 """ Configure the logger """
2324 LOGGING_STREAM = sys.stderr
2425 logging.basicConfig(level=logging.DEBUG, stream=LOGGING_STREAM, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%b-%d %H:%M:%S')
 26+# logging.basicConfig(level=logging.DEBUG, filename="categories.log", filemode='w', format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%b-%d %H:%M:%S')
2527
2628
2729
 30+
2831 """
2932 Inherits DataLoader
3033
@@ -102,16 +105,31 @@
103106 self._query_names_['build_subcat_tbl'] = "CREATE TABLE rfaulk.categorylinks_cp select * from enwiki.categorylinks where cl_type = 'subcat'"
104107 self._query_names_['drop_subcat_tbl'] = "drop table if exists rfaulk.categorylinks_cp;"
105108 self._query_names_['get_first_rec'] = "select cl_from from categorylinks_cp limit 1"
106 - self._query_names_['get_category_page_title'] = "select page_title from enwiki.page where page_id = %s"
 109+ self._query_names_['get_category_page_title'] = "select page_id, page_title from enwiki.page where %s"
107110 self._query_names_['get_category_page_id'] = "select page_id from enwiki.page where page_title = '%s' and page_namespace = 14"
108111 self._query_names_['get_subcategories'] = "select cl_to from categorylinks_cp where cl_from = %s"
109112 self._query_names_['delete_from_recs'] = "delete from rfaulk.categorylinks_cp where cl_from = %s"
110113 self._query_names_['is_empty'] = "select * from rfaulk.categorylinks_cp limit 1"
111114 self._query_names_['get_category_links'] = "select cl_from, cl_to from categorylinks_cp"
 115+ self._query_names_['get_page_categories'] = "select cl_from, cl_to from enwiki.categorylinks where %s order by 1"
 116+ self._query_names_['get_all_page_ids'] = "select page_id from enwiki.page where page_namespace = 0 and page_len > 1000"
112117
 118+ self._query_names_['create_page_category'] = "create table rfaulk.page_category (page_id int(8) unsigned, page_title varbinary(255), category varbinary(255));"
 119+ self._query_names_['drop_page_category'] = "drop table if exists rfaulk.page_category;"
 120+ self._query_names_['insert_page_category'] = "insert into rfaulk.page_category values %s;"
 121+
 122+
113123 WSORSlaveDataLoader.__init__(self)
114124 logging.info('Creating CategoryLoader')
115 -
 125+
 126+ self._max_depth_ = 100
 127+ self._top_level_cats_ = ['Natural_sciences', 'Applied_sciences', 'Mathematics', 'Literature', 'Visual_arts', 'Social_sciences', 'Film', 'Music', 'Television', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports', 'Places']
 128+ # self._top_level_cats_ = ['Natural_sciences', 'Mathematics', 'Arts', 'Social_sciences', 'Entertainment', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports']
 129+ self._block_words_ = ['categories', 'Categories', 'topic', 'Topic']
 130+ self._block_cats_ = ['']
 131+ self._topic_trees_ = dict()
 132+
 133+
116134 """
117135 Retrieves all rows out of the category links table
118136 """
@@ -129,8 +147,45 @@
130148
131149 return results
132150
 151+ """
 152+ Extract the categories for a given article
 153+ """
 154+ def get_page_categories(self, page_id_list):
 155+
 156+ categories = dict()
 157+ where_clause = ''
 158+
 159+ """ Initialize category lists for each page """
 160+ for id in page_id_list:
 161+ categories[id] = list()
 162+
 163+ try:
 164+
 165+ for id in page_id_list:
 166+ where_clause = where_clause + 'cl_from = %s or ' % str(id)
 167+ where_clause = where_clause[:-4]
 168+
 169+ sql = self._query_names_['get_page_categories'] % where_clause
 170+
 171+ logging.info('Retrieving page categories ...')
 172+ results = self.execute_SQL(sql)
 173+
 174+ """ walk through results and add to category lists """
 175+ for row in results:
 176+ id = int(row[0])
 177+ categories[id].append(row[1])
 178+
 179+ except Exception as inst:
 180+
 181+ logging.error('Could not retrieve page categories.')
 182+ logging.error(str(type(inst))) # the exception instance
 183+ logging.error(str(inst.args)) # arguments stored in .args
 184+ logging.error(inst.__str__()) # __str__ allows args to printed directly
 185+
 186+ return {}
 187+
 188+ return categories
133189
134 -
135190 """
136191 Retrives the integer page id
137192 """
@@ -151,21 +206,38 @@
152207
153208 """
154209 Retrives the string page title
 210+
 211+ This either manages a list of ids or a single id
155212 """
156213 def get_page_title(self, page_id):
157214
 215+ logging.info('Getting page titles ...')
 216+ is_list = isinstance(page_id, (list))
 217+
158218 try:
159 - sql = self._query_names_['get_category_page_title'] % page_id
160 - #logging.info('Executing: ' + sql)
 219+ if not(is_list):
 220+ where_clause = 'page_id = %s' % str(page_id)
 221+ else:
 222+ where_clause = ''
 223+ for id in page_id:
 224+ where_clause = where_clause + 'page_id = %s or ' % str(id)
 225+ where_clause = where_clause[:-4]
 226+
 227+ sql = self._query_names_['get_category_page_title'] % where_clause
161228 results = self.execute_SQL(sql)
162 - title = str(results[0][0])
163 -
 229+
 230+ if not(is_list):
 231+ title = str(results[0][1])
 232+ else:
 233+ title = dict()
 234+
 235+ for row in results:
 236+ title[int(row[0])] = str(row[1])
 237+
164238 except Exception as inst:
165239
166 - logging.error('Could not retrieve page_title.')
167 - logging.error(str(type(inst))) # the exception instance
168 - logging.error(str(inst.args)) # arguments stored in .args
169 - logging.error(inst.__str__()) # __str__ allows args to printed directly
 240+ logging.error('Could not retrieve page_title for page_id = %s.' % page_id)
 241+ self._log_file.write('Could not retrieve page_title for page_id = %s.\n' % (page_id))
170242
171243 return ''
172244
@@ -181,7 +253,9 @@
182254
183255 #self.drop_category_links_cp_table()
184256 #self.create_category_links_cp_table()
185 -
 257+
 258+ self._log_file = open('category_miner.log', 'w')
 259+
186260 """ Create graph """
187261
188262 logging.info('Initializing directed graph...')
@@ -207,9 +281,9 @@
208282 """ Process subcategory links """
209283 for row in links:
210284
211 - cl_from = int(row[0])
212 - cl_to = str(row[1])
213 - cl_from = self.get_page_title(cl_from)
 285+ cl_from = str(row[1])
 286+ cl_to = int(row[0])
 287+ cl_to = self.get_page_title(cl_to)
214288
215289 try:
216290 subcategories[cl_from].append(cl_to)
@@ -230,10 +304,12 @@
231305
232306 directed_graph.add_weighted_edges_from([(cl_from, cl_to, 1)])
233307
234 - if self.__DEBUG__ and count % 1000 == 0:
 308+ if self.__DEBUG__ and (cl_from == 'Probability' or cl_from == 'Mathematics' or cl_from == 'Science' or cl_from == 'Arts'):
 309+ #if self.__DEBUG__ and count % 1000 == 0 :
235310
236311 logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to))
237 -
 312+ self._log_file.write('%s: %s -> %s\n' % (str(count), cl_from, cl_to))
 313+
238314 count = count + 1
239315
240316 logging.info('Sorting in degree list.')
@@ -244,12 +320,13 @@
245321 in_only, out_only = self.get_uni_directionally_linked_categories(sorted_in_degrees, sorted_out_degrees, in_degrees, out_degrees)
246322
247323 logging.info('Category links finished processing.')
 324+ self._log_file.close()
248325
249326 return directed_graph, in_degrees, out_degrees, sorted_in_degrees, sorted_out_degrees, subcategories, in_only, out_only
250327
251328
252329 """
253 - Returns
 330+ Looks at the in and out degrees and constructs lists of nodes having only edges out and edges in
254331 """
255332 def get_uni_directionally_linked_categories(self, in_degrees, out_degrees, in_degrees_by_key, out_degrees_by_key ):
256333
@@ -307,9 +384,7 @@
308385 logging.error(str(type(inst))) # the exception instance
309386 logging.error(str(inst.args)) # arguments stored in .args
310387 logging.error(inst.__str__()) # __str__ allows args to printed directly
311 -
312 -
313 -
 388+
314389 """
315390 Are there any records remaining in rfaulk.categorylinks_cp ??
316391 """
@@ -334,64 +409,276 @@
335410 return False
336411 else:
337412 return True
338 -
339 -
 413+
340414 """
341415 Are there any records remaining in rfaulk.categorylinks_cp ??
342416
343417 Use a trace to detect any loops
344418 """
345 - def construct_topic_tree(self, topic, subcategories):
 419+ def construct_topic_tree(self, subcategories):
346420
347421 """ Create graph """
348422
349423 logging.info('Initializing directed graph...')
350 - directed_graph = nx.DiGraph()
351 - trace = [topic]
 424+ graph = nx.Graph()
 425+ self._count_ = 1
352426
353 - topic_counts = self._recursive_construct_topic_tree(directed_graph, topic, subcategories, trace)
 427+ subcategories['top_level_categories'] = self._top_level_cats_
 428+ topic = 'top_level_categories'
354429
355 - return directed_graph, topic_counts
 430+ depth = 0
 431+ logging.info('Recursively contructing graph, MAX DEPTH = %s ...' % self._max_depth_)
 432+ shortest_paths, topic_counts = self._recursive_construct_topic_tree(graph, topic, subcategories, depth)
356433
 434+
 435+ """ Pickle the result """
 436+ #logging.info('Pickling the shortest paths ...')
 437+ #self.pickle_var(shortest_paths, 'shortest_paths.p')
 438+
 439+ """ Shelve the result """
 440+ logging.info('Shelve the shortest paths ...')
 441+ d = shelve.open( settings.__data_file_dir__ + 'shortest_paths.s')
 442+ d['shortest_paths'] = shortest_paths
 443+ d.close()
 444+
 445+ return graph, shortest_paths
 446+
357447 """
358 - Are there any records remaining in rfaulk.categorylinks_cp ??
 448+ Recursively build the graph structure for categories based on the subcategory list
 449+
 450+ @param graph: NetworkX graph structure to store category linkage
 451+ @param topic: String topic name on which to build a recursive structure
 452+ @param subcategories: Disctionary of subcategory lists
 453+ @param depth: integer depth of the call within the recursion
 454+
359455 """
360 - def _recursive_construct_topic_tree(self, directed_graph, topic, subcategories, trace):
 456+ def _recursive_construct_topic_tree(self, graph, topic, subcategories, depth):
 457+
 458+ topic_counts = 1
 459+ depth = depth + 1
 460+ self._count_ = self._count_ + 1
361461
362 - topic_counts = 0
 462+ if self._count_ % 10000 == 0:
 463+ logging.info('Processed %s nodes. Graph size = %s.' % (str(self._count_), str(graph.number_of_nodes())))
363464
364465 """ Extract the subtopics of topic """
365466 try:
366 - topic_subcategories = subcategories[topic]
367 -
 467+ topic_subcategories = subcategories[topic]
 468+ new_subcategories = topic_subcategories[:]
 469+
 470+ """ Filter meta categories based on block words """
 471+ for sub_topic in topic_subcategories:
 472+ for block_word in self._block_words_:
 473+ if re.search(block_word, sub_topic):
 474+ new_subcategories.remove(sub_topic)
 475+ for block_cat in self._block_cats_:
 476+ if block_cat == sub_topic:
 477+ new_subcategories.remove(sub_topic)
 478+
 479+ topic_subcategories = new_subcategories
 480+
368481 except KeyError:
369482 """ There are no subcategories for this topic """
370483 return 1 # there is a topic count of 1
371484
372 - """ Recursively build linkages for each """
373 - # logging.info(str(trace))
 485+ """ Recursively build linkages for each .
 486+ DFS determining topic tree - this provides """
374487 for sub_topic in topic_subcategories:
375488
376 - if not(sub_topic in trace):
 489+ if depth == 1:
 490+ logging.info('Processing top level catgory: %s' % sub_topic)
 491+
 492+ if not(graph.has_node(sub_topic)):
377493
378 - logging.info(topic + ' --> ' + sub_topic)
 494+ graph.add_edge(topic, sub_topic)
 495+
 496+ """ Only go deeper if the maximum recursive depth has not been reached """
 497+ if depth < self._max_depth_:
 498+ sub_topic_counts = self._recursive_construct_topic_tree(graph, sub_topic, subcategories, depth)
 499+ else:
 500+ sub_topic_counts = 1
379501
380 - copy_trace = trace[:]
381 - copy_trace.append(sub_topic)
382 -
383 - directed_graph.add_weighted_edges_from([(topic, sub_topic, 1)])
384 - sub_topic_counts = self._recursive_construct_topic_tree(directed_graph, sub_topic, subcategories, copy_trace)
385 -
386502 topic_counts = topic_counts + sub_topic_counts
387503
388504 else:
389505
390 - logging.info('LOOP: ' + topic + ' --> ' + sub_topic)
 506+ """ Add the 'loop' edge if and only if it is not a top level catagory """
 507+ if not(sub_topic in self._top_level_cats_):
 508+ graph.add_edge(topic, sub_topic)
 509+ topic_counts = topic_counts + 1
 510+
 511+ """ After the recursion is complete compute the shortest paths """
 512+ if depth == 1:
 513+
 514+ shortest_paths = dict()
 515+
 516+ for sub_topic in self._top_level_cats_:
 517+ logging.info('Computing shortest paths for %s ...' % sub_topic)
 518+ shortest_paths[sub_topic] = nx.single_source_dijkstra_path(graph, sub_topic)
391519
392 - directed_graph.add_weighted_edges_from([(topic, 'LOOP TO: ' + sub_topic, 1)])
393 -
 520+ """ Store the lengths rather than the paths """
 521+ for target in shortest_paths[sub_topic]:
 522+ shortest_paths[sub_topic][target] = len(shortest_paths[sub_topic][target])
 523+
 524+ return shortest_paths, topic_counts
 525+
394526 return topic_counts
 527+
 528+ """
 529+ Pickles variables that store the state of the category graph
 530+ """
 531+ def pickle_all(self, directed_graph, in_degrees, out_degrees, sorted_in_degrees, sorted_out_degrees, subcategories, in_only, out_only):
 532+
 533+ self.pickle_var(directed_graph, 'full_topic_graph.p')
 534+ self.pickle_var(in_degrees, 'in_degrees_dict.p')
 535+ self.pickle_var(out_degrees, 'out_degrees_dict.p')
 536+ self.pickle_var(sorted_out_degrees, 'sorted_out_degrees_dict.p')
 537+ self.pickle_var(sorted_in_degrees, 'sorted_in_degrees_dict.p')
 538+ self.pickle_var(subcategories, 'subcategories.p')
 539+ self.pickle_var(in_only, 'in_only.p')
 540+ self.pickle_var(out_only, 'out_only.p')
 541+
 542+ """
 543+ @param page_ids: a list of pages to classify
 544+ """
 545+ def find_top_level_category(self, page_ids, shortest_paths):
 546+
 547+ # self._topic_trees_ = dict()
 548+ titles = dict()
 549+ depths = dict()
 550+ page_categories = dict()
 551+ page_tl_cat = dict()
 552+ cat_winner = dict()
 553+ win_count = dict()
 554+
 555+ """ Get categories for pages - Initialize depth dictionaries for top level categories """
 556+ logging.info('Initializing data structures ...')
 557+ page_categories = self.get_page_categories(page_ids)
 558+ titles = self.get_page_title(page_ids)
 559+
 560+ for page_id in page_ids:
 561+ # page_categories[page_id] = self.get_page_categories(page_id)
 562+ title = titles[page_id]
 563+ depths[title] = dict()
 564+
 565+ """ Initialize dictionaries to store the depth scores for top level categories """
 566+ for category in page_categories[page_id]:
 567+ depths[title][category] = dict()
 568+
 569+ """ Iterate through each page, category, and top level category
 570+ Perform a breadth first search for the node to determine the dept """
 571+ logging.info('Finding category depths in each topic tree ...')
 572+
 573+ for page_id in page_ids:
 574+
 575+ # logging.info('For %s classifying categories: %s...' % (title, str(page_categories[page_id])))
 576+ title = titles[page_id]
 577+ cat_winner[title] = dict()
 578+
 579+ """ Initialize the number of top level categorizations for each top level category """
 580+ win_count[title] = dict()
 581+ for tl_cat in self._top_level_cats_:
 582+ win_count[title][tl_cat] = 0
 583+
 584+ """ Go through each category for a page and find out which top level cat is closest """
 585+ for category in page_categories[page_id]:
395586
 587+ cat_winner[title][category] = list()
 588+ min_depth = self._max_depth_
 589+ for tl_cat in self._top_level_cats_:
 590+
 591+ """ Use shortest paths """
 592+ try:
 593+ depths[title][category][tl_cat] = shortest_paths[tl_cat][category]
 594+ except KeyError:
 595+ depths[title][category][tl_cat] = 99
 596+
 597+ if depths[title][category][tl_cat] < min_depth:
 598+ cat_winner[title][category].append(tl_cat)
 599+ min_depth = depths[title][category][tl_cat]
 600+ elif depths[title][category][tl_cat] == min_depth:
 601+ cat_winner[title][category].append(tl_cat) # there can only be one winner
 602+
 603+ """ Randomly choose to tie breakers """
 604+ if len(cat_winner[title][category]) > 0:
 605+ random.shuffle(cat_winner[title][category])
 606+ cat_winner[title][category] = cat_winner[title][category][0]
 607+ else:
 608+ cat_winner[title][category] = None
 609+
 610+ winner = cat_winner[title][category] # this a top level category
 611+ if not(winner == None):
 612+ win_count[title][winner] = win_count[title][winner] + 1
 613+
 614+ """ Classify the top level categories for each page """
 615+ page_tl_cat[title] = None
 616+ best_count = 0
 617+ for tl_cat in self._top_level_cats_:
 618+ if win_count[title][tl_cat] > best_count:
 619+ page_tl_cat[title] = tl_cat
 620+ best_count = win_count[title][tl_cat]
 621+ elif win_count[title][tl_cat] == best_count and best_count > 0:
 622+ page_tl_cat[title] = page_tl_cat[title] + ' / ' + tl_cat
 623+
 624+ return titles, page_tl_cat # , depths, cat_winner
 625+
 626+ """
 627+ Builds a table containing all main namespace pages and their chosen categories
 628+ """
 629+ def determine_all_page_categories(self):
 630+
 631+ sql_create = self._query_names_['create_page_category']
 632+ sql_drop = self._query_names_['drop_page_category']
 633+ sql_insert = self._query_names_['insert_page_category']
 634+
 635+ logging.info('CATEGORIZING PAGES: Initializing tables ... ')
 636+ self.execute_SQL(sql_drop)
 637+ self.execute_SQL(sql_create)
 638+
 639+ logging.info('CATEGORIZING PAGES: Getting all pages ... ')
 640+ sql_get_page_ids = self._query_names_['get_all_page_ids']
 641+ results = self.execute_SQL(sql_get_page_ids)
 642+
 643+ page_ids = list()
 644+ for row in results:
 645+ page_ids.append(int(row[0]))
 646+
 647+ logging.info('CATEGORIZING PAGES: Unshelving shortest paths ... ')
 648+ # shortest_paths = self.unpickle_var('shortest_paths.p')
 649+
 650+ d = shelve.open( settings.__data_file_dir__ + 'shortest_paths.s')
 651+ shortest_paths = d['shortest_paths']
 652+
 653+ logging.info('CATEGORIZING PAGES: Computing categories ... ')
 654+ titles, page_tl_cat = self.find_top_level_category(page_ids, shortest_paths)
 655+ ids = dict((i,v) for v,i in titles.iteritems())
 656+
 657+ logging.info('CATEGORIZING PAGES: Performing inserts ... ')
 658+ page_id_str = ''
 659+ for title in page_tl_cat:
 660+ id = ids[title]
 661+ category = page_tl_cat[title]
 662+
 663+ parts = title.split("'")
 664+ new_title = parts[0]
 665+ parts = parts[1:]
 666+ for part in parts:
 667+ new_title = new_title + " " + part
 668+
 669+ page_id_str = "(%s,'%s','%s')" % (id, new_title, category)
 670+ try:
 671+ self.execute_SQL(sql_insert % page_id_str)
 672+ except:
 673+ logging.info('Could not insert: %s ... ' % new_title)
 674+ pass
 675+ # page_ids.append(str(row[0]))
 676+ # page_id_str = page_id_str[:-1]
 677+
 678+ #logging.info('CATEGORIZING PAGES: Inserting page ids into rfaulk.page_category ... ')
 679+ #self.execute_SQL(sql_insert % page_id_str)
 680+
 681+ d.close()
 682+
396683 """
397684 Inherits WSORSlaveDataLoader
398685

Status & tagging log