r95296 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r95295‎ | r95296 | r95297 >
Date:01:38, 23 August 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
added new way of computing category classfications
retain both shortest paths and topic counts from category structure
Modified paths:
  • /trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py
@@ -13,7 +13,7 @@
1414
1515 """ Import python base modules """
1616 import sys, getopt, re, datetime, logging, MySQLdb, operator, pickle, shelve, random
17 -import networkx as nx
 17+import networkx as nx, numpy as np, scipy.stats as ss
1818
1919 """ Import Analytics modules """
2020 from Fundraiser_Tools.classes.DataLoader import DataLoader
@@ -98,8 +98,11 @@
9999 """
100100 class CategoryLoader(WSORSlaveDataLoader):
101101
102 - def __init__(self):
 102+ def __init__(self, subcategories):
103103
 104+ logging.info('Creating CategoryLoader')
 105+ WSORSlaveDataLoader.__init__(self)
 106+
104107 self.__DEBUG__ = True
105108
106109 self._query_names_['build_subcat_tbl'] = "CREATE TABLE rfaulk.categorylinks_cp select * from enwiki.categorylinks where cl_type = 'subcat'"
@@ -116,19 +119,26 @@
117120
118121 self._query_names_['create_page_category'] = "create table rfaulk.page_category (page_id int(8) unsigned, page_title varbinary(255), category varbinary(255));"
119122 self._query_names_['drop_page_category'] = "drop table if exists rfaulk.page_category;"
 123+ self._query_names_['index1_page_category'] = "create index idx_page_id on rfaulk.page_category (page_id);"
 124+ self._query_names_['index2_page_category'] = "create index idx_page_title on rfaulk.page_category (page_title);"
 125+ self._query_names_['index3_page_category'] = "create index idx_category on rfaulk.page_category (category);"
120126 self._query_names_['insert_page_category'] = "insert into rfaulk.page_category values %s;"
121127
 128+ self._regexp_list_ = ['^[Aa]', '^[Bb]', '^[Cc]', '^[Dd]', '^[Ee]', '^[Ff]', '^[Gg]', '^[Hh]', '^[Ii]', '^[Jj]', '^[Kk]', '^[Ll]', '^[Mm]', '^[Nn]', '^[Oo]', '^[Pp]', '^[Qq]', '^[Rr]', \
 129+ '^[Ss]', '^[Tt]', '^[Tt]', '^[Uu]', '^[Vv]', '^[Ww]','^[Xx]', '^[Yy]', '^[Zz]', '^[^A-Za-z]']
122130
123 - WSORSlaveDataLoader.__init__(self)
124 - logging.info('Creating CategoryLoader')
 131+ self._max_depth_ = 50
 132+ self._main_topic_ = 'Main_topic_classifications'
 133+ self._top_level_cats_ = subcategories[self._main_topic_][:]
 134+ self._top_level_cats_.remove('Chronology')
125135
126 - self._max_depth_ = 100
127 - self._top_level_cats_ = ['Natural_sciences', 'Applied_sciences', 'Mathematics', 'Literature', 'Visual_arts', 'Social_sciences', 'Film', 'Music', 'Television', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports', 'Places']
 136+
 137+ #self._top_level_cats_ = ['Natural_sciences', 'Applied_sciences', 'Mathematics', 'Literature', 'Visual_arts', 'Social_sciences', 'Film', 'Music', 'Television', 'People', 'Religion', 'Culture', 'Philosophy', 'Sports', 'Places']
128138 # self._top_level_cats_ = ['Natural_sciences', 'Mathematics', 'Arts', 'Social_sciences', 'Entertainment', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports']
129139 self._block_words_ = ['categories', 'Categories', 'topic', 'Topic']
130140 self._block_cats_ = ['']
131141 self._topic_trees_ = dict()
132 -
 142+
133143
134144 """
135145 Retrieves all rows out of the category links table
@@ -150,30 +160,23 @@
151161 """
152162 Extract the categories for a given article
153163 """
154 - def get_page_categories(self, page_id_list):
 164+ def get_page_categories(self, page_id):
155165
156 - categories = dict()
 166+ categories = list()
157167 where_clause = ''
158168
159 - """ Initialize category lists for each page """
160 - for id in page_id_list:
161 - categories[id] = list()
162 -
 169+ """ Execute SQL query to retrieve categories for page of page id """
163170 try:
164171
165 - for id in page_id_list:
166 - where_clause = where_clause + 'cl_from = %s or ' % str(id)
167 - where_clause = where_clause[:-4]
168 -
 172+ where_clause = where_clause + 'cl_from = %s' % str(page_id)
169173 sql = self._query_names_['get_page_categories'] % where_clause
170 -
171 - logging.info('Retrieving page categories ...')
 174+
172175 results = self.execute_SQL(sql)
173176
174177 """ walk through results and add to category lists """
175178 for row in results:
176179 id = int(row[0])
177 - categories[id].append(row[1])
 180+ categories.append(row[1])
178181
179182 except Exception as inst:
180183
@@ -182,7 +185,7 @@
183186 logging.error(str(inst.args)) # arguments stored in .args
184187 logging.error(inst.__str__()) # __str__ allows args to printed directly
185188
186 - return {}
 189+ return []
187190
188191 return categories
189192
@@ -211,7 +214,7 @@
212215 """
213216 def get_page_title(self, page_id):
214217
215 - logging.info('Getting page titles ...')
 218+ # logging.info('Getting page titles ...')
216219 is_list = isinstance(page_id, (list))
217220
218221 try:
@@ -232,13 +235,18 @@
233236 title = dict()
234237
235238 for row in results:
236 - title[int(row[0])] = str(row[1])
237 -
 239+
 240+ try:
 241+ title[int(row[0])] = str(row[1])
 242+
 243+ except:
 244+ logging.error('Could not retrieve page_title for %s.' % row)
 245+ pass
 246+
238247 except Exception as inst:
239248
240 - logging.error('Could not retrieve page_title for page_id = %s.' % page_id)
241 - self._log_file.write('Could not retrieve page_title for page_id = %s.\n' % (page_id))
242 -
 249+ logging.error('Could not retrieve page_title for page_id.')
 250+
243251 return ''
244252
245253 return title
@@ -254,8 +262,6 @@
255263 #self.drop_category_links_cp_table()
256264 #self.create_category_links_cp_table()
257265
258 - self._log_file = open('category_miner.log', 'w')
259 -
260266 """ Create graph """
261267
262268 logging.info('Initializing directed graph...')
@@ -304,11 +310,10 @@
305311
306312 directed_graph.add_weighted_edges_from([(cl_from, cl_to, 1)])
307313
308 - if self.__DEBUG__ and (cl_from == 'Probability' or cl_from == 'Mathematics' or cl_from == 'Science' or cl_from == 'Arts'):
 314+ #if self.__DEBUG__ and (cl_from == 'Probability' or cl_from == 'Mathematics' or cl_from == 'Science' or cl_from == 'Arts'):
309315 #if self.__DEBUG__ and count % 1000 == 0 :
310316
311 - logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to))
312 - self._log_file.write('%s: %s -> %s\n' % (str(count), cl_from, cl_to))
 317+ # logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to))
313318
314319 count = count + 1
315320
@@ -320,7 +325,6 @@
321326 in_only, out_only = self.get_uni_directionally_linked_categories(sorted_in_degrees, sorted_out_degrees, in_degrees, out_degrees)
322327
323328 logging.info('Category links finished processing.')
324 - self._log_file.close()
325329
326330 return directed_graph, in_degrees, out_degrees, sorted_in_degrees, sorted_out_degrees, subcategories, in_only, out_only
327331
@@ -419,26 +423,30 @@
420424
421425 """ Create graph """
422426
423 - logging.info('Initializing directed graph...')
 427+ logging.info('Initializing directed graph and topic counts ...')
424428 graph = nx.Graph()
 429+ topic_counts = dict()
425430 self._count_ = 1
426431
427 - subcategories['top_level_categories'] = self._top_level_cats_
428 - topic = 'top_level_categories'
429 -
430432 depth = 0
431 - logging.info('Recursively contructing graph, MAX DEPTH = %s ...' % self._max_depth_)
432 - shortest_paths, topic_counts = self._recursive_construct_topic_tree(graph, topic, subcategories, depth)
 433+ logging.info('Recursively contructing graph, MAX DEPTH = %s ...' % self._max_depth_)
 434+ shortest_paths = self._recursive_construct_topic_tree(graph, self._main_topic_, subcategories, depth)
433435
434 -
435 - """ Pickle the result """
436 - #logging.info('Pickling the shortest paths ...')
437 - #self.pickle_var(shortest_paths, 'shortest_paths.p')
438 -
 436+ max_depth = 5
 437+ logging.info('Computing recursive sub-category counts, MAX DEPTH = %s ...' % max_depth)
 438+ count = 0
 439+ for title in shortest_paths[shortest_paths.keys()[0]].keys():
 440+
 441+ # logging.info('topic counts processed for %s ...' % title)
 442+ topic_counts[title] = self.get_subcategory_count(title, subcategories, 0, max_depth)
 443+ count = count + 1
 444+
439445 """ Shelve the result """
440446 logging.info('Shelve the shortest paths ...')
441 - d = shelve.open( settings.__data_file_dir__ + 'shortest_paths.s')
 447+ d = shelve.open( settings.__data_file_dir__ + 'topic_tree.s')
442448 d['shortest_paths'] = shortest_paths
 449+ d['topic_counts'] = topic_counts
 450+
443451 d.close()
444452
445453 return graph, shortest_paths
@@ -454,7 +462,6 @@
455463 """
456464 def _recursive_construct_topic_tree(self, graph, topic, subcategories, depth):
457465
458 - topic_counts = 1
459466 depth = depth + 1
460467 self._count_ = self._count_ + 1
461468
@@ -483,30 +490,25 @@
484491
485492 """ Recursively build linkages for each .
486493 DFS determining topic tree - this provides """
487 - for sub_topic in topic_subcategories:
488 -
489 - if depth == 1:
490 - logging.info('Processing top level catgory: %s' % sub_topic)
491 -
492 - if not(graph.has_node(sub_topic)):
493 -
494 - graph.add_edge(topic, sub_topic)
 494+ """ Only go deeper if the maximum recursive depth has not been reached """
 495+ if depth < self._max_depth_:
 496+ for sub_topic in topic_subcategories:
495497
496 - """ Only go deeper if the maximum recursive depth has not been reached """
497 - if depth < self._max_depth_:
498 - sub_topic_counts = self._recursive_construct_topic_tree(graph, sub_topic, subcategories, depth)
 498+ if depth == 1:
 499+ logging.info('Processing top level category: %s' % sub_topic)
 500+
 501+ """ Check if the subtopic node is already in the graph - if so add a loop edge """
 502+ if not(graph.has_node(sub_topic)):
 503+
 504+ graph.add_edge(topic, sub_topic)
 505+ self._recursive_construct_topic_tree(graph, sub_topic, subcategories, depth)
 506+
499507 else:
500 - sub_topic_counts = 1
501508
502 - topic_counts = topic_counts + sub_topic_counts
503 -
504 - else:
505 -
506 - """ Add the 'loop' edge if and only if it is not a top level catagory """
507 - if not(sub_topic in self._top_level_cats_):
508 - graph.add_edge(topic, sub_topic)
509 - topic_counts = topic_counts + 1
510 -
 509+ """ Add the 'loop' edge if and only if it is not a top level category """
 510+ if not(sub_topic in self._top_level_cats_):
 511+ graph.add_edge(topic, sub_topic)
 512+
511513 """ After the recursion is complete compute the shortest paths """
512514 if depth == 1:
513515
@@ -519,10 +521,14 @@
520522 """ Store the lengths rather than the paths """
521523 for target in shortest_paths[sub_topic]:
522524 shortest_paths[sub_topic][target] = len(shortest_paths[sub_topic][target])
 525+
 526+ """ Get the shortest path lengths for the main topic also """
 527+ shortest_paths[self._main_topic_] = nx.single_source_dijkstra_path(graph, self._main_topic_)
 528+ for target in shortest_paths[self._main_topic_]:
 529+ shortest_paths[self._main_topic_][target] = len(shortest_paths[self._main_topic_][target])
523530
524 - return shortest_paths, topic_counts
 531+ return shortest_paths
525532
526 - return topic_counts
527533
528534 """
529535 Pickles variables that store the state of the category graph
@@ -539,90 +545,191 @@
540546 self.pickle_var(out_only, 'out_only.p')
541547
542548 """
 549+ Given a set of article ids determine their corresponding representation in category space
 550+
543551 @param page_ids: a list of pages to classify
 552+ @param shortest_paths: an dictionary of the shortest paths from all top level categories to sub categories
544553 """
545 - def find_top_level_category(self, page_ids, shortest_paths):
 554+ def find_top_level_category(self, page_ids, shortest_paths, topic_counts):
546555
547 - # self._topic_trees_ = dict()
548556 titles = dict()
549 - depths = dict()
550557 page_categories = dict()
551558 page_tl_cat = dict()
552 - cat_winner = dict()
553 - win_count = dict()
 559+ # win_count = dict()
554560
 561+ self._num_tl_cats_ = len(self._top_level_cats_)
 562+ tl_cat_vectors = dict()
 563+
555564 """ Get categories for pages - Initialize depth dictionaries for top level categories """
556565 logging.info('Initializing data structures ...')
557 - page_categories = self.get_page_categories(page_ids)
558 - titles = self.get_page_title(page_ids)
559566
560 - for page_id in page_ids:
561 - # page_categories[page_id] = self.get_page_categories(page_id)
562 - title = titles[page_id]
563 - depths[title] = dict()
 567+
 568+ logging.info('Getting page titles and categories ...')
 569+ counter = 0
 570+ for id in page_ids:
 571+ titles[id] = self.get_page_title(id)
 572+ page_categories[id] = self.get_page_categories(id)
564573
565 - """ Initialize dictionaries to store the depth scores for top level categories """
566 - for category in page_categories[page_id]:
567 - depths[title][category] = dict()
568 -
 574+ counter = counter + 1
 575+
 576+ if counter % 100000 == 0:
 577+ logging.info('%s page titles and categories processed ...' % counter)
 578+
569579 """ Iterate through each page, category, and top level category
570580 Perform a breadth first search for the node to determine the dept """
571581 logging.info('Finding category depths in each topic tree ...')
572582
573583 for page_id in page_ids:
574 -
575 - # logging.info('For %s classifying categories: %s...' % (title, str(page_categories[page_id])))
 584+
 585+ """
 586+ Retrieve page categories for each page using one of the classification methods available
 587+ rank_categories_M1()
 588+ rank_categories_M2()
 589+ """
576590 title = titles[page_id]
577 - cat_winner[title] = dict()
 591+ page_tl_cat[title] = self.rank_categories_M2(page_categories[page_id], shortest_paths, topic_counts)
 592+
578593
579 - """ Initialize the number of top level categorizations for each top level category """
580 - win_count[title] = dict()
581 - for tl_cat in self._top_level_cats_:
582 - win_count[title][tl_cat] = 0
 594+ return titles, page_tl_cat #, tl_cat_vectors # , depths, cat_winner
 595+
 596+ """
 597+ Method for determining top level category for a set of categories
 598+
 599+ This method looks at the closest top level categories for each category and chooses the category or categories that appear most.
 600+
 601+ @param categories: String list of categories to classify
 602+ @param shortest_paths: dictionary of shortest paths indexed by categories
 603+
 604+ @return: String indicating the top level category(ies)
 605+
 606+ """
 607+ def rank_categories_M1(self, categories, shortest_paths):
 608+
 609+ """ Initialize the number of top level categorizations for each top level category """
 610+ win_count = dict()
 611+ for tl_cat in self._top_level_cats_:
 612+ win_count[tl_cat] = 0
 613+
 614+ for category in page_categories[page_id]:
583615
584 - """ Go through each category for a page and find out which top level cat is closest """
585 - for category in page_categories[page_id]:
 616+ cat_winner = list()
 617+ min_depth = self._max_depth_
 618+
 619+ for index, tl_cat in enumerate(self._top_level_cats_):
 620+ # for tl_cat in self._top_level_cats_:
586621
587 - cat_winner[title][category] = list()
588 - min_depth = self._max_depth_
589 - for tl_cat in self._top_level_cats_:
590 -
591 - """ Use shortest paths """
592 - try:
593 - depths[title][category][tl_cat] = shortest_paths[tl_cat][category]
594 - except KeyError:
595 - depths[title][category][tl_cat] = 99
596 -
597 - if depths[title][category][tl_cat] < min_depth:
598 - cat_winner[title][category].append(tl_cat)
599 - min_depth = depths[title][category][tl_cat]
600 - elif depths[title][category][tl_cat] == min_depth:
601 - cat_winner[title][category].append(tl_cat) # there can only be one winner
 622+ """ Use shortest paths """
 623+ try:
 624+ path_length = shortest_paths[tl_cat][category]
602625
603 - """ Randomly choose to tie breakers """
604 - if len(cat_winner[title][category]) > 0:
605 - random.shuffle(cat_winner[title][category])
606 - cat_winner[title][category] = cat_winner[title][category][0]
607 - else:
608 - cat_winner[title][category] = None
609 -
610 - winner = cat_winner[title][category] # this a top level category
611 - if not(winner == None):
612 - win_count[title][winner] = win_count[title][winner] + 1
 626+ except KeyError:
 627+ path_length = self._max_depth_
 628+
 629+ if path_length < min_depth:
 630+ cat_winner = [tl_cat]
 631+ min_depth = path_length
 632+ elif path_length == min_depth:
 633+ cat_winner.append(tl_cat) # there can only be one winner
613634
614 - """ Classify the top level categories for each page """
615 - page_tl_cat[title] = None
616 - best_count = 0
617 - for tl_cat in self._top_level_cats_:
618 - if win_count[title][tl_cat] > best_count:
619 - page_tl_cat[title] = tl_cat
620 - best_count = win_count[title][tl_cat]
621 - elif win_count[title][tl_cat] == best_count and best_count > 0:
622 - page_tl_cat[title] = page_tl_cat[title] + ' / ' + tl_cat
623 -
624 - return titles, page_tl_cat # , depths, cat_winner
 635+ """ Randomly choose to tie breakers """
 636+ if len(cat_winner) > 0:
 637+ random.shuffle(cat_winner)
 638+ cat_winner = cat_winner[0]
 639+ else:
 640+ cat_winner = None
 641+
 642+ winner = cat_winner # this a top level category
 643+ if not(winner == None):
 644+ win_count[winner] = win_count[winner] + 1
 645+
 646+
 647+ """ Classify the top level categories for each page """
625648
 649+ best_count = 0
 650+
 651+ for tl_cat in self._top_level_cats_:
 652+ if win_count[tl_cat] > best_count:
 653+ page_tl_cat = tl_cat
 654+ best_count = win_count[tl_cat]
 655+ elif win_count[tl_cat] == best_count and best_count > 0:
 656+ page_tl_cat = page_tl_cat + ' / ' + tl_cat
 657+
 658+ return page_tl_cat
 659+
626660 """
 661+ Method for determining top level category for a set of categories
 662+
 663+ This method looks at the closest top level categories for each category and constructs vector representations based on path lengths. The dimensions of the
 664+ vector space are the top level categories. The value for a given dimension is determined by the sum of the path lengths from each category to the top-level
 665+ category where the summand is weighted by the distance of the category to the main topic
 666+
 667+ @param categories: String list of categories to classify
 668+ @param shortest_paths: dictionary of shortest paths indexed by categories
 669+
 670+ @return: String indicating the top level category(ies)
 671+ """
 672+ def rank_categories_M2(self, categories, shortest_paths, topic_counts):
 673+
 674+ """ Go through each category for a page and find out which top level cat is closest """
 675+
 676+ tl_cat_vectors = np.zeros(len(self._top_level_cats_)) # initialize the vector in top-level category space
 677+ page_tl_cat = [0] * len(self._top_level_cats_) # the top level cats
 678+
 679+ for category in categories:
 680+
 681+ """ Compute the weight of this category """
 682+ try:
 683+ path_length_from_main = shortest_paths[self._main_topic_][category]
 684+ except:
 685+ path_length_from_main = self._max_depth_
 686+
 687+ """ The fanout weight is based on the fanout of a category for a fixed depth .. if there is no fanout for this topic it probably has vary few or no
 688+ subtopics so assign a fanout of 1.0 """
 689+ try:
 690+ fanout_weight = float(topic_counts[category])
 691+ except:
 692+ fanout_weight = 1.0
 693+ pass
 694+
 695+ """ The total weight of this category depends on the product of how far it lies from the root and the inverse of its fanout
 696+ this makes more specialized categories worth more """
 697+ category_weight = float(path_length_from_main) * fanout_weight
 698+
 699+ # category_weight = 1 / float(path_length_from_main) * self._max_depth_
 700+
 701+ cat_winner = list()
 702+ min_depth = self._max_depth_
 703+
 704+ for index, tl_cat in enumerate(self._top_level_cats_):
 705+
 706+ """ Use shortest paths """
 707+ try:
 708+ path_length = shortest_paths[tl_cat][category]
 709+ except KeyError:
 710+ path_length = 100
 711+
 712+ tl_cat_vectors[index] = tl_cat_vectors[index] + category_weight * np.power(path_length, 0.5)
 713+
 714+ """ Normalize the vector representation """
 715+ ranks = np.floor(ss.rankdata(tl_cat_vectors)) - 1
 716+ vec = max(tl_cat_vectors) - tl_cat_vectors
 717+ tl_cat_vectors = vec / float(sum(vec))
 718+
 719+ """ Choose the top categories """
 720+ for i in range(self._num_tl_cats_):
 721+ index = np.argmin(ranks)
 722+ ranks[index] = self._num_tl_cats_ + 1
 723+ page_tl_cat[i] = self._top_level_cats_[index]
 724+
 725+ top_five_cats = ''
 726+ for i in range(5):
 727+ top_five_cats = top_five_cats + page_tl_cat[i] + ', '
 728+ top_five_cats = top_five_cats[:-2]
 729+ page_tl_cat = top_five_cats
 730+
 731+ return page_tl_cat
 732+
 733+ """
627734 Builds a table containing all main namespace pages and their chosen categories
628735 """
629736 def determine_all_page_categories(self):
@@ -634,51 +741,84 @@
635742 logging.info('CATEGORIZING PAGES: Initializing tables ... ')
636743 self.execute_SQL(sql_drop)
637744 self.execute_SQL(sql_create)
 745+ self.execute_SQL(self._query_names_['index1_page_category'])
 746+ self.execute_SQL(self._query_names_['index2_page_category'])
 747+ self.execute_SQL(self._query_names_['index3_page_category'])
638748
639 - logging.info('CATEGORIZING PAGES: Getting all pages ... ')
640 - sql_get_page_ids = self._query_names_['get_all_page_ids']
641 - results = self.execute_SQL(sql_get_page_ids)
642 -
643 - page_ids = list()
644 - for row in results:
645 - page_ids.append(int(row[0]))
646 -
647749 logging.info('CATEGORIZING PAGES: Unshelving shortest paths ... ')
648 - # shortest_paths = self.unpickle_var('shortest_paths.p')
 750+ d = shelve.open( settings.__data_file_dir__ + 'topic_tree.s')
649751
650 - d = shelve.open( settings.__data_file_dir__ + 'shortest_paths.s')
651752 shortest_paths = d['shortest_paths']
 753+ topic_counts = d['topic_counts']
652754
653 - logging.info('CATEGORIZING PAGES: Computing categories ... ')
654 - titles, page_tl_cat = self.find_top_level_category(page_ids, shortest_paths)
655 - ids = dict((i,v) for v,i in titles.iteritems())
656 -
657 - logging.info('CATEGORIZING PAGES: Performing inserts ... ')
658 - page_id_str = ''
659 - for title in page_tl_cat:
660 - id = ids[title]
661 - category = page_tl_cat[title]
 755+ """
 756+ Break up processing to handle records with page titles matching the regular expressions in _regexp_list_
 757+ """
 758+ for regexp in self._regexp_list_:
662759
663 - parts = title.split("'")
664 - new_title = parts[0]
665 - parts = parts[1:]
666 - for part in parts:
667 - new_title = new_title + " " + part
668 -
669 - page_id_str = "(%s,'%s','%s')" % (id, new_title, category)
670 - try:
671 - self.execute_SQL(sql_insert % page_id_str)
672 - except:
673 - logging.info('Could not insert: %s ... ' % new_title)
674 - pass
675 - # page_ids.append(str(row[0]))
676 - # page_id_str = page_id_str[:-1]
 760+ logging.info('CATEGORIZING PAGES: Getting pages for %s ... ' % regexp)
 761+
 762+ sql_get_page_ids = self._query_names_['get_all_page_ids'] + " and page_title regexp '%s';" % regexp
 763+ results = self.execute_SQL(sql_get_page_ids)
 764+
 765+ page_ids = list()
 766+ for row in results:
 767+ page_ids.append(int(row[0]))
 768+
 769+ logging.info('CATEGORIZING PAGES: Computing categories ... ')
 770+ titles, page_tl_cat = self.find_top_level_category(page_ids, shortest_paths, topic_counts)
 771+ ids = dict((i,v) for v,i in titles.iteritems())
 772+
 773+ logging.info('CATEGORIZING PAGES: Performing inserts ... ')
 774+ page_id_str = ''
 775+ for title in page_tl_cat:
 776+ id = ids[title]
 777+ category = page_tl_cat[title]
 778+
 779+ parts = title.split("'")
 780+ new_title = parts[0]
 781+ parts = parts[1:]
 782+ for part in parts:
 783+ new_title = new_title + " " + part
 784+
 785+ page_id_str = "(%s,'%s','%s')" % (id, new_title, category)
 786+ try:
 787+ self.execute_SQL(sql_insert % page_id_str)
 788+ except:
 789+ logging.info('Could not insert: %s ... ' % new_title)
 790+ pass
 791+
 792+
 793+ d.close()
 794+
 795+
 796+ """
 797+ Gets a subcategory count for a fixed depth of the category graph structure starting at a specified node
 798+ loops are ignored
677799
678 - #logging.info('CATEGORIZING PAGES: Inserting page ids into rfaulk.page_category ... ')
679 - #self.execute_SQL(sql_insert % page_id_str)
 800+ @param topic: The topic to produce a sub topic count for
 801+ @param subcategories: dictionary keyed on categories, values are subcategories
 802+ @param depth: the current depth of the recursion
 803+ @param max_depth: The maximum depth of the recursion
680804
681 - d.close()
 805+ """
 806+ def get_subcategory_count(self, topic, subcategories, depth, max_depth):
682807
 808+ topic_count = 1
 809+
 810+ try:
 811+ topic_subcategories = subcategories[topic]
 812+ new_depth = depth + 1
 813+
 814+ if depth < max_depth:
 815+ for sub_topic in topic_subcategories:
 816+ topic_count = topic_count + self.get_subcategory_count(sub_topic, subcategories, new_depth, max_depth)
 817+ except:
 818+ # logging.info('No subcategories of %s' % topic)
 819+ pass
 820+
 821+ return topic_count
 822+
683823 """
684824 Inherits WSORSlaveDataLoader
685825

Status & tagging log