Index: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py |
— | — | @@ -13,7 +13,7 @@ |
14 | 14 | |
15 | 15 | """ Import python base modules """ |
16 | 16 | import sys, getopt, re, datetime, logging, MySQLdb, operator, pickle, shelve, random |
17 | | -import networkx as nx |
| 17 | +import networkx as nx, numpy as np, scipy.stats as ss |
18 | 18 | |
19 | 19 | """ Import Analytics modules """ |
20 | 20 | from Fundraiser_Tools.classes.DataLoader import DataLoader |
— | — | @@ -98,8 +98,11 @@ |
99 | 99 | """ |
100 | 100 | class CategoryLoader(WSORSlaveDataLoader): |
101 | 101 | |
102 | | - def __init__(self): |
| 102 | + def __init__(self, subcategories): |
103 | 103 | |
| 104 | + logging.info('Creating CategoryLoader') |
| 105 | + WSORSlaveDataLoader.__init__(self) |
| 106 | + |
104 | 107 | self.__DEBUG__ = True |
105 | 108 | |
106 | 109 | self._query_names_['build_subcat_tbl'] = "CREATE TABLE rfaulk.categorylinks_cp select * from enwiki.categorylinks where cl_type = 'subcat'" |
— | — | @@ -116,19 +119,26 @@ |
117 | 120 | |
118 | 121 | self._query_names_['create_page_category'] = "create table rfaulk.page_category (page_id int(8) unsigned, page_title varbinary(255), category varbinary(255));" |
119 | 122 | self._query_names_['drop_page_category'] = "drop table if exists rfaulk.page_category;" |
| 123 | + self._query_names_['index1_page_category'] = "create index idx_page_id on rfaulk.page_category (page_id);" |
| 124 | + self._query_names_['index2_page_category'] = "create index idx_page_title on rfaulk.page_category (page_title);" |
| 125 | + self._query_names_['index3_page_category'] = "create index idx_category on rfaulk.page_category (category);" |
120 | 126 | self._query_names_['insert_page_category'] = "insert into rfaulk.page_category values %s;" |
121 | 127 | |
| 128 | + self._regexp_list_ = ['^[Aa]', '^[Bb]', '^[Cc]', '^[Dd]', '^[Ee]', '^[Ff]', '^[Gg]', '^[Hh]', '^[Ii]', '^[Jj]', '^[Kk]', '^[Ll]', '^[Mm]', '^[Nn]', '^[Oo]', '^[Pp]', '^[Qq]', '^[Rr]', \ |
| 129 | + '^[Ss]', '^[Tt]', '^[Tt]', '^[Uu]', '^[Vv]', '^[Ww]','^[Xx]', '^[Yy]', '^[Zz]', '^[^A-Za-z]'] |
122 | 130 | |
123 | | - WSORSlaveDataLoader.__init__(self) |
124 | | - logging.info('Creating CategoryLoader') |
| 131 | + self._max_depth_ = 50 |
| 132 | + self._main_topic_ = 'Main_topic_classifications' |
| 133 | + self._top_level_cats_ = subcategories[self._main_topic_][:] |
| 134 | + self._top_level_cats_.remove('Chronology') |
125 | 135 | |
126 | | - self._max_depth_ = 100 |
127 | | - self._top_level_cats_ = ['Natural_sciences', 'Applied_sciences', 'Mathematics', 'Literature', 'Visual_arts', 'Social_sciences', 'Film', 'Music', 'Television', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports', 'Places'] |
| 136 | + |
| 137 | + #self._top_level_cats_ = ['Natural_sciences', 'Applied_sciences', 'Mathematics', 'Literature', 'Visual_arts', 'Social_sciences', 'Film', 'Music', 'Television', 'People', 'Religion', 'Culture', 'Philosophy', 'Sports', 'Places'] |
128 | 138 | # self._top_level_cats_ = ['Natural_sciences', 'Mathematics', 'Arts', 'Social_sciences', 'Entertainment', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports'] |
129 | 139 | self._block_words_ = ['categories', 'Categories', 'topic', 'Topic'] |
130 | 140 | self._block_cats_ = [''] |
131 | 141 | self._topic_trees_ = dict() |
132 | | - |
| 142 | + |
133 | 143 | |
134 | 144 | """ |
135 | 145 | Retrieves all rows out of the category links table |
— | — | @@ -150,30 +160,23 @@ |
151 | 161 | """ |
152 | 162 | Extract the categories for a given article |
153 | 163 | """ |
154 | | - def get_page_categories(self, page_id_list): |
| 164 | + def get_page_categories(self, page_id): |
155 | 165 | |
156 | | - categories = dict() |
| 166 | + categories = list() |
157 | 167 | where_clause = '' |
158 | 168 | |
159 | | - """ Initialize category lists for each page """ |
160 | | - for id in page_id_list: |
161 | | - categories[id] = list() |
162 | | - |
| 169 | + """ Execute SQL query to retrieve categories for page of page id """ |
163 | 170 | try: |
164 | 171 | |
165 | | - for id in page_id_list: |
166 | | - where_clause = where_clause + 'cl_from = %s or ' % str(id) |
167 | | - where_clause = where_clause[:-4] |
168 | | - |
| 172 | + where_clause = where_clause + 'cl_from = %s' % str(page_id) |
169 | 173 | sql = self._query_names_['get_page_categories'] % where_clause |
170 | | - |
171 | | - logging.info('Retrieving page categories ...') |
| 174 | + |
172 | 175 | results = self.execute_SQL(sql) |
173 | 176 | |
174 | 177 | """ walk through results and add to category lists """ |
175 | 178 | for row in results: |
176 | 179 | id = int(row[0]) |
177 | | - categories[id].append(row[1]) |
| 180 | + categories.append(row[1]) |
178 | 181 | |
179 | 182 | except Exception as inst: |
180 | 183 | |
— | — | @@ -182,7 +185,7 @@ |
183 | 186 | logging.error(str(inst.args)) # arguments stored in .args |
184 | 187 | logging.error(inst.__str__()) # __str__ allows args to printed directly |
185 | 188 | |
186 | | - return {} |
| 189 | + return [] |
187 | 190 | |
188 | 191 | return categories |
189 | 192 | |
— | — | @@ -211,7 +214,7 @@ |
212 | 215 | """ |
213 | 216 | def get_page_title(self, page_id): |
214 | 217 | |
215 | | - logging.info('Getting page titles ...') |
| 218 | + # logging.info('Getting page titles ...') |
216 | 219 | is_list = isinstance(page_id, (list)) |
217 | 220 | |
218 | 221 | try: |
— | — | @@ -232,13 +235,18 @@ |
233 | 236 | title = dict() |
234 | 237 | |
235 | 238 | for row in results: |
236 | | - title[int(row[0])] = str(row[1]) |
237 | | - |
| 239 | + |
| 240 | + try: |
| 241 | + title[int(row[0])] = str(row[1]) |
| 242 | + |
| 243 | + except: |
| 244 | + logging.error('Could not retrieve page_title for %s.' % row) |
| 245 | + pass |
| 246 | + |
238 | 247 | except Exception as inst: |
239 | 248 | |
240 | | - logging.error('Could not retrieve page_title for page_id = %s.' % page_id) |
241 | | - self._log_file.write('Could not retrieve page_title for page_id = %s.\n' % (page_id)) |
242 | | - |
| 249 | + logging.error('Could not retrieve page_title for page_id.') |
| 250 | + |
243 | 251 | return '' |
244 | 252 | |
245 | 253 | return title |
— | — | @@ -254,8 +262,6 @@ |
255 | 263 | #self.drop_category_links_cp_table() |
256 | 264 | #self.create_category_links_cp_table() |
257 | 265 | |
258 | | - self._log_file = open('category_miner.log', 'w') |
259 | | - |
260 | 266 | """ Create graph """ |
261 | 267 | |
262 | 268 | logging.info('Initializing directed graph...') |
— | — | @@ -304,11 +310,10 @@ |
305 | 311 | |
306 | 312 | directed_graph.add_weighted_edges_from([(cl_from, cl_to, 1)]) |
307 | 313 | |
308 | | - if self.__DEBUG__ and (cl_from == 'Probability' or cl_from == 'Mathematics' or cl_from == 'Science' or cl_from == 'Arts'): |
| 314 | + #if self.__DEBUG__ and (cl_from == 'Probability' or cl_from == 'Mathematics' or cl_from == 'Science' or cl_from == 'Arts'): |
309 | 315 | #if self.__DEBUG__ and count % 1000 == 0 : |
310 | 316 | |
311 | | - logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to)) |
312 | | - self._log_file.write('%s: %s -> %s\n' % (str(count), cl_from, cl_to)) |
| 317 | + # logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to)) |
313 | 318 | |
314 | 319 | count = count + 1 |
315 | 320 | |
— | — | @@ -320,7 +325,6 @@ |
321 | 326 | in_only, out_only = self.get_uni_directionally_linked_categories(sorted_in_degrees, sorted_out_degrees, in_degrees, out_degrees) |
322 | 327 | |
323 | 328 | logging.info('Category links finished processing.') |
324 | | - self._log_file.close() |
325 | 329 | |
326 | 330 | return directed_graph, in_degrees, out_degrees, sorted_in_degrees, sorted_out_degrees, subcategories, in_only, out_only |
327 | 331 | |
— | — | @@ -419,26 +423,30 @@ |
420 | 424 | |
421 | 425 | """ Create graph """ |
422 | 426 | |
423 | | - logging.info('Initializing directed graph...') |
| 427 | + logging.info('Initializing directed graph and topic counts ...') |
424 | 428 | graph = nx.Graph() |
| 429 | + topic_counts = dict() |
425 | 430 | self._count_ = 1 |
426 | 431 | |
427 | | - subcategories['top_level_categories'] = self._top_level_cats_ |
428 | | - topic = 'top_level_categories' |
429 | | - |
430 | 432 | depth = 0 |
431 | | - logging.info('Recursively contructing graph, MAX DEPTH = %s ...' % self._max_depth_) |
432 | | - shortest_paths, topic_counts = self._recursive_construct_topic_tree(graph, topic, subcategories, depth) |
| 433 | + logging.info('Recursively contructing graph, MAX DEPTH = %s ...' % self._max_depth_) |
| 434 | + shortest_paths = self._recursive_construct_topic_tree(graph, self._main_topic_, subcategories, depth) |
433 | 435 | |
434 | | - |
435 | | - """ Pickle the result """ |
436 | | - #logging.info('Pickling the shortest paths ...') |
437 | | - #self.pickle_var(shortest_paths, 'shortest_paths.p') |
438 | | - |
| 436 | + max_depth = 5 |
| 437 | + logging.info('Computing recursive sub-category counts, MAX DEPTH = %s ...' % max_depth) |
| 438 | + count = 0 |
| 439 | + for title in shortest_paths[shortest_paths.keys()[0]].keys(): |
| 440 | + |
| 441 | + # logging.info('topic counts processed for %s ...' % title) |
| 442 | + topic_counts[title] = self.get_subcategory_count(title, subcategories, 0, max_depth) |
| 443 | + count = count + 1 |
| 444 | + |
439 | 445 | """ Shelve the result """ |
440 | 446 | logging.info('Shelve the shortest paths ...') |
441 | | - d = shelve.open( settings.__data_file_dir__ + 'shortest_paths.s') |
| 447 | + d = shelve.open( settings.__data_file_dir__ + 'topic_tree.s') |
442 | 448 | d['shortest_paths'] = shortest_paths |
| 449 | + d['topic_counts'] = topic_counts |
| 450 | + |
443 | 451 | d.close() |
444 | 452 | |
445 | 453 | return graph, shortest_paths |
— | — | @@ -454,7 +462,6 @@ |
455 | 463 | """ |
456 | 464 | def _recursive_construct_topic_tree(self, graph, topic, subcategories, depth): |
457 | 465 | |
458 | | - topic_counts = 1 |
459 | 466 | depth = depth + 1 |
460 | 467 | self._count_ = self._count_ + 1 |
461 | 468 | |
— | — | @@ -483,30 +490,25 @@ |
484 | 491 | |
485 | 492 | """ Recursively build linkages for each . |
486 | 493 | DFS determining topic tree - this provides """ |
487 | | - for sub_topic in topic_subcategories: |
488 | | - |
489 | | - if depth == 1: |
490 | | - logging.info('Processing top level catgory: %s' % sub_topic) |
491 | | - |
492 | | - if not(graph.has_node(sub_topic)): |
493 | | - |
494 | | - graph.add_edge(topic, sub_topic) |
| 494 | + """ Only go deeper if the maximum recursive depth has not been reached """ |
| 495 | + if depth < self._max_depth_: |
| 496 | + for sub_topic in topic_subcategories: |
495 | 497 | |
496 | | - """ Only go deeper if the maximum recursive depth has not been reached """ |
497 | | - if depth < self._max_depth_: |
498 | | - sub_topic_counts = self._recursive_construct_topic_tree(graph, sub_topic, subcategories, depth) |
| 498 | + if depth == 1: |
| 499 | + logging.info('Processing top level category: %s' % sub_topic) |
| 500 | + |
| 501 | + """ Check if the subtopic node is already in the graph - if so add a loop edge """ |
| 502 | + if not(graph.has_node(sub_topic)): |
| 503 | + |
| 504 | + graph.add_edge(topic, sub_topic) |
| 505 | + self._recursive_construct_topic_tree(graph, sub_topic, subcategories, depth) |
| 506 | + |
499 | 507 | else: |
500 | | - sub_topic_counts = 1 |
501 | 508 | |
502 | | - topic_counts = topic_counts + sub_topic_counts |
503 | | - |
504 | | - else: |
505 | | - |
506 | | - """ Add the 'loop' edge if and only if it is not a top level catagory """ |
507 | | - if not(sub_topic in self._top_level_cats_): |
508 | | - graph.add_edge(topic, sub_topic) |
509 | | - topic_counts = topic_counts + 1 |
510 | | - |
| 509 | + """ Add the 'loop' edge if and only if it is not a top level category """ |
| 510 | + if not(sub_topic in self._top_level_cats_): |
| 511 | + graph.add_edge(topic, sub_topic) |
| 512 | + |
511 | 513 | """ After the recursion is complete compute the shortest paths """ |
512 | 514 | if depth == 1: |
513 | 515 | |
— | — | @@ -519,10 +521,14 @@ |
520 | 522 | """ Store the lengths rather than the paths """ |
521 | 523 | for target in shortest_paths[sub_topic]: |
522 | 524 | shortest_paths[sub_topic][target] = len(shortest_paths[sub_topic][target]) |
| 525 | + |
| 526 | + """ Get the shortest path lengths for the main topic also """ |
| 527 | + shortest_paths[self._main_topic_] = nx.single_source_dijkstra_path(graph, self._main_topic_) |
| 528 | + for target in shortest_paths[self._main_topic_]: |
| 529 | + shortest_paths[self._main_topic_][target] = len(shortest_paths[self._main_topic_][target]) |
523 | 530 | |
524 | | - return shortest_paths, topic_counts |
| 531 | + return shortest_paths |
525 | 532 | |
526 | | - return topic_counts |
527 | 533 | |
528 | 534 | """ |
529 | 535 | Pickles variables that store the state of the category graph |
— | — | @@ -539,90 +545,191 @@ |
540 | 546 | self.pickle_var(out_only, 'out_only.p') |
541 | 547 | |
542 | 548 | """ |
| 549 | + Given a set of article ids determine their corresponding representation in category space |
| 550 | + |
543 | 551 | @param page_ids: a list of pages to classify |
| 552 | + @param shortest_paths: an dictionary of the shortest paths from all top level categories to sub categories |
544 | 553 | """ |
545 | | - def find_top_level_category(self, page_ids, shortest_paths): |
| 554 | + def find_top_level_category(self, page_ids, shortest_paths, topic_counts): |
546 | 555 | |
547 | | - # self._topic_trees_ = dict() |
548 | 556 | titles = dict() |
549 | | - depths = dict() |
550 | 557 | page_categories = dict() |
551 | 558 | page_tl_cat = dict() |
552 | | - cat_winner = dict() |
553 | | - win_count = dict() |
| 559 | + # win_count = dict() |
554 | 560 | |
| 561 | + self._num_tl_cats_ = len(self._top_level_cats_) |
| 562 | + tl_cat_vectors = dict() |
| 563 | + |
555 | 564 | """ Get categories for pages - Initialize depth dictionaries for top level categories """ |
556 | 565 | logging.info('Initializing data structures ...') |
557 | | - page_categories = self.get_page_categories(page_ids) |
558 | | - titles = self.get_page_title(page_ids) |
559 | 566 | |
560 | | - for page_id in page_ids: |
561 | | - # page_categories[page_id] = self.get_page_categories(page_id) |
562 | | - title = titles[page_id] |
563 | | - depths[title] = dict() |
| 567 | + |
| 568 | + logging.info('Getting page titles and categories ...') |
| 569 | + counter = 0 |
| 570 | + for id in page_ids: |
| 571 | + titles[id] = self.get_page_title(id) |
| 572 | + page_categories[id] = self.get_page_categories(id) |
564 | 573 | |
565 | | - """ Initialize dictionaries to store the depth scores for top level categories """ |
566 | | - for category in page_categories[page_id]: |
567 | | - depths[title][category] = dict() |
568 | | - |
| 574 | + counter = counter + 1 |
| 575 | + |
| 576 | + if counter % 100000 == 0: |
| 577 | + logging.info('%s page titles and categories processed ...' % counter) |
| 578 | + |
569 | 579 | """ Iterate through each page, category, and top level category |
570 | 580 | Perform a breadth first search for the node to determine the dept """ |
571 | 581 | logging.info('Finding category depths in each topic tree ...') |
572 | 582 | |
573 | 583 | for page_id in page_ids: |
574 | | - |
575 | | - # logging.info('For %s classifying categories: %s...' % (title, str(page_categories[page_id]))) |
| 584 | + |
| 585 | + """ |
| 586 | + Retrieve page categories for each page using one of the classification methods available |
| 587 | + rank_categories_M1() |
| 588 | + rank_categories_M2() |
| 589 | + """ |
576 | 590 | title = titles[page_id] |
577 | | - cat_winner[title] = dict() |
| 591 | + page_tl_cat[title] = self.rank_categories_M2(page_categories[page_id], shortest_paths, topic_counts) |
| 592 | + |
578 | 593 | |
579 | | - """ Initialize the number of top level categorizations for each top level category """ |
580 | | - win_count[title] = dict() |
581 | | - for tl_cat in self._top_level_cats_: |
582 | | - win_count[title][tl_cat] = 0 |
| 594 | + return titles, page_tl_cat #, tl_cat_vectors # , depths, cat_winner |
| 595 | + |
| 596 | + """ |
| 597 | + Method for determining top level category for a set of categories |
| 598 | + |
| 599 | + This method looks at the closest top level categories for each category and chooses the category or categories that appear most. |
| 600 | + |
| 601 | + @param categories: String list of categories to classify |
| 602 | + @param shortest_paths: dictionary of shortest paths indexed by categories |
| 603 | + |
| 604 | + @return: String indicating the top level category(ies) |
| 605 | + |
| 606 | + """ |
| 607 | + def rank_categories_M1(self, categories, shortest_paths): |
| 608 | + |
| 609 | + """ Initialize the number of top level categorizations for each top level category """ |
| 610 | + win_count = dict() |
| 611 | + for tl_cat in self._top_level_cats_: |
| 612 | + win_count[tl_cat] = 0 |
| 613 | + |
| 614 | + for category in page_categories[page_id]: |
583 | 615 | |
584 | | - """ Go through each category for a page and find out which top level cat is closest """ |
585 | | - for category in page_categories[page_id]: |
| 616 | + cat_winner = list() |
| 617 | + min_depth = self._max_depth_ |
| 618 | + |
| 619 | + for index, tl_cat in enumerate(self._top_level_cats_): |
| 620 | + # for tl_cat in self._top_level_cats_: |
586 | 621 | |
587 | | - cat_winner[title][category] = list() |
588 | | - min_depth = self._max_depth_ |
589 | | - for tl_cat in self._top_level_cats_: |
590 | | - |
591 | | - """ Use shortest paths """ |
592 | | - try: |
593 | | - depths[title][category][tl_cat] = shortest_paths[tl_cat][category] |
594 | | - except KeyError: |
595 | | - depths[title][category][tl_cat] = 99 |
596 | | - |
597 | | - if depths[title][category][tl_cat] < min_depth: |
598 | | - cat_winner[title][category].append(tl_cat) |
599 | | - min_depth = depths[title][category][tl_cat] |
600 | | - elif depths[title][category][tl_cat] == min_depth: |
601 | | - cat_winner[title][category].append(tl_cat) # there can only be one winner |
| 622 | + """ Use shortest paths """ |
| 623 | + try: |
| 624 | + path_length = shortest_paths[tl_cat][category] |
602 | 625 | |
603 | | - """ Randomly choose to tie breakers """ |
604 | | - if len(cat_winner[title][category]) > 0: |
605 | | - random.shuffle(cat_winner[title][category]) |
606 | | - cat_winner[title][category] = cat_winner[title][category][0] |
607 | | - else: |
608 | | - cat_winner[title][category] = None |
609 | | - |
610 | | - winner = cat_winner[title][category] # this a top level category |
611 | | - if not(winner == None): |
612 | | - win_count[title][winner] = win_count[title][winner] + 1 |
| 626 | + except KeyError: |
| 627 | + path_length = self._max_depth_ |
| 628 | + |
| 629 | + if path_length < min_depth: |
| 630 | + cat_winner = [tl_cat] |
| 631 | + min_depth = path_length |
| 632 | + elif path_length == min_depth: |
| 633 | + cat_winner.append(tl_cat) # there can only be one winner |
613 | 634 | |
614 | | - """ Classify the top level categories for each page """ |
615 | | - page_tl_cat[title] = None |
616 | | - best_count = 0 |
617 | | - for tl_cat in self._top_level_cats_: |
618 | | - if win_count[title][tl_cat] > best_count: |
619 | | - page_tl_cat[title] = tl_cat |
620 | | - best_count = win_count[title][tl_cat] |
621 | | - elif win_count[title][tl_cat] == best_count and best_count > 0: |
622 | | - page_tl_cat[title] = page_tl_cat[title] + ' / ' + tl_cat |
623 | | - |
624 | | - return titles, page_tl_cat # , depths, cat_winner |
| 635 | + """ Randomly choose to tie breakers """ |
| 636 | + if len(cat_winner) > 0: |
| 637 | + random.shuffle(cat_winner) |
| 638 | + cat_winner = cat_winner[0] |
| 639 | + else: |
| 640 | + cat_winner = None |
| 641 | + |
| 642 | + winner = cat_winner # this a top level category |
| 643 | + if not(winner == None): |
| 644 | + win_count[winner] = win_count[winner] + 1 |
| 645 | + |
| 646 | + |
| 647 | + """ Classify the top level categories for each page """ |
625 | 648 | |
| 649 | + best_count = 0 |
| 650 | + |
| 651 | + for tl_cat in self._top_level_cats_: |
| 652 | + if win_count[tl_cat] > best_count: |
| 653 | + page_tl_cat = tl_cat |
| 654 | + best_count = win_count[tl_cat] |
| 655 | + elif win_count[tl_cat] == best_count and best_count > 0: |
| 656 | + page_tl_cat = page_tl_cat + ' / ' + tl_cat |
| 657 | + |
| 658 | + return page_tl_cat |
| 659 | + |
626 | 660 | """ |
| 661 | + Method for determining top level category for a set of categories |
| 662 | + |
| 663 | + This method looks at the closest top level categories for each category and constructs vector representations based on path lengths. The dimensions of the |
| 664 | + vector space are the top level categories. The value for a given dimension is determined by the sum of the path lengths from each category to the top-level |
| 665 | + category where the summand is weighted by the distance of the category to the main topic |
| 666 | + |
| 667 | + @param categories: String list of categories to classify |
| 668 | + @param shortest_paths: dictionary of shortest paths indexed by categories |
| 669 | + |
| 670 | + @return: String indicating the top level category(ies) |
| 671 | + """ |
| 672 | + def rank_categories_M2(self, categories, shortest_paths, topic_counts): |
| 673 | + |
| 674 | + """ Go through each category for a page and find out which top level cat is closest """ |
| 675 | + |
| 676 | + tl_cat_vectors = np.zeros(len(self._top_level_cats_)) # initialize the vector in top-level category space |
| 677 | + page_tl_cat = [0] * len(self._top_level_cats_) # the top level cats |
| 678 | + |
| 679 | + for category in categories: |
| 680 | + |
| 681 | + """ Compute the weight of this category """ |
| 682 | + try: |
| 683 | + path_length_from_main = shortest_paths[self._main_topic_][category] |
| 684 | + except: |
| 685 | + path_length_from_main = self._max_depth_ |
| 686 | + |
| 687 | + """ The fanout weight is based on the fanout of a category for a fixed depth .. if there is no fanout for this topic it probably has vary few or no |
| 688 | + subtopics so assign a fanout of 1.0 """ |
| 689 | + try: |
| 690 | + fanout_weight = float(topic_counts[category]) |
| 691 | + except: |
| 692 | + fanout_weight = 1.0 |
| 693 | + pass |
| 694 | + |
| 695 | + """ The total weight of this category depends on the product of how far it lies from the root and the inverse of its fanout |
| 696 | + this makes more specialized categories worth more """ |
| 697 | + category_weight = float(path_length_from_main) * fanout_weight |
| 698 | + |
| 699 | + # category_weight = 1 / float(path_length_from_main) * self._max_depth_ |
| 700 | + |
| 701 | + cat_winner = list() |
| 702 | + min_depth = self._max_depth_ |
| 703 | + |
| 704 | + for index, tl_cat in enumerate(self._top_level_cats_): |
| 705 | + |
| 706 | + """ Use shortest paths """ |
| 707 | + try: |
| 708 | + path_length = shortest_paths[tl_cat][category] |
| 709 | + except KeyError: |
| 710 | + path_length = 100 |
| 711 | + |
| 712 | + tl_cat_vectors[index] = tl_cat_vectors[index] + category_weight * np.power(path_length, 0.5) |
| 713 | + |
| 714 | + """ Normalize the vector representation """ |
| 715 | + ranks = np.floor(ss.rankdata(tl_cat_vectors)) - 1 |
| 716 | + vec = max(tl_cat_vectors) - tl_cat_vectors |
| 717 | + tl_cat_vectors = vec / float(sum(vec)) |
| 718 | + |
| 719 | + """ Choose the top categories """ |
| 720 | + for i in range(self._num_tl_cats_): |
| 721 | + index = np.argmin(ranks) |
| 722 | + ranks[index] = self._num_tl_cats_ + 1 |
| 723 | + page_tl_cat[i] = self._top_level_cats_[index] |
| 724 | + |
| 725 | + top_five_cats = '' |
| 726 | + for i in range(5): |
| 727 | + top_five_cats = top_five_cats + page_tl_cat[i] + ', ' |
| 728 | + top_five_cats = top_five_cats[:-2] |
| 729 | + page_tl_cat = top_five_cats |
| 730 | + |
| 731 | + return page_tl_cat |
| 732 | + |
| 733 | + """ |
627 | 734 | Builds a table containing all main namespace pages and their chosen categories |
628 | 735 | """ |
629 | 736 | def determine_all_page_categories(self): |
— | — | @@ -634,51 +741,84 @@ |
635 | 742 | logging.info('CATEGORIZING PAGES: Initializing tables ... ') |
636 | 743 | self.execute_SQL(sql_drop) |
637 | 744 | self.execute_SQL(sql_create) |
| 745 | + self.execute_SQL(self._query_names_['index1_page_category']) |
| 746 | + self.execute_SQL(self._query_names_['index2_page_category']) |
| 747 | + self.execute_SQL(self._query_names_['index3_page_category']) |
638 | 748 | |
639 | | - logging.info('CATEGORIZING PAGES: Getting all pages ... ') |
640 | | - sql_get_page_ids = self._query_names_['get_all_page_ids'] |
641 | | - results = self.execute_SQL(sql_get_page_ids) |
642 | | - |
643 | | - page_ids = list() |
644 | | - for row in results: |
645 | | - page_ids.append(int(row[0])) |
646 | | - |
647 | 749 | logging.info('CATEGORIZING PAGES: Unshelving shortest paths ... ') |
648 | | - # shortest_paths = self.unpickle_var('shortest_paths.p') |
| 750 | + d = shelve.open( settings.__data_file_dir__ + 'topic_tree.s') |
649 | 751 | |
650 | | - d = shelve.open( settings.__data_file_dir__ + 'shortest_paths.s') |
651 | 752 | shortest_paths = d['shortest_paths'] |
| 753 | + topic_counts = d['topic_counts'] |
652 | 754 | |
653 | | - logging.info('CATEGORIZING PAGES: Computing categories ... ') |
654 | | - titles, page_tl_cat = self.find_top_level_category(page_ids, shortest_paths) |
655 | | - ids = dict((i,v) for v,i in titles.iteritems()) |
656 | | - |
657 | | - logging.info('CATEGORIZING PAGES: Performing inserts ... ') |
658 | | - page_id_str = '' |
659 | | - for title in page_tl_cat: |
660 | | - id = ids[title] |
661 | | - category = page_tl_cat[title] |
| 755 | + """ |
| 756 | + Break up processing to handle records with page titles matching the regular expressions in _regexp_list_ |
| 757 | + """ |
| 758 | + for regexp in self._regexp_list_: |
662 | 759 | |
663 | | - parts = title.split("'") |
664 | | - new_title = parts[0] |
665 | | - parts = parts[1:] |
666 | | - for part in parts: |
667 | | - new_title = new_title + " " + part |
668 | | - |
669 | | - page_id_str = "(%s,'%s','%s')" % (id, new_title, category) |
670 | | - try: |
671 | | - self.execute_SQL(sql_insert % page_id_str) |
672 | | - except: |
673 | | - logging.info('Could not insert: %s ... ' % new_title) |
674 | | - pass |
675 | | - # page_ids.append(str(row[0])) |
676 | | - # page_id_str = page_id_str[:-1] |
| 760 | + logging.info('CATEGORIZING PAGES: Getting pages for %s ... ' % regexp) |
| 761 | + |
| 762 | + sql_get_page_ids = self._query_names_['get_all_page_ids'] + " and page_title regexp '%s';" % regexp |
| 763 | + results = self.execute_SQL(sql_get_page_ids) |
| 764 | + |
| 765 | + page_ids = list() |
| 766 | + for row in results: |
| 767 | + page_ids.append(int(row[0])) |
| 768 | + |
| 769 | + logging.info('CATEGORIZING PAGES: Computing categories ... ') |
| 770 | + titles, page_tl_cat = self.find_top_level_category(page_ids, shortest_paths, topic_counts) |
| 771 | + ids = dict((i,v) for v,i in titles.iteritems()) |
| 772 | + |
| 773 | + logging.info('CATEGORIZING PAGES: Performing inserts ... ') |
| 774 | + page_id_str = '' |
| 775 | + for title in page_tl_cat: |
| 776 | + id = ids[title] |
| 777 | + category = page_tl_cat[title] |
| 778 | + |
| 779 | + parts = title.split("'") |
| 780 | + new_title = parts[0] |
| 781 | + parts = parts[1:] |
| 782 | + for part in parts: |
| 783 | + new_title = new_title + " " + part |
| 784 | + |
| 785 | + page_id_str = "(%s,'%s','%s')" % (id, new_title, category) |
| 786 | + try: |
| 787 | + self.execute_SQL(sql_insert % page_id_str) |
| 788 | + except: |
| 789 | + logging.info('Could not insert: %s ... ' % new_title) |
| 790 | + pass |
| 791 | + |
| 792 | + |
| 793 | + d.close() |
| 794 | + |
| 795 | + |
| 796 | + """ |
| 797 | + Gets a subcategory count for a fixed depth of the category graph structure starting at a specified node |
| 798 | + loops are ignored |
677 | 799 | |
678 | | - #logging.info('CATEGORIZING PAGES: Inserting page ids into rfaulk.page_category ... ') |
679 | | - #self.execute_SQL(sql_insert % page_id_str) |
| 800 | + @param topic: The topic to produce a sub topic count for |
| 801 | + @param subcategories: dictionary keyed on categories, values are subcategories |
| 802 | + @param depth: the current depth of the recursion |
| 803 | + @param max_depth: The maximum depth of the recursion |
680 | 804 | |
681 | | - d.close() |
| 805 | + """ |
| 806 | + def get_subcategory_count(self, topic, subcategories, depth, max_depth): |
682 | 807 | |
| 808 | + topic_count = 1 |
| 809 | + |
| 810 | + try: |
| 811 | + topic_subcategories = subcategories[topic] |
| 812 | + new_depth = depth + 1 |
| 813 | + |
| 814 | + if depth < max_depth: |
| 815 | + for sub_topic in topic_subcategories: |
| 816 | + topic_count = topic_count + self.get_subcategory_count(sub_topic, subcategories, new_depth, max_depth) |
| 817 | + except: |
| 818 | + # logging.info('No subcategories of %s' % topic) |
| 819 | + pass |
| 820 | + |
| 821 | + return topic_count |
| 822 | + |
683 | 823 | """ |
684 | 824 | Inherits WSORSlaveDataLoader |
685 | 825 | |