Index: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py |
— | — | @@ -12,18 +12,21 @@ |
13 | 13 | |
14 | 14 | |
15 | 15 | """ Import python base modules """ |
16 | | -import sys, getopt, re, datetime, logging, MySQLdb, settings, operator, pickle |
| 16 | +import sys, getopt, re, datetime, logging, MySQLdb, operator, pickle, shelve, random |
17 | 17 | import networkx as nx |
18 | 18 | |
19 | 19 | """ Import Analytics modules """ |
20 | 20 | from Fundraiser_Tools.classes.DataLoader import DataLoader |
| 21 | +import WSOR.scripts.classes.settings as settings |
21 | 22 | |
22 | 23 | """ Configure the logger """ |
23 | 24 | LOGGING_STREAM = sys.stderr |
24 | 25 | logging.basicConfig(level=logging.DEBUG, stream=LOGGING_STREAM, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%b-%d %H:%M:%S') |
| 26 | +# logging.basicConfig(level=logging.DEBUG, filename="categories.log", filemode='w', format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%b-%d %H:%M:%S') |
25 | 27 | |
26 | 28 | |
27 | 29 | |
| 30 | + |
28 | 31 | """ |
29 | 32 | Inherits DataLoader |
30 | 33 | |
— | — | @@ -102,16 +105,31 @@ |
103 | 106 | self._query_names_['build_subcat_tbl'] = "CREATE TABLE rfaulk.categorylinks_cp select * from enwiki.categorylinks where cl_type = 'subcat'" |
104 | 107 | self._query_names_['drop_subcat_tbl'] = "drop table if exists rfaulk.categorylinks_cp;" |
105 | 108 | self._query_names_['get_first_rec'] = "select cl_from from categorylinks_cp limit 1" |
106 | | - self._query_names_['get_category_page_title'] = "select page_title from enwiki.page where page_id = %s" |
| 109 | + self._query_names_['get_category_page_title'] = "select page_id, page_title from enwiki.page where %s" |
107 | 110 | self._query_names_['get_category_page_id'] = "select page_id from enwiki.page where page_title = '%s' and page_namespace = 14" |
108 | 111 | self._query_names_['get_subcategories'] = "select cl_to from categorylinks_cp where cl_from = %s" |
109 | 112 | self._query_names_['delete_from_recs'] = "delete from rfaulk.categorylinks_cp where cl_from = %s" |
110 | 113 | self._query_names_['is_empty'] = "select * from rfaulk.categorylinks_cp limit 1" |
111 | 114 | self._query_names_['get_category_links'] = "select cl_from, cl_to from categorylinks_cp" |
| 115 | + self._query_names_['get_page_categories'] = "select cl_from, cl_to from enwiki.categorylinks where %s order by 1" |
| 116 | + self._query_names_['get_all_page_ids'] = "select page_id from enwiki.page where page_namespace = 0 and page_len > 1000" |
112 | 117 | |
| 118 | + self._query_names_['create_page_category'] = "create table rfaulk.page_category (page_id int(8) unsigned, page_title varbinary(255), category varbinary(255));" |
| 119 | + self._query_names_['drop_page_category'] = "drop table if exists rfaulk.page_category;" |
| 120 | + self._query_names_['insert_page_category'] = "insert into rfaulk.page_category values %s;" |
| 121 | + |
| 122 | + |
113 | 123 | WSORSlaveDataLoader.__init__(self) |
114 | 124 | logging.info('Creating CategoryLoader') |
115 | | - |
| 125 | + |
| 126 | + self._max_depth_ = 100 |
| 127 | + self._top_level_cats_ = ['Natural_sciences', 'Applied_sciences', 'Mathematics', 'Literature', 'Visual_arts', 'Social_sciences', 'Film', 'Music', 'Television', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports', 'Places'] |
| 128 | + # self._top_level_cats_ = ['Natural_sciences', 'Mathematics', 'Arts', 'Social_sciences', 'Entertainment', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports'] |
| 129 | + self._block_words_ = ['categories', 'Categories', 'topic', 'Topic'] |
| 130 | + self._block_cats_ = [''] |
| 131 | + self._topic_trees_ = dict() |
| 132 | + |
| 133 | + |
116 | 134 | """ |
117 | 135 | Retrieves all rows out of the category links table |
118 | 136 | """ |
— | — | @@ -129,8 +147,45 @@ |
130 | 148 | |
131 | 149 | return results |
132 | 150 | |
| 151 | + """ |
| 152 | + Extract the categories for a given article |
| 153 | + """ |
| 154 | + def get_page_categories(self, page_id_list): |
| 155 | + |
| 156 | + categories = dict() |
| 157 | + where_clause = '' |
| 158 | + |
| 159 | + """ Initialize category lists for each page """ |
| 160 | + for id in page_id_list: |
| 161 | + categories[id] = list() |
| 162 | + |
| 163 | + try: |
| 164 | + |
| 165 | + for id in page_id_list: |
| 166 | + where_clause = where_clause + 'cl_from = %s or ' % str(id) |
| 167 | + where_clause = where_clause[:-4] |
| 168 | + |
| 169 | + sql = self._query_names_['get_page_categories'] % where_clause |
| 170 | + |
| 171 | + logging.info('Retrieving page categories ...') |
| 172 | + results = self.execute_SQL(sql) |
| 173 | + |
| 174 | + """ walk through results and add to category lists """ |
| 175 | + for row in results: |
| 176 | + id = int(row[0]) |
| 177 | + categories[id].append(row[1]) |
| 178 | + |
| 179 | + except Exception as inst: |
| 180 | + |
| 181 | + logging.error('Could not retrieve page categories.') |
| 182 | + logging.error(str(type(inst))) # the exception instance |
| 183 | + logging.error(str(inst.args)) # arguments stored in .args |
| 184 | + logging.error(inst.__str__()) # __str__ allows args to printed directly |
| 185 | + |
| 186 | + return {} |
| 187 | + |
| 188 | + return categories |
133 | 189 | |
134 | | - |
135 | 190 | """ |
136 | 191 | Retrives the integer page id |
137 | 192 | """ |
— | — | @@ -151,21 +206,38 @@ |
152 | 207 | |
153 | 208 | """ |
154 | 209 | Retrives the string page title |
| 210 | + |
| 211 | + This either manages a list of ids or a single id |
155 | 212 | """ |
156 | 213 | def get_page_title(self, page_id): |
157 | 214 | |
| 215 | + logging.info('Getting page titles ...') |
| 216 | + is_list = isinstance(page_id, (list)) |
| 217 | + |
158 | 218 | try: |
159 | | - sql = self._query_names_['get_category_page_title'] % page_id |
160 | | - #logging.info('Executing: ' + sql) |
| 219 | + if not(is_list): |
| 220 | + where_clause = 'page_id = %s' % str(page_id) |
| 221 | + else: |
| 222 | + where_clause = '' |
| 223 | + for id in page_id: |
| 224 | + where_clause = where_clause + 'page_id = %s or ' % str(id) |
| 225 | + where_clause = where_clause[:-4] |
| 226 | + |
| 227 | + sql = self._query_names_['get_category_page_title'] % where_clause |
161 | 228 | results = self.execute_SQL(sql) |
162 | | - title = str(results[0][0]) |
163 | | - |
| 229 | + |
| 230 | + if not(is_list): |
| 231 | + title = str(results[0][1]) |
| 232 | + else: |
| 233 | + title = dict() |
| 234 | + |
| 235 | + for row in results: |
| 236 | + title[int(row[0])] = str(row[1]) |
| 237 | + |
164 | 238 | except Exception as inst: |
165 | 239 | |
166 | | - logging.error('Could not retrieve page_title.') |
167 | | - logging.error(str(type(inst))) # the exception instance |
168 | | - logging.error(str(inst.args)) # arguments stored in .args |
169 | | - logging.error(inst.__str__()) # __str__ allows args to printed directly |
| 240 | + logging.error('Could not retrieve page_title for page_id = %s.' % page_id) |
| 241 | + self._log_file.write('Could not retrieve page_title for page_id = %s.\n' % (page_id)) |
170 | 242 | |
171 | 243 | return '' |
172 | 244 | |
— | — | @@ -181,7 +253,9 @@ |
182 | 254 | |
183 | 255 | #self.drop_category_links_cp_table() |
184 | 256 | #self.create_category_links_cp_table() |
185 | | - |
| 257 | + |
| 258 | + self._log_file = open('category_miner.log', 'w') |
| 259 | + |
186 | 260 | """ Create graph """ |
187 | 261 | |
188 | 262 | logging.info('Initializing directed graph...') |
— | — | @@ -207,9 +281,9 @@ |
208 | 282 | """ Process subcategory links """ |
209 | 283 | for row in links: |
210 | 284 | |
211 | | - cl_from = int(row[0]) |
212 | | - cl_to = str(row[1]) |
213 | | - cl_from = self.get_page_title(cl_from) |
| 285 | + cl_from = str(row[1]) |
| 286 | + cl_to = int(row[0]) |
| 287 | + cl_to = self.get_page_title(cl_to) |
214 | 288 | |
215 | 289 | try: |
216 | 290 | subcategories[cl_from].append(cl_to) |
— | — | @@ -230,10 +304,12 @@ |
231 | 305 | |
232 | 306 | directed_graph.add_weighted_edges_from([(cl_from, cl_to, 1)]) |
233 | 307 | |
234 | | - if self.__DEBUG__ and count % 1000 == 0: |
| 308 | + if self.__DEBUG__ and (cl_from == 'Probability' or cl_from == 'Mathematics' or cl_from == 'Science' or cl_from == 'Arts'): |
| 309 | + #if self.__DEBUG__ and count % 1000 == 0 : |
235 | 310 | |
236 | 311 | logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to)) |
237 | | - |
| 312 | + self._log_file.write('%s: %s -> %s\n' % (str(count), cl_from, cl_to)) |
| 313 | + |
238 | 314 | count = count + 1 |
239 | 315 | |
240 | 316 | logging.info('Sorting in degree list.') |
— | — | @@ -244,12 +320,13 @@ |
245 | 321 | in_only, out_only = self.get_uni_directionally_linked_categories(sorted_in_degrees, sorted_out_degrees, in_degrees, out_degrees) |
246 | 322 | |
247 | 323 | logging.info('Category links finished processing.') |
| 324 | + self._log_file.close() |
248 | 325 | |
249 | 326 | return directed_graph, in_degrees, out_degrees, sorted_in_degrees, sorted_out_degrees, subcategories, in_only, out_only |
250 | 327 | |
251 | 328 | |
252 | 329 | """ |
253 | | - Returns |
| 330 | + Looks at the in and out degrees and constructs lists of nodes having only edges out and edges in |
254 | 331 | """ |
255 | 332 | def get_uni_directionally_linked_categories(self, in_degrees, out_degrees, in_degrees_by_key, out_degrees_by_key ): |
256 | 333 | |
— | — | @@ -307,9 +384,7 @@ |
308 | 385 | logging.error(str(type(inst))) # the exception instance |
309 | 386 | logging.error(str(inst.args)) # arguments stored in .args |
310 | 387 | logging.error(inst.__str__()) # __str__ allows args to printed directly |
311 | | - |
312 | | - |
313 | | - |
| 388 | + |
314 | 389 | """ |
315 | 390 | Are there any records remaining in rfaulk.categorylinks_cp ?? |
316 | 391 | """ |
— | — | @@ -334,64 +409,276 @@ |
335 | 410 | return False |
336 | 411 | else: |
337 | 412 | return True |
338 | | - |
339 | | - |
| 413 | + |
340 | 414 | """ |
341 | 415 | Are there any records remaining in rfaulk.categorylinks_cp ?? |
342 | 416 | |
343 | 417 | Use a trace to detect any loops |
344 | 418 | """ |
345 | | - def construct_topic_tree(self, topic, subcategories): |
| 419 | + def construct_topic_tree(self, subcategories): |
346 | 420 | |
347 | 421 | """ Create graph """ |
348 | 422 | |
349 | 423 | logging.info('Initializing directed graph...') |
350 | | - directed_graph = nx.DiGraph() |
351 | | - trace = [topic] |
| 424 | + graph = nx.Graph() |
| 425 | + self._count_ = 1 |
352 | 426 | |
353 | | - topic_counts = self._recursive_construct_topic_tree(directed_graph, topic, subcategories, trace) |
| 427 | + subcategories['top_level_categories'] = self._top_level_cats_ |
| 428 | + topic = 'top_level_categories' |
354 | 429 | |
355 | | - return directed_graph, topic_counts |
| 430 | + depth = 0 |
| 431 | + logging.info('Recursively contructing graph, MAX DEPTH = %s ...' % self._max_depth_) |
| 432 | + shortest_paths, topic_counts = self._recursive_construct_topic_tree(graph, topic, subcategories, depth) |
356 | 433 | |
| 434 | + |
| 435 | + """ Pickle the result """ |
| 436 | + #logging.info('Pickling the shortest paths ...') |
| 437 | + #self.pickle_var(shortest_paths, 'shortest_paths.p') |
| 438 | + |
| 439 | + """ Shelve the result """ |
| 440 | + logging.info('Shelve the shortest paths ...') |
| 441 | + d = shelve.open( settings.__data_file_dir__ + 'shortest_paths.s') |
| 442 | + d['shortest_paths'] = shortest_paths |
| 443 | + d.close() |
| 444 | + |
| 445 | + return graph, shortest_paths |
| 446 | + |
357 | 447 | """ |
358 | | - Are there any records remaining in rfaulk.categorylinks_cp ?? |
| 448 | + Recursively build the graph structure for categories based on the subcategory list |
| 449 | + |
| 450 | + @param graph: NetworkX graph structure to store category linkage |
| 451 | + @param topic: String topic name on which to build a recursive structure |
| 452 | + @param subcategories: Disctionary of subcategory lists |
| 453 | + @param depth: integer depth of the call within the recursion |
| 454 | + |
359 | 455 | """ |
360 | | - def _recursive_construct_topic_tree(self, directed_graph, topic, subcategories, trace): |
| 456 | + def _recursive_construct_topic_tree(self, graph, topic, subcategories, depth): |
| 457 | + |
| 458 | + topic_counts = 1 |
| 459 | + depth = depth + 1 |
| 460 | + self._count_ = self._count_ + 1 |
361 | 461 | |
362 | | - topic_counts = 0 |
| 462 | + if self._count_ % 10000 == 0: |
| 463 | + logging.info('Processed %s nodes. Graph size = %s.' % (str(self._count_), str(graph.number_of_nodes()))) |
363 | 464 | |
364 | 465 | """ Extract the subtopics of topic """ |
365 | 466 | try: |
366 | | - topic_subcategories = subcategories[topic] |
367 | | - |
| 467 | + topic_subcategories = subcategories[topic] |
| 468 | + new_subcategories = topic_subcategories[:] |
| 469 | + |
| 470 | + """ Filter meta categories based on block words """ |
| 471 | + for sub_topic in topic_subcategories: |
| 472 | + for block_word in self._block_words_: |
| 473 | + if re.search(block_word, sub_topic): |
| 474 | + new_subcategories.remove(sub_topic) |
| 475 | + for block_cat in self._block_cats_: |
| 476 | + if block_cat == sub_topic: |
| 477 | + new_subcategories.remove(sub_topic) |
| 478 | + |
| 479 | + topic_subcategories = new_subcategories |
| 480 | + |
368 | 481 | except KeyError: |
369 | 482 | """ There are no subcategories for this topic """ |
370 | 483 | return 1 # there is a topic count of 1 |
371 | 484 | |
372 | | - """ Recursively build linkages for each """ |
373 | | - # logging.info(str(trace)) |
| 485 | + """ Recursively build linkages for each . |
| 486 | + DFS determining topic tree - this provides """ |
374 | 487 | for sub_topic in topic_subcategories: |
375 | 488 | |
376 | | - if not(sub_topic in trace): |
| 489 | + if depth == 1: |
| 490 | + logging.info('Processing top level catgory: %s' % sub_topic) |
| 491 | + |
| 492 | + if not(graph.has_node(sub_topic)): |
377 | 493 | |
378 | | - logging.info(topic + ' --> ' + sub_topic) |
| 494 | + graph.add_edge(topic, sub_topic) |
| 495 | + |
| 496 | + """ Only go deeper if the maximum recursive depth has not been reached """ |
| 497 | + if depth < self._max_depth_: |
| 498 | + sub_topic_counts = self._recursive_construct_topic_tree(graph, sub_topic, subcategories, depth) |
| 499 | + else: |
| 500 | + sub_topic_counts = 1 |
379 | 501 | |
380 | | - copy_trace = trace[:] |
381 | | - copy_trace.append(sub_topic) |
382 | | - |
383 | | - directed_graph.add_weighted_edges_from([(topic, sub_topic, 1)]) |
384 | | - sub_topic_counts = self._recursive_construct_topic_tree(directed_graph, sub_topic, subcategories, copy_trace) |
385 | | - |
386 | 502 | topic_counts = topic_counts + sub_topic_counts |
387 | 503 | |
388 | 504 | else: |
389 | 505 | |
390 | | - logging.info('LOOP: ' + topic + ' --> ' + sub_topic) |
| 506 | + """ Add the 'loop' edge if and only if it is not a top level catagory """ |
| 507 | + if not(sub_topic in self._top_level_cats_): |
| 508 | + graph.add_edge(topic, sub_topic) |
| 509 | + topic_counts = topic_counts + 1 |
| 510 | + |
| 511 | + """ After the recursion is complete compute the shortest paths """ |
| 512 | + if depth == 1: |
| 513 | + |
| 514 | + shortest_paths = dict() |
| 515 | + |
| 516 | + for sub_topic in self._top_level_cats_: |
| 517 | + logging.info('Computing shortest paths for %s ...' % sub_topic) |
| 518 | + shortest_paths[sub_topic] = nx.single_source_dijkstra_path(graph, sub_topic) |
391 | 519 | |
392 | | - directed_graph.add_weighted_edges_from([(topic, 'LOOP TO: ' + sub_topic, 1)]) |
393 | | - |
| 520 | + """ Store the lengths rather than the paths """ |
| 521 | + for target in shortest_paths[sub_topic]: |
| 522 | + shortest_paths[sub_topic][target] = len(shortest_paths[sub_topic][target]) |
| 523 | + |
| 524 | + return shortest_paths, topic_counts |
| 525 | + |
394 | 526 | return topic_counts |
| 527 | + |
| 528 | + """ |
| 529 | + Pickles variables that store the state of the category graph |
| 530 | + """ |
| 531 | + def pickle_all(self, directed_graph, in_degrees, out_degrees, sorted_in_degrees, sorted_out_degrees, subcategories, in_only, out_only): |
| 532 | + |
| 533 | + self.pickle_var(directed_graph, 'full_topic_graph.p') |
| 534 | + self.pickle_var(in_degrees, 'in_degrees_dict.p') |
| 535 | + self.pickle_var(out_degrees, 'out_degrees_dict.p') |
| 536 | + self.pickle_var(sorted_out_degrees, 'sorted_out_degrees_dict.p') |
| 537 | + self.pickle_var(sorted_in_degrees, 'sorted_in_degrees_dict.p') |
| 538 | + self.pickle_var(subcategories, 'subcategories.p') |
| 539 | + self.pickle_var(in_only, 'in_only.p') |
| 540 | + self.pickle_var(out_only, 'out_only.p') |
| 541 | + |
| 542 | + """ |
| 543 | + @param page_ids: a list of pages to classify |
| 544 | + """ |
| 545 | + def find_top_level_category(self, page_ids, shortest_paths): |
| 546 | + |
| 547 | + # self._topic_trees_ = dict() |
| 548 | + titles = dict() |
| 549 | + depths = dict() |
| 550 | + page_categories = dict() |
| 551 | + page_tl_cat = dict() |
| 552 | + cat_winner = dict() |
| 553 | + win_count = dict() |
| 554 | + |
| 555 | + """ Get categories for pages - Initialize depth dictionaries for top level categories """ |
| 556 | + logging.info('Initializing data structures ...') |
| 557 | + page_categories = self.get_page_categories(page_ids) |
| 558 | + titles = self.get_page_title(page_ids) |
| 559 | + |
| 560 | + for page_id in page_ids: |
| 561 | + # page_categories[page_id] = self.get_page_categories(page_id) |
| 562 | + title = titles[page_id] |
| 563 | + depths[title] = dict() |
| 564 | + |
| 565 | + """ Initialize dictionaries to store the depth scores for top level categories """ |
| 566 | + for category in page_categories[page_id]: |
| 567 | + depths[title][category] = dict() |
| 568 | + |
| 569 | + """ Iterate through each page, category, and top level category |
| 570 | + Perform a breadth first search for the node to determine the dept """ |
| 571 | + logging.info('Finding category depths in each topic tree ...') |
| 572 | + |
| 573 | + for page_id in page_ids: |
| 574 | + |
| 575 | + # logging.info('For %s classifying categories: %s...' % (title, str(page_categories[page_id]))) |
| 576 | + title = titles[page_id] |
| 577 | + cat_winner[title] = dict() |
| 578 | + |
| 579 | + """ Initialize the number of top level categorizations for each top level category """ |
| 580 | + win_count[title] = dict() |
| 581 | + for tl_cat in self._top_level_cats_: |
| 582 | + win_count[title][tl_cat] = 0 |
| 583 | + |
| 584 | + """ Go through each category for a page and find out which top level cat is closest """ |
| 585 | + for category in page_categories[page_id]: |
395 | 586 | |
| 587 | + cat_winner[title][category] = list() |
| 588 | + min_depth = self._max_depth_ |
| 589 | + for tl_cat in self._top_level_cats_: |
| 590 | + |
| 591 | + """ Use shortest paths """ |
| 592 | + try: |
| 593 | + depths[title][category][tl_cat] = shortest_paths[tl_cat][category] |
| 594 | + except KeyError: |
| 595 | + depths[title][category][tl_cat] = 99 |
| 596 | + |
| 597 | + if depths[title][category][tl_cat] < min_depth: |
| 598 | + cat_winner[title][category].append(tl_cat) |
| 599 | + min_depth = depths[title][category][tl_cat] |
| 600 | + elif depths[title][category][tl_cat] == min_depth: |
| 601 | + cat_winner[title][category].append(tl_cat) # there can only be one winner |
| 602 | + |
| 603 | + """ Randomly choose to tie breakers """ |
| 604 | + if len(cat_winner[title][category]) > 0: |
| 605 | + random.shuffle(cat_winner[title][category]) |
| 606 | + cat_winner[title][category] = cat_winner[title][category][0] |
| 607 | + else: |
| 608 | + cat_winner[title][category] = None |
| 609 | + |
| 610 | + winner = cat_winner[title][category] # this a top level category |
| 611 | + if not(winner == None): |
| 612 | + win_count[title][winner] = win_count[title][winner] + 1 |
| 613 | + |
| 614 | + """ Classify the top level categories for each page """ |
| 615 | + page_tl_cat[title] = None |
| 616 | + best_count = 0 |
| 617 | + for tl_cat in self._top_level_cats_: |
| 618 | + if win_count[title][tl_cat] > best_count: |
| 619 | + page_tl_cat[title] = tl_cat |
| 620 | + best_count = win_count[title][tl_cat] |
| 621 | + elif win_count[title][tl_cat] == best_count and best_count > 0: |
| 622 | + page_tl_cat[title] = page_tl_cat[title] + ' / ' + tl_cat |
| 623 | + |
| 624 | + return titles, page_tl_cat # , depths, cat_winner |
| 625 | + |
| 626 | + """ |
| 627 | + Builds a table containing all main namespace pages and their chosen categories |
| 628 | + """ |
| 629 | + def determine_all_page_categories(self): |
| 630 | + |
| 631 | + sql_create = self._query_names_['create_page_category'] |
| 632 | + sql_drop = self._query_names_['drop_page_category'] |
| 633 | + sql_insert = self._query_names_['insert_page_category'] |
| 634 | + |
| 635 | + logging.info('CATEGORIZING PAGES: Initializing tables ... ') |
| 636 | + self.execute_SQL(sql_drop) |
| 637 | + self.execute_SQL(sql_create) |
| 638 | + |
| 639 | + logging.info('CATEGORIZING PAGES: Getting all pages ... ') |
| 640 | + sql_get_page_ids = self._query_names_['get_all_page_ids'] |
| 641 | + results = self.execute_SQL(sql_get_page_ids) |
| 642 | + |
| 643 | + page_ids = list() |
| 644 | + for row in results: |
| 645 | + page_ids.append(int(row[0])) |
| 646 | + |
| 647 | + logging.info('CATEGORIZING PAGES: Unshelving shortest paths ... ') |
| 648 | + # shortest_paths = self.unpickle_var('shortest_paths.p') |
| 649 | + |
| 650 | + d = shelve.open( settings.__data_file_dir__ + 'shortest_paths.s') |
| 651 | + shortest_paths = d['shortest_paths'] |
| 652 | + |
| 653 | + logging.info('CATEGORIZING PAGES: Computing categories ... ') |
| 654 | + titles, page_tl_cat = self.find_top_level_category(page_ids, shortest_paths) |
| 655 | + ids = dict((i,v) for v,i in titles.iteritems()) |
| 656 | + |
| 657 | + logging.info('CATEGORIZING PAGES: Performing inserts ... ') |
| 658 | + page_id_str = '' |
| 659 | + for title in page_tl_cat: |
| 660 | + id = ids[title] |
| 661 | + category = page_tl_cat[title] |
| 662 | + |
| 663 | + parts = title.split("'") |
| 664 | + new_title = parts[0] |
| 665 | + parts = parts[1:] |
| 666 | + for part in parts: |
| 667 | + new_title = new_title + " " + part |
| 668 | + |
| 669 | + page_id_str = "(%s,'%s','%s')" % (id, new_title, category) |
| 670 | + try: |
| 671 | + self.execute_SQL(sql_insert % page_id_str) |
| 672 | + except: |
| 673 | + logging.info('Could not insert: %s ... ' % new_title) |
| 674 | + pass |
| 675 | + # page_ids.append(str(row[0])) |
| 676 | + # page_id_str = page_id_str[:-1] |
| 677 | + |
| 678 | + #logging.info('CATEGORIZING PAGES: Inserting page ids into rfaulk.page_category ... ') |
| 679 | + #self.execute_SQL(sql_insert % page_id_str) |
| 680 | + |
| 681 | + d.close() |
| 682 | + |
396 | 683 | """ |
397 | 684 | Inherits WSORSlaveDataLoader |
398 | 685 | |