r95296 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r95295‎ \| r95296 \| r95297 >
Date:	01:38, 23 August 2011
Author:	rfaulk
Status:	deferred
Tags:
Comment:	added new way of computing category classfications retain both shortest paths and topic counts from category structure
Modified paths:	/trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py
—	—	@@ -13,7 +13,7 @@
14	14
15	15	""" Import python base modules """
16	16	import sys, getopt, re, datetime, logging, MySQLdb, operator, pickle, shelve, random
17		~~-import networkx as nx~~
	17	+import networkx as nx, numpy as np, scipy.stats as ss
18	18
19	19	""" Import Analytics modules """
20	20	from Fundraiser_Tools.classes.DataLoader import DataLoader
—	—	@@ -98,8 +98,11 @@
99	99	"""
100	100	class CategoryLoader(WSORSlaveDataLoader):
101	101
102		~~- def __init__(self):~~
	102	+ def __init__(self, subcategories):
103	103
	104	+ logging.info('Creating CategoryLoader')
	105	+ WSORSlaveDataLoader.__init__(self)
	106	+
104	107	self.__DEBUG__ = True
105	108
106	109	self._query_names_['build_subcat_tbl'] = "CREATE TABLE rfaulk.categorylinks_cp select * from enwiki.categorylinks where cl_type = 'subcat'"
—	—	@@ -116,19 +119,26 @@
117	120
118	121	self._query_names_['create_page_category'] = "create table rfaulk.page_category (page_id int(8) unsigned, page_title varbinary(255), category varbinary(255));"
119	122	self._query_names_['drop_page_category'] = "drop table if exists rfaulk.page_category;"
	123	+ self._query_names_['index1_page_category'] = "create index idx_page_id on rfaulk.page_category (page_id);"
	124	+ self._query_names_['index2_page_category'] = "create index idx_page_title on rfaulk.page_category (page_title);"
	125	+ self._query_names_['index3_page_category'] = "create index idx_category on rfaulk.page_category (category);"
120	126	self._query_names_['insert_page_category'] = "insert into rfaulk.page_category values %s;"
121	127
	128	+ self._regexp_list_ = ['^[Aa]', '^[Bb]', '^[Cc]', '^[Dd]', '^[Ee]', '^[Ff]', '^[Gg]', '^[Hh]', '^[Ii]', '^[Jj]', '^[Kk]', '^[Ll]', '^[Mm]', '^[Nn]', '^[Oo]', '^[Pp]', '^[Qq]', '^[Rr]', \
	129	+ '^[Ss]', '^[Tt]', '^[Tt]', '^[Uu]', '^[Vv]', '^[Ww]','^[Xx]', '^[Yy]', '^[Zz]', '^[^A-Za-z]']
122	130
123		~~- WSORSlaveDataLoader.__init__(self)~~
124		~~- logging.info('Creating CategoryLoader')~~
	131	+ self._max_depth_ = 50
	132	+ self._main_topic_ = 'Main_topic_classifications'
	133	+ self._top_level_cats_ = subcategories[self._main_topic_][:]
	134	+ self._top_level_cats_.remove('Chronology')
125	135
126		~~- self._max_depth_ = 100~~
127		- self._top_level_cats_ = ['Natural_sciences', 'Applied_sciences', 'Mathematics', 'Literature', 'Visual_arts', 'Social_sciences', 'Film', 'Music', 'Television', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports', 'Places']
	136	+
	137	+ #self._top_level_cats_ = ['Natural_sciences', 'Applied_sciences', 'Mathematics', 'Literature', 'Visual_arts', 'Social_sciences', 'Film', 'Music', 'Television', 'People', 'Religion', 'Culture', 'Philosophy', 'Sports', 'Places']
128	138	# self._top_level_cats_ = ['Natural_sciences', 'Mathematics', 'Arts', 'Social_sciences', 'Entertainment', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports']
129	139	self._block_words_ = ['categories', 'Categories', 'topic', 'Topic']
130	140	self._block_cats_ = ['']
131	141	self._topic_trees_ = dict()
132		-
	142	+
133	143
134	144	"""
135	145	Retrieves all rows out of the category links table
—	—	@@ -150,30 +160,23 @@
151	161	"""
152	162	Extract the categories for a given article
153	163	"""
154		~~- def get_page_categories(self, page_id_list):~~
	164	+ def get_page_categories(self, page_id):
155	165
156		~~- categories = dict()~~
	166	+ categories = list()
157	167	where_clause = ''
158	168
159		~~- """ Initialize category lists for each page """~~
160		~~- for id in page_id_list:~~
161		~~- categories[id] = list()~~
162		-
	169	+ """ Execute SQL query to retrieve categories for page of page id """
163	170	try:
164	171
165		~~- for id in page_id_list:~~
166		~~- where_clause = where_clause + 'cl_from = %s or ' % str(id)~~
167		~~- where_clause = where_clause[:-4]~~
168		-
	172	+ where_clause = where_clause + 'cl_from = %s' % str(page_id)
169	173	sql = self._query_names_['get_page_categories'] % where_clause
170		-
171		~~- logging.info('Retrieving page categories ...')~~
	174	+
172	175	results = self.execute_SQL(sql)
173	176
174	177	""" walk through results and add to category lists """
175	178	for row in results:
176	179	id = int(row[0])
177		~~- categories[id].append(row[1])~~
	180	+ categories.append(row[1])
178	181
179	182	except Exception as inst:
180	183
—	—	@@ -182,7 +185,7 @@
183	186	logging.error(str(inst.args)) # arguments stored in .args
184	187	logging.error(inst.__str__()) # __str__ allows args to printed directly
185	188
186		~~- return {}~~
	189	+ return []
187	190
188	191	return categories
189	192
—	—	@@ -211,7 +214,7 @@
212	215	"""
213	216	def get_page_title(self, page_id):
214	217
215		~~- logging.info('Getting page titles ...')~~
	218	+ # logging.info('Getting page titles ...')
216	219	is_list = isinstance(page_id, (list))
217	220
218	221	try:
—	—	@@ -232,13 +235,18 @@
233	236	title = dict()
234	237
235	238	for row in results:
236		~~- title[int(row[0])] = str(row[1])~~
237		-
	239	+
	240	+ try:
	241	+ title[int(row[0])] = str(row[1])
	242	+
	243	+ except:
	244	+ logging.error('Could not retrieve page_title for %s.' % row)
	245	+ pass
	246	+
238	247	except Exception as inst:
239	248
240		~~- logging.error('Could not retrieve page_title for page_id = %s.' % page_id)~~
241		~~- self._log_file.write('Could not retrieve page_title for page_id = %s.\n' % (page_id))~~
242		-
	249	+ logging.error('Could not retrieve page_title for page_id.')
	250	+
243	251	return ''
244	252
245	253	return title
—	—	@@ -254,8 +262,6 @@
255	263	#self.drop_category_links_cp_table()
256	264	#self.create_category_links_cp_table()
257	265
258		~~- self._log_file = open('category_miner.log', 'w')~~
259		-
260	266	""" Create graph """
261	267
262	268	logging.info('Initializing directed graph...')
—	—	@@ -304,11 +310,10 @@
305	311
306	312	directed_graph.add_weighted_edges_from([(cl_from, cl_to, 1)])
307	313
308		~~- if self.__DEBUG__ and (cl_from == 'Probability' or cl_from == 'Mathematics' or cl_from == 'Science' or cl_from == 'Arts'):~~
	314	+ #if self.__DEBUG__ and (cl_from == 'Probability' or cl_from == 'Mathematics' or cl_from == 'Science' or cl_from == 'Arts'):
309	315	#if self.__DEBUG__ and count % 1000 == 0 :
310	316
311		~~- logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to))~~
312		~~- self._log_file.write('%s: %s -> %s\n' % (str(count), cl_from, cl_to))~~
	317	+ # logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to))
313	318
314	319	count = count + 1
315	320
—	—	@@ -320,7 +325,6 @@
321	326	in_only, out_only = self.get_uni_directionally_linked_categories(sorted_in_degrees, sorted_out_degrees, in_degrees, out_degrees)
322	327
323	328	logging.info('Category links finished processing.')
324		~~- self._log_file.close()~~
325	329
326	330	return directed_graph, in_degrees, out_degrees, sorted_in_degrees, sorted_out_degrees, subcategories, in_only, out_only
327	331
—	—	@@ -419,26 +423,30 @@
420	424
421	425	""" Create graph """
422	426
423		~~- logging.info('Initializing directed graph...')~~
	427	+ logging.info('Initializing directed graph and topic counts ...')
424	428	graph = nx.Graph()
	429	+ topic_counts = dict()
425	430	self._count_ = 1
426	431
427		~~- subcategories['top_level_categories'] = self._top_level_cats_~~
428		~~- topic = 'top_level_categories'~~
429		-
430	432	depth = 0
431		~~- logging.info('Recursively contructing graph, MAX DEPTH = %s ...' % self._max_depth_)~~
432		~~- shortest_paths, topic_counts = self._recursive_construct_topic_tree(graph, topic, subcategories, depth)~~
	433	+ logging.info('Recursively contructing graph, MAX DEPTH = %s ...' % self._max_depth_)
	434	+ shortest_paths = self._recursive_construct_topic_tree(graph, self._main_topic_, subcategories, depth)
433	435
434		-
435		~~- """ Pickle the result """~~
436		~~- #logging.info('Pickling the shortest paths ...')~~
437		~~- #self.pickle_var(shortest_paths, 'shortest_paths.p')~~
438		-
	436	+ max_depth = 5
	437	+ logging.info('Computing recursive sub-category counts, MAX DEPTH = %s ...' % max_depth)
	438	+ count = 0
	439	+ for title in shortest_paths[shortest_paths.keys()[0]].keys():
	440	+
	441	+ # logging.info('topic counts processed for %s ...' % title)
	442	+ topic_counts[title] = self.get_subcategory_count(title, subcategories, 0, max_depth)
	443	+ count = count + 1
	444	+
439	445	""" Shelve the result """
440	446	logging.info('Shelve the shortest paths ...')
441		~~- d = shelve.open( settings.__data_file_dir__ + 'shortest_paths.s')~~
	447	+ d = shelve.open( settings.__data_file_dir__ + 'topic_tree.s')
442	448	d['shortest_paths'] = shortest_paths
	449	+ d['topic_counts'] = topic_counts
	450	+
443	451	d.close()
444	452
445	453	return graph, shortest_paths
—	—	@@ -454,7 +462,6 @@
455	463	"""
456	464	def _recursive_construct_topic_tree(self, graph, topic, subcategories, depth):
457	465
458		~~- topic_counts = 1~~
459	466	depth = depth + 1
460	467	self._count_ = self._count_ + 1
461	468
—	—	@@ -483,30 +490,25 @@
484	491
485	492	""" Recursively build linkages for each .
486	493	DFS determining topic tree - this provides """
487		~~- for sub_topic in topic_subcategories:~~
488		-
489		~~- if depth == 1:~~
490		~~- logging.info('Processing top level catgory: %s' % sub_topic)~~
491		-
492		~~- if not(graph.has_node(sub_topic)):~~
493		-
494		~~- graph.add_edge(topic, sub_topic)~~
	494	+ """ Only go deeper if the maximum recursive depth has not been reached """
	495	+ if depth < self._max_depth_:
	496	+ for sub_topic in topic_subcategories:
495	497
496		~~- """ Only go deeper if the maximum recursive depth has not been reached """~~
497		~~- if depth < self._max_depth_:~~
498		~~- sub_topic_counts = self._recursive_construct_topic_tree(graph, sub_topic, subcategories, depth)~~
	498	+ if depth == 1:
	499	+ logging.info('Processing top level category: %s' % sub_topic)
	500	+
	501	+ """ Check if the subtopic node is already in the graph - if so add a loop edge """
	502	+ if not(graph.has_node(sub_topic)):
	503	+
	504	+ graph.add_edge(topic, sub_topic)
	505	+ self._recursive_construct_topic_tree(graph, sub_topic, subcategories, depth)
	506	+
499	507	else:
500		~~- sub_topic_counts = 1~~
501	508
502		~~- topic_counts = topic_counts + sub_topic_counts~~
503		-
504		~~- else:~~
505		-
506		~~- """ Add the 'loop' edge if and only if it is not a top level catagory """~~
507		~~- if not(sub_topic in self._top_level_cats_):~~
508		~~- graph.add_edge(topic, sub_topic)~~
509		~~- topic_counts = topic_counts + 1~~
510		-
	509	+ """ Add the 'loop' edge if and only if it is not a top level category """
	510	+ if not(sub_topic in self._top_level_cats_):
	511	+ graph.add_edge(topic, sub_topic)
	512	+
511	513	""" After the recursion is complete compute the shortest paths """
512	514	if depth == 1:
513	515
—	—	@@ -519,10 +521,14 @@
520	522	""" Store the lengths rather than the paths """
521	523	for target in shortest_paths[sub_topic]:
522	524	shortest_paths[sub_topic][target] = len(shortest_paths[sub_topic][target])
	525	+
	526	+ """ Get the shortest path lengths for the main topic also """
	527	+ shortest_paths[self._main_topic_] = nx.single_source_dijkstra_path(graph, self._main_topic_)
	528	+ for target in shortest_paths[self._main_topic_]:
	529	+ shortest_paths[self._main_topic_][target] = len(shortest_paths[self._main_topic_][target])
523	530
524		~~- return shortest_paths, topic_counts~~
	531	+ return shortest_paths
525	532
526		~~- return topic_counts~~
527	533
528	534	"""
529	535	Pickles variables that store the state of the category graph
—	—	@@ -539,90 +545,191 @@
540	546	self.pickle_var(out_only, 'out_only.p')
541	547
542	548	"""
	549	+ Given a set of article ids determine their corresponding representation in category space
	550	+
543	551	@param page_ids: a list of pages to classify
	552	+ @param shortest_paths: an dictionary of the shortest paths from all top level categories to sub categories
544	553	"""
545		~~- def find_top_level_category(self, page_ids, shortest_paths):~~
	554	+ def find_top_level_category(self, page_ids, shortest_paths, topic_counts):
546	555
547		~~- # self._topic_trees_ = dict()~~
548	556	titles = dict()
549		~~- depths = dict()~~
550	557	page_categories = dict()
551	558	page_tl_cat = dict()
552		~~- cat_winner = dict()~~
553		~~- win_count = dict()~~
	559	+ # win_count = dict()
554	560
	561	+ self._num_tl_cats_ = len(self._top_level_cats_)
	562	+ tl_cat_vectors = dict()
	563	+
555	564	""" Get categories for pages - Initialize depth dictionaries for top level categories """
556	565	logging.info('Initializing data structures ...')
557		~~- page_categories = self.get_page_categories(page_ids)~~
558		~~- titles = self.get_page_title(page_ids)~~
559	566
560		~~- for page_id in page_ids:~~
561		~~- # page_categories[page_id] = self.get_page_categories(page_id)~~
562		~~- title = titles[page_id]~~
563		~~- depths[title] = dict()~~
	567	+
	568	+ logging.info('Getting page titles and categories ...')
	569	+ counter = 0
	570	+ for id in page_ids:
	571	+ titles[id] = self.get_page_title(id)
	572	+ page_categories[id] = self.get_page_categories(id)
564	573
565		~~- """ Initialize dictionaries to store the depth scores for top level categories """~~
566		~~- for category in page_categories[page_id]:~~
567		~~- depths[title][category] = dict()~~
568		-
	574	+ counter = counter + 1
	575	+
	576	+ if counter % 100000 == 0:
	577	+ logging.info('%s page titles and categories processed ...' % counter)
	578	+
569	579	""" Iterate through each page, category, and top level category
570	580	Perform a breadth first search for the node to determine the dept """
571	581	logging.info('Finding category depths in each topic tree ...')
572	582
573	583	for page_id in page_ids:
574		-
575		~~- # logging.info('For %s classifying categories: %s...' % (title, str(page_categories[page_id])))~~
	584	+
	585	+ """
	586	+ Retrieve page categories for each page using one of the classification methods available
	587	+ rank_categories_M1()
	588	+ rank_categories_M2()
	589	+ """
576	590	title = titles[page_id]
577		~~- cat_winner[title] = dict()~~
	591	+ page_tl_cat[title] = self.rank_categories_M2(page_categories[page_id], shortest_paths, topic_counts)
	592	+
578	593
579		~~- """ Initialize the number of top level categorizations for each top level category """~~
580		~~- win_count[title] = dict()~~
581		~~- for tl_cat in self._top_level_cats_:~~
582		~~- win_count[title][tl_cat] = 0~~
	594	+ return titles, page_tl_cat #, tl_cat_vectors # , depths, cat_winner
	595	+
	596	+ """
	597	+ Method for determining top level category for a set of categories
	598	+
	599	+ This method looks at the closest top level categories for each category and chooses the category or categories that appear most.
	600	+
	601	+ @param categories: String list of categories to classify
	602	+ @param shortest_paths: dictionary of shortest paths indexed by categories
	603	+
	604	+ @return: String indicating the top level category(ies)
	605	+
	606	+ """
	607	+ def rank_categories_M1(self, categories, shortest_paths):
	608	+
	609	+ """ Initialize the number of top level categorizations for each top level category """
	610	+ win_count = dict()
	611	+ for tl_cat in self._top_level_cats_:
	612	+ win_count[tl_cat] = 0
	613	+
	614	+ for category in page_categories[page_id]:
583	615
584		~~- """ Go through each category for a page and find out which top level cat is closest """~~
585		~~- for category in page_categories[page_id]:~~
	616	+ cat_winner = list()
	617	+ min_depth = self._max_depth_
	618	+
	619	+ for index, tl_cat in enumerate(self._top_level_cats_):
	620	+ # for tl_cat in self._top_level_cats_:
586	621
587		~~- cat_winner[title][category] = list()~~
588		~~- min_depth = self._max_depth_~~
589		~~- for tl_cat in self._top_level_cats_:~~
590		-
591		~~- """ Use shortest paths """~~
592		~~- try:~~
593		~~- depths[title][category][tl_cat] = shortest_paths[tl_cat][category]~~
594		~~- except KeyError:~~
595		~~- depths[title][category][tl_cat] = 99~~
596		-
597		~~- if depths[title][category][tl_cat] < min_depth:~~
598		~~- cat_winner[title][category].append(tl_cat)~~
599		~~- min_depth = depths[title][category][tl_cat]~~
600		~~- elif depths[title][category][tl_cat] == min_depth:~~
601		~~- cat_winner[title][category].append(tl_cat) # there can only be one winner~~
	622	+ """ Use shortest paths """
	623	+ try:
	624	+ path_length = shortest_paths[tl_cat][category]
602	625
603		~~- """ Randomly choose to tie breakers """~~
604		~~- if len(cat_winner[title][category]) > 0:~~
605		~~- random.shuffle(cat_winner[title][category])~~
606		~~- cat_winner[title][category] = cat_winner[title][category][0]~~
607		~~- else:~~
608		~~- cat_winner[title][category] = None~~
609		-
610		~~- winner = cat_winner[title][category] # this a top level category~~
611		~~- if not(winner == None):~~
612		~~- win_count[title][winner] = win_count[title][winner] + 1~~
	626	+ except KeyError:
	627	+ path_length = self._max_depth_
	628	+
	629	+ if path_length < min_depth:
	630	+ cat_winner = [tl_cat]
	631	+ min_depth = path_length
	632	+ elif path_length == min_depth:
	633	+ cat_winner.append(tl_cat) # there can only be one winner
613	634
614		~~- """ Classify the top level categories for each page """~~
615		~~- page_tl_cat[title] = None~~
616		~~- best_count = 0~~
617		~~- for tl_cat in self._top_level_cats_:~~
618		~~- if win_count[title][tl_cat] > best_count:~~
619		~~- page_tl_cat[title] = tl_cat~~
620		~~- best_count = win_count[title][tl_cat]~~
621		~~- elif win_count[title][tl_cat] == best_count and best_count > 0:~~
622		~~- page_tl_cat[title] = page_tl_cat[title] + ' / ' + tl_cat~~
623		-
624		~~- return titles, page_tl_cat # , depths, cat_winner~~
	635	+ """ Randomly choose to tie breakers """
	636	+ if len(cat_winner) > 0:
	637	+ random.shuffle(cat_winner)
	638	+ cat_winner = cat_winner[0]
	639	+ else:
	640	+ cat_winner = None
	641	+
	642	+ winner = cat_winner # this a top level category
	643	+ if not(winner == None):
	644	+ win_count[winner] = win_count[winner] + 1
	645	+
	646	+
	647	+ """ Classify the top level categories for each page """
625	648
	649	+ best_count = 0
	650	+
	651	+ for tl_cat in self._top_level_cats_:
	652	+ if win_count[tl_cat] > best_count:
	653	+ page_tl_cat = tl_cat
	654	+ best_count = win_count[tl_cat]
	655	+ elif win_count[tl_cat] == best_count and best_count > 0:
	656	+ page_tl_cat = page_tl_cat + ' / ' + tl_cat
	657	+
	658	+ return page_tl_cat
	659	+
626	660	"""
	661	+ Method for determining top level category for a set of categories
	662	+
	663	+ This method looks at the closest top level categories for each category and constructs vector representations based on path lengths. The dimensions of the
	664	+ vector space are the top level categories. The value for a given dimension is determined by the sum of the path lengths from each category to the top-level
	665	+ category where the summand is weighted by the distance of the category to the main topic
	666	+
	667	+ @param categories: String list of categories to classify
	668	+ @param shortest_paths: dictionary of shortest paths indexed by categories
	669	+
	670	+ @return: String indicating the top level category(ies)
	671	+ """
	672	+ def rank_categories_M2(self, categories, shortest_paths, topic_counts):
	673	+
	674	+ """ Go through each category for a page and find out which top level cat is closest """
	675	+
	676	+ tl_cat_vectors = np.zeros(len(self._top_level_cats_)) # initialize the vector in top-level category space
	677	+ page_tl_cat = [0] * len(self._top_level_cats_) # the top level cats
	678	+
	679	+ for category in categories:
	680	+
	681	+ """ Compute the weight of this category """
	682	+ try:
	683	+ path_length_from_main = shortest_paths[self._main_topic_][category]
	684	+ except:
	685	+ path_length_from_main = self._max_depth_
	686	+
	687	+ """ The fanout weight is based on the fanout of a category for a fixed depth .. if there is no fanout for this topic it probably has vary few or no
	688	+ subtopics so assign a fanout of 1.0 """
	689	+ try:
	690	+ fanout_weight = float(topic_counts[category])
	691	+ except:
	692	+ fanout_weight = 1.0
	693	+ pass
	694	+
	695	+ """ The total weight of this category depends on the product of how far it lies from the root and the inverse of its fanout
	696	+ this makes more specialized categories worth more """
	697	+ category_weight = float(path_length_from_main) * fanout_weight
	698	+
	699	+ # category_weight = 1 / float(path_length_from_main) * self._max_depth_
	700	+
	701	+ cat_winner = list()
	702	+ min_depth = self._max_depth_
	703	+
	704	+ for index, tl_cat in enumerate(self._top_level_cats_):
	705	+
	706	+ """ Use shortest paths """
	707	+ try:
	708	+ path_length = shortest_paths[tl_cat][category]
	709	+ except KeyError:
	710	+ path_length = 100
	711	+
	712	+ tl_cat_vectors[index] = tl_cat_vectors[index] + category_weight * np.power(path_length, 0.5)
	713	+
	714	+ """ Normalize the vector representation """
	715	+ ranks = np.floor(ss.rankdata(tl_cat_vectors)) - 1
	716	+ vec = max(tl_cat_vectors) - tl_cat_vectors
	717	+ tl_cat_vectors = vec / float(sum(vec))
	718	+
	719	+ """ Choose the top categories """
	720	+ for i in range(self._num_tl_cats_):
	721	+ index = np.argmin(ranks)
	722	+ ranks[index] = self._num_tl_cats_ + 1
	723	+ page_tl_cat[i] = self._top_level_cats_[index]
	724	+
	725	+ top_five_cats = ''
	726	+ for i in range(5):
	727	+ top_five_cats = top_five_cats + page_tl_cat[i] + ', '
	728	+ top_five_cats = top_five_cats[:-2]
	729	+ page_tl_cat = top_five_cats
	730	+
	731	+ return page_tl_cat
	732	+
	733	+ """
627	734	Builds a table containing all main namespace pages and their chosen categories
628	735	"""
629	736	def determine_all_page_categories(self):
—	—	@@ -634,51 +741,84 @@
635	742	logging.info('CATEGORIZING PAGES: Initializing tables ... ')
636	743	self.execute_SQL(sql_drop)
637	744	self.execute_SQL(sql_create)
	745	+ self.execute_SQL(self._query_names_['index1_page_category'])
	746	+ self.execute_SQL(self._query_names_['index2_page_category'])
	747	+ self.execute_SQL(self._query_names_['index3_page_category'])
638	748
639		~~- logging.info('CATEGORIZING PAGES: Getting all pages ... ')~~
640		~~- sql_get_page_ids = self._query_names_['get_all_page_ids']~~
641		~~- results = self.execute_SQL(sql_get_page_ids)~~
642		-
643		~~- page_ids = list()~~
644		~~- for row in results:~~
645		~~- page_ids.append(int(row[0]))~~
646		-
647	749	logging.info('CATEGORIZING PAGES: Unshelving shortest paths ... ')
648		~~- # shortest_paths = self.unpickle_var('shortest_paths.p')~~
	750	+ d = shelve.open( settings.__data_file_dir__ + 'topic_tree.s')
649	751
650		~~- d = shelve.open( settings.__data_file_dir__ + 'shortest_paths.s')~~
651	752	shortest_paths = d['shortest_paths']
	753	+ topic_counts = d['topic_counts']
652	754
653		~~- logging.info('CATEGORIZING PAGES: Computing categories ... ')~~
654		~~- titles, page_tl_cat = self.find_top_level_category(page_ids, shortest_paths)~~
655		~~- ids = dict((i,v) for v,i in titles.iteritems())~~
656		-
657		~~- logging.info('CATEGORIZING PAGES: Performing inserts ... ')~~
658		~~- page_id_str = ''~~
659		~~- for title in page_tl_cat:~~
660		~~- id = ids[title]~~
661		~~- category = page_tl_cat[title]~~
	755	+ """
	756	+ Break up processing to handle records with page titles matching the regular expressions in _regexp_list_
	757	+ """
	758	+ for regexp in self._regexp_list_:
662	759
663		~~- parts = title.split("'")~~
664		~~- new_title = parts[0]~~
665		~~- parts = parts[1:]~~
666		~~- for part in parts:~~
667		~~- new_title = new_title + " " + part~~
668		-
669		~~- page_id_str = "(%s,'%s','%s')" % (id, new_title, category)~~
670		~~- try:~~
671		~~- self.execute_SQL(sql_insert % page_id_str)~~
672		~~- except:~~
673		~~- logging.info('Could not insert: %s ... ' % new_title)~~
674		~~- pass~~
675		~~- # page_ids.append(str(row[0]))~~
676		~~- # page_id_str = page_id_str[:-1]~~
	760	+ logging.info('CATEGORIZING PAGES: Getting pages for %s ... ' % regexp)
	761	+
	762	+ sql_get_page_ids = self._query_names_['get_all_page_ids'] + " and page_title regexp '%s';" % regexp
	763	+ results = self.execute_SQL(sql_get_page_ids)
	764	+
	765	+ page_ids = list()
	766	+ for row in results:
	767	+ page_ids.append(int(row[0]))
	768	+
	769	+ logging.info('CATEGORIZING PAGES: Computing categories ... ')
	770	+ titles, page_tl_cat = self.find_top_level_category(page_ids, shortest_paths, topic_counts)
	771	+ ids = dict((i,v) for v,i in titles.iteritems())
	772	+
	773	+ logging.info('CATEGORIZING PAGES: Performing inserts ... ')
	774	+ page_id_str = ''
	775	+ for title in page_tl_cat:
	776	+ id = ids[title]
	777	+ category = page_tl_cat[title]
	778	+
	779	+ parts = title.split("'")
	780	+ new_title = parts[0]
	781	+ parts = parts[1:]
	782	+ for part in parts:
	783	+ new_title = new_title + " " + part
	784	+
	785	+ page_id_str = "(%s,'%s','%s')" % (id, new_title, category)
	786	+ try:
	787	+ self.execute_SQL(sql_insert % page_id_str)
	788	+ except:
	789	+ logging.info('Could not insert: %s ... ' % new_title)
	790	+ pass
	791	+
	792	+
	793	+ d.close()
	794	+
	795	+
	796	+ """
	797	+ Gets a subcategory count for a fixed depth of the category graph structure starting at a specified node
	798	+ loops are ignored
677	799
678		~~- #logging.info('CATEGORIZING PAGES: Inserting page ids into rfaulk.page_category ... ')~~
679		~~- #self.execute_SQL(sql_insert % page_id_str)~~
	800	+ @param topic: The topic to produce a sub topic count for
	801	+ @param subcategories: dictionary keyed on categories, values are subcategories
	802	+ @param depth: the current depth of the recursion
	803	+ @param max_depth: The maximum depth of the recursion
680	804
681		~~- d.close()~~
	805	+ """
	806	+ def get_subcategory_count(self, topic, subcategories, depth, max_depth):
682	807
	808	+ topic_count = 1
	809	+
	810	+ try:
	811	+ topic_subcategories = subcategories[topic]
	812	+ new_depth = depth + 1
	813	+
	814	+ if depth < max_depth:
	815	+ for sub_topic in topic_subcategories:
	816	+ topic_count = topic_count + self.get_subcategory_count(sub_topic, subcategories, new_depth, max_depth)
	817	+ except:
	818	+ # logging.info('No subcategories of %s' % topic)
	819	+ pass
	820	+
	821	+ return topic_count
	822	+
683	823	"""
684	824	Inherits WSORSlaveDataLoader
685	825

Status & tagging log

12:12, 23 August 2011 Reedy (talk | contribs) changed the status of r95296 [removed: new added: deferred]