r94667 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r94666‎ \| r94667 \| r94668 >
Date:	17:47, 16 August 2011
Author:	rfaulk
Status:	deferred
Tags:
Comment:	modified construct_topic tree() to build trees from all top level categories added find_top_level category() to determine categories from article page ids added determine_all_page_categories() to categorize all pages
Modified paths:	/trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py
—	—	@@ -12,18 +12,21 @@
13	13
14	14
15	15	""" Import python base modules """
16		~~-import sys, getopt, re, datetime, logging, MySQLdb, settings, operator, pickle~~
	16	+import sys, getopt, re, datetime, logging, MySQLdb, operator, pickle, shelve, random
17	17	import networkx as nx
18	18
19	19	""" Import Analytics modules """
20	20	from Fundraiser_Tools.classes.DataLoader import DataLoader
	21	+import WSOR.scripts.classes.settings as settings
21	22
22	23	""" Configure the logger """
23	24	LOGGING_STREAM = sys.stderr
24	25	logging.basicConfig(level=logging.DEBUG, stream=LOGGING_STREAM, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%b-%d %H:%M:%S')
	26	+# logging.basicConfig(level=logging.DEBUG, filename="categories.log", filemode='w', format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%b-%d %H:%M:%S')
25	27
26	28
27	29
	30	+
28	31	"""
29	32	Inherits DataLoader
30	33
—	—	@@ -102,16 +105,31 @@
103	106	self._query_names_['build_subcat_tbl'] = "CREATE TABLE rfaulk.categorylinks_cp select * from enwiki.categorylinks where cl_type = 'subcat'"
104	107	self._query_names_['drop_subcat_tbl'] = "drop table if exists rfaulk.categorylinks_cp;"
105	108	self._query_names_['get_first_rec'] = "select cl_from from categorylinks_cp limit 1"
106		~~- self._query_names_['get_category_page_title'] = "select page_title from enwiki.page where page_id = %s"~~
	109	+ self._query_names_['get_category_page_title'] = "select page_id, page_title from enwiki.page where %s"
107	110	self._query_names_['get_category_page_id'] = "select page_id from enwiki.page where page_title = '%s' and page_namespace = 14"
108	111	self._query_names_['get_subcategories'] = "select cl_to from categorylinks_cp where cl_from = %s"
109	112	self._query_names_['delete_from_recs'] = "delete from rfaulk.categorylinks_cp where cl_from = %s"
110	113	self._query_names_['is_empty'] = "select * from rfaulk.categorylinks_cp limit 1"
111	114	self._query_names_['get_category_links'] = "select cl_from, cl_to from categorylinks_cp"
	115	+ self._query_names_['get_page_categories'] = "select cl_from, cl_to from enwiki.categorylinks where %s order by 1"
	116	+ self._query_names_['get_all_page_ids'] = "select page_id from enwiki.page where page_namespace = 0 and page_len > 1000"
112	117
	118	+ self._query_names_['create_page_category'] = "create table rfaulk.page_category (page_id int(8) unsigned, page_title varbinary(255), category varbinary(255));"
	119	+ self._query_names_['drop_page_category'] = "drop table if exists rfaulk.page_category;"
	120	+ self._query_names_['insert_page_category'] = "insert into rfaulk.page_category values %s;"
	121	+
	122	+
113	123	WSORSlaveDataLoader.__init__(self)
114	124	logging.info('Creating CategoryLoader')
115		-
	125	+
	126	+ self._max_depth_ = 100
	127	+ self._top_level_cats_ = ['Natural_sciences', 'Applied_sciences', 'Mathematics', 'Literature', 'Visual_arts', 'Social_sciences', 'Film', 'Music', 'Television', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports', 'Places']
	128	+ # self._top_level_cats_ = ['Natural_sciences', 'Mathematics', 'Arts', 'Social_sciences', 'Entertainment', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports']
	129	+ self._block_words_ = ['categories', 'Categories', 'topic', 'Topic']
	130	+ self._block_cats_ = ['']
	131	+ self._topic_trees_ = dict()
	132	+
	133	+
116	134	"""
117	135	Retrieves all rows out of the category links table
118	136	"""
—	—	@@ -129,8 +147,45 @@
130	148
131	149	return results
132	150
	151	+ """
	152	+ Extract the categories for a given article
	153	+ """
	154	+ def get_page_categories(self, page_id_list):
	155	+
	156	+ categories = dict()
	157	+ where_clause = ''
	158	+
	159	+ """ Initialize category lists for each page """
	160	+ for id in page_id_list:
	161	+ categories[id] = list()
	162	+
	163	+ try:
	164	+
	165	+ for id in page_id_list:
	166	+ where_clause = where_clause + 'cl_from = %s or ' % str(id)
	167	+ where_clause = where_clause[:-4]
	168	+
	169	+ sql = self._query_names_['get_page_categories'] % where_clause
	170	+
	171	+ logging.info('Retrieving page categories ...')
	172	+ results = self.execute_SQL(sql)
	173	+
	174	+ """ walk through results and add to category lists """
	175	+ for row in results:
	176	+ id = int(row[0])
	177	+ categories[id].append(row[1])
	178	+
	179	+ except Exception as inst:
	180	+
	181	+ logging.error('Could not retrieve page categories.')
	182	+ logging.error(str(type(inst))) # the exception instance
	183	+ logging.error(str(inst.args)) # arguments stored in .args
	184	+ logging.error(inst.__str__()) # __str__ allows args to printed directly
	185	+
	186	+ return {}
	187	+
	188	+ return categories
133	189
134		-
135	190	"""
136	191	Retrives the integer page id
137	192	"""
—	—	@@ -151,21 +206,38 @@
152	207
153	208	"""
154	209	Retrives the string page title
	210	+
	211	+ This either manages a list of ids or a single id
155	212	"""
156	213	def get_page_title(self, page_id):
157	214
	215	+ logging.info('Getting page titles ...')
	216	+ is_list = isinstance(page_id, (list))
	217	+
158	218	try:
159		~~- sql = self._query_names_['get_category_page_title'] % page_id~~
160		~~- #logging.info('Executing: ' + sql)~~
	219	+ if not(is_list):
	220	+ where_clause = 'page_id = %s' % str(page_id)
	221	+ else:
	222	+ where_clause = ''
	223	+ for id in page_id:
	224	+ where_clause = where_clause + 'page_id = %s or ' % str(id)
	225	+ where_clause = where_clause[:-4]
	226	+
	227	+ sql = self._query_names_['get_category_page_title'] % where_clause
161	228	results = self.execute_SQL(sql)
162		~~- title = str(results[0][0])~~
163		-
	229	+
	230	+ if not(is_list):
	231	+ title = str(results[0][1])
	232	+ else:
	233	+ title = dict()
	234	+
	235	+ for row in results:
	236	+ title[int(row[0])] = str(row[1])
	237	+
164	238	except Exception as inst:
165	239
166		~~- logging.error('Could not retrieve page_title.')~~
167		~~- logging.error(str(type(inst))) # the exception instance~~
168		~~- logging.error(str(inst.args)) # arguments stored in .args~~
169		~~- logging.error(inst.__str__()) # __str__ allows args to printed directly~~
	240	+ logging.error('Could not retrieve page_title for page_id = %s.' % page_id)
	241	+ self._log_file.write('Could not retrieve page_title for page_id = %s.\n' % (page_id))
170	242
171	243	return ''
172	244
—	—	@@ -181,7 +253,9 @@
182	254
183	255	#self.drop_category_links_cp_table()
184	256	#self.create_category_links_cp_table()
185		-
	257	+
	258	+ self._log_file = open('category_miner.log', 'w')
	259	+
186	260	""" Create graph """
187	261
188	262	logging.info('Initializing directed graph...')
—	—	@@ -207,9 +281,9 @@
208	282	""" Process subcategory links """
209	283	for row in links:
210	284
211		~~- cl_from = int(row[0])~~
212		~~- cl_to = str(row[1])~~
213		~~- cl_from = self.get_page_title(cl_from)~~
	285	+ cl_from = str(row[1])
	286	+ cl_to = int(row[0])
	287	+ cl_to = self.get_page_title(cl_to)
214	288
215	289	try:
216	290	subcategories[cl_from].append(cl_to)
—	—	@@ -230,10 +304,12 @@
231	305
232	306	directed_graph.add_weighted_edges_from([(cl_from, cl_to, 1)])
233	307
234		~~- if self.__DEBUG__ and count % 1000 == 0:~~
	308	+ if self.__DEBUG__ and (cl_from == 'Probability' or cl_from == 'Mathematics' or cl_from == 'Science' or cl_from == 'Arts'):
	309	+ #if self.__DEBUG__ and count % 1000 == 0 :
235	310
236	311	logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to))
237		-
	312	+ self._log_file.write('%s: %s -> %s\n' % (str(count), cl_from, cl_to))
	313	+
238	314	count = count + 1
239	315
240	316	logging.info('Sorting in degree list.')
—	—	@@ -244,12 +320,13 @@
245	321	in_only, out_only = self.get_uni_directionally_linked_categories(sorted_in_degrees, sorted_out_degrees, in_degrees, out_degrees)
246	322
247	323	logging.info('Category links finished processing.')
	324	+ self._log_file.close()
248	325
249	326	return directed_graph, in_degrees, out_degrees, sorted_in_degrees, sorted_out_degrees, subcategories, in_only, out_only
250	327
251	328
252	329	"""
253		~~- Returns~~
	330	+ Looks at the in and out degrees and constructs lists of nodes having only edges out and edges in
254	331	"""
255	332	def get_uni_directionally_linked_categories(self, in_degrees, out_degrees, in_degrees_by_key, out_degrees_by_key ):
256	333
—	—	@@ -307,9 +384,7 @@
308	385	logging.error(str(type(inst))) # the exception instance
309	386	logging.error(str(inst.args)) # arguments stored in .args
310	387	logging.error(inst.__str__()) # __str__ allows args to printed directly
311		-
312		-
313		-
	388	+
314	389	"""
315	390	Are there any records remaining in rfaulk.categorylinks_cp ??
316	391	"""
—	—	@@ -334,64 +409,276 @@
335	410	return False
336	411	else:
337	412	return True
338		-
339		-
	413	+
340	414	"""
341	415	Are there any records remaining in rfaulk.categorylinks_cp ??
342	416
343	417	Use a trace to detect any loops
344	418	"""
345		~~- def construct_topic_tree(self, topic, subcategories):~~
	419	+ def construct_topic_tree(self, subcategories):
346	420
347	421	""" Create graph """
348	422
349	423	logging.info('Initializing directed graph...')
350		~~- directed_graph = nx.DiGraph()~~
351		~~- trace = [topic]~~
	424	+ graph = nx.Graph()
	425	+ self._count_ = 1
352	426
353		~~- topic_counts = self._recursive_construct_topic_tree(directed_graph, topic, subcategories, trace)~~
	427	+ subcategories['top_level_categories'] = self._top_level_cats_
	428	+ topic = 'top_level_categories'
354	429
355		~~- return directed_graph, topic_counts~~
	430	+ depth = 0
	431	+ logging.info('Recursively contructing graph, MAX DEPTH = %s ...' % self._max_depth_)
	432	+ shortest_paths, topic_counts = self._recursive_construct_topic_tree(graph, topic, subcategories, depth)
356	433
	434	+
	435	+ """ Pickle the result """
	436	+ #logging.info('Pickling the shortest paths ...')
	437	+ #self.pickle_var(shortest_paths, 'shortest_paths.p')
	438	+
	439	+ """ Shelve the result """
	440	+ logging.info('Shelve the shortest paths ...')
	441	+ d = shelve.open( settings.__data_file_dir__ + 'shortest_paths.s')
	442	+ d['shortest_paths'] = shortest_paths
	443	+ d.close()
	444	+
	445	+ return graph, shortest_paths
	446	+
357	447	"""
358		~~- Are there any records remaining in rfaulk.categorylinks_cp ??~~
	448	+ Recursively build the graph structure for categories based on the subcategory list
	449	+
	450	+ @param graph: NetworkX graph structure to store category linkage
	451	+ @param topic: String topic name on which to build a recursive structure
	452	+ @param subcategories: Disctionary of subcategory lists
	453	+ @param depth: integer depth of the call within the recursion
	454	+
359	455	"""
360		~~- def _recursive_construct_topic_tree(self, directed_graph, topic, subcategories, trace):~~
	456	+ def _recursive_construct_topic_tree(self, graph, topic, subcategories, depth):
	457	+
	458	+ topic_counts = 1
	459	+ depth = depth + 1
	460	+ self._count_ = self._count_ + 1
361	461
362		~~- topic_counts = 0~~
	462	+ if self._count_ % 10000 == 0:
	463	+ logging.info('Processed %s nodes. Graph size = %s.' % (str(self._count_), str(graph.number_of_nodes())))
363	464
364	465	""" Extract the subtopics of topic """
365	466	try:
366		~~- topic_subcategories = subcategories[topic]~~
367		-
	467	+ topic_subcategories = subcategories[topic]
	468	+ new_subcategories = topic_subcategories[:]
	469	+
	470	+ """ Filter meta categories based on block words """
	471	+ for sub_topic in topic_subcategories:
	472	+ for block_word in self._block_words_:
	473	+ if re.search(block_word, sub_topic):
	474	+ new_subcategories.remove(sub_topic)
	475	+ for block_cat in self._block_cats_:
	476	+ if block_cat == sub_topic:
	477	+ new_subcategories.remove(sub_topic)
	478	+
	479	+ topic_subcategories = new_subcategories
	480	+
368	481	except KeyError:
369	482	""" There are no subcategories for this topic """
370	483	return 1 # there is a topic count of 1
371	484
372		~~- """ Recursively build linkages for each """~~
373		~~- # logging.info(str(trace))~~
	485	+ """ Recursively build linkages for each .
	486	+ DFS determining topic tree - this provides """
374	487	for sub_topic in topic_subcategories:
375	488
376		~~- if not(sub_topic in trace):~~
	489	+ if depth == 1:
	490	+ logging.info('Processing top level catgory: %s' % sub_topic)
	491	+
	492	+ if not(graph.has_node(sub_topic)):
377	493
378		~~- logging.info(topic + ' --> ' + sub_topic)~~
	494	+ graph.add_edge(topic, sub_topic)
	495	+
	496	+ """ Only go deeper if the maximum recursive depth has not been reached """
	497	+ if depth < self._max_depth_:
	498	+ sub_topic_counts = self._recursive_construct_topic_tree(graph, sub_topic, subcategories, depth)
	499	+ else:
	500	+ sub_topic_counts = 1
379	501
380		~~- copy_trace = trace[:]~~
381		~~- copy_trace.append(sub_topic)~~
382		-
383		~~- directed_graph.add_weighted_edges_from([(topic, sub_topic, 1)])~~
384		~~- sub_topic_counts = self._recursive_construct_topic_tree(directed_graph, sub_topic, subcategories, copy_trace)~~
385		-
386	502	topic_counts = topic_counts + sub_topic_counts
387	503
388	504	else:
389	505
390		~~- logging.info('LOOP: ' + topic + ' --> ' + sub_topic)~~
	506	+ """ Add the 'loop' edge if and only if it is not a top level catagory """
	507	+ if not(sub_topic in self._top_level_cats_):
	508	+ graph.add_edge(topic, sub_topic)
	509	+ topic_counts = topic_counts + 1
	510	+
	511	+ """ After the recursion is complete compute the shortest paths """
	512	+ if depth == 1:
	513	+
	514	+ shortest_paths = dict()
	515	+
	516	+ for sub_topic in self._top_level_cats_:
	517	+ logging.info('Computing shortest paths for %s ...' % sub_topic)
	518	+ shortest_paths[sub_topic] = nx.single_source_dijkstra_path(graph, sub_topic)
391	519
392		~~- directed_graph.add_weighted_edges_from([(topic, 'LOOP TO: ' + sub_topic, 1)])~~
393		-
	520	+ """ Store the lengths rather than the paths """
	521	+ for target in shortest_paths[sub_topic]:
	522	+ shortest_paths[sub_topic][target] = len(shortest_paths[sub_topic][target])
	523	+
	524	+ return shortest_paths, topic_counts
	525	+
394	526	return topic_counts
	527	+
	528	+ """
	529	+ Pickles variables that store the state of the category graph
	530	+ """
	531	+ def pickle_all(self, directed_graph, in_degrees, out_degrees, sorted_in_degrees, sorted_out_degrees, subcategories, in_only, out_only):
	532	+
	533	+ self.pickle_var(directed_graph, 'full_topic_graph.p')
	534	+ self.pickle_var(in_degrees, 'in_degrees_dict.p')
	535	+ self.pickle_var(out_degrees, 'out_degrees_dict.p')
	536	+ self.pickle_var(sorted_out_degrees, 'sorted_out_degrees_dict.p')
	537	+ self.pickle_var(sorted_in_degrees, 'sorted_in_degrees_dict.p')
	538	+ self.pickle_var(subcategories, 'subcategories.p')
	539	+ self.pickle_var(in_only, 'in_only.p')
	540	+ self.pickle_var(out_only, 'out_only.p')
	541	+
	542	+ """
	543	+ @param page_ids: a list of pages to classify
	544	+ """
	545	+ def find_top_level_category(self, page_ids, shortest_paths):
	546	+
	547	+ # self._topic_trees_ = dict()
	548	+ titles = dict()
	549	+ depths = dict()
	550	+ page_categories = dict()
	551	+ page_tl_cat = dict()
	552	+ cat_winner = dict()
	553	+ win_count = dict()
	554	+
	555	+ """ Get categories for pages - Initialize depth dictionaries for top level categories """
	556	+ logging.info('Initializing data structures ...')
	557	+ page_categories = self.get_page_categories(page_ids)
	558	+ titles = self.get_page_title(page_ids)
	559	+
	560	+ for page_id in page_ids:
	561	+ # page_categories[page_id] = self.get_page_categories(page_id)
	562	+ title = titles[page_id]
	563	+ depths[title] = dict()
	564	+
	565	+ """ Initialize dictionaries to store the depth scores for top level categories """
	566	+ for category in page_categories[page_id]:
	567	+ depths[title][category] = dict()
	568	+
	569	+ """ Iterate through each page, category, and top level category
	570	+ Perform a breadth first search for the node to determine the dept """
	571	+ logging.info('Finding category depths in each topic tree ...')
	572	+
	573	+ for page_id in page_ids:
	574	+
	575	+ # logging.info('For %s classifying categories: %s...' % (title, str(page_categories[page_id])))
	576	+ title = titles[page_id]
	577	+ cat_winner[title] = dict()
	578	+
	579	+ """ Initialize the number of top level categorizations for each top level category """
	580	+ win_count[title] = dict()
	581	+ for tl_cat in self._top_level_cats_:
	582	+ win_count[title][tl_cat] = 0
	583	+
	584	+ """ Go through each category for a page and find out which top level cat is closest """
	585	+ for category in page_categories[page_id]:
395	586
	587	+ cat_winner[title][category] = list()
	588	+ min_depth = self._max_depth_
	589	+ for tl_cat in self._top_level_cats_:
	590	+
	591	+ """ Use shortest paths """
	592	+ try:
	593	+ depths[title][category][tl_cat] = shortest_paths[tl_cat][category]
	594	+ except KeyError:
	595	+ depths[title][category][tl_cat] = 99
	596	+
	597	+ if depths[title][category][tl_cat] < min_depth:
	598	+ cat_winner[title][category].append(tl_cat)
	599	+ min_depth = depths[title][category][tl_cat]
	600	+ elif depths[title][category][tl_cat] == min_depth:
	601	+ cat_winner[title][category].append(tl_cat) # there can only be one winner
	602	+
	603	+ """ Randomly choose to tie breakers """
	604	+ if len(cat_winner[title][category]) > 0:
	605	+ random.shuffle(cat_winner[title][category])
	606	+ cat_winner[title][category] = cat_winner[title][category][0]
	607	+ else:
	608	+ cat_winner[title][category] = None
	609	+
	610	+ winner = cat_winner[title][category] # this a top level category
	611	+ if not(winner == None):
	612	+ win_count[title][winner] = win_count[title][winner] + 1
	613	+
	614	+ """ Classify the top level categories for each page """
	615	+ page_tl_cat[title] = None
	616	+ best_count = 0
	617	+ for tl_cat in self._top_level_cats_:
	618	+ if win_count[title][tl_cat] > best_count:
	619	+ page_tl_cat[title] = tl_cat
	620	+ best_count = win_count[title][tl_cat]
	621	+ elif win_count[title][tl_cat] == best_count and best_count > 0:
	622	+ page_tl_cat[title] = page_tl_cat[title] + ' / ' + tl_cat
	623	+
	624	+ return titles, page_tl_cat # , depths, cat_winner
	625	+
	626	+ """
	627	+ Builds a table containing all main namespace pages and their chosen categories
	628	+ """
	629	+ def determine_all_page_categories(self):
	630	+
	631	+ sql_create = self._query_names_['create_page_category']
	632	+ sql_drop = self._query_names_['drop_page_category']
	633	+ sql_insert = self._query_names_['insert_page_category']
	634	+
	635	+ logging.info('CATEGORIZING PAGES: Initializing tables ... ')
	636	+ self.execute_SQL(sql_drop)
	637	+ self.execute_SQL(sql_create)
	638	+
	639	+ logging.info('CATEGORIZING PAGES: Getting all pages ... ')
	640	+ sql_get_page_ids = self._query_names_['get_all_page_ids']
	641	+ results = self.execute_SQL(sql_get_page_ids)
	642	+
	643	+ page_ids = list()
	644	+ for row in results:
	645	+ page_ids.append(int(row[0]))
	646	+
	647	+ logging.info('CATEGORIZING PAGES: Unshelving shortest paths ... ')
	648	+ # shortest_paths = self.unpickle_var('shortest_paths.p')
	649	+
	650	+ d = shelve.open( settings.__data_file_dir__ + 'shortest_paths.s')
	651	+ shortest_paths = d['shortest_paths']
	652	+
	653	+ logging.info('CATEGORIZING PAGES: Computing categories ... ')
	654	+ titles, page_tl_cat = self.find_top_level_category(page_ids, shortest_paths)
	655	+ ids = dict((i,v) for v,i in titles.iteritems())
	656	+
	657	+ logging.info('CATEGORIZING PAGES: Performing inserts ... ')
	658	+ page_id_str = ''
	659	+ for title in page_tl_cat:
	660	+ id = ids[title]
	661	+ category = page_tl_cat[title]
	662	+
	663	+ parts = title.split("'")
	664	+ new_title = parts[0]
	665	+ parts = parts[1:]
	666	+ for part in parts:
	667	+ new_title = new_title + " " + part
	668	+
	669	+ page_id_str = "(%s,'%s','%s')" % (id, new_title, category)
	670	+ try:
	671	+ self.execute_SQL(sql_insert % page_id_str)
	672	+ except:
	673	+ logging.info('Could not insert: %s ... ' % new_title)
	674	+ pass
	675	+ # page_ids.append(str(row[0]))
	676	+ # page_id_str = page_id_str[:-1]
	677	+
	678	+ #logging.info('CATEGORIZING PAGES: Inserting page ids into rfaulk.page_category ... ')
	679	+ #self.execute_SQL(sql_insert % page_id_str)
	680	+
	681	+ d.close()
	682	+
396	683	"""
397	684	Inherits WSORSlaveDataLoader
398	685

Status & tagging log

17:49, 16 August 2011 😂 (talk | contribs) changed the status of r94667 [removed: new added: deferred]