r107362 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r107361‎ \| r107362 \| r107363 >
Date:	06:36, 27 December 2011
Author:	rfaulk
Status:	deferred
Tags:
Comment:	modified top level category definition return category vectors
Modified paths:	/trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py
—	—	@@ -117,11 +117,13 @@
118	118	self._query_names_['get_page_categories'] = "select cl_from, cl_to from enwiki.categorylinks where %s order by 1"
119	119	self._query_names_['get_all_page_ids'] = "select page_id from enwiki.page where page_namespace = 0 and page_len > 1000"
120	120
121		~~- self._query_names_['create_page_category'] = "create table rfaulk.page_category (page_id int(8) unsigned, page_title varbinary(255), category varbinary(255));"~~
	121	+ self._query_names_['create_page_category'] = "create table rfaulk.page_category (page_id int(8) unsigned, page_title varbinary(255)," + \
	122	+ "category varbinary(255), category_value varbinary(255));"
122	123	self._query_names_['drop_page_category'] = "drop table if exists rfaulk.page_category;"
123	124	self._query_names_['index1_page_category'] = "create index idx_page_id on rfaulk.page_category (page_id);"
124	125	self._query_names_['index2_page_category'] = "create index idx_page_title on rfaulk.page_category (page_title);"
125	126	self._query_names_['index3_page_category'] = "create index idx_category on rfaulk.page_category (category);"
	127	+ self._query_names_['index3_page_category'] = "create index idx_category_value on rfaulk.page_category (category_value);"
126	128	self._query_names_['insert_page_category'] = "insert into rfaulk.page_category values %s;"
127	129
128	130	self._regexp_list_ = ['^[Aa]', '^[Bb]', '^[Cc]', '^[Dd]', '^[Ee]', '^[Ff]', '^[Gg]', '^[Hh]', '^[Ii]', '^[Jj]', '^[Kk]', '^[Ll]', '^[Mm]', '^[Nn]', '^[Oo]', '^[Pp]', '^[Qq]', '^[Rr]', \
—	—	@@ -129,10 +131,17 @@
130	132
131	133	self._max_depth_ = 50
132	134	self._main_topic_ = 'Main_topic_classifications'
133		~~- self._top_level_cats_ = subcategories[self._main_topic_][:]~~
	135	+ self._top_level_cats_ = subcategories[self._main_topic_]
	136	+
134	137	self._top_level_cats_.remove('Chronology')
	138	+ self._top_level_cats_.remove('Geography')
	139	+ self._top_level_cats_.remove('Nature')
	140	+ self._top_level_cats_.remove('Agriculture')
	141	+ self._top_level_cats_.remove('Applied_sciences')
135	142
	143	+ self._top_level_cats_.append('Places')
136	144
	145	+
137	146	#self._top_level_cats_ = ['Natural_sciences', 'Applied_sciences', 'Mathematics', 'Literature', 'Visual_arts', 'Social_sciences', 'Film', 'Music', 'Television', 'People', 'Religion', 'Culture', 'Philosophy', 'Sports', 'Places']
138	147	# self._top_level_cats_ = ['Natural_sciences', 'Mathematics', 'Arts', 'Social_sciences', 'Entertainment', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports']
139	148	self._block_words_ = ['categories', 'Categories', 'topic', 'Topic']
—	—	@@ -587,10 +596,10 @@
588	597	rank_categories_M2()
589	598	"""
590	599	title = titles[page_id]
591		~~- page_tl_cat[title] = self.rank_categories_M2(page_categories[page_id], shortest_paths, topic_counts)~~
	600	+ page_tl_cat[title], tl_cat_vectors[title] = self.rank_categories_M2(page_categories[page_id], shortest_paths, topic_counts)
592	601
593	602
594		~~- return titles, page_tl_cat #, tl_cat_vectors # , depths, cat_winner~~
	603	+ return titles, page_tl_cat, tl_cat_vectors # , depths, cat_winner
595	604
596	605	"""
597	606	Method for determining top level category for a set of categories
—	—	@@ -693,7 +702,7 @@
694	703
695	704	""" The total weight of this category depends on the product of how far it lies from the root and the inverse of its fanout
696	705	this makes more specialized categories worth more """
697		~~- category_weight = float(path_length_from_main) * fanout_weight~~
	706	+ category_weight = float(path_length_from_main) # * fanout_weight
698	707
699	708	# category_weight = 1 / float(path_length_from_main) * self._max_depth_
700	709
—	—	@@ -727,7 +736,7 @@
728	737	top_five_cats = top_five_cats[:-2]
729	738	page_tl_cat = top_five_cats
730	739
731		~~- return page_tl_cat~~
	740	+ return page_tl_cat, tl_cat_vectors
732	741
733	742	"""
734	743	Builds a table containing all main namespace pages and their chosen categories
—	—	@@ -766,7 +775,7 @@
767	776	page_ids.append(int(row[0]))
768	777
769	778	logging.info('CATEGORIZING PAGES: Computing categories ... ')
770		~~- titles, page_tl_cat = self.find_top_level_category(page_ids, shortest_paths, topic_counts)~~
	779	+ titles, page_tl_cat, cat_vector = self.find_top_level_category(page_ids, shortest_paths, topic_counts)
771	780	ids = dict((i,v) for v,i in titles.iteritems())
772	781
773	782	logging.info('CATEGORIZING PAGES: Performing inserts ... ')
—	—	@@ -774,14 +783,20 @@
775	784	for title in page_tl_cat:
776	785	id = ids[title]
777	786	category = page_tl_cat[title]
	787	+ vector = cat_vector[title]
778	788
779	789	parts = title.split("'")
780	790	new_title = parts[0]
781	791	parts = parts[1:]
782	792	for part in parts:
783	793	new_title = new_title + " " + part
784		-
785		~~- page_id_str = "(%s,'%s','%s')" % (id, new_title, category)~~
	794	+
	795	+ vector_str = ''
	796	+ for elem in vector:
	797	+ vector_str = vector_str + '%1.7s ' % elem
	798	+ vector_str = vector_str[:-1]
	799	+
	800	+ page_id_str = "(%s,'%s','%s', '%s')" % (id, new_title, category, vector_str)
786	801	try:
787	802	self.execute_SQL(sql_insert % page_id_str)
788	803	except:

Status & tagging log

16:51, 4 January 2012 Reedy (talk | contribs) changed the status of r107362 [removed: new added: deferred]