r107362 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r107361‎ | r107362 | r107363 >
Date:06:36, 27 December 2011
Author:rfaulk
Status:deferred
Tags:
Comment:
modified top level category definition
return category vectors
Modified paths:
  • /trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py (modified) (history)

Diff [purge]

Index: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py
@@ -117,11 +117,13 @@
118118 self._query_names_['get_page_categories'] = "select cl_from, cl_to from enwiki.categorylinks where %s order by 1"
119119 self._query_names_['get_all_page_ids'] = "select page_id from enwiki.page where page_namespace = 0 and page_len > 1000"
120120
121 - self._query_names_['create_page_category'] = "create table rfaulk.page_category (page_id int(8) unsigned, page_title varbinary(255), category varbinary(255));"
 121+ self._query_names_['create_page_category'] = "create table rfaulk.page_category (page_id int(8) unsigned, page_title varbinary(255)," + \
 122+ "category varbinary(255), category_value varbinary(255));"
122123 self._query_names_['drop_page_category'] = "drop table if exists rfaulk.page_category;"
123124 self._query_names_['index1_page_category'] = "create index idx_page_id on rfaulk.page_category (page_id);"
124125 self._query_names_['index2_page_category'] = "create index idx_page_title on rfaulk.page_category (page_title);"
125126 self._query_names_['index3_page_category'] = "create index idx_category on rfaulk.page_category (category);"
 127+ self._query_names_['index3_page_category'] = "create index idx_category_value on rfaulk.page_category (category_value);"
126128 self._query_names_['insert_page_category'] = "insert into rfaulk.page_category values %s;"
127129
128130 self._regexp_list_ = ['^[Aa]', '^[Bb]', '^[Cc]', '^[Dd]', '^[Ee]', '^[Ff]', '^[Gg]', '^[Hh]', '^[Ii]', '^[Jj]', '^[Kk]', '^[Ll]', '^[Mm]', '^[Nn]', '^[Oo]', '^[Pp]', '^[Qq]', '^[Rr]', \
@@ -129,10 +131,17 @@
130132
131133 self._max_depth_ = 50
132134 self._main_topic_ = 'Main_topic_classifications'
133 - self._top_level_cats_ = subcategories[self._main_topic_][:]
 135+ self._top_level_cats_ = subcategories[self._main_topic_]
 136+
134137 self._top_level_cats_.remove('Chronology')
 138+ self._top_level_cats_.remove('Geography')
 139+ self._top_level_cats_.remove('Nature')
 140+ self._top_level_cats_.remove('Agriculture')
 141+ self._top_level_cats_.remove('Applied_sciences')
135142
 143+ self._top_level_cats_.append('Places')
136144
 145+
137146 #self._top_level_cats_ = ['Natural_sciences', 'Applied_sciences', 'Mathematics', 'Literature', 'Visual_arts', 'Social_sciences', 'Film', 'Music', 'Television', 'People', 'Religion', 'Culture', 'Philosophy', 'Sports', 'Places']
138147 # self._top_level_cats_ = ['Natural_sciences', 'Mathematics', 'Arts', 'Social_sciences', 'Entertainment', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports']
139148 self._block_words_ = ['categories', 'Categories', 'topic', 'Topic']
@@ -587,10 +596,10 @@
588597 rank_categories_M2()
589598 """
590599 title = titles[page_id]
591 - page_tl_cat[title] = self.rank_categories_M2(page_categories[page_id], shortest_paths, topic_counts)
 600+ page_tl_cat[title], tl_cat_vectors[title] = self.rank_categories_M2(page_categories[page_id], shortest_paths, topic_counts)
592601
593602
594 - return titles, page_tl_cat #, tl_cat_vectors # , depths, cat_winner
 603+ return titles, page_tl_cat, tl_cat_vectors # , depths, cat_winner
595604
596605 """
597606 Method for determining top level category for a set of categories
@@ -693,7 +702,7 @@
694703
695704 """ The total weight of this category depends on the product of how far it lies from the root and the inverse of its fanout
696705 this makes more specialized categories worth more """
697 - category_weight = float(path_length_from_main) * fanout_weight
 706+ category_weight = float(path_length_from_main) # * fanout_weight
698707
699708 # category_weight = 1 / float(path_length_from_main) * self._max_depth_
700709
@@ -727,7 +736,7 @@
728737 top_five_cats = top_five_cats[:-2]
729738 page_tl_cat = top_five_cats
730739
731 - return page_tl_cat
 740+ return page_tl_cat, tl_cat_vectors
732741
733742 """
734743 Builds a table containing all main namespace pages and their chosen categories
@@ -766,7 +775,7 @@
767776 page_ids.append(int(row[0]))
768777
769778 logging.info('CATEGORIZING PAGES: Computing categories ... ')
770 - titles, page_tl_cat = self.find_top_level_category(page_ids, shortest_paths, topic_counts)
 779+ titles, page_tl_cat, cat_vector = self.find_top_level_category(page_ids, shortest_paths, topic_counts)
771780 ids = dict((i,v) for v,i in titles.iteritems())
772781
773782 logging.info('CATEGORIZING PAGES: Performing inserts ... ')
@@ -774,14 +783,20 @@
775784 for title in page_tl_cat:
776785 id = ids[title]
777786 category = page_tl_cat[title]
 787+ vector = cat_vector[title]
778788
779789 parts = title.split("'")
780790 new_title = parts[0]
781791 parts = parts[1:]
782792 for part in parts:
783793 new_title = new_title + " " + part
784 -
785 - page_id_str = "(%s,'%s','%s')" % (id, new_title, category)
 794+
 795+ vector_str = ''
 796+ for elem in vector:
 797+ vector_str = vector_str + '%1.7s ' % elem
 798+ vector_str = vector_str[:-1]
 799+
 800+ page_id_str = "(%s,'%s','%s', '%s')" % (id, new_title, category, vector_str)
786801 try:
787802 self.execute_SQL(sql_insert % page_id_str)
788803 except:

Status & tagging log