Index: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py |
— | — | @@ -117,11 +117,13 @@ |
118 | 118 | self._query_names_['get_page_categories'] = "select cl_from, cl_to from enwiki.categorylinks where %s order by 1" |
119 | 119 | self._query_names_['get_all_page_ids'] = "select page_id from enwiki.page where page_namespace = 0 and page_len > 1000" |
120 | 120 | |
121 | | - self._query_names_['create_page_category'] = "create table rfaulk.page_category (page_id int(8) unsigned, page_title varbinary(255), category varbinary(255));" |
| 121 | + self._query_names_['create_page_category'] = "create table rfaulk.page_category (page_id int(8) unsigned, page_title varbinary(255)," + \ |
| 122 | + "category varbinary(255), category_value varbinary(255));" |
122 | 123 | self._query_names_['drop_page_category'] = "drop table if exists rfaulk.page_category;" |
123 | 124 | self._query_names_['index1_page_category'] = "create index idx_page_id on rfaulk.page_category (page_id);" |
124 | 125 | self._query_names_['index2_page_category'] = "create index idx_page_title on rfaulk.page_category (page_title);" |
125 | 126 | self._query_names_['index3_page_category'] = "create index idx_category on rfaulk.page_category (category);" |
| 127 | + self._query_names_['index3_page_category'] = "create index idx_category_value on rfaulk.page_category (category_value);" |
126 | 128 | self._query_names_['insert_page_category'] = "insert into rfaulk.page_category values %s;" |
127 | 129 | |
128 | 130 | self._regexp_list_ = ['^[Aa]', '^[Bb]', '^[Cc]', '^[Dd]', '^[Ee]', '^[Ff]', '^[Gg]', '^[Hh]', '^[Ii]', '^[Jj]', '^[Kk]', '^[Ll]', '^[Mm]', '^[Nn]', '^[Oo]', '^[Pp]', '^[Qq]', '^[Rr]', \ |
— | — | @@ -129,10 +131,17 @@ |
130 | 132 | |
131 | 133 | self._max_depth_ = 50 |
132 | 134 | self._main_topic_ = 'Main_topic_classifications' |
133 | | - self._top_level_cats_ = subcategories[self._main_topic_][:] |
| 135 | + self._top_level_cats_ = subcategories[self._main_topic_] |
| 136 | + |
134 | 137 | self._top_level_cats_.remove('Chronology') |
| 138 | + self._top_level_cats_.remove('Geography') |
| 139 | + self._top_level_cats_.remove('Nature') |
| 140 | + self._top_level_cats_.remove('Agriculture') |
| 141 | + self._top_level_cats_.remove('Applied_sciences') |
135 | 142 | |
| 143 | + self._top_level_cats_.append('Places') |
136 | 144 | |
| 145 | + |
137 | 146 | #self._top_level_cats_ = ['Natural_sciences', 'Applied_sciences', 'Mathematics', 'Literature', 'Visual_arts', 'Social_sciences', 'Film', 'Music', 'Television', 'People', 'Religion', 'Culture', 'Philosophy', 'Sports', 'Places'] |
138 | 147 | # self._top_level_cats_ = ['Natural_sciences', 'Mathematics', 'Arts', 'Social_sciences', 'Entertainment', 'Biography', 'Religion', 'Culture', 'Philosophy', 'Sports'] |
139 | 148 | self._block_words_ = ['categories', 'Categories', 'topic', 'Topic'] |
— | — | @@ -587,10 +596,10 @@ |
588 | 597 | rank_categories_M2() |
589 | 598 | """ |
590 | 599 | title = titles[page_id] |
591 | | - page_tl_cat[title] = self.rank_categories_M2(page_categories[page_id], shortest_paths, topic_counts) |
| 600 | + page_tl_cat[title], tl_cat_vectors[title] = self.rank_categories_M2(page_categories[page_id], shortest_paths, topic_counts) |
592 | 601 | |
593 | 602 | |
594 | | - return titles, page_tl_cat #, tl_cat_vectors # , depths, cat_winner |
| 603 | + return titles, page_tl_cat, tl_cat_vectors # , depths, cat_winner |
595 | 604 | |
596 | 605 | """ |
597 | 606 | Method for determining top level category for a set of categories |
— | — | @@ -693,7 +702,7 @@ |
694 | 703 | |
695 | 704 | """ The total weight of this category depends on the product of how far it lies from the root and the inverse of its fanout |
696 | 705 | this makes more specialized categories worth more """ |
697 | | - category_weight = float(path_length_from_main) * fanout_weight |
| 706 | + category_weight = float(path_length_from_main) # * fanout_weight |
698 | 707 | |
699 | 708 | # category_weight = 1 / float(path_length_from_main) * self._max_depth_ |
700 | 709 | |
— | — | @@ -727,7 +736,7 @@ |
728 | 737 | top_five_cats = top_five_cats[:-2] |
729 | 738 | page_tl_cat = top_five_cats |
730 | 739 | |
731 | | - return page_tl_cat |
| 740 | + return page_tl_cat, tl_cat_vectors |
732 | 741 | |
733 | 742 | """ |
734 | 743 | Builds a table containing all main namespace pages and their chosen categories |
— | — | @@ -766,7 +775,7 @@ |
767 | 776 | page_ids.append(int(row[0])) |
768 | 777 | |
769 | 778 | logging.info('CATEGORIZING PAGES: Computing categories ... ') |
770 | | - titles, page_tl_cat = self.find_top_level_category(page_ids, shortest_paths, topic_counts) |
| 779 | + titles, page_tl_cat, cat_vector = self.find_top_level_category(page_ids, shortest_paths, topic_counts) |
771 | 780 | ids = dict((i,v) for v,i in titles.iteritems()) |
772 | 781 | |
773 | 782 | logging.info('CATEGORIZING PAGES: Performing inserts ... ') |
— | — | @@ -774,14 +783,20 @@ |
775 | 784 | for title in page_tl_cat: |
776 | 785 | id = ids[title] |
777 | 786 | category = page_tl_cat[title] |
| 787 | + vector = cat_vector[title] |
778 | 788 | |
779 | 789 | parts = title.split("'") |
780 | 790 | new_title = parts[0] |
781 | 791 | parts = parts[1:] |
782 | 792 | for part in parts: |
783 | 793 | new_title = new_title + " " + part |
784 | | - |
785 | | - page_id_str = "(%s,'%s','%s')" % (id, new_title, category) |
| 794 | + |
| 795 | + vector_str = '' |
| 796 | + for elem in vector: |
| 797 | + vector_str = vector_str + '%1.7s ' % elem |
| 798 | + vector_str = vector_str[:-1] |
| 799 | + |
| 800 | + page_id_str = "(%s,'%s','%s', '%s')" % (id, new_title, category, vector_str) |
786 | 801 | try: |
787 | 802 | self.execute_SQL(sql_insert % page_id_str) |
788 | 803 | except: |