Index: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py |
— | — | @@ -81,6 +81,8 @@ |
82 | 82 | |
83 | 83 | def __init__(self): |
84 | 84 | |
| 85 | + self.__DEBUG__ = True |
| 86 | + |
85 | 87 | self._query_names_['build_subcat_tbl'] = "CREATE TABLE rfaulk.categorylinks_cp select * from enwiki.categorylinks where cl_type = 'subcat'" |
86 | 88 | self._query_names_['drop_subcat_tbl'] = "drop table if exists rfaulk.categorylinks_cp;" |
87 | 89 | self._query_names_['get_first_rec'] = "select cl_from from categorylinks_cp limit 1" |
— | — | @@ -89,11 +91,31 @@ |
90 | 92 | self._query_names_['get_subcategories'] = "select cl_to from categorylinks_cp where cl_from = %s" |
91 | 93 | self._query_names_['delete_from_recs'] = "delete from rfaulk.categorylinks_cp where cl_from = %s" |
92 | 94 | self._query_names_['is_empty'] = "select * from rfaulk.categorylinks_cp limit 1" |
| 95 | + self._query_names_['get_category_links'] = "select cl_from, cl_to from categorylinks_cp limit 100" |
93 | 96 | |
94 | 97 | WSORSlaveDataLoader.__init__(self) |
95 | 98 | logging.info('Creating CategoryLoader') |
96 | 99 | |
97 | 100 | """ |
| 101 | + |
| 102 | + """ |
| 103 | + def get_category_links(self): |
| 104 | + |
| 105 | + try: |
| 106 | + sql = self._query_names_['get_category_links'] |
| 107 | + logging.info('Executing: ' + sql) |
| 108 | + results = self.execute_SQL(sql) |
| 109 | + |
| 110 | + except: |
| 111 | + |
| 112 | + logging.error('Could not retrieve page_id.') |
| 113 | + return -1 |
| 114 | + |
| 115 | + return results |
| 116 | + |
| 117 | + |
| 118 | + |
| 119 | + """ |
98 | 120 | Retrives the integer page id |
99 | 121 | """ |
100 | 122 | def get_page_id(self, page_title): |
— | — | @@ -209,6 +231,8 @@ |
210 | 232 | |
211 | 233 | """ |
212 | 234 | Execution entry point of the class - builds a full category hierarchy from categorylinks |
| 235 | + |
| 236 | + CURRENTLY THE EDGES ARE PROCESSED IN A NON=-RECURSIVE WAY, this is much faster |
213 | 237 | """ |
214 | 238 | def extract_hierarchy(self): |
215 | 239 | |
— | — | @@ -220,12 +244,31 @@ |
221 | 245 | directed_graph = nx.DiGraph() |
222 | 246 | |
223 | 247 | """ while there are rows left in categorylinks_cp """ |
| 248 | + |
| 249 | + """ |
224 | 250 | while(not self.is_empty()): |
225 | | - |
| 251 | + |
226 | 252 | category_title = self.get_first_record_from_category_links() |
227 | 253 | self.build_category_tree(directed_graph, category_title) |
228 | 254 | directed_graph.add_weighted_edges_from([('ALL', category_title, 1)]) |
| 255 | + """ |
229 | 256 | |
| 257 | + links = self.get_category_links() |
| 258 | + count = 0 |
| 259 | + |
| 260 | + for row in links: |
| 261 | + |
| 262 | + cl_from = int(row[0]) |
| 263 | + cl_to = str(row[1]) |
| 264 | + cl_from = self.get_page_title(cl_from) |
| 265 | + |
| 266 | + directed_graph.add_weighted_edges_from([(cl_from, cl_to, 1)]) |
| 267 | + |
| 268 | + if self.__DEBUG__: |
| 269 | + |
| 270 | + logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to)) |
| 271 | + count = count + 1 |
| 272 | + |
230 | 273 | logging.info('Category links finished processing.') |
231 | 274 | |
232 | 275 | return directed_graph |
— | — | @@ -293,33 +336,8 @@ |
294 | 337 | else: |
295 | 338 | return True |
296 | 339 | |
297 | | - """ |
298 | | - The cl_from key is formatted in uppercase with non-uniform whitespace |
299 | 340 | |
300 | | - def normalize_field_cl_from(self, category): |
301 | 341 | |
302 | | - category = category.lower() |
303 | | - words = category.split('\n')[0] # only keep text before the a carraige return |
304 | | - words = words.split() |
305 | | - len_words = len(words) |
306 | | - |
307 | | - category = '' |
308 | | - category_camel = '' |
309 | | - |
310 | | - |
311 | | - for i in range(len_words - 1): |
312 | | - category = category + words[i] + ' ' |
313 | | - category_camel = category_camel + words[i][0].upper() + words[i][1:] + ' ' |
314 | | - |
315 | | - category = category + words[len_words - 1] |
316 | | - category_camel = category_camel + words[len_words - 1][0].upper() + words[len_words - 1][1:] |
317 | | - |
318 | | - category_upper = category.upper() |
319 | | - category_lower = category.lower() |
320 | | - |
321 | | - return category_upper, category_lower, category_camel |
322 | | - """ |
323 | | - |
324 | 342 | """ |
325 | 343 | Inherits WSORSlaveDataLoader |
326 | 344 | |