Index: trunk/tools/wsor/scripts/classes/WSORSlaveDataLoader.py |
— | — | @@ -12,7 +12,7 @@ |
13 | 13 | |
14 | 14 | |
15 | 15 | """ Import python base modules """ |
16 | | -import sys, getopt, re, datetime, logging, MySQLdb, settings |
| 16 | +import sys, getopt, re, datetime, logging, MySQLdb, settings, operator |
17 | 17 | import networkx as nx |
18 | 18 | |
19 | 19 | """ Import Analytics modules """ |
— | — | @@ -91,13 +91,13 @@ |
92 | 92 | self._query_names_['get_subcategories'] = "select cl_to from categorylinks_cp where cl_from = %s" |
93 | 93 | self._query_names_['delete_from_recs'] = "delete from rfaulk.categorylinks_cp where cl_from = %s" |
94 | 94 | self._query_names_['is_empty'] = "select * from rfaulk.categorylinks_cp limit 1" |
95 | | - self._query_names_['get_category_links'] = "select cl_from, cl_to from categorylinks_cp limit 100" |
| 95 | + self._query_names_['get_category_links'] = "select cl_from, cl_to from categorylinks_cp limit 10000" |
96 | 96 | |
97 | 97 | WSORSlaveDataLoader.__init__(self) |
98 | 98 | logging.info('Creating CategoryLoader') |
99 | 99 | |
100 | 100 | """ |
101 | | - |
| 101 | + Retrieves all rows out of the category links table |
102 | 102 | """ |
103 | 103 | def get_category_links(self): |
104 | 104 | |
— | — | @@ -236,8 +236,8 @@ |
237 | 237 | """ |
238 | 238 | def extract_hierarchy(self): |
239 | 239 | |
240 | | - #self.drop_category_links_cp_table() |
241 | | - #self.create_category_links_cp_table() |
| 240 | + self.drop_category_links_cp_table() |
| 241 | + self.create_category_links_cp_table() |
242 | 242 | |
243 | 243 | """ Create graph """ |
244 | 244 | logging.info('Initializing directed graph...') |
— | — | @@ -256,24 +256,83 @@ |
257 | 257 | links = self.get_category_links() |
258 | 258 | count = 0 |
259 | 259 | |
| 260 | + out_degrees = dict() |
| 261 | + in_degrees = dict() |
| 262 | + subcategories = dict() |
| 263 | + |
| 264 | + """ Process subcategory links """ |
260 | 265 | for row in links: |
261 | 266 | |
262 | 267 | cl_from = int(row[0]) |
263 | 268 | cl_to = str(row[1]) |
264 | 269 | cl_from = self.get_page_title(cl_from) |
| 270 | + |
| 271 | + try: |
| 272 | + subcategories[cl_from].append(cl_to) |
265 | 273 | |
| 274 | + except KeyError: |
| 275 | + subcategories[cl_from] = list() |
| 276 | + subcategories[cl_from].append(cl_to) |
| 277 | + |
| 278 | + try: |
| 279 | + out_degrees[cl_from] = out_degrees[cl_from] + 1 |
| 280 | + except KeyError: |
| 281 | + out_degrees[cl_from] = 1 |
| 282 | + |
| 283 | + try: |
| 284 | + in_degrees[cl_to] = in_degrees[cl_to] + 1 |
| 285 | + except KeyError: |
| 286 | + in_degrees[cl_to] = 1 |
| 287 | + |
266 | 288 | directed_graph.add_weighted_edges_from([(cl_from, cl_to, 1)]) |
267 | 289 | |
268 | | - if self.__DEBUG__: |
| 290 | + if self.__DEBUG__ and count % 1000 == 0: |
269 | 291 | |
270 | 292 | logging.debug('%s: %s -> %s' % (str(count), cl_from, cl_to)) |
271 | | - count = count + 1 |
| 293 | + |
| 294 | + count = count + 1 |
272 | 295 | |
| 296 | + logging.info('Sorting in degree list.') |
| 297 | + sorted_in_degrees = sorted(in_degrees.iteritems(), key=operator.itemgetter(1), reverse=True) |
| 298 | + logging.info('Sorting out degree list.') |
| 299 | + sorted_out_degrees = sorted(out_degrees.iteritems(), key=operator.itemgetter(1), reverse=True) |
| 300 | + |
| 301 | + in_only, out_only = self.get_uni_directionally_linked_categories(sorted_in_degrees, sorted_out_degrees) |
| 302 | + |
273 | 303 | logging.info('Category links finished processing.') |
274 | 304 | |
275 | | - return directed_graph |
| 305 | + return directed_graph, in_degrees, out_degrees, sorted_in_degrees, sorted_out_degrees, subcategories, in_only, out_only |
276 | 306 | |
277 | 307 | |
| 308 | + """ |
| 309 | + Returns |
| 310 | + """ |
| 311 | + def get_uni_directionally_linked_categories(self, in_degrees, out_degrees): |
| 312 | + |
| 313 | + logging.info('Generating lists of categories have either only in degrees or out degrees.') |
| 314 | + |
| 315 | + in_keys = list() |
| 316 | + for i in in_degrees: |
| 317 | + in_keys.append(i[0]) |
| 318 | + |
| 319 | + out_keys = list() |
| 320 | + for i in out_degrees: |
| 321 | + out_keys.append(i[0]) |
| 322 | + |
| 323 | + in_only = list() |
| 324 | + out_only = list() |
| 325 | + |
| 326 | + for i in in_degrees: |
| 327 | + if not(i[0] in out_keys): |
| 328 | + in_only.append(i) |
| 329 | + |
| 330 | + for i in out_degrees: |
| 331 | + if not(i[0] in in_keys): |
| 332 | + out_only.append(i) |
| 333 | + |
| 334 | + return in_only, out_only |
| 335 | + |
| 336 | + |
278 | 337 | """ drop rfaulk.categorylinks_cp """ |
279 | 338 | def drop_category_links_cp_table(self): |
280 | 339 | |