Index: trunk/tools/editor_trends/classes/runtime_settings.py |
— | — | @@ -216,4 +216,4 @@ |
217 | 217 | if namespaces != None: |
218 | 218 | return namespaces.split(',') |
219 | 219 | else: |
220 | | - return namespaces |
| 220 | + return ['0'] #Assume that the mainspace is of interest |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -66,11 +66,12 @@ |
67 | 67 | @include is a list of namespace keys that should not be ignored, the default |
68 | 68 | setting is to ignore all namespaces except the main namespace. |
69 | 69 | ''' |
70 | | - ns = [] |
| 70 | + ns = {} |
71 | 71 | for key, value in namespaces.iteritems(): |
72 | | - if key not in include: |
| 72 | + if key in include: |
73 | 73 | #value = namespaces[namespace].get(u'*', None) |
74 | | - ns.append(value) |
| 74 | + #ns.append(value) |
| 75 | + ns[key] = value |
75 | 76 | return ns |
76 | 77 | |
77 | 78 | |
— | — | @@ -82,19 +83,28 @@ |
83 | 84 | return revisions |
84 | 85 | |
85 | 86 | |
86 | | -def verify_article_belongs_namespace(elem, namespaces): |
| 87 | +def parse_article(elem, namespaces): |
87 | 88 | ''' |
88 | | - @namespaces is a list of namespaces that should be ignored, hence if the |
89 | | - title of article starts with the namespace then return False else return |
90 | | - True |
| 89 | + @namespaces is a list of valid namespaces that should be included in the analysis |
| 90 | + if the article should be ignored then this function returns false, else it returns |
| 91 | + the namespace identifier and namespace name. |
91 | 92 | ''' |
92 | 93 | title = elem.text |
93 | 94 | if title == None: |
94 | 95 | return False |
95 | | - for namespace in namespaces: |
96 | | - if title.startswith(namespace): |
| 96 | + ns = title.split(':') |
| 97 | + if len(ns) ==1 and '0' in namespaces: |
| 98 | + return {'id': 0, 'name': 'main namespace'} |
| 99 | + else: |
| 100 | + if ns[0] in namespaces: |
| 101 | + return {'id': ns[0], 'name': ns[1]} |
| 102 | + else: |
97 | 103 | return False |
98 | | - return True |
| 104 | + |
| 105 | +# for namespace in namespaces: |
| 106 | +# if title.startswith(namespace): |
| 107 | +# return False |
| 108 | +# return True |
99 | 109 | |
100 | 110 | |
101 | 111 | def validate_hostname(address): |
— | — | @@ -263,12 +273,15 @@ |
264 | 274 | for page, article_size in wikitree.parser.read_input(fh1): |
265 | 275 | title = page.find('title') |
266 | 276 | total += 1 |
267 | | - if verify_article_belongs_namespace(title, ns): |
| 277 | + namespace = parse_article(title, ns) |
| 278 | + if namespace != False: |
| 279 | + #if verify_article_belongs_namespace(title, ns): |
268 | 280 | article_id = page.find('id').text |
269 | 281 | title = page.find('title').text |
270 | 282 | revisions = page.findall('revision') |
271 | 283 | revisions = parse_comments(revisions, remove_numeric_character_references) |
272 | 284 | output = output_editor_information(revisions, article_id, bot_ids) |
| 285 | + output = [o.append(namespace['id'] for o in output)] |
273 | 286 | write_output(output, filehandles, lock) |
274 | 287 | file_utils.write_list_to_csv([article_id, title], fh2) |
275 | 288 | processed += 1 |