r81316 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r81315‎ | r81316 | r81317 >
Date:19:57, 1 February 2011
Author:diederik
Status:deferred
Tags:
Comment:
The extract phase is now also storing the namespace identifier.
Modified paths:
  • /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/classes/runtime_settings.py
@@ -216,4 +216,4 @@
217217 if namespaces != None:
218218 return namespaces.split(',')
219219 else:
220 - return namespaces
 220+ return ['0'] #Assume that the mainspace is of interest
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -66,11 +66,12 @@
6767 @include is a list of namespace keys that should not be ignored, the default
6868 setting is to ignore all namespaces except the main namespace.
6969 '''
70 - ns = []
 70+ ns = {}
7171 for key, value in namespaces.iteritems():
72 - if key not in include:
 72+ if key in include:
7373 #value = namespaces[namespace].get(u'*', None)
74 - ns.append(value)
 74+ #ns.append(value)
 75+ ns[key] = value
7576 return ns
7677
7778
@@ -82,19 +83,28 @@
8384 return revisions
8485
8586
86 -def verify_article_belongs_namespace(elem, namespaces):
 87+def parse_article(elem, namespaces):
8788 '''
88 - @namespaces is a list of namespaces that should be ignored, hence if the
89 - title of article starts with the namespace then return False else return
90 - True
 89+ @namespaces is a list of valid namespaces that should be included in the analysis
 90+ if the article should be ignored then this function returns false, else it returns
 91+ the namespace identifier and namespace name.
9192 '''
9293 title = elem.text
9394 if title == None:
9495 return False
95 - for namespace in namespaces:
96 - if title.startswith(namespace):
 96+ ns = title.split(':')
 97+ if len(ns) ==1 and '0' in namespaces:
 98+ return {'id': 0, 'name': 'main namespace'}
 99+ else:
 100+ if ns[0] in namespaces:
 101+ return {'id': ns[0], 'name': ns[1]}
 102+ else:
97103 return False
98 - return True
 104+
 105+# for namespace in namespaces:
 106+# if title.startswith(namespace):
 107+# return False
 108+# return True
99109
100110
101111 def validate_hostname(address):
@@ -263,12 +273,15 @@
264274 for page, article_size in wikitree.parser.read_input(fh1):
265275 title = page.find('title')
266276 total += 1
267 - if verify_article_belongs_namespace(title, ns):
 277+ namespace = parse_article(title, ns)
 278+ if namespace != False:
 279+ #if verify_article_belongs_namespace(title, ns):
268280 article_id = page.find('id').text
269281 title = page.find('title').text
270282 revisions = page.findall('revision')
271283 revisions = parse_comments(revisions, remove_numeric_character_references)
272284 output = output_editor_information(revisions, article_id, bot_ids)
 285+ output = [o.append(namespace['id'] for o in output)]
273286 write_output(output, filehandles, lock)
274287 file_utils.write_list_to_csv([article_id, title], fh2)
275288 processed += 1

Status & tagging log