r84872 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r84871‎ | r84872 | r84873 >
Date:18:46, 27 March 2011
Author:diederik
Status:deferred
Tags:
Comment:
Removed code dependencies.
Modified paths:
  • /trunk/tools/editor_trends/etl/enricher.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/enricher.py
@@ -193,7 +193,7 @@
194194
195195
196196 def extract_revision_text(revision):
197 - rev = revision.find('text')
 197+ rev = revision.find('ns0:text')
198198 if rev != None:
199199 if rev.text == None:
200200 rev = fix_revision_text(revision)
@@ -202,6 +202,53 @@
203203 return ''
204204
205205
 206+def extract_username(contributor):
 207+ contributor = contributor.find('ns0:username')
 208+ if contributor != None:
 209+ return contributor.text
 210+ else:
 211+ return None
 212+
 213+
 214+def determine_username_is_bot(contributor, bots):
 215+ '''
 216+ #contributor is an xml element containing the id of the contributor
 217+ @bots should have a dict with all the bot ids and bot names
 218+ @Return False if username id is not in bot dict id or True if username id
 219+ is a bot id.
 220+ '''
 221+ username = contributor.find('ns0:username')
 222+ if username == None:
 223+ return 0
 224+ else:
 225+ if username.text in bots:
 226+ return 1
 227+ else:
 228+ return 0
 229+
 230+
 231+def extract_contributor_id(contributor):
 232+ '''
 233+ @contributor is the xml contributor node containing a number of attributes
 234+ Currently, we are only interested in registered contributors, hence we
 235+ ignore anonymous editors.
 236+ '''
 237+ if contributor.get('deleted'):
 238+ # ASK: Not sure if this is the best way to code deleted contributors.
 239+ return None
 240+ elem = contributor.find('ns0:id')
 241+ if elem != None:
 242+ return {'id':elem.text}
 243+ else:
 244+ elem = contributor.find('ns0:ip')
 245+ if elem != None and elem.text != None \
 246+ and validate_ip(elem.text) == False \
 247+ and validate_hostname(elem.text) == False:
 248+ return {'username':elem.text, 'id': elem.text}
 249+ else:
 250+ return None
 251+
 252+
206253 def fix_revision_text(revision):
207254 if revision.text == None:
208255 revision.text = ''
@@ -234,9 +281,9 @@
235282
236283
237284 def parse_contributor(contributor, bots):
238 - username = extracter.extract_username(contributor)
239 - user_id = extracter.extract_contributor_id(contributor)
240 - bot = extracter.determine_username_is_bot(contributor, bots=bots)
 285+ username = extract_username(contributor)
 286+ user_id = extract_contributor_id(contributor)
 287+ bot = determine_username_is_bot(contributor, bots)
241288 contributor.clear()
242289 editor = {}
243290 editor['username'] = username
@@ -310,7 +357,7 @@
311358 #the entire revision is empty, weird.
312359 continue
313360 dump(revision)
314 - contributor = revision.find('contributor')
 361+ contributor = revision.find('ns0:contributor')
315362 contributor = parse_contributor(contributor, bots)
316363 if not contributor:
317364 #editor is anonymous, ignore