Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -193,7 +193,7 @@ |
194 | 194 | |
195 | 195 | |
196 | 196 | def extract_revision_text(revision): |
197 | | - rev = revision.find('text') |
| 197 | + rev = revision.find('ns0:text') |
198 | 198 | if rev != None: |
199 | 199 | if rev.text == None: |
200 | 200 | rev = fix_revision_text(revision) |
— | — | @@ -202,6 +202,53 @@ |
203 | 203 | return '' |
204 | 204 | |
205 | 205 | |
| 206 | +def extract_username(contributor): |
| 207 | + contributor = contributor.find('ns0:username') |
| 208 | + if contributor != None: |
| 209 | + return contributor.text |
| 210 | + else: |
| 211 | + return None |
| 212 | + |
| 213 | + |
| 214 | +def determine_username_is_bot(contributor, bots): |
| 215 | + ''' |
| 216 | + #contributor is an xml element containing the id of the contributor |
| 217 | + @bots should have a dict with all the bot ids and bot names |
| 218 | + @Return False if username id is not in bot dict id or True if username id |
| 219 | + is a bot id. |
| 220 | + ''' |
| 221 | + username = contributor.find('ns0:username') |
| 222 | + if username == None: |
| 223 | + return 0 |
| 224 | + else: |
| 225 | + if username.text in bots: |
| 226 | + return 1 |
| 227 | + else: |
| 228 | + return 0 |
| 229 | + |
| 230 | + |
| 231 | +def extract_contributor_id(contributor): |
| 232 | + ''' |
| 233 | + @contributor is the xml contributor node containing a number of attributes |
| 234 | + Currently, we are only interested in registered contributors, hence we |
| 235 | + ignore anonymous editors. |
| 236 | + ''' |
| 237 | + if contributor.get('deleted'): |
| 238 | + # ASK: Not sure if this is the best way to code deleted contributors. |
| 239 | + return None |
| 240 | + elem = contributor.find('ns0:id') |
| 241 | + if elem != None: |
| 242 | + return {'id':elem.text} |
| 243 | + else: |
| 244 | + elem = contributor.find('ns0:ip') |
| 245 | + if elem != None and elem.text != None \ |
| 246 | + and validate_ip(elem.text) == False \ |
| 247 | + and validate_hostname(elem.text) == False: |
| 248 | + return {'username':elem.text, 'id': elem.text} |
| 249 | + else: |
| 250 | + return None |
| 251 | + |
| 252 | + |
206 | 253 | def fix_revision_text(revision): |
207 | 254 | if revision.text == None: |
208 | 255 | revision.text = '' |
— | — | @@ -234,9 +281,9 @@ |
235 | 282 | |
236 | 283 | |
237 | 284 | def parse_contributor(contributor, bots): |
238 | | - username = extracter.extract_username(contributor) |
239 | | - user_id = extracter.extract_contributor_id(contributor) |
240 | | - bot = extracter.determine_username_is_bot(contributor, bots=bots) |
| 285 | + username = extract_username(contributor) |
| 286 | + user_id = extract_contributor_id(contributor) |
| 287 | + bot = determine_username_is_bot(contributor, bots) |
241 | 288 | contributor.clear() |
242 | 289 | editor = {} |
243 | 290 | editor['username'] = username |
— | — | @@ -310,7 +357,7 @@ |
311 | 358 | #the entire revision is empty, weird. |
312 | 359 | continue |
313 | 360 | dump(revision) |
314 | | - contributor = revision.find('contributor') |
| 361 | + contributor = revision.find('ns0:contributor') |
315 | 362 | contributor = parse_contributor(contributor, bots) |
316 | 363 | if not contributor: |
317 | 364 | #editor is anonymous, ignore |