r77346 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r77345‎ | r77346 | r77347 >
Date:22:14, 26 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
Expanded the bots module with functionality to generate a training dataset to predict whether a Wikipedia user is a bot or not.
Modified paths:
  • /trunk/tools/editor_trends/bots/bots.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/bots/bots.py
@@ -34,6 +34,8 @@
3535 from etl import models
3636 import models as botmodels
3737
 38+import cProfile
 39+
3840 try:
3941 import psyco
4042 psyco.full()
@@ -113,30 +115,28 @@
114116 '''
115117 lock = kwargs.get('lock')
116118 bots = kwargs.get('bots')
117 - if settings.debug:
118 - messages = {}
119119
120120 revisions = xml_nodes.findall('revision')
121121 for revision in revisions:
122122 contributor = xml.retrieve_xml_node(revision, 'contributor')
123123 username = contributor.find('username')
124 - if username == None:
 124+ if username == None or username.text == None:
125125 continue
126 - username = xml.extract_text(username, None)
 126+ else:
 127+ username = username.text
127128 #print username.encode('utf-8')
128129 if username in bots and bots[username].verified == True:
129 - id = contributor.find('id')
130 - id = xml.extract_text(id, None)
 130+ id = contributor.find('id').text
131131 bot = bots[username]
132 - bot_dict = convert_object_to_dict(bot, exclude=['time', 'name', 'written'])
133 - bot_dict['_username'] = username
134 - bot_dict['id'] = id
135 -
 132+
136133 if not hasattr(bot, 'written'):
 134+ bot_dict = convert_object_to_dict(bot, exclude=['time', 'name', 'written'])
 135+ bot_dict['_username'] = username
 136+ bot_dict['id'] = id
137137 lock.acquire()
138138 utils.write_dict_to_csv(bot_dict, fh, write_key=False)
139139 lock.release()
140 - bot.written = True
 140+ bot.written = True
141141 #bots.pop(username)
142142 #if bots == {}:
143143 # print 'Found id numbers for all bots.'
@@ -156,12 +156,8 @@
157157 #bot = bots.get('PseudoBot')
158158 #bot.hours_active()
159159 #bot.avg_lag_between_edits()
160 - if settings.debug:
161 - utils.report_error_messages(messages, lookup_bot_userid)
162160
163161
164 -
165 -
166162 def bot_launcher(language_code, project, single=False):
167163 '''
168164 This function sets the stage to launch bot id detection and collecting data
@@ -190,13 +186,17 @@
191187
192188 utils.store_object(bots, settings.binary_location, 'bots.bin')
193189 bot_training_dataset(bots)
 190+ store_bots()
194191 if bots != {}:
195192 print 'The script was unable to retrieve the user id\s for the following %s bots:\n' % len(bots)
196193 keys = bots.keys()
197194 for key in keys:
198 - print '%s' % key
 195+ try:
 196+ print '%s' % key.encode(settings.encoding)
 197+ except:
 198+ pass
199199
200 - store_bots()
 200+
201201
202202
203203 def bot_training_dataset(bots):
@@ -225,12 +225,8 @@
226226 tasks.join()
227227
228228
229 -def bot_detector_launcher():
230 - bots = retrieve_bots()
231 -
232 -
233 -
234229 if __name__ == '__main__':
235230 language_code = 'en'
236231 project = 'wiki'
237 - bot_launcher(language_code, project, single=False)
 232+ #bot_launcher(language_code, project, single=True)
 233+ cProfile.run(bot_launcher(language_code, project, single=False), 'profile')

Status & tagging log