r90274 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r90273‎ | r90274 | r90275 >
Date:15:54, 17 June 2011
Author:diederik
Status:deferred
Tags:
Comment:
Simplified find functionality.
Modified paths:
  • /trunk/tools/editor_trends/analyses/adhoc/bot_detector.py (modified) (history)
  • /trunk/tools/editor_trends/classes/storage.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/adhoc/bot_detector.py
@@ -31,11 +31,9 @@
3232 from classes import settings
3333 settings = settings.Settings()
3434
35 -
36 -from classes import storage
3735 from utils import file_utils
3836 from utils import messages
39 -
 37+from classes import storage
4038 from classes import consumers
4139 from classes import bots
4240
Index: trunk/tools/editor_trends/classes/storage.py
@@ -135,8 +135,7 @@
136136 return self.db[self.collection].save(data)
137137
138138 def insert(self, data, qualifiers=None, safe=False):
139 - assert isinstance(data, dict), 'You need to feed me dictionaries.'
140 - data = self.stringify_keys(data)
 139+ assert isinstance(data, dict) or isinstance(data, list), 'You need to feed me dictionaries.'
141140 try:
142141 if qualifiers:
143142 self.db[self.collection].insert(data, qualifiers, safe=safe)
@@ -144,8 +143,6 @@
145144 self.db[self.collection].insert(data, safe=safe)
146145 except bson.errors.InvalidDocument, error:
147146 print error
148 - print 'BSON document too large, unable to store %s' % \
149 - (data.keys()[0])
150147 except OperationFailure, error:
151148 print 'It seems that you are running out of disk space. \
152149 Error message: %s' % error
@@ -160,7 +157,7 @@
161158 assert isinstance(data, dict), 'You need to feed me dictionaries.'
162159 self.db[self.collection].update({key: value}, {'$set': data})
163160
164 - def find(self, conditions, vars=None):
 161+ def find(self, conditions=None, vars=None):
165162 if conditions:
166163 return self.db[self.collection].find(conditions, fields=vars)
167164 else:
@@ -188,15 +185,6 @@
189186 def count(self):
190187 return self.db[self.collection].count()
191188
192 - def retrieve_editors(self):
193 - q = queue.JoinableRetryQueue()
194 - cursor = self.find('editor')
195 - print 'Loading editors...'
196 - for editor in cursor:
197 - q.put(editor['editor'])
198 - print 'Finished loading editors...'
199 - return q
200 -
201189 def retrieve_distinct_keys(self, key, force_new=False):
202190 '''
203191 TODO: figure out how big the index is and then take appropriate action,
@@ -216,9 +204,6 @@
217205 ids = self.db[self.collection].distinct(key)
218206 else:
219207 ids = self.retrieve_distinct_keys_mapreduce(key)
220 - file_utils.store_object(ids, settings.binary_location, \
221 - '%s_%s_%s.bin' % (self.dbname,
222 - self.collection, key))
223208 return ids
224209
225210 def retrieve_distinct_keys_mapreduce(self, key):
@@ -226,17 +211,23 @@
227212 This is to work around a Mongo limitation, if the index is too large
228213 then the distinct() function does not work. You need to do a map/reduce.
229214 '''
230 - emit = 'function () { emit(this.%s, 1)};' % key
231 - mapper = Code(emit)
232 - reducer = Code("function()")
 215+ q = queue.JoinableRetryQueue()
 216+ collection = '%s_%s_%s' % (self.dbname, 'mapreduce', key)
233217
234 - ids = []
235 - collection = '%s_%s' % (self.dbname, 'mapreduce_editors')
236 - cursor = self.db[self.collection].map_reduce(mapper, reducer, collection)
237 - for c in cursor.find():
238 - ids.append(c['_id'])
239 - return ids
 218+ if self.db[collection].count() == 0:
 219+ emit = 'function () { emit(this.%s, 1)};' % key
 220+ mapper = Code(emit)
 221+ reducer = Code("function()")
 222+ result = self.db[self.collection].map_reduce(mapper, reducer, collection)
 223+ else:
 224+ result = self.db[collection]
240225
 226+ print 'Loading %s keys in queue...' % key
 227+ for res in result.find():
 228+ q.put(res['_id'])
 229+ print 'Finished loading %s keys...' % key
 230+ return q
 231+
241232 def stringify_keys(self, data):
242233 '''
243234 @data should be a dictionary where the keys are not yet strings. This