Index: trunk/tools/editor_trends/analyses/adhoc/bot_detector.py |
— | — | @@ -31,11 +31,9 @@ |
32 | 32 | from classes import settings |
33 | 33 | settings = settings.Settings() |
34 | 34 | |
35 | | - |
36 | | -from classes import storage |
37 | 35 | from utils import file_utils |
38 | 36 | from utils import messages |
39 | | - |
| 37 | +from classes import storage |
40 | 38 | from classes import consumers |
41 | 39 | from classes import bots |
42 | 40 | |
Index: trunk/tools/editor_trends/classes/storage.py |
— | — | @@ -135,8 +135,7 @@ |
136 | 136 | return self.db[self.collection].save(data) |
137 | 137 | |
138 | 138 | def insert(self, data, qualifiers=None, safe=False): |
139 | | - assert isinstance(data, dict), 'You need to feed me dictionaries.' |
140 | | - data = self.stringify_keys(data) |
| 139 | + assert isinstance(data, dict) or isinstance(data, list), 'You need to feed me dictionaries.' |
141 | 140 | try: |
142 | 141 | if qualifiers: |
143 | 142 | self.db[self.collection].insert(data, qualifiers, safe=safe) |
— | — | @@ -144,8 +143,6 @@ |
145 | 144 | self.db[self.collection].insert(data, safe=safe) |
146 | 145 | except bson.errors.InvalidDocument, error: |
147 | 146 | print error |
148 | | - print 'BSON document too large, unable to store %s' % \ |
149 | | - (data.keys()[0]) |
150 | 147 | except OperationFailure, error: |
151 | 148 | print 'It seems that you are running out of disk space. \ |
152 | 149 | Error message: %s' % error |
— | — | @@ -160,7 +157,7 @@ |
161 | 158 | assert isinstance(data, dict), 'You need to feed me dictionaries.' |
162 | 159 | self.db[self.collection].update({key: value}, {'$set': data}) |
163 | 160 | |
164 | | - def find(self, conditions, vars=None): |
| 161 | + def find(self, conditions=None, vars=None): |
165 | 162 | if conditions: |
166 | 163 | return self.db[self.collection].find(conditions, fields=vars) |
167 | 164 | else: |
— | — | @@ -188,15 +185,6 @@ |
189 | 186 | def count(self): |
190 | 187 | return self.db[self.collection].count() |
191 | 188 | |
192 | | - def retrieve_editors(self): |
193 | | - q = queue.JoinableRetryQueue() |
194 | | - cursor = self.find('editor') |
195 | | - print 'Loading editors...' |
196 | | - for editor in cursor: |
197 | | - q.put(editor['editor']) |
198 | | - print 'Finished loading editors...' |
199 | | - return q |
200 | | - |
201 | 189 | def retrieve_distinct_keys(self, key, force_new=False): |
202 | 190 | ''' |
203 | 191 | TODO: figure out how big the index is and then take appropriate action, |
— | — | @@ -216,9 +204,6 @@ |
217 | 205 | ids = self.db[self.collection].distinct(key) |
218 | 206 | else: |
219 | 207 | ids = self.retrieve_distinct_keys_mapreduce(key) |
220 | | - file_utils.store_object(ids, settings.binary_location, \ |
221 | | - '%s_%s_%s.bin' % (self.dbname, |
222 | | - self.collection, key)) |
223 | 208 | return ids |
224 | 209 | |
225 | 210 | def retrieve_distinct_keys_mapreduce(self, key): |
— | — | @@ -226,17 +211,23 @@ |
227 | 212 | This is to work around a Mongo limitation, if the index is too large |
228 | 213 | then the distinct() function does not work. You need to do a map/reduce. |
229 | 214 | ''' |
230 | | - emit = 'function () { emit(this.%s, 1)};' % key |
231 | | - mapper = Code(emit) |
232 | | - reducer = Code("function()") |
| 215 | + q = queue.JoinableRetryQueue() |
| 216 | + collection = '%s_%s_%s' % (self.dbname, 'mapreduce', key) |
233 | 217 | |
234 | | - ids = [] |
235 | | - collection = '%s_%s' % (self.dbname, 'mapreduce_editors') |
236 | | - cursor = self.db[self.collection].map_reduce(mapper, reducer, collection) |
237 | | - for c in cursor.find(): |
238 | | - ids.append(c['_id']) |
239 | | - return ids |
| 218 | + if self.db[collection].count() == 0: |
| 219 | + emit = 'function () { emit(this.%s, 1)};' % key |
| 220 | + mapper = Code(emit) |
| 221 | + reducer = Code("function()") |
| 222 | + result = self.db[self.collection].map_reduce(mapper, reducer, collection) |
| 223 | + else: |
| 224 | + result = self.db[collection] |
240 | 225 | |
| 226 | + print 'Loading %s keys in queue...' % key |
| 227 | + for res in result.find(): |
| 228 | + q.put(res['_id']) |
| 229 | + print 'Finished loading %s keys...' % key |
| 230 | + return q |
| 231 | + |
241 | 232 | def stringify_keys(self, data): |
242 | 233 | ''' |
243 | 234 | @data should be a dictionary where the keys are not yet strings. This |