Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -260,7 +260,9 @@ |
261 | 261 | db.cleanup_database(project, logger) |
262 | 262 | |
263 | 263 | write_message_to_log(logger, args, verb='Storing', location=location, input=input, project=project, collection=collection) |
264 | | - loader.store_editors(input, project, collection) |
| 264 | + num_editors = loader.store_editors(input, project, collection) |
| 265 | + cnt_editors = db.count_records(project, collection) |
| 266 | + assert num_editors == cnt_editors |
265 | 267 | timer.elapsed() |
266 | 268 | |
267 | 269 | |
— | — | @@ -297,7 +299,8 @@ |
298 | 300 | write_message_to_log(logger, args, verb='Creating', dir=dirs) |
299 | 301 | settings.verify_environment(dirs) |
300 | 302 | |
301 | | - file = full_project + '_editors.bin' |
| 303 | + |
| 304 | + file = kwargs.get('full_project') + '_editor.bin' |
302 | 305 | write_message_to_log(logger, args, verb='Deleting', file=file) |
303 | 306 | utils.delete_file(settings.binary_location, file) |
304 | 307 | |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -1,98 +0,0 @@ |
2 | | -#!/usr/bin/python |
3 | | -# -*- coding: utf-8 -*- |
4 | | -''' |
5 | | -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com) |
6 | | -This program is free software; you can redistribute it and/or |
7 | | -modify it under the terms of the GNU General Public License version 2 |
8 | | -as published by the Free Software Foundation. |
9 | | -This program is distributed in the hope that it will be useful, |
10 | | -but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
12 | | -See the GNU General Public License for more details, at |
13 | | -http://www.fsf.org/licenses/gpl.html |
14 | | -''' |
15 | | - |
16 | | -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ]) |
17 | | -__author__email = 'dvanliere at gmail dot com' |
18 | | -__date__ = '2010-11-19' |
19 | | -__version__ = '0.1' |
20 | | - |
21 | | - |
22 | | -from Queue import Empty |
23 | | -import datetime |
24 | | -import sys |
25 | | -sys.path.append('..') |
26 | | - |
27 | | -import configuration |
28 | | -settings = configuration.Settings() |
29 | | - |
30 | | -from database import cache |
31 | | - |
32 | | - |
33 | | -#def store_editors(data_queue, **kwargs): |
34 | | -# ''' |
35 | | -# @data_queue is an instance of Queue containing information extracted by |
36 | | -# parse_editors() |
37 | | -# kwargs should contain: |
38 | | -# @dbname is the name of the MongoDB database where to store the information. |
39 | | -# @collection is the name of the MongoDB collection. |
40 | | -# ''' |
41 | | -# dbname = kwargs.get('dbname', None) |
42 | | -# collection = kwargs.pop('collection') |
43 | | -# mongo = db.init_mongo_db(dbname) |
44 | | -# collection = mongo[collection] |
45 | | -# mongo[collection].ensure_index('editor') |
46 | | -# editor_cache = cache.EditorCache(collection) |
47 | | -# |
48 | | -# while True: |
49 | | -# try: |
50 | | -# edit = data_queue.get(block=False) |
51 | | -# data_queue.task_done() |
52 | | -# if edit == None: |
53 | | -# print 'Swallowing poison pill' |
54 | | -# break |
55 | | -# elif edit == 'NEXT': |
56 | | -# editor_cache.add('NEXT', '') |
57 | | -# else: |
58 | | -# contributor = edit['editor'] |
59 | | -# value = {'date': edit['date'], 'article': edit['article']} |
60 | | -# editor_cache.add(contributor, value) |
61 | | -# #collection.update({'editor': contributor}, {'$push': {'edits': value}}, True) |
62 | | -# #'$inc': {'edit_count': 1}, |
63 | | -# |
64 | | -# except Empty: |
65 | | -# ''' |
66 | | -# This checks whether the Queue is empty because the preprocessors are |
67 | | -# finished or because this function is faster in emptying the Queue |
68 | | -# then the preprocessors are able to fill it. If the preprocessors |
69 | | -# are finished and this Queue is empty than break, else wait for the |
70 | | -# Queue to fill. |
71 | | -# ''' |
72 | | -# pass |
73 | | -# |
74 | | -# print 'Emptying entire cache.' |
75 | | -# editor_cache.store() |
76 | | -# print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n) |
77 | | - |
78 | | - |
79 | | -def load_cache_objects(): |
80 | | - cache = {} |
81 | | - files = utils.retrieve_file_list(settings.binary_location, '.bin') |
82 | | - for x, file in enumerate(files): |
83 | | - cache[x] = utils.load_object(settings.binary_location, file) |
84 | | - return cache |
85 | | - |
86 | | - |
87 | | -def search_cache_for_missed_editors(dbname, collection): |
88 | | - mongo = db.init_mongo_db(dbname) |
89 | | - collection = mongo[collection] |
90 | | - editor_cache = cache.EditorCache(collection) |
91 | | - cache = load_cache_objects() |
92 | | - for c in cache: |
93 | | - for editor in cache[c]: |
94 | | - editor_cache.add(editor, cache[c][editor]) |
95 | | - cache[c] = {} |
96 | | - editor_cache.add('NEXT', '') |
97 | | - cache = {} |
98 | | - |
99 | | - |
Index: trunk/tools/editor_trends/etl/exporter.py |
— | — | @@ -207,11 +207,11 @@ |
208 | 208 | if id == None: |
209 | 209 | break |
210 | 210 | obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1}) |
| 211 | + if obs == None: |
| 212 | + continue |
211 | 213 | first_edit = obs['first_edit'] |
212 | 214 | last_edit = obs['final_edit'] |
213 | 215 | for y in xrange(2001, year): |
214 | | -# if y == 2010 and first_edit > datetime.datetime(2010, 1, 1): |
215 | | -# print 'debug' |
216 | 216 | if y not in data: |
217 | 217 | data[y] = {} |
218 | 218 | data[y]['n'] = 0 |
— | — | @@ -226,23 +226,26 @@ |
227 | 227 | if period not in data[y]: |
228 | 228 | data[y][period] = 0 |
229 | 229 | window_start = datetime.datetime(y, 12, 31) - relativedelta(months=period) |
| 230 | + if first_edit.year > y or last_edit.year < y: |
| 231 | + continue |
230 | 232 | if window_start < datetime.datetime(2001, 1, 1): |
231 | 233 | window_start = datetime.datetime(2001, 1, 1) |
232 | 234 | if date_falls_in_window(window_start, window_end, first_edit): |
233 | 235 | edits.append(period) |
234 | 236 | if edits != []: |
235 | 237 | p = min(edits) |
236 | | - data[y]['n'] += 1 |
237 | 238 | data[y][p] += 1 |
| 239 | + data[y]['n'] += 1 |
| 240 | + |
238 | 241 | except Empty: |
239 | 242 | break |
240 | 243 | print 'Storing data as %s' % os.path.join(settings.binary_location, dbname + '_cohort_data.bin') |
241 | | - utils.store_object(data, settings.binary_location, dbname + '_cohort_data') |
| 244 | + utils.store_object(data, settings.binary_location, dbname + '_cohort_data.bin') |
242 | 245 | cohort_charts.prepare_cohort_dataset(dbname) |
243 | 246 | |
244 | 247 | |
245 | 248 | def date_falls_in_window(window_start, window_end, first_edit): |
246 | | - if first_edit >= window_start and first_edit <= window_end: |
| 249 | + if first_edit >= window_start and first_edit <= window_end: |
247 | 250 | return True |
248 | 251 | else: |
249 | 252 | return False |
Index: trunk/tools/editor_trends/etl/extract.py |
— | — | @@ -162,12 +162,14 @@ |
163 | 163 | |
164 | 164 | |
165 | 165 | def run_parse_editors(location, **kwargs): |
166 | | - bot_ids = bots.retrieve_bots() |
| 166 | + |
167 | 167 | input = os.path.join(location, 'chunks') |
168 | 168 | output = os.path.join(location, 'txt') |
| 169 | + language_code = kwargs.get('language_code') |
169 | 170 | settings.verify_environment([input, output]) |
170 | 171 | files = utils.retrieve_file_list(input, 'xml') |
171 | 172 | |
| 173 | + bot_ids = bots.retrieve_bots(language_code) |
172 | 174 | tasks = multiprocessing.JoinableQueue() |
173 | 175 | consumers = [models.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)] |
174 | 176 | for file in files: |
— | — | @@ -183,7 +185,8 @@ |
184 | 186 | |
185 | 187 | |
186 | 188 | def debug_parse_editors(location): |
187 | | - bot_ids = bots.retrieve_bots() |
| 189 | + language_code = 'en' |
| 190 | + bot_ids = bots.retrieve_bots(language_code) |
188 | 191 | input = os.path.join(location, 'chunks') |
189 | 192 | output = os.path.join(location, 'txt') |
190 | 193 | xml_file = models.XMLFile(input, output, 'pages_full_en.xml', bot_ids, output_editor_information) |
Index: trunk/tools/editor_trends/etl/chunker.py |
— | — | @@ -211,7 +211,7 @@ |
212 | 212 | output = os.path.join(location, 'chunks') |
213 | 213 | else: |
214 | 214 | output = os.path.join(location, 'txt') |
215 | | - bot_ids = bots.retrieve_bots() |
| 215 | + bot_ids = bots.retrieve_bots(language_code) |
216 | 216 | settings.verify_environment([output]) |
217 | 217 | |
218 | 218 | fh = None |
Index: trunk/tools/editor_trends/etl/shaper.py |
— | — | @@ -38,14 +38,6 @@ |
39 | 39 | year = datetime.datetime.now().year + 1 |
40 | 40 | for x in xrange(2001, year): |
41 | 41 | data[str(x)] = add_datatype(datatype) |
42 | | -# if datatype == 'dict': |
43 | | -# data[str(x)] = dict() |
44 | | -# elif datatype == 'list': |
45 | | -# data[str(x)] = list() |
46 | | -# elif datatype == 'set': |
47 | | -# data[str(x)] = set() |
48 | | -# else: |
49 | | -# data[str(x)] = 0.0 |
50 | 42 | return data |
51 | 43 | |
52 | 44 | |
— | — | @@ -54,16 +46,7 @@ |
55 | 47 | datacontainer[dc] = {} |
56 | 48 | for x in xrange(1, 13): |
57 | 49 | datacontainer[dc][str(x)] = add_datatype(datatype) |
58 | | -# if datatype == 'dict': |
59 | | -# datacontainer[dc][str(x)] = dict() |
60 | | -# elif datatype == 'list': |
61 | | -# datacontainer[dc][str(x)] = list() |
62 | | -# elif datatype == 'set': |
63 | | -# datacontainer[dc][str(x)] = set() |
64 | | -# else: |
65 | | -# datacontainer[dc][str(x)] = 0.0 |
66 | | -# #else: |
67 | | - # datacontainer[dc][str(x)] = 0.0 |
| 50 | + |
68 | 51 | return datacontainer |
69 | 52 | |
70 | 53 | |
Index: trunk/tools/editor_trends/etl/models.py |
— | — | @@ -73,14 +73,12 @@ |
74 | 74 | |
75 | 75 | |
76 | 76 | class XMLFile(object): |
77 | | - def __init__(self, input, output, xml_file, bots, target, output_file=None, **kwargs): |
78 | | - self.file = xml_file |
79 | | - self.input = input |
| 77 | + def __init__(self, file, location, output, output_file, target, ** kwargs): |
| 78 | + self.file = file |
| 79 | + self.location = location |
80 | 80 | self.output = output |
81 | | - self.bots = bots |
82 | 81 | self.target = target |
83 | 82 | self.output_file = output_file |
84 | | - self.lock = None |
85 | 83 | for kw in kwargs: |
86 | 84 | setattr(self, kw, kwargs[kw]) |
87 | 85 | |
— | — | @@ -96,11 +94,13 @@ |
97 | 95 | return '%s' % (self.file) |
98 | 96 | |
99 | 97 | def __call__(self, bots=None): |
| 98 | + if bots != {} and bots != None: |
| 99 | + self.bots = bots |
100 | 100 | if settings.debug: |
101 | 101 | messages = {} |
102 | 102 | vars = {} |
103 | 103 | |
104 | | - data = xml.read_input(utils.create_txt_filehandle(self.input, |
| 104 | + data = xml.read_input(utils.create_txt_filehandle(self.location, |
105 | 105 | self.file, 'r', |
106 | 106 | encoding=settings.encoding)) |
107 | 107 | self.create_file_handle() |
— | — | @@ -111,10 +111,6 @@ |
112 | 112 | raw_data = ''.join(raw_data) |
113 | 113 | xml_buffer.write(raw_data) |
114 | 114 | elem = cElementTree.XML(xml_buffer.getvalue()) |
115 | | - except Exception, error: |
116 | | - print error |
117 | | - continue |
118 | | - try: |
119 | 115 | bots = self.target(elem, fh=self.fh, bots=self.bots) |
120 | 116 | except SyntaxError, error: |
121 | 117 | print error |
Index: trunk/tools/editor_trends/etl/loader.py |
— | — | @@ -40,21 +40,16 @@ |
41 | 41 | collection.create_index('editor') |
42 | 42 | editor_cache = cache.EditorCache(collection) |
43 | 43 | prev_contributor = -1 |
44 | | - x = 0 |
45 | 44 | edits = 0 |
46 | | - editors = set() |
47 | 45 | for line in sort.readline(fh): |
48 | 46 | if len(line) == 0: |
49 | 47 | continue |
50 | 48 | contributor = line[0] |
| 49 | + #print 'Parsing %s' % contributor |
51 | 50 | if prev_contributor != contributor: |
52 | 51 | if edits > 9: |
53 | | - result = editor_cache.add(prev_contributor, 'NEXT') |
54 | | - if result: |
55 | | - editors.add(prev_contributor) |
56 | | - result = None |
57 | | - x += 1 |
58 | | - print 'Stored %s editors' % x |
| 52 | + editor_cache.add(prev_contributor, 'NEXT') |
| 53 | + print 'Stored %s' % prev_contributor |
59 | 54 | else: |
60 | 55 | editor_cache.clear(prev_contributor) |
61 | 56 | edits = 0 |
— | — | @@ -66,8 +61,9 @@ |
67 | 62 | editor_cache.add(contributor, value) |
68 | 63 | prev_contributor = contributor |
69 | 64 | fh.close() |
| 65 | + print editor_cache.n |
| 66 | + return editor_cache.n |
70 | 67 | |
71 | | - |
72 | 68 | def mergesort_external_launcher(input, output): |
73 | 69 | files = utils.retrieve_file_list(input, 'txt', mask='') |
74 | 70 | x = 0 |
— | — | @@ -154,4 +150,4 @@ |
155 | 151 | collection = 'editors' |
156 | 152 | #mergesort_launcher(input, intermediate_output) |
157 | 153 | #mergesort_external_launcher(intermediate_output, output) |
158 | | - store_editors(output, dbname, collection) |
| 154 | + num_editors = store_editors(output, dbname, collection) |
Index: trunk/tools/editor_trends/database/cache.py |
— | — | @@ -35,7 +35,7 @@ |
36 | 36 | self.n = 0 |
37 | 37 | |
38 | 38 | def __repr__(self): |
39 | | - return '%s' % 'Editor Cache' |
| 39 | + return self.editors |
40 | 40 | |
41 | 41 | def clear(self, key): |
42 | 42 | if key in self.editors: |
— | — | @@ -44,9 +44,8 @@ |
45 | 45 | def add(self, key, value): |
46 | 46 | if value == 'NEXT': |
47 | 47 | self.n += 1 |
48 | | - result = self.insert(key, self.editors[key]['edits'], self.editors[key]['username']) |
| 48 | + self.insert(key, self.editors[key]['edits'], self.editors[key]['username']) |
49 | 49 | del self.editors[key] |
50 | | - return result |
51 | 50 | else: |
52 | 51 | if key not in self.editors: |
53 | 52 | self.editors[key] = {} |
— | — | @@ -65,11 +64,13 @@ |
66 | 65 | self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True) |
67 | 66 | |
68 | 67 | def insert(self, editor, values, username): |
69 | | - try: |
70 | | - self.collection.insert({'editor': editor, 'edits': values, 'username': username}) |
71 | | - return True |
72 | | - except: |
73 | | - return False |
| 68 | + ''' |
| 69 | + Adding the safe=True statement slows down the insert process but this assures that all data |
| 70 | + will be written. |
| 71 | + ''' |
| 72 | + self.collection.insert({'editor': editor, 'edits': values, 'username': username}, safe=True) |
| 73 | + #except: |
| 74 | + # return False |
74 | 75 | |
75 | 76 | def store(self): |
76 | 77 | utils.store_object(self, settings.binary_location, self.__repr__()) |
Index: trunk/tools/editor_trends/database/db.py |
— | — | @@ -105,7 +105,7 @@ |
106 | 106 | ids = [] |
107 | 107 | cursor = collection.map_reduce(map, reduce) |
108 | 108 | for c in cursor.find(): |
109 | | - ids.append(int(c['_id'])) |
| 109 | + ids.append(c['_id']) |
110 | 110 | return ids |
111 | 111 | #def init_database(db=None): |
112 | 112 | # ''' |
Index: trunk/tools/editor_trends/bots/bots.py |
— | — | @@ -70,7 +70,7 @@ |
71 | 71 | return bot_dict |
72 | 72 | |
73 | 73 | |
74 | | -def retrieve_bots(): |
| 74 | +def retrieve_bots(language_code): |
75 | 75 | ''' |
76 | 76 | Loader function to retrieve list of id's of known Wikipedia bots. |
77 | 77 | ''' |
— | — | @@ -79,7 +79,7 @@ |
80 | 80 | bots = mongo['ids'] |
81 | 81 | cursor = bots.find() |
82 | 82 | for bot in cursor: |
83 | | - if bot['verified'] == 'True': |
| 83 | + if bot['verified'] == 'True' and language_code in bot['projects']: |
84 | 84 | ids[bot['id']] = bot['name'] |
85 | 85 | return ids |
86 | 86 | |
— | — | @@ -143,18 +143,25 @@ |
144 | 144 | return bots |
145 | 145 | |
146 | 146 | |
147 | | -def create_bot_validation_dataset(data, fh, bots, keys): |
148 | | - username = data[3].lower() |
149 | | - #print username.encode('utf-8') |
150 | | - if username.find('bot') > -1 or username.find('script') > -1: |
151 | | - bot = bots.get(username, botmodels.Bot(username, verified=False)) |
152 | | - setattr(bot, 'id', data[0]) |
| 147 | +def create_bot_validation_dataset(xml_nodes, fh, bots): |
| 148 | + revisions = xml_nodes.findall('revision') |
| 149 | + for revision in revisions: |
| 150 | + contributor = xml.retrieve_xml_node(revision, 'contributor') |
| 151 | + username = contributor.find('username') |
| 152 | + if username == None or username.text == None: |
| 153 | + continue |
| 154 | + else: |
| 155 | + username = username.text.lower() |
153 | 156 | |
154 | | - timestamp = data[1] |
155 | | - if timestamp != None: |
156 | | - timestamp = utils.convert_timestamp_to_datetime_naive(timestamp) |
157 | | - bot.time[str(timestamp.year)].append(timestamp) |
158 | | - bots[username] = bot |
| 157 | + #print username.encode('utf-8') |
| 158 | + if username.find('bot') > -1 or username.find('script') > -1: |
| 159 | + bot = bots.get(username, botmodels.Bot(username, verified=False)) |
| 160 | + bot.id = contributor.find('id').text |
| 161 | + timestamp = revision.find('timestamp').text |
| 162 | + if timestamp != None: |
| 163 | + timestamp = utils.convert_timestamp_to_datetime_naive(timestamp) |
| 164 | + bot.time[str(timestamp.year)].append(timestamp) |
| 165 | + bots[username] = bot |
159 | 166 | |
160 | 167 | return bots |
161 | 168 | |
— | — | @@ -172,26 +179,33 @@ |
173 | 180 | location = os.path.join(settings.input_location, language_code, project) |
174 | 181 | input_xml = os.path.join(location, 'chunks') |
175 | 182 | input_txt = os.path.join(location, 'txt') |
176 | | - files = utils.retrieve_file_list(input_txt, 'txt', mask=None) |
177 | | - input_queue = pc.load_queue(files, poison_pill=True) |
| 183 | + |
| 184 | + |
178 | 185 | tasks = multiprocessing.JoinableQueue() |
179 | 186 | mgr = multiprocessing.Manager() |
180 | 187 | keys = ['id', 'name', 'verified', 'projects'] |
181 | 188 | |
182 | 189 | if action == 'lookup': |
183 | 190 | output_file = 'bots_ids.csv' |
| 191 | + files = utils.retrieve_file_list(input_txt, 'txt', mask=None) |
| 192 | + input_queue = pc.load_queue(files, poison_pill=True) |
184 | 193 | bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding, manager=manager) |
| 194 | + for file in files: |
| 195 | + tasks.put(models.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys)) |
| 196 | + |
185 | 197 | else: |
186 | 198 | output_file = 'bots_predictionset.csv' |
| 199 | + files = utils.retrieve_file_list(input_xml, 'xml', mask=None) |
| 200 | + input_queue = pc.load_queue(files, poison_pill=True) |
187 | 201 | bots = {} |
| 202 | + for file in files: |
| 203 | + tasks.put(models.XMLFile(file, input_xml, settings.csv_location, output_file, target, bots=bots, keys=keys)) |
188 | 204 | |
189 | 205 | #lock = mgr.Lock() |
190 | 206 | if manager: |
191 | 207 | manager = mgr |
192 | 208 | |
193 | 209 | |
194 | | - for file in files: |
195 | | - tasks.put(models.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys)) |
196 | 210 | |
197 | 211 | tracker = {} |
198 | 212 | if single: |