r78098 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r78097‎ | r78098 | r78099 >
Date:22:19, 8 December 2010
Author:diederik
Status:deferred
Tags:
Comment:
Preparing for v1.1
Modified paths:
  • /trunk/tools/editor_trends/bots/bots.py (modified) (history)
  • /trunk/tools/editor_trends/database/cache.py (modified) (history)
  • /trunk/tools/editor_trends/database/db.py (modified) (history)
  • /trunk/tools/editor_trends/etl/chunker.py (modified) (history)
  • /trunk/tools/editor_trends/etl/exporter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extract.py (modified) (history)
  • /trunk/tools/editor_trends/etl/loader.py (modified) (history)
  • /trunk/tools/editor_trends/etl/models.py (modified) (history)
  • /trunk/tools/editor_trends/etl/shaper.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (deleted) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -260,7 +260,9 @@
261261 db.cleanup_database(project, logger)
262262
263263 write_message_to_log(logger, args, verb='Storing', location=location, input=input, project=project, collection=collection)
264 - loader.store_editors(input, project, collection)
 264+ num_editors = loader.store_editors(input, project, collection)
 265+ cnt_editors = db.count_records(project, collection)
 266+ assert num_editors == cnt_editors
265267 timer.elapsed()
266268
267269
@@ -297,7 +299,8 @@
298300 write_message_to_log(logger, args, verb='Creating', dir=dirs)
299301 settings.verify_environment(dirs)
300302
301 - file = full_project + '_editors.bin'
 303+
 304+ file = kwargs.get('full_project') + '_editor.bin'
302305 write_message_to_log(logger, args, verb='Deleting', file=file)
303306 utils.delete_file(settings.binary_location, file)
304307
Index: trunk/tools/editor_trends/etl/store.py
@@ -1,98 +0,0 @@
2 -#!/usr/bin/python
3 -# -*- coding: utf-8 -*-
4 -'''
5 -Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
6 -This program is free software; you can redistribute it and/or
7 -modify it under the terms of the GNU General Public License version 2
8 -as published by the Free Software Foundation.
9 -This program is distributed in the hope that it will be useful,
10 -but WITHOUT ANY WARRANTY; without even the implied warranty of
11 -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 -See the GNU General Public License for more details, at
13 -http://www.fsf.org/licenses/gpl.html
14 -'''
15 -
16 -__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
17 -__author__email = 'dvanliere at gmail dot com'
18 -__date__ = '2010-11-19'
19 -__version__ = '0.1'
20 -
21 -
22 -from Queue import Empty
23 -import datetime
24 -import sys
25 -sys.path.append('..')
26 -
27 -import configuration
28 -settings = configuration.Settings()
29 -
30 -from database import cache
31 -
32 -
33 -#def store_editors(data_queue, **kwargs):
34 -# '''
35 -# @data_queue is an instance of Queue containing information extracted by
36 -# parse_editors()
37 -# kwargs should contain:
38 -# @dbname is the name of the MongoDB database where to store the information.
39 -# @collection is the name of the MongoDB collection.
40 -# '''
41 -# dbname = kwargs.get('dbname', None)
42 -# collection = kwargs.pop('collection')
43 -# mongo = db.init_mongo_db(dbname)
44 -# collection = mongo[collection]
45 -# mongo[collection].ensure_index('editor')
46 -# editor_cache = cache.EditorCache(collection)
47 -#
48 -# while True:
49 -# try:
50 -# edit = data_queue.get(block=False)
51 -# data_queue.task_done()
52 -# if edit == None:
53 -# print 'Swallowing poison pill'
54 -# break
55 -# elif edit == 'NEXT':
56 -# editor_cache.add('NEXT', '')
57 -# else:
58 -# contributor = edit['editor']
59 -# value = {'date': edit['date'], 'article': edit['article']}
60 -# editor_cache.add(contributor, value)
61 -# #collection.update({'editor': contributor}, {'$push': {'edits': value}}, True)
62 -# #'$inc': {'edit_count': 1},
63 -#
64 -# except Empty:
65 -# '''
66 -# This checks whether the Queue is empty because the preprocessors are
67 -# finished or because this function is faster in emptying the Queue
68 -# then the preprocessors are able to fill it. If the preprocessors
69 -# are finished and this Queue is empty than break, else wait for the
70 -# Queue to fill.
71 -# '''
72 -# pass
73 -#
74 -# print 'Emptying entire cache.'
75 -# editor_cache.store()
76 -# print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n)
77 -
78 -
79 -def load_cache_objects():
80 - cache = {}
81 - files = utils.retrieve_file_list(settings.binary_location, '.bin')
82 - for x, file in enumerate(files):
83 - cache[x] = utils.load_object(settings.binary_location, file)
84 - return cache
85 -
86 -
87 -def search_cache_for_missed_editors(dbname, collection):
88 - mongo = db.init_mongo_db(dbname)
89 - collection = mongo[collection]
90 - editor_cache = cache.EditorCache(collection)
91 - cache = load_cache_objects()
92 - for c in cache:
93 - for editor in cache[c]:
94 - editor_cache.add(editor, cache[c][editor])
95 - cache[c] = {}
96 - editor_cache.add('NEXT', '')
97 - cache = {}
98 -
99 -
Index: trunk/tools/editor_trends/etl/exporter.py
@@ -207,11 +207,11 @@
208208 if id == None:
209209 break
210210 obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1})
 211+ if obs == None:
 212+ continue
211213 first_edit = obs['first_edit']
212214 last_edit = obs['final_edit']
213215 for y in xrange(2001, year):
214 -# if y == 2010 and first_edit > datetime.datetime(2010, 1, 1):
215 -# print 'debug'
216216 if y not in data:
217217 data[y] = {}
218218 data[y]['n'] = 0
@@ -226,23 +226,26 @@
227227 if period not in data[y]:
228228 data[y][period] = 0
229229 window_start = datetime.datetime(y, 12, 31) - relativedelta(months=period)
 230+ if first_edit.year > y or last_edit.year < y:
 231+ continue
230232 if window_start < datetime.datetime(2001, 1, 1):
231233 window_start = datetime.datetime(2001, 1, 1)
232234 if date_falls_in_window(window_start, window_end, first_edit):
233235 edits.append(period)
234236 if edits != []:
235237 p = min(edits)
236 - data[y]['n'] += 1
237238 data[y][p] += 1
 239+ data[y]['n'] += 1
 240+
238241 except Empty:
239242 break
240243 print 'Storing data as %s' % os.path.join(settings.binary_location, dbname + '_cohort_data.bin')
241 - utils.store_object(data, settings.binary_location, dbname + '_cohort_data')
 244+ utils.store_object(data, settings.binary_location, dbname + '_cohort_data.bin')
242245 cohort_charts.prepare_cohort_dataset(dbname)
243246
244247
245248 def date_falls_in_window(window_start, window_end, first_edit):
246 - if first_edit >= window_start and first_edit <= window_end:
 249+ if first_edit >= window_start and first_edit <= window_end:
247250 return True
248251 else:
249252 return False
Index: trunk/tools/editor_trends/etl/extract.py
@@ -162,12 +162,14 @@
163163
164164
165165 def run_parse_editors(location, **kwargs):
166 - bot_ids = bots.retrieve_bots()
 166+
167167 input = os.path.join(location, 'chunks')
168168 output = os.path.join(location, 'txt')
 169+ language_code = kwargs.get('language_code')
169170 settings.verify_environment([input, output])
170171 files = utils.retrieve_file_list(input, 'xml')
171172
 173+ bot_ids = bots.retrieve_bots(language_code)
172174 tasks = multiprocessing.JoinableQueue()
173175 consumers = [models.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)]
174176 for file in files:
@@ -183,7 +185,8 @@
184186
185187
186188 def debug_parse_editors(location):
187 - bot_ids = bots.retrieve_bots()
 189+ language_code = 'en'
 190+ bot_ids = bots.retrieve_bots(language_code)
188191 input = os.path.join(location, 'chunks')
189192 output = os.path.join(location, 'txt')
190193 xml_file = models.XMLFile(input, output, 'pages_full_en.xml', bot_ids, output_editor_information)
Index: trunk/tools/editor_trends/etl/chunker.py
@@ -211,7 +211,7 @@
212212 output = os.path.join(location, 'chunks')
213213 else:
214214 output = os.path.join(location, 'txt')
215 - bot_ids = bots.retrieve_bots()
 215+ bot_ids = bots.retrieve_bots(language_code)
216216 settings.verify_environment([output])
217217
218218 fh = None
Index: trunk/tools/editor_trends/etl/shaper.py
@@ -38,14 +38,6 @@
3939 year = datetime.datetime.now().year + 1
4040 for x in xrange(2001, year):
4141 data[str(x)] = add_datatype(datatype)
42 -# if datatype == 'dict':
43 -# data[str(x)] = dict()
44 -# elif datatype == 'list':
45 -# data[str(x)] = list()
46 -# elif datatype == 'set':
47 -# data[str(x)] = set()
48 -# else:
49 -# data[str(x)] = 0.0
5042 return data
5143
5244
@@ -54,16 +46,7 @@
5547 datacontainer[dc] = {}
5648 for x in xrange(1, 13):
5749 datacontainer[dc][str(x)] = add_datatype(datatype)
58 -# if datatype == 'dict':
59 -# datacontainer[dc][str(x)] = dict()
60 -# elif datatype == 'list':
61 -# datacontainer[dc][str(x)] = list()
62 -# elif datatype == 'set':
63 -# datacontainer[dc][str(x)] = set()
64 -# else:
65 -# datacontainer[dc][str(x)] = 0.0
66 -# #else:
67 - # datacontainer[dc][str(x)] = 0.0
 50+
6851 return datacontainer
6952
7053
Index: trunk/tools/editor_trends/etl/models.py
@@ -73,14 +73,12 @@
7474
7575
7676 class XMLFile(object):
77 - def __init__(self, input, output, xml_file, bots, target, output_file=None, **kwargs):
78 - self.file = xml_file
79 - self.input = input
 77+ def __init__(self, file, location, output, output_file, target, ** kwargs):
 78+ self.file = file
 79+ self.location = location
8080 self.output = output
81 - self.bots = bots
8281 self.target = target
8382 self.output_file = output_file
84 - self.lock = None
8583 for kw in kwargs:
8684 setattr(self, kw, kwargs[kw])
8785
@@ -96,11 +94,13 @@
9795 return '%s' % (self.file)
9896
9997 def __call__(self, bots=None):
 98+ if bots != {} and bots != None:
 99+ self.bots = bots
100100 if settings.debug:
101101 messages = {}
102102 vars = {}
103103
104 - data = xml.read_input(utils.create_txt_filehandle(self.input,
 104+ data = xml.read_input(utils.create_txt_filehandle(self.location,
105105 self.file, 'r',
106106 encoding=settings.encoding))
107107 self.create_file_handle()
@@ -111,10 +111,6 @@
112112 raw_data = ''.join(raw_data)
113113 xml_buffer.write(raw_data)
114114 elem = cElementTree.XML(xml_buffer.getvalue())
115 - except Exception, error:
116 - print error
117 - continue
118 - try:
119115 bots = self.target(elem, fh=self.fh, bots=self.bots)
120116 except SyntaxError, error:
121117 print error
Index: trunk/tools/editor_trends/etl/loader.py
@@ -40,21 +40,16 @@
4141 collection.create_index('editor')
4242 editor_cache = cache.EditorCache(collection)
4343 prev_contributor = -1
44 - x = 0
4544 edits = 0
46 - editors = set()
4745 for line in sort.readline(fh):
4846 if len(line) == 0:
4947 continue
5048 contributor = line[0]
 49+ #print 'Parsing %s' % contributor
5150 if prev_contributor != contributor:
5251 if edits > 9:
53 - result = editor_cache.add(prev_contributor, 'NEXT')
54 - if result:
55 - editors.add(prev_contributor)
56 - result = None
57 - x += 1
58 - print 'Stored %s editors' % x
 52+ editor_cache.add(prev_contributor, 'NEXT')
 53+ print 'Stored %s' % prev_contributor
5954 else:
6055 editor_cache.clear(prev_contributor)
6156 edits = 0
@@ -66,8 +61,9 @@
6762 editor_cache.add(contributor, value)
6863 prev_contributor = contributor
6964 fh.close()
 65+ print editor_cache.n
 66+ return editor_cache.n
7067
71 -
7268 def mergesort_external_launcher(input, output):
7369 files = utils.retrieve_file_list(input, 'txt', mask='')
7470 x = 0
@@ -154,4 +150,4 @@
155151 collection = 'editors'
156152 #mergesort_launcher(input, intermediate_output)
157153 #mergesort_external_launcher(intermediate_output, output)
158 - store_editors(output, dbname, collection)
 154+ num_editors = store_editors(output, dbname, collection)
Index: trunk/tools/editor_trends/database/cache.py
@@ -35,7 +35,7 @@
3636 self.n = 0
3737
3838 def __repr__(self):
39 - return '%s' % 'Editor Cache'
 39+ return self.editors
4040
4141 def clear(self, key):
4242 if key in self.editors:
@@ -44,9 +44,8 @@
4545 def add(self, key, value):
4646 if value == 'NEXT':
4747 self.n += 1
48 - result = self.insert(key, self.editors[key]['edits'], self.editors[key]['username'])
 48+ self.insert(key, self.editors[key]['edits'], self.editors[key]['username'])
4949 del self.editors[key]
50 - return result
5150 else:
5251 if key not in self.editors:
5352 self.editors[key] = {}
@@ -65,11 +64,13 @@
6665 self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True)
6766
6867 def insert(self, editor, values, username):
69 - try:
70 - self.collection.insert({'editor': editor, 'edits': values, 'username': username})
71 - return True
72 - except:
73 - return False
 68+ '''
 69+ Adding the safe=True statement slows down the insert process but this assures that all data
 70+ will be written.
 71+ '''
 72+ self.collection.insert({'editor': editor, 'edits': values, 'username': username}, safe=True)
 73+ #except:
 74+ # return False
7475
7576 def store(self):
7677 utils.store_object(self, settings.binary_location, self.__repr__())
Index: trunk/tools/editor_trends/database/db.py
@@ -105,7 +105,7 @@
106106 ids = []
107107 cursor = collection.map_reduce(map, reduce)
108108 for c in cursor.find():
109 - ids.append(int(c['_id']))
 109+ ids.append(c['_id'])
110110 return ids
111111 #def init_database(db=None):
112112 # '''
Index: trunk/tools/editor_trends/bots/bots.py
@@ -70,7 +70,7 @@
7171 return bot_dict
7272
7373
74 -def retrieve_bots():
 74+def retrieve_bots(language_code):
7575 '''
7676 Loader function to retrieve list of id's of known Wikipedia bots.
7777 '''
@@ -79,7 +79,7 @@
8080 bots = mongo['ids']
8181 cursor = bots.find()
8282 for bot in cursor:
83 - if bot['verified'] == 'True':
 83+ if bot['verified'] == 'True' and language_code in bot['projects']:
8484 ids[bot['id']] = bot['name']
8585 return ids
8686
@@ -143,18 +143,25 @@
144144 return bots
145145
146146
147 -def create_bot_validation_dataset(data, fh, bots, keys):
148 - username = data[3].lower()
149 - #print username.encode('utf-8')
150 - if username.find('bot') > -1 or username.find('script') > -1:
151 - bot = bots.get(username, botmodels.Bot(username, verified=False))
152 - setattr(bot, 'id', data[0])
 147+def create_bot_validation_dataset(xml_nodes, fh, bots):
 148+ revisions = xml_nodes.findall('revision')
 149+ for revision in revisions:
 150+ contributor = xml.retrieve_xml_node(revision, 'contributor')
 151+ username = contributor.find('username')
 152+ if username == None or username.text == None:
 153+ continue
 154+ else:
 155+ username = username.text.lower()
153156
154 - timestamp = data[1]
155 - if timestamp != None:
156 - timestamp = utils.convert_timestamp_to_datetime_naive(timestamp)
157 - bot.time[str(timestamp.year)].append(timestamp)
158 - bots[username] = bot
 157+ #print username.encode('utf-8')
 158+ if username.find('bot') > -1 or username.find('script') > -1:
 159+ bot = bots.get(username, botmodels.Bot(username, verified=False))
 160+ bot.id = contributor.find('id').text
 161+ timestamp = revision.find('timestamp').text
 162+ if timestamp != None:
 163+ timestamp = utils.convert_timestamp_to_datetime_naive(timestamp)
 164+ bot.time[str(timestamp.year)].append(timestamp)
 165+ bots[username] = bot
159166
160167 return bots
161168
@@ -172,26 +179,33 @@
173180 location = os.path.join(settings.input_location, language_code, project)
174181 input_xml = os.path.join(location, 'chunks')
175182 input_txt = os.path.join(location, 'txt')
176 - files = utils.retrieve_file_list(input_txt, 'txt', mask=None)
177 - input_queue = pc.load_queue(files, poison_pill=True)
 183+
 184+
178185 tasks = multiprocessing.JoinableQueue()
179186 mgr = multiprocessing.Manager()
180187 keys = ['id', 'name', 'verified', 'projects']
181188
182189 if action == 'lookup':
183190 output_file = 'bots_ids.csv'
 191+ files = utils.retrieve_file_list(input_txt, 'txt', mask=None)
 192+ input_queue = pc.load_queue(files, poison_pill=True)
184193 bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding, manager=manager)
 194+ for file in files:
 195+ tasks.put(models.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys))
 196+
185197 else:
186198 output_file = 'bots_predictionset.csv'
 199+ files = utils.retrieve_file_list(input_xml, 'xml', mask=None)
 200+ input_queue = pc.load_queue(files, poison_pill=True)
187201 bots = {}
 202+ for file in files:
 203+ tasks.put(models.XMLFile(file, input_xml, settings.csv_location, output_file, target, bots=bots, keys=keys))
188204
189205 #lock = mgr.Lock()
190206 if manager:
191207 manager = mgr
192208
193209
194 - for file in files:
195 - tasks.put(models.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys))
196210
197211 tracker = {}
198212 if single:

Status & tagging log