r78098 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r78097‎ \| r78098 \| r78099 >
Date:	22:19, 8 December 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Preparing for v1.1
Modified paths:	/trunk/tools/editor_trends/bots/bots.py (modified) (history) /trunk/tools/editor_trends/database/cache.py (modified) (history) /trunk/tools/editor_trends/database/db.py (modified) (history) /trunk/tools/editor_trends/etl/chunker.py (modified) (history) /trunk/tools/editor_trends/etl/exporter.py (modified) (history) /trunk/tools/editor_trends/etl/extract.py (modified) (history) /trunk/tools/editor_trends/etl/loader.py (modified) (history) /trunk/tools/editor_trends/etl/models.py (modified) (history) /trunk/tools/editor_trends/etl/shaper.py (modified) (history) /trunk/tools/editor_trends/etl/store.py (deleted) (history) /trunk/tools/editor_trends/manage.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -260,7 +260,9 @@
261	261	db.cleanup_database(project, logger)
262	262
263	263	write_message_to_log(logger, args, verb='Storing', location=location, input=input, project=project, collection=collection)
264		~~- loader.store_editors(input, project, collection)~~
	264	+ num_editors = loader.store_editors(input, project, collection)
	265	+ cnt_editors = db.count_records(project, collection)
	266	+ assert num_editors == cnt_editors
265	267	timer.elapsed()
266	268
267	269
—	—	@@ -297,7 +299,8 @@
298	300	write_message_to_log(logger, args, verb='Creating', dir=dirs)
299	301	settings.verify_environment(dirs)
300	302
301		~~- file = full_project + '_editors.bin'~~
	303	+
	304	+ file = kwargs.get('full_project') + '_editor.bin'
302	305	write_message_to_log(logger, args, verb='Deleting', file=file)
303	306	utils.delete_file(settings.binary_location, file)
304	307
Index: trunk/tools/editor_trends/etl/store.py
—	—	@@ -1,98 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__author__email = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2010-11-19'~~
19		~~-__version__ = '0.1'~~
20		-
21		-
22		~~-from Queue import Empty~~
23		~~-import datetime~~
24		~~-import sys~~
25		~~-sys.path.append('..')~~
26		-
27		~~-import configuration~~
28		~~-settings = configuration.Settings()~~
29		-
30		~~-from database import cache~~
31		-
32		-
33		~~-#def store_editors(data_queue, **kwargs):~~
34		~~-# '''~~
35		~~-# @data_queue is an instance of Queue containing information extracted by~~
36		~~-# parse_editors()~~
37		~~-# kwargs should contain:~~
38		~~-# @dbname is the name of the MongoDB database where to store the information.~~
39		~~-# @collection is the name of the MongoDB collection.~~
40		~~-# '''~~
41		~~-# dbname = kwargs.get('dbname', None)~~
42		~~-# collection = kwargs.pop('collection')~~
43		~~-# mongo = db.init_mongo_db(dbname)~~
44		~~-# collection = mongo[collection]~~
45		~~-# mongo[collection].ensure_index('editor')~~
46		~~-# editor_cache = cache.EditorCache(collection)~~
47		-#
48		~~-# while True:~~
49		~~-# try:~~
50		~~-# edit = data_queue.get(block=False)~~
51		~~-# data_queue.task_done()~~
52		~~-# if edit == None:~~
53		~~-# print 'Swallowing poison pill'~~
54		~~-# break~~
55		~~-# elif edit == 'NEXT':~~
56		~~-# editor_cache.add('NEXT', '')~~
57		~~-# else:~~
58		~~-# contributor = edit['editor']~~
59		~~-# value = {'date': edit['date'], 'article': edit['article']}~~
60		~~-# editor_cache.add(contributor, value)~~
61		~~-# #collection.update({'editor': contributor}, {'$push': {'edits': value}}, True)~~
62		~~-# #'$inc': {'edit_count': 1},~~
63		-#
64		~~-# except Empty:~~
65		~~-# '''~~
66		~~-# This checks whether the Queue is empty because the preprocessors are~~
67		~~-# finished or because this function is faster in emptying the Queue~~
68		~~-# then the preprocessors are able to fill it. If the preprocessors~~
69		~~-# are finished and this Queue is empty than break, else wait for the~~
70		~~-# Queue to fill.~~
71		~~-# '''~~
72		~~-# pass~~
73		-#
74		~~-# print 'Emptying entire cache.'~~
75		~~-# editor_cache.store()~~
76		~~-# print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n)~~
77		-
78		-
79		~~-def load_cache_objects():~~
80		~~- cache = {}~~
81		~~- files = utils.retrieve_file_list(settings.binary_location, '.bin')~~
82		~~- for x, file in enumerate(files):~~
83		~~- cache[x] = utils.load_object(settings.binary_location, file)~~
84		~~- return cache~~
85		-
86		-
87		~~-def search_cache_for_missed_editors(dbname, collection):~~
88		~~- mongo = db.init_mongo_db(dbname)~~
89		~~- collection = mongo[collection]~~
90		~~- editor_cache = cache.EditorCache(collection)~~
91		~~- cache = load_cache_objects()~~
92		~~- for c in cache:~~
93		~~- for editor in cache[c]:~~
94		~~- editor_cache.add(editor, cache[c][editor])~~
95		~~- cache[c] = {}~~
96		~~- editor_cache.add('NEXT', '')~~
97		~~- cache = {}~~
98		-
99		-
Index: trunk/tools/editor_trends/etl/exporter.py
—	—	@@ -207,11 +207,11 @@
208	208	if id == None:
209	209	break
210	210	obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1})
	211	+ if obs == None:
	212	+ continue
211	213	first_edit = obs['first_edit']
212	214	last_edit = obs['final_edit']
213	215	for y in xrange(2001, year):
214		~~-# if y == 2010 and first_edit > datetime.datetime(2010, 1, 1):~~
215		~~-# print 'debug'~~
216	216	if y not in data:
217	217	data[y] = {}
218	218	data[y]['n'] = 0
—	—	@@ -226,23 +226,26 @@
227	227	if period not in data[y]:
228	228	data[y][period] = 0
229	229	window_start = datetime.datetime(y, 12, 31) - relativedelta(months=period)
	230	+ if first_edit.year > y or last_edit.year < y:
	231	+ continue
230	232	if window_start < datetime.datetime(2001, 1, 1):
231	233	window_start = datetime.datetime(2001, 1, 1)
232	234	if date_falls_in_window(window_start, window_end, first_edit):
233	235	edits.append(period)
234	236	if edits != []:
235	237	p = min(edits)
236		~~- data[y]['n'] += 1~~
237	238	data[y][p] += 1
	239	+ data[y]['n'] += 1
	240	+
238	241	except Empty:
239	242	break
240	243	print 'Storing data as %s' % os.path.join(settings.binary_location, dbname + '_cohort_data.bin')
241		~~- utils.store_object(data, settings.binary_location, dbname + '_cohort_data')~~
	244	+ utils.store_object(data, settings.binary_location, dbname + '_cohort_data.bin')
242	245	cohort_charts.prepare_cohort_dataset(dbname)
243	246
244	247
245	248	def date_falls_in_window(window_start, window_end, first_edit):
246		~~- if first_edit >= window_start and first_edit <= window_end:~~
	249	+ if first_edit >= window_start and first_edit <= window_end:
247	250	return True
248	251	else:
249	252	return False
Index: trunk/tools/editor_trends/etl/extract.py
—	—	@@ -162,12 +162,14 @@
163	163
164	164
165	165	def run_parse_editors(location, **kwargs):
166		~~- bot_ids = bots.retrieve_bots()~~
	166	+
167	167	input = os.path.join(location, 'chunks')
168	168	output = os.path.join(location, 'txt')
	169	+ language_code = kwargs.get('language_code')
169	170	settings.verify_environment([input, output])
170	171	files = utils.retrieve_file_list(input, 'xml')
171	172
	173	+ bot_ids = bots.retrieve_bots(language_code)
172	174	tasks = multiprocessing.JoinableQueue()
173	175	consumers = [models.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)]
174	176	for file in files:
—	—	@@ -183,7 +185,8 @@
184	186
185	187
186	188	def debug_parse_editors(location):
187		~~- bot_ids = bots.retrieve_bots()~~
	189	+ language_code = 'en'
	190	+ bot_ids = bots.retrieve_bots(language_code)
188	191	input = os.path.join(location, 'chunks')
189	192	output = os.path.join(location, 'txt')
190	193	xml_file = models.XMLFile(input, output, 'pages_full_en.xml', bot_ids, output_editor_information)
Index: trunk/tools/editor_trends/etl/chunker.py
—	—	@@ -211,7 +211,7 @@
212	212	output = os.path.join(location, 'chunks')
213	213	else:
214	214	output = os.path.join(location, 'txt')
215		~~- bot_ids = bots.retrieve_bots()~~
	215	+ bot_ids = bots.retrieve_bots(language_code)
216	216	settings.verify_environment([output])
217	217
218	218	fh = None
Index: trunk/tools/editor_trends/etl/shaper.py
—	—	@@ -38,14 +38,6 @@
39	39	year = datetime.datetime.now().year + 1
40	40	for x in xrange(2001, year):
41	41	data[str(x)] = add_datatype(datatype)
42		~~-# if datatype == 'dict':~~
43		~~-# data[str(x)] = dict()~~
44		~~-# elif datatype == 'list':~~
45		~~-# data[str(x)] = list()~~
46		~~-# elif datatype == 'set':~~
47		~~-# data[str(x)] = set()~~
48		~~-# else:~~
49		~~-# data[str(x)] = 0.0~~
50	42	return data
51	43
52	44
—	—	@@ -54,16 +46,7 @@
55	47	datacontainer[dc] = {}
56	48	for x in xrange(1, 13):
57	49	datacontainer[dc][str(x)] = add_datatype(datatype)
58		~~-# if datatype == 'dict':~~
59		~~-# datacontainer[dc][str(x)] = dict()~~
60		~~-# elif datatype == 'list':~~
61		~~-# datacontainer[dc][str(x)] = list()~~
62		~~-# elif datatype == 'set':~~
63		~~-# datacontainer[dc][str(x)] = set()~~
64		~~-# else:~~
65		~~-# datacontainer[dc][str(x)] = 0.0~~
66		~~-# #else:~~
67		~~- # datacontainer[dc][str(x)] = 0.0~~
	50	+
68	51	return datacontainer
69	52
70	53
Index: trunk/tools/editor_trends/etl/models.py
—	—	@@ -73,14 +73,12 @@
74	74
75	75
76	76	class XMLFile(object):
77		~~- def __init__(self, input, output, xml_file, bots, target, output_file=None, **kwargs):~~
78		~~- self.file = xml_file~~
79		~~- self.input = input~~
	77	+ def __init__(self, file, location, output, output_file, target, ** kwargs):
	78	+ self.file = file
	79	+ self.location = location
80	80	self.output = output
81		~~- self.bots = bots~~
82	81	self.target = target
83	82	self.output_file = output_file
84		~~- self.lock = None~~
85	83	for kw in kwargs:
86	84	setattr(self, kw, kwargs[kw])
87	85
—	—	@@ -96,11 +94,13 @@
97	95	return '%s' % (self.file)
98	96
99	97	def __call__(self, bots=None):
	98	+ if bots != {} and bots != None:
	99	+ self.bots = bots
100	100	if settings.debug:
101	101	messages = {}
102	102	vars = {}
103	103
104		~~- data = xml.read_input(utils.create_txt_filehandle(self.input,~~
	104	+ data = xml.read_input(utils.create_txt_filehandle(self.location,
105	105	self.file, 'r',
106	106	encoding=settings.encoding))
107	107	self.create_file_handle()
—	—	@@ -111,10 +111,6 @@
112	112	raw_data = ''.join(raw_data)
113	113	xml_buffer.write(raw_data)
114	114	elem = cElementTree.XML(xml_buffer.getvalue())
115		~~- except Exception, error:~~
116		~~- print error~~
117		~~- continue~~
118		~~- try:~~
119	115	bots = self.target(elem, fh=self.fh, bots=self.bots)
120	116	except SyntaxError, error:
121	117	print error
Index: trunk/tools/editor_trends/etl/loader.py
—	—	@@ -40,21 +40,16 @@
41	41	collection.create_index('editor')
42	42	editor_cache = cache.EditorCache(collection)
43	43	prev_contributor = -1
44		~~- x = 0~~
45	44	edits = 0
46		~~- editors = set()~~
47	45	for line in sort.readline(fh):
48	46	if len(line) == 0:
49	47	continue
50	48	contributor = line[0]
	49	+ #print 'Parsing %s' % contributor
51	50	if prev_contributor != contributor:
52	51	if edits > 9:
53		~~- result = editor_cache.add(prev_contributor, 'NEXT')~~
54		~~- if result:~~
55		~~- editors.add(prev_contributor)~~
56		~~- result = None~~
57		~~- x += 1~~
58		~~- print 'Stored %s editors' % x~~
	52	+ editor_cache.add(prev_contributor, 'NEXT')
	53	+ print 'Stored %s' % prev_contributor
59	54	else:
60	55	editor_cache.clear(prev_contributor)
61	56	edits = 0
—	—	@@ -66,8 +61,9 @@
67	62	editor_cache.add(contributor, value)
68	63	prev_contributor = contributor
69	64	fh.close()
	65	+ print editor_cache.n
	66	+ return editor_cache.n
70	67
71		-
72	68	def mergesort_external_launcher(input, output):
73	69	files = utils.retrieve_file_list(input, 'txt', mask='')
74	70	x = 0
—	—	@@ -154,4 +150,4 @@
155	151	collection = 'editors'
156	152	#mergesort_launcher(input, intermediate_output)
157	153	#mergesort_external_launcher(intermediate_output, output)
158		~~- store_editors(output, dbname, collection)~~
	154	+ num_editors = store_editors(output, dbname, collection)
Index: trunk/tools/editor_trends/database/cache.py
—	—	@@ -35,7 +35,7 @@
36	36	self.n = 0
37	37
38	38	def __repr__(self):
39		~~- return '%s' % 'Editor Cache'~~
	39	+ return self.editors
40	40
41	41	def clear(self, key):
42	42	if key in self.editors:
—	—	@@ -44,9 +44,8 @@
45	45	def add(self, key, value):
46	46	if value == 'NEXT':
47	47	self.n += 1
48		~~- result = self.insert(key, self.editors[key]['edits'], self.editors[key]['username'])~~
	48	+ self.insert(key, self.editors[key]['edits'], self.editors[key]['username'])
49	49	del self.editors[key]
50		~~- return result~~
51	50	else:
52	51	if key not in self.editors:
53	52	self.editors[key] = {}
—	—	@@ -65,11 +64,13 @@
66	65	self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True)
67	66
68	67	def insert(self, editor, values, username):
69		~~- try:~~
70		~~- self.collection.insert({'editor': editor, 'edits': values, 'username': username})~~
71		~~- return True~~
72		~~- except:~~
73		~~- return False~~
	68	+ '''
	69	+ Adding the safe=True statement slows down the insert process but this assures that all data
	70	+ will be written.
	71	+ '''
	72	+ self.collection.insert({'editor': editor, 'edits': values, 'username': username}, safe=True)
	73	+ #except:
	74	+ # return False
74	75
75	76	def store(self):
76	77	utils.store_object(self, settings.binary_location, self.__repr__())
Index: trunk/tools/editor_trends/database/db.py
—	—	@@ -105,7 +105,7 @@
106	106	ids = []
107	107	cursor = collection.map_reduce(map, reduce)
108	108	for c in cursor.find():
109		~~- ids.append(int(c['_id']))~~
	109	+ ids.append(c['_id'])
110	110	return ids
111	111	#def init_database(db=None):
112	112	# '''
Index: trunk/tools/editor_trends/bots/bots.py
—	—	@@ -70,7 +70,7 @@
71	71	return bot_dict
72	72
73	73
74		~~-def retrieve_bots():~~
	74	+def retrieve_bots(language_code):
75	75	'''
76	76	Loader function to retrieve list of id's of known Wikipedia bots.
77	77	'''
—	—	@@ -79,7 +79,7 @@
80	80	bots = mongo['ids']
81	81	cursor = bots.find()
82	82	for bot in cursor:
83		~~- if bot['verified'] == 'True':~~
	83	+ if bot['verified'] == 'True' and language_code in bot['projects']:
84	84	ids[bot['id']] = bot['name']
85	85	return ids
86	86
—	—	@@ -143,18 +143,25 @@
144	144	return bots
145	145
146	146
147		~~-def create_bot_validation_dataset(data, fh, bots, keys):~~
148		~~- username = data[3].lower()~~
149		~~- #print username.encode('utf-8')~~
150		~~- if username.find('bot') > -1 or username.find('script') > -1:~~
151		~~- bot = bots.get(username, botmodels.Bot(username, verified=False))~~
152		~~- setattr(bot, 'id', data[0])~~
	147	+def create_bot_validation_dataset(xml_nodes, fh, bots):
	148	+ revisions = xml_nodes.findall('revision')
	149	+ for revision in revisions:
	150	+ contributor = xml.retrieve_xml_node(revision, 'contributor')
	151	+ username = contributor.find('username')
	152	+ if username == None or username.text == None:
	153	+ continue
	154	+ else:
	155	+ username = username.text.lower()
153	156
154		~~- timestamp = data[1]~~
155		~~- if timestamp != None:~~
156		~~- timestamp = utils.convert_timestamp_to_datetime_naive(timestamp)~~
157		~~- bot.time[str(timestamp.year)].append(timestamp)~~
158		~~- bots[username] = bot~~
	157	+ #print username.encode('utf-8')
	158	+ if username.find('bot') > -1 or username.find('script') > -1:
	159	+ bot = bots.get(username, botmodels.Bot(username, verified=False))
	160	+ bot.id = contributor.find('id').text
	161	+ timestamp = revision.find('timestamp').text
	162	+ if timestamp != None:
	163	+ timestamp = utils.convert_timestamp_to_datetime_naive(timestamp)
	164	+ bot.time[str(timestamp.year)].append(timestamp)
	165	+ bots[username] = bot
159	166
160	167	return bots
161	168
—	—	@@ -172,26 +179,33 @@
173	180	location = os.path.join(settings.input_location, language_code, project)
174	181	input_xml = os.path.join(location, 'chunks')
175	182	input_txt = os.path.join(location, 'txt')
176		~~- files = utils.retrieve_file_list(input_txt, 'txt', mask=None)~~
177		~~- input_queue = pc.load_queue(files, poison_pill=True)~~
	183	+
	184	+
178	185	tasks = multiprocessing.JoinableQueue()
179	186	mgr = multiprocessing.Manager()
180	187	keys = ['id', 'name', 'verified', 'projects']
181	188
182	189	if action == 'lookup':
183	190	output_file = 'bots_ids.csv'
	191	+ files = utils.retrieve_file_list(input_txt, 'txt', mask=None)
	192	+ input_queue = pc.load_queue(files, poison_pill=True)
184	193	bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding, manager=manager)
	194	+ for file in files:
	195	+ tasks.put(models.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys))
	196	+
185	197	else:
186	198	output_file = 'bots_predictionset.csv'
	199	+ files = utils.retrieve_file_list(input_xml, 'xml', mask=None)
	200	+ input_queue = pc.load_queue(files, poison_pill=True)
187	201	bots = {}
	202	+ for file in files:
	203	+ tasks.put(models.XMLFile(file, input_xml, settings.csv_location, output_file, target, bots=bots, keys=keys))
188	204
189	205	#lock = mgr.Lock()
190	206	if manager:
191	207	manager = mgr
192	208
193	209
194		~~- for file in files:~~
195		~~- tasks.put(models.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys))~~
196	210
197	211	tracker = {}
198	212	if single:

Status & tagging log

22:37, 9 December 2010 Reedy (talk | contribs) changed the status of r78098 [removed: new added: deferred]