r75176 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r75175‎ \| r75176 \| r75177 >
Date:	22:00, 21 October 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Minor improvements and bug fixes.
Modified paths:	/trunk/tools/editor_trends/construct_datasets.py (modified) (history) /trunk/tools/editor_trends/database/db.py (modified) (history) /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history) /trunk/tools/editor_trends/settings.py (modified) (history) /trunk/tools/editor_trends/utils/dump_downloader.py (modified) (history) /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history) /trunk/tools/editor_trends/utils/utils.py (modified) (history) /trunk/tools/editor_trends/wikitree/xml.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/map_wiki_editors.py
—	—	@@ -13,6 +13,9 @@
14	14	'''
15	15
16	16	__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
17	20
18	21	#Default Python libraries (Python => 2.6)
19	22	import sys
—	—	@@ -189,7 +192,7 @@
190	193	values = []
191	194	#print data_queue.qsize()
192	195
193		-
	196	+
194	197	except Empty:
195	198	# The queue is empty but store the remaining values if present
196	199	if values != []:
—	—	@@ -204,7 +207,7 @@
205	208	are finished and this Queue is empty than break, else wait for the
206	209	Queue to fill.
207	210	'''
208		-
	211	+
209	212	if all([utils.check_if_process_is_running(pid) for pid in pids]):
210	213	pass
211	214	#print 'Empty queue or not %s?' % data_queue.qsize()
Index: trunk/tools/editor_trends/wikitree/xml.py
—	—	@@ -13,6 +13,9 @@
14	14	'''
15	15
16	16	__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
17	20
18	21	from utils import utils
19	22	import settings
Index: trunk/tools/editor_trends/settings.py
—	—	@@ -13,8 +13,10 @@
14	14	'''
15	15
16	16	__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
17	20
18		-
19	21	'''
20	22	This file contains settings that are used for constructing and analyzing
21	23	the datasets as part of the Editor Dynamics and Anti-Vandalism projects.
—	—	@@ -91,3 +93,14 @@
92	94	#Multiprocess settings used to parallelize workload
93	95	#Change this to match your computers configuration (RAM / CPU)
94	96	NUMBER_OF_PROCESSES = cpu_count() * 1
	97	+
	98	+#Extensions of ascii files, this is used to determine the filemode to use
	99	+ASCII = ['txt', 'csv', 'xml', 'sql']
	100	+
	101	+WP_DUMP_LOCATION = 'http://download.wikimedia.org'
	102	+
	103	+LANGUAGE_MAPPING = {
	104	+'English': '/enwiki/latest/',
	105	+'Russian': '/ruwiki/latest/',
	106	+'German': '/dewiki/latest',
	107	+}
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -13,6 +13,9 @@
14	14	'''
15	15
16	16	__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
17	20
18	21	'''
19	22	The utils module contains helper functions that will be needed throughout.
—	—	@@ -138,6 +141,18 @@
139	142	fh.close()
140	143
141	144
	145	+def determine_file_extension(filename):
	146	+ pos = filename.rfind('.') + 1
	147	+ return filename[pos:]
	148	+
	149	+
	150	+def determine_file_mode(extension):
	151	+ if extension in settings.ASCII:
	152	+ return 'w'
	153	+ else:
	154	+ return 'wb'
	155	+
	156	+
142	157	def write_data_to_csv(data, location, function, encoding):
143	158	filename = construct_filename_from_function(function, '.csv')
144	159	fh = open_txt_file(location, filename, 'a', encoding=encoding)
—	—	@@ -155,14 +170,19 @@
156	171
157	172
158	173	def open_txt_file(location, filename, mode, encoding):
159		~~- return codecs.open(location+filename, mode, encoding=encoding)~~
	174	+ return codecs.open(location + filename, mode, encoding=encoding)
160	175
	176	+
	177	+def open_binary_file(location, filename, mode):
	178	+ return open(location + filename, mode)
	179	+
161	180	def construct_filename_from_function(function, extension):
162	181	return function.func_name + extension
163	182
	183	+
164	184	def check_file_exists(location, filename):
165	185	if hasattr(filename, '__call__'):
166		~~- filename = construct_filename_from_function(filename, '.bin')~~
	186	+ filename = construct_filename_from_function(filename, '.bin')
167	187	if os.path.exists(location + filename):
168	188	return True
169	189	else:
—	—	@@ -181,7 +201,7 @@
182	202
183	203	def load_object(location, filename):
184	204	if hasattr(filename, '__call__'):
185		~~- filename = construct_filename_from_function(filename, '.bin')~~
	205	+ filename = construct_filename_from_function(filename, '.bin')
186	206	if not filename.endswith('.bin'):
187	207	filename = filename + '.bin'
188	208	fh = open(location + filename, 'rb')
Index: trunk/tools/editor_trends/utils/dump_downloader.py
—	—	@@ -13,8 +13,10 @@
14	14	'''
15	15
16	16	__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
17	20
18		-
19	21	import os
20	22	import sys
21	23	import urllib2
—	—	@@ -22,56 +24,78 @@
23	25
24	26	import progressbar
25	27
	28	+import settings
26	29	import utils
27		~~-import settings~~
28	30
29	31
	32	+
30	33	def determine_remote_filesize(url, filename):
31	34	'''
32	35	@url is the full path of the file to be downloaded
33	36	@filename is the name of the file to be downloaded
34	37	'''
35		~~- conn = httplib.HTTPConnection(url)~~
	38	+ if url.startswith('http://'):
	39	+ url = url[7:]
	40	+ conn = httplib.HTTPConnection(url, 80)
36	41	conn.request('HEAD', filename)
37	42	res = conn.getresponse()
	43	+ conn.close()
38	44	if res.status == 200:
39		~~- return res.getheader('content-length', -1)~~
	45	+ return int(res.getheader('content-length', -1))
40	46	else:
41	47	return - 1
42	48
43	49
44		~~-def download_wp_dump(url, filename, location, pbar):~~
	50	+def download_wp_dump(domain, path, filename, location, filemode, pbar):
45	51	'''
46	52	This is a very simple replacement for wget and curl because Windows does
47	53	support these tools.
48	54	@url location of the file to be downloaded
49	55	@filename name of the file to be downloaded
50	56	@location indicates where to store the file locally
	57	+ @filemode indicates whether we are downloading a binary or ascii file.
51	58	@pbar is an instance of progressbar.ProgressBar()
52	59	'''
53	60	chunk = 4096
54		~~- fh = utils.open_txt_file(location, filename, 'w', settings.ENCODING)~~
55		~~- req = urllib2.Request(url + filename)~~
56		~~- filesize = determine_remote_filesize(url, filename)~~
57		~~- if filesize != -1:~~
58		~~- pbar(maxval=filesize).start()~~
	61	+ if filemode == 'w':
	62	+ fh = utils.open_txt_file(location, filename, filemode, settings.ENCODING)
	63	+ else:
	64	+ fh = utils.open_binary_file(location, filename, filemode)
	65	+
	66	+ filesize = determine_remote_filesize(domain, path + filename)
	67	+
	68	+
	69	+ if filesize != -1 and pbar:
	70	+ widgets = ['%s: ' % filename, progressbar.Percentage(), ' ',
	71	+ progressbar.Bar(marker=progressbar.RotatingMarker()),' ',
	72	+ progressbar.ETA(), ' ', progressbar.FileTransferSpeed()]
	73	+
	74	+ pbar = progressbar.ProgressBar(widgets=widgets,maxval=filesize).start()
	75	+ else:
	76	+ pbar = False
	77	+
	78	+ req = urllib2.Request(domain + path + filename)
59	79	try:
60	80	response = urllib2.urlopen(req)
61		~~- i = 0~~
62	81	while True:
63	82	data = response.read(chunk)
64	83	if not data:
65		~~- print 'Finished downloading %s%s.' % (url, filename)~~
	84	+ print 'Finished downloading %s%s%s.' % (domain, path, filename)
66	85	break
67		~~- f.write(data)~~
	86	+ fh.write(data)
68	87
69	88	if pbar:
70		~~- pbar.update(i * chunk)~~
71		~~- i += 1~~
72		~~- except URLError, error:~~
	89	+ filesize -= chunk
	90	+ if filesize < 0:
	91	+ chunk = chunk + filesize
	92	+ pbar.update(pbar.currval + chunk)
	93	+
	94	+ except urllib2.URLError, error:
73	95	print 'Reason: %s' % error.reason
74		~~- except HTTPError, error:~~
	96	+ except urllib2.HTTPError, error:
75	97	print 'Error: %s' % error.code
	98	+ finally:
	99	+ fh.close()
76	100
77	101
78	102	if __name__ == '__main__':
Index: trunk/tools/editor_trends/utils/process_constructor.py
—	—	@@ -13,6 +13,9 @@
14	14	'''
15	15
16	16	__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
17	20
18	21	from multiprocessing import Process, Queue
19	22	from Queue import Empty
—	—	@@ -106,7 +109,7 @@
107	110	data = obj
108	111	for d in data:
109	112	input_queue.put(d)
110		-
	113	+
111	114	if poison_pill:
112	115	for p in xrange(settings.NUMBER_OF_PROCESSES):
113	116	input_queue.put(None)
Index: trunk/tools/editor_trends/construct_datasets.py
—	—	@@ -13,6 +13,9 @@
14	14	'''
15	15
16	16	__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
17	20
18	21	from multiprocessing import Queue
19	22	from Queue import Empty
—	—	@@ -32,10 +35,10 @@
33	36	pass
34	37
35	38
36		~~-def retrieve_editor_ids_mongo():~~
	39	+def retrieve_editor_ids_mongo(RANDOM_SAMPLE=True):
37	40	if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,
38	41	retrieve_editor_ids_mongo):
39		~~- ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,~~
	42	+ contributors = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
40	43	retrieve_editor_ids_mongo)
41	44	else:
42	45	mongo = db.init_mongo_db('editors')
—	—	@@ -43,13 +46,14 @@
44	47	contributors = set()
45	48	#ids = editors.find().distinct('editor')
46	49	ids = editors.find()
47		~~- for x,id in enumerate(ids):~~
	50	+ for x, id in enumerate(ids):
48	51	contributors.add(id['editor'])
49		~~- if len(contributors) % 25000 == 0:~~
50		~~- print x, len(contributors)~~
51		~~- if ids != set():~~
52		~~- utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)~~
53		~~- return ids~~
	52	+ if len(contributors) == 100000:
	53	+ if RANDOM_SAMPLE:
	54	+ break
	55	+ if contributors != set():
	56	+ utils.store_object(contributors, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)
	57	+ return contributors
54	58
55	59
56	60	def generate_editor_dataset(input_queue, data_queue, pbar, kwargs):
—	—	@@ -58,6 +62,7 @@
59	63	debug = kwargs.pop('debug')
60	64	mongo = db.init_mongo_db('editors')
61	65	editors = mongo['editors']
	66	+ data = {}
62	67	while True:
63	68	try:
64	69	if debug:
—	—	@@ -65,29 +70,32 @@
66	71	else:
67	72	id = input_queue.get(block=False)
68	73
69		-
	74	+ print input_queue.qsize()
70	75	if definition == 'Traditional':
71		~~- obs = editors.find({'editor': id}).sort('date').limit(limit)~~
	76	+
	77	+ obs = editors.find({'editor': id}, {'date':1}).sort('date').limit(limit)
72	78	contributors = []
73	79	for ob in obs:
74	80	contributors.append(ob['date'])
	81	+ obs = ''
75	82	else:
76		~~- obs = editors.find({'editor': id}).sort('date')~~
	83	+ obs = editors.find({'editor': id}, {'date':1}).sort('date')
77	84	contributors = set()
78	85	for ob in obs:
79	86	if len(contributors) == limit:
80	87	break
81	88	else:
82	89	contributors.add(ob['date'])
83		-
	90	+ obs.close()
84	91	if len(contributors) < limit:
85	92	new_wikipedian = False
86	93	else:
87	94	new_wikipedian = True
88		~~- data = {id: [contributors, new_wikipedian]}~~
89		~~- utils.write_data_to_csv(data, settings.DATASETS_FILE_LOCATION, generate_editor_dataset, settings.ENCODING)~~
	95	+ data[id] = [contributors, new_wikipedian]
90	96
	97	+
91	98	except Empty:
	99	+ utils.write_data_to_csv(data, settings.DATASETS_FILE_LOCATION, generate_editor_dataset, settings.ENCODING)
92	100	break
93	101
94	102
—	—	@@ -152,21 +160,26 @@
153	161
154	162
155	163	def debug_retrieve_edits_by_contributor_launcher():
156		~~- input_queue = Queue()~~
	164	+ q = Queue()
157	165	kwargs = {'definition':'Traditional',
158	166	'limit': 10,
159		~~- 'debug': True~~
	167	+ 'debug': False
160	168	}
	169	+ ids = retrieve_editor_ids_mongo()
	170	+ input_queue = pc.load_queue(q, ids)
161	171	generate_editor_dataset(input_queue, False, False, kwargs)
162	172	#generate_editor_dataset_launcher()
163	173	#retrieve_list_contributors()
164	174	#retrieve_edits_by_contributor()
165	175
166	176	def generate_editor_dataset_launcher():
167		~~- ids = retrieve_editor_ids_mongo()~~
168		~~- pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, definition='Traditional', limit=10)~~
	177	+ kwargs = {'definition':'Traditional',
	178	+ 'limit': 10,
	179	+ 'debug': False
	180	+ }
	181	+ pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, kwargs)
169	182
170	183
171	184	if __name__ == '__main__':
172		~~- generate_editor_dataset_launcher()~~
173		~~- #debug_retrieve_edits_by_contributor_launcher()~~
	185	+ #generate_editor_dataset_launcher()
	186	+ debug_retrieve_edits_by_contributor_launcher()
Index: trunk/tools/editor_trends/database/db.py
—	—	@@ -13,6 +13,9 @@
14	14	'''
15	15
16	16	__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-10-21'
	19	+__version__ = '0.1'
17	20
18	21	import sqlite3 as sqlite
19	22	from pymongo import Connection
—	—	@@ -38,7 +41,7 @@
39	42	@collection is the name of the 'table' in mongodb
40	43	@key name of the field to create the index
41	44	'''
42		-
	45	+
43	46	mongo = init_mongo_db(db)
44	47	collection = mongo[collection]
45	48	mongo.collection.create_index(key)

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r75176 [removed: new added: deferred]