r75884 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r75883‎ \| r75884 \| r75885 >
Date:	22:08, 2 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Significant code refactoring, debugging and performance improvements. In particular, the cache object is starting to pay off.
Modified paths:	/trunk/tools/editor_trends (modified) (history) /trunk/tools/editor_trends/construct_datasets.py (modified) (history) /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history) /trunk/tools/editor_trends/optimize_editors.py (added) (history) /trunk/tools/editor_trends/settings.py (modified) (history) /trunk/tools/editor_trends/split_xml_file.py (modified) (history) /trunk/tools/editor_trends/utils/namespace_downloader.py (added) (history) /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history) /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/optimize_editors.py
—	—	@@ -0,0 +1,120 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = '2010-11-02'
	19	+__version__ = '0.1'
	20	+
	21	+
	22	+
	23	+import settings
	24	+from database import db
	25	+from utils import process_constructor as pc
	26	+
	27	+
	28	+def create_datacontainer(init_value=0):
	29	+ '''
	30	+ This function initializes an empty dictionary with as key the year (starting
	31	+ 2001 and running through) and as value @init_value, in most cases this will
	32	+ be zero so the dictionary will act as a running tally for a variable but
	33	+ @init_value can also a list, [], or a dictionary, {}, or a set, set().
	34	+ '''
	35	+ data = {}
	36	+ year = datetime.datetime.now().year + 1
	37	+ for x in xrange(2001, year):
	38	+ data[str(x)] = init_value
	39	+ return data
	40	+
	41	+
	42	+def determine_edits_by_year(dates):
	43	+ '''
	44	+ This function counts the number of edits by year made by a particular editor.
	45	+ '''
	46	+ edits = create_datacontainer()
	47	+ for date in dates:
	48	+ year = str(date['date'].year)
	49	+ edits[year] += 1
	50	+ return edits
	51	+
	52	+
	53	+def determine_articles_by_year(dates):
	54	+ '''
	55	+ This function counts the number of unique articles by year edited by a
	56	+ particular editor.
	57	+ '''
	58	+ articles = create_datacontainer(set())
	59	+ for date in dates:
	60	+ year = str(date['date'].year)
	61	+ articles[year].add(date['article'])
	62	+ for article in articles:
	63	+ articles[article] = len(article)
	64	+ return articles
	65	+
	66	+
	67	+def optimize_editors(input_queue, result_queue, pbar, kwargs):
	68	+ dbname = kwargs.pop('dbname')
	69	+ mongo = db.init_mongo_db(dbname)
	70	+ input = mongo['editors']
	71	+ output = mongo['dataset']
	72	+ mongo.output.ensure_index('editor')
	73	+ mongo.output.ensure_index('year_joined')
	74	+ definition = kwargs.pop('definition')
	75	+ while True:
	76	+ try:
	77	+ id = input_queue.get(block=False)
	78	+ editor = input.find_one({'editor': id})
	79	+ edits = editor['edits']
	80	+ edits = sorted(edits, key=itemgetter('date'))
	81	+ edit_count = len(edits)
	82	+ new_wikipedian = edits[9]['date'].year
	83	+ first_edit = edits[0]['date']
	84	+ final_edit = edits[-1]['date']
	85	+ edits_by_year = determine_edits_by_year(edits)
	86	+ articles_by_year = determine_articles_by_year(edits)
	87	+ edits = edits[:10]
	88	+
	89	+ output.insert({'editor': id, 'edits': edits,
	90	+ 'edits_by_year': edits_by_year,
	91	+ 'year_joined': year,
	92	+ 'edit_count': edit_count,
	93	+ 'final_edit': final_edit,
	94	+ 'first_edit': first_edit,
	95	+ 'articles_by_year': articles_by_year})
	96	+ print 'Items left: %s' % input_queue.qsize()
	97	+ except Empty:
	98	+ break
	99	+
	100	+def run_optimize_editors(dbname):
	101	+ ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
	102	+ kwargs = {'definition': 'traditional',
	103	+ 'pbar': True,
	104	+ 'dbname': 'enwiki',
	105	+ 'nr_input_processors': 2,
	106	+ 'nr_output_processors': 0,
	107	+ }
	108	+ pc.build_scaffolding(pc.load_queue, optimize_editors, ids, False, False, **kwargs)
	109	+
	110	+
	111	+def debug_optimize_editors(dbname):
	112	+ ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
	113	+ q = pc.load_queue(ids)
	114	+ kwargs = {'definition': 'traditional',
	115	+ 'dbname': 'enwiki'
	116	+ }
	117	+ optimize_editors(q, False, True, kwargs)
	118	+
	119	+
	120	+if __name__ == '__main__':
	121	+ run_optimize_editors('enwiki')
\ No newline at end of file
Index: trunk/tools/editor_trends/map_wiki_editors.py
—	—	@@ -21,11 +21,14 @@
22	22	import sys
23	23	import os
24	24	import time
	25	+import datetime
25	26	import codecs
	27	+import math
26	28	import cStringIO
27	29	import re
	30	+from operator import itemgetter
28	31	import xml.etree.cElementTree as cElementTree
29		~~-from multiprocessing import Queue~~
	32	+from multiprocessing import Queue, JoinableQueue
30	33	from Queue import Empty
31	34	import pymongo
32	35
—	—	@@ -34,6 +37,7 @@
35	38	from utils import utils, models
36	39	from database import db_settings
37	40	from database import db
	41	+from database import cache
38	42	from wikitree import xml
39	43	from statistics import dataset
40	44	from utils import process_constructor as pc
—	—	@@ -45,13 +49,15 @@
46	50	except ImportError:
47	51	pass
48	52
49		~~-#contributors = {}~~
50	53
51		~~-RE_BOT = re.compile('bot', re.IGNORECASE)~~
52		~~-RE_SCRIPT = re.compile('script', re.IGNORECASE)~~
	54	+def determine_username_is_bot(username, kwargs):
	55	+ '''
	56	+ @username is the xml element containing the id of the user
	57	+ @kwargs should have a list with all the bot ids
53	58
54		-
55		~~-def determine_username_is_bot(username, kwargs):~~
	59	+ @Return False if username id is not in bot list id or True if username id
	60	+ is a bot id.
	61	+ '''
56	62	ids = kwargs.get('bots', [])
57	63	if ids == None:
58	64	ids = []
—	—	@@ -66,14 +72,14 @@
67	73	def extract_contributor_id(contributor, kwargs):
68	74	'''
69	75	@contributor is the xml contributor node containing a number of attributes
70		-
	76	+
71	77	Currently, we are only interested in registered contributors, hence we
72	78	ignore anonymous editors. If you are interested in collecting data on
73	79	anonymous editors then add the string 'ip' to the tags variable.
74	80	'''
75	81	tags = ['id']
76	82	if contributor.get('deleted'):
77		~~- return - 1 #Not sure if this is the best way to code deleted contributors.~~
	83	+ return - 1 # ASK: Not sure if this is the best way to code deleted contributors.
78	84	for elem in contributor:
79	85	if elem.tag in tags:
80	86	if elem.text != None:
—	—	@@ -83,6 +89,14 @@
84	90
85	91
86	92	def output_editor_information(elem, data_queue, **kwargs):
	93	+ '''
	94	+ @elem is an XML element containing 1 revision from a page
	95	+ @data_queue is where to store the data
	96	+ @**kwargs contains extra information
	97	+
	98	+ the variable tags determines which attributes are being parsed, the values in
	99	+ this dictionary are the functions used to extract the data.
	100	+ '''
87	101	tags = {'contributor': {'editor': extract_contributor_id, 'bot': determine_username_is_bot},
88	102	'timestamp': {'date': xml.extract_text},
89	103	}
—	—	@@ -104,10 +118,24 @@
105	119	data_queue.put(vars)
106	120	vars = {}
107	121
108		~~-def parse_editors(xml_queue, data_queue, pbar, bots, debug=False, separator='\t'):~~
	122	+
	123	+def parse_editors(xml_queue, data_queue, pbar, bots, **kwargs):
	124	+ '''
	125	+ @xml_queue contains the filenames of the files to be parsed
	126	+ @data_queue is an instance of Queue where the extracted data is stored for
	127	+ further processing
	128	+ @pbar is an instance of progressbar to display the progress
	129	+ @bots is a list of id's of known Wikipedia bots
	130	+ @debug is a flag to indicate whether the function is called for debugging.
	131	+
	132	+ Output is the data_queue that will be used by store_editors()
	133	+ '''
	134	+ file_location = os.path.join(settings.XML_FILE_LOCATION, kwargs.get('language', 'en'))
	135	+ debug = kwargs.get('debug', None)
109	136	if settings.DEBUG:
110	137	messages = {}
111	138	vars = {}
	139	+
112	140	while True:
113	141	try:
114	142	if debug:
—	—	@@ -117,12 +145,13 @@
118	146	if file == None:
119	147	print 'Swallowed a poison pill'
120	148	break
121		~~- data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION,~~
	149	+ data = xml.read_input(utils.create_txt_filehandle(file_location,
122	150	file, 'r',
123	151	encoding=settings.ENCODING))
124	152	for raw_data in data:
125	153	xml_buffer = cStringIO.StringIO()
126	154	raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
	155	+
127	156	try:
128	157	raw_data = ''.join(raw_data)
129	158	xml_buffer.write(raw_data)
—	—	@@ -144,142 +173,122 @@
145	174	if settings.DEBUG:
146	175	utils.track_errors(xml_buffer, error, file, messages)
147	176	except MemoryError, error:
148		~~- '''~~
149		~~- There is one xml file causing an out of memory file, not~~
150		~~- sure which one yet. This happens when raw_data =~~
151		~~- ''.join(raw_data) is called. 18-22~~
152		~~- '''~~
153	177	print file, error
154	178	print raw_data[:12]
155	179	print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])
156		~~- if settings.DEBUG:~~
157		~~- utils.track_errors(xml_buffer, error, file, messages)~~
158	180
	181	+ data_queue.put('NEXT')
159	182	if pbar:
160		~~- #print xml_queue.qsize()~~
161		~~- utils.update_progressbar(pbar, xml_queue)~~
	183	+ print file, xml_queue.qsize(), data_queue.qsize()
	184	+ #utils.update_progressbar(pbar, xml_queue)
162	185	if debug:
163	186	break
164	187
	188	+ while True:
	189	+ if data_queue.qsize() < 100000:
	190	+ break
	191	+ else:
	192	+ time.sleep(10)
	193	+ print 'Still sleeping, queue is %s items long' % data_queue.qsize()
	194	+
165	195	except Empty:
166	196	break
167	197
	198	+ #for x in xrange(4):
	199	+ data_queue.put(None)
	200	+
168	201	if settings.DEBUG:
169		~~- utils.report_error_messages(messages, lookup_new_editors)~~
	202	+ utils.report_error_messages(messages, parse_editors)
170	203
171	204
172	205	def store_editors(data_queue, pids, dbname):
	206	+ '''
	207	+ @data_queue is an instance of Queue containing information extracted by
	208	+ parse_editors()
	209	+ @pids is a list of PIDs used to check if other processes are finished
	210	+ running
	211	+ @dbname is the name of the MongoDB collection where to store the information.
	212	+ '''
173	213	mongo = db.init_mongo_db(dbname)
174	214	collection = mongo['editors']
175	215	mongo.collection.ensure_index('editor')
	216	+ editor_cache = cache.EditorCache(collection)
176	217	while True:
177	218	try:
178	219	edit = data_queue.get(block=False)
179		~~- contributor = edit['editor']~~
180		~~- value = {'date':edit['date'], 'article': edit['article']}~~
181		~~- collection.update({'editor': contributor}, {'$inc': {'edit_count': 1},~~
182		~~- '$push': {'edits': value}}, True)~~
	220	+ data_queue.task_done()
	221	+ if edit == None:
	222	+ print 'Swallowing poison pill'
	223	+ break
	224	+ elif edit == 'NEXT':
	225	+ editor_cache.add('NEXT', '')
	226	+ else:
	227	+ contributor = edit['editor']
	228	+ value = {'date': edit['date'], 'article': edit['article']}
	229	+ editor_cache.add(contributor, value)
	230	+ #collection.update({'editor': contributor}, {'$push': {'edits': value}}, True)
	231	+ #'$inc': {'edit_count': 1},
	232	+
183	233	except Empty:
184	234	'''
185	235	This checks whether the Queue is empty because the preprocessors are
186	236	finished or because this function is faster in emptying the Queue
187		~~- then the preprocessors are able to fill it. If this preprocessors~~
	237	+ then the preprocessors are able to fill it. If the preprocessors
188	238	are finished and this Queue is empty than break, else wait for the
189	239	Queue to fill.
190	240	'''
191		~~- if all([utils.check_if_process_is_running(pid) for pid in pids]):~~
192		~~- pass~~
193		~~- #print 'Empty queue or not %s?' % data_queue.qsize()~~
194		~~- else:~~
195		~~- break~~
	241	+ pass
196	242
	243	+ print 'Emptying entire cache.'
	244	+ editor_cache.store()
	245	+ print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n)
197	246
198		~~-def optimize_editors(dbname, input_queue, **kwargs):~~
199		~~- mongo = db.init_mongo_db(dbname)~~
200		~~- collection = mongo['editors']~~
201		~~- definition = kwargs.pop('definition')~~
202		~~- while True:~~
203		~~- try:~~
204		~~- id = input_queue.get(block=False)~~
205		~~- #id = '94033'~~
206		~~- editor = collection.find_one({'editor': id})~~
207		~~- edits = editor['edits']~~
208		~~- edits.sort()~~
209		~~- year = edits[0]['date'].year~~
210		~~- new_wikipedian = dataset.determine_editor_is_new_wikipedian(edits, defintion)~~
211		~~- collection.update({'editor': id}, {'$set': {'edits': edits, 'year_joined': year, 'new_wikipedian': new_wikipedian}})~~
212		-
213		~~- except Empty:~~
214		~~- break~~
215	247
	248	+def load_bot_ids():
	249	+ '''
	250	+ Loader function to retrieve list of id's of known Wikipedia bots.
	251	+ '''
	252	+ ids = {}
	253	+ mongo = db.init_mongo_db('bots')
	254	+ bots = mongo['ids']
	255	+ cursor = bots.find()
	256	+ for bot in cursor:
	257	+ ids[bot['id']] = bot['name']
	258	+ return ids
216	259
217		~~-def store_data_db(data_queue, pids):~~
218		~~- connection = db.init_database()~~
219		~~- cursor = connection.cursor()~~
220		~~- db.create_tables(cursor, db_settings.CONTRIBUTOR_TABLE)~~
221	260
222		~~- empty = 0~~
223		-
224		~~- values = []~~
225		~~- while True:~~
226		~~- try:~~
227		~~- chunk = data_queue.get(block=False)~~
228		~~- contributor = chunk['contributor'].encode(settings.ENCODING)~~
229		~~- article = chunk['article']~~
230		~~- timestamp = chunk['timestamp'].encode(settings.ENCODING)~~
231		~~- bot = chunk['bot']~~
232		~~- values.append((contributor, article, timestamp, bot))~~
233		-
234		~~- if len(values) == 50000:~~
235		~~- cursor.executemany('INSERT INTO contributors VALUES (?,?,?,?)', values)~~
236		~~- connection.commit()~~
237		~~- #print 'Size of queue: %s' % data_queue.qsize()~~
238		~~- values = []~~
239		-
240		~~- except Empty:~~
241		~~- if all([utils.check_if_process_is_running(pid) for pid in pids]):~~
242		~~- pass~~
243		~~- else:~~
244		~~- break~~
245		~~- connection.close()~~
246		-
247		-
248		~~-def run_stand_alone(dbname):~~
249		~~- files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml')~~
250		~~- #files = files[:2]~~
	261	+def run_parse_editors(dbname, language):
	262	+ ids = load_bot_ids()
251	263	kwargs = {'bots': ids,
252	264	'dbname': dbname,
253	265	'pbar': True,
254		~~- 'definition': 'traditional'}~~
	266	+ 'nr_input_processors': 1,
	267	+ 'nr_output_processors': 1,
	268	+ 'language': language,
	269	+ }
	270	+ chunks = {}
	271	+ file_location = os.path.join(settings.XML_FILE_LOCATION, language)
	272	+ files = utils.retrieve_file_list(file_location, 'xml')
	273	+ parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0))
	274	+ a = 0
	275	+ for x in xrange(settings.NUMBER_OF_PROCESSES):
	276	+ b = a + parts
	277	+ chunks[x] = files[a:b]
	278	+ a = (x + 1) * parts
255	279
256		~~- mongo = db.init_mongo_db('bots')~~
257		~~- bots = mongo['ids']~~
258		~~- ids = {}~~
259		~~- cursor = bots.find()~~
260		~~- for bot in cursor:~~
261		~~- ids[bot['id']] = bot['name']~~
262		-
263		~~- pc.build_scaffolding(pc.load_queue, parse_editors, files, store_editors, True, **kwargs)~~
264		~~- ids = retrieve_ids_mongo_new(dbname, 'editors')~~
265		~~- pc.build_scaffolding(pc.load_queue, optimize_editors, ids, False, False, **kwargs)~~
266	280
267		~~-def debug_lookup_new_editors():~~
268		~~- q = Queue()~~
269		~~- import progressbar~~
270		~~- pbar = progressbar.ProgressBar().start()~~
	281	+ for x in xrange(settings.NUMBER_OF_PROCESSES):
	282	+ pc.build_scaffolding(pc.load_queue, parse_editors, chunks[x], store_editors, True, **kwargs)
	283	+
	284	+
	285	+def debug_parse_editors(dbname):
	286	+ q = JoinableQueue()
271	287	#edits = db.init_mongo_db('editors')
272		~~- parse_editors('464.xml', q, None, None, True)~~
273		~~- store_data_mongo(q, [], 'test')~~
274		~~- #keys = ['editor']~~
275		~~- #for key in keys:~~
276		~~- # db.add_index_to_collection('editors', 'editors', key)~~
	288	+ parse_editors('en\\522.xml', q, None, None, True)
	289	+ store_editors(q, [], dbname)
277	290
	291	+
278	292	if __name__ == "__main__":
279		~~- #optimize_editors('enwiki')~~
280		~~- #debug_lookup_new_editors()~~
281		-
282		~~- if settings.RUN_MODE == 'stand_alone':~~
283		~~- run_stand_alone()~~
284		~~- print 'Finished processing XML files.'~~
285		~~- else:~~
286		~~- run_hadoop()~~
	293	+ #debug_parse_editors('test')
	294	+ run_parse_editors('test', 'en')
	295	+ pass
Index: trunk/tools/editor_trends/settings.py
—	—	@@ -30,21 +30,29 @@
31	31
32	32	#Setting up the environment
33	33	ops = {platform.win32_ver: 'Windows',
34		~~- platform.linux_distribution: 'Linux',~~
35		~~- platform.mac_ver: 'OSX'}~~
	34	+ platform.linux_distribution: 'Linux',
	35	+ platform.mac_ver: 'OSX'}
	36	+
36	37	for op in ops:
37	38	if op() != ('', '', '') and op() != ('', ('', '', ''), ''):
38	39	OS = ops[op]
39	40
40		~~-WORKING_DIRECTORY = os.getcwd()#[:-9]~~
	41	+WORKING_DIRECTORY = os.getcwd()
41	42	IGNORE_DIRS = ['wikistats', 'zips']
	43	+ROOT = '/' if OS != 'Windows' else 'c:\\'
42	44
43		~~-dirs = [name for name in os.listdir(WORKING_DIRECTORY) if os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]~~
	45	+
	46	+dirs = [name for name in os.listdir(WORKING_DIRECTORY) if
	47	+ os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]
44	48	for subdirname in dirs:
45	49	if not subdirname.startswith('.') and subdirname not in IGNORE_DIRS:
46	50	sys.path.append(os.path.join(WORKING_DIRECTORY, subdirname))
47	51
	52	+WINDOWS_ZIP = ['7z.exe']
48	53
	54	+OSX_ZIP = []
	55	+
	56	+LINUX_ZIP = []
49	57	#General settings
50	58
51	59	# Valid values are 'stand-alone' and 'hadoop'
—	—	@@ -65,22 +73,23 @@
66	74	#This section contains configuration variables for the different file locations.
67	75
68	76	# Location where to write xml chunks
69		~~-XML_FILE_LOCATION = 'C:/wikimedia/'~~
	77	+XML_FILE_LOCATION = os.path.join(ROOT, 'wikimedia')
70	78
71	79	# Input file
72		~~-XML_FILE = 'C:/Source_Files/enwiki-20100916-stub-meta-history.xml'~~
	80	+XML_FILE = os.path.join(ROOT, 'Source_Files', 'enwiki-20100916-stub-meta-history.xml')
73	81
74	82	# This is the place where error messages are stored for debugging purposes
75		~~-ERROR_MESSAGE_FILE_LOCATION = WORKING_DIRECTORY + '/errors/'~~
	83	+ERROR_MESSAGE_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'errors')
76	84
77		~~-DATABASE_FILE_LOCATION = WORKING_DIRECTORY + '/data/database/'~~
	85	+DATABASE_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'database')
78	86
79		~~-BINARY_OBJECT_FILE_LOCATION = WORKING_DIRECTORY + '/data/objects/'~~
	87	+BINARY_OBJECT_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'objects')
80	88
81		~~-DATASETS_FILE_LOCATION = WORKING_DIRECTORY + '/datasets/'~~
	89	+DATASETS_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'datasets')
82	90
83		~~-TXT_FILE_LOCATION = WORKING_DIRECTORY + '/csv/'~~
	91	+TXT_FILE_LOCATION = os.path.join(WORKING_DIRECTORY, 'data', 'csv')
84	92
	93	+NAMESPACE_LOCATION = os.path.join(WORKING_DIRECTORY, 'namespaces')
85	94	#This section contains configuration variables for parsing / encoding and
86	95	#working with the XML files.
87	96
—	—	@@ -92,12 +101,32 @@
93	102	# Name space, do not change as this works for Mediawiki wikis
94	103	NAME_SPACE = 'http://www.mediawiki.org/xml/export-0.4/'
95	104
	105	+
	106	+WIKIMEDIA_PROJECTS = {'commons': 'commonswiki',
	107	+ 'wikibooks': 'wikibooks',
	108	+ 'wikinews': 'wikinews',
	109	+ 'wikiquote': 'wikiquote',
	110	+ 'wikisource': 'wikisource',
	111	+ 'wikiversity': 'wikiversity',
	112	+ 'wiktionary': 'wiktionary',
	113	+ 'metawiki': 'metawiki',
	114	+ 'wikispecies': 'specieswiki',
	115	+ 'incubator': 'incubatorwiki',
	116	+ 'foundation': 'foundationwiki',
	117	+ 'mediawiki': 'mediawikiwiki',
	118	+ 'outreach': 'outreachwiki',
	119	+ 'strategic planning': 'strategywiki',
	120	+ 'usability initiative': 'usabilitywiki',
	121	+ 'multilingual wikisource': None
	122	+ }
	123	+
96	124	#Multiprocess settings used to parallelize workload
97	125	#Change this to match your computers configuration (RAM / CPU)
98	126	NUMBER_OF_PROCESSES = cpu_count() * 1
99	127
100		~~-#Extensions of ascii files, this is used to determine the filemode to use~~
	128	+#Extensions of ascii files, this is used to determine the filemode to use
101	129	ASCII = ['txt', 'csv', 'xml', 'sql']
102	130
103	131	WP_DUMP_LOCATION = 'http://download.wikimedia.org'
104	132
	133	+MAX_CACHE_SIZE = 1024 * 1024
Index: trunk/tools/editor_trends/utils/namespace_downloader.py
—	—	@@ -0,0 +1,43 @@
	2	+
	3	+
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__author__email = 'dvanliere at gmail dot com'
	18	+__date__ = 'Oct 27, 2010'
	19	+__version__ = '0.1'
	20	+
	21	+import languages
	22	+import dump_downloader as dd
	23	+import settings
	24	+
	25	+PATH = '/w/api.php?action=query&meta=siteinfo&siprop=namespaces\|namespacealiases&format=json'
	26	+LOCATION = settings.NAMESPACE_LOCATION
	27	+
	28	+def retrieve_json_namespace():
	29	+ visited = set()
	30	+ for language in languages.MAPPING:
	31	+ language = languages.MAPPING[language]
	32	+ filename = '%s_ns.json' % language
	33	+ if language not in visited:
	34	+ domain = 'http://%s.wikipedia.org' % language
	35	+ dd.download_wiki_file(domain, PATH, filename, LOCATION, 'w', True)
	36	+ visited.add(language)
	37	+
	38	+
	39	+def launch_downloader():
	40	+ retrieve_json_namespace()
	41	+
	42	+
	43	+if __name__ == '__main__':
	44	+ launch_downloader()
\ No newline at end of file
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -65,7 +65,7 @@
66	66	return False
67	67	else:
68	68	os.kill(pid, 0)
69		~~- return Tru~~
	69	+ return True
70	70	except Exception, error:
71	71	print error
72	72	return False
—	—	@@ -132,7 +132,7 @@
133	133	# read / write data related functions
134	134	def read_data_from_csv(filename, encoding):
135	135	if hasattr(filename, '__call__'):
136		~~- filename = construct_filename_from_function(filename)~~
	136	+ filename = construct_filename(filename)
137	137
138	138	fh = open_txt_file(filename, 'r', encoding=encoding)
139	139	for line in fh:
—	—	@@ -140,13 +140,15 @@
141	141
142	142	fh.close()
143	143
144		~~-def create_directory(language):~~
	144	+
	145	+def create_directory(path):
145	146	try:
146		~~- os.mkdir(settings.WORKING_DIRECTORY + '/' + language)~~
	147	+ os.mkdir(path)
147	148	return True
148		~~- except IOERROR:~~
	149	+ except IOError:
149	150	return False
150	151
	152	+
151	153	def determine_file_extension(filename):
152	154	pos = filename.rfind('.') + 1
153	155	return filename[pos:]
—	—	@@ -158,10 +160,18 @@
159	161	else:
160	162	return 'wb'
161	163
162		-
163		~~-def write_data_to_csv(data, location, function, encoding):~~
164		~~- filename = construct_filename_from_function(function, '.csv')~~
165		~~- fh = open_txt_file(location, filename, 'a', encoding=encoding)~~
	164	+def write_list_to_csv(data, fh, recursive=False):
	165	+ if recursive:
	166	+ recursive = False
	167	+ for d in data:
	168	+ if type(d) == type([]):
	169	+ recursive = write_list_to_csv(d, fh, True)
	170	+ else:
	171	+ fh.write('%s\t' % d)
	172	+ if recursive:
	173	+ return True
	174	+
	175	+def write_dict_to_csv(data, fh):
166	176	keys = data.keys()
167	177	for key in keys:
168	178	fh.write('%s' % key)
—	—	@@ -172,45 +182,68 @@
173	183	else:
174	184	fh.write('\t%s' % (obs))
175	185	fh.write('\n')
176		~~- fh.close()~~
177	186
178	187
179		~~-def open_txt_file(location, filename, mode, encoding):~~
180		~~- return codecs.open(location + filename, mode, encoding=encoding)~~
	188	+def create_txt_filehandle(location, name, mode, encoding):
	189	+ filename = construct_filename(name, '.csv')
	190	+ path = os.path.join(location, filename)
	191	+ return codecs.open(path, mode, encoding=encoding)
181	192
182	193
183		~~-def open_binary_file(location, filename, mode):~~
184		~~- return open(location + filename, mode)~~
	194	+def create_binary_filehandle(location, filename, mode):
	195	+ path = os.path.join(location, filename)
	196	+ return open(path, mode)
185	197
186		~~-def construct_filename_from_function(function, extension):~~
187		~~- return function.func_name + extension~~
188	198
	199	+def construct_filename(name, extension):
	200	+ if hasattr(name, '__call__'):
	201	+ return name.func_name + extension
	202	+ else:
	203	+ return name
189	204
	205	+
190	206	def check_file_exists(location, filename):
191	207	if hasattr(filename, '__call__'):
192		~~- filename = construct_filename_from_function(filename, '.bin')~~
193		~~- if os.path.exists(location + filename):~~
	208	+ filename = construct_filename(filename, '.bin')
	209	+ if os.path.exists(os.path.join(location, filename)):
194	210	return True
195	211	else:
196	212	return False
197	213
198	214
	215	+def which(program):
	216	+ def is_exe(fpath):
	217	+ return os.path.exists(fpath) and os.access(fpath, os.X_OK)
	218	+
	219	+ fpath, fname = os.path.split(program)
	220	+ if fpath:
	221	+ if is_exe(program):
	222	+ return program
	223	+ else:
	224	+ for path in os.environ["PATH"].split(os.pathsep):
	225	+ exe_file = os.path.join(path, program)
	226	+ if is_exe(exe_file):
	227	+ return exe_file
	228	+
	229	+ return None
	230	+
	231	+
199	232	def store_object(object, location, filename):
200	233	if hasattr(filename, '__call__'):
201		~~- filename = construct_filename_from_function(filename, '.bin')~~
	234	+ filename = construct_filename(filename, '.bin')
202	235	if not filename.endswith('.bin'):
203	236	filename = filename + '.bin'
204		~~- fh = open(location + filename, 'wb')~~
	237	+ fh = create_binary_filehandle(location, filename, 'wb')
205	238	cPickle.dump(object, fh)
206	239	fh.close()
207	240
208	241
209	242	def load_object(location, filename):
210	243	if hasattr(filename, '__call__'):
211		~~- filename = construct_filename_from_function(filename, '.bin')~~
	244	+ filename = construct_filename(filename, '.bin')
212	245	if not filename.endswith('.bin'):
213	246	filename = filename + '.bin'
214		~~- fh = open(location + filename, 'rb')~~
	247	+ fh = create_binary_filehandle(location, filename, 'rb')
215	248	obj = cPickle.load(fh)
216	249	fh.close()
217	250	return obj
—	—	@@ -293,8 +326,8 @@
294	327
295	328
296	329	def debug():
297		~~- dt = humanize_time_difference(64)~~
298		~~- print dt~~
299		-
	330	+ #dt = humanize_time_difference(64)
	331	+ #print dt
	332	+ check_if_process_is_running(3012)
300	333	if __name__ == '__main__':
301	334	debug()
Index: trunk/tools/editor_trends/utils/process_constructor.py
—	—	@@ -17,7 +17,7 @@
18	18	__date__ = '2010-10-21'
19	19	__version__ = '0.1'
20	20
21		~~-from multiprocessing import Process, Queue~~
	21	+from multiprocessing import Process, Queue, JoinableQueue
22	22	from Queue import Empty
23	23
24	24	import settings
—	—	@@ -53,29 +53,32 @@
54	54	@kwargs is a dictionary with optional variables. Used to supply to main
55	55	'''
56	56
57		~~- input_queue = Queue()~~
	57	+ nr_input_processors = kwargs.pop('nr_input_processors')
	58	+ nr_output_processors = kwargs.pop('nr_output_processors')
	59	+
58	60	if result_queue:
59		~~- result_queue = Queue()~~
	61	+ result_queue = JoinableQueue()
60	62
61		~~- load_input_queue(input_queue, obj, poison_pill=True)~~
	63	+ input_queue = load_input_queue(obj, poison_pill=True)
62	64
63	65	if settings.PROGRESS_BAR:
64	66	pbar = progressbar.ProgressBar(maxval=input_queue.qsize()).start()
	67	+ kwargs['pbar'] = pbar
65	68	else:
66	69	pbar = False
67	70
68	71
69	72	input_processes = [models.ProcessInputQueue(main, input_queue, result_queue,
70		~~- **kwargs) for i in xrange(settings.NUMBER_OF_PROCESSES -1)]~~
	73	+ **kwargs) for i in xrange(nr_input_processors)]
71	74
72	75	for input_process in input_processes:
73	76	input_process.start()
74	77	pids = [p.pid for p in input_processes]
75	78	kwargs['pids'] = pids
76		-
	79	+
77	80	if result_queue:
78	81	result_processes = [models.ProcessResultQueue(result_processor,
79		~~- result_queue, **kwargs) for i in xrange(24)]~~
	82	+ result_queue, **kwargs) for i in xrange(nr_output_processors)]
80	83	for result_process in result_processes:
81	84	result_process.start()
82	85
—	—	@@ -95,7 +98,7 @@
96	99	print 'Total elapsed time: %s.' % (utils.humanize_time_difference(pbar.seconds_elapsed))
97	100
98	101
99		~~-def load_queue(input_queue, obj, poison_pill=False):~~
	102	+def load_queue(obj, poison_pill=False):
100	103	'''
101	104	@input_queue should be an instance of multiprocessing.Queue
102	105
—	—	@@ -103,7 +106,7 @@
104	107
105	108	@returns: queue with tasks
106	109	'''
107		-
	110	+ input_queue = Queue()
108	111	if isinstance(obj, type(list)):
109	112	data = utils.load_object(obj)
110	113	else:
Index: trunk/tools/editor_trends/split_xml_file.py
—	—	@@ -21,6 +21,12 @@
22	22	import codecs
23	23	import utils
24	24	import re
	25	+import json
	26	+import os
	27	+
	28	+import progressbar
	29	+
	30	+from utils import utils
25	31	import settings
26	32
27	33	try:
—	—	@@ -30,6 +36,7 @@
31	37	pass
32	38
33	39
	40	+
34	41	RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
35	42
36	43
—	—	@@ -38,7 +45,16 @@
39	46
40	47
41	48	def lenient_deccharref(m):
42		~~- return unichr(int(m.group(1)))~~
	49	+ try:
	50	+ return unichr(int(m.group(1)))
	51	+ except ValueError:
	52	+ '''
	53	+ There are a few articles that raise a Value Error here, the reason is
	54	+ that I am using a narrow Python build (UCS2) instead of a wide build
	55	+ (UCS4). The quick fix is to return an empty string...
	56	+ Real solution is to rebuild Python with UCS4 support.....
	57	+ '''
	58	+ return ''
43	59
44	60
45	61	def remove_namespace(element, namespace):
—	—	@@ -50,42 +66,70 @@
51	67	elem.tag = elem.tag[nsl:]
52	68	return element
53	69
	70	+def load_namespace(language):
	71	+ file = '%s_ns.json' % language
	72	+ fh = utils.create_txt_filehandle(settings.NAMESPACE_LOCATION, file, 'r', settings.ENCODING)
	73	+ ns = json.load(fh)
	74	+ fh.close()
	75	+ ns = ns['query']['namespaces']
	76	+ return ns
54	77
	78	+
	79	+def build_namespaces_locale(namespaces):
	80	+ ns = []
	81	+ for namespace in namespaces:
	82	+ value = namespaces[namespace].get(u'canonical', None)
	83	+ if value != None and not value.endswith('talk'):
	84	+ ns.append(value)
	85	+ return ns
	86	+
	87	+
55	88	def parse_comments(xml, function):
56	89	revisions = xml.findall('revision')
57	90	for revision in revisions:
58	91	comment = revision.find('comment')
59	92	timestamp = revision.find('timestamp').text
60		-
61	93	# text1 = remove_ascii_control_characters(text)
62	94	# text2 = remove_numeric_character_references(text)
63	95	# text3 = convert_html_entities(text)
64		-
65	96	if comment != None and comment.text != None:
66	97	comment.text = function(comment.text)
67	98	return xml
68	99
69	100
	101	+def is_article_main_namespace(elem, namespace):
	102	+ title = elem.find('title').text
	103	+ for ns in namespace:
	104	+ if title.startswith(ns):
	105	+ return False
	106	+ return True
	107	+
	108	+
	109	+
70	110	def write_xml_file(element, fh, counter, language):
71	111	'''Get file handle and write xml element to file'''
72	112	size = len(cElementTree.tostring(element))
73		~~- fh, counter = create_xml_file_handle(fh, counter, size)~~
74		~~- fh.write(cElementTree.tostring(element))~~
	113	+ fh, counter = create_xml_file_handle(fh, counter, size, language)
	114	+ try:
	115	+ fh.write(cElementTree.tostring(element))
	116	+ except MemoryError:
	117	+ print 'Add error capturing logic'
75	118	fh.write('\n')
76	119	return fh, counter
77	120
78	121
79		~~-def create_xml_file_handle(fh, counter, size):~~
	122	+def create_xml_file_handle(fh, counter, size, language):
80	123	'''Create file handle if none is supplied or if file size > max file size.'''
	124	+ path = os.path.join(settings.XML_FILE_LOCATION , language, '%s.xml' % counter)
81	125	if not fh:
82	126	counter = 0
83		~~- fh = codecs.open(settings.LOCATION + '/' + language + '/' + str(counter) + '.xml', 'w', encoding=settings.ENCODING)~~
	127	+ fh = codecs.open(path, 'w', encoding=settings.ENCODING)
84	128	return fh, counter
85	129	elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE:
86	130	print 'Created chunk %s' % counter
87	131	fh.close
88	132	counter += 1
89		~~- fh = codecs.open(settings.LOCATION + '/' + language + '/' + str(counter) + '.xml', 'w', encoding=settings.ENCODING)~~
	133	+ fh = codecs.open(path, 'w', encoding=settings.ENCODING)
90	134	return fh, counter
91	135	else:
92	136	return fh, counter
—	—	@@ -93,14 +137,21 @@
94	138
95	139	def split_xml(language):
96	140	'''Reads xml file and splits it in N chunks'''
97		~~- result = utils.create_directory(language)~~
	141	+ location = os.path.join(settings.XML_FILE_LOCATION, language)
	142	+ result = utils.check_file_exists(location, '')
	143	+ if result == False:
	144	+ result = utils.create_directory(location)
98	145	if not result:
99	146	return
100	147
	148	+ ns = load_namespace(language)
	149	+ ns = build_namespaces_locale(ns)
	150	+
	151	+
101	152	fh = None
102	153	counter = None
103	154	tag = '{%s}page' % settings.NAME_SPACE
104		-
	155	+
105	156	context = cElementTree.iterparse(settings.XML_FILE, events=('start', 'end'))
106	157	context = iter(context)
107	158	event, root = context.next() # get the root element of the XML doc
—	—	@@ -110,12 +161,16 @@
111	162	if elem.tag == tag:
112	163	elem = remove_namespace(elem, settings.NAME_SPACE)
113	164	elem = parse_comments(elem, remove_numeric_character_references)
	165	+
	166	+ if is_article_main_namespace(elem, ns):
	167	+ fh, counter = write_xml_file(elem, fh, counter, language)
	168	+
	169	+ root.clear() # when done parsing a section clear the tree to safe memory
	170	+
114	171	#elem = parse_comments(elem, convert_html_entities)
115	172	#elem = parse_comments(elem, remove_ascii_control_characters)
116		~~- fh, counter = write_xml_file(elem, fh, counter, language)~~
117	173	#print cElementTree.tostring(elem)
118		~~- root.clear() # when done parsing a section clear the tree to safe memory~~
119	174
120	175
121	176	if __name__ == "__main__":
122		~~- split_xml('enwiki')~~
	177	+ split_xml('en')
Index: trunk/tools/editor_trends/construct_datasets.py
—	—	@@ -19,7 +19,6 @@
20	20
21	21	from multiprocessing import Queue
22	22	from Queue import Empty
23		~~-import sqlite3~~
24	23
25	24	import progressbar
26	25
—	—	@@ -35,46 +34,63 @@
36	35	pass
37	36
38	37
39		~~-def retrieve_editor_ids_mongo(RANDOM_SAMPLE=True):~~
40		~~- raise DeprecatedError~~
41		~~-# if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,~~
42		~~-# retrieve_editor_ids_mongo):~~
43		~~-# contributors = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,~~
44		~~-# retrieve_editor_ids_mongo)~~
45		~~-# else:~~
46		~~-# mongo = db.init_mongo_db('editors')~~
47		~~-# editors = mongo['editors']~~
48		~~-# contributors = set()~~
49		~~-# #ids = editors.find().distinct('editor')~~
50		~~-# ids = editors.find()~~
51		~~-# for x, id in enumerate(ids):~~
52		~~-# contributors.add(id['editor'])~~
53		~~-# if len(contributors) == 100000:~~
54		~~-# if RANDOM_SAMPLE:~~
55		~~-# break~~
56		~~-# if contributors != set():~~
57		~~-# utils.store_object(contributors, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)~~
58		~~-# return contributors~~
59		-
60		~~-def retrieve_ids_mongo_new(dbname, collection):~~
61		~~- if utils.check_file_exists(settings.TXT_FILE_LOCATION,~~
	38	+def retrieve_editor_ids_mongo(dbname, collection):
	39	+ if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,
62	40	retrieve_editor_ids_mongo):
63		~~- ids = utils.load_object(settings.TXT_FILE_LOCATION,~~
	41	+ ids = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
64	42	retrieve_editor_ids_mongo)
65	43	else:
66	44	mongo = db.init_mongo_db(dbname)
67	45	editors = mongo[collection]
68		~~- ids = editors.distinct()~~
69		~~- utils.store_object(contributors, settings.TXT_FILE_LOCATION, retrieve_editor_ids_mongo)~~
	46	+ ids = editors.distinct('editor')
	47	+ utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)
70	48	return ids
71	49
	50	+
	51	+def expand_edits(edits):
	52	+ data = []
	53	+ for edit in edits:
	54	+ data.append(edit['date'])
	55	+ return data
	56	+
	57	+
	58	+def expand_observations(obs, vars_to_expand):
	59	+ for var in vars_to_expand:
	60	+ if var == 'edits':
	61	+ obs[var] = expand_edits(obs[var])
	62	+ elif var == 'edits_by_year':
	63	+ keys = obs[var].keys()
	64	+ keys.sort()
	65	+ edits = []
	66	+ for key in keys:
	67	+ edits.append(str(obs[var][key]))
	68	+ obs[var] = edits
	69	+ return obs
	70	+
	71	+
	72	+def expand_headers(headers, vars_to_expand, obs):
	73	+ for var in vars_to_expand:
	74	+ l = len(obs[var])
	75	+ pos = headers.index(var)
	76	+ for i in xrange(l):
	77	+ if var.endswith('year'):
	78	+ suffix = 2001 + i
	79	+ elif var.endswith('edits'):
	80	+ suffix = 1 + i
	81	+ headers.insert(pos+i, '%s_%s' % (var, suffix))
	82	+ headers.remove(var)
	83	+ return headers
	84	+
	85	+
72	86	def generate_editor_dataset(input_queue, data_queue, pbar, kwargs):
73		~~- definition = kwargs.pop('definition')~~
74		~~- limit = kwargs.pop('limit')~~
75	87	debug = kwargs.pop('debug')
76		~~- mongo = db.init_mongo_db('editors')~~
77		~~- editors = mongo['editors']~~
78		~~- data = {}~~
	88	+ dbname = kwargs.pop('dbname')
	89	+ mongo = db.init_mongo_db(dbname)
	90	+ editors = mongo['dataset']
	91	+ name = dbname + '_editors.csv'
	92	+ fh = utils.create_txt_filehandle(settings.DATASETS_FILE_LOCATION, name, 'a', settings.ENCODING)
	93	+ x = 0
	94	+ vars_to_expand = ['edits', 'edits_by_year']
79	95	while True:
80	96	try:
81	97	if debug:
—	—	@@ -83,115 +99,68 @@
84	100	id = input_queue.get(block=False)
85	101
86	102	print input_queue.qsize()
87		~~- if definition == 'Traditional':~~
88	103
89		~~- obs = editors.find({'editor': id}, {'date':1}).sort('date').limit(limit)~~
90		~~- contributors = []~~
91		~~- for ob in obs:~~
92		~~- contributors.append(ob['date'])~~
93		~~- obs = ''~~
94		~~- else:~~
95		~~- obs = editors.find({'editor': id}, {'date':1}).sort('date')~~
96		~~- contributors = set()~~
97		~~- for ob in obs:~~
98		~~- if len(contributors) == limit:~~
99		~~- break~~
100		~~- else:~~
101		~~- contributors.add(ob['date'])~~
102		~~- obs.close()~~
103		~~- if len(contributors) < limit:~~
104		~~- new_wikipedian = False~~
105		~~- else:~~
106		~~- new_wikipedian = True~~
107		~~- data[id] = [contributors, new_wikipedian]~~
	104	+ obs = editors.find_one({'editor': id})
	105	+ obs = expand_observations(obs, vars_to_expand)
	106	+ if x == 0:
	107	+ headers = obs.keys()
	108	+ headers.sort()
	109	+ headers = expand_headers(headers, vars_to_expand, obs)
	110	+ utils.write_list_to_csv(headers, fh)
	111	+ fh.write('\n')
	112	+ data = []
	113	+ keys = obs.keys()
	114	+ keys.sort()
	115	+ for key in keys:
	116	+ data.append(obs[key])
	117	+ utils.write_list_to_csv(data, fh)
	118	+ fh.write('\n')
108	119
109		-
	120	+ x += 1
110	121	except Empty:
111		~~- utils.write_data_to_csv(data, settings.DATASETS_FILE_LOCATION, generate_editor_dataset, settings.ENCODING)~~
112	122	break
	123	+ fh.close()
113	124
114	125
115		~~-def retrieve_editor_ids_db():~~
116		~~- contributors = set()~~
117		~~- connection = db.init_database()~~
118		~~- cursor = connection.cursor()~~
119		~~- if settings.PROGRESS_BAR:~~
120		~~- cursor.execute('SELECT MAX(ROWID) FROM contributors')~~
121		~~- for id in cursor:~~
122		~~- pass~~
123		~~- pbar = progressbar.ProgressBar(maxval=id[0]).start()~~
124		-
125		~~- cursor.execute('SELECT contributor FROM contributors WHERE bot=0')~~
126		-
127		~~- print 'Retrieving contributors...'~~
128		~~- for x, contributor in enumerate(cursor):~~
129		~~- contributors.add(contributor[0])~~
130		~~- if x % 100000 == 0:~~
131		~~- pbar.update(x)~~
132		~~- print 'Serializing contributors...'~~
133		~~- utils.store_object(contributors, 'contributors')~~
134		~~- print 'Finished serializing contributors...'~~
135		-
136		~~- if pbar:~~
137		~~- pbar.finish()~~
138		~~- print 'Total elapsed time: %s.' % (utils.humanize_time_difference(pbar.seconds_elapsed))~~
139		-
140		~~- connection.close()~~
141		-
142		-
143		~~-def retrieve_edits_by_contributor(input_queue, result_queue, pbar):~~
144		~~- connection = db.init_database()~~
145		~~- cursor = connection.cursor()~~
146		-
147		~~- while True:~~
148		~~- try:~~
149		~~- contributor = input_queue.get(block=False)~~
150		~~- if contributor == None:~~
151		~~- break~~
152		-
153		~~- cursor.execute('SELECT contributor, timestamp, bot FROM contributors WHERE contributor=?', (contributor,))~~
154		~~- edits = {}~~
155		~~- edits[contributor] = set()~~
156		~~- for edit, timestamp, bot in cursor:~~
157		~~- date = utils.convert_timestamp_to_date(timestamp)~~
158		~~- edits[contributor].add(date)~~
159		~~- #print edit, timestamp, bot~~
160		-
161		~~- utils.write_data_to_csv(edits, retrieve_edits_by_contributor)~~
162		~~- if pbar:~~
163		~~- utils.update_progressbar(pbar, input_queue)~~
164		-
165		~~- except Empty:~~
166		~~- pass~~
167		-
168		~~- connection.close()~~
169		-
170		-
171	126	def retrieve_edits_by_contributor_launcher():
172	127	pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors')
173	128
174	129
175	130	def debug_retrieve_edits_by_contributor_launcher():
176		~~- q = Queue()~~
177		~~- kwargs = {'definition':'Traditional',~~
178		~~- 'limit': 10,~~
179		~~- 'debug': False~~
	131	+ kwargs = {'debug': False,
	132	+ 'dbname': 'enwiki',
180	133	}
181		~~- ids = retrieve_editor_ids_mongo()~~
182		~~- input_queue = pc.load_queue(q, ids)~~
183		~~- generate_editor_dataset(input_queue, False, False, kwargs)~~
	134	+ ids = retrieve_editor_ids_mongo('enwiki', 'editors')
	135	+ input_queue = pc.load_queue(ids)
	136	+ q = Queue()
	137	+ generate_editor_dataset(input_queue, q, False, kwargs)
184	138	#generate_editor_dataset_launcher()
185	139	#retrieve_list_contributors()
186	140	#retrieve_edits_by_contributor()
187	141
188	142	def generate_editor_dataset_launcher():
189		~~- kwargs = {'definition':'Traditional',~~
190		~~- 'limit': 10,~~
191		~~- 'debug': False~~
	143	+ kwargs = {'nr_input_processors': 1,
	144	+ 'nr_output_processors': 1,
	145	+ 'debug': False,
	146	+ 'dbname': 'enwiki',
192	147	}
193		~~- pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, kwargs)~~
	148	+ ids = retrieve_editor_ids_mongo('enwiki', 'editors')
	149	+ pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False, False, **kwargs)
194	150
195	151
	152	+def generate_editor_dataset_debug():
	153	+ ids = retrieve_editor_ids_mongo('enwiki', 'editors')
	154	+ input_queue = pc.load_queue(ids)
	155	+ #write_dataset(input_queue, [], 'enwiki')
	156	+ kwargs = {'nr_input_processors': 1,
	157	+ 'nr_output_processors': 1,
	158	+ 'debug': True,
	159	+ 'dbname': 'enwiki',
	160	+ }
	161	+ generate_editor_dataset(input_queue, False, False, kwargs)
	162	+
	163	+
196	164	if __name__ == '__main__':
197		~~- #generate_editor_dataset_launcher()~~
198		~~- debug_retrieve_edits_by_contributor_launcher()~~
	165	+ #generate_editor_dataset_debug()
	166	+ generate_editor_dataset_launcher()
	167	+ #debug_retrieve_edits_by_contributor_launcher()
Property changes on: trunk/tools/editor_trends
___________________________________________________________________
Modified: svn:ignore
199	168	- wikistats
zips
notes.txt
*.pyc
datasets
errors
.settings
.project
.pydevproject
200	169	+ wikistats
zips
notes.txt
*.pyc
datasets
errors
.settings
.project
.pydevproject
wiki.cfg

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r75884 [removed: new added: deferred]