r77457 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r77456‎ \| r77457 \| r77458 >
Date:	05:07, 30 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	* Started implementation of Python logging module * Rewriting multiprocessing launchers * Numerous small bugfixes
Modified paths:	/trunk/tools/editor_trends/bots/bots.py (modified) (history) /trunk/tools/editor_trends/bots/models.py (modified) (history) /trunk/tools/editor_trends/configuration.py (modified) (history) /trunk/tools/editor_trends/etl/chunker.py (modified) (history) /trunk/tools/editor_trends/etl/exporter.py (modified) (history) /trunk/tools/editor_trends/etl/loader.py (modified) (history) /trunk/tools/editor_trends/etl/models.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/utils/compression.py (modified) (history) /trunk/tools/editor_trends/utils/exceptions.py (modified) (history) /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -18,6 +18,7 @@
19	19	__version__ = '0.1'
20	20
21	21	import os
	22	+import logging
22	23	import sys
23	24	import datetime
24	25	from argparse import ArgumentParser
—	—	@@ -32,6 +33,7 @@
33	34	import languages
34	35	from utils import utils
35	36	from utils import dump_downloader
	37	+from utils import compression
36	38	from etl import chunker
37	39	from etl import extract
38	40	from etl import loader
—	—	@@ -65,12 +67,12 @@
66	68	return language_code.split('_')[0]
67	69
68	70
69		~~-def retrieve_projectname(args):~~
70		~~- language_code = retrieve_language(args)~~
	71	+def get_projectname(args):
	72	+ language_code = get_language(args)
71	73	if language_code == None:
72	74	print 'Entered language: %s is not a valid Wikimedia language' % get_value(args, 'language')
73	75	sys.exit(-1)
74		~~- project = retrieve_project(args)~~
	76	+ project = get_project(args)
75	77
76	78	if project == None:
77	79	print 'Entered project: %s is not valid Wikimedia Foundation project.' % get_value(args, 'project')
—	—	@@ -81,59 +83,87 @@
82	84	return '%s%s' % (language_code, project)
83	85
84	86
85		~~-def retrieve_language(args):~~
	87	+def get_language(args):
86	88	language = get_value(args, 'language')
87	89	language = language.title()
88	90	return languages.MAPPING.get(language, 'en')
89	91
90	92
91		~~-def retrieve_project(args):~~
	93	+def get_namespaces(args):
	94	+ namespaces = get_value(args, 'namespace')
	95	+ if namespaces != None:
	96	+ return namespaces.split(',')
	97	+ else:
	98	+ return namespaces
	99	+
	100	+def write_message_to_log(logger, args, message=None, verb=None, **kwargs):
	101	+ function = get_value(args, 'func')
	102	+ logger.debug('Starting %s task' % function.func_name)
	103	+ if message:
	104	+ logger.debug(message)
	105	+ for kw in kwargs:
	106	+ if verb:
	107	+ logger.debug('Action: %s\tSetting: %s' % (verb, kwargs[kw]))
	108	+ else:
	109	+ logger.debug('Key: %s\tSetting: %s' % (kw, kwargs[kw]))
	110	+
	111	+
	112	+
	113	+def get_project(args):
92	114	project = get_value(args, 'project')
93	115	if project != 'wiki':
94	116	project = settings.projects.get(project, None)
95	117	return project
96	118
97	119
98		~~-def generate_wikidump_filename(project, args):~~
99		~~- return '%s-%s-%s' % (project, 'latest', get_value(args, 'file'))~~
	120	+def generate_wikidump_filename(language_code, project, args):
	121	+ return '%s%s-%s-%s' % (language_code, project, 'latest', get_value(args, 'file'))
100	122
101	123
102		~~-def determine_file_locations(args):~~
103		~~- locations = {}~~
	124	+def determine_file_locations(args, logger):
	125	+ config = {}
104	126	location = get_value(args, 'location') if get_value(args, 'location') != None else settings.input_location
105		~~- project = retrieve_project(args)~~
106		~~- language_code = retrieve_language(args)~~
107		~~- locations['language_code'] = language_code~~
108		~~- locations['language'] = get_value(args, 'language')~~
109		~~- locations['location'] = os.path.join(location, language_code, project)~~
110		~~- locations['chunks'] = os.path.join(locations['location'], 'chunks')~~
111		~~- locations['txt'] = os.path.join(locations['location'], 'txt')~~
112		~~- locations['sorted'] = os.path.join(locations['location'], 'sorted')~~
113		~~- locations['dbready'] = os.path.join(locations['location'], 'dbready')~~
114		~~- locations['project'] = project~~
115		~~- locations['full_project'] = retrieve_projectname(args)~~
116		~~- locations['filename'] = generate_wikidump_filename(project, args)~~
117		~~- locations['collection'] = get_value(args, 'collection')~~
118		~~- locations['directories'] = [locations['chunks'], locations['location'], locations['txt'], locations['sorted'], locations['dbready']]~~
119		~~- return locations~~
	127	+ project = get_project(args)
	128	+ language_code = get_language(args)
	129	+ config['language_code'] = language_code
	130	+ config['language'] = get_value(args, 'language')
	131	+ config['location'] = os.path.join(location, language_code, project)
	132	+ config['chunks'] = os.path.join(config['location'], 'chunks')
	133	+ config['txt'] = os.path.join(config['location'], 'txt')
	134	+ config['sorted'] = os.path.join(config['location'], 'sorted')
	135	+ config['dbready'] = os.path.join(config['location'], 'dbready')
	136	+ config['project'] = project
	137	+ config['full_project'] = get_projectname(args)
	138	+ config['filename'] = generate_wikidump_filename(language_code, project, args)
	139	+ config['collection'] = get_value(args, 'collection')
	140	+ config['namespaces'] = get_namespaces(args)
	141	+ config['directories'] = [config['location'], config['chunks'], config['txt'], config['sorted'], config['dbready']]
120	142
	143	+ message = 'Settings as generated from the configuration module.'
	144	+ write_message_to_log(logger, args, message, None, **config)
	145	+ #for c in config:
	146	+ # logger.debug('Key: %s - Setting: %s' % (c, config[c]))
	147	+ return config
121	148
122		~~-def show_settings(args, **kwargs):~~
123		~~- project = settings.projects.get(kwargs.pop('project'), 'wiki')~~
	149	+
	150	+def show_settings(args, logger, **kwargs):
	151	+ language_map = languages.language_map()
	152	+ language = kwargs.pop('language')
124	153	language_code = kwargs.pop('language_code')
125		~~- language = kwargs.pop('language')~~
126		~~- location = kwargs.pop('location')~~
127		~~- project = project.title()~~
128		~~- language_map = languages.language_map()~~
129		~~- print 'Project: %s' % (project)~~
130		~~- print 'Language: %s / %s' % (language_map[language_code].decode(settings.encoding), language.decode(settings.encoding))~~
131		~~- print 'Input directory: %s' % location~~
132		~~- print 'Output directory: %s and subdirectories' % location~~
	154	+ config = {}
	155	+ config['Project'] = settings.projects.get(kwargs.pop('project'), 'wiki').title()
	156	+ config['Language'] = '%s / %s' % (language_map[language_code].decode(settings.encoding), language.decode(settings.encoding))
	157	+ config['Input directory'] = kwargs.get('location')
	158	+ config['Output directory'] = '%s and subdirectories' % kwargs.get('location')
133	159
	160	+ message = 'Final settings after parsing command line arguments:'
	161	+ write_message_to_log(logger, args, message, None, **config)
134	162
135		~~-def dump_downloader_launcher(args, **kwargs):~~
	163	+
	164	+def dump_downloader_launcher(args, logger, **kwargs):
136	165	print 'dump downloader'
137	166	timer = Timer()
	167	+ write_message_to_log(logger, args, **kwargs)
138	168	filename = kwargs.get('filename')
139	169	extension = kwargs.get('extension')
140	170	location = kwargs.get('location')
—	—	@@ -146,37 +176,42 @@
147	177	timer.elapsed()
148	178
149	179
150		~~-def chunker_launcher(args, **kwargs):~~
	180	+def chunker_launcher(args, logger, **kwargs):
151	181	print 'split_settings.input_filename_launcher'
152	182	timer = Timer()
	183	+ write_message_to_log(logger, args, **kwargs)
153	184	filename = kwargs.pop('filename')
154		~~- filename = 'en-latest-pages-meta-history.xml.bz2'~~
155	185	location = kwargs.pop('location')
156	186	project = kwargs.pop('project')
157	187	language = kwargs.pop('language')
158	188	language_code = kwargs.pop('language_code')
	189	+ namespaces = kwargs.pop('namespaces')
	190	+
159	191	ext = utils.determine_file_extension(filename)
160		~~- if ext in settings.compression.keys():~~
161		~~- file = filename.replace('.'+ ext, '')~~
	192	+ file = filename.replace('.' + ext, '')
162	193	result = utils.check_file_exists(location, file)
163	194	if not result:
164		~~- retcode = launch_zip_extractor(args, location, filename, ext)~~
	195	+ retcode = launch_zip_extractor(args, location, filename)
165	196	else:
166	197	retcode = 0
167	198	if retcode != 0:
168	199	sys.exit(retcode)
169		~~- chunker.split_file(location, file, project, language_code, language)~~
	200	+
	201	+ chunker.split_file(location, file, project, language_code, namespaces, format='xml', zip=False)
170	202	timer.elapsed()
171	203
172	204
173		~~-def launch_zip_extractor(args, location, file, ext):~~
	205	+def launch_zip_extractor(args, location, file):
174	206	timer = Timer()
175		~~- utils.zip_extract(location, file, ext)~~
	207	+ write_message_to_log(logger, args, location=location, file=file)
	208	+ compressor = compression.Compressor(location, file)
	209	+ compressor.extract()
176	210	timer.elapsed()
177	211
178	212
179		~~-def extract_launcher(args, **kwargs):~~
	213	+def extract_launcher(args, logger, **kwargs):
180	214	timer = Timer()
	215	+ write_message_to_log(logger, args, **kwargs)
181	216	location = kwargs.pop('location')
182	217	language_code = kwargs.pop('language_code')
183	218	project = kwargs.pop('project')
—	—	@@ -184,8 +219,9 @@
185	220	timer.elapsed()
186	221
187	222
188		~~-def sort_launcher(args, **kwargs):~~
	223	+def sort_launcher(args, logger, **kwargs):
189	224	timer = Timer()
	225	+ write_message_to_log(logger, args, **kwargs)
190	226	location = kwargs.pop('location')
191	227	input = os.path.join(location, 'txt')
192	228	output = os.path.join(location, 'sorted')
—	—	@@ -196,8 +232,9 @@
197	233	timer.elapsed()
198	234
199	235
200		~~-def store_launcher(args, **kwargs):~~
	236	+def store_launcher(args, logger, **kwargs):
201	237	timer = Timer()
	238	+ write_message_to_log(logger, args, **kwargs)
202	239	location = kwargs.pop('location')
203	240	input = os.path.join(location, 'dbready')
204	241	dbname = kwargs.pop('full_project')
—	—	@@ -206,37 +243,47 @@
207	244	timer.elapsed()
208	245
209	246
210		~~-def transformer_launcher(args, **kwargs):~~
	247	+def transformer_launcher(args, logger, **kwargs):
211	248	print 'dataset launcher'
212	249	timer = Timer()
	250	+ write_message_to_log(logger, args, **kwargs)
213	251	project = kwargs.pop('full_project')
214	252	collection = kwargs.pop('collection')
215	253	transformer.run_optimize_editors(project, collection)
216	254	timer.elapsed()
217	255
218	256
219		~~-def exporter_launcher(args, **kwargs):~~
	257	+def exporter_launcher(args, logger, **kwargs):
220	258	timer = Timer()
	259	+ write_message_to_log(logger, args, **kwargs)
221	260	project = kwargs.pop('full_project')
222	261	exporter.generate_editor_dataset_launcher(project)
223	262	timer.elapsed()
224	263
225	264
226		~~-def all_launcher(args, **kwargs):~~
	265	+def all_launcher(args, logger, **kwargs):
227	266	print 'all_launcher'
228	267	timer = Timer()
	268	+ message = 'Starting '
	269	+ write_message_to_log(logger, args, message, **kwargs)
229	270	ignore = get_value(args, 'except')
	271	+ clean = get_value(args, 'clean')
	272	+ if clean:
	273	+ dirs = kwargs.get('directories')[1:]
	274	+ for dir in dirs:
	275	+ write_message_to_log(logger, args, verb='Deleting', **kwargs)
	276	+ utils.delete_file(dir, '')
230	277	functions = {dump_downloader_launcher: 'download',
231	278	chunker_launcher: 'split',
232	279	extract_launcher: 'extract',
233	280	sort_launcher: 'sort',
234		~~- transformer_launcher: 'transform',~~
	281	+ transformer_launcher: 'transform',
235	282	exporter_launcher: 'export'
236	283	}
237	284	for function, callname in functions.iteritems():
238	285	if callname not in ignore:
239	286	function(args, **kwargs)
240		-
	287	+
241	288	timer.elapsed()
242	289
243	290
—	—	@@ -265,8 +312,9 @@
266	313	print '%s' % language
267	314
268	315
269		~~-def detect_python_version():~~
	316	+def detect_python_version(logger):
270	317	version = sys.version_info[0:2]
	318	+ logger.debug('Python version: %s' % '.'.join(str(version)))
271	319	if version < settings.minimum_python_version:
272	320	raise 'Please upgrade to Python 2.6 or higher (but not Python 3.x).'
273	321
—	—	@@ -279,6 +327,9 @@
280	328
281	329
282	330	def main():
	331	+ logger = logging.getLogger('manager')
	332	+ logger.setLevel(logging.DEBUG)
	333	+
283	334	default_language = determine_default_language()
284	335	file_choices = ('stub-meta-history.xml.gz',
285	336	'stub-meta-current.xml.gz',
—	—	@@ -303,6 +354,7 @@
304	355	parser_download.set_defaults(func=dump_downloader_launcher)
305	356
306	357	parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.')
	358	+
307	359	parser_split.set_defaults(func=chunker_launcher)
308	360
309	361	parser_create = subparsers.add_parser('extract', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.')
—	—	@@ -317,7 +369,7 @@
318	370	help='Name of MongoDB collection',
319	371	default='editors')
320	372
321		~~- parser_transform = subparsers.add_parser('transform', help='Transform the raw datatabe to an enriched dataset that can be exported.')~~
	373	+ parser_transform = subparsers.add_parser('transform', help='Transform the raw datatable to an enriched dataset that can be exported.')
322	374	parser_transform.set_defaults(func=transformer_launcher)
323	375	parser_transform.add_argument('-c', '--collection', action='store',
324	376	help='Name of MongoDB collection',
—	—	@@ -331,8 +383,11 @@
332	384	parser_all.add_argument('-e', '--except', action='store',
333	385	help='Should be a list of functions that are to be ignored when executing \'all\'.',
334	386	default=[])
335		-
	387	+ parser_all.add_argument('-n', '--new', action='store_false',
	388	+ help='This will delete all previous output and starts from scratch. Mostly useful for debugging purposes.',
	389	+ default=False)
336	390
	391	+
337	392	parser.add_argument('-l', '--language', action='store',
338	393	help='Example of valid languages.',
339	394	choices=supported_languages(),
—	—	@@ -346,25 +401,29 @@
347	402	parser.add_argument('-o', '--location', action='store',
348	403	help='Indicate where you want to store the downloaded file.',
349	404	)
350		~~- #default=settings.input_location)~~
351	405
	406	+ parser.add_argument('-n', '--namespace', action='store',
	407	+ help='A list of namespaces to include for analysis.',
	408	+ default='0')
	409	+
	410	+
352	411	parser.add_argument('-f', '--file', action='store',
353	412	choices=file_choices,
354	413	help='Indicate which dump you want to download. Valid choices are:\n %s' % ''.join([f + ',\n' for f in file_choices]),
355		~~- default='stub-meta-current.xml.gz')~~
	414	+ default='stub-meta-history.xml.gz')
356	415
357	416	parser.add_argument('-prog', '--progress', action='store_true', default=True,
358	417	help='Indicate whether you want to have a progressbar.')
359	418
360		~~- detect_python_version()~~
	419	+ detect_python_version(logger)
361	420	about()
362	421	args = parser.parse_args()
363	422	config.create_configuration(settings, args)
364		~~- locations = determine_file_locations(args)~~
	423	+ locations = determine_file_locations(args, logger)
365	424	settings.verify_environment(locations['directories'])
366		~~- show_settings(args, **locations)~~
	425	+ show_settings(args, logger, **locations)
367	426	#locations['settings'] = settings
368		~~- args.func(args, **locations)~~
	427	+ args.func(args, logger, **locations)
369	428	t1 = datetime.datetime.now()
370	429
371	430
Index: trunk/tools/editor_trends/etl/exporter.py
—	—	@@ -21,7 +21,7 @@
22	22	import datetime
23	23	from dateutil.relativedelta import *
24	24	import calendar
25		~~-from multiprocessing import Queue~~
	25	+import multiprocessing
26	26	from Queue import Empty
27	27
28	28
—	—	@@ -32,8 +32,6 @@
33	33	from utils import models, utils
34	34	from database import db
35	35	from etl import shaper
36		~~-from utils import process_constructor as pc~~
37		~~-import progressbar~~
38	36
39	37	try:
40	38	import psyco
—	—	@@ -181,10 +179,10 @@
182	180	return headers
183	181
184	182
185		~~-def generate_long_editor_dataset(input_queue, vars, **kwargs):~~
186		~~- dbname = kwargs.pop('dbname')~~
	183	+def generate_long_editor_dataset(tasks, dbname, collection, **kwargs):
187	184	mongo = db.init_mongo_db(dbname)
188		~~- editors = mongo['dataset']~~
	185	+ vars = ['monthly_edits']
	186	+ editors = mongo[collection + 'dataset']
189	187	name = dbname + '_long_editors.csv'
190	188	#fh = utils.create_txt_filehandle(settings.dataset_location, name, 'w', settings.encoding)
191	189	vars_to_expand = []
—	—	@@ -202,11 +200,9 @@
203	201	ld.write_longitudinal_data()
204	202
205	203
206		~~-def generate_cohort_analysis(input_queue, **kwargs):~~
207		~~- dbname = kwargs.get('dbname')~~
208		~~- pbar = kwargs.get('pbar')~~
	204	+def generate_cohort_dataset(tasks, dbname, collection, **kwargs):
209	205	mongo = db.init_mongo_db(dbname)
210		~~- editors = mongo['dataset']~~
	206	+ editors = mongo[collection + 'dataset']
211	207	year = datetime.datetime.now().year + 1
212	208	begin = year - 2001
213	209	p = [3, 6, 9]
—	—	@@ -215,7 +211,7 @@
216	212	data = {}
217	213	while True:
218	214	try:
219		~~- id = input_queue.get(block=False)~~
	215	+ id = tasks.get(block=False)
220	216	obs = editors.find_one({'editor': id}, {'first_edit': 1, 'final_edit': 1})
221	217	first_edit = obs['first_edit']
222	218	last_edit = obs['final_edit']
—	—	@@ -244,7 +240,6 @@
245	241	p = min(edits)
246	242	data[y]['n'] += 1
247	243	data[y][p] += 1
248		~~- #pbar.update(+1)~~
249	244	except Empty:
250	245	break
251	246	utils.store_object(data, settings.binary_location, 'cohort_data')
—	—	@@ -257,20 +252,16 @@
258	253	return False
259	254
260	255
261		~~-def generate_wide_editor_dataset(input_queue, **kwargs):~~
262		~~- dbname = kwargs.pop('dbname')~~
	256	+def generate_wide_editor_dataset(tasks, dbname, collection, **kwargs):
263	257	mongo = db.init_mongo_db(dbname)
264		~~- editors = mongo['dataset']~~
	258	+ editors = mongo[collection + 'dataset']
265	259	name = dbname + '_wide_editors.csv'
266	260	fh = utils.create_txt_filehandle(settings.dataset_location, name, 'a', settings.encoding)
267	261	x = 0
268	262	vars_to_expand = ['edits', 'edits_by_year', 'articles_by_year']
269	263	while True:
270	264	try:
271		~~- if debug:~~
272		~~- id = u'99797'~~
273		~~- else:~~
274		~~- id = input_queue.get(block=False)~~
	265	+ id = input_queue.get(block=False)
275	266	print input_queue.qsize()
276	267	obs = editors.find_one({'editor': id})
277	268	obs = expand_observations(obs, vars_to_expand)
—	—	@@ -292,49 +283,25 @@
293	284	fh.close()
294	285
295	286
296		~~-def retrieve_edits_by_contributor_launcher():~~
297		~~- pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors')~~
	287	+def dataset_launcher(dbname, collection, target):
	288	+ editors = retrieve_editor_ids_mongo(dbname, collection)
	289	+ tasks = multiprocessing.JoinableQueue()
	290	+ consumers = [multiprocessing.Process(target=target, args=(tasks, dbname, collection)) for i in xrange(settings.number_of_processes)]
	291	+ for editor in editors:
	292	+ tasks.put(editor)
	293	+ print 'The queue contains %s editors.' % tasks.qsize()
	294	+ for x in xrange(settings.number_of_processes):
	295	+ tasks.put(None)
298	296
	297	+ for w in consumers:
	298	+ w.start()
299	299
300		~~-def debug_retrieve_edits_by_contributor_launcher(dbname):~~
301		~~- kwargs = {'debug': False,~~
302		~~- 'dbname': dbname,~~
303		~~- }~~
304		~~- ids = retrieve_editor_ids_mongo(dbname, 'editors')~~
305		~~- input_queue = pc.load_queue(ids)~~
306		~~- q = Queue()~~
307		~~- generate_editor_dataset(input_queue, q, False, kwargs)~~
	300	+ tasks.join()
308	301
309	302
310		~~-def generate_editor_dataset_launcher(dbname):~~
311		~~- kwargs = {'nr_input_processors': 1,~~
312		~~- 'nr_output_processors': 1,~~
313		~~- 'debug': False,~~
314		~~- 'dbname': dbname,~~
315		~~- 'poison_pill':False,~~
316		~~- 'pbar': True~~
317		~~- }~~
318		~~- ids = retrieve_editor_ids_mongo(dbname, 'editors')~~
319		~~- ids = list(ids)~~
320		~~- chunks = dict({0: ids})~~
321		~~- pc.build_scaffolding(pc.load_queue, generate_cohort_analysis, chunks, False, False, **kwargs)~~
322		-
323		-
324		~~-def generate_editor_dataset_debug(dbname):~~
325		~~- ids = retrieve_editor_ids_mongo(dbname, 'editors')~~
326		~~- #ids = list(ids)[:1000]~~
327		~~- input_queue = pc.load_queue(ids)~~
328		~~- kwargs = {'nr_input_processors': 1,~~
329		~~- 'nr_output_processors': 1,~~
330		~~- 'debug': True,~~
331		~~- 'dbname': dbname,~~
332		~~- }~~
333		~~- #generate_editor_dataset(input_queue, False, False, kwargs)~~
334		~~- vars = ['monthly_edits']~~
335		~~- generate_long_editor_dataset(input_queue, vars, **kwargs)~~
336		-
337	303	if __name__ == '__main__':
338		~~- #generate_editor_dataset_debug('test')~~
339		~~- #generate_editor_dataset_launcher('enwiki')~~
340		~~- generate_editor_dataset_debug('enwiki')~~
341		~~- #debug_retrieve_edits_by_contributor_launcher()~~
	304	+ dbname = 'enwiki'
	305	+ collection = 'editors'
	306	+ dataset_launcher(dbname, collection, generate_cohort_dataset)
	307	+ dataset_launcher(dbname, collection, generate_long_editor_dataset)
	308	+ dataset_launcher(dbname, collection, generate_wide_editor_dataset)
Index: trunk/tools/editor_trends/etl/chunker.py
—	—	@@ -87,7 +87,7 @@
88	88	'''
89	89	ns = []
90	90	for namespace in namespaces:
91		~~- if int(namespace) not in include:~~
	91	+ if namespace not in include:
92	92	value = namespaces[namespace].get(u'*', None)
93	93	ns.append(value)
94	94	return ns
—	—	@@ -165,8 +165,14 @@
166	166	return flat
167	167
168	168
169		~~-def split_file(location, file, project, language_code, include, format='xml', zip=False):~~
170		~~- '''Reads xml file and splits it in N chunks'''~~
	169	+def split_file(location, file, project, language_code, namespaces=[0], format='xml', zip=False):
	170	+ '''
	171	+ Reads xml file and splits it in N chunks
	172	+
	173	+ @namespaces is a list indicating which namespaces should be included, default
	174	+ is to include namespace 0 (main namespace)
	175	+ @zip indicates whether to compress the chunk or not
	176	+ '''
171	177	#location = os.path.join(settings.input_location, language)
172	178	input = os.path.join(location, file)
173	179	output = os.path.join(location, 'chunks')
—	—	@@ -178,7 +184,7 @@
179	185	fh = utils.create_txt_filehandle(output, '%s.tsv' % f, 'w', settings.encoding)
180	186
181	187	ns = load_namespace(language_code)
182		~~- ns = build_namespaces_locale(ns, include)~~
	188	+ ns = build_namespaces_locale(ns, namespaces)
183	189
184	190	counter = 0
185	191	tag = '{%s}page' % settings.xml_namespace
Index: trunk/tools/editor_trends/etl/models.py
—	—	@@ -52,19 +52,19 @@
53	53	self.lock = None
54	54	for kw in kwargs:
55	55	setattr(self, kw, kwargs[kw])
56		-
	56	+
57	57	def create_file_handle(self):
58	58	self.mode = 'a'
59	59	if self.output_file == None:
60	60	self.mode = 'w'
61	61	self.output_file = self.file[:-4] + '.txt'
62		-
	62	+
63	63	self.fh = utils.create_txt_filehandle(self.output, self.output_file, self.mode, settings.encoding)
64	64
65	65	def __str__(self):
66	66	return '%s' % (self.file)
67	67
68		~~- def __call__(self):~~
	68	+ def __call__(self, bots=None):
69	69	if settings.debug:
70	70	messages = {}
71	71	vars = {}
—	—	@@ -104,5 +104,5 @@
105	105
106	106	if settings.debug:
107	107	utils.report_error_messages(messages, self.target)
108		-
109		~~- return self.bots~~
\ No newline at end of file
	108	+
	109	+ return self.bots
Index: trunk/tools/editor_trends/etl/loader.py
—	—	@@ -19,19 +19,21 @@
20	20
21	21	import os
22	22	import sys
	23	+import multiprocessing
23	24	from Queue import Empty
24	25
25	26	sys.path.append('..')
26		~~-import configuration~~
	27	+import configuration
27	28	settings = configuration.Settings()
28	29	from database import db
29	30	from database import cache
30	31	from utils import utils
31	32	from utils import sort
32		~~-import process_constructor as pc~~
33	33
	34	+#import process_constructor as pc
34	35
35	36
	37	+
36	38	def store_editors(input, dbname, collection):
37	39	filename = utils.retrieve_file_list(input, 'txt', mask=None)[0]
38	40	fh = utils.create_txt_filehandle(input, filename, 'r', settings.encoding)
—	—	@@ -47,7 +49,7 @@
48	50	for line in sort.readline(fh):
49	51	if len(line) == 0:
50	52	continue
51		~~- contributor = int(line[0])~~
	53	+ contributor = int(line[0])
52	54	if prev_contributor != contributor:
53	55	if edits >= 10:
54	56	result = editor_cache.add(prev_contributor, 'NEXT')
—	—	@@ -94,31 +96,47 @@
95	97	filehandles = [fh.close() for fh in filehandles]
96	98	filename = 'merged_final.txt'
97	99	for r in to_remove:
98		~~- utils.delete_file(output ,r)~~
99		-
100		-
	100	+ utils.delete_file(output , r)
101	101
102	102
103		~~-def mergesort_feeder(task_queue, **kwargs):~~
104		~~- input = kwargs.get('input', None)~~
105		~~- output = kwargs.get('output', None)~~
106		~~- #while True:~~
107		~~- # try:~~
108		~~- #file = input_queue.get(block=False)~~
109		~~- for file in task_queue:~~
110		~~- fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding)~~
111		~~- data = fh.readlines()~~
112		~~- fh.close()~~
113		~~- data = [d.replace('\n', '') for d in data]~~
114		~~- data = [d.split('\t') for d in data]~~
115		~~- sorted_data = sort.mergesort(data)~~
116		~~- sort.write_sorted_file(sorted_data, file, output)~~
117	103
118	104
	105	+def mergesort_feeder(tasks, input, output):
	106	+ while True:
	107	+ try:
	108	+ file = tasks.get(block=False)
	109	+ print file
	110	+ if file == None:
	111	+ break
	112	+ fh = utils.create_txt_filehandle(input, file, 'r', settings.encoding)
	113	+ data = fh.readlines()
	114	+ fh.close()
	115	+ data = [d.replace('\n', '') for d in data]
	116	+ data = [d.split('\t') for d in data]
	117	+ sorted_data = sort.mergesort(data)
	118	+ sort.write_sorted_file(sorted_data, file, output)
	119	+ except Empty:
	120	+ break
	121	+
	122	+
119	123	def mergesort_launcher(input, output):
120	124	settings.verify_environment([input, output])
121	125	files = utils.retrieve_file_list(input, 'txt')
122		~~- mergesort_feeder(files, input=input, output=output)~~
	126	+ tasks = multiprocessing.JoinableQueue()
	127	+ consumers = [multiprocessing.Process(target=mergesort_feeder, args=(tasks, input, output)) for i in xrange(settings.number_of_processes)]
	128	+ for file in files:
	129	+ tasks.put(file)
	130	+
	131	+ for x in xrange(settings.number_of_processes):
	132	+ tasks.put(None)
	133	+
	134	+ for w in consumers:
	135	+ w.start()
	136	+
	137	+ tasks.join()
	138	+ #mergesort_feeder(file, input=input, output=output)
	139	+
	140	+
123	141	#chunks = utils.split_list(files, settings.number_of_processes)
124	142	#pc.build_scaffolding(pc.load_queue, mergesort_feeder, chunks, False, False, **kwargs)
125	143
—	—	@@ -138,7 +156,7 @@
139	157	input = os.path.join(settings.input_location, 'en', 'wiki', 'txt')
140	158	output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')
141	159	dbname = 'enwiki'
142		~~- #mergesort_launcher(input, output)~~
	160	+ mergesort_launcher(input, output)
143	161	final_output = os.path.join(settings.input_location, 'en', 'wiki', 'dbready')
144	162	mergesort_external_launcher(dbname, output, final_output)
145		~~- store_editors(input, dbname, collection)~~
\ No newline at end of file
	163	+ store_editors(input, dbname, collection)
Index: trunk/tools/editor_trends/configuration.py
—	—	@@ -152,13 +152,13 @@
153	153	if not program.endswith('.exe'):
154	154	program = program + '.exe'
155	155	path = self.detect_windows_program(program)
	156	+ if path != None:
	157	+ path = path + program
156	158	elif self.platform == 'Linux':
157	159	path = self.detect_linux_program(program)
158		~~- if path != None:~~
159		~~- return path + program~~
160		~~- else:~~
161		~~- return path~~
162	160
	161	+ return path
	162	+
163	163	def determine_max_filehandles_open(self):
164	164	if self.platform == 'Windows' and self.architecture == 'i386':
165	165	return win32file._getmaxstdio()
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -220,10 +220,10 @@
221	221	keys.sort()
222	222	for key in keys:
223	223	if write_key:
224		~~- fh.write('%s' % key)~~
	224	+ fh.write('%s\t' % key)
225	225	if getattr(data[key], '__iter__', False):
226	226	for d in data[key]:
227		~~- fh.write('\t%s' % d)~~
	227	+ fh.write('%s\t' % d)
228	228	else:
229	229	fh.write('%s\t' % (data[key]))
230	230	if newline:
Index: trunk/tools/editor_trends/utils/compression.py
—	—	@@ -18,8 +18,9 @@
19	19
20	20	def __init__(self, location, file, output=None):
21	21	self.extension = utils.determine_file_extension(file)
22		~~- self.file = os.path.join(location, file)~~
	22	+ self.file = file
23	23	self.location = location
	24	+ self.path = os.path.join(self.file, self.location)
24	25	self.output = None
25	26	self.name = None
26	27	self.program = []
—	—	@@ -42,20 +43,20 @@
43	44	'''
44	45	if self.program == []:
45	46	self.init_compression_tool(self.extension, 'compress')
46		-
	47	+
47	48	if self.program_installed == None:
48	49	raise exceptions.CompressionNotSupportedError
49		-
	50	+
50	51	args = {'7z': ['%s' % self.program_installed, 'a', '-scsUTF-8', '-t%s' % self.compression, '%s' % self.output, '%s' % self.input],
51	52	}
52		-
	53	+
53	54	commands = args.get(self.name, None)
54	55	if commands != None:
55	56	p = subprocess.Popen(commands, shell=True).wait()
56	57	else:
57	58	raise exceptions.CompressionNotSupportedError
58		-
59	59
	60	+
60	61	def extract(self):
61	62	'''
62	63	@location is the directory where to store the compressed file
—	—	@@ -68,18 +69,25 @@
69	70	if self.program_installed == None:
70	71	raise exceptions.CompressionNotSupportedError
71	72
72		~~- args = {'7z': ['%s' % self.program_installed,'e', '-o%s' % self.location, '%s' % self.file],~~
73		~~- 'bunzip2': ['%s' % self.program_installed, '-k', '%s' % self.file],~~
74		~~- 'zip': ['%s' % self.program_installed, '-o', '%s' % self.file],~~
75		~~- 'gz': ['%s' % self.program_installed, '-xzvf', '%s' % self.file],~~
76		~~- 'tar': ['%s' % self.program_installed, '-xvf', '%s' % self.file]~~
	73	+ print self.location
	74	+ print self.file
	75	+ if not utils.check_file_exists(self.location, self.file):
	76	+ raise exceptions.FileNotFoundException(self.location, self.file)
	77	+
	78	+
	79	+
	80	+ args = {'7z': ['%s' % self.program_installed, 'e', '-y', '-o%s' % self.location, '%s' % self.path],
	81	+ 'bunzip2': ['%s' % self.program_installed, '-k', '%s' % self.path],
	82	+ 'zip': ['%s' % self.program_installed, '-o', '%s' % self.path],
	83	+ 'gz': ['%s' % self.program_installed, '-xzvf', '%s' % self.path],
	84	+ 'tar': ['%s' % self.program_installed, '-xvf', '%s' % self.path]
77	85	}
78	86	commands = args.get(self.name, None)
79	87	if commands != None:
80	88	p = subprocess.Popen(commands, shell=True).wait()
81	89	else:
82	90	raise exceptions.CompressionNotSupportedError
83		-
	91	+
84	92	# if self.name == '7z':
85	93	# p = subprocess.Popen(['%s' % tool.extract_installed, 'e', '-o%s' % location, '%s' % input], shell=True).wait()
86	94	# elif tool_extract_installed.endswith('bunzip2'):
—	—	@@ -112,7 +120,7 @@
113	121	path = settings.detect_installed_program(p)
114	122	if path != None:
115	123	self.name = p
116		~~- self.program_installed = path~~
	124	+ self.program_installed = path
117	125
118	126
119	127	if __name__ == '__main__':
Index: trunk/tools/editor_trends/utils/exceptions.py
—	—	@@ -23,12 +23,13 @@
24	24	pass
25	25
26	26	class FileNotFoundException(Error):
27		~~- def __init__(self, file):~~
28		~~- self.file = file~~
	27	+ def __init__(self, file, path):
	28	+ self.file = file
	29	+ self.path = path
29	30
30	31	def __str__(self):
31		~~- print 'The file %s was not found. Please make sure your path is up-to-date.' % self.file~~
32		-
	32	+ print 'The file %s was not found. Please make sure that the file exists and the path is correct.' % (os.path.join(self.path, self.file))
	33	+
33	34	class PlatformNotSupportedError(Error):
34	35	def __init__(self, platform):
35	36	self.platform = platform
—	—	@@ -39,6 +40,6 @@
40	41	class CompressionNotSupportedError(Error):
41	42	def __init__(self, extension):
42	43	self.extension = extension
43		-
	44	+
44	45	def __str__(self):
45		~~- print 'You have not installed a program to extract %s archives.' % self.extension~~
\ No newline at end of file
	46	+ print 'You have not installed a program to extract %s archives.' % self.extension
Index: trunk/tools/editor_trends/bots/models.py
—	—	@@ -31,12 +31,17 @@
32	32
33	33	class Bot(object):
34	34
35		~~- def __init__(self, name):~~
	35	+ def __init__(self, name, **kwargs):
36	36	self.name = name
37	37	self.projects = []
38	38	self.time = shaper.create_datacontainer(datatype='list')
39	39	self.verified = True
	40	+ for kw in kwargs:
	41	+ setattr(self, kw, kwargs[kw])
40	42
	43	+ def __repr__(self):
	44	+ return self.name
	45	+
41	46	def hours_active(self):
42	47	self.clock = shaper.create_clock()
43	48	years = self.time.keys()
—	—	@@ -49,38 +54,42 @@
50	55	hours.sort()
51	56	for hour in hours:
52	57	self.data.append(self.clock[hour])
53		-
54		-
55	58
	59	+
	60	+ def active(self):
	61	+ return float(sum(self.clock.values())) / 24.0
	62	+
	63	+
56	64	def avg_lag_between_edits(self):
57	65	years = self.time.keys()
58	66	edits = []
59	67	for year in years:
60	68	for x in self.time[year]:
61	69	edits.append(x)
62		-
	70	+
63	71	if edits != []:
64	72	edits.sort()
65	73	dt = datetime.timedelta()
66	74	for i, edit in enumerate(edits):
67		~~- if i == len(edits) -1:~~
	75	+ if i == len(edits) - 1:
68	76	break
69	77	dt += edits[i + 1] - edits[i]
70	78	dt = dt / len(edits)
71	79	self.dt = dt
72		~~- self.n = i~~
	80	+ self.n = i
73	81	else:
74	82	self.dt = None
75		-
	83	+
76	84	def write_training_dataset(self, fh):
77	85	self.data = []
78	86	self.data.append(self.name)
79	87	self.data.append(self.verified)
80	88	self.add_clock_data()
	89	+ self.active()
81	90	self.data.append(self.dt)
82	91	utils.write_list_to_csv(self.data, fh, recursive=False, newline=True)
83		-
84	92
85	93
	94	+
86	95	if __name__ == '__main__':
87	96	pass
Index: trunk/tools/editor_trends/bots/bots.py
—	—	@@ -44,13 +44,16 @@
45	45	pass
46	46
47	47
48		~~-def read_bots_csv_file(manager, location, filename, encoding):~~
	48	+def read_bots_csv_file(location, filename, encoding, manager=False):
49	49	'''
50	50	Constructs a dictionary from Bots.csv
51	51	key is language
52	52	value is a list of bot names
53	53	'''
54		~~- bot_dict = manager.dict()~~
	54	+ if manager:
	55	+ bot_dict = manager.dict()
	56	+ else:
	57	+ bot_dict = dict()
55	58	for line in utils.read_data_from_csv(location, filename, encoding):
56	59	line = utils.clean_string(line)
57	60	language, bots = line.split(',')
—	—	@@ -84,7 +87,7 @@
85	88	This file reads the results from the lookup_bot_userid function and stores
86	89	it in a MongoDB collection.
87	90	'''
88		~~- bots = utils.read_dict_from_csv(settings.csv_location, 'bots_ids.csv', settings.encoding)~~
	91	+ #bots = utils.read_dict_from_csv(settings.csv_location, 'bots_ids.csv', settings.encoding)
89	92	mongo = db.init_mongo_db('bots')
90	93	collection = mongo['ids']
91	94	db.remove_documents_from_mongo_db(collection, None)
—	—	@@ -124,42 +127,47 @@
125	128	if username == None or username.text == None:
126	129	continue
127	130	else:
128		~~- username = username.text~~
	131	+ username = username.text #encode(settings.encoding)
	132	+ name = username.lower()
	133	+
129	134	#print username.encode('utf-8')
130	135	if username in bots and bots[username].verified == True:
131	136	id = contributor.find('id').text
132	137	bot = bots[username]
133	138
134	139	if not hasattr(bot, 'written'):
135		~~- bot_dict = convert_object_to_dict(bot, exclude=['time', 'name', 'written'])~~
136		~~- bot_dict['_username'] = username~~
137		~~- bot_dict['id'] = id~~
	140	+ if username == 'Robbot':
	141	+ print 'test'
	142	+ bot.id = id
	143	+ bot.name = username
	144	+ bot_dict = convert_object_to_dict(bot, exclude=['time', 'written'])
	145	+ #bot_dict['_username'] = username
	146	+ #bot_dict['id'] = id
138	147	lock.acquire()
139	148	utils.write_dict_to_csv(bot_dict, fh, write_key=False)
140	149	lock.release()
141		~~- bot.written = True~~
	150	+ bot.written = True
	151	+ bots[username] = bot
142	152	#bots.pop(username)
143	153	#if bots == {}:
144	154	# print 'Found id numbers for all bots.'
145	155	# return 'break'
146	156	#print username.encode('utf-8')
147		~~- name = username.lower()~~
	157	+
148	158	if name.find('bot') > -1:
149		~~- bot = bots.get(username, botmodels.Bot(username))~~
150		~~- if bot not in bots:~~
151		~~- bot.verified = False~~
	159	+ bot = bots.get(username, botmodels.Bot(username, verified=False))
152	160	timestamp = revision.find('timestamp').text
153	161	if timestamp != None:
154	162	timestamp = utils.convert_timestamp_to_datetime_naive(timestamp)
155	163	bot.time[str(timestamp.year)].append(timestamp)
156	164	bots[username] = bot
157		-
	165	+ return bots
158	166	#bot = bots.get('PseudoBot')
159	167	#bot.hours_active()
160	168	#bot.avg_lag_between_edits()
161	169
162	170
163		~~-def bot_launcher(language_code, project, single=False):~~
	171	+def bot_launcher(language_code, project, single=False, manager=False):
164	172	'''
165	173	This function sets the stage to launch bot id detection and collecting data
166	174	to discover new bots.
—	—	@@ -171,25 +179,28 @@
172	180	files = utils.retrieve_file_list(input, 'xml', mask=None)
173	181	input_queue = pc.load_queue(files, poison_pill=True)
174	182	tasks = multiprocessing.JoinableQueue()
175		~~- manager = multiprocessing.Manager()~~
176		~~- bots = manager.dict()~~
177		~~- lock = manager.Lock()~~
178		~~- bots = read_bots_csv_file(manager, settings.csv_location, 'Bots.csv', settings.encoding)~~
	183	+ mgr = multiprocessing.Manager()
	184	+ bots = mgr.dict()
	185	+ lock = mgr.Lock()
	186	+ if manager:
	187	+ manager = mgr
	188	+ bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding, manager=manager)
179	189
180	190	for file in files:
181	191	tasks.put(models.XMLFile(input, settings.csv_location, file, bots, lookup_bot_userid, 'bots_ids.csv', lock=lock))
182	192
	193	+ tracker = {}
183	194	if single:
184	195	while True:
185	196	try:
186	197	print '%s files left in the queue...' % tasks.qsize()
187	198	task = tasks.get(block=False)
188		~~- task()~~
	199	+ bots = task(bots)
189	200	except Empty:
190	201	break
191	202	else:
192	203	bot_launcher_multi(tasks)
193		-
	204	+
194	205	utils.store_object(bots, settings.binary_location, 'bots.bin')
195	206	bot_training_dataset(bots)
196	207	store_bots()
—	—	@@ -201,10 +212,10 @@
202	213	print '%s' % key.encode(settings.encoding)
203	214	except:
204	215	pass
205		-
206		-
207	216
208	217
	218	+
	219	+
209	220	def bot_training_dataset(bots):
210	221	fh = utils.create_txt_filehandle(settings.csv_location, 'training_bots.csv', 'w', settings.encoding)
211	222	keys = bots.keys()
—	—	@@ -213,10 +224,10 @@
214	225	bot.hours_active()
215	226	bot.avg_lag_between_edits()
216	227	bot.write_training_dataset(fh)
217		-
	228	+
218	229	fh.close()
219		-
220		-
	230	+
	231	+
221	232	def bot_launcher_multi(tasks):
222	233	'''
223	234	This is the launcher that uses multiprocesses.
—	—	@@ -230,9 +241,14 @@
231	242
232	243	tasks.join()
233	244
	245	+def debug_bots_dict():
	246	+ bots = utils.load_object(settings.binary_location, 'bots.bin')
	247	+ print 'done'
234	248
	249	+
235	250	if __name__ == '__main__':
236	251	language_code = 'en'
237	252	project = 'wiki'
238		~~- #bot_launcher(language_code, project, single=True)~~
239		~~- cProfile.run(bot_launcher(language_code, project, single=True), 'profile')~~
	253	+ #debug_bots_dict()
	254	+ bot_launcher(language_code, project, single=True)
	255	+ #cProfile.run(bot_launcher(language_code, project, single=True), 'profile')

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r77457 [removed: new added: deferred]