r77690 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r77689‎ \| r77690 \| r77691 >
Date:	23:38, 3 December 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	* 0.1 release
Modified paths:	/trunk/tools/editor_trends/bots/bots.py (modified) (history) /trunk/tools/editor_trends/configuration.py (modified) (history) /trunk/tools/editor_trends/database/db.py (modified) (history) /trunk/tools/editor_trends/etl/chunker.py (modified) (history) /trunk/tools/editor_trends/etl/extract.py (modified) (history) /trunk/tools/editor_trends/etl/models.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/utils/exceptions.py (modified) (history) /trunk/tools/editor_trends/utils/utils.py (modified) (history) /trunk/tools/editor_trends/wikitree/xml.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -19,6 +19,7 @@
20	20
21	21	import os
22	22	import logging
	23	+import logging.handlers
23	24	import sys
24	25	import datetime
25	26	from argparse import ArgumentParser
—	—	@@ -36,6 +37,7 @@
37	38	from utils import dump_downloader
38	39	from utils import compression
39	40	from utils import ordered_dict
	41	+from utils import exceptions
40	42	from database import db
41	43	from etl import chunker
42	44	from etl import extract
—	—	@@ -104,11 +106,18 @@
105	107	logger.debug('Starting %s task' % function.func_name)
106	108	if message:
107	109	logger.debug(message)
	110	+
	111	+ max_length = max([len(kw) for kw in kwargs])
	112	+ #max_tab = max_length / 4
108	113	for kw in kwargs:
109	114	if verb:
110	115	logger.debug('Action: %s\tSetting: %s' % (verb, kwargs[kw]))
111	116	else:
112		~~- logger.debug('Key: %s\tSetting: %s' % (kw, kwargs[kw]))~~
	117	+ tabs = (max_length - len(kw)) / 4
	118	+ if tabs == 0:
	119	+ tabs = 1
	120	+ tabs = ''.join(['\t' for t in xrange(tabs)])
	121	+ logger.debug('\tKey: %s%sSetting: %s' % (kw, tabs, kwargs[kw]))
113	122
114	123
115	124
—	—	@@ -128,6 +137,7 @@
129	138	location = get_value(args, 'location') if get_value(args, 'location') != None else settings.input_location
130	139	project = get_project(args)
131	140	language_code = get_language(args)
	141	+ config['format'] = get_value(args, 'format')
132	142	config['language_code'] = language_code
133	143	config['language'] = get_value(args, 'language')
134	144	config['location'] = os.path.join(location, language_code, project)
—	—	@@ -191,7 +201,7 @@
192	202	language = kwargs.pop('language')
193	203	language_code = kwargs.pop('language_code')
194	204	namespaces = kwargs.pop('namespaces')
195		-
	205	+ format = kwargs.pop('format')
196	206	ext = utils.determine_file_extension(filename)
197	207	file = filename.replace('.' + ext, '')
198	208	result = utils.check_file_exists(location, file)
—	—	@@ -202,11 +212,12 @@
203	213	if retcode != 0:
204	214	sys.exit(retcode)
205	215
206		~~- chunker.split_file(location, file, project, language_code, namespaces, format='xml', zip=False)~~
	216	+ chunker.split_file(location, file, project, language_code, namespaces, format=format, zip=False)
207	217	timer.elapsed()
208	218
209	219
210	220	def launch_zip_extractor(args, logger, location, file):
	221	+ print 'Unzipping zip file'
211	222	timer = Timer()
212	223	write_message_to_log(logger, args, location=location, file=file)
213	224	compressor = compression.Compressor(location, file)
—	—	@@ -215,56 +226,60 @@
216	227
217	228
218	229	def extract_launcher(args, logger, **kwargs):
	230	+ print 'Extracting data from XML'
219	231	timer = Timer()
220		~~- write_message_to_log(logger, args, **kwargs)~~
221	232	location = kwargs.pop('location')
222	233	language_code = kwargs.pop('language_code')
223	234	project = kwargs.pop('project')
	235	+ write_message_to_log(logger, args, location=location, language_code=language_code, project=project)
224	236	extract.run_parse_editors(location, **kwargs)
225	237	timer.elapsed()
226	238
227	239
228	240	def sort_launcher(args, logger, **kwargs):
	241	+ print 'Start sorting data'
229	242	timer = Timer()
230		~~- write_message_to_log(logger, args, **kwargs)~~
231	243	location = kwargs.pop('location')
232	244	input = os.path.join(location, 'txt')
233	245	output = os.path.join(location, 'sorted')
234	246	final_output = os.path.join(location, 'dbready')
	247	+ write_message_to_log(logger, args, location=location, input=input, output=output, final_output=final_output)
235	248	loader.mergesort_launcher(input, output)
236	249	loader.mergesort_external_launcher(output, final_output)
237	250	timer.elapsed()
238	251
239	252
240	253	def store_launcher(args, logger, **kwargs):
	254	+ print 'Start storing data in MongoDB'
241	255	timer = Timer()
242		~~- write_message_to_log(logger, args, **kwargs)~~
243	256	location = kwargs.pop('location')
244	257	input = os.path.join(location, 'dbready')
245	258	dbname = kwargs.pop('full_project')
246	259	collection = kwargs.pop('collection')
	260	+ write_message_to_log(logger, args, verb='Storing', location=location, input=input, dbname=dbname, collection=collection)
247	261	loader.store_editors(input, dbname, collection)
248	262	timer.elapsed()
249	263
250	264
251	265	def transformer_launcher(args, logger, **kwargs):
252		~~- print 'dataset launcher'~~
	266	+ print 'Start transforming dataset'
253	267	timer = Timer()
254		~~- write_message_to_log(logger, args, **kwargs)~~
255	268	project = kwargs.pop('full_project')
256	269	collection = kwargs.pop('collection')
	270	+ write_message_to_log(logger, args, verb='Transforming', project=project, collection=collection)
257	271	transformer.transform_editors_single_launcher(project, collection)
258	272	timer.elapsed()
259	273
260	274
261	275	def exporter_launcher(args, logger, **kwargs):
	276	+ print 'Start exporting dataset'
262	277	timer = Timer()
263		~~- write_message_to_log(logger, args, **kwargs)~~
264	278	collection = get_value(args, 'collection')
265		~~- dbname = kwargs.pop('full_project')~~
	279	+ dbname = kwargs.get('full_project')
266	280	targets = get_value(args, 'datasets')
267	281	targets = targets.split(',')
268	282	for target in targets:
	283	+ write_message_to_log(logger, args, verb='Exporting', target=target, dbname=dbname, collection=collection)
269	284	exporter.dataset_launcher(dbname, collection, target)
270	285	timer.elapsed()
271	286
—	—	@@ -274,16 +289,19 @@
275	290	timer = Timer()
276	291	full_project = kwargs.get('full_project', None)
277	292	message = 'Start of building %s dataset.' % full_project
278		~~- db.cleanup_database(full_project)~~
279		~~- write_message_to_log(logger, args, message, **kwargs)~~
	293	+
	294	+ db.cleanup_database(full_project, logger)
280	295	ignore = get_value(args, 'except')
281	296	clean = get_value(args, 'new')
	297	+ format = get_value(args, 'format')
	298	+ write_message_to_log(logger, args, message=message, full_project=full_project, ignore=ignore, clean=clean)
282	299	if clean:
283	300	dirs = kwargs.get('directories')[1:]
284	301	for dir in dirs:
285		~~- write_message_to_log(logger, args, verb='Deleting', **kwargs)~~
286		~~- utils.delete_file(dir, '')~~
287		-
	302	+ write_message_to_log(logger, args, verb='Deleting', dir=dir)
	303	+ utils.delete_file(dir, '', directory=True)
	304	+ if format != 'xml':
	305	+ ignore = ignore + ',extract'
288	306	functions = ordered_dict.OrderedDict(((dump_downloader_launcher, 'download'),
289	307	(chunker_launcher, 'split'),
290	308	(extract_launcher, 'extract'),
—	—	@@ -328,8 +346,9 @@
329	347	version = sys.version_info[0:2]
330	348	logger.debug('Python version: %s' % '.'.join(str(version)))
331	349	if version < settings.minimum_python_version:
332		~~- raise 'Please upgrade to Python 2.6 or higher (but not Python 3.x).'~~
	350	+ raise exceptions.OutDatedPythonVersionError
333	351
	352	+
334	353	def about():
335	354	print 'Editor Trends Software is (c) 2010 by the Wikimedia Foundation.'
336	355	print 'Written by Diederik van Liere (dvanliere@gmail.com).'
—	—	@@ -339,9 +358,6 @@
340	359
341	360
342	361	def main():
343		~~- logger = logging.getLogger('manager')~~
344		~~- logger.setLevel(logging.DEBUG)~~
345		-
346	362	default_language = determine_default_language()
347	363
348	364	datasets = {'cohort': 'generate_cohort_dataset',
—	—	@@ -397,7 +413,7 @@
398	414	help='Should be a list of functions that are to be ignored when executing \'all\'.',
399	415	default=[])
400	416
401		~~- parser_all.add_argument('-n', '--new', action='store_false',~~
	417	+ parser_all.add_argument('-n', '--new', action='store_true',
402	418	help='This will delete all previous output and starts from scratch. Mostly useful for debugging purposes.',
403	419	default=False)
404	420
—	—	@@ -421,10 +437,13 @@
422	438	default=settings.input_location
423	439	)
424	440
425		~~- parser.add_argument('-n', '--namespace', action='store',~~
	441	+ parser.add_argument('-ns', '--namespace', action='store',
426	442	help='A list of namespaces to include for analysis.',
427	443	default='0')
428	444
	445	+ parser.add_argument('-fo', '--format', action='store',
	446	+ help='Indicate which format the chunks should be stored. Valid options are xml and txt.',
	447	+ default='txt')
429	448
430	449	parser.add_argument('-f', '--file', action='store',
431	450	choices=file_choices,
—	—	@@ -444,9 +463,22 @@
445	464	parser.add_argument('-prog', '--progress', action='store_true', default=True,
446	465	help='Indicate whether you want to have a progressbar.')
447	466
	467	+ args = parser.parse_args()
	468	+ #initialize logger
	469	+ logger = logging.getLogger('manager')
	470	+ logger.setLevel(logging.DEBUG)
	471	+
	472	+ # Add the log message handler to the logger
	473	+ today = datetime.datetime.today()
	474	+ log_filename = os.path.join(settings.log_location, '%s%s_%s-%s-%s.log' % (args.language, args.project, today.day, today.month, today.year))
	475	+ handler = logging.handlers.RotatingFileHandler(log_filename, maxBytes=1024 * 1024, backupCount=3)
	476	+
	477	+ logger.addHandler(handler)
	478	+ logger.debug('Default language: \t%s' % default_language)
	479	+
	480	+ #start manager
448	481	detect_python_version(logger)
449	482	about()
450		~~- args = parser.parse_args()~~
451	483	config.create_configuration(settings, args)
452	484	locations = determine_file_locations(args, logger)
453	485	settings.verify_environment(locations['directories'])
Index: trunk/tools/editor_trends/wikitree/xml.py
—	—	@@ -26,7 +26,7 @@
27	27	return utils.unescape(text)
28	28
29	29
30		~~-def extract_text(elem, kwargs):~~
	30	+def extract_text(elem, **kwargs):
31	31	if elem != None and elem.text != None:
32	32	#try:
33	33	return elem.text #.decode(settings.encoding)
Index: trunk/tools/editor_trends/etl/extract.py
—	—	@@ -20,13 +20,12 @@
21	21	#Default Python libraries (Python => 2.6)
22	22	import sys
23	23	import os
24		~~-import time~~
25		~~-import datetime~~
26		~~-import codecs~~
27		~~-import math~~
	24	+#import time
	25	+#import datetime
	26	+#import codecs
	27	+#import math
28	28
29		~~-import re~~
30		~~-from operator import itemgetter~~
	29	+#from operator import itemgetter
31	30
32	31	import multiprocessing
33	32	from Queue import Empty
—	—	@@ -43,7 +42,7 @@
44	43	from wikitree import xml
45	44	from bots import bots
46	45	from etl import models
47		~~-from utils import process_constructor as pc~~
	46	+#from utils import process_constructor as pc
48	47
49	48	try:
50	49	import psyco
—	—	@@ -51,25 +50,49 @@
52	51	except ImportError:
53	52	pass
54	53
	54	+def validate_hostname(address):
	55	+ '''
	56	+ This is not a foolproof solution at all. The problem is that it's really hard
	57	+ to determine whether a string is a hostname or not reliably. This is a
	58	+ very fast rule of thumb. Will lead to false positives, but that's life :)
	59	+ '''
	60	+ parts = address.split(".")
	61	+ if len(parts) > 2:
	62	+ return True
	63	+ else:
	64	+ return False
55	65
	66	+def validate_ip(address):
	67	+ parts = address.split(".")
	68	+ if len(parts) != 4:
	69	+ return False
	70	+ parts = parts[:3]
	71	+ for item in parts:
	72	+ try:
	73	+ if not 0 <= int(item) <= 255:
	74	+ return False
	75	+ except ValueError:
	76	+ return False
	77	+ return True
56	78
57	79
58		~~-def determine_username_is_bot(contributor, bots):~~
	80	+def determine_username_is_bot(contributor, **kwargs):
59	81	'''
60	82	#contributor is an xml element containing the id of the contributor
61	83	@bots should have a dcit with all the bot ids and bot names
62	84	@Return False if username id is not in bot dict id or True if username id
63	85	is a bot id.
64	86	'''
	87	+ bots = kwargs.get('bots')
65	88	for elem in contributor:
66	89	if elem.tag == 'id':
67		~~- if elem.text in bots['bots']:~~
	90	+ if elem.text in bots:
68	91	return 1
69	92	else:
70	93	return 0
71	94
72	95
73		~~-def extract_username(contributor, kwargs):~~
	96	+def extract_username(contributor, **kwargs):
74	97	for elem in contributor:
75	98	if elem.tag == 'username':
76	99	return elem.text
—	—	@@ -77,41 +100,44 @@
78	101	return None
79	102
80	103
81		~~-def extract_contributor_id(contributor, kwargs):~~
	104	+def extract_contributor_id(contributor, **kwargs):
82	105	'''
83	106	@contributor is the xml contributor node containing a number of attributes
84	107
85	108	Currently, we are only interested in registered contributors, hence we
86	109	ignore anonymous editors.
87	110	'''
88		~~- if contributor.get('deleted'):~~
89		~~- return - 1 # ASK: Not sure if this is the best way to code deleted contributors.~~
	111	+ #if contributor.get('deleted'):
	112	+ # return None # ASK: Not sure if this is the best way to code deleted contributors.
90	113	for elem in contributor:
91		~~- if elem.tag == 'id':~~
92		~~- if elem.text != None:~~
93		~~- return elem.text~~
	114	+ if elem.tag == 'id' and elem.text != None:
	115	+ return {'id':elem.text}
	116	+
	117	+ elif elem.tag == 'ip' and elem.text != None:
	118	+ if validate_ip(elem.text) == False and validate_hostname(elem.text) == False:
	119	+ return {'username':elem.text, 'id': elem.text}
94	120	else:
95		~~- return - 1~~
	121	+ return None
	122	+ return None
96	123
97		-
98	124	def output_editor_information(elem, fh, **kwargs):
99	125	'''
100	126	@elem is an XML element containing 1 revision from a page
101		~~- @output is where to store the data, either a queue or a filehandle~~
	127	+ @output is where to store the data, a filehandle
102	128	@**kwargs contains extra information
103	129
104	130	the variable tags determines which attributes are being parsed, the values in
105	131	this dictionary are the functions used to extract the data.
106	132	'''
107		~~- tags = {'contributor': {'editor': extract_contributor_id,~~
	133	+ tags = {'contributor': {'id': extract_contributor_id,
108	134	'bot': determine_username_is_bot,
109	135	'username': extract_username,
110	136	},
111	137	'timestamp': {'date': xml.extract_text},
112	138	}
113	139	vars = {}
114		~~- headers = ['editor', 'date', 'article', 'username']~~
115		~~- #destination = kwargs.pop('destination')~~
	140	+ #counter = kwargs.pop('counter')
	141	+ headers = ['id', 'date', 'article', 'username']
116	142	revisions = elem.findall('revision')
117	143	for revision in revisions:
118	144	vars['article'] = elem.find('id').text.decode(settings.encoding)
—	—	@@ -119,18 +145,27 @@
120	146	for tag, functions in tags.iteritems():
121	147	xml_node = xml.retrieve_xml_node(elements, tag)
122	148	for var, function in functions.iteritems():
123		~~- vars[var] = function(xml_node, kwargs)~~
	149	+ value = function(xml_node, **kwargs)
	150	+ if type(value) == type({}):
	151	+ for kw in value:
	152	+ vars[kw] = value[kw]
	153	+ #if vars['username'] not in counter:
	154	+ # counter['username'] = c
	155	+ # c += 1
	156	+ #vars['id'] = counter[vars['username']]
	157	+ else:
	158	+ vars[var] = value
124	159
125	160	#print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot'])
126		~~- if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None:~~
	161	+ if vars['bot'] != 1 and vars['id'] != None:
127	162	vars.pop('bot')
128	163	data = []
129	164	for head in headers:
130	165	data.append(vars[head])
131	166	utils.write_list_to_csv(data, fh)
132	167	vars = {}
	168	+ #return counter, c
133	169
134		-
135	170	def run_parse_editors(location, **kwargs):
136	171	bot_ids = bots.retrieve_bots()
137	172	input = os.path.join(location, 'chunks')
—	—	@@ -157,7 +192,7 @@
158	193	bot_ids = bots.retrieve_bots()
159	194	input = os.path.join(location, 'chunks')
160	195	output = os.path.join(location, 'txt')
161		~~- xml_file = models.XMLFile(input, output, '1.xml', bot_ids, output_editor_information)~~
	196	+ xml_file = models.XMLFile(input, output, 'pages_full_en.xml', bot_ids, output_editor_information)
162	197	xml_file()
163	198
164	199	if __name__ == '__main__':
Index: trunk/tools/editor_trends/etl/chunker.py
—	—	@@ -30,10 +30,14 @@
31	31
32	32	sys.path.append('..')
33	33	import configuration
	34	+settings = configuration.Settings()
	35	+
34	36	from utils import utils
	37	+import extract
35	38	from wikitree import xml
36		~~-settings = configuration.Settings()~~
	39	+from bots import bots
37	40
	41	+
38	42	try:
39	43	import psyco
40	44	psyco.full()
—	—	@@ -114,12 +118,12 @@
115	119	return True
116	120
117	121
118		~~-def write_xml_file(element, fh, output, counter):~~
	122	+def write_xml_file(element, fh, output, counter, format):
119	123	'''Get file handle and write xml element to file'''
120	124	try:
121	125	xml_string = cElementTree.tostring(element)
122	126	size = len(xml_string)
123		~~- fh, counter, new_file = create_file_handle(fh, output, counter, size)~~
	127	+ fh, counter, new_file = create_file_handle(fh, output, counter, size, format)
124	128	fh.write(xml_string)
125	129	except MemoryError:
126	130	print 'Add error capturing logic'
—	—	@@ -134,7 +138,7 @@
135	139	return fh, counter, new_file
136	140
137	141
138		~~-def create_file_handle(fh, output, counter, size):~~
	142	+def create_file_handle(fh, output, counter, size, format):
139	143	'''
140	144	@fh is file handle, if none is supplied or if file size > max file size then
141	145	create a new file handle
—	—	@@ -144,56 +148,79 @@
145	149	'''
146	150	if not fh:
147	151	counter = 0
148		~~- path = os.path.join(output, '%s.xml' % counter)~~
	152	+ path = os.path.join(output, '%s.%s' % (counter, format))
149	153	fh = codecs.open(path, 'w', encoding=settings.encoding)
150	154	return fh, counter, False
151	155	elif (fh.tell() + size) > settings.max_xmlfile_size:
152	156	print 'Created chunk %s' % (counter + 1)
153	157	fh.close
154	158	counter += 1
155		~~- path = os.path.join(output, '%s.xml' % counter)~~
	159	+ path = os.path.join(output, '%s.%s' % (counter, format))
156	160	fh = codecs.open(path, 'w', encoding=settings.encoding)
157	161	return fh, counter, True
158	162	else:
159	163	return fh, counter, False
160	164
161	165
162		~~-def flatten_xml_elements(data, page):~~
	166	+def flatten_xml_elements(data, page, bots):
	167	+ headers = ['id', 'date', 'article', 'username']
	168	+ tags = {'contributor': {'id': extract.extract_contributor_id,
	169	+ 'bot': extract.determine_username_is_bot,
	170	+ 'username': extract.extract_username,
	171	+ },
	172	+ 'timestamp': {'date': xml.extract_text},
	173	+ }
	174	+ vars = {}
163	175	flat = []
	176	+
164	177	for x, elems in enumerate(data):
165		~~- flat.append([page])~~
166		~~- for elem in elems:~~
167		~~- if elem.tag != 'id':~~
168		~~- if len(elem.getchildren()) > 0:~~
169		~~- for el in elem.getchildren():~~
170		~~- flat[x].append(xml.extract_text(elem, None))~~
	178	+ vars[x] = {}
	179	+ vars[x]['article'] = page
	180	+ for tag in tags:
	181	+ el = xml.retrieve_xml_node(elems, tag)
	182	+ for function in tags[tag].keys():
	183	+ f = tags[tag][function]
	184	+ value = f(el, bots=bots)
	185	+ if type(value) == type({}):
	186	+ for kw in value:
	187	+ vars[x][kw] = value[kw]
171	188	else:
172		~~- flat[x].append(xml.extract_text(elem, None))~~
	189	+ vars[x][function] = value
	190	+
	191	+ for x, var in enumerate(vars):
	192	+ if vars[x]['bot'] == 1 or vars[x]['id'] == None or vars[x]['username'] == None:
	193	+ continue
	194	+ else:
	195	+ f = []
	196	+ for head in headers:
	197	+ f.append(vars[x][head])
	198	+ flat.append(f)
	199	+
173	200	return flat
174	201
175	202
176	203	def split_file(location, file, project, language_code, namespaces=[0], format='xml', zip=False):
177	204	'''
178	205	Reads xml file and splits it in N chunks
179		-
180	206	@namespaces is a list indicating which namespaces should be included, default
181	207	is to include namespace 0 (main namespace)
182	208	@zip indicates whether to compress the chunk or not
183	209	'''
184		~~- #location = os.path.join(settings.input_location, language)~~
185	210	input = os.path.join(location, file)
186		~~- output = os.path.join(location, 'chunks')~~
187		~~- settings.verify_environment([output])~~
188	211	if format == 'xml':
189		~~- fh = None~~
	212	+ output = os.path.join(location, 'chunks')
190	213	else:
191		~~- f = input.replace('.xml', '')~~
192		~~- fh = utils.create_txt_filehandle(output, '%s.tsv' % f, 'w', settings.encoding)~~
	214	+ output = os.path.join(location, 'txt')
	215	+ bot_ids = bots.retrieve_bots()
	216	+ settings.verify_environment([output])
193	217
	218	+ fh = None
	219	+ counter = 0
	220	+
194	221	ns = load_namespace(language_code)
195	222	ns = build_namespaces_locale(ns, namespaces)
	223	+ #settings.xml_namespace = 'http://www.mediawiki.org/xml/export-0.4/'
196	224
197		~~- counter = 0~~
198	225	tag = '{%s}page' % settings.xml_namespace
199	226	context = cElementTree.iterparse(input, events=('start', 'end'))
200	227	context = iter(context)
—	—	@@ -206,16 +233,21 @@
207	234	if is_article_main_namespace(elem, ns):
208	235	page = elem.find('id').text
209	236	elem = parse_comments(elem, remove_numeric_character_references)
	237	+
210	238	if format == 'xml':
211		~~- fh, counter, new_file = write_xml_file(elem, fh, output, counter)~~
212		~~- if zip and new_file:~~
213		~~- file = str(counter - 1) + '.xml'~~
214		~~- utils.zip_archive(settings.path_ziptool, output, file)~~
215		~~- utils.delete_file(output, file)~~
	239	+ fh, counter, new_file = write_xml_file(elem, fh, output, counter, format)
216	240	else:
217	241	data = [el.getchildren() for el in elem if el.tag == 'revision']
218		~~- data = flatten_xml_elements(data, page)~~
219		~~- utils.write_list_to_csv(data, fh, recursive=False, newline=True)~~
	242	+ data = flatten_xml_elements(data, page, bot_ids)
	243	+ if data != None:
	244	+ size = 64 * len(data)
	245	+ fh, counter, new_file = create_file_handle(fh, output, counter, size, format)
	246	+ utils.write_list_to_csv(data, fh, recursive=False, newline=True)
	247	+
	248	+ if zip and new_file:
	249	+ file = str(counter - 1) + format
	250	+ utils.zip_archive(settings.path_ziptool, output, file)
	251	+ utils.delete_file(output, file)
220	252	root.clear() # when done parsing a section clear the tree to safe memory
221	253	except SyntaxError:
222	254	f = utils.create_txt_filehandle(settings.log_location, 'split_xml', 'w', settings.encoding)
—	—	@@ -225,8 +257,8 @@
226	258	fh.close()
227	259
228	260	if __name__ == "__main__":
229		~~- kwargs = {'output': settings.input_location,~~
230		~~- 'input': settings.input_filename,~~
	261	+ kwargs = {'location': settings.input_location,
	262	+ 'file': settings.input_filename,
231	263	'project':'wiki',
232	264	'language_code':'en',
233	265	'format': 'tsv'
Index: trunk/tools/editor_trends/etl/models.py
—	—	@@ -29,18 +29,49 @@
30	30	from utils import utils
31	31	from wikitree import xml
32	32
	33	+class TXTFile(object):
	34	+
	35	+ def __init__(self, file, location, output, output_file, target, **kwargs):
	36	+ self.file = file
	37	+ self.location = location
	38	+ self.target = target
	39	+ self.output = output
	40	+ self.output_file = output_file
	41	+ for kw in kwargs:
	42	+ setattr(self, kw, kwargs[kw])
	43	+
	44	+ def __str__(self):
	45	+ return '%s' % (self.file)
	46	+
	47	+ def __call__(self, bots):
	48	+ self.bots = bots
	49	+ self.fr = utils.create_txt_filehandle(self.location, self.file, 'r', settings.encoding)
	50	+ self.fw = utils.create_txt_filehandle(self.output, self.output_file, 'w', settings.encoding)
	51	+ for line in self.fr:
	52	+ line = line.replace('\n', '')
	53	+ if line == '':
	54	+ continue
	55	+ line = line.split('\t')
	56	+ self.bots = self.target(line, self.fw, self.bots, self.keys)
	57	+ if self.bots == {}:
	58	+ break
	59	+ self.fr.close()
	60	+ self.fw.close()
	61	+ return self.bots
	62	+
33	63	class XMLFileConsumer(models.BaseConsumer):
34	64
35	65	def run(self):
36	66	while True:
37		~~- print 'Queue is %s files long...' % (self.task_queue.qsize() - settings.number_of_processes)~~
38	67	new_xmlfile = self.task_queue.get()
39	68	self.task_queue.task_done()
40	69	if new_xmlfile == None:
41	70	print 'Swallowed a poison pill'
42	71	break
	72	+ print 'Queue is %s files long...' % self.task_queue.qsize()
43	73	new_xmlfile()
44	74
	75	+
45	76	class XMLFile(object):
46	77	def __init__(self, input, output, xml_file, bots, target, output_file=None, **kwargs):
47	78	self.file = xml_file
Index: trunk/tools/editor_trends/configuration.py
—	—	@@ -50,9 +50,9 @@
51	51	self.progressbar = True
52	52	self.encoding = 'utf-8'
53	53	self.date_format = '%Y-%m-%d' #Date format as used by Erik Zachte
54		~~- self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' # Timestampformat as generated by the MediaWiki dumps~~
	54	+ self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' # Timestamp format as generated by the MediaWiki dumps
55	55
56		~~- self.max_xmlfile_size = 67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason~~
	56	+ self.max_xmlfile_size = 4096 * 1024 #67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason
57	57	self.number_of_processes = cpu_count() * process_multiplier
58	58	#Change this to match your computers configuration (RAM / CPU)
59	59	self.minimum_python_version = (2, 6)
—	—	@@ -70,8 +70,8 @@
71	71	self.root = '/' if self.platform != 'Windows' else 'c:\\'
72	72	self.file_locations = self.set_file_locations()
73	73	self.max_filehandles = self.determine_max_filehandles_open()
	74	+ self.tab_width = 4 if self.platform == 'Windows' else 8
74	75
75		-
76	76	self.load_configuration()
77	77	self.set_custom_settings(**kwargs)
78	78	self.dumpversions = {'0': 'http://www.mediawiki.org/xml/export-0.4/',
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -31,8 +31,9 @@
32	32	import os
33	33	import ctypes
34	34	import time
35		~~-import subprocess~~
	35	+#import subprocess
36	36	import sys
	37	+import shutil
37	38	sys.path.append('..')
38	39
39	40	import configuration
—	—	@@ -248,9 +249,19 @@
249	250	return name
250	251
251	252
252		~~-def delete_file(location, filename):~~
	253	+def delete_file(location, filename, directory=False):
253	254	if check_file_exists(location, filename):
254		~~- os.remove(os.path.join(location, filename))~~
	255	+ if not directory:
	256	+ try:
	257	+ path = os.path.join(location, filename)
	258	+ os.remove(path)
	259	+ except WindowsError, error:
	260	+ print error
	261	+ else:
	262	+ try:
	263	+ shutil.rmtree(location)
	264	+ except Exception, error:
	265	+ print error
255	266
256	267
257	268	def check_file_exists(location, filename):
Index: trunk/tools/editor_trends/utils/exceptions.py
—	—	@@ -43,3 +43,10 @@
44	44
45	45	def __str__(self):
46	46	print 'You have not installed a program to extract %s archives.' % self.extension
	47	+
	48	+class OutDatedPythonVersionError(Error):
	49	+ def __init__(self, version):
	50	+ self.version = version
	51	+
	52	+ def __str__(self):
	53	+ print 'Please upgrade to Python 2.6 or higher (but not Python 3.x).'
Index: trunk/tools/editor_trends/database/db.py
—	—	@@ -17,18 +17,18 @@
18	18	__date__ = '2010-10-21'
19	19	__version__ = '0.1'
20	20
21		~~-import sqlite3 as sqlite~~
	21	+#import sqlite3 as sqlite
22	22	from pymongo import Connection
23	23
24	24
25		~~-import configuration~~
26		~~-settings = configuration.Settings()~~
27		~~-from database import db_settings~~
	25	+#import configuration
	26	+#settings = configuration.Settings()
	27	+#from database import db_settings
28	28
29	29
30		~~-def init_mongo_db(db):~~
	30	+def init_mongo_db(dbname):
31	31	connection = Connection()
32		~~- db = connection[db]~~
	32	+ db = connection[dbname]
33	33	return db
34	34
35	35
—	—	@@ -42,11 +42,12 @@
43	43	return db.collection_names()
44	44
45	45
46		~~-def cleanup_database(dbname):~~
	46	+def cleanup_database(dbname, logger):
47	47	coll = get_collections(dbname)
48	48	for c in coll:
49	49	if not c.startswith('system'):
50	50	drop_collection(dbname, c)
	51	+ logger.debug('Deleting collection %s from database %s.' % (c, dbname))
51	52
52	53
53	54	def remove_documents_from_mongo_db(collection, ids):
Index: trunk/tools/editor_trends/bots/bots.py
—	—	@@ -31,6 +31,7 @@
32	32	from wikitree import xml
33	33	from database import db
34	34	from utils import utils
	35	+#from etl import extract
35	36	from utils import process_constructor as pc
36	37	from etl import models
37	38	import models as botmodels
—	—	@@ -110,54 +111,51 @@
111	112	keys and values to ease writing to a csv file.
112	113	'''
113	114	d = {}
114		~~- for o in obj:~~
115		~~- bot = obj[o]~~
116		~~- d[o] = {}~~
117		~~- for kw in bot.__dict__.keys():~~
118		~~- if kw not in exclude:~~
119		~~- d[o][kw] = getattr(bot, kw)~~
	115	+ for kw in obj.__dict__.keys():
	116	+ if kw not in exclude:
	117	+ d[kw] = getattr(obj, kw)
120	118	return d
121	119
122	120
123		~~-def write_bot_list_to_csv(bots):~~
	121	+def write_bot_list_to_csv(bots, keys):
124	122	fh = utils.create_txt_filehandle(settings.csv_location, 'bots_ids.csv', 'w', settings.encoding)
125	123	bot_dict = convert_object_to_dict(bots, exclude=['time', 'written'])
126		~~- keys = ['id', 'name', 'verified', 'projects']~~
127	124	for bot in bot_dict:
128	125	bot = bot_dict[bot]
129	126	utils.write_dict_to_csv(bot, fh, keys, write_key=False, newline=True)
130	127	fh.close()
131	128
132	129
133		~~-def lookup_bot_userid(xml_nodes, bots):~~
	130	+def lookup_bot_userid(data, fh, bots, keys):
134	131	'''
135	132	This function is used to find the id's belonging to the different bots that
136	133	are patrolling the Wikipedia sites.
137	134	@xml_nodes is a list of xml elements that need to be parsed
138	135	@bots is a dictionary containing the names of the bots to lookup
139	136	'''
140		~~- revisions = xml_nodes.findall('revision')~~
141		~~- for revision in revisions:~~
142		~~- contributor = xml.retrieve_xml_node(revision, 'contributor')~~
143		~~- username = contributor.find('username')~~
144		~~- if username == None or username.text == None:~~
145		~~- continue~~
146		~~- else:~~
147		~~- username = username.text #encode(settings.encoding)~~
148		~~- name = username.lower()~~
	137	+ username = data[3]
	138	+ if username in bots:
	139	+ bot = bots.pop(username)
	140	+ setattr(bot, 'id', data[0])
	141	+ setattr(bot, 'verified', True)
	142	+ bot = convert_object_to_dict(bot, exclude=['time'])
	143	+ utils.write_dict_to_csv(bot, fh, keys, write_key=False, newline=True)
	144	+ return bots
149	145
150		~~- #print username.encode('utf-8')~~
151		~~- if (username in bots and bots[username].verified == True) or name.find('bot') > -1:~~
152		~~- bot = bots.get(username, botmodels.Bot(username, verified=False))~~
153		~~- id = contributor.find('id').text~~
154		~~- bot.id = id~~
155		~~- bot.name = username~~
156		~~- timestamp = revision.find('timestamp').text~~
157		~~- if timestamp != None:~~
158		~~- timestamp = utils.convert_timestamp_to_datetime_naive(timestamp)~~
159		~~- bot.time[str(timestamp.year)].append(timestamp)~~
160	146
161		~~- bots[username] = bot~~
	147	+def create_bot_validation_dataset(data, fh, bots, keys):
	148	+ username = data[3].lower()
	149	+ #print username.encode('utf-8')
	150	+ if username.find('bot') > -1 or username.find('script') > -1:
	151	+ bot = bots.get(username, botmodels.Bot(username, verified=False))
	152	+ setattr(bot, 'id', data[0])
	153	+
	154	+ timestamp = data[1]
	155	+ if timestamp != None:
	156	+ timestamp = utils.convert_timestamp_to_datetime_naive(timestamp)
	157	+ bot.time[str(timestamp.year)].append(timestamp)
	158	+ bots[username] = bot
	159	+
162	160	return bots
163	161
164	162	#bot = bots.get('PseudoBot')
—	—	@@ -165,26 +163,36 @@
166	164	#bot.avg_lag_between_edits()
167	165
168	166
169		~~-def bot_launcher(language_code, project, single=False, manager=False):~~
	167	+def bot_launcher(language_code, project, target, action, single=False, manager=False):
170	168	'''
171	169	This function sets the stage to launch bot id detection and collecting data
172	170	to discover new bots.
173	171	'''
174	172	utils.delete_file(settings.csv_location, 'bots_ids.csv')
175	173	location = os.path.join(settings.input_location, language_code, project)
176		~~- input = os.path.join(location, 'chunks')~~
177		-
178		~~- files = utils.retrieve_file_list(input, 'xml', mask=None)~~
	174	+ input_xml = os.path.join(location, 'chunks')
	175	+ input_txt = os.path.join(location, 'txt')
	176	+ files = utils.retrieve_file_list(input_txt, 'txt', mask=None)
	177	+ files = files[400:405]
179	178	input_queue = pc.load_queue(files, poison_pill=True)
180	179	tasks = multiprocessing.JoinableQueue()
181	180	mgr = multiprocessing.Manager()
	181	+ keys = ['id', 'name', 'verified', 'projects']
	182	+
	183	+ if action == 'lookup':
	184	+ output_file = 'bots_ids.csv'
	185	+ bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding, manager=manager)
	186	+ else:
	187	+ output_file = 'bots_predictionset.csv'
	188	+ bots = {}
	189	+
182	190	#lock = mgr.Lock()
183	191	if manager:
184	192	manager = mgr
185		~~- bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', settings.encoding, manager=manager)~~
186	193
	194	+
187	195	for file in files:
188		~~- tasks.put(models.XMLFile(input, settings.csv_location, file, None, lookup_bot_userid))~~
	196	+ tasks.put(models.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys))
189	197
190	198	tracker = {}
191	199	if single:
—	—	@@ -199,21 +207,22 @@
200	208	bot_launcher_multi(tasks)
201	209
202	210	utils.store_object(bots, settings.binary_location, 'bots.bin')
203		~~- write_bot_list_to_csv(bots)~~
204		~~- bot_training_dataset(bots)~~
205		~~- store_bots()~~
206		~~- if bots != {}:~~
207		~~- print 'The script was unable to retrieve the user id\s for the following %s bots:\n' % len(bots)~~
208		~~- keys = bots.keys()~~
209		~~- for key in keys:~~
210		~~- try:~~
211		~~- print '%s' % key.encode(settings.encoding)~~
212		~~- except:~~
213		~~- pass~~
	211	+ if action == 'lookup':
	212	+ store_bots()
	213	+ if bots != {}:
	214	+ print 'The script was unable to retrieve the user id\s for the following %s bots:\n' % len(bots)
	215	+ keys = bots.keys()
	216	+ for key in keys:
	217	+ try:
	218	+ print '%s' % key.encode(settings.encoding)
	219	+ except:
	220	+ pass
	221	+ else:
	222	+ bot_training_dataset(bots)
	223	+ #write_bot_list_to_csv(bots, keys)
214	224
215	225
216	226
217		-
218	227	def bot_training_dataset(bots):
219	228	fh = utils.create_txt_filehandle(settings.csv_location, 'training_bots.csv', 'w', settings.encoding)
220	229	keys = bots.keys()
—	—	@@ -251,8 +260,9 @@
252	261	if __name__ == '__main__':
253	262	language_code = 'en'
254	263	project = 'wiki'
255		~~- store_bots()~~
	264	+ #store_bots()
256	265	#bots = debug_bots_dict()
257	266	#write_bot_list_to_csv(bots)
258		~~- #bot_launcher(language_code, project, single=True)~~
	267	+ #language_code, project, lookup_bot_userid, single = False, manager = False
	268	+ bot_launcher(language_code, project, create_bot_validation_dataset, action='training', single=True, manager=False)
259	269	#cProfile.run(bot_launcher(language_code, project, single=True), 'profile')

Follow-up revisions

Revision	Commit summary	Author	Date
r77723	Retagging editor_trends at 0.1/r77690, doing r77691 properly	reedy	15:35, 4 December 2010

Status & tagging log

15:25, 4 December 2010 IAlex (talk | contribs) changed the status of r77690 [removed: new added: deferred]