r89189 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r89188‎ \| r89189 \| r89190 >
Date:	21:50, 30 May 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Preparing for Summer of Research
Modified paths:	/trunk/tools/editor_trends/analyses/adhoc/bot_detector.py (modified) (history) /trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py (modified) (history) /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history) /trunk/tools/editor_trends/classes/languages.py (modified) (history) /trunk/tools/editor_trends/classes/projects.py (modified) (history) /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history) /trunk/tools/editor_trends/classes/settings.py (modified) (history) /trunk/tools/editor_trends/classes/storage.py (modified) (history) /trunk/tools/editor_trends/etl/differ.py (modified) (history) /trunk/tools/editor_trends/etl/extracter.py (modified) (history) /trunk/tools/editor_trends/etl/kaggle.py (deleted) (history) /trunk/tools/editor_trends/etl/transformer.py (modified) (history) /trunk/tools/editor_trends/etl/variables.py (modified) (history) /trunk/tools/editor_trends/kaggle/training.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/statistics/stata/ppi.do (modified) (history) /trunk/tools/editor_trends/utils/file_utils.py (modified) (history) /trunk/tools/editor_trends/utils/log.py (modified) (history) /trunk/tools/editor_trends/utils/text_utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/analyzer.py
—	—	@@ -17,7 +17,7 @@
18	18	_date__ = '2010-12-10'
19	19	__version__ = '0.1'
20	20
21		~~-from multiprocessing import JoinableQueue, Queue, Manager, RLock, Process~~
	21	+from multiprocessing import JoinableQueue, Queue, Manager, RLock, Process, cpu_count
22	22	from multiprocessing.managers import BaseManager
23	23	from Queue import Empty
24	24
—	—	@@ -141,10 +141,10 @@
142	142	del editors
143	143
144	144	analyzers = [analytics.Analyzer(rts, tasks, result, var, data, plugin, func) for
145		~~- x in xrange(rts.number_of_processes)]~~
	145	+ x in xrange(cpu_count())]
146	146
147	147
148		~~- for x in xrange(rts.number_of_processes):~~
	148	+ for x in xrange(cpu_count()):
149	149	tasks.put(None)
150	150
151	151	pbar = progressbar.ProgressBar(maxval=n).start()
—	—	@@ -152,7 +152,7 @@
153	153	analyzer.start()
154	154
155	155
156		~~- ppills = rts.number_of_processes~~
	156	+ ppills = cpu_count()
157	157	while True:
158	158	while ppills > 0:
159	159	try:
—	—	@@ -216,7 +216,7 @@
217	217
218	218
219	219	def launcher():
220		~~- project, language, parser = manage.init_args_parser()~~
	220	+ project, language, parser = commandline.init_args_parser()
221	221	args = parser.parse_args(['django'])
222	222	rts = runtime_settings.init_environment('wiki', 'en', args)
223	223	generate_chart_data(rts, 'taxonomy_burnout', time_unit='month')
Index: trunk/tools/editor_trends/analyses/adhoc/bot_detector.py
—	—	@@ -244,8 +244,8 @@
245	245	'''
246	246	This is the launcher that uses multiprocesses.
247	247	'''
248		~~- consumers = [consumers.XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)]~~
249		~~- for x in xrange(settings.number_of_processes):~~
	248	+ consumers = [consumers.XMLFileConsumer(tasks, None) for i in xrange(multiprocessing.cpu_count())]
	249	+ for x in xrange(multiprocessing.cpu_count()):
250	250	tasks.put(None)
251	251
252	252	for w in consumers:
Index: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
—	—	@@ -159,7 +159,7 @@
160	160	min_d = min(data.keys())
161	161	max_d = max(data.keys())
162	162	match = data[max_d]
163		~~- matches.append((ppi_editor, match))~~
	163	+ matches.append((ppi_editor, match, max_d))
164	164	#remove match to make sure that every matched pair is unique
165	165	for editor in distances:
166	166	try:
—	—	@@ -177,11 +177,12 @@
178	178	fh.write('_a\t'.join(vars))
179	179	fh.write('\t%s\t' % ('editor_b'))
180	180	fh.write('_b\t'.join(vars))
181		~~- fh.write('\tdelta registration days\tid\n')~~
	181	+ fh.write('\tdelta registration days\tid\teuclid_dist\n')
182	182	for i, match in enumerate(matches):
183	183	line = []
184	184	editor_a = match[0]
185	185	editor_b = match[1]
	186	+ dist = match[2]
186	187	line.append(editor_a)
187	188	values_a = [str(obs_a[editor_a][v]) for v in vars]
188	189	values_b = [str(obs_b[editor_b][v]) for v in vars]
—	—	@@ -191,6 +192,7 @@
192	193	dt = obs_a[editor_a]['reg_date'] - obs_b[editor_b]['reg_date']
193	194	line.append(str(dt.days))
194	195	line.append(str(i))
	196	+ line.append(dist)
195	197	line.append('\n')
196	198	print line
197	199	#line = '\t'.join([str(l).decode('utf-8') for l in line])
Index: trunk/tools/editor_trends/manage.py
—	—	@@ -30,6 +30,7 @@
31	31	from classes import projects
32	32	from classes import runtime_settings
33	33	from utils import file_utils
	34	+from utils import text_utils
34	35	from utils import ordered_dict
35	36	from utils import log
36	37	from utils import timer
—	—	@@ -43,14 +44,113 @@
44	45	from analyses import inventory
45	46
46	47
47		~~-def init_args_parser():~~
	48	+
	49	+def config_launcher(rts, logger):
48	50	'''
	51	+ Config launcher is used to (re)configure Wikilytics.
	52	+ '''
	53	+
	54	+ pc = projects.ProjectContainer()
	55	+ if not os.path.exists('wiki.cfg') or rts.force:
	56	+ config = ConfigParser.RawConfigParser()
	57	+ project = None
	58	+ language = None
	59	+ db = None
	60	+ valid_hostname = False
	61	+ valid_storage = ['mongo', 'cassandra']
	62	+ working_directory = raw_input('''Please indicate where you installed
	63	+ Wikilytics.\nCurrent location is %s\nPress Enter to accept default.\n''' % os.getcwd())
	64	+
	65	+ input_location = raw_input('''Please indicate where the Wikipedia dump
	66	+ files are or will be located.\nDefault is: %s\nPress Enter to
	67	+ accept default.\n''' % rts.input_location)
	68	+
	69	+ base_location = raw_input('''Please indicate where to store all
	70	+ Wikilytics project files.\nDefault is: %s\nPress Enter to accept
	71	+ default.\n''' % rts.base_location)
	72	+
	73	+ while db not in valid_storage:
	74	+ db = raw_input('''Please indicate what database you are using for storage.\nDefault is: Mongo\n''')
	75	+ db = 'mongo' if len(db) == 0 else db.lower()
	76	+ if db not in valid_storage:
	77	+ print 'Valid choices are: %s' % ','.join(valid_storage)
	78	+
	79	+ while project not in pc.projects.keys():
	80	+ project = raw_input('''Please indicate which project you would like
	81	+ to analyze.\nDefault is: %s\nPress Enter to accept default.\n''' % rts.project.full_name)
	82	+ project = project if len(project) > 0 else rts.project.name
	83	+ if project not in pc.projects.keys():
	84	+ print 'Valid choices for a project are: %s' % ','.join(pc.projects.keys())
	85	+
	86	+ while language not in rts.project.valid_languages:
	87	+ language = raw_input('''Please indicate which language of project
	88	+ %s you would like to analyze.\nDefault is: %s\nPress Enter to accept
	89	+ default.\n''' % (rts.project.full_name, rts.language))
	90	+ if len(language) == 0:
	91	+ language = rts.language.code
	92	+ language = language if language in rts.project.valid_languages \
	93	+ else rts.language.default
	94	+
	95	+ while valid_hostname == False:
	96	+ master = raw_input('''Please indicate the hostname master of your database
	97	+ cluster.\n Default is: %s\nPress Enter to accept default.\n''' % ('localhost'))
	98	+ master = 'localhost' if len(master) == 0 else master
	99	+ valid_hostname = text_utils.validate_hostname(master)
	100	+
	101	+ if master != 'localhost':
	102	+ valid_hostname = False
	103	+ while valid_hostname == False:
	104	+ slaves = raw_input('''Please indicate the hostnames of your slaves
	105	+ of your database cluster.Separate names using a comma.\n''')
	106	+ slaves = slaves.split(',')
	107	+ results = []
	108	+ for slave in slaves:
	109	+ results.append(text_utils.validate_hostname(slave))
	110	+ valid_hostname = True if all(results) else False
	111	+
	112	+ slaves = ','.join(slaves)
	113	+ input_location = input_location if len(input_location) > 0 else \
	114	+ rts.input_location
	115	+ base_location = base_location if len(base_location) > 0 else \
	116	+ rts.base_location
	117	+ working_directory = working_directory if len(working_directory) > 0 \
	118	+ else os.getcwd()
	119	+
	120	+ config = ConfigParser.RawConfigParser()
	121	+ config.add_section('file_locations')
	122	+ config.set('file_locations', 'working_directory', working_directory)
	123	+ config.set('file_locations', 'input_location', input_location)
	124	+ config.set('file_locations', 'base_location', base_location)
	125	+ config.add_section('wiki')
	126	+ config.set('wiki', 'project', project)
	127	+ config.set('wiki', 'language', language)
	128	+ config.add_section('storage')
	129	+ config.set('storage', 'db', db)
	130	+ config.add_section('cluster')
	131	+ config.set('cluster', 'master', master)
	132	+ config.set('cluster', 'slaves', slaves)
	133	+
	134	+ fh = file_utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb')
	135	+ config.write(fh)
	136	+ fh.close()
	137	+
	138	+ log.to_csv(logger, rts, 'New configuration', 'Creating',
	139	+ config_launcher,
	140	+ working_directory=working_directory,
	141	+ input_location=input_location,
	142	+ base_location=base_location,
	143	+ project=project,
	144	+ language=language,)
	145	+
	146	+
	147	+def init_args_parser(language_code=None, project=None):
	148	+ '''
49	149	Entry point for parsing command line and launching the needed function(s).
50	150	'''
51		~~- language = languages.init()~~
52		~~- project = projects.init()~~
	151	+ language = languages.init(language_code)
	152	+ project = projects.init(project)
53	153	pjc = projects.ProjectContainer()
54		~~- rts = runtime_settings.RunTimeSettings(project, language)~~
	154	+ #rts = runtime_settings.RunTimeSettings(project, language)
55	155
56	156	file_choices = {'meta-full': 'stub-meta-history.xml.gz',
57	157	'meta-current': 'stub-meta-current.xml.gz',
—	—	@@ -78,7 +178,7 @@
79	179	parser_config.set_defaults(func=config_launcher)
80	180	parser_config.add_argument('-f', '--force',
81	181	action='store_true',
82		~~- help='Reconfigure Editor Toolkit (this will replace wiki.cfg')~~
	182	+ help='Reconfigure Wikilytics (this will replace wiki.cfg')
83	183
84	184	#DOWNLOAD
85	185	parser_download = subparsers.add_parser('download',
—	—	@@ -141,7 +241,7 @@
142	242	parser_diff = subparsers.add_parser('diff',
143	243	help='Create a Mongo collection containing the diffs between revisions.')
144	244	parser_diff.set_defaults(func=diff_launcher)
145		-
	245	+
146	246	#DJANGO
147	247	parser_django = subparsers.add_parser('django')
148	248	parser_django.add_argument('-e', '--except',
—	—	@@ -192,85 +292,9 @@
193	293	%s' % ''.join([f + ',\n' for f in file_choices]),
194	294	default=file_choices['meta-full'])
195	295
196		~~- return project, language, parser~~
	296	+ return parser
197	297
198	298
199		~~-def config_launcher(rts, logger):~~
200		~~- '''~~
201		~~- Config launcher is used to reconfigure editor trends toolkit.~~
202		~~- '''~~
203		-
204		~~- pc = projects.ProjectContainer()~~
205		~~- if not os.path.exists('wiki.cfg') or rts.force:~~
206		~~- config = ConfigParser.RawConfigParser()~~
207		~~- project = None~~
208		~~- language = None~~
209		~~- db = None~~
210		~~- valid_storage = ['mongo', 'cassandra']~~
211		~~- working_directory = raw_input('''Please indicate where you installed~~
212		~~- Wikilytics.\nCurrent location is %s\nPress Enter to accept default.\n''' % os.getcwd())~~
213		-
214		~~- input_location = raw_input('''Please indicate where the Wikipedia dump~~
215		~~- files are or will be located.\nDefault is: %s\nPress Enter to~~
216		~~- accept default.\n''' % rts.input_location)~~
217		-
218		~~- base_location = raw_input('''Please indicate where to store all~~
219		~~- Wikilytics project files.\nDefault is: %s\nPress Enter to accept~~
220		~~- default.\n''' % rts.base_location)~~
221		-
222		~~- while db not in valid_storage:~~
223		~~- db = raw_input('Please indicate what database you are using for storage. \nDefault is: Mongo\n')~~
224		~~- db = 'mongo' if len(db) == 0 else db.lower()~~
225		~~- if db not in valid_storage:~~
226		~~- print 'Valid choices are: %s' % ','.join(valid_storage)~~
227		-
228		~~- while project not in pc.projects.keys():~~
229		~~- project = raw_input('''Please indicate which project you would like~~
230		~~- to analyze.\nDefault is: %s\nPress Enter to accept default.\n''' % rts.project.full_name)~~
231		~~- project = project if len(project) > 0 else rts.project.name~~
232		~~- if project not in pc.projects.keys():~~
233		~~- print 'Valid choices for a project are: %s' % ','.join(pc.projects.keys())~~
234		-
235		~~- while language not in rts.project.valid_languages:~~
236		~~- language = raw_input('''Please indicate which language of project~~
237		~~- %s you would like to analyze.\nDefault is: %s\nPress Enter to accept~~
238		~~- default.\n''' % (rts.project.full_name, rts.language))~~
239		~~- if len(language) == 0:~~
240		~~- language = rts.language.code~~
241		~~- language = language if language in rts.project.valid_languages \~~
242		~~- else rts.language.default~~
243		-
244		~~- input_location = input_location if len(input_location) > 0 else \~~
245		~~- rts.input_location~~
246		~~- base_location = base_location if len(base_location) > 0 else \~~
247		~~- rts.base_location~~
248		~~- working_directory = working_directory if len(working_directory) > 0 \~~
249		~~- else os.getcwd()~~
250		-
251		~~- config = ConfigParser.RawConfigParser()~~
252		~~- config.add_section('file_locations')~~
253		~~- config.set('file_locations', 'working_directory', working_directory)~~
254		~~- config.set('file_locations', 'input_location', input_location)~~
255		~~- config.set('file_locations', 'base_location', base_location)~~
256		~~- config.add_section('wiki')~~
257		~~- config.set('wiki', 'project', project)~~
258		~~- config.set('wiki', 'language', language)~~
259		~~- config.add_section('storage')~~
260		~~- config.set('storage', 'db', db)~~
261		-
262		~~- fh = file_utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb')~~
263		~~- config.write(fh)~~
264		~~- fh.close()~~
265		-
266		~~- log.to_csv(logger, rts, 'New configuration', 'Creating',~~
267		~~- config_launcher,~~
268		~~- working_directory=working_directory,~~
269		~~- input_location=input_location,~~
270		~~- base_location=base_location,~~
271		~~- project=project,~~
272		~~- language=language,)~~
273		-
274		-
275	299	def downloader_launcher(rts, logger):
276	300	'''
277	301	This launcher calls the dump downloader to download a Wikimedia dump file.
—	—	@@ -343,7 +367,8 @@
344	368	stopwatch = timer.Timer()
345	369	log.to_db(rts, 'dataset', 'transform', stopwatch, event='start')
346	370	log.to_csv(logger, rts, 'Start', 'Transform', transformer_launcher)
347		~~- transformer.transform_editors_multi_launcher(rts)~~
	371	+ #transformer.transform_editors_multi_launcher(rts)
	372	+ transformer.transform_editors_single_launcher(rts)
348	373	stopwatch.elapsed()
349	374	log.to_db(rts, 'dataset', 'transform', stopwatch, event='finish')
350	375	log.to_csv(logger, rts, 'Finish', 'Transform', transformer_launcher)
—	—	@@ -359,8 +384,8 @@
360	385	log.to_db(rts, 'dataset', 'diff', stopwatch, event='finish')
361	386	log.to_csv(logger, rts, 'Finish', 'Diff', diff_launcher)
362	387
363		-
364	388
	389	+
365	390	def dataset_launcher(rts, logger):
366	391	'''
367	392	Dataset launcher is the entry point to generate datasets from the command
—	—	@@ -414,8 +439,11 @@
415	440	'''
416	441	This function initializes the command line parser.
417	442	'''
418		~~- project, language, parser, = init_args_parser()~~
	443	+ parser = init_args_parser()
419	444	args = parser.parse_args()
	445	+ language = languages.init()
	446	+ project = projects.init()
	447	+
420	448	rts = runtime_settings.RunTimeSettings(project, language, args)
421	449	#initialize logger
422	450	logger = logging.getLogger('manager')
Index: trunk/tools/editor_trends/etl/kaggle.py
—	—	@@ -1,49 +0,0 @@
2		~~-#!/usr/bin/python~~
3		~~-# -- coding: utf-8 --~~
4		~~-'''~~
5		~~-Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)~~
6		~~-This program is free software; you can redistribute it and/or~~
7		~~-modify it under the terms of the GNU General Public License version 2~~
8		~~-as published by the Free Software Foundation.~~
9		~~-This program is distributed in the hope that it will be useful,~~
10		~~-but WITHOUT ANY WARRANTY; without even the implied warranty of~~
11		~~-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.~~
12		~~-See the GNU General Public License for more details, at~~
13		~~-http://www.fsf.org/licenses/gpl.html~~
14		~~-'''~~
15		-
16		~~-__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])~~
17		~~-__email__ = 'dvanliere at gmail dot com'~~
18		~~-__date__ = '2011-04-12'~~
19		~~-__version__ = '0.1'~~
20		-
21		~~-import sys~~
22		-
23		~~-if '..' not in sys.path:~~
24		~~- sys.path.append('..')~~
25		-
26		~~-from utils import file_utils~~
27		-
28		-
29		~~-def launcher():~~
30		~~- location = '/home/diederik/wikimedia/en/wiki/kaggle_training/'~~
31		~~- #location = 'C:\\wikimedia\\en\\wiki\\txt'~~
32		~~- files = file_utils.retrieve_file_list(location, extension='csv')~~
33		~~- files.sort()~~
34		~~- dataset = file_utils.create_txt_filehandle(location, 'dataset.csv', 'w', 'utf-8')~~
35		~~- for filename in files:~~
36		~~- if not filename.startswith('comments') and \~~
37		~~- not filename.startswith('articles') and not filename.startswith('dataset'):~~
38		~~- fh = file_utils.create_txt_filehandle(location, filename, 'r', 'utf-8')~~
39		~~- print fh~~
40		~~- for line in fh:~~
41		~~- data = line.split('\t')~~
42		~~- username = data[3].lower()~~
43		~~- if username.endswith('bot'):~~
44		~~- continue~~
45		~~- else:~~
46		~~- dataset.write(line)~~
47		~~- fh.close()~~
48		~~- dataset.close()~~
49		-
50		~~-launcher()~~
Index: trunk/tools/editor_trends/etl/variables.py
—	—	@@ -275,20 +275,20 @@
276	276	Determine the id of a revision
277	277	'''
278	278	if revision_id != None:
279		~~- return revision_id.text~~
	279	+ return int(revision_id.text)
280	280	else:
281	281	return None
282	282
283	283
284		~~-def extract_comment_text(revision_id, revision):~~
	284	+def extract_comment_text(revision, xml_namespace):
285	285	'''
286	286	Extract the comment associated with an edit.
287	287	'''
288		~~- comment = {}~~
289		~~- text = revision.find('comment')~~
290		~~- if text != None and text.text != None:~~
291		~~- comment[revision_id] = text.text.encode('utf-8')~~
292		~~- return comment~~
	288	+ comment_text = revision.find('%s%s' % (xml_namespace, 'comment'))
	289	+ if comment_text != None and comment_text.text != None:
	290	+ return comment_text.text
	291	+ else:
	292	+ return None
293	293
294	294
295	295	def create_namespace_dict(siteinfo, xml_namespace):
Index: trunk/tools/editor_trends/etl/differ.py
—	—	@@ -17,23 +17,35 @@
18	18	__date__ = '2011-04-10'
19	19	__version__ = '0.1'
20	20
	21	+
	22	+'''
	23	+This script generates diffs of edits for the Talk, User Talk and Wikipedia Talk
	24	+pages of a Wikipedia project. These diffs are stored in json files and then
	25	+imported in Mongo.
	26	+'''
	27	+import pprint
21	28	import json
22	29	import cStringIO
23	30	import codecs
24	31	import sys
25	32	import os
26	33	import difflib
	34	+import bson
27	35	from xml.etree.cElementTree import iterparse, dump
28	36	from multiprocessing import JoinableQueue, Process, cpu_count
29	37	from datetime import datetime
	38	+from copy import deepcopy
30	39
31	40
32	41	if '..' not in sys.path:
33	42	sys.path.append('../')
34	43
35	44	from utils import file_utils
	45	+from utils import text_utils
36	46	from etl import variables
37	47	from classes import exceptions
	48	+from classes import storage
	49	+from classes import runtime_settings
38	50
39	51
40	52	def parse_xml(fh, format, process_id, location):
—	—	@@ -50,13 +62,13 @@
51	63	context = iterparse(fh, events=(start, end))
52	64	context = iter(context)
53	65
54		~~- article = {}~~
	66	+
	67	+ revisions = []
55	68	count_articles = 0
56	69	id = False
57	70	ns = False
58	71	parse = False
59		~~- rev1 = None~~
60		~~- rev2 = None~~
	72	+ prev_rev_text = None
61	73	file_id, fh_output = None, None
62	74
63	75	try:
—	—	@@ -80,11 +92,11 @@
81	93	parsing this article, else it will skip this article.
82	94	'''
83	95	title = variables.parse_title(elem)
84		~~- article['title'] = title~~
85	96	current_namespace = variables.determine_namespace(title, namespaces, include_ns)
86	97	if current_namespace == 1 or current_namespace == 3 or current_namespace == 5:
87	98	parse = True
88		~~- article['namespace'] = current_namespace~~
	99	+ #article['namespace'] = current_namespace
	100	+ title = title.replace(namespaces[current_namespace], '')
89	101	count_articles += 1
90	102	if count_articles % 10000 == 0:
91	103	print 'Worker %s parsed %s articles' % (process_id, count_articles)
—	—	@@ -105,23 +117,32 @@
106	118	timestamp = elem.find('%s%s' % (xml_namespace, 'timestamp')).text
107	119	contributor = elem.find('%s%s' % (xml_namespace, 'contributor'))
108	120	editor = variables.parse_contributor(contributor, None, xml_namespace)
	121	+ text = variables.extract_revision_text(elem, xml_namespace)
	122	+ comment = variables.extract_comment_text(elem, xml_namespace)
109	123	if editor:
110	124	rev_id = variables.extract_revision_id(rev_id)
	125	+ if prev_rev_text == None:
	126	+ diff = text
	127	+ prev_rev_text = deepcopy(text)
	128	+ if prev_rev_text != None:
	129	+ #print text[0:20], prev_rev_text[0:20]
	130	+ diff = diff_revision(prev_rev_text, text)
111	131
112		~~- if rev1 == None and rev2 == None:~~
113		~~- diff = variables.extract_revision_text(elem, xml_namespace)~~
114		~~- rev1 = elem~~
115		~~- if rev1 != None and rev2 != None:~~
116		~~- diff = diff_revision(rev1, rev2, xml_namespace)~~
	132	+ if diff != None:
	133	+ timestamp = text_utils.convert_timestamp_to_datetime_utc(timestamp)
	134	+ timestamp = timestamp.isoformat()
	135	+ revision = dict(rev_id=rev_id, title=title,
	136	+ timestamp=timestamp,
	137	+ diff=diff, comment=comment,
	138	+ id=editor['id'],
	139	+ username=editor['username'],
	140	+ article_id=article_id,
	141	+ ns=current_namespace)
	142	+ revisions.append(revision)
117	143
118		~~- article[rev_id] = {}~~
119		~~- article[rev_id].update(editor)~~
120		~~- article[rev_id]['timestamp'] = timestamp~~
121		~~- article[rev_id]['diff'] = diff~~
122		-
123	144	clear = True
124	145	if clear:
125		~~- rev2 = rev1~~
	146	+ prev_rev_text = deepcopy(text)
126	147	elem.clear()
127	148	else:
128	149	elem.clear()
—	—	@@ -130,7 +151,7 @@
131	152	'''
132	153	Determine id of article
133	154	'''
134		~~- article['article_id'] = elem.text~~
	155	+ article_id = int(elem.text)
135	156	id = True
136	157	elem.clear()
137	158
—	—	@@ -140,17 +161,16 @@
141	162	memory.
142	163	'''
143	164	elem.clear()
144		~~- #write diff of text to file~~
	165	+
145	166	if parse:
146		~~- #print article~~
147		~~- fh_output, file_id = assign_filehandle(fh_output, file_id, location, process_id, format)~~
148		~~- write_diff(fh_output, article, format)~~
	167	+ #write diff of text to file
	168	+ if len(revisions) > 0:
	169	+ fh_output, file_id = assign_filehandle(fh_output, file_id, location, process_id, format)
	170	+ write_diff(fh_output, revisions, format)
	171	+
149	172	#Reset all variables for next article
150		~~- article = {}~~
151		~~- if rev1 != None:~~
152		~~- rev1.clear()~~
153		~~- if rev2 != None:~~
154		~~- rev2.clear()~~
	173	+ revisions = []
	174	+ prev_rev_text = None
155	175	id = False
156	176	parse = False
157	177
—	—	@@ -181,14 +201,47 @@
182	202
183	203	return fh, file_id
184	204
	205	+
185	206	def write_xml_diff(fh, article):
186	207	pass
187	208
188	209
189		~~-def write_json_diff(fh, article):~~
190		~~- json.dump(article, fh)~~
	210	+def write_json_diff(fh, revisions):
	211	+ fh.write('\nStart new JSON object\n')
	212	+ json.dump(revisions, fh, indent=4, sort_keys=True)
191	213
192	214
	215	+def store_json_diffs(rts):
	216	+ files = os.listdir(rts.diffs)
	217	+ print files, rts.diffs
	218	+ db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
	219	+ buffer = cStringIO.StringIO()
	220	+
	221	+ for filename in files:
	222	+ fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 'utf-8')
	223	+ for line in fh:
	224	+ if line.startswith('\n') or line.startswith('Start'):
	225	+ obj = buffer.getvalue()
	226	+ if obj != '':
	227	+ obj = json.loads(obj)
	228	+ obj[0]['article_id'] = int(obj[0]['article_id'])
	229	+ for key, value in obj[0].iteritems():
	230	+ if type(value) == type(dict()):
	231	+ value['timestamp'] = datetime.strptime(value['timestamp'], '%Y-%m-%dT%H:%M:%S')
	232	+ obj[0][key] = value
	233	+ obj = obj[0]
	234	+ #print obj
	235	+ #print len(obj)
	236	+ try:
	237	+ db.save(obj)
	238	+ except bson.errors.InvalidDocument, error:
	239	+ print error
	240	+ buffer = cStringIO.StringIO()
	241	+ else:
	242	+ buffer.write(line)
	243	+ fh.close()
	244	+
	245	+
193	246	def write_diff(fh, article, format):
194	247	if format == 'xml':
195	248	write_xml_diff(fh, article)
—	—	@@ -198,23 +251,47 @@
199	252	raise exceptions.OutputNotSupported()
200	253
201	254
202		~~-def diff_revision(rev1, rev2, xml_namespace):~~
203		~~- buffer = cStringIO.StringIO()~~
204		~~- if rev1.text != None and rev2.text != None:~~
205		~~- diff = difflib.unified_diff(rev1.text, rev2.text, n=0, lineterm='')~~
	255	+def diff_revision(rev1, rev2):
	256	+ if rev1 == None:
	257	+ rev1 = ''
	258	+ if rev2 == None:
	259	+ rev2 = ''
	260	+ if len(rev1) != len(rev2):
	261	+ buffer = cStringIO.StringIO()
	262	+ rev1 = rev1.splitlines(1)
	263	+ rev2 = rev2.splitlines(2)
	264	+
	265	+ diff = difflib.unified_diff(rev1, rev2, n=0, lineterm='')
206	266	for line in diff:
207	267	if len(line) > 3:
208		~~- print line~~
209		~~- buffer.write(line)~~
	268	+ #print line
	269	+ buffer.write(line.encode('utf-8'))
210	270
211		~~- return buffer.getvalue()~~
	271	+ diff = buffer.getvalue()
212	272
	273	+ if diff == '':
	274	+ return None
	275	+ else:
	276	+ return diff
	277	+ else:
	278	+ return None
	279	+
	280	+
	281	+def store_diffs_debug(rts):
	282	+ db = storage.init_database(rts)
	283	+ files = os.listdir(rts.diffs)
	284	+ for filename in files:
	285	+ fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 'utf-8')
	286	+ diffs = json.load(fh)
	287	+ db.insert(diffs)
	288	+ fh.close()
	289	+
	290	+
213	291	def stream_raw_xml(input_queue, process_id, rts, format):
214	292	'''
215	293	This function fetches an XML file from the queue and launches the processor.
216	294	'''
217	295	t0 = datetime.now()
218		~~- file_id = 0~~
219	296
220	297	while True:
221	298	filename = input_queue.get()
—	—	@@ -225,7 +302,7 @@
226	303
227	304	print filename
228	305	fh = file_utils.create_streaming_buffer(filename)
229		~~- parse_xml(fh, format, process_id, rts.input_location)~~
	306	+ parse_xml(fh, format, process_id, rts.diffs)
230	307	fh.close()
231	308
232	309	t1 = datetime.now()
—	—	@@ -266,7 +343,14 @@
267	344
268	345	input_queue.join()
269	346
	347	+ store_json_diffs(rts)
	348	+ db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
	349	+ db.add_index('title')
	350	+ db.add_index('timestamp')
	351	+ db.add_index('username')
	352	+ db.add_index('ns')
270	353
	354	+
271	355	def launcher_simple():
272	356	location = 'c:\\wikimedia\\nl\\wiki\\'
273	357	output_location = 'c:\\wikimedia\\nl\\wiki\\diffs\\'
—	—	@@ -311,5 +395,6 @@
312	396
313	397
314	398	if __name__ == '__main__':
	399	+ #read_json_diffs()
315	400	launcher_simple()
316	401	#debug()
Index: trunk/tools/editor_trends/etl/extracter.py
—	—	@@ -22,8 +22,8 @@
23	23	parsing the XML on the fly and extracting & constructing the variables that are
24	24	need for subsequent analysis. The extract module is initialized using an
25	25	instance of RunTimeSettings and the most important parameters are:
26		~~-The name of project\n~~
27		~~-The language of the project\n~~
	26	+The name of project
	27	+The language of the project
28	28	The location where the dump files are stored
29	29	'''
30	30
Index: trunk/tools/editor_trends/etl/transformer.py
—	—	@@ -338,11 +338,11 @@
339	339	db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset)
340	340	db_dataset.drop_collection()
341	341	editors = db_raw.retrieve_editors()
342		~~- return editors~~
	342	+ return editors, db_raw, db_dataset
343	343
344	344
345	345	def transform_editors_multi_launcher(rts):
346		~~- editors = setup_database(rts)~~
	346	+ editors, db_raw, db_dataset = setup_database(rts)
347	347	n = editors.size()
348	348	result = queue.JoinableRetryQueue()
349	349	pbar = progressbar.ProgressBar(maxval=n).start()
—	—	@@ -372,7 +372,7 @@
373	373
374	374	def transform_editors_single_launcher(rts):
375	375	print rts.dbname, rts.editors_raw
376		~~- editors = setup_database(rts)~~
	376	+ editors, db_raw, db_dataset = setup_database(rts)
377	377	n = editors.size()
378	378	pbar = progressbar.ProgressBar(maxval=n).start()
379	379
—	—	@@ -384,7 +384,7 @@
385	385	editors.task_done()
386	386	if editor == None:
387	387	break
388		~~- editor = Editor(rts, editor)~~
	388	+ editor = Editor(rts, editor, db_raw, db_dataset)
389	389	editor()
390	390
391	391	pbar.update(pbar.currval + 1)
Index: trunk/tools/editor_trends/statistics/stata/ppi.do
—	—	@@ -1,5 +1,11 @@
2	2	clear
3	3	insheet using "C:\Users\diederik.vanliere\Desktop\ppi_quality.csv"
	4	+
	5	+gen diff_character_count = character_count_a - character_count_b
	6	+gen diff_cum_edit_count_main_ns = cum_edit_count_main_ns_a- cum_edit_count_main_ns_b
	7	+gen diff_cum_edit_count_other_ns = cum_edit_count_other_ns_a- cum_edit_count_other_ns_b
	8	+gen diff_article_count = article_count_a- article_count_b
	9	+
4	10	label var character_count_a "PPI editor"
5	11	label var character_count_b "Regular editor"
6	12
Index: trunk/tools/editor_trends/kaggle/training.py
—	—	@@ -17,23 +17,35 @@
18	18	__date__ = '2011-04-12'
19	19	__version__ = '0.1'
20	20
	21	+import os
	22	+import sys
	23	+import cPickle
21	24	import codecs
22		~~-import os~~
23	25	from datetime import datetime
24		~~-import json~~
	26	+sys.path.append('../')
25	27
26		~~-location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction'~~
	28	+from classes import storage
	29	+
	30	+location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction_solution'
27	31	files = os.listdir(location)
28	32	files.reverse()
29		~~-dataset = codecs.open('training.tsv', 'w', 'utf-8')~~
	33	+
	34	+max_size = 2147483648
	35	+max_size_reached = False
	36	+
30	37	t0 = datetime.now()
31		~~-max_size = 2147483648~~
32	38	titles = {}
33	39	ids = set()
	40	+dates = {}
	41	+edits = {}
	42	+ignore_ids = set()
34	43	size = 0
35	44	cnt_obs = 0
36		~~-max_size_reached = False~~
	45	+cutoff_date = datetime(2010, 8, 31)
37	46
	47	+print 'Constructing training dataset...'
	48	+db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
	49	+dataset = codecs.open('training.tsv', 'w', 'utf-8')
38	50	for filename in files:
39	51	if not filename.startswith('comments') and not filename.startswith('articles'):
40	52	fh = codecs.open(os.path.join(location, filename))
—	—	@@ -46,13 +58,25 @@
47	59	continue
48	60	if line[10] == '1':
49	61	continue
	62	+ timestamp = datetime.strptime(line[6], '%Y-%m-%dT%H:%M:%SZ')
	63	+ if timestamp > cutoff_date:
	64	+ continue
50	65	username = line[3].lower()
51		~~- if username.endswith('bot'):~~
	66	+ if username.endswith('bot') or username.find('script') > -1:
52	67	#line[10] = '1'
53	68	continue
	69	+ id = line[2]
	70	+ if id not in ids and id not in ignore_ids:
	71	+ res = db.find_one('editor', id)
	72	+ if res == None:
	73	+ ignore_ids.add(id)
	74	+ continue
54	75	cnt_obs += 1
55	76	title_id = line[1]
56		~~- ids.add(line[2])~~
	77	+ ids.add(id)
	78	+ simple_date = '%s-%s' % (timestamp.year, timestamp.month)
	79	+ dates.setdefault(simple_date, 0)
	80	+ dates[simple_date] += 1
57	81	title = line.pop(5)
58	82	titles[title_id] = title
59	83	line.append('\n')
—	—	@@ -64,20 +88,54 @@
65	89
66	90	dataset.close()
67	91
	92	+print 'Constructing title dataset...'
68	93	fh = codecs.open('titles.tsv', 'w', 'utf-8')
69	94	for id, title in titles.iteritems():
70	95	fh.write('%s\t%s\n' % (id, title.decode('utf-8')))
71	96	fh.close()
72	97
73		~~-fh = codecs.open('ids.json', 'w', 'utf-8')~~
74		~~-json.dump(ids, fh)~~
75		~~-#for id in ids:~~
76		~~-#fh.write('%s\n' % (id.decode('utf-8')))~~
77		~~-#fh.write('%s\n' % (json.du)~~
	98	+
	99	+print 'Constructing solution dataset...'
	100	+x = 0
	101	+fh = codecs.open('solutions.tsv', 'w', 'utf-8')
	102	+for id in ids:
	103	+ if id not in ignore_ids:
	104	+ obs = db.find_one('editor', str(id), 'cum_edit_count_main_ns')
	105	+ if obs != None:
	106	+ x += 1
	107	+ n = obs['cum_edit_count_main_ns']
	108	+ fh.write('%s,%s\n' % (id.decode('utf-8'), n))
	109	+ edits.setdefault(n, 0)
	110	+ edits[n] += 1
	111	+ else:
	112	+ print id
78	113	fh.close()
79	114
	115	+print 'Storing date histogram'
	116	+fh = open('histogram_dates.bin', 'wb')
	117	+cPickle.dump(dates, fh)
	118	+fh.close()
	119	+
	120	+
	121	+fh = open('histogram_dates.tsv', 'w')
	122	+for date, n in dates.iteritems():
	123	+ fh.write('%s\t%s\n' % (date, n))
	124	+fh.close()
	125	+
	126	+
	127	+print 'Storing edit histogram'
	128	+fh = open('histogram_edits.bin', 'wb')
	129	+cPickle.dump(edits, fh)
	130	+fh.close()
	131	+
	132	+fh = open('histogram_edits.tsv', 'w')
	133	+for edit, n in edits.iteritems():
	134	+ fh.write('%s\t%s\n' % (edit, n))
	135	+fh.close()
	136	+
	137	+
80	138	t1 = datetime.now()
81		~~-print 'Descriptives:\n'~~
82		~~-print 'Number of editors: %s' % len(ids)~~
	139	+print 'Descriptives:'
	140	+print 'Number of editors: %s' % x
83	141	print 'Number of edits: %s' % cnt_obs
84	142	print 'It took %s to construct the Kaggle training set' % (t1 - t0)
Index: trunk/tools/editor_trends/classes/projects.py
—	—	@@ -107,9 +107,12 @@
108	108	pc = ProjectContainer()
109	109	pc.supported_projects()
110	110
111		~~-def init():~~
	111	+def init(project=None):
112	112	pc = ProjectContainer()
113		~~- return pc.get_project('wiki')~~
	113	+ if project:
	114	+ return pc.get_project(project)
	115	+ else:
	116	+ return pc.get_project('wiki')
114	117
115	118	if __name__ == '__main__':
116	119	debug()
Index: trunk/tools/editor_trends/classes/settings.py
—	—	@@ -17,11 +17,6 @@
18	18	__date__ = '2010-10-21'
19	19	__version__ = '0.1'
20	20
21		~~-'''~~
22		~~-This file contains settings that are used for constructing and analyzing~~
23		~~-the datasets as part of the Editor Dynamics and Anti-Vandalism projects.~~
24		~~-'''~~
25		-
26	21	from multiprocessing import cpu_count
27	22	import ConfigParser
28	23	import os
—	—	@@ -73,7 +68,7 @@
74	69	#Change this to match your computers configuration (RAM / CPU)
75	70	# I want to get rid off these two variables.
76	71	self.number_of_processes = cpu_count()
77		~~- self.windows_register = {'7z.exe': 'Software\\7-Zip'}~~
	72	+ #self.windows_register = {'7z.exe': 'Software\\7-Zip'}
78	73
79	74	self.wp_dump_location = 'http://dumps.wikimedia.org'
80	75
—	—	@@ -107,6 +102,8 @@
108	103	self.default_project = config.get('wiki', 'project')
109	104	self.default_language = config.get('wiki', 'language')
110	105	self.storage = config.get('storage', 'db')
	106	+ self.master = config.get('cluster', 'master')
	107	+ self.slaves = config.get('cluster', 'slaves')
111	108	return True
112	109	except Exception, error:
113	110	#raise exceptions.GenericMessage('corrupted_config')
Index: trunk/tools/editor_trends/classes/runtime_settings.py
—	—	@@ -27,6 +27,9 @@
28	28	import datetime
29	29	import time
30	30
	31	+if '..' not in sys.path:
	32	+ sys.path.append('../')
	33	+
31	34	from settings import Settings
32	35	from analyses import inventory
33	36	from classes import exceptions
—	—	@@ -48,49 +51,49 @@
49	52	self.language = language
50	53	self.dbname = 'wikilytics'
51	54
52		~~- if args:~~
53		~~- self.args = args~~
54		~~- self.hash = self.secs_since_epoch()~~
55		~~- #print self.settings.input_location~~
56		~~- #print self.get_value('location')~~
57		~~- self.project = self.update_project_settings()~~
58		~~- self.language = self.update_language_settings()~~
	55	+ #if args:
	56	+ self.args = args
	57	+ self.id = '%s%s_%s' % (self.language.code, self.project.name, 'current_month')
	58	+ #print self.settings.input_location
	59	+ #print self.get_value('location')
	60	+ self.project = self.update_project_settings()
	61	+ self.language = self.update_language_settings()
59	62
60		~~- self.input_location = self.set_input_location()~~
61		~~- self.output_location = self.set_output_location()~~
	63	+ self.input_location = self.set_input_location()
	64	+ self.output_location = self.set_output_location()
62	65
63		~~- self.plugins = self.set_plugin()~~
64		~~- self.keywords = self.split_keywords()~~
65		~~- self.namespaces = self.get_namespaces()~~
	66	+ self.plugins = self.set_plugin()
	67	+ self.keywords = self.split_keywords()
	68	+ self.namespaces = self.get_namespaces()
66	69
67		~~- self.kaggle = self.get_value('kaggle')~~
68		~~- self.function = self.get_value('func')~~
69		~~- self.ignore = self.get_value('except')~~
70		~~- self.force = self.get_value('force')~~
71		~~- self.analyzer_collection = self.get_value('collection')~~
	70	+ #self.kaggle = self.get_value('kaggle')
	71	+ self.function = self.get_value('func')
	72	+ self.ignore = self.get_value('except')
	73	+ self.force = self.get_value('force')
	74	+ self.analyzer_collection = self.get_value('collection')
72	75
73		~~- self.dataset = os.path.join(self.dataset_location, self.project.name)~~
74		~~- self.txt = os.path.join(self.output_location, 'txt')~~
75		~~- self.sorted = os.path.join(self.output_location, 'sorted')~~
76		~~- self.diffs = os.path.join(self.output_location, 'diffs')~~
	76	+ self.dataset = os.path.join(self.dataset_location, self.project.name)
	77	+ self.txt = os.path.join(self.output_location, 'txt')
	78	+ self.sorted = os.path.join(self.output_location, 'sorted')
	79	+ self.diffs = os.path.join(self.output_location, 'diffs')
77	80
78		~~- self.directories = [self.output_location,~~
79		~~- self.txt,~~
80		~~- self.sorted,~~
81		~~- self.dataset,~~
82		~~- self.diffs]~~
83		~~- self.verify_environment(self.directories)~~
	81	+ self.directories = [self.output_location,
	82	+ self.txt,
	83	+ self.sorted,
	84	+ self.dataset,
	85	+ self.diffs]
	86	+ self.verify_environment(self.directories)
84	87
85		~~- #Wikidump file related variables~~
86		~~- self.dump_filename = self.generate_wikidump_filename()~~
87		~~- self.dump_relative_path = self.set_dump_path()~~
88		~~- self.dump_absolute_path = self.set_dump_path(absolute=True)~~
	88	+ #Wikidump file related variables
	89	+ self.dump_filename = self.generate_wikidump_filename()
	90	+ self.dump_relative_path = self.set_dump_path()
	91	+ self.dump_absolute_path = self.set_dump_path(absolute=True)
89	92
90		~~- #Collection names~~
91		~~- self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name)~~
92		~~- self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name)~~
93		~~- self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name)~~
94		~~- self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, self.project.name)~~
	93	+ #Collection names
	94	+ self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name)
	95	+ self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name)
	96	+ self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name)
	97	+ self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, self.project.name)
95	98
96	99
97	100
—	—	@@ -239,7 +242,7 @@
240	243	'''
241	244	default = self.project
242	245	proj = self.get_value('project')
243		~~- if proj != 'wiki':~~
	246	+ if proj != default:
244	247	pc = projects.ProjectContainer()
245	248	proj = pc.get_project(proj)
246	249	return proj
—	—	@@ -281,7 +284,7 @@
282	285	return ['0'] #Assume that the mainspace is of interest
283	286
284	287
285		~~-def init_environment(project, language_code, args):~~
	288	+def init_environment(project, language_code):
286	289	'''
287	290	Initialize an instance of RuntimeSettings.
288	291	'''
—	—	@@ -289,8 +292,9 @@
290	293	project = pjc.get_project(project)
291	294	lnc = languages.LanguageContainer()
292	295	language = lnc.get_language(language_code)
293		-
294		~~- args.language = language.name~~
295		~~- args.project = project.name~~
	296	+ parser = init_args_parser(language_code, project)
	297	+ args = parser.parse_args(['django'])
	298	+ #args.language = language.name
	299	+ #args.project = project.name
296	300	rts = RunTimeSettings(project, language, args)
297	301	return rts
Index: trunk/tools/editor_trends/classes/storage.py
—	—	@@ -102,9 +102,14 @@
103	103	This class provides the functionality to talk to a MongoDB backend including
104	104	inserting, finding, and updating data.
105	105	'''
106		~~- def __init__(self, dbname, collection):~~
	106	+ def __init__(self, dbname, collection, master=None, slaves=[]):
	107	+ if master == None:
	108	+ self.master = 'localhost'
	109	+ else:
	110	+ self.master = master
	111	+ self.slaves = slaves
	112	+ self.port = 27017
107	113	super(Mongo, self).__init__(dbname, collection)
108		~~- self.port = 27017~~
109	114
110	115	@classmethod
111	116	def is_registrar_for(cls, storage):
—	—	@@ -114,8 +119,16 @@
115	120	return storage == 'mongo'
116	121
117	122	def connect(self):
118		~~- db = pymongo.Connection()~~
119		~~- return db[self.dbname]~~
	123	+ master = pymongo.Connection(host=self.master, port=self.port)
	124	+ if self.master == 'localhost':
	125	+ return master[self.dbname]
	126	+ else:
	127	+ slave_connections = []
	128	+ for slave in self.slaves:
	129	+ slave = pymongo.Connection(host=slave, port=self.port)
	130	+ slave_connections.append(slave)
	131	+ master_slave_connection = pymongo.MasterSlaveConnection(master, slave_connections)
	132	+ return master_slave_connection[self.dbname]
120	133
121	134	def save(self, data):
122	135	assert isinstance(data, dict), 'You need to feed me dictionaries.'
Index: trunk/tools/editor_trends/classes/languages.py
—	—	@@ -679,9 +679,12 @@
680	680	print abbr
681	681	print len(abbr)
682	682
683		~~-def init():~~
	683	+def init(language_code=None):
684	684	lnc = LanguageContainer()
685		~~- return lnc.languages[lnc.default]~~
	685	+ if language_code:
	686	+ return lnc.languages[language_code]
	687	+ else:
	688	+ return lnc.languages[lnc.default]
686	689
687	690	if __name__ == '__main__':
688	691	init()
Index: trunk/tools/editor_trends/utils/file_utils.py
—	—	@@ -173,9 +173,10 @@
174	174	'''Create a filehandle for text file with utf-8 encoding'''
175	175	filename = str(filename)
176	176	if not filename.endswith('.csv'):
177		~~- filename = construct_filename(filename, '.csv')~~
	177	+ if filename.find('.') == -1:
	178	+ filename = construct_filename(filename, '.csv')
178	179	path = os.path.join(location, filename)
179		~~- return codecs.open(path, mode, encoding='utf-8')~~
	180	+ return codecs.open(path, mode, encoding)
180	181
181	182
182	183	def create_streaming_buffer(path):
—	—	@@ -189,7 +190,8 @@
190	191	fh = subprocess.Popen('7z e -bd -so %s 2>/dev/null' % path, shell=True,
191	192	stdout=subprocess.PIPE, bufsize=65535).stdout
192	193	elif extension == '.xml':
193		~~- fh = create_txt_filehandle(path, None, 'r', 'utf-8')~~
	194	+ location, filename = os.path.split(path)
	195	+ fh = create_txt_filehandle(location, filename, 'r', 'utf-8')
194	196	else:
195	197	raise exceptions.CompressedFileNotSupported(extension)
196	198	return fh
—	—	@@ -247,6 +249,7 @@
248	250	os.utime(path, (mod_rem, mod_rem))
249	251	#sraise exceptions.NotYetImplementedError(set_modified_data)
250	252
	253	+
251	254	def get_modified_date(location, filename):
252	255	'''determine the date the file was originally created'''
253	256	path = os.path.join(location, filename)
Index: trunk/tools/editor_trends/utils/log.py
—	—	@@ -31,11 +31,9 @@
32	32	def to_db(rts, jobtype, task, timer, event='start'):
33	33	db = storage.init_database(rts.storage, rts.dbname, 'jobs')
34	34	created = datetime.datetime.now()
35		~~- hash = '%s_%s' % (rts.project, rts.hash)~~
	35	+ job = db.find_one('hash', rts.id)
36	36
37		~~- job = db.find_one('hash', hash)~~
38		-
39		~~- data = {'hash': hash,~~
	37	+ data = {'hash': rts.id,
40	38	'created': created,
41	39	'jobtype': jobtype,
42	40	'in_progress': True,
—	—	@@ -60,7 +58,7 @@
61	59	t['start'] = timer.t0
62	60	t['in_progress'] = True
63	61	tasks[task] = t
64		~~- db.update('hash', hash, {'$set': {'tasks': tasks}})~~
	62	+ db.update('hash', rts.id, {'$set': {'tasks': tasks}})
65	63	#coll.update({'hash': hash}, {'$set': {'tasks': tasks}})
66	64	elif event == 'finish':
67	65	t['finish'] = timer.t1
—	—	@@ -68,11 +66,11 @@
69	67	tasks[task] = t
70	68	if task == 'transform' or jobtype == 'chart':
71	69	#final task, set entire task to finished
72		~~- db.update('hash', hash, {'$set': {'tasks': tasks,~~
	70	+ db.update('hash', rts.id, {'$set': {'tasks': tasks,
73	71	'in_progress': False,
74	72	'finished': True}})
75	73	else:
76		~~- db.update('hash', hash, {'$set': {'tasks': tasks}})~~
	74	+ db.update('hash', rts.id, {'$set': {'tasks': tasks}})
77	75
78	76
79	77	def to_csv(logger, settings, message, verb, function, **kwargs):
Index: trunk/tools/editor_trends/utils/text_utils.py
—	—	@@ -20,6 +20,7 @@
21	21	import datetime
22	22	import time
23	23	import sys
	24	+import re
24	25
25	26	if '..' not in sys.path:
26	27	sys.path.append('..')
—	—	@@ -52,6 +53,14 @@
53	54	return dict([[v, k] for k, v in dictionary.items()])
54	55
55	56
	57	+def validate_hostname(hostname):
	58	+ regex_hostname = re.compile('^(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]\|\b-){0,61}[0-9A-Za-z])?(?:\.[0-9A-Za-z](?:(?:[0-9A-Za-z]\|\b-){0,61}[0-9A-Za-z])?)*\.?$')
	59	+ res = re.match(regex_hostname, hostname)
	60	+ if res == None:
	61	+ return False
	62	+ else:
	63	+ return True
	64	+
56	65	def get_max_width(table, index):
57	66	'''
58	67	Get the maximum width of the given column index