r89242 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r89241‎ \| r89242 \| r89243 >
Date:	00:02, 1 June 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Preparing for Summer of Research, part 2
Modified paths:	/trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py (modified) (history) /trunk/tools/editor_trends/analyses/analyzer.py (modified) (history) /trunk/tools/editor_trends/analyses/plugins/taxonomy_list_makers.py (modified) (history) /trunk/tools/editor_trends/classes/analytics.py (modified) (history) /trunk/tools/editor_trends/classes/buffer.py (modified) (history) /trunk/tools/editor_trends/classes/dataset.py (modified) (history) /trunk/tools/editor_trends/classes/runtime_settings.py (modified) (history) /trunk/tools/editor_trends/classes/storage.py (modified) (history) /trunk/tools/editor_trends/etl/differ.py (modified) (history) /trunk/tools/editor_trends/etl/downloader.py (modified) (history) /trunk/tools/editor_trends/etl/extracter.py (modified) (history) /trunk/tools/editor_trends/etl/store.py (modified) (history) /trunk/tools/editor_trends/etl/variables.py (modified) (history) /trunk/tools/editor_trends/kaggle/training.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/utils/log.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/analyses/plugins/taxonomy_list_makers.py
—	—	@@ -71,4 +71,4 @@
72	72	cursor = db.find('category', 'List')
73	73	for c in cursor:
74	74	data[c['id']] = 1
75		~~- return data~~
	75	+ return data, rts
Index: trunk/tools/editor_trends/analyses/analyzer.py
—	—	@@ -24,6 +24,7 @@
25	25	import types
26	26	import sys
27	27	import cPickle
	28	+import pymongo
28	29	import gc
29	30	import os
30	31	import progressbar
—	—	@@ -65,7 +66,7 @@
66	67	def feedback(plugin, rts):
67	68	print 'Exporting data for chart: %s' % plugin
68	69	print 'Project: %s' % rts.dbname
69		~~- print 'Dataset: %s' % rts.editors_dataset~~
	70	+ print 'Dataset: %s' % rts.collection
70	71
71	72
72	73	def write_output(ds, rts, stopwatch):
—	—	@@ -97,7 +98,8 @@
98	99	plugin = retrieve_plugin(func)
99	100
100	101	if not plugin:
101		~~- raise exceptions.UnknownPluginError(plugin, self.available_plugins)~~
	102	+ available_plugins = inventory.available_analyses()
	103	+ raise exceptions.UnknownPluginError(plugin, available_plugins)
102	104	plugin = getattr(plugin, func)
103	105
104	106	feedback(func, rts)
—	—	@@ -110,15 +112,16 @@
111	113	obs = dict()
112	114	obs_proxy = mgr.dict(obs)
113	115
114		~~- db = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset)~~
	116	+ db = storage.init_database(rts.storage, rts.dbname, rts.collection)
115	117	editors = db.retrieve_distinct_keys('editor')
116	118	#editors = editors[:500]
117		~~- min_year, max_year = determine_project_year_range(db, 'new_wikipedian')~~
	119	+ if rts.collection.find('editors_dataset') > -1:
	120	+ min_year, max_year = determine_project_year_range(db, 'new_wikipedian')
	121	+ kwargs['min_year'] = min_year
	122	+ kwargs['max_year'] = max_year
118	123
119	124	fmt = kwargs.pop('format', 'long')
120	125	time_unit = kwargs.pop('time_unit', 'year')
121		~~- kwargs['min_year'] = min_year~~
122		~~- kwargs['max_year'] = max_year~~
123	126
124	127
125	128	var = dataset.Variable('count', time_unit, lock, obs_proxy, **kwargs)
—	—	@@ -153,24 +156,21 @@
154	157
155	158
156	159	ppills = cpu_count()
157		~~- while True:~~
158		~~- while ppills > 0:~~
159		~~- try:~~
160		~~- res = result.get()~~
161		~~- if res == True:~~
162		~~- pbar.update(pbar.currval + 1)~~
163		~~- else:~~
164		~~- ppills -= 1~~
165		~~- var = res~~
166		~~- print ppills~~
167		~~- except Empty:~~
168		~~- pass~~
169		~~- break~~
170		~~- print 'Waiting for tasks...'~~
	160	+ while ppills > 0:
	161	+ try:
	162	+ res = result.get()
	163	+ if res == True:
	164	+ pbar.update(pbar.currval + 1)
	165	+ else:
	166	+ ppills -= 1
	167	+ var = res
	168	+ except Empty:
	169	+ pass
	170	+
171	171	tasks.join()
172	172
173	173	var = reconstruct_observations(var)
174		~~- ds = dataset.Dataset(plugin.func_name, rts, format=fmt, **kwargs)~~
	174	+ ds = dataset.Dataset(func, rts, format=fmt, **kwargs)
175	175	ds.add_variable(var)
176	176
177	177	stopwatch.elapsed()
—	—	@@ -178,8 +178,8 @@
179	179
180	180	ds.summary()
181	181
182		~~- for n, c in get_refcounts()[:100]:~~
183		~~- print '%10d %s' % (n, c.__name__)~~
	182	+ #for n, c in get_refcounts()[:100]:
	183	+ # print '%10d %s' % (n, c.__name__)
184	184
185	185
186	186	def get_refcounts():
—	—	@@ -205,9 +205,12 @@
206	206	Determine the first and final year for the observed data
207	207	'''
208	208	try:
209		~~- obs = db.find(var, qualifier='max')~~
	209	+ conditions = {var : {'$ne' : False}}
	210	+
	211	+ obs = db.find(conditions).sort(var, pymongo.ASCENDING).limit(1)[0]
210	212	max_year = obs[var].year + 1
211		~~- obs = db.find(var, qualifier='min')~~
	213	+
	214	+ obs = db.find(conditions).sort(var, pymongo.DESCENDING).limit(1)[0]
212	215	min_year = obs[var].year
213	216	except KeyError:
214	217	min_year = 2001
Index: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
—	—	@@ -57,7 +57,7 @@
58	58
59	59
60	60	def retrieve_variables(obs, username, date):
61		~~- data = db.find_one('username', username)~~
	61	+ data = db.find_one({'username': username})
62	62	year = str(date.year)
63	63	month = str(date.month)
64	64	if data:
Index: trunk/tools/editor_trends/manage.py
—	—	@@ -150,14 +150,8 @@
151	151	language = languages.init(language_code)
152	152	project = projects.init(project)
153	153	pjc = projects.ProjectContainer()
154		~~- #rts = runtime_settings.RunTimeSettings(project, language)~~
	154	+ rts = runtime_settings.RunTimeSettings(project, language)
155	155
156		~~- file_choices = {'meta-full': 'stub-meta-history.xml.gz',~~
157		~~- 'meta-current': 'stub-meta-current.xml.gz',~~
158		~~- 'history-full': 'pages-meta-history.xml.7z',~~
159		~~- 'history-current': 'pages-meta-current.xml.bz2'~~
160		~~- }~~
161		-
162	156	#Init Argument Parser
163	157	parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter)
164	158	subparsers = parser.add_subparsers(help='sub - command help')
—	—	@@ -218,7 +212,7 @@
219	213	parser_dataset.add_argument('-c', '--charts',
220	214	action='store',
221	215	help='Should be a valid function name that matches one of the plugin functions',
222		~~- default=inventory.available_analyses()['new_editor_count'])~~
	216	+ default='new_editor_count')
223	217
224	218	parser_dataset.add_argument('-k', '--keywords',
225	219	action='store',
—	—	@@ -256,6 +250,13 @@
257	251	help='Indicate whether the output is for Kaggle or not',
258	252	default=False)
259	253
	254	+
	255	+ parser.add_argument('-t', '--collection',
	256	+ action='store',
	257	+ help='Name of default collection',
	258	+ default='editors_dataset'
	259	+ )
	260	+
260	261	parser.add_argument('-l', '--language',
261	262	action='store',
262	263	help='Example of valid languages.',
—	—	@@ -269,28 +270,17 @@
270	271	choices=pjc.supported_projects(),
271	272	default='wiki')
272	273
273		~~- parser.add_argument('-c', '--collection',~~
274		~~- action='store',~~
275		~~- help='Name of MongoDB collection',~~
276		~~- default='editors_raw')~~
277		-
278		-
279	274	parser.add_argument('-ns', '--namespace',
280	275	action='store',
281	276	help='A list of namespaces to include for analysis.',
282	277	default='0')
283	278
284		~~- parser.add_argument('-db', '--database',~~
285		~~- action='store',~~
286		~~- help='Specify the database that you want to use. Valid choices are mongo and cassandra.',~~
287		~~- default='mongo')~~
288		-
289	279	parser.add_argument('-f', '--file',
290	280	action='store',
291		~~- choices=file_choices,~~
	281	+ choices=rts.file_choices,
292	282	help='Indicate which dump you want to download. Valid choices are:\n \
293		~~- %s' % ''.join([f + ',\n' for f in file_choices]),~~
294		~~- default=file_choices['meta-full'])~~
	283	+ %s' % ''.join([f + ',\n' for f in rts.file_choices]),
	284	+ default='meta-full')
295	285
296	286	return parser
297	287
—	—	@@ -353,6 +343,7 @@
354	344	log.to_db(rts, 'dataset', 'store', stopwatch, event='start')
355	345	log.to_csv(logger, rts, 'Start', 'Store', store_launcher)
356	346	store.launcher(rts)
	347	+ #store.launcher_articles(rts)
357	348	stopwatch.elapsed()
358	349	log.to_db(rts, 'dataset', 'store', stopwatch, event='finish')
359	350	log.to_csv(logger, rts, 'Finish', 'Store', store_launcher)
Index: trunk/tools/editor_trends/etl/variables.py
—	—	@@ -68,21 +68,22 @@
69	69	return title.text
70	70
71	71
72		~~-def parse_title_meta_data(title, namespace):~~
	72	+def parse_title_meta_data(title, ns, namespaces):
73	73	'''
74	74	This function categorizes an article to assist the Wikimedia Taxonomy
75	75	project. See
76	76	http://meta.wikimedia.org/wiki/Contribution_Taxonomy_Project/Research_Questions
77	77	'''
78	78	title_meta = {}
79		~~- if not namespace:~~
	79	+ if not ns:
80	80	return title_meta
81		-
	81	+ namespace = '%s:' % namespaces[ns]
	82	+ title = title.replace(namespace, '')
82	83	title_meta['title'] = title
83		~~- title_meta['ns'] = namespace~~
	84	+ title_meta['ns'] = ns
84	85	if title.startswith('List of'):
85	86	title_meta['category'] = 'List'
86		~~- elif namespace == 4 or namespace == 5:~~
	87	+ elif ns == 4 or ns == 5:
87	88	if title.find('Articles for deletion') > -1:
88	89	title_meta['category'] = 'Deletion'
89	90	elif title.find('Mediation Committee') > -1:
—	—	@@ -105,6 +106,7 @@
106	107	title_meta['category'] = 'Featured Topic'
107	108	elif title.find('Good Article') > -1:
108	109	title_meta['category'] = 'Good Article'
	110	+ #print title_meta
109	111	return title_meta
110	112
111	113
Index: trunk/tools/editor_trends/etl/store.py
—	—	@@ -44,32 +44,31 @@
45	45	while True:
46	46	try:
47	47	filename = self.tasks.get(block=False)
48		~~- except Empty:~~
49		~~- break~~
	48	+ self.tasks.task_done()
	49	+ if filename == None:
	50	+ self.result.put(None)
	51	+ break
50	52
51		~~- self.tasks.task_done()~~
52		~~- if filename == None:~~
53		~~- self.result.put(None)~~
54		~~- break~~
	53	+ fh = file_utils.create_txt_filehandle(self.rts.sorted, filename,
	54	+ 'r', 'utf-8')
	55	+ for line in file_utils.read_raw_data(fh):
	56	+ if len(line) == 1 or len(line) == 4:
	57	+ continue
	58	+ editor = line[0]
	59	+ #print 'Parsing %s' % editor
	60	+ if prev_editor != editor and prev_editor != -1:
	61	+ editor_cache.add(prev_editor, 'NEXT')
55	62
56		~~- fh = file_utils.create_txt_filehandle(self.rts.sorted, filename,~~
57		~~- 'r', 'utf-8')~~
58		~~- for line in file_utils.read_raw_data(fh):~~
59		~~- if len(line) == 1 or len(line) == 4:~~
60		~~- continue~~
61		~~- editor = line[0]~~
62		~~- #print 'Parsing %s' % editor~~
63		~~- if prev_editor != editor and prev_editor != -1:~~
64		~~- editor_cache.add(prev_editor, 'NEXT')~~
	63	+ data = prepare_data(line)
	64	+ #print editor, data['username']
	65	+ editor_cache.add(editor, data)
	66	+ prev_editor = editor
	67	+ fh.close()
	68	+ self.result.put(True)
	69	+ except Empty:
	70	+ pass
65	71
66		~~- data = prepare_data(line)~~
67		~~- #print editor, data['username']~~
68		~~- editor_cache.add(editor, data)~~
69		~~- prev_editor = editor~~
70		~~- fh.close()~~
71		~~- self.result.put(True)~~
72	72
73		-
74	73	def prepare_data(line):
75	74	'''
76	75	Prepare a single line to store in the database, this entails converting
—	—	@@ -103,34 +102,34 @@
104	103	while True:
105	104	try:
106	105	filename = tasks.get(block=False)
	106	+ if filename == None:
	107	+ self.result.put(None)
	108	+ break
	109	+ print 'Processing %s...' % filename
	110	+ fh = file_utils.create_txt_filehandle(rts.txt, filename, 'r', 'utf-8')
	111	+ for line in fh:
	112	+ line = line.strip()
	113	+ line = line.split('\t')
	114	+ data = {}
	115	+ x, y = 0, 1
	116	+ while y < len(line):
	117	+ key, value = line[x], line[y]
	118	+ if key == 'ns' or key == 'id':
	119	+ data[key] = int(value)
	120	+ else:
	121	+ data[key] = value
	122	+ x += 2
	123	+ y += 2
	124	+ db.insert(data)
	125	+ fh.close()
107	126	except Empty:
108		~~- continue~~
109		-
110		~~- if filename == None:~~
111		~~- break~~
112		~~- print 'Processing %s...' % filename~~
113		~~- fh = file_utils.create_txt_filehandle(rts.txt, filename, 'r', 'utf-8')~~
114		~~- for line in fh:~~
115		~~- line = line.strip()~~
116		~~- line = line.split('\t')~~
117		~~- data = {}~~
118		~~- x, y = 0, 1~~
119		~~- while y < len(line):~~
120		~~- key, value = line[x], line[y]~~
121		~~- if key == 'ns' or key == 'id':~~
122		~~- data[key] = int(value)~~
123		~~- else:~~
124		~~- data[key] = value~~
125		~~- x += 2~~
126		~~- y += 2~~
127		~~- db.insert(data)~~
128		~~- fh.close()~~
	127	+ pass
129	128	print 'Done storing articles...'
130	129
131	130
132	131	def launcher_articles(rts):
133	132	'''
134		~~- This function reads titles.csv and stores it in a separate collection.~~
	133	+ This function reads articles.csv and stores it in a separate collection.
135	134	Besides containing the title of an article, it also includes:
136	135	* namespace
137	136	* category (if any)
—	—	@@ -172,7 +171,6 @@
173	172	This is the main entry point and creates a number of workers and launches
174	173	them.
175	174	'''
176		~~- #launcher_articles(rts)~~
177	175	print 'Input directory is: %s ' % rts.sorted
178	176	db = storage.init_database(rts.storage, rts.dbname, rts.editors_raw)
179	177	db.drop_collection()
Index: trunk/tools/editor_trends/etl/downloader.py
—	—	@@ -29,7 +29,7 @@
30	30	from utils import log
31	31
32	32
33		~~-def download_wiki_file(task_queue, properties):~~
	33	+def download_wiki_file(task_queue, rts):
34	34	'''
35	35	This is a very simple replacement for wget and curl because Windows does
36	36	not have these tools installed by default
—	—	@@ -46,34 +46,34 @@
47	47	widgets = log.init_progressbar_widgets(filename)
48	48	extension = os.path.splitext(filename)[1]
49	49	filemode = file_utils.determine_file_mode(extension)
50		~~- filesize = http_utils.determine_remote_filesize(properties.wp_dump_location,~~
51		~~- properties.dump_relative_path,~~
	50	+ filesize = http_utils.determine_remote_filesize(rts.wp_dump_location,
	51	+ rts.dump_relative_path,
52	52	filename)
53	53
54		~~- mod_date = http_utils.determine_modified_date(properties.wp_dump_location,~~
55		~~- properties.dump_relative_path,~~
	54	+ mod_date = http_utils.determine_modified_date(rts.wp_dump_location,
	55	+ rts.dump_relative_path,
56	56	filename)
57		~~- mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, properties.timestamp_server)~~
58		~~- if file_utils.check_file_exists(properties.input_location, filename):~~
59		~~- mod_loc = file_utils.get_modified_date(properties.input_location, filename)~~
60		~~- if mod_loc == mod_date and (properties.force == False or properties.force == None):~~
61		~~- print 'You already have downloaded the most recent %s%s dumpfile.' % (properties.language.code, properties.project.name)~~
	57	+ mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, rts.timestamp_server)
	58	+ if file_utils.check_file_exists(rts.input_location, filename):
	59	+ mod_loc = file_utils.get_modified_date(rts.input_location, filename)
	60	+ if mod_loc == mod_date and (rts.force == False or rts.force == None):
	61	+ print 'You already have downloaded the most recent %s%s dumpfile.' % (rts.language.code, rts.project.name)
62	62	continue
63	63
64	64	if filemode == 'w':
65		~~- fh = file_utils.create_txt_filehandle(properties.input_location,~~
	65	+ fh = file_utils.create_txt_filehandle(rts.input_location,
66	66	filename,
67	67	filemode,
68		~~- properties.encoding)~~
	68	+ rts.encoding)
69	69	else:
70		~~- fh = file_utils.create_binary_filehandle(properties.input_location, filename, 'wb')~~
	70	+ fh = file_utils.create_binary_filehandle(rts.input_location, filename, 'wb')
71	71
72	72	if filesize != -1:
73	73	pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start()
74	74	else:
75	75	pbar = progressbar.ProgressBar(widgets=widgets).start()
76	76	try:
77		~~- path = '%s%s' % (properties.dump_absolute_path, filename)~~
	77	+ path = '%s%s' % (rts.dump_absolute_path, filename)
78	78	req = urllib2.Request(path)
79	79	response = urllib2.urlopen(req)
80	80	while True:
—	—	@@ -94,24 +94,24 @@
95	95	print 'Error: %s' % error
96	96	finally:
97	97	fh.close()
98		~~- file_utils.set_modified_data(mod_date, properties.input_location, filename)~~
	98	+ file_utils.set_modified_data(mod_date, rts.input_location, filename)
99	99
100	100
101	101
102		~~-def launcher(properties, logger):~~
	102	+def launcher(rts, logger):
103	103	print 'Creating list of files to be downloaded...'
104		~~- tasks = http_utils.create_list_dumpfiles(properties.wp_dump_location,~~
105		~~- properties.dump_relative_path,~~
106		~~- properties.dump_filename)~~
	104	+ tasks = http_utils.create_list_dumpfiles(rts.wp_dump_location,
	105	+ rts.dump_relative_path,
	106	+ rts.dump_filename)
107	107	#print tasks.qsize()
108		~~- #if tasks.qsize() < properties.settings.number_of_processes:~~
109		~~- # properties..number_of_processes = tasks.qsize()~~
	108	+ #if tasks.qsize() < rts.settings.number_of_processes:
	109	+ # rts..number_of_processes = tasks.qsize()
110	110	if tasks.qsize() > 2:
111	111	consumers = [multiprocessing.Process(target=download_wiki_file,
112		~~- args=(tasks, properties))~~
113		~~- for i in xrange(properties.number_of_processes)]~~
	112	+ args=(tasks, rts))
	113	+ for i in xrange(rts.number_of_processes)]
114	114	else: consumers = [multiprocessing.Process(target=download_wiki_file,
115		~~- args=(tasks, properties))~~
	115	+ args=(tasks, rts))
116	116	for i in xrange(1)]
117	117	print 'Starting consumers to download files...'
118	118	for w in consumers:
Index: trunk/tools/editor_trends/etl/differ.py
—	—	@@ -213,7 +213,7 @@
214	214
215	215	def store_json_diffs(rts):
216	216	files = os.listdir(rts.diffs)
217		~~- print files, rts.diffs~~
	217	+ #print files, rts.diffs
218	218	db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
219	219	buffer = cStringIO.StringIO()
220	220
—	—	@@ -226,12 +226,10 @@
227	227	obj = json.loads(obj)
228	228	obj[0]['article_id'] = int(obj[0]['article_id'])
229	229	for key, value in obj[0].iteritems():
230		~~- if type(value) == type(dict()):~~
231		~~- value['timestamp'] = datetime.strptime(value['timestamp'], '%Y-%m-%dT%H:%M:%S')~~
	230	+ if key == 'timestamp':
	231	+ value = datetime.strptime(value, '%Y-%m-%dT%H:%M:%S')
232	232	obj[0][key] = value
233	233	obj = obj[0]
234		~~- #print obj~~
235		~~- #print len(obj)~~
236	234	try:
237	235	db.save(obj)
238	236	except bson.errors.InvalidDocument, error:
—	—	@@ -279,6 +277,7 @@
280	278
281	279	def store_diffs_debug(rts):
282	280	db = storage.init_database(rts)
	281	+ db.drop_collection()
283	282	files = os.listdir(rts.diffs)
284	283	for filename in files:
285	284	fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 'utf-8')
—	—	@@ -335,20 +334,22 @@
336	335	print 'Inserting poison pill %s...' % x
337	336	input_queue.put(None)
338	337
339		~~- extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id,~~
340		~~- rts, format])~~
341		~~- for process_id in xrange(processors)]~~
342		~~- for extracter in extracters:~~
343		~~- extracter.start()~~
	338	+# extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id,
	339	+# rts, format])
	340	+# for process_id in xrange(processors)]
	341	+# for extracter in extracters:
	342	+# extracter.start()
	343	+#
	344	+# input_queue.join()
344	345
345		~~- input_queue.join()~~
346		-
347	346	store_json_diffs(rts)
348	347	db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
	348	+
349	349	db.add_index('title')
350	350	db.add_index('timestamp')
351	351	db.add_index('username')
352	352	db.add_index('ns')
	353	+ db.add_index('editor')
353	354
354	355
355	356	def launcher_simple():
Index: trunk/tools/editor_trends/etl/extracter.py
—	—	@@ -72,8 +72,8 @@
73	73	text = variables.extract_revision_text(revision, xml_namespace)
74	74	article.update(contributor)
75	75
76		~~- comment = variables.extract_comment_text(revision_id, revision)~~
77		~~- cache.comments.update(comment)~~
	76	+ #comment = variables.extract_comment_text(revision_id, revision)
	77	+ #cache.comments.update(comment)
78	78
79	79	timestamp = revision.find('%s%s' % (xml_namespace, 'timestamp')).text
80	80	article['timestamp'] = timestamp
—	—	@@ -139,7 +139,7 @@
140	140	title = variables.parse_title(elem)
141	141	article['title'] = title
142	142	current_namespace = variables.determine_namespace(title, namespaces, include_ns)
143		~~- title_meta = variables.parse_title_meta_data(title, current_namespace)~~
	143	+ title_meta = variables.parse_title_meta_data(title, current_namespace, namespaces)
144	144	if current_namespace < 6:
145	145	parse = True
146	146	article['namespace'] = current_namespace
—	—	@@ -172,7 +172,7 @@
173	173	Determine id of article
174	174	'''
175	175	article['article_id'] = elem.text
176		~~- if isinstance(current_namespace, int):~~
	176	+ if isinstance(current_namespace, int) and title_meta != {}:
177	177	cache.articles[article['article_id']] = title_meta
178	178	id = True
179	179	elem.clear()
Index: trunk/tools/editor_trends/kaggle/training.py
—	—	@@ -26,7 +26,7 @@
27	27
28	28	from classes import storage
29	29
30		~~-location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction_solution'~~
	30	+location = '/home/diederik/wikimedia/en/wiki/kaggle'
31	31	files = os.listdir(location)
32	32	files.reverse()
33	33
—	—	@@ -67,7 +67,7 @@
68	68	continue
69	69	id = line[2]
70	70	if id not in ids and id not in ignore_ids:
71		~~- res = db.find_one('editor', id)~~
	71	+ res = db.find_one({'editor': id})
72	72	if res == None:
73	73	ignore_ids.add(id)
74	74	continue
—	—	@@ -100,7 +100,7 @@
101	101	fh = codecs.open('solutions.tsv', 'w', 'utf-8')
102	102	for id in ids:
103	103	if id not in ignore_ids:
104		~~- obs = db.find_one('editor', str(id), 'cum_edit_count_main_ns')~~
	104	+ obs = db.find_one({'editor': str(id)}, 'cum_edit_count_main_ns')
105	105	if obs != None:
106	106	x += 1
107	107	n = obs['cum_edit_count_main_ns']
Index: trunk/tools/editor_trends/classes/buffer.py
—	—	@@ -124,7 +124,9 @@
125	125	def simplify(self, revision):
126	126	row = []
127	127	for key in self.keys:
128		~~- row.append(revision[key].decode('utf-8'))~~
	128	+ value = revision.get(key, None)
	129	+ if value != None:
	130	+ row.append(value.decode('utf-8'))
129	131	return row
130	132
131	133	def stringify(self, revision):
Index: trunk/tools/editor_trends/classes/runtime_settings.py
—	—	@@ -50,53 +50,57 @@
51	51	self.project = project
52	52	self.language = language
53	53	self.dbname = 'wikilytics'
	54	+ self.file_choices = {'meta-full': 'stub-meta-history.xml.gz',
	55	+ 'meta-current': 'stub-meta-current.xml.gz',
	56	+ 'history-full': 'pages-meta-history.xml.7z',
	57	+ 'history-current': 'pages-meta-current.xml.bz2'
	58	+ }
	59	+ if args:
	60	+ self.args = args
	61	+ self.id = '%s%s_%s' % (self.language.code, self.project.name, 'current_month')
	62	+ #print self.settings.input_location
	63	+ #print self.get_value('location')
	64	+ self.project = self.update_project_settings()
	65	+ self.language = self.update_language_settings()
54	66
55		~~- #if args:~~
56		~~- self.args = args~~
57		~~- self.id = '%s%s_%s' % (self.language.code, self.project.name, 'current_month')~~
58		~~- #print self.settings.input_location~~
59		~~- #print self.get_value('location')~~
60		~~- self.project = self.update_project_settings()~~
61		~~- self.language = self.update_language_settings()~~
	67	+ self.input_location = self.set_input_location()
	68	+ self.output_location = self.set_output_location()
62	69
63		~~- self.input_location = self.set_input_location()~~
64		~~- self.output_location = self.set_output_location()~~
	70	+ self.plugins = self.set_plugin()
	71	+ self.keywords = self.split_keywords()
	72	+ self.namespaces = self.get_namespaces()
65	73
66		~~- self.plugins = self.set_plugin()~~
67		~~- self.keywords = self.split_keywords()~~
68		~~- self.namespaces = self.get_namespaces()~~
	74	+ #self.kaggle = self.get_value('kaggle')
	75	+ self.function = self.get_value('func')
	76	+ self.ignore = self.get_value('except')
	77	+ self.force = self.get_value('force')
	78	+ self.analyzer_collection = self.get_value('collection')
69	79
70		~~- #self.kaggle = self.get_value('kaggle')~~
71		~~- self.function = self.get_value('func')~~
72		~~- self.ignore = self.get_value('except')~~
73		~~- self.force = self.get_value('force')~~
74		~~- self.analyzer_collection = self.get_value('collection')~~
	80	+ self.dataset = os.path.join(self.dataset_location, self.project.name)
	81	+ self.txt = os.path.join(self.output_location, 'txt')
	82	+ self.sorted = os.path.join(self.output_location, 'sorted')
	83	+ self.diffs = os.path.join(self.output_location, 'diffs')
75	84
76		~~- self.dataset = os.path.join(self.dataset_location, self.project.name)~~
77		~~- self.txt = os.path.join(self.output_location, 'txt')~~
78		~~- self.sorted = os.path.join(self.output_location, 'sorted')~~
79		~~- self.diffs = os.path.join(self.output_location, 'diffs')~~
	85	+ self.directories = [self.output_location,
	86	+ self.txt,
	87	+ self.sorted,
	88	+ self.dataset,
	89	+ self.diffs]
	90	+ self.verify_environment(self.directories)
80	91
81		~~- self.directories = [self.output_location,~~
82		~~- self.txt,~~
83		~~- self.sorted,~~
84		~~- self.dataset,~~
85		~~- self.diffs]~~
86		~~- self.verify_environment(self.directories)~~
	92	+ #Wikidump file related variables
	93	+ self.dump_filename = self.generate_wikidump_filename()
	94	+ self.dump_relative_path = self.set_dump_path()
	95	+ self.dump_absolute_path = self.set_dump_path(absolute=True)
87	96
88		~~- #Wikidump file related variables~~
89		~~- self.dump_filename = self.generate_wikidump_filename()~~
90		~~- self.dump_relative_path = self.set_dump_path()~~
91		~~- self.dump_absolute_path = self.set_dump_path(absolute=True)~~
	97	+ #Collection names
	98	+ self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name)
	99	+ self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name)
	100	+ self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name)
	101	+ self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, self.project.name)
	102	+ self.collection = self.set_collection()
92	103
93		~~- #Collection names~~
94		~~- self.editors_raw = '%s%s_editors_raw' % (self.language.code, self.project.name)~~
95		~~- self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, self.project.name)~~
96		~~- self.articles_raw = '%s%s_articles_raw' % (self.language.code, self.project.name)~~
97		~~- self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, self.project.name)~~
98	104
99		-
100		-
101	105	def __str__(self):
102	106	return 'Runtime Settings for project %s %s' % (self.language.name,
103	107	self.project.full_name)
—	—	@@ -105,14 +109,8 @@
106	110	for item in self.__dict__:
107	111	yield item
108	112
109		~~- def dict(self):~~
110		~~- '''~~
111		~~- Return a dictionary with all properties and their values~~
112		~~- '''~~
113		~~- props = {}~~
114		~~- for prop in self:~~
115		~~- props[prop] = getattr(self, prop)~~
116		~~- return props~~
	113	+ def set_collection(self):
	114	+ return getattr(self, self.get_value('collection'))
117	115
118	116	def split_keywords(self):
119	117	'''
—	—	@@ -141,7 +139,7 @@
142	140	'''
143	141	plugin = self.get_value('charts')
144	142	requested_plugins = []
145		~~- if plugin != None and isinstance(plugin, type('module')) == False:~~
	143	+ if plugin != None:
146	144	plugins = plugin.split(',')
147	145	available_plugins = inventory.available_analyses()
148	146	for plugin in plugins:
—	—	@@ -220,8 +218,9 @@
221	219	'''
222	220	Generate the main name of the wikidump file to be downloaded.
223	221	'''
	222	+ choice = self.get_value('file')
224	223	return '%s%s-latest-%s' % (self.language.code, self.project.name,
225		~~- self.get_value('file'))~~
	224	+ self.file_choices[choice])
226	225
227	226	def update_language_settings(self):
228	227	'''
Index: trunk/tools/editor_trends/classes/storage.py
—	—	@@ -160,29 +160,22 @@
161	161	assert isinstance(data, dict), 'You need to feed me dictionaries.'
162	162	self.db[self.collection].update({key: value}, {'$set': data})
163	163
164		~~- def find(self, key=None, qualifier=None):~~
165		~~- if qualifier == 'min':~~
166		~~- return self.db[self.collection].find({~~
167		~~- key : {'$ne' : False}}).sort(key, pymongo.ASCENDING).limit(1)[0]~~
168		~~- elif qualifier == 'max':~~
169		~~- return self.db[self.collection].find({~~
170		~~- key : {'$ne' : False}}).sort(key, pymongo.DESCENDING).limit(1)[0]~~
171		~~- elif qualifier:~~
172		~~- return self.db[self.collection].find({key : qualifier})~~
173		~~- elif key != None:~~
174		~~- return self.db[self.collection].find({}, fields=[key])~~
	164	+ def find(self, conditions, vars=None):
	165	+ if conditions:
	166	+ return self.db[self.collection].find(conditions, fields=vars)
175	167	else:
176	168	return self.db[self.collection].find()
177	169
178		~~- def find_one(self, key, value, vars=None):~~
	170	+ def find_one(self, conditions, vars=None):
179	171	if vars:
180	172	#if you only want to retrieve a specific variable(s) then you need to
181	173	#specify vars, if vars is None then you will get the entire BSON object
182	174	vars = vars.split(',')
183	175	vars = dict([(var, 1) for var in vars])
184		~~- return self.db[self.collection].find_one({key: value}, vars)~~
	176	+ return self.db[self.collection].find_one(conditions, vars)
185	177	else:
186		~~- return self.db[self.collection].find_one({key: value})~~
	178	+ #conditions should be a dictionary
	179	+ return self.db[self.collection].find_one(conditions)
187	180
188	181
189	182	def drop_collection(self):
Index: trunk/tools/editor_trends/classes/dataset.py
—	—	@@ -176,6 +176,7 @@
177	177	#self.date = date
178	178	self.data = 0
179	179	self.time_unit = time_unit
	180	+ self.date = date
180	181	self.t1, self.t0 = self.set_date_range(date)
181	182	self.id = id
182	183	self.props = []
—	—	@@ -515,7 +516,7 @@
516	517	variable.max = get_max(data)
517	518	variable.num_obs = variable.number_of_obs()
518	519	variable.num_dates = len(variable)
519		~~- #variable.first_obs, variable.last_obs = variable.get_date_range()~~
	520	+ variable.first_obs, variable.last_obs = variable.get_date_range()
520	521
521	522	def summary(self):
522	523	'''
Index: trunk/tools/editor_trends/classes/analytics.py
—	—	@@ -64,10 +64,8 @@
65	65
66	66	def __call__(self):
67	67	project = 'wiki'
68		~~- #rts = runtime_settings.init_environment('wiki', 'en', args)~~
69	68	for lang in self.languages:
70	69	self.rts = runtime_settings.init_environment(project, lang, self.args)
71		~~- #self.rts.editors_dataset = 'editors_dataset'~~
72	70
73	71	self.rts.dbname = '%s%s' % (lang, project)
74	72	for cum_cutoff in self.cum_cutoff:
—	—	@@ -91,15 +89,16 @@
92	90	Generic loop function that loops over all the editors of a Wikipedia
93	91	project and then calls the plugin that does the actual mapping.
94	92	'''
95		~~- db = storage.init_database(self.rts.storage, self.rts.dbname, self.rts.editors_dataset)~~
	93	+ db = storage.init_database(self.rts.storage, self.rts.dbname, self.rts.collection)
96	94	while True:
97	95	try:
98	96	editor_id = self.tasks.get(block=False)
	97	+ self.tasks.task_done()
99	98	if editor_id == None:
100	99	self.result.put(self.var)
101	100	break
102		~~- editor = db.find_one('editor', editor_id)~~
103		~~- self.plugin(self.var, editor, dbname=self.rts.dbname, data=self.data)~~
	101	+ editor = db.find_one({'editor': editor_id})
	102	+ self.plugin(self.var, editor, rts=self.rts, data=self.data)
104	103	self.result.put(True)
105	104	except Empty:
106	105	pass
Index: trunk/tools/editor_trends/utils/log.py
—	—	@@ -31,8 +31,8 @@
32	32	def to_db(rts, jobtype, task, timer, event='start'):
33	33	db = storage.init_database(rts.storage, rts.dbname, 'jobs')
34	34	created = datetime.datetime.now()
35		~~- job = db.find_one('hash', rts.id)~~
36		-
	35	+ job = db.find_one({'hash': rts.id})
	36	+ #print job
37	37	data = {'hash': rts.id,
38	38	'created': created,
39	39	'jobtype': jobtype,
—	—	@@ -50,7 +50,7 @@
51	51	data['finished'] = True
52	52	_id = db.save(data)
53	53
54		~~- job = db.find_one('_id', _id)~~
	54	+ job = db.find_one({'_id': _id})
55	55
56	56	tasks = job['tasks']
57	57	t = tasks.get(task, {})