r75089 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r75088‎ \| r75089 \| r75090 >
Date:	18:08, 20 October 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	1) Added functionality to generate datasets 2) Fixed creating indexes in MongoDB 3) Updated README.1ST with processing time information
Modified paths:	/trunk/tools/editor_trends (modified) (history) /trunk/tools/editor_trends/README.1ST (modified) (history) /trunk/tools/editor_trends/construct_datasets.py (modified) (history) /trunk/tools/editor_trends/database/db.py (modified) (history) /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history) /trunk/tools/editor_trends/settings.py (modified) (history) /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/map_wiki_editors.py
—	—	@@ -143,18 +143,25 @@
144	144	elem = cElementTree.XML(data)
145	145	output_editor_information(elem)
146	146	'''
	147	+ if settings.DEBUG:
	148	+ utils.track_errors(xml_buffer, error, file, messages)
147	149	except UnicodeEncodeError, error:
148	150	print error
	151	+ if settings.DEBUG:
	152	+ utils.track_errors(xml_buffer, error, file, messages)
149	153	except MemoryError, error:
150	154	'''
151	155	There is one xml file causing an out of memory file, not
152		~~- sure which one yet.~~
	156	+ sure which one yet. This happens when raw_data =
	157	+ ''.join(raw_data) is called. 18-22
153	158	'''
154	159	print error
155		~~- finally:~~
	160	+ print raw_data[:12]
	161	+ print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])
156	162	if settings.DEBUG:
157	163	utils.track_errors(xml_buffer, error, file, messages)
158	164
	165	+
159	166	if pbar:
160	167	print xml_queue.qsize()
161	168	#utils.update_progressbar(pbar, xml_queue)
—	—	@@ -248,17 +255,19 @@
249	256	for bot in cursor:
250	257	ids[bot['id']] = bot['name']
251	258	pc.build_scaffolding(pc.load_queue, lookup_new_editors, files, store_data_mongo, True, bots=ids)
252		~~- keys = [('date', pymongo.ASCENDING), ('name', pymongo.ASCENDING)]~~
253		~~- db.add_index_to_collection('editors', 'editors', keys)~~
	259	+ keys = ['editor']
	260	+ for key in keys:
	261	+ db.add_index_to_collection('editors', 'editors', key)
254	262
255	263	def debug_lookup_new_editors():
256	264	q = Queue()
257	265	import progressbar
258	266	pbar = progressbar.ProgressBar().start()
259		~~- edits = db.init_mongo_db('editors')~~
260		~~- lookup_new_editors('1.xml', q, None, None, True)~~
261		~~- keys = [('date', pymongo.ASCENDING), ('name', pymongo.ASCENDING)]~~
262		~~- db.add_index_to_collection('editors', 'editors', keys)~~
	267	+ #edits = db.init_mongo_db('editors')
	268	+ #lookup_new_editors('1.xml', q, None, None, True)
	269	+ keys = ['editor']
	270	+ for key in keys:
	271	+ db.add_index_to_collection('editors', 'editors', key)
263	272
264	273
265	274
—	—	@@ -267,10 +276,10 @@
268	277
269	278
270	279	if __name__ == "__main__":
271		~~- #debug_lookup_new_editors()~~
	280	+ debug_lookup_new_editors()
272	281
273		~~- if settings.RUN_MODE == 'stand_alone':~~
274		~~- run_stand_alone()~~
275		~~- print 'Finished processing XML files.'~~
276		~~- else:~~
277		~~- run_hadoop()~~
	282	+# if settings.RUN_MODE == 'stand_alone':
	283	+# run_stand_alone()
	284	+# print 'Finished processing XML files.'
	285	+# else:
	286	+# run_hadoop()
Index: trunk/tools/editor_trends/settings.py
—	—	@@ -75,6 +75,8 @@
76	76
77	77	BINARY_OBJECT_FILE_LOCATION = WORKING_DIRECTORY + '/data/objects/'
78	78
	79	+DATASETS_FILE_LOCATION = WORKING_DIRECTORY + '/datasets/'
	80	+
79	81	#This section contains configuration variables for parsing / encoding and
80	82	#working with the XML files.
81	83
Index: trunk/tools/editor_trends/utils/utils.py
—	—	@@ -138,18 +138,24 @@
139	139	fh.close()
140	140
141	141
142		~~-def write_data_to_csv(data, function, encoding):~~
	142	+def write_data_to_csv(data, location, function, encoding):
143	143	filename = construct_filename_from_function(function, '.csv')
144		~~- fh = open_txt_file(filename, 'a', encoding=encoding)~~
	144	+ fh = open_txt_file(location, filename, 'a', encoding=encoding)
145	145	keys = data.keys()
146	146	for key in keys:
147		~~- for value in data[key]:~~
148		~~- fh.write('%s\t%s\n' % (key, value))~~
	147	+ fh.write('%s' % key)
	148	+ for obs in data[key]:
	149	+ if getattr(obs, '__iter__', False):
	150	+ for o in obs:
	151	+ fh.write('\t%s' % o)
	152	+ else:
	153	+ fh.write('\t%s' % (obs))
	154	+ fh.write('\n')
149	155	fh.close()
150	156
151	157
152		~~-def open_txt_file(filename, mode, encoding):~~
153		~~- return codecs.open(filename, mode, encoding=encoding)~~
	158	+def open_txt_file(location, filename, mode, encoding):
	159	+ return codecs.open(location+filename, mode, encoding=encoding)
154	160
155	161	def construct_filename_from_function(function, extension):
156	162	return function.func_name + extension
Index: trunk/tools/editor_trends/README.1ST
—	—	@@ -58,6 +58,30 @@
59	59	settings are self-explanatory but in cases of any questions please drop me a
60	60	line.
61	61
	62	+PROCESSING TIMES:
	63	+
	64	+CONFIG NAMESPACE FILENAME CHUNKING STORING INDEXING RETRIEVING TOTAL
	65	+1 0 stub-meta-history 7 3 1 ? 11
	66	+
	67	+
	68	+*CHUNKING == splitting XML file in smaller pieces
	69	+*STORING == parsing xml files and storing it in MongoDB
	70	+*INDEXING == creating an index in MongoDB
	71	+*RETRIEVING == generating a dataset
	72	+*TOTAL == sum of all parts
	73	+
	74	+MACHINE CONFIGURATIONS
	75	+
	76	+ID OS VERSION MEMORY PROCESSOR SPEED
	77	+1 Windows 7 64-bit 4GB Duo Core 2.8MHZ
	78	+Please add your processing times plus configuration to help improve performance.
	79	+
	80	+HARDDISK REQUIREMENTS
	81	+You will need at least 3x the size of xml dump file in free space on your hard
	82	+disk if you want to create the databases and datasets to run your own analyses.
	83	+The English stub-meta-history.xml is about 15Gb so you need about 45Gb of free
	84	+diskspace.
	85	+
62	86	CODE:
63	87	The Python code adheres to PEP8. Function names are deliberately expressive to
64	88	ease understanding what's going. If you find a bug please email me at dvanliere
Index: trunk/tools/editor_trends/construct_datasets.py
—	—	@@ -40,9 +40,14 @@
41	41	else:
42	42	mongo = db.init_mongo_db('editors')
43	43	editors = mongo['editors']
44		~~- ids = editors.find().distinct('editor')~~
45		~~- print ids~~
46		~~- if ids != []:~~
	44	+ contributors = set()
	45	+ #ids = editors.find().distinct('editor')
	46	+ ids = editors.find()
	47	+ for x,id in enumerate(ids):
	48	+ contributors.add(id['editor'])
	49	+ if len(contributors) % 25000 == 0:
	50	+ print x, len(contributors)
	51	+ if ids != set():
47	52	utils.store_object(ids, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)
48	53	return ids
49	54
—	—	@@ -60,21 +65,28 @@
61	66	else:
62	67	id = input_queue.get(block=False)
63	68
64		~~- contributors = set()~~
	69	+
65	70	if definition == 'Traditional':
66		~~- obs = editors.find({'editor': id}).limit(limit) #.sort({'date': 1}).limit(limit)~~
	71	+ obs = editors.find({'editor': id}).sort('date').limit(limit)
	72	+ contributors = []
67	73	for ob in obs:
68		~~- contributors.add(ob)~~
	74	+ contributors.append(ob['date'])
69	75	else:
70		~~- obs = editors.find({'editor': id}).sort({'date': 1})~~
	76	+ obs = editors.find({'editor': id}).sort('date')
	77	+ contributors = set()
71	78	for ob in obs:
72		~~- if len(dates) > limit:~~
	79	+ if len(contributors) == limit:
73	80	break
74	81	else:
75		~~- if edit.date not in dates:~~
76		~~- set.add(edit)~~
77		~~- utils.write_data_to_csv(contributors, generate_editor_dataset, settings.ENCODING)~~
	82	+ contributors.add(ob['date'])
78	83
	84	+ if len(contributors) < limit:
	85	+ new_wikipedian = False
	86	+ else:
	87	+ new_wikipedian = True
	88	+ data = {id: [contributors, new_wikipedian]}
	89	+ utils.write_data_to_csv(data, settings.DATASETS_FILE_LOCATION, generate_editor_dataset, settings.ENCODING)
	90	+
79	91	except Empty:
80	92	break
81	93
—	—	@@ -146,7 +158,7 @@
147	159	'debug': True
148	160	}
149	161	generate_editor_dataset(input_queue, False, False, kwargs)
150		~~- generate_editor_dataset_launcher()~~
	162	+ #generate_editor_dataset_launcher()
151	163	#retrieve_list_contributors()
152	164	#retrieve_edits_by_contributor()
153	165
—	—	@@ -156,4 +168,5 @@
157	169
158	170
159	171	if __name__ == '__main__':
160		~~- debug_retrieve_edits_by_contributor_launcher()~~
	172	+ generate_editor_dataset_launcher()
	173	+ #debug_retrieve_edits_by_contributor_launcher()
Index: trunk/tools/editor_trends/database/db.py
—	—	@@ -32,17 +32,17 @@
33	33	collection.remove(ids)
34	34
35	35
36		~~-def add_index_to_collection(db, collection, keys):~~
	36	+def add_index_to_collection(db, collection, key):
37	37	'''
38	38	@db is the name of the mongodb
39	39	@collection is the name of the 'table' in mongodb
40		~~- @keys should be a list of keys used to create the index~~
	40	+ @key name of the field to create the index
41	41	'''
42	42
43	43	mongo = init_mongo_db(db)
44	44	collection = mongo[collection]
45		~~- mongo.collection.create_index(keys)~~
46		~~- mongo.collection.ensure_index(keys)~~
	45	+ mongo.collection.create_index(key)
	46	+ mongo.collection.ensure_index(key)
47	47
48	48
49	49	def init_database(db=None):
Property changes on: trunk/tools/editor_trends
___________________________________________________________________
Modified: svn:ignore
50	50	- wikistats
zips
notes.txt
*.pyc
datasets
errors
51	51	+ wikistats
zips
notes.txt
*.pyc
datasets
errors
.settings
.project
.pydevproject

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r75089 [removed: new added: deferred]