r76048 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r76047‎ \| r76048 \| r76049 >
Date:	21:57, 4 November 2010
Author:	diederik
Status:	deferred
Tags:
Comment:	Various bugfixes, preparing for 1st release.
Modified paths:	/trunk/tools/editor_trends/config.py (modified) (history) /trunk/tools/editor_trends/database/cache.py (modified) (history) /trunk/tools/editor_trends/manage.py (modified) (history) /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history) /trunk/tools/editor_trends/settings.py (modified) (history) /trunk/tools/editor_trends/split_xml_file.py (modified) (history) /trunk/tools/editor_trends/utils/dump_downloader.py (modified) (history) /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
—	—	@@ -19,6 +19,7 @@
20	20
21	21	import os
22	22	import sys
	23	+import subprocess
23	24	from argparse import ArgumentParser
24	25	from argparse import RawTextHelpFormatter
25	26
—	—	@@ -30,6 +31,7 @@
31	32	from utils import utils
32	33	from utils import dump_downloader
33	34	import split_xml_file
	35	+import map_wiki_editors
34	36	import config
35	37
36	38
—	—	@@ -37,15 +39,29 @@
38	40	return getattr(args, key, None)
39	41
40	42
41		~~-def config_launcher(args):~~
	43	+def config_launcher(args, location, filename, project, language_code):
42	44	config.load_configuration(args)
43	45
44	46
	47	+def retrieve_projectname(args):
	48	+ language_code = retrieve_language(args)
	49	+ if language_code == None:
	50	+ print 'Entered language: %s is not a valid Wikipedia language' % get_value(args, 'language')
	51	+ sys.exit(-1)
	52	+ project = retrieve_project(args)
	53	+
	54	+ if project == None:
	55	+ print 'Entered project: %s is not valid Wikipedia project.' % get_value(args, 'project')
	56	+ sys.exit(-1)
	57	+
	58	+ return '%s%s' % (language_code, project)
	59	+
45	60	def retrieve_language(args):
46	61	language = get_value(args, 'language')
47	62	language = language.title()
48		~~- return languages.MAPPING.get(language, None)~~
	63	+ return languages.MAPPING.get(language, None)
49	64
	65	+
50	66	def retrieve_project(args):
51	67	project = get_value(args, 'project')
52	68	if project != 'wiki':
—	—	@@ -53,51 +69,65 @@
54	70	return project
55	71
56	72
57		~~-def dump_downloader_launcher(args):~~
	73	+def generate_wikidump_filename(args):
	74	+ return '%s-%s-%s' % (retrieve_projectname(args), 'latest', get_value(args, 'file'))
	75	+
	76	+
	77	+def determine_file_locations(args):
	78	+ locations = {}
	79	+ locations['language_code'] = retrieve_language(args)
	80	+ locations['location'] = os.path.join(get_value(args, 'location'), retrieve_language(args))
	81	+ locations['project'] = retrieve_projectname(args)
	82	+ locations['filename'] = generate_wikidump_filename(args)
	83	+ return locations
	84	+
	85	+
	86	+def dump_downloader_launcher(args, location, filename, project, language_code):
58	87	print 'dump downloader'
59		~~- config.load_configuration(args)~~
60		~~- language_code = retrieve_language(args)~~
61		~~- if language_code == None:~~
62		~~- print 'Entered language: %s is not a valid Wikipedia language' % get_value(args, 'language')~~
63		~~- sys.exit(-1)~~
64		~~- project = retrieve_project(args)~~
65		~~- if project == None:~~
66		~~- print 'Entered project: %s is not valid Wikipedia project.' % get_value(args, 'project')~~
67		~~- sys.exit(-1)~~
68		~~- location = os.path.join(get_value(args, 'location'), language_code)~~
69		~~- project = language_code + project~~
70		~~- filename = '%s-%s-%s' % (project, 'latest', get_value(args, 'file'))~~
71	88	pbar = get_value(args, 'progress')
72		-
73	89	domain = settings.WP_DUMP_LOCATION
74		~~- path = '/%s/latest/' % language_code~~
75		-
	90	+ path = '/%s/latest/' % project
76	91	extension = utils.determine_file_extension(filename)
77	92	filemode = utils.determine_file_mode(extension)
78	93
79		~~- dump_downloader.download_wp_dump(domain, path, filename, location, filemode, pbar)~~
	94	+ dump_downloader.download_wiki_file(domain, path, filename, location, filemode, pbar)
80	95
81	96
82		~~-def split_xml_file_launcher(args):~~
	97	+def split_xml_file_launcher(args, location, filename, project, language_code):
83	98	print 'split_xml_file_launcher'
84		~~- dbname = create_dbname(args)~~
85		~~- split_xml_file.split_xml(dbname)~~
	99	+ ext = utils.determine_file_extension(filename)
	100	+ if ext in settings.COMPRESSION_EXTENSIONS:
	101	+ ext = '.%s' % ext
	102	+ file = filename.replace(ext, '')
	103	+ result = utils.check_file_exists(location, file)
	104	+ if not result:
	105	+ retcode = extract_xml_file(args, location, filename)
	106	+ else:
	107	+ retcode = 0
	108	+ if retcode != 0:
	109	+ sys.exit(retcode)
	110	+ split_xml_file.split_xml(location, file, project, language_code)
86	111
87	112
88		~~-def mongodb_script_launcher(args):~~
	113	+def extract_xml_file(args, location, file):
	114	+ path = config.detect_installed_program('7zip')
	115	+
	116	+ source = os.path.join(location, file)
	117	+ retcode = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)])
	118	+ return retcode
	119	+
	120	+
	121	+def mongodb_script_launcher(args, location, filename, project, language_code):
89	122	print 'mongodb_script_launcher'
90		~~- config.load_configuration(args)~~
91		~~- dbname = create_dbname(args)~~
92		~~- #map_wiki_editors.run_stand_alone(dbname)~~
	123	+ map_wiki_editors.run_parse_editors(project, language_code, location)
93	124	#print args
94	125
95	126
96		~~-def all_launcher(args):~~
	127	+def all_launcher(args, location, filename, project, language_code):
97	128	print 'all_launcher'
98		~~- config_launcher(args)~~
99		~~- dump_downloader_launcher(args)~~
100		~~- split_xml_file_launcher(args)~~
101		~~- mongodb_script_launcher(args)~~
	129	+ dump_downloader_launcher(args, location, filename, project, language_code)
	130	+ split_xml_file_launcher(args, location, filename, project, language_code)
	131	+ mongodb_script_launcher(args, location, filename, project, language_code)
102	132
103	133
104	134	def supported_languages():
—	—	@@ -106,8 +136,8 @@
107	137	return tuple(choices)
108	138
109	139
110		~~-def show_languages(args):~~
111		~~- first = get_value(args, 'first')~~
	140	+def show_languages(args, location, filename, project, language_code):
	141	+ first = get_value(args, 'startswith')
112	142	if first != None:
113	143	first = first.title()
114	144	choices = supported_languages()
—	—	@@ -129,32 +159,21 @@
130	160	'pages-meta-current.xml.bz2')
131	161
132	162	parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter)
	163	+ #group = parser.add_mutually_exclusive_group()
	164	+ #group.add_argument('show_languages', action='store')
	165	+ #group.add_argument('language', action='store')
133	166	subparsers = parser.add_subparsers(help='sub-command help')
134		~~- parser.add_argument('-p', '--progress', action='store_true', default=True,~~
135		~~- help='Indicate whether you want to have a progressbar.')~~
136	167
137	168	parser_languages = subparsers.add_parser('show_languages', help='Overview of all valid languages.')
	169	+ parser_languages.add_argument('-s', '--startswith',
	170	+ action='store',
	171	+ help='Enter the first letter of a language to see which languages are available.')
138	172	parser_languages.set_defaults(func=show_languages)
139		~~- parser_languages.add_argument('-f', '--first', action='store', help='Enter the first letter of a language to see which languages are available.')~~
140	173
141	174	parser_config = subparsers.add_parser('config', help='The config sub command allows you set the data location of where to store files.')
142	175	parser_config.set_defaults(func=config_launcher)
143	176
144	177	parser_download = subparsers.add_parser('download', help='The download sub command allows you to download a Wikipedia dump file.')
145		~~- parser_download.add_argument('language', action='store',~~
146		~~- help='Example of valid languages.',~~
147		~~- choices=supported_languages(),~~
148		~~- default='Russian')~~
149		~~- parser_download.add_argument('-p', '--project', action='store', help='Specify the Wikimedia project that you would like to download',~~
150		~~- choices=settings.WIKIMEDIA_PROJECTS.keys(),~~
151		~~- default='wiki')~~
152		~~- parser_download.add_argument('-l', '--location', action='store',~~
153		~~- help='Indicate where you want to store the downloaded file.',~~
154		~~- default=settings.XML_FILE_LOCATION)~~
155		~~- parser_download.add_argument('-f', '--file', action='store',~~
156		~~- choices=file_choices,~~
157		~~- help='Indicate which dump you want to download. Valid choices are:\n %s' % ''.join([f + ',\n' for f in file_choices]),~~
158		~~- default='stub-meta-current.xml.gz')~~
159	178	parser_download.set_defaults(func=dump_downloader_launcher)
160	179
161	180	parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.')
—	—	@@ -166,8 +185,32 @@
167	186	parser_all = subparsers.add_parser('all', help='The all sub command runs the download, split, store and dataset commands.\n\nWARNING: THIS COULD TAKE DAYS DEPENDING ON THE CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE WIKIMEDIA DUMP FILE.')
168	187	parser_all.set_defaults(func=all_launcher)
169	188
	189	+ parser.add_argument('-l', '--language', action='store',
	190	+ help='Example of valid languages.',
	191	+ choices=supported_languages(),
	192	+ default='Russian')
	193	+
	194	+ parser.add_argument('-p', '--project', action='store',
	195	+ help='Specify the Wikimedia project that you would like to download',
	196	+ choices=settings.WIKIMEDIA_PROJECTS.keys(),
	197	+ default='wiki')
	198	+
	199	+ parser.add_argument('-o', '--location', action='store',
	200	+ help='Indicate where you want to store the downloaded file.',
	201	+ default=settings.XML_FILE_LOCATION)
	202	+
	203	+ parser.add_argument('-f', '--file', action='store',
	204	+ choices=file_choices,
	205	+ help='Indicate which dump you want to download. Valid choices are:\n %s' % ''.join([f + ',\n' for f in file_choices]),
	206	+ default='stub-meta-current.xml.gz')
	207	+
	208	+ parser.add_argument('-prog', '--progress', action='store_true', default=True,
	209	+ help='Indicate whether you want to have a progressbar.')
	210	+
170	211	args = parser.parse_args()
171		~~- args.func(args)~~
	212	+ config.load_configuration(args)
	213	+ locations = determine_file_locations(args)
	214	+ args.func(args, **locations)
172	215
173	216
174	217	if __name__ == '__main__':
Index: trunk/tools/editor_trends/map_wiki_editors.py
—	—	@@ -257,18 +257,18 @@
258	258	return ids
259	259
260	260
261		~~-def run_parse_editors(dbname, language):~~
	261	+def run_parse_editors(dbname, language, location):
262	262	ids = load_bot_ids()
263	263	kwargs = {'bots': ids,
264	264	'dbname': dbname,
265	265	'pbar': True,
266		~~- 'nr_input_processors': 1,~~
267		~~- 'nr_output_processors': 1,~~
	266	+ 'nr_input_processors': 2,
	267	+ 'nr_output_processors': 2,
268	268	'language': language,
269	269	}
270	270	chunks = {}
271		~~- file_location = os.path.join(settings.XML_FILE_LOCATION, language)~~
272		~~- files = utils.retrieve_file_list(file_location, 'xml')~~
	271	+ #file_location = os.path.join(settings.XML_FILE_LOCATION, language)
	272	+ files = utils.retrieve_file_list(location, 'xml')
273	273	parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0))
274	274	a = 0
275	275	for x in xrange(settings.NUMBER_OF_PROCESSES):
—	—	@@ -276,14 +276,11 @@
277	277	chunks[x] = files[a:b]
278	278	a = (x + 1) * parts
279	279
	280	+ pc.build_scaffolding(pc.load_queue, parse_editors, chunks, store_editors, True, **kwargs)
280	281
281		~~- for x in xrange(settings.NUMBER_OF_PROCESSES):~~
282		~~- pc.build_scaffolding(pc.load_queue, parse_editors, chunks[x], store_editors, True, **kwargs)~~
283	282
284		-
285	283	def debug_parse_editors(dbname):
286	284	q = JoinableQueue()
287		~~- #edits = db.init_mongo_db('editors')~~
288	285	parse_editors('en\\522.xml', q, None, None, True)
289	286	store_editors(q, [], dbname)
290	287
Index: trunk/tools/editor_trends/settings.py
—	—	@@ -101,7 +101,12 @@
102	102	# Name space, do not change as this works for Mediawiki wikis
103	103	NAME_SPACE = 'http://www.mediawiki.org/xml/export-0.4/'
104	104
	105	+WINDOWS_REGISTER = {'7zip': 'Software\\7-Zip',
	106	+ }
105	107
	108	+COMPRESSION_EXTENSIONS = ['gz', 'bz2', '7z']
	109	+
	110	+
106	111	WIKIMEDIA_PROJECTS = {'commons': 'commonswiki',
107	112	'wikibooks': 'wikibooks',
108	113	'wikinews': 'wikinews',
Index: trunk/tools/editor_trends/config.py
—	—	@@ -20,29 +20,48 @@
21	21
22	22	import os
23	23	import ConfigParser
	24	+from _winreg import *
24	25
25	26	import settings
26	27	from utils import utils
27	28
28	29
	30	+def detect_windows_program(program):
	31	+ entry = settings.WINDOWS_REGISTER[program]
	32	+ try:
	33	+ key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ)
	34	+ return QueryValueEx(key, 'Path')[0]
	35	+ except WindowsError:
	36	+ return None
	37	+
	38	+
	39	+def detect_installed_program(program):
	40	+ platform = settings.OS
	41	+ if platform == 'Windows':
	42	+ path = detect_windows_program(program)
	43	+ return path
	44	+ else:
	45	+ raise NotImplementedError
	46	+
	47	+
29	48	def load_configuration(args):
30	49	config = ConfigParser.RawConfigParser()
31	50	if not utils.check_file_exists(settings.WORKING_DIRECTORY, 'wiki.cfg'):
32	51	working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.' % os.getcwd())
33	52	if working_directory == '':
34	53	working_directory = os.getcwd()
35		-
	54	+
36	55	xml_file_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.' % settings.XML_FILE_LOCATION)
37	56	if xml_file_location == '':
38	57	xml_file_location = settings.XML_FILE_LOCATION
39		-
	58	+
40	59	create_configuration(WORKING_DIRECTORY=working_directory, XML_FILE_LOCATION=xml_file_location)
41	60
42	61	config.read('wiki.cfg')
43	62	settings.WORKING_DIRECTORY = config.get('file_locations', 'WORKING_DIRECTORY')
44	63	settings.XML_FILE_LOCATION = config.get('file_locations', 'XML_FILE_LOCATION')
45		-
46		-
	64	+
	65	+
47	66	def create_configuration(**kwargs):
48	67	working_directory = kwargs.get('WORKING_DIRECTORY', settings.WORKING_DIRECTORY)
49	68	config = ConfigParser.RawConfigParser()
—	—	@@ -56,6 +75,8 @@
57	76
58	77
59	78	if __name__ == '__main__':
60		~~- load_configuration([])~~
	79	+ p =detect_windows_program('7zip')
	80	+ print p
	81	+ #load_configuration([])
61	82
62	83
Index: trunk/tools/editor_trends/utils/dump_downloader.py
—	—	@@ -97,9 +97,9 @@
98	98	pbar.update(pbar.currval + chunk)
99	99
100	100	except urllib2.URLError, error:
101		~~- print 'Reason: %s' % error.reason~~
	101	+ print 'Reason: %s' % error
102	102	except urllib2.HTTPError, error:
103		~~- print 'Error: %s' % error.code~~
	103	+ print 'Error: %s' % error
104	104	finally:
105	105	fh.close()
106	106
Index: trunk/tools/editor_trends/utils/process_constructor.py
—	—	@@ -55,20 +55,25 @@
56	56
57	57	nr_input_processors = kwargs.pop('nr_input_processors')
58	58	nr_output_processors = kwargs.pop('nr_output_processors')
59		-
	59	+ input_queues = {}
	60	+ result_queues = {}
	61	+ assert len(obj) == nr_input_processors
60	62	if result_queue:
61		~~- result_queue = JoinableQueue()~~
	63	+ assert len(obj)== nr_output_processors
62	64
63		~~- input_queue = load_input_queue(obj, poison_pill=True)~~
	65	+ for i, o in enumerate(obj):
	66	+ input_queues[i] = load_input_queue(obj[o], poison_pill=True)
	67	+ if result_queue:
	68	+ result_queues[i] = JoinableQueue()
64	69
65	70	if settings.PROGRESS_BAR:
66		~~- pbar = progressbar.ProgressBar(maxval=input_queue.qsize()).start()~~
	71	+ size = sum([input_queues[q].qsize() for q in input_queues])
	72	+ pbar = progressbar.ProgressBar(maxval=size).start()
67	73	kwargs['pbar'] = pbar
68	74	else:
69	75	pbar = False
70		-
71		-
72		~~- input_processes = [models.ProcessInputQueue(main, input_queue, result_queue,~~
	76	+
	77	+ input_processes = [models.ProcessInputQueue(main, input_queues[i], result_queues[i],
73	78	**kwargs) for i in xrange(nr_input_processors)]
74	79
75	80	for input_process in input_processes:
—	—	@@ -78,7 +83,7 @@
79	84
80	85	if result_queue:
81	86	result_processes = [models.ProcessResultQueue(result_processor,
82		~~- result_queue, **kwargs) for i in xrange(nr_output_processors)]~~
	87	+ result_queues[i], **kwargs) for i in xrange(nr_output_processors)]
83	88	for result_process in result_processes:
84	89	result_process.start()
85	90
—	—	@@ -115,6 +120,5 @@
116	121	input_queue.put(d)
117	122
118	123	if poison_pill:
119		~~- for p in xrange(settings.NUMBER_OF_PROCESSES):~~
120		~~- input_queue.put(None)~~
	124	+ input_queue.put(None)
121	125	return input_queue
Index: trunk/tools/editor_trends/split_xml_file.py
—	—	@@ -82,7 +82,7 @@
83	83	ns = []
84	84	for namespace in namespaces:
85	85	value = namespaces[namespace].get(u'*', None)
86		~~- if value != None and value != '' and not value.endswith('talk'):~~
	86	+ if value != None and value != '':
87	87	ns.append(value)
88	88	return ns
89	89
—	—	@@ -122,9 +122,10 @@
123	123
124	124	def create_xml_file_handle(fh, counter, size, language):
125	125	'''Create file handle if none is supplied or if file size > max file size.'''
	126	+ if not counter:
	127	+ counter = 0
126	128	path = os.path.join(settings.XML_FILE_LOCATION, language, '%s.xml' % counter)
127	129	if not fh:
128		~~- counter = 0~~
129	130	fh = codecs.open(path, 'w', encoding=settings.ENCODING)
130	131	return fh, counter
131	132	elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE:
—	—	@@ -137,39 +138,49 @@
138	139	return fh, counter
139	140
140	141
141		~~-def split_xml(language):~~
	142	+def split_xml(location, filename, project, language_code):
142	143	'''Reads xml file and splits it in N chunks'''
143		~~- location = os.path.join(settings.XML_FILE_LOCATION, language)~~
	144	+ #location = os.path.join(settings.XML_FILE_LOCATION, language)
144	145	result = utils.check_file_exists(location, '')
145	146	if result == False:
146	147	result = utils.create_directory(location)
147	148	if not result:
148	149	return
149	150
150		~~- ns = load_namespace(language)~~
	151	+ ns = load_namespace(language_code)
151	152	ns = build_namespaces_locale(ns)
152	153
153	154	fh = None
154	155	counter = None
	156	+ source = os.path.join(location, filename)
155	157	tag = '{%s}page' % settings.NAME_SPACE
156	158
157		~~- context = cElementTree.iterparse(settings.XML_FILE, events=('start', 'end'))~~
	159	+ context = cElementTree.iterparse(source, events=('start', 'end'))
158	160	context = iter(context)
159	161	event, root = context.next() #get the root element of the XML doc
160	162
161		~~- for event, elem in context:~~
162		~~- if event == 'end':~~
163		~~- if elem.tag == tag:~~
164		~~- elem = remove_namespace(elem, settings.NAME_SPACE)~~
165		~~- elem = parse_comments(elem, remove_numeric_character_references)~~
	163	+ try:
	164	+ for event, elem in context:
	165	+ if event == 'end':
	166	+ if elem.tag == tag:
	167	+ elem = remove_namespace(elem, settings.NAME_SPACE)
	168	+ if is_article_main_namespace(elem, ns):
	169	+ elem = parse_comments(elem, remove_numeric_character_references)
	170	+ fh, counter = write_xml_file(elem, fh, counter, language_code)
	171	+ root.clear() # when done parsing a section clear the tree to safe memory
	172	+ #elem = parse_comments(elem, convert_html_entities)
	173	+ #elem = parse_comments(elem, remove_ascii_control_characters)
	174	+ #print cElementTree.tostring(elem)
	175	+ except SyntaxError:
	176	+ fh = utils.create_txt_filehandle(ERROR_MESSAGE_FILE_LOCATION, 'split_xml', 'w', settings.ENCODING)
	177	+ fh.write(cElementTree.tostring(elem))
	178	+ fh.close()
166	179
167		~~- if is_article_main_namespace(elem, ns):~~
168		~~- fh, counter = write_xml_file(elem, fh, counter, language)~~
169		~~- root.clear() # when done parsing a section clear the tree to safe memory~~
170		~~- #elem = parse_comments(elem, convert_html_entities)~~
171		~~- #elem = parse_comments(elem, remove_ascii_control_characters)~~
172		~~- #print cElementTree.tostring(elem)~~
173	180
174		-
175	181	if __name__ == "__main__":
176		~~- split_xml('en')~~
	182	+ kwargs = {'location': 'c:\\Source_files\\',
	183	+ 'filename': settings.XML_FILE,
	184	+ 'project':'wiki',
	185	+ 'language_code':'en'
	186	+ }
	187	+ split_xml(**kwargs)
Index: trunk/tools/editor_trends/database/cache.py
—	—	@@ -20,109 +20,113 @@
21	21
22	22	'''
23	23	This module provides a simple caching mechanism to speed-up the process of
24		~~-inserting records to MongoDB. The caching bject works as follows:~~
25		~~-1) Each edit from an author is added to a dictionary~~
26		~~-2) Every 50000 edits, the object returns %x with the most edits, and these are~~
27		~~-then stored in MongoDB. By packaging multiple edits in a single commit,~~
28		~~-processing time is significantly reduced.~~
	24	+inserting records to MongoDB. The caching object works as follows:
	25	+1) Each edit from an author is added to a dictionary
	26	+2) Every x seconds, the object returns %x with the least number of edits,
	27	+and these are then stored in MongoDB. By packaging multiple edits in a single
	28	+commit, processing time is significantly reduced.
29	29
30	30	This caching mechanism does not create any benefits for authors with single or
31		~~-very few edits.~~
32		-
	31	+very few edits.
33	32	'''
34	33
35	34
36	35	import sys
37	36	import datetime
	37	+import random
38	38
39	39	import settings
40	40	import db
	41	+from utils import utils
41	42
42	43
43	44	class EditorCache(object):
44	45	def __init__(self, collection):
45	46	self.collection = collection
46	47	self.editors = {}
47		~~- self.size = self.__sizeof__()~~
48	48	self.cumulative_n = 0
	49	+ self.init_time = datetime.datetime.now()
49	50	self.time_started = datetime.datetime.now()
50		~~- self.n = self.current_cache_size()~~
	51	+ self.n = 0
51	52	self.emptied = 1
	53	+ self.number_editors = 0
	54	+ self.treshold_editors = set()
	55	+ self.treshold = 10
52	56
53		-
54	57	def __repr__(self):
55		~~- pass~~
	58	+ return '%s_%s' % ('editor_cache', random.randint(0, 99999))
56	59
57		-
58	60	def _store_editor(self, key, value):
59	61	editor = self.collection.insert({'editor': key, 'edits': {}})
60	62	self.editors[key]['id'] = str(editor)
61	63
62		-
63	64	def current_cache_size(self):
64	65	return sum([self.editors[k].get('obs', 0) for k in self.editors])
65	66
66		-
67	67	def add(self, key, value):
68		~~- self.cumulative_n += 1~~
69		~~- if key not in self.editors:~~
70		~~- self.editors[key] = {}~~
71		~~- self.editors[key]['obs'] = 0~~
72		~~- self.editors[key]['edits'] = []~~
73		-
	68	+ if key == 'NEXT':
	69	+ for editor in self.treshold_editors:
	70	+ self.update(editor, self.editors[editor]['edits'])
	71	+ self.n -= self.editors[editor]['obs']
	72	+ self.number_editors -= 1
	73	+ del self.editors[editor]
	74	+ self.treshold_editors = set()
74	75	else:
	76	+ self.cumulative_n += 1
	77	+ self.n += 1
	78	+ if key not in self.editors:
	79	+ self.editors[key] = {}
	80	+ self.editors[key]['obs'] = 0
	81	+ self.editors[key]['edits'] = []
	82	+ self.number_editors += 1
	83	+
75	84	id = str(self.editors[key]['obs'])
76	85	self.editors[key]['edits'].append(value)
77	86	self.editors[key]['obs'] += 1
78	87
	88	+ if self.editors[key]['obs'] == self.treshold:
	89	+ self.treshold_editors.add(key)
	90	+# self.update(key, self.editors[key]['edits'])
	91	+# del self.editors[key]
	92	+# self.n -= 10
	93	+# self.number_editors -= 1
79	94
80		~~- if self.cumulative_n % 25000 == 0:~~
81		~~- self.empty_all(5.0)~~
	95	+ def update(self, editor, values):
	96	+ #t = datetime.datetime.now()
	97	+ self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True)
	98	+ #print 'It took %s to store editor %s;and the cache contains %s editors and %s items' % (datetime.datetime.now() - t, editor, self.number_editors, self.n)
82	99
	100	+ def quick_sort(self, obs):
	101	+ if obs == []:
	102	+ return []
	103	+ else:
	104	+ pivot = obs[0]
	105	+ lesser = self.quick_sort([x for x in obs[1:] if x < pivot])
	106	+ greater = self.quick_sort([x for x in obs[1:] if x >= pivot])
	107	+ return lesser + [pivot] + greater
83	108
84		~~- def retrieve_top_k_editors(self, percentage):~~
85		~~- keys = self.editors.keys()~~
86		~~- obs = []~~
87		~~- for k in keys:~~
88		~~- weight = float(self.editors[k].get('obs', 0)) / self.n~~
89		~~- obs.append((weight, k))~~
90		~~- obs.sort()~~
91		~~- obs.reverse()~~
92		~~- l = int((len(obs) / 100.0) * percentage)~~
93		~~- if l == 0:~~
94		~~- l = 1~~
95		~~- obs = obs[:l]~~
96		~~- obs = [o[1] for o in obs]~~
97		~~- return obs~~
	109	+ def store(self):
	110	+ utils.store_object(self, settings.BINARY_OBJECT_FILE_LOCATION, self.__repr__())
98	111
	112	+ def drop_n_observations(self, n=1):
	113	+ editors_to_remove = set()
	114	+ for editor in self.editors:
	115	+ if editor['obs'] <= n:
	116	+ editors_to_remove.add(editor)
99	117
100		~~- def update(self, editor, values):~~
101		~~- self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True)~~
	118	+ for editor in editors_to_remove:
	119	+ del self.editors[editor]
102	120
103	121
104		~~- def empty_all(self, percentage):~~
105		~~- self.n = self.current_cache_size()~~
106		~~- if percentage < 100.0:~~
107		~~- keys = self.retrieve_top_k_editors(percentage)~~
108		~~- else:~~
109		~~- keys = self.editors.keys()~~
110		~~- print 'Emptying cache %s time' % self.emptied~~
111		~~- self.emptied += 1~~
112		~~- for key in keys:~~
113		~~- if self.editors[key]['edits'] != {}:~~
114		~~- self.update(key, self.editors[key]['edits'])~~
115		~~- self.editors[key]['edits'] = []~~
116		~~- self.editors[key]['obs'] = 0.0~~
117		-
118		-
119	122	def debug():
120	123	mongo = db.init_mongo_db('test')
121	124	collection = mongo['test']
122		~~- cache = EditorCache(collection)~~
	125	+ cache = EditorCache(collection, wait=2)
123	126	import random
124	127	for i in xrange(100000):
125	128	cache.add(str(random.randrange(0, 5)), {'date': 'woensaag', 'article': '3252'})
126		~~- cache.empty_all(100)~~
	129	+ cache.empty_all(-1)
	130	+ print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - cache.init_time, cache.cumulative_n)
127	131
128	132
129	133	if __name__ == '__main__':

Status & tagging log

10:07, 3 December 2010 Reedy (talk | contribs) changed the status of r76048 [removed: new added: deferred]