r76048 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r76047‎ | r76048 | r76049 >
Date:21:57, 4 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
Various bugfixes, preparing for 1st release.
Modified paths:
  • /trunk/tools/editor_trends/config.py (modified) (history)
  • /trunk/tools/editor_trends/database/cache.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history)
  • /trunk/tools/editor_trends/settings.py (modified) (history)
  • /trunk/tools/editor_trends/split_xml_file.py (modified) (history)
  • /trunk/tools/editor_trends/utils/dump_downloader.py (modified) (history)
  • /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -19,6 +19,7 @@
2020
2121 import os
2222 import sys
 23+import subprocess
2324 from argparse import ArgumentParser
2425 from argparse import RawTextHelpFormatter
2526
@@ -30,6 +31,7 @@
3132 from utils import utils
3233 from utils import dump_downloader
3334 import split_xml_file
 35+import map_wiki_editors
3436 import config
3537
3638
@@ -37,15 +39,29 @@
3840 return getattr(args, key, None)
3941
4042
41 -def config_launcher(args):
 43+def config_launcher(args, location, filename, project, language_code):
4244 config.load_configuration(args)
4345
4446
 47+def retrieve_projectname(args):
 48+ language_code = retrieve_language(args)
 49+ if language_code == None:
 50+ print 'Entered language: %s is not a valid Wikipedia language' % get_value(args, 'language')
 51+ sys.exit(-1)
 52+ project = retrieve_project(args)
 53+
 54+ if project == None:
 55+ print 'Entered project: %s is not valid Wikipedia project.' % get_value(args, 'project')
 56+ sys.exit(-1)
 57+
 58+ return '%s%s' % (language_code, project)
 59+
4560 def retrieve_language(args):
4661 language = get_value(args, 'language')
4762 language = language.title()
48 - return languages.MAPPING.get(language, None)
 63+ return languages.MAPPING.get(language, None)
4964
 65+
5066 def retrieve_project(args):
5167 project = get_value(args, 'project')
5268 if project != 'wiki':
@@ -53,51 +69,65 @@
5470 return project
5571
5672
57 -def dump_downloader_launcher(args):
 73+def generate_wikidump_filename(args):
 74+ return '%s-%s-%s' % (retrieve_projectname(args), 'latest', get_value(args, 'file'))
 75+
 76+
 77+def determine_file_locations(args):
 78+ locations = {}
 79+ locations['language_code'] = retrieve_language(args)
 80+ locations['location'] = os.path.join(get_value(args, 'location'), retrieve_language(args))
 81+ locations['project'] = retrieve_projectname(args)
 82+ locations['filename'] = generate_wikidump_filename(args)
 83+ return locations
 84+
 85+
 86+def dump_downloader_launcher(args, location, filename, project, language_code):
5887 print 'dump downloader'
59 - config.load_configuration(args)
60 - language_code = retrieve_language(args)
61 - if language_code == None:
62 - print 'Entered language: %s is not a valid Wikipedia language' % get_value(args, 'language')
63 - sys.exit(-1)
64 - project = retrieve_project(args)
65 - if project == None:
66 - print 'Entered project: %s is not valid Wikipedia project.' % get_value(args, 'project')
67 - sys.exit(-1)
68 - location = os.path.join(get_value(args, 'location'), language_code)
69 - project = language_code + project
70 - filename = '%s-%s-%s' % (project, 'latest', get_value(args, 'file'))
7188 pbar = get_value(args, 'progress')
72 -
7389 domain = settings.WP_DUMP_LOCATION
74 - path = '/%s/latest/' % language_code
75 -
 90+ path = '/%s/latest/' % project
7691 extension = utils.determine_file_extension(filename)
7792 filemode = utils.determine_file_mode(extension)
7893
79 - dump_downloader.download_wp_dump(domain, path, filename, location, filemode, pbar)
 94+ dump_downloader.download_wiki_file(domain, path, filename, location, filemode, pbar)
8095
8196
82 -def split_xml_file_launcher(args):
 97+def split_xml_file_launcher(args, location, filename, project, language_code):
8398 print 'split_xml_file_launcher'
84 - dbname = create_dbname(args)
85 - split_xml_file.split_xml(dbname)
 99+ ext = utils.determine_file_extension(filename)
 100+ if ext in settings.COMPRESSION_EXTENSIONS:
 101+ ext = '.%s' % ext
 102+ file = filename.replace(ext, '')
 103+ result = utils.check_file_exists(location, file)
 104+ if not result:
 105+ retcode = extract_xml_file(args, location, filename)
 106+ else:
 107+ retcode = 0
 108+ if retcode != 0:
 109+ sys.exit(retcode)
 110+ split_xml_file.split_xml(location, file, project, language_code)
86111
87112
88 -def mongodb_script_launcher(args):
 113+def extract_xml_file(args, location, file):
 114+ path = config.detect_installed_program('7zip')
 115+
 116+ source = os.path.join(location, file)
 117+ retcode = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)])
 118+ return retcode
 119+
 120+
 121+def mongodb_script_launcher(args, location, filename, project, language_code):
89122 print 'mongodb_script_launcher'
90 - config.load_configuration(args)
91 - dbname = create_dbname(args)
92 - #map_wiki_editors.run_stand_alone(dbname)
 123+ map_wiki_editors.run_parse_editors(project, language_code, location)
93124 #print args
94125
95126
96 -def all_launcher(args):
 127+def all_launcher(args, location, filename, project, language_code):
97128 print 'all_launcher'
98 - config_launcher(args)
99 - dump_downloader_launcher(args)
100 - split_xml_file_launcher(args)
101 - mongodb_script_launcher(args)
 129+ dump_downloader_launcher(args, location, filename, project, language_code)
 130+ split_xml_file_launcher(args, location, filename, project, language_code)
 131+ mongodb_script_launcher(args, location, filename, project, language_code)
102132
103133
104134 def supported_languages():
@@ -106,8 +136,8 @@
107137 return tuple(choices)
108138
109139
110 -def show_languages(args):
111 - first = get_value(args, 'first')
 140+def show_languages(args, location, filename, project, language_code):
 141+ first = get_value(args, 'startswith')
112142 if first != None:
113143 first = first.title()
114144 choices = supported_languages()
@@ -129,32 +159,21 @@
130160 'pages-meta-current.xml.bz2')
131161
132162 parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter)
 163+ #group = parser.add_mutually_exclusive_group()
 164+ #group.add_argument('show_languages', action='store')
 165+ #group.add_argument('language', action='store')
133166 subparsers = parser.add_subparsers(help='sub-command help')
134 - parser.add_argument('-p', '--progress', action='store_true', default=True,
135 - help='Indicate whether you want to have a progressbar.')
136167
137168 parser_languages = subparsers.add_parser('show_languages', help='Overview of all valid languages.')
 169+ parser_languages.add_argument('-s', '--startswith',
 170+ action='store',
 171+ help='Enter the first letter of a language to see which languages are available.')
138172 parser_languages.set_defaults(func=show_languages)
139 - parser_languages.add_argument('-f', '--first', action='store', help='Enter the first letter of a language to see which languages are available.')
140173
141174 parser_config = subparsers.add_parser('config', help='The config sub command allows you set the data location of where to store files.')
142175 parser_config.set_defaults(func=config_launcher)
143176
144177 parser_download = subparsers.add_parser('download', help='The download sub command allows you to download a Wikipedia dump file.')
145 - parser_download.add_argument('language', action='store',
146 - help='Example of valid languages.',
147 - choices=supported_languages(),
148 - default='Russian')
149 - parser_download.add_argument('-p', '--project', action='store', help='Specify the Wikimedia project that you would like to download',
150 - choices=settings.WIKIMEDIA_PROJECTS.keys(),
151 - default='wiki')
152 - parser_download.add_argument('-l', '--location', action='store',
153 - help='Indicate where you want to store the downloaded file.',
154 - default=settings.XML_FILE_LOCATION)
155 - parser_download.add_argument('-f', '--file', action='store',
156 - choices=file_choices,
157 - help='Indicate which dump you want to download. Valid choices are:\n %s' % ''.join([f + ',\n' for f in file_choices]),
158 - default='stub-meta-current.xml.gz')
159178 parser_download.set_defaults(func=dump_downloader_launcher)
160179
161180 parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.')
@@ -166,8 +185,32 @@
167186 parser_all = subparsers.add_parser('all', help='The all sub command runs the download, split, store and dataset commands.\n\nWARNING: THIS COULD TAKE DAYS DEPENDING ON THE CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE WIKIMEDIA DUMP FILE.')
168187 parser_all.set_defaults(func=all_launcher)
169188
 189+ parser.add_argument('-l', '--language', action='store',
 190+ help='Example of valid languages.',
 191+ choices=supported_languages(),
 192+ default='Russian')
 193+
 194+ parser.add_argument('-p', '--project', action='store',
 195+ help='Specify the Wikimedia project that you would like to download',
 196+ choices=settings.WIKIMEDIA_PROJECTS.keys(),
 197+ default='wiki')
 198+
 199+ parser.add_argument('-o', '--location', action='store',
 200+ help='Indicate where you want to store the downloaded file.',
 201+ default=settings.XML_FILE_LOCATION)
 202+
 203+ parser.add_argument('-f', '--file', action='store',
 204+ choices=file_choices,
 205+ help='Indicate which dump you want to download. Valid choices are:\n %s' % ''.join([f + ',\n' for f in file_choices]),
 206+ default='stub-meta-current.xml.gz')
 207+
 208+ parser.add_argument('-prog', '--progress', action='store_true', default=True,
 209+ help='Indicate whether you want to have a progressbar.')
 210+
170211 args = parser.parse_args()
171 - args.func(args)
 212+ config.load_configuration(args)
 213+ locations = determine_file_locations(args)
 214+ args.func(args, **locations)
172215
173216
174217 if __name__ == '__main__':
Index: trunk/tools/editor_trends/map_wiki_editors.py
@@ -257,18 +257,18 @@
258258 return ids
259259
260260
261 -def run_parse_editors(dbname, language):
 261+def run_parse_editors(dbname, language, location):
262262 ids = load_bot_ids()
263263 kwargs = {'bots': ids,
264264 'dbname': dbname,
265265 'pbar': True,
266 - 'nr_input_processors': 1,
267 - 'nr_output_processors': 1,
 266+ 'nr_input_processors': 2,
 267+ 'nr_output_processors': 2,
268268 'language': language,
269269 }
270270 chunks = {}
271 - file_location = os.path.join(settings.XML_FILE_LOCATION, language)
272 - files = utils.retrieve_file_list(file_location, 'xml')
 271+ #file_location = os.path.join(settings.XML_FILE_LOCATION, language)
 272+ files = utils.retrieve_file_list(location, 'xml')
273273 parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0))
274274 a = 0
275275 for x in xrange(settings.NUMBER_OF_PROCESSES):
@@ -276,14 +276,11 @@
277277 chunks[x] = files[a:b]
278278 a = (x + 1) * parts
279279
 280+ pc.build_scaffolding(pc.load_queue, parse_editors, chunks, store_editors, True, **kwargs)
280281
281 - for x in xrange(settings.NUMBER_OF_PROCESSES):
282 - pc.build_scaffolding(pc.load_queue, parse_editors, chunks[x], store_editors, True, **kwargs)
283282
284 -
285283 def debug_parse_editors(dbname):
286284 q = JoinableQueue()
287 - #edits = db.init_mongo_db('editors')
288285 parse_editors('en\\522.xml', q, None, None, True)
289286 store_editors(q, [], dbname)
290287
Index: trunk/tools/editor_trends/settings.py
@@ -101,7 +101,12 @@
102102 # Name space, do not change as this works for Mediawiki wikis
103103 NAME_SPACE = 'http://www.mediawiki.org/xml/export-0.4/'
104104
 105+WINDOWS_REGISTER = {'7zip': 'Software\\7-Zip',
 106+ }
105107
 108+COMPRESSION_EXTENSIONS = ['gz', 'bz2', '7z']
 109+
 110+
106111 WIKIMEDIA_PROJECTS = {'commons': 'commonswiki',
107112 'wikibooks': 'wikibooks',
108113 'wikinews': 'wikinews',
Index: trunk/tools/editor_trends/config.py
@@ -20,29 +20,48 @@
2121
2222 import os
2323 import ConfigParser
 24+from _winreg import *
2425
2526 import settings
2627 from utils import utils
2728
2829
 30+def detect_windows_program(program):
 31+ entry = settings.WINDOWS_REGISTER[program]
 32+ try:
 33+ key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ)
 34+ return QueryValueEx(key, 'Path')[0]
 35+ except WindowsError:
 36+ return None
 37+
 38+
 39+def detect_installed_program(program):
 40+ platform = settings.OS
 41+ if platform == 'Windows':
 42+ path = detect_windows_program(program)
 43+ return path
 44+ else:
 45+ raise NotImplementedError
 46+
 47+
2948 def load_configuration(args):
3049 config = ConfigParser.RawConfigParser()
3150 if not utils.check_file_exists(settings.WORKING_DIRECTORY, 'wiki.cfg'):
3251 working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.' % os.getcwd())
3352 if working_directory == '':
3453 working_directory = os.getcwd()
35 -
 54+
3655 xml_file_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.' % settings.XML_FILE_LOCATION)
3756 if xml_file_location == '':
3857 xml_file_location = settings.XML_FILE_LOCATION
39 -
 58+
4059 create_configuration(WORKING_DIRECTORY=working_directory, XML_FILE_LOCATION=xml_file_location)
4160
4261 config.read('wiki.cfg')
4362 settings.WORKING_DIRECTORY = config.get('file_locations', 'WORKING_DIRECTORY')
4463 settings.XML_FILE_LOCATION = config.get('file_locations', 'XML_FILE_LOCATION')
45 -
46 -
 64+
 65+
4766 def create_configuration(**kwargs):
4867 working_directory = kwargs.get('WORKING_DIRECTORY', settings.WORKING_DIRECTORY)
4968 config = ConfigParser.RawConfigParser()
@@ -56,6 +75,8 @@
5776
5877
5978 if __name__ == '__main__':
60 - load_configuration([])
 79+ p =detect_windows_program('7zip')
 80+ print p
 81+ #load_configuration([])
6182
6283
Index: trunk/tools/editor_trends/utils/dump_downloader.py
@@ -97,9 +97,9 @@
9898 pbar.update(pbar.currval + chunk)
9999
100100 except urllib2.URLError, error:
101 - print 'Reason: %s' % error.reason
 101+ print 'Reason: %s' % error
102102 except urllib2.HTTPError, error:
103 - print 'Error: %s' % error.code
 103+ print 'Error: %s' % error
104104 finally:
105105 fh.close()
106106
Index: trunk/tools/editor_trends/utils/process_constructor.py
@@ -55,20 +55,25 @@
5656
5757 nr_input_processors = kwargs.pop('nr_input_processors')
5858 nr_output_processors = kwargs.pop('nr_output_processors')
59 -
 59+ input_queues = {}
 60+ result_queues = {}
 61+ assert len(obj) == nr_input_processors
6062 if result_queue:
61 - result_queue = JoinableQueue()
 63+ assert len(obj)== nr_output_processors
6264
63 - input_queue = load_input_queue(obj, poison_pill=True)
 65+ for i, o in enumerate(obj):
 66+ input_queues[i] = load_input_queue(obj[o], poison_pill=True)
 67+ if result_queue:
 68+ result_queues[i] = JoinableQueue()
6469
6570 if settings.PROGRESS_BAR:
66 - pbar = progressbar.ProgressBar(maxval=input_queue.qsize()).start()
 71+ size = sum([input_queues[q].qsize() for q in input_queues])
 72+ pbar = progressbar.ProgressBar(maxval=size).start()
6773 kwargs['pbar'] = pbar
6874 else:
6975 pbar = False
70 -
71 -
72 - input_processes = [models.ProcessInputQueue(main, input_queue, result_queue,
 76+
 77+ input_processes = [models.ProcessInputQueue(main, input_queues[i], result_queues[i],
7378 **kwargs) for i in xrange(nr_input_processors)]
7479
7580 for input_process in input_processes:
@@ -78,7 +83,7 @@
7984
8085 if result_queue:
8186 result_processes = [models.ProcessResultQueue(result_processor,
82 - result_queue, **kwargs) for i in xrange(nr_output_processors)]
 87+ result_queues[i], **kwargs) for i in xrange(nr_output_processors)]
8388 for result_process in result_processes:
8489 result_process.start()
8590
@@ -115,6 +120,5 @@
116121 input_queue.put(d)
117122
118123 if poison_pill:
119 - for p in xrange(settings.NUMBER_OF_PROCESSES):
120 - input_queue.put(None)
 124+ input_queue.put(None)
121125 return input_queue
Index: trunk/tools/editor_trends/split_xml_file.py
@@ -82,7 +82,7 @@
8383 ns = []
8484 for namespace in namespaces:
8585 value = namespaces[namespace].get(u'*', None)
86 - if value != None and value != '' and not value.endswith('talk'):
 86+ if value != None and value != '':
8787 ns.append(value)
8888 return ns
8989
@@ -122,9 +122,10 @@
123123
124124 def create_xml_file_handle(fh, counter, size, language):
125125 '''Create file handle if none is supplied or if file size > max file size.'''
 126+ if not counter:
 127+ counter = 0
126128 path = os.path.join(settings.XML_FILE_LOCATION, language, '%s.xml' % counter)
127129 if not fh:
128 - counter = 0
129130 fh = codecs.open(path, 'w', encoding=settings.ENCODING)
130131 return fh, counter
131132 elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE:
@@ -137,39 +138,49 @@
138139 return fh, counter
139140
140141
141 -def split_xml(language):
 142+def split_xml(location, filename, project, language_code):
142143 '''Reads xml file and splits it in N chunks'''
143 - location = os.path.join(settings.XML_FILE_LOCATION, language)
 144+ #location = os.path.join(settings.XML_FILE_LOCATION, language)
144145 result = utils.check_file_exists(location, '')
145146 if result == False:
146147 result = utils.create_directory(location)
147148 if not result:
148149 return
149150
150 - ns = load_namespace(language)
 151+ ns = load_namespace(language_code)
151152 ns = build_namespaces_locale(ns)
152153
153154 fh = None
154155 counter = None
 156+ source = os.path.join(location, filename)
155157 tag = '{%s}page' % settings.NAME_SPACE
156158
157 - context = cElementTree.iterparse(settings.XML_FILE, events=('start', 'end'))
 159+ context = cElementTree.iterparse(source, events=('start', 'end'))
158160 context = iter(context)
159161 event, root = context.next() #get the root element of the XML doc
160162
161 - for event, elem in context:
162 - if event == 'end':
163 - if elem.tag == tag:
164 - elem = remove_namespace(elem, settings.NAME_SPACE)
165 - elem = parse_comments(elem, remove_numeric_character_references)
 163+ try:
 164+ for event, elem in context:
 165+ if event == 'end':
 166+ if elem.tag == tag:
 167+ elem = remove_namespace(elem, settings.NAME_SPACE)
 168+ if is_article_main_namespace(elem, ns):
 169+ elem = parse_comments(elem, remove_numeric_character_references)
 170+ fh, counter = write_xml_file(elem, fh, counter, language_code)
 171+ root.clear() # when done parsing a section clear the tree to safe memory
 172+ #elem = parse_comments(elem, convert_html_entities)
 173+ #elem = parse_comments(elem, remove_ascii_control_characters)
 174+ #print cElementTree.tostring(elem)
 175+ except SyntaxError:
 176+ fh = utils.create_txt_filehandle(ERROR_MESSAGE_FILE_LOCATION, 'split_xml', 'w', settings.ENCODING)
 177+ fh.write(cElementTree.tostring(elem))
 178+ fh.close()
166179
167 - if is_article_main_namespace(elem, ns):
168 - fh, counter = write_xml_file(elem, fh, counter, language)
169 - root.clear() # when done parsing a section clear the tree to safe memory
170 - #elem = parse_comments(elem, convert_html_entities)
171 - #elem = parse_comments(elem, remove_ascii_control_characters)
172 - #print cElementTree.tostring(elem)
173180
174 -
175181 if __name__ == "__main__":
176 - split_xml('en')
 182+ kwargs = {'location': 'c:\\Source_files\\',
 183+ 'filename': settings.XML_FILE,
 184+ 'project':'wiki',
 185+ 'language_code':'en'
 186+ }
 187+ split_xml(**kwargs)
Index: trunk/tools/editor_trends/database/cache.py
@@ -20,109 +20,113 @@
2121
2222 '''
2323 This module provides a simple caching mechanism to speed-up the process of
24 -inserting records to MongoDB. The caching bject works as follows:
25 -1) Each edit from an author is added to a dictionary
26 -2) Every 50000 edits, the object returns %x with the most edits, and these are
27 -then stored in MongoDB. By packaging multiple edits in a single commit,
28 -processing time is significantly reduced.
 24+inserting records to MongoDB. The caching object works as follows:
 25+1) Each edit from an author is added to a dictionary
 26+2) Every x seconds, the object returns %x with the least number of edits,
 27+and these are then stored in MongoDB. By packaging multiple edits in a single
 28+commit, processing time is significantly reduced.
2929
3030 This caching mechanism does not create any benefits for authors with single or
31 -very few edits.
32 -
 31+very few edits.
3332 '''
3433
3534
3635 import sys
3736 import datetime
 37+import random
3838
3939 import settings
4040 import db
 41+from utils import utils
4142
4243
4344 class EditorCache(object):
4445 def __init__(self, collection):
4546 self.collection = collection
4647 self.editors = {}
47 - self.size = self.__sizeof__()
4848 self.cumulative_n = 0
 49+ self.init_time = datetime.datetime.now()
4950 self.time_started = datetime.datetime.now()
50 - self.n = self.current_cache_size()
 51+ self.n = 0
5152 self.emptied = 1
 53+ self.number_editors = 0
 54+ self.treshold_editors = set()
 55+ self.treshold = 10
5256
53 -
5457 def __repr__(self):
55 - pass
 58+ return '%s_%s' % ('editor_cache', random.randint(0, 99999))
5659
57 -
5860 def _store_editor(self, key, value):
5961 editor = self.collection.insert({'editor': key, 'edits': {}})
6062 self.editors[key]['id'] = str(editor)
6163
62 -
6364 def current_cache_size(self):
6465 return sum([self.editors[k].get('obs', 0) for k in self.editors])
6566
66 -
6767 def add(self, key, value):
68 - self.cumulative_n += 1
69 - if key not in self.editors:
70 - self.editors[key] = {}
71 - self.editors[key]['obs'] = 0
72 - self.editors[key]['edits'] = []
73 -
 68+ if key == 'NEXT':
 69+ for editor in self.treshold_editors:
 70+ self.update(editor, self.editors[editor]['edits'])
 71+ self.n -= self.editors[editor]['obs']
 72+ self.number_editors -= 1
 73+ del self.editors[editor]
 74+ self.treshold_editors = set()
7475 else:
 76+ self.cumulative_n += 1
 77+ self.n += 1
 78+ if key not in self.editors:
 79+ self.editors[key] = {}
 80+ self.editors[key]['obs'] = 0
 81+ self.editors[key]['edits'] = []
 82+ self.number_editors += 1
 83+
7584 id = str(self.editors[key]['obs'])
7685 self.editors[key]['edits'].append(value)
7786 self.editors[key]['obs'] += 1
7887
 88+ if self.editors[key]['obs'] == self.treshold:
 89+ self.treshold_editors.add(key)
 90+# self.update(key, self.editors[key]['edits'])
 91+# del self.editors[key]
 92+# self.n -= 10
 93+# self.number_editors -= 1
7994
80 - if self.cumulative_n % 25000 == 0:
81 - self.empty_all(5.0)
 95+ def update(self, editor, values):
 96+ #t = datetime.datetime.now()
 97+ self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True)
 98+ #print 'It took %s to store editor %s;and the cache contains %s editors and %s items' % (datetime.datetime.now() - t, editor, self.number_editors, self.n)
8299
 100+ def quick_sort(self, obs):
 101+ if obs == []:
 102+ return []
 103+ else:
 104+ pivot = obs[0]
 105+ lesser = self.quick_sort([x for x in obs[1:] if x < pivot])
 106+ greater = self.quick_sort([x for x in obs[1:] if x >= pivot])
 107+ return lesser + [pivot] + greater
83108
84 - def retrieve_top_k_editors(self, percentage):
85 - keys = self.editors.keys()
86 - obs = []
87 - for k in keys:
88 - weight = float(self.editors[k].get('obs', 0)) / self.n
89 - obs.append((weight, k))
90 - obs.sort()
91 - obs.reverse()
92 - l = int((len(obs) / 100.0) * percentage)
93 - if l == 0:
94 - l = 1
95 - obs = obs[:l]
96 - obs = [o[1] for o in obs]
97 - return obs
 109+ def store(self):
 110+ utils.store_object(self, settings.BINARY_OBJECT_FILE_LOCATION, self.__repr__())
98111
 112+ def drop_n_observations(self, n=1):
 113+ editors_to_remove = set()
 114+ for editor in self.editors:
 115+ if editor['obs'] <= n:
 116+ editors_to_remove.add(editor)
99117
100 - def update(self, editor, values):
101 - self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True)
 118+ for editor in editors_to_remove:
 119+ del self.editors[editor]
102120
103121
104 - def empty_all(self, percentage):
105 - self.n = self.current_cache_size()
106 - if percentage < 100.0:
107 - keys = self.retrieve_top_k_editors(percentage)
108 - else:
109 - keys = self.editors.keys()
110 - print 'Emptying cache %s time' % self.emptied
111 - self.emptied += 1
112 - for key in keys:
113 - if self.editors[key]['edits'] != {}:
114 - self.update(key, self.editors[key]['edits'])
115 - self.editors[key]['edits'] = []
116 - self.editors[key]['obs'] = 0.0
117 -
118 -
119122 def debug():
120123 mongo = db.init_mongo_db('test')
121124 collection = mongo['test']
122 - cache = EditorCache(collection)
 125+ cache = EditorCache(collection, wait=2)
123126 import random
124127 for i in xrange(100000):
125128 cache.add(str(random.randrange(0, 5)), {'date': 'woensaag', 'article': '3252'})
126 - cache.empty_all(100)
 129+ cache.empty_all(-1)
 130+ print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - cache.init_time, cache.cumulative_n)
127131
128132
129133 if __name__ == '__main__':

Status & tagging log