Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -19,6 +19,7 @@ |
20 | 20 | |
21 | 21 | import os |
22 | 22 | import sys |
| 23 | +import subprocess |
23 | 24 | from argparse import ArgumentParser |
24 | 25 | from argparse import RawTextHelpFormatter |
25 | 26 | |
— | — | @@ -30,6 +31,7 @@ |
31 | 32 | from utils import utils |
32 | 33 | from utils import dump_downloader |
33 | 34 | import split_xml_file |
| 35 | +import map_wiki_editors |
34 | 36 | import config |
35 | 37 | |
36 | 38 | |
— | — | @@ -37,15 +39,29 @@ |
38 | 40 | return getattr(args, key, None) |
39 | 41 | |
40 | 42 | |
41 | | -def config_launcher(args): |
| 43 | +def config_launcher(args, location, filename, project, language_code): |
42 | 44 | config.load_configuration(args) |
43 | 45 | |
44 | 46 | |
| 47 | +def retrieve_projectname(args): |
| 48 | + language_code = retrieve_language(args) |
| 49 | + if language_code == None: |
| 50 | + print 'Entered language: %s is not a valid Wikipedia language' % get_value(args, 'language') |
| 51 | + sys.exit(-1) |
| 52 | + project = retrieve_project(args) |
| 53 | + |
| 54 | + if project == None: |
| 55 | + print 'Entered project: %s is not valid Wikipedia project.' % get_value(args, 'project') |
| 56 | + sys.exit(-1) |
| 57 | + |
| 58 | + return '%s%s' % (language_code, project) |
| 59 | + |
45 | 60 | def retrieve_language(args): |
46 | 61 | language = get_value(args, 'language') |
47 | 62 | language = language.title() |
48 | | - return languages.MAPPING.get(language, None) |
| 63 | + return languages.MAPPING.get(language, None) |
49 | 64 | |
| 65 | + |
50 | 66 | def retrieve_project(args): |
51 | 67 | project = get_value(args, 'project') |
52 | 68 | if project != 'wiki': |
— | — | @@ -53,51 +69,65 @@ |
54 | 70 | return project |
55 | 71 | |
56 | 72 | |
57 | | -def dump_downloader_launcher(args): |
| 73 | +def generate_wikidump_filename(args): |
| 74 | + return '%s-%s-%s' % (retrieve_projectname(args), 'latest', get_value(args, 'file')) |
| 75 | + |
| 76 | + |
| 77 | +def determine_file_locations(args): |
| 78 | + locations = {} |
| 79 | + locations['language_code'] = retrieve_language(args) |
| 80 | + locations['location'] = os.path.join(get_value(args, 'location'), retrieve_language(args)) |
| 81 | + locations['project'] = retrieve_projectname(args) |
| 82 | + locations['filename'] = generate_wikidump_filename(args) |
| 83 | + return locations |
| 84 | + |
| 85 | + |
| 86 | +def dump_downloader_launcher(args, location, filename, project, language_code): |
58 | 87 | print 'dump downloader' |
59 | | - config.load_configuration(args) |
60 | | - language_code = retrieve_language(args) |
61 | | - if language_code == None: |
62 | | - print 'Entered language: %s is not a valid Wikipedia language' % get_value(args, 'language') |
63 | | - sys.exit(-1) |
64 | | - project = retrieve_project(args) |
65 | | - if project == None: |
66 | | - print 'Entered project: %s is not valid Wikipedia project.' % get_value(args, 'project') |
67 | | - sys.exit(-1) |
68 | | - location = os.path.join(get_value(args, 'location'), language_code) |
69 | | - project = language_code + project |
70 | | - filename = '%s-%s-%s' % (project, 'latest', get_value(args, 'file')) |
71 | 88 | pbar = get_value(args, 'progress') |
72 | | - |
73 | 89 | domain = settings.WP_DUMP_LOCATION |
74 | | - path = '/%s/latest/' % language_code |
75 | | - |
| 90 | + path = '/%s/latest/' % project |
76 | 91 | extension = utils.determine_file_extension(filename) |
77 | 92 | filemode = utils.determine_file_mode(extension) |
78 | 93 | |
79 | | - dump_downloader.download_wp_dump(domain, path, filename, location, filemode, pbar) |
| 94 | + dump_downloader.download_wiki_file(domain, path, filename, location, filemode, pbar) |
80 | 95 | |
81 | 96 | |
82 | | -def split_xml_file_launcher(args): |
| 97 | +def split_xml_file_launcher(args, location, filename, project, language_code): |
83 | 98 | print 'split_xml_file_launcher' |
84 | | - dbname = create_dbname(args) |
85 | | - split_xml_file.split_xml(dbname) |
| 99 | + ext = utils.determine_file_extension(filename) |
| 100 | + if ext in settings.COMPRESSION_EXTENSIONS: |
| 101 | + ext = '.%s' % ext |
| 102 | + file = filename.replace(ext, '') |
| 103 | + result = utils.check_file_exists(location, file) |
| 104 | + if not result: |
| 105 | + retcode = extract_xml_file(args, location, filename) |
| 106 | + else: |
| 107 | + retcode = 0 |
| 108 | + if retcode != 0: |
| 109 | + sys.exit(retcode) |
| 110 | + split_xml_file.split_xml(location, file, project, language_code) |
86 | 111 | |
87 | 112 | |
88 | | -def mongodb_script_launcher(args): |
| 113 | +def extract_xml_file(args, location, file): |
| 114 | + path = config.detect_installed_program('7zip') |
| 115 | + |
| 116 | + source = os.path.join(location, file) |
| 117 | + retcode = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)]) |
| 118 | + return retcode |
| 119 | + |
| 120 | + |
| 121 | +def mongodb_script_launcher(args, location, filename, project, language_code): |
89 | 122 | print 'mongodb_script_launcher' |
90 | | - config.load_configuration(args) |
91 | | - dbname = create_dbname(args) |
92 | | - #map_wiki_editors.run_stand_alone(dbname) |
| 123 | + map_wiki_editors.run_parse_editors(project, language_code, location) |
93 | 124 | #print args |
94 | 125 | |
95 | 126 | |
96 | | -def all_launcher(args): |
| 127 | +def all_launcher(args, location, filename, project, language_code): |
97 | 128 | print 'all_launcher' |
98 | | - config_launcher(args) |
99 | | - dump_downloader_launcher(args) |
100 | | - split_xml_file_launcher(args) |
101 | | - mongodb_script_launcher(args) |
| 129 | + dump_downloader_launcher(args, location, filename, project, language_code) |
| 130 | + split_xml_file_launcher(args, location, filename, project, language_code) |
| 131 | + mongodb_script_launcher(args, location, filename, project, language_code) |
102 | 132 | |
103 | 133 | |
104 | 134 | def supported_languages(): |
— | — | @@ -106,8 +136,8 @@ |
107 | 137 | return tuple(choices) |
108 | 138 | |
109 | 139 | |
110 | | -def show_languages(args): |
111 | | - first = get_value(args, 'first') |
| 140 | +def show_languages(args, location, filename, project, language_code): |
| 141 | + first = get_value(args, 'startswith') |
112 | 142 | if first != None: |
113 | 143 | first = first.title() |
114 | 144 | choices = supported_languages() |
— | — | @@ -129,32 +159,21 @@ |
130 | 160 | 'pages-meta-current.xml.bz2') |
131 | 161 | |
132 | 162 | parser = ArgumentParser(prog='manage', formatter_class=RawTextHelpFormatter) |
| 163 | + #group = parser.add_mutually_exclusive_group() |
| 164 | + #group.add_argument('show_languages', action='store') |
| 165 | + #group.add_argument('language', action='store') |
133 | 166 | subparsers = parser.add_subparsers(help='sub-command help') |
134 | | - parser.add_argument('-p', '--progress', action='store_true', default=True, |
135 | | - help='Indicate whether you want to have a progressbar.') |
136 | 167 | |
137 | 168 | parser_languages = subparsers.add_parser('show_languages', help='Overview of all valid languages.') |
| 169 | + parser_languages.add_argument('-s', '--startswith', |
| 170 | + action='store', |
| 171 | + help='Enter the first letter of a language to see which languages are available.') |
138 | 172 | parser_languages.set_defaults(func=show_languages) |
139 | | - parser_languages.add_argument('-f', '--first', action='store', help='Enter the first letter of a language to see which languages are available.') |
140 | 173 | |
141 | 174 | parser_config = subparsers.add_parser('config', help='The config sub command allows you set the data location of where to store files.') |
142 | 175 | parser_config.set_defaults(func=config_launcher) |
143 | 176 | |
144 | 177 | parser_download = subparsers.add_parser('download', help='The download sub command allows you to download a Wikipedia dump file.') |
145 | | - parser_download.add_argument('language', action='store', |
146 | | - help='Example of valid languages.', |
147 | | - choices=supported_languages(), |
148 | | - default='Russian') |
149 | | - parser_download.add_argument('-p', '--project', action='store', help='Specify the Wikimedia project that you would like to download', |
150 | | - choices=settings.WIKIMEDIA_PROJECTS.keys(), |
151 | | - default='wiki') |
152 | | - parser_download.add_argument('-l', '--location', action='store', |
153 | | - help='Indicate where you want to store the downloaded file.', |
154 | | - default=settings.XML_FILE_LOCATION) |
155 | | - parser_download.add_argument('-f', '--file', action='store', |
156 | | - choices=file_choices, |
157 | | - help='Indicate which dump you want to download. Valid choices are:\n %s' % ''.join([f + ',\n' for f in file_choices]), |
158 | | - default='stub-meta-current.xml.gz') |
159 | 178 | parser_download.set_defaults(func=dump_downloader_launcher) |
160 | 179 | |
161 | 180 | parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.') |
— | — | @@ -166,8 +185,32 @@ |
167 | 186 | parser_all = subparsers.add_parser('all', help='The all sub command runs the download, split, store and dataset commands.\n\nWARNING: THIS COULD TAKE DAYS DEPENDING ON THE CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE WIKIMEDIA DUMP FILE.') |
168 | 187 | parser_all.set_defaults(func=all_launcher) |
169 | 188 | |
| 189 | + parser.add_argument('-l', '--language', action='store', |
| 190 | + help='Example of valid languages.', |
| 191 | + choices=supported_languages(), |
| 192 | + default='Russian') |
| 193 | + |
| 194 | + parser.add_argument('-p', '--project', action='store', |
| 195 | + help='Specify the Wikimedia project that you would like to download', |
| 196 | + choices=settings.WIKIMEDIA_PROJECTS.keys(), |
| 197 | + default='wiki') |
| 198 | + |
| 199 | + parser.add_argument('-o', '--location', action='store', |
| 200 | + help='Indicate where you want to store the downloaded file.', |
| 201 | + default=settings.XML_FILE_LOCATION) |
| 202 | + |
| 203 | + parser.add_argument('-f', '--file', action='store', |
| 204 | + choices=file_choices, |
| 205 | + help='Indicate which dump you want to download. Valid choices are:\n %s' % ''.join([f + ',\n' for f in file_choices]), |
| 206 | + default='stub-meta-current.xml.gz') |
| 207 | + |
| 208 | + parser.add_argument('-prog', '--progress', action='store_true', default=True, |
| 209 | + help='Indicate whether you want to have a progressbar.') |
| 210 | + |
170 | 211 | args = parser.parse_args() |
171 | | - args.func(args) |
| 212 | + config.load_configuration(args) |
| 213 | + locations = determine_file_locations(args) |
| 214 | + args.func(args, **locations) |
172 | 215 | |
173 | 216 | |
174 | 217 | if __name__ == '__main__': |
Index: trunk/tools/editor_trends/map_wiki_editors.py |
— | — | @@ -257,18 +257,18 @@ |
258 | 258 | return ids |
259 | 259 | |
260 | 260 | |
261 | | -def run_parse_editors(dbname, language): |
| 261 | +def run_parse_editors(dbname, language, location): |
262 | 262 | ids = load_bot_ids() |
263 | 263 | kwargs = {'bots': ids, |
264 | 264 | 'dbname': dbname, |
265 | 265 | 'pbar': True, |
266 | | - 'nr_input_processors': 1, |
267 | | - 'nr_output_processors': 1, |
| 266 | + 'nr_input_processors': 2, |
| 267 | + 'nr_output_processors': 2, |
268 | 268 | 'language': language, |
269 | 269 | } |
270 | 270 | chunks = {} |
271 | | - file_location = os.path.join(settings.XML_FILE_LOCATION, language) |
272 | | - files = utils.retrieve_file_list(file_location, 'xml') |
| 271 | + #file_location = os.path.join(settings.XML_FILE_LOCATION, language) |
| 272 | + files = utils.retrieve_file_list(location, 'xml') |
273 | 273 | parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0)) |
274 | 274 | a = 0 |
275 | 275 | for x in xrange(settings.NUMBER_OF_PROCESSES): |
— | — | @@ -276,14 +276,11 @@ |
277 | 277 | chunks[x] = files[a:b] |
278 | 278 | a = (x + 1) * parts |
279 | 279 | |
| 280 | + pc.build_scaffolding(pc.load_queue, parse_editors, chunks, store_editors, True, **kwargs) |
280 | 281 | |
281 | | - for x in xrange(settings.NUMBER_OF_PROCESSES): |
282 | | - pc.build_scaffolding(pc.load_queue, parse_editors, chunks[x], store_editors, True, **kwargs) |
283 | 282 | |
284 | | - |
285 | 283 | def debug_parse_editors(dbname): |
286 | 284 | q = JoinableQueue() |
287 | | - #edits = db.init_mongo_db('editors') |
288 | 285 | parse_editors('en\\522.xml', q, None, None, True) |
289 | 286 | store_editors(q, [], dbname) |
290 | 287 | |
Index: trunk/tools/editor_trends/settings.py |
— | — | @@ -101,7 +101,12 @@ |
102 | 102 | # Name space, do not change as this works for Mediawiki wikis |
103 | 103 | NAME_SPACE = 'http://www.mediawiki.org/xml/export-0.4/' |
104 | 104 | |
| 105 | +WINDOWS_REGISTER = {'7zip': 'Software\\7-Zip', |
| 106 | + } |
105 | 107 | |
| 108 | +COMPRESSION_EXTENSIONS = ['gz', 'bz2', '7z'] |
| 109 | + |
| 110 | + |
106 | 111 | WIKIMEDIA_PROJECTS = {'commons': 'commonswiki', |
107 | 112 | 'wikibooks': 'wikibooks', |
108 | 113 | 'wikinews': 'wikinews', |
Index: trunk/tools/editor_trends/config.py |
— | — | @@ -20,29 +20,48 @@ |
21 | 21 | |
22 | 22 | import os |
23 | 23 | import ConfigParser |
| 24 | +from _winreg import * |
24 | 25 | |
25 | 26 | import settings |
26 | 27 | from utils import utils |
27 | 28 | |
28 | 29 | |
| 30 | +def detect_windows_program(program): |
| 31 | + entry = settings.WINDOWS_REGISTER[program] |
| 32 | + try: |
| 33 | + key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ) |
| 34 | + return QueryValueEx(key, 'Path')[0] |
| 35 | + except WindowsError: |
| 36 | + return None |
| 37 | + |
| 38 | + |
| 39 | +def detect_installed_program(program): |
| 40 | + platform = settings.OS |
| 41 | + if platform == 'Windows': |
| 42 | + path = detect_windows_program(program) |
| 43 | + return path |
| 44 | + else: |
| 45 | + raise NotImplementedError |
| 46 | + |
| 47 | + |
29 | 48 | def load_configuration(args): |
30 | 49 | config = ConfigParser.RawConfigParser() |
31 | 50 | if not utils.check_file_exists(settings.WORKING_DIRECTORY, 'wiki.cfg'): |
32 | 51 | working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.' % os.getcwd()) |
33 | 52 | if working_directory == '': |
34 | 53 | working_directory = os.getcwd() |
35 | | - |
| 54 | + |
36 | 55 | xml_file_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.' % settings.XML_FILE_LOCATION) |
37 | 56 | if xml_file_location == '': |
38 | 57 | xml_file_location = settings.XML_FILE_LOCATION |
39 | | - |
| 58 | + |
40 | 59 | create_configuration(WORKING_DIRECTORY=working_directory, XML_FILE_LOCATION=xml_file_location) |
41 | 60 | |
42 | 61 | config.read('wiki.cfg') |
43 | 62 | settings.WORKING_DIRECTORY = config.get('file_locations', 'WORKING_DIRECTORY') |
44 | 63 | settings.XML_FILE_LOCATION = config.get('file_locations', 'XML_FILE_LOCATION') |
45 | | - |
46 | | - |
| 64 | + |
| 65 | + |
47 | 66 | def create_configuration(**kwargs): |
48 | 67 | working_directory = kwargs.get('WORKING_DIRECTORY', settings.WORKING_DIRECTORY) |
49 | 68 | config = ConfigParser.RawConfigParser() |
— | — | @@ -56,6 +75,8 @@ |
57 | 76 | |
58 | 77 | |
59 | 78 | if __name__ == '__main__': |
60 | | - load_configuration([]) |
| 79 | + p =detect_windows_program('7zip') |
| 80 | + print p |
| 81 | + #load_configuration([]) |
61 | 82 | |
62 | 83 | |
Index: trunk/tools/editor_trends/utils/dump_downloader.py |
— | — | @@ -97,9 +97,9 @@ |
98 | 98 | pbar.update(pbar.currval + chunk) |
99 | 99 | |
100 | 100 | except urllib2.URLError, error: |
101 | | - print 'Reason: %s' % error.reason |
| 101 | + print 'Reason: %s' % error |
102 | 102 | except urllib2.HTTPError, error: |
103 | | - print 'Error: %s' % error.code |
| 103 | + print 'Error: %s' % error |
104 | 104 | finally: |
105 | 105 | fh.close() |
106 | 106 | |
Index: trunk/tools/editor_trends/utils/process_constructor.py |
— | — | @@ -55,20 +55,25 @@ |
56 | 56 | |
57 | 57 | nr_input_processors = kwargs.pop('nr_input_processors') |
58 | 58 | nr_output_processors = kwargs.pop('nr_output_processors') |
59 | | - |
| 59 | + input_queues = {} |
| 60 | + result_queues = {} |
| 61 | + assert len(obj) == nr_input_processors |
60 | 62 | if result_queue: |
61 | | - result_queue = JoinableQueue() |
| 63 | + assert len(obj)== nr_output_processors |
62 | 64 | |
63 | | - input_queue = load_input_queue(obj, poison_pill=True) |
| 65 | + for i, o in enumerate(obj): |
| 66 | + input_queues[i] = load_input_queue(obj[o], poison_pill=True) |
| 67 | + if result_queue: |
| 68 | + result_queues[i] = JoinableQueue() |
64 | 69 | |
65 | 70 | if settings.PROGRESS_BAR: |
66 | | - pbar = progressbar.ProgressBar(maxval=input_queue.qsize()).start() |
| 71 | + size = sum([input_queues[q].qsize() for q in input_queues]) |
| 72 | + pbar = progressbar.ProgressBar(maxval=size).start() |
67 | 73 | kwargs['pbar'] = pbar |
68 | 74 | else: |
69 | 75 | pbar = False |
70 | | - |
71 | | - |
72 | | - input_processes = [models.ProcessInputQueue(main, input_queue, result_queue, |
| 76 | + |
| 77 | + input_processes = [models.ProcessInputQueue(main, input_queues[i], result_queues[i], |
73 | 78 | **kwargs) for i in xrange(nr_input_processors)] |
74 | 79 | |
75 | 80 | for input_process in input_processes: |
— | — | @@ -78,7 +83,7 @@ |
79 | 84 | |
80 | 85 | if result_queue: |
81 | 86 | result_processes = [models.ProcessResultQueue(result_processor, |
82 | | - result_queue, **kwargs) for i in xrange(nr_output_processors)] |
| 87 | + result_queues[i], **kwargs) for i in xrange(nr_output_processors)] |
83 | 88 | for result_process in result_processes: |
84 | 89 | result_process.start() |
85 | 90 | |
— | — | @@ -115,6 +120,5 @@ |
116 | 121 | input_queue.put(d) |
117 | 122 | |
118 | 123 | if poison_pill: |
119 | | - for p in xrange(settings.NUMBER_OF_PROCESSES): |
120 | | - input_queue.put(None) |
| 124 | + input_queue.put(None) |
121 | 125 | return input_queue |
Index: trunk/tools/editor_trends/split_xml_file.py |
— | — | @@ -82,7 +82,7 @@ |
83 | 83 | ns = [] |
84 | 84 | for namespace in namespaces: |
85 | 85 | value = namespaces[namespace].get(u'*', None) |
86 | | - if value != None and value != '' and not value.endswith('talk'): |
| 86 | + if value != None and value != '': |
87 | 87 | ns.append(value) |
88 | 88 | return ns |
89 | 89 | |
— | — | @@ -122,9 +122,10 @@ |
123 | 123 | |
124 | 124 | def create_xml_file_handle(fh, counter, size, language): |
125 | 125 | '''Create file handle if none is supplied or if file size > max file size.''' |
| 126 | + if not counter: |
| 127 | + counter = 0 |
126 | 128 | path = os.path.join(settings.XML_FILE_LOCATION, language, '%s.xml' % counter) |
127 | 129 | if not fh: |
128 | | - counter = 0 |
129 | 130 | fh = codecs.open(path, 'w', encoding=settings.ENCODING) |
130 | 131 | return fh, counter |
131 | 132 | elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE: |
— | — | @@ -137,39 +138,49 @@ |
138 | 139 | return fh, counter |
139 | 140 | |
140 | 141 | |
141 | | -def split_xml(language): |
| 142 | +def split_xml(location, filename, project, language_code): |
142 | 143 | '''Reads xml file and splits it in N chunks''' |
143 | | - location = os.path.join(settings.XML_FILE_LOCATION, language) |
| 144 | + #location = os.path.join(settings.XML_FILE_LOCATION, language) |
144 | 145 | result = utils.check_file_exists(location, '') |
145 | 146 | if result == False: |
146 | 147 | result = utils.create_directory(location) |
147 | 148 | if not result: |
148 | 149 | return |
149 | 150 | |
150 | | - ns = load_namespace(language) |
| 151 | + ns = load_namespace(language_code) |
151 | 152 | ns = build_namespaces_locale(ns) |
152 | 153 | |
153 | 154 | fh = None |
154 | 155 | counter = None |
| 156 | + source = os.path.join(location, filename) |
155 | 157 | tag = '{%s}page' % settings.NAME_SPACE |
156 | 158 | |
157 | | - context = cElementTree.iterparse(settings.XML_FILE, events=('start', 'end')) |
| 159 | + context = cElementTree.iterparse(source, events=('start', 'end')) |
158 | 160 | context = iter(context) |
159 | 161 | event, root = context.next() #get the root element of the XML doc |
160 | 162 | |
161 | | - for event, elem in context: |
162 | | - if event == 'end': |
163 | | - if elem.tag == tag: |
164 | | - elem = remove_namespace(elem, settings.NAME_SPACE) |
165 | | - elem = parse_comments(elem, remove_numeric_character_references) |
| 163 | + try: |
| 164 | + for event, elem in context: |
| 165 | + if event == 'end': |
| 166 | + if elem.tag == tag: |
| 167 | + elem = remove_namespace(elem, settings.NAME_SPACE) |
| 168 | + if is_article_main_namespace(elem, ns): |
| 169 | + elem = parse_comments(elem, remove_numeric_character_references) |
| 170 | + fh, counter = write_xml_file(elem, fh, counter, language_code) |
| 171 | + root.clear() # when done parsing a section clear the tree to safe memory |
| 172 | + #elem = parse_comments(elem, convert_html_entities) |
| 173 | + #elem = parse_comments(elem, remove_ascii_control_characters) |
| 174 | + #print cElementTree.tostring(elem) |
| 175 | + except SyntaxError: |
| 176 | + fh = utils.create_txt_filehandle(ERROR_MESSAGE_FILE_LOCATION, 'split_xml', 'w', settings.ENCODING) |
| 177 | + fh.write(cElementTree.tostring(elem)) |
| 178 | + fh.close() |
166 | 179 | |
167 | | - if is_article_main_namespace(elem, ns): |
168 | | - fh, counter = write_xml_file(elem, fh, counter, language) |
169 | | - root.clear() # when done parsing a section clear the tree to safe memory |
170 | | - #elem = parse_comments(elem, convert_html_entities) |
171 | | - #elem = parse_comments(elem, remove_ascii_control_characters) |
172 | | - #print cElementTree.tostring(elem) |
173 | 180 | |
174 | | - |
175 | 181 | if __name__ == "__main__": |
176 | | - split_xml('en') |
| 182 | + kwargs = {'location': 'c:\\Source_files\\', |
| 183 | + 'filename': settings.XML_FILE, |
| 184 | + 'project':'wiki', |
| 185 | + 'language_code':'en' |
| 186 | + } |
| 187 | + split_xml(**kwargs) |
Index: trunk/tools/editor_trends/database/cache.py |
— | — | @@ -20,109 +20,113 @@ |
21 | 21 | |
22 | 22 | ''' |
23 | 23 | This module provides a simple caching mechanism to speed-up the process of |
24 | | -inserting records to MongoDB. The caching bject works as follows: |
25 | | -1) Each edit from an author is added to a dictionary |
26 | | -2) Every 50000 edits, the object returns %x with the most edits, and these are |
27 | | -then stored in MongoDB. By packaging multiple edits in a single commit, |
28 | | -processing time is significantly reduced. |
| 24 | +inserting records to MongoDB. The caching object works as follows: |
| 25 | +1) Each edit from an author is added to a dictionary |
| 26 | +2) Every x seconds, the object returns %x with the least number of edits, |
| 27 | +and these are then stored in MongoDB. By packaging multiple edits in a single |
| 28 | +commit, processing time is significantly reduced. |
29 | 29 | |
30 | 30 | This caching mechanism does not create any benefits for authors with single or |
31 | | -very few edits. |
32 | | - |
| 31 | +very few edits. |
33 | 32 | ''' |
34 | 33 | |
35 | 34 | |
36 | 35 | import sys |
37 | 36 | import datetime |
| 37 | +import random |
38 | 38 | |
39 | 39 | import settings |
40 | 40 | import db |
| 41 | +from utils import utils |
41 | 42 | |
42 | 43 | |
43 | 44 | class EditorCache(object): |
44 | 45 | def __init__(self, collection): |
45 | 46 | self.collection = collection |
46 | 47 | self.editors = {} |
47 | | - self.size = self.__sizeof__() |
48 | 48 | self.cumulative_n = 0 |
| 49 | + self.init_time = datetime.datetime.now() |
49 | 50 | self.time_started = datetime.datetime.now() |
50 | | - self.n = self.current_cache_size() |
| 51 | + self.n = 0 |
51 | 52 | self.emptied = 1 |
| 53 | + self.number_editors = 0 |
| 54 | + self.treshold_editors = set() |
| 55 | + self.treshold = 10 |
52 | 56 | |
53 | | - |
54 | 57 | def __repr__(self): |
55 | | - pass |
| 58 | + return '%s_%s' % ('editor_cache', random.randint(0, 99999)) |
56 | 59 | |
57 | | - |
58 | 60 | def _store_editor(self, key, value): |
59 | 61 | editor = self.collection.insert({'editor': key, 'edits': {}}) |
60 | 62 | self.editors[key]['id'] = str(editor) |
61 | 63 | |
62 | | - |
63 | 64 | def current_cache_size(self): |
64 | 65 | return sum([self.editors[k].get('obs', 0) for k in self.editors]) |
65 | 66 | |
66 | | - |
67 | 67 | def add(self, key, value): |
68 | | - self.cumulative_n += 1 |
69 | | - if key not in self.editors: |
70 | | - self.editors[key] = {} |
71 | | - self.editors[key]['obs'] = 0 |
72 | | - self.editors[key]['edits'] = [] |
73 | | - |
| 68 | + if key == 'NEXT': |
| 69 | + for editor in self.treshold_editors: |
| 70 | + self.update(editor, self.editors[editor]['edits']) |
| 71 | + self.n -= self.editors[editor]['obs'] |
| 72 | + self.number_editors -= 1 |
| 73 | + del self.editors[editor] |
| 74 | + self.treshold_editors = set() |
74 | 75 | else: |
| 76 | + self.cumulative_n += 1 |
| 77 | + self.n += 1 |
| 78 | + if key not in self.editors: |
| 79 | + self.editors[key] = {} |
| 80 | + self.editors[key]['obs'] = 0 |
| 81 | + self.editors[key]['edits'] = [] |
| 82 | + self.number_editors += 1 |
| 83 | + |
75 | 84 | id = str(self.editors[key]['obs']) |
76 | 85 | self.editors[key]['edits'].append(value) |
77 | 86 | self.editors[key]['obs'] += 1 |
78 | 87 | |
| 88 | + if self.editors[key]['obs'] == self.treshold: |
| 89 | + self.treshold_editors.add(key) |
| 90 | +# self.update(key, self.editors[key]['edits']) |
| 91 | +# del self.editors[key] |
| 92 | +# self.n -= 10 |
| 93 | +# self.number_editors -= 1 |
79 | 94 | |
80 | | - if self.cumulative_n % 25000 == 0: |
81 | | - self.empty_all(5.0) |
| 95 | + def update(self, editor, values): |
| 96 | + #t = datetime.datetime.now() |
| 97 | + self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True) |
| 98 | + #print 'It took %s to store editor %s;and the cache contains %s editors and %s items' % (datetime.datetime.now() - t, editor, self.number_editors, self.n) |
82 | 99 | |
| 100 | + def quick_sort(self, obs): |
| 101 | + if obs == []: |
| 102 | + return [] |
| 103 | + else: |
| 104 | + pivot = obs[0] |
| 105 | + lesser = self.quick_sort([x for x in obs[1:] if x < pivot]) |
| 106 | + greater = self.quick_sort([x for x in obs[1:] if x >= pivot]) |
| 107 | + return lesser + [pivot] + greater |
83 | 108 | |
84 | | - def retrieve_top_k_editors(self, percentage): |
85 | | - keys = self.editors.keys() |
86 | | - obs = [] |
87 | | - for k in keys: |
88 | | - weight = float(self.editors[k].get('obs', 0)) / self.n |
89 | | - obs.append((weight, k)) |
90 | | - obs.sort() |
91 | | - obs.reverse() |
92 | | - l = int((len(obs) / 100.0) * percentage) |
93 | | - if l == 0: |
94 | | - l = 1 |
95 | | - obs = obs[:l] |
96 | | - obs = [o[1] for o in obs] |
97 | | - return obs |
| 109 | + def store(self): |
| 110 | + utils.store_object(self, settings.BINARY_OBJECT_FILE_LOCATION, self.__repr__()) |
98 | 111 | |
| 112 | + def drop_n_observations(self, n=1): |
| 113 | + editors_to_remove = set() |
| 114 | + for editor in self.editors: |
| 115 | + if editor['obs'] <= n: |
| 116 | + editors_to_remove.add(editor) |
99 | 117 | |
100 | | - def update(self, editor, values): |
101 | | - self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True) |
| 118 | + for editor in editors_to_remove: |
| 119 | + del self.editors[editor] |
102 | 120 | |
103 | 121 | |
104 | | - def empty_all(self, percentage): |
105 | | - self.n = self.current_cache_size() |
106 | | - if percentage < 100.0: |
107 | | - keys = self.retrieve_top_k_editors(percentage) |
108 | | - else: |
109 | | - keys = self.editors.keys() |
110 | | - print 'Emptying cache %s time' % self.emptied |
111 | | - self.emptied += 1 |
112 | | - for key in keys: |
113 | | - if self.editors[key]['edits'] != {}: |
114 | | - self.update(key, self.editors[key]['edits']) |
115 | | - self.editors[key]['edits'] = [] |
116 | | - self.editors[key]['obs'] = 0.0 |
117 | | - |
118 | | - |
119 | 122 | def debug(): |
120 | 123 | mongo = db.init_mongo_db('test') |
121 | 124 | collection = mongo['test'] |
122 | | - cache = EditorCache(collection) |
| 125 | + cache = EditorCache(collection, wait=2) |
123 | 126 | import random |
124 | 127 | for i in xrange(100000): |
125 | 128 | cache.add(str(random.randrange(0, 5)), {'date': 'woensaag', 'article': '3252'}) |
126 | | - cache.empty_all(100) |
| 129 | + cache.empty_all(-1) |
| 130 | + print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - cache.init_time, cache.cumulative_n) |
127 | 131 | |
128 | 132 | |
129 | 133 | if __name__ == '__main__': |