Index: trunk/tools/editor_trends/manage.py |
— | — | @@ -19,7 +19,6 @@ |
20 | 20 | |
21 | 21 | import os |
22 | 22 | import sys |
23 | | -import subprocess |
24 | 23 | import datetime |
25 | 24 | from argparse import ArgumentParser |
26 | 25 | from argparse import RawTextHelpFormatter |
— | — | @@ -44,15 +43,15 @@ |
45 | 44 | class Timer(object): |
46 | 45 | def __init__(self): |
47 | 46 | self.t0 = datetime.datetime.now() |
48 | | - |
| 47 | + |
49 | 48 | def stop(self): |
50 | 49 | self.t1 = datetime.datetime.now() |
51 | | - |
| 50 | + |
52 | 51 | def elapsed(self): |
53 | 52 | self.stop() |
54 | 53 | print 'Processing time: %s' % (self.t1 - self.t0) |
55 | | - |
56 | 54 | |
| 55 | + |
57 | 56 | def get_value(args, key): |
58 | 57 | return getattr(args, key, None) |
59 | 58 | |
— | — | @@ -69,12 +68,12 @@ |
70 | 69 | def retrieve_projectname(args): |
71 | 70 | language_code = retrieve_language(args) |
72 | 71 | if language_code == None: |
73 | | - print 'Entered language: %s is not a valid Wikipedia language' % get_value(args, 'language') |
| 72 | + print 'Entered language: %s is not a valid Wikimedia language' % get_value(args, 'language') |
74 | 73 | sys.exit(-1) |
75 | 74 | project = retrieve_project(args) |
76 | 75 | |
77 | 76 | if project == None: |
78 | | - print 'Entered project: %s is not valid Wikipedia project.' % get_value(args, 'project') |
| 77 | + print 'Entered project: %s is not valid Wikimedia Foundation project.' % get_value(args, 'project') |
79 | 78 | sys.exit(-1) |
80 | 79 | if project == 'commonswiki': |
81 | 80 | return project |
— | — | @@ -91,7 +90,7 @@ |
92 | 91 | def retrieve_project(args): |
93 | 92 | project = get_value(args, 'project') |
94 | 93 | if project != 'wiki': |
95 | | - project = settings.WIKIMEDIA_PROJECTS.get(project, None) |
| 94 | + project = settings.projects.get(project, None) |
96 | 95 | return project |
97 | 96 | |
98 | 97 | |
— | — | @@ -107,9 +106,15 @@ |
108 | 107 | locations['language_code'] = language_code |
109 | 108 | locations['language'] = get_value(args, 'language') |
110 | 109 | locations['location'] = os.path.join(location, language_code, project) |
| 110 | + locations['chunks'] = os.path.join(locations['location'], 'chunks') |
| 111 | + locations['txt'] = os.path.join(locations['location'], 'txt') |
| 112 | + locations['sorted'] = os.path.join(locations['location'], 'sorted') |
| 113 | + locations['dbready'] = os.path.join(locations['location'], 'dbready') |
111 | 114 | locations['project'] = project |
112 | 115 | locations['full_project'] = retrieve_projectname(args) |
113 | 116 | locations['filename'] = generate_wikidump_filename(project, args) |
| 117 | + locations['collection'] = get_value(args, 'collection') |
| 118 | + locations['directories'] = [locations['chunks'], locations['location'], locations['txt'], locations['sorted'], locations['dbready']] |
114 | 119 | return locations |
115 | 120 | |
116 | 121 | |
— | — | @@ -119,7 +124,7 @@ |
120 | 125 | language = kwargs.pop('language') |
121 | 126 | location = kwargs.pop('location') |
122 | 127 | project = project.title() |
123 | | - language_map = utils.invert_dict(languages.MAPPING) |
| 128 | + language_map = languages.language_map() |
124 | 129 | print 'Project: %s' % (project) |
125 | 130 | print 'Language: %s / %s' % (language_map[language_code].decode(settings.encoding), language.decode(settings.encoding)) |
126 | 131 | print 'Input directory: %s' % location |
— | — | @@ -163,29 +168,15 @@ |
164 | 169 | sys.exit(retcode) |
165 | 170 | chunker.split_file(location, file, project, language_code, language) |
166 | 171 | timer.elapsed() |
167 | | - #settings.set_custom_settings(xml_namespace='http://www.mediawiki.org/xml/export-0.3/') |
168 | 172 | |
169 | 173 | |
170 | 174 | def launch_zip_extractor(args, location, file): |
171 | 175 | timer = Timer() |
172 | | - path = settings.detect_installed_program('7zip') |
173 | | - source = os.path.join(location, file) |
174 | | - p = None |
175 | | - |
176 | | - if settings.platform == 'Windows': |
177 | | - p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)], shell=True).wait() |
178 | | - elif settings.platform == 'Linux': |
179 | | - raise NotImplementedError |
180 | | - elif settings.platform == 'OSX': |
181 | | - raise NotImplementedError |
182 | | - else: |
183 | | - raise exceptions.PlatformNotSupportedError |
| 176 | + utils.zip_extract(location, file, compression='7z') |
184 | 177 | timer.elapsed() |
185 | | - return p |
186 | 178 | |
187 | 179 | |
188 | 180 | def extract_launcher(args, **kwargs): |
189 | | - print 'mongodb_script_launcher' |
190 | 181 | timer = Timer() |
191 | 182 | location = kwargs.pop('location') |
192 | 183 | language_code = kwargs.pop('language_code') |
— | — | @@ -199,13 +190,23 @@ |
200 | 191 | location = kwargs.pop('location') |
201 | 192 | input = os.path.join(location, 'txt') |
202 | 193 | output = os.path.join(location, 'sorted') |
| 194 | + final_output = os.path.join(location, 'dbready') |
203 | 195 | dbname = kwargs.pop('full_project') |
204 | 196 | loader.mergesort_launcher(input, output) |
205 | | - filename = loader.mergesort_external_launcher(dbname, output, output) |
206 | | - loader.store_editors(output, filename, dbname, 'editors') |
| 197 | + loader.mergesort_external_launcher(dbname, output, final_output) |
207 | 198 | timer.elapsed() |
208 | 199 | |
209 | 200 | |
| 201 | +def store_launcher(args, **kwargs): |
| 202 | + timer = Timer() |
| 203 | + location = kwargs.pop('location') |
| 204 | + input = os.path.join(location, 'dbready') |
| 205 | + dbname = kwargs.pop('full_project') |
| 206 | + collection = kwargs.pop('collection') |
| 207 | + loader.store_editors(input, dbname, collection) |
| 208 | + timer.elapsed() |
| 209 | + |
| 210 | + |
210 | 211 | def transformer_launcher(args, **kwargs): |
211 | 212 | print 'dataset launcher' |
212 | 213 | timer = Timer() |
— | — | @@ -289,6 +290,8 @@ |
290 | 291 | |
291 | 292 | parser_config = subparsers.add_parser('config', help='The config sub command allows you set the data location of where to store files.') |
292 | 293 | parser_config.set_defaults(func=config_launcher) |
| 294 | + parser_config.add_argument('-f', '--force', action='store_true', |
| 295 | + help='Reconfigure Editor Toolkit (this will replace wiki.cfg') |
293 | 296 | |
294 | 297 | parser_download = subparsers.add_parser('download', help='The download sub command allows you to download a Wikipedia dump file.') |
295 | 298 | parser_download.set_defaults(func=dump_downloader_launcher) |
— | — | @@ -296,11 +299,17 @@ |
297 | 300 | parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.') |
298 | 301 | parser_split.set_defaults(func=chunker_launcher) |
299 | 302 | |
| 303 | + parser_create = subparsers.add_parser('extract', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.') |
| 304 | + parser_create.set_defaults(func=extract_launcher) |
| 305 | + |
300 | 306 | parser_sort = subparsers.add_parser('sort', help='By presorting the data, significant processing time reducations are achieved.') |
301 | 307 | parser_sort.set_defaults(func=sort_launcher) |
302 | 308 | |
303 | | - parser_create = subparsers.add_parser('extract', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.') |
304 | | - parser_create.set_defaults(func=extract_launcher) |
| 309 | + parser_store = subparsers.add_parser('store', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.') |
| 310 | + parser_store.set_defaults(func=store_launcher) |
| 311 | + parser_store.add_argument('-c', '--collection', action='store', |
| 312 | + help='Name of MongoDB collection', |
| 313 | + default='editors') |
305 | 314 | |
306 | 315 | parser_transform = subparsers.add_parser('transform', help='Transform the raw datatabe to an enriched dataset that can be exported.') |
307 | 316 | parser_transform.set_defaults(func=transformer_launcher) |
— | — | @@ -337,10 +346,9 @@ |
338 | 347 | detect_python_version() |
339 | 348 | about() |
340 | 349 | args = parser.parse_args() |
341 | | - if not os.path.exists('wiki.cfg'): |
342 | | - config.create_configuration(settings, args) |
| 350 | + config.create_configuration(settings, args) |
343 | 351 | locations = determine_file_locations(args) |
344 | | - settings.verify_environment([locations['location']]) |
| 352 | + settings.verify_environment(locations['directories']) |
345 | 353 | show_settings(args, **locations) |
346 | 354 | #locations['settings'] = settings |
347 | 355 | args.func(args, **locations) |
— | — | @@ -348,5 +356,4 @@ |
349 | 357 | |
350 | 358 | |
351 | 359 | if __name__ == '__main__': |
352 | | - #args = ['download', '-l', 'Russian'] |
353 | 360 | main() |
Index: trunk/tools/editor_trends/wikitree/xml.py |
— | — | @@ -28,8 +28,10 @@ |
29 | 29 | |
30 | 30 | def extract_text(elem, kwargs): |
31 | 31 | if elem != None and elem.text != None: |
32 | | - return elem.text.decode(settings.encoding) |
33 | | - return None |
| 32 | + #try: |
| 33 | + return elem.text #.decode(settings.encoding) |
| 34 | + #except UnicodeDecodeError: |
| 35 | + #return None |
34 | 36 | |
35 | 37 | |
36 | 38 | def retrieve_xml_node(xml_nodes, name): |
Index: trunk/tools/editor_trends/etl/extract.py |
— | — | @@ -63,11 +63,12 @@ |
64 | 64 | new_xmlfile() |
65 | 65 | |
66 | 66 | class XMLFile(object): |
67 | | - def __init__(self, input, output, file, bots, **kwargs): |
| 67 | + def __init__(self, input, output, file, bots, target, **kwargs): |
68 | 68 | self.file = file |
69 | 69 | self.input = input |
70 | 70 | self.output = output |
71 | 71 | self.bots = bots |
| 72 | + self.target = target |
72 | 73 | for kw in kwargs: |
73 | 74 | setattr(self, kw, kwargs[kw]) |
74 | 75 | |
— | — | @@ -96,7 +97,7 @@ |
97 | 98 | raw_data = ''.join(raw_data) |
98 | 99 | xml_buffer.write(raw_data) |
99 | 100 | elem = cElementTree.XML(xml_buffer.getvalue()) |
100 | | - output_editor_information(elem, self.fh, bots=self.bots, destination=self.destination) |
| 101 | + self.target(elem, self.fh, bots=self.bots, destination=self.destination) |
101 | 102 | except SyntaxError, error: |
102 | 103 | print error |
103 | 104 | ''' |
— | — | @@ -160,6 +161,7 @@ |
161 | 162 | else: |
162 | 163 | return None |
163 | 164 | |
| 165 | + |
164 | 166 | def extract_contributor_id(contributor, kwargs): |
165 | 167 | ''' |
166 | 168 | @contributor is the xml contributor node containing a number of attributes |
— | — | @@ -339,20 +341,17 @@ |
340 | 342 | tasks = multiprocessing.JoinableQueue() |
341 | 343 | consumers = [XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)] |
342 | 344 | for file in files: |
343 | | - tasks.put(XMLFile(input, output, file, bots, **kwargs)) |
| 345 | + tasks.put(XMLFile(input, output, file, bots, output_editor_information, **kwargs)) |
| 346 | + print 'The queue contains %s files.' % tasks.qsize() |
344 | 347 | for x in xrange(settings.number_of_processes): |
345 | 348 | tasks.put(None) |
346 | 349 | |
347 | | - print tasks.qsize() |
348 | 350 | for w in consumers: |
349 | 351 | w.start() |
350 | 352 | |
351 | 353 | tasks.join() |
352 | 354 | |
353 | | - #chunks = utils.split_list(files, settings.number_of_processes) |
354 | | - #pc.build_scaffolding(pc.load_queue, parse_editors, chunks, False, False, **kwargs) |
355 | 355 | |
356 | | - |
357 | 356 | def debug_parse_editors(dbname): |
358 | 357 | q = JoinableQueue() |
359 | 358 | parse_editors('522.xml', q, None, None, debug=True, destination='file') |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -38,8 +38,9 @@ |
39 | 39 | @dbname is the name of the MongoDB collection where to store the information. |
40 | 40 | ''' |
41 | 41 | dbname = kwargs.get('dbname', None) |
| 42 | + colleciton = kwargs.pop('collection') |
42 | 43 | mongo = db.init_mongo_db(dbname) |
43 | | - collection = mongo['editors'] |
| 44 | + collection = mongo[collection] |
44 | 45 | mongo[collection].ensure_index('editor') |
45 | 46 | editor_cache = cache.EditorCache(collection) |
46 | 47 | |
— | — | @@ -82,9 +83,9 @@ |
83 | 84 | return cache |
84 | 85 | |
85 | 86 | |
86 | | -def search_cache_for_missed_editors(dbname): |
| 87 | +def search_cache_for_missed_editors(dbname, collection): |
87 | 88 | mongo = db.init_mongo_db(dbname) |
88 | | - collection = mongo['editors'] |
| 89 | + collection = mongo[collection] |
89 | 90 | editor_cache = cache.EditorCache(collection) |
90 | 91 | cache = load_cache_objects() |
91 | 92 | for c in cache: |
Index: trunk/tools/editor_trends/etl/chunker.py |
— | — | @@ -24,6 +24,7 @@ |
25 | 25 | import json |
26 | 26 | import os |
27 | 27 | |
| 28 | + |
28 | 29 | import progressbar |
29 | 30 | |
30 | 31 | |
— | — | @@ -79,14 +80,15 @@ |
80 | 81 | return ns |
81 | 82 | |
82 | 83 | |
83 | | -def build_namespaces_locale(namespaces): |
| 84 | +def build_namespaces_locale(namespaces, include=[0]): |
84 | 85 | ''' |
85 | | - Construct a list of all the non-main namespaces |
| 86 | + @include is a list of namespace keys that should not be ignored, the default |
| 87 | + setting is to ignore all namespaces except the main namespace. |
86 | 88 | ''' |
87 | 89 | ns = [] |
88 | 90 | for namespace in namespaces: |
89 | | - value = namespaces[namespace].get(u'*', None) |
90 | | - if value != None and value != '': |
| 91 | + if int(namespace) not in include: |
| 92 | + value = namespaces[namespace].get(u'*', None) |
91 | 93 | ns.append(value) |
92 | 94 | return ns |
93 | 95 | |
— | — | @@ -114,32 +116,39 @@ |
115 | 117 | |
116 | 118 | def write_xml_file(element, fh, output, counter): |
117 | 119 | '''Get file handle and write xml element to file''' |
118 | | - size = len(cElementTree.tostring(element)) |
119 | | - fh, counter = create_file_handle(fh, output, counter, size) |
| 120 | + xml_string = cElementTree.tostring(element) |
| 121 | + size = len(xml_string) |
| 122 | + fh, counter, new_file = create_file_handle(fh, output, counter, size) |
120 | 123 | try: |
121 | | - fh.write(cElementTree.tostring(element)) |
| 124 | + fh.write(xml_string) |
122 | 125 | except MemoryError: |
123 | 126 | print 'Add error capturing logic' |
124 | 127 | fh.write('\n') |
125 | | - return fh, counter |
| 128 | + return fh, counter, new_file |
126 | 129 | |
127 | 130 | |
128 | 131 | def create_file_handle(fh, output, counter, size): |
129 | | - '''Create file handle if none is supplied or if file size > max file size.''' |
130 | | - if not counter: |
| 132 | + ''' |
| 133 | + @fh is file handle, if none is supplied or if file size > max file size then |
| 134 | + create a new file handle |
| 135 | + @output is the location where to store the files |
| 136 | + @counter indicates which chunk it is |
| 137 | + @size is the length of the xml element about to be written to file. |
| 138 | + ''' |
| 139 | + if not fh: |
131 | 140 | counter = 0 |
132 | | - path = os.path.join(output, '%s.xml' % counter) |
133 | | - if not fh: |
| 141 | + path = os.path.join(output, '%s.xml' % counter) |
134 | 142 | fh = codecs.open(path, 'w', encoding=settings.encoding) |
135 | | - return fh, counter |
136 | | - elif (fh.tell() + size) > settings.max_settings_xmlfile_size: |
137 | | - print 'Created chunk %s' % counter |
| 143 | + return fh, counter, False |
| 144 | + elif (fh.tell() + size) > settings.max_xmlfile_size: |
| 145 | + print 'Created chunk %s' % (counter + 1) |
138 | 146 | fh.close |
139 | 147 | counter += 1 |
| 148 | + path = os.path.join(output, '%s.xml' % counter) |
140 | 149 | fh = codecs.open(path, 'w', encoding=settings.encoding) |
141 | | - return fh, counter |
| 150 | + return fh, counter, True |
142 | 151 | else: |
143 | | - return fh, counter |
| 152 | + return fh, counter, False |
144 | 153 | |
145 | 154 | |
146 | 155 | def flatten_xml_elements(data, page): |
— | — | @@ -154,9 +163,9 @@ |
155 | 164 | else: |
156 | 165 | flat[x].append(xml.extract_text(elem, None)) |
157 | 166 | return flat |
158 | | - |
159 | 167 | |
160 | | -def split_file(location, file, project, language_code, language, format='xml'): |
| 168 | + |
| 169 | +def split_file(location, file, project, language_code, include, format='xml', zip=False): |
161 | 170 | '''Reads xml file and splits it in N chunks''' |
162 | 171 | #location = os.path.join(settings.input_location, language) |
163 | 172 | input = os.path.join(location, file) |
— | — | @@ -167,12 +176,11 @@ |
168 | 177 | else: |
169 | 178 | f = input.replace('.xml', '') |
170 | 179 | fh = utils.create_txt_filehandle(output, '%s.tsv' % f, 'w', settings.encoding) |
171 | | - |
| 180 | + |
172 | 181 | ns = load_namespace(language_code) |
173 | | - ns = build_namespaces_locale(ns) |
| 182 | + ns = build_namespaces_locale(ns, include) |
174 | 183 | |
175 | | - settings.xml_namespace = 'http://www.mediawiki.org/xml/export-0.3/' |
176 | | - counter = None |
| 184 | + counter = 0 |
177 | 185 | tag = '{%s}page' % settings.xml_namespace |
178 | 186 | context = cElementTree.iterparse(input, events=('start', 'end')) |
179 | 187 | context = iter(context) |
— | — | @@ -186,7 +194,11 @@ |
187 | 195 | page = elem.find('id').text |
188 | 196 | elem = parse_comments(elem, remove_numeric_character_references) |
189 | 197 | if format == 'xml': |
190 | | - fh, counter = write_xml_file(elem, fh, output, counter) |
| 198 | + fh, counter, new_file = write_xml_file(elem, fh, output, counter) |
| 199 | + if zip and new_file: |
| 200 | + file = str(counter - 1) + '.xml' |
| 201 | + utils.zip_archive(settings.path_ziptool, output, file) |
| 202 | + utils.delete_file(output, file) |
191 | 203 | else: |
192 | 204 | data = [el.getchildren() for el in elem if el.tag == 'revision'] |
193 | 205 | data = flatten_xml_elements(data, page) |
— | — | @@ -196,9 +208,9 @@ |
197 | 209 | f = utils.create_txt_filehandle(settings.log_location, 'split_xml', 'w', settings.encoding) |
198 | 210 | f.write(cElementTree.tostring(elem)) |
199 | 211 | f.close() |
200 | | - finally: |
201 | | - fh.close() |
202 | 212 | |
| 213 | + fh.close() |
| 214 | + |
203 | 215 | if __name__ == "__main__": |
204 | 216 | kwargs = {'output': settings.input_location, |
205 | 217 | 'input': settings.input_filename, |
Index: trunk/tools/editor_trends/etl/loader.py |
— | — | @@ -32,7 +32,8 @@ |
33 | 33 | |
34 | 34 | |
35 | 35 | |
36 | | -def store_editors(input, filename, dbname, collection): |
| 36 | +def store_editors(input, dbname, collection): |
| 37 | + filename = utils.retrieve_file_list(input, 'txt', mask=None)[0] |
37 | 38 | fh = utils.create_txt_filehandle(input, filename, 'r', settings.encoding) |
38 | 39 | mongo = db.init_mongo_db(dbname) |
39 | 40 | collection = mongo[collection] |
— | — | @@ -70,7 +71,7 @@ |
71 | 72 | utils.store_object(editors, settings.binary_location, 'editors') |
72 | 73 | |
73 | 74 | |
74 | | -def mergesort_external_launcher(dbname, input, output): |
| 75 | +def mergesort_external_launcher(dbname, input, intermediate_output, output): |
75 | 76 | files = utils.retrieve_file_list(input, 'txt', mask='') |
76 | 77 | x = 0 |
77 | 78 | maxval = 99999 |
— | — | @@ -79,11 +80,12 @@ |
80 | 81 | maxval = round(len(files) / x) |
81 | 82 | chunks = utils.split_list(files, int(x)) |
82 | 83 | '''1st iteration external mergesort''' |
| 84 | + if len(chunks) < 2: |
| 85 | + intermediate_output = output |
83 | 86 | for chunk in chunks: |
84 | 87 | filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]] |
85 | | - filename = sort.merge_sorted_files(output, filehandles, chunk) |
| 88 | + filename = sort.merge_sorted_files(intermediate_output, filehandles, chunk) |
86 | 89 | filehandles = [fh.close() for fh in filehandles] |
87 | | -# pass |
88 | 90 | '''2nd iteration external mergesort, if necessary''' |
89 | 91 | if len(chunks) > 1: |
90 | 92 | files = utils.retrieve_file_list(output, 'txt', mask='[merged]') |
— | — | @@ -91,7 +93,7 @@ |
92 | 94 | filename = sort.merge_sorted_files(output, filehandles, 'final') |
93 | 95 | filehandles = [fh.close() for fh in filehandles] |
94 | 96 | filename = 'merged_final.txt' |
95 | | - return filename |
| 97 | + |
96 | 98 | |
97 | 99 | |
98 | 100 | def mergesort_feeder(task_queue, **kwargs): |
— | — | @@ -134,4 +136,6 @@ |
135 | 137 | output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted') |
136 | 138 | dbname = 'enwiki' |
137 | 139 | #mergesort_launcher(input, output) |
138 | | - mergesort_external_launcher(dbname, output, output) |
\ No newline at end of file |
| 140 | + final_output = os.path.join(settings.input_location, 'en', 'wiki', 'dbready') |
| 141 | + mergesort_external_launcher(dbname, output, final_output) |
| 142 | + store_editors(input, dbname, collection) |
\ No newline at end of file |
Index: trunk/tools/editor_trends/config.py |
— | — | @@ -22,29 +22,49 @@ |
23 | 23 | import ConfigParser |
24 | 24 | |
25 | 25 | from utils import utils |
| 26 | +import languages |
26 | 27 | |
27 | | - |
28 | 28 | def create_configuration(settings, args): |
29 | | - config = ConfigParser.RawConfigParser() |
| 29 | + force = getattr(args, 'force', False) |
| 30 | + if not os.path.exists('wiki.cfg') or force: |
| 31 | + config = ConfigParser.RawConfigParser() |
| 32 | + project = None |
| 33 | + language = None |
| 34 | + language_map = languages.language_map() |
| 35 | + working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.\n' % os.getcwd()) |
| 36 | + input_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.\n' % settings.input_location) |
30 | 37 | |
31 | | - working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.' % os.getcwd()) |
32 | | - input_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.' % settings.input_location) |
33 | | - input_location = input_location if len(input_location) > 0 else settings.input_location |
34 | | - working_directory = working_directory if len(working_directory) > 0 else os.getcwd() |
35 | | - |
36 | | - config = ConfigParser.RawConfigParser() |
37 | | - config.add_section('file_locations') |
38 | | - config.set('file_locations', 'working_directory', working_directory) |
39 | | - config.set('file_locations', 'input_location', input_location) |
| 38 | + while project not in settings.projects.keys(): |
| 39 | + project = raw_input('Please indicate which project you would like to analyze.\nDefault is: %s\nPress Enter to accept default.\n' % settings.projects[args.project].capitalize()) |
| 40 | + project = project if len(project) > 0 else args.project |
| 41 | + if project not in settings.projects.keys(): |
| 42 | + print 'Valid choices for a project are: %s' % ','.join(settings.projects.keys()) |
40 | 43 | |
41 | | - fh = utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb') |
42 | | - config.write(fh) |
43 | | - fh.close() |
44 | | - |
45 | | - settings.working_directory = config.get('file_locations', 'working_directory') |
46 | | - settings.input_location = config.get('file_locations', 'input_location') |
47 | | - return settings |
| 44 | + while language not in languages.MAPPING: |
| 45 | + language = raw_input('Please indicate which language of project %s you would like to analyze.\nDefault is: %s\nPress Enter to accept default.\n' % (settings.projects[project].capitalize(), language_map[args.language])) |
| 46 | + if len(language) == 0: |
| 47 | + language = language_map[args.language] |
| 48 | + language = language if language in languages.MAPPING else args.language |
48 | 49 | |
| 50 | + input_location = input_location if len(input_location) > 0 else settings.input_location |
| 51 | + working_directory = working_directory if len(working_directory) > 0 else os.getcwd() |
49 | 52 | |
| 53 | + config = ConfigParser.RawConfigParser() |
| 54 | + config.add_section('file_locations') |
| 55 | + config.set('file_locations', 'working_directory', working_directory) |
| 56 | + config.set('file_locations', 'input_location', input_location) |
| 57 | + config.add_section('wiki') |
| 58 | + config.set('wiki', 'project', project) |
| 59 | + config.set('wiki', 'language', language) |
| 60 | + |
| 61 | + fh = utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb') |
| 62 | + config.write(fh) |
| 63 | + fh.close() |
| 64 | + |
| 65 | + settings.working_directory = config.get('file_locations', 'working_directory') |
| 66 | + settings.input_location = config.get('file_locations', 'input_location') |
| 67 | + return settings |
| 68 | + |
| 69 | + |
50 | 70 | if __name__ == '__main__': |
51 | | - pass |
\ No newline at end of file |
| 71 | + pass |
Index: trunk/tools/editor_trends/languages.py |
— | — | @@ -24,6 +24,8 @@ |
25 | 25 | ''' |
26 | 26 | |
27 | 27 | from utils import ordered_dict as odict |
| 28 | +from utils import utils |
| 29 | + |
28 | 30 | MAPPING = odict.OrderedDict([ |
29 | 31 | (u'English','en'), |
30 | 32 | (u'German','de'), |
— | — | @@ -604,4 +606,7 @@ |
605 | 607 | (u'Muskogee','mus'), |
606 | 608 | (u'Kanuri','kr'), |
607 | 609 | (u'Otsiherero','hz'), |
608 | | -]) |
\ No newline at end of file |
| 610 | +]) |
| 611 | + |
| 612 | +def language_map(): |
| 613 | + return utils.invert_dict(MAPPING) |
\ No newline at end of file |
Index: trunk/tools/editor_trends/configuration.py |
— | — | @@ -50,7 +50,7 @@ |
51 | 51 | self.date_format = '%Y-%m-%d' #Date format as used by Erik Zachte |
52 | 52 | self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' # Timestampformat as generated by the MediaWiki dumps |
53 | 53 | |
54 | | - self.max_settings_xmlfile_size = 67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason |
| 54 | + self.max_xmlfile_size = 67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason |
55 | 55 | self.number_of_processes = cpu_count() * process_multiplier |
56 | 56 | #Change this to match your computers configuration (RAM / CPU) |
57 | 57 | self.minimum_python_version = (2, 6) |
— | — | @@ -69,25 +69,27 @@ |
70 | 70 | self.file_locations = self.set_file_locations() |
71 | 71 | self.max_filehandles = self.determine_max_filehandles_open() |
72 | 72 | |
73 | | - self.windows_register = {'7zip': 'Software\\7-Zip', } |
| 73 | + self.windows_register = {'7z.exe': 'Software\\7-Zip', } |
74 | 74 | self.load_configuration() |
75 | 75 | self.set_custom_settings(**kwargs) |
76 | | - self.projects = {'commons': 'commonswiki', |
77 | | - 'wikibooks': 'wikibooks', |
78 | | - 'wikinews': 'wikinews', |
79 | | - 'wikiquote': 'wikiquote', |
80 | | - 'wikisource': 'wikisource', |
81 | | - 'wikiversity': 'wikiversity', |
82 | | - 'wiktionary': 'wiktionary', |
83 | | - 'metawiki': 'metawiki', |
84 | | - 'wikispecies': 'specieswiki', |
85 | | - 'incubator': 'incubatorwiki', |
86 | | - 'foundation': 'foundationwiki', |
87 | | - 'mediawiki': 'mediawikiwiki', |
88 | | - 'outreach': 'outreachwiki', |
89 | | - 'strategic planning': 'strategywiki', |
90 | | - 'usability initiative': 'usabilitywiki', |
91 | | - 'multilingual wikisource': None |
| 76 | + self.path_ziptool = self.determine_path_ziptool() |
| 77 | + self.projects = {'wiki': 'wikipedia', |
| 78 | + 'commons': 'commonswiki', |
| 79 | + 'books': 'wikibooks', |
| 80 | + 'news': 'wikinews', |
| 81 | + 'quote': 'wikiquote', |
| 82 | + 'source': 'wikisource', |
| 83 | + 'versity': 'wikiversity', |
| 84 | + 'tionary': 'wiktionary', |
| 85 | + 'meta': 'metawiki', |
| 86 | + 'species': 'specieswiki', |
| 87 | + 'incubator': 'incubatorwiki', |
| 88 | + 'foundation': 'foundationwiki', |
| 89 | + 'mediawiki': 'mediawikiwiki', |
| 90 | + 'outreach': 'outreachwiki', |
| 91 | + 'strategic_planning': 'strategywiki', |
| 92 | + 'usability_initiative': 'usabilitywiki', |
| 93 | + 'multilingual_wikisource': None |
92 | 94 | } |
93 | 95 | |
94 | 96 | def set_custom_settings(self, **kwargs): |
— | — | @@ -100,6 +102,8 @@ |
101 | 103 | config.read(os.path.join(self.working_directory, 'wiki.cfg')) |
102 | 104 | self.working_directory = config.get('file_locations', 'working_directory') |
103 | 105 | self.input_location = config.get('file_locations', 'input_location') |
| 106 | + self.default_project = config.get('wiki', 'project') |
| 107 | + self.default_language = config.get('wiki', 'language') |
104 | 108 | |
105 | 109 | def determine_working_directory(self): |
106 | 110 | cwd = os.getcwd() |
— | — | @@ -115,6 +119,10 @@ |
116 | 120 | else: |
117 | 121 | return os |
118 | 122 | |
| 123 | + def determine_path_ziptool(self): |
| 124 | + return self.detect_installed_program(self.determine_ziptool()) |
| 125 | + |
| 126 | + |
119 | 127 | def verify_environment(self, directories): |
120 | 128 | for dir in directories: |
121 | 129 | if not os.path.exists(dir): |
— | — | @@ -146,6 +154,7 @@ |
147 | 155 | return resource.getrlimit(resource.RLIMIT_NOFILE)[0] |
148 | 156 | else: |
149 | 157 | return 500 |
| 158 | + |
150 | 159 | def update_python_path(self): |
151 | 160 | IGNORE_DIRS = ['wikistats', 'zips'] |
152 | 161 | dirs = [name for name in os.listdir(self.working_directory) if |
Index: trunk/tools/editor_trends/utils/utils.py |
— | — | @@ -31,13 +31,14 @@ |
32 | 32 | import os |
33 | 33 | import ctypes |
34 | 34 | import time |
| 35 | +import subprocess |
| 36 | +import sys |
| 37 | +sys.path.append('..') |
35 | 38 | |
36 | 39 | import configuration |
37 | 40 | settings = configuration.Settings() |
38 | 41 | import exceptions |
39 | 42 | |
40 | | -settings = configuration.Settings() |
41 | | - |
42 | 43 | try: |
43 | 44 | import psyco |
44 | 45 | psyco.full() |
— | — | @@ -250,6 +251,11 @@ |
251 | 252 | return name |
252 | 253 | |
253 | 254 | |
| 255 | +def delete_file(location, filename): |
| 256 | + if check_file_exists(location, filename): |
| 257 | + os.remove(os.path.join(location, filename)) |
| 258 | + |
| 259 | + |
254 | 260 | def check_file_exists(location, filename): |
255 | 261 | if hasattr(filename, '__call__'): |
256 | 262 | filename = construct_filename(filename, '.bin') |
— | — | @@ -350,6 +356,41 @@ |
351 | 357 | return files |
352 | 358 | |
353 | 359 | |
| 360 | +def zip_archive(location, source, compression='7z'): |
| 361 | + ''' |
| 362 | + @path is the absolute path to the zip program |
| 363 | + @location is the directory where to store the compressed file |
| 364 | + @source is the name of the zipfile |
| 365 | + ''' |
| 366 | + output, ext = source.split('.') |
| 367 | + output = output + '.7z' |
| 368 | + path = settings.path_ziptool |
| 369 | + if settings.platform == 'Windows': |
| 370 | + p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'a', '-scsUTF-8', '-t%s' % compression, '%s\\%s' % (location,output), '%s\\%s' % (location,source)], shell=True).wait() |
| 371 | + elif settings.platform == 'Linux': |
| 372 | + raise NotImplementedError |
| 373 | + elif settings.platform == 'OSX': |
| 374 | + raise NotImplementedError |
| 375 | + else: |
| 376 | + raise exceptions.PlatformNotSupportedError |
| 377 | + |
| 378 | + |
| 379 | +def zip_extract(path, location, source): |
| 380 | + ''' |
| 381 | + @path is the absolute path to the zip program |
| 382 | + @location is the directory where to store the compressed file |
| 383 | + @source is the name of the zipfile |
| 384 | + ''' |
| 385 | + if settings.platform == 'Windows': |
| 386 | + p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)], shell=True).wait() |
| 387 | + elif settings.platform == 'Linux': |
| 388 | + raise NotImplementedError |
| 389 | + elif settings.platform == 'OSX': |
| 390 | + raise NotImplementedError |
| 391 | + else: |
| 392 | + raise exceptions.PlatformNotSupportedError |
| 393 | + |
| 394 | + |
354 | 395 | def merge_list(datalist): |
355 | 396 | merged = [] |
356 | 397 | for d in datalist: |
— | — | @@ -421,4 +462,8 @@ |
422 | 463 | |
423 | 464 | |
424 | 465 | if __name__ == '__main__': |
425 | | - debug() |
| 466 | + tool = settings.determine_ziptool() |
| 467 | + path = settings.detect_installed_program(tool) |
| 468 | + location = os.path.join(settings.input_location, 'en', 'wiki') |
| 469 | + source = 'enwiki-20100916-stub-meta-history.xml' |
| 470 | + zip_archive(path, location, source) |