r77195 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r77194‎ | r77195 | r77196 >
Date:22:23, 23 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
* Various performance improvements
Modified paths:
  • /trunk/tools/editor_trends/config.py (modified) (history)
  • /trunk/tools/editor_trends/configuration.py (modified) (history)
  • /trunk/tools/editor_trends/etl/chunker.py (modified) (history)
  • /trunk/tools/editor_trends/etl/extract.py (modified) (history)
  • /trunk/tools/editor_trends/etl/loader.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)
  • /trunk/tools/editor_trends/languages.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/utils/utils.py (modified) (history)
  • /trunk/tools/editor_trends/wikitree/xml.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -19,7 +19,6 @@
2020
2121 import os
2222 import sys
23 -import subprocess
2423 import datetime
2524 from argparse import ArgumentParser
2625 from argparse import RawTextHelpFormatter
@@ -44,15 +43,15 @@
4544 class Timer(object):
4645 def __init__(self):
4746 self.t0 = datetime.datetime.now()
48 -
 47+
4948 def stop(self):
5049 self.t1 = datetime.datetime.now()
51 -
 50+
5251 def elapsed(self):
5352 self.stop()
5453 print 'Processing time: %s' % (self.t1 - self.t0)
55 -
5654
 55+
5756 def get_value(args, key):
5857 return getattr(args, key, None)
5958
@@ -69,12 +68,12 @@
7069 def retrieve_projectname(args):
7170 language_code = retrieve_language(args)
7271 if language_code == None:
73 - print 'Entered language: %s is not a valid Wikipedia language' % get_value(args, 'language')
 72+ print 'Entered language: %s is not a valid Wikimedia language' % get_value(args, 'language')
7473 sys.exit(-1)
7574 project = retrieve_project(args)
7675
7776 if project == None:
78 - print 'Entered project: %s is not valid Wikipedia project.' % get_value(args, 'project')
 77+ print 'Entered project: %s is not valid Wikimedia Foundation project.' % get_value(args, 'project')
7978 sys.exit(-1)
8079 if project == 'commonswiki':
8180 return project
@@ -91,7 +90,7 @@
9291 def retrieve_project(args):
9392 project = get_value(args, 'project')
9493 if project != 'wiki':
95 - project = settings.WIKIMEDIA_PROJECTS.get(project, None)
 94+ project = settings.projects.get(project, None)
9695 return project
9796
9897
@@ -107,9 +106,15 @@
108107 locations['language_code'] = language_code
109108 locations['language'] = get_value(args, 'language')
110109 locations['location'] = os.path.join(location, language_code, project)
 110+ locations['chunks'] = os.path.join(locations['location'], 'chunks')
 111+ locations['txt'] = os.path.join(locations['location'], 'txt')
 112+ locations['sorted'] = os.path.join(locations['location'], 'sorted')
 113+ locations['dbready'] = os.path.join(locations['location'], 'dbready')
111114 locations['project'] = project
112115 locations['full_project'] = retrieve_projectname(args)
113116 locations['filename'] = generate_wikidump_filename(project, args)
 117+ locations['collection'] = get_value(args, 'collection')
 118+ locations['directories'] = [locations['chunks'], locations['location'], locations['txt'], locations['sorted'], locations['dbready']]
114119 return locations
115120
116121
@@ -119,7 +124,7 @@
120125 language = kwargs.pop('language')
121126 location = kwargs.pop('location')
122127 project = project.title()
123 - language_map = utils.invert_dict(languages.MAPPING)
 128+ language_map = languages.language_map()
124129 print 'Project: %s' % (project)
125130 print 'Language: %s / %s' % (language_map[language_code].decode(settings.encoding), language.decode(settings.encoding))
126131 print 'Input directory: %s' % location
@@ -163,29 +168,15 @@
164169 sys.exit(retcode)
165170 chunker.split_file(location, file, project, language_code, language)
166171 timer.elapsed()
167 - #settings.set_custom_settings(xml_namespace='http://www.mediawiki.org/xml/export-0.3/')
168172
169173
170174 def launch_zip_extractor(args, location, file):
171175 timer = Timer()
172 - path = settings.detect_installed_program('7zip')
173 - source = os.path.join(location, file)
174 - p = None
175 -
176 - if settings.platform == 'Windows':
177 - p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)], shell=True).wait()
178 - elif settings.platform == 'Linux':
179 - raise NotImplementedError
180 - elif settings.platform == 'OSX':
181 - raise NotImplementedError
182 - else:
183 - raise exceptions.PlatformNotSupportedError
 176+ utils.zip_extract(location, file, compression='7z')
184177 timer.elapsed()
185 - return p
186178
187179
188180 def extract_launcher(args, **kwargs):
189 - print 'mongodb_script_launcher'
190181 timer = Timer()
191182 location = kwargs.pop('location')
192183 language_code = kwargs.pop('language_code')
@@ -199,13 +190,23 @@
200191 location = kwargs.pop('location')
201192 input = os.path.join(location, 'txt')
202193 output = os.path.join(location, 'sorted')
 194+ final_output = os.path.join(location, 'dbready')
203195 dbname = kwargs.pop('full_project')
204196 loader.mergesort_launcher(input, output)
205 - filename = loader.mergesort_external_launcher(dbname, output, output)
206 - loader.store_editors(output, filename, dbname, 'editors')
 197+ loader.mergesort_external_launcher(dbname, output, final_output)
207198 timer.elapsed()
208199
209200
 201+def store_launcher(args, **kwargs):
 202+ timer = Timer()
 203+ location = kwargs.pop('location')
 204+ input = os.path.join(location, 'dbready')
 205+ dbname = kwargs.pop('full_project')
 206+ collection = kwargs.pop('collection')
 207+ loader.store_editors(input, dbname, collection)
 208+ timer.elapsed()
 209+
 210+
210211 def transformer_launcher(args, **kwargs):
211212 print 'dataset launcher'
212213 timer = Timer()
@@ -289,6 +290,8 @@
290291
291292 parser_config = subparsers.add_parser('config', help='The config sub command allows you set the data location of where to store files.')
292293 parser_config.set_defaults(func=config_launcher)
 294+ parser_config.add_argument('-f', '--force', action='store_true',
 295+ help='Reconfigure Editor Toolkit (this will replace wiki.cfg')
293296
294297 parser_download = subparsers.add_parser('download', help='The download sub command allows you to download a Wikipedia dump file.')
295298 parser_download.set_defaults(func=dump_downloader_launcher)
@@ -296,11 +299,17 @@
297300 parser_split = subparsers.add_parser('split', help='The split sub command splits the downloaded file in smaller chunks to parallelize extracting information.')
298301 parser_split.set_defaults(func=chunker_launcher)
299302
 303+ parser_create = subparsers.add_parser('extract', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.')
 304+ parser_create.set_defaults(func=extract_launcher)
 305+
300306 parser_sort = subparsers.add_parser('sort', help='By presorting the data, significant processing time reducations are achieved.')
301307 parser_sort.set_defaults(func=sort_launcher)
302308
303 - parser_create = subparsers.add_parser('extract', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.')
304 - parser_create.set_defaults(func=extract_launcher)
 309+ parser_store = subparsers.add_parser('store', help='The store sub command parsers the XML chunk files, extracts the information and stores it in a MongoDB.')
 310+ parser_store.set_defaults(func=store_launcher)
 311+ parser_store.add_argument('-c', '--collection', action='store',
 312+ help='Name of MongoDB collection',
 313+ default='editors')
305314
306315 parser_transform = subparsers.add_parser('transform', help='Transform the raw datatabe to an enriched dataset that can be exported.')
307316 parser_transform.set_defaults(func=transformer_launcher)
@@ -337,10 +346,9 @@
338347 detect_python_version()
339348 about()
340349 args = parser.parse_args()
341 - if not os.path.exists('wiki.cfg'):
342 - config.create_configuration(settings, args)
 350+ config.create_configuration(settings, args)
343351 locations = determine_file_locations(args)
344 - settings.verify_environment([locations['location']])
 352+ settings.verify_environment(locations['directories'])
345353 show_settings(args, **locations)
346354 #locations['settings'] = settings
347355 args.func(args, **locations)
@@ -348,5 +356,4 @@
349357
350358
351359 if __name__ == '__main__':
352 - #args = ['download', '-l', 'Russian']
353360 main()
Index: trunk/tools/editor_trends/wikitree/xml.py
@@ -28,8 +28,10 @@
2929
3030 def extract_text(elem, kwargs):
3131 if elem != None and elem.text != None:
32 - return elem.text.decode(settings.encoding)
33 - return None
 32+ #try:
 33+ return elem.text #.decode(settings.encoding)
 34+ #except UnicodeDecodeError:
 35+ #return None
3436
3537
3638 def retrieve_xml_node(xml_nodes, name):
Index: trunk/tools/editor_trends/etl/extract.py
@@ -63,11 +63,12 @@
6464 new_xmlfile()
6565
6666 class XMLFile(object):
67 - def __init__(self, input, output, file, bots, **kwargs):
 67+ def __init__(self, input, output, file, bots, target, **kwargs):
6868 self.file = file
6969 self.input = input
7070 self.output = output
7171 self.bots = bots
 72+ self.target = target
7273 for kw in kwargs:
7374 setattr(self, kw, kwargs[kw])
7475
@@ -96,7 +97,7 @@
9798 raw_data = ''.join(raw_data)
9899 xml_buffer.write(raw_data)
99100 elem = cElementTree.XML(xml_buffer.getvalue())
100 - output_editor_information(elem, self.fh, bots=self.bots, destination=self.destination)
 101+ self.target(elem, self.fh, bots=self.bots, destination=self.destination)
101102 except SyntaxError, error:
102103 print error
103104 '''
@@ -160,6 +161,7 @@
161162 else:
162163 return None
163164
 165+
164166 def extract_contributor_id(contributor, kwargs):
165167 '''
166168 @contributor is the xml contributor node containing a number of attributes
@@ -339,20 +341,17 @@
340342 tasks = multiprocessing.JoinableQueue()
341343 consumers = [XMLFileConsumer(tasks, None) for i in xrange(settings.number_of_processes)]
342344 for file in files:
343 - tasks.put(XMLFile(input, output, file, bots, **kwargs))
 345+ tasks.put(XMLFile(input, output, file, bots, output_editor_information, **kwargs))
 346+ print 'The queue contains %s files.' % tasks.qsize()
344347 for x in xrange(settings.number_of_processes):
345348 tasks.put(None)
346349
347 - print tasks.qsize()
348350 for w in consumers:
349351 w.start()
350352
351353 tasks.join()
352354
353 - #chunks = utils.split_list(files, settings.number_of_processes)
354 - #pc.build_scaffolding(pc.load_queue, parse_editors, chunks, False, False, **kwargs)
355355
356 -
357356 def debug_parse_editors(dbname):
358357 q = JoinableQueue()
359358 parse_editors('522.xml', q, None, None, debug=True, destination='file')
Index: trunk/tools/editor_trends/etl/store.py
@@ -38,8 +38,9 @@
3939 @dbname is the name of the MongoDB collection where to store the information.
4040 '''
4141 dbname = kwargs.get('dbname', None)
 42+ colleciton = kwargs.pop('collection')
4243 mongo = db.init_mongo_db(dbname)
43 - collection = mongo['editors']
 44+ collection = mongo[collection]
4445 mongo[collection].ensure_index('editor')
4546 editor_cache = cache.EditorCache(collection)
4647
@@ -82,9 +83,9 @@
8384 return cache
8485
8586
86 -def search_cache_for_missed_editors(dbname):
 87+def search_cache_for_missed_editors(dbname, collection):
8788 mongo = db.init_mongo_db(dbname)
88 - collection = mongo['editors']
 89+ collection = mongo[collection]
8990 editor_cache = cache.EditorCache(collection)
9091 cache = load_cache_objects()
9192 for c in cache:
Index: trunk/tools/editor_trends/etl/chunker.py
@@ -24,6 +24,7 @@
2525 import json
2626 import os
2727
 28+
2829 import progressbar
2930
3031
@@ -79,14 +80,15 @@
8081 return ns
8182
8283
83 -def build_namespaces_locale(namespaces):
 84+def build_namespaces_locale(namespaces, include=[0]):
8485 '''
85 - Construct a list of all the non-main namespaces
 86+ @include is a list of namespace keys that should not be ignored, the default
 87+ setting is to ignore all namespaces except the main namespace.
8688 '''
8789 ns = []
8890 for namespace in namespaces:
89 - value = namespaces[namespace].get(u'*', None)
90 - if value != None and value != '':
 91+ if int(namespace) not in include:
 92+ value = namespaces[namespace].get(u'*', None)
9193 ns.append(value)
9294 return ns
9395
@@ -114,32 +116,39 @@
115117
116118 def write_xml_file(element, fh, output, counter):
117119 '''Get file handle and write xml element to file'''
118 - size = len(cElementTree.tostring(element))
119 - fh, counter = create_file_handle(fh, output, counter, size)
 120+ xml_string = cElementTree.tostring(element)
 121+ size = len(xml_string)
 122+ fh, counter, new_file = create_file_handle(fh, output, counter, size)
120123 try:
121 - fh.write(cElementTree.tostring(element))
 124+ fh.write(xml_string)
122125 except MemoryError:
123126 print 'Add error capturing logic'
124127 fh.write('\n')
125 - return fh, counter
 128+ return fh, counter, new_file
126129
127130
128131 def create_file_handle(fh, output, counter, size):
129 - '''Create file handle if none is supplied or if file size > max file size.'''
130 - if not counter:
 132+ '''
 133+ @fh is file handle, if none is supplied or if file size > max file size then
 134+ create a new file handle
 135+ @output is the location where to store the files
 136+ @counter indicates which chunk it is
 137+ @size is the length of the xml element about to be written to file.
 138+ '''
 139+ if not fh:
131140 counter = 0
132 - path = os.path.join(output, '%s.xml' % counter)
133 - if not fh:
 141+ path = os.path.join(output, '%s.xml' % counter)
134142 fh = codecs.open(path, 'w', encoding=settings.encoding)
135 - return fh, counter
136 - elif (fh.tell() + size) > settings.max_settings_xmlfile_size:
137 - print 'Created chunk %s' % counter
 143+ return fh, counter, False
 144+ elif (fh.tell() + size) > settings.max_xmlfile_size:
 145+ print 'Created chunk %s' % (counter + 1)
138146 fh.close
139147 counter += 1
 148+ path = os.path.join(output, '%s.xml' % counter)
140149 fh = codecs.open(path, 'w', encoding=settings.encoding)
141 - return fh, counter
 150+ return fh, counter, True
142151 else:
143 - return fh, counter
 152+ return fh, counter, False
144153
145154
146155 def flatten_xml_elements(data, page):
@@ -154,9 +163,9 @@
155164 else:
156165 flat[x].append(xml.extract_text(elem, None))
157166 return flat
158 -
159167
160 -def split_file(location, file, project, language_code, language, format='xml'):
 168+
 169+def split_file(location, file, project, language_code, include, format='xml', zip=False):
161170 '''Reads xml file and splits it in N chunks'''
162171 #location = os.path.join(settings.input_location, language)
163172 input = os.path.join(location, file)
@@ -167,12 +176,11 @@
168177 else:
169178 f = input.replace('.xml', '')
170179 fh = utils.create_txt_filehandle(output, '%s.tsv' % f, 'w', settings.encoding)
171 -
 180+
172181 ns = load_namespace(language_code)
173 - ns = build_namespaces_locale(ns)
 182+ ns = build_namespaces_locale(ns, include)
174183
175 - settings.xml_namespace = 'http://www.mediawiki.org/xml/export-0.3/'
176 - counter = None
 184+ counter = 0
177185 tag = '{%s}page' % settings.xml_namespace
178186 context = cElementTree.iterparse(input, events=('start', 'end'))
179187 context = iter(context)
@@ -186,7 +194,11 @@
187195 page = elem.find('id').text
188196 elem = parse_comments(elem, remove_numeric_character_references)
189197 if format == 'xml':
190 - fh, counter = write_xml_file(elem, fh, output, counter)
 198+ fh, counter, new_file = write_xml_file(elem, fh, output, counter)
 199+ if zip and new_file:
 200+ file = str(counter - 1) + '.xml'
 201+ utils.zip_archive(settings.path_ziptool, output, file)
 202+ utils.delete_file(output, file)
191203 else:
192204 data = [el.getchildren() for el in elem if el.tag == 'revision']
193205 data = flatten_xml_elements(data, page)
@@ -196,9 +208,9 @@
197209 f = utils.create_txt_filehandle(settings.log_location, 'split_xml', 'w', settings.encoding)
198210 f.write(cElementTree.tostring(elem))
199211 f.close()
200 - finally:
201 - fh.close()
202212
 213+ fh.close()
 214+
203215 if __name__ == "__main__":
204216 kwargs = {'output': settings.input_location,
205217 'input': settings.input_filename,
Index: trunk/tools/editor_trends/etl/loader.py
@@ -32,7 +32,8 @@
3333
3434
3535
36 -def store_editors(input, filename, dbname, collection):
 36+def store_editors(input, dbname, collection):
 37+ filename = utils.retrieve_file_list(input, 'txt', mask=None)[0]
3738 fh = utils.create_txt_filehandle(input, filename, 'r', settings.encoding)
3839 mongo = db.init_mongo_db(dbname)
3940 collection = mongo[collection]
@@ -70,7 +71,7 @@
7172 utils.store_object(editors, settings.binary_location, 'editors')
7273
7374
74 -def mergesort_external_launcher(dbname, input, output):
 75+def mergesort_external_launcher(dbname, input, intermediate_output, output):
7576 files = utils.retrieve_file_list(input, 'txt', mask='')
7677 x = 0
7778 maxval = 99999
@@ -79,11 +80,12 @@
8081 maxval = round(len(files) / x)
8182 chunks = utils.split_list(files, int(x))
8283 '''1st iteration external mergesort'''
 84+ if len(chunks) < 2:
 85+ intermediate_output = output
8386 for chunk in chunks:
8487 filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.encoding) for file in chunks[chunk]]
85 - filename = sort.merge_sorted_files(output, filehandles, chunk)
 88+ filename = sort.merge_sorted_files(intermediate_output, filehandles, chunk)
8689 filehandles = [fh.close() for fh in filehandles]
87 -# pass
8890 '''2nd iteration external mergesort, if necessary'''
8991 if len(chunks) > 1:
9092 files = utils.retrieve_file_list(output, 'txt', mask='[merged]')
@@ -91,7 +93,7 @@
9294 filename = sort.merge_sorted_files(output, filehandles, 'final')
9395 filehandles = [fh.close() for fh in filehandles]
9496 filename = 'merged_final.txt'
95 - return filename
 97+
9698
9799
98100 def mergesort_feeder(task_queue, **kwargs):
@@ -134,4 +136,6 @@
135137 output = os.path.join(settings.input_location, 'en', 'wiki', 'sorted')
136138 dbname = 'enwiki'
137139 #mergesort_launcher(input, output)
138 - mergesort_external_launcher(dbname, output, output)
\ No newline at end of file
 140+ final_output = os.path.join(settings.input_location, 'en', 'wiki', 'dbready')
 141+ mergesort_external_launcher(dbname, output, final_output)
 142+ store_editors(input, dbname, collection)
\ No newline at end of file
Index: trunk/tools/editor_trends/config.py
@@ -22,29 +22,49 @@
2323 import ConfigParser
2424
2525 from utils import utils
 26+import languages
2627
27 -
2828 def create_configuration(settings, args):
29 - config = ConfigParser.RawConfigParser()
 29+ force = getattr(args, 'force', False)
 30+ if not os.path.exists('wiki.cfg') or force:
 31+ config = ConfigParser.RawConfigParser()
 32+ project = None
 33+ language = None
 34+ language_map = languages.language_map()
 35+ working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.\n' % os.getcwd())
 36+ input_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.\n' % settings.input_location)
3037
31 - working_directory = raw_input('Please indicate where you installed Editor Trends Analytics.\nCurrent location is %s\nPress Enter to accept default.' % os.getcwd())
32 - input_location = raw_input('Please indicate where to store the Wikipedia dump files.\nDefault is: %s\nPress Enter to accept default.' % settings.input_location)
33 - input_location = input_location if len(input_location) > 0 else settings.input_location
34 - working_directory = working_directory if len(working_directory) > 0 else os.getcwd()
35 -
36 - config = ConfigParser.RawConfigParser()
37 - config.add_section('file_locations')
38 - config.set('file_locations', 'working_directory', working_directory)
39 - config.set('file_locations', 'input_location', input_location)
 38+ while project not in settings.projects.keys():
 39+ project = raw_input('Please indicate which project you would like to analyze.\nDefault is: %s\nPress Enter to accept default.\n' % settings.projects[args.project].capitalize())
 40+ project = project if len(project) > 0 else args.project
 41+ if project not in settings.projects.keys():
 42+ print 'Valid choices for a project are: %s' % ','.join(settings.projects.keys())
4043
41 - fh = utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb')
42 - config.write(fh)
43 - fh.close()
44 -
45 - settings.working_directory = config.get('file_locations', 'working_directory')
46 - settings.input_location = config.get('file_locations', 'input_location')
47 - return settings
 44+ while language not in languages.MAPPING:
 45+ language = raw_input('Please indicate which language of project %s you would like to analyze.\nDefault is: %s\nPress Enter to accept default.\n' % (settings.projects[project].capitalize(), language_map[args.language]))
 46+ if len(language) == 0:
 47+ language = language_map[args.language]
 48+ language = language if language in languages.MAPPING else args.language
4849
 50+ input_location = input_location if len(input_location) > 0 else settings.input_location
 51+ working_directory = working_directory if len(working_directory) > 0 else os.getcwd()
4952
 53+ config = ConfigParser.RawConfigParser()
 54+ config.add_section('file_locations')
 55+ config.set('file_locations', 'working_directory', working_directory)
 56+ config.set('file_locations', 'input_location', input_location)
 57+ config.add_section('wiki')
 58+ config.set('wiki', 'project', project)
 59+ config.set('wiki', 'language', language)
 60+
 61+ fh = utils.create_binary_filehandle(working_directory, 'wiki.cfg', 'wb')
 62+ config.write(fh)
 63+ fh.close()
 64+
 65+ settings.working_directory = config.get('file_locations', 'working_directory')
 66+ settings.input_location = config.get('file_locations', 'input_location')
 67+ return settings
 68+
 69+
5070 if __name__ == '__main__':
51 - pass
\ No newline at end of file
 71+ pass
Index: trunk/tools/editor_trends/languages.py
@@ -24,6 +24,8 @@
2525 '''
2626
2727 from utils import ordered_dict as odict
 28+from utils import utils
 29+
2830 MAPPING = odict.OrderedDict([
2931 (u'English','en'),
3032 (u'German','de'),
@@ -604,4 +606,7 @@
605607 (u'Muskogee','mus'),
606608 (u'Kanuri','kr'),
607609 (u'Otsiherero','hz'),
608 -])
\ No newline at end of file
 610+])
 611+
 612+def language_map():
 613+ return utils.invert_dict(MAPPING)
\ No newline at end of file
Index: trunk/tools/editor_trends/configuration.py
@@ -50,7 +50,7 @@
5151 self.date_format = '%Y-%m-%d' #Date format as used by Erik Zachte
5252 self.timestamp_format = '%Y-%m-%dT%H:%M:%SZ' # Timestampformat as generated by the MediaWiki dumps
5353
54 - self.max_settings_xmlfile_size = 67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason
 54+ self.max_xmlfile_size = 67108864 # ==64Mb, see http://hadoop.apache.org/common/docs/r0.20.0/hdfs_design.html#Large+Data+Setsfor reason
5555 self.number_of_processes = cpu_count() * process_multiplier
5656 #Change this to match your computers configuration (RAM / CPU)
5757 self.minimum_python_version = (2, 6)
@@ -69,25 +69,27 @@
7070 self.file_locations = self.set_file_locations()
7171 self.max_filehandles = self.determine_max_filehandles_open()
7272
73 - self.windows_register = {'7zip': 'Software\\7-Zip', }
 73+ self.windows_register = {'7z.exe': 'Software\\7-Zip', }
7474 self.load_configuration()
7575 self.set_custom_settings(**kwargs)
76 - self.projects = {'commons': 'commonswiki',
77 - 'wikibooks': 'wikibooks',
78 - 'wikinews': 'wikinews',
79 - 'wikiquote': 'wikiquote',
80 - 'wikisource': 'wikisource',
81 - 'wikiversity': 'wikiversity',
82 - 'wiktionary': 'wiktionary',
83 - 'metawiki': 'metawiki',
84 - 'wikispecies': 'specieswiki',
85 - 'incubator': 'incubatorwiki',
86 - 'foundation': 'foundationwiki',
87 - 'mediawiki': 'mediawikiwiki',
88 - 'outreach': 'outreachwiki',
89 - 'strategic planning': 'strategywiki',
90 - 'usability initiative': 'usabilitywiki',
91 - 'multilingual wikisource': None
 76+ self.path_ziptool = self.determine_path_ziptool()
 77+ self.projects = {'wiki': 'wikipedia',
 78+ 'commons': 'commonswiki',
 79+ 'books': 'wikibooks',
 80+ 'news': 'wikinews',
 81+ 'quote': 'wikiquote',
 82+ 'source': 'wikisource',
 83+ 'versity': 'wikiversity',
 84+ 'tionary': 'wiktionary',
 85+ 'meta': 'metawiki',
 86+ 'species': 'specieswiki',
 87+ 'incubator': 'incubatorwiki',
 88+ 'foundation': 'foundationwiki',
 89+ 'mediawiki': 'mediawikiwiki',
 90+ 'outreach': 'outreachwiki',
 91+ 'strategic_planning': 'strategywiki',
 92+ 'usability_initiative': 'usabilitywiki',
 93+ 'multilingual_wikisource': None
9294 }
9395
9496 def set_custom_settings(self, **kwargs):
@@ -100,6 +102,8 @@
101103 config.read(os.path.join(self.working_directory, 'wiki.cfg'))
102104 self.working_directory = config.get('file_locations', 'working_directory')
103105 self.input_location = config.get('file_locations', 'input_location')
 106+ self.default_project = config.get('wiki', 'project')
 107+ self.default_language = config.get('wiki', 'language')
104108
105109 def determine_working_directory(self):
106110 cwd = os.getcwd()
@@ -115,6 +119,10 @@
116120 else:
117121 return os
118122
 123+ def determine_path_ziptool(self):
 124+ return self.detect_installed_program(self.determine_ziptool())
 125+
 126+
119127 def verify_environment(self, directories):
120128 for dir in directories:
121129 if not os.path.exists(dir):
@@ -146,6 +154,7 @@
147155 return resource.getrlimit(resource.RLIMIT_NOFILE)[0]
148156 else:
149157 return 500
 158+
150159 def update_python_path(self):
151160 IGNORE_DIRS = ['wikistats', 'zips']
152161 dirs = [name for name in os.listdir(self.working_directory) if
Index: trunk/tools/editor_trends/utils/utils.py
@@ -31,13 +31,14 @@
3232 import os
3333 import ctypes
3434 import time
 35+import subprocess
 36+import sys
 37+sys.path.append('..')
3538
3639 import configuration
3740 settings = configuration.Settings()
3841 import exceptions
3942
40 -settings = configuration.Settings()
41 -
4243 try:
4344 import psyco
4445 psyco.full()
@@ -250,6 +251,11 @@
251252 return name
252253
253254
 255+def delete_file(location, filename):
 256+ if check_file_exists(location, filename):
 257+ os.remove(os.path.join(location, filename))
 258+
 259+
254260 def check_file_exists(location, filename):
255261 if hasattr(filename, '__call__'):
256262 filename = construct_filename(filename, '.bin')
@@ -350,6 +356,41 @@
351357 return files
352358
353359
 360+def zip_archive(location, source, compression='7z'):
 361+ '''
 362+ @path is the absolute path to the zip program
 363+ @location is the directory where to store the compressed file
 364+ @source is the name of the zipfile
 365+ '''
 366+ output, ext = source.split('.')
 367+ output = output + '.7z'
 368+ path = settings.path_ziptool
 369+ if settings.platform == 'Windows':
 370+ p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'a', '-scsUTF-8', '-t%s' % compression, '%s\\%s' % (location,output), '%s\\%s' % (location,source)], shell=True).wait()
 371+ elif settings.platform == 'Linux':
 372+ raise NotImplementedError
 373+ elif settings.platform == 'OSX':
 374+ raise NotImplementedError
 375+ else:
 376+ raise exceptions.PlatformNotSupportedError
 377+
 378+
 379+def zip_extract(path, location, source):
 380+ '''
 381+ @path is the absolute path to the zip program
 382+ @location is the directory where to store the compressed file
 383+ @source is the name of the zipfile
 384+ '''
 385+ if settings.platform == 'Windows':
 386+ p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)], shell=True).wait()
 387+ elif settings.platform == 'Linux':
 388+ raise NotImplementedError
 389+ elif settings.platform == 'OSX':
 390+ raise NotImplementedError
 391+ else:
 392+ raise exceptions.PlatformNotSupportedError
 393+
 394+
354395 def merge_list(datalist):
355396 merged = []
356397 for d in datalist:
@@ -421,4 +462,8 @@
422463
423464
424465 if __name__ == '__main__':
425 - debug()
 466+ tool = settings.determine_ziptool()
 467+ path = settings.detect_installed_program(tool)
 468+ location = os.path.join(settings.input_location, 'en', 'wiki')
 469+ source = 'enwiki-20100916-stub-meta-history.xml'
 470+ zip_archive(path, location, source)

Status & tagging log