r76201 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r76200‎ | r76201 | r76202 >
Date:17:42, 6 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
Various bugfixes
Modified paths:
  • /trunk/tools/editor_trends/config.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history)
  • /trunk/tools/editor_trends/optimize_editors.py (modified) (history)
  • /trunk/tools/editor_trends/split_xml_file.py (modified) (history)
  • /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history)
  • /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -22,8 +22,8 @@
2323 import subprocess
2424 from argparse import ArgumentParser
2525 from argparse import RawTextHelpFormatter
 26+import locale
2627
27 -
2828 import progressbar
2929
3030 import settings
@@ -43,6 +43,11 @@
4444 config.load_configuration(args)
4545
4646
 47+def determine_default_language():
 48+ language_code = locale.getdefaultlocale()[0]
 49+ return language_code.split('_')[0]
 50+
 51+
4752 def retrieve_projectname(args):
4853 language_code = retrieve_language(args)
4954 if language_code == None:
@@ -53,13 +58,16 @@
5459 if project == None:
5560 print 'Entered project: %s is not valid Wikipedia project.' % get_value(args, 'project')
5661 sys.exit(-1)
 62+ if project == 'commonswiki':
 63+ return project
 64+ else:
 65+ return '%s%s' % (language_code, project)
5766
58 - return '%s%s' % (language_code, project)
5967
6068 def retrieve_language(args):
6169 language = get_value(args, 'language')
6270 language = language.title()
63 - return languages.MAPPING.get(language, None)
 71+ return languages.MAPPING.get(language, 'en')
6472
6573
6674 def retrieve_project(args):
@@ -75,13 +83,24 @@
7684
7785 def determine_file_locations(args):
7886 locations = {}
 87+ location = get_value(args, 'location') if get_value(args, 'location') != None else settings.XML_FILE_LOCATION
7988 locations['language_code'] = retrieve_language(args)
80 - locations['location'] = os.path.join(get_value(args, 'location'), retrieve_language(args))
 89+ locations['location'] = os.path.join(location, retrieve_language(args))
8190 locations['project'] = retrieve_projectname(args)
8291 locations['filename'] = generate_wikidump_filename(args)
8392 return locations
8493
8594
 95+def show_settings(args, location, filename, project, language_code):
 96+ project = settings.WIKIMEDIA_PROJECTS.get(project, 'wiki')
 97+ project = project.title()
 98+ language_map = utils.invert_dict(languages.MAPPING)
 99+ print 'Project: %s' % (project)
 100+ print 'Language: %s' % language_map[language_code]
 101+ print 'Input directory: %s' % location
 102+ print 'Output directory: TODO'
 103+
 104+
86105 def dump_downloader_launcher(args, location, filename, project, language_code):
87106 print 'dump downloader'
88107 pbar = get_value(args, 'progress')
@@ -113,8 +132,8 @@
114133 path = config.detect_installed_program('7zip')
115134
116135 source = os.path.join(location, file)
117 - retcode = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)])
118 - return retcode
 136+ p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location, '%s' % (source,)])
 137+ return p
119138
120139
121140 def mongodb_script_launcher(args, location, filename, project, language_code):
@@ -153,6 +172,7 @@
154173
155174
156175 def main():
 176+ default_language = determine_default_language()
157177 file_choices = ('stub-meta-history.xml.gz',
158178 'stub-meta-current.xml.gz',
159179 'pages-meta-history.xml.7z',
@@ -188,7 +208,7 @@
189209 parser.add_argument('-l', '--language', action='store',
190210 help='Example of valid languages.',
191211 choices=supported_languages(),
192 - default='Russian')
 212+ default=default_language)
193213
194214 parser.add_argument('-p', '--project', action='store',
195215 help='Specify the Wikimedia project that you would like to download',
@@ -210,6 +230,7 @@
211231 args = parser.parse_args()
212232 config.load_configuration(args)
213233 locations = determine_file_locations(args)
 234+ show_settings(args, **locations)
214235 args.func(args, **locations)
215236
216237
Index: trunk/tools/editor_trends/optimize_editors.py
@@ -17,11 +17,15 @@
1818 __date__ = '2010-11-02'
1919 __version__ = '0.1'
2020
 21+from multiprocessing import Queue
 22+from Queue import Empty
 23+from operator import itemgetter
 24+import datetime
2125
22 -
2326 import settings
2427 from database import db
2528 from utils import process_constructor as pc
 29+import construct_datasets
2630
2731
2832 def create_datacontainer(init_value=0):
@@ -37,7 +41,7 @@
3842 data[str(x)] = init_value
3943 return data
4044
41 -
 45+
4246 def determine_edits_by_year(dates):
4347 '''
4448 This function counts the number of edits by year made by a particular editor.
@@ -87,7 +91,7 @@
8892
8993 output.insert({'editor': id, 'edits': edits,
9094 'edits_by_year': edits_by_year,
91 - 'year_joined': year,
 95+ 'year_joined': new_wikipedian,
9296 'edit_count': edit_count,
9397 'final_edit': final_edit,
9498 'first_edit': first_edit,
@@ -101,20 +105,31 @@
102106 kwargs = {'definition': 'traditional',
103107 'pbar': True,
104108 'dbname': 'enwiki',
105 - 'nr_input_processors': 2,
 109+ 'nr_input_processors': 1,
106110 'nr_output_processors': 0,
107111 }
108 - pc.build_scaffolding(pc.load_queue, optimize_editors, ids, False, False, **kwargs)
 112+ chunks = {}
 113+ parts = int(round(float(len(ids)) / 1, 0))
 114+ a = 0
 115+ for x in xrange(settings.NUMBER_OF_PROCESSES):
 116+ b = a + parts
 117+ chunks[x] = ids[a:b]
 118+ a = (x + 1) * parts
 119+ if a >= len(ids):
 120+ break
109121
 122+ pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False, False, **kwargs)
110123
 124+
111125 def debug_optimize_editors(dbname):
112126 ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
113127 q = pc.load_queue(ids)
114128 kwargs = {'definition': 'traditional',
115 - 'dbname': 'enwiki'
 129+ 'dbname': dbname
116130 }
117131 optimize_editors(q, False, True, kwargs)
118132
119133
120134 if __name__ == '__main__':
121 - run_optimize_editors('enwiki')
\ No newline at end of file
 135+ debug_optimize_editors('test')
 136+ #run_optimize_editors('test')
Index: trunk/tools/editor_trends/map_wiki_editors.py
@@ -244,6 +244,26 @@
245245 print 'Time elapsed: %s and processed %s items.' % (datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n)
246246
247247
 248+def load_cache_objects():
 249+ cache = {}
 250+ files = utils.retrieve_file_list(settings.BINARY_OBJECT_FILE_LOCATION, '.bin')
 251+ for x, file in enumerate(files):
 252+ cache[x] = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION, file)
 253+ return cache
 254+
 255+
 256+def search_cache_for_missed_editors(dbname):
 257+ mongo = db.init_mongo_db(dbname)
 258+ collection = mongo['editors']
 259+ editor_cache = cache.EditorCache(collection)
 260+ cache = load_cache_objects()
 261+ for c in cache:
 262+ for editor in cache[c]:
 263+ editor_cache.add(editor, cache[c][editor])
 264+ cache[c] = {}
 265+ editor_cache.add('NEXT', '')
 266+
 267+
248268 def load_bot_ids():
249269 '''
250270 Loader function to retrieve list of id's of known Wikipedia bots.
@@ -267,7 +287,6 @@
268288 'language': language,
269289 }
270290 chunks = {}
271 - #file_location = os.path.join(settings.XML_FILE_LOCATION, language)
272291 files = utils.retrieve_file_list(location, 'xml')
273292 parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0))
274293 a = 0
@@ -277,12 +296,14 @@
278297 a = (x + 1) * parts
279298
280299 pc.build_scaffolding(pc.load_queue, parse_editors, chunks, store_editors, True, **kwargs)
 300+ search_cache_for_missed_editors(dbname)
281301
282302
283303 def debug_parse_editors(dbname):
284304 q = JoinableQueue()
285305 parse_editors('en\\522.xml', q, None, None, True)
286306 store_editors(q, [], dbname)
 307+ search_cache_for_missed_editors(dbname)
287308
288309
289310 if __name__ == "__main__":
Index: trunk/tools/editor_trends/config.py
@@ -20,13 +20,14 @@
2121
2222 import os
2323 import ConfigParser
24 -from _winreg import *
2524
 25+
2626 import settings
2727 from utils import utils
2828
2929
3030 def detect_windows_program(program):
 31+ from _winreg import *
3132 entry = settings.WINDOWS_REGISTER[program]
3233 try:
3334 key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ)
Index: trunk/tools/editor_trends/utils/utils.py
@@ -32,6 +32,7 @@
3333 import ctypes
3434
3535 import settings
 36+import exceptions
3637
3738
3839 try:
@@ -160,6 +161,7 @@
161162 else:
162163 return 'wb'
163164
 165+
164166 def write_list_to_csv(data, fh, recursive=False):
165167 if recursive:
166168 recursive = False
@@ -170,6 +172,7 @@
171173 fh.write('%s\t' % d)
172174 if recursive:
173175 return True
 176+
174177
175178 def write_dict_to_csv(data, fh):
176179 keys = data.keys()
@@ -225,7 +228,7 @@
226229 if is_exe(exe_file):
227230 return exe_file
228231
229 - return None
 232+ raise exceptions.FileNotFoundException(program)
230233
231234
232235 def store_object(object, location, filename):
@@ -254,6 +257,15 @@
255258 return string
256259
257260
 261+def invert_dict(dictionary):
 262+ '''
 263+ @dictionary is a simple dictionary containing simple values, ie. no lists,
 264+ or other dictionaries
 265+ output: dictionary where key and value are swapped.
 266+ '''
 267+ return dict([[v,k] for k,v in dictionary.items()])
 268+
 269+
258270 def create_dict_from_csv_file(filename, encoding):
259271 d = {}
260272 for line in read_data_from_csv(filename, encoding):
Index: trunk/tools/editor_trends/utils/process_constructor.py
@@ -57,14 +57,16 @@
5858 nr_output_processors = kwargs.pop('nr_output_processors')
5959 input_queues = {}
6060 result_queues = {}
61 - assert len(obj) == nr_input_processors
62 - if result_queue:
63 - assert len(obj)== nr_output_processors
 61+ #assert len(obj) == nr_input_processors
 62+ #if result_queue:
 63+ # assert len(obj)== nr_output_processors
6464
6565 for i, o in enumerate(obj):
6666 input_queues[i] = load_input_queue(obj[o], poison_pill=True)
6767 if result_queue:
6868 result_queues[i] = JoinableQueue()
 69+ else:
 70+ result_queues[i] = False
6971
7072 if settings.PROGRESS_BAR:
7173 size = sum([input_queues[q].qsize() for q in input_queues])
Index: trunk/tools/editor_trends/split_xml_file.py
@@ -172,7 +172,7 @@
173173 #elem = parse_comments(elem, remove_ascii_control_characters)
174174 #print cElementTree.tostring(elem)
175175 except SyntaxError:
176 - fh = utils.create_txt_filehandle(ERROR_MESSAGE_FILE_LOCATION, 'split_xml', 'w', settings.ENCODING)
 176+ fh = utils.create_txt_filehandle(settings.ERROR_MESSAGE_FILE_LOCATION, 'split_xml', 'w', settings.ENCODING)
177177 fh.write(cElementTree.tostring(elem))
178178 fh.close()
179179

Status & tagging log