r76345 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r76344‎ | r76345 | r76346 >
Date:22:12, 8 November 2010
Author:diederik
Status:deferred
Tags:
Comment:
Addded mergesort module. By presorting data, significant reductions in processing time are achieved.
Modified paths:
  • /trunk/tools/editor_trends/construct_datasets.py (modified) (history)
  • /trunk/tools/editor_trends/database/cache.py (modified) (history)
  • /trunk/tools/editor_trends/manage.py (modified) (history)
  • /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history)
  • /trunk/tools/editor_trends/settings.py (modified) (history)
  • /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history)
  • /trunk/tools/editor_trends/utils/sort.py (added) (history)
  • /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/manage.py
@@ -79,17 +79,19 @@
8080 return project
8181
8282
83 -def generate_wikidump_filename(args):
84 - return '%s-%s-%s' % (retrieve_projectname(args), 'latest', get_value(args, 'file'))
 83+def generate_wikidump_filename(project, args):
 84+ return '%s-%s-%s' % (project, 'latest', get_value(args, 'file'))
8585
8686
8787 def determine_file_locations(args):
8888 locations = {}
8989 location = get_value(args, 'location') if get_value(args, 'location') != None else settings.XML_FILE_LOCATION
90 - locations['language_code'] = retrieve_language(args)
91 - locations['location'] = os.path.join(location, retrieve_language(args))
 90+ project = retrieve_project(args)
 91+ language_code = retrieve_language(args)
 92+ locations['language_code'] = language_code
 93+ locations['location'] = os.path.join(location, language_code, project)
9294 locations['project'] = retrieve_projectname(args)
93 - locations['filename'] = generate_wikidump_filename(args)
 95+ locations['filename'] = generate_wikidump_filename(project, args)
9496 return locations
9597
9698
@@ -189,6 +191,12 @@
190192 except UnicodeEncodeError:
191193 print '%s' % language
192194
 195+
 196+def detect_python_version():
 197+ version = ''.join(sys.version_info[0:2])
 198+ if version < settings.MINIMUM_PYTHON_VERSION:
 199+ raise 'Please upgrade to Python 2.6 or higher (but not Python 3.x).'
 200+
193201 def about():
194202 print 'Editor Trends Software is (c) 2010 by the Wikimedia Foundation.'
195203 print 'Written by Diederik van Liere (dvanliere@gmail.com).'
@@ -253,6 +261,7 @@
254262 parser.add_argument('-prog', '--progress', action='store_true', default=True,
255263 help='Indicate whether you want to have a progressbar.')
256264
 265+ detect_python_version()
257266 args = parser.parse_args()
258267 config.load_configuration(args)
259268 locations = determine_file_locations(args)
Index: trunk/tools/editor_trends/map_wiki_editors.py
@@ -88,20 +88,22 @@
8989 return - 1
9090
9191
92 -def output_editor_information(elem, data_queue, **kwargs):
 92+def output_editor_information(elem, output, **kwargs):
9393 '''
9494 @elem is an XML element containing 1 revision from a page
95 - @data_queue is where to store the data
 95+ @output is where to store the data, either a queue or a filehandle
9696 @**kwargs contains extra information
9797
9898 the variable tags determines which attributes are being parsed, the values in
9999 this dictionary are the functions used to extract the data.
100100 '''
101 - tags = {'contributor': {'editor': extract_contributor_id, 'bot': determine_username_is_bot},
 101+ tags = {'contributor': {'editor': extract_contributor_id,
 102+ 'bot': determine_username_is_bot},
102103 'timestamp': {'date': xml.extract_text},
103104 }
104105 vars = {}
105 -
 106+ headers = ['editor', 'date', 'article']
 107+ destination = kwargs.pop('destination')
106108 revisions = elem.findall('revision')
107109 for revision in revisions:
108110 vars['article'] = elem.find('id').text.decode(settings.ENCODING)
@@ -114,12 +116,19 @@
115117 #print '%s\t%s\t%s\t%s\t' % (vars['article'], vars['contributor'], vars['timestamp'], vars['bot'])
116118 if vars['bot'] == 0 and vars['editor'] != -1 and vars['editor'] != None:
117119 vars.pop('bot')
118 - vars['date'] = utils.convert_timestamp_to_date(vars['date'])
119 - data_queue.put(vars)
 120+ if destination == 'queue':
 121+ output.put(vars)
 122+ vars['date'] = utils.convert_timestamp_to_date(vars['date'])
 123+ elif destination == 'file':
 124+ data =[]
 125+ for head in headers:
 126+ data.append(vars[head])
 127+ utils.write_list_to_csv(data, output)
 128+ output.write('\n')
120129 vars = {}
121130
122131
123 -def parse_editors(xml_queue, data_queue, pbar, bots, **kwargs):
 132+def parse_editors(xml_queue, output, pbar, bots, **kwargs):
124133 '''
125134 @xml_queue contains the filenames of the files to be parsed
126135 @data_queue is an instance of Queue where the extracted data is stored for
@@ -130,8 +139,10 @@
131140
132141 Output is the data_queue that will be used by store_editors()
133142 '''
134 - file_location = os.path.join(settings.XML_FILE_LOCATION, kwargs.get('language', 'en'))
135 - debug = kwargs.get('debug', None)
 143+ file_location = os.path.join(settings.XML_FILE_LOCATION, kwargs.get('language', 'en'), kwargs.get('project', 'wiki'))
 144+ debug = kwargs.get('debug', False)
 145+ destination = kwargs.get('destination', 'file')
 146+
136147 if settings.DEBUG:
137148 messages = {}
138149 vars = {}
@@ -145,9 +156,13 @@
146157 if file == None:
147158 print 'Swallowed a poison pill'
148159 break
 160+
149161 data = xml.read_input(utils.create_txt_filehandle(file_location,
150162 file, 'r',
151163 encoding=settings.ENCODING))
 164+ if destination == 'file':
 165+ name = file[:-4] + '.txt'
 166+ output = utils.create_txt_filehandle(file_location, name, 'w', settings.ENCODING)
152167 for raw_data in data:
153168 xml_buffer = cStringIO.StringIO()
154169 raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
@@ -156,7 +171,7 @@
157172 raw_data = ''.join(raw_data)
158173 xml_buffer.write(raw_data)
159174 elem = cElementTree.XML(xml_buffer.getvalue())
160 - output_editor_information(elem, data_queue, bots=bots)
 175+ output_editor_information(elem, output, bots=bots, destination=destination)
161176 except SyntaxError, error:
162177 print error
163178 '''
@@ -176,26 +191,30 @@
177192 print file, error
178193 print raw_data[:12]
179194 print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])
 195+ if destination == 'queue':
 196+ output.put('NEXT')
 197+ while True:
 198+ if output.qsize() < 100000:
 199+ break
 200+ else:
 201+ time.sleep(10)
 202+ print 'Still sleeping, queue is %s items long' % output.qsize()
180203
181 - data_queue.put('NEXT')
 204+ else:
 205+ output.close()
 206+
182207 if pbar:
183 - print file, xml_queue.qsize(), data_queue.qsize()
 208+ print file, xml_queue.qsize()
184209 #utils.update_progressbar(pbar, xml_queue)
 210+
185211 if debug:
186212 break
187 -
188 - while True:
189 - if data_queue.qsize() < 100000:
190 - break
191 - else:
192 - time.sleep(10)
193 - print 'Still sleeping, queue is %s items long' % data_queue.qsize()
194 -
 213+
195214 except Empty:
196215 break
197216
198 - #for x in xrange(4):
199 - data_queue.put(None)
 217+ if destination == 'queue':
 218+ data_queue.put(None)
200219
201220 if settings.DEBUG:
202221 utils.report_error_messages(messages, parse_editors)
@@ -263,9 +282,9 @@
264283 cache[c] = {}
265284 editor_cache.add('NEXT', '')
266285 cache = {}
267 -
268286
269287
 288+
270289 def load_bot_ids():
271290 '''
272291 Loader function to retrieve list of id's of known Wikipedia bots.
@@ -279,17 +298,20 @@
280299 return ids
281300
282301
283 -def run_parse_editors(dbname, language, location):
 302+def run_parse_editors(location, language, project):
284303 ids = load_bot_ids()
285304 kwargs = {'bots': ids,
286 - 'dbname': dbname,
 305+ 'dbname': language + project,
 306+ 'language': language,
 307+ 'project': project,
287308 'pbar': True,
288 - 'nr_input_processors': 2,
289 - 'nr_output_processors': 2,
290 - 'language': language,
 309+ 'destination': 'file',
 310+ 'nr_input_processors': settings.NUMBER_OF_PROCESSES,
 311+ 'nr_output_processors': settings.NUMBER_OF_PROCESSES,
291312 }
292313 chunks = {}
293 - files = utils.retrieve_file_list(location, 'xml')
 314+ source = os.path.join(location, language, project)
 315+ files = utils.retrieve_file_list(source, 'xml')
294316 parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0))
295317 a = 0
296318 for x in xrange(settings.NUMBER_OF_PROCESSES):
@@ -297,18 +319,18 @@
298320 chunks[x] = files[a:b]
299321 a = (x + 1) * parts
300322
301 - pc.build_scaffolding(pc.load_queue, parse_editors, chunks, store_editors, True, **kwargs)
302 - search_cache_for_missed_editors(dbname)
 323+ pc.build_scaffolding(pc.load_queue, parse_editors, chunks, False, False, **kwargs)
 324+ #search_cache_for_missed_editors(dbname)
303325
304326
305327 def debug_parse_editors(dbname):
306328 q = JoinableQueue()
307 - parse_editors('en\\522.xml', q, None, None, True)
 329+ parse_editors('522.xml', q, None, None, debug=True, destination='file')
308330 store_editors(q, [], dbname)
309 - search_cache_for_missed_editors(dbname)
 331+ #search_cache_for_missed_editors(dbname)
310332
311333
312334 if __name__ == "__main__":
313 - #debug_parse_editors('test')
314 - run_parse_editors('test', 'en')
 335+ #debug_parse_editors('test2')
 336+ run_parse_editors(settings.XML_FILE_LOCATION, 'en', 'wiki')
315337 pass
Index: trunk/tools/editor_trends/settings.py
@@ -41,6 +41,7 @@
4242 IGNORE_DIRS = ['wikistats', 'zips']
4343 ROOT = '/' if OS != 'Windows' else 'c:\\'
4444
 45+MINIMUM_PYTHON_VERSION = 2.6
4546
4647 dirs = [name for name in os.listdir(WORKING_DIRECTORY) if
4748 os.path.isdir(os.path.join(WORKING_DIRECTORY, name))]
Index: trunk/tools/editor_trends/utils/utils.py
@@ -132,6 +132,11 @@
133133
134134 # read / write data related functions
135135 def read_data_from_csv(filename, encoding):
 136+ '''
 137+ @filename is the path (either absolute or relative) including the name of
 138+ of the file
 139+ @encoding is usually utf-8
 140+ '''
136141 if hasattr(filename, '__call__'):
137142 filename = construct_filename(filename)
138143
@@ -156,6 +161,10 @@
157162
158163
159164 def determine_file_mode(extension):
 165+ '''
 166+ Checks if a given extension is an ASCII extension or not. The settings file
 167+ provides known ASCII extensions.
 168+ '''
160169 if extension in settings.ASCII:
161170 return 'w'
162171 else:
@@ -163,15 +172,30 @@
164173
165174
166175 def write_list_to_csv(data, fh, recursive=False):
 176+ '''
 177+ @data is a list which can contain other lists that will be written as a
 178+ single line to a textfile
 179+ @fh is a handle to an open text
 180+
 181+ The calling function is responsible for:
 182+ 1) writing a newline
 183+ 2) closing the filehandle
 184+ '''
 185+ tab = False
167186 if recursive:
168187 recursive = False
169 - for d in data:
 188+ for x, d in enumerate(data):
 189+ if tab:
 190+ fh.write('\t')
170191 if type(d) == type([]):
171192 recursive = write_list_to_csv(d, fh, True)
172193 else:
173 - fh.write('%s\t' % d)
 194+ fh.write('%s' % d)
 195+ tab = True
174196 if recursive:
 197+ tab = False
175198 return True
 199+ fh.write('\n')
176200
177201
178202 def write_dict_to_csv(data, fh):
@@ -267,31 +291,37 @@
268292
269293
270294 def create_dict_from_csv_file(filename, encoding):
 295+ '''
 296+ Constructs a dictionary from a txtfile
 297+ '''
271298 d = {}
272299 for line in read_data_from_csv(filename, encoding):
273300 line = clean_string(line)
274301 value, key = line.split('\t')
275302 d[key] = value
276 -
277303 return d
278304
279305
280 -def retrieve_file_list(location, extension, mask=''):
 306+def retrieve_file_list(location, extension, mask=None):
281307 '''
282308 Retrieve a list of files from a specified location.
283309 @location: either an absolute or relative path
284310 @extension: only include files with extension (optional)
285 - @mask: only include files that start with mask (optional)
 311+ @mask: only include files that start with mask (optional), this is
 312+ interpreted as a regular expression.
286313
287314 @return: a list of files matching the criteria
288315 '''
 316+ if mask:
 317+ mask = re.compile(mask)
 318+ else:
 319+ mask = re.compile('[\w\d*]')
289320 all_files = os.listdir(location)
290 - if not extension.startswith('.'):
291 - extension = '.' + extension
292321 files = []
293322 for file in all_files:
294 - if file.startswith(mask) and file.endswith(extension):
295 - files.append(file)
 323+ file = file.split('.')
 324+ if re.match(mask, file[0]) and file[1].endswith(extension):
 325+ files.append('.'.join(file))
296326 return files
297327
298328
Index: trunk/tools/editor_trends/utils/process_constructor.py
@@ -57,6 +57,7 @@
5858 nr_output_processors = kwargs.pop('nr_output_processors')
5959 input_queues = {}
6060 result_queues = {}
 61+
6162 #assert len(obj) == nr_input_processors
6263 #if result_queue:
6364 # assert len(obj)== nr_output_processors
Index: trunk/tools/editor_trends/utils/sort.py
@@ -0,0 +1,119 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+
 5+'''
 6+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 7+This program is free software; you can redistribute it and/or
 8+modify it under the terms of the GNU General Public License version 2
 9+as published by the Free Software Foundation.
 10+This program is distributed in the hope that it will be useful,
 11+but WITHOUT ANY WARRANTY; without even the implied warranty of
 12+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 13+See the GNU General Public License for more details, at
 14+http://www.fsf.org/licenses/gpl.html
 15+'''
 16+
 17+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 18+__author__email = 'dvanliere at gmail dot com'
 19+__date__ = '2010-11-07'
 20+__version__ = '0.1'
 21+
 22+'''
 23+This module provides a small number of sorting algorithms including mergesort,
 24+external mergesort and quicksort. By presorting the data, considerably
 25+efficiency gains can be realized when inserting the data in MongoDB.
 26+'''
 27+
 28+import heapq
 29+
 30+import settings
 31+import utils
 32+
 33+def quick_sort(obs):
 34+ if obs == []:
 35+ return []
 36+ else:
 37+ pivot = obs[0]
 38+ lesser = quick_sort([x for x in obs[1:] if x < pivot])
 39+ greater = quick_sort([x for x in obs[1:] if x >= pivot])
 40+ return lesser + [pivot] + greater
 41+
 42+def mergesort(n):
 43+ """Recursively merge sort a list. Returns the sorted list."""
 44+ front = n[:len(n) / 2]
 45+ back = n[len(n) / 2:]
 46+
 47+ if len(front) > 1:
 48+ front = mergesort(front)
 49+ if len(back) > 1:
 50+ back = mergesort(back)
 51+
 52+ return merge(front, back)
 53+
 54+
 55+def merge(front, back):
 56+ """Merge two sorted lists together. Returns the merged list."""
 57+ result = []
 58+ while front and back:
 59+ # pick the smaller one from the front and stick it on
 60+ # note that list.pop(0) is a linear operation, so this gives quadratic running time...
 61+ result.append(front.pop(0) if front[0] <= back[0] else back.pop(0))
 62+ # add the remaining end
 63+ result.extend(front or back)
 64+ return result
 65+
 66+
 67+def readline(file):
 68+ for line in file:
 69+ if line == '':
 70+ continue
 71+ else:
 72+ line = line.replace('\n', '')
 73+ line = line.split('\t')
 74+ yield line
 75+
 76+
 77+def merge_sorted_files(output, files):
 78+ output = utils.create_txt_filehandle(output, 'merged.txt', 'w', settings.ENCODING)
 79+ lines = 0
 80+ for line in heapq.merge(*[readline(file) for file in files]):
 81+ output.write(line)
 82+ lines += 1
 83+ output.close()
 84+ return lines
 85+
 86+
 87+def write_sorted_file(sorted_data, file, output):
 88+ file = file.split('.')
 89+ file[0] = file[0] + '_sorted'
 90+ file = '.'.join(file)
 91+ fh = utils.create_txt_filehandle(output, file, 'w', settings.ENCODING)
 92+ utils.write_list_to_csv(sorted_data, fh)
 93+ fh.close()
 94+
 95+
 96+def debug_merge_sorted_files(input, output):
 97+ files = utils.retrieve_file_list(input, 'txt', mask='')
 98+ filehandles = [utils.create_txt_filehandle(input, file, 'r', settings.ENCODING) for file in files]
 99+ lines = merge_sorted_files(output, filehandles)
 100+ filehandles = [fh.close() for fh in filehandles]
 101+ print lines
 102+
 103+
 104+def debug_mergesort(input, output):
 105+ files = utils.retrieve_file_list(input, 'txt', mask='((?!_sorted)\d)')
 106+ for file in files:
 107+ fh = utils.create_txt_filehandle(input, file, 'r', settings.ENCODING)
 108+ data = fh.readlines()
 109+ fh.close()
 110+ data = [d.replace('\n', '') for d in data]
 111+ data = [d.split('\t') for d in data]
 112+ sorted_data = mergesort(data)
 113+ write_sorted_file(sorted_data, file, output)
 114+
 115+
 116+if __name__ == '__main__':
 117+ input = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki')
 118+ output = os.path.join(settings.XML_FILE_LOCATION, 'en', 'wiki', 'sorted')
 119+ debug_mergesort(input, output)
 120+ #debug_merge_sorted_files(input, output)
Property changes on: trunk/tools/editor_trends/utils/sort.py
___________________________________________________________________
Added: svn:eol-style
1121 + native
Index: trunk/tools/editor_trends/construct_datasets.py
@@ -126,11 +126,11 @@
127127 pc.build_scaffolding(pc.load_queue, retrieve_edits_by_contributor, 'contributors')
128128
129129
130 -def debug_retrieve_edits_by_contributor_launcher():
 130+def debug_retrieve_edits_by_contributor_launcher(dbname):
131131 kwargs = {'debug': False,
132 - 'dbname': 'enwiki',
 132+ 'dbname': dbname,
133133 }
134 - ids = retrieve_editor_ids_mongo('enwiki', 'editors')
 134+ ids = retrieve_editor_ids_mongo(dbname, 'editors')
135135 input_queue = pc.load_queue(ids)
136136 q = Queue()
137137 generate_editor_dataset(input_queue, q, False, kwargs)
@@ -159,7 +159,6 @@
160160 def generate_editor_dataset_debug(dbname):
161161 ids = retrieve_editor_ids_mongo(dbname, 'editors')
162162 input_queue = pc.load_queue(ids)
163 - #write_dataset(input_queue, [], 'enwiki')
164163 kwargs = {'nr_input_processors': 1,
165164 'nr_output_processors': 1,
166165 'debug': True,
Index: trunk/tools/editor_trends/database/cache.py
@@ -86,25 +86,10 @@
8787
8888 if self.editors[key]['obs'] == self.treshold:
8989 self.treshold_editors.add(key)
90 -# self.update(key, self.editors[key]['edits'])
91 -# del self.editors[key]
92 -# self.n -= 10
93 -# self.number_editors -= 1
9490
9591 def update(self, editor, values):
96 - #t = datetime.datetime.now()
9792 self.collection.update({'editor': editor}, {'$pushAll': {'edits': values}}, upsert=True)
98 - #print 'It took %s to store editor %s;and the cache contains %s editors and %s items' % (datetime.datetime.now() - t, editor, self.number_editors, self.n)
9993
100 - def quick_sort(self, obs):
101 - if obs == []:
102 - return []
103 - else:
104 - pivot = obs[0]
105 - lesser = self.quick_sort([x for x in obs[1:] if x < pivot])
106 - greater = self.quick_sort([x for x in obs[1:] if x >= pivot])
107 - return lesser + [pivot] + greater
108 -
10994 def store(self):
11095 utils.store_object(self, settings.BINARY_OBJECT_FILE_LOCATION, self.__repr__())
11196

Status & tagging log