r75218 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r75217‎ | r75218 | r75219 >
Date:22:42, 22 October 2010
Author:diederik
Status:deferred
Tags:
Comment:
Performance improvements.
Modified paths:
  • /trunk/tools/editor_trends/construct_datasets.py (modified) (history)
  • /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history)
  • /trunk/tools/editor_trends/settings.py (modified) (history)
  • /trunk/tools/editor_trends/split_xml_file.py (modified) (history)
  • /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history)
  • /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/map_wiki_editors.py
@@ -35,6 +35,7 @@
3636 from database import db_settings
3737 from database import db
3838 from wikitree import xml
 39+from statistics import dataset
3940 from utils import process_constructor as pc
4041
4142
@@ -44,15 +45,10 @@
4546 except ImportError:
4647 pass
4748
48 -contributors = {}
 49+#contributors = {}
4950
5051 RE_BOT = re.compile('bot', re.IGNORECASE)
5152 RE_SCRIPT = re.compile('script', re.IGNORECASE)
52 -#RE_NUMERIC_CHARACTER = re.compile('&#[\d{1,5}]+;')
53 -#
54 -#def remove_numeric_character_references(text):
55 -# return re.sub(RE_NUMERIC_CHARACTER, '', text)
56 -#
5753
5854
5955 def determine_username_is_bot(username, kwargs):
@@ -108,7 +104,7 @@
109105 data_queue.put(vars)
110106 vars = {}
111107
112 -def lookup_new_editors(xml_queue, data_queue, pbar, bots, debug=False, separator='\t'):
 108+def parse_editors(xml_queue, data_queue, pbar, bots, debug=False, separator='\t'):
113109 if settings.DEBUG:
114110 messages = {}
115111 vars = {}
@@ -118,14 +114,12 @@
119115 file = xml_queue
120116 else:
121117 file = xml_queue.get(block=False)
122 - #print 'parsing %s' % file
123118 if file == None:
124119 print 'Swallowed a poison pill'
125120 break
126121 data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION,
127122 file, 'r',
128123 encoding=settings.ENCODING))
129 - #data = read_input(sys.stdin)
130124 for raw_data in data:
131125 xml_buffer = cStringIO.StringIO()
132126 raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
@@ -155,16 +149,15 @@
156150 sure which one yet. This happens when raw_data =
157151 ''.join(raw_data) is called. 18-22
158152 '''
159 - print error
 153+ print file, error
160154 print raw_data[:12]
161155 print 'String was supposed to be %s characters long' % sum([len(raw) for raw in raw_data])
162156 if settings.DEBUG:
163157 utils.track_errors(xml_buffer, error, file, messages)
164158
165 -
166159 if pbar:
167 - print xml_queue.qsize()
168 - #utils.update_progressbar(pbar, xml_queue)
 160+ #print xml_queue.qsize()
 161+ utils.update_progressbar(pbar, xml_queue)
169162 if debug:
170163 break
171164
@@ -175,25 +168,17 @@
176169 utils.report_error_messages(messages, lookup_new_editors)
177170
178171
179 -def store_data_mongo(data_queue, pids, dbname):
 172+def store_editors(data_queue, pids, dbname):
180173 mongo = db.init_mongo_db(dbname)
181174 collection = mongo['editors']
182175 mongo.collection.ensure_index('editor')
183 - contributors = {}
184176 while True:
185177 try:
186178 edit = data_queue.get(block=False)
187179 contributor = edit['editor']
188 - if contributor not in contributors:
189 - collection.insert({'editor': contributor, 'edit_count': 0, })
190 - contributors[contributor] = 1
191 -
192 - key = str(contributors[contributor])
193180 value = {'date':edit['date'], 'article': edit['article']}
194181 collection.update({'editor': contributor}, {'$inc': {'edit_count': 1},
195 - '$push': {'edits': value}})
196 - contributors[contributor] += 1
197 -
 182+ '$push': {'edits': value}}, True)
198183 except Empty:
199184 '''
200185 This checks whether the Queue is empty because the preprocessors are
@@ -202,16 +187,32 @@
203188 are finished and this Queue is empty than break, else wait for the
204189 Queue to fill.
205190 '''
206 -
207191 if all([utils.check_if_process_is_running(pid) for pid in pids]):
208192 pass
209193 #print 'Empty queue or not %s?' % data_queue.qsize()
210194 else:
211195 break
212 - except Exception, error:
213 - print error
214196
215197
 198+def optimize_editors(dbname, input_queue, **kwargs):
 199+ mongo = db.init_mongo_db(dbname)
 200+ collection = mongo['editors']
 201+ definition = kwargs.pop('definition')
 202+ while True:
 203+ try:
 204+ id = input_queue.get(block=False)
 205+ #id = '94033'
 206+ editor = collection.find_one({'editor': id})
 207+ edits = editor['edits']
 208+ edits.sort()
 209+ year = edits[0]['date'].year
 210+ new_wikipedian = dataset.determine_editor_is_new_wikipedian(edits, defintion)
 211+ collection.update({'editor': id}, {'$set': {'edits': edits, 'year_joined': year, 'new_wikipedian': new_wikipedian}})
 212+
 213+ except Empty:
 214+ break
 215+
 216+
216217 def store_data_db(data_queue, pids):
217218 connection = db.init_database()
218219 cursor = connection.cursor()
@@ -243,43 +244,38 @@
244245 connection.close()
245246
246247
247 -def run_stand_alone():
 248+def run_stand_alone(dbname):
248249 files = utils.retrieve_file_list(settings.XML_FILE_LOCATION, 'xml')
249250 #files = files[:2]
 251+ kwargs = {'bots': ids,
 252+ 'dbname': dbname,
 253+ 'pbar': True,
 254+ 'definition': 'traditional'}
 255+
250256 mongo = db.init_mongo_db('bots')
251257 bots = mongo['ids']
252258 ids = {}
253259 cursor = bots.find()
254 -
255 - kwargs = {'bots': ids,
256 - 'dbname': 'enwiki',
257 - 'pbar': True}
258 -
259260 for bot in cursor:
260261 ids[bot['id']] = bot['name']
261 - pc.build_scaffolding(pc.load_queue, lookup_new_editors, files, store_data_mongo, True, **kwargs)
262 - keys = ['editor']
263 - for key in keys:
264 - db.add_index_to_collection('enwiki', 'editors', key)
 262+
 263+ pc.build_scaffolding(pc.load_queue, parse_editors, files, store_editors, True, **kwargs)
 264+ ids = retrieve_ids_mongo_new(dbname, 'editors')
 265+ pc.build_scaffolding(pc.load_queue, optimize_editors, ids, False, False, **kwargs)
265266
266267 def debug_lookup_new_editors():
267268 q = Queue()
268269 import progressbar
269270 pbar = progressbar.ProgressBar().start()
270271 #edits = db.init_mongo_db('editors')
271 - lookup_new_editors('464.xml', q, None, None, True)
 272+ parse_editors('464.xml', q, None, None, True)
272273 store_data_mongo(q, [], 'test')
273274 #keys = ['editor']
274275 #for key in keys:
275276 # db.add_index_to_collection('editors', 'editors', key)
276277
277 -
278 -
279 -def run_hadoop():
280 - pass
281 -
282 -
283278 if __name__ == "__main__":
 279+ #optimize_editors('enwiki')
284280 #debug_lookup_new_editors()
285281
286282 if settings.RUN_MODE == 'stand_alone':
Index: trunk/tools/editor_trends/settings.py
@@ -79,6 +79,8 @@
8080
8181 DATASETS_FILE_LOCATION = WORKING_DIRECTORY + '/datasets/'
8282
 83+TXT_FILE_LOCATION = WORKING_DIRECTORY + '/csv/'
 84+
8385 #This section contains configuration variables for parsing / encoding and
8486 #working with the XML files.
8587
@@ -99,8 +101,3 @@
100102
101103 WP_DUMP_LOCATION = 'http://download.wikimedia.org'
102104
103 -LANGUAGE_MAPPING = {
104 -'English': '/enwiki/latest/',
105 -'Russian': '/ruwiki/latest/',
106 -'German': '/dewiki/latest',
107 -}
Index: trunk/tools/editor_trends/utils/utils.py
@@ -256,7 +256,7 @@
257257 ValueError: I/O operation on closed file
258258 Not sure how to fix this, that's why the line is commented.
259259 '''
260 - #pbar.update(x)
 260+ pbar.update(pbar.currval + x)
261261
262262
263263 def humanize_time_difference(seconds_elapsed):
Index: trunk/tools/editor_trends/utils/process_constructor.py
@@ -66,18 +66,16 @@
6767
6868
6969 input_processes = [models.ProcessInputQueue(main, input_queue, result_queue,
70 - **kwargs) for i in xrange(settings.NUMBER_OF_PROCESSES)]
 70+ **kwargs) for i in xrange(settings.NUMBER_OF_PROCESSES -1)]
7171
7272 for input_process in input_processes:
7373 input_process.start()
7474 pids = [p.pid for p in input_processes]
7575 kwargs['pids'] = pids
7676
77 -
78 -
7977 if result_queue:
8078 result_processes = [models.ProcessResultQueue(result_processor,
81 - result_queue, **kwargs) for i in xrange(1)]
 79+ result_queue, **kwargs) for i in xrange(24)]
8280 for result_process in result_processes:
8381 result_process.start()
8482
Index: trunk/tools/editor_trends/split_xml_file.py
@@ -13,8 +13,10 @@
1414 '''
1515
1616 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__author__email = 'dvanliere at gmail dot com'
 18+__date__ = '2010-10-21'
 19+__version__ = '0.1'
1720
18 -
1921 import xml.etree.cElementTree as cElementTree
2022 import codecs
2123 import utils
@@ -30,10 +32,7 @@
3133
3234 RE_NUMERIC_CHARACTER = re.compile('&#(\d+);')
3335
34 -#def convert_html_entities(text):
35 -# return utils.unescape(text)
3636
37 -
3837 def remove_numeric_character_references(text):
3938 return re.sub(RE_NUMERIC_CHARACTER, lenient_deccharref, text).encode('utf-8')
4039
@@ -57,26 +56,17 @@
5857 for revision in revisions:
5958 comment = revision.find('comment')
6059 timestamp = revision.find('timestamp').text
61 - #if timestamp == '2007-11-25T09:21:11Z':
62 - # print 'debug'
63 - # text = comment.text
64 - #test2 = text.encode('utf-8')
65 - #test = text.decode('utf-8')
6660
6761 # text1 = remove_ascii_control_characters(text)
6862 # text2 = remove_numeric_character_references(text)
6963 # text3 = convert_html_entities(text)
7064
7165 if comment != None and comment.text != None:
72 - #print comment.text.encode('utf-8')
73 -
7466 comment.text = function(comment.text)
75 - #text = comment.text
76 - #print text
7767 return xml
7868
7969
80 -def write_xml_file(element, fh, counter):
 70+def write_xml_file(element, fh, counter, language):
8171 '''Get file handle and write xml element to file'''
8272 size = len(cElementTree.tostring(element))
8373 fh, counter = create_xml_file_handle(fh, counter, size)
@@ -89,20 +79,24 @@
9080 '''Create file handle if none is supplied or if file size > max file size.'''
9181 if not fh:
9282 counter = 0
93 - fh = codecs.open(settings.LOCATION + str(counter) + '.xml', 'w', encoding=settings.ENCODING)
 83+ fh = codecs.open(settings.LOCATION + '/' + language + '/' + str(counter) + '.xml', 'w', encoding=settings.ENCODING)
9484 return fh, counter
9585 elif (fh.tell() + size) > settings.MAX_XML_FILE_SIZE:
9686 print 'Created chunk %s' % counter
9787 fh.close
9888 counter += 1
99 - fh = codecs.open(settings.LOCATION + str(counter) + '.xml', 'w', encoding=settings.ENCODING)
 89+ fh = codecs.open(settings.LOCATION + '/' + language + '/' + str(counter) + '.xml', 'w', encoding=settings.ENCODING)
10090 return fh, counter
10191 else:
10292 return fh, counter
10393
10494
105 -def split_xml():
 95+def split_xml(language):
10696 '''Reads xml file and splits it in N chunks'''
 97+ result = utils.create_directory(language)
 98+ if not result:
 99+ return
 100+
107101 fh = None
108102 counter = None
109103 tag = '{%s}page' % settings.NAME_SPACE
@@ -118,10 +112,10 @@
119113 elem = parse_comments(elem, remove_numeric_character_references)
120114 #elem = parse_comments(elem, convert_html_entities)
121115 #elem = parse_comments(elem, remove_ascii_control_characters)
122 - fh, counter = write_xml_file(elem, fh, counter)
 116+ fh, counter = write_xml_file(elem, fh, counter, language)
123117 #print cElementTree.tostring(elem)
124118 root.clear() # when done parsing a section clear the tree to safe memory
125119
126120
127121 if __name__ == "__main__":
128 - split_xml()
 122+ split_xml('enwiki')
Index: trunk/tools/editor_trends/construct_datasets.py
@@ -36,26 +36,38 @@
3737
3838
3939 def retrieve_editor_ids_mongo(RANDOM_SAMPLE=True):
40 - if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,
 40+ raise DeprecatedError
 41+# if utils.check_file_exists(settings.BINARY_OBJECT_FILE_LOCATION,
 42+# retrieve_editor_ids_mongo):
 43+# contributors = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
 44+# retrieve_editor_ids_mongo)
 45+# else:
 46+# mongo = db.init_mongo_db('editors')
 47+# editors = mongo['editors']
 48+# contributors = set()
 49+# #ids = editors.find().distinct('editor')
 50+# ids = editors.find()
 51+# for x, id in enumerate(ids):
 52+# contributors.add(id['editor'])
 53+# if len(contributors) == 100000:
 54+# if RANDOM_SAMPLE:
 55+# break
 56+# if contributors != set():
 57+# utils.store_object(contributors, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)
 58+# return contributors
 59+
 60+def retrieve_ids_mongo_new(dbname, collection):
 61+ if utils.check_file_exists(settings.TXT_FILE_LOCATION,
4162 retrieve_editor_ids_mongo):
42 - contributors = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
 63+ ids = utils.load_object(settings.TXT_FILE_LOCATION,
4364 retrieve_editor_ids_mongo)
4465 else:
45 - mongo = db.init_mongo_db('editors')
46 - editors = mongo['editors']
47 - contributors = set()
48 - #ids = editors.find().distinct('editor')
49 - ids = editors.find()
50 - for x, id in enumerate(ids):
51 - contributors.add(id['editor'])
52 - if len(contributors) == 100000:
53 - if RANDOM_SAMPLE:
54 - break
55 - if contributors != set():
56 - utils.store_object(contributors, settings.BINARY_OBJECT_FILE_LOCATION, retrieve_editor_ids_mongo)
57 - return contributors
 66+ mongo = db.init_mongo_db(dbname)
 67+ editors = mongo[collection]
 68+ ids = editors.distinct()
 69+ utils.store_object(contributors, settings.TXT_FILE_LOCATION, retrieve_editor_ids_mongo)
 70+ return ids
5871
59 -
6072 def generate_editor_dataset(input_queue, data_queue, pbar, kwargs):
6173 definition = kwargs.pop('definition')
6274 limit = kwargs.pop('limit')
@@ -72,7 +84,7 @@
7385
7486 print input_queue.qsize()
7587 if definition == 'Traditional':
76 -
 88+
7789 obs = editors.find({'editor': id}, {'date':1}).sort('date').limit(limit)
7890 contributors = []
7991 for ob in obs:

Status & tagging log