r75085 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r75084‎ | r75085 | r75086 >
Date:12:54, 20 October 2010
Author:diederik
Status:deferred
Tags:
Comment:
Fixed a memory leak caused by not closing a file handle on the Windows platform.
Modified paths:
  • /trunk/tools/editor_trends/map_wiki_editors.py (modified) (history)
  • /trunk/tools/editor_trends/utils/process_constructor.py (modified) (history)
  • /trunk/tools/editor_trends/utils/utils.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/map_wiki_editors.py
@@ -24,6 +24,7 @@
2525 import xml.etree.cElementTree as cElementTree
2626 from multiprocessing import Queue
2727 from Queue import Empty
 28+import pymongo
2829
2930 # Custom written files
3031 import settings
@@ -71,15 +72,15 @@
7273 ignore anonymous editors. If you are interested in collecting data on
7374 anonymous editors then add the string 'ip' to the tags variable.
7475 '''
75 - tags = ['id']
 76+ tags = ['id']
7677 if contributor.get('deleted'):
77 - return -1 #Not sure if this is the best way to code deleted contributors.
 78+ return - 1 #Not sure if this is the best way to code deleted contributors.
7879 for elem in contributor:
7980 if elem.tag in tags:
8081 if elem.text != None:
8182 return elem.text.decode('utf-8')
8283 else:
83 - return -1
 84+ return - 1
8485
8586
8687 def output_editor_information(elem, data_queue, **kwargs):
@@ -104,8 +105,8 @@
105106 vars.pop('bot')
106107 vars['date'] = utils.convert_timestamp_to_date(vars['date'])
107108 data_queue.put(vars)
108 - vars={}
109 -
 109+ vars = {}
 110+
110111 def lookup_new_editors(xml_queue, data_queue, pbar, bots, debug=False, separator='\t'):
111112 if settings.DEBUG:
112113 messages = {}
@@ -118,37 +119,42 @@
119120 file = xml_queue.get(block=False)
120121 #print 'parsing %s' % file
121122 if file == None:
 123+ print 'Swallowed a poison pill'
122124 break
123 -
124 - data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION
 125+
 126+ data = xml.read_input(utils.open_txt_file(settings.XML_FILE_LOCATION
125127 + file, 'r', encoding=settings.ENCODING))
126128 #data = read_input(sys.stdin)
127129 #print xml_queue.qsize()
128130 for raw_data in data:
129131 xml_buffer = cStringIO.StringIO()
130132 raw_data.insert(0, '<?xml version="1.0" encoding="UTF-8" ?>\n')
131 - raw_data = ''.join(raw_data)
132 - xml_buffer.write(raw_data)
133 -
134133 try:
 134+ raw_data = ''.join(raw_data)
 135+ xml_buffer.write(raw_data)
135136 elem = cElementTree.XML(xml_buffer.getvalue())
136137 output_editor_information(elem, data_queue, bots=bots)
137138 except SyntaxError, error:
138139 print error
139 - #There are few cases with invalid tokens, they are fixed
140 - #here and then reinserted into the XML DOM
141 - #data = convert_html_entities(xml_buffer.getvalue())
142 - #elem = cElementTree.XML(data)
143 - #output_editor_information(elem)
144 - if settings.DEBUG:
145 - utils.track_errors(xml_buffer, error, file, messages)
 140+ '''
 141+ There are few cases with invalid tokens, they are fixed
 142+ here and then reinserted into the XML DOM
 143+ data = convert_html_entities(xml_buffer.getvalue())
 144+ elem = cElementTree.XML(data)
 145+ output_editor_information(elem)
 146+ '''
146147 except UnicodeEncodeError, error:
147148 print error
 149+ except MemoryError, error:
 150+ '''
 151+ There is one xml file causing an out of memory file, not
 152+ sure which one yet.
 153+ '''
 154+ print error
 155+ finally:
148156 if settings.DEBUG:
149157 utils.track_errors(xml_buffer, error, file, messages)
150 - #finally:
151158
152 -
153159 if pbar:
154160 print xml_queue.qsize()
155161 #utils.update_progressbar(pbar, xml_queue)
@@ -171,11 +177,12 @@
172178 chunk = data_queue.get(block=False)
173179 values.append(chunk)
174180 #print chunk
175 - if len(values) == 100000:
176 - collection.insert(values)
 181+ if len(values) == 25000:
 182+ collection.insert(chunk)
177183 values = []
178184 #print data_queue.qsize()
179 - data_queue.task_done()
 185+
 186+
180187 except Empty:
181188 # The queue is empty but store the remaining values if present
182189 if values != []:
@@ -190,12 +197,15 @@
191198 are finished and this Queue is empty than break, else wait for the
192199 Queue to fill.
193200 '''
 201+
194202 if all([utils.check_if_process_is_running(pid) for pid in pids]):
195203 pass
 204+ #print 'Empty queue or not %s?' % data_queue.qsize()
196205 else:
197206 break
198207
199208
 209+
200210 def store_data_db(data_queue, pids):
201211 connection = db.init_database()
202212 cursor = connection.cursor()
@@ -238,8 +248,8 @@
239249 for bot in cursor:
240250 ids[bot['id']] = bot['name']
241251 pc.build_scaffolding(pc.load_queue, lookup_new_editors, files, store_data_mongo, True, bots=ids)
242 - db.add_index_to_collection('editors', 'date')
243 - db.add_index_to_collection('editors', 'name')
 252+ keys = [('date', pymongo.ASCENDING), ('name', pymongo.ASCENDING)]
 253+ db.add_index_to_collection('editors', 'editors', keys)
244254
245255 def debug_lookup_new_editors():
246256 q = Queue()
@@ -247,18 +257,18 @@
248258 pbar = progressbar.ProgressBar().start()
249259 edits = db.init_mongo_db('editors')
250260 lookup_new_editors('1.xml', q, None, None, True)
251 - db.add_index_to_collection('editors', 'date')
252 - db.add_index_to_collection('editors', 'name')
253 -
 261+ keys = [('date', pymongo.ASCENDING), ('name', pymongo.ASCENDING)]
 262+ db.add_index_to_collection('editors', 'editors', keys)
254263
255264
 265+
256266 def run_hadoop():
257267 pass
258268
259269
260270 if __name__ == "__main__":
261271 #debug_lookup_new_editors()
262 -
 272+
263273 if settings.RUN_MODE == 'stand_alone':
264274 run_stand_alone()
265275 print 'Finished processing XML files.'
Index: trunk/tools/editor_trends/utils/utils.py
@@ -55,6 +55,7 @@
5656 if settings.OS == 'Windows':
5757 PROCESS_TERMINATE = 1
5858 handle = ctypes.windll.kernel32.OpenProcess(PROCESS_TERMINATE, False, pid)
 59+ ctypes.windll.kernel32.CloseHandle(handle)
5960 if handle != 0:
6061 return True
6162 else:
Index: trunk/tools/editor_trends/utils/process_constructor.py
@@ -14,7 +14,7 @@
1515
1616 __author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
1717
18 -from multiprocessing import Process, Queue, JoinableQueue
 18+from multiprocessing import Process, Queue
1919 from Queue import Empty
2020
2121 import settings
@@ -52,7 +52,7 @@
5353
5454 input_queue = Queue()
5555 if result_queue:
56 - result_queue = JoinableQueue()
 56+ result_queue = Queue()
5757
5858 load_input_queue(input_queue, obj, poison_pill=True)
5959

Status & tagging log