r85092 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r85091‎ | r85092 | r85093 >
Date:21:49, 31 March 2011
Author:diederik
Status:deferred
Tags:
Comment:
Fixed out of memory problem.
Modified paths:
  • /trunk/tools/editor_trends/etl/sort.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/store.py
@@ -107,8 +107,15 @@
108108 for line in fh:
109109 line = line.strip()
110110 #print line.encode('utf-8')
111 - id, title = line.split('\t')
112 - collection.insert({'id':id, 'title':title})
 111+ line = line.split('\t')
 112+ title = line[-1]
 113+ id = line[0]
 114+ ns = line[1]
 115+ if len(line) == 4:
 116+ category = line[2]
 117+ collection.insert({'id':id, 'title':title, 'category': category, 'ns': ns})
 118+ else:
 119+ collection.insert({'id':id, 'title':title, 'ns': ns})
113120 fh.close()
114121 print 'Done...'
115122
Index: trunk/tools/editor_trends/etl/sort.py
@@ -43,20 +43,26 @@
4444 if filename == None:
4545 self.result.put(None)
4646 break
47 -
 47+ elif filename.startswith('comments') or filename.startswith('title'):
 48+ continue
4849 fh = file_utils.create_txt_filehandle(self.rts.txt,
4950 filename,
5051 'r',
5152 self.rts.encoding)
5253 data = file_utils.read_unicode_text(fh)
5354 fh.close()
54 - data = [d.strip() for d in data]
55 - data = [d.split('\t') for d in data]
 55+ for x, d in enumerate(data):
 56+ d = d.strip().split('\t')
 57+ data[x] = d
 58+ #data = [d.strip() for d in data]
 59+ #data = [d.split('\t') for d in data]
5660 sorted_data = mergesort(data)
5761 write_sorted_file(sorted_data, filename, self.rts)
5862 self.result.put(True)
5963 except UnicodeDecodeError, e:
6064 print 'Error: %s, (%s)' % (e, filename)
 65+ except MemoryError, e:
 66+ print 'Error: %s, (%s)' % (e, filename)
6167 except Empty:
6268 pass
6369