r85158 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r85157‎ | r85158 | r85159 >
Date:21:29, 1 April 2011
Author:diederik
Status:deferred
Tags:
Comment:
Catching out of disk space error has been added and some extra indexes to reduce query time.
Modified paths:
  • /trunk/tools/editor_trends/database/cache.py (modified) (history)
  • /trunk/tools/editor_trends/etl/enricher.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)
  • /trunk/tools/editor_trends/etl/transformer.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/store.py
@@ -64,7 +64,7 @@
6565 editor_cache.add(prev_editor, 'NEXT')
6666
6767 data = self.prepare_data(line)
68 -
 68+ print editor, data['username']
6969 editor_cache.add(editor, data)
7070 prev_editor = editor
7171 fh.close()
@@ -93,6 +93,7 @@
9494 }
9595 return data
9696
 97+
9798 def store_articles(rts):
9899 '''
99100 This function reads titles.csv and stores it in a separate collection.
@@ -104,10 +105,14 @@
105106 mongo = db.init_mongo_db(rts.dbname)
106107 db.drop_collection(rts.dbname, rts.articles_raw)
107108 collection = mongo[rts.articles_raw]
108 - db.add_index_to_collection(rts.dbname, rts.articles_raw, 'id')
109 - db.add_index_to_collection(rts.dbname, rts.articles_raw, 'title')
110 - db.add_index_to_collection(rts.dbname, rts.articles_raw, 'ns')
111 - db.add_index_to_collection(rts.dbname, rts.articles_raw, 'category')
 109+ collection.create_index('id')
 110+ collection.create_index('title')
 111+ collection.create_index('ns')
 112+ collection.create_index('category')
 113+ collection.ensure_index('id')
 114+ collection.ensure_index('title')
 115+ collection.ensure_index('ns')
 116+ collection.ensure_index('category')
112117
113118 location = os.path.join(rts.input_location, rts.language.code, rts.project.name, 'txt')
114119 fh = file_utils.create_txt_filehandle(location, 'titles.csv', 'r', rts.encoding)
Index: trunk/tools/editor_trends/etl/enricher.py
@@ -333,7 +333,7 @@
334334 ns = namespace['namespace']
335335 title_meta['ns'] = ns
336336 if title.startswith('List of'):
337 - title_meta['list'] = True
 337+ title_meta['category'] = 'List'
338338 elif ns == 4 or ns == 5:
339339 if title.find('Articles for deletion') > -1:
340340 title_meta['category'] = 'Deletion'
@@ -549,6 +549,8 @@
550550 hashes = deque()
551551 size = {}
552552 revisions = article['revisions']
 553+ if revisions:
 554+ return
553555 for revision in revisions:
554556 cache.stats.count_revisions += 1
555557 if revision == None:
@@ -772,7 +774,7 @@
773775 lock2 = RLock()
774776 lock3 = RLock()
775777 locks = [lock1, lock2, lock3]
776 - setup(storage, rts)
 778+ #setup(storage, rts)
777779 multiprocessor_launcher(function, path, dataset, storage, processors, extension, locks, rts)
778780
779781
Index: trunk/tools/editor_trends/etl/transformer.py
@@ -122,10 +122,10 @@
123123
124124 def determine_number_edits(edits, first_year, final_year):
125125 count = 0
126 - for year in edits:
127 - for edit in edits[year]:
128 - if edit['ns'] == 0:
129 - count += 1
 126+ for edit in edits:
 127+ if edit['ns'] == 0:
 128+ print edit['ns']
 129+ count += 1
130130 return count
131131
132132
Index: trunk/tools/editor_trends/database/cache.py
@@ -21,6 +21,7 @@
2222 import datetime
2323 import sys
2424 import bson
 25+from pymongo.errors import OperationFailure
2526
2627 if '..' not in sys.path:
2728 sys.path.append('..')
@@ -85,6 +86,10 @@
8687 self.collection.insert({'editor': editor, 'edits': values, 'username': username}, safe=True)
8788 except bson.errors.InvalidDocument:
8889 print 'BSON document too large, unable to store %s' % (username)
 90+ except OperationFailure, error:
 91+ print error
 92+ print 'It seems that you are running out of disk space.'
 93+ sys.exit(-1)
8994
9095 def store(self):
9196 file_utils.store_object(self, settings.binary_location, self.__repr__())