Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -64,7 +64,7 @@ |
65 | 65 | editor_cache.add(prev_editor, 'NEXT') |
66 | 66 | |
67 | 67 | data = self.prepare_data(line) |
68 | | - |
| 68 | + print editor, data['username'] |
69 | 69 | editor_cache.add(editor, data) |
70 | 70 | prev_editor = editor |
71 | 71 | fh.close() |
— | — | @@ -93,6 +93,7 @@ |
94 | 94 | } |
95 | 95 | return data |
96 | 96 | |
| 97 | + |
97 | 98 | def store_articles(rts): |
98 | 99 | ''' |
99 | 100 | This function reads titles.csv and stores it in a separate collection. |
— | — | @@ -104,10 +105,14 @@ |
105 | 106 | mongo = db.init_mongo_db(rts.dbname) |
106 | 107 | db.drop_collection(rts.dbname, rts.articles_raw) |
107 | 108 | collection = mongo[rts.articles_raw] |
108 | | - db.add_index_to_collection(rts.dbname, rts.articles_raw, 'id') |
109 | | - db.add_index_to_collection(rts.dbname, rts.articles_raw, 'title') |
110 | | - db.add_index_to_collection(rts.dbname, rts.articles_raw, 'ns') |
111 | | - db.add_index_to_collection(rts.dbname, rts.articles_raw, 'category') |
| 109 | + collection.create_index('id') |
| 110 | + collection.create_index('title') |
| 111 | + collection.create_index('ns') |
| 112 | + collection.create_index('category') |
| 113 | + collection.ensure_index('id') |
| 114 | + collection.ensure_index('title') |
| 115 | + collection.ensure_index('ns') |
| 116 | + collection.ensure_index('category') |
112 | 117 | |
113 | 118 | location = os.path.join(rts.input_location, rts.language.code, rts.project.name, 'txt') |
114 | 119 | fh = file_utils.create_txt_filehandle(location, 'titles.csv', 'r', rts.encoding) |
Index: trunk/tools/editor_trends/etl/enricher.py |
— | — | @@ -333,7 +333,7 @@ |
334 | 334 | ns = namespace['namespace'] |
335 | 335 | title_meta['ns'] = ns |
336 | 336 | if title.startswith('List of'): |
337 | | - title_meta['list'] = True |
| 337 | + title_meta['category'] = 'List' |
338 | 338 | elif ns == 4 or ns == 5: |
339 | 339 | if title.find('Articles for deletion') > -1: |
340 | 340 | title_meta['category'] = 'Deletion' |
— | — | @@ -549,6 +549,8 @@ |
550 | 550 | hashes = deque() |
551 | 551 | size = {} |
552 | 552 | revisions = article['revisions'] |
| 553 | + if revisions: |
| 554 | + return |
553 | 555 | for revision in revisions: |
554 | 556 | cache.stats.count_revisions += 1 |
555 | 557 | if revision == None: |
— | — | @@ -772,7 +774,7 @@ |
773 | 775 | lock2 = RLock() |
774 | 776 | lock3 = RLock() |
775 | 777 | locks = [lock1, lock2, lock3] |
776 | | - setup(storage, rts) |
| 778 | + #setup(storage, rts) |
777 | 779 | multiprocessor_launcher(function, path, dataset, storage, processors, extension, locks, rts) |
778 | 780 | |
779 | 781 | |
Index: trunk/tools/editor_trends/etl/transformer.py |
— | — | @@ -122,10 +122,10 @@ |
123 | 123 | |
124 | 124 | def determine_number_edits(edits, first_year, final_year): |
125 | 125 | count = 0 |
126 | | - for year in edits: |
127 | | - for edit in edits[year]: |
128 | | - if edit['ns'] == 0: |
129 | | - count += 1 |
| 126 | + for edit in edits: |
| 127 | + if edit['ns'] == 0: |
| 128 | + print edit['ns'] |
| 129 | + count += 1 |
130 | 130 | return count |
131 | 131 | |
132 | 132 | |
Index: trunk/tools/editor_trends/database/cache.py |
— | — | @@ -21,6 +21,7 @@ |
22 | 22 | import datetime |
23 | 23 | import sys |
24 | 24 | import bson |
| 25 | +from pymongo.errors import OperationFailure |
25 | 26 | |
26 | 27 | if '..' not in sys.path: |
27 | 28 | sys.path.append('..') |
— | — | @@ -85,6 +86,10 @@ |
86 | 87 | self.collection.insert({'editor': editor, 'edits': values, 'username': username}, safe=True) |
87 | 88 | except bson.errors.InvalidDocument: |
88 | 89 | print 'BSON document too large, unable to store %s' % (username) |
| 90 | + except OperationFailure, error: |
| 91 | + print error |
| 92 | + print 'It seems that you are running out of disk space.' |
| 93 | + sys.exit(-1) |
89 | 94 | |
90 | 95 | def store(self): |
91 | 96 | file_utils.store_object(self, settings.binary_location, self.__repr__()) |