Index: trunk/tools/editor_trends/database/cache.py |
— | — | @@ -82,7 +82,7 @@ |
83 | 83 | try: |
84 | 84 | self.collection.insert({'editor': editor, 'edits': values, 'username': username}, safe=True) |
85 | 85 | except bson.errors.InvalidDocument: |
86 | | - print 'BSON document too large' |
| 86 | + print 'BSON document too large, unable to store %s' % (username) |
87 | 87 | |
88 | 88 | def store(self): |
89 | 89 | file_utils.store_object(self, settings.binary_location, self.__repr__()) |
Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -44,7 +44,6 @@ |
45 | 45 | |
46 | 46 | editor_cache = cache.EditorCache(collection) |
47 | 47 | prev_contributor = -1 |
48 | | - #edits = 0 |
49 | 48 | while True: |
50 | 49 | try: |
51 | 50 | filename = tasks.get(block=False) |
— | — | @@ -58,19 +57,12 @@ |
59 | 58 | print '%s files left in the queue.' % messages.show(tasks.qsize) |
60 | 59 | |
61 | 60 | fh = file_utils.create_txt_filehandle(source, filename, 'r', settings.encoding) |
62 | | - print fh |
63 | 61 | for line in file_utils.read_raw_data(fh): |
64 | 62 | if len(line) > 1: |
65 | 63 | contributor = line[0] |
66 | 64 | #print 'Parsing %s' % contributor |
67 | | - if prev_contributor != contributor: |
68 | | - #if edits > 9: |
| 65 | + if prev_contributor != contributor and prev_contributor != -1: |
69 | 66 | editor_cache.add(prev_contributor, 'NEXT') |
70 | | - print 'Stored %s' % prev_contributor |
71 | | - #else: |
72 | | - # editor_cache.clear(prev_contributor) |
73 | | - #edits = 0 |
74 | | - edits += 1 |
75 | 67 | date = text_utils.convert_timestamp_to_datetime_utc(line[1]) |
76 | 68 | article_id = int(line[2]) |
77 | 69 | username = line[3].encode(settings.encoding) |