Index: trunk/tools/editor_trends/classes/buffer.py |
— | — | @@ -59,6 +59,7 @@ |
60 | 60 | self.process_id = process_id |
61 | 61 | self.count_articles = 0 |
62 | 62 | self.count_revisions = 0 |
| 63 | + self.n = 0 |
63 | 64 | self.filehandles = [file_utils.create_txt_filehandle(self.rts.txt, |
64 | 65 | file_id, 'a', 'utf-8') for file_id in xrange(self.rts.max_filehandles)] |
65 | 66 | self.keys = ['revision_id', 'article_id', 'id', 'username', 'namespace', |
— | — | @@ -87,53 +88,34 @@ |
88 | 89 | hashes[file_id].append(editor) |
89 | 90 | return hashes |
90 | 91 | |
91 | | - def group_revisions_by_fileid(self): |
92 | | - ''' |
93 | | - This function groups observation by editor id and then by file_id, |
94 | | - this way we have to make fewer file opening calls and should reduce |
95 | | - processing time. |
96 | | - ''' |
97 | | - data = {} |
98 | | - editors = {} |
99 | | - #first, we group all revisions by editor |
100 | | - |
101 | | - for revision in self.revisions.values(): |
102 | | - row = [] |
103 | | - #strip away the keys and make sure that the values are always in the same sequence |
104 | | - for key in self.keys: |
105 | | - row.append(revision[key].decode('utf-8')) |
106 | | - editor_id = row[0] |
107 | | - data.setdefault(editor_id, []) |
108 | | - data[editor_id].append(row) |
109 | | - editors.setdefault(editor_id, self.get_hash(editor_id)) |
110 | | - |
111 | | - #now, we are going to group all editors by file_id |
112 | | - print editors |
113 | | - file_ids = self.invert_dictionary(editors) |
114 | | - print file_ids |
115 | | - self.revisions = {} |
116 | | - for file_id, editors in file_ids.iteritems(): |
117 | | - for editor in editors: |
118 | | - self.revisions.setdefault(file_id, []) |
119 | | - self.revisions[file_id].extend(data[editor]) |
120 | | - print file_id, data[editor] |
121 | | - |
122 | 92 | def add(self, revision): |
123 | | - self.stringify(revision) |
124 | | - id = revision['revision_id'] |
125 | | - self.revisions[id] = revision |
126 | | - if len(self.revisions) > 10000: |
| 93 | + revision = self.stringify(revision) |
| 94 | + id = revision['id'] |
| 95 | + file_id = self.get_hash(id) |
| 96 | + revision = self.simplify(revision) |
| 97 | + self.revisions.setdefault(file_id, []) |
| 98 | + self.revisions[file_id].append(revision) |
| 99 | + if self.n > 10000: |
127 | 100 | #print '%s: Emptying buffer %s - buffer size %s' % (datetime.datetime.now(), self.id, len(self.revisions)) |
128 | 101 | self.store() |
| 102 | + else: |
| 103 | + self.n += 1 |
129 | 104 | |
| 105 | + def simplify(self, revision): |
| 106 | + row = [] |
| 107 | + for key in self.keys: |
| 108 | + row.append(value.decode('utf-8')) |
| 109 | + return row |
130 | 110 | |
131 | 111 | def stringify(self, revision): |
132 | 112 | for key, value in revision.iteritems(): |
| 113 | + value = revision[key] |
133 | 114 | try: |
134 | 115 | value = str(value) |
135 | 116 | except UnicodeEncodeError: |
136 | 117 | value = value.encode('utf-8') |
137 | 118 | revision[key] = value |
| 119 | + return revision |
138 | 120 | |
139 | 121 | |
140 | 122 | def summary(self): |
— | — | @@ -188,7 +170,7 @@ |
189 | 171 | |
190 | 172 | def write_revisions(self): |
191 | 173 | #t0 = datetime.datetime.now() |
192 | | - self.group_revisions_by_fileid() |
| 174 | + #self.group_revisions_by_fileid() |
193 | 175 | file_ids = self.revisions.keys() |
194 | 176 | for file_id in file_ids: |
195 | 177 | wait = True |