Index: trunk/tools/editor_trends/etl/store.py |
— | — | @@ -68,7 +68,7 @@ |
69 | 69 | date = text_utils.convert_timestamp_to_datetime_utc(line[1]) |
70 | 70 | article_id = int(line[2]) |
71 | 71 | username = line[3].encode(self.rts.encoding) |
72 | | - ns = int(line[4]) |
| 72 | + ns = int(line[5]) |
73 | 73 | value = {'date': date, |
74 | 74 | 'article': article_id, |
75 | 75 | 'username': username, |
Index: trunk/tools/editor_trends/etl/extracter.py |
— | — | @@ -24,6 +24,9 @@ |
25 | 25 | import progressbar |
26 | 26 | from Queue import Empty |
27 | 27 | |
| 28 | +if '..' not in sys.path: |
| 29 | + sys.path.append('..') |
| 30 | + |
28 | 31 | import wikitree.parser |
29 | 32 | from bots import detector |
30 | 33 | from utils import file_utils |
— | — | @@ -148,7 +151,8 @@ |
149 | 152 | def extract_username(contributor, **kwargs): |
150 | 153 | contributor = contributor.find('username') |
151 | 154 | if contributor != None: |
152 | | - return contributor.text |
| 155 | + contributor = contributor.text.encode('utf-8') |
| 156 | + return contributor.decode('utf-8') |
153 | 157 | else: |
154 | 158 | return None |
155 | 159 | |
— | — | @@ -182,6 +186,14 @@ |
183 | 187 | return None |
184 | 188 | |
185 | 189 | |
| 190 | +def parse_title(title): |
| 191 | + if type(title.text) == type('str'): |
| 192 | + title = title.text.decode('utf-8') |
| 193 | + else: |
| 194 | + title = title.text |
| 195 | + return title |
| 196 | + |
| 197 | + |
186 | 198 | def output_editor_information(revisions, page, bots, rts): |
187 | 199 | ''' |
188 | 200 | @elem is an XML element containing 1 revision from a page |
— | — | @@ -282,7 +294,7 @@ |
283 | 295 | namespace = parse_article(title, ns) |
284 | 296 | if namespace != False: |
285 | 297 | article_id = page.find('id').text |
286 | | - title = page.find('title').text |
| 298 | + title = parse_title(title) |
287 | 299 | revisions = page.findall('revision') |
288 | 300 | revisions = parse_comments(rts, revisions, remove_numeric_character_references) |
289 | 301 | output = output_editor_information(revisions, article_id, bot_ids, rts) |
— | — | @@ -297,7 +309,10 @@ |
298 | 310 | fh2.close() |
299 | 311 | print 'Closing %s...' % (os.path.join(location, filename)) |
300 | 312 | print 'Total pages: %s' % total |
301 | | - print 'Pages processed: %s (%s)' % (processed, processed / total) |
| 313 | + try: |
| 314 | + print 'Pages processed: %s (%s)' % (processed, processed / total) |
| 315 | + except ZeroDivisionError: |
| 316 | + print 'Pages processed: %s' % processed |
302 | 317 | |
303 | 318 | return True |
304 | 319 | |
— | — | @@ -312,6 +327,7 @@ |
313 | 328 | id = o[0] |
314 | 329 | if id not in d: |
315 | 330 | d[id] = [] |
| 331 | + #if len(o) == 6: |
316 | 332 | d[id].append(o) |
317 | 333 | return d |
318 | 334 | |
— | — | @@ -319,7 +335,8 @@ |
320 | 336 | def write_output(observations, filehandles, lock, rts): |
321 | 337 | observations = group_observations(observations) |
322 | 338 | for obs in observations: |
323 | | - lock.acquire() #lock the write around all edits of an editor for a particular page |
| 339 | + #lock the write around all edits of an editor for a particular page |
| 340 | + lock.acquire() |
324 | 341 | try: |
325 | 342 | for i, o in enumerate(observations[obs]): |
326 | 343 | if i == 0: |