r84947 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r84946‎ | r84947 | r84948 >
Date:04:56, 29 March 2011
Author:diederik
Status:deferred
Tags:
Comment:
Fixed some utf8 encoding problems.
Modified paths:
  • /trunk/tools/editor_trends/etl/extracter.py (modified) (history)
  • /trunk/tools/editor_trends/etl/store.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/store.py
@@ -68,7 +68,7 @@
6969 date = text_utils.convert_timestamp_to_datetime_utc(line[1])
7070 article_id = int(line[2])
7171 username = line[3].encode(self.rts.encoding)
72 - ns = int(line[4])
 72+ ns = int(line[5])
7373 value = {'date': date,
7474 'article': article_id,
7575 'username': username,
Index: trunk/tools/editor_trends/etl/extracter.py
@@ -24,6 +24,9 @@
2525 import progressbar
2626 from Queue import Empty
2727
 28+if '..' not in sys.path:
 29+ sys.path.append('..')
 30+
2831 import wikitree.parser
2932 from bots import detector
3033 from utils import file_utils
@@ -148,7 +151,8 @@
149152 def extract_username(contributor, **kwargs):
150153 contributor = contributor.find('username')
151154 if contributor != None:
152 - return contributor.text
 155+ contributor = contributor.text.encode('utf-8')
 156+ return contributor.decode('utf-8')
153157 else:
154158 return None
155159
@@ -182,6 +186,14 @@
183187 return None
184188
185189
 190+def parse_title(title):
 191+ if type(title.text) == type('str'):
 192+ title = title.text.decode('utf-8')
 193+ else:
 194+ title = title.text
 195+ return title
 196+
 197+
186198 def output_editor_information(revisions, page, bots, rts):
187199 '''
188200 @elem is an XML element containing 1 revision from a page
@@ -282,7 +294,7 @@
283295 namespace = parse_article(title, ns)
284296 if namespace != False:
285297 article_id = page.find('id').text
286 - title = page.find('title').text
 298+ title = parse_title(title)
287299 revisions = page.findall('revision')
288300 revisions = parse_comments(rts, revisions, remove_numeric_character_references)
289301 output = output_editor_information(revisions, article_id, bot_ids, rts)
@@ -297,7 +309,10 @@
298310 fh2.close()
299311 print 'Closing %s...' % (os.path.join(location, filename))
300312 print 'Total pages: %s' % total
301 - print 'Pages processed: %s (%s)' % (processed, processed / total)
 313+ try:
 314+ print 'Pages processed: %s (%s)' % (processed, processed / total)
 315+ except ZeroDivisionError:
 316+ print 'Pages processed: %s' % processed
302317
303318 return True
304319
@@ -312,6 +327,7 @@
313328 id = o[0]
314329 if id not in d:
315330 d[id] = []
 331+ #if len(o) == 6:
316332 d[id].append(o)
317333 return d
318334
@@ -319,7 +335,8 @@
320336 def write_output(observations, filehandles, lock, rts):
321337 observations = group_observations(observations)
322338 for obs in observations:
323 - lock.acquire() #lock the write around all edits of an editor for a particular page
 339+ #lock the write around all edits of an editor for a particular page
 340+ lock.acquire()
324341 try:
325342 for i, o in enumerate(observations[obs]):
326343 if i == 0: