r84695 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r84694‎ | r84695 | r84696 >
Date:19:58, 24 March 2011
Author:diederik
Status:deferred
Tags:
Comment:
Added counting number of edits for prediction dataset.
Modified paths:
  • /trunk/tools/editor_trends/etl/enricher.py (modified) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/enricher.py
@@ -34,6 +34,8 @@
3535 if '..' not in sys.path:
3636 sys.path.append('..')
3737
 38+from utils import file_utils
 39+
3840 try:
3941 from database import cassandra
4042 import pycassa
@@ -265,7 +267,11 @@
266268 if ns == {}:
267269 for namespace in NAMESPACE.values():
268270 if title.startswith(namespace):
269 - ns = False #article does not belong to either the main namespace, user, talk or user talk namespace.
 271+ '''
 272+ article does not belong to either the main namespace, user,
 273+ talk or user talk namespace.
 274+ '''
 275+ ns = False
270276 return ns
271277 ns['namespace'] = 0
272278 else:
@@ -283,7 +289,7 @@
284290
285291 def is_revision_reverted(hash_cur, hashes):
286292 revert = {}
287 - if hash_cur in hashes:
 293+ if hash_cur in hashes and hash_cur != -1:
288294 revert['revert'] = 1
289295 else:
290296 revert['revert'] = 0
@@ -296,12 +302,34 @@
297303 if text != None and text.text != None:
298304 comment[revision_id] = text.text.encode('utf-8')
299305 return comment
300 -
301306
 307+
 308+
 309+def count_edits(article, counts, bots):
 310+ title = article['title'].text
 311+ namespace = determine_namespace(article['title'])
 312+
 313+ if namespace != False:
 314+ article_id = article['id'].text
 315+ revisions = article['revisions']
 316+ for revision in revisions:
 317+ if revision == None:
 318+ #the entire revision is empty, weird.
 319+ continue
 320+ contributor = revision.find('contributor')
 321+ contributor = parse_contributor(contributor, bots)
 322+ if not contributor:
 323+ #editor is anonymous, ignore
 324+ continue
 325+ counts.setdefault(contributor['username'], 0)
 326+ counts[contributor['username']] += 1
 327+ return counts
 328+
 329+
302330 def create_variables(article, cache, bots):
303331 title = article['title'].text
304332 namespace = determine_namespace(article['title'])
305 -
 333+
306334 if namespace != False:
307335 cache.stats.count_articles += 1
308336 article_id = article['id'].text
@@ -324,7 +352,7 @@
325353 if revision_id == None:
326354 #revision_id is missing, which is weird
327355 continue
328 -
 356+
329357 row = prefill_row(title, article_id, namespace)
330358 row['revision_id'] = revision_id
331359 text = extract_revision_text(revision)
@@ -332,7 +360,7 @@
333361
334362 comment = extract_comment_text(revision_id, revision)
335363 cache.comments.update(comment)
336 -
 364+
337365 timestamp = revision.find('timestamp').text
338366 row['timestamp'] = timestamp
339367
@@ -366,12 +394,17 @@
367395 return article
368396
369397
370 -def stream_raw_xml(input_queue, storage, id):
 398+def stream_raw_xml(input_queue, storage, id, dataset='training'):
371399 buffer = cStringIO.StringIO()
372400 parsing = False
 401+ i = 0
373402 bots = detector.retrieve_bots('en')
374 - cache = Buffer(storage, id)
375 - i = 0
 403+
 404+ if dataset == 'training':
 405+ cache = Buffer(storage, id)
 406+ else:
 407+ counts = {}
 408+
376409 while True:
377410 filename = input_queue.get()
378411 input_queue.task_done()
@@ -379,38 +412,32 @@
380413 break
381414
382415 for data in unzip(filename):
383 - if data.startswith('<page>'):
 416+ if data.find('<page>') > -1:
384417 parsing = True
385418 if parsing:
386419 buffer.write(data)
387 - buffer.write('\n')
388 - if data == '</page>':
 420+ if data.find('</page>') > -1:
389421 i += 1
390422 buffer.seek(0)
391423 article = parse_xml(buffer)
392 - create_variables(article, cache, bots)
 424+ if dataset == 'training':
 425+ function(article, cache, bots)
 426+ else:
 427+ counts = function(article, counts, bots)
393428 buffer = cStringIO.StringIO()
394429
395430 if i % 10000 == 0:
396431 print 'Worker %s parsed %s articles' % (id, i)
397432
398 -
399 - cache.empty()
400 - print 'Finished parsing bz2 archives'
401 - cache.stats.summary()
 433+ if dataset == 'training':
 434+ cache.empty()
 435+ print 'Finished parsing bz2 archives'
 436+ cache.stats.summary()
 437+ else:
 438+ location = os.getcwd()
 439+ file_utils.store_object(counts, location, 'counts.bin')
402440
403441
404 -def debug():
405 - input_queue = JoinableQueue()
406 - result_queue = JoinableQueue()
407 - files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']
408 -
409 - for file in files:
410 - input_queue.put(file)
411 -
412 - stream_raw_xml(input_queue, result_queue)
413 -
414 -
415442 def unzip(filename):
416443 '''
417444 Filename should be a fully qualified path to the bz2 file that will be
@@ -419,26 +446,26 @@
420447 '''
421448 fh = bz2.BZ2File(filename, 'r')
422449 for line in fh:
423 - line = line.strip()
424450 yield line
425451 fh.close()
426452 print 'Reached end of BZ2 file.'
427453
 454+
428455 def setup(storage):
429456 keyspace_name = 'enwiki'
430457 if storage == 'cassandra':
431458 cassandra.install_schema(keyspace_name, drop_first=True)
432459
433460
434 -def launcher():
 461+def launcher(function, path):
435462 storage = 'csv'
436463 setup(storage)
437464 input_queue = JoinableQueue()
438465 #files = ['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']
439466 #files = ['/home/diederik/kaggle/enwiki-20100904-pages-meta-history2.xml.bz2']
440 - path = '/media/wikipedia_dumps/batch1/'
441 - files = file_utils.retrieve_file_list(path, 'bz2', mask=None)
442467
 468+ files = file_utils.retrieve_file_list(path, 'bz2')
 469+
443470 for file in files:
444471 filename = os.path.join(path, file)
445472 print filename
@@ -447,20 +474,19 @@
448475 for x in xrange(cpu_count()):
449476 input_queue.put(None)
450477
451 - extracters = [Process(target=stream_raw_xml, args=[input_queue, storage, x])
 478+ extracters = [Process(target=stream_raw_xml, args=[input_queue, function, storage, x])
452479 for x in xrange(cpu_count())]
453480 for extracter in extracters:
454481 extracter.start()
455482
456 - #creators = [Process(target=create_variables, args=[result_queue, storage, x])
457 - # for x in xrange(cpu_count())]
458 - #for creator in creators:
459 - # creator.start()
460 -
461 -
462483 input_queue.join()
463484
464485
465486 if __name__ == '__main__':
466 - #debug()
467 - launcher()
 487+ path1 = '/media/wikipedia_dumps/batch1/'
 488+ path2 = '/media/wikipedia_dumps/batch2/'
 489+ function1 = create_variables
 490+ function2 = count_edits
 491+
 492+ launcher(function1, path1) # launcher for creating training data
 493+ launcher(function2, path2) # launcher for creating test data

Status & tagging log