r82621 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r82620‎ | r82621 | r82622 >
Date:18:36, 22 February 2011
Author:diederik
Status:deferred
Tags:
Comment:
Code to create data competition files.
Modified paths:
  • /trunk/tools/editor_trends/etl/enricher.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/enricher.py
@@ -0,0 +1,182 @@
 2+#!/usr/bin/python
 3+# -*- coding: utf-8 -*-
 4+'''
 5+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
 6+This program is free software; you can redistribute it and/or
 7+modify it under the terms of the GNU General Public License version 2
 8+as published by the Free Software Foundation.
 9+This program is distributed in the hope that it will be useful,
 10+but WITHOUT ANY WARRANTY; without even the implied warranty of
 11+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 12+See the GNU General Public License for more details, at
 13+http://www.fsf.org/licenses/gpl.html
 14+'''
 15+
 16+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
 17+__email__ = 'dvanliere at gmail dot com'
 18+__date__ = '2011-02-06'
 19+__version__ = '0.1'
 20+
 21+
 22+import bz2
 23+import cStringIO
 24+import hashlib
 25+import codecs
 26+import re
 27+from multiprocessing import JoinableQueue, Process
 28+#from xml.etree.cElementTree. import iterparse
 29+from xml.etree.cElementTree import fromstring
 30+
 31+RE_CATEGORY= re.compile('\(.*\`\,\.\-\:\'\)')
 32+
 33+def extract_categories():
 34+ '''
 35+ Field 1: page id
 36+ Field 2: name category
 37+ Field 3: sort key
 38+ Field 4: timestamp last change
 39+ '''
 40+ filename = '/Users/diederik/Downloads/enwiki-20110115-categorylinks.sql'
 41+ output = codecs.open('categories.csv', 'w', encoding='utf-8')
 42+ fh = codecs.open(filename, 'r', encoding='utf-8')
 43+
 44+ for line in fh:
 45+ if line.startswith('INSERT INTO `categorylinks` VALUES ('):
 46+ line = line.replace('INSERT INTO `categorylinks` VALUES (','')
 47+ line = line.replace("'",'')
 48+ categories = line.split('),(')
 49+ for category in categories:
 50+ category = category.split(',')
 51+ if len(category) ==4:
 52+ output.write('%s\t%s\n' % (category[0], category[1]))
 53+
 54+ output.close()
 55+ fh.close()
 56+
 57+def extract_revision_text(revision):
 58+ rev = revision.find('text')
 59+ if rev != None:
 60+ return rev.text.encode('utf-8')
 61+ else:
 62+ return None
 63+
 64+def create_md5hash(revision):
 65+ if revision == None:
 66+ return False
 67+ rev = extract_revision_text(revision)
 68+ if rev != None:
 69+ m = hashlib.md5()
 70+ m.update(rev)
 71+ #echo m.digest()
 72+ return m.hexdigest()
 73+ else:
 74+ return False
 75+
 76+
 77+def calculate_delta_article_size(prev_size, revision):
 78+ if revision == None:
 79+ return False
 80+ rev= extract_revision_text(revision)
 81+ if rev == None:
 82+ return 0, prev_size
 83+ else:
 84+ delta = len(rev) - prev_size
 85+ prev_size = len(rev)
 86+ return delta, prev_size
 87+
 88+
 89+
 90+def create_variables(result_queue):
 91+ while True:
 92+ try:
 93+ article = result_queue.get(block=True)
 94+ result_queue.task_done()
 95+ if article == None:
 96+ break
 97+ article = fromstring(article)
 98+ prev_size = 0
 99+ revisions = article.findall('revision')
 100+ for revision in revisions:
 101+ revision_id = revision.find('id').text
 102+ hash = create_md5hash(revision)
 103+ delta, prev_size = calculate_delta_article_size(prev_size, revision)
 104+ print revision_id, hash, delta, prev_size
 105+ except ValueError, e:
 106+ pass
 107+ #print e
 108+
 109+
 110+
 111+def create_article(input_queue, result_queue):
 112+ '''
 113+ This function creates three variables:
 114+ 1) a MD5 hash for each revision
 115+ 2) the size of the current revision
 116+ 3) the delta size of the current revision and the previous revision
 117+ '''
 118+ buffer = cStringIO.StringIO()
 119+ parsing = False
 120+ while True:
 121+ filename = input_queue.get()
 122+ input_queue.task_done()
 123+ if filename == None:
 124+ break
 125+
 126+ for data in unzip(filename):
 127+ if data.startswith('<page>'):
 128+ parsing = True
 129+ #print data
 130+ if parsing:
 131+ buffer.write(data)
 132+ if data == '</page>':
 133+ xml1 = buffer.getvalue()
 134+ #xml1 = xml1.decode('utf-8')
 135+ #xml1 = xml1.encode('utf-8')
 136+ #xml1 = fromstring(xml1)
 137+ if xml1 != None:
 138+ result_queue.put(xml1)
 139+ buffer = cStringIO.StringIO()
 140+
 141+ result_queue.put(None)
 142+ print 'Finished parsing bz2 archives'
 143+
 144+def unzip(filename):
 145+ '''
 146+ Filename should be a fully qualified path to the bz2 file that will be decompressed.
 147+ It will iterate line by line and yield this back to create_article
 148+ '''
 149+ fh = bz2.BZ2File(filename, 'r')
 150+ for line in fh:
 151+ #line = line.decode('utf-8')
 152+ line = line.strip()
 153+ yield line
 154+ fh.close()
 155+
 156+
 157+def launcher():
 158+ input_queue = JoinableQueue()
 159+ result_queue = JoinableQueue()
 160+ files = ['/Users/diederik/Downloads/enwiki-20110115-pages-articles1.xml.bz2']
 161+ for file in files:
 162+ input_queue.put(file)
 163+
 164+ for x in xrange(2):
 165+ input_queue.put(None)
 166+
 167+ extracters = [Process(target=create_article, args=[input_queue, result_queue]) for x in xrange(2)]
 168+ for extracter in extracters:
 169+ extracter.start()
 170+
 171+ creators = [Process(target=create_variables, args=[result_queue]) for x in xrange(2)]
 172+ for creator in creators:
 173+ creator.start()
 174+
 175+
 176+ input_queue.join()
 177+ result_queue.join()
 178+
 179+
 180+
 181+if __name__ == '__main__':
 182+ extract_categories()
 183+ #launcher()
\ No newline at end of file

Status & tagging log