r82621 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r82620‎ \| r82621 \| r82622 >
Date:	18:36, 22 February 2011
Author:	diederik
Status:	deferred
Tags:
Comment:	Code to create data competition files.
Modified paths:	/trunk/tools/editor_trends/etl/enricher.py (added) (history)

Diff [purge]

Index: trunk/tools/editor_trends/etl/enricher.py
—	—	@@ -0,0 +1,182 @@
	2	+#!/usr/bin/python
	3	+# -- coding: utf-8 --
	4	+'''
	5	+Copyright (C) 2010 by Diederik van Liere (dvanliere@gmail.com)
	6	+This program is free software; you can redistribute it and/or
	7	+modify it under the terms of the GNU General Public License version 2
	8	+as published by the Free Software Foundation.
	9	+This program is distributed in the hope that it will be useful,
	10	+but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	12	+See the GNU General Public License for more details, at
	13	+http://www.fsf.org/licenses/gpl.html
	14	+'''
	15	+
	16	+__author__ = '''\n'''.join(['Diederik van Liere (dvanliere@gmail.com)', ])
	17	+__email__ = 'dvanliere at gmail dot com'
	18	+__date__ = '2011-02-06'
	19	+__version__ = '0.1'
	20	+
	21	+
	22	+import bz2
	23	+import cStringIO
	24	+import hashlib
	25	+import codecs
	26	+import re
	27	+from multiprocessing import JoinableQueue, Process
	28	+#from xml.etree.cElementTree. import iterparse
	29	+from xml.etree.cElementTree import fromstring
	30	+
	31	+RE_CATEGORY= re.compile('\(.*\`\,\.\-\:\'\)')
	32	+
	33	+def extract_categories():
	34	+ '''
	35	+ Field 1: page id
	36	+ Field 2: name category
	37	+ Field 3: sort key
	38	+ Field 4: timestamp last change
	39	+ '''
	40	+ filename = '/Users/diederik/Downloads/enwiki-20110115-categorylinks.sql'
	41	+ output = codecs.open('categories.csv', 'w', encoding='utf-8')
	42	+ fh = codecs.open(filename, 'r', encoding='utf-8')
	43	+
	44	+ for line in fh:
	45	+ if line.startswith('INSERT INTO `categorylinks` VALUES ('):
	46	+ line = line.replace('INSERT INTO `categorylinks` VALUES (','')
	47	+ line = line.replace("'",'')
	48	+ categories = line.split('),(')
	49	+ for category in categories:
	50	+ category = category.split(',')
	51	+ if len(category) ==4:
	52	+ output.write('%s\t%s\n' % (category[0], category[1]))
	53	+
	54	+ output.close()
	55	+ fh.close()
	56	+
	57	+def extract_revision_text(revision):
	58	+ rev = revision.find('text')
	59	+ if rev != None:
	60	+ return rev.text.encode('utf-8')
	61	+ else:
	62	+ return None
	63	+
	64	+def create_md5hash(revision):
	65	+ if revision == None:
	66	+ return False
	67	+ rev = extract_revision_text(revision)
	68	+ if rev != None:
	69	+ m = hashlib.md5()
	70	+ m.update(rev)
	71	+ #echo m.digest()
	72	+ return m.hexdigest()
	73	+ else:
	74	+ return False
	75	+
	76	+
	77	+def calculate_delta_article_size(prev_size, revision):
	78	+ if revision == None:
	79	+ return False
	80	+ rev= extract_revision_text(revision)
	81	+ if rev == None:
	82	+ return 0, prev_size
	83	+ else:
	84	+ delta = len(rev) - prev_size
	85	+ prev_size = len(rev)
	86	+ return delta, prev_size
	87	+
	88	+
	89	+
	90	+def create_variables(result_queue):
	91	+ while True:
	92	+ try:
	93	+ article = result_queue.get(block=True)
	94	+ result_queue.task_done()
	95	+ if article == None:
	96	+ break
	97	+ article = fromstring(article)
	98	+ prev_size = 0
	99	+ revisions = article.findall('revision')
	100	+ for revision in revisions:
	101	+ revision_id = revision.find('id').text
	102	+ hash = create_md5hash(revision)
	103	+ delta, prev_size = calculate_delta_article_size(prev_size, revision)
	104	+ print revision_id, hash, delta, prev_size
	105	+ except ValueError, e:
	106	+ pass
	107	+ #print e
	108	+
	109	+
	110	+
	111	+def create_article(input_queue, result_queue):
	112	+ '''
	113	+ This function creates three variables:
	114	+ 1) a MD5 hash for each revision
	115	+ 2) the size of the current revision
	116	+ 3) the delta size of the current revision and the previous revision
	117	+ '''
	118	+ buffer = cStringIO.StringIO()
	119	+ parsing = False
	120	+ while True:
	121	+ filename = input_queue.get()
	122	+ input_queue.task_done()
	123	+ if filename == None:
	124	+ break
	125	+
	126	+ for data in unzip(filename):
	127	+ if data.startswith('<page>'):
	128	+ parsing = True
	129	+ #print data
	130	+ if parsing:
	131	+ buffer.write(data)
	132	+ if data == '</page>':
	133	+ xml1 = buffer.getvalue()
	134	+ #xml1 = xml1.decode('utf-8')
	135	+ #xml1 = xml1.encode('utf-8')
	136	+ #xml1 = fromstring(xml1)
	137	+ if xml1 != None:
	138	+ result_queue.put(xml1)
	139	+ buffer = cStringIO.StringIO()
	140	+
	141	+ result_queue.put(None)
	142	+ print 'Finished parsing bz2 archives'
	143	+
	144	+def unzip(filename):
	145	+ '''
	146	+ Filename should be a fully qualified path to the bz2 file that will be decompressed.
	147	+ It will iterate line by line and yield this back to create_article
	148	+ '''
	149	+ fh = bz2.BZ2File(filename, 'r')
	150	+ for line in fh:
	151	+ #line = line.decode('utf-8')
	152	+ line = line.strip()
	153	+ yield line
	154	+ fh.close()
	155	+
	156	+
	157	+def launcher():
	158	+ input_queue = JoinableQueue()
	159	+ result_queue = JoinableQueue()
	160	+ files = ['/Users/diederik/Downloads/enwiki-20110115-pages-articles1.xml.bz2']
	161	+ for file in files:
	162	+ input_queue.put(file)
	163	+
	164	+ for x in xrange(2):
	165	+ input_queue.put(None)
	166	+
	167	+ extracters = [Process(target=create_article, args=[input_queue, result_queue]) for x in xrange(2)]
	168	+ for extracter in extracters:
	169	+ extracter.start()
	170	+
	171	+ creators = [Process(target=create_variables, args=[result_queue]) for x in xrange(2)]
	172	+ for creator in creators:
	173	+ creator.start()
	174	+
	175	+
	176	+ input_queue.join()
	177	+ result_queue.join()
	178	+
	179	+
	180	+
	181	+if __name__ == '__main__':
	182	+ extract_categories()
	183	+ #launcher()
\ No newline at end of file

Status & tagging log

16:32, 2 March 2011 Reedy (talk | contribs) changed the status of r82621 [removed: new added: deferred]