r75338 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r75337‎ \| r75338 \| r75339 >
Date:	20:00, 24 October 2010
Author:	ariel
Status:	deferred
Tags:
Comment:	initial import, routines for bzip2 compressed xml data, still needs lots of work
Modified paths:	/branches/ariel/xmldumps-backup/Bzip2Xml.py (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/Bzip2Xml.py
—	—	@@ -0,0 +1,178 @@
	2	+import getopt
	3	+import os
	4	+import re
	5	+import sys
	6	+import time
	7	+import bz2
	8	+import xml.sax
	9	+import Bzip2RandomAccess
	10	+from Bzip2RandomAccess import BzFile
	11	+
	12	+class PageInXml:
	13	+ """One page in XML, minus most of the content which we don't care about"""
	14	+
	15	+ def __init__(self, title, id, revisionids):
	16	+ self.title = title
	17	+ self.id = id
	18	+ self.revisionids = revisionids
	19	+
	20	+class XmlFileChunk(object):
	21	+ """find pageID in last complete or partial bzip2 block before end of file,
	22	+ something like that."""
	23	+ def __init__(self, fileName):
	24	+ self._fileName = fileName
	25	+ self._dataBlock = None
	26	+ self._pageID = None
	27	+ self._f = BzFile(self._fileName)
	28	+ # not convinced I need this now, let's see
	29	+ self._seekOffset = None
	30	+
	31	+ def getPageID(self, pageData):
	32	+ # there is the possibility that this chunk of data will have had the page tag
	33	+ # but not the page ID tag in it.
	34	+ titleAndIDPattern = re.compile('<title>(?P<title>.+?)</title>\s*' + '<id>(?P<pageid>\d+?)</id>')
	35	+ result = titleAndIDPattern.search(pageData)
	36	+ if (not result):
	37	+ return None
	38	+ print result.group('title')
	39	+ return result.group('pageid')
	40	+
	41	+ def uncompressedPageDataCount(self,page,pattern):
	42	+ """from one page (ie <page> through close tag)
	43	+ count the nuber of some given tag or other string"""
	44	+ return(page.count(pattern))
	45	+
	46	+ def countRevisionsInData(self, uncompressedData):
	47	+ """return number of revisions in uncompressedPageData,
	48	+ looking at revision start tags only"""
	49	+ if not uncompressedPageData:
	50	+ return 0
	51	+ return(self.uncompressedPageDataCount(uncompressedPageData,"<revision>"))
	52	+
	53	+ # FIXME used but incomplete, is this really the way to get revision counts for a
	54	+ # page is to get a whole page worth of data and count the revisions?
	55	+ # I doubt it, we really just set state = in page, count revisions
	56	+ # until we get to state not in page. something like that.
	57	+ def getOnePage(self,data,offset=0):
	58	+ """get one page starting at offset specified from uncompressed data"""
	59	+ offsetPage = data[offset:]
	60	+ pageStart = offsetPage.find("<page>")
	61	+ if (not pageStart):
	62	+ return None
	63	+ pageEnd = offsetPage[pageStart:].find("</page>")
	64	+ if (not pageEnd):
	65	+ # FIXME we should go get more blocks or something?
	66	+ return None
	67	+ return offsetPage[pageStart:pageStart + pageEnd+len("</page>")]
	68	+
	69	+ # FIXME too (unused and incomplete)
	70	+ def findPageInBlock(self, uncompressedData):
	71	+ # format:
	72	+ # <page>
	73	+ # <title>MediaWiki:Categories</title>
	74	+ # <id>1</id>
	75	+ # etc.
	76	+ pageStartPattern = re.compile('<page>\s*');
	77	+ result = pageStartPattern.search(uncompressedData)
	78	+ if not result:
	79	+ return None
	80	+ # now we look for the end page marker. we will
	81	+ # put the uncompressed page someplace
	82	+ pageEndPattern = re.compile('</page>');
	83	+ result = pageEndPattern.search(uncompressedData)
	84	+ if not result:
	85	+ # need to grab the next block...
	86	+ # FIXME from here
	87	+ pass
	88	+
	89	+ def findPageIDInBlock(self, uncompressedData):
	90	+ # format:
	91	+ # <page>
	92	+ # <title>MediaWiki:Categories</title>
	93	+ # <id>1</id>
	94	+ # etc.
	95	+
	96	+ pageStartPattern = re.compile('<page>\s*');
	97	+ result = pageStartPattern.search(uncompressedData)
	98	+ if not result:
	99	+ return None
	100	+
	101	+ # we want the first page available in this block I guess
	102	+ # hmm, this block might have stuff from the previous pageID.
	103	+ # or some one much earlier than that, if some pages were deleted.
	104	+ # how can we tell? have to go find it??
	105	+
	106	+ pages = uncompressedData[result.start():].split("<page>")
	107	+ for page in pages:
	108	+ ID = self.getPageID(page)
	109	+ if (ID):
	110	+ return(ID)
	111	+ return None
	112	+
	113	+ def findPageIDFromSeekpoint(self, seek, maxBlocksToCheck = None):
	114	+ block = self._f.findBzBlockFromSeekPoint(seek)
	115	+ if not block:
	116	+ print "DEBUG: findPageIDFromSeekpoint: no block found, wtf"
	117	+ return (None, None)
	118	+ uncompressedData = block.getUncompressedData()
	119	+ if not uncompressedData:
	120	+ print "DEBUG: findPageIDFromSeekpoint: no bzip2 block found"
	121	+ return (None, None)
	122	+ # we got a block, we can look for a pageid in it (or in it plus the next
	123	+ # one, if there is a next one)
	124	+ pageID = self.findPageIDInBlock(uncompressedData)
	125	+ print "DEBUG: findPageIDFromSeekpoint: trying to find pageid in block"
	126	+ if (pageID):
	127	+ self._dataBlock = block
	128	+ self._pageID = pageID
	129	+ self._seekOffset = -1*seek
	130	+ return(pageID, uncompressedData)
	131	+
	132	+ blockCount = 1
	133	+ pageID = None
	134	+ while (True):
	135	+ if (maxBlocksToCheck and (blockCount > maxBlocksToCheck)):
	136	+ break
	137	+ seek = seek + block.getBlockLength()
	138	+ block = self._f.findBzBlockFromSeekPoint(seek)
	139	+ # the n is length of <title> plus </title> plus <id> plus </id> plus <page>
	140	+ # plus max title length plus a few for good measure. so title length max is 255
	141	+ # let's future proof this a bit
	142	+ prevBytes = uncompressedData[-1050:]
	143	+ uncompressedData = block.getUncompressedData()
	144	+ if not uncompressedData:
	145	+ break
	146	+ uncompressedData = prevBytes + uncompressedData
	147	+ pageID = self.findPageIDInBlock(uncompressedData)
	148	+ if (pageID):
	149	+ self._dataBlock = block
	150	+ self._pageID = pageID
	151	+ self._seekOffset = -1*seek
	152	+ break
	153	+ return(pageID, uncompressedData)
	154	+
	155	+ def close(self):
	156	+ self._f.close()
	157	+
	158	+if __name__ == "__main__":
	159	+ try:
	160	+# f = XmlFileChunk("/home/ariel/elwikt/elwiktionary-20100305-pages-articles.xml.bz2")
	161	+ f = XmlFileChunk("/home/ariel/src/mediawiki/testing/enwiki-20100904-pages-meta-history9.xml.bz2")
	162	+# f = XmlFileChunk("/mnt/dataset1/xmldatadumps/public/enwiki/20100904/enwiki-20100904-pages-meta-history11.xml.bz2")
	163	+ if not f:
	164	+ print "couldn't initialize file for searching"
	165	+ f.close()
	166	+ os.sys.exit()
	167	+
	168	+ for i in range(1,100):
	169	+ (id,stuff) = f.findPageIDFromSeekpoint(1315000*i)
	170	+ if (id):
	171	+ print "page id:", id, " offset from eof:", f._seekOffset, "number of revisions: ", f.countRevisionsInData(f.getOnePage(stuff))
	172	+
	173	+ else:
	174	+ print "no id found"
	175	+
	176	+ f.close()
	177	+ except(IOError):
	178	+ print "there was no such file, you fool"
	179	+
Property changes on: branches/ariel/xmldumps-backup/Bzip2Xml.py
___________________________________________________________________
Added: svn:eol-style
1	180	+ native

Status & tagging log

21:54, 3 December 2010 Reedy (talk | contribs) changed the status of r75338 [removed: new added: deferred]