r75338 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r75337‎ | r75338 | r75339 >
Date:20:00, 24 October 2010
Author:ariel
Status:deferred
Tags:
Comment:
initial import, routines for bzip2 compressed xml data, still needs lots of work
Modified paths:
  • /branches/ariel/xmldumps-backup/Bzip2Xml.py (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/Bzip2Xml.py
@@ -0,0 +1,178 @@
 2+import getopt
 3+import os
 4+import re
 5+import sys
 6+import time
 7+import bz2
 8+import xml.sax
 9+import Bzip2RandomAccess
 10+from Bzip2RandomAccess import BzFile
 11+
 12+class PageInXml:
 13+ """One page in XML, minus most of the content which we don't care about"""
 14+
 15+ def __init__(self, title, id, revisionids):
 16+ self.title = title
 17+ self.id = id
 18+ self.revisionids = revisionids
 19+
 20+class XmlFileChunk(object):
 21+ """find pageID in last complete or partial bzip2 block before end of file,
 22+ something like that."""
 23+ def __init__(self, fileName):
 24+ self._fileName = fileName
 25+ self._dataBlock = None
 26+ self._pageID = None
 27+ self._f = BzFile(self._fileName)
 28+ # not convinced I need this now, let's see
 29+ self._seekOffset = None
 30+
 31+ def getPageID(self, pageData):
 32+ # there is the possibility that this chunk of data will have had the page tag
 33+ # but not the page ID tag in it.
 34+ titleAndIDPattern = re.compile('<title>(?P<title>.+?)</title>\s*' + '<id>(?P<pageid>\d+?)</id>')
 35+ result = titleAndIDPattern.search(pageData)
 36+ if (not result):
 37+ return None
 38+ print result.group('title')
 39+ return result.group('pageid')
 40+
 41+ def uncompressedPageDataCount(self,page,pattern):
 42+ """from one page (ie <page> through close tag)
 43+ count the nuber of some given tag or other string"""
 44+ return(page.count(pattern))
 45+
 46+ def countRevisionsInData(self, uncompressedData):
 47+ """return number of revisions in uncompressedPageData,
 48+ looking at revision start tags only"""
 49+ if not uncompressedPageData:
 50+ return 0
 51+ return(self.uncompressedPageDataCount(uncompressedPageData,"<revision>"))
 52+
 53+ # FIXME used but incomplete, is this really the way to get revision counts for a
 54+ # page is to get a whole page worth of data and count the revisions?
 55+ # I doubt it, we really just set state = in page, count revisions
 56+ # until we get to state not in page. something like that.
 57+ def getOnePage(self,data,offset=0):
 58+ """get one page starting at offset specified from uncompressed data"""
 59+ offsetPage = data[offset:]
 60+ pageStart = offsetPage.find("<page>")
 61+ if (not pageStart):
 62+ return None
 63+ pageEnd = offsetPage[pageStart:].find("</page>")
 64+ if (not pageEnd):
 65+ # FIXME we should go get more blocks or something?
 66+ return None
 67+ return offsetPage[pageStart:pageStart + pageEnd+len("</page>")]
 68+
 69+ # FIXME too (unused and incomplete)
 70+ def findPageInBlock(self, uncompressedData):
 71+ # format:
 72+ # <page>
 73+ # <title>MediaWiki:Categories</title>
 74+ # <id>1</id>
 75+ # etc.
 76+ pageStartPattern = re.compile('<page>\s*');
 77+ result = pageStartPattern.search(uncompressedData)
 78+ if not result:
 79+ return None
 80+ # now we look for the end page marker. we will
 81+ # put the uncompressed page someplace
 82+ pageEndPattern = re.compile('</page>');
 83+ result = pageEndPattern.search(uncompressedData)
 84+ if not result:
 85+ # need to grab the next block...
 86+ # FIXME from here
 87+ pass
 88+
 89+ def findPageIDInBlock(self, uncompressedData):
 90+ # format:
 91+ # <page>
 92+ # <title>MediaWiki:Categories</title>
 93+ # <id>1</id>
 94+ # etc.
 95+
 96+ pageStartPattern = re.compile('<page>\s*');
 97+ result = pageStartPattern.search(uncompressedData)
 98+ if not result:
 99+ return None
 100+
 101+ # we want the first page available in this block I guess
 102+ # hmm, this block might have stuff from the previous pageID.
 103+ # or some one much earlier than that, if some pages were deleted.
 104+ # how can we tell? have to go find it??
 105+
 106+ pages = uncompressedData[result.start():].split("<page>")
 107+ for page in pages:
 108+ ID = self.getPageID(page)
 109+ if (ID):
 110+ return(ID)
 111+ return None
 112+
 113+ def findPageIDFromSeekpoint(self, seek, maxBlocksToCheck = None):
 114+ block = self._f.findBzBlockFromSeekPoint(seek)
 115+ if not block:
 116+ print "DEBUG: findPageIDFromSeekpoint: no block found, wtf"
 117+ return (None, None)
 118+ uncompressedData = block.getUncompressedData()
 119+ if not uncompressedData:
 120+ print "DEBUG: findPageIDFromSeekpoint: no bzip2 block found"
 121+ return (None, None)
 122+ # we got a block, we can look for a pageid in it (or in it plus the next
 123+ # one, if there is a next one)
 124+ pageID = self.findPageIDInBlock(uncompressedData)
 125+ print "DEBUG: findPageIDFromSeekpoint: trying to find pageid in block"
 126+ if (pageID):
 127+ self._dataBlock = block
 128+ self._pageID = pageID
 129+ self._seekOffset = -1*seek
 130+ return(pageID, uncompressedData)
 131+
 132+ blockCount = 1
 133+ pageID = None
 134+ while (True):
 135+ if (maxBlocksToCheck and (blockCount > maxBlocksToCheck)):
 136+ break
 137+ seek = seek + block.getBlockLength()
 138+ block = self._f.findBzBlockFromSeekPoint(seek)
 139+ # the n is length of <title> plus </title> plus <id> plus </id> plus <page>
 140+ # plus max title length plus a few for good measure. so title length max is 255
 141+ # let's future proof this a bit
 142+ prevBytes = uncompressedData[-1050:]
 143+ uncompressedData = block.getUncompressedData()
 144+ if not uncompressedData:
 145+ break
 146+ uncompressedData = prevBytes + uncompressedData
 147+ pageID = self.findPageIDInBlock(uncompressedData)
 148+ if (pageID):
 149+ self._dataBlock = block
 150+ self._pageID = pageID
 151+ self._seekOffset = -1*seek
 152+ break
 153+ return(pageID, uncompressedData)
 154+
 155+ def close(self):
 156+ self._f.close()
 157+
 158+if __name__ == "__main__":
 159+ try:
 160+# f = XmlFileChunk("/home/ariel/elwikt/elwiktionary-20100305-pages-articles.xml.bz2")
 161+ f = XmlFileChunk("/home/ariel/src/mediawiki/testing/enwiki-20100904-pages-meta-history9.xml.bz2")
 162+# f = XmlFileChunk("/mnt/dataset1/xmldatadumps/public/enwiki/20100904/enwiki-20100904-pages-meta-history11.xml.bz2")
 163+ if not f:
 164+ print "couldn't initialize file for searching"
 165+ f.close()
 166+ os.sys.exit()
 167+
 168+ for i in range(1,100):
 169+ (id,stuff) = f.findPageIDFromSeekpoint(1315000*i)
 170+ if (id):
 171+ print "page id:", id, " offset from eof:", f._seekOffset, "number of revisions: ", f.countRevisionsInData(f.getOnePage(stuff))
 172+
 173+ else:
 174+ print "no id found"
 175+
 176+ f.close()
 177+ except(IOError):
 178+ print "there was no such file, you fool"
 179+
Property changes on: branches/ariel/xmldumps-backup/Bzip2Xml.py
___________________________________________________________________
Added: svn:eol-style
1180 + native

Status & tagging log