r81588 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r81587‎ | r81588 | r81589 >
Date:06:12, 6 February 2011
Author:ariel
Status:deferred
Tags:
Comment:
new functions: check mw footer existence, get last pageID from last complete block (from possibly truncated file)
Modified paths:
  • /branches/ariel/xmldumps-backup/Bzip2Xml.py (modified) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/Bzip2Xml.py
@@ -6,8 +6,110 @@
77 import bz2
88 import xml.sax
99 import Bzip2RandomAccess
10 -from Bzip2RandomAccess import BzFile
1110
 11+from os.path import dirname, exists, getsize, join, realpath
 12+from Bzip2RandomAccess import BzFile, BzConstants
 13+
 14+class BzXmlFile(object):
 15+ """Convenience functions that do things like find the last pageID
 16+ from the last complete block in a possibly truncated bzip2 file, etc"""
 17+ def __init__(self, fileName):
 18+ self._fileName = fileName
 19+ self._dataBlock = None
 20+ self._pageID = None
 21+ self._bzf = BzFile(self._fileName)
 22+
 23+ def getFooter(self):
 24+ block = self._bzf.findLastFullBzBlock()
 25+ if (not block):
 26+ return False
 27+ uncompressedData = block.getUncompressedData()
 28+ if (not uncompressedData ):
 29+ return False
 30+ footerPattern = re.compile('</mediawiki>\s*');
 31+ result = footerPattern.search(uncompressedData)
 32+ if (not result):
 33+ return False
 34+ return True
 35+
 36+ # start from a fixed length back from the end of the file,
 37+ # read through blocks looking for page id info. goal is to
 38+ # find the *last one* in the file
 39+ # if we don't find any in those blocks then we step back that same length
 40+ # further back and try again...
 41+ # repeat until we find one (in which case find the last one available)
 42+ # or we run out of file
 43+ def getLastPageIDFromFile(self):
 44+ self._bzf._f.seek(0,os.SEEK_END)
 45+ filesize = self._bzf._f.tell()
 46+
 47+ # so because stuff isn't byte aligned etc, after
 48+ # multiple iterations trying to get blocks and matches etc,
 49+ # we could be a few bytes of in various length counts.
 50+ # maybe I'm just being paranoid but that's what this is anyways.
 51+ fudgefactor = 10
 52+
 53+ offset = filesize
 54+ # the pattern we are looking for could be perfectly positioned
 55+ # to be split across two blocks, so put leftover stuff in here
 56+ # to be concatenated onto the beginning of the next block
 57+ holdoverString = ""
 58+
 59+ # blocks should always be smaller than this so if we start from
 60+ # this far back in the file we should be guaranteed to find
 61+ # a block. (unless the file is hopelessly corrupted)
 62+ interval = BzConstants.getMaxCompressedBlockSize(self._bzf.getBlockSizeMultiplier())*2
 63+
 64+ pageIDPattern = re.compile('</title>\n\s*<id>(?P<pageid>\d+?)</id>');
 65+
 66+ # the position in the file after which we have already scanned
 67+ didAlready = filesize
 68+
 69+ # outer loop: jump back in the file from where we are,
 70+ # far enough back we are guaranteed to find at least
 71+ # one bzip2 block in ther if the file isn't garbage
 72+ # hen read through it looking for pageID til we get to
 73+ # ground we've already covered.
 74+ # found one? awesome, return it. otherwise,
 75+ # jump back even further, repeat.
 76+ while offset > 0:
 77+ start = offset - interval
 78+ if (start < 0):
 79+ if (didAlready != fudgefactor):
 80+ start = 0
 81+ else:
 82+ return currentPageID
 83+ didAlready = offset + fudgefactor
 84+ if didAlready > filesize:
 85+ didAlready = filesize
 86+ doingNow = start
 87+ currentPageID = None
 88+ while start < didAlready:
 89+ # we need to walk through the blocks now going forward
 90+ # looking for all pageIDs we encounter, if we find any
 91+ # we return the last one. Only if we find none do
 92+ # we fall through to the outer loop to back up further
 93+ # towards the beginning of the file.
 94+ block = self._bzf.findBzBlockFromSeekPoint(start)
 95+ if not block:
 96+ return currentPageID
 97+ uncompressedData = block.getUncompressedData()
 98+ text = holdoverString + uncompressedData
 99+ pageID = pageIDPattern.search(uncompressedData)
 100+ if (pageID):
 101+ currentPageID = pageID.group('pageid')
 102+ start = start + block.getBlockLength() - fudgefactor
 103+ holdoverString = uncompressedData[-1025:]
 104+
 105+ if currentPageID:
 106+ return currentPageID
 107+
 108+ # no, we need to just add the block length I think, maybe subtract a few bytes
 109+ offset = start - interval
 110+ if offset < 0:
 111+ return currentPageID
 112+ holdoverString = ""
 113+
12114 class PageInXml:
13115 """One page in XML, minus most of the content which we don't care about"""
14116
@@ -16,6 +118,11 @@
17119 self.id = id
18120 self.revisionids = revisionids
19121
 122+#
 123+#
 124+# the rest of this crap may get tossed... we shall see
 125+#
 126+#
20127 class XmlFileChunk(object):
21128 """find pageID in last complete or partial bzip2 block before end of file,
22129 something like that."""
@@ -109,7 +216,7 @@
110217 return(ID)
111218 return None
112219
113 - def findPageIDFromSeekpoint(self, seek, maxBlocksToCheck = None):
 220+ def findPageIDFromSeekpoint(self, seek, maxBlocksToCheck = None, maxOffset = None):
114221 block = self._f.findBzBlockFromSeekPoint(seek)
115222 if not block:
116223 print "DEBUG: findPageIDFromSeekpoint: no block found, wtf"
@@ -134,6 +241,8 @@
135242 if (maxBlocksToCheck and (blockCount > maxBlocksToCheck)):
136243 break
137244 seek = seek + block.getBlockLength()
 245+ if (maxOffset and (seek > maxOffset)):
 246+ break
138247 block = self._f.findBzBlockFromSeekPoint(seek)
139248 # the n is length of <title> plus </title> plus <id> plus </id> plus <page>
140249 # plus max title length plus a few for good measure. so title length max is 255
@@ -155,6 +264,20 @@
156265 self._f.close()
157266
158267 if __name__ == "__main__":
 268+ myfile = BzXmlFile("/home/ariel/src/mediawiki/testing/dumps/public/elwikidb/20110205/elwikidb-20110205-pages-articles4.xml.bz2")
 269+ pageID = myfile.getLastPageIDFromFile()
 270+ if pageID:
 271+ print "pageID", pageID
 272+ else:
 273+ print "no pageID for you!"
 274+
 275+
 276+ result = myfile.getFooter()
 277+ if (result):
 278+ print "and this file has a footer"
 279+ else:
 280+ print "no footer on this baby"
 281+
159282 try:
160283 # f = XmlFileChunk("/home/ariel/elwikt/elwiktionary-20100305-pages-articles.xml.bz2")
161284 f = XmlFileChunk("/home/ariel/src/mediawiki/testing/enwiki-20100904-pages-meta-history9.xml.bz2")
@@ -176,3 +299,4 @@
177300 except(IOError):
178301 print "there was no such file, you fool"
179302
 303+

Status & tagging log