r75337 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r75336‎ | r75337 | r75338 >
Date:19:56, 24 October 2010
Author:ariel
Status:deferred
Tags:
Comment:
initial import, bzip block-level management routines
Modified paths:
  • /branches/ariel/xmldumps-backup/Bzip2RandomAccess.py (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/Bzip2RandomAccess.py
@@ -0,0 +1,657 @@
 2+import getopt
 3+import os
 4+import re
 5+import sys
 6+import time
 7+import bz2
 8+import xml.sax
 9+
 10+class ShiftedData(object):
 11+ """This class manages strings of data that have been left-shifted
 12+ 0 through 7 bits."""
 13+
 14+ def __init__(self, data, n=0, padding = None):
 15+ """Arguments:
 16+ data -- the data to shift
 17+ n -- the number of bits to shift left
 18+ padding -- whether to add 1's on upper end of new
 19+ leftmost byte and lower end of rightmost byte"""
 20+ self._data = data
 21+ self._bitsShiftedLeft = n % 8
 22+ self._padding = padding
 23+ self._shiftedData = self.shiftLeftNBits()
 24+
 25+ def getShiftedData(self):
 26+ return self._shiftedData
 27+
 28+ def getData(self):
 29+ return self._data
 30+
 31+ def getBitsShifted(self):
 32+ return(self._bitsShiftedLeft)
 33+
 34+ def getlength(self):
 35+ return len(self._data)
 36+
 37+ def getShiftedLength(self):
 38+ return len(self._shiftedData)
 39+
 40+ def shiftLeftNBits(self):
 41+ """shift a string of crap n bits left, pushing
 42+ overflow into a new leftmost byte"""
 43+ return ByteAlignedDataMethods.shiftLeftNBits(self._data,self._bitsShiftedLeft,self._padding)
 44+
 45+class ByteAlignedDataMethods(object):
 46+ """Contains various methods for byte-aligned data"""
 47+
 48+ def shiftLeftNBits(data, bits, padding = False):
 49+ """shift a string of crap n bits left, pushing
 50+ overflow into a new leftmost byte, padding on right
 51+ with 1s if requested, otherwise with 0s"""
 52+ if (bits == 0):
 53+ return data
 54+
 55+ resultList = []
 56+
 57+ # overflow from shift off left end, may be 0
 58+ overflow = ord(data[0])>> (8 - bits)
 59+ resultList.append(chr(overflow))
 60+
 61+ if (padding):
 62+ resultList[-1] = chr(ord(resultList[-1]) | (256 - 2**bits))
 63+
 64+ for i in range(0,len(data)):
 65+ c = ord(data[i])
 66+ if i == len(data)-1:
 67+ next = 0
 68+ else:
 69+ next = ord(data[i+1])
 70+
 71+ # grab stuff shifted off the left end of the next byte
 72+ resultList.append(chr((c<<bits) & 255 | (next >> (8 - bits))))
 73+
 74+ if (padding):
 75+ resultList[-1] = chr(ord(resultList[-1]) | (2**bits -1))
 76+
 77+ resultString = "".join(resultList)
 78+ return(resultString)
 79+
 80+ def getByteAlignedData(data, startByte, startBit):
 81+ """given a string of data, a starting byte number (from 0)
 82+ and a starting bit in that byte (counting 0 from the leftmost bit),
 83+ return the string of bits starting from there and going to the end of the
 84+ string of data. The last byte is 0-padded on the right if necessary."""
 85+ if (startByte >= len(data)):
 86+ return None
 87+ startBit = startBit % 8
 88+ shifted = ByteAlignedDataMethods.shiftLeftNBits(data[startByte:],startBit)
 89+ if (startBit):
 90+ # the new uppermost byte is the extra bits that we didn't want anyways
 91+ return(shifted[1:])
 92+ else:
 93+ return(shifted)
 94+
 95+ shiftLeftNBits = staticmethod(shiftLeftNBits)
 96+ getByteAlignedData = staticmethod(getByteAlignedData)
 97+
 98+
 99+class shiftedSearchString(object):
 100+ """This class manages search strings that may searched for in
 101+ bit-shifted data."""
 102+
 103+ def __init__(self, data):
 104+ """Arguments:
 105+ data -- the data to shift"""
 106+ self._searchStringShifted=[]
 107+ for i in range(0,8):
 108+ self._searchStringShifted.append(ShiftedData(data,i,padding=True))
 109+
 110+ def getLength(self):
 111+ return len(self._searchStringShifted[0].getData())
 112+
 113+ def findAllMatchesOfStringNonaligned(self,data):
 114+ """search for all matches of a given pattern in a given string of data
 115+ not byte aligned. returns an array n0, n1, n2.. where n0 = list of
 116+ starting bytes where pattern was found with no bit padding,
 117+ n1 = with 1 bit shifted, etc."""
 118+
 119+ # FIXME this might be with the bit pattern shifted and not the string, dunno.
 120+
 121+ if (not data):
 122+ return (None, None)
 123+ matches = []
 124+ for i in range(0,8):
 125+ results = self.findAllMatchesOfStringFromLeft(data,self._searchStringShifted[i].getShiftedData())
 126+ matches.append(results)
 127+ return(matches)
 128+
 129+
 130+ def findAllMatchesOfStringFromLeft(self,data,pattern):
 131+ """byte aligned already"""
 132+ if (not data):
 133+ return (None, None)
 134+
 135+ positions = []
 136+ offset = 0
 137+ while (offset < len(data)):
 138+ # do all but the first and last byte (which may have padding)
 139+ result = data[offset:].find(pattern[1:-1])
 140+ if (result >= 0):
 141+ if (offset+result-1 >= 0):
 142+ offset = offset -1
 143+ firstByte = data[offset+result]
 144+ if firstByte == chr(ord(pattern[0]) & ord(firstByte)):
 145+ lastByte = data[offset+result+len(pattern) -1]
 146+ if lastByte == chr(ord(pattern[-1]) & ord(lastByte)):
 147+ positions.append(result)
 148+
 149+ # that submatch isn't at a block boundary, false alarm
 150+ offset = offset + result + 2 # +1 because we start match at byte 2 of pattern and we want to move up one also
 151+ else:
 152+ return(positions)
 153+
 154+
 155+ def findStringInDataFromLeft(self,data):
 156+ """Find first occurence of string in (bit-shifted) data
 157+ (occurence may not be byte aligned)
 158+ Arguments: a string of data or a ShiftedData object
 159+ Returns: tuple consisting of
 160+ - the starting position of the first match
 161+ - number of bits data must be left-shifted in order to find
 162+ byte-aligned match"""
 163+
 164+ if (not data):
 165+ return (None, None)
 166+
 167+ while(True):
 168+ firstMatch = None
 169+ shiftedBy = None
 170+ for i in range (0,8):
 171+ offset = 0
 172+ # do all but the first and last byte (which may have padding)
 173+ bytesShifted = self._searchStringShifted[i].getShiftedData()
 174+ result = data[offset:].find(bytesShifted[1:-1])
 175+ if (result >= 0):
 176+ if (offset+result-1 >= 0):
 177+ offset = offset -1
 178+ firstByte = data[offset+result]
 179+ if firstByte == chr(ord(bytesShifted[0]) & ord(firstByte)):
 180+ lastByte = data[offset+result+len(bytesShifted) -1]
 181+ if lastByte == chr(ord(bytesShifted[-1]) & ord(lastByte)):
 182+ if (firstMatch == None or result+offset < firstMatch):
 183+ firstMatch = result+offset
 184+ shiftedBy = i
 185+ # that submatch isn't at a block boundary, false alarm
 186+ offset = offset + result + 2 # +1 because we start match at byte 2 of pattern and we want to move up one also
 187+ if (firstMatch == None):
 188+ return(None, None)
 189+ else:
 190+ return (firstMatch,(8 - shiftedBy)%8)
 191+
 192+ def findStringInDataFromRight(self,data):
 193+ """Find last occurence of string in (bit-shifted) data
 194+ (occurence may not be byte aligned)
 195+ Arguments: a string of data or a ShiftedData object
 196+ Returns: tuple consisting of
 197+ - the starting position of the first match, starting from byte 0
 198+ - number of bits pattern was left-shifted in order to find match"""
 199+
 200+ if (not data):
 201+ return (None, None)
 202+
 203+ if isinstance(data, ShiftedData):
 204+ checkData = data.getShiftedData()
 205+ else:
 206+ checkData = data
 207+
 208+ while(True):
 209+ firstMatch = None
 210+ shiftedBy = None
 211+ for i in range (0,8):
 212+ offset = 0
 213+ # do all but the first and last byte (which may have padding)
 214+ bytesShifted = self._searchStringShifted[i].getShiftedData()
 215+ result = checkData[offset:].rfind(bytesShifted[1:-1])
 216+ if (result >= 0):
 217+ if (offset+result-1 >= 0):
 218+ offset = offset -1
 219+ firstByte = checkData[offset+result]
 220+ if firstByte == chr(ord(bytesShifted[0]) & ord(firstByte)):
 221+ lastByte = checkData[offset+result+len(bytesShifted) -1]
 222+ if lastByte == chr(ord(bytesShifted[-1]) & ord(lastByte)):
 223+ if (firstMatch == None or result+offset > firstMatch):
 224+ firstMatch = result+offset
 225+ shiftedBy = i
 226+ # that submatch isn't at a block boundary, false alarm
 227+ offset = offset - result + 2 # +1 because we start match at 1 byte before end pattern and we want to skip a byte also
 228+
 229+ if (firstMatch == None):
 230+ return(None, None)
 231+ else:
 232+ return (firstMatch,(8 - shiftedBy)%8)
 233+
 234+ def dumpSearchString(self):
 235+ for i in range(0, len(self._searchStringShifted)):
 236+ BzConstants.dumpstring(self._searchStringShifted[i].getShiftedData(),"search string shifted %s:" % i)
 237+
 238+class BzConstants(object):
 239+ """Contains various defines for bz2 data"""
 240+
 241+ def getFooter():
 242+ """Return string which is at the end of every bzip2 stream or file"""
 243+ footer = [ '0x17', '0x72', '0x45', '0x38', '0x50', '0x90' ]
 244+ for i in range(0,len(footer)):
 245+ footer[i] = chr(int(footer[i],16))
 246+ footerString = "".join(footer)
 247+ return footerString
 248+
 249+ def getBlockMarker():
 250+ """Return string which is at the beginning of every bzip2 compressed block"""
 251+ return "1AY&SY"
 252+
 253+ def getMaxCompressedBlockSize(bzBlockSizeMultiplier):
 254+ """Return the maximum compressed bzip2 block size based on the
 255+ block size multipler (from the bzip2 header) passed as an argument"""
 256+ # max length of compressed data block given size of uncompressed data as specified in header
 257+ # "To guarantee that the compressed data will fit in its buffer, allocate an output buffer of
 258+ # size 1% larger than the uncompressed data, plus six hundred extra bytes." (Plus paranoia :-P)
 259+ return (bzBlockSizeMultiplier + bzBlockSizeMultiplier/100)*100000 + 650
 260+
 261+ def dumpstring(string,message="dumping string:"):
 262+ print message,
 263+ for i in range(0,len(string)):
 264+ print hex(ord(string[i])),
 265+ print
 266+
 267+ def isZeros(data):
 268+ for i in range(0,len(data)):
 269+ if (ord(data[i]) != 0):
 270+ return False
 271+ return True
 272+
 273+ def checkForFooter(data):
 274+ """See if data passed as argument ends in the bz2 footer
 275+ We expect: 6 bytes of footer, 4 bytes of crc, 0 to 7 bits of padding"""
 276+ footerSearchString = shiftedSearchString(BzConstants.getFooter())
 277+ ( offset, bitsShifted ) = footerSearchString.findStringInDataFromRight(data[-30:])
 278+ if (offset != None):
 279+ if (bitsShifted > 0):
 280+ paddingByte = 1
 281+ else:
 282+ paddingByte = 0
 283+ # starts from 0
 284+ # expect 0-filled bytes at the end, if there are any
 285+ extraBytes = -30+(offset + 6 + 4 + paddingByte)
 286+ # so this iszeros thing... see, this data we got passed may have not been byte aligned.
 287+ # so maybe it got left-bit shifted for some block marker.
 288+ # then if we foudn another marker farther down, maybe it got shifted again
 289+ # and so on... so there could be a bunch of extra zero bytes at the end
 290+ # FIXME this is the wrong place to check for that but I don't know the
 291+ # right place yet. And keeping track is absolutely out of the question.
 292+ # we could instead of using the newly byte aligned data from the stream
 293+ # and continually left shifting it, go back to the original stream each
 294+ # time which would limit this some. ? needs thought.
 295+ if (extraBytes <= 0 or BzConstants.isZeros(data[-30+(offset + 6 + 4 + paddingByte):])):
 296+ # starting byte of footer, counting from end. may not be byte aligned.
 297+ return (-30 + offset)
 298+ return None
 299+
 300+ def getDecompressedBlock(data, bzHeader):
 301+ """takes string of data plus 4 byte bzip2 header, returns decompressed block"""
 302+ block = bzHeader + data
 303+ try:
 304+ bz = bz2.BZ2Decompressor()
 305+ out = bz.decompress(block)
 306+ return(out)
 307+ except Exception, ex:
 308+ print ex
 309+ return(None)
 310+
 311+ getFooter = staticmethod(getFooter)
 312+ getBlockMarker = staticmethod(getBlockMarker)
 313+ getMaxCompressedBlockSize = staticmethod(getMaxCompressedBlockSize)
 314+ dumpstring = staticmethod(dumpstring)
 315+ checkForFooter = staticmethod(checkForFooter)
 316+ getDecompressedBlock = staticmethod(getDecompressedBlock)
 317+ isZeros = staticmethod(isZeros)
 318+
 319+class BzBlockStart(object):
 320+ """This class manages bzip2 block markers (which mark the start of bzip2 blocks)
 321+ in a blob of data. Because the block marker may not be byte aligned, it includes
 322+ the number of bits shifted left for the block start marker to be byte aligned.
 323+ It also contains a copy of the byte aligned data beginning at the start of the block
 324+ and including this marker.
 325+ Arguments:
 326+ data -- a blob of compressed data within which to find a bzip2 block
 327+ bzBlockSizeMultiplier -- 1 through 9.
 328+ This number is typically retrieved from a bzip2 header; it indicates the bzip2 block
 329+ sizes used for the uncompressed data, in units of 100K.
 330+ This function is a bit wasteful in that it attempts a decompression of the data and
 331+ throws away the results; it does this in order to verify that the block marker really
 332+ is at the start of a block.
 333+ It will detect a footer in the appropriate place at the end of the data stream
 334+ (it has to in order for uncompression to work correctly)
 335+ Use this function for seeking into an arbitrary place in a bzip2 file and digging up
 336+ data, or finding the last whole block written out to an arbitrarily truncated bzip2 file,
 337+ not for regular stream decompression."""
 338+
 339+ def __init__(self, data, header):
 340+ """Arguments: data, 4-byte header from bzip stream
 341+ If the data does not contain a bzip2 block marker
 342+ (that really begins a block, i.e. the data afterwards
 343+ uncompresses properly), subsequent calls to
 344+ getBzBlockMarkerStart() will return None.
 345+ Otherwise, getBzBlockMarkerStart() will return the byte in the
 346+ data where the marker starts, (counting from 0),
 347+ getBitsShifted() will get the number of bits that the
 348+ data at that byte must be left-shifted in order for the
 349+ block marker to be byte-aligned, and
 350+ getByteAlignedData() will return the byte-aligned data
 351+ including the block marker.
 352+ Note that we test for the block marker to be a genuine start of
 353+ block by trying decompression; this can fail sometimes if the
 354+ bzip2 end of stream footer is present, so we will attempt to
 355+ remove it and getByteAlignedData() returns the data with
 356+ that removed (FIXME should it? Do we even need that function?)"""
 357+
 358+ self._data = data
 359+ # 4 byte bzip2 header at the beginning of every compressed file or stream
 360+ self._bzHeader = header
 361+ self._bzBlockSizeMultiplier = int(header[3])
 362+ self._bzBlockSize = self._bzBlockSizeMultiplier * 100000
 363+ # this byte string is at the start of every bzip2 compressed block
 364+ self._bzBlockMarker = shiftedSearchString(BzConstants.getBlockMarker())
 365+ # index of byte in data where block marker found
 366+ self._bzBlockMarkerStart = None
 367+ # number of bits block was shifted in order to be byte aligned
 368+ self._bitsShifted = None
 369+ # block marker + following data, byte aligned
 370+ self._byteAlignedData = None
 371+ self.findBzBlockMarker()
 372+
 373+ def findBzBlockMarker(self):
 374+ data = self._data
 375+ offset = 0
 376+
 377+ while (True):
 378+ ( bzBlockMarkerStart, bitsShifted ) = self._bzBlockMarker.findStringInDataFromLeft(data)
 379+ if ( bzBlockMarkerStart == None):
 380+ return None
 381+ offset = offset + bzBlockMarkerStart
 382+
 383+ data = ByteAlignedDataMethods.getByteAlignedData(data,bzBlockMarkerStart, bitsShifted)
 384+
 385+ # try uncompression to see if it's a valid block. since the block marker can
 386+ # appear in the data stream randomly, we don't try to bound this possible block by the next
 387+ # appearance of a block marker; the decompress routine may barf on a partial block
 388+ # (and we don't know how much truncation is allowed). So pass something guaranteed to be
 389+ # a full block size and then some, for the test.
 390+ toDO = BzConstants.getMaxCompressedBlockSize(self._bzBlockSizeMultiplier)
 391+
 392+ dataToUncompress = data[:toDO]
 393+
 394+ # if there is a footer we will toss it
 395+ footerOffset = BzConstants.checkForFooter(dataToUncompress)
 396+ if (footerOffset != None):
 397+ # what do we think about this, give a partial footer?
 398+ dataToTry = dataToUncompress[:footerOffset+5]
 399+ else:
 400+ dataToTry = dataToUncompress
 401+
 402+ uncompressedData = BzConstants.getDecompressedBlock(dataToTry, self._bzHeader)
 403+ if (uncompressedData != None):
 404+ # w00t!
 405+ self._byteAlignedData = dataToUncompress
 406+ self._bzBlockMarkerStart = offset
 407+ self._bitsShifted = bitsShifted
 408+ return True
 409+
 410+ # no possibilities left
 411+ if (len(data) <= len(BzConstants.getBlockMarker())):
 412+ self._bzBlockMarkerStart = None
 413+
 414+ return None
 415+ # not a real block. on to the next possibility
 416+ else:
 417+ data = data[len(BzConstants.getBlockMarker()):]
 418+
 419+ def getBitsShifted(self):
 420+ return self._bitsShifted
 421+
 422+ def getBzBlockMarkerStart(self):
 423+ return self._bzBlockMarkerStart
 424+
 425+ def getByteAlignedData(self):
 426+ return self._byteAlignedData
 427+
 428+class BzBlock(object):
 429+ """This class manipulates bzipped data blocks (which include the
 430+ block start marker). Because the block may not have been byte
 431+ aligned wthin the original compressed stream or file, it also
 432+ includes the number of bits shifted left for the block start marker to
 433+ be byte aligned, as well as the number of bits to shift for
 434+ the start of the next block, it there is one, in the stream
 435+ or file.
 436+ It may additionally include some or all of the uncompressed data.
 437+ Takes as arguments:
 438+ data -- the stream of data in which to find a block
 439+ header -- the 4 byte bzip2 header which tells us block size among other things"""
 440+
 441+ def __init__(self, blockData, header):
 442+ self._blockData = blockData
 443+ self._compressedData = None
 444+ self._bzHeader = header
 445+ self._bzBlockStart = None
 446+ self._uncompressedData = None
 447+ self._bzBlockLength = None
 448+ self.findAndUncompressFirstBlock()
 449+
 450+ def getBitMask(self, bits, left = False):
 451+ """return bitmask starting from left or right end, of specified number of bits.
 452+ default is start from right end"""
 453+ if bits < 0:
 454+ return 0
 455+
 456+ bits = bits % 8
 457+ if (left):
 458+ return 255 - 2**(8-bits) +1
 459+ pass
 460+ else:
 461+ return 2**bits -1
 462+
 463+ def getMasked(self, byte, bitCount, left = False):
 464+ """return leftmost or rightmost bitCount bits from byte
 465+ default is rightmost. expect byte to be ord, not char"""
 466+ mask = self.getBitMask(bitCount,left)
 467+ return mask & byte
 468+
 469+ def findAndUncompressFirstBlock(self):
 470+ bzBlockStart = BzBlockStart(self._blockData, self._bzHeader)
 471+ if (not bzBlockStart.getBzBlockMarkerStart()):
 472+ return None
 473+
 474+ dataToUncompress = bzBlockStart.getByteAlignedData()
 475+
 476+ # ok now we want to get the start of the next block in here,
 477+ nextBlockStart = BzBlockStart(bzBlockStart.getByteAlignedData()[1:],self._bzHeader)
 478+ if (not nextBlockStart.getBzBlockMarkerStart()):
 479+ footerOffset = BzConstants.checkForFooter(dataToUncompress)
 480+ if (footerOffset != None):
 481+ # partial footer?
 482+ endMarker = footerOffset+5
 483+ else:
 484+ return None
 485+ else:
 486+ footerOffset = None
 487+ endMarker = nextBlockStart.getBzBlockMarkerStart() + 8
 488+
 489+ # this is either an additional 4 or 7 characters. not 5 or 8. python is stupid that way.
 490+ dataToUncompress = dataToUncompress[:endMarker]
 491+
 492+ self._uncompressedData = BzConstants.getDecompressedBlock(dataToUncompress, self._bzHeader)
 493+
 494+ if (self._uncompressedData == None):
 495+ return None
 496+ else:
 497+ # fixme is this next line right?
 498+ self._compressedData = dataToUncompress
 499+ self._bzBlockLength = len(dataToUncompress)
 500+ # now set *real* block length, not the length of the block plus a
 501+ # a few bytes of the following block marker or footer.
 502+ # NOTE that truncating your data to this byte may be a bad idea since the next byte,
 503+ # while it will have the start of the next block marker in it, may not be
 504+ # byte aligned; i.e. the first so many bits of that byte may be the end of this block. :-P
 505+ # also note that truncating your block here for purposes of decompression with the
 506+ # python bindings *will not work*, it needs to see the footer or the beginning of
 507+ # the next block or it will fail and complain. sorry dudes.
 508+ if (footerOffset == None):
 509+ self._bzBlockLength = self._bzBlockLength - 7
 510+ else:
 511+ self._bzBlockLength = self._bzBlockLength - 4
 512+ self._bzBlockStart = bzBlockStart
 513+ return bzBlockStart
 514+
 515+ def getOffset(self):
 516+ if (self._bzBlockStart):
 517+ return self._bzBlockStart.getBzBlockMarkerStart()
 518+ else:
 519+ return None
 520+
 521+ def getCompressedData(self):
 522+ return self._compressedData
 523+
 524+ def getUncompressedData(self):
 525+ return self._uncompressedData
 526+
 527+ def getBlockLength(self):
 528+ """return length of the (compressed) bz2 block"""
 529+ return(self._bzBlockLength)
 530+
 531+class BzFile:
 532+ """handle bzip2 files, which means we can seek to arbitrary places
 533+ in the compressed data, find the next block, uncompress it,
 534+ uncompress the following n blocks, get the last complete block
 535+ from before the eof, etc."""
 536+ def __init__(self, fileName):
 537+ self._fileName = fileName
 538+ self._dataBlock = None
 539+ self._seekOffset = None
 540+ self._blocksize = None
 541+ self._header = None # bzip2 header, 4 bytes
 542+ self._f = open(fileName,"r")
 543+ self.readHeader()
 544+ self._blockSizeMultiplier = int(self._header[3])
 545+ self._footer = BzConstants.getFooter()
 546+ self._filesize = None
 547+
 548+ def getBlockSizeMultiplier(self):
 549+ return(self._blockSizeMultiplier)
 550+
 551+ def readHeader(self):
 552+ if self._header == None:
 553+ self._f.seek(0)
 554+ # header is BZhn (n = multiplier of 100k for compression blocksize)
 555+ self._header = self._f.read(4)
 556+
 557+ def close(self):
 558+ self._f.close()
 559+
 560+ def findLastFullBzBlock(self):
 561+ """find last full bzip2 block written out before eof (by seeking
 562+ to near the eof). This is useful in case you have a truncated XML dump
 563+ and you want to know where to restart the run from; for very large files,
 564+ decompressing the blocks starting from the beginning of the file
 565+ can be quite slow.
 566+ Returns a pointer to the DataBlock object or None if there was no
 567+ bzip2 block found."""
 568+
 569+ if not self._filesize:
 570+ self._f.seek(0,os.SEEK_END)
 571+ self._filesize = self._f.tell()
 572+
 573+ seekBackTo = BzConstants.getMaxCompressedBlockSize(self._blockSizeMultiplier)*2
 574+ if self._filesize < seekBackTo:
 575+ seekBackTo = self._filesize
 576+ self._f.seek(seekBackTo * -1,os.SEEK_END)
 577+ # we are guaranteed to have a full block in here (if the file isn't less than a block long)
 578+ # so start walking through this data til we find the last full block before eof.
 579+ data = self._f.read()
 580+ previousBlock = None
 581+
 582+ while True:
 583+ blockFound = BzBlock(data, self._header)
 584+ if not blockFound.getUncompressedData(): # truncated block?
 585+ if previousBlock:
 586+ self._dataBlock = previousBlock
 587+ self._seekOffset = self._filesize - previousSeekBack
 588+ return previousBlock
 589+ else:
 590+ return None
 591+ previousBlock = blockFound
 592+ offset = blockFound.getBlockLength()
 593+ # otheroffset = where the fricking block started in the data we passed it
 594+ otheroffset = blockFound.getOffset()
 595+
 596+ previousSeekBack = seekBackTo - otheroffset
 597+ seekBackTo = seekBackTo - offset - otheroffset + 1
 598+ data = data[seekBackTo * -1:]
 599+
 600+ def findBzBlockFromSeekPoint(self,seek):
 601+ """Seek to given offset in file, search for and return
 602+ first bzip2 block found in file after seek point, or
 603+ None if none was found"""
 604+ self._f.seek(seek,os.SEEK_SET)
 605+ data = self._f.read(BzConstants.getMaxCompressedBlockSize(self._blockSizeMultiplier)*2)
 606+
 607+ blockFound = BzBlock(data, self._header)
 608+ if not blockFound.getUncompressedData():
 609+ return None
 610+ self._dataBlock = blockFound
 611+ self._seekOffset = seek + blockFound.getOffset()
 612+ return blockFound
 613+
 614+ def getOffset(self):
 615+ return self._seekOffset
 616+
 617+if __name__ == "__main__":
 618+ try:
 619+# works
 620+ f = BzFile("/home/ariel/src/mediawiki/testing/enwiki-20100904-pages-meta-history9.xml.bz2")
 621+
 622+# works
 623+# f = BzFile("/home/ariel/elwiki-20100925-pages-meta-history.xml.bz2")
 624+
 625+# works hmm for from certain point, fails, because seek point > end of file :-P
 626+# f = BzFile("/home/ariel/src/mediawiki/testing/sample-last-but-0.bz2")
 627+
 628+# works
 629+# f = BzFile("/home/ariel/sample-file.txt.bz2")
 630+
 631+# works
 632+# f = BzFile("/home/ariel/sample-file-bz9.txt.bz2")
 633+
 634+# offset = f.getBlockSizeMultiplier()*100000 + 600
 635+ offset = 14315000
 636+
 637+ # in these all our results are byte-aligned block markers out of the box
 638+ # maybe that indicates a little problem? check. yes it's a bug, should have something around
 639+ # 14254438 + 61571 and don't. so where is it? only finding start block aligned and
 640+ # end block at shifted by 7, that's really weird. this must be recent, have
 641+ # this behavior for the other routine too.
 642+
 643+ block = f.findBzBlockFromSeekPoint(offset)
 644+
 645+# block = f.findLastFullBzBlock()
 646+ offset = None
 647+ if (block):
 648+ print "found this block (at offset in file %s, original seek point was %s, length %s): " % ( f.getOffset(), offset, block.getBlockLength()), block.getUncompressedData()[-500:]
 649+ print "doublecheck..."
 650+ f._f.seek(f.getOffset(),os.SEEK_SET)
 651+ datatemp = f._f.read(100)
 652+ BzConstants.dumpstring(datatemp[0:30],"contents of file from that offset")
 653+ else:
 654+ print "no block found"
 655+
 656+ f.close()
 657+ except(IOError):
 658+ print "there was no such file, you fool"
Property changes on: branches/ariel/xmldumps-backup/Bzip2RandomAccess.py
___________________________________________________________________
Added: svn:eol-style
1659 + native

Status & tagging log