Index: branches/ariel/xmldumps-backup/Bzip2RandomAccess.py |
— | — | @@ -0,0 +1,657 @@ |
| 2 | +import getopt |
| 3 | +import os |
| 4 | +import re |
| 5 | +import sys |
| 6 | +import time |
| 7 | +import bz2 |
| 8 | +import xml.sax |
| 9 | + |
| 10 | +class ShiftedData(object): |
| 11 | + """This class manages strings of data that have been left-shifted |
| 12 | + 0 through 7 bits.""" |
| 13 | + |
| 14 | + def __init__(self, data, n=0, padding = None): |
| 15 | + """Arguments: |
| 16 | + data -- the data to shift |
| 17 | + n -- the number of bits to shift left |
| 18 | + padding -- whether to add 1's on upper end of new |
| 19 | + leftmost byte and lower end of rightmost byte""" |
| 20 | + self._data = data |
| 21 | + self._bitsShiftedLeft = n % 8 |
| 22 | + self._padding = padding |
| 23 | + self._shiftedData = self.shiftLeftNBits() |
| 24 | + |
| 25 | + def getShiftedData(self): |
| 26 | + return self._shiftedData |
| 27 | + |
| 28 | + def getData(self): |
| 29 | + return self._data |
| 30 | + |
| 31 | + def getBitsShifted(self): |
| 32 | + return(self._bitsShiftedLeft) |
| 33 | + |
| 34 | + def getlength(self): |
| 35 | + return len(self._data) |
| 36 | + |
| 37 | + def getShiftedLength(self): |
| 38 | + return len(self._shiftedData) |
| 39 | + |
| 40 | + def shiftLeftNBits(self): |
| 41 | + """shift a string of crap n bits left, pushing |
| 42 | + overflow into a new leftmost byte""" |
| 43 | + return ByteAlignedDataMethods.shiftLeftNBits(self._data,self._bitsShiftedLeft,self._padding) |
| 44 | + |
| 45 | +class ByteAlignedDataMethods(object): |
| 46 | + """Contains various methods for byte-aligned data""" |
| 47 | + |
| 48 | + def shiftLeftNBits(data, bits, padding = False): |
| 49 | + """shift a string of crap n bits left, pushing |
| 50 | + overflow into a new leftmost byte, padding on right |
| 51 | + with 1s if requested, otherwise with 0s""" |
| 52 | + if (bits == 0): |
| 53 | + return data |
| 54 | + |
| 55 | + resultList = [] |
| 56 | + |
| 57 | + # overflow from shift off left end, may be 0 |
| 58 | + overflow = ord(data[0])>> (8 - bits) |
| 59 | + resultList.append(chr(overflow)) |
| 60 | + |
| 61 | + if (padding): |
| 62 | + resultList[-1] = chr(ord(resultList[-1]) | (256 - 2**bits)) |
| 63 | + |
| 64 | + for i in range(0,len(data)): |
| 65 | + c = ord(data[i]) |
| 66 | + if i == len(data)-1: |
| 67 | + next = 0 |
| 68 | + else: |
| 69 | + next = ord(data[i+1]) |
| 70 | + |
| 71 | + # grab stuff shifted off the left end of the next byte |
| 72 | + resultList.append(chr((c<<bits) & 255 | (next >> (8 - bits)))) |
| 73 | + |
| 74 | + if (padding): |
| 75 | + resultList[-1] = chr(ord(resultList[-1]) | (2**bits -1)) |
| 76 | + |
| 77 | + resultString = "".join(resultList) |
| 78 | + return(resultString) |
| 79 | + |
| 80 | + def getByteAlignedData(data, startByte, startBit): |
| 81 | + """given a string of data, a starting byte number (from 0) |
| 82 | + and a starting bit in that byte (counting 0 from the leftmost bit), |
| 83 | + return the string of bits starting from there and going to the end of the |
| 84 | + string of data. The last byte is 0-padded on the right if necessary.""" |
| 85 | + if (startByte >= len(data)): |
| 86 | + return None |
| 87 | + startBit = startBit % 8 |
| 88 | + shifted = ByteAlignedDataMethods.shiftLeftNBits(data[startByte:],startBit) |
| 89 | + if (startBit): |
| 90 | + # the new uppermost byte is the extra bits that we didn't want anyways |
| 91 | + return(shifted[1:]) |
| 92 | + else: |
| 93 | + return(shifted) |
| 94 | + |
| 95 | + shiftLeftNBits = staticmethod(shiftLeftNBits) |
| 96 | + getByteAlignedData = staticmethod(getByteAlignedData) |
| 97 | + |
| 98 | + |
| 99 | +class shiftedSearchString(object): |
| 100 | + """This class manages search strings that may searched for in |
| 101 | + bit-shifted data.""" |
| 102 | + |
| 103 | + def __init__(self, data): |
| 104 | + """Arguments: |
| 105 | + data -- the data to shift""" |
| 106 | + self._searchStringShifted=[] |
| 107 | + for i in range(0,8): |
| 108 | + self._searchStringShifted.append(ShiftedData(data,i,padding=True)) |
| 109 | + |
| 110 | + def getLength(self): |
| 111 | + return len(self._searchStringShifted[0].getData()) |
| 112 | + |
| 113 | + def findAllMatchesOfStringNonaligned(self,data): |
| 114 | + """search for all matches of a given pattern in a given string of data |
| 115 | + not byte aligned. returns an array n0, n1, n2.. where n0 = list of |
| 116 | + starting bytes where pattern was found with no bit padding, |
| 117 | + n1 = with 1 bit shifted, etc.""" |
| 118 | + |
| 119 | + # FIXME this might be with the bit pattern shifted and not the string, dunno. |
| 120 | + |
| 121 | + if (not data): |
| 122 | + return (None, None) |
| 123 | + matches = [] |
| 124 | + for i in range(0,8): |
| 125 | + results = self.findAllMatchesOfStringFromLeft(data,self._searchStringShifted[i].getShiftedData()) |
| 126 | + matches.append(results) |
| 127 | + return(matches) |
| 128 | + |
| 129 | + |
| 130 | + def findAllMatchesOfStringFromLeft(self,data,pattern): |
| 131 | + """byte aligned already""" |
| 132 | + if (not data): |
| 133 | + return (None, None) |
| 134 | + |
| 135 | + positions = [] |
| 136 | + offset = 0 |
| 137 | + while (offset < len(data)): |
| 138 | + # do all but the first and last byte (which may have padding) |
| 139 | + result = data[offset:].find(pattern[1:-1]) |
| 140 | + if (result >= 0): |
| 141 | + if (offset+result-1 >= 0): |
| 142 | + offset = offset -1 |
| 143 | + firstByte = data[offset+result] |
| 144 | + if firstByte == chr(ord(pattern[0]) & ord(firstByte)): |
| 145 | + lastByte = data[offset+result+len(pattern) -1] |
| 146 | + if lastByte == chr(ord(pattern[-1]) & ord(lastByte)): |
| 147 | + positions.append(result) |
| 148 | + |
| 149 | + # that submatch isn't at a block boundary, false alarm |
| 150 | + offset = offset + result + 2 # +1 because we start match at byte 2 of pattern and we want to move up one also |
| 151 | + else: |
| 152 | + return(positions) |
| 153 | + |
| 154 | + |
| 155 | + def findStringInDataFromLeft(self,data): |
| 156 | + """Find first occurence of string in (bit-shifted) data |
| 157 | + (occurence may not be byte aligned) |
| 158 | + Arguments: a string of data or a ShiftedData object |
| 159 | + Returns: tuple consisting of |
| 160 | + - the starting position of the first match |
| 161 | + - number of bits data must be left-shifted in order to find |
| 162 | + byte-aligned match""" |
| 163 | + |
| 164 | + if (not data): |
| 165 | + return (None, None) |
| 166 | + |
| 167 | + while(True): |
| 168 | + firstMatch = None |
| 169 | + shiftedBy = None |
| 170 | + for i in range (0,8): |
| 171 | + offset = 0 |
| 172 | + # do all but the first and last byte (which may have padding) |
| 173 | + bytesShifted = self._searchStringShifted[i].getShiftedData() |
| 174 | + result = data[offset:].find(bytesShifted[1:-1]) |
| 175 | + if (result >= 0): |
| 176 | + if (offset+result-1 >= 0): |
| 177 | + offset = offset -1 |
| 178 | + firstByte = data[offset+result] |
| 179 | + if firstByte == chr(ord(bytesShifted[0]) & ord(firstByte)): |
| 180 | + lastByte = data[offset+result+len(bytesShifted) -1] |
| 181 | + if lastByte == chr(ord(bytesShifted[-1]) & ord(lastByte)): |
| 182 | + if (firstMatch == None or result+offset < firstMatch): |
| 183 | + firstMatch = result+offset |
| 184 | + shiftedBy = i |
| 185 | + # that submatch isn't at a block boundary, false alarm |
| 186 | + offset = offset + result + 2 # +1 because we start match at byte 2 of pattern and we want to move up one also |
| 187 | + if (firstMatch == None): |
| 188 | + return(None, None) |
| 189 | + else: |
| 190 | + return (firstMatch,(8 - shiftedBy)%8) |
| 191 | + |
| 192 | + def findStringInDataFromRight(self,data): |
| 193 | + """Find last occurence of string in (bit-shifted) data |
| 194 | + (occurence may not be byte aligned) |
| 195 | + Arguments: a string of data or a ShiftedData object |
| 196 | + Returns: tuple consisting of |
| 197 | + - the starting position of the first match, starting from byte 0 |
| 198 | + - number of bits pattern was left-shifted in order to find match""" |
| 199 | + |
| 200 | + if (not data): |
| 201 | + return (None, None) |
| 202 | + |
| 203 | + if isinstance(data, ShiftedData): |
| 204 | + checkData = data.getShiftedData() |
| 205 | + else: |
| 206 | + checkData = data |
| 207 | + |
| 208 | + while(True): |
| 209 | + firstMatch = None |
| 210 | + shiftedBy = None |
| 211 | + for i in range (0,8): |
| 212 | + offset = 0 |
| 213 | + # do all but the first and last byte (which may have padding) |
| 214 | + bytesShifted = self._searchStringShifted[i].getShiftedData() |
| 215 | + result = checkData[offset:].rfind(bytesShifted[1:-1]) |
| 216 | + if (result >= 0): |
| 217 | + if (offset+result-1 >= 0): |
| 218 | + offset = offset -1 |
| 219 | + firstByte = checkData[offset+result] |
| 220 | + if firstByte == chr(ord(bytesShifted[0]) & ord(firstByte)): |
| 221 | + lastByte = checkData[offset+result+len(bytesShifted) -1] |
| 222 | + if lastByte == chr(ord(bytesShifted[-1]) & ord(lastByte)): |
| 223 | + if (firstMatch == None or result+offset > firstMatch): |
| 224 | + firstMatch = result+offset |
| 225 | + shiftedBy = i |
| 226 | + # that submatch isn't at a block boundary, false alarm |
| 227 | + offset = offset - result + 2 # +1 because we start match at 1 byte before end pattern and we want to skip a byte also |
| 228 | + |
| 229 | + if (firstMatch == None): |
| 230 | + return(None, None) |
| 231 | + else: |
| 232 | + return (firstMatch,(8 - shiftedBy)%8) |
| 233 | + |
| 234 | + def dumpSearchString(self): |
| 235 | + for i in range(0, len(self._searchStringShifted)): |
| 236 | + BzConstants.dumpstring(self._searchStringShifted[i].getShiftedData(),"search string shifted %s:" % i) |
| 237 | + |
| 238 | +class BzConstants(object): |
| 239 | + """Contains various defines for bz2 data""" |
| 240 | + |
| 241 | + def getFooter(): |
| 242 | + """Return string which is at the end of every bzip2 stream or file""" |
| 243 | + footer = [ '0x17', '0x72', '0x45', '0x38', '0x50', '0x90' ] |
| 244 | + for i in range(0,len(footer)): |
| 245 | + footer[i] = chr(int(footer[i],16)) |
| 246 | + footerString = "".join(footer) |
| 247 | + return footerString |
| 248 | + |
| 249 | + def getBlockMarker(): |
| 250 | + """Return string which is at the beginning of every bzip2 compressed block""" |
| 251 | + return "1AY&SY" |
| 252 | + |
| 253 | + def getMaxCompressedBlockSize(bzBlockSizeMultiplier): |
| 254 | + """Return the maximum compressed bzip2 block size based on the |
| 255 | + block size multipler (from the bzip2 header) passed as an argument""" |
| 256 | + # max length of compressed data block given size of uncompressed data as specified in header |
| 257 | + # "To guarantee that the compressed data will fit in its buffer, allocate an output buffer of |
| 258 | + # size 1% larger than the uncompressed data, plus six hundred extra bytes." (Plus paranoia :-P) |
| 259 | + return (bzBlockSizeMultiplier + bzBlockSizeMultiplier/100)*100000 + 650 |
| 260 | + |
| 261 | + def dumpstring(string,message="dumping string:"): |
| 262 | + print message, |
| 263 | + for i in range(0,len(string)): |
| 264 | + print hex(ord(string[i])), |
| 265 | + print |
| 266 | + |
| 267 | + def isZeros(data): |
| 268 | + for i in range(0,len(data)): |
| 269 | + if (ord(data[i]) != 0): |
| 270 | + return False |
| 271 | + return True |
| 272 | + |
| 273 | + def checkForFooter(data): |
| 274 | + """See if data passed as argument ends in the bz2 footer |
| 275 | + We expect: 6 bytes of footer, 4 bytes of crc, 0 to 7 bits of padding""" |
| 276 | + footerSearchString = shiftedSearchString(BzConstants.getFooter()) |
| 277 | + ( offset, bitsShifted ) = footerSearchString.findStringInDataFromRight(data[-30:]) |
| 278 | + if (offset != None): |
| 279 | + if (bitsShifted > 0): |
| 280 | + paddingByte = 1 |
| 281 | + else: |
| 282 | + paddingByte = 0 |
| 283 | + # starts from 0 |
| 284 | + # expect 0-filled bytes at the end, if there are any |
| 285 | + extraBytes = -30+(offset + 6 + 4 + paddingByte) |
| 286 | + # so this iszeros thing... see, this data we got passed may have not been byte aligned. |
| 287 | + # so maybe it got left-bit shifted for some block marker. |
| 288 | + # then if we foudn another marker farther down, maybe it got shifted again |
| 289 | + # and so on... so there could be a bunch of extra zero bytes at the end |
| 290 | + # FIXME this is the wrong place to check for that but I don't know the |
| 291 | + # right place yet. And keeping track is absolutely out of the question. |
| 292 | + # we could instead of using the newly byte aligned data from the stream |
| 293 | + # and continually left shifting it, go back to the original stream each |
| 294 | + # time which would limit this some. ? needs thought. |
| 295 | + if (extraBytes <= 0 or BzConstants.isZeros(data[-30+(offset + 6 + 4 + paddingByte):])): |
| 296 | + # starting byte of footer, counting from end. may not be byte aligned. |
| 297 | + return (-30 + offset) |
| 298 | + return None |
| 299 | + |
| 300 | + def getDecompressedBlock(data, bzHeader): |
| 301 | + """takes string of data plus 4 byte bzip2 header, returns decompressed block""" |
| 302 | + block = bzHeader + data |
| 303 | + try: |
| 304 | + bz = bz2.BZ2Decompressor() |
| 305 | + out = bz.decompress(block) |
| 306 | + return(out) |
| 307 | + except Exception, ex: |
| 308 | + print ex |
| 309 | + return(None) |
| 310 | + |
| 311 | + getFooter = staticmethod(getFooter) |
| 312 | + getBlockMarker = staticmethod(getBlockMarker) |
| 313 | + getMaxCompressedBlockSize = staticmethod(getMaxCompressedBlockSize) |
| 314 | + dumpstring = staticmethod(dumpstring) |
| 315 | + checkForFooter = staticmethod(checkForFooter) |
| 316 | + getDecompressedBlock = staticmethod(getDecompressedBlock) |
| 317 | + isZeros = staticmethod(isZeros) |
| 318 | + |
| 319 | +class BzBlockStart(object): |
| 320 | + """This class manages bzip2 block markers (which mark the start of bzip2 blocks) |
| 321 | + in a blob of data. Because the block marker may not be byte aligned, it includes |
| 322 | + the number of bits shifted left for the block start marker to be byte aligned. |
| 323 | + It also contains a copy of the byte aligned data beginning at the start of the block |
| 324 | + and including this marker. |
| 325 | + Arguments: |
| 326 | + data -- a blob of compressed data within which to find a bzip2 block |
| 327 | + bzBlockSizeMultiplier -- 1 through 9. |
| 328 | + This number is typically retrieved from a bzip2 header; it indicates the bzip2 block |
| 329 | + sizes used for the uncompressed data, in units of 100K. |
| 330 | + This function is a bit wasteful in that it attempts a decompression of the data and |
| 331 | + throws away the results; it does this in order to verify that the block marker really |
| 332 | + is at the start of a block. |
| 333 | + It will detect a footer in the appropriate place at the end of the data stream |
| 334 | + (it has to in order for uncompression to work correctly) |
| 335 | + Use this function for seeking into an arbitrary place in a bzip2 file and digging up |
| 336 | + data, or finding the last whole block written out to an arbitrarily truncated bzip2 file, |
| 337 | + not for regular stream decompression.""" |
| 338 | + |
| 339 | + def __init__(self, data, header): |
| 340 | + """Arguments: data, 4-byte header from bzip stream |
| 341 | + If the data does not contain a bzip2 block marker |
| 342 | + (that really begins a block, i.e. the data afterwards |
| 343 | + uncompresses properly), subsequent calls to |
| 344 | + getBzBlockMarkerStart() will return None. |
| 345 | + Otherwise, getBzBlockMarkerStart() will return the byte in the |
| 346 | + data where the marker starts, (counting from 0), |
| 347 | + getBitsShifted() will get the number of bits that the |
| 348 | + data at that byte must be left-shifted in order for the |
| 349 | + block marker to be byte-aligned, and |
| 350 | + getByteAlignedData() will return the byte-aligned data |
| 351 | + including the block marker. |
| 352 | + Note that we test for the block marker to be a genuine start of |
| 353 | + block by trying decompression; this can fail sometimes if the |
| 354 | + bzip2 end of stream footer is present, so we will attempt to |
| 355 | + remove it and getByteAlignedData() returns the data with |
| 356 | + that removed (FIXME should it? Do we even need that function?)""" |
| 357 | + |
| 358 | + self._data = data |
| 359 | + # 4 byte bzip2 header at the beginning of every compressed file or stream |
| 360 | + self._bzHeader = header |
| 361 | + self._bzBlockSizeMultiplier = int(header[3]) |
| 362 | + self._bzBlockSize = self._bzBlockSizeMultiplier * 100000 |
| 363 | + # this byte string is at the start of every bzip2 compressed block |
| 364 | + self._bzBlockMarker = shiftedSearchString(BzConstants.getBlockMarker()) |
| 365 | + # index of byte in data where block marker found |
| 366 | + self._bzBlockMarkerStart = None |
| 367 | + # number of bits block was shifted in order to be byte aligned |
| 368 | + self._bitsShifted = None |
| 369 | + # block marker + following data, byte aligned |
| 370 | + self._byteAlignedData = None |
| 371 | + self.findBzBlockMarker() |
| 372 | + |
| 373 | + def findBzBlockMarker(self): |
| 374 | + data = self._data |
| 375 | + offset = 0 |
| 376 | + |
| 377 | + while (True): |
| 378 | + ( bzBlockMarkerStart, bitsShifted ) = self._bzBlockMarker.findStringInDataFromLeft(data) |
| 379 | + if ( bzBlockMarkerStart == None): |
| 380 | + return None |
| 381 | + offset = offset + bzBlockMarkerStart |
| 382 | + |
| 383 | + data = ByteAlignedDataMethods.getByteAlignedData(data,bzBlockMarkerStart, bitsShifted) |
| 384 | + |
| 385 | + # try uncompression to see if it's a valid block. since the block marker can |
| 386 | + # appear in the data stream randomly, we don't try to bound this possible block by the next |
| 387 | + # appearance of a block marker; the decompress routine may barf on a partial block |
| 388 | + # (and we don't know how much truncation is allowed). So pass something guaranteed to be |
| 389 | + # a full block size and then some, for the test. |
| 390 | + toDO = BzConstants.getMaxCompressedBlockSize(self._bzBlockSizeMultiplier) |
| 391 | + |
| 392 | + dataToUncompress = data[:toDO] |
| 393 | + |
| 394 | + # if there is a footer we will toss it |
| 395 | + footerOffset = BzConstants.checkForFooter(dataToUncompress) |
| 396 | + if (footerOffset != None): |
| 397 | + # what do we think about this, give a partial footer? |
| 398 | + dataToTry = dataToUncompress[:footerOffset+5] |
| 399 | + else: |
| 400 | + dataToTry = dataToUncompress |
| 401 | + |
| 402 | + uncompressedData = BzConstants.getDecompressedBlock(dataToTry, self._bzHeader) |
| 403 | + if (uncompressedData != None): |
| 404 | + # w00t! |
| 405 | + self._byteAlignedData = dataToUncompress |
| 406 | + self._bzBlockMarkerStart = offset |
| 407 | + self._bitsShifted = bitsShifted |
| 408 | + return True |
| 409 | + |
| 410 | + # no possibilities left |
| 411 | + if (len(data) <= len(BzConstants.getBlockMarker())): |
| 412 | + self._bzBlockMarkerStart = None |
| 413 | + |
| 414 | + return None |
| 415 | + # not a real block. on to the next possibility |
| 416 | + else: |
| 417 | + data = data[len(BzConstants.getBlockMarker()):] |
| 418 | + |
| 419 | + def getBitsShifted(self): |
| 420 | + return self._bitsShifted |
| 421 | + |
| 422 | + def getBzBlockMarkerStart(self): |
| 423 | + return self._bzBlockMarkerStart |
| 424 | + |
| 425 | + def getByteAlignedData(self): |
| 426 | + return self._byteAlignedData |
| 427 | + |
| 428 | +class BzBlock(object): |
| 429 | + """This class manipulates bzipped data blocks (which include the |
| 430 | + block start marker). Because the block may not have been byte |
| 431 | + aligned wthin the original compressed stream or file, it also |
| 432 | + includes the number of bits shifted left for the block start marker to |
| 433 | + be byte aligned, as well as the number of bits to shift for |
| 434 | + the start of the next block, it there is one, in the stream |
| 435 | + or file. |
| 436 | + It may additionally include some or all of the uncompressed data. |
| 437 | + Takes as arguments: |
| 438 | + data -- the stream of data in which to find a block |
| 439 | + header -- the 4 byte bzip2 header which tells us block size among other things""" |
| 440 | + |
| 441 | + def __init__(self, blockData, header): |
| 442 | + self._blockData = blockData |
| 443 | + self._compressedData = None |
| 444 | + self._bzHeader = header |
| 445 | + self._bzBlockStart = None |
| 446 | + self._uncompressedData = None |
| 447 | + self._bzBlockLength = None |
| 448 | + self.findAndUncompressFirstBlock() |
| 449 | + |
| 450 | + def getBitMask(self, bits, left = False): |
| 451 | + """return bitmask starting from left or right end, of specified number of bits. |
| 452 | + default is start from right end""" |
| 453 | + if bits < 0: |
| 454 | + return 0 |
| 455 | + |
| 456 | + bits = bits % 8 |
| 457 | + if (left): |
| 458 | + return 255 - 2**(8-bits) +1 |
| 459 | + pass |
| 460 | + else: |
| 461 | + return 2**bits -1 |
| 462 | + |
| 463 | + def getMasked(self, byte, bitCount, left = False): |
| 464 | + """return leftmost or rightmost bitCount bits from byte |
| 465 | + default is rightmost. expect byte to be ord, not char""" |
| 466 | + mask = self.getBitMask(bitCount,left) |
| 467 | + return mask & byte |
| 468 | + |
| 469 | + def findAndUncompressFirstBlock(self): |
| 470 | + bzBlockStart = BzBlockStart(self._blockData, self._bzHeader) |
| 471 | + if (not bzBlockStart.getBzBlockMarkerStart()): |
| 472 | + return None |
| 473 | + |
| 474 | + dataToUncompress = bzBlockStart.getByteAlignedData() |
| 475 | + |
| 476 | + # ok now we want to get the start of the next block in here, |
| 477 | + nextBlockStart = BzBlockStart(bzBlockStart.getByteAlignedData()[1:],self._bzHeader) |
| 478 | + if (not nextBlockStart.getBzBlockMarkerStart()): |
| 479 | + footerOffset = BzConstants.checkForFooter(dataToUncompress) |
| 480 | + if (footerOffset != None): |
| 481 | + # partial footer? |
| 482 | + endMarker = footerOffset+5 |
| 483 | + else: |
| 484 | + return None |
| 485 | + else: |
| 486 | + footerOffset = None |
| 487 | + endMarker = nextBlockStart.getBzBlockMarkerStart() + 8 |
| 488 | + |
| 489 | + # this is either an additional 4 or 7 characters. not 5 or 8. python is stupid that way. |
| 490 | + dataToUncompress = dataToUncompress[:endMarker] |
| 491 | + |
| 492 | + self._uncompressedData = BzConstants.getDecompressedBlock(dataToUncompress, self._bzHeader) |
| 493 | + |
| 494 | + if (self._uncompressedData == None): |
| 495 | + return None |
| 496 | + else: |
| 497 | + # fixme is this next line right? |
| 498 | + self._compressedData = dataToUncompress |
| 499 | + self._bzBlockLength = len(dataToUncompress) |
| 500 | + # now set *real* block length, not the length of the block plus a |
| 501 | + # a few bytes of the following block marker or footer. |
| 502 | + # NOTE that truncating your data to this byte may be a bad idea since the next byte, |
| 503 | + # while it will have the start of the next block marker in it, may not be |
| 504 | + # byte aligned; i.e. the first so many bits of that byte may be the end of this block. :-P |
| 505 | + # also note that truncating your block here for purposes of decompression with the |
| 506 | + # python bindings *will not work*, it needs to see the footer or the beginning of |
| 507 | + # the next block or it will fail and complain. sorry dudes. |
| 508 | + if (footerOffset == None): |
| 509 | + self._bzBlockLength = self._bzBlockLength - 7 |
| 510 | + else: |
| 511 | + self._bzBlockLength = self._bzBlockLength - 4 |
| 512 | + self._bzBlockStart = bzBlockStart |
| 513 | + return bzBlockStart |
| 514 | + |
| 515 | + def getOffset(self): |
| 516 | + if (self._bzBlockStart): |
| 517 | + return self._bzBlockStart.getBzBlockMarkerStart() |
| 518 | + else: |
| 519 | + return None |
| 520 | + |
| 521 | + def getCompressedData(self): |
| 522 | + return self._compressedData |
| 523 | + |
| 524 | + def getUncompressedData(self): |
| 525 | + return self._uncompressedData |
| 526 | + |
| 527 | + def getBlockLength(self): |
| 528 | + """return length of the (compressed) bz2 block""" |
| 529 | + return(self._bzBlockLength) |
| 530 | + |
| 531 | +class BzFile: |
| 532 | + """handle bzip2 files, which means we can seek to arbitrary places |
| 533 | + in the compressed data, find the next block, uncompress it, |
| 534 | + uncompress the following n blocks, get the last complete block |
| 535 | + from before the eof, etc.""" |
| 536 | + def __init__(self, fileName): |
| 537 | + self._fileName = fileName |
| 538 | + self._dataBlock = None |
| 539 | + self._seekOffset = None |
| 540 | + self._blocksize = None |
| 541 | + self._header = None # bzip2 header, 4 bytes |
| 542 | + self._f = open(fileName,"r") |
| 543 | + self.readHeader() |
| 544 | + self._blockSizeMultiplier = int(self._header[3]) |
| 545 | + self._footer = BzConstants.getFooter() |
| 546 | + self._filesize = None |
| 547 | + |
| 548 | + def getBlockSizeMultiplier(self): |
| 549 | + return(self._blockSizeMultiplier) |
| 550 | + |
| 551 | + def readHeader(self): |
| 552 | + if self._header == None: |
| 553 | + self._f.seek(0) |
| 554 | + # header is BZhn (n = multiplier of 100k for compression blocksize) |
| 555 | + self._header = self._f.read(4) |
| 556 | + |
| 557 | + def close(self): |
| 558 | + self._f.close() |
| 559 | + |
| 560 | + def findLastFullBzBlock(self): |
| 561 | + """find last full bzip2 block written out before eof (by seeking |
| 562 | + to near the eof). This is useful in case you have a truncated XML dump |
| 563 | + and you want to know where to restart the run from; for very large files, |
| 564 | + decompressing the blocks starting from the beginning of the file |
| 565 | + can be quite slow. |
| 566 | + Returns a pointer to the DataBlock object or None if there was no |
| 567 | + bzip2 block found.""" |
| 568 | + |
| 569 | + if not self._filesize: |
| 570 | + self._f.seek(0,os.SEEK_END) |
| 571 | + self._filesize = self._f.tell() |
| 572 | + |
| 573 | + seekBackTo = BzConstants.getMaxCompressedBlockSize(self._blockSizeMultiplier)*2 |
| 574 | + if self._filesize < seekBackTo: |
| 575 | + seekBackTo = self._filesize |
| 576 | + self._f.seek(seekBackTo * -1,os.SEEK_END) |
| 577 | + # we are guaranteed to have a full block in here (if the file isn't less than a block long) |
| 578 | + # so start walking through this data til we find the last full block before eof. |
| 579 | + data = self._f.read() |
| 580 | + previousBlock = None |
| 581 | + |
| 582 | + while True: |
| 583 | + blockFound = BzBlock(data, self._header) |
| 584 | + if not blockFound.getUncompressedData(): # truncated block? |
| 585 | + if previousBlock: |
| 586 | + self._dataBlock = previousBlock |
| 587 | + self._seekOffset = self._filesize - previousSeekBack |
| 588 | + return previousBlock |
| 589 | + else: |
| 590 | + return None |
| 591 | + previousBlock = blockFound |
| 592 | + offset = blockFound.getBlockLength() |
| 593 | + # otheroffset = where the fricking block started in the data we passed it |
| 594 | + otheroffset = blockFound.getOffset() |
| 595 | + |
| 596 | + previousSeekBack = seekBackTo - otheroffset |
| 597 | + seekBackTo = seekBackTo - offset - otheroffset + 1 |
| 598 | + data = data[seekBackTo * -1:] |
| 599 | + |
| 600 | + def findBzBlockFromSeekPoint(self,seek): |
| 601 | + """Seek to given offset in file, search for and return |
| 602 | + first bzip2 block found in file after seek point, or |
| 603 | + None if none was found""" |
| 604 | + self._f.seek(seek,os.SEEK_SET) |
| 605 | + data = self._f.read(BzConstants.getMaxCompressedBlockSize(self._blockSizeMultiplier)*2) |
| 606 | + |
| 607 | + blockFound = BzBlock(data, self._header) |
| 608 | + if not blockFound.getUncompressedData(): |
| 609 | + return None |
| 610 | + self._dataBlock = blockFound |
| 611 | + self._seekOffset = seek + blockFound.getOffset() |
| 612 | + return blockFound |
| 613 | + |
| 614 | + def getOffset(self): |
| 615 | + return self._seekOffset |
| 616 | + |
| 617 | +if __name__ == "__main__": |
| 618 | + try: |
| 619 | +# works |
| 620 | + f = BzFile("/home/ariel/src/mediawiki/testing/enwiki-20100904-pages-meta-history9.xml.bz2") |
| 621 | + |
| 622 | +# works |
| 623 | +# f = BzFile("/home/ariel/elwiki-20100925-pages-meta-history.xml.bz2") |
| 624 | + |
| 625 | +# works hmm for from certain point, fails, because seek point > end of file :-P |
| 626 | +# f = BzFile("/home/ariel/src/mediawiki/testing/sample-last-but-0.bz2") |
| 627 | + |
| 628 | +# works |
| 629 | +# f = BzFile("/home/ariel/sample-file.txt.bz2") |
| 630 | + |
| 631 | +# works |
| 632 | +# f = BzFile("/home/ariel/sample-file-bz9.txt.bz2") |
| 633 | + |
| 634 | +# offset = f.getBlockSizeMultiplier()*100000 + 600 |
| 635 | + offset = 14315000 |
| 636 | + |
| 637 | + # in these all our results are byte-aligned block markers out of the box |
| 638 | + # maybe that indicates a little problem? check. yes it's a bug, should have something around |
| 639 | + # 14254438 + 61571 and don't. so where is it? only finding start block aligned and |
| 640 | + # end block at shifted by 7, that's really weird. this must be recent, have |
| 641 | + # this behavior for the other routine too. |
| 642 | + |
| 643 | + block = f.findBzBlockFromSeekPoint(offset) |
| 644 | + |
| 645 | +# block = f.findLastFullBzBlock() |
| 646 | + offset = None |
| 647 | + if (block): |
| 648 | + print "found this block (at offset in file %s, original seek point was %s, length %s): " % ( f.getOffset(), offset, block.getBlockLength()), block.getUncompressedData()[-500:] |
| 649 | + print "doublecheck..." |
| 650 | + f._f.seek(f.getOffset(),os.SEEK_SET) |
| 651 | + datatemp = f._f.read(100) |
| 652 | + BzConstants.dumpstring(datatemp[0:30],"contents of file from that offset") |
| 653 | + else: |
| 654 | + print "no block found" |
| 655 | + |
| 656 | + f.close() |
| 657 | + except(IOError): |
| 658 | + print "there was no such file, you fool" |
Property changes on: branches/ariel/xmldumps-backup/Bzip2RandomAccess.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 659 | + native |