r75337 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r75336‎ \| r75337 \| r75338 >
Date:	19:56, 24 October 2010
Author:	ariel
Status:	deferred
Tags:
Comment:	initial import, bzip block-level management routines
Modified paths:	/branches/ariel/xmldumps-backup/Bzip2RandomAccess.py (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/Bzip2RandomAccess.py
—	—	@@ -0,0 +1,657 @@
	2	+import getopt
	3	+import os
	4	+import re
	5	+import sys
	6	+import time
	7	+import bz2
	8	+import xml.sax
	9	+
	10	+class ShiftedData(object):
	11	+ """This class manages strings of data that have been left-shifted
	12	+ 0 through 7 bits."""
	13	+
	14	+ def __init__(self, data, n=0, padding = None):
	15	+ """Arguments:
	16	+ data -- the data to shift
	17	+ n -- the number of bits to shift left
	18	+ padding -- whether to add 1's on upper end of new
	19	+ leftmost byte and lower end of rightmost byte"""
	20	+ self._data = data
	21	+ self._bitsShiftedLeft = n % 8
	22	+ self._padding = padding
	23	+ self._shiftedData = self.shiftLeftNBits()
	24	+
	25	+ def getShiftedData(self):
	26	+ return self._shiftedData
	27	+
	28	+ def getData(self):
	29	+ return self._data
	30	+
	31	+ def getBitsShifted(self):
	32	+ return(self._bitsShiftedLeft)
	33	+
	34	+ def getlength(self):
	35	+ return len(self._data)
	36	+
	37	+ def getShiftedLength(self):
	38	+ return len(self._shiftedData)
	39	+
	40	+ def shiftLeftNBits(self):
	41	+ """shift a string of crap n bits left, pushing
	42	+ overflow into a new leftmost byte"""
	43	+ return ByteAlignedDataMethods.shiftLeftNBits(self._data,self._bitsShiftedLeft,self._padding)
	44	+
	45	+class ByteAlignedDataMethods(object):
	46	+ """Contains various methods for byte-aligned data"""
	47	+
	48	+ def shiftLeftNBits(data, bits, padding = False):
	49	+ """shift a string of crap n bits left, pushing
	50	+ overflow into a new leftmost byte, padding on right
	51	+ with 1s if requested, otherwise with 0s"""
	52	+ if (bits == 0):
	53	+ return data
	54	+
	55	+ resultList = []
	56	+
	57	+ # overflow from shift off left end, may be 0
	58	+ overflow = ord(data[0])>> (8 - bits)
	59	+ resultList.append(chr(overflow))
	60	+
	61	+ if (padding):
	62	+ resultList[-1] = chr(ord(resultList[-1]) \| (256 - 2**bits))
	63	+
	64	+ for i in range(0,len(data)):
	65	+ c = ord(data[i])
	66	+ if i == len(data)-1:
	67	+ next = 0
	68	+ else:
	69	+ next = ord(data[i+1])
	70	+
	71	+ # grab stuff shifted off the left end of the next byte
	72	+ resultList.append(chr((c<<bits) & 255 \| (next >> (8 - bits))))
	73	+
	74	+ if (padding):
	75	+ resultList[-1] = chr(ord(resultList[-1]) \| (2**bits -1))
	76	+
	77	+ resultString = "".join(resultList)
	78	+ return(resultString)
	79	+
	80	+ def getByteAlignedData(data, startByte, startBit):
	81	+ """given a string of data, a starting byte number (from 0)
	82	+ and a starting bit in that byte (counting 0 from the leftmost bit),
	83	+ return the string of bits starting from there and going to the end of the
	84	+ string of data. The last byte is 0-padded on the right if necessary."""
	85	+ if (startByte >= len(data)):
	86	+ return None
	87	+ startBit = startBit % 8
	88	+ shifted = ByteAlignedDataMethods.shiftLeftNBits(data[startByte:],startBit)
	89	+ if (startBit):
	90	+ # the new uppermost byte is the extra bits that we didn't want anyways
	91	+ return(shifted[1:])
	92	+ else:
	93	+ return(shifted)
	94	+
	95	+ shiftLeftNBits = staticmethod(shiftLeftNBits)
	96	+ getByteAlignedData = staticmethod(getByteAlignedData)
	97	+
	98	+
	99	+class shiftedSearchString(object):
	100	+ """This class manages search strings that may searched for in
	101	+ bit-shifted data."""
	102	+
	103	+ def __init__(self, data):
	104	+ """Arguments:
	105	+ data -- the data to shift"""
	106	+ self._searchStringShifted=[]
	107	+ for i in range(0,8):
	108	+ self._searchStringShifted.append(ShiftedData(data,i,padding=True))
	109	+
	110	+ def getLength(self):
	111	+ return len(self._searchStringShifted[0].getData())
	112	+
	113	+ def findAllMatchesOfStringNonaligned(self,data):
	114	+ """search for all matches of a given pattern in a given string of data
	115	+ not byte aligned. returns an array n0, n1, n2.. where n0 = list of
	116	+ starting bytes where pattern was found with no bit padding,
	117	+ n1 = with 1 bit shifted, etc."""
	118	+
	119	+ # FIXME this might be with the bit pattern shifted and not the string, dunno.
	120	+
	121	+ if (not data):
	122	+ return (None, None)
	123	+ matches = []
	124	+ for i in range(0,8):
	125	+ results = self.findAllMatchesOfStringFromLeft(data,self._searchStringShifted[i].getShiftedData())
	126	+ matches.append(results)
	127	+ return(matches)
	128	+
	129	+
	130	+ def findAllMatchesOfStringFromLeft(self,data,pattern):
	131	+ """byte aligned already"""
	132	+ if (not data):
	133	+ return (None, None)
	134	+
	135	+ positions = []
	136	+ offset = 0
	137	+ while (offset < len(data)):
	138	+ # do all but the first and last byte (which may have padding)
	139	+ result = data[offset:].find(pattern[1:-1])
	140	+ if (result >= 0):
	141	+ if (offset+result-1 >= 0):
	142	+ offset = offset -1
	143	+ firstByte = data[offset+result]
	144	+ if firstByte == chr(ord(pattern[0]) & ord(firstByte)):
	145	+ lastByte = data[offset+result+len(pattern) -1]
	146	+ if lastByte == chr(ord(pattern[-1]) & ord(lastByte)):
	147	+ positions.append(result)
	148	+
	149	+ # that submatch isn't at a block boundary, false alarm
	150	+ offset = offset + result + 2 # +1 because we start match at byte 2 of pattern and we want to move up one also
	151	+ else:
	152	+ return(positions)
	153	+
	154	+
	155	+ def findStringInDataFromLeft(self,data):
	156	+ """Find first occurence of string in (bit-shifted) data
	157	+ (occurence may not be byte aligned)
	158	+ Arguments: a string of data or a ShiftedData object
	159	+ Returns: tuple consisting of
	160	+ - the starting position of the first match
	161	+ - number of bits data must be left-shifted in order to find
	162	+ byte-aligned match"""
	163	+
	164	+ if (not data):
	165	+ return (None, None)
	166	+
	167	+ while(True):
	168	+ firstMatch = None
	169	+ shiftedBy = None
	170	+ for i in range (0,8):
	171	+ offset = 0
	172	+ # do all but the first and last byte (which may have padding)
	173	+ bytesShifted = self._searchStringShifted[i].getShiftedData()
	174	+ result = data[offset:].find(bytesShifted[1:-1])
	175	+ if (result >= 0):
	176	+ if (offset+result-1 >= 0):
	177	+ offset = offset -1
	178	+ firstByte = data[offset+result]
	179	+ if firstByte == chr(ord(bytesShifted[0]) & ord(firstByte)):
	180	+ lastByte = data[offset+result+len(bytesShifted) -1]
	181	+ if lastByte == chr(ord(bytesShifted[-1]) & ord(lastByte)):
	182	+ if (firstMatch == None or result+offset < firstMatch):
	183	+ firstMatch = result+offset
	184	+ shiftedBy = i
	185	+ # that submatch isn't at a block boundary, false alarm
	186	+ offset = offset + result + 2 # +1 because we start match at byte 2 of pattern and we want to move up one also
	187	+ if (firstMatch == None):
	188	+ return(None, None)
	189	+ else:
	190	+ return (firstMatch,(8 - shiftedBy)%8)
	191	+
	192	+ def findStringInDataFromRight(self,data):
	193	+ """Find last occurence of string in (bit-shifted) data
	194	+ (occurence may not be byte aligned)
	195	+ Arguments: a string of data or a ShiftedData object
	196	+ Returns: tuple consisting of
	197	+ - the starting position of the first match, starting from byte 0
	198	+ - number of bits pattern was left-shifted in order to find match"""
	199	+
	200	+ if (not data):
	201	+ return (None, None)
	202	+
	203	+ if isinstance(data, ShiftedData):
	204	+ checkData = data.getShiftedData()
	205	+ else:
	206	+ checkData = data
	207	+
	208	+ while(True):
	209	+ firstMatch = None
	210	+ shiftedBy = None
	211	+ for i in range (0,8):
	212	+ offset = 0
	213	+ # do all but the first and last byte (which may have padding)
	214	+ bytesShifted = self._searchStringShifted[i].getShiftedData()
	215	+ result = checkData[offset:].rfind(bytesShifted[1:-1])
	216	+ if (result >= 0):
	217	+ if (offset+result-1 >= 0):
	218	+ offset = offset -1
	219	+ firstByte = checkData[offset+result]
	220	+ if firstByte == chr(ord(bytesShifted[0]) & ord(firstByte)):
	221	+ lastByte = checkData[offset+result+len(bytesShifted) -1]
	222	+ if lastByte == chr(ord(bytesShifted[-1]) & ord(lastByte)):
	223	+ if (firstMatch == None or result+offset > firstMatch):
	224	+ firstMatch = result+offset
	225	+ shiftedBy = i
	226	+ # that submatch isn't at a block boundary, false alarm
	227	+ offset = offset - result + 2 # +1 because we start match at 1 byte before end pattern and we want to skip a byte also
	228	+
	229	+ if (firstMatch == None):
	230	+ return(None, None)
	231	+ else:
	232	+ return (firstMatch,(8 - shiftedBy)%8)
	233	+
	234	+ def dumpSearchString(self):
	235	+ for i in range(0, len(self._searchStringShifted)):
	236	+ BzConstants.dumpstring(self._searchStringShifted[i].getShiftedData(),"search string shifted %s:" % i)
	237	+
	238	+class BzConstants(object):
	239	+ """Contains various defines for bz2 data"""
	240	+
	241	+ def getFooter():
	242	+ """Return string which is at the end of every bzip2 stream or file"""
	243	+ footer = [ '0x17', '0x72', '0x45', '0x38', '0x50', '0x90' ]
	244	+ for i in range(0,len(footer)):
	245	+ footer[i] = chr(int(footer[i],16))
	246	+ footerString = "".join(footer)
	247	+ return footerString
	248	+
	249	+ def getBlockMarker():
	250	+ """Return string which is at the beginning of every bzip2 compressed block"""
	251	+ return "1AY&SY"
	252	+
	253	+ def getMaxCompressedBlockSize(bzBlockSizeMultiplier):
	254	+ """Return the maximum compressed bzip2 block size based on the
	255	+ block size multipler (from the bzip2 header) passed as an argument"""
	256	+ # max length of compressed data block given size of uncompressed data as specified in header
	257	+ # "To guarantee that the compressed data will fit in its buffer, allocate an output buffer of
	258	+ # size 1% larger than the uncompressed data, plus six hundred extra bytes." (Plus paranoia :-P)
	259	+ return (bzBlockSizeMultiplier + bzBlockSizeMultiplier/100)*100000 + 650
	260	+
	261	+ def dumpstring(string,message="dumping string:"):
	262	+ print message,
	263	+ for i in range(0,len(string)):
	264	+ print hex(ord(string[i])),
	265	+ print
	266	+
	267	+ def isZeros(data):
	268	+ for i in range(0,len(data)):
	269	+ if (ord(data[i]) != 0):
	270	+ return False
	271	+ return True
	272	+
	273	+ def checkForFooter(data):
	274	+ """See if data passed as argument ends in the bz2 footer
	275	+ We expect: 6 bytes of footer, 4 bytes of crc, 0 to 7 bits of padding"""
	276	+ footerSearchString = shiftedSearchString(BzConstants.getFooter())
	277	+ ( offset, bitsShifted ) = footerSearchString.findStringInDataFromRight(data[-30:])
	278	+ if (offset != None):
	279	+ if (bitsShifted > 0):
	280	+ paddingByte = 1
	281	+ else:
	282	+ paddingByte = 0
	283	+ # starts from 0
	284	+ # expect 0-filled bytes at the end, if there are any
	285	+ extraBytes = -30+(offset + 6 + 4 + paddingByte)
	286	+ # so this iszeros thing... see, this data we got passed may have not been byte aligned.
	287	+ # so maybe it got left-bit shifted for some block marker.
	288	+ # then if we foudn another marker farther down, maybe it got shifted again
	289	+ # and so on... so there could be a bunch of extra zero bytes at the end
	290	+ # FIXME this is the wrong place to check for that but I don't know the
	291	+ # right place yet. And keeping track is absolutely out of the question.
	292	+ # we could instead of using the newly byte aligned data from the stream
	293	+ # and continually left shifting it, go back to the original stream each
	294	+ # time which would limit this some. ? needs thought.
	295	+ if (extraBytes <= 0 or BzConstants.isZeros(data[-30+(offset + 6 + 4 + paddingByte):])):
	296	+ # starting byte of footer, counting from end. may not be byte aligned.
	297	+ return (-30 + offset)
	298	+ return None
	299	+
	300	+ def getDecompressedBlock(data, bzHeader):
	301	+ """takes string of data plus 4 byte bzip2 header, returns decompressed block"""
	302	+ block = bzHeader + data
	303	+ try:
	304	+ bz = bz2.BZ2Decompressor()
	305	+ out = bz.decompress(block)
	306	+ return(out)
	307	+ except Exception, ex:
	308	+ print ex
	309	+ return(None)
	310	+
	311	+ getFooter = staticmethod(getFooter)
	312	+ getBlockMarker = staticmethod(getBlockMarker)
	313	+ getMaxCompressedBlockSize = staticmethod(getMaxCompressedBlockSize)
	314	+ dumpstring = staticmethod(dumpstring)
	315	+ checkForFooter = staticmethod(checkForFooter)
	316	+ getDecompressedBlock = staticmethod(getDecompressedBlock)
	317	+ isZeros = staticmethod(isZeros)
	318	+
	319	+class BzBlockStart(object):
	320	+ """This class manages bzip2 block markers (which mark the start of bzip2 blocks)
	321	+ in a blob of data. Because the block marker may not be byte aligned, it includes
	322	+ the number of bits shifted left for the block start marker to be byte aligned.
	323	+ It also contains a copy of the byte aligned data beginning at the start of the block
	324	+ and including this marker.
	325	+ Arguments:
	326	+ data -- a blob of compressed data within which to find a bzip2 block
	327	+ bzBlockSizeMultiplier -- 1 through 9.
	328	+ This number is typically retrieved from a bzip2 header; it indicates the bzip2 block
	329	+ sizes used for the uncompressed data, in units of 100K.
	330	+ This function is a bit wasteful in that it attempts a decompression of the data and
	331	+ throws away the results; it does this in order to verify that the block marker really
	332	+ is at the start of a block.
	333	+ It will detect a footer in the appropriate place at the end of the data stream
	334	+ (it has to in order for uncompression to work correctly)
	335	+ Use this function for seeking into an arbitrary place in a bzip2 file and digging up
	336	+ data, or finding the last whole block written out to an arbitrarily truncated bzip2 file,
	337	+ not for regular stream decompression."""
	338	+
	339	+ def __init__(self, data, header):
	340	+ """Arguments: data, 4-byte header from bzip stream
	341	+ If the data does not contain a bzip2 block marker
	342	+ (that really begins a block, i.e. the data afterwards
	343	+ uncompresses properly), subsequent calls to
	344	+ getBzBlockMarkerStart() will return None.
	345	+ Otherwise, getBzBlockMarkerStart() will return the byte in the
	346	+ data where the marker starts, (counting from 0),
	347	+ getBitsShifted() will get the number of bits that the
	348	+ data at that byte must be left-shifted in order for the
	349	+ block marker to be byte-aligned, and
	350	+ getByteAlignedData() will return the byte-aligned data
	351	+ including the block marker.
	352	+ Note that we test for the block marker to be a genuine start of
	353	+ block by trying decompression; this can fail sometimes if the
	354	+ bzip2 end of stream footer is present, so we will attempt to
	355	+ remove it and getByteAlignedData() returns the data with
	356	+ that removed (FIXME should it? Do we even need that function?)"""
	357	+
	358	+ self._data = data
	359	+ # 4 byte bzip2 header at the beginning of every compressed file or stream
	360	+ self._bzHeader = header
	361	+ self._bzBlockSizeMultiplier = int(header[3])
	362	+ self._bzBlockSize = self._bzBlockSizeMultiplier * 100000
	363	+ # this byte string is at the start of every bzip2 compressed block
	364	+ self._bzBlockMarker = shiftedSearchString(BzConstants.getBlockMarker())
	365	+ # index of byte in data where block marker found
	366	+ self._bzBlockMarkerStart = None
	367	+ # number of bits block was shifted in order to be byte aligned
	368	+ self._bitsShifted = None
	369	+ # block marker + following data, byte aligned
	370	+ self._byteAlignedData = None
	371	+ self.findBzBlockMarker()
	372	+
	373	+ def findBzBlockMarker(self):
	374	+ data = self._data
	375	+ offset = 0
	376	+
	377	+ while (True):
	378	+ ( bzBlockMarkerStart, bitsShifted ) = self._bzBlockMarker.findStringInDataFromLeft(data)
	379	+ if ( bzBlockMarkerStart == None):
	380	+ return None
	381	+ offset = offset + bzBlockMarkerStart
	382	+
	383	+ data = ByteAlignedDataMethods.getByteAlignedData(data,bzBlockMarkerStart, bitsShifted)
	384	+
	385	+ # try uncompression to see if it's a valid block. since the block marker can
	386	+ # appear in the data stream randomly, we don't try to bound this possible block by the next
	387	+ # appearance of a block marker; the decompress routine may barf on a partial block
	388	+ # (and we don't know how much truncation is allowed). So pass something guaranteed to be
	389	+ # a full block size and then some, for the test.
	390	+ toDO = BzConstants.getMaxCompressedBlockSize(self._bzBlockSizeMultiplier)
	391	+
	392	+ dataToUncompress = data[:toDO]
	393	+
	394	+ # if there is a footer we will toss it
	395	+ footerOffset = BzConstants.checkForFooter(dataToUncompress)
	396	+ if (footerOffset != None):
	397	+ # what do we think about this, give a partial footer?
	398	+ dataToTry = dataToUncompress[:footerOffset+5]
	399	+ else:
	400	+ dataToTry = dataToUncompress
	401	+
	402	+ uncompressedData = BzConstants.getDecompressedBlock(dataToTry, self._bzHeader)
	403	+ if (uncompressedData != None):
	404	+ # w00t!
	405	+ self._byteAlignedData = dataToUncompress
	406	+ self._bzBlockMarkerStart = offset
	407	+ self._bitsShifted = bitsShifted
	408	+ return True
	409	+
	410	+ # no possibilities left
	411	+ if (len(data) <= len(BzConstants.getBlockMarker())):
	412	+ self._bzBlockMarkerStart = None
	413	+
	414	+ return None
	415	+ # not a real block. on to the next possibility
	416	+ else:
	417	+ data = data[len(BzConstants.getBlockMarker()):]
	418	+
	419	+ def getBitsShifted(self):
	420	+ return self._bitsShifted
	421	+
	422	+ def getBzBlockMarkerStart(self):
	423	+ return self._bzBlockMarkerStart
	424	+
	425	+ def getByteAlignedData(self):
	426	+ return self._byteAlignedData
	427	+
	428	+class BzBlock(object):
	429	+ """This class manipulates bzipped data blocks (which include the
	430	+ block start marker). Because the block may not have been byte
	431	+ aligned wthin the original compressed stream or file, it also
	432	+ includes the number of bits shifted left for the block start marker to
	433	+ be byte aligned, as well as the number of bits to shift for
	434	+ the start of the next block, it there is one, in the stream
	435	+ or file.
	436	+ It may additionally include some or all of the uncompressed data.
	437	+ Takes as arguments:
	438	+ data -- the stream of data in which to find a block
	439	+ header -- the 4 byte bzip2 header which tells us block size among other things"""
	440	+
	441	+ def __init__(self, blockData, header):
	442	+ self._blockData = blockData
	443	+ self._compressedData = None
	444	+ self._bzHeader = header
	445	+ self._bzBlockStart = None
	446	+ self._uncompressedData = None
	447	+ self._bzBlockLength = None
	448	+ self.findAndUncompressFirstBlock()
	449	+
	450	+ def getBitMask(self, bits, left = False):
	451	+ """return bitmask starting from left or right end, of specified number of bits.
	452	+ default is start from right end"""
	453	+ if bits < 0:
	454	+ return 0
	455	+
	456	+ bits = bits % 8
	457	+ if (left):
	458	+ return 255 - 2**(8-bits) +1
	459	+ pass
	460	+ else:
	461	+ return 2**bits -1
	462	+
	463	+ def getMasked(self, byte, bitCount, left = False):
	464	+ """return leftmost or rightmost bitCount bits from byte
	465	+ default is rightmost. expect byte to be ord, not char"""
	466	+ mask = self.getBitMask(bitCount,left)
	467	+ return mask & byte
	468	+
	469	+ def findAndUncompressFirstBlock(self):
	470	+ bzBlockStart = BzBlockStart(self._blockData, self._bzHeader)
	471	+ if (not bzBlockStart.getBzBlockMarkerStart()):
	472	+ return None
	473	+
	474	+ dataToUncompress = bzBlockStart.getByteAlignedData()
	475	+
	476	+ # ok now we want to get the start of the next block in here,
	477	+ nextBlockStart = BzBlockStart(bzBlockStart.getByteAlignedData()[1:],self._bzHeader)
	478	+ if (not nextBlockStart.getBzBlockMarkerStart()):
	479	+ footerOffset = BzConstants.checkForFooter(dataToUncompress)
	480	+ if (footerOffset != None):
	481	+ # partial footer?
	482	+ endMarker = footerOffset+5
	483	+ else:
	484	+ return None
	485	+ else:
	486	+ footerOffset = None
	487	+ endMarker = nextBlockStart.getBzBlockMarkerStart() + 8
	488	+
	489	+ # this is either an additional 4 or 7 characters. not 5 or 8. python is stupid that way.
	490	+ dataToUncompress = dataToUncompress[:endMarker]
	491	+
	492	+ self._uncompressedData = BzConstants.getDecompressedBlock(dataToUncompress, self._bzHeader)
	493	+
	494	+ if (self._uncompressedData == None):
	495	+ return None
	496	+ else:
	497	+ # fixme is this next line right?
	498	+ self._compressedData = dataToUncompress
	499	+ self._bzBlockLength = len(dataToUncompress)
	500	+ # now set real block length, not the length of the block plus a
	501	+ # a few bytes of the following block marker or footer.
	502	+ # NOTE that truncating your data to this byte may be a bad idea since the next byte,
	503	+ # while it will have the start of the next block marker in it, may not be
	504	+ # byte aligned; i.e. the first so many bits of that byte may be the end of this block. :-P
	505	+ # also note that truncating your block here for purposes of decompression with the
	506	+ # python bindings will not work, it needs to see the footer or the beginning of
	507	+ # the next block or it will fail and complain. sorry dudes.
	508	+ if (footerOffset == None):
	509	+ self._bzBlockLength = self._bzBlockLength - 7
	510	+ else:
	511	+ self._bzBlockLength = self._bzBlockLength - 4
	512	+ self._bzBlockStart = bzBlockStart
	513	+ return bzBlockStart
	514	+
	515	+ def getOffset(self):
	516	+ if (self._bzBlockStart):
	517	+ return self._bzBlockStart.getBzBlockMarkerStart()
	518	+ else:
	519	+ return None
	520	+
	521	+ def getCompressedData(self):
	522	+ return self._compressedData
	523	+
	524	+ def getUncompressedData(self):
	525	+ return self._uncompressedData
	526	+
	527	+ def getBlockLength(self):
	528	+ """return length of the (compressed) bz2 block"""
	529	+ return(self._bzBlockLength)
	530	+
	531	+class BzFile:
	532	+ """handle bzip2 files, which means we can seek to arbitrary places
	533	+ in the compressed data, find the next block, uncompress it,
	534	+ uncompress the following n blocks, get the last complete block
	535	+ from before the eof, etc."""
	536	+ def __init__(self, fileName):
	537	+ self._fileName = fileName
	538	+ self._dataBlock = None
	539	+ self._seekOffset = None
	540	+ self._blocksize = None
	541	+ self._header = None # bzip2 header, 4 bytes
	542	+ self._f = open(fileName,"r")
	543	+ self.readHeader()
	544	+ self._blockSizeMultiplier = int(self._header[3])
	545	+ self._footer = BzConstants.getFooter()
	546	+ self._filesize = None
	547	+
	548	+ def getBlockSizeMultiplier(self):
	549	+ return(self._blockSizeMultiplier)
	550	+
	551	+ def readHeader(self):
	552	+ if self._header == None:
	553	+ self._f.seek(0)
	554	+ # header is BZhn (n = multiplier of 100k for compression blocksize)
	555	+ self._header = self._f.read(4)
	556	+
	557	+ def close(self):
	558	+ self._f.close()
	559	+
	560	+ def findLastFullBzBlock(self):
	561	+ """find last full bzip2 block written out before eof (by seeking
	562	+ to near the eof). This is useful in case you have a truncated XML dump
	563	+ and you want to know where to restart the run from; for very large files,
	564	+ decompressing the blocks starting from the beginning of the file
	565	+ can be quite slow.
	566	+ Returns a pointer to the DataBlock object or None if there was no
	567	+ bzip2 block found."""
	568	+
	569	+ if not self._filesize:
	570	+ self._f.seek(0,os.SEEK_END)
	571	+ self._filesize = self._f.tell()
	572	+
	573	+ seekBackTo = BzConstants.getMaxCompressedBlockSize(self._blockSizeMultiplier)*2
	574	+ if self._filesize < seekBackTo:
	575	+ seekBackTo = self._filesize
	576	+ self._f.seek(seekBackTo * -1,os.SEEK_END)
	577	+ # we are guaranteed to have a full block in here (if the file isn't less than a block long)
	578	+ # so start walking through this data til we find the last full block before eof.
	579	+ data = self._f.read()
	580	+ previousBlock = None
	581	+
	582	+ while True:
	583	+ blockFound = BzBlock(data, self._header)
	584	+ if not blockFound.getUncompressedData(): # truncated block?
	585	+ if previousBlock:
	586	+ self._dataBlock = previousBlock
	587	+ self._seekOffset = self._filesize - previousSeekBack
	588	+ return previousBlock
	589	+ else:
	590	+ return None
	591	+ previousBlock = blockFound
	592	+ offset = blockFound.getBlockLength()
	593	+ # otheroffset = where the fricking block started in the data we passed it
	594	+ otheroffset = blockFound.getOffset()
	595	+
	596	+ previousSeekBack = seekBackTo - otheroffset
	597	+ seekBackTo = seekBackTo - offset - otheroffset + 1
	598	+ data = data[seekBackTo * -1:]
	599	+
	600	+ def findBzBlockFromSeekPoint(self,seek):
	601	+ """Seek to given offset in file, search for and return
	602	+ first bzip2 block found in file after seek point, or
	603	+ None if none was found"""
	604	+ self._f.seek(seek,os.SEEK_SET)
	605	+ data = self._f.read(BzConstants.getMaxCompressedBlockSize(self._blockSizeMultiplier)*2)
	606	+
	607	+ blockFound = BzBlock(data, self._header)
	608	+ if not blockFound.getUncompressedData():
	609	+ return None
	610	+ self._dataBlock = blockFound
	611	+ self._seekOffset = seek + blockFound.getOffset()
	612	+ return blockFound
	613	+
	614	+ def getOffset(self):
	615	+ return self._seekOffset
	616	+
	617	+if __name__ == "__main__":
	618	+ try:
	619	+# works
	620	+ f = BzFile("/home/ariel/src/mediawiki/testing/enwiki-20100904-pages-meta-history9.xml.bz2")
	621	+
	622	+# works
	623	+# f = BzFile("/home/ariel/elwiki-20100925-pages-meta-history.xml.bz2")
	624	+
	625	+# works hmm for from certain point, fails, because seek point > end of file :-P
	626	+# f = BzFile("/home/ariel/src/mediawiki/testing/sample-last-but-0.bz2")
	627	+
	628	+# works
	629	+# f = BzFile("/home/ariel/sample-file.txt.bz2")
	630	+
	631	+# works
	632	+# f = BzFile("/home/ariel/sample-file-bz9.txt.bz2")
	633	+
	634	+# offset = f.getBlockSizeMultiplier()*100000 + 600
	635	+ offset = 14315000
	636	+
	637	+ # in these all our results are byte-aligned block markers out of the box
	638	+ # maybe that indicates a little problem? check. yes it's a bug, should have something around
	639	+ # 14254438 + 61571 and don't. so where is it? only finding start block aligned and
	640	+ # end block at shifted by 7, that's really weird. this must be recent, have
	641	+ # this behavior for the other routine too.
	642	+
	643	+ block = f.findBzBlockFromSeekPoint(offset)
	644	+
	645	+# block = f.findLastFullBzBlock()
	646	+ offset = None
	647	+ if (block):
	648	+ print "found this block (at offset in file %s, original seek point was %s, length %s): " % ( f.getOffset(), offset, block.getBlockLength()), block.getUncompressedData()[-500:]
	649	+ print "doublecheck..."
	650	+ f._f.seek(f.getOffset(),os.SEEK_SET)
	651	+ datatemp = f._f.read(100)
	652	+ BzConstants.dumpstring(datatemp[0:30],"contents of file from that offset")
	653	+ else:
	654	+ print "no block found"
	655	+
	656	+ f.close()
	657	+ except(IOError):
	658	+ print "there was no such file, you fool"
Property changes on: branches/ariel/xmldumps-backup/Bzip2RandomAccess.py
___________________________________________________________________
Added: svn:eol-style
1	659	+ native

Status & tagging log

21:54, 3 December 2010 Reedy (talk | contribs) changed the status of r75337 [removed: new added: deferred]