r81588 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r81587‎ \| r81588 \| r81589 >
Date:	06:12, 6 February 2011
Author:	ariel
Status:	deferred
Tags:
Comment:	new functions: check mw footer existence, get last pageID from last complete block (from possibly truncated file)
Modified paths:	/branches/ariel/xmldumps-backup/Bzip2Xml.py (modified) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/Bzip2Xml.py
—	—	@@ -6,8 +6,110 @@
7	7	import bz2
8	8	import xml.sax
9	9	import Bzip2RandomAccess
10		~~-from Bzip2RandomAccess import BzFile~~
11	10
	11	+from os.path import dirname, exists, getsize, join, realpath
	12	+from Bzip2RandomAccess import BzFile, BzConstants
	13	+
	14	+class BzXmlFile(object):
	15	+ """Convenience functions that do things like find the last pageID
	16	+ from the last complete block in a possibly truncated bzip2 file, etc"""
	17	+ def __init__(self, fileName):
	18	+ self._fileName = fileName
	19	+ self._dataBlock = None
	20	+ self._pageID = None
	21	+ self._bzf = BzFile(self._fileName)
	22	+
	23	+ def getFooter(self):
	24	+ block = self._bzf.findLastFullBzBlock()
	25	+ if (not block):
	26	+ return False
	27	+ uncompressedData = block.getUncompressedData()
	28	+ if (not uncompressedData ):
	29	+ return False
	30	+ footerPattern = re.compile('</mediawiki>\s*');
	31	+ result = footerPattern.search(uncompressedData)
	32	+ if (not result):
	33	+ return False
	34	+ return True
	35	+
	36	+ # start from a fixed length back from the end of the file,
	37	+ # read through blocks looking for page id info. goal is to
	38	+ # find the last one in the file
	39	+ # if we don't find any in those blocks then we step back that same length
	40	+ # further back and try again...
	41	+ # repeat until we find one (in which case find the last one available)
	42	+ # or we run out of file
	43	+ def getLastPageIDFromFile(self):
	44	+ self._bzf._f.seek(0,os.SEEK_END)
	45	+ filesize = self._bzf._f.tell()
	46	+
	47	+ # so because stuff isn't byte aligned etc, after
	48	+ # multiple iterations trying to get blocks and matches etc,
	49	+ # we could be a few bytes of in various length counts.
	50	+ # maybe I'm just being paranoid but that's what this is anyways.
	51	+ fudgefactor = 10
	52	+
	53	+ offset = filesize
	54	+ # the pattern we are looking for could be perfectly positioned
	55	+ # to be split across two blocks, so put leftover stuff in here
	56	+ # to be concatenated onto the beginning of the next block
	57	+ holdoverString = ""
	58	+
	59	+ # blocks should always be smaller than this so if we start from
	60	+ # this far back in the file we should be guaranteed to find
	61	+ # a block. (unless the file is hopelessly corrupted)
	62	+ interval = BzConstants.getMaxCompressedBlockSize(self._bzf.getBlockSizeMultiplier())*2
	63	+
	64	+ pageIDPattern = re.compile('</title>\n\s*<id>(?P<pageid>\d+?)</id>');
	65	+
	66	+ # the position in the file after which we have already scanned
	67	+ didAlready = filesize
	68	+
	69	+ # outer loop: jump back in the file from where we are,
	70	+ # far enough back we are guaranteed to find at least
	71	+ # one bzip2 block in ther if the file isn't garbage
	72	+ # hen read through it looking for pageID til we get to
	73	+ # ground we've already covered.
	74	+ # found one? awesome, return it. otherwise,
	75	+ # jump back even further, repeat.
	76	+ while offset > 0:
	77	+ start = offset - interval
	78	+ if (start < 0):
	79	+ if (didAlready != fudgefactor):
	80	+ start = 0
	81	+ else:
	82	+ return currentPageID
	83	+ didAlready = offset + fudgefactor
	84	+ if didAlready > filesize:
	85	+ didAlready = filesize
	86	+ doingNow = start
	87	+ currentPageID = None
	88	+ while start < didAlready:
	89	+ # we need to walk through the blocks now going forward
	90	+ # looking for all pageIDs we encounter, if we find any
	91	+ # we return the last one. Only if we find none do
	92	+ # we fall through to the outer loop to back up further
	93	+ # towards the beginning of the file.
	94	+ block = self._bzf.findBzBlockFromSeekPoint(start)
	95	+ if not block:
	96	+ return currentPageID
	97	+ uncompressedData = block.getUncompressedData()
	98	+ text = holdoverString + uncompressedData
	99	+ pageID = pageIDPattern.search(uncompressedData)
	100	+ if (pageID):
	101	+ currentPageID = pageID.group('pageid')
	102	+ start = start + block.getBlockLength() - fudgefactor
	103	+ holdoverString = uncompressedData[-1025:]
	104	+
	105	+ if currentPageID:
	106	+ return currentPageID
	107	+
	108	+ # no, we need to just add the block length I think, maybe subtract a few bytes
	109	+ offset = start - interval
	110	+ if offset < 0:
	111	+ return currentPageID
	112	+ holdoverString = ""
	113	+
12	114	class PageInXml:
13	115	"""One page in XML, minus most of the content which we don't care about"""
14	116
—	—	@@ -16,6 +118,11 @@
17	119	self.id = id
18	120	self.revisionids = revisionids
19	121
	122	+#
	123	+#
	124	+# the rest of this crap may get tossed... we shall see
	125	+#
	126	+#
20	127	class XmlFileChunk(object):
21	128	"""find pageID in last complete or partial bzip2 block before end of file,
22	129	something like that."""
—	—	@@ -109,7 +216,7 @@
110	217	return(ID)
111	218	return None
112	219
113		~~- def findPageIDFromSeekpoint(self, seek, maxBlocksToCheck = None):~~
	220	+ def findPageIDFromSeekpoint(self, seek, maxBlocksToCheck = None, maxOffset = None):
114	221	block = self._f.findBzBlockFromSeekPoint(seek)
115	222	if not block:
116	223	print "DEBUG: findPageIDFromSeekpoint: no block found, wtf"
—	—	@@ -134,6 +241,8 @@
135	242	if (maxBlocksToCheck and (blockCount > maxBlocksToCheck)):
136	243	break
137	244	seek = seek + block.getBlockLength()
	245	+ if (maxOffset and (seek > maxOffset)):
	246	+ break
138	247	block = self._f.findBzBlockFromSeekPoint(seek)
139	248	# the n is length of <title> plus </title> plus <id> plus </id> plus <page>
140	249	# plus max title length plus a few for good measure. so title length max is 255
—	—	@@ -155,6 +264,20 @@
156	265	self._f.close()
157	266
158	267	if __name__ == "__main__":
	268	+ myfile = BzXmlFile("/home/ariel/src/mediawiki/testing/dumps/public/elwikidb/20110205/elwikidb-20110205-pages-articles4.xml.bz2")
	269	+ pageID = myfile.getLastPageIDFromFile()
	270	+ if pageID:
	271	+ print "pageID", pageID
	272	+ else:
	273	+ print "no pageID for you!"
	274	+
	275	+
	276	+ result = myfile.getFooter()
	277	+ if (result):
	278	+ print "and this file has a footer"
	279	+ else:
	280	+ print "no footer on this baby"
	281	+
159	282	try:
160	283	# f = XmlFileChunk("/home/ariel/elwikt/elwiktionary-20100305-pages-articles.xml.bz2")
161	284	f = XmlFileChunk("/home/ariel/src/mediawiki/testing/enwiki-20100904-pages-meta-history9.xml.bz2")
—	—	@@ -176,3 +299,4 @@
177	300	except(IOError):
178	301	print "there was no such file, you fool"
179	302
	303	+

Status & tagging log

16:22, 2 March 2011 Reedy (talk | contribs) changed the status of r81588 [removed: new added: deferred]