r96826 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r96825‎ \| r96826 \| r96827 >
Date:	07:38, 12 September 2011
Author:	ariel
Status:	deferred
Tags:
Comment:	remove some dead fixmes, add _chunkToDo to base Dump class, seriously cleanup linking to rss feed files and removal of old latest links, this were pretty broken after checkpoint files went in
Modified paths:	/branches/ariel/xmldumps-backup/worker.py (modified) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/worker.py
—	—	@@ -1792,6 +1792,35 @@
1793	1793	dumpFile = DumpFilename(self.wiki, None, self.checksums.getChecksumFileNameBasename())
1794	1794	self.symLinks.saveSymlink(dumpFile)
1795	1795	self.symLinks.cleanupSymLinks()
	1796	+
	1797	+ for item in self.dumpItemList.dumpItems:
	1798	+ dumpNames = item.getDumpName()
	1799	+ if type(dumpNames).__name__!='list':
	1800	+ dumpNames = [ dumpNames ]
	1801	+
	1802	+ if (item._chunksEnabled):
	1803	+ # if there is a specific chunk, we want to only clear out
	1804	+ # old files for that piece, because new files for the other
	1805	+ # pieces may not have been generated yet.
	1806	+ chunk = item._chunkToDo
	1807	+ else:
	1808	+ chunk = None
	1809	+
	1810	+ checkpoint = None
	1811	+ if (item._checkpointsEnabled):
	1812	+ if (item.checkpointFile):
	1813	+ # if there's a specific checkpoint file we are
	1814	+ # rerunning, we would only clear out old copies
	1815	+ # of that very file. meh. how likely is it that we
	1816	+ # have one? these files are time based and the start/end pageids
	1817	+ # are going to fluctuate. whatever
	1818	+ cf = DumpFilename(self.wiki)
	1819	+ cf.newFromFilename(item.checkpointFile)
	1820	+ checkpoint = cf.checkpoint
	1821	+
	1822	+ for d in dumpNames:
	1823	+ self.symLinks.removeSymLinksFromOldRuns(self.wiki.date, d, chunk, checkpoint )
	1824	+
1796	1825	self.feeds.cleanupFeeds()
1797	1826
1798	1827	def makeDir(self, dir):
—	—	@@ -1826,10 +1855,15 @@
1827	1856	link = os.path.join(self.dumpDir.latestDir(), latestFilename)
1828	1857	if exists(link) or os.path.islink(link):
1829	1858	if os.path.islink(link):
1830		~~- realfile = os.readlink(link)~~
	1859	+ oldrealfile = os.readlink(link)
1831	1860	# format of these links should be... ../20110228/elwikidb-20110228-templatelinks.sql.gz
1832	1861	rellinkpattern = re.compile('^\.\./(20[0-9]+)/')
1833		~~- dateinterval = int(self.wiki.date) - int(dumpFile.date)~~
	1862	+ dateinlink = rellinkpattern.search(oldrealfile)
	1863	+ if (dateinlink):
	1864	+ dateoflinkedfile = dateinlink.group(1)
	1865	+ dateinterval = int(self.wiki.date) - int(dateoflinkedfile)
	1866	+ else:
	1867	+ dateinterval = 0
1834	1868	# no file or it's older than ours... then remove the link
1835	1869	if not exists(os.path.realpath(link)) or dateinterval > 0:
1836	1870	self.debugfn("Removing old symlink %s" % link)
—	—	@@ -1854,6 +1888,34 @@
1855	1889	if not exists(os.path.join(latestDir,realfile)):
1856	1890	os.remove(link)
1857	1891
	1892	+ # if the args are False or None, we remove all the old links for all values of the arg.
	1893	+ # example: if chunk is False or None then we remove all old values for all chunks
	1894	+ # "old" means "older than the specified datestring".
	1895	+ def removeSymLinksFromOldRuns(self, dateString, dumpName=None, chunk=None, checkpoint=None):
	1896	+ # fixme this needs to do more work if there are chunks or checkpoint files linked in here from
	1897	+ # earlier dates. checkpoint ranges change, and configuration of chunks changes too, so maybe
	1898	+ # old files still exist and the links need to be removed because we have newer files for the
	1899	+ # same phase of the dump.
	1900	+ if (self._enabled):
	1901	+ latestDir = self.dumpDir.latestDir()
	1902	+ files = os.listdir(latestDir)
	1903	+ for f in files:
	1904	+ link = os.path.join(latestDir,f)
	1905	+ if os.path.islink(link):
	1906	+ realfile = os.readlink(link)
	1907	+ fileObj = DumpFilename(self.dumpDir._wiki)
	1908	+ fileObj.newFromFilename(os.path.basename(realfile))
	1909	+ if fileObj.date < dateString:
	1910	+ # fixme check that these are ok if the value is None
	1911	+ if dumpName and (fileObj.dumpName != dumpName):
	1912	+ continue
	1913	+ if chunk and (fileObj.chunk != chunk):
	1914	+ continue
	1915	+ if checkpoint and (fileObj.checkpoint != checkpoint):
	1916	+ continue
	1917	+ self.debugfn("Removing old symlink %s -> %s" % (link, realfile))
	1918	+ os.remove(link)
	1919	+
1858	1920	class Feeds(object):
1859	1921	def __init__(self, wiki, dumpDir, dbName, debugfn, enabled):
1860	1922	self.wiki = wiki
—	—	@@ -1884,7 +1946,8 @@
1885	1947	"description": xmlEscape("<a href=\"%s\">%s</a>" % (filenameAndPath, fileObj.filename)),
1886	1948	"date": time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime()) }
1887	1949	directory = self.dumpDir.latestDir()
1888		~~- rssPath = os.path.join(self.dumpDir.latestDir(), fileObj.basename + "-rss.xml")~~
	1950	+ rssPath = os.path.join(self.dumpDir.latestDir(), self.dbName + "-latest-" + fileObj.basename + "-rss.xml")
	1951	+ self.debugfn( "adding rss feed file %s " % rssPath )
1889	1952	FileUtils.writeFile(self.wiki.config.tempDir, rssPath, rssText, self.wiki.config.fileperms)
1890	1953
1891	1954	def cleanupFeeds(self):
—	—	@@ -1896,9 +1959,10 @@
1897	1960	files = os.listdir(latestDir)
1898	1961	for f in files:
1899	1962	if f.endswith("-rss.xml"):
1900		~~- filename = f[:-8];~~
1901		~~- link = os.path.join(latestDir,f)~~
	1963	+ filename = f[:-8]
	1964	+ link = os.path.join(latestDir,filename)
1902	1965	if not exists(link):
	1966	+ self.debugfn("Removing old rss feed %s for link %s" % (os.path.join(latestDir,f), link))
1903	1967	os.remove(os.path.join(latestDir,f))
1904	1968
1905	1969	class Dump(object):
—	—	@@ -1919,6 +1983,8 @@
1920	1984	self._checkpointsEnabled = False
1921	1985	if not hasattr(self, 'checkpointFile'):
1922	1986	self.checkpointFile = False
	1987	+ if not hasattr(self, '_chunkToDo'):
	1988	+ self._chunkToDo = False
1923	1989
1924	1990	def name(self):
1925	1991	return self.runInfo.name()
—	—	@@ -1996,7 +2062,6 @@
1997	2063	runner.log.addToLogQueue(line)
1998	2064	sys.stderr.write(line)
1999	2065	self.progress = line.strip()
2000		~~- # FIXME test this a lot!! does the updateStatus work?~~
2001	2066	runner.status.updateStatusFiles()
2002	2067	runner.runInfoFile.saveDumpRunInfoFile(runner.dumpItemList.reportDumpRunInfo())
2003	2068
—	—	@@ -2392,7 +2457,6 @@
2393	2458	files.extend(self.listCheckpointFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames))
2394	2459	files.extend(self.listTempFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames))
2395	2460	else:
2396		~~- # fixme this should be a list~~
2397	2461	# we will pass list of chunks or chunkToDo, or False, depending on the job setup.
2398	2462	files.extend(self.listRegularFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames))
2399	2463	return files
—	—	@@ -2532,7 +2596,6 @@
2533	2597	return files
2534	2598
2535	2599	def listOutputFilesForCleanup(self, dumpDir):
2536		~~- # fixme should this pass a list instead of one item?~~
2537	2600	dumpNames = self.listDumpNames()
2538	2601	files = []
2539	2602	files.extend(Dump.listOutputFilesForCleanup(self, dumpDir, dumpNames))
—	—	@@ -2549,7 +2612,6 @@
2550	2613	if (not exists( runner.wiki.config.php ) ):
2551	2614	raise BackupError("php command %s not found" % runner.wiki.config.php)
2552	2615
2553		~~- # fixme we have a list of all the files for all three dumpNames, we want to split them up by dumpName. oops.~~
2554	2616	articlesFile = runner.dumpDir.filenamePublicPath(f)
2555	2617	historyFile = runner.dumpDir.filenamePublicPath(DumpFilename(runner.wiki, f.date, self.historyDumpName, f.fileType, f.fileExt, f.chunk, f.checkpoint, f.temp))
2556	2618	currentFile = runner.dumpDir.filenamePublicPath(DumpFilename(runner.wiki, f.date, self.currentDumpName, f.fileType, f.fileExt, f.chunk, f.checkpoint, f.temp))
—	—	@@ -2724,7 +2786,6 @@
2725	2787	self.cleanupOldFiles(runner.dumpDir)
2726	2788	# just get the files pertaining to our dumpName, which is one of articles, pages-current, pages-history.
2727	2789	# stubs include all of them together.
2728		~~- # FIXME this needs some other jobname here. uuuggghhh and how would this job know what that is? bah~~
2729	2790	if not self.dumpName.startswith(self.getDumpNameBase()):
2730	2791	raise BackupError("dumpName %s of unknown form for this job" % self.dumpName)
2731	2792	dumpName = self.dumpName[len(self.getDumpNameBase()):]
—	—	@@ -2822,7 +2883,6 @@
2823	2884
2824	2885	if (self.checkpointFile):
2825	2886	outputFile = f
2826		~~- print "outputFile is ", outputFile.filename~~
2827	2887	elif (self._checkpointsEnabled):
2828	2888	# we write a temp file, it will be checkpointed every so often.
2829	2889	outputFile = DumpFilename(self.wiki, f.date, self.dumpName, f.fileType, self.fileExt, f.chunk, f.checkpoint, temp = True)
—	—	@@ -2935,11 +2995,10 @@
2936	2996	for fileObj in fileList:
2937	2997	firstPageIdInFile = int(fileObj.firstPageID)
2938	2998
2939		~~- # fixme what do we do here? this could be very expensive. is that~~
2940		~~- # worth it??~~
	2999	+ # fixme what do we do here? this could be very expensive. is that worth it??
2941	3000	if not fileObj.lastPageID:
2942	3001	# (b) nasty hack, see (a)
2943		~~- # it's not a chekcpoint fle or we'd have the pageid in the filename~~
	3002	+ # it's not a checkpoint fle or we'd have the pageid in the filename
2944	3003	# so... temporary hack which will give expensive results
2945	3004	# if chunk file, and it's the last chunk, put none
2946	3005	# if it's not the last chunk, get the first pageid in the next chunk and subtract 1
—	—	@@ -3184,7 +3243,6 @@
3185	3244	files.extend(self.listCheckpointFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames))
3186	3245	files.extend(self.listTempFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames))
3187	3246	else:
3188		~~- # fixme this should be a list~~
3189	3247	# we will pass list of chunks or chunkToDo, or False, depending on the job setup.
3190	3248	files.extend(self.listRegularFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames))
3191	3249	return files
—	—	@@ -3264,7 +3322,6 @@
3265	3323	return "xml"
3266	3324
3267	3325	def getFileExt(self):
3268		~~- # fixme no file extension, see what this means for everything~~
3269	3326	return ""
3270	3327
3271	3328	def buildCommand(self, runner, f):
—	—	@@ -3396,7 +3453,6 @@
3397	3454
3398	3455	def run(self, runner):
3399	3456	error = 0
3400		~~- # FIXME check this code~~
3401	3457	files = self.itemForRecombine.listOutputFilesForInput(runner.dumpDir)
3402	3458	outputFileList = self.listOutputFilesForBuildCommand(runner.dumpDir)
3403	3459	for outputFile in outputFileList:

Status & tagging log

22:08, 19 September 2011 Reedy (talk | contribs) changed the status of r96826 [removed: new added: deferred]