r96826 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r96825‎ | r96826 | r96827 >
Date:07:38, 12 September 2011
Author:ariel
Status:deferred
Tags:
Comment:
remove some dead fixmes, add _chunkToDo to base Dump class, seriously cleanup linking to rss feed files and removal of old latest links, this were pretty broken after checkpoint files went in
Modified paths:
  • /branches/ariel/xmldumps-backup/worker.py (modified) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/worker.py
@@ -1792,6 +1792,35 @@
17931793 dumpFile = DumpFilename(self.wiki, None, self.checksums.getChecksumFileNameBasename())
17941794 self.symLinks.saveSymlink(dumpFile)
17951795 self.symLinks.cleanupSymLinks()
 1796+
 1797+ for item in self.dumpItemList.dumpItems:
 1798+ dumpNames = item.getDumpName()
 1799+ if type(dumpNames).__name__!='list':
 1800+ dumpNames = [ dumpNames ]
 1801+
 1802+ if (item._chunksEnabled):
 1803+ # if there is a specific chunk, we want to only clear out
 1804+ # old files for that piece, because new files for the other
 1805+ # pieces may not have been generated yet.
 1806+ chunk = item._chunkToDo
 1807+ else:
 1808+ chunk = None
 1809+
 1810+ checkpoint = None
 1811+ if (item._checkpointsEnabled):
 1812+ if (item.checkpointFile):
 1813+ # if there's a specific checkpoint file we are
 1814+ # rerunning, we would only clear out old copies
 1815+ # of that very file. meh. how likely is it that we
 1816+ # have one? these files are time based and the start/end pageids
 1817+ # are going to fluctuate. whatever
 1818+ cf = DumpFilename(self.wiki)
 1819+ cf.newFromFilename(item.checkpointFile)
 1820+ checkpoint = cf.checkpoint
 1821+
 1822+ for d in dumpNames:
 1823+ self.symLinks.removeSymLinksFromOldRuns(self.wiki.date, d, chunk, checkpoint )
 1824+
17961825 self.feeds.cleanupFeeds()
17971826
17981827 def makeDir(self, dir):
@@ -1826,10 +1855,15 @@
18271856 link = os.path.join(self.dumpDir.latestDir(), latestFilename)
18281857 if exists(link) or os.path.islink(link):
18291858 if os.path.islink(link):
1830 - realfile = os.readlink(link)
 1859+ oldrealfile = os.readlink(link)
18311860 # format of these links should be... ../20110228/elwikidb-20110228-templatelinks.sql.gz
18321861 rellinkpattern = re.compile('^\.\./(20[0-9]+)/')
1833 - dateinterval = int(self.wiki.date) - int(dumpFile.date)
 1862+ dateinlink = rellinkpattern.search(oldrealfile)
 1863+ if (dateinlink):
 1864+ dateoflinkedfile = dateinlink.group(1)
 1865+ dateinterval = int(self.wiki.date) - int(dateoflinkedfile)
 1866+ else:
 1867+ dateinterval = 0
18341868 # no file or it's older than ours... *then* remove the link
18351869 if not exists(os.path.realpath(link)) or dateinterval > 0:
18361870 self.debugfn("Removing old symlink %s" % link)
@@ -1854,6 +1888,34 @@
18551889 if not exists(os.path.join(latestDir,realfile)):
18561890 os.remove(link)
18571891
 1892+ # if the args are False or None, we remove all the old links for all values of the arg.
 1893+ # example: if chunk is False or None then we remove all old values for all chunks
 1894+ # "old" means "older than the specified datestring".
 1895+ def removeSymLinksFromOldRuns(self, dateString, dumpName=None, chunk=None, checkpoint=None):
 1896+ # fixme this needs to do more work if there are chunks or checkpoint files linked in here from
 1897+ # earlier dates. checkpoint ranges change, and configuration of chunks changes too, so maybe
 1898+ # old files still exist and the links need to be removed because we have newer files for the
 1899+ # same phase of the dump.
 1900+ if (self._enabled):
 1901+ latestDir = self.dumpDir.latestDir()
 1902+ files = os.listdir(latestDir)
 1903+ for f in files:
 1904+ link = os.path.join(latestDir,f)
 1905+ if os.path.islink(link):
 1906+ realfile = os.readlink(link)
 1907+ fileObj = DumpFilename(self.dumpDir._wiki)
 1908+ fileObj.newFromFilename(os.path.basename(realfile))
 1909+ if fileObj.date < dateString:
 1910+ # fixme check that these are ok if the value is None
 1911+ if dumpName and (fileObj.dumpName != dumpName):
 1912+ continue
 1913+ if chunk and (fileObj.chunk != chunk):
 1914+ continue
 1915+ if checkpoint and (fileObj.checkpoint != checkpoint):
 1916+ continue
 1917+ self.debugfn("Removing old symlink %s -> %s" % (link, realfile))
 1918+ os.remove(link)
 1919+
18581920 class Feeds(object):
18591921 def __init__(self, wiki, dumpDir, dbName, debugfn, enabled):
18601922 self.wiki = wiki
@@ -1884,7 +1946,8 @@
18851947 "description": xmlEscape("<a href=\"%s\">%s</a>" % (filenameAndPath, fileObj.filename)),
18861948 "date": time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime()) }
18871949 directory = self.dumpDir.latestDir()
1888 - rssPath = os.path.join(self.dumpDir.latestDir(), fileObj.basename + "-rss.xml")
 1950+ rssPath = os.path.join(self.dumpDir.latestDir(), self.dbName + "-latest-" + fileObj.basename + "-rss.xml")
 1951+ self.debugfn( "adding rss feed file %s " % rssPath )
18891952 FileUtils.writeFile(self.wiki.config.tempDir, rssPath, rssText, self.wiki.config.fileperms)
18901953
18911954 def cleanupFeeds(self):
@@ -1896,9 +1959,10 @@
18971960 files = os.listdir(latestDir)
18981961 for f in files:
18991962 if f.endswith("-rss.xml"):
1900 - filename = f[:-8];
1901 - link = os.path.join(latestDir,f)
 1963+ filename = f[:-8]
 1964+ link = os.path.join(latestDir,filename)
19021965 if not exists(link):
 1966+ self.debugfn("Removing old rss feed %s for link %s" % (os.path.join(latestDir,f), link))
19031967 os.remove(os.path.join(latestDir,f))
19041968
19051969 class Dump(object):
@@ -1919,6 +1983,8 @@
19201984 self._checkpointsEnabled = False
19211985 if not hasattr(self, 'checkpointFile'):
19221986 self.checkpointFile = False
 1987+ if not hasattr(self, '_chunkToDo'):
 1988+ self._chunkToDo = False
19231989
19241990 def name(self):
19251991 return self.runInfo.name()
@@ -1996,7 +2062,6 @@
19972063 runner.log.addToLogQueue(line)
19982064 sys.stderr.write(line)
19992065 self.progress = line.strip()
2000 - # FIXME test this a lot!! does the updateStatus work?
20012066 runner.status.updateStatusFiles()
20022067 runner.runInfoFile.saveDumpRunInfoFile(runner.dumpItemList.reportDumpRunInfo())
20032068
@@ -2392,7 +2457,6 @@
23932458 files.extend(self.listCheckpointFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames))
23942459 files.extend(self.listTempFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames))
23952460 else:
2396 - # fixme this should be a list
23972461 # we will pass list of chunks or chunkToDo, or False, depending on the job setup.
23982462 files.extend(self.listRegularFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames))
23992463 return files
@@ -2532,7 +2596,6 @@
25332597 return files
25342598
25352599 def listOutputFilesForCleanup(self, dumpDir):
2536 - # fixme should this pass a list instead of one item?
25372600 dumpNames = self.listDumpNames()
25382601 files = []
25392602 files.extend(Dump.listOutputFilesForCleanup(self, dumpDir, dumpNames))
@@ -2549,7 +2612,6 @@
25502613 if (not exists( runner.wiki.config.php ) ):
25512614 raise BackupError("php command %s not found" % runner.wiki.config.php)
25522615
2553 - # fixme we have a list of all the files for all three dumpNames, we want to split them up by dumpName. oops.
25542616 articlesFile = runner.dumpDir.filenamePublicPath(f)
25552617 historyFile = runner.dumpDir.filenamePublicPath(DumpFilename(runner.wiki, f.date, self.historyDumpName, f.fileType, f.fileExt, f.chunk, f.checkpoint, f.temp))
25562618 currentFile = runner.dumpDir.filenamePublicPath(DumpFilename(runner.wiki, f.date, self.currentDumpName, f.fileType, f.fileExt, f.chunk, f.checkpoint, f.temp))
@@ -2724,7 +2786,6 @@
27252787 self.cleanupOldFiles(runner.dumpDir)
27262788 # just get the files pertaining to our dumpName, which is *one* of articles, pages-current, pages-history.
27272789 # stubs include all of them together.
2728 - # FIXME this needs some other jobname here. uuuggghhh and how would this job know what that is? bah
27292790 if not self.dumpName.startswith(self.getDumpNameBase()):
27302791 raise BackupError("dumpName %s of unknown form for this job" % self.dumpName)
27312792 dumpName = self.dumpName[len(self.getDumpNameBase()):]
@@ -2822,7 +2883,6 @@
28232884
28242885 if (self.checkpointFile):
28252886 outputFile = f
2826 - print "outputFile is ", outputFile.filename
28272887 elif (self._checkpointsEnabled):
28282888 # we write a temp file, it will be checkpointed every so often.
28292889 outputFile = DumpFilename(self.wiki, f.date, self.dumpName, f.fileType, self.fileExt, f.chunk, f.checkpoint, temp = True)
@@ -2935,11 +2995,10 @@
29362996 for fileObj in fileList:
29372997 firstPageIdInFile = int(fileObj.firstPageID)
29382998
2939 - # fixme what do we do here? this could be very expensive. is that
2940 - # worth it??
 2999+ # fixme what do we do here? this could be very expensive. is that worth it??
29413000 if not fileObj.lastPageID:
29423001 # (b) nasty hack, see (a)
2943 - # it's not a chekcpoint fle or we'd have the pageid in the filename
 3002+ # it's not a checkpoint fle or we'd have the pageid in the filename
29443003 # so... temporary hack which will give expensive results
29453004 # if chunk file, and it's the last chunk, put none
29463005 # if it's not the last chunk, get the first pageid in the next chunk and subtract 1
@@ -3184,7 +3243,6 @@
31853244 files.extend(self.listCheckpointFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames))
31863245 files.extend(self.listTempFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames))
31873246 else:
3188 - # fixme this should be a list
31893247 # we will pass list of chunks or chunkToDo, or False, depending on the job setup.
31903248 files.extend(self.listRegularFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames))
31913249 return files
@@ -3264,7 +3322,6 @@
32653323 return "xml"
32663324
32673325 def getFileExt(self):
3268 - # fixme no file extension, see what this means for everything
32693326 return ""
32703327
32713328 def buildCommand(self, runner, f):
@@ -3396,7 +3453,6 @@
33973454
33983455 def run(self, runner):
33993456 error = 0
3400 - # FIXME check this code
34013457 files = self.itemForRecombine.listOutputFilesForInput(runner.dumpDir)
34023458 outputFileList = self.listOutputFilesForBuildCommand(runner.dumpDir)
34033459 for outputFile in outputFileList:

Status & tagging log