Index: branches/ariel/xmldumps-backup/worker.py |
— | — | @@ -1792,6 +1792,35 @@ |
1793 | 1793 | dumpFile = DumpFilename(self.wiki, None, self.checksums.getChecksumFileNameBasename()) |
1794 | 1794 | self.symLinks.saveSymlink(dumpFile) |
1795 | 1795 | self.symLinks.cleanupSymLinks() |
| 1796 | + |
| 1797 | + for item in self.dumpItemList.dumpItems: |
| 1798 | + dumpNames = item.getDumpName() |
| 1799 | + if type(dumpNames).__name__!='list': |
| 1800 | + dumpNames = [ dumpNames ] |
| 1801 | + |
| 1802 | + if (item._chunksEnabled): |
| 1803 | + # if there is a specific chunk, we want to only clear out |
| 1804 | + # old files for that piece, because new files for the other |
| 1805 | + # pieces may not have been generated yet. |
| 1806 | + chunk = item._chunkToDo |
| 1807 | + else: |
| 1808 | + chunk = None |
| 1809 | + |
| 1810 | + checkpoint = None |
| 1811 | + if (item._checkpointsEnabled): |
| 1812 | + if (item.checkpointFile): |
| 1813 | + # if there's a specific checkpoint file we are |
| 1814 | + # rerunning, we would only clear out old copies |
| 1815 | + # of that very file. meh. how likely is it that we |
| 1816 | + # have one? these files are time based and the start/end pageids |
| 1817 | + # are going to fluctuate. whatever |
| 1818 | + cf = DumpFilename(self.wiki) |
| 1819 | + cf.newFromFilename(item.checkpointFile) |
| 1820 | + checkpoint = cf.checkpoint |
| 1821 | + |
| 1822 | + for d in dumpNames: |
| 1823 | + self.symLinks.removeSymLinksFromOldRuns(self.wiki.date, d, chunk, checkpoint ) |
| 1824 | + |
1796 | 1825 | self.feeds.cleanupFeeds() |
1797 | 1826 | |
1798 | 1827 | def makeDir(self, dir): |
— | — | @@ -1826,10 +1855,15 @@ |
1827 | 1856 | link = os.path.join(self.dumpDir.latestDir(), latestFilename) |
1828 | 1857 | if exists(link) or os.path.islink(link): |
1829 | 1858 | if os.path.islink(link): |
1830 | | - realfile = os.readlink(link) |
| 1859 | + oldrealfile = os.readlink(link) |
1831 | 1860 | # format of these links should be... ../20110228/elwikidb-20110228-templatelinks.sql.gz |
1832 | 1861 | rellinkpattern = re.compile('^\.\./(20[0-9]+)/') |
1833 | | - dateinterval = int(self.wiki.date) - int(dumpFile.date) |
| 1862 | + dateinlink = rellinkpattern.search(oldrealfile) |
| 1863 | + if (dateinlink): |
| 1864 | + dateoflinkedfile = dateinlink.group(1) |
| 1865 | + dateinterval = int(self.wiki.date) - int(dateoflinkedfile) |
| 1866 | + else: |
| 1867 | + dateinterval = 0 |
1834 | 1868 | # no file or it's older than ours... *then* remove the link |
1835 | 1869 | if not exists(os.path.realpath(link)) or dateinterval > 0: |
1836 | 1870 | self.debugfn("Removing old symlink %s" % link) |
— | — | @@ -1854,6 +1888,34 @@ |
1855 | 1889 | if not exists(os.path.join(latestDir,realfile)): |
1856 | 1890 | os.remove(link) |
1857 | 1891 | |
| 1892 | + # if the args are False or None, we remove all the old links for all values of the arg. |
| 1893 | + # example: if chunk is False or None then we remove all old values for all chunks |
| 1894 | + # "old" means "older than the specified datestring". |
| 1895 | + def removeSymLinksFromOldRuns(self, dateString, dumpName=None, chunk=None, checkpoint=None): |
| 1896 | + # fixme this needs to do more work if there are chunks or checkpoint files linked in here from |
| 1897 | + # earlier dates. checkpoint ranges change, and configuration of chunks changes too, so maybe |
| 1898 | + # old files still exist and the links need to be removed because we have newer files for the |
| 1899 | + # same phase of the dump. |
| 1900 | + if (self._enabled): |
| 1901 | + latestDir = self.dumpDir.latestDir() |
| 1902 | + files = os.listdir(latestDir) |
| 1903 | + for f in files: |
| 1904 | + link = os.path.join(latestDir,f) |
| 1905 | + if os.path.islink(link): |
| 1906 | + realfile = os.readlink(link) |
| 1907 | + fileObj = DumpFilename(self.dumpDir._wiki) |
| 1908 | + fileObj.newFromFilename(os.path.basename(realfile)) |
| 1909 | + if fileObj.date < dateString: |
| 1910 | + # fixme check that these are ok if the value is None |
| 1911 | + if dumpName and (fileObj.dumpName != dumpName): |
| 1912 | + continue |
| 1913 | + if chunk and (fileObj.chunk != chunk): |
| 1914 | + continue |
| 1915 | + if checkpoint and (fileObj.checkpoint != checkpoint): |
| 1916 | + continue |
| 1917 | + self.debugfn("Removing old symlink %s -> %s" % (link, realfile)) |
| 1918 | + os.remove(link) |
| 1919 | + |
1858 | 1920 | class Feeds(object): |
1859 | 1921 | def __init__(self, wiki, dumpDir, dbName, debugfn, enabled): |
1860 | 1922 | self.wiki = wiki |
— | — | @@ -1884,7 +1946,8 @@ |
1885 | 1947 | "description": xmlEscape("<a href=\"%s\">%s</a>" % (filenameAndPath, fileObj.filename)), |
1886 | 1948 | "date": time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime()) } |
1887 | 1949 | directory = self.dumpDir.latestDir() |
1888 | | - rssPath = os.path.join(self.dumpDir.latestDir(), fileObj.basename + "-rss.xml") |
| 1950 | + rssPath = os.path.join(self.dumpDir.latestDir(), self.dbName + "-latest-" + fileObj.basename + "-rss.xml") |
| 1951 | + self.debugfn( "adding rss feed file %s " % rssPath ) |
1889 | 1952 | FileUtils.writeFile(self.wiki.config.tempDir, rssPath, rssText, self.wiki.config.fileperms) |
1890 | 1953 | |
1891 | 1954 | def cleanupFeeds(self): |
— | — | @@ -1896,9 +1959,10 @@ |
1897 | 1960 | files = os.listdir(latestDir) |
1898 | 1961 | for f in files: |
1899 | 1962 | if f.endswith("-rss.xml"): |
1900 | | - filename = f[:-8]; |
1901 | | - link = os.path.join(latestDir,f) |
| 1963 | + filename = f[:-8] |
| 1964 | + link = os.path.join(latestDir,filename) |
1902 | 1965 | if not exists(link): |
| 1966 | + self.debugfn("Removing old rss feed %s for link %s" % (os.path.join(latestDir,f), link)) |
1903 | 1967 | os.remove(os.path.join(latestDir,f)) |
1904 | 1968 | |
1905 | 1969 | class Dump(object): |
— | — | @@ -1919,6 +1983,8 @@ |
1920 | 1984 | self._checkpointsEnabled = False |
1921 | 1985 | if not hasattr(self, 'checkpointFile'): |
1922 | 1986 | self.checkpointFile = False |
| 1987 | + if not hasattr(self, '_chunkToDo'): |
| 1988 | + self._chunkToDo = False |
1923 | 1989 | |
1924 | 1990 | def name(self): |
1925 | 1991 | return self.runInfo.name() |
— | — | @@ -1996,7 +2062,6 @@ |
1997 | 2063 | runner.log.addToLogQueue(line) |
1998 | 2064 | sys.stderr.write(line) |
1999 | 2065 | self.progress = line.strip() |
2000 | | - # FIXME test this a lot!! does the updateStatus work? |
2001 | 2066 | runner.status.updateStatusFiles() |
2002 | 2067 | runner.runInfoFile.saveDumpRunInfoFile(runner.dumpItemList.reportDumpRunInfo()) |
2003 | 2068 | |
— | — | @@ -2392,7 +2457,6 @@ |
2393 | 2458 | files.extend(self.listCheckpointFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames)) |
2394 | 2459 | files.extend(self.listTempFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames)) |
2395 | 2460 | else: |
2396 | | - # fixme this should be a list |
2397 | 2461 | # we will pass list of chunks or chunkToDo, or False, depending on the job setup. |
2398 | 2462 | files.extend(self.listRegularFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames)) |
2399 | 2463 | return files |
— | — | @@ -2532,7 +2596,6 @@ |
2533 | 2597 | return files |
2534 | 2598 | |
2535 | 2599 | def listOutputFilesForCleanup(self, dumpDir): |
2536 | | - # fixme should this pass a list instead of one item? |
2537 | 2600 | dumpNames = self.listDumpNames() |
2538 | 2601 | files = [] |
2539 | 2602 | files.extend(Dump.listOutputFilesForCleanup(self, dumpDir, dumpNames)) |
— | — | @@ -2549,7 +2612,6 @@ |
2550 | 2613 | if (not exists( runner.wiki.config.php ) ): |
2551 | 2614 | raise BackupError("php command %s not found" % runner.wiki.config.php) |
2552 | 2615 | |
2553 | | - # fixme we have a list of all the files for all three dumpNames, we want to split them up by dumpName. oops. |
2554 | 2616 | articlesFile = runner.dumpDir.filenamePublicPath(f) |
2555 | 2617 | historyFile = runner.dumpDir.filenamePublicPath(DumpFilename(runner.wiki, f.date, self.historyDumpName, f.fileType, f.fileExt, f.chunk, f.checkpoint, f.temp)) |
2556 | 2618 | currentFile = runner.dumpDir.filenamePublicPath(DumpFilename(runner.wiki, f.date, self.currentDumpName, f.fileType, f.fileExt, f.chunk, f.checkpoint, f.temp)) |
— | — | @@ -2724,7 +2786,6 @@ |
2725 | 2787 | self.cleanupOldFiles(runner.dumpDir) |
2726 | 2788 | # just get the files pertaining to our dumpName, which is *one* of articles, pages-current, pages-history. |
2727 | 2789 | # stubs include all of them together. |
2728 | | - # FIXME this needs some other jobname here. uuuggghhh and how would this job know what that is? bah |
2729 | 2790 | if not self.dumpName.startswith(self.getDumpNameBase()): |
2730 | 2791 | raise BackupError("dumpName %s of unknown form for this job" % self.dumpName) |
2731 | 2792 | dumpName = self.dumpName[len(self.getDumpNameBase()):] |
— | — | @@ -2822,7 +2883,6 @@ |
2823 | 2884 | |
2824 | 2885 | if (self.checkpointFile): |
2825 | 2886 | outputFile = f |
2826 | | - print "outputFile is ", outputFile.filename |
2827 | 2887 | elif (self._checkpointsEnabled): |
2828 | 2888 | # we write a temp file, it will be checkpointed every so often. |
2829 | 2889 | outputFile = DumpFilename(self.wiki, f.date, self.dumpName, f.fileType, self.fileExt, f.chunk, f.checkpoint, temp = True) |
— | — | @@ -2935,11 +2995,10 @@ |
2936 | 2996 | for fileObj in fileList: |
2937 | 2997 | firstPageIdInFile = int(fileObj.firstPageID) |
2938 | 2998 | |
2939 | | - # fixme what do we do here? this could be very expensive. is that |
2940 | | - # worth it?? |
| 2999 | + # fixme what do we do here? this could be very expensive. is that worth it?? |
2941 | 3000 | if not fileObj.lastPageID: |
2942 | 3001 | # (b) nasty hack, see (a) |
2943 | | - # it's not a chekcpoint fle or we'd have the pageid in the filename |
| 3002 | + # it's not a checkpoint fle or we'd have the pageid in the filename |
2944 | 3003 | # so... temporary hack which will give expensive results |
2945 | 3004 | # if chunk file, and it's the last chunk, put none |
2946 | 3005 | # if it's not the last chunk, get the first pageid in the next chunk and subtract 1 |
— | — | @@ -3184,7 +3243,6 @@ |
3185 | 3244 | files.extend(self.listCheckpointFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames)) |
3186 | 3245 | files.extend(self.listTempFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames)) |
3187 | 3246 | else: |
3188 | | - # fixme this should be a list |
3189 | 3247 | # we will pass list of chunks or chunkToDo, or False, depending on the job setup. |
3190 | 3248 | files.extend(self.listRegularFilesPerChunkExisting(dumpDir, self.getChunkList(), dumpNames)) |
3191 | 3249 | return files |
— | — | @@ -3264,7 +3322,6 @@ |
3265 | 3323 | return "xml" |
3266 | 3324 | |
3267 | 3325 | def getFileExt(self): |
3268 | | - # fixme no file extension, see what this means for everything |
3269 | 3326 | return "" |
3270 | 3327 | |
3271 | 3328 | def buildCommand(self, runner, f): |
— | — | @@ -3396,7 +3453,6 @@ |
3397 | 3454 | |
3398 | 3455 | def run(self, runner): |
3399 | 3456 | error = 0 |
3400 | | - # FIXME check this code |
3401 | 3457 | files = self.itemForRecombine.listOutputFilesForInput(runner.dumpDir) |
3402 | 3458 | outputFileList = self.listOutputFilesForBuildCommand(runner.dumpDir) |
3403 | 3459 | for outputFile in outputFileList: |