Index: branches/ariel/xmldumps-backup/WikiDump.py |
— | — | @@ -172,6 +172,7 @@ |
173 | 173 | "tail": "/usr/bin/tail", |
174 | 174 | "cat": "/bin/cat", |
175 | 175 | "grep": "/bin/grep", |
| 176 | + "checkforbz2footer": "/usr/local/bin/checkforbz2footer", |
176 | 177 | #"cleanup": { |
177 | 178 | "keep": "3", |
178 | 179 | #"chunks": { |
— | — | @@ -249,6 +250,7 @@ |
250 | 251 | self.tail = conf.get("tools", "tail") |
251 | 252 | self.cat = conf.get("tools", "cat") |
252 | 253 | self.grep = conf.get("tools", "grep") |
| 254 | + self.checkforbz2footer = conf.get("tools","checkforbz2footer") |
253 | 255 | |
254 | 256 | if not conf.has_section('chunks'): |
255 | 257 | conf.add_section('chunks') |
Index: branches/ariel/xmldumps-backup/worker.py |
— | — | @@ -988,7 +988,7 @@ |
989 | 989 | # callbackinterval: how often we will call callbackTimed (in milliseconds), defaults to every 5 secs |
990 | 990 | def runCommand(self, commandSeriesList, callbackStderr=None, callbackStderrArg=None, callbackTimed=None, callbackTimedArg=None, shell = False, callbackInterval=5000): |
991 | 991 | """Nonzero return code from the shell from any command in any pipeline will cause this |
992 | | - function to print an error message and return 1, indictating error. |
| 992 | + function to print an error message and return 1, indicating error. |
993 | 993 | Returns 0 on success. |
994 | 994 | If a callback function is passed, it will receive lines of |
995 | 995 | output from the call. If the callback function takes another argument (which will |
— | — | @@ -1561,8 +1561,9 @@ |
1562 | 1562 | self.cleanupOldFiles(runner) |
1563 | 1563 | series = self.buildCommand(runner) |
1564 | 1564 | commands.append(series) |
1565 | | - result = runner.runCommand(commands, callbackStderr=self.progressCallback, callbackStderrArg=runner) |
1566 | | - return result |
| 1565 | + error = runner.runCommand(commands, callbackStderr=self.progressCallback, callbackStderrArg=runner) |
| 1566 | + if (error): |
| 1567 | + raise BackupError("error producing stub files" % self._subset) |
1567 | 1568 | |
1568 | 1569 | class RecombineXmlStub(XmlStub): |
1569 | 1570 | def __init__(self, name, desc, chunks): |
— | — | @@ -1576,7 +1577,7 @@ |
1577 | 1578 | return(XmlStub.listFiles(self, runner, unnumbered=True)) |
1578 | 1579 | |
1579 | 1580 | def run(self, runner): |
1580 | | - errorresult=0 |
| 1581 | + error=0 |
1581 | 1582 | if (self._chunks): |
1582 | 1583 | files = XmlStub.listFiles(self,runner) |
1583 | 1584 | outputFileList = self.listFiles(runner) |
— | — | @@ -1599,8 +1600,9 @@ |
1600 | 1601 | series = [ recombinePipeline ] |
1601 | 1602 | result = runner.runCommand([ series ], callbackTimed=self.progressCallback, callbackTimedArg=runner, shell = True) |
1602 | 1603 | if result: |
1603 | | - errorresult = result |
1604 | | - return errorresult |
| 1604 | + error = result |
| 1605 | + if (error): |
| 1606 | + raise BackupError("error recombining stub files") |
1605 | 1607 | |
1606 | 1608 | class XmlLogging(Dump): |
1607 | 1609 | """ Create a logging dump of all page activity """ |
— | — | @@ -1635,8 +1637,9 @@ |
1636 | 1638 | "--output=gzip:%s" % logging ] |
1637 | 1639 | pipeline = [ command ] |
1638 | 1640 | series = [ pipeline ] |
1639 | | - result = runner.runCommand([ series ], callbackStderr=self.progressCallback, callbackStderrArg=runner) |
1640 | | - return result |
| 1641 | + error = runner.runCommand([ series ], callbackStderr=self.progressCallback, callbackStderrArg=runner) |
| 1642 | + if (error): |
| 1643 | + raise BackupError("error dimping log files") |
1641 | 1644 | |
1642 | 1645 | class XmlDump(Dump): |
1643 | 1646 | """Primary XML dumps, one section at a time.""" |
— | — | @@ -1671,9 +1674,29 @@ |
1672 | 1675 | else: |
1673 | 1676 | series = self.buildCommand(runner) |
1674 | 1677 | commands.append(series) |
1675 | | - result = runner.runCommand(commands, callbackStderr=self.progressCallback, callbackStderrArg=runner) |
1676 | | - return result |
| 1678 | + error = runner.runCommand(commands, callbackStderr=self.progressCallback, callbackStderrArg=runner) |
1677 | 1679 | |
| 1680 | + checkforbz2footer = "%s" % runner.config.checkforbz2footer |
| 1681 | + if exists(checkforbz2footer): |
| 1682 | + # check to see if any of the output files are truncated |
| 1683 | + files = [] |
| 1684 | + if (self._chunks): |
| 1685 | + for i in range(1, len(self._chunks)+1): |
| 1686 | + files.append( self._path(runner, 'bz2', i ) ) |
| 1687 | + files.append( self._path(runner, 'bz2', i ) ) |
| 1688 | + |
| 1689 | + for f in files: |
| 1690 | + pipeline = [] |
| 1691 | + pipeline.append([ checkforbz2footer, f ]) |
| 1692 | + p = CommandPipeline(pipeline, quiet=True) |
| 1693 | + p.runPipelineAndGetOutput() |
| 1694 | + if not p.exitedSuccessfully(): |
| 1695 | + runner.logAndPrint("file %s is truncated, moving out of the way" %f ) |
| 1696 | + os.rename( f, f + ".truncated" ) |
| 1697 | + error = 1 |
| 1698 | + if (error): |
| 1699 | + raise BackupError("error producing xml bz2 file(s) %s" % self._subset) |
| 1700 | + |
1678 | 1701 | def buildEta(self, runner): |
1679 | 1702 | """Tell the dumper script whether to make ETA estimate on page or revision count.""" |
1680 | 1703 | return "--current" |
— | — | @@ -1969,7 +1992,7 @@ |
1970 | 1993 | return(XmlDump.listFiles(self, runner, unnumbered=True)) |
1971 | 1994 | |
1972 | 1995 | def run(self, runner): |
1973 | | - errorresult=0 |
| 1996 | + error=0 |
1974 | 1997 | if (self._chunks): |
1975 | 1998 | files = XmlDump.listFiles(self,runner) |
1976 | 1999 | outputFileList = self.listFiles(runner) |
— | — | @@ -1992,8 +2015,9 @@ |
1993 | 2016 | series = [ recombinePipeline ] |
1994 | 2017 | result = runner.runCommand([ series ], callbackTimed=self.progressCallback, callbackTimedArg=runner, shell = True) |
1995 | 2018 | if result: |
1996 | | - errorresult = result |
1997 | | - return errorresult |
| 2019 | + error = result |
| 2020 | + if (error): |
| 2021 | + raise BackupError("error recombining xml bz2 files") |
1998 | 2022 | |
1999 | 2023 | class BigXmlDump(XmlDump): |
2000 | 2024 | """XML page dump for something larger, where a 7-Zip compressed copy |
— | — | @@ -2068,7 +2092,7 @@ |
2069 | 2093 | self.cleanupOldFiles(runner) |
2070 | 2094 | series = self.buildCommand(runner) |
2071 | 2095 | commands.append(series) |
2072 | | - result = runner.runCommand(commands, callbackTimed=self.progressCallback, callbackTimedArg=runner, shell = True) |
| 2096 | + error = runner.runCommand(commands, callbackTimed=self.progressCallback, callbackTimedArg=runner, shell = True) |
2073 | 2097 | # temp hack force 644 permissions until ubuntu bug # 370618 is fixed - tomasz 5/1/2009 |
2074 | 2098 | # some hacks aren't so temporary - atg 3 sept 2010 |
2075 | 2099 | if (self._chunks): |
— | — | @@ -2080,7 +2104,8 @@ |
2081 | 2105 | xml7z = self.buildOutputFilename(runner) |
2082 | 2106 | if exists(xml7z): |
2083 | 2107 | os.chmod(xml7z, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH ) |
2084 | | - return(result) |
| 2108 | + if (error): |
| 2109 | + raise BackupError("error recompressing bz2 file(s)") |
2085 | 2110 | |
2086 | 2111 | def listFiles(self, runner, unnumbered = False): |
2087 | 2112 | if (self._chunks) and not unnumbered: |
— | — | @@ -2113,8 +2138,7 @@ |
2114 | 2139 | runner.remove(filename) |
2115 | 2140 | |
2116 | 2141 | def run(self, runner): |
2117 | | - print "here we are" |
2118 | | - errorresult = 0 |
| 2142 | + error = 0 |
2119 | 2143 | if (self._chunks): |
2120 | 2144 | self.cleanupOldFiles(runner) |
2121 | 2145 | files = XmlRecompressDump.listFiles(self,runner) |
— | — | @@ -2138,8 +2162,9 @@ |
2139 | 2163 | series = [ recombinePipeline ] |
2140 | 2164 | result = runner.runCommand([ series ], callbackTimed=self.progressCallback, callbackTimedArg=runner, shell = True) |
2141 | 2165 | if result: |
2142 | | - errorresult = result |
2143 | | - return errorresult |
| 2166 | + error = result |
| 2167 | + if (error): |
| 2168 | + raise BackupError("error recombining xml bz2 file(s)") |
2144 | 2169 | |
2145 | 2170 | class AbstractDump(Dump): |
2146 | 2171 | """XML dump for Yahoo!'s Active Abstracts thingy""" |
— | — | @@ -2186,8 +2211,11 @@ |
2187 | 2212 | else: |
2188 | 2213 | series = self.buildCommand(runner) |
2189 | 2214 | commands.append(series) |
2190 | | - runner.runCommand(commands, callbackStderr=self.progressCallback, callbackStderrArg=runner) |
| 2215 | + error = runner.runCommand(commands, callbackStderr=self.progressCallback, callbackStderrArg=runner) |
| 2216 | + if (error): |
| 2217 | + raise BackupError("error producing abstract dump") |
2191 | 2218 | |
| 2219 | + |
2192 | 2220 | def _variants(self, runner): |
2193 | 2221 | # If the database name looks like it's marked as Chinese language, |
2194 | 2222 | # return a list including Simplified and Traditional versions, so |
— | — | @@ -2234,7 +2262,7 @@ |
2235 | 2263 | return(AbstractDump.listFiles(self,runner, unnumbered = True)) |
2236 | 2264 | |
2237 | 2265 | def run(self, runner): |
2238 | | - errorresult = 0 |
| 2266 | + error = 0 |
2239 | 2267 | if (self._chunks): |
2240 | 2268 | files = AbstractDump.listFiles(self,runner) |
2241 | 2269 | outputFileList = self.listFiles(runner) |
— | — | @@ -2256,8 +2284,9 @@ |
2257 | 2285 | series = [ recombinePipeline ] |
2258 | 2286 | result = runner.runCommand([ series ], callbackTimed=self.progressCallback, callbackTimedArg=runner, shell = True) |
2259 | 2287 | if result: |
2260 | | - errorresult = result |
2261 | | - return errorresult |
| 2288 | + error = result |
| 2289 | + if (error): |
| 2290 | + raise BackupError("error recombining abstract dump files") |
2262 | 2291 | |
2263 | 2292 | class TitleDump(Dump): |
2264 | 2293 | """This is used by "wikiproxy", a program to add Wikipedia links to BBC news online""" |
— | — | @@ -2271,7 +2300,8 @@ |
2272 | 2301 | retries = retries + 1 |
2273 | 2302 | time.sleep(5) |
2274 | 2303 | error = runner.saveSql(query, runner.dumpDir.publicPath("all-titles-in-ns0.gz")) |
2275 | | - return error |
| 2304 | + if (error): |
| 2305 | + raise BackupError("error dumping titles list") |
2276 | 2306 | |
2277 | 2307 | def listFiles(self, runner): |
2278 | 2308 | return ["all-titles-in-ns0.gz"] |