r92523 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r92522‎ | r92523 | r92524 >
Date:11:30, 19 July 2011
Author:ariel
Status:deferred
Tags:
Comment:
run one "chunk" by itself, from command line,
new classes RunInfo and RunInfoFile,
clean up some dead code,
some formatting cleanup,
listFiles functions cleanup in all dump jobs,
add "enabled" arg for notices, dumpruninfo file, md5sum generation, etc.
so these can be turned on and off at will
Modified paths:
  • /branches/ariel/xmldumps-backup/worker.py (modified) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/worker.py
@@ -24,9 +24,6 @@
2525 from WikiDump import FileUtils, MiscUtils, TimeUtils
2626 from CommandManagement import CommandPipeline, CommandSeries, CommandsInParallel
2727
28 -def xmlEscape(text):
29 - return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;");
30 -
3128 class Logger(object):
3229
3330 def __init__(self, logFileName=None):
@@ -234,7 +231,6 @@
235232
236233
237234 class RunSimpleCommand(object):
238 -
239235 # FIXME rewrite to not use popen2
240236 def runAndReturn(command, logCallback = None):
241237 """Run a command and return the output as a string.
@@ -315,6 +311,146 @@
316312 class BackupError(Exception):
317313 pass
318314
 315+class RunInfoFile(object):
 316+ def __init__(self, wiki, enabled):
 317+ self.wiki = wiki
 318+ self._enabled = enabled
 319+
 320+ def saveDumpRunInfoFile(self, text):
 321+ """Write out a simple text file with the status for this wiki's dump."""
 322+ if (self._enabled):
 323+ try:
 324+ self._writeDumpRunInfoFile(text)
 325+ except:
 326+ print "Couldn't save dump run info file. Continuing anyways"
 327+
 328+ def statusOfOldDumpIsDone(self, runner, date, jobName, jobDesc):
 329+ oldDumpRunInfoFilename=self._getDumpRunInfoFileName(date)
 330+ status = self._getStatusForJobFromRunInfoFile(oldDumpRunInfoFilename, jobName)
 331+ if (status == "done"):
 332+ return 1
 333+ elif (not status == None):
 334+ # failure, in progress, some other useless thing
 335+ return 0
 336+
 337+ # ok, there was no info there to be had, try the index file. yuck.
 338+ indexFilename = os.path.join(runner.wiki.publicDir(), date, runner.config.perDumpIndex)
 339+ status = self._getStatusForJobFromIndexFile(indexFilename, jobDesc)
 340+ if (status == "done"):
 341+ return 1
 342+ else:
 343+ return 0
 344+
 345+ def getOldRunInfoFromFile(self):
 346+ # read the dump run info file in, if there is one, and get info about which dumps
 347+ # have already been run and whether they were successful
 348+ dumpRunInfoFileName = self._getDumpRunInfoFileName()
 349+ results = []
 350+ try:
 351+ infile = open(dumpRunInfoFileName,"r")
 352+ for line in infile:
 353+ results.append(self._getOldRunInfoFromLine(line))
 354+ infile.close
 355+ return results
 356+ except:
 357+ return False
 358+
 359+ #
 360+ # functions internal to the class
 361+ #
 362+ def _getDumpRunInfoFileName(self, date = None):
 363+ # sometimes need to get this info for an older run to check status of a file for
 364+ # possible prefetch
 365+ if (date):
 366+ return os.path.join(self.wiki.publicDir(), date, "dumpruninfo.txt")
 367+ else:
 368+ return os.path.join(self.wiki.publicDir(), self.wiki.date, "dumpruninfo.txt")
 369+
 370+ def _getDumpRunInfoDirName(self, date=None):
 371+ if (date):
 372+ return os.path.join(self.wiki.publicDir(), date);
 373+ else:
 374+ return os.path.join(self.wiki.publicDir(), self.wiki.date);
 375+
 376+ # format: name:%; updated:%; status:%
 377+ def _getOldRunInfoFromLine(self, line):
 378+ # get rid of leading/trailing/blanks
 379+ line = line.strip(" ")
 380+ line = line.replace("\n","")
 381+ fields = line.split(';',3)
 382+ dumpRunInfo = RunInfo()
 383+ for field in fields:
 384+ field = field.strip(" ")
 385+ (fieldName, separator, fieldValue) = field.partition(':')
 386+ if (fieldName == "name"):
 387+ dumpRunInfo.setName(fieldValue)
 388+ elif (fieldName == "status"):
 389+ dumpRunInfo.setStatus(fieldValue,False)
 390+ elif (fieldName == "updated"):
 391+ dumpRunInfo.setUpdated(fieldValue)
 392+ return(dumpRunInfo)
 393+
 394+ def _writeDumpRunInfoFile(self, text):
 395+ directory = self._getDumpRunInfoDirName()
 396+ dumpRunInfoFilename = self._getDumpRunInfoFileName()
 397+ FileUtils.writeFile(directory, dumpRunInfoFilename, text, self.wiki.config.fileperms)
 398+
 399+ # format: name:%; updated:%; status:%
 400+ def _getStatusForJobFromRunInfoFileLine(self, line, jobName):
 401+ # get rid of leading/trailing/embedded blanks
 402+ line = line.replace(" ","")
 403+ line = line.replace("\n","")
 404+ fields = line.split(';',3)
 405+ for field in fields:
 406+ (fieldName, separator, fieldValue) = field.partition(':')
 407+ if (fieldName == "name"):
 408+ if (not fieldValue == jobName):
 409+ return None
 410+ elif (fieldName == "status"):
 411+ return fieldValue
 412+
 413+ def _getStatusForJobFromRunInfoFile(self, filename, jobName = ""):
 414+ # read the dump run info file in, if there is one, and find out whether
 415+ # a particular job (one step only, not a multiple piece job) has been
 416+ # already run and whether it was successful (use to examine status
 417+ # of step from some previous run)
 418+ try:
 419+ infile = open(filename,"r")
 420+ for line in infile:
 421+ result = self._getStatusForJobFromRunInfoFileLine(line, jobName)
 422+ if (not result == None):
 423+ return result
 424+ infile.close
 425+ return None
 426+ except:
 427+ return None
 428+
 429+ # find desc in there, look for "class='done'"
 430+ def _getStatusForJobFromIndexFileLine(self, line, desc):
 431+ if not(">"+desc+"<" in line):
 432+ return None
 433+ if "<li class='done'>" in line:
 434+ return "done"
 435+ else:
 436+ return "other"
 437+
 438+ def _getStatusForJobFromIndexFile(self, filename, desc):
 439+ # read the index file in, if there is one, and find out whether
 440+ # a particular job (one step only, not a multiple piece job) has been
 441+ # already run and whether it was successful (use to examine status
 442+ # of step from some previous run)
 443+ try:
 444+ infile = open(filename,"r")
 445+ for line in infile:
 446+ result = self._getStatusForJobFromIndexFileLine(line, desc)
 447+ if (not result == None):
 448+ return result
 449+ infile.close
 450+ return None
 451+ except:
 452+ return None
 453+
 454+
319455 class RunInfo(object):
320456 def __init__(self, name="", status="", updated="", toBeRun = False):
321457 self._name = name
@@ -347,8 +483,7 @@
348484 self._toBeRun = toBeRun
349485
350486 class DumpItemList(object):
351 -
352 - def __init__(self, wiki, prefetch, spawn, date, chunkInfo):
 487+ def __init__(self, wiki, prefetch, spawn, date, chunkToDo, singleJob, chunkInfo, runInfoFile):
353488 self.date = date
354489 self.wiki = wiki
355490 self._hasFlaggedRevs = self.wiki.hasFlaggedRevs()
@@ -356,7 +491,18 @@
357492 self._prefetch = prefetch
358493 self._spawn = spawn
359494 self.chunkInfo = chunkInfo
 495+ self._chunkToDo = chunkToDo
 496+ self._singleJob = singleJob
 497+ self._runInfoFile = runInfoFile
360498
 499+ if (self._singleJob and self._chunkToDo):
 500+ if (self._singleJob[-5:] == 'table' or
 501+ self._singleJob[-9:] == 'recombine' or
 502+ self._singleJob == 'noop' or
 503+ self._singleJob == 'xmlpagelogsdump' or
 504+ self._singleJob == 'pagetitlesdump'):
 505+ raise BackupError("You cannot specify a chunk with the job %s, exiting.\n" % self._singleJob)
 506+
361507 self.dumpItems = [PrivateTable("user", "usertable", "User account data."),
362508 PrivateTable("watchlist", "watchlisttable", "Users' watchlist settings."),
363509 PrivateTable("ipblocks", "ipblockstable", "Data for blocks of IP addresses, ranges, and users."),
@@ -390,12 +536,12 @@
391537
392538 TitleDump("pagetitlesdump", "List of page titles"),
393539
394 - AbstractDump("abstractsdump","Extracted page abstracts for Yahoo", self.chunkInfo.getPagesPerChunkAbstract())]
 540+ AbstractDump("abstractsdump","Extracted page abstracts for Yahoo", self._getChunkToDo("abstractsdump"), self.chunkInfo.getPagesPerChunkAbstract())]
395541
396542 if (self.chunkInfo.chunksEnabled()):
397543 self.dumpItems.append(RecombineAbstractDump("abstractsdumprecombine", "Recombine extracted page abstracts for Yahoo", self.chunkInfo.getPagesPerChunkAbstract()))
398544
399 - self.dumpItems.append(XmlStub("xmlstubsdump", "First-pass for page XML data dumps", self.chunkInfo.getPagesPerChunkHistory()))
 545+ self.dumpItems.append(XmlStub("xmlstubsdump", "First-pass for page XML data dumps", self._getChunkToDo("xmlstubsdump"), self.chunkInfo.getPagesPerChunkHistory()))
400546 if (self.chunkInfo.chunksEnabled()):
401547 self.dumpItems.append(RecombineXmlStub("xmlstubsdumprecombine", "Recombine first-pass for page XML data dumps", self.chunkInfo.getPagesPerChunkHistory()))
402548
@@ -404,7 +550,7 @@
405551 XmlDump("articles",
406552 "articlesdump",
407553 "<big><b>Articles, templates, image descriptions, and primary meta-pages.</b></big>",
408 - "This contains current versions of article content, and is the archive most mirror sites will probably want.", self._prefetch, self._spawn, self.chunkInfo.getPagesPerChunkHistory()))
 554+ "This contains current versions of article content, and is the archive most mirror sites will probably want.", self._prefetch, self._spawn, self._getChunkToDo("articlesdump"), self.chunkInfo.getPagesPerChunkHistory()))
409555 if (self.chunkInfo.chunksEnabled()):
410556 self.dumpItems.append(RecombineXmlDump("articles","articlesdumprecombine", "<big><b>Recombine articles, templates, image descriptions, and primary meta-pages.</b></big>","This contains current versions of article content, and is the archive most mirror sites will probably want.", self.chunkInfo.getPagesPerChunkHistory()))
411557
@@ -412,7 +558,7 @@
413559 XmlDump("meta-current",
414560 "metacurrentdump",
415561 "All pages, current versions only.",
416 - "Discussion and user pages are included in this complete archive. Most mirrors won't want this extra material.", self._prefetch, self._spawn, self.chunkInfo.getPagesPerChunkHistory()))
 562+ "Discussion and user pages are included in this complete archive. Most mirrors won't want this extra material.", self._prefetch, self._spawn, self._getChunkToDo("metacurrentdump"), self.chunkInfo.getPagesPerChunkHistory()))
417563
418564 if (self.chunkInfo.chunksEnabled()):
419565 self.dumpItems.append(RecombineXmlDump("meta-current","metacurrentdumprecombine", "Recombine all pages, current versions only.","Discussion and user pages are included in this complete archive. Most mirrors won't want this extra material.", self.chunkInfo.getPagesPerChunkHistory()))
@@ -432,7 +578,7 @@
433579 "metahistorybz2dump",
434580 "All pages with complete page edit history (.bz2)",
435581 "These dumps can be *very* large, uncompressing up to 20 times the archive download size. " +
436 - "Suitable for archival and statistical use, most mirror sites won't want or need this.", self._prefetch, self._spawn, self.chunkInfo.getPagesPerChunkHistory()))
 582+ "Suitable for archival and statistical use, most mirror sites won't want or need this.", self._prefetch, self._spawn, self._getChunkToDo("metahistorybz2dump"), self.chunkInfo.getPagesPerChunkHistory()))
437583 if (self.chunkInfo.chunksEnabled() and self.chunkInfo.recombineHistory()):
438584 self.dumpItems.append(
439585 RecombineXmlDump("meta-history",
@@ -445,7 +591,7 @@
446592 "metahistory7zdump",
447593 "All pages with complete edit history (.7z)",
448594 "These dumps can be *very* large, uncompressing up to 100 times the archive download size. " +
449 - "Suitable for archival and statistical use, most mirror sites won't want or need this.", self.chunkInfo.getPagesPerChunkHistory()))
 595+ "Suitable for archival and statistical use, most mirror sites won't want or need this.", self._getChunkToDo("metahistory7zdump"), self.chunkInfo.getPagesPerChunkHistory()))
450596 if (self.chunkInfo.chunksEnabled() and self.chunkInfo.recombineHistory()):
451597 self.dumpItems.append(
452598 RecombineXmlRecompressDump("meta-history",
@@ -453,76 +599,15 @@
454600 "Recombine all pages with complete edit history (.7z)",
455601 "These dumps can be *very* large, uncompressing up to 100 times the archive download size. " +
456602 "Suitable for archival and statistical use, most mirror sites won't want or need this.", self.chunkInfo.getPagesPerChunkHistory()))
457 - self.oldRunInfoRetrieved = self._getOldRunInfoFromFile()
458 -
459 -
460 -
461 - # read in contents from dump run info file and stuff into dumpItems for later reference
462 -
463 - # sometimes need to get this info for an older run to check status of a file for
464 - # possible prefetch
465 - def _getDumpRunInfoFileName(self, date=None):
466 - if (date):
467 - return os.path.join(self.wiki.publicDir(), date, "dumpruninfo.txt")
 603+ results = self._runInfoFile.getOldRunInfoFromFile()
 604+ if (results):
 605+ for runInfoObj in results:
 606+ self._setDumpItemRunInfo(runInfoObj)
 607+ self.oldRunInfoRetrieved = True
468608 else:
469 - return os.path.join(self.wiki.publicDir(), self.date, "dumpruninfo.txt")
470 -
471 - def _getDumpRunInfoDirName(self, date=None):
472 - if (date):
473 - return os.path.join(self.wiki.publicDir(), date);
474 - else:
475 - return os.path.join(self.wiki.publicDir(), self.date);
476 -
477 - def _setDumpItemRunInfo(self, runInfo):
478 - if (not runInfo.name()):
479 - return False
480 - for item in self.dumpItems:
481 - if (item.name() == runInfo.name()):
482 - item.setStatus(runInfo.status(),False)
483 - item.setUpdated(runInfo.updated())
484 - item.setToBeRun(runInfo.toBeRun())
485 - return True
486 - return False
487 -
488 - # format: name:%; updated:%; status:%
489 - def _getOldRunInfoFromLine(self, line):
490 - # get rid of leading/trailing/blanks
491 - line = line.strip(" ")
492 - line = line.replace("\n","")
493 - fields = line.split(';',3)
494 - dumpRunInfo = RunInfo()
495 - for field in fields:
496 - field = field.strip(" ")
497 - (fieldName, separator, fieldValue) = field.partition(':')
498 - if (fieldName == "name"):
499 - dumpRunInfo.setName(fieldValue)
500 - elif (fieldName == "status"):
501 - dumpRunInfo.setStatus(fieldValue,False)
502 - elif (fieldName == "updated"):
503 - dumpRunInfo.setUpdated(fieldValue)
504 - self._setDumpItemRunInfo(dumpRunInfo)
505 -
506 - def _getOldRunInfoFromFile(self):
507 - # read the dump run info file in, if there is one, and get info about which dumps
508 - # have already been run and whether they were successful
509 - dumpRunInfoFileName = self._getDumpRunInfoFileName()
510 - try:
511 - infile = open(dumpRunInfoFileName,"r")
512 - for line in infile:
513 - self._getOldRunInfoFromLine(line)
514 - infile.close
515 - return True
516 - except:
517 - return False
518 -
519 - # write dump run info file
520 - # (this file is rewritten with updates after each dumpItem completes)
521 -
522 - def _reportDumpRunInfoLine(self, item):
523 - # even if the item has never been run we will at least have "waiting" in the status
524 - return "name:%s; status:%s; updated:%s" % (item.name(), item.status(), item.updated())
525 -
526 - def _reportDumpRunInfo(self, done=False):
 609+ self.oldRunInfoRetrieved = False
 610+
 611+ def reportDumpRunInfo(self, done=False):
527612 """Put together a dump run info listing for this database, with all its component dumps."""
528613 runInfoLines = [self._reportDumpRunInfoLine(item) for item in self.dumpItems]
529614 runInfoLines.reverse()
@@ -530,18 +615,6 @@
531616 text = text + "\n"
532617 return text
533618
534 - def writeDumpRunInfoFile(self, text):
535 - directory = self._getDumpRunInfoDirName()
536 - dumpRunInfoFilename = self._getDumpRunInfoFileName()
537 - FileUtils.writeFile(directory, dumpRunInfoFilename, text, self.wiki.config.fileperms)
538 -
539 - def saveDumpRunInfoFile(self, done=False):
540 - """Write out a simple text file with the status for this wiki's dump."""
541 - try:
542 - self.writeDumpRunInfoFile(self._reportDumpRunInfo(done))
543 - except:
544 - print "Couldn't save dump run info file. Continuing anyways"
545 -
546619 def allPossibleJobsDone(self):
547620 for item in self.dumpItems:
548621 if (item.status() != "done" and item.status() != "failed"):
@@ -583,7 +656,6 @@
584657 break
585658
586659 # see whether job needs previous jobs that have not completed successfully
587 -
588660 def jobDoneSuccessfully(self, job):
589661 for item in self.dumpItems:
590662 if (item.name() == job):
@@ -622,29 +694,72 @@
623695 return False
624696 return True
625697
 698+ def _getChunkToDo(self, jobName):
 699+ if (self._singleJob):
 700+ if (self._singleJob == jobName):
 701+ return(self._chunkToDo)
 702+ return(False)
 703+
 704+ # read in contents from dump run info file and stuff into dumpItems for later reference
 705+ def _setDumpItemRunInfo(self, runInfo):
 706+ if (not runInfo.name()):
 707+ return False
 708+ for item in self.dumpItems:
 709+ if (item.name() == runInfo.name()):
 710+ item.setStatus(runInfo.status(),False)
 711+ item.setUpdated(runInfo.updated())
 712+ item.setToBeRun(runInfo.toBeRun())
 713+ return True
 714+ return False
 715+
 716+ # write dump run info file
 717+ # (this file is rewritten with updates after each dumpItem completes)
 718+ def _reportDumpRunInfoLine(self, item):
 719+ # even if the item has never been run we will at least have "waiting" in the status
 720+ return "name:%s; status:%s; updated:%s" % (item.name(), item.status(), item.updated())
 721+
626722 class Checksummer(object):
627 - def __init__(self,wiki,dumpDir):
 723+ def __init__(self,wiki,dumpDir, enabled = True):
628724 self.wiki = wiki
629725 self.dumpDir = dumpDir
630726 self.timestamp = time.strftime("%Y%m%d%H%M%S", time.gmtime())
631 -
 727+ self._enabled = enabled
 728+
 729+ def prepareChecksums(self):
 730+ """Create a temporary md5 checksum file.
 731+ Call this at the start of the dump run, and move the file
 732+ into the final location at the completion of the dump run."""
 733+ if (self._enabled):
 734+ checksumFileName = self._getChecksumFileNameTmp()
 735+ output = file(checksumFileName, "w")
 736+
 737+ def checksum(self, filename, runner):
 738+ """Run checksum for an output file, and append to the list."""
 739+ if (self._enabled):
 740+ checksumFileName = self._getChecksumFileNameTmp()
 741+ output = file(checksumFileName, "a")
 742+ self._saveChecksum(filename, output, runner)
 743+ output.close()
 744+
 745+ def moveMd5FileIntoPlace(self):
 746+ if (self._enabled):
 747+ tmpFileName = self._getChecksumFileNameTmp()
 748+ realFileName = self._getChecksumFileName()
 749+ os.rename(tmpFileName, realFileName)
 750+
632751 def getChecksumFileNameBasename(self):
633752 return ("md5sums.txt")
634753
635 - def getChecksumFileName(self):
 754+ #
 755+ # functions internal to the class
 756+ #
 757+ def _getChecksumFileName(self):
636758 return (self.dumpDir.publicPath(self.getChecksumFileNameBasename()))
637759
638 - def getChecksumFileNameTmp(self):
 760+ def _getChecksumFileNameTmp(self):
639761 return (self.dumpDir.publicPath(self.getChecksumFileNameBasename() + "." + self.timestamp + ".tmp"))
640762
641 - def prepareChecksums(self):
642 - """Create a temporary md5 checksum file.
643 - Call this at the start of the dump run, and move the file
644 - into the final location at the completion of the dump run."""
645 - checksumFileName = self.getChecksumFileNameTmp()
646 - output = file(checksumFileName, "w")
647 -
648 - def md5File(self, filename):
 763+ def _md5File(self, filename):
649764 summer = md5.new()
650765 infile = file(filename, "rb")
651766 bufsize = 4192 * 32
@@ -655,28 +770,16 @@
656771 infile.close()
657772 return summer.hexdigest()
658773
659 - def md5FileLine(self, filename):
660 - return "%s %s\n" % (self.md5File(filename), os.path.basename(filename))
 774+ def _md5FileLine(self, filename):
 775+ return "%s %s\n" % (self._md5File(filename), os.path.basename(filename))
661776
662 - def saveChecksum(self, file, output, runner):
 777+ def _saveChecksum(self, file, output, runner):
663778 runner.debug("Checksumming %s" % file)
664779 path = self.dumpDir.publicPath(file)
665780 if os.path.exists(path):
666 - checksum = self.md5FileLine(path)
 781+ checksum = self._md5FileLine(path)
667782 output.write(checksum)
668783
669 - def checksum(self, filename, runner):
670 - """Run checksum for an output file, and append to the list."""
671 - checksumFileName = self.getChecksumFileNameTmp()
672 - output = file(checksumFileName, "a")
673 - self.saveChecksum(filename, output, runner)
674 - output.close()
675 -
676 - def moveMd5FileIntoPlace(self):
677 - tmpFileName = self.getChecksumFileNameTmp()
678 - realFileName = self.getChecksumFileName()
679 - os.rename(tmpFileName, realFileName)
680 -
681784 class DumpDir(object):
682785 def __init__(self, wiki, dbName, date):
683786 self._wiki = wiki
@@ -711,7 +814,7 @@
712815 # everything that has to do with reporting the status of a piece
713816 # of a dump is collected here
714817 class Status(object):
715 - def __init__(self, wiki, dumpDir, date, items, checksums, noticeFile = None, errorCallback=None):
 818+ def __init__(self, wiki, dumpDir, date, items, checksums, enabled, noticeFile = None, errorCallback=None):
716819 self.wiki = wiki
717820 self.config = wiki.config
718821 self.dbName = wiki.dbName
@@ -723,16 +826,48 @@
724827 self.noticeFile = noticeFile
725828 self.errorCallback = errorCallback
726829 self.failCount = 0
 830+ self._enabled = enabled
727831
728 - def saveStatusSummaryAndDetail(self, done=False):
 832+ def updateStatusFiles(self, done=False):
 833+ if self._enabled:
 834+ self._saveStatusSummaryAndDetail(done)
 835+
 836+ def reportFailure(self):
 837+ if self._enabled:
 838+ if self.config.adminMail:
 839+ subject = "Dump failure for " + self.dbName
 840+ message = self.config.readTemplate("errormail.txt") % {
 841+ "db": self.dbName,
 842+ "date": self.date,
 843+ "time": TimeUtils.prettyTime(),
 844+ "url": "/".join((self.config.webRoot, self.dbName, self.date, ''))}
 845+ config.mail(subject, message)
 846+
 847+ # this is a per-dump-item report (well per file generated by the item)
 848+ # Report on the file size & item status of the current output and output a link if we are done
 849+ def reportFile(self, file, itemStatus):
 850+ filepath = self.dumpDir.publicPath(file)
 851+ if itemStatus == "in-progress" and exists (filepath):
 852+ size = FileUtils.prettySize(getsize(filepath))
 853+ return "<li class='file'>%s %s (written) </li>" % (file, size)
 854+ elif itemStatus == "done" and exists(filepath):
 855+ size = FileUtils.prettySize(getsize(filepath))
 856+ webpath = self.dumpDir.webPath(file)
 857+ return "<li class='file'><a href=\"%s\">%s</a> %s</li>" % (webpath, file, size)
 858+ else:
 859+ return "<li class='missing'>%s</li>" % file
 860+
 861+ #
 862+ # functions internal to the class
 863+ #
 864+ def _saveStatusSummaryAndDetail(self, done=False):
729865 """Write out an HTML file with the status for this wiki's dump
730866 and links to completed files, as well as a summary status in a separate file."""
731867 try:
732868 # Comprehensive report goes here
733 - self.wiki.writePerDumpIndex(self.reportDatabaseStatusDetailed(done))
734 -
 869+ self.wiki.writePerDumpIndex(self._reportDatabaseStatusDetailed(done))
735870 # Short line for report extraction goes here
736 - self.wiki.writeStatus(self.reportDatabaseStatusSummary(done))
 871+ self.wiki.writeStatus(self._reportDatabaseStatusSummary(done))
737872 except:
738873 message = "Couldn't update status files. Continuing anyways"
739874 if self.errorCallback:
@@ -740,47 +875,34 @@
741876 else:
742877 print(message)
743878
744 - def updateStatusFiles(self, done=False):
745 - self.saveStatusSummaryAndDetail(done)
746 -
747 - def reportDatabaseStatusSummary(self, done=False):
 879+ def _reportDatabaseStatusSummary(self, done = False):
748880 """Put together a brief status summary and link for the current database."""
749 - status = self.reportStatusSummaryLine(done)
 881+ status = self._reportStatusSummaryLine(done)
750882 html = self.wiki.reportStatusLine(status)
751883
752884 activeItems = [x for x in self.items if x.status() == "in-progress"]
753885 if activeItems:
754 - return html + "<ul>" + "\n".join([self.reportItem(x) for x in activeItems]) + "</ul>"
 886+ return html + "<ul>" + "\n".join([self._reportItem(x) for x in activeItems]) + "</ul>"
755887 else:
756888 return html
757889
758 - def reportDatabaseStatusDetailed(self, done=False):
 890+ def _reportDatabaseStatusDetailed(self, done = False):
759891 """Put together a status page for this database, with all its component dumps."""
760892 self.noticeFile.refreshNotice()
761 - statusItems = [self.reportItem(item) for item in self.items]
 893+ statusItems = [self._reportItem(item) for item in self.items]
762894 statusItems.reverse()
763895 html = "\n".join(statusItems)
764896 return self.config.readTemplate("report.html") % {
765897 "db": self.dbName,
766898 "date": self.date,
767899 "notice": self.noticeFile.notice,
768 - "status": self.reportStatusSummaryLine(done),
769 - "previous": self.reportPreviousDump(done),
 900+ "status": self._reportStatusSummaryLine(done),
 901+ "previous": self._reportPreviousDump(done),
770902 "items": html,
771903 "checksum": self.dumpDir.webPath(self.checksums.getChecksumFileNameBasename()),
772904 "index": self.config.index}
773905
774 - def reportFailure(self):
775 - if self.config.adminMail:
776 - subject = "Dump failure for " + self.dbName
777 - message = self.config.readTemplate("errormail.txt") % {
778 - "db": self.dbName,
779 - "date": self.date,
780 - "time": TimeUtils.prettyTime(),
781 - "url": "/".join((self.config.webRoot, self.dbName, self.date, ''))}
782 - config.mail(subject, message)
783 -
784 - def reportPreviousDump(self, done):
 906+ def _reportPreviousDump(self, done):
785907 """Produce a link to the previous dump, if any"""
786908 # get the list of dumps for this wiki in order, find me in the list, find the one prev to me.
787909 # why? we might be rerunning a job from an older dumps. we might have two
@@ -805,7 +927,7 @@
806928 message = "previous dump from"
807929 return "%s<a href=\"../%s/\">%s %s</a>" % (prefix, rawDate, message, prettyDate)
808930
809 - def reportStatusSummaryLine(self, done=False):
 931+ def _reportStatusSummaryLine(self, done=False):
810932 if (done == "done"):
811933 classes = "done"
812934 text = "Dump complete"
@@ -824,12 +946,15 @@
825947 text += ", %d item%s failed" % (self.failCount, ess)
826948 return "<span class='%s'>%s</span>" % (classes, text)
827949
828 - def reportItem(self, item):
 950+ def _reportItem(self, item):
829951 """Return an HTML fragment with info on the progress of this item."""
 952+ item.status()
 953+ item.updated()
 954+ item.description()
830955 html = "<li class='%s'><span class='updates'>%s</span> <span class='status'>%s</span> <span class='title'>%s</span>" % (item.status(), item.updated(), item.status(), item.description())
831956 if item.progress:
832957 html += "<div class='progress'>%s</div>\n" % item.progress
833 - files = item.listFiles(self)
 958+ files = item.listOutputFiles(self)
834959 if files:
835960 listItems = [self.reportFile(file, item.status()) for file in files]
836961 html += "<ul>"
@@ -841,59 +966,53 @@
842967 html += "</li>"
843968 return html
844969
845 - # this is a per-dump-item report (well per file generated by the item)
846 - # Report on the file size & item status of the current output and output a link if we are done
847 - def reportFile(self, file, itemStatus):
848 - filepath = self.dumpDir.publicPath(file)
849 - if itemStatus == "in-progress" and exists (filepath):
850 - size = FileUtils.prettySize(getsize(filepath))
851 - return "<li class='file'>%s %s (written) </li>" % (file, size)
852 - elif itemStatus == "done" and exists(filepath):
853 - size = FileUtils.prettySize(getsize(filepath))
854 - webpath = self.dumpDir.webPath(file)
855 - return "<li class='file'><a href=\"%s\">%s</a> %s</li>" % (webpath, file, size)
856 - else:
857 - return "<li class='missing'>%s</li>" % file
858 -
859970 class NoticeFile(object):
860 - def __init__(self, wiki, date, notice):
 971+ def __init__(self, wiki, date, notice, enabled):
861972 self.wiki = wiki
862973 self.date = date
863974 self.notice = notice
 975+ self._enabled = enabled
 976+ self.writeNoticeFile()
864977
865 - noticeFile = self.getNoticeFilename()
866 - # delnotice. toss any existing file
867 - if self.notice == False:
868 - if exists(noticeFile):
869 - os.remove(noticeFile)
870 - self.notice = ""
871 - # addnotice, stuff notice in a file for other jobs etc
872 - elif self.notice != "":
873 - noticeDir = self.getNoticeDir()
874 - FileUtils.writeFile(noticeDir, noticeFile, self.notice, self.wiki.config.fileperms)
875 - # default case. if there is a file get the contents, otherwise
876 - # we have empty contents, all good
877 - else:
878 - if exists(noticeFile):
879 - self.notice = FileUtils.readFile(noticeFile)
880 -
881 - def getNoticeFilename(self):
882 - return os.path.join(self.wiki.publicDir(), self.date, "notice.txt")
 978+ def writeNoticeFile(self):
 979+ if (self._enabled):
 980+ noticeFile = self._getNoticeFilename()
 981+ # delnotice. toss any existing file
 982+ if self.notice == False:
 983+ if exists(noticeFile):
 984+ os.remove(noticeFile)
 985+ self.notice = ""
 986+ # addnotice, stuff notice in a file for other jobs etc
 987+ elif self.notice != "":
 988+ noticeDir = self._getNoticeDir()
 989+ FileUtils.writeFile(noticeDir, noticeFile, self.notice, self.wiki.config.fileperms)
 990+ # default case. if there is a file get the contents, otherwise
 991+ # we have empty contents, all good
 992+ else:
 993+ if exists(noticeFile):
 994+ self.notice = FileUtils.readFile(noticeFile)
883995
884 - def getNoticeDir(self):
885 - return os.path.join(self.wiki.publicDir(), self.date);
886 -
887996 def refreshNotice(self):
888997 # if the notice file has changed or gone away, we comply.
889 - noticeFile = self.getNoticeFilename()
 998+ noticeFile = self._getNoticeFilename()
890999 if exists(noticeFile):
8911000 self.notice = FileUtils.readFile(noticeFile)
8921001 else:
8931002 self.notice = ""
8941003
 1004+
 1005+ #
 1006+ # functions internal to class
 1007+ #
 1008+ def _getNoticeFilename(self):
 1009+ return os.path.join(self.wiki.publicDir(), self.date, "notice.txt")
 1010+
 1011+ def _getNoticeDir(self):
 1012+ return os.path.join(self.wiki.publicDir(), self.date);
 1013+
8951014 class Runner(object):
8961015
897 - def __init__(self, wiki, date=None, prefetch=True, spawn=True, job=None, restart=False, notice="", dryrun = False, loggingEnabled=False):
 1016+ def __init__(self, wiki, date=None, prefetch=True, spawn=True, job=None, restart=False, notice="", dryrun = False, loggingEnabled=False, chunkToDo = False):
8981017 self.wiki = wiki
8991018 self.config = wiki.config
9001019 self.dbName = wiki.dbName
@@ -905,7 +1024,24 @@
9061025 self.htmlNoticeFile = None
9071026 self.log = None
9081027 self.dryrun = dryrun
 1028+ self._chunkToDo = chunkToDo
 1029+ self._statusEnabled = True
 1030+ self._checksummerEnabled = True
 1031+ self._runInfoFileEnabled = True
 1032+ self._symLinksEnabled = True
 1033+ self._feedsEnabled = True
 1034+ self._noticeFileEnabled = True
9091035
 1036+ if self.dryrun or self._chunkToDo:
 1037+ self._statusEnabled = False
 1038+ self._checksummerEnabled = False
 1039+ self._runInfoFileEnabled = False
 1040+ self._symLinksEnabled = False
 1041+ self._feedsEnabled = False
 1042+ self._noticeFileEnabled = False
 1043+ if self.dryrun:
 1044+ self.loggingEnabled = False
 1045+
9101046 if date:
9111047 # Override, continuing a past dump?
9121048 self.date = date
@@ -913,34 +1049,28 @@
9141050 self.date = TimeUtils.today()
9151051 wiki.setDate(self.date)
9161052
917 - self.lastFailed = False
918 -
9191053 self.jobRequested = job
9201054 self.dbServerInfo = DbServerInfo(self.wiki, self.dbName, self.logAndPrint)
921 -
9221055 self.dumpDir = DumpDir(self.wiki, self.dbName, self.date)
9231056
924 - # this must come after the dumpdir setup so we know which directory we are in
925 - # for the log file.
926 - if (loggingEnabled and not self.dryrun):
 1057+ self.lastFailed = False
 1058+
 1059+ # these must come after the dumpdir setup so we know which directory we are in
 1060+ if (loggingEnabled):
9271061 self.logFileName = self.dumpDir.publicPath(config.logFile)
9281062 self.makeDir(join(self.wiki.publicDir(), self.date))
9291063 self.log = Logger(self.logFileName)
9301064 thread.start_new_thread(self.logQueueReader,(self.log,))
 1065+ self.runInfoFile = RunInfoFile(wiki,self._runInfoFileEnabled)
 1066+ self.symLinks = SymLinks(self.wiki, self.dumpDir, self. date, self.logAndPrint, self.debug, self._symLinksEnabled)
 1067+ self.feeds = Feeds(self.wiki,self.dumpDir,self.config, self.dbName, self.debug, self._feedsEnabled)
 1068+ self.htmlNoticeFile = NoticeFile(self.wiki, self.date, notice, self._noticeFileEnabled)
 1069+ self.checksums = Checksummer(self.wiki, self.dumpDir, self._checksummerEnabled)
9311070
932 - # have to handle the notice file here instead of main, it goes in the per-run directory
933 - if not dryrun:
934 - self.htmlNoticeFile = NoticeFile(self.wiki, self.date, notice)
935 -
936 - if not dryrun:
937 - self.checksums = Checksummer(self.wiki, self.dumpDir)
938 -
9391071 # some or all of these dumpItems will be marked to run
940 - self.dumpItemList = DumpItemList(self.wiki, self.prefetch, self.spawn, self.date, self.chunkInfo);
 1072+ self.dumpItemList = DumpItemList(self.wiki, self.prefetch, self.spawn, self.date, self._chunkToDo, self.jobRequested, self.chunkInfo, self.runInfoFile);
 1073+ self.status = Status(self.wiki, self.dumpDir, self.date, self.dumpItemList.dumpItems, self.checksums, self._statusEnabled, self.htmlNoticeFile, self.logAndPrint)
9411074
942 - if not self.dryrun:
943 - self.status = Status(self.wiki, self.dumpDir, self.date, self.dumpItemList.dumpItems, self.checksums, self.htmlNoticeFile, self.logAndPrint)
944 -
9451075 def logQueueReader(self,log):
9461076 if not log:
9471077 return
@@ -1045,19 +1175,20 @@
10461176 self.lastFailed = True
10471177
10481178 def runUpdateItemFileInfo(self, item):
1049 - for f in item.listFiles(self):
 1179+ for f in item.listOutputFiles(self):
10501180 print f
10511181 if exists(self.dumpDir.publicPath(f)):
10521182 # why would the file not exist? because we changed chunk numbers in the
10531183 # middle of a run, and now we list more files for the next stage than there
10541184 # were for earlier ones
1055 - self.saveSymlink(f)
1056 - self.saveFeed(f)
 1185+ self.symLinks.saveSymlink(f)
 1186+ self.feeds.saveFeed(f)
10571187 self.checksums.checksum(f, self)
10581188
10591189 def run(self):
10601190 if (self.jobRequested):
10611191 if ((not self.dumpItemList.oldRunInfoRetrieved) and (self.wiki.existsPerDumpIndex())):
 1192+
10621193 # There was a previous run of all or part of this date, but...
10631194 # There was no old RunInfo to be had (or an error was encountered getting it)
10641195 # so we can't rerun a step and keep all the status information about the old run around.
@@ -1097,41 +1228,36 @@
10981229 self.cleanOldDumps()
10991230 self.showRunnerState("Starting backup of %s" % self.dbName)
11001231
1101 - files = self.listFilesFor(self.dumpItemList.dumpItems)
1102 -
11031232 if (self.jobRequested):
1104 - if not self.dryrun:
1105 - self.checksums.prepareChecksums()
 1233+ self.checksums.prepareChecksums()
11061234
11071235 for item in self.dumpItemList.dumpItems:
11081236 if (item.toBeRun()):
11091237 item.start(self)
1110 - if not self.dryrun:
1111 - self.status.updateStatusFiles()
1112 - self.dumpItemList.saveDumpRunInfoFile()
 1238+ self.status.updateStatusFiles()
 1239+ self.runInfoFile.saveDumpRunInfoFile(self.dumpItemList.reportDumpRunInfo())
11131240 try:
11141241 item.dump(self)
11151242 except Exception, ex:
11161243 self.debug("*** exception! " + str(ex))
11171244 item.setStatus("failed")
1118 - if item.status() == "failed" and not self.dryrun:
 1245+ if item.status() == "failed" and not self.dryrun and not self._chunkToDo:
11191246 self.runHandleFailure()
11201247 else:
11211248 self.lastFailed = False
11221249 # this ensures that, previous run or new one, the old or new md5sums go to the file
1123 - if item.status() == "done" and not self.dryrun:
 1250+ if item.status() == "done" and not self.dryrun and not self._chunkToDo:
11241251 self.runUpdateItemFileInfo(item)
11251252
1126 - if not self.dryrun:
1127 - if (self.dumpItemList.allPossibleJobsDone()):
1128 - self.status.updateStatusFiles("done")
1129 - else:
1130 - self.status.updateStatusFiles("partialdone")
1131 - self.dumpItemList.saveDumpRunInfoFile()
1132 -
 1253+ if (self.dumpItemList.allPossibleJobsDone()):
 1254+ self.status.updateStatusFiles("done")
 1255+ else:
 1256+ self.status.updateStatusFiles("partialdone")
 1257+ self.runInfoFile.saveDumpRunInfoFile(self.dumpItemList.reportDumpRunInfo())
 1258+ if not self.dryrun and not self._chunkToDo:
11331259 # if any job succeeds we might as well make the sym link
11341260 if (self.status.failCount < 1):
1135 - self.completeDump(files)
 1261+ self.completeDump()
11361262
11371263 if (self.restart):
11381264 self.showRunnerState("Completed run restarting from job %s for %s" % (self.jobRequested, self.dbName))
@@ -1139,31 +1265,29 @@
11401266 self.showRunnerState("Completed job %s for %s" % (self.jobRequested, self.dbName))
11411267
11421268 else:
1143 - if not self.dryrun:
1144 - self.checksums.prepareChecksums()
 1269+ self.checksums.prepareChecksums()
11451270
11461271 for item in self.dumpItemList.dumpItems:
11471272 item.start(self)
1148 - if not self.dryrun:
1149 - self.status.updateStatusFiles()
1150 - self.dumpItemList.saveDumpRunInfoFile()
 1273+ self.status.updateStatusFiles()
 1274+ self.runInfoFile.saveDumpRunInfoFile(self.dumpItemList.reportDumpRunInfo())
11511275 try:
11521276 item.dump(self)
11531277 except Exception, ex:
11541278 self.debug("*** exception! " + str(ex))
11551279 item.setStatus("failed")
1156 - if item.status() == "failed" and not self.dryrun:
 1280+ if item.status() == "failed" and not self.dryrun and not self._chunkToDo:
11571281 self.runHandleFailure()
11581282 else:
1159 - if not self.dryrun:
 1283+ if not self.dryrun and not self._chunkToDo:
11601284 self.runUpdateItemFileInfo(item)
11611285 self.lastFailed = False
11621286
1163 - if not self.dryrun:
1164 - self.status.updateStatusFiles("done")
1165 - self.dumpItemList.saveDumpRunInfoFile()
 1287+ self.status.updateStatusFiles("done")
 1288+ if not self.dryrun and not self._chunkToDo:
 1289+ self.runInfoFile.saveDumpRunInfoFile(self.dumpItemList.reportDumpRunInfo())
11661290 if self.status.failCount < 1:
1167 - self.completeDump(files)
 1291+ self.completeDump()
11681292
11691293 self.showRunnerStateComplete()
11701294
@@ -1180,57 +1304,25 @@
11811305 if old:
11821306 for dump in old:
11831307 self.showRunnerState("Purging old dump %s for %s" % (dump, self.dbName))
1184 - if not self.dryrun:
 1308+ if not self.dryrun and not self._chunkToDo:
11851309 base = os.path.join(self.wiki.publicDir(), dump)
11861310 shutil.rmtree("%s" % base)
11871311 else:
11881312 self.showRunnerState("No old dumps to purge.")
11891313
1190 - def listFilesFor(self, items):
1191 - files = []
1192 - for item in items:
1193 - for file in item.listFiles(self):
1194 - files.append(file)
1195 - return files
1196 -
1197 -
1198 - def lockFileName(self):
1199 - return self.dumpDir.publicPath("lock")
1200 -
1201 - def doneFileName(self):
1202 - return self.dumpDir.publicPath("done")
1203 -
1204 - def lock(self):
1205 - self.showRunnerState("Creating lock file.")
1206 - lockfile = self.lockFileName()
1207 - donefile = self.doneFileName()
1208 - if exists(lockfile):
1209 - raise BackupError("Lock file %s already exists" % lockfile)
1210 - if exists(donefile):
1211 - self.showRunnerState("Removing completion marker %s" % donefile)
1212 - os.remove(donefile)
1213 - try:
1214 - os.remove(lockfile)
1215 - except:
1216 - # failure? let it die
1217 - pass
1218 -
1219 - def unlock(self):
1220 - self.showRunnerState("Marking complete.")
1221 -
12221314 def showRunnerState(self, message):
12231315 self.debug(message)
12241316
12251317 def showRunnerStateComplete(self):
12261318 self.debug("SUCCESS: done.")
12271319
1228 - def completeDump(self, files):
 1320+ def completeDump(self):
12291321 # note that it's possible for links in "latest" to point to
12301322 # files from different runs, in which case the md5sums file
12311323 # will have accurate checksums for the run for which it was
12321324 # produced, but not the other files. FIXME
12331325 self.checksums.moveMd5FileIntoPlace()
1234 - self.saveSymlink(self.checksums.getChecksumFileNameBasename())
 1326+ self.symLinks.saveSymlink(self.checksums.getChecksumFileNameBasename())
12351327
12361328 def makeDir(self, dir):
12371329 if exists(dir):
@@ -1239,34 +1331,67 @@
12401332 self.debug("Creating %s ..." % dir)
12411333 os.makedirs(dir)
12421334
 1335+class SymLinks(object):
 1336+ def __init__(self, wiki, dumpDir, date, logfn, debugfn, enabled):
 1337+ self.wiki = wiki
 1338+ self.dumpDir = dumpDir
 1339+ self.date = date
 1340+ self._enabled = enabled
 1341+ self.logfn = logfn
 1342+ self.debugfn = debugfn
 1343+
 1344+ def makeDir(self, dir):
 1345+ if exists(dir):
 1346+ self.debugfn("Checkdir dir %s ..." % dir)
 1347+ else:
 1348+ self.debugfn("Creating %s ..." % dir)
 1349+ os.makedirs(dir)
 1350+
12431351 def saveSymlink(self, file):
1244 - self.makeDir(join(self.wiki.publicDir(), 'latest'))
1245 - real = self.dumpDir.publicPath(file)
1246 - link = self.dumpDir.latestPath(file)
1247 - if exists(link) or os.path.islink(link):
1248 - if os.path.islink(link):
1249 - realfile = os.readlink(link)
1250 - # format of these links should be... ../20110228/elwikidb-20110228-templatelinks.sql.gz
1251 - rellinkpattern = re.compile('^\.\./(20[0-9]+)/');
1252 - dateinlink = rellinkpattern.search(realfile)
1253 - if (dateinlink):
1254 - dateoflinkedfile = dateinlink.group(1)
1255 - dateinterval = int(self.date) - int(dateoflinkedfile)
 1352+ if (self._enabled):
 1353+ self.makeDir(join(self.wiki.publicDir(), 'latest'))
 1354+ real = self.dumpDir.publicPath(file)
 1355+ link = self.dumpDir.latestPath(file)
 1356+ if exists(link) or os.path.islink(link):
 1357+ if os.path.islink(link):
 1358+ realfile = os.readlink(link)
 1359+ # format of these links should be... ../20110228/elwikidb-20110228-templatelinks.sql.gz
 1360+ rellinkpattern = re.compile('^\.\./(20[0-9]+)/');
 1361+ dateinlink = rellinkpattern.search(realfile)
 1362+ if (dateinlink):
 1363+ dateoflinkedfile = dateinlink.group(1)
 1364+ dateinterval = int(self.date) - int(dateoflinkedfile)
 1365+ else:
 1366+ dateinterval = 0
 1367+ # no file or it's older than ours... *then* remove the link
 1368+ if not exists(os.path.realpath(link)) or dateinterval > 0:
 1369+ self.debug("Removing old symlink %s" % link)
 1370+ os.remove(link)
12561371 else:
1257 - dateinterval = 0
1258 - # no file or it's older than ours... *then* remove the link
1259 - if not exists(os.path.realpath(link)) or dateinterval > 0:
1260 - self.debug("Removing old symlink %s" % link)
1261 - os.remove(link)
1262 - else:
1263 - self.logAndPrint("What the hell dude, %s is not a symlink" % link)
1264 - raise BackupError("What the hell dude, %s is not a symlink" % link)
1265 - relative = FileUtils.relativePath(real, dirname(link))
1266 - # if we removed the link cause it's obsolete, make the new one
1267 - if exists(real) and not exists(link):
1268 - self.debug("Adding symlink %s -> %s" % (link, relative))
1269 - os.symlink(relative, link)
 1372+ self.logfn("What the hell dude, %s is not a symlink" % link)
 1373+ raise BackupError("What the hell dude, %s is not a symlink" % link)
 1374+ relative = FileUtils.relativePath(real, dirname(link))
 1375+ # if we removed the link cause it's obsolete, make the new one
 1376+ if exists(real) and not exists(link):
 1377+ self.debugfn("Adding symlink %s -> %s" % (link, relative))
 1378+ os.symlink(relative, link)
12701379
 1380+class Feeds(object):
 1381+ def __init__(self, wiki, dumpDir, config, dbName, debugfn, enabled):
 1382+ self.wiki = wiki
 1383+ self.dumpDir = dumpDir
 1384+ self.config = config
 1385+ self.dbName = dbName
 1386+ self.debugfn = debugfn
 1387+ self._enabled = enabled
 1388+
 1389+ def makeDir(self, dir):
 1390+ if exists(dir):
 1391+ self.debugfn("Checkdir dir %s ..." % dir)
 1392+ else:
 1393+ self.debugfn("Creating %s ..." % dir)
 1394+ os.makedirs(dir)
 1395+
12711396 def saveFeed(self, file):
12721397 self.makeDir(join(self.wiki.publicDir(), 'latest'))
12731398 filePath = self.dumpDir.webPath(file)
@@ -1284,6 +1409,7 @@
12851410 rssPath = self.dumpDir.latestPath(file + "-rss.xml")
12861411 FileUtils.writeFile(directory, rssPath, rssText, self.config.fileperms)
12871412
 1413+
12881414 class Dump(object):
12891415 def __init__(self, name, desc):
12901416 self._desc = desc
@@ -1326,7 +1452,7 @@
13271453 """Optionally return additional text to appear under the heading."""
13281454 return None
13291455
1330 - def listFiles(self, runner):
 1456+ def listOutputFiles(self, runner):
13311457 """Return a list of filenames which should be exported and checksummed"""
13321458 return []
13331459
@@ -1356,7 +1482,7 @@
13571483 sys.stderr.write(line)
13581484 self.progress = line.strip()
13591485 runner.status.updateStatusFiles()
1360 - runner.dumpItemList.saveDumpRunInfoFile()
 1486+ runner.runInfoFile.saveDumpRunInfoFile(runner.dumpItemList.reportDumpRunInfo())
13611487
13621488 def timeToWait(self):
13631489 # we use wait this many secs for a command to complete that
@@ -1462,7 +1588,7 @@
14631589 if (error):
14641590 raise BackupError("error dumping table %s" % self._table)
14651591
1466 - def listFiles(self, runner):
 1592+ def listOutputFiles(self, runner):
14671593 return [self._file()]
14681594
14691595 class PrivateTable(PublicTable):
@@ -1474,7 +1600,7 @@
14751601 def _path(self, runner):
14761602 return runner.dumpDir.privatePath(self._file())
14771603
1478 - def listFiles(self, runner):
 1604+ def listOutputFiles(self, runner):
14791605 """Private table won't have public files to list."""
14801606 return []
14811607
@@ -1484,21 +1610,31 @@
14851611 A second pass will import text from prior dumps or the database to make
14861612 full files for the public."""
14871613
1488 - def __init__(self, name, desc, chunks = False):
 1614+ def __init__(self, name, desc, chunkToDo, chunks = False):
14891615 Dump.__init__(self, name, desc)
 1616+ self._chunkToDo = chunkToDo
14901617 self._chunks = chunks
14911618
14921619 def detail(self):
14931620 return "These files contain no page text, only revision metadata."
14941621
1495 - def listFiles(self, runner, unnumbered=False):
1496 - if (self._chunks) and not unnumbered:
1497 - files = []
1498 - for i in range(1, len(self._chunks) + 1):
1499 - files.append("stub-meta-history%s.xml.gz" % i)
1500 - files.append("stub-meta-current%s.xml.gz" % i)
1501 - files.append("stub-articles%s.xml.gz" % i)
1502 - return files
 1622+ def listOutputFiles(self, runner, unnumbered=False):
 1623+ if (self._chunks):
 1624+ if (self._chunkToDo):
 1625+ if (self._chunkToDo < 1 or self._chunkToDo > len(self._chunks)):
 1626+ raise BackupError("chunk option must be in range of available chunks to rerun, 1 through %s\n" % str(len(self._chunks)))
 1627+ files = []
 1628+ files.append("stub-meta-history%s.xml.gz" % self._chunkToDo)
 1629+ files.append("stub-meta-current%s.xml.gz" % self._chunkToDo)
 1630+ files.append("stub-articles%s.xml.gz" % self._chunkToDo)
 1631+ return files
 1632+ else:
 1633+ files = []
 1634+ for i in range(1, len(self._chunks) + 1):
 1635+ files.append("stub-meta-history%s.xml.gz" % i)
 1636+ files.append("stub-meta-current%s.xml.gz" % i)
 1637+ files.append("stub-articles%s.xml.gz" % i)
 1638+ return files
15031639 else:
15041640 return ["stub-meta-history.xml.gz",
15051641 "stub-meta-current.xml.gz",
@@ -1578,10 +1714,17 @@
15791715 def run(self, runner):
15801716 commands = []
15811717 if self._chunks:
1582 - for i in range(1, len(self._chunks)+1):
1583 - self.cleanupOldFiles(runner,i)
1584 - series = self.buildCommand(runner, i)
 1718+ if (self._chunkToDo):
 1719+ if (self._chunkToDo < 1 or self._chunkToDo > len(self._chunks)):
 1720+ raise BackupError("chunk option must be in range of available chunks to rerun, 1 through %s\n" % str(len(self._chunks)))
 1721+ self.cleanupOldFiles(runner,self._chunkToDo)
 1722+ series = self.buildCommand(runner, self._chunkToDo)
15851723 commands.append(series)
 1724+ else:
 1725+ for i in range(1, len(self._chunks)+1):
 1726+ self.cleanupOldFiles(runner,i)
 1727+ series = self.buildCommand(runner, i)
 1728+ commands.append(series)
15861729 else:
15871730 self.cleanupOldFiles(runner)
15881731 series = self.buildCommand(runner)
@@ -1592,20 +1735,24 @@
15931736
15941737 class RecombineXmlStub(XmlStub):
15951738 def __init__(self, name, desc, chunks):
1596 - XmlStub.__init__(self, name, desc, chunks)
 1739+ XmlStub.__init__(self, name, desc, False, chunks)
15971740 # this is here only so that a callback can capture output from some commands
15981741 # related to recombining files if we did parallel runs of the recompression
15991742 self._output = None
16001743
1601 - # oh crap we need to be able to produce a list of output files, what else?
1602 - def listFiles(self, runner):
1603 - return(XmlStub.listFiles(self, runner, unnumbered=True))
 1744+ def listInputFiles(self, runner):
 1745+ return(XmlStub.listOutputFiles(self, runner))
16041746
 1747+ def listOutputFiles(self, runner):
 1748+ return ["stub-meta-history.xml.gz",
 1749+ "stub-meta-current.xml.gz",
 1750+ "stub-articles.xml.gz"]
 1751+
16051752 def run(self, runner):
16061753 error=0
16071754 if (self._chunks):
1608 - files = XmlStub.listFiles(self,runner)
1609 - outputFileList = self.listFiles(runner)
 1755+ files = self.listInputFiles(runner)
 1756+ outputFileList = self.listOutputFiles(runner)
16101757 for outputFile in outputFileList:
16111758 inputFiles = []
16121759 for inFile in files:
@@ -1641,7 +1788,7 @@
16421789 def detail(self):
16431790 return "This contains the log of actions performed on pages."
16441791
1645 - def listFiles(self, runner):
 1792+ def listOutputFiles(self, runner):
16461793 return ["pages-logging.xml.gz"]
16471794
16481795 def cleanupOldFiles(self, runner):
@@ -1672,14 +1819,16 @@
16731820
16741821 class XmlDump(Dump):
16751822 """Primary XML dumps, one section at a time."""
1676 - def __init__(self, subset, name, desc, detail, prefetch, spawn, chunks = False):
 1823+ def __init__(self, subset, name, desc, detail, prefetch, spawn, chunkToDo, chunks = False):
16771824 Dump.__init__(self, name, desc)
16781825 self._subset = subset
16791826 self._detail = detail
 1827+ self._desc = desc
16801828 self._prefetch = prefetch
16811829 self._spawn = spawn
16821830 self._chunks = chunks
16831831 self._pageID = {}
 1832+ self._chunkToDo = chunkToDo
16841833
16851834 def detail(self):
16861835 """Optionally return additional text to appear under the heading."""
@@ -1697,9 +1846,15 @@
16981847 def run(self, runner):
16991848 commands = []
17001849 if (self._chunks):
1701 - for i in range(1, len(self._chunks)+1):
1702 - series = self.buildCommand(runner, i)
 1850+ if (self._chunkToDo):
 1851+ if (self._chunkToDo < 1 or self._chunkToDo > len(self._chunks)):
 1852+ raise BackupError("chunk option must be in range of available chunks to rerun, 1 through %s\n" % str(len(self._chunks)))
 1853+ series = self.buildCommand(runner, self._chunkToDo)
17031854 commands.append(series)
 1855+ else:
 1856+ for i in range(1, len(self._chunks)+1):
 1857+ series = self.buildCommand(runner, i)
 1858+ commands.append(series)
17041859 else:
17051860 series = self.buildCommand(runner)
17061861 commands.append(series)
@@ -1712,9 +1867,13 @@
17131868 # check to see if any of the output files are truncated
17141869 files = []
17151870 if (self._chunks):
1716 - for i in range(1, len(self._chunks)+1):
1717 - files.append( self._path(runner, 'bz2', i ) )
1718 - files.append( self._path(runner, 'bz2', i ) )
 1871+ if (self._chunkToDo):
 1872+ if (self._chunkToDo < 1 or self._chunkToDo > len(self._chunks)):
 1873+ raise BackupError("chunk option must be in range of available chunks to rerun, 1 through %s\n" % str(len(self._chunks)))
 1874+ files.append( self._path(runner, 'bz2', self._chunkToDo ) )
 1875+ else:
 1876+ for i in range(1, len(self._chunks)+1):
 1877+ files.append( self._path(runner, 'bz2', i ) )
17191878
17201879 for f in files:
17211880 pipeline = []
@@ -1900,7 +2059,7 @@
19012060 if realpath(old) == current:
19022061 runner.debug("skipping current dump for prefetch %s" % possible)
19032062 continue
1904 - if not self.statusOfOldDumpIsDone(runner, date):
 2063+ if not runner.runInfoFile.statusOfOldDumpIsDone(runner, date, self.name, self._desc):
19052064 runner.debug("skipping incomplete or failed dump for prefetch %s" % possible)
19062065 continue
19072066 if (chunk) and (self.filenameHasChunk(possible, "bz2")):
@@ -1936,82 +2095,8 @@
19372096 runner.debug("Could not locate a prefetchable dump.")
19382097 return None
19392098
1940 - # find desc in there, look for "class='done'"
1941 - def _getStatusForJobFromIndexFileLine(self, line, desc):
1942 - if not(">"+desc+"<" in line):
1943 - return None
1944 - if "<li class='done'>" in line:
1945 - return "done"
1946 - else:
1947 - return "other"
1948 -
1949 - def _getStatusForJobFromIndexFile(self, filename, desc):
1950 - # read the index file in, if there is one, and find out whether
1951 - # a particular job (one step only, not a multiple piece job) has been
1952 - # already run and whether it was successful (use to examine status
1953 - # of step from some previous run)
1954 - try:
1955 - infile = open(filename,"r")
1956 - for line in infile:
1957 - result = self._getStatusForJobFromIndexFileLine(line, desc)
1958 - if (not result == None):
1959 - return result
1960 - infile.close
1961 - return None
1962 - except:
1963 - return None
1964 -
1965 - # format: name:%; updated:%; status:%
1966 - def _getStatusForJobFromRunInfoFileLine(self, line, jobName):
1967 - # get rid of leading/trailing/embedded blanks
1968 - line = line.replace(" ","")
1969 - line = line.replace("\n","")
1970 - fields = line.split(';',3)
1971 - for field in fields:
1972 - (fieldName, separator, fieldValue) = field.partition(':')
1973 - if (fieldName == "name"):
1974 - if (not fieldValue == jobName):
1975 - return None
1976 - elif (fieldName == "status"):
1977 - return fieldValue
1978 -
1979 - def _getStatusForJobFromRunInfoFile(self, filename, jobName = ""):
1980 - # read the dump run info file in, if there is one, and find out whether
1981 - # a particular job (one step only, not a multiple piece job) has been
1982 - # already run and whether it was successful (use to examine status
1983 - # of step from some previous run)
1984 - try:
1985 - infile = open(filename,"r")
1986 - for line in infile:
1987 - result = self._getStatusForJobFromRunInfoFileLine(line, jobName)
1988 - if (not result == None):
1989 - return result
1990 - infile.close
1991 - return None
1992 - except:
1993 - return None
1994 -
1995 - def statusOfOldDumpIsDone(self, runner, date):
1996 - oldDumpRunInfoFilename=runner.dumpItemList._getDumpRunInfoFileName(date)
1997 - jobName = self.name()
1998 - status = self._getStatusForJobFromRunInfoFile(oldDumpRunInfoFilename, jobName)
1999 - if (status == "done"):
2000 - return 1
2001 - elif (not status == None):
2002 - # failure, in progress, some other useless thing
2003 - return 0
2004 -
2005 - # ok, there was no info there to be had, try the index file. yuck.
2006 - indexFilename = os.path.join(runner.wiki.publicDir(), date, runner.config.perDumpIndex)
2007 - desc = self._desc
2008 - status = self._getStatusForJobFromIndexFile(indexFilename, desc)
2009 - if (status == "done"):
2010 - return 1
2011 - else:
2012 - return 0
2013 -
2014 - def listFiles(self, runner, unnumbered = False):
2015 - if (self._chunks) and not unnumbered:
 2099+ def listOutputFiles(self, runner):
 2100+ if (self._chunks):
20162101 files = []
20172102 for i in range(1, len(self._chunks)+1):
20182103 files.append(self._file("bz2",i))
@@ -2022,19 +2107,22 @@
20232108 class RecombineXmlDump(XmlDump):
20242109 def __init__(self, subset, name, desc, detail, chunks = False):
20252110 # no prefetch, no spawn
2026 - XmlDump.__init__(self, subset, name, desc, detail, None, None, chunks)
 2111+ XmlDump.__init__(self, subset, name, desc, detail, None, None, False, chunks)
20272112 # this is here only so that a callback can capture output from some commands
20282113 # related to recombining files if we did parallel runs of the recompression
20292114 self._output = None
20302115
2031 - def listFiles(self, runner):
2032 - return(XmlDump.listFiles(self, runner, unnumbered=True))
 2116+ def listInputFiles(self, runner):
 2117+ return XmlDump.listOutputFiles(self,runner)
20332118
 2119+ def listOutputFiles(self, runner):
 2120+ return [ self._file("bz2",0) ]
 2121+
20342122 def run(self, runner):
20352123 error=0
20362124 if (self._chunks):
2037 - files = XmlDump.listFiles(self,runner)
2038 - outputFileList = self.listFiles(runner)
 2125+ files = self.listInputFiles(runner)
 2126+ outputFileList = self.listOutputFiles(runner)
20392127 for outputFile in outputFileList:
20402128 inputFiles = []
20412129 for inFile in files:
@@ -2071,11 +2159,12 @@
20722160 class XmlRecompressDump(Dump):
20732161 """Take a .bz2 and recompress it as 7-Zip."""
20742162
2075 - def __init__(self, subset, name, desc, detail, chunks = False):
 2163+ def __init__(self, subset, name, desc, detail, chunkToDo, chunks = False):
20762164 Dump.__init__(self, name, desc)
20772165 self._subset = subset
20782166 self._detail = detail
20792167 self._chunks = chunks
 2168+ self._chunkToDo = chunkToDo
20802169
20812170 def detail(self):
20822171 """Optionally return additional text to appear under the heading."""
@@ -2127,11 +2216,18 @@
21282217 raise BackupError("bz2 dump incomplete, not recompressing")
21292218 commands = []
21302219 if (self._chunks):
2131 - for i in range(1, len(self._chunks)+1):
2132 - # Clear prior 7zip attempts; 7zip will try to append an existing archive
2133 - self.cleanupOldFiles(runner, i)
2134 - series = self.buildCommand(runner, i)
 2220+ if (self._chunkToDo):
 2221+ if (self._chunkToDo < 1 or self._chunkToDo > len(self._chunks)):
 2222+ raise BackupError("chunk option must be in range of available chunks to rerun, 1 through %s\n" % str(len(self._chunks)))
 2223+ self.cleanupOldFiles(runner, self._chunkToDo)
 2224+ series = self.buildCommand(runner, self._chunkToDo)
21352225 commands.append(series)
 2226+ else:
 2227+ for i in range(1, len(self._chunks)+1):
 2228+ # Clear prior 7zip attempts; 7zip will try to append an existing archive
 2229+ self.cleanupOldFiles(runner, i)
 2230+ series = self.buildCommand(runner, i)
 2231+ commands.append(series)
21362232 else:
21372233 # Clear prior 7zip attempts; 7zip will try to append an existing archive
21382234 self.cleanupOldFiles(runner)
@@ -2141,10 +2237,17 @@
21422238 # temp hack force 644 permissions until ubuntu bug # 370618 is fixed - tomasz 5/1/2009
21432239 # some hacks aren't so temporary - atg 3 sept 2010
21442240 if (self._chunks):
2145 - for i in range(1, len(self._chunks)+1):
2146 - xml7z = self.buildOutputFilename(runner,i)
 2241+ if (self._chunkToDo):
 2242+ if (self._chunkToDo < 1 or self._chunkToDo > len(self._chunks)):
 2243+ raise BackupError("chunk option must be in range of available chunks to rerun, 1 through %s\n" % str(len(self._chunks)))
 2244+ xml7z = self.buildOutputFilename(runner,self._chunkToDo)
21472245 if exists(xml7z):
21482246 os.chmod(xml7z, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH )
 2247+ else:
 2248+ for i in range(1, len(self._chunks)+1):
 2249+ xml7z = self.buildOutputFilename(runner,i)
 2250+ if exists(xml7z):
 2251+ os.chmod(xml7z, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH )
21492252 else:
21502253 xml7z = self.buildOutputFilename(runner)
21512254 if exists(xml7z):
@@ -2152,12 +2255,19 @@
21532256 if (error):
21542257 raise BackupError("error recompressing bz2 file(s)")
21552258
2156 - def listFiles(self, runner, unnumbered = False):
2157 - if (self._chunks) and not unnumbered:
2158 - files = []
2159 - for i in range(1, len(self._chunks)+1):
2160 - files.append(self._file("7z",i))
2161 - return files
 2259+ def listOutputFiles(self, runner):
 2260+ if (self._chunks):
 2261+ if (self._chunkToDo):
 2262+ if (self._chunkToDo < 1 or self._chunkToDo > len(self._chunks)):
 2263+ raise BackupError("chunk option must be in range of available chunks to rerun, 1 through %s\n" % str(len(self._chunks)))
 2264+ files = []
 2265+ files.append(self._file("7z",self._chunkToDo))
 2266+ return files
 2267+ else:
 2268+ files = []
 2269+ for i in range(1, len(self._chunks)+1):
 2270+ files.append(self._file("7z",i))
 2271+ return files
21622272 else:
21632273 return [ self._file("7z",0) ]
21642274
@@ -2166,17 +2276,19 @@
21672277
21682278 class RecombineXmlRecompressDump(XmlRecompressDump):
21692279 def __init__(self, subset, name, desc, detail, chunks):
2170 - XmlRecompressDump.__init__(self, subset, name, desc, detail, chunks)
 2280+ XmlRecompressDump.__init__(self, subset, name, desc, detail, False, chunks)
21712281 # this is here only so that a callback can capture output from some commands
21722282 # related to recombining files if we did parallel runs of the recompression
21732283 self._output = None
21742284
2175 - def listFiles(self, runner):
2176 - return(XmlRecompressDump.listFiles(self, runner, unnumbered=True))
 2285+ def listInputFiles(self, runner):
 2286+ return XmlRecompressDump.listOutputFiles(self,runner)
21772287
 2288+ def listOutputFiles(self, runner):
 2289+ return [ self._file("7z",0) ]
 2290+
21782291 def cleanupOldFiles(self, runner):
2179 - files = self.listFiles(runner)
2180 - print "here is cleanup"
 2292+ files = self.listOutputFiles(runner)
21812293 for filename in files:
21822294 filename = runner.dumpDir.publicPath(filename)
21832295 if exists(filename):
@@ -2186,8 +2298,8 @@
21872299 error = 0
21882300 if (self._chunks):
21892301 self.cleanupOldFiles(runner)
2190 - files = XmlRecompressDump.listFiles(self,runner)
2191 - outputFileList = self.listFiles(runner)
 2302+ files = self.listInputFiles(runner)
 2303+ outputFileList = self.listOutputFiles(runner)
21922304 for outputFile in outputFileList:
21932305 inputFiles = []
21942306 for inFile in files:
@@ -2216,8 +2328,9 @@
22172329 class AbstractDump(Dump):
22182330 """XML dump for Yahoo!'s Active Abstracts thingy"""
22192331
2220 - def __init__(self, name, desc, chunks = False):
 2332+ def __init__(self, name, desc, chunkToDo, chunks = False):
22212333 Dump.__init__(self, name, desc)
 2334+ self._chunkToDo = chunkToDo
22222335 self._chunks = chunks
22232336
22242337 def buildCommand(self, runner, chunk = 0):
@@ -2254,9 +2367,15 @@
22552368 def run(self, runner):
22562369 commands = []
22572370 if (self._chunks):
2258 - for i in range(1, len(self._chunks)+1):
2259 - series = self.buildCommand(runner, i)
 2371+ if (self._chunkToDo):
 2372+ if (self._chunkToDo < 1 or self._chunkToDo > len(self._chunks)):
 2373+ raise BackupError("chunk option must be in range of available chunks to rerun, 1 through %s\n" % str(len(self._chunks)))
 2374+ series = self.buildCommand(runner, self._chunkToDo)
22602375 commands.append(series)
 2376+ else:
 2377+ for i in range(1, len(self._chunks)+1):
 2378+ series = self.buildCommand(runner, i)
 2379+ commands.append(series)
22612380 else:
22622381 series = self.buildCommand(runner)
22632382 commands.append(series)
@@ -2290,31 +2409,42 @@
22912410 else:
22922411 return( "abstract-%s%s.xml" % (variant, chunkInfo) )
22932412
2294 - def listFiles(self, runner, unnumbered = False):
 2413+ def listOutputFiles(self, runner):
22952414 files = []
22962415 for x in self._variants(runner):
2297 - if (self._chunks) and not unnumbered:
2298 - for i in range(1, len(self._chunks)+1):
2299 - files.append(self._variantFile(x, i))
 2416+ if (self._chunks):
 2417+ if (self._chunkToDo):
 2418+ if (self._chunkToDo < 1 or self._chunkToDo > len(self._chunks)):
 2419+ raise BackupError("chunk option must be in range of available chunks to rerun, 1 through %s\n" % str(len(self._chunks)))
 2420+ files.append(self._variantFile(x, self._chunkToDo))
 2421+ else:
 2422+ for i in range(1, len(self._chunks)+1):
 2423+ files.append(self._variantFile(x, i))
23002424 else:
23012425 files.append(self._variantFile(x))
23022426 return files
23032427
23042428 class RecombineAbstractDump(AbstractDump):
23052429 def __init__(self, name, desc, chunks):
2306 - AbstractDump.__init__(self, name, desc, chunks)
 2430+ AbstractDump.__init__(self, name, desc, False, chunks)
23072431 # this is here only so that a callback can capture output from some commands
23082432 # related to recombining files if we did parallel runs of the recompression
23092433 self._output = None
23102434
2311 - def listFiles(self, runner):
2312 - return(AbstractDump.listFiles(self,runner, unnumbered = True))
 2435+ def listOutputFiles(self, runner):
 2436+ files = []
 2437+ for x in self._variants(runner):
 2438+ files.append(self._variantFile(x))
 2439+ return files
23132440
 2441+ def listInputFiles(self, runner):
 2442+ return(AbstractDump.listOutputFiles(self,runner))
 2443+
23142444 def run(self, runner):
23152445 error = 0
23162446 if (self._chunks):
2317 - files = AbstractDump.listFiles(self,runner)
2318 - outputFileList = self.listFiles(runner)
 2447+ files = AbstractDump.listOutputFiles(self,runner)
 2448+ outputFileList = self.listOutputFiles(runner)
23192449 for outputFile in outputFileList:
23202450 inputFiles = []
23212451 for inFile in files:
@@ -2354,11 +2484,11 @@
23552485 if (error):
23562486 raise BackupError("error dumping titles list")
23572487
2358 - def listFiles(self, runner):
 2488+ def listOutputFiles(self, runner):
23592489 return ["all-titles-in-ns0.gz"]
23602490
23612491
2362 -def findAndLockNextWiki(config):
 2492+def findAndLockNextWiki(config, locksEnabled):
23632493 if config.halt:
23642494 print "Dump process halted by config."
23652495 return None
@@ -2371,13 +2501,17 @@
23722502 for db in next:
23732503 wiki = WikiDump.Wiki(config, db)
23742504 try:
2375 - wiki.lock()
 2505+ if (locksEnabled):
 2506+ wiki.lock()
23762507 return wiki
23772508 except:
23782509 print "Couldn't lock %s, someone else must have got it..." % db
23792510 continue
23802511 return None
23812512
 2513+def xmlEscape(text):
 2514+ return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;");
 2515+
23822516 def usage(message = None):
23832517 if message:
23842518 print message
@@ -2425,10 +2559,11 @@
24262560 log = None
24272561 htmlNotice = ""
24282562 dryrun = False
 2563+ chunkToDo = False
24292564
24302565 try:
24312566 (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "",
2432 - ['date=', 'job=', 'configfile=', 'addnotice=', 'delnotice', 'force', 'dryrun', 'noprefetch', 'nospawn', 'restartfrom', 'log'])
 2567+ ['date=', 'job=', 'configfile=', 'addnotice=', 'delnotice', 'force', 'dryrun', 'noprefetch', 'nospawn', 'restartfrom', 'log', 'chunk=' ])
24332568 except:
24342569 usage("Unknown option specified")
24352570
@@ -2437,6 +2572,8 @@
24382573 date = val
24392574 elif opt == "--configfile":
24402575 configFile = val
 2576+ elif opt == '--chunk':
 2577+ chunkToDo = int(val)
24412578 elif opt == "--force":
24422579 forceLock = True
24432580 elif opt == "--noprefetch":
@@ -2455,7 +2592,7 @@
24562593 htmlNotice = val
24572594 elif opt == "--delnotice":
24582595 htmlNotice = False
2459 -
 2596+
24602597 if dryrun and (len(remainder) == 0):
24612598 usage("--dryrun requires the name of a wikidb to be specified")
24622599 if jobRequested and (len(remainder) == 0):
@@ -2464,6 +2601,10 @@
24652602 usage("--force cannot be used with --job option")
24662603 if (restart and not jobRequested):
24672604 usage("--restartfrom requires --job and the job from which to restart")
 2605+ if (chunkToDo and not jobRequested):
 2606+ usage("--chunk option requires a specific job for which to rerun that chunk")
 2607+ if (chunkToDo and restart):
 2608+ usage("--chunk option can be specified only for one specific job")
24682609
24692610 # allow alternate config file
24702611 if (configFile):
@@ -2471,6 +2612,11 @@
24722613 else:
24732614 config = WikiDump.Config()
24742615
 2616+ if dryrun or chunkToDo or (jobRequested and not restart):
 2617+ locksEnabled = False
 2618+ else:
 2619+ locksEnabled = True
 2620+
24752621 if dryrun:
24762622 print "***"
24772623 print "Dry run only, no files will be updated."
@@ -2478,22 +2624,20 @@
24792625
24802626 if len(remainder) > 0:
24812627 wiki = WikiDump.Wiki(config, remainder[0])
2482 - if not dryrun:
2483 - # if we are doing one piece only of the dump, we don't try to grab a lock
2484 - # unless told to.
 2628+ if locksEnabled:
24852629 if forceLock and wiki.isLocked():
24862630 wiki.unlock()
2487 - if restart or not jobRequested:
 2631+ if locksEnabled:
24882632 wiki.lock()
24892633
24902634 else:
2491 - wiki = findAndLockNextWiki(config)
 2635+ wiki = findAndLockNextWiki(config, locksEnabled)
24922636
24932637 if wiki:
24942638 # process any per-project configuration options
24952639 config.parseConfFilePerProject(wiki.dbName)
24962640
2497 - runner = Runner(wiki, date, prefetch, spawn, jobRequested, restart, htmlNotice, dryrun, enableLogging)
 2641+ runner = Runner(wiki, date, prefetch, spawn, jobRequested, restart, htmlNotice, dryrun, enableLogging, chunkToDo)
24982642 if (restart):
24992643 print "Running %s, restarting from job %s..." % (wiki.dbName, jobRequested)
25002644 elif (jobRequested):
@@ -2502,9 +2646,8 @@
25032647 print "Running %s..." % wiki.dbName
25042648 runner.run()
25052649 # if we are doing one piece only of the dump, we don't unlock either
2506 - if not dryrun:
2507 - if restart or not jobRequested:
2508 - wiki.unlock()
 2650+ if locksEnabled:
 2651+ wiki.unlock()
25092652 else:
25102653 print "No wikis available to run."
25112654 finally:

Status & tagging log