r99655 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r99654‎ | r99655 | r99656 >
Date:23:24, 12 October 2011
Author:ariel
Status:deferred
Tags:
Comment:
initial checkin of adds/changes dumps
Modified paths:
  • /branches/ariel/xmldumps-backup/incrementals (added) (history)
  • /branches/ariel/xmldumps-backup/incrementals/IncrDumpLib.py (added) (history)
  • /branches/ariel/xmldumps-backup/incrementals/README.config (added) (history)
  • /branches/ariel/xmldumps-backup/incrementals/README.txt (added) (history)
  • /branches/ariel/xmldumps-backup/incrementals/all.dblist (added) (history)
  • /branches/ariel/xmldumps-backup/incrementals/closed.dblist (added) (history)
  • /branches/ariel/xmldumps-backup/incrementals/dumpincr.conf.sample (added) (history)
  • /branches/ariel/xmldumps-backup/incrementals/generateincrementals.py (added) (history)
  • /branches/ariel/xmldumps-backup/incrementals/generatemaxrevids.py (added) (history)
  • /branches/ariel/xmldumps-backup/incrementals/incrmonitor (added) (history)
  • /branches/ariel/xmldumps-backup/incrementals/incrmonitor.py (added) (history)
  • /branches/ariel/xmldumps-backup/incrementals/incrs-index.html (added) (history)
  • /branches/ariel/xmldumps-backup/incrementals/private.dblist (added) (history)

Diff [purge]

Index: branches/ariel/xmldumps-backup/incrementals/IncrDumpLib.py
@@ -0,0 +1,390 @@
 2+# shared classes for incrementals
 3+import os
 4+import sys
 5+import re
 6+import ConfigParser
 7+import WikiDump
 8+from WikiDump import FileUtils, TimeUtils, MiscUtils
 9+from os.path import exists
 10+import socket
 11+import subprocess
 12+from subprocess import Popen, PIPE
 13+
 14+class ContentFile(object):
 15+ def __init__(self, config, date, wikiName):
 16+ self._config = config
 17+ self.date = date
 18+ self.incrDir = IncrementDir(self._config, date)
 19+ self.wikiName = wikiName
 20+
 21+ # override this.
 22+ def getFileName(self):
 23+ return "content.txt"
 24+
 25+ def getPath(self):
 26+ return os.path.join(self.incrDir.getIncDir(self.wikiName),self.getFileName())
 27+
 28+ def getFileInfo(self):
 29+ return FileUtils.fileInfo(self.getPath())
 30+
 31+class MaxRevIDFile(ContentFile):
 32+ def getFileName(self):
 33+ return "maxrevid.txt"
 34+
 35+class StubFile(ContentFile):
 36+ def getFileName(self):
 37+ return "%s-%s-stubs-meta-hist-incr.xml.gz" % ( self.wikiName, self.date )
 38+
 39+class RevsFile(ContentFile):
 40+ def getFileName(self):
 41+ return "%s-%s-pages-meta-hist-incr.xml.bz2" % ( self.wikiName, self.date )
 42+
 43+class StatusFile(ContentFile):
 44+ def getFileName(self):
 45+ return "status.txt"
 46+
 47+ def getPath(self, date = None):
 48+ return os.path.join(self.incrDir.getIncDir(self.wikiName, date),self.getFileName())
 49+
 50+class LockFile(ContentFile):
 51+ def getFileName(self):
 52+ return "%s-%s.lock" % ( self.wikiName, self.date )
 53+
 54+ def getPath(self):
 55+ return os.path.join(self.incrDir.getIncDirNoDate(self.wikiName),self.getFileName())
 56+
 57+class MaxRevIDLockFile(LockFile):
 58+ def getFileName(self):
 59+ return "%s-%s-maxrevid.lock" % ( self.wikiName, self.date )
 60+
 61+class IncrDumpLockFile(LockFile):
 62+ def getFileName(self):
 63+ return "%s-%s-incrdump.lock" % ( self.wikiName, self.date )
 64+
 65+class MD5File(ContentFile):
 66+ def getFileName(self):
 67+ return "%s-%s-md5sums.txt" % ( self.wikiName, self.date )
 68+
 69+class IndexFile(ContentFile):
 70+ def __init__(self, config):
 71+ self._config = config
 72+ self.incrDir = IncrementDir(self._config)
 73+
 74+ def getFileName(self):
 75+ return "index.html"
 76+
 77+ def getPath(self):
 78+ return os.path.join(self.incrDir.getIncDirBase(),self.getFileName())
 79+
 80+class StatusInfo(object):
 81+ def __init__(self, config, date, wikiName):
 82+ self._config = config
 83+ self.date = date
 84+ self.wikiName = wikiName
 85+ self.statusFile = StatusFile(self._config, self.date, self.wikiName)
 86+
 87+ def getStatus(self, date = None):
 88+ if exists(self.statusFile.getPath(date)):
 89+ status = FileUtils.readFile(self.statusFile.getPath(date)).rstrip()
 90+ if status == "done":
 91+ return True
 92+ return False
 93+
 94+ def setStatus(self, status):
 95+ FileUtils.writeFileInPlace(self.statusFile.getPath(),status, self._config.fileperms)
 96+
 97+class Lock(object):
 98+ def __init__(self, config, date, wikiName):
 99+ self._config = config
 100+ self.date = date
 101+ self.wikiName = wikiName
 102+ self.lockFile = LockFile(self._config, self.date, self.wikiName)
 103+
 104+ def isLocked(self):
 105+ return exists(self.lockFile.getPath())
 106+
 107+ def getLock(self):
 108+ try:
 109+ if not exists(self._config.incrementalsDir):
 110+ os.makedirs(self._config.incrementalsDir)
 111+ f = FileUtils.atomicCreate(self.lockFile.getPath(), "w")
 112+ f.write("%s %d" % (socket.getfqdn(), os.getpid()))
 113+ f.close()
 114+ return True
 115+ except:
 116+ return False
 117+
 118+ def unlock(self):
 119+ os.remove(self.lockFile.getPath())
 120+
 121+ def getLockInfo(self):
 122+ try:
 123+ timestamp = os.stat(self.lockFile.getPath()).st_mtime
 124+ return time.strftime("%Y-%m-%d %H:%M:%S",timestamp)
 125+ except:
 126+ return None
 127+
 128+class IncrDumpLock(Lock):
 129+ def __init__(self, config, date, wikiName):
 130+ self._config = config
 131+ self.date = date
 132+ self.wikiName = wikiName
 133+ self.lockFile = IncrDumpLockFile(self._config, self.date, self.wikiName)
 134+
 135+class MaxRevIDLock(Lock):
 136+ def __init__(self,config, date, wikiName):
 137+ self._config = config
 138+ self.date = date
 139+ self.wikiName = wikiName
 140+ self.lockFile = MaxRevIDLockFile(self._config, self.date, self.wikiName)
 141+
 142+class Config(object):
 143+ def __init__(self, configFile=False):
 144+ self.projectName = False
 145+
 146+ home = os.path.dirname(sys.argv[0])
 147+ if (not configFile):
 148+ configFile = "dumpincr.conf"
 149+ self.files = [
 150+ os.path.join(home,configFile),
 151+ "/etc/dumpincrementals.conf",
 152+ os.path.join(os.getenv("HOME"), ".dumpincr.conf")]
 153+ defaults = {
 154+ #"wiki": {
 155+ "allwikislist": "",
 156+ "privatewikislist": "",
 157+ "closedwikislist": "",
 158+ #"output": {
 159+ "incrementalsdir": "/dumps/public/incr",
 160+ "templatedir": home,
 161+ "temp":"/dumps/temp",
 162+ "webroot": "http://localhost/dumps/incr",
 163+ "fileperms": "0640",
 164+ "delay": "43200",
 165+ #"database": {
 166+ "user": "root",
 167+ "password": "",
 168+ #"tools": {
 169+ "mediawiki" : "",
 170+ "php": "/bin/php",
 171+ "gzip": "/usr/bin/gzip",
 172+ "bzip2": "/usr/bin/bzip2",
 173+ "mysql": "/usr/bin/mysql",
 174+ "checkforbz2footer": "/usr/local/bin/checkforbz2footer",
 175+ "writeuptopageid": "/usr/local/bin/writeuptopageid",
 176+ "multiversion": "",
 177+ #"cleanup": {
 178+ "keep": "3",
 179+ }
 180+
 181+ self.conf = ConfigParser.SafeConfigParser(defaults)
 182+ self.conf.read(self.files)
 183+
 184+ if not self.conf.has_section("wiki"):
 185+ print "The mandatory configuration section 'wiki' was not defined."
 186+ raise ConfigParser.NoSectionError('wiki')
 187+
 188+ if not self.conf.has_option("wiki","mediawiki"):
 189+ print "The mandatory setting 'mediawiki' in the section 'wiki' was not defined."
 190+ raise ConfigParser.NoOptionError('wiki','mediawiki')
 191+
 192+ self.parseConfFile()
 193+
 194+ def parseConfFile(self):
 195+ self.mediawiki = self.conf.get("wiki", "mediawiki")
 196+ self.allWikisList = MiscUtils.dbList(self.conf.get("wiki", "allwikislist"))
 197+ self.privateWikisList = MiscUtils.dbList(self.conf.get("wiki", "privatewikislist"))
 198+ self.closedWikisList = MiscUtils.dbList(self.conf.get("wiki", "closedwikislist"))
 199+
 200+ if not self.conf.has_section('output'):
 201+ self.conf.add_section('output')
 202+ self.incrementalsDir = self.conf.get("output", "incrementalsdir")
 203+ self.tempDir = self.conf.get("output", "temp")
 204+ self.templateDir = self.conf.get("output", "templateDir")
 205+ self.webRoot = self.conf.get("output", "webroot")
 206+ self.fileperms = self.conf.get("output", "fileperms")
 207+ self.fileperms = int(self.fileperms,0)
 208+ self.delay = self.conf.get("output", "delay")
 209+ self.delay = int(self.delay,0)
 210+
 211+ if not self.conf.has_section('tools'):
 212+ self.conf.add_section('tools')
 213+ self.php = self.conf.get("tools", "php")
 214+ self.gzip = self.conf.get("tools", "gzip")
 215+ self.bzip2 = self.conf.get("tools", "bzip2")
 216+ self.mysql = self.conf.get("tools", "mysql")
 217+ self.checkforbz2footer = self.conf.get("tools","checkforbz2footer")
 218+ self.writeuptopageid = self.conf.get("tools","writeuptopageid")
 219+ self.multiversion = self.conf.get("tools","multiversion")
 220+
 221+ if not self.conf.has_section('cleanup'):
 222+ self.conf.add_section('cleanup')
 223+ self.keep = self.conf.getint("cleanup", "keep")
 224+
 225+ if not self.conf.has_section('database'):
 226+ self.conf.add_section('database')
 227+ self.dbUser = self.conf.get("database", "user")
 228+ self.dbPassword = self.conf.get("database", "password")
 229+
 230+ def readTemplate(self, name):
 231+ template = os.path.join(self.templateDir, name)
 232+ return FileUtils.readFile(template)
 233+
 234+class RunSimpleCommand(object):
 235+ def runWithOutput(command, maxtries = 3, shell=False):
 236+ """Run a command and return the output as a string.
 237+ Raises IncrementDumpsError on non-zero return code."""
 238+ success = False
 239+ tries = 0
 240+ while (not success and tries < maxtries):
 241+ proc = Popen(command, shell = shell, stdout = PIPE, stderr = PIPE)
 242+ output, error = proc.communicate()
 243+ if not proc.returncode:
 244+ success = True
 245+ tries = tries + 1
 246+ if not success:
 247+ if type(command).__name__=='list':
 248+ commandString = " ".join(command)
 249+ else:
 250+ commandString = command
 251+ if proc:
 252+ raise IncrementDumpsError("command '" + commandString + ( "' failed with return code %s " % proc.returncode ) + " and error '" + error + "'")
 253+ else:
 254+ raise IncrementDumpsError("command '" + commandString + ( "' failed" ) + " and error '" + error + "'")
 255+ return output
 256+
 257+ def runWithNoOutput(command, maxtries = 3, shell=False):
 258+ """Run a command, expecting no output.
 259+ Raises IncrementDumpsError on non-zero return code."""
 260+ success = False
 261+ tries = 0
 262+ while ((not success) and tries < maxtries):
 263+ proc = Popen(command, shell = shell, stderr = PIPE)
 264+ # output will be None, we can ignore it
 265+ output, error = proc.communicate()
 266+ if not proc.returncode:
 267+ success = True
 268+ tries = tries + 1
 269+ if not success:
 270+ if type(command).__name__=='list':
 271+ commandString = " ".join(command)
 272+ else:
 273+ commandString = command
 274+ raise IncrementDumpsError("command '" + commandString + ( "' failed with return code %s " % proc.returncode ) + " and error '" + error + "'")
 275+
 276+ runWithOutput = staticmethod(runWithOutput)
 277+ runWithNoOutput = staticmethod(runWithNoOutput)
 278+
 279+class MultiVersion(object):
 280+ def MWScriptAsString(config, maintenanceScript):
 281+ return(" ".join(MultiVersion.MWScriptAsArray(config, maintenanceScript)))
 282+
 283+ def MWScriptAsArray(config, maintenanceScript):
 284+ if config.multiversion != "":
 285+ if exists(config.multiversion):
 286+ return [ config.multiversion, maintenanceScript ]
 287+ return [ "%s/maintenance/%s" % (config.mediawiki, maintenanceScript) ]
 288+
 289+ MWScriptAsString = staticmethod(MWScriptAsString)
 290+ MWScriptAsArray = staticmethod(MWScriptAsArray)
 291+
 292+class DBServer(object):
 293+ def __init__(self, config, wikiName):
 294+ self.config = config
 295+ self.wikiName = wikiName
 296+ self.dbServer = self.defaultServer()
 297+
 298+ def defaultServer(self):
 299+ if (not exists( self.config.php ) ):
 300+ raise BackupError("php command %s not found" % self.config.php)
 301+ commandList = MultiVersion.MWScriptAsArray(self.config, "getSlaveServer.php")
 302+ command = [ self.config.php, "-q" ]
 303+ command.extend(commandList)
 304+ command.extend( [ "--wiki=%s" % self.wikiName, "--group=dump" ])
 305+ return RunSimpleCommand.runWithOutput(command, shell=False).rstrip()
 306+
 307+ def buildSqlCommand(self, query):
 308+ """Put together a command to execute an sql query to the server for this DB."""
 309+ if (not exists( self.config.mysql ) ):
 310+ raise BackupError("mysql command %s not found" % self.config.mysql)
 311+ command = "/bin/echo '%s' | %s -h %s -u %s " % ( query, self.config.mysql, self.dbServer, self.config.dbUser )
 312+ if self.config.dbPassword != "":
 313+ command = command + "-p" + self.config.dbPassword
 314+ command = command + " -r --silent " + self.wikiName
 315+ return command
 316+
 317+class IncrementDumpsError(Exception):
 318+ pass
 319+
 320+class IncrementDir(object):
 321+ def __init__(self, config, date = None):
 322+ self._config = config
 323+ self.date = date
 324+
 325+ def getIncDirBase(self):
 326+ return self._config.incrementalsDir
 327+
 328+ def getIncDirNoDate(self, wikiName):
 329+ return os.path.join(self.getIncDirBase(), wikiName)
 330+
 331+ def getIncDir(self, wikiName, date = None):
 332+ if (date == None):
 333+ return os.path.join(self.getIncDirBase(), wikiName, self.date)
 334+ else:
 335+ return os.path.join(self.getIncDirBase(), wikiName, date)
 336+
 337+class IncrementDumpsError(Exception):
 338+ pass
 339+
 340+class IncDumpDirs(object):
 341+ def __init__(self, config, wikiName):
 342+ self._config = config
 343+ self.wikiName = wikiName
 344+ self.incrDir = IncrementDir(self._config)
 345+
 346+ def getIncDumpDirs(self):
 347+ base = self.incrDir.getIncDirNoDate(self.wikiName)
 348+ digits = re.compile(r"^\d{4}\d{2}\d{2}$")
 349+ dates = []
 350+ try:
 351+ for dir in os.listdir(base):
 352+ if digits.match(dir):
 353+ dates.append(dir)
 354+ except OSError:
 355+ return []
 356+ dates.sort()
 357+ return dates
 358+
 359+ def cleanupOldIncrDumps(self, date):
 360+ old = self.getIncDumpDirs()
 361+ if old:
 362+ if old[-1] == date:
 363+ old = old[:-1]
 364+ if self._config.keep > 0:
 365+ old = old[:-(self._config.keep)]
 366+ for dump in old:
 367+ toRemove = os.path.join(self.incrDir.getIncDirNoDate(self.wikiName), dump)
 368+ shutil.rmtree("%s" % toRemove)
 369+
 370+ def getPrevIncrDate(self, date):
 371+ # find the most recent incr dump before the
 372+ # specified date that completed successfully
 373+ previous = None
 374+ old = self.getIncDumpDirs()
 375+ if old:
 376+ for dump in old:
 377+ if dump == date:
 378+ return previous
 379+ else:
 380+ statusInfo = StatusInfo(self._config, dump, self.wikiName)
 381+ if statusInfo.getStatus(dump) == "done":
 382+ previous = dump
 383+ return previous
 384+
 385+ def getLatestIncrDate(self):
 386+ # find the most recent incr dump
 387+ dirs = self.getIncDumpDirs()
 388+ if dirs:
 389+ return(dirs[-1])
 390+ else:
 391+ return(None)
Property changes on: branches/ariel/xmldumps-backup/incrementals/IncrDumpLib.py
___________________________________________________________________
Added: svn:eol-style
1392 + native
Index: branches/ariel/xmldumps-backup/incrementals/README.config
@@ -0,0 +1,38 @@
 2+By default, all configuration options are read from the file "dumpincr.conf" in the current directory.
 3+A different filename may be specified at run time.
 4+
 5+The following configuration options are accepted:
 6+
 7+In the "wiki" section,
 8+mediawiki -- full path to the directory of the MediaWiki installation
 9+allwikislist -- full path to a list of all projects to be dumped, as they appear in MySql
 10+privatewikislist -- full path to a list of all projects that are private and hence should not be dumped, if any
 11+closedwikislist -- full path to a list of all projects that are closed and hence should not be dumped, if any
 12+
 13+In the "output" section,
 14+incrementalsdir -- full path to the top level directory where adds/changes dumps will be written; this should
 15+ be web-accessible
 16+templatedir -- full path to the directory containing template html files such as incrs-index.html (typically
 17+ the same directory as that which contains the dump scripts)
 18+temp -- full path to a directory which is used to the generation of temporary files; this should
 19+ not be web-accessible
 20+webroot -- url to top level directory with the main index page, for example http://localhost/mydumps
 21+fileperms -- read and write permissions that will be assigned to created files; this is in octal four-digit
 22+ format, for example 0644
 23+delay -- number of seconds to wait after a max rev_id has been recorded, before dumping revisions
 24+
 25+In the "database" section,
 26+user -- the name of a database user with read access to all tables in the databases
 27+ which will be dumped
 28+password -- the password for the above user
 29+
 30+In the "tools" section,
 31+php -- the full path to the php command
 32+mysql -- the full path to the mysql command
 33+gzip -- the full path to the gzip command
 34+bzip2 -- the full path to the bzip2 command
 35+checkforbz2footer -- the full path to the checkforbz2footer command
 36+writeuptopageid -- the full path to the writeuptopageid command
 37+
 38+In the "cleanup" section,
 39+keep -- the number of old dumps to keep, per project.
Index: branches/ariel/xmldumps-backup/incrementals/incrmonitor
@@ -0,0 +1,26 @@
 2+#!/bin/bash
 3+
 4+WIKIDUMP_BASE=`dirname "$0"`
 5+
 6+if [ ! -z "$1" ]; then
 7+ configFile="$1"
 8+else
 9+ configFile=""
 10+fi
 11+
 12+if [ ! -z "$2" ]; then
 13+ if [ "$2" == "verbose" ]; then
 14+ verbose="--verbose"
 15+ else
 16+ echo "Unknown option $2"
 17+ exit 1
 18+ fi
 19+fi
 20+
 21+while true; do
 22+ echo ""
 23+ echo "Sweeping!"
 24+ python $WIKIDUMP_BASE/incrmonitor.py "$configFile" "$verbose"
 25+ echo "sleeping"
 26+ sleep 15
 27+done
Property changes on: branches/ariel/xmldumps-backup/incrementals/incrmonitor
___________________________________________________________________
Added: svn:executable
128 + *
Index: branches/ariel/xmldumps-backup/incrementals/generatemaxrevids.py
@@ -0,0 +1,138 @@
 2+# for every wiki, find and record the max rev_id in use.
 3+# this is phase 1 of daily xml change/adds dumps.
 4+
 5+import ConfigParser
 6+import getopt
 7+import os
 8+import re
 9+import sys
 10+import WikiDump
 11+from WikiDump import FileUtils, TimeUtils, MiscUtils
 12+import subprocess
 13+import socket
 14+import time
 15+import IncrDumpLib
 16+from IncrDumpLib import Lock, Config, RunSimpleCommand, MultiVersion, DBServer, IncrementDir, IncrementDumpsError, MaxRevIDFile, MaxRevIDLockFile, IncrDumpLock, MaxRevIDLock
 17+from subprocess import Popen, PIPE
 18+from os.path import exists
 19+import traceback
 20+
 21+class MaxRevID(object):
 22+ def __init__(self, config, wikiName, date):
 23+ self._config = config
 24+ self.wikiName = wikiName
 25+ self.date = date
 26+ self.maxID = 0
 27+ self.maxRevIdFile = MaxRevIDFile(self._config, self.date, self.wikiName)
 28+
 29+ def getMaxRevID(self):
 30+ query = "select MAX(rev_id) from revision";
 31+ db = DBServer(self._config, self.wikiName)
 32+ # get the result
 33+ self.maxID = RunSimpleCommand.runWithOutput(db.buildSqlCommand(query), shell = True)
 34+
 35+ def recordMaxRevID(self):
 36+ self.getMaxRevID()
 37+ # write the max id in a file in the right place
 38+ FileUtils.writeFileInPlace(self.maxRevIdFile.getPath(), self.maxID, self._config.fileperms)
 39+
 40+ def exists(self):
 41+ return exists(self.maxRevIdFile.getPath())
 42+
 43+class MaxIDDump(object):
 44+ def __init__(self,config, date, verbose):
 45+ self._config = config
 46+ self.date = date
 47+ self.incrDir = IncrementDir(self._config, self.date)
 48+ self.verbose = verbose
 49+
 50+ def doOneWiki(self, w):
 51+ success = True
 52+ if w not in self._config.privateWikisList and w not in self._config.closedWikisList:
 53+ if not exists(self.incrDir.getIncDir(w)):
 54+ os.makedirs(self.incrDir.getIncDir(w))
 55+ lock = MaxRevIDLock(self._config, self.date, w)
 56+ if lock.getLock():
 57+ try:
 58+ maxRevID = MaxRevID(self._config, w, self.date)
 59+ if not maxRevID.exists():
 60+ maxRevID.recordMaxRevID()
 61+ except:
 62+ if (self.verbose):
 63+ print "Wiki ", w, "failed to get max revid."
 64+ traceback.print_exc(file=sys.stdout)
 65+ success = False
 66+ lock.unlock()
 67+ else:
 68+ if (self.verbose):
 69+ print "Wiki ", w, "failed to get lock."
 70+ traceback.print_exc(file=sys.stdout)
 71+ if success:
 72+ if (self.verbose):
 73+ print "Success! Wiki", w, "adds/changes dump complete."
 74+ return success
 75+
 76+ def doRunOnAllWikis(self):
 77+ failures = 0
 78+ for w in self._config.allWikisList:
 79+ if not self.doOneWiki(w):
 80+ failures = failures + 1
 81+ return failures
 82+
 83+ def doAllWikisTilDone(self,numFails):
 84+ fails = 0
 85+ while 1:
 86+ result = self.doRunOnAllWikis()
 87+ if not result:
 88+ break
 89+ fails = fails + 1
 90+ if fails > numFails:
 91+ raise("Too many consecutive failures, giving up")
 92+ # wait 5 minutes and try another loop
 93+ time.sleep(300)
 94+
 95+def usage(message = None):
 96+ if message:
 97+ print message
 98+ print "Usage: python generateincrementals.py [options] [wikidbname]"
 99+ print "Options: --configfile, --date, --verbose"
 100+ print "--configfile: Specify an alternate config file to read. Default file is 'dumpincr.conf' in the current directory."
 101+ print "--date: (Re)run incremental of a given date (use with care)."
 102+ print "--verbose: Print error messages and other informative messages (normally the"
 103+ print " script runs silently)."
 104+ print "wikiname: Run the dumps only for the specific wiki."
 105+ sys.exit(1)
 106+
 107+if __name__ == "__main__":
 108+ configFile = False
 109+ result = False
 110+ date = None
 111+ verbose = False
 112+
 113+ try:
 114+ (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "",
 115+ ['date=', 'configfile=', 'verbose' ])
 116+ except:
 117+ usage("Unknown option specified")
 118+
 119+ for (opt, val) in options:
 120+ if opt == "--date":
 121+ date = val
 122+ elif opt == "--configfile":
 123+ configFile = val
 124+ elif opt == "--verbose":
 125+ verbose = True
 126+
 127+ if (configFile):
 128+ config = Config(configFile)
 129+ else:
 130+ config = Config()
 131+
 132+ if not date:
 133+ date = TimeUtils.today()
 134+
 135+ dump = MaxIDDump(config, date, verbose)
 136+ if len(remainder) > 0:
 137+ dump.doOneWiki(remainder[0])
 138+ else:
 139+ dump.doAllWikisTilDone(3)
Property changes on: branches/ariel/xmldumps-backup/incrementals/generatemaxrevids.py
___________________________________________________________________
Added: svn:eol-style
1140 + native
Index: branches/ariel/xmldumps-backup/incrementals/dumpincr.conf.sample
@@ -0,0 +1,32 @@
 2+# sample configuration file
 3+
 4+[wiki]
 5+mediawiki=/src/mediawiki/118wmf1/1.18wmf1
 6+allwikislist=/home/backups/incrementals/all.dblist
 7+privatewikislist=/home/backups/incrementals/private.dblist
 8+closedwikislist=/home/backups/incrementals/closed.dblist
 9+
 10+[output]
 11+incrementalsdir=/dumps/public/incr
 12+templatedir=/home/backups/incrementals
 13+temp=/dumps/temp
 14+webroot=http://localhost/mydumps
 15+fileperms=0644
 16+# minimum number of seconds from revision creation
 17+# til it can be dumped
 18+delay=43200
 19+
 20+[database]
 21+user=dbuser
 22+password=leet
 23+
 24+[tools]
 25+php=/usr/bin/php
 26+mysql=/usr/bin/mysql
 27+gzip=/usr/bin/gzip
 28+bzip2=/usr/bin/bzip2
 29+checkforbz2footer=/usr/local/bin/checkforbz2footer
 30+writeuptopageid=/usr/local/bin/writeuptopageid
 31+
 32+[cleanup]
 33+keep=20
Index: branches/ariel/xmldumps-backup/incrementals/generateincrementals.py
@@ -0,0 +1,266 @@
 2+# for every wiki, read the maxid and the prev maxid
 3+# recorded for incrementals, dump stubs and dump history file
 4+# based on stubs.
 5+# this is phase 2 of daily xml change/adds dumps.
 6+
 7+import ConfigParser
 8+import getopt
 9+import os
 10+import re
 11+import sys
 12+import WikiDump
 13+from WikiDump import FileUtils, TimeUtils, MiscUtils
 14+import subprocess
 15+import socket
 16+import time
 17+import IncrDumpLib
 18+from IncrDumpLib import Lock, Config, RunSimpleCommand, MultiVersion, DBServer, IncrementDir, IncrementDumpsError, MaxRevIDFile, StatusFile, IncrDumpLockFile, StubFile, RevsFile, MD5File, IncDumpDirs, IncrDumpLock, MaxRevIDLock, StatusInfo
 19+from subprocess import Popen, PIPE
 20+from os.path import exists
 21+import hashlib
 22+import traceback
 23+
 24+class DumpResults(object):
 25+ def __init__(self):
 26+ self.TODO = 1
 27+ self.FAILED = -1
 28+ self.OK = 0
 29+
 30+class IncrDump(object):
 31+ def __init__(self,config, date, wikiName, doStubs, doRevs, dryrun, verbose):
 32+ self._config = config
 33+ self.date = date
 34+ self.wikiName = wikiName
 35+ self.incrDir = IncrementDir(self._config, self.date)
 36+ self.doStubs = doStubs
 37+ self.doRevs = doRevs
 38+ self.dryrun = dryrun
 39+ self.maxRevIDFile = MaxRevIDFile(self._config, self.date, self.wikiName)
 40+ self.statusInfo = StatusInfo(self._config, self.date, self.wikiName)
 41+ self.stubFile = StubFile(self._config, self.date, self.wikiName)
 42+ self.revsFile = RevsFile(self._config, self.date, self.wikiName)
 43+ self.incrDumpsDirs = IncDumpDirs(self._config, self.wikiName)
 44+ self.verbose = verbose
 45+
 46+ def getMaxRevIdFromFile(self, date = None):
 47+ if date == None:
 48+ date = self.date
 49+ maxRevIDFile = MaxRevIDFile(self._config, date, self.wikiName)
 50+ return FileUtils.readFile(maxRevIDFile.getPath().rstrip())
 51+
 52+ def doOneWiki(self):
 53+ retCodes = DumpResults()
 54+ if self.wikiName not in self._config.privateWikisList and self.wikiName not in self._config.closedWikisList:
 55+ if not exists(self.incrDir.getIncDir(self.wikiName)):
 56+ os.makedirs(self.incrDir.getIncDir(self.wikiName))
 57+ status = self.statusInfo.getStatus()
 58+ if status == "done":
 59+ if (self.verbose):
 60+ print "wiki",self.wikiName,"skipped, adds/changes dump already complete"
 61+ return retCodes.OK
 62+ if time.time() - os.path.getmtime(self.maxRevIDFile.getPath()) < self._config.delay:
 63+ if (self.verbose):
 64+ print "wiki",self.wikiName,"skipped, must wait for configured delay interval"
 65+ return retCodes.TODO
 66+ if not dryrun:
 67+ lock = IncrDumpLock(self._config, self.date, self.wikiName)
 68+ if not lock.getLock():
 69+ if (self.verbose):
 70+ print "wiki",self.wikiName,"skipped, wiki is locked, another process should be doing the job"
 71+ return retCodes.TODO
 72+ try:
 73+ if not dryrun:
 74+ self.incrDumpsDirs.cleanupOldIncrDumps(self.date)
 75+ maxRevID = self.getMaxRevIdFromFile()
 76+ prevDate = self.incrDumpsDirs.getPrevIncrDate(self.date)
 77+ prevRevID = None
 78+ if prevDate:
 79+ prevRevID = self.getMaxRevIdFromFile(prevDate)
 80+ if not prevRevID:
 81+ prevRevID = str(int(maxRevID) - 10)
 82+ if int(prevRevID) < 1:
 83+ prevRevID = str(1)
 84+ else:
 85+ # this incr will cover every revision from the last incremental
 86+ # through the maxid we wrote out in phase one of this job.
 87+ prevRevID = str(int(prevRevID) + 1)
 88+ if doStubs:
 89+ maxRevID = str(int(maxRevID) + 1) # end rev id is not included in dump
 90+ if not self.dumpStub(prevRevID, maxRevID):
 91+ return retCodes.FAILED
 92+ if doRevs:
 93+ if not self.dumpRevs():
 94+ return retCodes.FAILED
 95+ if not dryrun:
 96+ if not self.md5sums():
 97+ return retCodes.FAILED
 98+ self.statusInfo.setStatus("done")
 99+ lock.unlock()
 100+ except:
 101+ if (self.verbose):
 102+ traceback.print_exc(file=sys.stdout)
 103+ if not dryrun:
 104+ lock.unlock()
 105+ return retCodes.FAILED
 106+ if (self.verbose):
 107+ print "Success! Wiki", self.wikiName, "incremental dump complete."
 108+ return retCodes.OK
 109+
 110+ def dumpStub(self, startRevID, endRevID):
 111+ scriptCommand = MultiVersion.MWScriptAsArray(self._config, "dumpBackup.php")
 112+ command = [ "%s" % self._config.php, "-q" ]
 113+ command.extend(scriptCommand)
 114+ command.extend(["--wiki=%s" % self.wikiName, "--stub", "--quiet",
 115+ "--force-normal", "--output=gzip:%s" % self.stubFile.getPath(),
 116+ "--revrange", "--revstart=%s" % startRevID, "--revend=%s" % endRevID ])
 117+ if dryrun:
 118+ print "would run command for stubs dump:", command
 119+ else:
 120+ error = RunSimpleCommand.runWithNoOutput(command, shell = False)
 121+ if (error):
 122+ if (self.verbose):
 123+ print ("error producing stub files for wiki" % self.wikiName)
 124+ return False
 125+ return True
 126+
 127+ def dumpRevs(self):
 128+ scriptCommand = MultiVersion.MWScriptAsArray(self._config, "dumpTextPass.php")
 129+ command = [ "%s" % self._config.php, "-q" ]
 130+ command.extend(scriptCommand)
 131+ command.extend(["--wiki=%s" % self.wikiName, "--stub=gzip:%s" % self.stubFile.getPath(),
 132+ "--force-normal", "--quiet", "--spawn=%s" % self._config.php,
 133+ "--output=bzip2:%s" % self.revsFile.getPath()
 134+ ])
 135+ if dryrun:
 136+ print "would run command for revs dump:", command
 137+ else:
 138+ error = RunSimpleCommand.runWithNoOutput(command, shell = False)
 139+ if (error):
 140+ if (self.verbose):
 141+ print("error producing revision text files for wiki" % self.wikiName)
 142+ return False
 143+ return True
 144+
 145+ def md5sumOneFile(self, filename):
 146+ summer = hashlib.md5()
 147+ infile = file(filename, "rb")
 148+ bufsize = 4192 * 32
 149+ buffer = infile.read(bufsize)
 150+ while buffer:
 151+ summer.update(buffer)
 152+ buffer = infile.read(bufsize)
 153+ infile.close()
 154+ return summer.hexdigest()
 155+
 156+ def md5sums(self):
 157+ try:
 158+ md5File = MD5File(self._config, self.date, self.wikiName)
 159+ text = ""
 160+ summer = hashlib.md5()
 161+ files = []
 162+ if self.doStubs:
 163+ files.append(self.stubFile.getPath())
 164+ if self.doRevs:
 165+ files.append(self.revsFile.getPath())
 166+ for f in files:
 167+ text = text + "%s\n" % self.md5sumOneFile(f)
 168+ FileUtils.writeFileInPlace(md5File.getPath(), text, self._config.fileperms)
 169+ return True
 170+ except:
 171+ return False
 172+
 173+class IncrDumpLoop(object):
 174+ def __init__(self, config, date, doStubs, doRevs, dryrun, verbose):
 175+ self._config = config
 176+ self.date = date
 177+ self.doStubs = doStubs
 178+ self.doRevs = doRevs
 179+ self.dryrun = dryrun
 180+ self.verbose = verbose
 181+
 182+ def doRunOnAllWikis(self):
 183+ retCodes = DumpResults()
 184+ failures = 0
 185+ todos = 0
 186+ for w in self._config.allWikisList:
 187+ dump = IncrDump(config, date, w, doStubs, doRevs, dryrun, self.verbose)
 188+ result = dump.doOneWiki()
 189+ if result == retCodes.FAILED:
 190+ failures = failures + 1
 191+ elif result == retCodes.TODO:
 192+ todos = todos + 1
 193+ return (failures, todos)
 194+
 195+ def doAllWikisTilDone(self,numFails):
 196+ fails = 0
 197+ while 1:
 198+ (failures, todos) = self.doRunOnAllWikis()
 199+ if not failures and not todos:
 200+ break
 201+ fails = fails + 1
 202+ if fails > numFails:
 203+ raise IncrementDumpsError("Too many consecutive failures, giving up")
 204+ # wait 5 minutes and try another loop
 205+# raise IncrementDumpsError("would sleep")
 206+ time.sleep(300)
 207+
 208+def usage(message = None):
 209+ if message:
 210+ print message
 211+ print "Usage: python generateincrementals.py [options] [wikidbname]"
 212+ print "Options: --configfile, --date, --dryrun, --revsonly, --stubsonly, --verbose"
 213+ print "--configfile: Specify an alternate config file to read. Default file is 'dumpincr.conf' in the current directory."
 214+ print "--date: (Re)run incremental of a given date (use with care)."
 215+ print "--dryrun: Don't actually dump anything but print the commands that would be run."
 216+ print "--revsonly: Do only the stubs part of the dumps."
 217+ print "--stubsonly: Do only the revision text part of the dumps."
 218+ print "--verbose: Print error messages and other informative messages (normally the"
 219+ print " script runs silently)."
 220+ print "wikiname: Run the dumps only for the specific wiki."
 221+ sys.exit(1)
 222+
 223+if __name__ == "__main__":
 224+ configFile = False
 225+ result = False
 226+ date = None
 227+ doStubs = True
 228+ doRevs = True
 229+ dryrun = False
 230+ verbose = False
 231+
 232+ try:
 233+ (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "",
 234+ ['date=', 'configfile=', 'stubsonly', 'revsonly', 'dryrun', 'verbose' ])
 235+ except:
 236+ usage("Unknown option specified")
 237+
 238+ for (opt, val) in options:
 239+ if opt == "--date":
 240+ date = val
 241+ elif opt == "--configfile":
 242+ configFile = val
 243+ elif opt == "--stubsonly":
 244+ doRevs = False
 245+ elif opt == "--revsonly":
 246+ doStubs = False
 247+ elif opt == "--dryrun":
 248+ dryrun = True
 249+ elif opt == "--verbose":
 250+ verbose = True
 251+
 252+ if not doRevs and not doStubs:
 253+ usage("You may not specify stubsonly and revsonly options together.")
 254+
 255+ if (configFile):
 256+ config = Config(configFile)
 257+ else:
 258+ config = Config()
 259+
 260+ if not date:
 261+ date = TimeUtils.today()
 262+
 263+ if len(remainder) > 0:
 264+ dump = IncrDump(config, date, remainder[0], doStubs, doRevs, dryrun, verbose)
 265+ else:
 266+ dump = IncrDumpLoop(config, date, doStubs, doRevs, dryrun, verbose)
 267+ dump.doAllWikisTilDone(3)
Property changes on: branches/ariel/xmldumps-backup/incrementals/generateincrementals.py
___________________________________________________________________
Added: svn:eol-style
1268 + native
Index: branches/ariel/xmldumps-backup/incrementals/incrs-index.html
@@ -0,0 +1,118 @@
 2+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
 3+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 4+
 5+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 6+<head>
 7+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
 8+ <title>Incremental dumps</title>
 9+ <style type="text/css">
 10+ html, body {
 11+ background-color: #ffffff;
 12+ color: black;
 13+ }
 14+ .siteinfo {
 15+ text-align: center;
 16+ }
 17+ li {
 18+ background-color: #ffffff;
 19+ list-style-type: none;
 20+ }
 21+ li li {
 22+ background-color: white;
 23+ }
 24+ li ul {
 25+ margin-top: 4px;
 26+ margin-bottom: 8px;
 27+ }
 28+ .detail {
 29+ font-weight: normal;
 30+ font-style: italic;
 31+ }
 32+ .updates {
 33+ font: monospace;
 34+ font-size: smaller;
 35+ }
 36+ .status {
 37+ font-weight: bold;
 38+ padding-left: 1em;
 39+ padding-right: 1em;
 40+ }
 41+ .in-progress {
 42+ font-weight: bold;
 43+ }
 44+ .failed {
 45+ color: Maroon;
 46+ font-weight: bold;
 47+ }
 48+ .waiting {
 49+ color: Silver; /* Gray ? */
 50+ }
 51+ .progress {
 52+ font-family: monospace;
 53+ font-size: 80%%;
 54+ margin-left: .5in;
 55+ }
 56+ </style>
 57+</head>
 58+
 59+<body>
 60+ <h1>Adds/changes dumps</h1>
 61+
 62+ <p class="siteinfo">
 63+ This is the Wikimedia adds/changes dump service.
 64+ Please read the <a href='legal.html'>copyrights</a> information.
 65+ See <a href="http://meta.wikimedia.org/wiki/Data_dumps">Meta:Data dumps</a>
 66+ for documentation on the provided data formats.
 67+ </p>
 68+ <p>
 69+ Here's the big fat disclaimer.
 70+ </p>
 71+ <p>
 72+ This service is experimental. At any time it may not be working, for a day, a week or a month.
 73+ It is not intended to replace the full XML dumps. We don't expect users to be able to construct
 74+ full dumps of a given date from the incrementals and an older dump.
 75+ </p>
 76+ <p>
 77+ The data provided in these files is ''partial data''. To be precise:
 78+ <ul>
 79+ <li>* Revisions included in these dumps are not up to the minute. We write out those that were
 80+ created up to 18 hours ago; this gives local editing communities time to delete revisions
 81+ with sensitive information, vulgarities and other vandalism, etc.</li>
 82+ <li>* New pages entered for the first time during the time interval are included</li>
 83+ <li>* Revisions of undeleted pages will be included only if new revision IDs need to be assigned to
 84+ the restored revisions. For most revisions this will not be the case. </li>
 85+ <li>* Information about moves and deletes are not included.</li>
 86+ <li>* Imported revisions will be included if they were imported during the time interval, since they
 87+ will have new revisions IDs.</li>
 88+ <li>* As with all dumps, hidden revisions or more generally revisions not readable by the general public
 89+ are not provided.</li>
 90+ </ul>
 91+ </p>
 92+ <p>
 93+ What is in these files:
 94+ </p>
 95+ <p>
 96+ The stubs file consists of the metadata for revision texts of each page, where the revision texts were
 97+ added within the time interval. These look just like the history stubs files you would find on our XML data dumps
 98+ page, having the exact same format but only new revisions since the last adds/changes dump. This means you get
 99+ metadata for articles, user pages, discussion pages, etc. If you want articles only, you will need to write a
 100+ filter to grab just those entries.
 101+ </p>
 102+ <p>
 103+ The revs file consists of the metadata plus the wikitext for each new revision since the last adds/changes dump.
 104+ This is in the same format as the pages-meta-history files you would find on our XML data dumps page. This means
 105+ you get articles, user pages, discussion pages, etc. If you want articles only, you will need to write a
 106+ filter to grab just those entries.
 107+ </p>
 108+ <h2>Adds/changes dump listing</h2>
 109+ <ul>
 110+ %(items)s
 111+ </ul>
 112+ <hr>
 113+ <p>
 114+ Return to <a href="http://dumps.wikimedia.org/other/">our other datasets</a>, the
 115+ <a href="http://dumps.wikimedia.org/backup-index.html">XML data dumps</a>, or
 116+ <a href="http://dumps.wikimedia.org/index.html">the main index</a>.
 117+ <p/>
 118+</body>
 119+</html>
Index: branches/ariel/xmldumps-backup/incrementals/all.dblist
@@ -0,0 +1,5 @@
 2+elwikidb
 3+simplewikidb
 4+testAw118wmf1
 5+testBw118wmf1
 6+testCw118wmf1
\ No newline at end of file
Index: branches/ariel/xmldumps-backup/incrementals/closed.dblist
Index: branches/ariel/xmldumps-backup/incrementals/incrmonitor.py
@@ -0,0 +1,134 @@
 2+# generate an index page covering the status of and links to
 3+# incremental files for the latest date for each project
 4+
 5+import ConfigParser
 6+import getopt
 7+import os
 8+import re
 9+import sys
 10+import WikiDump
 11+from WikiDump import FileUtils, TimeUtils, MiscUtils
 12+import subprocess
 13+import socket
 14+import time
 15+import IncrDumpLib
 16+from IncrDumpLib import Lock, Config, RunSimpleCommand, MultiVersion, DBServer, IncrementDir, IncrementDumpsError, IndexFile, IncrDumpLockFile, IncDumpDirs, IncrDumpLock, MaxRevIDLock, StubFile, RevsFile, StatusFile
 17+from subprocess import Popen, PIPE
 18+from os.path import exists
 19+import hashlib
 20+import traceback
 21+
 22+class Link(object):
 23+
 24+ def makeLink(path, linkText):
 25+ return('<a href = "' + path + '">' + linkText + "</a>")
 26+
 27+ makeLink = staticmethod(makeLink)
 28+
 29+class Index(object):
 30+ def __init__(self, config, verbose):
 31+ self._config = config
 32+ self.indexFile = IndexFile(self._config)
 33+ self.incrDir = IncrementDir(self._config)
 34+ self.verbose = verbose
 35+
 36+ def doAllWikis(self):
 37+ text = ""
 38+ for w in self._config.allWikisList:
 39+ result = self.doOneWiki(w)
 40+ if result:
 41+ text = text + "<li>"+ result + "</li>\n"
 42+ indexText = self._config.readTemplate("incrs-index.html") % { "items" : text }
 43+ FileUtils.writeFileInPlace(self.indexFile.getPath(), indexText, self._config.fileperms)
 44+
 45+ def doOneWiki(self, w):
 46+ if w not in self._config.privateWikisList and w not in self._config.closedWikisList:
 47+ self.incrDumpsDirs = IncDumpDirs(self._config, w)
 48+ if not exists(self.incrDir.getIncDirNoDate(w)):
 49+ if (self.verbose):
 50+ print "No dump for wiki ", w
 51+ next
 52+
 53+ incrDate = self.incrDumpsDirs.getLatestIncrDate()
 54+ if not incrDate:
 55+ if (self.verbose):
 56+ print "No dump for wiki ", w
 57+ next
 58+
 59+ try:
 60+ lock = IncrDumpLock(self._config, incrDate, w)
 61+ lockDate = lock.getLockInfo()
 62+
 63+ stub = StubFile(self._config, incrDate, w)
 64+ (stubDate, stubSize) = stub.getFileInfo()
 65+ revs = RevsFile(self._config, incrDate, w)
 66+ (revsDate, revsSize) = revs.getFileInfo()
 67+ stat = StatusFile(self._config, incrDate, w)
 68+ statContents = FileUtils.readFile(stat.getPath())
 69+
 70+ except:
 71+ if (self.verbose):
 72+ traceback.print_exc(file=sys.stdout)
 73+ return "Error encountered, no information available for wiki", w
 74+
 75+ try:
 76+ wikinameText = "<strong>%s</strong>" % w
 77+ if lockDate:
 78+ lockText = "run started on %s." % lockDate
 79+ else:
 80+ lockText = None
 81+ if stubDate:
 82+ stubText = "stubs: %s (size %s)" % (Link.makeLink(os.path.join(w, incrDate, stub.getFileName()),stubDate), stubSize)
 83+ else:
 84+ stubText = None
 85+ if revsDate:
 86+ revsText = "revs: %s (size %s)" % (Link.makeLink(os.path.join(w, incrDate, revs.getFileName()),revsDate), revsSize)
 87+ else:
 88+ revsText = None
 89+ if statContents:
 90+ statText = "(%s)" % (statContents)
 91+ else:
 92+ statText = None
 93+
 94+ wikiInfo = " ".join( filter( None, [ wikinameText, lockText, statText ] ) ) + "<br />"
 95+ wikiInfo = wikiInfo + " &nbsp;&nbsp; " + " | ".join( filter( None, [ stubText, revsText ] ))
 96+ except:
 97+ if (self.verbose):
 98+ traceback.print_exc(file=sys.stdout)
 99+ return "Error encountered formatting information for wiki", w
 100+
 101+ return wikiInfo
 102+
 103+def usage(message = None):
 104+ if message:
 105+ print message
 106+ print "Usage: python monitor.py [options] [wikidbname]"
 107+ print "Options: --configfile, --verbose"
 108+ print "--configfile: Specify an alternate config file to read. Default file is 'dumpincr.conf' in the current directory."
 109+ print "--verbose: Print error messages and other informative messages (normally the"
 110+ print " script runs silently)."
 111+ sys.exit(1)
 112+
 113+if __name__ == "__main__":
 114+ configFile = False
 115+ verbose = False
 116+
 117+ try:
 118+ (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "",
 119+ ['configfile=', 'verbose' ])
 120+ except:
 121+ usage("Unknown option specified")
 122+
 123+ for (opt, val) in options:
 124+ if opt == "--configfile":
 125+ configFile = val
 126+ elif opt == '--verbose':
 127+ verbose = True
 128+
 129+ if (configFile):
 130+ config = Config(configFile)
 131+ else:
 132+ config = Config()
 133+
 134+ index = Index(config, verbose)
 135+ index.doAllWikis()
Property changes on: branches/ariel/xmldumps-backup/incrementals/incrmonitor.py
___________________________________________________________________
Added: svn:eol-style
1136 + native
Index: branches/ariel/xmldumps-backup/incrementals/private.dblist
Index: branches/ariel/xmldumps-backup/incrementals/README.txt
@@ -0,0 +1,66 @@
 2+The adds/changes dumps are a supplementary set of dumps intended to accompany
 3+the regular XML dump files.
 4+
 5+The adds/changes dumps are produced in two stages.
 6+
 7+In stage one, the max rev_id value at the time of the run is written out to a file for each project for the given date. Script name: generatemaxrevids.py
 8+
 9+In stage two, intended to be run at a later time, a stub file containing all
 10+revisions from the previous adds/changes dump through the max rev_id just
 11+written. This file is sorted by page id, just as the regular XML stubs files
 12+are. Next a history file containing metadata and page text for those
 13+revisions is written, in the same format as the pages-meta-history file
 14+generated for the regular XML dumps. A status file is written to indicate
 15+that the job is done, and the md5sums of the stub and revision text files
 16+is written to a file as well. Script name: generateincrementals.py
 17+
 18+The reason that there are two stages run via two separate scripts is that
 19+you may want to allow editors time to delete or hide sensitive or offensive
 20+material newly entered. A delay of an arbitrary number of seconds between
 21+the recording of the max rev_id to dump and the start of the stub and
 22+revision text dump is configurable in the configuration file; see
 23+README.config for information on that.
 24+
 25+Installation:
 26+
 27+Seriously? You want to install this already? This is version 0.0.1. Know
 28+what that means? It's buggy, risky, and could eat your data.
 29+
 30+However, if you just want to play around with it on your laptop, fine.
 31+* Put the files generateincrementals.py, generatemaxrevids.py, incrmonitor.py,
 32+ incrmonitor and IncrDumpLib.py together with the sample configuration file
 33+ dumpincr.conf into a directory from which the job will run.
 34+ Make sure you have a copy or a symlink of WikiDump.py from the regular XML
 35+ dumps in this same directory.
 36+ Also make sure you have a template for the top level index.html file, called
 37+ "incrs-index.html" in the same directory with these scripts. See the existing
 38+ incrs-index.html file for the format; the key here is that you want the
 39+ string "%(items)s" in between <ul> and </ul> tags. The status of the dump
 40+ for each wiki, along with links to the stub and revisions files, will be
 41+ included as a list item in that spot in the file.
 42+* See README.config for information on the various options in the config file.
 43+* Create the top level directory underneath which there will be a directory
 44+ for each project you want to generate additions/changes. You needn't create
 45+ the subdirectories, this will be done for you at run time.
 46+* Do a test run; run generatemaxrevids.py by hand. Then look in the top level
 47+ directory you created earlier. Is there a directory for each project? Is
 48+ there a subdirectory under each of these with the date, in YYYYMMDD format?
 49+ In the date subdirectory are there a file maxrevid.txt containing a positive
 50+ integer?
 51+* Do the phase 2 test run: run generateincrementals.py by hand. If you have
 52+ configured a large delay, you will need to wait at least that amount of time
 53+ before running this script. When it has completed, check the subdirectory
 54+ from phase 1; are there files analogous to the following?
 55+ mywiki-yyyymmdd-md5sums.txt
 56+ mywiki-yyyymmdd-pages-meta-hist-incr.xml.bz2
 57+ mywiki-yyyymmdd-stubs-meta-hist-incr.xml.gz
 58+ maxrevid.txt
 59+ status.txt
 60+ Does the status.txt file contain "done"?
 61+* If the runs look like they are producing the right files, do the html
 62+ generation by hand; run monitor.py. In the top level directory for the
 63+ adds/changes dumps, do you see the file index.html? If you view that
 64+ file in a browser, do the contents look reasonable?
 65+* If that looks good, put phase 1 and phase 2 into separate cron jobs,
 66+ spacing them out as appropriate.
 67+
Property changes on: branches/ariel/xmldumps-backup/incrementals/README.txt
___________________________________________________________________
Added: svn:eol-style
168 + native

Status & tagging log