r12613 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r12612‎ | r12613 | r12614 >
Date:00:47, 12 January 2006
Author:vibber
Status:old
Tags:
Comment:
Initial revision
Modified paths:
  • /trunk/backup (added) (history)
  • /trunk/backup/WikiBackup.py (added) (history)

Diff [purge]

Index: trunk/backup/WikiBackup.py
@@ -0,0 +1,369 @@
 2+#!/usr/bin/python
 3+
 4+"""Backup/public data dump runner for Wikimedia's MediaWiki-based sites.
 5+
 6+This replaces the old set of hacky bash scripts we used to use.
 7+
 8+Current state:
 9+* Seems to dump basic files correctly on my test system.
 10+
 11+TODO:
 12+* detect handle error conditions ;)
 13+* lock files / looping
 14+* use date-based subdirectories
 15+* generate HTML pages with status and navigable links
 16+* generate file checksums
 17+* symlink files to a stable directory on completion
 18+* make upload tarballs?
 19+* detect low disk space and either call for help or automatically clear old files
 20+
 21+
 22+To run, make a wrapper script something like this:
 23+runner = WikiBackup.Runner(
 24+ public="/dumps/public",
 25+ private="/dumps/private",
 26+ dblist=["onesix"],
 27+ privatelist=(),
 28+ dbserver="localhost",
 29+ dbuser="root",
 30+ dbpassword="",
 31+ wikidir="/opt/web/pages/head",
 32+ php="/opt/php51/bin/php")
 33+runner.run()
 34+"""
 35+
 36+import os
 37+
 38+def dbList(filename):
 39+ infile = open(filename)
 40+ dbs = []
 41+ for line in infile:
 42+ line = line.strip()
 43+ if line != "":
 44+ dbs.append(line)
 45+ infile.close()
 46+ return dbs
 47+
 48+def shellEscape(param):
 49+ """Escape a string parameter, or set of strings, for the shell."""
 50+ if isinstance(param, basestring):
 51+ return "'" + param.replace("'", "'\\''") + "'"
 52+ elif param is None:
 53+ # A blank string might actually be needed; None means we can leave it out
 54+ return ""
 55+ else:
 56+ return tuple([shellEscape(x) for x in param])
 57+
 58+class Runner(object):
 59+ def __init__(self, public, private, dblist, privatelist, dbserver, dbuser, dbpassword, wikidir, php="php"):
 60+ self.public = public
 61+ self.private = private
 62+ self.dblist = dblist
 63+ self.privatelist = privatelist
 64+ self.dbserver = dbserver
 65+ self.dbuser = dbuser
 66+ self.dbpassword = dbpassword
 67+ self.wikidir = wikidir
 68+ self.php = php
 69+
 70+ """Public methods for the manager script..."""
 71+
 72+ def run(self):
 73+ """Iterate through the list of wikis and dump them!"""
 74+ self.debug("Starting dump...")
 75+ for db in self.dblist:
 76+ self.db = db
 77+ self.doBackup()
 78+ self.debug("Done!")
 79+
 80+ """Public methods for dumps to use..."""
 81+
 82+ def privatePath(self, filename=""):
 83+ """Take a given filename in the private dump dir for the selected database."""
 84+ return os.path.join(self.private, self.db, filename)
 85+
 86+ def publicPath(self, filename=""):
 87+ """Take a given filename in the public dump dir for the selected database.
 88+ If this database is marked as private, will use the private dir instead.
 89+ """
 90+ if self.db in self.privatelist:
 91+ return self.privatePath(filename)
 92+ else:
 93+ return os.path.join(self.public, self.db, filename)
 94+
 95+ def passwordOption(self):
 96+ """If you pass '-pfoo' mysql uses the password 'foo', but if you pass '-p' it prompts. Sigh."""
 97+ if self.dbpassword == "":
 98+ return None
 99+ else:
 100+ return "-p" + shellEscape(self.dbpassword)
 101+
 102+ def saveTable(self, table, outfile):
 103+ """Dump a table from the current DB with mysqldump, save to a gzipped sql file."""
 104+ command = "mysqldump -h %s -u %s %s --opt --quote-names %s %s | gzip" % shellEscape((
 105+ self.dbserver,
 106+ self.dbuser,
 107+ self.passwordOption(),
 108+ self.db,
 109+ table))
 110+ return self.saveCommand(command, outfile)
 111+
 112+ def saveSql(self, query, outfile):
 113+ """Pass some SQL commands to the server for this DB and save output to a file."""
 114+ command = "echo %s | mysql -h %s -u %s %s %s | gzip" % shellEscape((
 115+ query,
 116+ self.dbserver,
 117+ self.dbuser,
 118+ self.passwordOption(),
 119+ self.db))
 120+ return self.saveCommand(command, outfile)
 121+
 122+ def saveCommand(self, command, outfile):
 123+ """Shell out and redirect output to a given file."""
 124+ return self.runCommand(command + " > " + shellEscape(outfile))
 125+
 126+ def runCommand(self, command):
 127+ """Shell out; output is assumed to be saved usefully somehow."""
 128+ self.debug("runCommand: " + command)
 129+ return os.system(command)
 130+
 131+ def debug(self, stuff):
 132+ print stuff
 133+
 134+ # auto-set
 135+ #OutputDir=$PublicDir/$DirLang
 136+ #StatusLog=$OutputDir/backup.log
 137+ #StatusLockFile=$OutputDir/backup.lock
 138+ #StatusDoneFile=$OutputDir/backup.done
 139+
 140+ #GlobalLog=/var/backup/public/backup.log
 141+
 142+ def makeDir(self, dir):
 143+ if os.path.exists(dir):
 144+ self.debug("Checkdir dir %s ..." % dir)
 145+ else:
 146+ self.debug("Creating %s ..." % dir)
 147+ os.mkdir(dir)
 148+
 149+ def doBackup(self):
 150+ self.makeDir(self.public)
 151+ self.makeDir(self.publicPath())
 152+
 153+ self.makeDir(self.private)
 154+ self.makeDir(self.privatePath())
 155+
 156+ self.status("Starting backup of %s" % self.db)
 157+ self.lock()
 158+
 159+ items = [PrivateTable("user", "User account data."),
 160+ PrivateTable("user_groups", "User group assignments."),
 161+ PrivateTable("watchlist", "Users' watchlist settings."),
 162+ PrivateTable("ipblocks", "Data for blocks of IP addresses, ranges, and users."),
 163+ PrivateTable("archive", "Deleted page and revision data."),
 164+ PrivateTable("updates", "Update dataset for OAI updater system."),
 165+
 166+ PublicTable("site_stats", "A few statistics such as the page count."),
 167+ PublicTable("image", "Metadata on current versions of uploaded images."),
 168+ PublicTable("oldimage", "Metadata on prior versions of uploaded images."),
 169+ PublicTable("pagelinks", "Wiki page-to-page link records."),
 170+ PublicTable("categorylinks", "Wiki category membership link records."),
 171+ PublicTable("imagelinks", "Wiki image usage records."),
 172+ PublicTable("templatelinks", "Wiki template inclusion link records."),
 173+ PublicTable("interwiki", "Set of defined interwiki prefixes and links for this wiki."),
 174+ PublicTable("logging", "Data for various events (deletions, uploads, etc)."),
 175+
 176+ PublicTable("page", "Base per-page data (id, title, restrictions, etc)."),
 177+ #PublicTable("revision", "Base per-revision data (does not include text)."), // safe?
 178+ #PrivateTable("text", "Text blob storage. May be compressed, etc."), // ?
 179+
 180+ XmlStub("First-pass for page XML data dumps"),
 181+ XmlDump("full", "All pages with complete page edit history (very large!)"),
 182+ XmlDump("current", "All pages, current versions only"),
 183+ XmlDump("articles", "Articles, templates, image descriptions, and main meta-pages (recommended)"),
 184+
 185+ TitleDump("List of page titles"),
 186+
 187+ # YahooDump(),
 188+ ];
 189+
 190+ for item in items:
 191+ item.run(self)
 192+
 193+ self.checksums()
 194+ self.completeDump()
 195+
 196+ self.unlock()
 197+ self.statusComplete()
 198+
 199+ def lockFile(self):
 200+ return self.publicPath("lock")
 201+
 202+ def doneFile(self):
 203+ return self.publicPath("done")
 204+
 205+ def lock(self):
 206+ self.status("Creating lock file.")
 207+ lockfile = self.lockFile()
 208+ donefile = self.doneFile()
 209+ if os.path.exists(lockfile):
 210+ raise BackupError("Lock file %s already exists" % lockfile)
 211+ if os.path.exists(donefile):
 212+ self.status("Removing completion marker %s" % donefile)
 213+ os.remove(donefile)
 214+ try:
 215+ os.remove(lockfile)
 216+ except:
 217+ # failure? let it die
 218+ pass
 219+ #####date -u > $StatusLockFile
 220+
 221+ def unlock(self):
 222+ self.status("Marking complete.")
 223+ ######date -u > $StatusDoneFile
 224+
 225+ def dateStamp(self):
 226+ #date -u --iso-8601=seconds
 227+ pass
 228+
 229+ def status(self, message):
 230+ #echo $DatabaseName `dateStamp` OK: "$1" | tee -a $StatusLog | tee -a $GlobalLog
 231+ self.debug(message)
 232+
 233+ def statusError(self, message):
 234+ # echo $DatabaseName `dateStamp` ABORT: "$1" | tee -a $StatusLog | tee -a $GlobalLog
 235+ # echo "Backup of $DatabaseName failed at: $1" | \
 236+ # mail -s "Wikimedia backup error on $DatabaseName" $AbortEmail
 237+ # exit -1
 238+ self.debug(message)
 239+
 240+ def statusComplete(self):
 241+ # echo $DatabaseName `dateStamp` SUCCESS: "done." | tee -a $StatusLog | tee -a $GlobalLog
 242+ self.debug("SUCCESS: done.")
 243+
 244+ def checksums(self):
 245+ self.debug("If this script were finished, it would be checksumming files here")
 246+
 247+ def completeDump(self):
 248+ self.debug("If this script were finished, it would be adding symlinks or something")
 249+
 250+class Dump(object):
 251+ def __init__(self, desc):
 252+ self._desc = desc
 253+
 254+ def description(self):
 255+ return self._desc
 256+
 257+class PublicTable(Dump):
 258+ def __init__(self, table, descr):
 259+ self._table = table
 260+ self._descr = descr
 261+
 262+ def _path(self, runner, filename):
 263+ return runner.publicPath(filename)
 264+
 265+ def run(self, runner):
 266+ path = self._path(runner, self._table + ".sql.gz")
 267+ return runner.saveTable(self._table, path)
 268+
 269+class PrivateTable(PublicTable):
 270+ def __init__(self, table, descr):
 271+ self._table = table
 272+ self._descr = descr
 273+
 274+ def description(self):
 275+ return self._desc + " (private)"
 276+
 277+ def _path(self, runner, filename):
 278+ return runner.privatePath(filename)
 279+
 280+
 281+class XmlStub(Dump):
 282+ """Create lightweight skeleton dumps, minus bulk text.
 283+ A second pass will import text from prior dumps or the database to make
 284+ full files for the public."""
 285+
 286+ def description(self):
 287+ return "creating split stub dumps..."
 288+
 289+ def run(self, runner):
 290+ command = """
 291+%s -q %s/maintenance/dumpBackup.php %s \
 292+ --full \
 293+ --stub \
 294+ --output=gzip:%s \
 295+ --output=gzip:%s \
 296+ --filter=latest \
 297+ --output=gzip:%s \
 298+ --filter=latest \
 299+ --filter=notalk \
 300+ --filter=namespace:\!NS_USER \
 301+""" % shellEscape((
 302+ runner.php,
 303+ runner.wikidir,
 304+ runner.db,
 305+ runner.privatePath("stub-full.xml.gz"),
 306+ runner.privatePath("stub-current.xml.gz"),
 307+ runner.privatePath("stub-articles.xml.gz")))
 308+ runner.runCommand(command)
 309+
 310+class XmlDump(Dump):
 311+ """Primary XML dumps, one section at a time."""
 312+ def __init__(self, subset, desc):
 313+ self._subset = subset
 314+ self._desc = desc
 315+
 316+ def run(self, runner):
 317+ xmlbz2 = runner.publicPath("pages_" + self._subset + ".xml.bz2")
 318+ xml7z = runner.publicPath("pages_" + self._subset + ".xml.7z")
 319+
 320+ # Clear prior 7zip attempts; 7zip will try to append an existing archive
 321+ if os.path.exists(xml7z):
 322+ os.remove(xml7z)
 323+
 324+ # Page and revision data pulled from this skeleton dump...
 325+ stub = runner.privatePath("stub-%s.xml.gz" % self._subset),
 326+ stubCommand = "gzip -dc %s" % stub
 327+
 328+ # Try to pull text from the previous run; most stuff hasn't changed
 329+ #Source=$OutputDir/pages_$section.xml.bz2
 330+ source = self._findPreviousDump(runner)
 331+ if os.path.exists(source):
 332+ runner.status("... building %s XML dump, with text prefetch from %s..." % (self._subset, source))
 333+ prefetch = "--prefetch=bzip2:%s" % (source)
 334+ else:
 335+ runner.status("... building %s XML dump, no text prefetch..." % self._subset)
 336+ prefetch = ""
 337+
 338+ dumpCommand = "%s -q %s/maintenance/dumpTextPass.php %s %s --output=bzip2:%s --output=7zip:%s" % shellEscape((
 339+ runner.php,
 340+ runner.wikidir,
 341+ runner.db,
 342+ prefetch,
 343+ xmlbz2,
 344+ xml7z))
 345+ command = stubCommand + " | " + dumpCommand
 346+
 347+ return runner.runCommand(command)
 348+
 349+ def _findPreviousDump(self, runner):
 350+ return "/tmp/fake/foo"
 351+
 352+class TitleDump(Dump):
 353+ """This is used by "wikiproxy", a program to add Wikipedia links to BBC news online"""
 354+ def run(self, runner):
 355+ return runner.saveSql("select page_title from page where page_namespace=0;",
 356+ runner.publicPath("all_titles_in_ns0.gz"))
 357+
 358+
 359+class Checksums(Dump):
 360+ def description(self):
 361+ return "calculating MD5 hashes"
 362+
 363+ def run(self, runner):
 364+ # FIXME: run checksums only on the master server?
 365+ command = "md5sum " + \
 366+ runner.publicPath("*.xml.*") + " " + \
 367+ runner.publicPath("*.sql.gz") + " " + \
 368+ runner.publicPath("all_titles_in_ns0.gz")
 369+ return runner.saveCommand(command, runner.publicPath("md5sums.txt"))
 370+
Property changes on: trunk/backup/WikiBackup.py
___________________________________________________________________
Added: svn:keywords
1371 + Author Date Id Revision
Added: svn:eol-style
2372 + native

Status & tagging log