Index: trunk/backup/WikiBackup.py |
— | — | @@ -0,0 +1,369 @@ |
| 2 | +#!/usr/bin/python |
| 3 | + |
| 4 | +"""Backup/public data dump runner for Wikimedia's MediaWiki-based sites. |
| 5 | + |
| 6 | +This replaces the old set of hacky bash scripts we used to use. |
| 7 | + |
| 8 | +Current state: |
| 9 | +* Seems to dump basic files correctly on my test system. |
| 10 | + |
| 11 | +TODO: |
| 12 | +* detect handle error conditions ;) |
| 13 | +* lock files / looping |
| 14 | +* use date-based subdirectories |
| 15 | +* generate HTML pages with status and navigable links |
| 16 | +* generate file checksums |
| 17 | +* symlink files to a stable directory on completion |
| 18 | +* make upload tarballs? |
| 19 | +* detect low disk space and either call for help or automatically clear old files |
| 20 | + |
| 21 | + |
| 22 | +To run, make a wrapper script something like this: |
| 23 | +runner = WikiBackup.Runner( |
| 24 | + public="/dumps/public", |
| 25 | + private="/dumps/private", |
| 26 | + dblist=["onesix"], |
| 27 | + privatelist=(), |
| 28 | + dbserver="localhost", |
| 29 | + dbuser="root", |
| 30 | + dbpassword="", |
| 31 | + wikidir="/opt/web/pages/head", |
| 32 | + php="/opt/php51/bin/php") |
| 33 | +runner.run() |
| 34 | +""" |
| 35 | + |
| 36 | +import os |
| 37 | + |
| 38 | +def dbList(filename): |
| 39 | + infile = open(filename) |
| 40 | + dbs = [] |
| 41 | + for line in infile: |
| 42 | + line = line.strip() |
| 43 | + if line != "": |
| 44 | + dbs.append(line) |
| 45 | + infile.close() |
| 46 | + return dbs |
| 47 | + |
| 48 | +def shellEscape(param): |
| 49 | + """Escape a string parameter, or set of strings, for the shell.""" |
| 50 | + if isinstance(param, basestring): |
| 51 | + return "'" + param.replace("'", "'\\''") + "'" |
| 52 | + elif param is None: |
| 53 | + # A blank string might actually be needed; None means we can leave it out |
| 54 | + return "" |
| 55 | + else: |
| 56 | + return tuple([shellEscape(x) for x in param]) |
| 57 | + |
| 58 | +class Runner(object): |
| 59 | + def __init__(self, public, private, dblist, privatelist, dbserver, dbuser, dbpassword, wikidir, php="php"): |
| 60 | + self.public = public |
| 61 | + self.private = private |
| 62 | + self.dblist = dblist |
| 63 | + self.privatelist = privatelist |
| 64 | + self.dbserver = dbserver |
| 65 | + self.dbuser = dbuser |
| 66 | + self.dbpassword = dbpassword |
| 67 | + self.wikidir = wikidir |
| 68 | + self.php = php |
| 69 | + |
| 70 | + """Public methods for the manager script...""" |
| 71 | + |
| 72 | + def run(self): |
| 73 | + """Iterate through the list of wikis and dump them!""" |
| 74 | + self.debug("Starting dump...") |
| 75 | + for db in self.dblist: |
| 76 | + self.db = db |
| 77 | + self.doBackup() |
| 78 | + self.debug("Done!") |
| 79 | + |
| 80 | + """Public methods for dumps to use...""" |
| 81 | + |
| 82 | + def privatePath(self, filename=""): |
| 83 | + """Take a given filename in the private dump dir for the selected database.""" |
| 84 | + return os.path.join(self.private, self.db, filename) |
| 85 | + |
| 86 | + def publicPath(self, filename=""): |
| 87 | + """Take a given filename in the public dump dir for the selected database. |
| 88 | + If this database is marked as private, will use the private dir instead. |
| 89 | + """ |
| 90 | + if self.db in self.privatelist: |
| 91 | + return self.privatePath(filename) |
| 92 | + else: |
| 93 | + return os.path.join(self.public, self.db, filename) |
| 94 | + |
| 95 | + def passwordOption(self): |
| 96 | + """If you pass '-pfoo' mysql uses the password 'foo', but if you pass '-p' it prompts. Sigh.""" |
| 97 | + if self.dbpassword == "": |
| 98 | + return None |
| 99 | + else: |
| 100 | + return "-p" + shellEscape(self.dbpassword) |
| 101 | + |
| 102 | + def saveTable(self, table, outfile): |
| 103 | + """Dump a table from the current DB with mysqldump, save to a gzipped sql file.""" |
| 104 | + command = "mysqldump -h %s -u %s %s --opt --quote-names %s %s | gzip" % shellEscape(( |
| 105 | + self.dbserver, |
| 106 | + self.dbuser, |
| 107 | + self.passwordOption(), |
| 108 | + self.db, |
| 109 | + table)) |
| 110 | + return self.saveCommand(command, outfile) |
| 111 | + |
| 112 | + def saveSql(self, query, outfile): |
| 113 | + """Pass some SQL commands to the server for this DB and save output to a file.""" |
| 114 | + command = "echo %s | mysql -h %s -u %s %s %s | gzip" % shellEscape(( |
| 115 | + query, |
| 116 | + self.dbserver, |
| 117 | + self.dbuser, |
| 118 | + self.passwordOption(), |
| 119 | + self.db)) |
| 120 | + return self.saveCommand(command, outfile) |
| 121 | + |
| 122 | + def saveCommand(self, command, outfile): |
| 123 | + """Shell out and redirect output to a given file.""" |
| 124 | + return self.runCommand(command + " > " + shellEscape(outfile)) |
| 125 | + |
| 126 | + def runCommand(self, command): |
| 127 | + """Shell out; output is assumed to be saved usefully somehow.""" |
| 128 | + self.debug("runCommand: " + command) |
| 129 | + return os.system(command) |
| 130 | + |
| 131 | + def debug(self, stuff): |
| 132 | + print stuff |
| 133 | + |
| 134 | + # auto-set |
| 135 | + #OutputDir=$PublicDir/$DirLang |
| 136 | + #StatusLog=$OutputDir/backup.log |
| 137 | + #StatusLockFile=$OutputDir/backup.lock |
| 138 | + #StatusDoneFile=$OutputDir/backup.done |
| 139 | + |
| 140 | + #GlobalLog=/var/backup/public/backup.log |
| 141 | + |
| 142 | + def makeDir(self, dir): |
| 143 | + if os.path.exists(dir): |
| 144 | + self.debug("Checkdir dir %s ..." % dir) |
| 145 | + else: |
| 146 | + self.debug("Creating %s ..." % dir) |
| 147 | + os.mkdir(dir) |
| 148 | + |
| 149 | + def doBackup(self): |
| 150 | + self.makeDir(self.public) |
| 151 | + self.makeDir(self.publicPath()) |
| 152 | + |
| 153 | + self.makeDir(self.private) |
| 154 | + self.makeDir(self.privatePath()) |
| 155 | + |
| 156 | + self.status("Starting backup of %s" % self.db) |
| 157 | + self.lock() |
| 158 | + |
| 159 | + items = [PrivateTable("user", "User account data."), |
| 160 | + PrivateTable("user_groups", "User group assignments."), |
| 161 | + PrivateTable("watchlist", "Users' watchlist settings."), |
| 162 | + PrivateTable("ipblocks", "Data for blocks of IP addresses, ranges, and users."), |
| 163 | + PrivateTable("archive", "Deleted page and revision data."), |
| 164 | + PrivateTable("updates", "Update dataset for OAI updater system."), |
| 165 | + |
| 166 | + PublicTable("site_stats", "A few statistics such as the page count."), |
| 167 | + PublicTable("image", "Metadata on current versions of uploaded images."), |
| 168 | + PublicTable("oldimage", "Metadata on prior versions of uploaded images."), |
| 169 | + PublicTable("pagelinks", "Wiki page-to-page link records."), |
| 170 | + PublicTable("categorylinks", "Wiki category membership link records."), |
| 171 | + PublicTable("imagelinks", "Wiki image usage records."), |
| 172 | + PublicTable("templatelinks", "Wiki template inclusion link records."), |
| 173 | + PublicTable("interwiki", "Set of defined interwiki prefixes and links for this wiki."), |
| 174 | + PublicTable("logging", "Data for various events (deletions, uploads, etc)."), |
| 175 | + |
| 176 | + PublicTable("page", "Base per-page data (id, title, restrictions, etc)."), |
| 177 | + #PublicTable("revision", "Base per-revision data (does not include text)."), // safe? |
| 178 | + #PrivateTable("text", "Text blob storage. May be compressed, etc."), // ? |
| 179 | + |
| 180 | + XmlStub("First-pass for page XML data dumps"), |
| 181 | + XmlDump("full", "All pages with complete page edit history (very large!)"), |
| 182 | + XmlDump("current", "All pages, current versions only"), |
| 183 | + XmlDump("articles", "Articles, templates, image descriptions, and main meta-pages (recommended)"), |
| 184 | + |
| 185 | + TitleDump("List of page titles"), |
| 186 | + |
| 187 | + # YahooDump(), |
| 188 | + ]; |
| 189 | + |
| 190 | + for item in items: |
| 191 | + item.run(self) |
| 192 | + |
| 193 | + self.checksums() |
| 194 | + self.completeDump() |
| 195 | + |
| 196 | + self.unlock() |
| 197 | + self.statusComplete() |
| 198 | + |
| 199 | + def lockFile(self): |
| 200 | + return self.publicPath("lock") |
| 201 | + |
| 202 | + def doneFile(self): |
| 203 | + return self.publicPath("done") |
| 204 | + |
| 205 | + def lock(self): |
| 206 | + self.status("Creating lock file.") |
| 207 | + lockfile = self.lockFile() |
| 208 | + donefile = self.doneFile() |
| 209 | + if os.path.exists(lockfile): |
| 210 | + raise BackupError("Lock file %s already exists" % lockfile) |
| 211 | + if os.path.exists(donefile): |
| 212 | + self.status("Removing completion marker %s" % donefile) |
| 213 | + os.remove(donefile) |
| 214 | + try: |
| 215 | + os.remove(lockfile) |
| 216 | + except: |
| 217 | + # failure? let it die |
| 218 | + pass |
| 219 | + #####date -u > $StatusLockFile |
| 220 | + |
| 221 | + def unlock(self): |
| 222 | + self.status("Marking complete.") |
| 223 | + ######date -u > $StatusDoneFile |
| 224 | + |
| 225 | + def dateStamp(self): |
| 226 | + #date -u --iso-8601=seconds |
| 227 | + pass |
| 228 | + |
| 229 | + def status(self, message): |
| 230 | + #echo $DatabaseName `dateStamp` OK: "$1" | tee -a $StatusLog | tee -a $GlobalLog |
| 231 | + self.debug(message) |
| 232 | + |
| 233 | + def statusError(self, message): |
| 234 | + # echo $DatabaseName `dateStamp` ABORT: "$1" | tee -a $StatusLog | tee -a $GlobalLog |
| 235 | + # echo "Backup of $DatabaseName failed at: $1" | \ |
| 236 | + # mail -s "Wikimedia backup error on $DatabaseName" $AbortEmail |
| 237 | + # exit -1 |
| 238 | + self.debug(message) |
| 239 | + |
| 240 | + def statusComplete(self): |
| 241 | + # echo $DatabaseName `dateStamp` SUCCESS: "done." | tee -a $StatusLog | tee -a $GlobalLog |
| 242 | + self.debug("SUCCESS: done.") |
| 243 | + |
| 244 | + def checksums(self): |
| 245 | + self.debug("If this script were finished, it would be checksumming files here") |
| 246 | + |
| 247 | + def completeDump(self): |
| 248 | + self.debug("If this script were finished, it would be adding symlinks or something") |
| 249 | + |
| 250 | +class Dump(object): |
| 251 | + def __init__(self, desc): |
| 252 | + self._desc = desc |
| 253 | + |
| 254 | + def description(self): |
| 255 | + return self._desc |
| 256 | + |
| 257 | +class PublicTable(Dump): |
| 258 | + def __init__(self, table, descr): |
| 259 | + self._table = table |
| 260 | + self._descr = descr |
| 261 | + |
| 262 | + def _path(self, runner, filename): |
| 263 | + return runner.publicPath(filename) |
| 264 | + |
| 265 | + def run(self, runner): |
| 266 | + path = self._path(runner, self._table + ".sql.gz") |
| 267 | + return runner.saveTable(self._table, path) |
| 268 | + |
| 269 | +class PrivateTable(PublicTable): |
| 270 | + def __init__(self, table, descr): |
| 271 | + self._table = table |
| 272 | + self._descr = descr |
| 273 | + |
| 274 | + def description(self): |
| 275 | + return self._desc + " (private)" |
| 276 | + |
| 277 | + def _path(self, runner, filename): |
| 278 | + return runner.privatePath(filename) |
| 279 | + |
| 280 | + |
| 281 | +class XmlStub(Dump): |
| 282 | + """Create lightweight skeleton dumps, minus bulk text. |
| 283 | + A second pass will import text from prior dumps or the database to make |
| 284 | + full files for the public.""" |
| 285 | + |
| 286 | + def description(self): |
| 287 | + return "creating split stub dumps..." |
| 288 | + |
| 289 | + def run(self, runner): |
| 290 | + command = """ |
| 291 | +%s -q %s/maintenance/dumpBackup.php %s \ |
| 292 | + --full \ |
| 293 | + --stub \ |
| 294 | + --output=gzip:%s \ |
| 295 | + --output=gzip:%s \ |
| 296 | + --filter=latest \ |
| 297 | + --output=gzip:%s \ |
| 298 | + --filter=latest \ |
| 299 | + --filter=notalk \ |
| 300 | + --filter=namespace:\!NS_USER \ |
| 301 | +""" % shellEscape(( |
| 302 | + runner.php, |
| 303 | + runner.wikidir, |
| 304 | + runner.db, |
| 305 | + runner.privatePath("stub-full.xml.gz"), |
| 306 | + runner.privatePath("stub-current.xml.gz"), |
| 307 | + runner.privatePath("stub-articles.xml.gz"))) |
| 308 | + runner.runCommand(command) |
| 309 | + |
| 310 | +class XmlDump(Dump): |
| 311 | + """Primary XML dumps, one section at a time.""" |
| 312 | + def __init__(self, subset, desc): |
| 313 | + self._subset = subset |
| 314 | + self._desc = desc |
| 315 | + |
| 316 | + def run(self, runner): |
| 317 | + xmlbz2 = runner.publicPath("pages_" + self._subset + ".xml.bz2") |
| 318 | + xml7z = runner.publicPath("pages_" + self._subset + ".xml.7z") |
| 319 | + |
| 320 | + # Clear prior 7zip attempts; 7zip will try to append an existing archive |
| 321 | + if os.path.exists(xml7z): |
| 322 | + os.remove(xml7z) |
| 323 | + |
| 324 | + # Page and revision data pulled from this skeleton dump... |
| 325 | + stub = runner.privatePath("stub-%s.xml.gz" % self._subset), |
| 326 | + stubCommand = "gzip -dc %s" % stub |
| 327 | + |
| 328 | + # Try to pull text from the previous run; most stuff hasn't changed |
| 329 | + #Source=$OutputDir/pages_$section.xml.bz2 |
| 330 | + source = self._findPreviousDump(runner) |
| 331 | + if os.path.exists(source): |
| 332 | + runner.status("... building %s XML dump, with text prefetch from %s..." % (self._subset, source)) |
| 333 | + prefetch = "--prefetch=bzip2:%s" % (source) |
| 334 | + else: |
| 335 | + runner.status("... building %s XML dump, no text prefetch..." % self._subset) |
| 336 | + prefetch = "" |
| 337 | + |
| 338 | + dumpCommand = "%s -q %s/maintenance/dumpTextPass.php %s %s --output=bzip2:%s --output=7zip:%s" % shellEscape(( |
| 339 | + runner.php, |
| 340 | + runner.wikidir, |
| 341 | + runner.db, |
| 342 | + prefetch, |
| 343 | + xmlbz2, |
| 344 | + xml7z)) |
| 345 | + command = stubCommand + " | " + dumpCommand |
| 346 | + |
| 347 | + return runner.runCommand(command) |
| 348 | + |
| 349 | + def _findPreviousDump(self, runner): |
| 350 | + return "/tmp/fake/foo" |
| 351 | + |
| 352 | +class TitleDump(Dump): |
| 353 | + """This is used by "wikiproxy", a program to add Wikipedia links to BBC news online""" |
| 354 | + def run(self, runner): |
| 355 | + return runner.saveSql("select page_title from page where page_namespace=0;", |
| 356 | + runner.publicPath("all_titles_in_ns0.gz")) |
| 357 | + |
| 358 | + |
| 359 | +class Checksums(Dump): |
| 360 | + def description(self): |
| 361 | + return "calculating MD5 hashes" |
| 362 | + |
| 363 | + def run(self, runner): |
| 364 | + # FIXME: run checksums only on the master server? |
| 365 | + command = "md5sum " + \ |
| 366 | + runner.publicPath("*.xml.*") + " " + \ |
| 367 | + runner.publicPath("*.sql.gz") + " " + \ |
| 368 | + runner.publicPath("all_titles_in_ns0.gz") |
| 369 | + return runner.saveCommand(command, runner.publicPath("md5sums.txt")) |
| 370 | + |
Property changes on: trunk/backup/WikiBackup.py |
___________________________________________________________________ |
Added: svn:keywords |
1 | 371 | + Author Date Id Revision |
Added: svn:eol-style |
2 | 372 | + native |