Index: branches/ariel/xmldumps-backup/incrementals/IncrDumpLib.py |
— | — | @@ -0,0 +1,390 @@ |
| 2 | +# shared classes for incrementals |
| 3 | +import os |
| 4 | +import sys |
| 5 | +import re |
| 6 | +import ConfigParser |
| 7 | +import WikiDump |
| 8 | +from WikiDump import FileUtils, TimeUtils, MiscUtils |
| 9 | +from os.path import exists |
| 10 | +import socket |
| 11 | +import subprocess |
| 12 | +from subprocess import Popen, PIPE |
| 13 | + |
| 14 | +class ContentFile(object): |
| 15 | + def __init__(self, config, date, wikiName): |
| 16 | + self._config = config |
| 17 | + self.date = date |
| 18 | + self.incrDir = IncrementDir(self._config, date) |
| 19 | + self.wikiName = wikiName |
| 20 | + |
| 21 | + # override this. |
| 22 | + def getFileName(self): |
| 23 | + return "content.txt" |
| 24 | + |
| 25 | + def getPath(self): |
| 26 | + return os.path.join(self.incrDir.getIncDir(self.wikiName),self.getFileName()) |
| 27 | + |
| 28 | + def getFileInfo(self): |
| 29 | + return FileUtils.fileInfo(self.getPath()) |
| 30 | + |
| 31 | +class MaxRevIDFile(ContentFile): |
| 32 | + def getFileName(self): |
| 33 | + return "maxrevid.txt" |
| 34 | + |
| 35 | +class StubFile(ContentFile): |
| 36 | + def getFileName(self): |
| 37 | + return "%s-%s-stubs-meta-hist-incr.xml.gz" % ( self.wikiName, self.date ) |
| 38 | + |
| 39 | +class RevsFile(ContentFile): |
| 40 | + def getFileName(self): |
| 41 | + return "%s-%s-pages-meta-hist-incr.xml.bz2" % ( self.wikiName, self.date ) |
| 42 | + |
| 43 | +class StatusFile(ContentFile): |
| 44 | + def getFileName(self): |
| 45 | + return "status.txt" |
| 46 | + |
| 47 | + def getPath(self, date = None): |
| 48 | + return os.path.join(self.incrDir.getIncDir(self.wikiName, date),self.getFileName()) |
| 49 | + |
| 50 | +class LockFile(ContentFile): |
| 51 | + def getFileName(self): |
| 52 | + return "%s-%s.lock" % ( self.wikiName, self.date ) |
| 53 | + |
| 54 | + def getPath(self): |
| 55 | + return os.path.join(self.incrDir.getIncDirNoDate(self.wikiName),self.getFileName()) |
| 56 | + |
| 57 | +class MaxRevIDLockFile(LockFile): |
| 58 | + def getFileName(self): |
| 59 | + return "%s-%s-maxrevid.lock" % ( self.wikiName, self.date ) |
| 60 | + |
| 61 | +class IncrDumpLockFile(LockFile): |
| 62 | + def getFileName(self): |
| 63 | + return "%s-%s-incrdump.lock" % ( self.wikiName, self.date ) |
| 64 | + |
| 65 | +class MD5File(ContentFile): |
| 66 | + def getFileName(self): |
| 67 | + return "%s-%s-md5sums.txt" % ( self.wikiName, self.date ) |
| 68 | + |
| 69 | +class IndexFile(ContentFile): |
| 70 | + def __init__(self, config): |
| 71 | + self._config = config |
| 72 | + self.incrDir = IncrementDir(self._config) |
| 73 | + |
| 74 | + def getFileName(self): |
| 75 | + return "index.html" |
| 76 | + |
| 77 | + def getPath(self): |
| 78 | + return os.path.join(self.incrDir.getIncDirBase(),self.getFileName()) |
| 79 | + |
| 80 | +class StatusInfo(object): |
| 81 | + def __init__(self, config, date, wikiName): |
| 82 | + self._config = config |
| 83 | + self.date = date |
| 84 | + self.wikiName = wikiName |
| 85 | + self.statusFile = StatusFile(self._config, self.date, self.wikiName) |
| 86 | + |
| 87 | + def getStatus(self, date = None): |
| 88 | + if exists(self.statusFile.getPath(date)): |
| 89 | + status = FileUtils.readFile(self.statusFile.getPath(date)).rstrip() |
| 90 | + if status == "done": |
| 91 | + return True |
| 92 | + return False |
| 93 | + |
| 94 | + def setStatus(self, status): |
| 95 | + FileUtils.writeFileInPlace(self.statusFile.getPath(),status, self._config.fileperms) |
| 96 | + |
| 97 | +class Lock(object): |
| 98 | + def __init__(self, config, date, wikiName): |
| 99 | + self._config = config |
| 100 | + self.date = date |
| 101 | + self.wikiName = wikiName |
| 102 | + self.lockFile = LockFile(self._config, self.date, self.wikiName) |
| 103 | + |
| 104 | + def isLocked(self): |
| 105 | + return exists(self.lockFile.getPath()) |
| 106 | + |
| 107 | + def getLock(self): |
| 108 | + try: |
| 109 | + if not exists(self._config.incrementalsDir): |
| 110 | + os.makedirs(self._config.incrementalsDir) |
| 111 | + f = FileUtils.atomicCreate(self.lockFile.getPath(), "w") |
| 112 | + f.write("%s %d" % (socket.getfqdn(), os.getpid())) |
| 113 | + f.close() |
| 114 | + return True |
| 115 | + except: |
| 116 | + return False |
| 117 | + |
| 118 | + def unlock(self): |
| 119 | + os.remove(self.lockFile.getPath()) |
| 120 | + |
| 121 | + def getLockInfo(self): |
| 122 | + try: |
| 123 | + timestamp = os.stat(self.lockFile.getPath()).st_mtime |
| 124 | + return time.strftime("%Y-%m-%d %H:%M:%S",timestamp) |
| 125 | + except: |
| 126 | + return None |
| 127 | + |
| 128 | +class IncrDumpLock(Lock): |
| 129 | + def __init__(self, config, date, wikiName): |
| 130 | + self._config = config |
| 131 | + self.date = date |
| 132 | + self.wikiName = wikiName |
| 133 | + self.lockFile = IncrDumpLockFile(self._config, self.date, self.wikiName) |
| 134 | + |
| 135 | +class MaxRevIDLock(Lock): |
| 136 | + def __init__(self,config, date, wikiName): |
| 137 | + self._config = config |
| 138 | + self.date = date |
| 139 | + self.wikiName = wikiName |
| 140 | + self.lockFile = MaxRevIDLockFile(self._config, self.date, self.wikiName) |
| 141 | + |
| 142 | +class Config(object): |
| 143 | + def __init__(self, configFile=False): |
| 144 | + self.projectName = False |
| 145 | + |
| 146 | + home = os.path.dirname(sys.argv[0]) |
| 147 | + if (not configFile): |
| 148 | + configFile = "dumpincr.conf" |
| 149 | + self.files = [ |
| 150 | + os.path.join(home,configFile), |
| 151 | + "/etc/dumpincrementals.conf", |
| 152 | + os.path.join(os.getenv("HOME"), ".dumpincr.conf")] |
| 153 | + defaults = { |
| 154 | + #"wiki": { |
| 155 | + "allwikislist": "", |
| 156 | + "privatewikislist": "", |
| 157 | + "closedwikislist": "", |
| 158 | + #"output": { |
| 159 | + "incrementalsdir": "/dumps/public/incr", |
| 160 | + "templatedir": home, |
| 161 | + "temp":"/dumps/temp", |
| 162 | + "webroot": "http://localhost/dumps/incr", |
| 163 | + "fileperms": "0640", |
| 164 | + "delay": "43200", |
| 165 | + #"database": { |
| 166 | + "user": "root", |
| 167 | + "password": "", |
| 168 | + #"tools": { |
| 169 | + "mediawiki" : "", |
| 170 | + "php": "/bin/php", |
| 171 | + "gzip": "/usr/bin/gzip", |
| 172 | + "bzip2": "/usr/bin/bzip2", |
| 173 | + "mysql": "/usr/bin/mysql", |
| 174 | + "checkforbz2footer": "/usr/local/bin/checkforbz2footer", |
| 175 | + "writeuptopageid": "/usr/local/bin/writeuptopageid", |
| 176 | + "multiversion": "", |
| 177 | + #"cleanup": { |
| 178 | + "keep": "3", |
| 179 | + } |
| 180 | + |
| 181 | + self.conf = ConfigParser.SafeConfigParser(defaults) |
| 182 | + self.conf.read(self.files) |
| 183 | + |
| 184 | + if not self.conf.has_section("wiki"): |
| 185 | + print "The mandatory configuration section 'wiki' was not defined." |
| 186 | + raise ConfigParser.NoSectionError('wiki') |
| 187 | + |
| 188 | + if not self.conf.has_option("wiki","mediawiki"): |
| 189 | + print "The mandatory setting 'mediawiki' in the section 'wiki' was not defined." |
| 190 | + raise ConfigParser.NoOptionError('wiki','mediawiki') |
| 191 | + |
| 192 | + self.parseConfFile() |
| 193 | + |
| 194 | + def parseConfFile(self): |
| 195 | + self.mediawiki = self.conf.get("wiki", "mediawiki") |
| 196 | + self.allWikisList = MiscUtils.dbList(self.conf.get("wiki", "allwikislist")) |
| 197 | + self.privateWikisList = MiscUtils.dbList(self.conf.get("wiki", "privatewikislist")) |
| 198 | + self.closedWikisList = MiscUtils.dbList(self.conf.get("wiki", "closedwikislist")) |
| 199 | + |
| 200 | + if not self.conf.has_section('output'): |
| 201 | + self.conf.add_section('output') |
| 202 | + self.incrementalsDir = self.conf.get("output", "incrementalsdir") |
| 203 | + self.tempDir = self.conf.get("output", "temp") |
| 204 | + self.templateDir = self.conf.get("output", "templateDir") |
| 205 | + self.webRoot = self.conf.get("output", "webroot") |
| 206 | + self.fileperms = self.conf.get("output", "fileperms") |
| 207 | + self.fileperms = int(self.fileperms,0) |
| 208 | + self.delay = self.conf.get("output", "delay") |
| 209 | + self.delay = int(self.delay,0) |
| 210 | + |
| 211 | + if not self.conf.has_section('tools'): |
| 212 | + self.conf.add_section('tools') |
| 213 | + self.php = self.conf.get("tools", "php") |
| 214 | + self.gzip = self.conf.get("tools", "gzip") |
| 215 | + self.bzip2 = self.conf.get("tools", "bzip2") |
| 216 | + self.mysql = self.conf.get("tools", "mysql") |
| 217 | + self.checkforbz2footer = self.conf.get("tools","checkforbz2footer") |
| 218 | + self.writeuptopageid = self.conf.get("tools","writeuptopageid") |
| 219 | + self.multiversion = self.conf.get("tools","multiversion") |
| 220 | + |
| 221 | + if not self.conf.has_section('cleanup'): |
| 222 | + self.conf.add_section('cleanup') |
| 223 | + self.keep = self.conf.getint("cleanup", "keep") |
| 224 | + |
| 225 | + if not self.conf.has_section('database'): |
| 226 | + self.conf.add_section('database') |
| 227 | + self.dbUser = self.conf.get("database", "user") |
| 228 | + self.dbPassword = self.conf.get("database", "password") |
| 229 | + |
| 230 | + def readTemplate(self, name): |
| 231 | + template = os.path.join(self.templateDir, name) |
| 232 | + return FileUtils.readFile(template) |
| 233 | + |
| 234 | +class RunSimpleCommand(object): |
| 235 | + def runWithOutput(command, maxtries = 3, shell=False): |
| 236 | + """Run a command and return the output as a string. |
| 237 | + Raises IncrementDumpsError on non-zero return code.""" |
| 238 | + success = False |
| 239 | + tries = 0 |
| 240 | + while (not success and tries < maxtries): |
| 241 | + proc = Popen(command, shell = shell, stdout = PIPE, stderr = PIPE) |
| 242 | + output, error = proc.communicate() |
| 243 | + if not proc.returncode: |
| 244 | + success = True |
| 245 | + tries = tries + 1 |
| 246 | + if not success: |
| 247 | + if type(command).__name__=='list': |
| 248 | + commandString = " ".join(command) |
| 249 | + else: |
| 250 | + commandString = command |
| 251 | + if proc: |
| 252 | + raise IncrementDumpsError("command '" + commandString + ( "' failed with return code %s " % proc.returncode ) + " and error '" + error + "'") |
| 253 | + else: |
| 254 | + raise IncrementDumpsError("command '" + commandString + ( "' failed" ) + " and error '" + error + "'") |
| 255 | + return output |
| 256 | + |
| 257 | + def runWithNoOutput(command, maxtries = 3, shell=False): |
| 258 | + """Run a command, expecting no output. |
| 259 | + Raises IncrementDumpsError on non-zero return code.""" |
| 260 | + success = False |
| 261 | + tries = 0 |
| 262 | + while ((not success) and tries < maxtries): |
| 263 | + proc = Popen(command, shell = shell, stderr = PIPE) |
| 264 | + # output will be None, we can ignore it |
| 265 | + output, error = proc.communicate() |
| 266 | + if not proc.returncode: |
| 267 | + success = True |
| 268 | + tries = tries + 1 |
| 269 | + if not success: |
| 270 | + if type(command).__name__=='list': |
| 271 | + commandString = " ".join(command) |
| 272 | + else: |
| 273 | + commandString = command |
| 274 | + raise IncrementDumpsError("command '" + commandString + ( "' failed with return code %s " % proc.returncode ) + " and error '" + error + "'") |
| 275 | + |
| 276 | + runWithOutput = staticmethod(runWithOutput) |
| 277 | + runWithNoOutput = staticmethod(runWithNoOutput) |
| 278 | + |
| 279 | +class MultiVersion(object): |
| 280 | + def MWScriptAsString(config, maintenanceScript): |
| 281 | + return(" ".join(MultiVersion.MWScriptAsArray(config, maintenanceScript))) |
| 282 | + |
| 283 | + def MWScriptAsArray(config, maintenanceScript): |
| 284 | + if config.multiversion != "": |
| 285 | + if exists(config.multiversion): |
| 286 | + return [ config.multiversion, maintenanceScript ] |
| 287 | + return [ "%s/maintenance/%s" % (config.mediawiki, maintenanceScript) ] |
| 288 | + |
| 289 | + MWScriptAsString = staticmethod(MWScriptAsString) |
| 290 | + MWScriptAsArray = staticmethod(MWScriptAsArray) |
| 291 | + |
| 292 | +class DBServer(object): |
| 293 | + def __init__(self, config, wikiName): |
| 294 | + self.config = config |
| 295 | + self.wikiName = wikiName |
| 296 | + self.dbServer = self.defaultServer() |
| 297 | + |
| 298 | + def defaultServer(self): |
| 299 | + if (not exists( self.config.php ) ): |
| 300 | + raise BackupError("php command %s not found" % self.config.php) |
| 301 | + commandList = MultiVersion.MWScriptAsArray(self.config, "getSlaveServer.php") |
| 302 | + command = [ self.config.php, "-q" ] |
| 303 | + command.extend(commandList) |
| 304 | + command.extend( [ "--wiki=%s" % self.wikiName, "--group=dump" ]) |
| 305 | + return RunSimpleCommand.runWithOutput(command, shell=False).rstrip() |
| 306 | + |
| 307 | + def buildSqlCommand(self, query): |
| 308 | + """Put together a command to execute an sql query to the server for this DB.""" |
| 309 | + if (not exists( self.config.mysql ) ): |
| 310 | + raise BackupError("mysql command %s not found" % self.config.mysql) |
| 311 | + command = "/bin/echo '%s' | %s -h %s -u %s " % ( query, self.config.mysql, self.dbServer, self.config.dbUser ) |
| 312 | + if self.config.dbPassword != "": |
| 313 | + command = command + "-p" + self.config.dbPassword |
| 314 | + command = command + " -r --silent " + self.wikiName |
| 315 | + return command |
| 316 | + |
| 317 | +class IncrementDumpsError(Exception): |
| 318 | + pass |
| 319 | + |
| 320 | +class IncrementDir(object): |
| 321 | + def __init__(self, config, date = None): |
| 322 | + self._config = config |
| 323 | + self.date = date |
| 324 | + |
| 325 | + def getIncDirBase(self): |
| 326 | + return self._config.incrementalsDir |
| 327 | + |
| 328 | + def getIncDirNoDate(self, wikiName): |
| 329 | + return os.path.join(self.getIncDirBase(), wikiName) |
| 330 | + |
| 331 | + def getIncDir(self, wikiName, date = None): |
| 332 | + if (date == None): |
| 333 | + return os.path.join(self.getIncDirBase(), wikiName, self.date) |
| 334 | + else: |
| 335 | + return os.path.join(self.getIncDirBase(), wikiName, date) |
| 336 | + |
| 337 | +class IncrementDumpsError(Exception): |
| 338 | + pass |
| 339 | + |
| 340 | +class IncDumpDirs(object): |
| 341 | + def __init__(self, config, wikiName): |
| 342 | + self._config = config |
| 343 | + self.wikiName = wikiName |
| 344 | + self.incrDir = IncrementDir(self._config) |
| 345 | + |
| 346 | + def getIncDumpDirs(self): |
| 347 | + base = self.incrDir.getIncDirNoDate(self.wikiName) |
| 348 | + digits = re.compile(r"^\d{4}\d{2}\d{2}$") |
| 349 | + dates = [] |
| 350 | + try: |
| 351 | + for dir in os.listdir(base): |
| 352 | + if digits.match(dir): |
| 353 | + dates.append(dir) |
| 354 | + except OSError: |
| 355 | + return [] |
| 356 | + dates.sort() |
| 357 | + return dates |
| 358 | + |
| 359 | + def cleanupOldIncrDumps(self, date): |
| 360 | + old = self.getIncDumpDirs() |
| 361 | + if old: |
| 362 | + if old[-1] == date: |
| 363 | + old = old[:-1] |
| 364 | + if self._config.keep > 0: |
| 365 | + old = old[:-(self._config.keep)] |
| 366 | + for dump in old: |
| 367 | + toRemove = os.path.join(self.incrDir.getIncDirNoDate(self.wikiName), dump) |
| 368 | + shutil.rmtree("%s" % toRemove) |
| 369 | + |
| 370 | + def getPrevIncrDate(self, date): |
| 371 | + # find the most recent incr dump before the |
| 372 | + # specified date that completed successfully |
| 373 | + previous = None |
| 374 | + old = self.getIncDumpDirs() |
| 375 | + if old: |
| 376 | + for dump in old: |
| 377 | + if dump == date: |
| 378 | + return previous |
| 379 | + else: |
| 380 | + statusInfo = StatusInfo(self._config, dump, self.wikiName) |
| 381 | + if statusInfo.getStatus(dump) == "done": |
| 382 | + previous = dump |
| 383 | + return previous |
| 384 | + |
| 385 | + def getLatestIncrDate(self): |
| 386 | + # find the most recent incr dump |
| 387 | + dirs = self.getIncDumpDirs() |
| 388 | + if dirs: |
| 389 | + return(dirs[-1]) |
| 390 | + else: |
| 391 | + return(None) |
Property changes on: branches/ariel/xmldumps-backup/incrementals/IncrDumpLib.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 392 | + native |
Index: branches/ariel/xmldumps-backup/incrementals/README.config |
— | — | @@ -0,0 +1,38 @@ |
| 2 | +By default, all configuration options are read from the file "dumpincr.conf" in the current directory. |
| 3 | +A different filename may be specified at run time. |
| 4 | + |
| 5 | +The following configuration options are accepted: |
| 6 | + |
| 7 | +In the "wiki" section, |
| 8 | +mediawiki -- full path to the directory of the MediaWiki installation |
| 9 | +allwikislist -- full path to a list of all projects to be dumped, as they appear in MySql |
| 10 | +privatewikislist -- full path to a list of all projects that are private and hence should not be dumped, if any |
| 11 | +closedwikislist -- full path to a list of all projects that are closed and hence should not be dumped, if any |
| 12 | + |
| 13 | +In the "output" section, |
| 14 | +incrementalsdir -- full path to the top level directory where adds/changes dumps will be written; this should |
| 15 | + be web-accessible |
| 16 | +templatedir -- full path to the directory containing template html files such as incrs-index.html (typically |
| 17 | + the same directory as that which contains the dump scripts) |
| 18 | +temp -- full path to a directory which is used to the generation of temporary files; this should |
| 19 | + not be web-accessible |
| 20 | +webroot -- url to top level directory with the main index page, for example http://localhost/mydumps |
| 21 | +fileperms -- read and write permissions that will be assigned to created files; this is in octal four-digit |
| 22 | + format, for example 0644 |
| 23 | +delay -- number of seconds to wait after a max rev_id has been recorded, before dumping revisions |
| 24 | + |
| 25 | +In the "database" section, |
| 26 | +user -- the name of a database user with read access to all tables in the databases |
| 27 | + which will be dumped |
| 28 | +password -- the password for the above user |
| 29 | + |
| 30 | +In the "tools" section, |
| 31 | +php -- the full path to the php command |
| 32 | +mysql -- the full path to the mysql command |
| 33 | +gzip -- the full path to the gzip command |
| 34 | +bzip2 -- the full path to the bzip2 command |
| 35 | +checkforbz2footer -- the full path to the checkforbz2footer command |
| 36 | +writeuptopageid -- the full path to the writeuptopageid command |
| 37 | + |
| 38 | +In the "cleanup" section, |
| 39 | +keep -- the number of old dumps to keep, per project. |
Index: branches/ariel/xmldumps-backup/incrementals/incrmonitor |
— | — | @@ -0,0 +1,26 @@ |
| 2 | +#!/bin/bash |
| 3 | + |
| 4 | +WIKIDUMP_BASE=`dirname "$0"` |
| 5 | + |
| 6 | +if [ ! -z "$1" ]; then |
| 7 | + configFile="$1" |
| 8 | +else |
| 9 | + configFile="" |
| 10 | +fi |
| 11 | + |
| 12 | +if [ ! -z "$2" ]; then |
| 13 | + if [ "$2" == "verbose" ]; then |
| 14 | + verbose="--verbose" |
| 15 | + else |
| 16 | + echo "Unknown option $2" |
| 17 | + exit 1 |
| 18 | + fi |
| 19 | +fi |
| 20 | + |
| 21 | +while true; do |
| 22 | + echo "" |
| 23 | + echo "Sweeping!" |
| 24 | + python $WIKIDUMP_BASE/incrmonitor.py "$configFile" "$verbose" |
| 25 | + echo "sleeping" |
| 26 | + sleep 15 |
| 27 | +done |
Property changes on: branches/ariel/xmldumps-backup/incrementals/incrmonitor |
___________________________________________________________________ |
Added: svn:executable |
1 | 28 | + * |
Index: branches/ariel/xmldumps-backup/incrementals/generatemaxrevids.py |
— | — | @@ -0,0 +1,138 @@ |
| 2 | +# for every wiki, find and record the max rev_id in use. |
| 3 | +# this is phase 1 of daily xml change/adds dumps. |
| 4 | + |
| 5 | +import ConfigParser |
| 6 | +import getopt |
| 7 | +import os |
| 8 | +import re |
| 9 | +import sys |
| 10 | +import WikiDump |
| 11 | +from WikiDump import FileUtils, TimeUtils, MiscUtils |
| 12 | +import subprocess |
| 13 | +import socket |
| 14 | +import time |
| 15 | +import IncrDumpLib |
| 16 | +from IncrDumpLib import Lock, Config, RunSimpleCommand, MultiVersion, DBServer, IncrementDir, IncrementDumpsError, MaxRevIDFile, MaxRevIDLockFile, IncrDumpLock, MaxRevIDLock |
| 17 | +from subprocess import Popen, PIPE |
| 18 | +from os.path import exists |
| 19 | +import traceback |
| 20 | + |
| 21 | +class MaxRevID(object): |
| 22 | + def __init__(self, config, wikiName, date): |
| 23 | + self._config = config |
| 24 | + self.wikiName = wikiName |
| 25 | + self.date = date |
| 26 | + self.maxID = 0 |
| 27 | + self.maxRevIdFile = MaxRevIDFile(self._config, self.date, self.wikiName) |
| 28 | + |
| 29 | + def getMaxRevID(self): |
| 30 | + query = "select MAX(rev_id) from revision"; |
| 31 | + db = DBServer(self._config, self.wikiName) |
| 32 | + # get the result |
| 33 | + self.maxID = RunSimpleCommand.runWithOutput(db.buildSqlCommand(query), shell = True) |
| 34 | + |
| 35 | + def recordMaxRevID(self): |
| 36 | + self.getMaxRevID() |
| 37 | + # write the max id in a file in the right place |
| 38 | + FileUtils.writeFileInPlace(self.maxRevIdFile.getPath(), self.maxID, self._config.fileperms) |
| 39 | + |
| 40 | + def exists(self): |
| 41 | + return exists(self.maxRevIdFile.getPath()) |
| 42 | + |
| 43 | +class MaxIDDump(object): |
| 44 | + def __init__(self,config, date, verbose): |
| 45 | + self._config = config |
| 46 | + self.date = date |
| 47 | + self.incrDir = IncrementDir(self._config, self.date) |
| 48 | + self.verbose = verbose |
| 49 | + |
| 50 | + def doOneWiki(self, w): |
| 51 | + success = True |
| 52 | + if w not in self._config.privateWikisList and w not in self._config.closedWikisList: |
| 53 | + if not exists(self.incrDir.getIncDir(w)): |
| 54 | + os.makedirs(self.incrDir.getIncDir(w)) |
| 55 | + lock = MaxRevIDLock(self._config, self.date, w) |
| 56 | + if lock.getLock(): |
| 57 | + try: |
| 58 | + maxRevID = MaxRevID(self._config, w, self.date) |
| 59 | + if not maxRevID.exists(): |
| 60 | + maxRevID.recordMaxRevID() |
| 61 | + except: |
| 62 | + if (self.verbose): |
| 63 | + print "Wiki ", w, "failed to get max revid." |
| 64 | + traceback.print_exc(file=sys.stdout) |
| 65 | + success = False |
| 66 | + lock.unlock() |
| 67 | + else: |
| 68 | + if (self.verbose): |
| 69 | + print "Wiki ", w, "failed to get lock." |
| 70 | + traceback.print_exc(file=sys.stdout) |
| 71 | + if success: |
| 72 | + if (self.verbose): |
| 73 | + print "Success! Wiki", w, "adds/changes dump complete." |
| 74 | + return success |
| 75 | + |
| 76 | + def doRunOnAllWikis(self): |
| 77 | + failures = 0 |
| 78 | + for w in self._config.allWikisList: |
| 79 | + if not self.doOneWiki(w): |
| 80 | + failures = failures + 1 |
| 81 | + return failures |
| 82 | + |
| 83 | + def doAllWikisTilDone(self,numFails): |
| 84 | + fails = 0 |
| 85 | + while 1: |
| 86 | + result = self.doRunOnAllWikis() |
| 87 | + if not result: |
| 88 | + break |
| 89 | + fails = fails + 1 |
| 90 | + if fails > numFails: |
| 91 | + raise("Too many consecutive failures, giving up") |
| 92 | + # wait 5 minutes and try another loop |
| 93 | + time.sleep(300) |
| 94 | + |
| 95 | +def usage(message = None): |
| 96 | + if message: |
| 97 | + print message |
| 98 | + print "Usage: python generateincrementals.py [options] [wikidbname]" |
| 99 | + print "Options: --configfile, --date, --verbose" |
| 100 | + print "--configfile: Specify an alternate config file to read. Default file is 'dumpincr.conf' in the current directory." |
| 101 | + print "--date: (Re)run incremental of a given date (use with care)." |
| 102 | + print "--verbose: Print error messages and other informative messages (normally the" |
| 103 | + print " script runs silently)." |
| 104 | + print "wikiname: Run the dumps only for the specific wiki." |
| 105 | + sys.exit(1) |
| 106 | + |
| 107 | +if __name__ == "__main__": |
| 108 | + configFile = False |
| 109 | + result = False |
| 110 | + date = None |
| 111 | + verbose = False |
| 112 | + |
| 113 | + try: |
| 114 | + (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", |
| 115 | + ['date=', 'configfile=', 'verbose' ]) |
| 116 | + except: |
| 117 | + usage("Unknown option specified") |
| 118 | + |
| 119 | + for (opt, val) in options: |
| 120 | + if opt == "--date": |
| 121 | + date = val |
| 122 | + elif opt == "--configfile": |
| 123 | + configFile = val |
| 124 | + elif opt == "--verbose": |
| 125 | + verbose = True |
| 126 | + |
| 127 | + if (configFile): |
| 128 | + config = Config(configFile) |
| 129 | + else: |
| 130 | + config = Config() |
| 131 | + |
| 132 | + if not date: |
| 133 | + date = TimeUtils.today() |
| 134 | + |
| 135 | + dump = MaxIDDump(config, date, verbose) |
| 136 | + if len(remainder) > 0: |
| 137 | + dump.doOneWiki(remainder[0]) |
| 138 | + else: |
| 139 | + dump.doAllWikisTilDone(3) |
Property changes on: branches/ariel/xmldumps-backup/incrementals/generatemaxrevids.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 140 | + native |
Index: branches/ariel/xmldumps-backup/incrementals/dumpincr.conf.sample |
— | — | @@ -0,0 +1,32 @@ |
| 2 | +# sample configuration file |
| 3 | + |
| 4 | +[wiki] |
| 5 | +mediawiki=/src/mediawiki/118wmf1/1.18wmf1 |
| 6 | +allwikislist=/home/backups/incrementals/all.dblist |
| 7 | +privatewikislist=/home/backups/incrementals/private.dblist |
| 8 | +closedwikislist=/home/backups/incrementals/closed.dblist |
| 9 | + |
| 10 | +[output] |
| 11 | +incrementalsdir=/dumps/public/incr |
| 12 | +templatedir=/home/backups/incrementals |
| 13 | +temp=/dumps/temp |
| 14 | +webroot=http://localhost/mydumps |
| 15 | +fileperms=0644 |
| 16 | +# minimum number of seconds from revision creation |
| 17 | +# til it can be dumped |
| 18 | +delay=43200 |
| 19 | + |
| 20 | +[database] |
| 21 | +user=dbuser |
| 22 | +password=leet |
| 23 | + |
| 24 | +[tools] |
| 25 | +php=/usr/bin/php |
| 26 | +mysql=/usr/bin/mysql |
| 27 | +gzip=/usr/bin/gzip |
| 28 | +bzip2=/usr/bin/bzip2 |
| 29 | +checkforbz2footer=/usr/local/bin/checkforbz2footer |
| 30 | +writeuptopageid=/usr/local/bin/writeuptopageid |
| 31 | + |
| 32 | +[cleanup] |
| 33 | +keep=20 |
Index: branches/ariel/xmldumps-backup/incrementals/generateincrementals.py |
— | — | @@ -0,0 +1,266 @@ |
| 2 | +# for every wiki, read the maxid and the prev maxid |
| 3 | +# recorded for incrementals, dump stubs and dump history file |
| 4 | +# based on stubs. |
| 5 | +# this is phase 2 of daily xml change/adds dumps. |
| 6 | + |
| 7 | +import ConfigParser |
| 8 | +import getopt |
| 9 | +import os |
| 10 | +import re |
| 11 | +import sys |
| 12 | +import WikiDump |
| 13 | +from WikiDump import FileUtils, TimeUtils, MiscUtils |
| 14 | +import subprocess |
| 15 | +import socket |
| 16 | +import time |
| 17 | +import IncrDumpLib |
| 18 | +from IncrDumpLib import Lock, Config, RunSimpleCommand, MultiVersion, DBServer, IncrementDir, IncrementDumpsError, MaxRevIDFile, StatusFile, IncrDumpLockFile, StubFile, RevsFile, MD5File, IncDumpDirs, IncrDumpLock, MaxRevIDLock, StatusInfo |
| 19 | +from subprocess import Popen, PIPE |
| 20 | +from os.path import exists |
| 21 | +import hashlib |
| 22 | +import traceback |
| 23 | + |
| 24 | +class DumpResults(object): |
| 25 | + def __init__(self): |
| 26 | + self.TODO = 1 |
| 27 | + self.FAILED = -1 |
| 28 | + self.OK = 0 |
| 29 | + |
| 30 | +class IncrDump(object): |
| 31 | + def __init__(self,config, date, wikiName, doStubs, doRevs, dryrun, verbose): |
| 32 | + self._config = config |
| 33 | + self.date = date |
| 34 | + self.wikiName = wikiName |
| 35 | + self.incrDir = IncrementDir(self._config, self.date) |
| 36 | + self.doStubs = doStubs |
| 37 | + self.doRevs = doRevs |
| 38 | + self.dryrun = dryrun |
| 39 | + self.maxRevIDFile = MaxRevIDFile(self._config, self.date, self.wikiName) |
| 40 | + self.statusInfo = StatusInfo(self._config, self.date, self.wikiName) |
| 41 | + self.stubFile = StubFile(self._config, self.date, self.wikiName) |
| 42 | + self.revsFile = RevsFile(self._config, self.date, self.wikiName) |
| 43 | + self.incrDumpsDirs = IncDumpDirs(self._config, self.wikiName) |
| 44 | + self.verbose = verbose |
| 45 | + |
| 46 | + def getMaxRevIdFromFile(self, date = None): |
| 47 | + if date == None: |
| 48 | + date = self.date |
| 49 | + maxRevIDFile = MaxRevIDFile(self._config, date, self.wikiName) |
| 50 | + return FileUtils.readFile(maxRevIDFile.getPath().rstrip()) |
| 51 | + |
| 52 | + def doOneWiki(self): |
| 53 | + retCodes = DumpResults() |
| 54 | + if self.wikiName not in self._config.privateWikisList and self.wikiName not in self._config.closedWikisList: |
| 55 | + if not exists(self.incrDir.getIncDir(self.wikiName)): |
| 56 | + os.makedirs(self.incrDir.getIncDir(self.wikiName)) |
| 57 | + status = self.statusInfo.getStatus() |
| 58 | + if status == "done": |
| 59 | + if (self.verbose): |
| 60 | + print "wiki",self.wikiName,"skipped, adds/changes dump already complete" |
| 61 | + return retCodes.OK |
| 62 | + if time.time() - os.path.getmtime(self.maxRevIDFile.getPath()) < self._config.delay: |
| 63 | + if (self.verbose): |
| 64 | + print "wiki",self.wikiName,"skipped, must wait for configured delay interval" |
| 65 | + return retCodes.TODO |
| 66 | + if not dryrun: |
| 67 | + lock = IncrDumpLock(self._config, self.date, self.wikiName) |
| 68 | + if not lock.getLock(): |
| 69 | + if (self.verbose): |
| 70 | + print "wiki",self.wikiName,"skipped, wiki is locked, another process should be doing the job" |
| 71 | + return retCodes.TODO |
| 72 | + try: |
| 73 | + if not dryrun: |
| 74 | + self.incrDumpsDirs.cleanupOldIncrDumps(self.date) |
| 75 | + maxRevID = self.getMaxRevIdFromFile() |
| 76 | + prevDate = self.incrDumpsDirs.getPrevIncrDate(self.date) |
| 77 | + prevRevID = None |
| 78 | + if prevDate: |
| 79 | + prevRevID = self.getMaxRevIdFromFile(prevDate) |
| 80 | + if not prevRevID: |
| 81 | + prevRevID = str(int(maxRevID) - 10) |
| 82 | + if int(prevRevID) < 1: |
| 83 | + prevRevID = str(1) |
| 84 | + else: |
| 85 | + # this incr will cover every revision from the last incremental |
| 86 | + # through the maxid we wrote out in phase one of this job. |
| 87 | + prevRevID = str(int(prevRevID) + 1) |
| 88 | + if doStubs: |
| 89 | + maxRevID = str(int(maxRevID) + 1) # end rev id is not included in dump |
| 90 | + if not self.dumpStub(prevRevID, maxRevID): |
| 91 | + return retCodes.FAILED |
| 92 | + if doRevs: |
| 93 | + if not self.dumpRevs(): |
| 94 | + return retCodes.FAILED |
| 95 | + if not dryrun: |
| 96 | + if not self.md5sums(): |
| 97 | + return retCodes.FAILED |
| 98 | + self.statusInfo.setStatus("done") |
| 99 | + lock.unlock() |
| 100 | + except: |
| 101 | + if (self.verbose): |
| 102 | + traceback.print_exc(file=sys.stdout) |
| 103 | + if not dryrun: |
| 104 | + lock.unlock() |
| 105 | + return retCodes.FAILED |
| 106 | + if (self.verbose): |
| 107 | + print "Success! Wiki", self.wikiName, "incremental dump complete." |
| 108 | + return retCodes.OK |
| 109 | + |
| 110 | + def dumpStub(self, startRevID, endRevID): |
| 111 | + scriptCommand = MultiVersion.MWScriptAsArray(self._config, "dumpBackup.php") |
| 112 | + command = [ "%s" % self._config.php, "-q" ] |
| 113 | + command.extend(scriptCommand) |
| 114 | + command.extend(["--wiki=%s" % self.wikiName, "--stub", "--quiet", |
| 115 | + "--force-normal", "--output=gzip:%s" % self.stubFile.getPath(), |
| 116 | + "--revrange", "--revstart=%s" % startRevID, "--revend=%s" % endRevID ]) |
| 117 | + if dryrun: |
| 118 | + print "would run command for stubs dump:", command |
| 119 | + else: |
| 120 | + error = RunSimpleCommand.runWithNoOutput(command, shell = False) |
| 121 | + if (error): |
| 122 | + if (self.verbose): |
| 123 | + print ("error producing stub files for wiki" % self.wikiName) |
| 124 | + return False |
| 125 | + return True |
| 126 | + |
| 127 | + def dumpRevs(self): |
| 128 | + scriptCommand = MultiVersion.MWScriptAsArray(self._config, "dumpTextPass.php") |
| 129 | + command = [ "%s" % self._config.php, "-q" ] |
| 130 | + command.extend(scriptCommand) |
| 131 | + command.extend(["--wiki=%s" % self.wikiName, "--stub=gzip:%s" % self.stubFile.getPath(), |
| 132 | + "--force-normal", "--quiet", "--spawn=%s" % self._config.php, |
| 133 | + "--output=bzip2:%s" % self.revsFile.getPath() |
| 134 | + ]) |
| 135 | + if dryrun: |
| 136 | + print "would run command for revs dump:", command |
| 137 | + else: |
| 138 | + error = RunSimpleCommand.runWithNoOutput(command, shell = False) |
| 139 | + if (error): |
| 140 | + if (self.verbose): |
| 141 | + print("error producing revision text files for wiki" % self.wikiName) |
| 142 | + return False |
| 143 | + return True |
| 144 | + |
| 145 | + def md5sumOneFile(self, filename): |
| 146 | + summer = hashlib.md5() |
| 147 | + infile = file(filename, "rb") |
| 148 | + bufsize = 4192 * 32 |
| 149 | + buffer = infile.read(bufsize) |
| 150 | + while buffer: |
| 151 | + summer.update(buffer) |
| 152 | + buffer = infile.read(bufsize) |
| 153 | + infile.close() |
| 154 | + return summer.hexdigest() |
| 155 | + |
| 156 | + def md5sums(self): |
| 157 | + try: |
| 158 | + md5File = MD5File(self._config, self.date, self.wikiName) |
| 159 | + text = "" |
| 160 | + summer = hashlib.md5() |
| 161 | + files = [] |
| 162 | + if self.doStubs: |
| 163 | + files.append(self.stubFile.getPath()) |
| 164 | + if self.doRevs: |
| 165 | + files.append(self.revsFile.getPath()) |
| 166 | + for f in files: |
| 167 | + text = text + "%s\n" % self.md5sumOneFile(f) |
| 168 | + FileUtils.writeFileInPlace(md5File.getPath(), text, self._config.fileperms) |
| 169 | + return True |
| 170 | + except: |
| 171 | + return False |
| 172 | + |
| 173 | +class IncrDumpLoop(object): |
| 174 | + def __init__(self, config, date, doStubs, doRevs, dryrun, verbose): |
| 175 | + self._config = config |
| 176 | + self.date = date |
| 177 | + self.doStubs = doStubs |
| 178 | + self.doRevs = doRevs |
| 179 | + self.dryrun = dryrun |
| 180 | + self.verbose = verbose |
| 181 | + |
| 182 | + def doRunOnAllWikis(self): |
| 183 | + retCodes = DumpResults() |
| 184 | + failures = 0 |
| 185 | + todos = 0 |
| 186 | + for w in self._config.allWikisList: |
| 187 | + dump = IncrDump(config, date, w, doStubs, doRevs, dryrun, self.verbose) |
| 188 | + result = dump.doOneWiki() |
| 189 | + if result == retCodes.FAILED: |
| 190 | + failures = failures + 1 |
| 191 | + elif result == retCodes.TODO: |
| 192 | + todos = todos + 1 |
| 193 | + return (failures, todos) |
| 194 | + |
| 195 | + def doAllWikisTilDone(self,numFails): |
| 196 | + fails = 0 |
| 197 | + while 1: |
| 198 | + (failures, todos) = self.doRunOnAllWikis() |
| 199 | + if not failures and not todos: |
| 200 | + break |
| 201 | + fails = fails + 1 |
| 202 | + if fails > numFails: |
| 203 | + raise IncrementDumpsError("Too many consecutive failures, giving up") |
| 204 | + # wait 5 minutes and try another loop |
| 205 | +# raise IncrementDumpsError("would sleep") |
| 206 | + time.sleep(300) |
| 207 | + |
| 208 | +def usage(message = None): |
| 209 | + if message: |
| 210 | + print message |
| 211 | + print "Usage: python generateincrementals.py [options] [wikidbname]" |
| 212 | + print "Options: --configfile, --date, --dryrun, --revsonly, --stubsonly, --verbose" |
| 213 | + print "--configfile: Specify an alternate config file to read. Default file is 'dumpincr.conf' in the current directory." |
| 214 | + print "--date: (Re)run incremental of a given date (use with care)." |
| 215 | + print "--dryrun: Don't actually dump anything but print the commands that would be run." |
| 216 | + print "--revsonly: Do only the stubs part of the dumps." |
| 217 | + print "--stubsonly: Do only the revision text part of the dumps." |
| 218 | + print "--verbose: Print error messages and other informative messages (normally the" |
| 219 | + print " script runs silently)." |
| 220 | + print "wikiname: Run the dumps only for the specific wiki." |
| 221 | + sys.exit(1) |
| 222 | + |
| 223 | +if __name__ == "__main__": |
| 224 | + configFile = False |
| 225 | + result = False |
| 226 | + date = None |
| 227 | + doStubs = True |
| 228 | + doRevs = True |
| 229 | + dryrun = False |
| 230 | + verbose = False |
| 231 | + |
| 232 | + try: |
| 233 | + (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", |
| 234 | + ['date=', 'configfile=', 'stubsonly', 'revsonly', 'dryrun', 'verbose' ]) |
| 235 | + except: |
| 236 | + usage("Unknown option specified") |
| 237 | + |
| 238 | + for (opt, val) in options: |
| 239 | + if opt == "--date": |
| 240 | + date = val |
| 241 | + elif opt == "--configfile": |
| 242 | + configFile = val |
| 243 | + elif opt == "--stubsonly": |
| 244 | + doRevs = False |
| 245 | + elif opt == "--revsonly": |
| 246 | + doStubs = False |
| 247 | + elif opt == "--dryrun": |
| 248 | + dryrun = True |
| 249 | + elif opt == "--verbose": |
| 250 | + verbose = True |
| 251 | + |
| 252 | + if not doRevs and not doStubs: |
| 253 | + usage("You may not specify stubsonly and revsonly options together.") |
| 254 | + |
| 255 | + if (configFile): |
| 256 | + config = Config(configFile) |
| 257 | + else: |
| 258 | + config = Config() |
| 259 | + |
| 260 | + if not date: |
| 261 | + date = TimeUtils.today() |
| 262 | + |
| 263 | + if len(remainder) > 0: |
| 264 | + dump = IncrDump(config, date, remainder[0], doStubs, doRevs, dryrun, verbose) |
| 265 | + else: |
| 266 | + dump = IncrDumpLoop(config, date, doStubs, doRevs, dryrun, verbose) |
| 267 | + dump.doAllWikisTilDone(3) |
Property changes on: branches/ariel/xmldumps-backup/incrementals/generateincrementals.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 268 | + native |
Index: branches/ariel/xmldumps-backup/incrementals/incrs-index.html |
— | — | @@ -0,0 +1,118 @@ |
| 2 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" |
| 3 | + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> |
| 4 | + |
| 5 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> |
| 6 | +<head> |
| 7 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> |
| 8 | + <title>Incremental dumps</title> |
| 9 | + <style type="text/css"> |
| 10 | + html, body { |
| 11 | + background-color: #ffffff; |
| 12 | + color: black; |
| 13 | + } |
| 14 | + .siteinfo { |
| 15 | + text-align: center; |
| 16 | + } |
| 17 | + li { |
| 18 | + background-color: #ffffff; |
| 19 | + list-style-type: none; |
| 20 | + } |
| 21 | + li li { |
| 22 | + background-color: white; |
| 23 | + } |
| 24 | + li ul { |
| 25 | + margin-top: 4px; |
| 26 | + margin-bottom: 8px; |
| 27 | + } |
| 28 | + .detail { |
| 29 | + font-weight: normal; |
| 30 | + font-style: italic; |
| 31 | + } |
| 32 | + .updates { |
| 33 | + font: monospace; |
| 34 | + font-size: smaller; |
| 35 | + } |
| 36 | + .status { |
| 37 | + font-weight: bold; |
| 38 | + padding-left: 1em; |
| 39 | + padding-right: 1em; |
| 40 | + } |
| 41 | + .in-progress { |
| 42 | + font-weight: bold; |
| 43 | + } |
| 44 | + .failed { |
| 45 | + color: Maroon; |
| 46 | + font-weight: bold; |
| 47 | + } |
| 48 | + .waiting { |
| 49 | + color: Silver; /* Gray ? */ |
| 50 | + } |
| 51 | + .progress { |
| 52 | + font-family: monospace; |
| 53 | + font-size: 80%%; |
| 54 | + margin-left: .5in; |
| 55 | + } |
| 56 | + </style> |
| 57 | +</head> |
| 58 | + |
| 59 | +<body> |
| 60 | + <h1>Adds/changes dumps</h1> |
| 61 | + |
| 62 | + <p class="siteinfo"> |
| 63 | + This is the Wikimedia adds/changes dump service. |
| 64 | + Please read the <a href='legal.html'>copyrights</a> information. |
| 65 | + See <a href="http://meta.wikimedia.org/wiki/Data_dumps">Meta:Data dumps</a> |
| 66 | + for documentation on the provided data formats. |
| 67 | + </p> |
| 68 | + <p> |
| 69 | + Here's the big fat disclaimer. |
| 70 | + </p> |
| 71 | + <p> |
| 72 | + This service is experimental. At any time it may not be working, for a day, a week or a month. |
| 73 | + It is not intended to replace the full XML dumps. We don't expect users to be able to construct |
| 74 | + full dumps of a given date from the incrementals and an older dump. |
| 75 | + </p> |
| 76 | + <p> |
| 77 | + The data provided in these files is ''partial data''. To be precise: |
| 78 | + <ul> |
| 79 | + <li>* Revisions included in these dumps are not up to the minute. We write out those that were |
| 80 | + created up to 18 hours ago; this gives local editing communities time to delete revisions |
| 81 | + with sensitive information, vulgarities and other vandalism, etc.</li> |
| 82 | + <li>* New pages entered for the first time during the time interval are included</li> |
| 83 | + <li>* Revisions of undeleted pages will be included only if new revision IDs need to be assigned to |
| 84 | + the restored revisions. For most revisions this will not be the case. </li> |
| 85 | + <li>* Information about moves and deletes are not included.</li> |
| 86 | + <li>* Imported revisions will be included if they were imported during the time interval, since they |
| 87 | + will have new revisions IDs.</li> |
| 88 | + <li>* As with all dumps, hidden revisions or more generally revisions not readable by the general public |
| 89 | + are not provided.</li> |
| 90 | + </ul> |
| 91 | + </p> |
| 92 | + <p> |
| 93 | + What is in these files: |
| 94 | + </p> |
| 95 | + <p> |
| 96 | + The stubs file consists of the metadata for revision texts of each page, where the revision texts were |
| 97 | + added within the time interval. These look just like the history stubs files you would find on our XML data dumps |
| 98 | + page, having the exact same format but only new revisions since the last adds/changes dump. This means you get |
| 99 | + metadata for articles, user pages, discussion pages, etc. If you want articles only, you will need to write a |
| 100 | + filter to grab just those entries. |
| 101 | + </p> |
| 102 | + <p> |
| 103 | + The revs file consists of the metadata plus the wikitext for each new revision since the last adds/changes dump. |
| 104 | + This is in the same format as the pages-meta-history files you would find on our XML data dumps page. This means |
| 105 | + you get articles, user pages, discussion pages, etc. If you want articles only, you will need to write a |
| 106 | + filter to grab just those entries. |
| 107 | + </p> |
| 108 | + <h2>Adds/changes dump listing</h2> |
| 109 | + <ul> |
| 110 | + %(items)s |
| 111 | + </ul> |
| 112 | + <hr> |
| 113 | + <p> |
| 114 | + Return to <a href="http://dumps.wikimedia.org/other/">our other datasets</a>, the |
| 115 | + <a href="http://dumps.wikimedia.org/backup-index.html">XML data dumps</a>, or |
| 116 | + <a href="http://dumps.wikimedia.org/index.html">the main index</a>. |
| 117 | + <p/> |
| 118 | +</body> |
| 119 | +</html> |
Index: branches/ariel/xmldumps-backup/incrementals/all.dblist |
— | — | @@ -0,0 +1,5 @@ |
| 2 | +elwikidb |
| 3 | +simplewikidb |
| 4 | +testAw118wmf1 |
| 5 | +testBw118wmf1 |
| 6 | +testCw118wmf1 |
\ No newline at end of file |
Index: branches/ariel/xmldumps-backup/incrementals/closed.dblist |
Index: branches/ariel/xmldumps-backup/incrementals/incrmonitor.py |
— | — | @@ -0,0 +1,134 @@ |
| 2 | +# generate an index page covering the status of and links to |
| 3 | +# incremental files for the latest date for each project |
| 4 | + |
| 5 | +import ConfigParser |
| 6 | +import getopt |
| 7 | +import os |
| 8 | +import re |
| 9 | +import sys |
| 10 | +import WikiDump |
| 11 | +from WikiDump import FileUtils, TimeUtils, MiscUtils |
| 12 | +import subprocess |
| 13 | +import socket |
| 14 | +import time |
| 15 | +import IncrDumpLib |
| 16 | +from IncrDumpLib import Lock, Config, RunSimpleCommand, MultiVersion, DBServer, IncrementDir, IncrementDumpsError, IndexFile, IncrDumpLockFile, IncDumpDirs, IncrDumpLock, MaxRevIDLock, StubFile, RevsFile, StatusFile |
| 17 | +from subprocess import Popen, PIPE |
| 18 | +from os.path import exists |
| 19 | +import hashlib |
| 20 | +import traceback |
| 21 | + |
| 22 | +class Link(object): |
| 23 | + |
| 24 | + def makeLink(path, linkText): |
| 25 | + return('<a href = "' + path + '">' + linkText + "</a>") |
| 26 | + |
| 27 | + makeLink = staticmethod(makeLink) |
| 28 | + |
| 29 | +class Index(object): |
| 30 | + def __init__(self, config, verbose): |
| 31 | + self._config = config |
| 32 | + self.indexFile = IndexFile(self._config) |
| 33 | + self.incrDir = IncrementDir(self._config) |
| 34 | + self.verbose = verbose |
| 35 | + |
| 36 | + def doAllWikis(self): |
| 37 | + text = "" |
| 38 | + for w in self._config.allWikisList: |
| 39 | + result = self.doOneWiki(w) |
| 40 | + if result: |
| 41 | + text = text + "<li>"+ result + "</li>\n" |
| 42 | + indexText = self._config.readTemplate("incrs-index.html") % { "items" : text } |
| 43 | + FileUtils.writeFileInPlace(self.indexFile.getPath(), indexText, self._config.fileperms) |
| 44 | + |
| 45 | + def doOneWiki(self, w): |
| 46 | + if w not in self._config.privateWikisList and w not in self._config.closedWikisList: |
| 47 | + self.incrDumpsDirs = IncDumpDirs(self._config, w) |
| 48 | + if not exists(self.incrDir.getIncDirNoDate(w)): |
| 49 | + if (self.verbose): |
| 50 | + print "No dump for wiki ", w |
| 51 | + next |
| 52 | + |
| 53 | + incrDate = self.incrDumpsDirs.getLatestIncrDate() |
| 54 | + if not incrDate: |
| 55 | + if (self.verbose): |
| 56 | + print "No dump for wiki ", w |
| 57 | + next |
| 58 | + |
| 59 | + try: |
| 60 | + lock = IncrDumpLock(self._config, incrDate, w) |
| 61 | + lockDate = lock.getLockInfo() |
| 62 | + |
| 63 | + stub = StubFile(self._config, incrDate, w) |
| 64 | + (stubDate, stubSize) = stub.getFileInfo() |
| 65 | + revs = RevsFile(self._config, incrDate, w) |
| 66 | + (revsDate, revsSize) = revs.getFileInfo() |
| 67 | + stat = StatusFile(self._config, incrDate, w) |
| 68 | + statContents = FileUtils.readFile(stat.getPath()) |
| 69 | + |
| 70 | + except: |
| 71 | + if (self.verbose): |
| 72 | + traceback.print_exc(file=sys.stdout) |
| 73 | + return "Error encountered, no information available for wiki", w |
| 74 | + |
| 75 | + try: |
| 76 | + wikinameText = "<strong>%s</strong>" % w |
| 77 | + if lockDate: |
| 78 | + lockText = "run started on %s." % lockDate |
| 79 | + else: |
| 80 | + lockText = None |
| 81 | + if stubDate: |
| 82 | + stubText = "stubs: %s (size %s)" % (Link.makeLink(os.path.join(w, incrDate, stub.getFileName()),stubDate), stubSize) |
| 83 | + else: |
| 84 | + stubText = None |
| 85 | + if revsDate: |
| 86 | + revsText = "revs: %s (size %s)" % (Link.makeLink(os.path.join(w, incrDate, revs.getFileName()),revsDate), revsSize) |
| 87 | + else: |
| 88 | + revsText = None |
| 89 | + if statContents: |
| 90 | + statText = "(%s)" % (statContents) |
| 91 | + else: |
| 92 | + statText = None |
| 93 | + |
| 94 | + wikiInfo = " ".join( filter( None, [ wikinameText, lockText, statText ] ) ) + "<br />" |
| 95 | + wikiInfo = wikiInfo + " " + " | ".join( filter( None, [ stubText, revsText ] )) |
| 96 | + except: |
| 97 | + if (self.verbose): |
| 98 | + traceback.print_exc(file=sys.stdout) |
| 99 | + return "Error encountered formatting information for wiki", w |
| 100 | + |
| 101 | + return wikiInfo |
| 102 | + |
| 103 | +def usage(message = None): |
| 104 | + if message: |
| 105 | + print message |
| 106 | + print "Usage: python monitor.py [options] [wikidbname]" |
| 107 | + print "Options: --configfile, --verbose" |
| 108 | + print "--configfile: Specify an alternate config file to read. Default file is 'dumpincr.conf' in the current directory." |
| 109 | + print "--verbose: Print error messages and other informative messages (normally the" |
| 110 | + print " script runs silently)." |
| 111 | + sys.exit(1) |
| 112 | + |
| 113 | +if __name__ == "__main__": |
| 114 | + configFile = False |
| 115 | + verbose = False |
| 116 | + |
| 117 | + try: |
| 118 | + (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", |
| 119 | + ['configfile=', 'verbose' ]) |
| 120 | + except: |
| 121 | + usage("Unknown option specified") |
| 122 | + |
| 123 | + for (opt, val) in options: |
| 124 | + if opt == "--configfile": |
| 125 | + configFile = val |
| 126 | + elif opt == '--verbose': |
| 127 | + verbose = True |
| 128 | + |
| 129 | + if (configFile): |
| 130 | + config = Config(configFile) |
| 131 | + else: |
| 132 | + config = Config() |
| 133 | + |
| 134 | + index = Index(config, verbose) |
| 135 | + index.doAllWikis() |
Property changes on: branches/ariel/xmldumps-backup/incrementals/incrmonitor.py |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 136 | + native |
Index: branches/ariel/xmldumps-backup/incrementals/private.dblist |
Index: branches/ariel/xmldumps-backup/incrementals/README.txt |
— | — | @@ -0,0 +1,66 @@ |
| 2 | +The adds/changes dumps are a supplementary set of dumps intended to accompany |
| 3 | +the regular XML dump files. |
| 4 | + |
| 5 | +The adds/changes dumps are produced in two stages. |
| 6 | + |
| 7 | +In stage one, the max rev_id value at the time of the run is written out to a file for each project for the given date. Script name: generatemaxrevids.py |
| 8 | + |
| 9 | +In stage two, intended to be run at a later time, a stub file containing all |
| 10 | +revisions from the previous adds/changes dump through the max rev_id just |
| 11 | +written. This file is sorted by page id, just as the regular XML stubs files |
| 12 | +are. Next a history file containing metadata and page text for those |
| 13 | +revisions is written, in the same format as the pages-meta-history file |
| 14 | +generated for the regular XML dumps. A status file is written to indicate |
| 15 | +that the job is done, and the md5sums of the stub and revision text files |
| 16 | +is written to a file as well. Script name: generateincrementals.py |
| 17 | + |
| 18 | +The reason that there are two stages run via two separate scripts is that |
| 19 | +you may want to allow editors time to delete or hide sensitive or offensive |
| 20 | +material newly entered. A delay of an arbitrary number of seconds between |
| 21 | +the recording of the max rev_id to dump and the start of the stub and |
| 22 | +revision text dump is configurable in the configuration file; see |
| 23 | +README.config for information on that. |
| 24 | + |
| 25 | +Installation: |
| 26 | + |
| 27 | +Seriously? You want to install this already? This is version 0.0.1. Know |
| 28 | +what that means? It's buggy, risky, and could eat your data. |
| 29 | + |
| 30 | +However, if you just want to play around with it on your laptop, fine. |
| 31 | +* Put the files generateincrementals.py, generatemaxrevids.py, incrmonitor.py, |
| 32 | + incrmonitor and IncrDumpLib.py together with the sample configuration file |
| 33 | + dumpincr.conf into a directory from which the job will run. |
| 34 | + Make sure you have a copy or a symlink of WikiDump.py from the regular XML |
| 35 | + dumps in this same directory. |
| 36 | + Also make sure you have a template for the top level index.html file, called |
| 37 | + "incrs-index.html" in the same directory with these scripts. See the existing |
| 38 | + incrs-index.html file for the format; the key here is that you want the |
| 39 | + string "%(items)s" in between <ul> and </ul> tags. The status of the dump |
| 40 | + for each wiki, along with links to the stub and revisions files, will be |
| 41 | + included as a list item in that spot in the file. |
| 42 | +* See README.config for information on the various options in the config file. |
| 43 | +* Create the top level directory underneath which there will be a directory |
| 44 | + for each project you want to generate additions/changes. You needn't create |
| 45 | + the subdirectories, this will be done for you at run time. |
| 46 | +* Do a test run; run generatemaxrevids.py by hand. Then look in the top level |
| 47 | + directory you created earlier. Is there a directory for each project? Is |
| 48 | + there a subdirectory under each of these with the date, in YYYYMMDD format? |
| 49 | + In the date subdirectory are there a file maxrevid.txt containing a positive |
| 50 | + integer? |
| 51 | +* Do the phase 2 test run: run generateincrementals.py by hand. If you have |
| 52 | + configured a large delay, you will need to wait at least that amount of time |
| 53 | + before running this script. When it has completed, check the subdirectory |
| 54 | + from phase 1; are there files analogous to the following? |
| 55 | + mywiki-yyyymmdd-md5sums.txt |
| 56 | + mywiki-yyyymmdd-pages-meta-hist-incr.xml.bz2 |
| 57 | + mywiki-yyyymmdd-stubs-meta-hist-incr.xml.gz |
| 58 | + maxrevid.txt |
| 59 | + status.txt |
| 60 | + Does the status.txt file contain "done"? |
| 61 | +* If the runs look like they are producing the right files, do the html |
| 62 | + generation by hand; run monitor.py. In the top level directory for the |
| 63 | + adds/changes dumps, do you see the file index.html? If you view that |
| 64 | + file in a browser, do the contents look reasonable? |
| 65 | +* If that looks good, put phase 1 and phase 2 into separate cron jobs, |
| 66 | + spacing them out as appropriate. |
| 67 | + |
Property changes on: branches/ariel/xmldumps-backup/incrementals/README.txt |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 68 | + native |