Index: trunk/backup/worker.py |
— | — | @@ -23,7 +23,7 @@ |
24 | 24 | parts.insert(0, file) |
25 | 25 | (path, file) = os.path.split(path) |
26 | 26 | return parts |
27 | | - |
| 27 | + |
28 | 28 | def relativePath(path, base): |
29 | 29 | """Return a relative path to 'path' from the directory 'base'.""" |
30 | 30 | path = splitPath(path) |
— | — | @@ -56,7 +56,7 @@ |
57 | 57 | pass |
58 | 58 | |
59 | 59 | class Runner(object): |
60 | | - |
| 60 | + |
61 | 61 | def __init__(self, wiki, date=None, checkpoint=None, prefetch=True, spawn=True): |
62 | 62 | self.wiki = wiki |
63 | 63 | self.config = wiki.config |
— | — | @@ -68,12 +68,12 @@ |
69 | 69 | else: |
70 | 70 | self.date = WikiDump.today() |
71 | 71 | wiki.setDate(self.date) |
72 | | - |
| 72 | + |
73 | 73 | self.failCount = 0 |
74 | 74 | self.lastFailed = False |
75 | | - |
| 75 | + |
76 | 76 | self.checkpoint = checkpoint |
77 | | - |
| 77 | + |
78 | 78 | def passwordOption(self): |
79 | 79 | """If you pass '-pfoo' mysql uses the password 'foo', |
80 | 80 | but if you pass '-p' it prompts. Sigh.""" |
— | — | @@ -81,13 +81,13 @@ |
82 | 82 | return None |
83 | 83 | else: |
84 | 84 | return "-p" + self.config.dbPassword |
85 | | - |
| 85 | + |
86 | 86 | def forceNormalOption(self): |
87 | 87 | if self.config.forceNormal: |
88 | 88 | return "--force-normal" |
89 | 89 | else: |
90 | 90 | return "" |
91 | | - |
| 91 | + |
92 | 92 | def getDBTablePrefix(self): |
93 | 93 | """Get the prefix for all tables for the specific wiki ($wgDBprefix)""" |
94 | 94 | command = "echo 'print $wgDBprefix; ' | %s -q %s/maintenance/eval.php --wiki=%s" % shellEscape(( |
— | — | @@ -96,14 +96,14 @@ |
97 | 97 | |
98 | 98 | def saveTable(self, table, outfile): |
99 | 99 | """Dump a table from the current DB with mysqldump, save to a gzipped sql file.""" |
100 | | - command = "mysqldump -h %s -u %s %s --extended-insert --skip-opt --quick --create-options --add-drop-table --extended-insert --set-charset --quote-names %s %s | gzip" % shellEscape(( |
| 100 | + command = "mysqldump -h %s -u %s %s --opt --quick --skip-add-locks --skip-lock-tables %s %s | gzip" % shellEscape(( |
101 | 101 | self.dbServer, |
102 | 102 | self.config.dbUser, |
103 | 103 | self.passwordOption(), |
104 | 104 | self.dbName, |
105 | 105 | self.getDBTablePrefix() + table)) |
106 | 106 | return self.saveCommand(command, outfile, pipe=True) |
107 | | - |
| 107 | + |
108 | 108 | def saveSql(self, query, outfile): |
109 | 109 | """Pass some SQL commands to the server for this DB and save output to a file.""" |
110 | 110 | command = "echo %s | mysql -h %s -u %s %s %s -r | gzip" % shellEscape(( |
— | — | @@ -113,11 +113,11 @@ |
114 | 114 | self.passwordOption(), |
115 | 115 | self.dbName)) |
116 | 116 | return self.saveCommand(command, outfile, pipe=True) |
117 | | - |
| 117 | + |
118 | 118 | def saveCommand(self, command, outfile, pipe=False): |
119 | 119 | """Shell out and redirect output to a given file.""" |
120 | 120 | return self.runCommand(command + " > " + shellEscape(outfile), pipe) |
121 | | - |
| 121 | + |
122 | 122 | def runCommand(self, command, pipe=False, callback=None): |
123 | 123 | """Shell out; output is assumed to be saved usefully somehow. |
124 | 124 | Nonzero return code from the shell will raise a BackupError. |
— | — | @@ -135,7 +135,7 @@ |
136 | 136 | if retval: |
137 | 137 | raise BackupError("nonzero return code from '%s'" % command) |
138 | 138 | return retval |
139 | | - |
| 139 | + |
140 | 140 | def runAndReport(self, command, callback): |
141 | 141 | """Shell out to a command, and feed output lines to the callback function. |
142 | 142 | Returns the exit code from the program once complete. |
— | — | @@ -149,7 +149,7 @@ |
150 | 150 | callback(self, line) |
151 | 151 | line = proc.fromchild.readline() |
152 | 152 | return proc.wait() |
153 | | - |
| 153 | + |
154 | 154 | def runAndReturn(self, command): |
155 | 155 | """Run a command and return the output as a string. |
156 | 156 | Raises BackupError on non-zero return code.""" |
— | — | @@ -160,16 +160,16 @@ |
161 | 161 | raise BackupError("Non-zero return code from '%s'" % command) |
162 | 162 | else: |
163 | 163 | return output |
164 | | - |
| 164 | + |
165 | 165 | def debug(self, stuff): |
166 | 166 | print "%s: %s %s" % (prettyTime(), self.dbName, stuff) |
167 | | - |
| 167 | + |
168 | 168 | def buildDir(self, base, version): |
169 | 169 | return join(base, self.dbName, version) |
170 | | - |
| 170 | + |
171 | 171 | def buildPath(self, base, version, filename): |
172 | 172 | return join(base, version, "%s-%s-%s" % (self.dbName, version, filename)) |
173 | | - |
| 173 | + |
174 | 174 | def privatePath(self, filename): |
175 | 175 | """Take a given filename in the private dump dir for the selected database.""" |
176 | 176 | return self.buildPath(self.wiki.privateDir(), self.date, filename) |
— | — | @@ -179,38 +179,38 @@ |
180 | 180 | If this database is marked as private, will use the private dir instead. |
181 | 181 | """ |
182 | 182 | return self.buildPath(self.wiki.publicDir(), self.date, filename) |
183 | | - |
| 183 | + |
184 | 184 | def latestPath(self, filename): |
185 | 185 | return self.buildPath(self.wiki.publicDir(), "latest", filename) |
186 | | - |
| 186 | + |
187 | 187 | def webPath(self, filename): |
188 | 188 | return self.buildPath(self.wiki.webDir(), self.date, filename) |
189 | | - |
| 189 | + |
190 | 190 | def makeDir(self, dir): |
191 | 191 | if exists(dir): |
192 | 192 | self.debug("Checkdir dir %s ..." % dir) |
193 | 193 | else: |
194 | 194 | self.debug("Creating %s ..." % dir) |
195 | 195 | os.makedirs(dir) |
196 | | - |
| 196 | + |
197 | 197 | def selectDatabaseServer(self): |
198 | 198 | self.dbServer = self.defaultServer() |
199 | | - |
| 199 | + |
200 | 200 | def defaultServer(self): |
201 | 201 | command = "%s -q %s/maintenance/getSlaveServer.php --wiki=%s --group=dump" % shellEscape(( |
202 | 202 | self.config.php, self.config.wikiDir, self.dbName)) |
203 | 203 | return self.runAndReturn(command).strip() |
204 | | - |
| 204 | + |
205 | 205 | def run(self): |
206 | 206 | self.makeDir(join(self.wiki.publicDir(), self.date)) |
207 | 207 | self.makeDir(join(self.wiki.privateDir(), self.date)) |
208 | | - |
| 208 | + |
209 | 209 | self.status("Cleaning up old dumps for %s" % self.dbName) |
210 | 210 | self.cleanOldDumps() |
211 | | - |
| 211 | + |
212 | 212 | self.status("Starting backup of %s" % self.dbName) |
213 | 213 | self.selectDatabaseServer() |
214 | | - |
| 214 | + |
215 | 215 | self.items = [PrivateTable("user", "User account data."), |
216 | 216 | PrivateTable("watchlist", "Users' watchlist settings."), |
217 | 217 | PrivateTable("ipblocks", "Data for blocks of IP addresses, ranges, and users."), |
— | — | @@ -219,7 +219,7 @@ |
220 | 220 | PrivateTable("logging", "Data for various events (deletions, uploads, etc)."), |
221 | 221 | #PrivateTable("oldimage", "Metadata on prior versions of uploaded images."), |
222 | 222 | #PrivateTable("filearchive", "Deleted image data"), |
223 | | - |
| 223 | + |
224 | 224 | PublicTable("site_stats", "A few statistics such as the page count."), |
225 | 225 | PublicTable("image", "Metadata on current versions of uploaded images."), |
226 | 226 | PublicTable("oldimage", "Metadata on prior versions of uploaded images."), |
— | — | @@ -232,7 +232,7 @@ |
233 | 233 | PublicTable("interwiki", "Set of defined interwiki prefixes and links for this wiki."), |
234 | 234 | PublicTable("user_groups", "User group assignments."), |
235 | 235 | PublicTable("category", "Category information."), |
236 | | - |
| 236 | + |
237 | 237 | PublicTable("page", "Base per-page data (id, title, old restrictions, etc)."), |
238 | 238 | PublicTable("page_restrictions", "Newer per-page restrictions table."), |
239 | 239 | PublicTable("page_props", "Name/value pairs for pages."), |
— | — | @@ -240,11 +240,11 @@ |
241 | 241 | #PublicTable("revision", "Base per-revision data (does not include text)."), // safe? |
242 | 242 | #PrivateTable("text", "Text blob storage. May be compressed, etc."), // ? |
243 | 243 | PublicTable("redirect", "Redirect list"), |
244 | | - |
| 244 | + |
245 | 245 | TitleDump("List of page titles"), |
246 | | - |
| 246 | + |
247 | 247 | AbstractDump("Extracted page abstracts for Yahoo"), |
248 | | - |
| 248 | + |
249 | 249 | XmlStub("First-pass for page XML data dumps"), |
250 | 250 | XmlDump("articles", |
251 | 251 | "<big><b>Articles, templates, image descriptions, and primary meta-pages.</b></big>", |
— | — | @@ -257,7 +257,7 @@ |
258 | 258 | self.items.append( |
259 | 259 | PublicTable( "flaggedpages", "This contains a row for each flagged article, containing the stable revision ID, if the lastest edit was flagged, and how long edits have been pending." )) |
260 | 260 | self.items.append( |
261 | | - PublicTable( "flaggedrevs", "This contains a row for each flagged revision, containing who flagged it, when it was flagged, reviewer comments, the flag values, and the quality tier those flags fall under." )) |
| 261 | + PublicTable( "flaggedrevs", "This contains a row for each flagged revision, containing who flagged it, when it was flagged, reviewer comments, the flag values, and the quality tier those flags fall under." )) |
262 | 262 | |
263 | 263 | if not self.wiki.isBig(): |
264 | 264 | self.items.append( |
— | — | @@ -270,10 +270,10 @@ |
271 | 271 | "All pages with complete edit history (.7z)", |
272 | 272 | "These dumps can be *very* large, uncompressing up to 100 times the archive download size. " + |
273 | 273 | "Suitable for archival and statistical use, most mirror sites won't want or need this.")) |
274 | | - |
| 274 | + |
275 | 275 | files = self.listFilesFor(self.items) |
276 | 276 | self.prepareChecksums() |
277 | | - |
| 277 | + |
278 | 278 | for item in self.items: |
279 | 279 | item.start(self) |
280 | 280 | self.updateStatusFiles() |
— | — | @@ -306,9 +306,9 @@ |
307 | 307 | |
308 | 308 | if self.failCount < 1: |
309 | 309 | self.completeDump(files) |
310 | | - |
| 310 | + |
311 | 311 | self.statusComplete() |
312 | | - |
| 312 | + |
313 | 313 | def cleanOldDumps(self): |
314 | 314 | old = self.wiki.dumpDirs() |
315 | 315 | if old: |
— | — | @@ -327,7 +327,7 @@ |
328 | 328 | self.runCommand(command) |
329 | 329 | else: |
330 | 330 | self.status("No old dumps to purge.") |
331 | | - |
| 331 | + |
332 | 332 | def reportFailure(self): |
333 | 333 | if self.config.adminMail: |
334 | 334 | subject = "Dump failure for " + self.dbName |
— | — | @@ -337,27 +337,27 @@ |
338 | 338 | "time": prettyTime(), |
339 | 339 | "url": "/".join((self.config.webRoot, self.dbName, self.date, ''))} |
340 | 340 | config.mail(subject, message) |
341 | | - |
| 341 | + |
342 | 342 | def listFilesFor(self, items): |
343 | 343 | files = [] |
344 | 344 | for item in items: |
345 | 345 | for file in item.listFiles(self): |
346 | 346 | files.append(file) |
347 | 347 | return files |
348 | | - |
| 348 | + |
349 | 349 | def updateStatusFiles(self, done=False): |
350 | 350 | self.saveStatus(self.items, done) |
351 | | - |
| 351 | + |
352 | 352 | def saveStatus(self, items, done=False): |
353 | 353 | """Write out an HTML file with the status for this wiki's dump and links to completed files.""" |
354 | | - try: |
| 354 | + try: |
355 | 355 | self.wiki.writeIndex(self.reportStatus(items, done)) |
356 | | - |
| 356 | + |
357 | 357 | # Short line for report extraction |
358 | 358 | self.wiki.writeStatus(self.reportDatabase(items, done)) |
359 | 359 | except: |
360 | 360 | print "Couldn't update status files. Continuing anyways" |
361 | | - |
| 361 | + |
362 | 362 | def progressReports(self): |
363 | 363 | status = {} |
364 | 364 | for db in self.dblist: |
— | — | @@ -366,7 +366,7 @@ |
367 | 367 | status[db] = item |
368 | 368 | # sorted by name... |
369 | 369 | return [status[db] for db in self.dblist if db in status] |
370 | | - |
| 370 | + |
371 | 371 | def readProgress(self, db): |
372 | 372 | dir = self.latestDump(db) |
373 | 373 | if dir: |
— | — | @@ -378,18 +378,18 @@ |
379 | 379 | else: |
380 | 380 | self.debug("No dump dir for %s?" % db) |
381 | 381 | return None |
382 | | - |
| 382 | + |
383 | 383 | def reportDatabase(self, items, done=False): |
384 | 384 | """Put together a brief status summary and link for the current database.""" |
385 | 385 | status = self.reportStatusLine(done) |
386 | 386 | html = self.wiki.reportStatusLine(status) |
387 | | - |
| 387 | + |
388 | 388 | activeItems = [x for x in items if x.status == "in-progress"] |
389 | 389 | if activeItems: |
390 | 390 | return html + "<ul>" + "\n".join([self.reportItem(x) for x in activeItems]) + "</ul>" |
391 | 391 | else: |
392 | 392 | return html |
393 | | - |
| 393 | + |
394 | 394 | def reportStatus(self, items, done=False): |
395 | 395 | """Put together a status page for this database, with all its component dumps.""" |
396 | 396 | statusItems = [self.reportItem(item) for item in items] |
— | — | @@ -403,7 +403,7 @@ |
404 | 404 | "items": html, |
405 | 405 | "checksum": self.webPath("md5sums.txt"), |
406 | 406 | "index": self.config.index} |
407 | | - |
| 407 | + |
408 | 408 | def reportPreviousDump(self, done): |
409 | 409 | """Produce a link to the previous dump, if any""" |
410 | 410 | try: |
— | — | @@ -418,7 +418,7 @@ |
419 | 419 | prefix = "This dump is in progress; see also the " |
420 | 420 | message = "previous dump from" |
421 | 421 | return "%s<a href=\"../%s/\">%s %s</a>" % (prefix, raw, message, date) |
422 | | - |
| 422 | + |
423 | 423 | def reportStatusLine(self, done=False): |
424 | 424 | if done: |
425 | 425 | classes = "done" |
— | — | @@ -434,7 +434,7 @@ |
435 | 435 | ess = "s" |
436 | 436 | text += ", %d item%s failed" % (self.failCount, ess) |
437 | 437 | return "<span class='%s'>%s</span>" % (classes, text) |
438 | | - |
| 438 | + |
439 | 439 | def reportItem(self, item): |
440 | 440 | """Return an HTML fragment with info on the progress of this item.""" |
441 | 441 | html = "<li class='%s'><span class='updates'>%s</span> <span class='status'>%s</span> <span class='title'>%s</span>" % (item.status, item.updated, item.status, item.description()) |
— | — | @@ -451,8 +451,8 @@ |
452 | 452 | html += "</ul>" |
453 | 453 | html += "</li>" |
454 | 454 | return html |
455 | | - |
456 | | - # Report on the file size & status of the current output and output a link if were done |
| 455 | + |
| 456 | + # Report on the file size & status of the current output and output a link if were done |
457 | 457 | def reportFile(self, file, status): |
458 | 458 | filepath = self.publicPath(file) |
459 | 459 | if status == "in-progress" and exists (filepath): |
— | — | @@ -464,13 +464,13 @@ |
465 | 465 | return "<li class='file'><a href=\"%s\">%s</a> %s</li>" % (webpath, file, size) |
466 | 466 | else: |
467 | 467 | return "<li class='missing'>%s</li>" % file |
468 | | - |
| 468 | + |
469 | 469 | def lockFile(self): |
470 | 470 | return self.publicPath("lock") |
471 | | - |
| 471 | + |
472 | 472 | def doneFile(self): |
473 | 473 | return self.publicPath("done") |
474 | | - |
| 474 | + |
475 | 475 | def lock(self): |
476 | 476 | self.status("Creating lock file.") |
477 | 477 | lockfile = self.lockFile() |
— | — | @@ -486,46 +486,46 @@ |
487 | 487 | # failure? let it die |
488 | 488 | pass |
489 | 489 | #####date -u > $StatusLockFile |
490 | | - |
| 490 | + |
491 | 491 | def unlock(self): |
492 | 492 | self.status("Marking complete.") |
493 | 493 | ######date -u > $StatusDoneFile |
494 | | - |
| 494 | + |
495 | 495 | def dateStamp(self): |
496 | 496 | #date -u --iso-8601=seconds |
497 | 497 | pass |
498 | | - |
| 498 | + |
499 | 499 | def status(self, message): |
500 | 500 | #echo $DatabaseName `dateStamp` OK: "$1" | tee -a $StatusLog | tee -a $GlobalLog |
501 | 501 | self.debug(message) |
502 | | - |
| 502 | + |
503 | 503 | def statusComplete(self): |
504 | 504 | # echo $DatabaseName `dateStamp` SUCCESS: "done." | tee -a $StatusLog | tee -a $GlobalLog |
505 | 505 | self.debug("SUCCESS: done.") |
506 | | - |
| 506 | + |
507 | 507 | def prepareChecksums(self): |
508 | 508 | """Create the md5 checksum file at the start of the run. |
509 | 509 | This will overwrite a previous run's output, if any.""" |
510 | 510 | output = file(self.publicPath("md5sums.txt"), "w") |
511 | | - |
| 511 | + |
512 | 512 | def checksum(self, filename): |
513 | 513 | """Run checksum for an output file, and append to the list.""" |
514 | 514 | output = file(self.publicPath("md5sums.txt"), "a") |
515 | 515 | self.saveChecksum(filename, output) |
516 | 516 | output.close() |
517 | | - |
| 517 | + |
518 | 518 | def saveChecksum(self, file, output): |
519 | 519 | self.debug("Checksumming %s" % file) |
520 | 520 | path = self.publicPath(file) |
521 | 521 | if os.path.exists(path): |
522 | 522 | checksum = md5FileLine(path) |
523 | 523 | output.write(checksum) |
524 | | - |
| 524 | + |
525 | 525 | def completeDump(self, files): |
526 | 526 | # FIXME: md5sums.txt won't be consistent with mixed data. |
527 | 527 | # Buuuuut life sucks, huh? |
528 | 528 | self.saveSymlink("md5sums.txt") |
529 | | - |
| 529 | + |
530 | 530 | def saveSymlink(self, file): |
531 | 531 | self.makeDir(join(self.wiki.publicDir(), 'latest')) |
532 | 532 | real = self.publicPath(file) |
— | — | @@ -539,7 +539,7 @@ |
540 | 540 | relative = relativePath(real, dirname(link)) |
541 | 541 | self.debug("Adding symlink %s -> %s" % (link, relative)) |
542 | 542 | os.symlink(relative, link) |
543 | | - |
| 543 | + |
544 | 544 | def saveFeed(self, file): |
545 | 545 | self.makeDir(join(self.wiki.publicDir(), 'latest')) |
546 | 546 | filePath = self.webPath(file) |
— | — | @@ -562,26 +562,26 @@ |
563 | 563 | self.updated = "" |
564 | 564 | self.status = "waiting" |
565 | 565 | self.progress = "" |
566 | | - |
| 566 | + |
567 | 567 | def description(self): |
568 | 568 | return self._desc |
569 | | - |
| 569 | + |
570 | 570 | def detail(self): |
571 | 571 | """Optionally return additional text to appear under the heading.""" |
572 | 572 | return None |
573 | | - |
| 573 | + |
574 | 574 | def setStatus(self, status): |
575 | 575 | self.status = status |
576 | 576 | self.updated = prettyTime() |
577 | | - |
| 577 | + |
578 | 578 | def listFiles(self, runner): |
579 | 579 | """Return a list of filenames which should be exported and checksummed""" |
580 | 580 | return [] |
581 | | - |
| 581 | + |
582 | 582 | def start(self, runner): |
583 | 583 | """Set the 'in progress' flag so we can output status.""" |
584 | 584 | self.setStatus("in-progress") |
585 | | - |
| 585 | + |
586 | 586 | def dump(self, runner): |
587 | 587 | """Attempt to run the operation, updating progress/status info.""" |
588 | 588 | try: |
— | — | @@ -590,52 +590,52 @@ |
591 | 591 | self.setStatus("failed") |
592 | 592 | raise ex |
593 | 593 | self.setStatus("done") |
594 | | - |
| 594 | + |
595 | 595 | def run(self, runner): |
596 | 596 | """Actually do something!""" |
597 | 597 | pass |
598 | | - |
| 598 | + |
599 | 599 | def progressCallback(self, runner, line): |
600 | 600 | """Receive a status line from a shellout and update the status files.""" |
601 | 601 | # pass through... |
602 | 602 | sys.stderr.write(line) |
603 | 603 | self.progress = line.strip() |
604 | 604 | runner.updateStatusFiles() |
605 | | - |
| 605 | + |
606 | 606 | def matchCheckpoint(self, checkpoint): |
607 | 607 | return checkpoint == self.__class__.__name__ |
608 | 608 | |
609 | 609 | class PublicTable(Dump): |
610 | 610 | """Dump of a table using MySQL's mysqldump utility.""" |
611 | | - |
| 611 | + |
612 | 612 | def __init__(self, table, desc): |
613 | 613 | Dump.__init__(self, desc) |
614 | 614 | self._table = table |
615 | | - |
| 615 | + |
616 | 616 | def _file(self): |
617 | 617 | return self._table + ".sql.gz" |
618 | | - |
| 618 | + |
619 | 619 | def _path(self, runner): |
620 | 620 | return runner.publicPath(self._file()) |
621 | | - |
| 621 | + |
622 | 622 | def run(self, runner): |
623 | 623 | return runner.saveTable(self._table, self._path(runner)) |
624 | | - |
| 624 | + |
625 | 625 | def listFiles(self, runner): |
626 | 626 | return [self._file()] |
627 | | - |
| 627 | + |
628 | 628 | def matchCheckpoint(self, checkpoint): |
629 | 629 | return checkpoint == self.__class__.__name__ + "." + self._table |
630 | 630 | |
631 | 631 | class PrivateTable(PublicTable): |
632 | 632 | """Hidden table dumps for private data.""" |
633 | | - |
| 633 | + |
634 | 634 | def description(self): |
635 | 635 | return self._desc + " (private)" |
636 | | - |
| 636 | + |
637 | 637 | def _path(self, runner): |
638 | 638 | return runner.privatePath(self._file()) |
639 | | - |
| 639 | + |
640 | 640 | def listFiles(self, runner): |
641 | 641 | """Private table won't have public files to list.""" |
642 | 642 | return [] |
— | — | @@ -645,18 +645,18 @@ |
646 | 646 | """Create lightweight skeleton dumps, minus bulk text. |
647 | 647 | A second pass will import text from prior dumps or the database to make |
648 | 648 | full files for the public.""" |
649 | | - |
| 649 | + |
650 | 650 | def description(self): |
651 | 651 | return "Creating split stub dumps..." |
652 | | - |
| 652 | + |
653 | 653 | def detail(self): |
654 | 654 | return "These files contain no page text, only revision metadata." |
655 | | - |
| 655 | + |
656 | 656 | def listFiles(self, runner): |
657 | 657 | return ["stub-meta-history.xml.gz", |
658 | 658 | "stub-meta-current.xml.gz", |
659 | 659 | "stub-articles.xml.gz",] |
660 | | - |
| 660 | + |
661 | 661 | def run(self, runner): |
662 | 662 | history = runner.publicPath("stub-meta-history.xml.gz") |
663 | 663 | current = runner.publicPath("stub-meta-current.xml.gz") |
— | — | @@ -688,20 +688,20 @@ |
689 | 689 | history, |
690 | 690 | current, |
691 | 691 | articles)) |
692 | | - runner.runCommand(command, callback=self.progressCallback) |
| 692 | + runner.runCommand(command, callback=self.progressCallback) |
693 | 693 | |
694 | 694 | class XmlLogging(Dump): |
695 | 695 | """ Create a logging dump of all page activity """ |
696 | | - |
| 696 | + |
697 | 697 | def description(self): |
698 | 698 | return "<big><b>Log events to all pages.</big></b>" |
699 | | - |
| 699 | + |
700 | 700 | def detail(self): |
701 | 701 | return "This contains the log of actions performed on pages." |
702 | | - |
| 702 | + |
703 | 703 | def listFiles(self, runner): |
704 | 704 | return ["pages-logging.xml.gz"] |
705 | | - |
| 705 | + |
706 | 706 | def run(self, runner): |
707 | 707 | logging = runner.publicPath("pages-logging.xml.gz") |
708 | 708 | if exists(logging): |
— | — | @@ -735,24 +735,24 @@ |
736 | 736 | def detail(self): |
737 | 737 | """Optionally return additional text to appear under the heading.""" |
738 | 738 | return self._detail |
739 | | - |
| 739 | + |
740 | 740 | def _file(self, ext): |
741 | 741 | return "pages-" + self._subset + ".xml." + ext |
742 | | - |
| 742 | + |
743 | 743 | def _path(self, runner, ext): |
744 | 744 | return runner.publicPath(self._file(ext)) |
745 | | - |
| 745 | + |
746 | 746 | def run(self, runner): |
747 | 747 | filters = self.buildFilters(runner) |
748 | 748 | command = self.buildCommand(runner) |
749 | 749 | eta = self.buildEta(runner) |
750 | 750 | return runner.runCommand(command + " " + filters + " " + eta, |
751 | 751 | callback=self.progressCallback) |
752 | | - |
| 752 | + |
753 | 753 | def buildEta(self, runner): |
754 | 754 | """Tell the dumper script whether to make ETA estimate on page or revision count.""" |
755 | 755 | return "--current" |
756 | | - |
| 756 | + |
757 | 757 | def buildFilters(self, runner): |
758 | 758 | """Construct the output filter options for dumpTextPass.php""" |
759 | 759 | xmlbz2 = self._path(runner, "bz2") |
— | — | @@ -761,14 +761,14 @@ |
762 | 762 | else: |
763 | 763 | bz2mode = "bzip2" |
764 | 764 | return "--output=%s:%s" % shellEscape((bz2mode, xmlbz2)) |
765 | | - |
| 765 | + |
766 | 766 | def buildCommand(self, runner): |
767 | 767 | """Build the command line for the dump, minus output and filter options""" |
768 | | - |
| 768 | + |
769 | 769 | # Page and revision data pulled from this skeleton dump... |
770 | 770 | stub = runner.publicPath("stub-%s.xml.gz" % self._subset), |
771 | 771 | stubOption = "--stub=gzip:%s" % stub |
772 | | - |
| 772 | + |
773 | 773 | # Try to pull text from the previous run; most stuff hasn't changed |
774 | 774 | #Source=$OutputDir/pages_$section.xml.bz2 |
775 | 775 | if self._prefetch: |
— | — | @@ -806,7 +806,7 @@ |
807 | 807 | spawn)) |
808 | 808 | command = dumpCommand |
809 | 809 | return command |
810 | | - |
| 810 | + |
811 | 811 | def _findPreviousDump(self, runner): |
812 | 812 | """The previously-linked previous successful dump.""" |
813 | 813 | bzfile = self._file("bz2") |
— | — | @@ -830,52 +830,52 @@ |
831 | 831 | return old |
832 | 832 | runner.debug("Could not locate a prefetchable dump.") |
833 | 833 | return None |
834 | | - |
| 834 | + |
835 | 835 | def listFiles(self, runner): |
836 | 836 | return [self._file("bz2")] |
837 | | - |
| 837 | + |
838 | 838 | def matchCheckpoint(self, checkpoint): |
839 | 839 | return checkpoint == self.__class__.__name__ + "." + self._subset |
840 | 840 | |
841 | 841 | class BigXmlDump(XmlDump): |
842 | 842 | """XML page dump for something larger, where a 7-Zip compressed copy |
843 | 843 | could save 75% of download time for some users.""" |
844 | | - |
| 844 | + |
845 | 845 | def buildEta(self, runner): |
846 | 846 | """Tell the dumper script whether to make ETA estimate on page or revision count.""" |
847 | 847 | return "--full" |
848 | 848 | |
849 | 849 | class XmlRecompressDump(Dump): |
850 | 850 | """Take a .bz2 and recompress it as 7-Zip.""" |
851 | | - |
| 851 | + |
852 | 852 | def __init__(self, subset, desc, detail): |
853 | 853 | Dump.__init__(self, desc) |
854 | 854 | self._subset = subset |
855 | 855 | self._detail = detail |
856 | | - |
| 856 | + |
857 | 857 | def detail(self): |
858 | 858 | """Optionally return additional text to appear under the heading.""" |
859 | 859 | return self._detail |
860 | | - |
| 860 | + |
861 | 861 | def _file(self, ext): |
862 | 862 | return "pages-" + self._subset + ".xml." + ext |
863 | | - |
| 863 | + |
864 | 864 | def _path(self, runner, ext): |
865 | 865 | return runner.publicPath(self._file(ext)) |
866 | | - |
| 866 | + |
867 | 867 | def run(self, runner): |
868 | 868 | if runner.lastFailed: |
869 | 869 | raise BackupError("bz2 dump incomplete, not recompressing") |
870 | | - |
| 870 | + |
871 | 871 | xmlbz2 = self._path(runner, "bz2") |
872 | 872 | xml7z = self._path(runner, "7z") |
873 | | - |
| 873 | + |
874 | 874 | # Clear prior 7zip attempts; 7zip will try to append an existing archive |
875 | 875 | if exists(xml7z): |
876 | 876 | os.remove(xml7z) |
877 | | - |
| 877 | + |
878 | 878 | # temp hack force 644 permissions until ubuntu bug # 370618 is fixed - tomasz 5/1/2009 |
879 | | - command = "%s -dc < %s | %s a -si %s ; chmod 644 %s" % shellEscape(( |
| 879 | + command = "%s -dc < %s | %s a -si %s ; chmod 644 %s" % shellEscape(( |
880 | 880 | runner.config.bzip2, |
881 | 881 | xmlbz2, |
882 | 882 | runner.config.sevenzip, |
— | — | @@ -883,16 +883,16 @@ |
884 | 884 | xml7z)); |
885 | 885 | |
886 | 886 | return runner.runCommand(command, callback=self.progressCallback) |
887 | | - |
| 887 | + |
888 | 888 | def listFiles(self, runner): |
889 | 889 | return [self._file("7z")] |
890 | | - |
| 890 | + |
891 | 891 | def matchCheckpoint(self, checkpoint): |
892 | 892 | return checkpoint == self.__class__.__name__ + "." + self._subset |
893 | 893 | |
894 | 894 | class AbstractDump(Dump): |
895 | 895 | """XML dump for Yahoo!'s Active Abstracts thingy""" |
896 | | - |
| 896 | + |
897 | 897 | def run(self, runner): |
898 | 898 | command = """ |
899 | 899 | %s -q %s/maintenance/dumpBackup.php \ |
— | — | @@ -919,7 +919,7 @@ |
920 | 920 | self._variantOption(variant))) |
921 | 921 | command = command + "\n" |
922 | 922 | runner.runCommand(command, callback=self.progressCallback) |
923 | | - |
| 923 | + |
924 | 924 | def _variants(self, runner): |
925 | 925 | # If the database name looks like it's marked as Chinese language, |
926 | 926 | # return a list including Simplified and Traditional versions, so |
— | — | @@ -928,28 +928,28 @@ |
929 | 929 | return ("", "zh-cn", "zh-tw") |
930 | 930 | else: |
931 | 931 | return ("",) |
932 | | - |
| 932 | + |
933 | 933 | def _variantOption(self, variant): |
934 | 934 | if variant == "": |
935 | 935 | return "" |
936 | 936 | else: |
937 | 937 | return ":variant=%s" % variant |
938 | | - |
| 938 | + |
939 | 939 | def _variantFile(self, variant): |
940 | 940 | if variant == "": |
941 | 941 | return "abstract.xml" |
942 | 942 | else: |
943 | 943 | return "abstract-%s.xml" % variant |
944 | | - |
| 944 | + |
945 | 945 | def listFiles(self, runner): |
946 | 946 | return [self._variantFile(x) for x in self._variants(runner)] |
947 | | - |
| 947 | + |
948 | 948 | class TitleDump(Dump): |
949 | 949 | """This is used by "wikiproxy", a program to add Wikipedia links to BBC news online""" |
950 | 950 | def run(self, runner): |
951 | 951 | return runner.saveSql("select page_title from page where page_namespace=0;", |
952 | 952 | runner.publicPath("all-titles-in-ns0.gz")) |
953 | | - |
| 953 | + |
954 | 954 | def listFiles(self, runner): |
955 | 955 | return ["all-titles-in-ns0.gz"] |
956 | 956 | |
— | — | @@ -958,12 +958,12 @@ |
959 | 959 | if config.halt: |
960 | 960 | print "Dump process halted by config." |
961 | 961 | return None |
962 | | - |
| 962 | + |
963 | 963 | next = config.dbListByAge() |
964 | 964 | next.reverse() |
965 | 965 | |
966 | 966 | print "Finding oldest unlocked wiki..." |
967 | | - |
| 967 | + |
968 | 968 | for db in next: |
969 | 969 | wiki = WikiDump.Wiki(config, db) |
970 | 970 | try: |
— | — | @@ -973,11 +973,11 @@ |
974 | 974 | print "Couldn't lock %s, someone else must have got it..." % db |
975 | 975 | continue |
976 | 976 | return None |
977 | | - |
| 977 | + |
978 | 978 | if __name__ == "__main__": |
979 | 979 | try: |
980 | 980 | config = WikiDump.Config() |
981 | | - |
| 981 | + |
982 | 982 | date = None |
983 | 983 | checkpoint = None |
984 | 984 | forceLock = False |
— | — | @@ -1006,7 +1006,7 @@ |
1007 | 1007 | wiki.lock() |
1008 | 1008 | else: |
1009 | 1009 | wiki = findAndLockNextWiki(config) |
1010 | | - |
| 1010 | + |
1011 | 1011 | if wiki: |
1012 | 1012 | runner = Runner(wiki, date, checkpoint, prefetch, spawn) |
1013 | 1013 | print "Running %s..." % wiki.dbName |