r62124 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r62123‎ | r62124 | r62125 >
Date:12:36, 8 February 2010
Author:tstarling
Status:deferred
Tags:
Comment:
Fix script for bug 20757.
Modified paths:
  • /trunk/phase3/maintenance/storage/fixBug20757.php (added) (history)

Diff [purge]

Index: trunk/phase3/maintenance/storage/fixBug20757.php
@@ -0,0 +1,265 @@
 2+<?php
 3+
 4+require_once( dirname( __FILE__ ) . '/../Maintenance.php' );
 5+
 6+class FixBug20757 extends Maintenance {
 7+ var $batchSize = 10000;
 8+ var $mapCache = array();
 9+ var $mapCacheSize = 0;
 10+ var $maxMapCacheSize = 1000000;
 11+
 12+ function __construct() {
 13+ parent::__construct();
 14+ $this->mDescription = 'Script to fix bug 20757 assuming that blob_tracking is intact';
 15+ $this->addOption( 'dry-run', 'Report only' );
 16+ }
 17+
 18+ function execute() {
 19+ $dbr = wfGetDB( DB_SLAVE );
 20+ $dbw = wfGetDB( DB_MASTER );
 21+
 22+ $dryRun = $this->getOption( 'dry-run' );
 23+ if ( $dryRun ) {
 24+ print "Dry run only.\n";
 25+ }
 26+
 27+ $startId = 0;
 28+ $numGood = 0;
 29+ $numFixed = 0;
 30+ $numBad = 0;
 31+
 32+ $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
 33+
 34+ while ( true ) {
 35+ print "ID: $startId / $totalRevs\r";
 36+
 37+ $res = $dbr->select(
 38+ 'text',
 39+ array( 'old_id', 'old_flags', 'old_text' ),
 40+ array(
 41+ 'old_id > ' . intval( $startId ),
 42+ 'old_flags' => 'object'
 43+ ),
 44+ __METHOD__,
 45+ array(
 46+ 'ORDER BY' => 'old_id',
 47+ 'LIMIT' => $this->batchSize,
 48+ )
 49+ );
 50+
 51+ if ( !$res->numRows() ) {
 52+ break;
 53+ }
 54+
 55+ $secondaryIds = array();
 56+ $stubs = array();
 57+
 58+ foreach ( $res as $row ) {
 59+ $startId = $row->old_id;
 60+
 61+ // Basic sanity checks
 62+ $obj = unserialize( $row->old_text );
 63+ if ( $obj === false ) {
 64+ print "{$row->old_id}: unrecoverable: cannot unserialize\n";
 65+ ++$numBad;
 66+ continue;
 67+ }
 68+
 69+ if ( !is_object( $obj ) ) {
 70+ print "{$row->old_id}: unrecoverable: unserialized to type " .
 71+ gettype( $obj ) . ", possible double-serialization\n";
 72+ ++$numBad;
 73+ continue;
 74+ }
 75+
 76+ // Check if it really is broken
 77+ $text = Revision::getRevisionText( $row );
 78+ if ( $text !== false ) {
 79+ // Not broken yet
 80+ ++$numGood;
 81+ continue;
 82+ }
 83+
 84+ if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) {
 85+ print "{$row->old_id}: unrecoverable: unexpected object class " .
 86+ get_class( $obj ) . "\n";
 87+ ++$numBad;
 88+ continue;
 89+ }
 90+
 91+ // Queue the stub for future batch processing
 92+ $id = intval( $obj->mOldId );
 93+ $secondaryIds[] = $id;
 94+ $stubs[$row->old_id] = array(
 95+ 'secondaryId' => $id,
 96+ 'hash' => $obj->mHash,
 97+ );
 98+ }
 99+
 100+ $secondaryIds = array_unique( $secondaryIds );
 101+
 102+ if ( !count( $secondaryIds ) ) {
 103+ continue;
 104+ }
 105+
 106+ // Run the batch query on blob_tracking
 107+ $res = $dbr->select(
 108+ 'blob_tracking',
 109+ '*',
 110+ array(
 111+ 'bt_text_id' => $secondaryIds,
 112+ 'bt_moved' => 1,
 113+ ),
 114+ __METHOD__
 115+ );
 116+ $trackedBlobs = array();
 117+ foreach ( $res as $row ) {
 118+ $trackedBlobs[$row->bt_text_id] = $row;
 119+ }
 120+
 121+ // Process the stubs
 122+ $stubsToFix = array();
 123+ foreach ( $stubs as $primaryId => $stub ) {
 124+ $secondaryId = $stub['secondaryId'];
 125+ if ( !isset( $trackedBlobs[$secondaryId] ) ) {
 126+ $secondaryRow = $dbr->selectRow(
 127+ 'text',
 128+ array( 'old_flags', 'old_text' ),
 129+ array( 'old_id' => $secondaryId ),
 130+ __METHOD__
 131+ );
 132+ if ( !$secondaryRow ) {
 133+ print "$primaryId: unrecoverable: secondary row is missing\n";
 134+ } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) {
 135+ print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n";
 136+ } else {
 137+ print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n";
 138+ }
 139+ ++$numBad;
 140+ unset( $stubs[$primaryId] );
 141+ continue;
 142+ }
 143+ $trackRow = $trackedBlobs[$secondaryId];
 144+
 145+ // Check that the specified text really is available in the tracked source row
 146+ $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}";
 147+ $text = ExternalStore::fetchFromURL( $url );
 148+ if ( $text === false ) {
 149+ print "$primaryId: unrecoverable: source text missing\n";
 150+ ++$numBad;
 151+ unset( $stubs[$primaryId] );
 152+ continue;
 153+ }
 154+ if ( md5( $text ) !== $stub['hash'] ) {
 155+ print "$primaryId: unrecoverable: content hashes do not match\n";
 156+ ++$numBad;
 157+ unset( $stubs[$primaryId] );
 158+ continue;
 159+ }
 160+
 161+ // Find the page_id and rev_id
 162+ // The page is probably the same as the page of the secondary row
 163+ $pageId = $this->bt_page;
 164+ if ( $pageId === null ) {
 165+ $revId = null;
 166+ } else {
 167+ $revId = $this->findTextIdInPage( $pageId, $primaryId );
 168+ if ( $revId === null ) {
 169+ // Actually an orphan
 170+ $pageId = null;
 171+ }
 172+ }
 173+
 174+ if ( !$dryRun ) {
 175+ // Reset the text row to point to the original copy
 176+ $dbw->begin();
 177+ $dbw->update(
 178+ 'text',
 179+ // SET
 180+ array(
 181+ 'old_flags' => 'external', // use legacy encoding
 182+ 'old_text' => $url
 183+ ),
 184+ // WHERE
 185+ array( 'old_id' => $primaryId ),
 186+ __METHOD__
 187+ );
 188+
 189+ // Add a blob_tracking row so that the new reference can be recompressed
 190+ // without needing to run trackBlobs.php again
 191+ $dbw->insert( 'blob_tracking',
 192+ array(
 193+ 'bt_page' => $trackRow->bt_page,
 194+ 'bt_rev_id' => $revId,
 195+ 'bt_text_id' => $primaryId,
 196+ 'bt_cluster' => $trackRow->bt_cluster,
 197+ 'bt_blob_id' => $trackRow->bt_blob_id,
 198+ 'bt_cgz_hash' => $stub['hash'],
 199+ 'bt_new_url' => null,
 200+ 'bt_moved' => 0,
 201+ ),
 202+ __METHOD__
 203+ );
 204+ $dbw->commit();
 205+ $this->waitForSlaves();
 206+ }
 207+
 208+ print "$primaryId: resolved to $url\n";
 209+ ++$numFixed;
 210+ }
 211+ }
 212+
 213+ print "\n";
 214+ print "Fixed: $numFixed\n";
 215+ print "Unrecoverable: $numBad\n";
 216+ print "Not yet broken: $numGood\n";
 217+ }
 218+
 219+ function waitForSlaves() {
 220+ static $iteration = 0;
 221+ ++$iteration;
 222+ if ( ++$iteration > 50 == 0 ) {
 223+ wfWaitForSlaves( 5 );
 224+ $iteration = 0;
 225+ }
 226+ }
 227+
 228+ function findTextIdInPage( $pageId, $textId ) {
 229+ $ids = $this->getRevTextMap( $pageId );
 230+ if ( !isset( $ids[$textId] ) ) {
 231+ return null;
 232+ } else {
 233+ return $ids[$textId];
 234+ }
 235+ }
 236+
 237+ function getRevTextMap( $pageId ) {
 238+ if ( !isset( $this->mapCache[$pageId] ) ) {
 239+ // Limit cache size
 240+ while ( $this->mapCacheSize > $this->maxMapCacheSize ) {
 241+ $key = key( $this->mapCache );
 242+ $this->mapCacheSize -= count( $this->mapCache[$key] );
 243+ unset( $this->mapCache[$key] );
 244+ }
 245+
 246+ $dbr = wfGetDB( DB_SLAVE );
 247+ $map = array();
 248+ $res = $dbr->select( 'revision',
 249+ array( 'rev_id', 'rev_text_id' ),
 250+ array( 'rev_page' => $pageId ),
 251+ __METHOD__
 252+ );
 253+ foreach ( $res as $row ) {
 254+ $map[$row->rev_text_id] = $row->rev_id;
 255+ }
 256+ $this->mapCache[$pageId] = $map;
 257+ $this->mapCacheSize += count( $map );
 258+ }
 259+ return $this->mapCache[$pageId];
 260+ }
 261+
 262+}
 263+
 264+$maintClass = 'FixBug20757';
 265+require_once( DO_MAINTENANCE );
 266+
Property changes on: trunk/phase3/maintenance/storage/fixBug20757.php
___________________________________________________________________
Name: svn:eol-style
1267 + native

Follow-up revisions

RevisionCommit summaryAuthorDate
r63304Don't allow trackBlobs.php to continue if there is a potential for corruption...tstarling23:59, 5 March 2010

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r62119Initial fix for bug 20757. Committing for test on server.tstarling07:01, 8 February 2010

Status & tagging log