Index: trunk/phase3/maintenance/storage/fixBug20757.php |
— | — | @@ -0,0 +1,265 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +require_once( dirname( __FILE__ ) . '/../Maintenance.php' ); |
| 5 | + |
| 6 | +class FixBug20757 extends Maintenance { |
| 7 | + var $batchSize = 10000; |
| 8 | + var $mapCache = array(); |
| 9 | + var $mapCacheSize = 0; |
| 10 | + var $maxMapCacheSize = 1000000; |
| 11 | + |
| 12 | + function __construct() { |
| 13 | + parent::__construct(); |
| 14 | + $this->mDescription = 'Script to fix bug 20757 assuming that blob_tracking is intact'; |
| 15 | + $this->addOption( 'dry-run', 'Report only' ); |
| 16 | + } |
| 17 | + |
| 18 | + function execute() { |
| 19 | + $dbr = wfGetDB( DB_SLAVE ); |
| 20 | + $dbw = wfGetDB( DB_MASTER ); |
| 21 | + |
| 22 | + $dryRun = $this->getOption( 'dry-run' ); |
| 23 | + if ( $dryRun ) { |
| 24 | + print "Dry run only.\n"; |
| 25 | + } |
| 26 | + |
| 27 | + $startId = 0; |
| 28 | + $numGood = 0; |
| 29 | + $numFixed = 0; |
| 30 | + $numBad = 0; |
| 31 | + |
| 32 | + $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ ); |
| 33 | + |
| 34 | + while ( true ) { |
| 35 | + print "ID: $startId / $totalRevs\r"; |
| 36 | + |
| 37 | + $res = $dbr->select( |
| 38 | + 'text', |
| 39 | + array( 'old_id', 'old_flags', 'old_text' ), |
| 40 | + array( |
| 41 | + 'old_id > ' . intval( $startId ), |
| 42 | + 'old_flags' => 'object' |
| 43 | + ), |
| 44 | + __METHOD__, |
| 45 | + array( |
| 46 | + 'ORDER BY' => 'old_id', |
| 47 | + 'LIMIT' => $this->batchSize, |
| 48 | + ) |
| 49 | + ); |
| 50 | + |
| 51 | + if ( !$res->numRows() ) { |
| 52 | + break; |
| 53 | + } |
| 54 | + |
| 55 | + $secondaryIds = array(); |
| 56 | + $stubs = array(); |
| 57 | + |
| 58 | + foreach ( $res as $row ) { |
| 59 | + $startId = $row->old_id; |
| 60 | + |
| 61 | + // Basic sanity checks |
| 62 | + $obj = unserialize( $row->old_text ); |
| 63 | + if ( $obj === false ) { |
| 64 | + print "{$row->old_id}: unrecoverable: cannot unserialize\n"; |
| 65 | + ++$numBad; |
| 66 | + continue; |
| 67 | + } |
| 68 | + |
| 69 | + if ( !is_object( $obj ) ) { |
| 70 | + print "{$row->old_id}: unrecoverable: unserialized to type " . |
| 71 | + gettype( $obj ) . ", possible double-serialization\n"; |
| 72 | + ++$numBad; |
| 73 | + continue; |
| 74 | + } |
| 75 | + |
| 76 | + // Check if it really is broken |
| 77 | + $text = Revision::getRevisionText( $row ); |
| 78 | + if ( $text !== false ) { |
| 79 | + // Not broken yet |
| 80 | + ++$numGood; |
| 81 | + continue; |
| 82 | + } |
| 83 | + |
| 84 | + if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) { |
| 85 | + print "{$row->old_id}: unrecoverable: unexpected object class " . |
| 86 | + get_class( $obj ) . "\n"; |
| 87 | + ++$numBad; |
| 88 | + continue; |
| 89 | + } |
| 90 | + |
| 91 | + // Queue the stub for future batch processing |
| 92 | + $id = intval( $obj->mOldId ); |
| 93 | + $secondaryIds[] = $id; |
| 94 | + $stubs[$row->old_id] = array( |
| 95 | + 'secondaryId' => $id, |
| 96 | + 'hash' => $obj->mHash, |
| 97 | + ); |
| 98 | + } |
| 99 | + |
| 100 | + $secondaryIds = array_unique( $secondaryIds ); |
| 101 | + |
| 102 | + if ( !count( $secondaryIds ) ) { |
| 103 | + continue; |
| 104 | + } |
| 105 | + |
| 106 | + // Run the batch query on blob_tracking |
| 107 | + $res = $dbr->select( |
| 108 | + 'blob_tracking', |
| 109 | + '*', |
| 110 | + array( |
| 111 | + 'bt_text_id' => $secondaryIds, |
| 112 | + 'bt_moved' => 1, |
| 113 | + ), |
| 114 | + __METHOD__ |
| 115 | + ); |
| 116 | + $trackedBlobs = array(); |
| 117 | + foreach ( $res as $row ) { |
| 118 | + $trackedBlobs[$row->bt_text_id] = $row; |
| 119 | + } |
| 120 | + |
| 121 | + // Process the stubs |
| 122 | + $stubsToFix = array(); |
| 123 | + foreach ( $stubs as $primaryId => $stub ) { |
| 124 | + $secondaryId = $stub['secondaryId']; |
| 125 | + if ( !isset( $trackedBlobs[$secondaryId] ) ) { |
| 126 | + $secondaryRow = $dbr->selectRow( |
| 127 | + 'text', |
| 128 | + array( 'old_flags', 'old_text' ), |
| 129 | + array( 'old_id' => $secondaryId ), |
| 130 | + __METHOD__ |
| 131 | + ); |
| 132 | + if ( !$secondaryRow ) { |
| 133 | + print "$primaryId: unrecoverable: secondary row is missing\n"; |
| 134 | + } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) { |
| 135 | + print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n"; |
| 136 | + } else { |
| 137 | + print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n"; |
| 138 | + } |
| 139 | + ++$numBad; |
| 140 | + unset( $stubs[$primaryId] ); |
| 141 | + continue; |
| 142 | + } |
| 143 | + $trackRow = $trackedBlobs[$secondaryId]; |
| 144 | + |
| 145 | + // Check that the specified text really is available in the tracked source row |
| 146 | + $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}"; |
| 147 | + $text = ExternalStore::fetchFromURL( $url ); |
| 148 | + if ( $text === false ) { |
| 149 | + print "$primaryId: unrecoverable: source text missing\n"; |
| 150 | + ++$numBad; |
| 151 | + unset( $stubs[$primaryId] ); |
| 152 | + continue; |
| 153 | + } |
| 154 | + if ( md5( $text ) !== $stub['hash'] ) { |
| 155 | + print "$primaryId: unrecoverable: content hashes do not match\n"; |
| 156 | + ++$numBad; |
| 157 | + unset( $stubs[$primaryId] ); |
| 158 | + continue; |
| 159 | + } |
| 160 | + |
| 161 | + // Find the page_id and rev_id |
| 162 | + // The page is probably the same as the page of the secondary row |
| 163 | + $pageId = $this->bt_page; |
| 164 | + if ( $pageId === null ) { |
| 165 | + $revId = null; |
| 166 | + } else { |
| 167 | + $revId = $this->findTextIdInPage( $pageId, $primaryId ); |
| 168 | + if ( $revId === null ) { |
| 169 | + // Actually an orphan |
| 170 | + $pageId = null; |
| 171 | + } |
| 172 | + } |
| 173 | + |
| 174 | + if ( !$dryRun ) { |
| 175 | + // Reset the text row to point to the original copy |
| 176 | + $dbw->begin(); |
| 177 | + $dbw->update( |
| 178 | + 'text', |
| 179 | + // SET |
| 180 | + array( |
| 181 | + 'old_flags' => 'external', // use legacy encoding |
| 182 | + 'old_text' => $url |
| 183 | + ), |
| 184 | + // WHERE |
| 185 | + array( 'old_id' => $primaryId ), |
| 186 | + __METHOD__ |
| 187 | + ); |
| 188 | + |
| 189 | + // Add a blob_tracking row so that the new reference can be recompressed |
| 190 | + // without needing to run trackBlobs.php again |
| 191 | + $dbw->insert( 'blob_tracking', |
| 192 | + array( |
| 193 | + 'bt_page' => $trackRow->bt_page, |
| 194 | + 'bt_rev_id' => $revId, |
| 195 | + 'bt_text_id' => $primaryId, |
| 196 | + 'bt_cluster' => $trackRow->bt_cluster, |
| 197 | + 'bt_blob_id' => $trackRow->bt_blob_id, |
| 198 | + 'bt_cgz_hash' => $stub['hash'], |
| 199 | + 'bt_new_url' => null, |
| 200 | + 'bt_moved' => 0, |
| 201 | + ), |
| 202 | + __METHOD__ |
| 203 | + ); |
| 204 | + $dbw->commit(); |
| 205 | + $this->waitForSlaves(); |
| 206 | + } |
| 207 | + |
| 208 | + print "$primaryId: resolved to $url\n"; |
| 209 | + ++$numFixed; |
| 210 | + } |
| 211 | + } |
| 212 | + |
| 213 | + print "\n"; |
| 214 | + print "Fixed: $numFixed\n"; |
| 215 | + print "Unrecoverable: $numBad\n"; |
| 216 | + print "Not yet broken: $numGood\n"; |
| 217 | + } |
| 218 | + |
| 219 | + function waitForSlaves() { |
| 220 | + static $iteration = 0; |
| 221 | + ++$iteration; |
| 222 | + if ( ++$iteration > 50 == 0 ) { |
| 223 | + wfWaitForSlaves( 5 ); |
| 224 | + $iteration = 0; |
| 225 | + } |
| 226 | + } |
| 227 | + |
| 228 | + function findTextIdInPage( $pageId, $textId ) { |
| 229 | + $ids = $this->getRevTextMap( $pageId ); |
| 230 | + if ( !isset( $ids[$textId] ) ) { |
| 231 | + return null; |
| 232 | + } else { |
| 233 | + return $ids[$textId]; |
| 234 | + } |
| 235 | + } |
| 236 | + |
| 237 | + function getRevTextMap( $pageId ) { |
| 238 | + if ( !isset( $this->mapCache[$pageId] ) ) { |
| 239 | + // Limit cache size |
| 240 | + while ( $this->mapCacheSize > $this->maxMapCacheSize ) { |
| 241 | + $key = key( $this->mapCache ); |
| 242 | + $this->mapCacheSize -= count( $this->mapCache[$key] ); |
| 243 | + unset( $this->mapCache[$key] ); |
| 244 | + } |
| 245 | + |
| 246 | + $dbr = wfGetDB( DB_SLAVE ); |
| 247 | + $map = array(); |
| 248 | + $res = $dbr->select( 'revision', |
| 249 | + array( 'rev_id', 'rev_text_id' ), |
| 250 | + array( 'rev_page' => $pageId ), |
| 251 | + __METHOD__ |
| 252 | + ); |
| 253 | + foreach ( $res as $row ) { |
| 254 | + $map[$row->rev_text_id] = $row->rev_id; |
| 255 | + } |
| 256 | + $this->mapCache[$pageId] = $map; |
| 257 | + $this->mapCacheSize += count( $map ); |
| 258 | + } |
| 259 | + return $this->mapCache[$pageId]; |
| 260 | + } |
| 261 | + |
| 262 | +} |
| 263 | + |
| 264 | +$maintClass = 'FixBug20757'; |
| 265 | +require_once( DO_MAINTENANCE ); |
| 266 | + |
Property changes on: trunk/phase3/maintenance/storage/fixBug20757.php |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 267 | + native |