Index: branches/wmf-deployment/maintenance/storage/fixBug20757.php |
— | — | @@ -0,0 +1,314 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +require_once( dirname( __FILE__ ) . '/../Maintenance.php' ); |
| 5 | + |
| 6 | +class FixBug20757 extends Maintenance { |
| 7 | + var $batchSize = 10000; |
| 8 | + var $mapCache = array(); |
| 9 | + var $mapCacheSize = 0; |
| 10 | + var $maxMapCacheSize = 1000000; |
| 11 | + |
| 12 | + function __construct() { |
| 13 | + parent::__construct(); |
| 14 | + $this->mDescription = 'Script to fix bug 20757 assuming that blob_tracking is intact'; |
| 15 | + $this->addOption( 'dry-run', 'Report only' ); |
| 16 | + $this->addOption( 'start', 'old_id to start at', false, true ); |
| 17 | + } |
| 18 | + |
| 19 | + function execute() { |
| 20 | + $dbr = wfGetDB( DB_SLAVE ); |
| 21 | + $dbw = wfGetDB( DB_MASTER ); |
| 22 | + |
| 23 | + $dryRun = $this->getOption( 'dry-run' ); |
| 24 | + if ( $dryRun ) { |
| 25 | + print "Dry run only.\n"; |
| 26 | + } |
| 27 | + |
| 28 | + $startId = $this->getOption( 'start', 0 ); |
| 29 | + $numGood = 0; |
| 30 | + $numFixed = 0; |
| 31 | + $numBad = 0; |
| 32 | + |
| 33 | + $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ ); |
| 34 | + |
| 35 | + while ( true ) { |
| 36 | + print "ID: $startId / $totalRevs\r"; |
| 37 | + |
| 38 | + $res = $dbr->select( |
| 39 | + 'text', |
| 40 | + array( 'old_id', 'old_flags', 'old_text' ), |
| 41 | + array( |
| 42 | + 'old_id > ' . intval( $startId ), |
| 43 | + 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'', |
| 44 | + 'LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'', |
| 45 | + ), |
| 46 | + __METHOD__, |
| 47 | + array( |
| 48 | + 'ORDER BY' => 'old_id', |
| 49 | + 'LIMIT' => $this->batchSize, |
| 50 | + ) |
| 51 | + ); |
| 52 | + |
| 53 | + if ( !$res->numRows() ) { |
| 54 | + break; |
| 55 | + } |
| 56 | + |
| 57 | + $secondaryIds = array(); |
| 58 | + $stubs = array(); |
| 59 | + |
| 60 | + foreach ( $res as $row ) { |
| 61 | + $startId = $row->old_id; |
| 62 | + |
| 63 | + // Basic sanity checks |
| 64 | + $obj = unserialize( $row->old_text ); |
| 65 | + if ( $obj === false ) { |
| 66 | + print "{$row->old_id}: unrecoverable: cannot unserialize\n"; |
| 67 | + ++$numBad; |
| 68 | + continue; |
| 69 | + } |
| 70 | + |
| 71 | + if ( !is_object( $obj ) ) { |
| 72 | + print "{$row->old_id}: unrecoverable: unserialized to type " . |
| 73 | + gettype( $obj ) . ", possible double-serialization\n"; |
| 74 | + ++$numBad; |
| 75 | + continue; |
| 76 | + } |
| 77 | + |
| 78 | + if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) { |
| 79 | + print "{$row->old_id}: unrecoverable: unexpected object class " . |
| 80 | + get_class( $obj ) . "\n"; |
| 81 | + ++$numBad; |
| 82 | + continue; |
| 83 | + } |
| 84 | + |
| 85 | + // Process flags |
| 86 | + $flags = explode( ',', $row->old_flags ); |
| 87 | + if ( in_array( 'utf-8', $flags ) || in_array( 'utf8', $flags ) ) { |
| 88 | + $legacyEncoding = false; |
| 89 | + } else { |
| 90 | + $legacyEncoding = true; |
| 91 | + } |
| 92 | + |
| 93 | + // Queue the stub for future batch processing |
| 94 | + $id = intval( $obj->mOldId ); |
| 95 | + $secondaryIds[] = $id; |
| 96 | + $stubs[$row->old_id] = array( |
| 97 | + 'legacyEncoding' => $legacyEncoding, |
| 98 | + 'secondaryId' => $id, |
| 99 | + 'hash' => $obj->mHash, |
| 100 | + ); |
| 101 | + } |
| 102 | + |
| 103 | + $secondaryIds = array_unique( $secondaryIds ); |
| 104 | + |
| 105 | + if ( !count( $secondaryIds ) ) { |
| 106 | + continue; |
| 107 | + } |
| 108 | + |
| 109 | + // Run the batch query on blob_tracking |
| 110 | + $res = $dbr->select( |
| 111 | + 'blob_tracking', |
| 112 | + '*', |
| 113 | + array( |
| 114 | + 'bt_text_id' => $secondaryIds, |
| 115 | + ), |
| 116 | + __METHOD__ |
| 117 | + ); |
| 118 | + $trackedBlobs = array(); |
| 119 | + foreach ( $res as $row ) { |
| 120 | + $trackedBlobs[$row->bt_text_id] = $row; |
| 121 | + } |
| 122 | + |
| 123 | + // Process the stubs |
| 124 | + $stubsToFix = array(); |
| 125 | + foreach ( $stubs as $primaryId => $stub ) { |
| 126 | + $secondaryId = $stub['secondaryId']; |
| 127 | + if ( !isset( $trackedBlobs[$secondaryId] ) ) { |
| 128 | + // No tracked blob. Work out what went wrong |
| 129 | + $secondaryRow = $dbr->selectRow( |
| 130 | + 'text', |
| 131 | + array( 'old_flags', 'old_text' ), |
| 132 | + array( 'old_id' => $secondaryId ), |
| 133 | + __METHOD__ |
| 134 | + ); |
| 135 | + if ( !$secondaryRow ) { |
| 136 | + print "$primaryId: unrecoverable: secondary row is missing\n"; |
| 137 | + ++$numBad; |
| 138 | + } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) { |
| 139 | + // Not broken yet, and not in the tracked clusters so it won't get |
| 140 | + // broken by the current RCT run. |
| 141 | + ++$numGood; |
| 142 | + } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) { |
| 143 | + print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n"; |
| 144 | + ++$numBad; |
| 145 | + } else { |
| 146 | + print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n"; |
| 147 | + ++$numBad; |
| 148 | + } |
| 149 | + unset( $stubs[$primaryId] ); |
| 150 | + continue; |
| 151 | + } |
| 152 | + $trackRow = $trackedBlobs[$secondaryId]; |
| 153 | + |
| 154 | + // Check that the specified text really is available in the tracked source row |
| 155 | + $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}"; |
| 156 | + $text = ExternalStore::fetchFromURL( $url ); |
| 157 | + if ( $text === false ) { |
| 158 | + print "$primaryId: unrecoverable: source text missing\n"; |
| 159 | + ++$numBad; |
| 160 | + unset( $stubs[$primaryId] ); |
| 161 | + continue; |
| 162 | + } |
| 163 | + if ( md5( $text ) !== $stub['hash'] ) { |
| 164 | + print "$primaryId: unrecoverable: content hashes do not match\n"; |
| 165 | + ++$numBad; |
| 166 | + unset( $stubs[$primaryId] ); |
| 167 | + continue; |
| 168 | + } |
| 169 | + |
| 170 | + // Find the page_id and rev_id |
| 171 | + // The page is probably the same as the page of the secondary row |
| 172 | + $pageId = intval( $trackRow->bt_page ); |
| 173 | + if ( !$pageId ) { |
| 174 | + $revId = $pageId = 0; |
| 175 | + } else { |
| 176 | + $revId = $this->findTextIdInPage( $pageId, $primaryId ); |
| 177 | + if ( !$revId ) { |
| 178 | + // Actually an orphan |
| 179 | + $pageId = $revId = 0; |
| 180 | + } |
| 181 | + } |
| 182 | + |
| 183 | + $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8'; |
| 184 | + |
| 185 | + if ( !$dryRun ) { |
| 186 | + // Reset the text row to point to the original copy |
| 187 | + $dbw->begin(); |
| 188 | + $dbw->update( |
| 189 | + 'text', |
| 190 | + // SET |
| 191 | + array( |
| 192 | + 'old_flags' => $newFlags, |
| 193 | + 'old_text' => $url |
| 194 | + ), |
| 195 | + // WHERE |
| 196 | + array( 'old_id' => $primaryId ), |
| 197 | + __METHOD__ |
| 198 | + ); |
| 199 | + |
| 200 | + // Add a blob_tracking row so that the new reference can be recompressed |
| 201 | + // without needing to run trackBlobs.php again |
| 202 | + $dbw->insert( 'blob_tracking', |
| 203 | + array( |
| 204 | + 'bt_page' => $pageId, |
| 205 | + 'bt_rev_id' => $revId, |
| 206 | + 'bt_text_id' => $primaryId, |
| 207 | + 'bt_cluster' => $trackRow->bt_cluster, |
| 208 | + 'bt_blob_id' => $trackRow->bt_blob_id, |
| 209 | + 'bt_cgz_hash' => $stub['hash'], |
| 210 | + 'bt_new_url' => null, |
| 211 | + 'bt_moved' => 0, |
| 212 | + ), |
| 213 | + __METHOD__ |
| 214 | + ); |
| 215 | + $dbw->commit(); |
| 216 | + $this->waitForSlaves(); |
| 217 | + } |
| 218 | + |
| 219 | + print "$primaryId: resolved to $url\n"; |
| 220 | + ++$numFixed; |
| 221 | + } |
| 222 | + } |
| 223 | + |
| 224 | + print "\n"; |
| 225 | + print "Fixed: $numFixed\n"; |
| 226 | + print "Unrecoverable: $numBad\n"; |
| 227 | + print "Good stubs: $numGood\n"; |
| 228 | + } |
| 229 | + |
| 230 | + function waitForSlaves() { |
| 231 | + static $iteration = 0; |
| 232 | + ++$iteration; |
| 233 | + if ( ++$iteration > 50 == 0 ) { |
| 234 | + wfWaitForSlaves( 5 ); |
| 235 | + $iteration = 0; |
| 236 | + } |
| 237 | + } |
| 238 | + |
| 239 | + function findTextIdInPage( $pageId, $textId ) { |
| 240 | + $ids = $this->getRevTextMap( $pageId ); |
| 241 | + if ( !isset( $ids[$textId] ) ) { |
| 242 | + return null; |
| 243 | + } else { |
| 244 | + return $ids[$textId]; |
| 245 | + } |
| 246 | + } |
| 247 | + |
| 248 | + function getRevTextMap( $pageId ) { |
| 249 | + if ( !isset( $this->mapCache[$pageId] ) ) { |
| 250 | + // Limit cache size |
| 251 | + while ( $this->mapCacheSize > $this->maxMapCacheSize ) { |
| 252 | + $key = key( $this->mapCache ); |
| 253 | + $this->mapCacheSize -= count( $this->mapCache[$key] ); |
| 254 | + unset( $this->mapCache[$key] ); |
| 255 | + } |
| 256 | + |
| 257 | + $dbr = wfGetDB( DB_SLAVE ); |
| 258 | + $map = array(); |
| 259 | + $res = $dbr->select( 'revision', |
| 260 | + array( 'rev_id', 'rev_text_id' ), |
| 261 | + array( 'rev_page' => $pageId ), |
| 262 | + __METHOD__ |
| 263 | + ); |
| 264 | + foreach ( $res as $row ) { |
| 265 | + $map[$row->rev_text_id] = $row->rev_id; |
| 266 | + } |
| 267 | + $this->mapCache[$pageId] = $map; |
| 268 | + $this->mapCacheSize += count( $map ); |
| 269 | + } |
| 270 | + return $this->mapCache[$pageId]; |
| 271 | + } |
| 272 | + |
| 273 | + /** |
| 274 | + * This is based on part of HistoryBlobStub::getText(). |
| 275 | + * Determine if the text can be retrieved from the row in the normal way. |
| 276 | + */ |
| 277 | + function isUnbrokenStub( $stub, $secondaryRow ) { |
| 278 | + $flags = explode( ',', $secondaryRow->old_flags ); |
| 279 | + $text = $secondaryRow->old_text; |
| 280 | + if( in_array( 'external', $flags ) ) { |
| 281 | + $url = $text; |
| 282 | + @list( /* $proto */ , $path ) = explode( '://', $url, 2 ); |
| 283 | + if ( $path == "" ) { |
| 284 | + return false; |
| 285 | + } |
| 286 | + $text = ExternalStore::fetchFromUrl( $url ); |
| 287 | + } |
| 288 | + if( !in_array( 'object', $flags ) ) { |
| 289 | + return false; |
| 290 | + } |
| 291 | + |
| 292 | + if( in_array( 'gzip', $flags ) ) { |
| 293 | + $obj = unserialize( gzinflate( $text ) ); |
| 294 | + } else { |
| 295 | + $obj = unserialize( $text ); |
| 296 | + } |
| 297 | + |
| 298 | + if( !is_object( $obj ) ) { |
| 299 | + // Correct for old double-serialization bug. |
| 300 | + $obj = unserialize( $obj ); |
| 301 | + } |
| 302 | + |
| 303 | + if ( !is_object( $obj ) ) { |
| 304 | + return false; |
| 305 | + } |
| 306 | + |
| 307 | + $obj->uncompress(); |
| 308 | + $text = $obj->getItem( $stub['hash'] ); |
| 309 | + return $text !== false; |
| 310 | + } |
| 311 | +} |
| 312 | + |
| 313 | +$maintClass = 'FixBug20757'; |
| 314 | +require_once( DO_MAINTENANCE ); |
| 315 | + |
Property changes on: branches/wmf-deployment/maintenance/storage/fixBug20757.php |
___________________________________________________________________ |
Name: svn:mergeinfo |
1 | 316 | + /branches/REL1_15/phase3/maintenance/storage/fixBug20757.php:51646 |
/branches/sqlite/maintenance/storage/fixBug20757.php:58211-58321 |
Name: svn:eol-style |
2 | 317 | + native |