r62808 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r62807‎ | r62808 | r62809 >
Date:02:48, 22 February 2010
Author:tstarling
Status:ok
Tags:
Comment:
Copying fixBug20757.php from trunk to wmf-deployment
Modified paths:
  • /branches/wmf-deployment/maintenance/storage/fixBug20757.php (added) (history)

Diff [purge]

Index: branches/wmf-deployment/maintenance/storage/fixBug20757.php
@@ -0,0 +1,314 @@
 2+<?php
 3+
 4+require_once( dirname( __FILE__ ) . '/../Maintenance.php' );
 5+
 6+class FixBug20757 extends Maintenance {
 7+ var $batchSize = 10000;
 8+ var $mapCache = array();
 9+ var $mapCacheSize = 0;
 10+ var $maxMapCacheSize = 1000000;
 11+
 12+ function __construct() {
 13+ parent::__construct();
 14+ $this->mDescription = 'Script to fix bug 20757 assuming that blob_tracking is intact';
 15+ $this->addOption( 'dry-run', 'Report only' );
 16+ $this->addOption( 'start', 'old_id to start at', false, true );
 17+ }
 18+
 19+ function execute() {
 20+ $dbr = wfGetDB( DB_SLAVE );
 21+ $dbw = wfGetDB( DB_MASTER );
 22+
 23+ $dryRun = $this->getOption( 'dry-run' );
 24+ if ( $dryRun ) {
 25+ print "Dry run only.\n";
 26+ }
 27+
 28+ $startId = $this->getOption( 'start', 0 );
 29+ $numGood = 0;
 30+ $numFixed = 0;
 31+ $numBad = 0;
 32+
 33+ $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
 34+
 35+ while ( true ) {
 36+ print "ID: $startId / $totalRevs\r";
 37+
 38+ $res = $dbr->select(
 39+ 'text',
 40+ array( 'old_id', 'old_flags', 'old_text' ),
 41+ array(
 42+ 'old_id > ' . intval( $startId ),
 43+ 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'',
 44+ 'LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
 45+ ),
 46+ __METHOD__,
 47+ array(
 48+ 'ORDER BY' => 'old_id',
 49+ 'LIMIT' => $this->batchSize,
 50+ )
 51+ );
 52+
 53+ if ( !$res->numRows() ) {
 54+ break;
 55+ }
 56+
 57+ $secondaryIds = array();
 58+ $stubs = array();
 59+
 60+ foreach ( $res as $row ) {
 61+ $startId = $row->old_id;
 62+
 63+ // Basic sanity checks
 64+ $obj = unserialize( $row->old_text );
 65+ if ( $obj === false ) {
 66+ print "{$row->old_id}: unrecoverable: cannot unserialize\n";
 67+ ++$numBad;
 68+ continue;
 69+ }
 70+
 71+ if ( !is_object( $obj ) ) {
 72+ print "{$row->old_id}: unrecoverable: unserialized to type " .
 73+ gettype( $obj ) . ", possible double-serialization\n";
 74+ ++$numBad;
 75+ continue;
 76+ }
 77+
 78+ if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) {
 79+ print "{$row->old_id}: unrecoverable: unexpected object class " .
 80+ get_class( $obj ) . "\n";
 81+ ++$numBad;
 82+ continue;
 83+ }
 84+
 85+ // Process flags
 86+ $flags = explode( ',', $row->old_flags );
 87+ if ( in_array( 'utf-8', $flags ) || in_array( 'utf8', $flags ) ) {
 88+ $legacyEncoding = false;
 89+ } else {
 90+ $legacyEncoding = true;
 91+ }
 92+
 93+ // Queue the stub for future batch processing
 94+ $id = intval( $obj->mOldId );
 95+ $secondaryIds[] = $id;
 96+ $stubs[$row->old_id] = array(
 97+ 'legacyEncoding' => $legacyEncoding,
 98+ 'secondaryId' => $id,
 99+ 'hash' => $obj->mHash,
 100+ );
 101+ }
 102+
 103+ $secondaryIds = array_unique( $secondaryIds );
 104+
 105+ if ( !count( $secondaryIds ) ) {
 106+ continue;
 107+ }
 108+
 109+ // Run the batch query on blob_tracking
 110+ $res = $dbr->select(
 111+ 'blob_tracking',
 112+ '*',
 113+ array(
 114+ 'bt_text_id' => $secondaryIds,
 115+ ),
 116+ __METHOD__
 117+ );
 118+ $trackedBlobs = array();
 119+ foreach ( $res as $row ) {
 120+ $trackedBlobs[$row->bt_text_id] = $row;
 121+ }
 122+
 123+ // Process the stubs
 124+ $stubsToFix = array();
 125+ foreach ( $stubs as $primaryId => $stub ) {
 126+ $secondaryId = $stub['secondaryId'];
 127+ if ( !isset( $trackedBlobs[$secondaryId] ) ) {
 128+ // No tracked blob. Work out what went wrong
 129+ $secondaryRow = $dbr->selectRow(
 130+ 'text',
 131+ array( 'old_flags', 'old_text' ),
 132+ array( 'old_id' => $secondaryId ),
 133+ __METHOD__
 134+ );
 135+ if ( !$secondaryRow ) {
 136+ print "$primaryId: unrecoverable: secondary row is missing\n";
 137+ ++$numBad;
 138+ } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) {
 139+ // Not broken yet, and not in the tracked clusters so it won't get
 140+ // broken by the current RCT run.
 141+ ++$numGood;
 142+ } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) {
 143+ print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n";
 144+ ++$numBad;
 145+ } else {
 146+ print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n";
 147+ ++$numBad;
 148+ }
 149+ unset( $stubs[$primaryId] );
 150+ continue;
 151+ }
 152+ $trackRow = $trackedBlobs[$secondaryId];
 153+
 154+ // Check that the specified text really is available in the tracked source row
 155+ $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}";
 156+ $text = ExternalStore::fetchFromURL( $url );
 157+ if ( $text === false ) {
 158+ print "$primaryId: unrecoverable: source text missing\n";
 159+ ++$numBad;
 160+ unset( $stubs[$primaryId] );
 161+ continue;
 162+ }
 163+ if ( md5( $text ) !== $stub['hash'] ) {
 164+ print "$primaryId: unrecoverable: content hashes do not match\n";
 165+ ++$numBad;
 166+ unset( $stubs[$primaryId] );
 167+ continue;
 168+ }
 169+
 170+ // Find the page_id and rev_id
 171+ // The page is probably the same as the page of the secondary row
 172+ $pageId = intval( $trackRow->bt_page );
 173+ if ( !$pageId ) {
 174+ $revId = $pageId = 0;
 175+ } else {
 176+ $revId = $this->findTextIdInPage( $pageId, $primaryId );
 177+ if ( !$revId ) {
 178+ // Actually an orphan
 179+ $pageId = $revId = 0;
 180+ }
 181+ }
 182+
 183+ $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8';
 184+
 185+ if ( !$dryRun ) {
 186+ // Reset the text row to point to the original copy
 187+ $dbw->begin();
 188+ $dbw->update(
 189+ 'text',
 190+ // SET
 191+ array(
 192+ 'old_flags' => $newFlags,
 193+ 'old_text' => $url
 194+ ),
 195+ // WHERE
 196+ array( 'old_id' => $primaryId ),
 197+ __METHOD__
 198+ );
 199+
 200+ // Add a blob_tracking row so that the new reference can be recompressed
 201+ // without needing to run trackBlobs.php again
 202+ $dbw->insert( 'blob_tracking',
 203+ array(
 204+ 'bt_page' => $pageId,
 205+ 'bt_rev_id' => $revId,
 206+ 'bt_text_id' => $primaryId,
 207+ 'bt_cluster' => $trackRow->bt_cluster,
 208+ 'bt_blob_id' => $trackRow->bt_blob_id,
 209+ 'bt_cgz_hash' => $stub['hash'],
 210+ 'bt_new_url' => null,
 211+ 'bt_moved' => 0,
 212+ ),
 213+ __METHOD__
 214+ );
 215+ $dbw->commit();
 216+ $this->waitForSlaves();
 217+ }
 218+
 219+ print "$primaryId: resolved to $url\n";
 220+ ++$numFixed;
 221+ }
 222+ }
 223+
 224+ print "\n";
 225+ print "Fixed: $numFixed\n";
 226+ print "Unrecoverable: $numBad\n";
 227+ print "Good stubs: $numGood\n";
 228+ }
 229+
 230+ function waitForSlaves() {
 231+ static $iteration = 0;
 232+ ++$iteration;
 233+ if ( ++$iteration > 50 == 0 ) {
 234+ wfWaitForSlaves( 5 );
 235+ $iteration = 0;
 236+ }
 237+ }
 238+
 239+ function findTextIdInPage( $pageId, $textId ) {
 240+ $ids = $this->getRevTextMap( $pageId );
 241+ if ( !isset( $ids[$textId] ) ) {
 242+ return null;
 243+ } else {
 244+ return $ids[$textId];
 245+ }
 246+ }
 247+
 248+ function getRevTextMap( $pageId ) {
 249+ if ( !isset( $this->mapCache[$pageId] ) ) {
 250+ // Limit cache size
 251+ while ( $this->mapCacheSize > $this->maxMapCacheSize ) {
 252+ $key = key( $this->mapCache );
 253+ $this->mapCacheSize -= count( $this->mapCache[$key] );
 254+ unset( $this->mapCache[$key] );
 255+ }
 256+
 257+ $dbr = wfGetDB( DB_SLAVE );
 258+ $map = array();
 259+ $res = $dbr->select( 'revision',
 260+ array( 'rev_id', 'rev_text_id' ),
 261+ array( 'rev_page' => $pageId ),
 262+ __METHOD__
 263+ );
 264+ foreach ( $res as $row ) {
 265+ $map[$row->rev_text_id] = $row->rev_id;
 266+ }
 267+ $this->mapCache[$pageId] = $map;
 268+ $this->mapCacheSize += count( $map );
 269+ }
 270+ return $this->mapCache[$pageId];
 271+ }
 272+
 273+ /**
 274+ * This is based on part of HistoryBlobStub::getText().
 275+ * Determine if the text can be retrieved from the row in the normal way.
 276+ */
 277+ function isUnbrokenStub( $stub, $secondaryRow ) {
 278+ $flags = explode( ',', $secondaryRow->old_flags );
 279+ $text = $secondaryRow->old_text;
 280+ if( in_array( 'external', $flags ) ) {
 281+ $url = $text;
 282+ @list( /* $proto */ , $path ) = explode( '://', $url, 2 );
 283+ if ( $path == "" ) {
 284+ return false;
 285+ }
 286+ $text = ExternalStore::fetchFromUrl( $url );
 287+ }
 288+ if( !in_array( 'object', $flags ) ) {
 289+ return false;
 290+ }
 291+
 292+ if( in_array( 'gzip', $flags ) ) {
 293+ $obj = unserialize( gzinflate( $text ) );
 294+ } else {
 295+ $obj = unserialize( $text );
 296+ }
 297+
 298+ if( !is_object( $obj ) ) {
 299+ // Correct for old double-serialization bug.
 300+ $obj = unserialize( $obj );
 301+ }
 302+
 303+ if ( !is_object( $obj ) ) {
 304+ return false;
 305+ }
 306+
 307+ $obj->uncompress();
 308+ $text = $obj->getItem( $stub['hash'] );
 309+ return $text !== false;
 310+ }
 311+}
 312+
 313+$maintClass = 'FixBug20757';
 314+require_once( DO_MAINTENANCE );
 315+
Property changes on: branches/wmf-deployment/maintenance/storage/fixBug20757.php
___________________________________________________________________
Name: svn:mergeinfo
1316 + /branches/REL1_15/phase3/maintenance/storage/fixBug20757.php:51646
/branches/sqlite/maintenance/storage/fixBug20757.php:58211-58321
Name: svn:eol-style
2317 + native

Status & tagging log