r104258 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r104257‎ | r104258 | r104259 >
Date:23:46, 25 November 2011
Author:aaron
Status:deferred
Tags:
Comment:
* More cleanup and robustness changes to DBFileLockManager
* Added sql patch for DBFileLockManager
* Minor FSFileBackend cleanups
Modified paths:
  • /branches/FileBackend/phase3/includes/filerepo/backend/FSFileBackend.php (modified) (history)
  • /branches/FileBackend/phase3/includes/filerepo/backend/FileLockManager.php (modified) (history)
  • /branches/FileBackend/phase3/maintenance/locking (added) (history)
  • /branches/FileBackend/phase3/maintenance/locking/file_locks.sql (added) (history)

Diff [purge]

Index: branches/FileBackend/phase3/maintenance/locking/file_locks.sql
@@ -0,0 +1,4 @@
 2+-- Create table to handle resource locking
 3+CREATE TABLE /*$wgDBprefix*/file_locks (
 4+ fl_key binary(40) NOT NULL default '' PRIMARY KEY
 5+) ENGINE=InnoDB, DEFAULT CHARSET=binary;
Property changes on: branches/FileBackend/phase3/maintenance/locking/file_locks.sql
___________________________________________________________________
Added: svn:eol-style
16 + native
Property changes on: branches/FileBackend/phase3/maintenance/locking
___________________________________________________________________
Added: bugtraq:number
27 + true
Index: branches/FileBackend/phase3/includes/filerepo/backend/FSFileBackend.php
@@ -303,6 +303,10 @@
304304 $status->fatal( 'directoryreadonlyerror', $param['directory'] );
305305 return $status;
306306 }
 307+ if ( !is_readable( $dir ) ) {
 308+ $status->fatal( 'directorynotreadableerror', $param['directory'] );
 309+ return $status;
 310+ }
307311 return $status;
308312 }
309313
@@ -506,7 +510,7 @@
507511 // If the first thing we find is a directory, then return
508512 // the first file that it contains (via recursion).
509513 // We exclude symlink dirs in order to avoid cycles.
510 - if ( is_dir( "$dir/$file" ) && !is_link( "$dir/$file" ) ) {
 514+ if ( is_dir( "{$dir}/{$file}" ) && !is_link( "{$dir}/{$file}" ) ) {
511515 $subHandle = opendir( "$dir/$file" );
512516 if ( $subHandle ) {
513517 $this->pushDirectory( "{$dir}/{$file}", $subHandle );
@@ -515,8 +519,8 @@
516520 return $nextFile; // found the next one!
517521 }
518522 }
519 - } elseif ( is_file( "$dir/$file" ) ) {
520 - return "$dir/$file"; // found the next one!
 523+ } elseif ( is_file( "{$dir}/{$file}" ) ) {
 524+ return "{$dir}/{$file}"; // found the next one!
521525 }
522526 }
523527 }
Index: branches/FileBackend/phase3/includes/filerepo/backend/FileLockManager.php
@@ -220,28 +220,32 @@
221221
222222 /**
223223 * Version of FileLockManager based on using DB table locks.
 224+ * This is meant for multi-wiki systems that may share share files.
224225 *
225 - * This is meant for multi-wiki systems that may share share files.
226 - * One or several database servers are set up having a `file_locking`
227 - * table with one field, fl_key, the PRIMARY KEY. The table engine should
228 - * have row-level locking. All lock requests for a resource, identified
229 - * by a hash string, will map to one bucket.
 226+ * All lock requests for a resource, identified by a hash string, will
 227+ * map to one bucket. Each bucket maps to one or several peer DB servers,
 228+ * each of which have a `file_locks` table with row-level locking.
 229+ *
 230+ * All peer servers must agree to a lock in order for it to be acquired.
 231+ * As long as one peer server is up, lock requests will not be blocked
 232+ * just because another peer server cannot be contacted. A global status
 233+ * cache can be setup to track servers that recently missed queries; such
 234+ * servers will not be trusted for obtaining locks.
230235 *
231 - * Each bucket maps to one or several pier servers.
232 - * All pier servers must agree to a lock in order for it to be acquired.
233 - * As long as one pier server is up, lock requests will not be blocked
234 - * just because another pier server cannot be contacted.
235 - *
236236 * For performance, deadlock detection should be disabled and a small
237237 * lock-wait timeout should be set via server config. In innoDB, this can
238238 * done via the innodb_deadlock_detect and innodb_lock_wait_timeout settings.
239239 */
240240 class DBFileLockManager extends FileLockManager {
241 - /** @var Array Map of bucket indexes to server names */
242 - protected $serverMap; // (index => (server1, server2, ...))
 241+ /** @var Array Map of bucket indexes to peer sets */
 242+ protected $dbsByBucket; // (bucket index => (ldb1, ldb2, ...))
 243+ /** @var BagOStuff */
 244+ protected $statusCache;
 245+
243246 protected $webTimeout; // integer number of seconds
244247 protected $cliTimeout; // integer number of seconds
245 - protected $resetDelay; // integer number of seconds
 248+ protected $safeDelay; // integer number of seconds
 249+
246250 /** @var Array Map of (locked key => lock type => 1) */
247251 protected $locksHeld = array();
248252 /** $var Array Map Lock-active database connections (server name => Database) */
@@ -250,47 +254,47 @@
251255 /**
252256 * Construct a new instance from configuration.
253257 * $config paramaters include:
254 - * 'serverMap' : Array of no more than 16 consecutive integer keys,
255 - * starting from 0, with a list of servers as values.
256 - * The first server in each list is the main server and
257 - * the others are pier servers.
258 - * 'webTimeout' : Connection timeout (seconds) for non-CLI scripts.
259 - * This tells the DB server how long to wait before giving up
260 - * and releasing all the locks made in a session transaction.
261 - * 'cliTimeout' : Connection timeout (seconds) for CLI scripts.
262 - * This tells the DB server how long to wait before giving up
263 - * and releasing all the locks made in a session transaction.
264 - * 'resetDelay' : How long (seconds) to avoid using a DB server after it restarted.
265 - * This should reflect the highest max_execution_time that a PHP
266 - * script might use on this wiki. Locks are lost on server restart.
 258+ * 'dbsByBucket' : Array of 1-16 consecutive integer keys, starting from 0,
 259+ * with a list of database names (peers) as values.
 260+ * Each DB should be on its own server.
 261+ * 'statusCache' : $wgMemc if set to a global memcached instance. [optional]
 262+ * This tracks peer servers that couldn't be queried recently.
 263+ * 'webTimeout' : Connection timeout (seconds) for non-CLI scripts. [optional]
 264+ * This tells the DB server how long to wait before giving up
 265+ * and releasing all the locks made in a session transaction.
 266+ * 'cliTimeout' : Connection timeout (seconds) for CLI scripts. [optional]
 267+ * 'safeDelay' : Seconds to mistrust a DB after restart/query loss. [optional]
 268+ * This should reflect the highest max_execution_time that PHP
 269+ * scripts might use on a wiki. Locks are lost on server restart.
267270 *
268271 * @param Array $config
269272 */
270273 function __construct( array $config ) {
271 - $this->serverMap = (array)$config['serverMap'];
272 - // Sanitize serverMap against bad config to prevent PHP errors
273 - for ( $i=0; $i < count( $this->serverMap ); $i++ ) {
274 - if (
275 - !isset( $this->serverMap[$i] ) || // not consecutive
276 - !is_array( $this->serverMap[$i] ) || // bad type
277 - !count( $this->serverMap[$i] ) // empty list
 274+ $this->dbsByBucket = $config['dbsByBucket'];
 275+ // Sanitize against bad config to prevent PHP errors
 276+ for ( $i=0; $i < count( $this->dbsByBucket ); $i++ ) {
 277+ if ( !isset( $this->dbsByBucket[$i] ) // not consecutive
 278+ || !is_array( $this->dbsByBucket[$i] ) // bad type
278279 ) {
279 - $this->serverMap[$i] = null; // see getBucketFromKey()
280 - wfWarn( "No key for bucket $i in serverMap or server list is empty." );
 280+ $this->dbsByBucket[$i] = array();
 281+ wfWarn( "No valid key for bucket $i in dbsByBucket." );
281282 }
282283 }
283 - if ( !empty( $config['webTimeout'] ) ) { // disallow 0
 284+ if ( isset( $config['statusCache'] ) ) {
 285+ $this->statusCache = $config['statusCache'];
 286+ }
 287+ if ( isset( $config['webTimeout'] ) ) {
284288 $this->webTimeout = $config['webTimeout'];
285 - } elseif ( ini_get( 'max_execution_time' ) > 0 ) {
286 - $this->webTimeout = ini_get( 'max_execution_time' );
287 - } else { // cli?
288 - $this->webTimeout = 60; // some sane number
 289+ } else {
 290+ $this->webTimeout = ini_get( 'max_execution_time' ) // disallow 0
 291+ ? ini_get( 'max_execution_time' )
 292+ : 60; // some sane number
289293 }
290 - $this->cliTimeout = !empty( $config['cliTimeout'] ) // disallow 0
 294+ $this->cliTimeout = isset( $config['cliTimeout'] )
291295 ? $config['cliTimeout']
292296 : 60; // some sane number
293 - $this->resetDelay = isset( $config['resetDelay'] )
294 - ? $config['resetDelay']
 297+ $this->safeDelay = isset( $config['safeDelay'] )
 298+ ? $config['safeDelay']
295299 : max( $this->cliTimeout, $this->webTimeout );
296300 }
297301
@@ -306,7 +310,7 @@
307311 $status->warning( 'lockmanager-alreadylocked', $key );
308312 } else {
309313 $bucket = $this->getBucketFromKey( $key );
310 - if ( $bucket === null ) { // config error?
 314+ if ( !$bucket ) { // config error?
311315 $status->fatal( 'lockmanager-fail-config', $key );
312316 return $status;
313317 }
@@ -319,9 +323,9 @@
320324 foreach ( $keysToLock as $bucket => $keys ) {
321325 // Acquire the locks for this server. Three main cases can happen:
322326 // (a) First server is up; common case
323 - // (b) First server is down but a pier is up
324 - // (c) First server is down and no pier are up (or none defined)
325 - $count = $this->doLockingSelectAll( $bucket, $keys, $type );
 327+ // (b) First server is down but a peer is up
 328+ // (c) First server is down and no peer are up (or none defined)
 329+ $count = $this->doLockingQueryAll( $bucket, $keys, $type );
326330 if ( $count == -1 ) {
327331 // Resources already locked by another process.
328332 // Abort and unlock everything we just locked.
@@ -331,7 +335,7 @@
332336 } elseif ( $count <= 0 ) {
333337 // Couldn't contact any servers for this bucket.
334338 // Abort and unlock everything we just locked.
335 - $status->fatal( 'lockmanager-fail-db', $bucket );
 339+ $status->fatal( 'lockmanager-fail-db-bucket', $bucket );
336340 $status->merge( $this->doUnlock( $lockedKeys, $type ) );
337341 return $status; // error
338342 }
@@ -369,7 +373,7 @@
370374
371375 // Reference count the locks held and COMMIT when zero
372376 if ( !count( $this->locksHeld ) ) {
373 - $this->commitLockTransactions();
 377+ $this->finishLockTransactions();
374378 }
375379
376380 return $status;
@@ -383,7 +387,7 @@
384388 * @param $type integer FileLockManager::LOCK_EX or FileLockManager::LOCK_SH
385389 * @return void
386390 */
387 - protected function doLockingSelect( $server, array $keys, $type ) {
 391+ protected function doLockingQuery( $server, array $keys, $type ) {
388392 if ( !isset( $this->activeConns[$server] ) ) {
389393 $this->activeConns[$server] = wfGetDB( DB_MASTER, array(), $server );
390394 $this->activeConns[$server]->begin(); // start transaction
@@ -392,16 +396,20 @@
393397 # This won't handle the case of server reboots however.
394398 $options = array();
395399 if ( php_sapi_name() == 'cli' ) { // maintenance scripts
396 - $options['connTimeout'] = $this->cliTimeout;
 400+ if ( $this->cliTimeout > 0 ) {
 401+ $options['connTimeout'] = $this->cliTimeout;
 402+ }
397403 } else { // web requests
398 - $options['connTimeout'] = $this->webTimeout;
 404+ if ( $this->webTimeout > 0 ) {
 405+ $options['connTimeout'] = $this->webTimeout;
 406+ }
399407 }
400408 $this->activeConns[$server]->setSessionOptions( $options );
401409 }
402410 $db = $this->activeConns[$server];
403411 # Try to get the locks...this should be the last query of this function
404412 if ( $type == self::LOCK_SH ) { // reader locks
405 - $db->select( 'file_locking', '1',
 413+ $db->select( 'file_locks', '1',
406414 array( 'fl_key' => $keys ),
407415 __METHOD__,
408416 array( 'LOCK IN SHARE MODE' ) // single-row gap locks
@@ -411,36 +419,41 @@
412420 foreach ( $keys as $key ) {
413421 $data[] = array( 'fl_key' => $key );
414422 }
415 - $db->insert( 'file_locking', $data, __METHOD__ ); // error on duplicate
 423+ $db->insert( 'file_locks', $data, __METHOD__ );
416424 }
417425 }
418426
419427 /**
420428 * Attept to acquire a lock on the primary server as well
421 - * as all pier servers for a bucket. Returns the number of
422 - * servers with locks made or -1 if any of them claimed that
423 - * any of the keys were already locked by another process.
 429+ * as all peer servers for a bucket. Return value is either:
 430+ * a) The number of servers, considered reliable, where the locks were acquired
 431+ * b) -1; if any server claimed that a resource was already locked
424432 * This should avoid throwing any exceptions.
425433 *
426434 * @param $bucket integer
427 - * @param $keys Array
 435+ * @param $keys Array List of resource keys to lock
428436 * @param $type integer FileLockManager::LOCK_EX or FileLockManager::LOCK_SH
429437 * @return integer
430438 */
431 - protected function doLockingSelectAll( $bucket, array $keys, $type ) {
432 - $locksMade = 0;
433 - for ( $i=0; $i < count( $this->serverMap[$bucket] ); $i++ ) {
434 - $server = $this->serverMap[$bucket][$i];
 439+ protected function doLockingQueryAll( $bucket, array $keys, $type ) {
 440+ $locksMade = 0; // locks made on trustable servers
 441+ foreach ( $this->dbsByBucket[$bucket] as $server ) {
435442 try {
436 - $this->doLockingSelect( $server, $keys, $type );
437 - if ( $this->checkServerUptime( $server ) ) {
438 - ++$locksMade; // success for this pier
 443+ $this->doLockingQuery( $server, $keys, $type );
 444+ // Servers that have any signs of lock loss are treated as suspect
 445+ if ( $this->checkReliable( $server ) ) {
 446+ ++$locksMade; // success for this peer
 447+ } elseif ( !$this->statusCache ) {
 448+ // If we are only checking for restarts, this won't catch
 449+ // cases were are only server got a lock and was restarted.
 450+ return 0;
439451 }
440452 } catch ( DBError $e ) {
441453 if ( $this->lastErrorIndicatesLocked( $server ) ) {
442454 return -1; // resource locked
 455+ } else { // can't connect?
 456+ $this->recordFailure( $server );
443457 }
444 - // oh well; logged via wfLogDBError()
445458 }
446459 }
447460 return $locksMade;
@@ -452,7 +465,7 @@
453466 *
454467 * @return void
455468 */
456 - protected function commitLockTransactions() {
 469+ protected function finishLockTransactions() {
457470 foreach ( $this->activeConns as $server => $db ) {
458471 try {
459472 $db->rollback(); // finish transaction and kill any rows
@@ -480,22 +493,69 @@
481494 }
482495
483496 /**
484 - * Check if the DB server has been up long enough to be safe
485 - * to use. This is to get around the problems of locks falling
486 - * off when servers restart.
 497+ * Checks if none of the following happened:
 498+ * a) The DB server recently restarted.
 499+ * This curtails the problem of locks falling off when servers restart.
 500+ * b) The DB server has recently missed lock queries.
 501+ * This curtails the problem of peers occasionally not getting locks.
487502 *
488503 * @param $server string
489504 * @return bool
490505 */
491 - protected function checkServerUptime( $server ) {
 506+ protected function checkReliable( $server ) {
492507 if ( isset( $this->activeConns[$server] ) ) { // sanity
493 - $db = $this->activeConns[$server];
494 - return ( $db->getServerUptime() >= $this->resetDelay );
 508+ if ( $this->safeDelay > 0 ) {
 509+ $db = $this->activeConns[$server];
 510+ if ( $db->getServerUptime() < $this->safeDelay ) {
 511+ return false;
 512+ }
 513+ if ( $this->statusCache ) {
 514+ $key = $this->getMissKey( $server );
 515+ $misses = $this->statusCache->get( $key );
 516+ if ( $misses > 0 ) {
 517+ return false;
 518+ }
 519+ }
 520+ }
 521+ return true;
495522 }
496523 return false;
497524 }
498525
499526 /**
 527+ * Log a lock request failure to the log server.
 528+ *
 529+ * Worst case scenario is that a resource lock was only
 530+ * on one peer and then that peer is restarted or goes down.
 531+ * Clients trying to get locks need to know if a server is down.
 532+ *
 533+ * @param $server string
 534+ * @return bool Success
 535+ */
 536+ protected function recordFailure( $server ) {
 537+ if ( $this->statusCache && $this->safeDelay > 0 ) {
 538+ $key = $this->getMissKey( $server );
 539+ $misses = $this->statusCache->get( $key );
 540+ if ( $misses ) {
 541+ return $this->statusCache->incr( $key );
 542+ } else {
 543+ return $this->statusCache->add( $key, 1, $this->safeDelay );
 544+ }
 545+ }
 546+ return true;
 547+ }
 548+
 549+ /**
 550+ * Get a cache key for recent query misses for a server
 551+ *
 552+ * @param $server string
 553+ * @return string
 554+ */
 555+ protected function getMissKey( $server ) {
 556+ return "lockmanager:querymisses:srv:$server";
 557+ }
 558+
 559+ /**
500560 * Get the bucket for lock key.
501561 * This should avoid throwing any exceptions.
502562 *
@@ -504,12 +564,7 @@
505565 */
506566 protected function getBucketFromKey( $key ) {
507567 $prefix = substr( $key, 0, 2 ); // first 2 hex chars (8 bits)
508 - $bucket = intval( base_convert( $prefix, 16, 10 ) ) % count( $this->serverMap );
509 - // Sanity check that at least one server is handling this bucket
510 - if ( !isset( $this->serverMap[$bucket] ) ) {
511 - return null; // bad config?
512 - }
513 - return $bucket;
 568+ return intval( base_convert( $prefix, 16, 10 ) ) % count( $this->dbsByBucket );
514569 }
515570 }
516571

Status & tagging log