r54841 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r54840‎ | r54841 | r54842 >
Date:05:00, 12 August 2009
Author:tstarling
Status:ok
Tags:
Comment:
In response to a report from Domas that we are seeing HTMLCacheUpdate::invalidate() queries that touch hundreds of thousands of rows and cause significant slave lag:
* Check the number of rows to be updated before actually doing the query, and if it is too large, repartition the job. Due to caching and job queue lag, it is possible that the original partitioning could be pathologically inaccurate.
* Respect $wgRowsPerQuery (regression due to r47317) but increase the default from 10 to 100. It was originally chosen with a low value because I imagined that it would help reduce slave lag, but this is not generally the case since the queries may be in the same transaction.
* Fix lack of initialisation of $jobs in insertJobs() (sloppy but not a bug)
* To avoid queueing up jobs unnecessarily and to reduce the chance of jobs being repartitioned a large number of times as links are incrementally added, make the size threshold for queueing double the job size instead of equal to the job size
* Add a check of title array size to the immediate case, to avoid updating hundreds of thousands of rows when an incorrect size is stored to memcached.
Modified paths:
  • /trunk/phase3/includes/DefaultSettings.php (modified) (history)
  • /trunk/phase3/includes/HTMLCacheUpdate.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/HTMLCacheUpdate.php
@@ -25,38 +25,119 @@
2626 */
2727 class HTMLCacheUpdate
2828 {
29 - public $mTitle, $mTable, $mPrefix;
 29+ public $mTitle, $mTable, $mPrefix, $mStart, $mEnd;
3030 public $mRowsPerJob, $mRowsPerQuery;
3131
32 - function __construct( $titleTo, $table ) {
 32+ function __construct( $titleTo, $table, $start = false, $end = false ) {
3333 global $wgUpdateRowsPerJob, $wgUpdateRowsPerQuery;
3434
3535 $this->mTitle = $titleTo;
3636 $this->mTable = $table;
 37+ $this->mStart = $start;
 38+ $this->mEnd = $end;
3739 $this->mRowsPerJob = $wgUpdateRowsPerJob;
3840 $this->mRowsPerQuery = $wgUpdateRowsPerQuery;
3941 $this->mCache = $this->mTitle->getBacklinkCache();
4042 }
4143
4244 public function doUpdate() {
43 - # Fetch the IDs
 45+ if ( $this->mStart || $this->mEnd ) {
 46+ $this->doPartialUpdate();
 47+ return;
 48+ }
 49+
 50+ # Get an estimate of the number of rows from the BacklinkCache
4451 $numRows = $this->mCache->getNumLinks( $this->mTable );
45 -
46 - if ( $numRows != 0 ) {
47 - if ( $numRows > $this->mRowsPerJob ) {
48 - $this->insertJobs();
 52+ if ( $numRows > $this->mRowsPerJob * 2 ) {
 53+ # Do fast cached partition
 54+ $this->insertJobs();
 55+ } else {
 56+ # Get the links from the DB
 57+ $titleArray = $this->mCache->getLinks( $this->mTable );
 58+ # Check if the row count estimate was correct
 59+ if ( $titleArray->count() > $this->mRowsPerJob * 2 ) {
 60+ # Not correct, do accurate partition
 61+ wfDebug( __METHOD__.": row count estimate was incorrect, repartitioning\n" );
 62+ $this->insertJobsFromTitles( $titleArray );
4963 } else {
50 - $this->invalidate();
 64+ $this->invalidateTitles( $titleArray );
5165 }
5266 }
5367 wfRunHooks( 'HTMLCacheUpdate::doUpdate', array($this->mTitle) );
5468 }
5569
 70+ /**
 71+ * Update some of the backlinks, defined by a page ID range
 72+ */
 73+ protected function doPartialUpdate() {
 74+ $titleArray = $this->mCache->getLinks( $this->mTable, $this->mStart, $this->mEnd );
 75+ if ( $titleArray->count() <= $this->mRowsPerJob * 2 ) {
 76+ # This partition is small enough, do the update
 77+ $this->invalidateTitles( $titleArray );
 78+ } else {
 79+ # Partitioning was excessively inaccurate. Divide the job further.
 80+ # This can occur when a large number of links are added in a short
 81+ # period of time, say by updating a heavily-used template.
 82+ $this->insertJobsFromTitles( $titleArray );
 83+ }
 84+ }
 85+
 86+ /**
 87+ * Partition the current range given by $this->mStart and $this->mEnd,
 88+ * using a pre-calculated title array which gives the links in that range.
 89+ * Queue the resulting jobs.
 90+ */
 91+ protected function insertJobsFromTitles( $titleArray ) {
 92+ # We make subpartitions in the sense that the start of the first job
 93+ # will be the start of the parent partition, and the end of the last
 94+ # job will be the end of the parent partition.
 95+ $jobs = array();
 96+ $start = $this->mStart; # start of the current job
 97+ $numTitles = 0;
 98+ foreach ( $titleArray as $title ) {
 99+ $id = $title->getArticleID();
 100+ # $numTitles is now the number of titles in the current job not
 101+ # including the current ID
 102+ if ( $numTitles >= $this->mRowsPerJob ) {
 103+ # Add a job up to but not including the current ID
 104+ $params = array(
 105+ 'table' => $this->mTable,
 106+ 'start' => $start,
 107+ 'end' => $id - 1
 108+ );
 109+ $jobs[] = new HTMLCacheUpdateJob( $this->mTitle, $params );
 110+ $start = $id;
 111+ $numTitles = 0;
 112+ }
 113+ $numTitles++;
 114+ }
 115+ # Last job
 116+ $params = array(
 117+ 'table' => $this->mTable,
 118+ 'start' => $start,
 119+ 'end' => $this->mEnd
 120+ );
 121+ $jobs[] = new HTMLCacheUpdateJob( $this->mTitle, $params );
 122+ wfDebug( __METHOD__.": repartitioning into " . count( $jobs ) . " jobs\n" );
 123+
 124+ if ( count( $jobs ) < 2 ) {
 125+ # I don't think this is possible at present, but handling this case
 126+ # makes the code a bit more robust against future code updates and
 127+ # avoids a potential infinite loop of repartitioning
 128+ wfDebug( __METHOD__.": repartitioning failed!\n" );
 129+ $this->invalidateTitles( $titleArray );
 130+ return;
 131+ }
 132+
 133+ Job::batchInsert( $jobs );
 134+ }
 135+
56136 protected function insertJobs() {
57137 $batches = $this->mCache->partition( $this->mTable, $this->mRowsPerJob );
58138 if ( !$batches ) {
59139 return;
60140 }
 141+ $jobs = array();
61142 foreach ( $batches as $batch ) {
62143 $params = array(
63144 'table' => $this->mTable,
@@ -68,18 +149,21 @@
69150 Job::batchInsert( $jobs );
70151 }
71152
 153+ /**
 154+ * Invalidate a range of pages, right now
 155+ * @deprecated
 156+ */
 157+ public function invalidate( $startId = false, $endId = false ) {
 158+ $titleArray = $this->mCache->getLinks( $this->mTable, $startId, $endId );
 159+ $this->invalidateTitles( $titleArray );
 160+ }
72161
73162 /**
74 - * Invalidate a set of pages, right now
 163+ * Invalidate an array (or iterator) of Title objects, right now
75164 */
76 - public function invalidate( $startId = false, $endId = false ) {
 165+ protected function invalidateTitles( $titleArray ) {
77166 global $wgUseFileCache, $wgUseSquid;
78167
79 - $titleArray = $this->mCache->getLinks( $this->mTable, $startId, $endId );
80 - if ( $titleArray->count() == 0 ) {
81 - return;
82 - }
83 -
84168 $dbw = wfGetDB( DB_MASTER );
85169 $timestamp = $dbw->timestamp();
86170
@@ -88,12 +172,20 @@
89173 foreach ( $titleArray as $title ) {
90174 $ids[] = $title->getArticleID();
91175 }
 176+
 177+ if ( !$ids ) {
 178+ return;
 179+ }
 180+
92181 # Update page_touched
93 - $dbw->update( 'page',
94 - array( 'page_touched' => $timestamp ),
95 - array( 'page_id IN (' . $dbw->makeList( $ids ) . ')' ),
96 - __METHOD__
97 - );
 182+ $batches = array_chunk( $ids, $this->mRowsPerQuery );
 183+ foreach ( $batches as $batch ) {
 184+ $dbw->update( 'page',
 185+ array( 'page_touched' => $timestamp ),
 186+ array( 'page_id IN (' . $dbw->makeList( $batch ) . ')' ),
 187+ __METHOD__
 188+ );
 189+ }
98190
99191 # Update squid
100192 if ( $wgUseSquid ) {
@@ -108,6 +200,7 @@
109201 }
110202 }
111203 }
 204+
112205 }
113206
114207 /**
@@ -133,8 +226,8 @@
134227 }
135228
136229 public function run() {
137 - $update = new HTMLCacheUpdate( $this->title, $this->table );
138 - $update->invalidate( $this->start, $this->end );
 230+ $update = new HTMLCacheUpdate( $this->title, $this->table, $this->start, $this->end );
 231+ $update->doUpdate();
139232 return true;
140233 }
141234 }
Index: trunk/phase3/includes/DefaultSettings.php
@@ -3635,7 +3635,7 @@
36363636 /**
36373637 * Number of rows to update per query
36383638 */
3639 -$wgUpdateRowsPerQuery = 10;
 3639+$wgUpdateRowsPerQuery = 100;
36403640
36413641 /**
36423642 * Enable AJAX framework

Follow-up revisions

RevisionCommit summaryAuthorDate
r59718Fix bug in BacklinkCache: the lack of an ORDER BY clause in getLinks(), combi...tstarling01:55, 4 December 2009

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r47317* Mostly reverted r41081 and related. Although the motivation (to save a quer...tstarling14:26, 16 February 2009

Status & tagging log