r2080 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r2079‎ | r2080 | r2081 >
Date:01:10, 27 November 2003
Author:e23
Status:old
Tags:
Comment:
Speed improvements
Modified paths:
  • /trunk/phase3/maintenance/rebuildlinks.inc (modified) (history)

Diff [purge]

Index: trunk/phase3/maintenance/rebuildlinks.inc
@@ -11,15 +11,16 @@
1212 # Buffer this many rows before inserting them all in one sweep. More
1313 # than about 1000 will probably not increase speed significantly on
1414 # most setups.
15 -/* private */ $rowbuf_size = 2000; // 2000 rows ~ 80 kB
 15+/* private */ $rowbuf_size = 1000; // 1000 rows ~40 kB
1616
1717 function rebuildLinkTables()
1818 {
19 - global $wgLang, $wgUseMemoryTables, $rowbuf_size;
 19+ error_reporting (E_ALL);
 20+ global $wgLang, $wgUseMemoryTables, $wgLinkCache, $rowbuf_size;
2021
21 - print "This script may take many hours to complete. If you abort during that time,\n";
22 - print "your wiki will be in an inconsistent state and you may have problems running\n";
23 - print "this script again. If you are going to abort, this is the time to do it.\n\n";
 22+ print "This script may take several hours to complete. If you abort during that time,\n";
 23+ print "your wiki will be in an inconsistent state. If you are going to abort, this is\n";
 24+ print "the time to do it.\n\n";
2425 print "Press control-c to abort (will proceed automatically in 15 seconds)\n";
2526 sleep(15);
2627
@@ -29,6 +30,11 @@
3031 print "Setting AUTOCOMMIT=1\n";
3132 wfQuery("SET SESSION AUTOCOMMIT=1", DB_WRITE);
3233
 34+ print "Locking tables\n";
 35+ $sql = "LOCK TABLES cur READ, interwiki READ, user_newtalk READ, " .
 36+ "links WRITE, brokenlinks WRITE, imagelinks WRITE";
 37+ wfQuery( $sql, DB_WRITE );
 38+
3339 print "Deleting old data in links table.\n";
3440 $sql = "DELETE FROM links";
3541 wfQuery( $sql, DB_WRITE );
@@ -41,41 +47,21 @@
4248 $sql = "DELETE FROM imagelinks";
4349 wfQuery( $sql, DB_WRITE );
4450
45 - print "\nAdding temporary unique index on links, brokenlinks and imagelinks.\n";
46 - print "->If build aborts now, you probably aborted a previous build. If that is\n";
47 - print " the case, you can clean up the remains with the following SQL commands,\n";
48 - print " and then try again.\n";
49 - print " ALTER TABLE links DROP INDEX tmp_unique;\n";
50 - print " ALTER TABLE brokenlinks DROP INDEX tmp_unique;\n";
51 - print " ALTER TABLE imagelinks DROP INDEX tmp_unique;\n\n";
52 -
53 - $sql = "ALTER TABLE links ADD UNIQUE tmp_unique (l_from, l_to)";
54 - wfQuery( $sql, DB_WRITE );
55 - $sql = "ALTER TABLE brokenlinks ADD UNIQUE tmp_unique (bl_from, bl_to)";
56 - wfQuery( $sql, DB_WRITE );
57 - $sql = "ALTER TABLE imagelinks ADD UNIQUE tmp_unique (il_from, il_to(244))";
58 - wfQuery( $sql, DB_WRITE );
59 - print "Temporary unique index added ok. Forget what I said.\n\n";
60 -
61 - print "Locking tables\n";
62 - $sql = "LOCK TABLES cur READ, interwiki READ, user_newtalk READ, " .
63 - "links WRITE, brokenlinks WRITE, imagelinks WRITE";
64 - wfQuery( $sql, DB_WRITE );
65 -
66 - print "Finding number of articles to process\n";
 51+ print "Finding number of articles to process... ";
6752 $sql = "SELECT COUNT(*) as count FROM cur";
6853 $res = wfQuery( $sql, DB_READ );
6954 $obj = wfFetchObject( $res );
7055 $total = $obj->count;
 56+ print "$total\n";
7157
7258 print "Finding highest article id\n";
7359 $sql = "SELECT MIN(cur_id) AS min, MAX(cur_id) AS max FROM cur";
7460 $res = wfQuery( $sql, DB_READ );
7561 $obj = wfFetchObject( $res );
76 -
 62+
7763 $cur_pulser = new SelectPulser("SELECT cur_id,cur_namespace,cur_title,cur_text " .
7864 "FROM cur WHERE cur_id ",
79 - $obj->min, $obj->max, $rowbuf_size);
 65+ $obj->min, $obj->max, 100);
8066
8167 $brokenlinks_inserter = new InsertBuffer(
8268 "INSERT IGNORE INTO brokenlinks (bl_from,bl_to) VALUES " , $rowbuf_size);
@@ -93,93 +79,150 @@
9480
9581 $tc = Title::legalChars();
9682
 83+ $titleCache = new MRUCache( 10000 );
 84+ $titlecount = 0;
9785 $start_time = time();
 86+
9887 while ( $row = $cur_pulser->next() ) {
99 - $from_id = $row->cur_id;
100 - $ns = $wgLang->getNsText( $row->cur_namespace );
10188
102 - $raw_title = $row->cur_title;
 89+ $from_id = intval($row->cur_id);
 90+ $ns = $wgLang->getNsText( $row->cur_namespace );
 91+ $from_full_title = $row->cur_title;
10392 if ( "" != $ns ) {
104 - $raw_title = "$ns:{$raw_title}";
 93+ $from_full_title = "$ns:{$from_full_title}";
10594 }
106 - $title = addslashes( $raw_title );
 95+ $from_full_title_with_slashes = addslashes( $from_full_title );
10796 $text = $row->cur_text;
10897
10998 $numlinks = preg_match_all( "/\\[\\[([{$tc}]+)(]|\\|)/", $text,
11099 $m, PREG_PATTERN_ORDER );
111100
112 - for ( $i = 0; $i < $numlinks; ++$i ) {
 101+ $seen_links = array(); // seen links in this article
 102+ $titles_ready_for_insertion = array();
 103+ $titles_needing_curdata = array();
 104+ $titles_needing_curdata_pos = array();
 105+ $links_corresponding_to_titles = array();
 106+
 107+ for ( $i = 0 ; $i < $numlinks; ++$i ) {
 108+ $link = $m[1][$i];
 109+
 110+ // We're only interested in the link once per article
 111+ if( isset( $seen_links[$link] ) )
 112+ continue;
 113+ $seen_links[$link] = 1;
 114+
113115 if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) {
114116 # an URL link; not for us!
115117 continue;
116118 }
 119+
 120+ # FIXME: Handle subpage links
 121+ $nt = $titleCache->get( $link );
 122+ if( $nt != false ){
 123+ $titles_ready_for_insertion[] = $nt;
 124+ } else {
 125+ $nt = Title::newFromText( $link );
 126+ if (! $nt) {
 127+ print "\nerror in '$ns:{$from_full_title}': '$link'\n";
 128+ continue;
 129+ }
 130+ if( $nt->getInterwiki() != "" ) {
 131+ # Interwiki links are not stored in the link tables
 132+ continue;
 133+ }
 134+ if( $nt->getNamespace() == Namespace::getSpecial() ) {
 135+ # Special links not stored in link tables
 136+ continue;
 137+ }
 138+ if( $nt->getNamespace() == Namespace::getMedia() ) {
 139+ # treat media: links as image: links
 140+ $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() );
 141+ }
 142+ $nt->mArticleID = 0; // assume broken link until proven otherwise
117143
118 - # FIXME: Handle subpage links
119 - $nt = Title::newFromText( $m[1][$i] );
120 -
121 - if (! $nt)
122 - {
123 - $txt = $m[1][$i];
124 - print "error in '$ns:{$row->cur_title}' :\t'$txt'\n";
125 - continue;
 144+ $pos = array_push($titles_needing_curdata, $nt) - 1;
 145+ $titles_needing_curdata_pos[$nt->getDBkey()] = $pos;
 146+ $links_corresponding_to_titles[] = $link;
 147+ unset( $link ); // useless outside this loop, but tempting
126148 }
127 - if( $nt->getInterwiki() != "" ) {
128 - # Interwiki links are not stored in the link tables
129 - continue;
 149+ }
 150+
 151+
 152+ if ( count( $titles_needing_curdata ) > 0 ){
 153+ $parts = array();
 154+ foreach ($titles_needing_curdata as $nt ) {
 155+ $parts[] = " (cur_namespace = " . $nt->getNamespace() . " AND " .
 156+ "cur_title='" . wfStrencode( $nt->getDBkey() ) . "' AND ".
 157+ "cur_namespace=" . intval( $nt->getNamespace() ) . ")";
130158 }
131 - if( $nt->getNamespace() == Namespace::getSpecial() ) {
132 - # Special links not stored in link tables
133 - continue;
 159+ $sql = "SELECT cur_title, cur_id FROM cur WHERE " . implode(" OR ", $parts);
 160+ $res = wfQuery( $sql, DB_WRITE );
 161+ while($row = wfFetchObject( $res ) ){
 162+ $pos = $titles_needing_curdata_pos[$row->cur_title];
 163+ $titles_needing_curdata[$pos]->mArticleID = intval($row->cur_id);
134164 }
135 - if( $nt->getNamespace() == Namespace::getMedia() ) {
136 - # treat media: links as image: links
137 - $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() );
 165+ for( $k = 0; $k < count( $titles_needing_curdata ) ; $k++) {
 166+ $tmplink = $links_corresponding_to_titles[$k];
 167+ $titleCache->set( $tmplink, $titles_needing_curdata[$k] );
 168+ $titles_ready_for_insertion[] = $titles_needing_curdata[$k];
138169 }
 170+ }
139171
 172+ foreach ( $titles_ready_for_insertion as $nt ) {
140173 $dest = addslashes( $nt->getPrefixedDBkey() );
141174 $dest_id = $nt->getArticleID();
 175+ $from = $from_full_title_with_slashes;
142176
143 - if ( 0 == strncmp( "$ins:", $raw_title, $inslen ) ) {
144 - $iname = addslashes( substr( $raw_title, $inslen ) );
145 - $imagelinks_inserter->insert( "('{$title}','{$iname}')" );
 177+ # print "\nLINK '$from_full_title' ($from_id) -> '$dest' ($dest_id)\n";
 178+ if ( 0 == strncmp( "$ins:", $from_full_title, $inslen ) ) {
 179+ $iname = addslashes( substr( $from_full_title, $inslen ) );
 180+ $imagelinks_inserter->insert( "('{$from}','{$iname}')" );
146181 } else if ( 0 == $dest_id ) {
147182 $brokenlinks_inserter->insert( "({$from_id},'{$dest}')" );
148183 } else {
149 - $links_inserter->insert( "('{$title}',{$dest_id})" );
 184+ $links_inserter->insert( "('{$from}',{$dest_id})" );
150185 }
 186+ $titlecount++;
151187 }
152188
153 - if ( ( $count % 10 ) == 0 )
 189+ if ( ( $count % 20 ) == 0 )
154190 print ".";
155191
156192 if ( ( ++$count % 1000 ) == 0 ) {
157193 $dt = time() - $start_time;
158194 $start_time = time();
159 - $rps = ($dt == 0 ? "lots of" : intval(1000/$dt));
160 - print "\n$count of $total articles scanned ({$rps} articles per second)\n";
 195+ $rps = persec(1000, $dt);
 196+ $tps = persec($titlecount, $dt);
 197+ $titlecount = 0;
 198+ print "\n$count of $total articles scanned ({$rps} articles ".
 199+ "and {$tps} titles per second)\n";
 200+ print "Title cache hits: " . $titleCache->getPerformance() . "%\n";
 201+
161202 }
162203
163204 }
164205
 206+ print "\nFlushing insertion buffers...";
165207 $imagelinks_inserter->flush();
166208 $links_inserter->flush();
167209 $brokenlinks_inserter->flush();
 210+ print "ok\n";
168211
169 - print "$total articles scanned.\n";
 212+ print "$count articles scanned.\n";
170213
171 - print "Removing temporary unique indexes from tables links, brokenlinks and imagelinks.\n";
172 - $sql = "ALTER TABLE links DROP INDEX tmp_unique";
173 - wfQuery( $sql, DB_WRITE );
174 - $sql = "ALTER TABLE brokenlinks DROP INDEX tmp_unique";
175 - wfQuery( $sql, DB_WRITE );
176 - $sql = "ALTER TABLE imagelinks DROP INDEX tmp_unique";
177 - wfQuery( $sql, DB_WRITE );
178 -
179214 $sql = "UNLOCK TABLES";
180215 wfQuery( $sql, DB_WRITE );
181216 print "Done\n";
182217 }
183218
 219+/* private */ function persec($n, $t){
 220+ if($n == 0)
 221+ return "zero";
 222+ if($t == 0)
 223+ return "lots of";
 224+ return intval($n/$t);
 225+}
 226+
184227 # InsertBuffer increases performance slightly by inserting many rows
185228 # at once. The gain is small (<5%) when running against a local, idle
186229 # database, but may be significant in other circumstances. It also
@@ -221,6 +264,7 @@
222265 # Select parts from a large table by using the "BETWEEN X AND Y"
223266 # operator on the id column. Avoids buffering the whole thing in
224267 # RAM. It's also convenient.
 268+
225269 class SelectPulser {
226270 /* private */ var $mSql, $mSetsize, $mPos, $mMax, $mSet;
227271
@@ -248,7 +292,7 @@
249293 while ( $row = wfFetchObject( $res ) ) {
250294 $this->mSet[] = $row;
251295 }
252 -
 296+ wfFreeResult( $res );
253297 if( count( $this->mSet ) > 0 ){
254298 return $this->next();
255299 }
@@ -257,4 +301,62 @@
258302 }
259303 }
260304
 305+# A simple MRU for general cacheing.
 306+
 307+class MRUCache {
 308+ /* private */ var $mMru, $mCache, $mSize, $mPurgefreq, $nexti;
 309+ /* private */ var $hits, $misses;
 310+
 311+ function MRUCache( $size, $purgefreq = -1 ) {
 312+ // purgefreq is 1/10 of $size if not stated
 313+ $purgefreq = ($purgefreq == -1 ? intval($size/10) : $purgefreq);
 314+ $purgefreq = ($purgefreq <= 0 ? 1 : $purgefreq);
 315+
 316+ $this->mSize = $size;
 317+ $this->mMru = array();
 318+ $this->mCache = array();
 319+ $this->mPurgefreq = $purgefreq;
 320+ $this->nexti = 1;
 321+ print "purgefreq = " . $this->mPurgefreq . "\n";
 322+ }
 323+
 324+ function get( $key ){
 325+ if ( ! array_key_exists( $key, $this->mCache) ){
 326+ $this->misses++;
 327+ return false;
 328+ }
 329+ $this->hits++;
 330+ $this->mMru[$key] = $this->nexti++;
 331+ return $this->mCache[$key];
 332+ }
 333+
 334+ function set( $key, $value ){
 335+ $this->mMru[$key] = $this->nexti++;
 336+ $this->mCache[$key] = $value;
 337+
 338+ if($this->nexti % $this->mPurgefreq == 0)
 339+ $this->purge();
 340+ }
 341+
 342+ function purge(){
 343+ $to_remove = count( $this->mMru ) - $this->mSize;
 344+ if( $to_remove <= 0 ){
 345+ return;
 346+ }
 347+ asort( $this->mMru );
 348+ $removed = array_splice( $this->mMru, 0, $to_remove );
 349+ foreach( array_keys( $removed ) as $key ){
 350+ unset( $this->mCache[$key] );
 351+ }
 352+ }
 353+
 354+ function getPerformance(){
 355+ $tot = $this->hits + $this->misses;
 356+ if($tot > 0)
 357+ return intval(100.0 * $this->hits / $tot);
 358+ else
 359+ return 0;
 360+ }
 361+}
 362+
261363 ?>

Status & tagging log