Index: trunk/phase3/maintenance/rebuildlinks.inc |
— | — | @@ -11,15 +11,16 @@ |
12 | 12 | # Buffer this many rows before inserting them all in one sweep. More |
13 | 13 | # than about 1000 will probably not increase speed significantly on |
14 | 14 | # most setups. |
15 | | -/* private */ $rowbuf_size = 2000; // 2000 rows ~ 80 kB |
| 15 | +/* private */ $rowbuf_size = 1000; // 1000 rows ~40 kB |
16 | 16 | |
17 | 17 | function rebuildLinkTables() |
18 | 18 | { |
19 | | - global $wgLang, $wgUseMemoryTables, $rowbuf_size; |
| 19 | + error_reporting (E_ALL); |
| 20 | + global $wgLang, $wgUseMemoryTables, $wgLinkCache, $rowbuf_size; |
20 | 21 | |
21 | | - print "This script may take many hours to complete. If you abort during that time,\n"; |
22 | | - print "your wiki will be in an inconsistent state and you may have problems running\n"; |
23 | | - print "this script again. If you are going to abort, this is the time to do it.\n\n"; |
| 22 | + print "This script may take several hours to complete. If you abort during that time,\n"; |
| 23 | + print "your wiki will be in an inconsistent state. If you are going to abort, this is\n"; |
| 24 | + print "the time to do it.\n\n"; |
24 | 25 | print "Press control-c to abort (will proceed automatically in 15 seconds)\n"; |
25 | 26 | sleep(15); |
26 | 27 | |
— | — | @@ -29,6 +30,11 @@ |
30 | 31 | print "Setting AUTOCOMMIT=1\n"; |
31 | 32 | wfQuery("SET SESSION AUTOCOMMIT=1", DB_WRITE); |
32 | 33 | |
| 34 | + print "Locking tables\n"; |
| 35 | + $sql = "LOCK TABLES cur READ, interwiki READ, user_newtalk READ, " . |
| 36 | + "links WRITE, brokenlinks WRITE, imagelinks WRITE"; |
| 37 | + wfQuery( $sql, DB_WRITE ); |
| 38 | + |
33 | 39 | print "Deleting old data in links table.\n"; |
34 | 40 | $sql = "DELETE FROM links"; |
35 | 41 | wfQuery( $sql, DB_WRITE ); |
— | — | @@ -41,41 +47,21 @@ |
42 | 48 | $sql = "DELETE FROM imagelinks"; |
43 | 49 | wfQuery( $sql, DB_WRITE ); |
44 | 50 | |
45 | | - print "\nAdding temporary unique index on links, brokenlinks and imagelinks.\n"; |
46 | | - print "->If build aborts now, you probably aborted a previous build. If that is\n"; |
47 | | - print " the case, you can clean up the remains with the following SQL commands,\n"; |
48 | | - print " and then try again.\n"; |
49 | | - print " ALTER TABLE links DROP INDEX tmp_unique;\n"; |
50 | | - print " ALTER TABLE brokenlinks DROP INDEX tmp_unique;\n"; |
51 | | - print " ALTER TABLE imagelinks DROP INDEX tmp_unique;\n\n"; |
52 | | - |
53 | | - $sql = "ALTER TABLE links ADD UNIQUE tmp_unique (l_from, l_to)"; |
54 | | - wfQuery( $sql, DB_WRITE ); |
55 | | - $sql = "ALTER TABLE brokenlinks ADD UNIQUE tmp_unique (bl_from, bl_to)"; |
56 | | - wfQuery( $sql, DB_WRITE ); |
57 | | - $sql = "ALTER TABLE imagelinks ADD UNIQUE tmp_unique (il_from, il_to(244))"; |
58 | | - wfQuery( $sql, DB_WRITE ); |
59 | | - print "Temporary unique index added ok. Forget what I said.\n\n"; |
60 | | - |
61 | | - print "Locking tables\n"; |
62 | | - $sql = "LOCK TABLES cur READ, interwiki READ, user_newtalk READ, " . |
63 | | - "links WRITE, brokenlinks WRITE, imagelinks WRITE"; |
64 | | - wfQuery( $sql, DB_WRITE ); |
65 | | - |
66 | | - print "Finding number of articles to process\n"; |
| 51 | + print "Finding number of articles to process... "; |
67 | 52 | $sql = "SELECT COUNT(*) as count FROM cur"; |
68 | 53 | $res = wfQuery( $sql, DB_READ ); |
69 | 54 | $obj = wfFetchObject( $res ); |
70 | 55 | $total = $obj->count; |
| 56 | + print "$total\n"; |
71 | 57 | |
72 | 58 | print "Finding highest article id\n"; |
73 | 59 | $sql = "SELECT MIN(cur_id) AS min, MAX(cur_id) AS max FROM cur"; |
74 | 60 | $res = wfQuery( $sql, DB_READ ); |
75 | 61 | $obj = wfFetchObject( $res ); |
76 | | - |
| 62 | + |
77 | 63 | $cur_pulser = new SelectPulser("SELECT cur_id,cur_namespace,cur_title,cur_text " . |
78 | 64 | "FROM cur WHERE cur_id ", |
79 | | - $obj->min, $obj->max, $rowbuf_size); |
| 65 | + $obj->min, $obj->max, 100); |
80 | 66 | |
81 | 67 | $brokenlinks_inserter = new InsertBuffer( |
82 | 68 | "INSERT IGNORE INTO brokenlinks (bl_from,bl_to) VALUES " , $rowbuf_size); |
— | — | @@ -93,93 +79,150 @@ |
94 | 80 | |
95 | 81 | $tc = Title::legalChars(); |
96 | 82 | |
| 83 | + $titleCache = new MRUCache( 10000 ); |
| 84 | + $titlecount = 0; |
97 | 85 | $start_time = time(); |
| 86 | + |
98 | 87 | while ( $row = $cur_pulser->next() ) { |
99 | | - $from_id = $row->cur_id; |
100 | | - $ns = $wgLang->getNsText( $row->cur_namespace ); |
101 | 88 | |
102 | | - $raw_title = $row->cur_title; |
| 89 | + $from_id = intval($row->cur_id); |
| 90 | + $ns = $wgLang->getNsText( $row->cur_namespace ); |
| 91 | + $from_full_title = $row->cur_title; |
103 | 92 | if ( "" != $ns ) { |
104 | | - $raw_title = "$ns:{$raw_title}"; |
| 93 | + $from_full_title = "$ns:{$from_full_title}"; |
105 | 94 | } |
106 | | - $title = addslashes( $raw_title ); |
| 95 | + $from_full_title_with_slashes = addslashes( $from_full_title ); |
107 | 96 | $text = $row->cur_text; |
108 | 97 | |
109 | 98 | $numlinks = preg_match_all( "/\\[\\[([{$tc}]+)(]|\\|)/", $text, |
110 | 99 | $m, PREG_PATTERN_ORDER ); |
111 | 100 | |
112 | | - for ( $i = 0; $i < $numlinks; ++$i ) { |
| 101 | + $seen_links = array(); // seen links in this article |
| 102 | + $titles_ready_for_insertion = array(); |
| 103 | + $titles_needing_curdata = array(); |
| 104 | + $titles_needing_curdata_pos = array(); |
| 105 | + $links_corresponding_to_titles = array(); |
| 106 | + |
| 107 | + for ( $i = 0 ; $i < $numlinks; ++$i ) { |
| 108 | + $link = $m[1][$i]; |
| 109 | + |
| 110 | + // We're only interested in the link once per article |
| 111 | + if( isset( $seen_links[$link] ) ) |
| 112 | + continue; |
| 113 | + $seen_links[$link] = 1; |
| 114 | + |
113 | 115 | if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) { |
114 | 116 | # an URL link; not for us! |
115 | 117 | continue; |
116 | 118 | } |
| 119 | + |
| 120 | + # FIXME: Handle subpage links |
| 121 | + $nt = $titleCache->get( $link ); |
| 122 | + if( $nt != false ){ |
| 123 | + $titles_ready_for_insertion[] = $nt; |
| 124 | + } else { |
| 125 | + $nt = Title::newFromText( $link ); |
| 126 | + if (! $nt) { |
| 127 | + print "\nerror in '$ns:{$from_full_title}': '$link'\n"; |
| 128 | + continue; |
| 129 | + } |
| 130 | + if( $nt->getInterwiki() != "" ) { |
| 131 | + # Interwiki links are not stored in the link tables |
| 132 | + continue; |
| 133 | + } |
| 134 | + if( $nt->getNamespace() == Namespace::getSpecial() ) { |
| 135 | + # Special links not stored in link tables |
| 136 | + continue; |
| 137 | + } |
| 138 | + if( $nt->getNamespace() == Namespace::getMedia() ) { |
| 139 | + # treat media: links as image: links |
| 140 | + $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() ); |
| 141 | + } |
| 142 | + $nt->mArticleID = 0; // assume broken link until proven otherwise |
117 | 143 | |
118 | | - # FIXME: Handle subpage links |
119 | | - $nt = Title::newFromText( $m[1][$i] ); |
120 | | - |
121 | | - if (! $nt) |
122 | | - { |
123 | | - $txt = $m[1][$i]; |
124 | | - print "error in '$ns:{$row->cur_title}' :\t'$txt'\n"; |
125 | | - continue; |
| 144 | + $pos = array_push($titles_needing_curdata, $nt) - 1; |
| 145 | + $titles_needing_curdata_pos[$nt->getDBkey()] = $pos; |
| 146 | + $links_corresponding_to_titles[] = $link; |
| 147 | + unset( $link ); // useless outside this loop, but tempting |
126 | 148 | } |
127 | | - if( $nt->getInterwiki() != "" ) { |
128 | | - # Interwiki links are not stored in the link tables |
129 | | - continue; |
| 149 | + } |
| 150 | + |
| 151 | + |
| 152 | + if ( count( $titles_needing_curdata ) > 0 ){ |
| 153 | + $parts = array(); |
| 154 | + foreach ($titles_needing_curdata as $nt ) { |
| 155 | + $parts[] = " (cur_namespace = " . $nt->getNamespace() . " AND " . |
| 156 | + "cur_title='" . wfStrencode( $nt->getDBkey() ) . "' AND ". |
| 157 | + "cur_namespace=" . intval( $nt->getNamespace() ) . ")"; |
130 | 158 | } |
131 | | - if( $nt->getNamespace() == Namespace::getSpecial() ) { |
132 | | - # Special links not stored in link tables |
133 | | - continue; |
| 159 | + $sql = "SELECT cur_title, cur_id FROM cur WHERE " . implode(" OR ", $parts); |
| 160 | + $res = wfQuery( $sql, DB_WRITE ); |
| 161 | + while($row = wfFetchObject( $res ) ){ |
| 162 | + $pos = $titles_needing_curdata_pos[$row->cur_title]; |
| 163 | + $titles_needing_curdata[$pos]->mArticleID = intval($row->cur_id); |
134 | 164 | } |
135 | | - if( $nt->getNamespace() == Namespace::getMedia() ) { |
136 | | - # treat media: links as image: links |
137 | | - $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() ); |
| 165 | + for( $k = 0; $k < count( $titles_needing_curdata ) ; $k++) { |
| 166 | + $tmplink = $links_corresponding_to_titles[$k]; |
| 167 | + $titleCache->set( $tmplink, $titles_needing_curdata[$k] ); |
| 168 | + $titles_ready_for_insertion[] = $titles_needing_curdata[$k]; |
138 | 169 | } |
| 170 | + } |
139 | 171 | |
| 172 | + foreach ( $titles_ready_for_insertion as $nt ) { |
140 | 173 | $dest = addslashes( $nt->getPrefixedDBkey() ); |
141 | 174 | $dest_id = $nt->getArticleID(); |
| 175 | + $from = $from_full_title_with_slashes; |
142 | 176 | |
143 | | - if ( 0 == strncmp( "$ins:", $raw_title, $inslen ) ) { |
144 | | - $iname = addslashes( substr( $raw_title, $inslen ) ); |
145 | | - $imagelinks_inserter->insert( "('{$title}','{$iname}')" ); |
| 177 | + # print "\nLINK '$from_full_title' ($from_id) -> '$dest' ($dest_id)\n"; |
| 178 | + if ( 0 == strncmp( "$ins:", $from_full_title, $inslen ) ) { |
| 179 | + $iname = addslashes( substr( $from_full_title, $inslen ) ); |
| 180 | + $imagelinks_inserter->insert( "('{$from}','{$iname}')" ); |
146 | 181 | } else if ( 0 == $dest_id ) { |
147 | 182 | $brokenlinks_inserter->insert( "({$from_id},'{$dest}')" ); |
148 | 183 | } else { |
149 | | - $links_inserter->insert( "('{$title}',{$dest_id})" ); |
| 184 | + $links_inserter->insert( "('{$from}',{$dest_id})" ); |
150 | 185 | } |
| 186 | + $titlecount++; |
151 | 187 | } |
152 | 188 | |
153 | | - if ( ( $count % 10 ) == 0 ) |
| 189 | + if ( ( $count % 20 ) == 0 ) |
154 | 190 | print "."; |
155 | 191 | |
156 | 192 | if ( ( ++$count % 1000 ) == 0 ) { |
157 | 193 | $dt = time() - $start_time; |
158 | 194 | $start_time = time(); |
159 | | - $rps = ($dt == 0 ? "lots of" : intval(1000/$dt)); |
160 | | - print "\n$count of $total articles scanned ({$rps} articles per second)\n"; |
| 195 | + $rps = persec(1000, $dt); |
| 196 | + $tps = persec($titlecount, $dt); |
| 197 | + $titlecount = 0; |
| 198 | + print "\n$count of $total articles scanned ({$rps} articles ". |
| 199 | + "and {$tps} titles per second)\n"; |
| 200 | + print "Title cache hits: " . $titleCache->getPerformance() . "%\n"; |
| 201 | + |
161 | 202 | } |
162 | 203 | |
163 | 204 | } |
164 | 205 | |
| 206 | + print "\nFlushing insertion buffers..."; |
165 | 207 | $imagelinks_inserter->flush(); |
166 | 208 | $links_inserter->flush(); |
167 | 209 | $brokenlinks_inserter->flush(); |
| 210 | + print "ok\n"; |
168 | 211 | |
169 | | - print "$total articles scanned.\n"; |
| 212 | + print "$count articles scanned.\n"; |
170 | 213 | |
171 | | - print "Removing temporary unique indexes from tables links, brokenlinks and imagelinks.\n"; |
172 | | - $sql = "ALTER TABLE links DROP INDEX tmp_unique"; |
173 | | - wfQuery( $sql, DB_WRITE ); |
174 | | - $sql = "ALTER TABLE brokenlinks DROP INDEX tmp_unique"; |
175 | | - wfQuery( $sql, DB_WRITE ); |
176 | | - $sql = "ALTER TABLE imagelinks DROP INDEX tmp_unique"; |
177 | | - wfQuery( $sql, DB_WRITE ); |
178 | | - |
179 | 214 | $sql = "UNLOCK TABLES"; |
180 | 215 | wfQuery( $sql, DB_WRITE ); |
181 | 216 | print "Done\n"; |
182 | 217 | } |
183 | 218 | |
| 219 | +/* private */ function persec($n, $t){ |
| 220 | + if($n == 0) |
| 221 | + return "zero"; |
| 222 | + if($t == 0) |
| 223 | + return "lots of"; |
| 224 | + return intval($n/$t); |
| 225 | +} |
| 226 | + |
184 | 227 | # InsertBuffer increases performance slightly by inserting many rows |
185 | 228 | # at once. The gain is small (<5%) when running against a local, idle |
186 | 229 | # database, but may be significant in other circumstances. It also |
— | — | @@ -221,6 +264,7 @@ |
222 | 265 | # Select parts from a large table by using the "BETWEEN X AND Y" |
223 | 266 | # operator on the id column. Avoids buffering the whole thing in |
224 | 267 | # RAM. It's also convenient. |
| 268 | + |
225 | 269 | class SelectPulser { |
226 | 270 | /* private */ var $mSql, $mSetsize, $mPos, $mMax, $mSet; |
227 | 271 | |
— | — | @@ -248,7 +292,7 @@ |
249 | 293 | while ( $row = wfFetchObject( $res ) ) { |
250 | 294 | $this->mSet[] = $row; |
251 | 295 | } |
252 | | - |
| 296 | + wfFreeResult( $res ); |
253 | 297 | if( count( $this->mSet ) > 0 ){ |
254 | 298 | return $this->next(); |
255 | 299 | } |
— | — | @@ -257,4 +301,62 @@ |
258 | 302 | } |
259 | 303 | } |
260 | 304 | |
| 305 | +# A simple MRU for general cacheing. |
| 306 | + |
| 307 | +class MRUCache { |
| 308 | + /* private */ var $mMru, $mCache, $mSize, $mPurgefreq, $nexti; |
| 309 | + /* private */ var $hits, $misses; |
| 310 | + |
| 311 | + function MRUCache( $size, $purgefreq = -1 ) { |
| 312 | + // purgefreq is 1/10 of $size if not stated |
| 313 | + $purgefreq = ($purgefreq == -1 ? intval($size/10) : $purgefreq); |
| 314 | + $purgefreq = ($purgefreq <= 0 ? 1 : $purgefreq); |
| 315 | + |
| 316 | + $this->mSize = $size; |
| 317 | + $this->mMru = array(); |
| 318 | + $this->mCache = array(); |
| 319 | + $this->mPurgefreq = $purgefreq; |
| 320 | + $this->nexti = 1; |
| 321 | + print "purgefreq = " . $this->mPurgefreq . "\n"; |
| 322 | + } |
| 323 | + |
| 324 | + function get( $key ){ |
| 325 | + if ( ! array_key_exists( $key, $this->mCache) ){ |
| 326 | + $this->misses++; |
| 327 | + return false; |
| 328 | + } |
| 329 | + $this->hits++; |
| 330 | + $this->mMru[$key] = $this->nexti++; |
| 331 | + return $this->mCache[$key]; |
| 332 | + } |
| 333 | + |
| 334 | + function set( $key, $value ){ |
| 335 | + $this->mMru[$key] = $this->nexti++; |
| 336 | + $this->mCache[$key] = $value; |
| 337 | + |
| 338 | + if($this->nexti % $this->mPurgefreq == 0) |
| 339 | + $this->purge(); |
| 340 | + } |
| 341 | + |
| 342 | + function purge(){ |
| 343 | + $to_remove = count( $this->mMru ) - $this->mSize; |
| 344 | + if( $to_remove <= 0 ){ |
| 345 | + return; |
| 346 | + } |
| 347 | + asort( $this->mMru ); |
| 348 | + $removed = array_splice( $this->mMru, 0, $to_remove ); |
| 349 | + foreach( array_keys( $removed ) as $key ){ |
| 350 | + unset( $this->mCache[$key] ); |
| 351 | + } |
| 352 | + } |
| 353 | + |
| 354 | + function getPerformance(){ |
| 355 | + $tot = $this->hits + $this->misses; |
| 356 | + if($tot > 0) |
| 357 | + return intval(100.0 * $this->hits / $tot); |
| 358 | + else |
| 359 | + return 0; |
| 360 | + } |
| 361 | +} |
| 362 | + |
261 | 363 | ?> |