Index: trunk/phase3/maintenance/refreshLinks.inc |
— | — | @@ -136,13 +136,23 @@ |
137 | 137 | $dbw->immediateCommit(); |
138 | 138 | } |
139 | 139 | |
140 | | -function deleteLinksFromNonexistent( $maxLag = 0 ) { |
| 140 | +/* |
| 141 | + * Removes non-existing links from pages from pagelinks, imagelinks, |
| 142 | + * categorylinks, templatelinks and externallinks tables. |
| 143 | + * |
| 144 | + * @param $maxLag |
| 145 | + * @param $batchSize The size of deletion batches |
| 146 | + * |
| 147 | + * @author Merlijn van Deen <valhallasw@arctus.nl> |
| 148 | + */ |
| 149 | +function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) { |
141 | 150 | $fname = 'deleteLinksFromNonexistent'; |
142 | | - |
143 | 151 | wfWaitForSlaves( $maxLag ); |
144 | | - |
| 152 | + |
145 | 153 | $dbw = wfGetDB( DB_MASTER ); |
146 | | - |
| 154 | + $dbr = wfGetDB( DB_SLAVE ); |
| 155 | + $dbr->bufferResults(false); |
| 156 | + |
147 | 157 | $linksTables = array( |
148 | 158 | 'pagelinks' => 'pl_from', |
149 | 159 | 'imagelinks' => 'il_from', |
— | — | @@ -150,27 +160,65 @@ |
151 | 161 | 'templatelinks' => 'tl_from', |
152 | 162 | 'externallinks' => 'el_from', |
153 | 163 | ); |
154 | | - |
155 | | - $page = $dbw->tableName( 'page' ); |
156 | | - |
157 | | - |
| 164 | + |
| 165 | + |
| 166 | + $readPage = $dbr->tableName( 'page' ); |
158 | 167 | foreach ( $linksTables as $table => $field ) { |
159 | | - if ( !$dbw->ping() ) { |
160 | | - print "DB disconnected, reconnecting..."; |
161 | | - while ( !$dbw->ping() ) { |
162 | | - print "."; |
163 | | - sleep(10); |
164 | | - } |
| 168 | + $readLinks = $dbr->tableName( $table ); |
| 169 | + |
| 170 | + $sql = "SELECT DISTINCT( $field ) FROM $readLinks LEFT JOIN $readPage ON $field=page_id WHERE page_id IS NULL;"; |
| 171 | + print "Retrieving illegal entries from $table: \tRUNNING"; |
| 172 | + |
| 173 | + $results = $dbr->query( $sql, $fname . ':' . $readLinks ); |
| 174 | + print "\x08\x08\x08\x08\x08\x08\x08" . $results->numRows() . " illegal " . $field. "s. "; |
| 175 | + |
| 176 | + if ( $results->numRows() == 0 ) { |
165 | 177 | print "\n"; |
| 178 | + continue; |
166 | 179 | } |
| 180 | + |
| 181 | + $counter = 0; |
| 182 | + $list = array(); |
| 183 | + print "Removing illegal links: 1.."; |
| 184 | + foreach( $results as $row ) { |
| 185 | + $counter++; |
| 186 | + $list[] = $row->$field; |
| 187 | + if ( ( $counter % $batchSize ) == 0 ) { |
| 188 | + print $counter . ".."; |
| 189 | + deleteBatch($dbw, $table, $field, $list); |
| 190 | + $list = ''; |
| 191 | + } |
| 192 | + } |
| 193 | + print $counter . "\n"; |
| 194 | + deleteBatch($dbw, $table, $field, $list); |
| 195 | + } |
| 196 | +} |
167 | 197 | |
168 | | - $pTable = $dbw->tableName( $table ); |
169 | | - $sql = "DELETE $pTable FROM $pTable LEFT JOIN $page ON page_id=$field WHERE page_id IS NULL"; |
| 198 | +/* Deletes a batch of items from a table. |
| 199 | + * Runs the query: DELETE FROM <$table> WHERE <$field> IN (<$list>) |
| 200 | + * |
| 201 | + * @param $dbw Database Database object to run the DELETE query on |
| 202 | + * @param $table table to work on; will be converted via $dbw->tableName. |
| 203 | + * @param $field column to search in |
| 204 | + * @param $list values to remove. Array with SQL-safe (!) values. |
| 205 | + * |
| 206 | + * @author Merlijn van Deen <valhallasw@arctus.nl> |
| 207 | + */ |
| 208 | +function deleteBatch($dbw, $table, $field, $list) { |
| 209 | + if (count($list) == 0) return; |
| 210 | + |
| 211 | + $masterLinks = $dbw->tableName( $table ); |
| 212 | + $fname = "deleteBatch:masterLinks"; |
| 213 | + |
| 214 | + if ( !$dbw->ping() ) { |
| 215 | + print "\nDB disconnected, reconnecting..."; |
| 216 | + while ( !$dbw->ping() ) { |
| 217 | + print "."; |
| 218 | + sleep(10); |
| 219 | + } |
| 220 | + print "\n"; |
| 221 | + } |
170 | 222 | |
171 | | - print "Deleting $table from non-existent articles..."; |
172 | | - $dbw->query( $sql, $fname ); |
173 | | - print " fixed " .$dbw->affectedRows() . " row(s)\n"; |
174 | | - } |
| 223 | + $sql = "DELETE FROM $masterLinks WHERE $field IN (" . join("," , $list) . ");"; |
| 224 | + $dbw->query($sql, $fname); |
175 | 225 | } |
176 | | - |
177 | | -?> |
Index: trunk/phase3/maintenance/refreshLinks.php |
— | — | @@ -18,14 +18,16 @@ |
19 | 19 | [--new-only] [--redirects-only] |
20 | 20 | php refreshLinks.php [<start>] [-e <end>] [-m <maxlag>] --old-redirects-only |
21 | 21 | |
22 | | - --help : This help message |
23 | | - --dfn-only : Delete links from nonexistent articles only |
24 | | - --new-only : Only affect articles with just a single edit |
25 | | - --redirects-only : Only fix redirects, not all links |
26 | | - --old-redirects-only : Only fix redirects with no redirect table entry |
27 | | - -m <number> : Maximum replication lag |
28 | | - <start> : First page id to refresh |
29 | | - -e <number> : Last page id to refresh |
| 22 | + --help : This help message |
| 23 | + --dfn-only : Delete links from nonexistent articles only |
| 24 | + --batch-size <number> : The delete batch size when removing links from |
| 25 | + nonexistent articles (default 100) |
| 26 | + --new-only : Only affect articles with just a single edit |
| 27 | + --redirects-only : Only fix redirects, not all links |
| 28 | + --old-redirects-only : Only fix redirects with no redirect table entry |
| 29 | + -m <number> : Maximum replication lag |
| 30 | + <start> : First page id to refresh |
| 31 | + -e <number> : Last page id to refresh |
30 | 32 | |
31 | 33 | TEXT; |
32 | 34 | exit(0); |
— | — | @@ -44,10 +46,8 @@ |
45 | 47 | } |
46 | 48 | // this bit's bad for replication: disabling temporarily |
47 | 49 | // --brion 2005-07-16 |
48 | | -//deleteLinksFromNonexistent(); |
| 50 | +deleteLinksFromNonexistent($options['m'], $options['batch-size']); |
49 | 51 | |
50 | 52 | if ( $options['globals'] ) { |
51 | 53 | print_r( $GLOBALS ); |
52 | 54 | } |
53 | | - |
54 | | - |