r45514 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r45513‎ | r45514 | r45515 >
Date:19:51, 7 January 2009
Author:valhallasw
Status:resolved (Comments)
Tags:
Comment:
Recommit of r45431 with these changes:
* Removed backspace characters in output
* Small code update
* Removed 'disabled' comment from refreshLinks.php
Modified paths:
  • /trunk/phase3/maintenance/refreshLinks.inc (modified) (history)
  • /trunk/phase3/maintenance/refreshLinks.php (modified) (history)

Diff [purge]

Index: trunk/phase3/maintenance/refreshLinks.inc
@@ -136,41 +136,90 @@
137137 $dbw->immediateCommit();
138138 }
139139
140 -function deleteLinksFromNonexistent( $maxLag = 0 ) {
 140+/*
 141+ * Removes non-existing links from pages from pagelinks, imagelinks,
 142+ * categorylinks, templatelinks and externallinks tables.
 143+ *
 144+ * @param $maxLag
 145+ * @param $batchSize The size of deletion batches
 146+ *
 147+ * @author Merlijn van Deen <valhallasw@arctus.nl>
 148+ */
 149+function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) {
141150 $fname = 'deleteLinksFromNonexistent';
142 -
143151 wfWaitForSlaves( $maxLag );
144 -
 152+
145153 $dbw = wfGetDB( DB_MASTER );
146 -
147 - $linksTables = array(
 154+ $dbr = wfGetDB( DB_SLAVE );
 155+ $dbr->bufferResults(false);
 156+
 157+ $linksTables = array( // table name => page_id field
148158 'pagelinks' => 'pl_from',
149159 'imagelinks' => 'il_from',
150160 'categorylinks' => 'cl_from',
151161 'templatelinks' => 'tl_from',
152162 'externallinks' => 'el_from',
153163 );
154 -
155 - $page = $dbw->tableName( 'page' );
156 -
157 -
 164+
 165+ $readPage = $dbr->tableName( 'page' );
 166+
158167 foreach ( $linksTables as $table => $field ) {
159 - if ( !$dbw->ping() ) {
160 - print "DB disconnected, reconnecting...";
161 - while ( !$dbw->ping() ) {
162 - print ".";
163 - sleep(10);
 168+ $readLinks = $dbr->tableName( $table );
 169+
 170+ print "Retrieving illegal entries from $table... ";
 171+
 172+ $sql = "SELECT DISTINCT( $field ) FROM $readLinks LEFT JOIN $readPage ON $field=page_id WHERE page_id IS NULL;";
 173+ $results = $dbr->query( $sql, $fname . ':' . $readLinks );
 174+
 175+ print $results->numRows() . " illegal " . $field. "s. ";
 176+
 177+ if ( $results->numRows() > 0 ) {
 178+ $counter = 0;
 179+ $list = array();
 180+ print "Removing illegal links: 1..";
 181+
 182+ foreach( $results as $row ) {
 183+ $counter++;
 184+ $list[] = $row->$field;
 185+ if ( ( $counter % $batchSize ) == 0 ) {
 186+ print $counter . "..";
 187+ deleteBatch($dbw, $table, $field, $list);
 188+ $list = array();
 189+ }
164190 }
165 - print "\n";
 191+ print $counter;
 192+ deleteBatch($dbw, $table, $field, $list);
166193 }
 194+
 195+ print "\n";
 196+ }
 197+}
167198
168 - $pTable = $dbw->tableName( $table );
169 - $sql = "DELETE $pTable FROM $pTable LEFT JOIN $page ON page_id=$field WHERE page_id IS NULL";
 199+/* Deletes a batch of items from a table.
 200+ * Runs the query: DELETE FROM <$table> WHERE <$field> IN (<$list>)
 201+ *
 202+ * @param $dbw Database Database object to run the DELETE query on
 203+ * @param $table table to work on; will be converted via $dbw->tableName.
 204+ * @param $field column to search in
 205+ * @param $list values to remove. Array with SQL-safe (!) values.
 206+ *
 207+ * @author Merlijn van Deen <valhallasw@arctus.nl>
 208+ */
 209+function deleteBatch($dbw, $table, $field, $list) {
 210+ if (count($list) == 0) return;
 211+
 212+ $masterLinks = $dbw->tableName( $table );
 213+ $fname = "deleteBatch:masterLinks";
 214+
 215+ if ( !$dbw->ping() ) {
 216+ print "\nDB disconnected, reconnecting...";
 217+ while ( !$dbw->ping() ) {
 218+ print ".";
 219+ sleep(10);
 220+ }
 221+ print "\n";
 222+ }
170223
171 - print "Deleting $table from non-existent articles...";
172 - $dbw->query( $sql, $fname );
173 - print " fixed " .$dbw->affectedRows() . " row(s)\n";
174 - }
 224+ $sql = "DELETE FROM $masterLinks WHERE $field IN (" . join("," , $list) . ");";
 225+ $dbw->query($sql, $fname);
175226 }
176 -
177 -?>
Index: trunk/phase3/maintenance/refreshLinks.php
@@ -18,14 +18,16 @@
1919 [--new-only] [--redirects-only]
2020 php refreshLinks.php [<start>] [-e <end>] [-m <maxlag>] --old-redirects-only
2121
22 - --help : This help message
23 - --dfn-only : Delete links from nonexistent articles only
24 - --new-only : Only affect articles with just a single edit
25 - --redirects-only : Only fix redirects, not all links
26 - --old-redirects-only : Only fix redirects with no redirect table entry
27 - -m <number> : Maximum replication lag
28 - <start> : First page id to refresh
29 - -e <number> : Last page id to refresh
 22+ --help : This help message
 23+ --dfn-only : Delete links from nonexistent articles only
 24+ --batch-size <number> : The delete batch size when removing links from
 25+ nonexistent articles (default 100)
 26+ --new-only : Only affect articles with just a single edit
 27+ --redirects-only : Only fix redirects, not all links
 28+ --old-redirects-only : Only fix redirects with no redirect table entry
 29+ -m <number> : Maximum replication lag
 30+ <start> : First page id to refresh
 31+ -e <number> : Last page id to refresh
3032
3133 TEXT;
3234 exit(0);
@@ -42,12 +44,9 @@
4345
4446 refreshLinks( $start, $options['new-only'], $options['m'], $options['e'], $options['redirects-only'], $options['old-redirects-only'] );
4547 }
46 -// this bit's bad for replication: disabling temporarily
47 -// --brion 2005-07-16
48 -//deleteLinksFromNonexistent();
4948
 49+deleteLinksFromNonexistent($options['m'], $options['batch-size']);
 50+
5051 if ( $options['globals'] ) {
5152 print_r( $GLOBALS );
5253 }
53 -
54 -

Follow-up revisions

RevisionCommit summaryAuthorDate
r45721Fixes for r45514 and r45516:...valhallasw23:58, 13 January 2009

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r45431Updated deleteLinksFromNonexistent function:...valhallasw02:10, 6 January 2009

Comments

#Comment by Brion VIBBER (talk | contribs)   18:42, 13 January 2009

This doesn't seem to work. I inserted 1000 bogus pagelinks entries into my table and ran refreshLinks.php --dfn-only and it didn't find or delete any of them.

#Comment by Aaron Schulz (talk | contribs)   23:18, 20 January 2009

Fixed in r45721

Status & tagging log