r109224 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r109223‎ | r109224 | r109225 >
Date:21:57, 17 January 2012
Author:reedy
Status:ok
Tags:miscextensions 
Comment:
refreshLinks in maintenance/ is in better shape
Modified paths:
  • /trunk/extensions/WikimediaMaintenance/rL.php (deleted) (history)

Diff [purge]

Index: trunk/extensions/WikimediaMaintenance/rL.php
@@ -1,286 +0,0 @@
2 -<?php
3 -/**
4 - * This program is free software; you can redistribute it and/or modify
5 - * it under the terms of the GNU General Public License as published by
6 - * the Free Software Foundation; either version 2 of the License, or
7 - * (at your option) any later version.
8 - *
9 - * This program is distributed in the hope that it will be useful,
10 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 - * GNU General Public License for more details.
13 - *
14 - * You should have received a copy of the GNU General Public License along
15 - * with this program; if not, write to the Free Software Foundation, Inc.,
16 - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 - * http://www.gnu.org/copyleft/gpl.html
18 - *
19 - * @ingroup Maintenance
20 - */
21 -
22 -require_once( dirname( __FILE__ ) . '/WikimediaMaintenance.php' );
23 -
24 -class RefreshLinks extends WikimediaMaintenance {
25 - public function __construct() {
26 - parent::__construct();
27 - $this->mDescription = "Refresh link tables";
28 - $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
29 - $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
30 - $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
31 - $this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
32 - $this->addOption( 'm', 'Maximum replication lag', false, true );
33 - $this->addOption( 'e', 'Last page id to refresh', false, true );
34 - $this->addArg( 'start', 'Page_id to start from, default 1', false );
35 - $this->setBatchSize( 100 );
36 - }
37 -
38 - public function execute() {
39 - global $wgMemc;
40 - global $wgCacheEpoch;
41 - $wgCacheEpoch = '20100423090000';
42 - if( !$this->hasOption( 'dfn-only' ) ) {
43 - $start = $this->getArg( 0, 1 );
44 - $new = $this->getOption( 'new-only', false );
45 - $max = $this->getOption( 'm', false );
46 - $end = $this->getOption( 'e', 0 );
47 - $redir = $this->getOption( 'redirects-only', false );
48 - $oldRedir = $this->getOption( 'old-redirects-only', false );
49 - $this->doRefreshLinks( $start, $new, $max, $end, $redir, $oldRedir );
50 - }
51 - $this->deleteLinksFromNonexistent( $max, $this->mBatchSize );
52 - }
53 -
54 - /**
55 - * Do the actual link refreshing.
56 - * @param $start int Page_id to start from
57 - * @param $newOnly bool Only do pages with 1 edit
58 - * @param $maxLag int Max DB replication lag
59 - * @param $end int Page_id to stop at
60 - * @param $redirectsOnly bool Only fix redirects
61 - * @param $oldRedirectsOnly bool Only fix redirects without redirect entries
62 - */
63 - private function doRefreshLinks( $start, $newOnly = false, $maxLag = false,
64 - $end = 0, $redirectsOnly = false, $oldRedirectsOnly = false ) {
65 - global $wgUser, $wgParser, $wgUseTidy;
66 -
67 - $reportingInterval = 100;
68 - $dbr = wfGetDB( DB_SLAVE );
69 - $start = intval( $start );
70 -
71 - # Don't generate TeX PNGs (lack of a sensible current directory causes errors anyway)
72 - $wgUser->setOption('math', MW_MATH_SOURCE);
73 -
74 - # Don't generate extension images (e.g. Timeline)
75 - #if( method_exists( $wgParser, "clearTagHooks" ) ) {
76 - # $wgParser->clearTagHooks();
77 - #}
78 -
79 - # Don't use HTML tidy
80 - $wgUseTidy = false;
81 -
82 - $what = $redirectsOnly ? "redirects" : "links";
83 -
84 - if( $oldRedirectsOnly ) {
85 - # This entire code path is cut-and-pasted from below. Hurrah.
86 - $res = $dbr->query(
87 - "SELECT page_id ".
88 - "FROM page ".
89 - "LEFT JOIN redirect ON page_id=rd_from ".
90 - "WHERE page_is_redirect=1 AND rd_from IS NULL AND ".
91 - ($end == 0 ? "page_id >= $start"
92 - : "page_id BETWEEN $start AND $end"),
93 - __METHOD__
94 - );
95 - $num = $dbr->numRows( $res );
96 - $this->output( "Refreshing $num old redirects from $start...\n" );
97 -
98 - foreach( $res as $row ) {
99 - if ( !( ++$i % $reportingInterval ) ) {
100 - $this->output( "$i\n" );
101 - wfWaitForSlaves( $maxLag );
102 - }
103 - $this->fixRedirect( $row->page_id );
104 - }
105 - } elseif( $newOnly ) {
106 - $this->output( "Refreshing $what from " );
107 - $res = $dbr->select( 'page',
108 - array( 'page_id' ),
109 - array(
110 - 'page_is_new' => 1,
111 - "page_id >= $start" ),
112 - __METHOD__
113 - );
114 - $num = $dbr->numRows( $res );
115 - $this->output( "$num new articles...\n" );
116 -
117 - $i = 0;
118 - foreach ( $res as $row ) {
119 - if ( !( ++$i % $reportingInterval ) ) {
120 - $this->output( "$i\n" );
121 - wfWaitForSlaves( $maxLag );
122 - }
123 - if($redirectsOnly)
124 - $this->fixRedirect( $row->page_id );
125 - else
126 - $this->fixLinksFromArticle( $row->page_id );
127 - }
128 - } else {
129 - if ( !$end ) {
130 - $maxPage = $dbr->selectField( 'page', 'max(page_id)', false );
131 - $maxRD = $dbr->selectField( 'redirect', 'max(rd_from)', false );
132 - $end = max( $maxPage, $maxRD );
133 - }
134 - $this->output( "Refreshing redirects table.\n" );
135 - $this->output( "Starting from page_id $start of $end.\n" );
136 -
137 - for ($id = $start; $id <= $end; $id++) {
138 -
139 - if ( !($id % $reportingInterval) ) {
140 - $this->output( "$id\n" );
141 - wfWaitForSlaves( $maxLag );
142 - }
143 - $this->fixRedirect( $id );
144 - }
145 -
146 - if(!$redirectsOnly) {
147 - $this->output( "Refreshing links table.\n" );
148 - $this->output( "Starting from page_id $start of $end.\n" );
149 -
150 - for ($id = $start; $id <= $end; $id++) {
151 -
152 - if ( !($id % $reportingInterval) ) {
153 - $this->output( "$id\n" );
154 - wfWaitForSlaves( $maxLag );
155 - }
156 - $this->fixLinksFromArticle( $id );
157 - }
158 - }
159 - }
160 - }
161 -
162 - /**
163 - * Update the redirect entry for a given page
164 - * @param $id int The page_id of the redirect
165 - */
166 - private function fixRedirect( $id ){
167 - global $wgTitle;
168 -
169 - $wgTitle = Title::newFromID( $id );
170 - $dbw = wfGetDB( DB_MASTER );
171 -
172 - if ( is_null( $wgTitle ) ) {
173 - // This page doesn't exist (any more)
174 - // Delete any redirect table entry for it
175 - $dbw->delete( 'redirect', array( 'rd_from' => $id ),
176 - __METHOD__ );
177 - return;
178 - }
179 - $article = new Article($wgTitle);
180 -
181 - $rt = $article->followRedirect();
182 -
183 - if($rt == false || !is_object($rt)) {
184 - // $wgTitle is not a redirect
185 - // Delete any redirect table entry for it
186 - $dbw->delete( 'redirect', array( 'rd_from' => $id ),
187 - __METHOD__ );
188 - } else {
189 - $article->updateRedirectOn($dbw,$rt);
190 - }
191 - }
192 -
193 - /**
194 - * Run LinksUpdate for all links on a given page_id
195 - * @param $id int The page_id
196 - */
197 - private function fixLinksFromArticle( $id ) {
198 - global $wgTitle, $wgParser;
199 -
200 - $wgTitle = Title::newFromID( $id );
201 - $dbw = wfGetDB( DB_MASTER );
202 -
203 - $linkCache =& LinkCache::singleton();
204 - $linkCache->clear();
205 -
206 - if ( is_null( $wgTitle ) ) {
207 - return;
208 - }
209 - $dbw->begin();
210 -
211 - $revision = Revision::newFromTitle( $wgTitle );
212 - if ( !$revision ) {
213 - return;
214 - }
215 -
216 - $options = new ParserOptions;
217 - $parserOutput = $wgParser->parse( $revision->getText(), $wgTitle, $options, true, true, $revision->getId() );
218 - $update = new LinksUpdate( $wgTitle, $parserOutput, false );
219 - $update->doUpdate();
220 - $dbw->commit();
221 - }
222 -
223 - /**
224 - * Removes non-existing links from pages from pagelinks, imagelinks,
225 - * categorylinks, templatelinks and externallinks tables.
226 - *
227 - * @param $maxLag
228 - * @param $batchSize The size of deletion batches
229 - *
230 - * @author Merlijn van Deen <valhallasw@arctus.nl>
231 - */
232 - private function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) {
233 - wfWaitForSlaves( $maxLag );
234 -
235 - $dbw = wfGetDB( DB_MASTER );
236 -
237 - $lb = wfGetLBFactory()->newMainLB();
238 - $dbr = $lb->getConnection( DB_SLAVE );
239 - $dbr->bufferResults( false );
240 -
241 - $linksTables = array( // table name => page_id field
242 - 'pagelinks' => 'pl_from',
243 - 'imagelinks' => 'il_from',
244 - 'categorylinks' => 'cl_from',
245 - 'templatelinks' => 'tl_from',
246 - 'externallinks' => 'el_from',
247 - );
248 -
249 - foreach ( $linksTables as $table => $field ) {
250 - $this->output( "Retrieving illegal entries from $table... " );
251 -
252 - // SELECT DISTINCT( $field ) FROM $table LEFT JOIN page ON $field=page_id WHERE page_id IS NULL;
253 - $results = $dbr->select( array( $table, 'page' ),
254 - $field,
255 - array('page_id' => null ),
256 - __METHOD__,
257 - 'DISTINCT',
258 - array( 'page' => array( 'LEFT JOIN', "$field=page_id"))
259 - );
260 -
261 - $counter = 0;
262 - $list = array();
263 - $this->output( "0.." );
264 -
265 - foreach( $results as $row ) {
266 - $counter++;
267 - $list[] = $row->$field;
268 - if ( ( $counter % $batchSize ) == 0 ) {
269 - wfWaitForSlaves(5);
270 - $dbw->delete( $table, array( $field => $list ), __METHOD__ );
271 -
272 - $this->output( $counter . ".." );
273 - $list = array();
274 - }
275 - }
276 - $this->output( $counter );
277 - if (count($list) > 0) {
278 - $dbw->delete( $table, array( $field => $list ), __METHOD__ );
279 - }
280 - $this->output( "\n" );
281 - }
282 - $lb->closeAll();
283 - }
284 -}
285 -
286 -$maintClass = 'RefreshLinks';
287 -require_once( DO_MAINTENANCE );

Status & tagging log