Index: trunk/phase3/maintenance/rebuildlinks.inc |
— | — | @@ -97,7 +97,7 @@ |
98 | 98 | $numlinks = preg_match_all( "/\\[\\[([{$tc}]+)(]|\\|)/", $text, |
99 | 99 | $m, PREG_PATTERN_ORDER ); |
100 | 100 | |
101 | | - $seen_links = array(); // seen links in this article |
| 101 | + $seen_dbtitles = array(); // seen links (normalized and with ns, see below) |
102 | 102 | $titles_ready_for_insertion = array(); |
103 | 103 | $titles_needing_curdata = array(); |
104 | 104 | $titles_needing_curdata_pos = array(); |
— | — | @@ -105,12 +105,6 @@ |
106 | 106 | |
107 | 107 | for ( $i = 0 ; $i < $numlinks; ++$i ) { |
108 | 108 | $link = $m[1][$i]; |
109 | | - |
110 | | - // We're only interested in the link once per article |
111 | | - if( isset( $seen_links[$link] ) ) |
112 | | - continue; |
113 | | - $seen_links[$link] = 1; |
114 | | - |
115 | 109 | if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) { |
116 | 110 | # an URL link; not for us! |
117 | 111 | continue; |
— | — | @@ -119,13 +113,26 @@ |
120 | 114 | # FIXME: Handle subpage links |
121 | 115 | $nt = $titleCache->get( $link ); |
122 | 116 | if( $nt != false ){ |
123 | | - $titles_ready_for_insertion[] = $nt; |
| 117 | + // Only process each unique link once per page |
| 118 | + $nt_key = $nt->getDBkey() . $nt->getNamespace(); |
| 119 | + if( isset( $seen_dbtitles[$nt_key] ) ) |
| 120 | + continue; |
| 121 | + $seen_dbtitles[$nt_key] = 1; |
| 122 | + |
| 123 | + $titles_ready_for_insertion[] = $nt; |
124 | 124 | } else { |
125 | 125 | $nt = Title::newFromText( $link ); |
126 | 126 | if (! $nt) { |
127 | | - print "\nerror in '$ns:{$from_full_title}': '$link'\n"; |
| 127 | + print "\nInvalid link in page '$ns:{$from_full_title}': '$link'\n"; |
128 | 128 | continue; |
129 | 129 | } |
| 130 | + |
| 131 | + // Only process each unique link once per page |
| 132 | + $nt_key = $nt->getDBkey() . $nt->getNamespace(); |
| 133 | + if( isset( $seen_dbtitles[$nt_key] ) ) |
| 134 | + continue; |
| 135 | + $seen_dbtitles[$nt_key] = 1; |
| 136 | + |
130 | 137 | if( $nt->getInterwiki() != "" ) { |
131 | 138 | # Interwiki links are not stored in the link tables |
132 | 139 | continue; |
— | — | @@ -152,8 +159,7 @@ |
153 | 160 | $parts = array(); |
154 | 161 | foreach ($titles_needing_curdata as $nt ) { |
155 | 162 | $parts[] = " (cur_namespace = " . $nt->getNamespace() . " AND " . |
156 | | - "cur_title='" . wfStrencode( $nt->getDBkey() ) . "' AND ". |
157 | | - "cur_namespace=" . intval( $nt->getNamespace() ) . ")"; |
| 163 | + "cur_title='" . wfStrencode( $nt->getDBkey() ) . "')"; |
158 | 164 | } |
159 | 165 | $sql = "SELECT cur_title, cur_id FROM cur WHERE " . implode(" OR ", $parts); |
160 | 166 | $res = wfQuery( $sql, DB_WRITE ); |
— | — | @@ -169,13 +175,15 @@ |
170 | 176 | } |
171 | 177 | |
172 | 178 | foreach ( $titles_ready_for_insertion as $nt ) { |
173 | | - $dest = addslashes( $nt->getPrefixedDBkey() ); |
| 179 | + $dest_noslashes = $nt->getPrefixedDBkey(); |
| 180 | + $dest = addslashes( $dest_noslashes ); |
174 | 181 | $dest_id = $nt->getArticleID(); |
175 | 182 | $from = $from_full_title_with_slashes; |
176 | 183 | |
177 | 184 | # print "\nLINK '$from_full_title' ($from_id) -> '$dest' ($dest_id)\n"; |
178 | | - if ( 0 == strncmp( "$ins:", $from_full_title, $inslen ) ) { |
179 | | - $iname = addslashes( substr( $from_full_title, $inslen ) ); |
| 185 | + |
| 186 | + if ( 0 == strncmp( "$ins:", $dest_noslashes, $inslen ) ) { |
| 187 | + $iname = addslashes( substr( $dest_noslashes, $inslen ) ); |
180 | 188 | $imagelinks_inserter->insert( "('{$from}','{$iname}')" ); |
181 | 189 | } else if ( 0 == $dest_id ) { |
182 | 190 | $brokenlinks_inserter->insert( "({$from_id},'{$dest}')" ); |