| Index: trunk/phase3/maintenance/archives/patch-externallinks.sql |
| — | — | @@ -0,0 +1,13 @@ |
| | 2 | +-- |
| | 3 | +-- Track links to external URLs |
| | 4 | +-- |
| | 5 | +CREATE TABLE /*$wgDBprefix*/externallinks ( |
| | 6 | + el_from int(8) unsigned NOT NULL default '0', |
| | 7 | + el_to blob NOT NULL default '', |
| | 8 | + el_index blob NOT NULL default '', |
| | 9 | + |
| | 10 | + KEY (el_from, el_to(40)), |
| | 11 | + KEY (el_to(60), el_from), |
| | 12 | + KEY (el_index(60)) |
| | 13 | +) TYPE=InnoDB; |
| | 14 | + |
| Property changes on: trunk/phase3/maintenance/archives/patch-externallinks.sql |
| ___________________________________________________________________ |
| Added: svn:eol-style |
| 1 | 15 | + native |
| Added: svn:keywords |
| 2 | 16 | + Author Date Id Revision |
| Index: trunk/phase3/maintenance/refreshLinks.inc |
| — | — | @@ -69,7 +69,7 @@ |
| 70 | 70 | } |
| 71 | 71 | |
| 72 | 72 | function fixLinksFromArticle( $id ) { |
| 73 | | - global $wgTitle, $wgArticle, $wgOut, $wgParser, $wgLinkCache; |
| | 73 | + global $wgTitle, $wgArticle, $wgOut, $wgParser; |
| 74 | 74 | |
| 75 | 75 | $wgTitle = Title::newFromID( $id ); |
| 76 | 76 | $dbw =& wfGetDB( DB_MASTER ); |
| — | — | @@ -105,6 +105,8 @@ |
| 106 | 106 | 'pagelinks' => 'pl_from', |
| 107 | 107 | 'imagelinks' => 'il_from', |
| 108 | 108 | 'categorylinks' => 'cl_from', |
| | 109 | + 'templatelinks' => 'tl_from', |
| | 110 | + 'externallinks' => 'el_from', |
| 109 | 111 | ); |
| 110 | 112 | |
| 111 | 113 | $page = $dbw->tableName( 'page' ); |
| Index: trunk/phase3/maintenance/mysql5/tables.sql |
| — | — | @@ -463,6 +463,34 @@ |
| 464 | 464 | ) TYPE=InnoDB, DEFAULT CHARSET=utf8; |
| 465 | 465 | |
| 466 | 466 | -- |
| | 467 | +-- Track links to external URLs |
| | 468 | +-- |
| | 469 | +CREATE TABLE /*$wgDBprefix*/externallinks ( |
| | 470 | + -- page_id of the referring page |
| | 471 | + el_from int(8) unsigned NOT NULL default '0', |
| | 472 | + |
| | 473 | + -- The URL |
| | 474 | + el_to blob NOT NULL default '', |
| | 475 | + |
| | 476 | + -- In the case of HTTP URLs, this is the URL with any username or password |
| | 477 | + -- removed, and with the labels in the hostname reversed and converted to |
| | 478 | + -- lower case. An extra dot is added to allow for matching of either |
| | 479 | + -- example.com or *.example.com in a single scan. |
| | 480 | + -- Example: |
| | 481 | + -- http://user:password@sub.example.com/page.html |
| | 482 | + -- becomes |
| | 483 | + -- http://com.example.sub./page.html |
| | 484 | + -- which allows for fast searching for all pages under example.com with the |
| | 485 | + -- clause: |
| | 486 | + -- WHERE el_index LIKE 'http://com.example.%' |
| | 487 | + el_index blob NOT NULL default '', |
| | 488 | + |
| | 489 | + KEY (el_from, el_to(40)), |
| | 490 | + KEY (el_to(60), el_from), |
| | 491 | + KEY (el_index(60)) |
| | 492 | +) TYPE=InnoDB, DEFAULT CHARSET=utf8; |
| | 493 | + |
| | 494 | +-- |
| 467 | 495 | -- Contains a single row with some aggregate info |
| 468 | 496 | -- on the state of the site. |
| 469 | 497 | -- |
| Index: trunk/phase3/maintenance/updaters.inc |
| — | — | @@ -26,6 +26,7 @@ |
| 27 | 27 | array( 'user_newtalk', 'patch-usernewtalk2.sql' ), |
| 28 | 28 | array( 'transcache', 'patch-transcache.sql' ), |
| 29 | 29 | array( 'trackbacks', 'patch-trackbacks.sql' ), |
| | 30 | + array( 'externallinks', 'patch-externallinks.sql' ), |
| 30 | 31 | ); |
| 31 | 32 | |
| 32 | 33 | $wgNewFields = array( |
| Index: trunk/phase3/maintenance/tables.sql |
| — | — | @@ -450,6 +450,34 @@ |
| 451 | 451 | ) TYPE=InnoDB; |
| 452 | 452 | |
| 453 | 453 | -- |
| | 454 | +-- Track links to external URLs |
| | 455 | +-- |
| | 456 | +CREATE TABLE /*$wgDBprefix*/externallinks ( |
| | 457 | + -- page_id of the referring page |
| | 458 | + el_from int(8) unsigned NOT NULL default '0', |
| | 459 | + |
| | 460 | + -- The URL |
| | 461 | + el_to blob NOT NULL default '', |
| | 462 | + |
| | 463 | + -- In the case of HTTP URLs, this is the URL with any username or password |
| | 464 | + -- removed, and with the labels in the hostname reversed and converted to |
| | 465 | + -- lower case. An extra dot is added to allow for matching of either |
| | 466 | + -- example.com or *.example.com in a single scan. |
| | 467 | + -- Example: |
| | 468 | + -- http://user:password@sub.example.com/page.html |
| | 469 | + -- becomes |
| | 470 | + -- http://com.example.sub./page.html |
| | 471 | + -- which allows for fast searching for all pages under example.com with the |
| | 472 | + -- clause: |
| | 473 | + -- WHERE el_index LIKE 'http://com.example.%' |
| | 474 | + el_index blob NOT NULL default '', |
| | 475 | + |
| | 476 | + KEY (el_from, el_to(40)), |
| | 477 | + KEY (el_to(60), el_from), |
| | 478 | + KEY (el_index(60)) |
| | 479 | +) TYPE=InnoDB; |
| | 480 | + |
| | 481 | +-- |
| 454 | 482 | -- Contains a single row with some aggregate info |
| 455 | 483 | -- on the state of the site. |
| 456 | 484 | -- |
| Index: trunk/phase3/includes/GlobalFunctions.php |
| — | — | @@ -1808,4 +1808,34 @@ |
| 1809 | 1809 | } |
| 1810 | 1810 | } |
| 1811 | 1811 | |
| | 1812 | +/** |
| | 1813 | + * Make a URL index, appropriate for the el_index field of externallinks. |
| | 1814 | + */ |
| | 1815 | +function wfMakeUrlIndex( $url ) { |
| | 1816 | + wfSuppressWarnings(); |
| | 1817 | + $bits = parse_url( $url ); |
| | 1818 | + wfRestoreWarnings(); |
| | 1819 | + if ( !$bits || $bits['scheme'] !== 'http' ) { |
| | 1820 | + return false; |
| | 1821 | + } |
| | 1822 | + // Reverse the labels in the hostname, convert to lower case |
| | 1823 | + $reversedHost = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) ); |
| | 1824 | + // Add an extra dot to the end |
| | 1825 | + if ( substr( $reversedHost, -1 ) !== '.' ) { |
| | 1826 | + $reversedHost .= '.'; |
| | 1827 | + } |
| | 1828 | + // Reconstruct the pseudo-URL |
| | 1829 | + $index = "http://$reversedHost"; |
| | 1830 | + // Leave out user and password. Add the port, path, query and fragment |
| | 1831 | + if ( isset( $bits['port'] ) ) $index .= ':' . $bits['port']; |
| | 1832 | + if ( isset( $bits['path'] ) ) { |
| | 1833 | + $index .= $bits['path']; |
| | 1834 | + } else { |
| | 1835 | + $index .= '/'; |
| | 1836 | + } |
| | 1837 | + if ( isset( $bits['query'] ) ) $index .= '?' . $bits['query']; |
| | 1838 | + if ( isset( $bits['fragment'] ) ) $index .= '#' . $bits['fragment']; |
| | 1839 | + return $index; |
| | 1840 | +} |
| | 1841 | + |
| 1812 | 1842 | ?> |
| Index: trunk/phase3/includes/Parser.php |
| — | — | @@ -1121,19 +1121,23 @@ |
| 1122 | 1122 | |
| 1123 | 1123 | # Replace & from obsolete syntax with &. |
| 1124 | 1124 | # All HTML entities will be escaped by makeExternalLink() |
| 1125 | | - # or maybeMakeExternalImage() |
| 1126 | 1125 | $url = str_replace( '&', '&', $url ); |
| | 1126 | + # Replace unnecessary URL escape codes with the referenced character |
| | 1127 | + # This prevents spammers from hiding links from the filters |
| | 1128 | + $url = Parser::replaceUnusualEscapes( $url ); |
| 1127 | 1129 | |
| 1128 | 1130 | # Process the trail (i.e. everything after this link up until start of the next link), |
| 1129 | 1131 | # replacing any non-bracketed links |
| 1130 | 1132 | $trail = $this->replaceFreeExternalLinks( $trail ); |
| 1131 | 1133 | |
| 1132 | | - |
| 1133 | 1134 | # Use the encoded URL |
| 1134 | 1135 | # This means that users can paste URLs directly into the text |
| 1135 | 1136 | # Funny characters like ö aren't valid in URLs anyway |
| 1136 | 1137 | # This was changed in August 2004 |
| 1137 | 1138 | $s .= $sk->makeExternalLink( $url, $text, false, $linktype ) . $dtrail . $trail; |
| | 1139 | + |
| | 1140 | + # Register link in the output object |
| | 1141 | + $this->mOutput->addExternalLink( $url ); |
| 1138 | 1142 | } |
| 1139 | 1143 | |
| 1140 | 1144 | wfProfileOut( $fname ); |
| — | — | @@ -1189,12 +1193,16 @@ |
| 1190 | 1194 | # All HTML entities will be escaped by makeExternalLink() |
| 1191 | 1195 | # or maybeMakeExternalImage() |
| 1192 | 1196 | $url = str_replace( '&', '&', $url ); |
| | 1197 | + # Replace unnecessary URL escape codes with their equivalent characters |
| | 1198 | + $url = Parser::replaceUnusualEscapes( $url ); |
| 1193 | 1199 | |
| 1194 | 1200 | # Is this an external image? |
| 1195 | 1201 | $text = $this->maybeMakeExternalImage( $url ); |
| 1196 | 1202 | if ( $text === false ) { |
| 1197 | 1203 | # Not an image, make a link |
| 1198 | 1204 | $text = $sk->makeExternalLink( $url, $wgContLang->markNoConversion($url), true, 'free' ); |
| | 1205 | + # Register it in the output object |
| | 1206 | + $this->mOutput->addExternalLink( $url ); |
| 1199 | 1207 | } |
| 1200 | 1208 | $s .= $text . $trail; |
| 1201 | 1209 | } else { |
| — | — | @@ -1206,6 +1214,36 @@ |
| 1207 | 1215 | } |
| 1208 | 1216 | |
| 1209 | 1217 | /** |
| | 1218 | + * Replace unusual URL escape codes with their equivalent characters |
| | 1219 | + * @param string |
| | 1220 | + * @return string |
| | 1221 | + * @static |
| | 1222 | + */ |
| | 1223 | + function replaceUnusualEscapes( $url ) { |
| | 1224 | + return preg_replace_callback( '/%[0-9A-Fa-f]{2}/', |
| | 1225 | + array( 'Parser', 'replaceUnusualEscapesCallback' ), $url ); |
| | 1226 | + } |
| | 1227 | + |
| | 1228 | + /** |
| | 1229 | + * Callback function used in replaceUnusualEscapes(). |
| | 1230 | + * Replaces unusual URL escape codes with their equivalent character |
| | 1231 | + * @static |
| | 1232 | + * @access private |
| | 1233 | + */ |
| | 1234 | + function replaceUnusualEscapesCallback( $matches ) { |
| | 1235 | + $char = urldecode( $matches[0] ); |
| | 1236 | + $ord = ord( $char ); |
| | 1237 | + // Is it an unsafe or HTTP reserved character according to RFC 1738? |
| | 1238 | + if ( $ord > 32 && $ord < 127 && strpos( '<>"#{}|\^~[]`;/?', $char ) === false ) { |
| | 1239 | + // No, shouldn't be escaped |
| | 1240 | + return $char; |
| | 1241 | + } else { |
| | 1242 | + // Yes, leave it escaped |
| | 1243 | + return $matches[0]; |
| | 1244 | + } |
| | 1245 | + } |
| | 1246 | + |
| | 1247 | + /** |
| 1210 | 1248 | * make an image if it's allowed, either through the global |
| 1211 | 1249 | * option or through the exception |
| 1212 | 1250 | * @access private |
| — | — | @@ -3742,7 +3780,8 @@ |
| 3743 | 3781 | $mTitleText, # title text of the chosen language variant |
| 3744 | 3782 | $mLinks, # 2-D map of NS/DBK to ID for the links in the document. ID=zero for broken. |
| 3745 | 3783 | $mTemplates, # 2-D map of NS/DBK to ID for the template references. ID=zero for broken. |
| 3746 | | - $mImages; # DB keys of the images used, in the array key only |
| | 3784 | + $mImages, # DB keys of the images used, in the array key only |
| | 3785 | + $mExternalLinks; # External link URLs, in the key only |
| 3747 | 3786 | |
| 3748 | 3787 | function ParserOutput( $text = '', $languageLinks = array(), $categoryLinks = array(), |
| 3749 | 3788 | $containsOldMagic = false, $titletext = '' ) |
| — | — | @@ -3757,6 +3796,7 @@ |
| 3758 | 3797 | $this->mLinks = array(); |
| 3759 | 3798 | $this->mTemplates = array(); |
| 3760 | 3799 | $this->mImages = array(); |
| | 3800 | + $this->mExternalLinks = array(); |
| 3761 | 3801 | } |
| 3762 | 3802 | |
| 3763 | 3803 | function getText() { return $this->mText; } |
| — | — | @@ -3768,6 +3808,7 @@ |
| 3769 | 3809 | function &getLinks() { return $this->mLinks; } |
| 3770 | 3810 | function &getTemplates() { return $this->mTemplates; } |
| 3771 | 3811 | function &getImages() { return $this->mImages; } |
| | 3812 | + function &getExternalLinks() { return $this->mExternalLinks; } |
| 3772 | 3813 | |
| 3773 | 3814 | function containsOldMagic() { return $this->mContainsOldMagic; } |
| 3774 | 3815 | function setText( $text ) { return wfSetVar( $this->mText, $text ); } |
| — | — | @@ -3780,6 +3821,7 @@ |
| 3781 | 3822 | function addCategory( $c, $sort ) { $this->mCategories[$c] = $sort; } |
| 3782 | 3823 | function addImage( $name ) { $this->mImages[$name] = 1; } |
| 3783 | 3824 | function addLanguageLink( $t ) { $this->mLanguageLinks[] = $t; } |
| | 3825 | + function addExternalLink( $url ) { $this->mExternalLinks[$url] = 1; } |
| 3784 | 3826 | |
| 3785 | 3827 | function addLink( $title, $id ) { |
| 3786 | 3828 | $ns = $title->getNamespace(); |
| Index: trunk/phase3/includes/LinksUpdate.php |
| — | — | @@ -19,6 +19,7 @@ |
| 20 | 20 | $mLinks, # Map of title strings to IDs for the links in the document |
| 21 | 21 | $mImages, # DB keys of the images used, in the array key only |
| 22 | 22 | $mTemplates, # Map of title strings to IDs for the template references, including broken ones |
| | 23 | + $mExternals, # URLs of external links, array key only |
| 23 | 24 | $mCategories, # Map of category names to sort keys |
| 24 | 25 | $mDb, # Database connection reference |
| 25 | 26 | $mOptions; # SELECT options to be used (array) |
| — | — | @@ -52,6 +53,7 @@ |
| 53 | 54 | $this->mLinks =& $this->mParserOutput->getLinks(); |
| 54 | 55 | $this->mImages =& $this->mParserOutput->getImages(); |
| 55 | 56 | $this->mTemplates =& $this->mParserOutput->getTemplates(); |
| | 57 | + $this->mExternals =& $this->mParserOutput->getExternalLinks(); |
| 56 | 58 | $this->mCategories =& $this->mParserOutput->getCategories(); |
| 57 | 59 | |
| 58 | 60 | } |
| — | — | @@ -87,6 +89,11 @@ |
| 88 | 90 | $this->incrTableUpdate( 'imagelinks', 'il', $this->getImageDeletions( $existing ), |
| 89 | 91 | $this->getImageInsertions( $existing ) ); |
| 90 | 92 | |
| | 93 | + # External links |
| | 94 | + $existing = $this->getExistingExternals(); |
| | 95 | + $this->incrTableUpdate( 'externallinks', 'el', $this->getExternalDeletions( $existing ), |
| | 96 | + $this->getExternalInsertions( $existing ) ); |
| | 97 | + |
| 91 | 98 | # Category links |
| 92 | 99 | $existing = $this->getExistingCategories(); |
| 93 | 100 | $this->incrTableUpdate( 'categorylinks', 'cl', $this->getCategoryDeletions( $existing ), |
| — | — | @@ -117,6 +124,7 @@ |
| 118 | 125 | $this->dumbTableUpdate( 'imagelinks', $this->getImageInsertions(), 'il_from' ); |
| 119 | 126 | $this->dumbTableUpdate( 'categorylinks', $this->getCategoryInsertions(), 'cl_from' ); |
| 120 | 127 | $this->dumbTableUpdate( 'templatelinks', $this->getTemplateInsertions(), 'tl_from' ); |
| | 128 | + $this->dumbTableUpdate( 'externallinks', $this->getExternalInsertions(), 'el_from' ); |
| 121 | 129 | |
| 122 | 130 | # Update the cache of all the category pages |
| 123 | 131 | $this->invalidateCategories( $categoryUpdates ); |
| — | — | @@ -238,7 +246,7 @@ |
| 239 | 247 | function getImageInsertions( $existing = array() ) { |
| 240 | 248 | $arr = array(); |
| 241 | 249 | $diffs = array_diff_key( $this->mImages, $existing ); |
| 242 | | - foreach( $diffs as $iname => $val ) { |
| | 250 | + foreach( $diffs as $iname => $dummy ) { |
| 243 | 251 | $arr[] = array( |
| 244 | 252 | 'il_from' => $this->mId, |
| 245 | 253 | 'il_to' => $iname |
| — | — | @@ -248,6 +256,23 @@ |
| 249 | 257 | } |
| 250 | 258 | |
| 251 | 259 | /** |
| | 260 | + * Get an array of externallinks insertions. Skips the names specified in $existing |
| | 261 | + * @access private |
| | 262 | + */ |
| | 263 | + function getExternalInsertions( $existing = array() ) { |
| | 264 | + $arr = array(); |
| | 265 | + $diffs = array_diff_key( $this->mExternals, $existing ); |
| | 266 | + foreach( $diffs as $url => $dummy ) { |
| | 267 | + $arr[] = array( |
| | 268 | + 'el_from' => $this->mId, |
| | 269 | + 'el_to' => $url, |
| | 270 | + 'el_index' => wfMakeUrlIndex( $url ), |
| | 271 | + ); |
| | 272 | + } |
| | 273 | + return $arr; |
| | 274 | + } |
| | 275 | + |
| | 276 | + /** |
| 252 | 277 | * Get an array of category insertions |
| 253 | 278 | * @param array $existing Array mapping existing category names to sort keys. If both |
| 254 | 279 | * match a link in $this, the link will be omitted from the output |
| — | — | @@ -309,6 +334,15 @@ |
| 310 | 335 | return array_diff_key( $existing, $this->mImages ); |
| 311 | 336 | } |
| 312 | 337 | |
| | 338 | + /** |
| | 339 | + * Given an array of existing external links, returns those links which are not |
| | 340 | + * in $this and thus should be deleted. |
| | 341 | + * @access private |
| | 342 | + */ |
| | 343 | + function getExternalDeletions( $existing ) { |
| | 344 | + return array_diff_key( $existing, $this->mExternals ); |
| | 345 | + } |
| | 346 | + |
| 313 | 347 | /** |
| 314 | 348 | * Given an array of existing categories, returns those categories which are not in $this |
| 315 | 349 | * and thus should be deleted. |
| — | — | @@ -333,6 +367,7 @@ |
| 334 | 368 | } |
| 335 | 369 | $arr[$row->pl_namespace][$row->pl_title] = 1; |
| 336 | 370 | } |
| | 371 | + $this->mDb->freeResult( $res ); |
| 337 | 372 | return $arr; |
| 338 | 373 | } |
| 339 | 374 | |
| — | — | @@ -351,6 +386,7 @@ |
| 352 | 387 | } |
| 353 | 388 | $arr[$row->tl_namespace][$row->tl_title] = 1; |
| 354 | 389 | } |
| | 390 | + $this->mDb->freeResult( $res ); |
| 355 | 391 | return $arr; |
| 356 | 392 | } |
| 357 | 393 | |
| — | — | @@ -366,10 +402,27 @@ |
| 367 | 403 | while ( $row = $this->mDb->fetchObject( $res ) ) { |
| 368 | 404 | $arr[$row->il_to] = 1; |
| 369 | 405 | } |
| | 406 | + $this->mDb->freeResult( $res ); |
| 370 | 407 | return $arr; |
| 371 | 408 | } |
| 372 | 409 | |
| 373 | 410 | /** |
| | 411 | + * Get an array of existing external links, URLs in the keys |
| | 412 | + * @access private |
| | 413 | + */ |
| | 414 | + function getExistingExternals() { |
| | 415 | + $fname = 'LinksUpdate::getExistingExternals'; |
| | 416 | + $res = $this->mDb->select( 'externallinks', array( 'el_to' ), |
| | 417 | + array( 'el_from' => $this->mId ), $fname, $this->mOptions ); |
| | 418 | + $arr = array(); |
| | 419 | + while ( $row = $this->mDb->fetchObject( $res ) ) { |
| | 420 | + $arr[$row->el_to] = 1; |
| | 421 | + } |
| | 422 | + $this->mDb->freeResult( $res ); |
| | 423 | + return $arr; |
| | 424 | + } |
| | 425 | + |
| | 426 | + /** |
| 374 | 427 | * Get an array of existing categories, with the name in the key and sort key in the value. |
| 375 | 428 | * @access private |
| 376 | 429 | */ |
| — | — | @@ -381,6 +434,7 @@ |
| 382 | 435 | while ( $row = $this->mDb->fetchObject( $res ) ) { |
| 383 | 436 | $arr[$row->cl_to] = $row->cl_sortkey; |
| 384 | 437 | } |
| | 438 | + $this->mDb->freeResult( $res ); |
| 385 | 439 | return $arr; |
| 386 | 440 | } |
| 387 | 441 | } |
| Index: trunk/phase3/RELEASE-NOTES |
| — | — | @@ -64,6 +64,7 @@ |
| 65 | 65 | namespace are changed |
| 66 | 66 | * Respect database prefix in dumpHTML.inc |
| 67 | 67 | * Removed read-only check from Database::query() |
| | 68 | +* Added externallinks table, to track links to arbitrary URLs |
| 68 | 69 | |
| 69 | 70 | Documentation: |
| 70 | 71 | * (bug 3306) Document $wgLocalTZoffset |
| — | — | @@ -251,6 +252,9 @@ |
| 252 | 253 | * Fix XML validity checks in parser tests on PHP 5.1 |
| 253 | 254 | * (bug 4377) "[" is not valid in URLs |
| 254 | 255 | * (bug 4453) fix for __TOC__ dollar-number breakage |
| | 256 | +* Convert unnecessary URL escape codes in external links to their equivalent |
| | 257 | + character before doing anything with them. This prevents certain kinds of |
| | 258 | + spam filter evasion. |
| 255 | 259 | |
| 256 | 260 | Upload: |
| 257 | 261 | * (bug 2527) Always set destination filename when new file is selected |