r12874 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r12873‎ | r12874 | r12875 >
Date:13:29, 26 January 2006
Author:timstarling
Status:old
Tags:
Comment:
* Added externallinks table, to track links to arbitrary URLs
* Convert unnecessary URL escape codes in external links to their equivalent
character before doing anything with them. This prevents certain kinds of
spam filter evasion. (Parser.php only)
Modified paths:
  • /trunk/phase3/RELEASE-NOTES (modified) (history)
  • /trunk/phase3/includes/GlobalFunctions.php (modified) (history)
  • /trunk/phase3/includes/LinksUpdate.php (modified) (history)
  • /trunk/phase3/includes/Parser.php (modified) (history)
  • /trunk/phase3/maintenance/archives/patch-externallinks.sql (added) (history)
  • /trunk/phase3/maintenance/mysql5/tables.sql (modified) (history)
  • /trunk/phase3/maintenance/refreshLinks.inc (modified) (history)
  • /trunk/phase3/maintenance/tables.sql (modified) (history)
  • /trunk/phase3/maintenance/updaters.inc (modified) (history)

Diff [purge]

Index: trunk/phase3/maintenance/archives/patch-externallinks.sql
@@ -0,0 +1,13 @@
 2+--
 3+-- Track links to external URLs
 4+--
 5+CREATE TABLE /*$wgDBprefix*/externallinks (
 6+ el_from int(8) unsigned NOT NULL default '0',
 7+ el_to blob NOT NULL default '',
 8+ el_index blob NOT NULL default '',
 9+
 10+ KEY (el_from, el_to(40)),
 11+ KEY (el_to(60), el_from),
 12+ KEY (el_index(60))
 13+) TYPE=InnoDB;
 14+
Property changes on: trunk/phase3/maintenance/archives/patch-externallinks.sql
___________________________________________________________________
Added: svn:eol-style
115 + native
Added: svn:keywords
216 + Author Date Id Revision
Index: trunk/phase3/maintenance/refreshLinks.inc
@@ -69,7 +69,7 @@
7070 }
7171
7272 function fixLinksFromArticle( $id ) {
73 - global $wgTitle, $wgArticle, $wgOut, $wgParser, $wgLinkCache;
 73+ global $wgTitle, $wgArticle, $wgOut, $wgParser;
7474
7575 $wgTitle = Title::newFromID( $id );
7676 $dbw =& wfGetDB( DB_MASTER );
@@ -105,6 +105,8 @@
106106 'pagelinks' => 'pl_from',
107107 'imagelinks' => 'il_from',
108108 'categorylinks' => 'cl_from',
 109+ 'templatelinks' => 'tl_from',
 110+ 'externallinks' => 'el_from',
109111 );
110112
111113 $page = $dbw->tableName( 'page' );
Index: trunk/phase3/maintenance/mysql5/tables.sql
@@ -463,6 +463,34 @@
464464 ) TYPE=InnoDB, DEFAULT CHARSET=utf8;
465465
466466 --
 467+-- Track links to external URLs
 468+--
 469+CREATE TABLE /*$wgDBprefix*/externallinks (
 470+ -- page_id of the referring page
 471+ el_from int(8) unsigned NOT NULL default '0',
 472+
 473+ -- The URL
 474+ el_to blob NOT NULL default '',
 475+
 476+ -- In the case of HTTP URLs, this is the URL with any username or password
 477+ -- removed, and with the labels in the hostname reversed and converted to
 478+ -- lower case. An extra dot is added to allow for matching of either
 479+ -- example.com or *.example.com in a single scan.
 480+ -- Example:
 481+ -- http://user:password@sub.example.com/page.html
 482+ -- becomes
 483+ -- http://com.example.sub./page.html
 484+ -- which allows for fast searching for all pages under example.com with the
 485+ -- clause:
 486+ -- WHERE el_index LIKE 'http://com.example.%'
 487+ el_index blob NOT NULL default '',
 488+
 489+ KEY (el_from, el_to(40)),
 490+ KEY (el_to(60), el_from),
 491+ KEY (el_index(60))
 492+) TYPE=InnoDB, DEFAULT CHARSET=utf8;
 493+
 494+--
467495 -- Contains a single row with some aggregate info
468496 -- on the state of the site.
469497 --
Index: trunk/phase3/maintenance/updaters.inc
@@ -26,6 +26,7 @@
2727 array( 'user_newtalk', 'patch-usernewtalk2.sql' ),
2828 array( 'transcache', 'patch-transcache.sql' ),
2929 array( 'trackbacks', 'patch-trackbacks.sql' ),
 30+ array( 'externallinks', 'patch-externallinks.sql' ),
3031 );
3132
3233 $wgNewFields = array(
Index: trunk/phase3/maintenance/tables.sql
@@ -450,6 +450,34 @@
451451 ) TYPE=InnoDB;
452452
453453 --
 454+-- Track links to external URLs
 455+--
 456+CREATE TABLE /*$wgDBprefix*/externallinks (
 457+ -- page_id of the referring page
 458+ el_from int(8) unsigned NOT NULL default '0',
 459+
 460+ -- The URL
 461+ el_to blob NOT NULL default '',
 462+
 463+ -- In the case of HTTP URLs, this is the URL with any username or password
 464+ -- removed, and with the labels in the hostname reversed and converted to
 465+ -- lower case. An extra dot is added to allow for matching of either
 466+ -- example.com or *.example.com in a single scan.
 467+ -- Example:
 468+ -- http://user:password@sub.example.com/page.html
 469+ -- becomes
 470+ -- http://com.example.sub./page.html
 471+ -- which allows for fast searching for all pages under example.com with the
 472+ -- clause:
 473+ -- WHERE el_index LIKE 'http://com.example.%'
 474+ el_index blob NOT NULL default '',
 475+
 476+ KEY (el_from, el_to(40)),
 477+ KEY (el_to(60), el_from),
 478+ KEY (el_index(60))
 479+) TYPE=InnoDB;
 480+
 481+--
454482 -- Contains a single row with some aggregate info
455483 -- on the state of the site.
456484 --
Index: trunk/phase3/includes/GlobalFunctions.php
@@ -1808,4 +1808,34 @@
18091809 }
18101810 }
18111811
 1812+/**
 1813+ * Make a URL index, appropriate for the el_index field of externallinks.
 1814+ */
 1815+function wfMakeUrlIndex( $url ) {
 1816+ wfSuppressWarnings();
 1817+ $bits = parse_url( $url );
 1818+ wfRestoreWarnings();
 1819+ if ( !$bits || $bits['scheme'] !== 'http' ) {
 1820+ return false;
 1821+ }
 1822+ // Reverse the labels in the hostname, convert to lower case
 1823+ $reversedHost = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
 1824+ // Add an extra dot to the end
 1825+ if ( substr( $reversedHost, -1 ) !== '.' ) {
 1826+ $reversedHost .= '.';
 1827+ }
 1828+ // Reconstruct the pseudo-URL
 1829+ $index = "http://$reversedHost";
 1830+ // Leave out user and password. Add the port, path, query and fragment
 1831+ if ( isset( $bits['port'] ) ) $index .= ':' . $bits['port'];
 1832+ if ( isset( $bits['path'] ) ) {
 1833+ $index .= $bits['path'];
 1834+ } else {
 1835+ $index .= '/';
 1836+ }
 1837+ if ( isset( $bits['query'] ) ) $index .= '?' . $bits['query'];
 1838+ if ( isset( $bits['fragment'] ) ) $index .= '#' . $bits['fragment'];
 1839+ return $index;
 1840+}
 1841+
18121842 ?>
Index: trunk/phase3/includes/Parser.php
@@ -1121,19 +1121,23 @@
11221122
11231123 # Replace & from obsolete syntax with &.
11241124 # All HTML entities will be escaped by makeExternalLink()
1125 - # or maybeMakeExternalImage()
11261125 $url = str_replace( '&', '&', $url );
 1126+ # Replace unnecessary URL escape codes with the referenced character
 1127+ # This prevents spammers from hiding links from the filters
 1128+ $url = Parser::replaceUnusualEscapes( $url );
11271129
11281130 # Process the trail (i.e. everything after this link up until start of the next link),
11291131 # replacing any non-bracketed links
11301132 $trail = $this->replaceFreeExternalLinks( $trail );
11311133
1132 -
11331134 # Use the encoded URL
11341135 # This means that users can paste URLs directly into the text
11351136 # Funny characters like ö aren't valid in URLs anyway
11361137 # This was changed in August 2004
11371138 $s .= $sk->makeExternalLink( $url, $text, false, $linktype ) . $dtrail . $trail;
 1139+
 1140+ # Register link in the output object
 1141+ $this->mOutput->addExternalLink( $url );
11381142 }
11391143
11401144 wfProfileOut( $fname );
@@ -1189,12 +1193,16 @@
11901194 # All HTML entities will be escaped by makeExternalLink()
11911195 # or maybeMakeExternalImage()
11921196 $url = str_replace( '&', '&', $url );
 1197+ # Replace unnecessary URL escape codes with their equivalent characters
 1198+ $url = Parser::replaceUnusualEscapes( $url );
11931199
11941200 # Is this an external image?
11951201 $text = $this->maybeMakeExternalImage( $url );
11961202 if ( $text === false ) {
11971203 # Not an image, make a link
11981204 $text = $sk->makeExternalLink( $url, $wgContLang->markNoConversion($url), true, 'free' );
 1205+ # Register it in the output object
 1206+ $this->mOutput->addExternalLink( $url );
11991207 }
12001208 $s .= $text . $trail;
12011209 } else {
@@ -1206,6 +1214,36 @@
12071215 }
12081216
12091217 /**
 1218+ * Replace unusual URL escape codes with their equivalent characters
 1219+ * @param string
 1220+ * @return string
 1221+ * @static
 1222+ */
 1223+ function replaceUnusualEscapes( $url ) {
 1224+ return preg_replace_callback( '/%[0-9A-Fa-f]{2}/',
 1225+ array( 'Parser', 'replaceUnusualEscapesCallback' ), $url );
 1226+ }
 1227+
 1228+ /**
 1229+ * Callback function used in replaceUnusualEscapes().
 1230+ * Replaces unusual URL escape codes with their equivalent character
 1231+ * @static
 1232+ * @access private
 1233+ */
 1234+ function replaceUnusualEscapesCallback( $matches ) {
 1235+ $char = urldecode( $matches[0] );
 1236+ $ord = ord( $char );
 1237+ // Is it an unsafe or HTTP reserved character according to RFC 1738?
 1238+ if ( $ord > 32 && $ord < 127 && strpos( '<>"#{}|\^~[]`;/?', $char ) === false ) {
 1239+ // No, shouldn't be escaped
 1240+ return $char;
 1241+ } else {
 1242+ // Yes, leave it escaped
 1243+ return $matches[0];
 1244+ }
 1245+ }
 1246+
 1247+ /**
12101248 * make an image if it's allowed, either through the global
12111249 * option or through the exception
12121250 * @access private
@@ -3742,7 +3780,8 @@
37433781 $mTitleText, # title text of the chosen language variant
37443782 $mLinks, # 2-D map of NS/DBK to ID for the links in the document. ID=zero for broken.
37453783 $mTemplates, # 2-D map of NS/DBK to ID for the template references. ID=zero for broken.
3746 - $mImages; # DB keys of the images used, in the array key only
 3784+ $mImages, # DB keys of the images used, in the array key only
 3785+ $mExternalLinks; # External link URLs, in the key only
37473786
37483787 function ParserOutput( $text = '', $languageLinks = array(), $categoryLinks = array(),
37493788 $containsOldMagic = false, $titletext = '' )
@@ -3757,6 +3796,7 @@
37583797 $this->mLinks = array();
37593798 $this->mTemplates = array();
37603799 $this->mImages = array();
 3800+ $this->mExternalLinks = array();
37613801 }
37623802
37633803 function getText() { return $this->mText; }
@@ -3768,6 +3808,7 @@
37693809 function &getLinks() { return $this->mLinks; }
37703810 function &getTemplates() { return $this->mTemplates; }
37713811 function &getImages() { return $this->mImages; }
 3812+ function &getExternalLinks() { return $this->mExternalLinks; }
37723813
37733814 function containsOldMagic() { return $this->mContainsOldMagic; }
37743815 function setText( $text ) { return wfSetVar( $this->mText, $text ); }
@@ -3780,6 +3821,7 @@
37813822 function addCategory( $c, $sort ) { $this->mCategories[$c] = $sort; }
37823823 function addImage( $name ) { $this->mImages[$name] = 1; }
37833824 function addLanguageLink( $t ) { $this->mLanguageLinks[] = $t; }
 3825+ function addExternalLink( $url ) { $this->mExternalLinks[$url] = 1; }
37843826
37853827 function addLink( $title, $id ) {
37863828 $ns = $title->getNamespace();
Index: trunk/phase3/includes/LinksUpdate.php
@@ -19,6 +19,7 @@
2020 $mLinks, # Map of title strings to IDs for the links in the document
2121 $mImages, # DB keys of the images used, in the array key only
2222 $mTemplates, # Map of title strings to IDs for the template references, including broken ones
 23+ $mExternals, # URLs of external links, array key only
2324 $mCategories, # Map of category names to sort keys
2425 $mDb, # Database connection reference
2526 $mOptions; # SELECT options to be used (array)
@@ -52,6 +53,7 @@
5354 $this->mLinks =& $this->mParserOutput->getLinks();
5455 $this->mImages =& $this->mParserOutput->getImages();
5556 $this->mTemplates =& $this->mParserOutput->getTemplates();
 57+ $this->mExternals =& $this->mParserOutput->getExternalLinks();
5658 $this->mCategories =& $this->mParserOutput->getCategories();
5759
5860 }
@@ -87,6 +89,11 @@
8890 $this->incrTableUpdate( 'imagelinks', 'il', $this->getImageDeletions( $existing ),
8991 $this->getImageInsertions( $existing ) );
9092
 93+ # External links
 94+ $existing = $this->getExistingExternals();
 95+ $this->incrTableUpdate( 'externallinks', 'el', $this->getExternalDeletions( $existing ),
 96+ $this->getExternalInsertions( $existing ) );
 97+
9198 # Category links
9299 $existing = $this->getExistingCategories();
93100 $this->incrTableUpdate( 'categorylinks', 'cl', $this->getCategoryDeletions( $existing ),
@@ -117,6 +124,7 @@
118125 $this->dumbTableUpdate( 'imagelinks', $this->getImageInsertions(), 'il_from' );
119126 $this->dumbTableUpdate( 'categorylinks', $this->getCategoryInsertions(), 'cl_from' );
120127 $this->dumbTableUpdate( 'templatelinks', $this->getTemplateInsertions(), 'tl_from' );
 128+ $this->dumbTableUpdate( 'externallinks', $this->getExternalInsertions(), 'el_from' );
121129
122130 # Update the cache of all the category pages
123131 $this->invalidateCategories( $categoryUpdates );
@@ -238,7 +246,7 @@
239247 function getImageInsertions( $existing = array() ) {
240248 $arr = array();
241249 $diffs = array_diff_key( $this->mImages, $existing );
242 - foreach( $diffs as $iname => $val ) {
 250+ foreach( $diffs as $iname => $dummy ) {
243251 $arr[] = array(
244252 'il_from' => $this->mId,
245253 'il_to' => $iname
@@ -248,6 +256,23 @@
249257 }
250258
251259 /**
 260+ * Get an array of externallinks insertions. Skips the names specified in $existing
 261+ * @access private
 262+ */
 263+ function getExternalInsertions( $existing = array() ) {
 264+ $arr = array();
 265+ $diffs = array_diff_key( $this->mExternals, $existing );
 266+ foreach( $diffs as $url => $dummy ) {
 267+ $arr[] = array(
 268+ 'el_from' => $this->mId,
 269+ 'el_to' => $url,
 270+ 'el_index' => wfMakeUrlIndex( $url ),
 271+ );
 272+ }
 273+ return $arr;
 274+ }
 275+
 276+ /**
252277 * Get an array of category insertions
253278 * @param array $existing Array mapping existing category names to sort keys. If both
254279 * match a link in $this, the link will be omitted from the output
@@ -309,6 +334,15 @@
310335 return array_diff_key( $existing, $this->mImages );
311336 }
312337
 338+ /**
 339+ * Given an array of existing external links, returns those links which are not
 340+ * in $this and thus should be deleted.
 341+ * @access private
 342+ */
 343+ function getExternalDeletions( $existing ) {
 344+ return array_diff_key( $existing, $this->mExternals );
 345+ }
 346+
313347 /**
314348 * Given an array of existing categories, returns those categories which are not in $this
315349 * and thus should be deleted.
@@ -333,6 +367,7 @@
334368 }
335369 $arr[$row->pl_namespace][$row->pl_title] = 1;
336370 }
 371+ $this->mDb->freeResult( $res );
337372 return $arr;
338373 }
339374
@@ -351,6 +386,7 @@
352387 }
353388 $arr[$row->tl_namespace][$row->tl_title] = 1;
354389 }
 390+ $this->mDb->freeResult( $res );
355391 return $arr;
356392 }
357393
@@ -366,10 +402,27 @@
367403 while ( $row = $this->mDb->fetchObject( $res ) ) {
368404 $arr[$row->il_to] = 1;
369405 }
 406+ $this->mDb->freeResult( $res );
370407 return $arr;
371408 }
372409
373410 /**
 411+ * Get an array of existing external links, URLs in the keys
 412+ * @access private
 413+ */
 414+ function getExistingExternals() {
 415+ $fname = 'LinksUpdate::getExistingExternals';
 416+ $res = $this->mDb->select( 'externallinks', array( 'el_to' ),
 417+ array( 'el_from' => $this->mId ), $fname, $this->mOptions );
 418+ $arr = array();
 419+ while ( $row = $this->mDb->fetchObject( $res ) ) {
 420+ $arr[$row->el_to] = 1;
 421+ }
 422+ $this->mDb->freeResult( $res );
 423+ return $arr;
 424+ }
 425+
 426+ /**
374427 * Get an array of existing categories, with the name in the key and sort key in the value.
375428 * @access private
376429 */
@@ -381,6 +434,7 @@
382435 while ( $row = $this->mDb->fetchObject( $res ) ) {
383436 $arr[$row->cl_to] = $row->cl_sortkey;
384437 }
 438+ $this->mDb->freeResult( $res );
385439 return $arr;
386440 }
387441 }
Index: trunk/phase3/RELEASE-NOTES
@@ -64,6 +64,7 @@
6565 namespace are changed
6666 * Respect database prefix in dumpHTML.inc
6767 * Removed read-only check from Database::query()
 68+* Added externallinks table, to track links to arbitrary URLs
6869
6970 Documentation:
7071 * (bug 3306) Document $wgLocalTZoffset
@@ -251,6 +252,9 @@
252253 * Fix XML validity checks in parser tests on PHP 5.1
253254 * (bug 4377) "[" is not valid in URLs
254255 * (bug 4453) fix for __TOC__ dollar-number breakage
 256+* Convert unnecessary URL escape codes in external links to their equivalent
 257+ character before doing anything with them. This prevents certain kinds of
 258+ spam filter evasion.
255259
256260 Upload:
257261 * (bug 2527) Always set destination filename when new file is selected

Status & tagging log