r113136 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r113135‎ | r113136 | r113137 >
Date:13:49, 6 March 2012
Author:gwicke
Status:deferred
Tags:
Comment:
Reworked percent encoding handling for URIs to get closer to the 'url
construction' part of the HTML5 spec:
http://www.whatwg.org/specs/web-apps/current-work/multipage/urls.html#url-manipulation-and-creation

Removed a few whitelisted test cases that are now passing directly.

The encoding canonicalization could also be moved to the Sanitizer. Doing this
early in token stream processing however has the advantage of providing further
transformations uniform data to work with. We could even consider to move this
even further into the tokenizer.
Modified paths:
  • /trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt (modified) (history)
  • /trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js
@@ -46,36 +46,24 @@
4747
4848 /* Missing token transform functionality */
4949
50 -// We don't implement percent encoding for URIs yet.
51 -testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a data-mw-type=\"internal\" href=\"/wiki/Lista d''e paise d''o munno\">Lista d''e paise d''o munno</a></p>";
 50+// Single quotes are legal in HTML5 URIs. See
 51+// http://www.whatwg.org/specs/web-apps/current-work/multipage/urls.html#url-manipulation-and-creation
 52+testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a href=\"/wiki/Lista_d''e_paise_d''o_munno\" data-mw-type=\"internal\">Lista d''e paise d''o munno</a></p>";
5253
53 -testWhiteList["Link containing \"<#\" and \">#\" as a hex sequences"] = "<p><a data-mw-type=\"internal\" href=\"/wiki/&lt;%23\">&lt;%23</a><a data-mw-type=\"internal\" href=\"/wiki/&gt;%23\">&gt;%23</a></p>";
5454
55 -
5655 // Sanitizer
5756 testWhiteList["Invalid attributes in table cell (bug 1830)"] = "<table><tbody><tr><td Cell:=\"\">broken</td></tr></tbody></table>";
5857 testWhiteList["Table security: embedded pipes (http://lists.wikimedia.org/mailman/htdig/wikitech-l/2006-April/022293.html)"] = "<table><tbody><tr><td> |<a href=\"ftp://|x||\">[1]</a>\" onmouseover=\"alert(document.cookie)\"&gt;test</td></tr></tbody></table>";
5958
60 -// Sanitizer, but UTF8 in link might actually be ok in HTML5
 59+// Sanitizer, but UTF8 in link is ok in HTML5
6160 testWhiteList["External link containing double-single-quotes with no space separating the url from text in italics"] = "<p><a href=\"http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm\" data-mw-type=\"external\" data-mw-rt=\"{&quot;sourcePos&quot;:[0,146]}\"><i>La muerte de Casagemas</i> (1901) en el sitio de </a><a href=\"/wiki/Museo_Picasso_(París)\" data-mw-type=\"internal\">Museo Picasso</a>.</p>";
6261
63 -// plain percent sign is also valid in HTML5
64 -testWhiteList["Bug 4781, 5267: %28, %29 in URL"] = "<p><a href=\"http://www.example.com/?title=Ben-Hur_(1959_film)\" data-mw-sourcePos=\"0:53\">http://www.example.com/?title=Ben-Hur_(1959_film)</a></p>";
65 -
6662 testWhiteList["External links: wiki links within external link (Bug 3695)"] = "<p><a href=\"http://example.com\" data-mw-type=\"external\" data-mw-sourcePos=\"0:54\"></a><a data-mw-type=\"internal\" href=\"/wiki/Wikilink\">wikilink</a> embedded in ext link</p>";
6763
68 -testWhiteList["Bug 4781, 5267: %25 in URL"] = "<p><a href=\"http://www.example.com/?title=100%_Bran\" data-mw-sourcePos=\"0:41\">http://www.example.com/?title=100%_Bran</a></p>";
69 -
7064 testWhiteList["<pre> with forbidden attribute values (bug 3202)"] = "<pre width=\"8\" style=\"\">Narrow screen goodies</pre>";
7165
72 -testWhiteList["Link containing % (not as a hex sequence)"] = "<p><a href=\"/wiki/7%_Solution\" data-mw-type=\"internal\">7% Solution</a></p>";
 66+//testWhiteList["Piped link to URL"] = "<p>Piped link to URL: [<a href=\"http://www.example.com|an\" data-mw-type=\"external\">example URL</a>]</p>";
7367
74 -testWhiteList["Link containing % as a single hex sequence interpreted to char"] = "<p><a href=\"/wiki/7%_Solution\" data-mw-type=\"internal\">7% Solution</a></p>";
75 -
76 -testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a href=\"/wiki/Lista_d''e_paise_d''o_munno\" data-mw-type=\"internal\">Lista d''e paise d''o munno</a></p>";
77 -
78 -testWhiteList["Brackets in urls"] = "<p><a href=\"http://example.com/index.php?foozoid[]=bar\">http://example.com/index.php?foozoid[]=bar</a></p><p><a href=\"http://example.com/index.php?foozoid[]=bar\">http://example.com/index.php?foozoid[]=bar</a></p>";
79 -
8068 if (typeof module == "object") {
8169 module.exports.testWhiteList = testWhiteList;
8270 }
Index: trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
@@ -645,7 +645,7 @@
646646 )
647647 / s:[.:,] !(space / eolf) { return s }
648648 / htmlentity
649 - / urlencoded_char
 649+ /// urlencoded_char
650650 / [&%] )+
651651 {
652652 return proto + addr + rest.join('');
@@ -1674,17 +1674,19 @@
16751675
16761676
16771677 wikilink_preprocessor_text
1678 - = r:( t:[^%<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
1679 - / urlencoded_char
 1678+ = r:( t:[^<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
 1679+ /// urlencoded_char
16801680 / directive
16811681 / !inline_breaks !"|" !"]]" text_char )+ {
16821682 return flatten_stringlist ( r );
16831683 }
16841684
16851685 extlink_preprocessor_text
 1686+ // added special separator character class inline: separates url from
 1687+ // description / text
16861688 = r:( t:[^'<~[{\n\r|!\]}\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ { return t.join(''); }
16871689 / directive
1688 - / urlencoded_char
 1690+ /// urlencoded_char
16891691 / !inline_breaks no_punctuation_char
16901692 / s:[.:,] !(space / eolf) { return s }
16911693 / [&%] )+ {
Index: trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js
@@ -54,7 +54,7 @@
5555 }
5656 content = out;
5757 } else {
58 - content = href;
 58+ content = [ env.decodeURI( env.tokensToString( href ) ) ];
5959 }
6060 if ( tail ) {
6161 content.push( tail );
@@ -105,19 +105,19 @@
106106
107107 var content = token.attribs.slice(1, -1);
108108
109 - // XXX: get /wiki from config!
 109+ // TODO: get /wiki from config!
110110 var a = new TagTk( 'a', [ new KV( 'href', '/wiki' + title.makeLink() ) ] );
111111 a.dataAttribs = token.dataAttribs;
112112
113113 var MD5 = new jshashes.MD5(),
114114 hash = MD5.hex( title.key ),
115 - // XXX: Hackhack..
 115+ // TODO: Hackhack.. Move to proper test harness setup!
116116 path = 'http://example.com/images/' +
117117 [ hash[0], hash.substr(0, 2) ].join('/') + '/' + title.key;
118118
119119
120120
121 - // XXX: extract options
 121+ // extract options
122122 var options = [],
123123 caption = null;
124124 for( var i = 0, l = content.length; i<l; i++ ) {
@@ -132,7 +132,7 @@
133133 }
134134 } else {
135135 var bits = oText[0].split( '=', 2 );
136 - if ( bits.length > 1 && this._prefixImageOptions[ bits[0].strip ] ) {
 136+ if ( bits.length > 1 && this._prefixImageOptions[ bits[0].trim() ] ) {
137137 console.log('handle prefix ' + bits );
138138 } else {
139139 caption = oContent;
@@ -217,7 +217,9 @@
218218 };
219219
220220 ExternalLinkHandler.prototype.onUrlLink = function ( token, manager, cb ) {
221 - var href = this.manager.env.lookupKV( token.attribs, 'href' ).v;
 221+ var href = this.manager.env.sanitizeURI(
 222+ this.manager.env.lookupKV( token.attribs, 'href' ).v
 223+ );
222224 if ( this._isImageLink( href ) ) {
223225 return { token: new SelfclosingTagTk( 'img',
224226 [
@@ -241,6 +243,8 @@
242244 ExternalLinkHandler.prototype.onExtLink = function ( token, manager, cb ) {
243245 var href = this.manager.env.lookupKV( token.attribs, 'href' ).v,
244246 content= this.manager.env.lookupKV( token.attribs, 'content' ).v;
 247+ href = this.manager.env.sanitizeURI( href );
 248+ console.warn('extlink href: ' + href );
245249 //console.warn( 'content: ' + JSON.stringify( content, null, 2 ) );
246250 // validate the href
247251 if ( this.imageParser.parseURL( href ) ) {
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js
@@ -10,7 +10,7 @@
1111 Title.prototype.makeLink = function () {
1212 // XXX: links always point to the canonical namespace name.
1313 if ( false && this.nskey ) {
14 - return this.env.wgScriptPath + this.nskey + ':' + this.key;
 14+ return this.env.sanitizeURI( this.env.wgScriptPath + this.nskey + ':' + this.key );
1515 } else {
1616 var l = this.env.wgScriptPath,
1717 ns = this.ns.getDefaultName();
@@ -18,7 +18,7 @@
1919 if ( ns ) {
2020 l += ns + ':';
2121 }
22 - return l + this.key;
 22+ return this.env.sanitizeURI( l + this.key );
2323 }
2424 };
2525
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js
@@ -208,7 +208,45 @@
209209 return out.join('');
210210 };
211211
 212+MWParserEnvironment.prototype.decodeURI = function ( s ) {
 213+ return s.replace( /%[0-9a-f][0-9a-f]/g, function( m ) {
 214+ try {
 215+ return decodeURI( m );
 216+ } catch ( e ) {
 217+ return m;
 218+ }
 219+ } );
 220+};
212221
 222+MWParserEnvironment.prototype.sanitizeURI = function ( s ) {
 223+ var host = s.match(/^[a-zA-Z]+:\/\/[^\/]+(?:\/|$)/),
 224+ path = s,
 225+ anchor = null;
 226+ console.warn( 'host: ' + host );
 227+ if ( host ) {
 228+ path = s.substr( host[0].length );
 229+ host = host[0];
 230+ } else {
 231+ host = '';
 232+ }
 233+ var bits = path.split('#');
 234+ if ( bits.length > 1 ) {
 235+ anchor = bits[bits.length - 1];
 236+ path = path.substr(0, path.length - anchor.length - 1);
 237+ }
 238+ host = host.replace( /%(?![0-9a-fA-F][0-9a-fA-F])|[#|]/g, function ( m ) {
 239+ return encodeURIComponent( m );
 240+ } );
 241+ path = path.replace( /%(?![0-9a-fA-F][0-9a-fA-F])|[\[\]#|]/g, function ( m ) {
 242+ return encodeURIComponent( m );
 243+ } );
 244+ s = host + path;
 245+ if ( anchor !== null ) {
 246+ s += '#' + anchor;
 247+ }
 248+ return s;
 249+};
 250+
213251 /**
214252 * Simple debug helper
215253 */

Status & tagging log