r107633 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r107632‎ | r107633 | r107634 >
Date:12:47, 30 December 2011
Author:gwicke
Status:deferred
Tags:
Comment:
Improvements to link trail handling, and two tweaks to the whitelist. 182
tests now passing.

Link trails depend on language-dependent positive character classes in the PHP
parser. These classes all seem to disallow punctuation implicitly and list
differing plain text characters instead, so it might be possible to get away
with identifying a common class of non-trail punctuation instead. This would
help to keep the tokenizer independent of configurations, which is very
desirable for caching and simplified external parsing.
Modified paths:
  • /trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt (modified) (history)
  • /trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js
@@ -23,6 +23,7 @@
2424 // empty table tags / with only a caption are legal in HTML5.
2525 testWhiteList["A table with no data."] = "<table></table>";
2626 testWhiteList["A table with nothing but a caption"] = "<table><caption> caption</caption></table>";
 27+testWhiteList["Fuzz testing: Parser22"] = "<p data-sourcePos=\"0:23\"><a href=\"http://===r:::https://b\">http://===r:::https://b</a></p><table></table>";
2728
2829 // MediaWiki changes the order of attributes in tables, ignore that
2930 testWhiteList["Multiplication table"] = "<table border=\"1\" cellpadding=\"2\"><caption>Multiplication table</caption><tbody><tr><th> × </th><th> 1 </th><th> 2 </th><th> 3</th></tr><tr><th> 1</th><td> 1 </td><td> 2 </td><td> 3</td></tr><tr><th> 2</th><td> 2 </td><td> 4 </td><td> 6</td></tr><tr><th> 3</th><td> 3 </td><td> 6 </td><td> 9</td></tr><tr><th> 4</th><td> 4 </td><td> 8 </td><td> 12</td></tr><tr><th> 5</th><td> 5 </td><td> 10 </td><td> 15</td></tr></tbody></table>";
@@ -51,7 +52,7 @@
5253 testWhiteList["Table security: embedded pipes (http://lists.wikimedia.org/mailman/htdig/wikitech-l/2006-April/022293.html)"] = "<table><tbody><tr><td> |<a href=\"ftp://|x||\">[1]</a>\" onmouseover=\"alert(document.cookie)\"&gt;test</td></tr></tbody></table>";
5354
5455 // Sanitizer, but UTF8 in link might actually be ok in HTML5
55 -testWhiteList["External link containing double-single-quotes with no space separating the url from text in italics"] = "<p><a href=\"http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm\"><i>La muerte de Casagemas</i> (1901) en el sitio de </a><a data-type=\"internal\" href=\"Museo Picasso (París)\">Museo Picasso</a></p>";
 56+testWhiteList["External link containing double-single-quotes with no space separating the url from text in italics"] = "<p><a href=\"http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm\"><i>La muerte de Casagemas</i> (1901) en el sitio de </a><a data-type=\"internal\" href=\"Museo Picasso (París)\">Museo Picasso</a>.</p>";
5657
5758
5859 if (typeof module == "object") {
Index: trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
@@ -653,10 +653,13 @@
654654 }
655655
656656 //[^][<>"\\x00-\\x20\\x7F\p{Zs}]
 657+
 658+no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
 659+
657660 url
658661 = proto:url_protocol
659662 rest:( ( !inline_breaks
660 - c:[^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
 663+ c:no_punctuation_char
661664 { return c }
662665 )
663666 / s:[.:,] !(space / eolf) { return s }
@@ -735,13 +738,16 @@
736739 }
737740 / & { clearFlag('template'); return false; }
738741
 742+// TODO: handle link prefixes as in al[[Razi]]
739743 wikilink
740744 = "[["
741745 ! url
742746 target:link_target
743747 lcontent:( "|" lt:link_text { return lt } )*
744748 "]]"
745 - suffix:(![ \]] tc:text_char { return tc })* {
 749+ // XXX In real MediaWiki, this is a language-dependent positive character
 750+ // class. Can we work out a static negative class instead?
 751+ trail:(! [ \t(),.:-] tc:text_char { return tc })* {
746752 var obj = {
747753 type: 'TAG',
748754 name: 'a',
@@ -749,21 +755,20 @@
750756 ['data-type', 'internal']
751757 ]
752758 },
753 - suffixTokens = [],
754759 textTokens = [];
755760 obj.attribs.push(['href', target]);
756761 if (lcontent && lcontent.length) {
757762 textTokens = lcontent;
758 - if (suffix) {
759 - suffixTokens = [{ type: 'TEXT', value: suffix.join('') }];
 763+ if (trail) {
 764+ textTokens.push( { type: 'TEXT', value: trail.join('') } );
760765 }
761766 } else {
762 - if (suffix) {
763 - target += suffix.join('');
 767+ if (trail) {
 768+ target += trail.join('');
764769 }
765770 textTokens = [{type: 'TEXT', value: target}];
766771 }
767 - return [obj].concat(textTokens, [{type: 'ENDTAG', name: 'a'}], suffixTokens);
 772+ return [obj].concat(textTokens, [{type: 'ENDTAG', name: 'a'}]);
768773 }
769774
770775 link_target

Status & tagging log