r113020 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r113019‎ | r113020 | r113021 >
Date:12:00, 5 March 2012
Author:gwicke
Status:deferred
Tags:
Comment:
Change wikilink tokenization strategy to split on pipes. This makes it
possible to support template / template argument expansion in image options,
and causes little trouble for wikilinks. Non-image wikilinks with multiple
text pipes are quite rare in the dumps, and concatenating description tokens
with a plain '|' is quite easy. 261 parser tests passing.
Modified paths:
  • /trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/ext.core.ParserFunctions.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt (modified) (history)
  • /trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js (modified) (history)
  • /trunk/extensions/VisualEditor/tests/parser/parserTests.js (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/tests/parser/parserTests.js
@@ -202,7 +202,7 @@
203203 fetchTemplates: false,
204204 debug: this.argv.debug,
205205 trace: this.argv.trace,
206 - wgScriptPath: '/'
 206+ wgScriptPath: '/wiki/'
207207 });
208208 }
209209
@@ -308,7 +308,7 @@
309309 return this.htmlparser.document.getElementsByTagName('body')[0]
310310 .innerHTML
311311 // a few things we ignore for now..
312 - .replace(/\/wiki\/Main_Page/g, 'Main Page')
 312+ //.replace(/\/wiki\/Main_Page/g, 'Main Page')
313313 // do not expect a toc for now
314314 .replace(/<table[^>]+?id="toc"[^>]*>.+?<\/table>/mg, '')
315315 // do not expect section editing for now
@@ -316,7 +316,7 @@
317317 // general class and titles, typically on links
318318 .replace(/(title|class|rel)="[^"]+"/g, '')
319319 // strip red link markup, we do not check if a page exists yet
320 - .replace(/\/index.php\?title=([^']+)&amp;action=edit&amp;redlink=1/g, '$1')
 320+ .replace(/\/index.php\?title=([^']+?)&amp;action=edit&amp;redlink=1/g, '/wiki/$1')
321321 // the expected html has some extra space in tags, strip it
322322 .replace(/<a +href/g, '<a href')
323323 .replace(/" +>/g, '">');
Index: trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js
@@ -13,7 +13,7 @@
1414 testWhiteList["Unclosed and unmatched quotes"] = "<p data-mw-sourcePos=\"0:66\"><i><b>Bold italic text </b>with bold deactivated<b> in between.</b></i></p><p><i><b>Bold italic text </b></i><b>with italic deactivated<i> in between.</i></b></p><p><b>Bold text..</b></p><p>..spanning two paragraphs (should not work).<b></b></p><p><b>Bold tag left open</b></p><p><i>Italic tag left open</i></p><p>Normal text.<!-- Unmatching number of opening, closing tags: -->\n</p><p><b>This year'</b>s election <i>should</i> beat <b>last year'</b>s.</p><p><i>Tom<b>s car is bigger than </b></i><b>Susan</b>s.</p>";
1515
1616 // The expected result for this test is really broken html.
17 -testWhiteList["Link containing double-single-quotes '' in text embedded in italics (bug 4598 sanity check)"] = "<p data-mw-sourcePos=\"0:45\"><i>Some <a data-mw-type=\"internal\" href=\"Link\">pretty </a></i><a data-mw-type=\"internal\" href=\"Link\">italics<i> and stuff</i></a><i>!</i></p>";
 17+testWhiteList["Link containing double-single-quotes '' in text embedded in italics (bug 4598 sanity check)"] = "<p data-mw-sourcePos=\"0:45\"><i>Some <a data-mw-type=\"internal\" href=\"/wiki/Link\">pretty </a></i><a data-mw-type=\"internal\" href=\"/wiki/Link\">italics<i> and stuff</i></a><i>!</i></p>";
1818
1919 testWhiteList["External link containing double-single-quotes in text embedded in italics (bug 4598 sanity check)"] = "<p><i>Some <a href=\"http://example.com/\">pretty </a></i><a href=\"http://example.com/\">italics<i> and stuff</i></a><i>!</i></p>";
2020
@@ -47,9 +47,9 @@
4848 /* Missing token transform functionality */
4949
5050 // We don't implement percent encoding for URIs yet.
51 -testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a data-mw-type=\"internal\" href=\"Lista d''e paise d''o munno\">Lista d''e paise d''o munno</a></p>";
 51+testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a data-mw-type=\"internal\" href=\"/wiki/Lista d''e paise d''o munno\">Lista d''e paise d''o munno</a></p>";
5252
53 -testWhiteList["Link containing \"<#\" and \">#\" as a hex sequences"] = "<p><a data-mw-type=\"internal\" href=\"&lt;%23\">&lt;%23</a><a data-mw-type=\"internal\" href=\"&gt;%23\">&gt;%23</a></p>";
 53+testWhiteList["Link containing \"<#\" and \">#\" as a hex sequences"] = "<p><a data-mw-type=\"internal\" href=\"/wiki/&lt;%23\">&lt;%23</a><a data-mw-type=\"internal\" href=\"/wiki/&gt;%23\">&gt;%23</a></p>";
5454
5555
5656 // Sanitizer
@@ -57,17 +57,23 @@
5858 testWhiteList["Table security: embedded pipes (http://lists.wikimedia.org/mailman/htdig/wikitech-l/2006-April/022293.html)"] = "<table><tbody><tr><td> |<a href=\"ftp://|x||\">[1]</a>\" onmouseover=\"alert(document.cookie)\"&gt;test</td></tr></tbody></table>";
5959
6060 // Sanitizer, but UTF8 in link might actually be ok in HTML5
61 -testWhiteList["External link containing double-single-quotes with no space separating the url from text in italics"] = "<p><a href=\"http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm\"><i>La muerte de Casagemas</i> (1901) en el sitio de </a><a data-mw-type=\"internal\" href=\"Museo Picasso (París)\">Museo Picasso</a>.</p>";
 61+testWhiteList["External link containing double-single-quotes with no space separating the url from text in italics"] = "<p><a href=\"http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm\" data-mw-type=\"external\" data-mw-rt=\"{&quot;sourcePos&quot;:[0,146]}\"><i>La muerte de Casagemas</i> (1901) en el sitio de </a><a href=\"/wiki/Museo_Picasso_(París)\" data-mw-type=\"internal\">Museo Picasso</a>.</p>";
6262
6363 // plain percent sign is also valid in HTML5
6464 testWhiteList["Bug 4781, 5267: %28, %29 in URL"] = "<p><a href=\"http://www.example.com/?title=Ben-Hur_(1959_film)\" data-mw-sourcePos=\"0:53\">http://www.example.com/?title=Ben-Hur_(1959_film)</a></p>";
6565
66 -testWhiteList["External links: wiki links within external link (Bug 3695)"] = "<p><a href=\"http://example.com\" data-mw-type=\"external\" data-mw-sourcePos=\"0:54\"></a><a data-mw-type=\"internal\" href=\"wikilink\">wikilink</a> embedded in ext link</p>";
 66+testWhiteList["External links: wiki links within external link (Bug 3695)"] = "<p><a href=\"http://example.com\" data-mw-type=\"external\" data-mw-sourcePos=\"0:54\"></a><a data-mw-type=\"internal\" href=\"/wiki/Wikilink\">wikilink</a> embedded in ext link</p>";
6767
6868 testWhiteList["Bug 4781, 5267: %25 in URL"] = "<p><a href=\"http://www.example.com/?title=100%_Bran\" data-mw-sourcePos=\"0:41\">http://www.example.com/?title=100%_Bran</a></p>";
6969
7070 testWhiteList["<pre> with forbidden attribute values (bug 3202)"] = "<pre width=\"8\" style=\"\">Narrow screen goodies</pre>";
7171
 72+testWhiteList["Link containing % (not as a hex sequence)"] = "<p><a href=\"/wiki/7%_Solution\" data-mw-type=\"internal\">7% Solution</a></p>";
 73+
 74+testWhiteList["Link containing % as a single hex sequence interpreted to char"] = "<p><a href=\"/wiki/7%_Solution\" data-mw-type=\"internal\">7% Solution</a></p>";
 75+
 76+testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a href=\"/wiki/Lista_d''e_paise_d''o_munno\" data-mw-type=\"internal\">Lista d''e paise d''o munno</a></p>";
 77+
7278 if (typeof module == "object") {
7379 module.exports.testWhiteList = testWhiteList;
7480 }
Index: trunk/extensions/VisualEditor/modules/parser/ext.core.ParserFunctions.js
@@ -366,7 +366,11 @@
367367 };
368368
369369 ParserFunctions.prototype['pf_localurl'] = function ( target, argList, argDict ) {
370 - return ( this.manager.env.wgScriptPath + 'index' +
 370+ return (
 371+ '/' +
 372+ // FIXME! Figure out correct prefix to use
 373+ //this.manager.env.wgScriptPath +
 374+ 'index' +
371375 this.manager.env.wgScriptExtension + '?title=' +
372376 this.manager.env.normalizeTitle( target ) + '&' +
373377 argList.map(
Index: trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
@@ -770,14 +770,14 @@
771771 }
772772
773773 / {
774 - return { pos: posStack.pop('lcontent' , pos), content: null };
 774+ return { pos: posStack.pop('lcontent' , pos), content: [] };
775775 }
776776 )
777777 "]]"
778778 // XXX In real MediaWiki, this is a language-dependent positive character
779779 // class. Can we work out a static negative class instead?
780780 // XXX: Exclude uppercase chars from non-latin languages too!
781 - trail:( ![A-Z \t(),.:-] tc:text_char { return tc } )* {
 781+ tail:( ![A-Z \t(),.:-] tc:text_char { return tc } )* {
782782 var obj = new SelfclosingTagTk( 'wikilink' ),
783783 textTokens = [];
784784 obj.attribs.push( new KV('href', target) );
@@ -788,22 +788,12 @@
789789 // XXX: Point to object with path, revision and input information
790790 obj.source = input;
791791
 792+ //console.warn('lcontent: ' + JSON.stringify( lcontent, null, 2 ) );
792793 // Deal with content. XXX: Properly support pipe-trick etc
793 - if (lcontent.content && lcontent.content.length) {
794 - textTokens = lcontent.content;
795 - if (trail) {
796 - textTokens.push( trail.join('') );
797 - }
798 - } else {
799 - if (trail) {
800 - textTokens = $.extend(true, [], target).concat( [ trail.join('') ] );
801 - } else {
802 - // copy list
803 - textTokens = $.extend(true, [], target);
804 - }
805 - }
 794+ lcontent.tail = tail && tail.join('') || '';
806795
807 - obj.attribs.push( new KV( 'content', flatten( textTokens ) ) );
 796+ obj.attribs.push( new KV( 'content', lcontent.content ) );
 797+ obj.attribs.push( new KV( 'tail', lcontent.tail ) );
808798 //console.warn( "XXX:" + pp([obj].concat(textTokens, [new EndTagTk( 'a' )])) );
809799 return [obj];
810800 }
@@ -826,6 +816,24 @@
827817 }
828818 / & { return clearFlag('linkdesc'); }
829819
 820+link_option
 821+ = & { setFlag('pipe'); return setFlag('linkdesc'); }
 822+ h:inline
 823+ // 'equal' syntaxFlag is set for links in template parameters. Consume the
 824+ // '=' here.
 825+ hs:( '=' inline)?
 826+ {
 827+ //console.warn('link_text' + pp(h) + pp(hs));
 828+ clearFlag('pipe');
 829+ clearFlag('linkdesc');
 830+ if( hs !== '' ) {
 831+ return h.concat(hs);
 832+ } else {
 833+ return h;
 834+ }
 835+ }
 836+ / & { clearFlag('pipe'); return clearFlag('linkdesc'); }
 837+
830838 link_end = "]]"
831839
832840 /* Generic quote production for italic and bold, further processed in a token
@@ -1674,7 +1682,7 @@
16751683 = r:( t:[^%<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
16761684 / urlencoded_char
16771685 / directive
1678 - / !inline_breaks !"]]" text_char )+ {
 1686+ / !inline_breaks !"|" !"]]" text_char )+ {
16791687 return flatten_stringlist ( r );
16801688 }
16811689
Index: trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js
@@ -24,12 +24,12 @@
2525 WikiLinkHandler.prototype.rank = 1.15; // after AttributeExpander
2626
2727 WikiLinkHandler.prototype.onWikiLink = function ( token, manager, cb ) {
28 - var env = this.manager.env;
 28+ var env = this.manager.env,
 29+ href = env.lookupKV( token.attribs, 'href' ).v,
 30+ tail = env.lookupKV( token.attribs, 'tail' ).v;
2931 var title = this.manager.env.makeTitleFromPrefixedText(
30 - env.tokensToString(
31 - env.lookupKV( token.attribs, 'href' ).v
32 - )
33 - );
 32+ env.tokensToString( href )
 33+ );
3434
3535 if ( title.ns.isFile() ) {
3636 return this.renderFile( token, manager, cb, title );
@@ -39,16 +39,65 @@
4040 } else {
4141 // Check if page exists
4242 //
43 - var obj = new TagTk( 'a', [ this.manager.env.lookupKV( token.attribs, 'href' ) ] );
 43+ //console.warn( 'title: ' + JSON.stringify( title ) );
 44+ var obj = new TagTk( 'a', [ new KV( 'href', title.makeLink() ) ] ),
 45+ content = this.manager.env.lookupKV( token.attribs, 'content' ).v;
 46+ //console.warn('content: ' + JSON.stringify( content, null, 2 ) );
 47+ // XXX: handle trail
 48+ if ( content.length ) {
 49+ var out = []
 50+ for ( var i = 0, l = content.length; i < l ; i++ ) {
 51+ out = out.concat( content[i] );
 52+ if ( i < l - 1 ) {
 53+ out.push( '|' );
 54+ }
 55+ }
 56+ content = out;
 57+ } else {
 58+ content = href;
 59+ }
 60+ if ( tail ) {
 61+ content.push( tail );
 62+ }
 63+
4464 obj.attribs.push( new KV('data-mw-type', 'internal') );
45 - var out = [obj].concat( this.manager.env.lookupKV( token.attribs, 'content' ).v,
46 - new EndTagTk( 'a' ) );
 65+ var out = [obj].concat( content, new EndTagTk( 'a' ) );
4766 //console.warn( JSON.stringify( out, null, 2 ) );
4867 return { tokens: out };
4968 }
5069 };
5170
 71+WikiLinkHandler.prototype._simpleImageOptions = {
 72+ // halign
 73+ 'left': 'halign',
 74+ 'right': 'halign',
 75+ 'center': 'halign',
 76+ 'none': 'halign',
 77+ // valign
 78+ 'baseline': 'valign',
 79+ 'sub': 'valign',
 80+ 'super': 'valign',
 81+ 'top': 'valign',
 82+ 'text-top': 'valign',
 83+ 'middle': 'valign',
 84+ 'bottom': 'valign',
 85+ 'text-bottom': 'valign',
 86+ // format
 87+ 'border': 'format',
 88+ 'frameless': 'format',
 89+ 'frame': 'format',
 90+ 'thumbnail': 'format',
 91+ 'thumb': 'format'
 92+};
5293
 94+WikiLinkHandler.prototype._prefixImageOptions = {
 95+ 'link': 'link',
 96+ 'alt': 'alt',
 97+ 'page': 'page',
 98+ 'thumbnail': 'thumb',
 99+ 'thumb': 'thumb'
 100+};
 101+
53102 WikiLinkHandler.prototype.renderFile = function ( token, manager, cb, title ) {
54103 var env = manager.env;
55104 // distinguish media types
@@ -67,15 +116,39 @@
68117 [ hash[0], hash.substr(0, 2) ].join('/') + '/' + title.key;
69118
70119
71 - // XXX: parse options
72 - var contentPos = token.dataAttribs.contentPos;
73 - var optionSource = token.source.substr( contentPos[0], contentPos[1] - contentPos[0] );
74 - console.log( 'optionSource: ' + optionSource );
 120+
 121+ // XXX: extract options
 122+ var options = [],
 123+ caption = null;
 124+ for( var i = 0, l = content.length; i<l; i++ ) {
 125+ var oContent = content[i],
 126+ oText = manager.env.tokensToString( oContent, true );
 127+ if ( oText.constructor === String ) {
 128+ var oText = oText.trim();
 129+ if ( this._simpleImageOptions[ oText ] ) {
 130+ options.push( new KV( this._simpleImageOptions[ oText ],
 131+ oText ) );
 132+ continue;
 133+ }
 134+ } else {
 135+ var bits = oText[0].split( '=', 2 );
 136+ if ( bits.length > 1 && this._prefixImageOptions[ bits[0].strip ] ) {
 137+ console.log('handle prefix ' + bits );
 138+ } else {
 139+ caption = oContent;
 140+ }
 141+ }
 142+ }
 143+
 144+
 145+ //var contentPos = token.dataAttribs.contentPos;
 146+ //var optionSource = token.source.substr( contentPos[0], contentPos[1] - contentPos[0] );
 147+ //console.log( 'optionSource: ' + optionSource );
75148 // XXX: The trouble with re-parsing is the need to re-expand templates.
76 - // Figure out often non-image links contain image-like parameters!
77 - var options = this.imageParser.processImageOptions( optionSource );
 149+ // Figure out how often non-image links contain image-like parameters!
 150+ //var options = this.imageParser.processImageOptions( optionSource );
78151 //console.log( JSON.stringify( options, null, 2 ) );
79 - // XXX: check if the file exists, generate thumbnail
 152+ // XXX: check if the file exists, generate thumbnail, get size
80153 // XXX: render according to mode (inline, thumb, framed etc)
81154 var img = new SelfclosingTagTk( 'img',
82155 [
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js
@@ -12,7 +12,13 @@
1313 if ( false && this.nskey ) {
1414 return this.env.wgScriptPath + this.nskey + ':' + this.key;
1515 } else {
16 - return this.env.wgScriptPath + [this.ns.getDefaultName(), this.key].join(':');
 16+ var l = this.env.wgScriptPath,
 17+ ns = this.ns.getDefaultName();
 18+
 19+ if ( ns ) {
 20+ l += ns + ':';
 21+ }
 22+ return l + this.key;
1723 }
1824 };
1925
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js
@@ -132,6 +132,7 @@
133133 },
134134 '|': function ( input, pos, syntaxFlags ) {
135135 return syntaxFlags.template ||
 136+ syntaxFlags.linkdesc ||
136137 ( syntaxFlags.table &&
137138 ( input[pos + 1].match(/[|}]/) !== null ||
138139 syntaxFlags.tableCellArg
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js
@@ -117,7 +117,7 @@
118118 return new Title( text, 0, '', this );
119119 }
120120 } else {
121 - return new Title( text, 0, this );
 121+ return new Title( text, 0, '', this );
122122 }
123123 };
124124
@@ -174,7 +174,7 @@
175175 return name;
176176 };
177177
178 -MWParserEnvironment.prototype.tokensToString = function ( tokens ) {
 178+MWParserEnvironment.prototype.tokensToString = function ( tokens, strict ) {
179179 var out = [];
180180 //console.warn( 'MWParserEnvironment.tokensToString, tokens: ' + JSON.stringify( tokens ) );
181181 // XXX: quick hack, track down non-array sources later!
@@ -195,6 +195,9 @@
196196 } else if ( token.type === 'COMMENT' || token.type === 'NEWLINE' ) {
197197 // strip comments and newlines
198198 } else {
 199+ if ( strict ) {
 200+ return [out.join(''), null];
 201+ }
199202 var tstring = JSON.stringify( token );
200203 this.dp ( 'MWParserEnvironment.tokensToString, non-text token: ' +
201204 tstring + JSON.stringify( tokens, null, 2 ) );

Status & tagging log