r104852 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r104851‎ | r104852 | r104853 >
Date:14:25, 1 December 2011
Author:gwicke
Status:deferred
Tags:
Comment:
Improve external links and definition lists, now 133 tests passing ;)
Also add printwhitelist option to test runner, provides js code copy/pastable
to whitelist.
Modified paths:
  • /trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt (modified) (history)
  • /trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js (modified) (history)
  • /trunk/extensions/VisualEditor/tests/parser/parserTests.js (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js
@@ -8,6 +8,10 @@
99 // formatting is identical
1010 testWhiteList['Italics and bold'] = "<ul><li> plain</li><li> plain<i>italic</i>plain</li><li> plain<i>italic</i>plain<i>italic</i>plain</li><li> plain<b>bold</b>plain</li><li> plain<b>bold</b>plain<b>bold</b>plain</li><li> plain<i>italic</i>plain<b>bold</b>plain</li><li> plain<b>bold</b>plain<i>italic</i>plain</li><li> plain<i>italic<b>bold-italic</b>italic</i>plain</li><li> plain<b>bold<i>bold-italic</i>bold</b>plain</li><li> plain<i><b>bold-italic</b>italic</i>plain</li><li> plain<i><b>bold-italic</b></i><b>bold</b>plain</li><li> plain<i>italic<b>bold-italic</b></i>plain</li><li> plain<b>bold<i>bold-italic</i></b>plain</li><li> plain l'<i>italic</i>plain</li><li> plain l'<b>bold</b> plain</li></ul>";
1111
 12+// We don't care about existing or non-existing pages for now, so don't fail
 13+// on missing redlink
 14+testWhiteList["Definition list with wikilink containing colon"] = "<dl><dt> <a data-type=\"internal\" href=\"Help:FAQ\">Help:FAQ</a></dt><dd> The least-read page on Wikipedia</dd></dl>";
 15+
1216 if (typeof module == "object") {
1317 module.exports.testWhiteList = testWhiteList;
1418 }
Index: trunk/extensions/VisualEditor/tests/parser/parserTests.js
@@ -44,7 +44,7 @@
4545 alias: 'regex',
4646 },
4747 'whitelist': {
48 - description: 'Alternatively compare against manually verified parser output from whitelist (default true)',
 48+ description: 'Alternatively compare against manually verified parser output from whitelist',
4949 default: true,
5050 boolean: true,
5151 },
@@ -57,8 +57,8 @@
5858 default: false,
5959 boolean: true,
6060 },
61 - 'jsonout': {
62 - description: 'Print out a JSON serialization (default false) of parser output.',
 61+ 'printwhitelist': {
 62+ description: 'Print out a whitelist entry for failing tests. Default false.',
6363 default: false,
6464 boolean: true,
6565 },
@@ -318,7 +318,7 @@
319319 var normalizedOut = normalizeOut(out);
320320 var normalizedExpected = normalizeHTML(item.result);
321321 if ( normalizedOut !== normalizedExpected ) {
322 - if (argv.whiteList &&
 322+ if (argv.whitelist &&
323323 item.title in testWhiteList &&
324324 normalizeOut(testWhiteList[item.title]) === normalizedOut) {
325325 if( !argv.quiet ) {
@@ -368,9 +368,12 @@
369369
370370 console.log( colored_diff );
371371
372 - if(argv.jsonout) {
373 - console.log("JSON of parser output:");
374 - console.log(JSON.stringify(out));
 372+ if(argv.printwhitelist) {
 373+ console.log("Whitelist entry:");
 374+ console.log("testWhiteList[" +
 375+ JSON.stringify(item.title) + "] = " +
 376+ JSON.stringify(out) +
 377+ ";");
375378 }
376379 }
377380 } else {
Index: trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt
@@ -355,6 +355,8 @@
356356
357357 // text start position
358358 var textStart = 0;
 359+
 360+ var linkCount = 1;
359361 }
360362
361363 start
@@ -363,15 +365,60 @@
364366 }
365367
366368
367 -anyblock = block / inline
368 -anyblockline = block / inlineline
 369+/* All chars that cannot start syntactic structures in the middle of a line
 370+ * XXX: ] and other end delimiters should probably only be activated inside
 371+ * structures to avoid unnecessarily leaving the text production on plain
 372+ * content. */
369373
 374+text_char = [^'<~[{\n\r:\]}]
370375
371 -// All chars that cannot start syntactic structures in the middle of a line
372 -// XXX: ] and other end delimiters should probably only be activated inside
373 -// structures to avoid unnecessarily leaving the text production on plain
374 -// content.
375 -text = t:[^'<~\][\n\r{}:]+ { return t.join(''); }
 376+text = t:text_char+ { return t.join(''); }
 377+
 378+/* Explanation of chars
 379+ * ' quotes (italic/bold)
 380+ * < start of xmlish_tag
 381+ * ~ signatures/dates
 382+ * [ start of links
 383+ * { start of parser functions, transclusion and template args
 384+ * \n all sort of block-level markup at start of line
 385+ * \r ditto
 386+ * h http(s) urls
 387+ * n nntp(s) urls
 388+ * m mailto urls
 389+ *
 390+ * The following chars are also included for now, but only apply in some
 391+ * contexts and should probably be enabled only in those:
 392+ * : separate definition in ; term : definition
 393+ * ] end of link
 394+ * } end of parser func/transclusion/template arg
 395+ */
 396+
 397+urltext = ( t:[^'<~[{\n\rfghimnstw:\]} ]+ { return t.join(''); }
 398+ / urllink
 399+ // Convert trailing space into &nbsp;
 400+ // XXX: This should be moved to a serializer
 401+ / ' ' & ':' { return "\u00a0"; }
 402+ / t:text_char )+
 403+
 404+/*
 405+ '//', // for protocol-relative URLs, but not in text!
 406+ 'ftp://',
 407+ 'git://',
 408+ 'gopher://',
 409+ 'http://',
 410+ 'https://',
 411+ 'irc://',
 412+ 'ircs://', // @bug 28503
 413+ 'mailto:',
 414+ 'mms://',
 415+ 'news:',
 416+ 'nntp://', // @bug 3808 RFC 1738
 417+ 'svn://',
 418+ 'telnet://', // Well if we're going to support the above.. -ævar
 419+ 'worldwind://',
 420+*/
 421+
 422+// Old version
376423 //text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
377424
378425 // Experimental tweaked version: avoid expensive single-char substrings
@@ -463,13 +510,16 @@
464511
465512 br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} }
466513
467 -// Syntax that stops inline expansion
 514+// Syntax stops to limit inline expansion defending on syntactic context
468515 inline_breaks
469516 = //& { console.log(pp(syntaxFlags)); return true; }
470517 & { return syntaxFlags['table']; }
471518 a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a)); return true; }
 519+ / & { return (syntaxFlags['colon'] &&
 520+ ! syntaxFlags.extlink &&
 521+ ! syntaxFlags.linkdesk); } ":" { return true; }
 522+ / & { return syntaxFlags['extlink']; } "]" { return true; }
472523 / & { return syntaxFlags['linkdesc']; } link_end { return true; }
473 - / & { return syntaxFlags['colon']; } ":" { return true; }
474524 / & { return syntaxFlags['h']; }
475525 ( & { return syntaxFlags['h1'] } '=' newline { return true; }
476526 / & { return syntaxFlags['h2'] } '==' newline { return true; }
@@ -480,7 +530,7 @@
481531 )
482532
483533 inline
484 - = c:(text / inline_element / (!inline_breaks ch:. { return ch; }))+ {
 534+ = c:(urltext / inline_element / (!inline_breaks ch:. { return ch; }))+ {
485535 var out = [];
486536 var text = [];
487537 c = flatten(c);
@@ -502,7 +552,7 @@
503553 }
504554
505555 inlineline
506 - = c:(text / !inline_breaks (inline_element / [^\n]))+ {
 556+ = c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
507557 var out = [];
508558 var text = [];
509559 c = flatten(c);
@@ -529,10 +579,13 @@
530580 */
531581 inline_element
532582 = comment
533 - / xmlish_tag
 583+ // Can actually also be block-level elements, we don't really try to enforce
 584+ // a content model in the tokenizer. The HTML tree builder and DOM
 585+ // transformations are better equipped to deal with it.
 586+ / xmlish_tag
 587+ / template
 588+ / wikilink
534589 / extlink
535 - / template
536 - / link
537590 / quote
538591
539592 /* Headings */
@@ -635,24 +688,68 @@
636689 = c:[^-] { return c; }
637690 / c:'-' !'->' { return c; }
638691
639 -extlink
640 - = "[" target:url " " text:extlink_text "]" {
 692+
 693+urllink
 694+ = target:url {
641695 return [ { type: 'TAG',
642696 name: 'a',
643 - attribs: [['href', target]] } ]
644 - .concat( text
645 - , [{type: 'ENDTAG', name: 'a'}]);
 697+ attribs: [['href', target]] }
 698+ , {type: 'TEXT', value: target}
 699+ , {type: 'ENDTAG', name: 'a'}
 700+ ];
646701 }
647702
648 -// = "[" target:url text:extlink_text "]" { return { type: 'extlink', target: target, text: text } }
 703+extlink
 704+ = "["
 705+ & { return setFlag('extlink'); }
 706+ target:url
 707+ space*
 708+ text:inlineline?
 709+ "]" {
 710+ clearFlag('extlink');
 711+ if ( text == '' ) {
 712+ // XXX: Link numbering should be implemented in post-processor.
 713+ text = [{type: 'TEXT', value: "[" + linkCount + "]"}];
 714+ linkCount++;
 715+ }
 716+ return [ { type: 'TAG',
 717+ name: 'a',
 718+ attribs: [['href', target]] } ]
 719+ .concat( text
 720+ , [{type: 'ENDTAG', name: 'a'}]);
 721+ }
 722+ / "[" & { clearFlag('extlink'); return false; }
649723
 724+/* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can
 725+ * be configured dynamically. */
 726+url_protocol
 727+ = '//' // for protocol-relative URLs
 728+ / 'ftp://'
 729+ / 'git://'
 730+ / 'gopher://'
 731+ / 'http://'
 732+ / 'https://'
 733+ / 'irc://'
 734+ / 'ircs://' // @bug 28503
 735+ / 'mailto:'
 736+ / 'mms://'
 737+ / 'news:'
 738+ / 'nntp://' // @bug 3808 RFC 1738
 739+ / 'svn://'
 740+ / 'telnet://' // Well if we're going to support the above.. -ævar
 741+ / 'worldwind://'
 742+
 743+unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
 744+
650745 url
651 - = proto:"http:" rest:([^ \]\n]+) { return proto + rest.join(''); }
 746+ = proto:url_protocol
 747+ rest:( [^ :\]\[\n<>\x00-\x20\x7f,.]
 748+ / s:[.:,] !(space / eolf) { return s } )+
 749+{
 750+ return proto + rest.join('');
 751+}
 752+//[^][<>"\\x00-\\x20\\x7F\p{Zs}]
652753
653 -// XXX: convert to syntax flags and generic inline!
654 -extlink_text
655 - = c:(inline_element / ch:[^\]\n] { return {type: 'TEXT', value: ch}; })+
656 -
657754 template
658755 = "{{" target:template_target params:("|" p:template_param { return p })* "}}" {
659756 var obj = { type: 'TAG', name: 'template', attribs: [['target', target]] }
@@ -700,11 +797,11 @@
701798 / xmlish_tag
702799 / extlink
703800 / template
704 - / link
 801+ / wikilink
705802 / quote
706803 / !"}}" x:([^|\n]) { return x }
707804
708 -link
 805+wikilink
709806 = "[[" target:link_target text:("|" link_text)* "]]" {
710807 var obj = {
711808 type: 'TAG',
@@ -984,38 +1081,36 @@
9851082 , c ];
9861083 }
9871084
988 -dtdd = bullets:(!(";" !list_char) list_char)*
989 - ";"
990 - & {return setFlag('colon');}
991 - c:inlineline
992 - ":"
993 - // Fortunately dtdds cannot be nested, so we can simply set the flag
994 - // back to 0 to disable it.
995 - & {syntaxFlags['colon'] = 0; return true;}
996 - d:inlineline
997 - &eolf
998 -{
999 - // Convert trailing space into &nbsp;
1000 - // XXX: This should be moved to a serializer
1001 - var clen = c.length;
1002 - if (clen && c[clen - 1].type === 'TEXT') {
1003 - var val = c[clen - 1].value;
1004 - if(val.length && val[val.length - 1] == ' ') {
1005 - c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
1006 - }
 1085+dtdd
 1086+ = bullets:(!(";" !list_char) list_char)*
 1087+ ";"
 1088+ & {return setFlag('colon');}
 1089+ c:inlineline
 1090+ ":"
 1091+ // Fortunately dtdds cannot be nested, so we can simply set the flag
 1092+ // back to 0 to disable it.
 1093+ & {syntaxFlags['colon'] = 0; return true;}
 1094+ d:inlineline
 1095+ &eolf {
 1096+ // Convert trailing space into &nbsp;
 1097+ // XXX: This should be moved to a serializer
 1098+ //var clen = c.length;
 1099+ //if (clen && c[clen - 1].type === 'TEXT') {
 1100+ // var val = c[clen - 1].value;
 1101+ // if(val.length && val[val.length - 1] == ' ') {
 1102+ // c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
 1103+ // }
 1104+ //}
 1105+
 1106+ return [ { type: 'TAG', name: 'listItem', bullets: bullets + ";" } ]
 1107+ .concat( c
 1108+ ,[{ type: 'TAG', name: 'listItem', bullets: bullets + ":" } ]
 1109+ , d );
10071110 }
 1111+ // Fall-back case to clear the colon flag
 1112+ / & { return true; } { syntaxFlags['colon'] = 0; return null; }
10081113
1009 - return [ { type: 'TAG', name: 'listItem', bullets: bullets + ";" } ]
1010 - .concat( c
1011 - ,[{ type: 'TAG', name: 'listItem', bullets: bullets + ":" } ]
1012 - , d );
1013 -}
10141114
1015 -/ bullets:(!(";" !list_char) list_char)*
1016 - ";"
1017 - {syntaxFlags['colon'] = 0; return null; }
1018 -
1019 -
10201115 list_char = [*#:;]
10211116
10221117
@@ -1094,7 +1189,7 @@
10951190 ("||" / newline "|")
10961191 ! [}+-]
10971192 a:thtd_attribs?
1098 - td:(!inline_breaks anyblock)* {
 1193+ td:(!inline_breaks block)* {
10991194 dp("table data result: " + pp(td) + ", attribts: " + pp(a));
11001195 return [{ type: 'TAG', name: 'td', attribs: [['data-unparsed', a]]}]
11011196 .concat(td, [{type: 'ENDTAG', name: 'td'}]);

Status & tagging log