Index: trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js |
— | — | @@ -8,6 +8,10 @@ |
9 | 9 | // formatting is identical |
10 | 10 | testWhiteList['Italics and bold'] = "<ul><li> plain</li><li> plain<i>italic</i>plain</li><li> plain<i>italic</i>plain<i>italic</i>plain</li><li> plain<b>bold</b>plain</li><li> plain<b>bold</b>plain<b>bold</b>plain</li><li> plain<i>italic</i>plain<b>bold</b>plain</li><li> plain<b>bold</b>plain<i>italic</i>plain</li><li> plain<i>italic<b>bold-italic</b>italic</i>plain</li><li> plain<b>bold<i>bold-italic</i>bold</b>plain</li><li> plain<i><b>bold-italic</b>italic</i>plain</li><li> plain<i><b>bold-italic</b></i><b>bold</b>plain</li><li> plain<i>italic<b>bold-italic</b></i>plain</li><li> plain<b>bold<i>bold-italic</i></b>plain</li><li> plain l'<i>italic</i>plain</li><li> plain l'<b>bold</b> plain</li></ul>"; |
11 | 11 | |
| 12 | +// We don't care about existing or non-existing pages for now, so don't fail |
| 13 | +// on missing redlink |
| 14 | +testWhiteList["Definition list with wikilink containing colon"] = "<dl><dt> <a data-type=\"internal\" href=\"Help:FAQ\">Help:FAQ</a></dt><dd> The least-read page on Wikipedia</dd></dl>"; |
| 15 | + |
12 | 16 | if (typeof module == "object") { |
13 | 17 | module.exports.testWhiteList = testWhiteList; |
14 | 18 | } |
Index: trunk/extensions/VisualEditor/tests/parser/parserTests.js |
— | — | @@ -44,7 +44,7 @@ |
45 | 45 | alias: 'regex', |
46 | 46 | }, |
47 | 47 | 'whitelist': { |
48 | | - description: 'Alternatively compare against manually verified parser output from whitelist (default true)', |
| 48 | + description: 'Alternatively compare against manually verified parser output from whitelist', |
49 | 49 | default: true, |
50 | 50 | boolean: true, |
51 | 51 | }, |
— | — | @@ -57,8 +57,8 @@ |
58 | 58 | default: false, |
59 | 59 | boolean: true, |
60 | 60 | }, |
61 | | - 'jsonout': { |
62 | | - description: 'Print out a JSON serialization (default false) of parser output.', |
| 61 | + 'printwhitelist': { |
| 62 | + description: 'Print out a whitelist entry for failing tests. Default false.', |
63 | 63 | default: false, |
64 | 64 | boolean: true, |
65 | 65 | }, |
— | — | @@ -318,7 +318,7 @@ |
319 | 319 | var normalizedOut = normalizeOut(out); |
320 | 320 | var normalizedExpected = normalizeHTML(item.result); |
321 | 321 | if ( normalizedOut !== normalizedExpected ) { |
322 | | - if (argv.whiteList && |
| 322 | + if (argv.whitelist && |
323 | 323 | item.title in testWhiteList && |
324 | 324 | normalizeOut(testWhiteList[item.title]) === normalizedOut) { |
325 | 325 | if( !argv.quiet ) { |
— | — | @@ -368,9 +368,12 @@ |
369 | 369 | |
370 | 370 | console.log( colored_diff ); |
371 | 371 | |
372 | | - if(argv.jsonout) { |
373 | | - console.log("JSON of parser output:"); |
374 | | - console.log(JSON.stringify(out)); |
| 372 | + if(argv.printwhitelist) { |
| 373 | + console.log("Whitelist entry:"); |
| 374 | + console.log("testWhiteList[" + |
| 375 | + JSON.stringify(item.title) + "] = " + |
| 376 | + JSON.stringify(out) + |
| 377 | + ";"); |
375 | 378 | } |
376 | 379 | } |
377 | 380 | } else { |
Index: trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt |
— | — | @@ -355,6 +355,8 @@ |
356 | 356 | |
357 | 357 | // text start position |
358 | 358 | var textStart = 0; |
| 359 | + |
| 360 | + var linkCount = 1; |
359 | 361 | } |
360 | 362 | |
361 | 363 | start |
— | — | @@ -363,15 +365,60 @@ |
364 | 366 | } |
365 | 367 | |
366 | 368 | |
367 | | -anyblock = block / inline |
368 | | -anyblockline = block / inlineline |
| 369 | +/* All chars that cannot start syntactic structures in the middle of a line |
| 370 | + * XXX: ] and other end delimiters should probably only be activated inside |
| 371 | + * structures to avoid unnecessarily leaving the text production on plain |
| 372 | + * content. */ |
369 | 373 | |
| 374 | +text_char = [^'<~[{\n\r:\]}] |
370 | 375 | |
371 | | -// All chars that cannot start syntactic structures in the middle of a line |
372 | | -// XXX: ] and other end delimiters should probably only be activated inside |
373 | | -// structures to avoid unnecessarily leaving the text production on plain |
374 | | -// content. |
375 | | -text = t:[^'<~\][\n\r{}:]+ { return t.join(''); } |
| 376 | +text = t:text_char+ { return t.join(''); } |
| 377 | + |
| 378 | +/* Explanation of chars |
| 379 | + * ' quotes (italic/bold) |
| 380 | + * < start of xmlish_tag |
| 381 | + * ~ signatures/dates |
| 382 | + * [ start of links |
| 383 | + * { start of parser functions, transclusion and template args |
| 384 | + * \n all sort of block-level markup at start of line |
| 385 | + * \r ditto |
| 386 | + * h http(s) urls |
| 387 | + * n nntp(s) urls |
| 388 | + * m mailto urls |
| 389 | + * |
| 390 | + * The following chars are also included for now, but only apply in some |
| 391 | + * contexts and should probably be enabled only in those: |
| 392 | + * : separate definition in ; term : definition |
| 393 | + * ] end of link |
| 394 | + * } end of parser func/transclusion/template arg |
| 395 | + */ |
| 396 | + |
| 397 | +urltext = ( t:[^'<~[{\n\rfghimnstw:\]} ]+ { return t.join(''); } |
| 398 | + / urllink |
| 399 | + // Convert trailing space into |
| 400 | + // XXX: This should be moved to a serializer |
| 401 | + / ' ' & ':' { return "\u00a0"; } |
| 402 | + / t:text_char )+ |
| 403 | + |
| 404 | +/* |
| 405 | + '//', // for protocol-relative URLs, but not in text! |
| 406 | + 'ftp://', |
| 407 | + 'git://', |
| 408 | + 'gopher://', |
| 409 | + 'http://', |
| 410 | + 'https://', |
| 411 | + 'irc://', |
| 412 | + 'ircs://', // @bug 28503 |
| 413 | + 'mailto:', |
| 414 | + 'mms://', |
| 415 | + 'news:', |
| 416 | + 'nntp://', // @bug 3808 RFC 1738 |
| 417 | + 'svn://', |
| 418 | + 'telnet://', // Well if we're going to support the above.. -ævar |
| 419 | + 'worldwind://', |
| 420 | +*/ |
| 421 | + |
| 422 | +// Old version |
376 | 423 | //text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') } |
377 | 424 | |
378 | 425 | // Experimental tweaked version: avoid expensive single-char substrings |
— | — | @@ -463,13 +510,16 @@ |
464 | 511 | |
465 | 512 | br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} } |
466 | 513 | |
467 | | -// Syntax that stops inline expansion |
| 514 | +// Syntax stops to limit inline expansion defending on syntactic context |
468 | 515 | inline_breaks |
469 | 516 | = //& { console.log(pp(syntaxFlags)); return true; } |
470 | 517 | & { return syntaxFlags['table']; } |
471 | 518 | a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a)); return true; } |
| 519 | + / & { return (syntaxFlags['colon'] && |
| 520 | + ! syntaxFlags.extlink && |
| 521 | + ! syntaxFlags.linkdesk); } ":" { return true; } |
| 522 | + / & { return syntaxFlags['extlink']; } "]" { return true; } |
472 | 523 | / & { return syntaxFlags['linkdesc']; } link_end { return true; } |
473 | | - / & { return syntaxFlags['colon']; } ":" { return true; } |
474 | 524 | / & { return syntaxFlags['h']; } |
475 | 525 | ( & { return syntaxFlags['h1'] } '=' newline { return true; } |
476 | 526 | / & { return syntaxFlags['h2'] } '==' newline { return true; } |
— | — | @@ -480,7 +530,7 @@ |
481 | 531 | ) |
482 | 532 | |
483 | 533 | inline |
484 | | - = c:(text / inline_element / (!inline_breaks ch:. { return ch; }))+ { |
| 534 | + = c:(urltext / inline_element / (!inline_breaks ch:. { return ch; }))+ { |
485 | 535 | var out = []; |
486 | 536 | var text = []; |
487 | 537 | c = flatten(c); |
— | — | @@ -502,7 +552,7 @@ |
503 | 553 | } |
504 | 554 | |
505 | 555 | inlineline |
506 | | - = c:(text / !inline_breaks (inline_element / [^\n]))+ { |
| 556 | + = c:(urltext / !inline_breaks (inline_element / [^\n]))+ { |
507 | 557 | var out = []; |
508 | 558 | var text = []; |
509 | 559 | c = flatten(c); |
— | — | @@ -529,10 +579,13 @@ |
530 | 580 | */ |
531 | 581 | inline_element |
532 | 582 | = comment |
533 | | - / xmlish_tag |
| 583 | + // Can actually also be block-level elements, we don't really try to enforce |
| 584 | + // a content model in the tokenizer. The HTML tree builder and DOM |
| 585 | + // transformations are better equipped to deal with it. |
| 586 | + / xmlish_tag |
| 587 | + / template |
| 588 | + / wikilink |
534 | 589 | / extlink |
535 | | - / template |
536 | | - / link |
537 | 590 | / quote |
538 | 591 | |
539 | 592 | /* Headings */ |
— | — | @@ -635,24 +688,68 @@ |
636 | 689 | = c:[^-] { return c; } |
637 | 690 | / c:'-' !'->' { return c; } |
638 | 691 | |
639 | | -extlink |
640 | | - = "[" target:url " " text:extlink_text "]" { |
| 692 | + |
| 693 | +urllink |
| 694 | + = target:url { |
641 | 695 | return [ { type: 'TAG', |
642 | 696 | name: 'a', |
643 | | - attribs: [['href', target]] } ] |
644 | | - .concat( text |
645 | | - , [{type: 'ENDTAG', name: 'a'}]); |
| 697 | + attribs: [['href', target]] } |
| 698 | + , {type: 'TEXT', value: target} |
| 699 | + , {type: 'ENDTAG', name: 'a'} |
| 700 | + ]; |
646 | 701 | } |
647 | 702 | |
648 | | -// = "[" target:url text:extlink_text "]" { return { type: 'extlink', target: target, text: text } } |
| 703 | +extlink |
| 704 | + = "[" |
| 705 | + & { return setFlag('extlink'); } |
| 706 | + target:url |
| 707 | + space* |
| 708 | + text:inlineline? |
| 709 | + "]" { |
| 710 | + clearFlag('extlink'); |
| 711 | + if ( text == '' ) { |
| 712 | + // XXX: Link numbering should be implemented in post-processor. |
| 713 | + text = [{type: 'TEXT', value: "[" + linkCount + "]"}]; |
| 714 | + linkCount++; |
| 715 | + } |
| 716 | + return [ { type: 'TAG', |
| 717 | + name: 'a', |
| 718 | + attribs: [['href', target]] } ] |
| 719 | + .concat( text |
| 720 | + , [{type: 'ENDTAG', name: 'a'}]); |
| 721 | + } |
| 722 | + / "[" & { clearFlag('extlink'); return false; } |
649 | 723 | |
| 724 | +/* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can |
| 725 | + * be configured dynamically. */ |
| 726 | +url_protocol |
| 727 | + = '//' // for protocol-relative URLs |
| 728 | + / 'ftp://' |
| 729 | + / 'git://' |
| 730 | + / 'gopher://' |
| 731 | + / 'http://' |
| 732 | + / 'https://' |
| 733 | + / 'irc://' |
| 734 | + / 'ircs://' // @bug 28503 |
| 735 | + / 'mailto:' |
| 736 | + / 'mms://' |
| 737 | + / 'news:' |
| 738 | + / 'nntp://' // @bug 3808 RFC 1738 |
| 739 | + / 'svn://' |
| 740 | + / 'telnet://' // Well if we're going to support the above.. -ævar |
| 741 | + / 'worldwind://' |
| 742 | + |
| 743 | +unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] |
| 744 | + |
650 | 745 | url |
651 | | - = proto:"http:" rest:([^ \]\n]+) { return proto + rest.join(''); } |
| 746 | + = proto:url_protocol |
| 747 | + rest:( [^ :\]\[\n<>\x00-\x20\x7f,.] |
| 748 | + / s:[.:,] !(space / eolf) { return s } )+ |
| 749 | +{ |
| 750 | + return proto + rest.join(''); |
| 751 | +} |
| 752 | +//[^][<>"\\x00-\\x20\\x7F\p{Zs}] |
652 | 753 | |
653 | | -// XXX: convert to syntax flags and generic inline! |
654 | | -extlink_text |
655 | | - = c:(inline_element / ch:[^\]\n] { return {type: 'TEXT', value: ch}; })+ |
656 | | - |
657 | 754 | template |
658 | 755 | = "{{" target:template_target params:("|" p:template_param { return p })* "}}" { |
659 | 756 | var obj = { type: 'TAG', name: 'template', attribs: [['target', target]] } |
— | — | @@ -700,11 +797,11 @@ |
701 | 798 | / xmlish_tag |
702 | 799 | / extlink |
703 | 800 | / template |
704 | | - / link |
| 801 | + / wikilink |
705 | 802 | / quote |
706 | 803 | / !"}}" x:([^|\n]) { return x } |
707 | 804 | |
708 | | -link |
| 805 | +wikilink |
709 | 806 | = "[[" target:link_target text:("|" link_text)* "]]" { |
710 | 807 | var obj = { |
711 | 808 | type: 'TAG', |
— | — | @@ -984,38 +1081,36 @@ |
985 | 1082 | , c ]; |
986 | 1083 | } |
987 | 1084 | |
988 | | -dtdd = bullets:(!(";" !list_char) list_char)* |
989 | | - ";" |
990 | | - & {return setFlag('colon');} |
991 | | - c:inlineline |
992 | | - ":" |
993 | | - // Fortunately dtdds cannot be nested, so we can simply set the flag |
994 | | - // back to 0 to disable it. |
995 | | - & {syntaxFlags['colon'] = 0; return true;} |
996 | | - d:inlineline |
997 | | - &eolf |
998 | | -{ |
999 | | - // Convert trailing space into |
1000 | | - // XXX: This should be moved to a serializer |
1001 | | - var clen = c.length; |
1002 | | - if (clen && c[clen - 1].type === 'TEXT') { |
1003 | | - var val = c[clen - 1].value; |
1004 | | - if(val.length && val[val.length - 1] == ' ') { |
1005 | | - c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0"; |
1006 | | - } |
| 1085 | +dtdd |
| 1086 | + = bullets:(!(";" !list_char) list_char)* |
| 1087 | + ";" |
| 1088 | + & {return setFlag('colon');} |
| 1089 | + c:inlineline |
| 1090 | + ":" |
| 1091 | + // Fortunately dtdds cannot be nested, so we can simply set the flag |
| 1092 | + // back to 0 to disable it. |
| 1093 | + & {syntaxFlags['colon'] = 0; return true;} |
| 1094 | + d:inlineline |
| 1095 | + &eolf { |
| 1096 | + // Convert trailing space into |
| 1097 | + // XXX: This should be moved to a serializer |
| 1098 | + //var clen = c.length; |
| 1099 | + //if (clen && c[clen - 1].type === 'TEXT') { |
| 1100 | + // var val = c[clen - 1].value; |
| 1101 | + // if(val.length && val[val.length - 1] == ' ') { |
| 1102 | + // c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0"; |
| 1103 | + // } |
| 1104 | + //} |
| 1105 | + |
| 1106 | + return [ { type: 'TAG', name: 'listItem', bullets: bullets + ";" } ] |
| 1107 | + .concat( c |
| 1108 | + ,[{ type: 'TAG', name: 'listItem', bullets: bullets + ":" } ] |
| 1109 | + , d ); |
1007 | 1110 | } |
| 1111 | + // Fall-back case to clear the colon flag |
| 1112 | + / & { return true; } { syntaxFlags['colon'] = 0; return null; } |
1008 | 1113 | |
1009 | | - return [ { type: 'TAG', name: 'listItem', bullets: bullets + ";" } ] |
1010 | | - .concat( c |
1011 | | - ,[{ type: 'TAG', name: 'listItem', bullets: bullets + ":" } ] |
1012 | | - , d ); |
1013 | | -} |
1014 | 1114 | |
1015 | | -/ bullets:(!(";" !list_char) list_char)* |
1016 | | - ";" |
1017 | | - {syntaxFlags['colon'] = 0; return null; } |
1018 | | - |
1019 | | - |
1020 | 1115 | list_char = [*#:;] |
1021 | 1116 | |
1022 | 1117 | |
— | — | @@ -1094,7 +1189,7 @@ |
1095 | 1190 | ("||" / newline "|") |
1096 | 1191 | ! [}+-] |
1097 | 1192 | a:thtd_attribs? |
1098 | | - td:(!inline_breaks anyblock)* { |
| 1193 | + td:(!inline_breaks block)* { |
1099 | 1194 | dp("table data result: " + pp(td) + ", attribts: " + pp(a)); |
1100 | 1195 | return [{ type: 'TAG', name: 'td', attribs: [['data-unparsed', a]]}] |
1101 | 1196 | .concat(td, [{type: 'ENDTAG', name: 'td'}]); |