Index: trunk/extensions/VisualEditor/tests/parser/parserTests.js |
— | — | @@ -147,7 +147,10 @@ |
148 | 148 | .getElementsByTagName('body')[0] |
149 | 149 | .innerHTML |
150 | 150 | // a few things we ignore for now.. |
151 | | - .replace(/(title|class|rel)="[^"]+"/g, ''); |
| 151 | + .replace(/\/wiki\/Main_Page/g, 'Main Page') |
| 152 | + .replace(/(title|class|rel)="[^"]+"/g, '') |
| 153 | + .replace(/<a +href/g, '<a href') |
| 154 | + .replace(/" +>/g, '">'); |
152 | 155 | } catch(e) { |
153 | 156 | console.log("normalizeHTML failed on" + |
154 | 157 | source + " with the following error: " + e); |
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.DOMPostProcessor.js |
— | — | @@ -43,7 +43,7 @@ |
44 | 44 | var child = cnodes[i - deleted], |
45 | 45 | ctype = child.nodeType; |
46 | 46 | //console.log(child + ctype); |
47 | | - if (ctype === 3 && (haveInlines || !isElementContentWhitespace(child))) || |
| 47 | + if ((ctype === 3 && (haveInlines || !isElementContentWhitespace(child))) || |
48 | 48 | (ctype !== 3 && // text |
49 | 49 | ctype !== 8 && // comment |
50 | 50 | !isBlock(child.nodeName))) { |
Index: trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt |
— | — | @@ -107,6 +107,7 @@ |
108 | 108 | out.push({type: 'TAG', name: newName}); |
109 | 109 | endtags.push({type: 'ENDTAG', name: newName}); |
110 | 110 | } else { |
| 111 | + popTags(bs.length - prefixLen); |
111 | 112 | for(var i = prefixLen; i < bn.length; i++) { |
112 | 113 | switch (bn[i]) { |
113 | 114 | case '*': |
— | — | @@ -351,6 +352,9 @@ |
352 | 353 | var isEOF = function (pos) { |
353 | 354 | return pos === inputLength; |
354 | 355 | }; |
| 356 | + |
| 357 | + // text start position |
| 358 | + var textStart = 0; |
355 | 359 | } |
356 | 360 | |
357 | 361 | start |
— | — | @@ -363,9 +367,24 @@ |
364 | 368 | anyblockline = block / inlineline |
365 | 369 | |
366 | 370 | |
367 | | -// All chars that cannot start syntactic structures |
368 | | -text = t:[A-Za-z0-9,._ -]+ { return t.join('') } |
| 371 | +// All chars that cannot start syntactic structures in the middle of a line |
| 372 | +// XXX: ] and other end delimiters should probably only be activated inside |
| 373 | +// structures to avoid unnecessarily leaving the text production on plain |
| 374 | +// content. |
| 375 | +text = t:[^'<~\][\n\r{}]+ { return t.join(''); } |
| 376 | +//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') } |
369 | 377 | |
| 378 | +// Experimental tweaked version: avoid expensive single-char substrings |
| 379 | +// This did not bring the expected performance boost, however. |
| 380 | +//text = [A-Za-z0-9,._ -] { |
| 381 | +// textStart = pos; |
| 382 | +// |
| 383 | +// var res = input.substr(textStart - 1, inputLength) |
| 384 | +// .match(/[A-Za-z0-9,._ -]+/)[0]; |
| 385 | +// pos = pos + (res.length - 1); |
| 386 | +// return res |
| 387 | +// } |
| 388 | + |
370 | 389 | space |
371 | 390 | = s:[ \t]+ { return s.join(''); } |
372 | 391 | |
— | — | @@ -403,18 +422,18 @@ |
404 | 423 | = block_lines |
405 | 424 | / pre |
406 | 425 | / comment &eolf |
| 426 | + / nowiki |
407 | 427 | / pre |
408 | 428 | / bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag |
409 | 429 | / para |
410 | 430 | / inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor |
411 | | - / (s:sol { |
412 | | - if (s) { |
413 | | - return [s, {type: 'NEWLINE'}]; |
414 | | - } else { |
415 | | - return [{type: 'NEWLINE'}]; |
| 431 | + / s:sol { |
| 432 | + if (s) { |
| 433 | + return [s, {type: 'NEWLINE'}]; |
| 434 | + } else { |
| 435 | + return [{type: 'NEWLINE'}]; |
| 436 | + } |
416 | 437 | } |
417 | | - } |
418 | | - ) |
419 | 438 | |
420 | 439 | block_lines |
421 | 440 | = s:sol |
— | — | @@ -434,6 +453,87 @@ |
435 | 454 | / pre |
436 | 455 | |
437 | 456 | |
| 457 | + |
| 458 | + |
| 459 | +// TODO: convert inline content to annotations! |
| 460 | +para |
| 461 | + = s1:sol s2:sol c:inlineline { |
| 462 | + return s1.concat(s2, [{type: 'TAG', name: 'p'}], c); |
| 463 | + } |
| 464 | + |
| 465 | +br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} } |
| 466 | + |
| 467 | +// Syntax that stops inline expansion |
| 468 | +inline_breaks |
| 469 | + = //& { console.log(pp(syntaxFlags)); return true; } |
| 470 | + & { return syntaxFlags['table']; } |
| 471 | + a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a)); return true; } |
| 472 | + / & { return syntaxFlags['linkdesc']; } link_end { return true; } |
| 473 | + / & { return syntaxFlags['h']; } |
| 474 | + ( & { return syntaxFlags['h1'] } '=' newline { return true; } |
| 475 | + / & { return syntaxFlags['h2'] } '==' newline { return true; } |
| 476 | + / & { return syntaxFlags['h3'] } '===' newline { return true; } |
| 477 | + / & { return syntaxFlags['h4'] } '====' newline { return true; } |
| 478 | + / & { return syntaxFlags['h5'] } '=====' newline { return true; } |
| 479 | + / & { return syntaxFlags['h6'] } '======' newline { return true; } |
| 480 | + ) |
| 481 | + |
| 482 | +inline |
| 483 | + = c:(text / inline_element / (!inline_breaks ch:. { return ch; }))+ { |
| 484 | + var out = []; |
| 485 | + var text = []; |
| 486 | + c = flatten(c); |
| 487 | + for (var i = 0; i < c.length; i++) { |
| 488 | + if (typeof c[i] == 'string') { |
| 489 | + text.push(c[i]); |
| 490 | + } else { |
| 491 | + if (text.length) { |
| 492 | + out.push({ type: "TEXT", value: text.join('') }); |
| 493 | + text = []; |
| 494 | + } |
| 495 | + out.concat(c[i]); |
| 496 | + } |
| 497 | + } |
| 498 | + if (text.length) { |
| 499 | + out.push({ type: 'TEXT', value: text.join('') }); |
| 500 | + } |
| 501 | + return out; |
| 502 | +} |
| 503 | + |
| 504 | +inlineline |
| 505 | + = c:(text / !inline_breaks (inline_element / [^\n]))+ { |
| 506 | + var out = []; |
| 507 | + var text = []; |
| 508 | + c = flatten(c); |
| 509 | + for (var i = 0; i < c.length; i++) { |
| 510 | + if (typeof c[i] == 'string') { |
| 511 | + text.push(c[i]); |
| 512 | + } else { |
| 513 | + if (text.length) { |
| 514 | + out.push({type: 'TEXT', value: text.join('')}); |
| 515 | + text = []; |
| 516 | + } |
| 517 | + out.push(c[i]); |
| 518 | + } |
| 519 | + } |
| 520 | + if (text.length) { |
| 521 | + out.push({type: 'TEXT', value: text.join('')}); |
| 522 | + } |
| 523 | + //dp('inlineline out:', pp(out)); |
| 524 | + return out; |
| 525 | +} |
| 526 | + |
| 527 | +/* TODO: convert all these to annotations! |
| 528 | + * -> need (start, end) offsets within block |
| 529 | + */ |
| 530 | +inline_element |
| 531 | + = comment |
| 532 | + / xmlish_tag |
| 533 | + / extlink |
| 534 | + / template |
| 535 | + / link |
| 536 | + / quote |
| 537 | + |
438 | 538 | /* Headings */ |
439 | 539 | h = h1 / h2 / h3 / h4 / h5 / h6 |
440 | 540 | |
— | — | @@ -513,15 +613,6 @@ |
514 | 614 | heading_text |
515 | 615 | = h:( !(heading_marker newline) x:inlineline { return x } )* { return h.join(''); } |
516 | 616 | |
517 | | - |
518 | | -// TODO: convert inline content to annotations! |
519 | | -para |
520 | | - = s1:sol s2:sol c:inlineline { |
521 | | - return s1.concat(s2, [{type: 'TAG', name: 'p'}], c); |
522 | | - } |
523 | | - |
524 | | -br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} } |
525 | | - |
526 | 617 | pre_indent |
527 | 618 | = l:pre_indent_line ls:(sol pre_indent_line)* { |
528 | 619 | return [{type: 'TAG', name: 'pre'}] |
— | — | @@ -532,79 +623,7 @@ |
533 | 624 | return [{type: 'TEXT', value: '\n'}].concat(l); |
534 | 625 | } |
535 | 626 | |
536 | | -// Syntax that stops inline expansion |
537 | | -inline_breaks |
538 | | - = //& { console.log(pp(syntaxFlags)); return true; } |
539 | | - & { return syntaxFlags['table']; } |
540 | | - a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a)); return true; } |
541 | | - / & { return syntaxFlags['linkdesc']; } link_end { return true; } |
542 | | - / & { return syntaxFlags['h']; } |
543 | | - ( & { return syntaxFlags['h1'] } '=' newline { return true; } |
544 | | - / & { return syntaxFlags['h2'] } '==' newline { return true; } |
545 | | - / & { return syntaxFlags['h3'] } '===' newline { return true; } |
546 | | - / & { return syntaxFlags['h4'] } '====' newline { return true; } |
547 | | - / & { return syntaxFlags['h5'] } '=====' newline { return true; } |
548 | | - / & { return syntaxFlags['h6'] } '======' newline { return true; } |
549 | | - ) |
550 | 627 | |
551 | | - |
552 | | - |
553 | | -inline |
554 | | - = c:(text / inline_element / (!inline_breaks ch:. { return ch; }))+ { |
555 | | - var out = []; |
556 | | - var text = []; |
557 | | - c = flatten(c); |
558 | | - for (var i = 0; i < c.length; i++) { |
559 | | - if (typeof c[i] == 'string') { |
560 | | - text.push(c[i]); |
561 | | - } else { |
562 | | - if (text.length) { |
563 | | - out.push({ type: "TEXT", value: text.join('') }); |
564 | | - text = []; |
565 | | - } |
566 | | - out.concat(c[i]); |
567 | | - } |
568 | | - } |
569 | | - if (text.length) { |
570 | | - out.push({ type: 'TEXT', value: text.join('') }); |
571 | | - } |
572 | | - return out; |
573 | | -} |
574 | | - |
575 | | -inlineline |
576 | | - = c:(text / !inline_breaks (inline_element / [^\n]))+ { |
577 | | - var out = []; |
578 | | - var text = []; |
579 | | - c = flatten(c); |
580 | | - for (var i = 0; i < c.length; i++) { |
581 | | - if (typeof c[i] == 'string') { |
582 | | - text.push(c[i]); |
583 | | - } else { |
584 | | - if (text.length) { |
585 | | - out.push({type: 'TEXT', value: text.join('')}); |
586 | | - text = []; |
587 | | - } |
588 | | - out.push(c[i]); |
589 | | - } |
590 | | - } |
591 | | - if (text.length) { |
592 | | - out.push({type: 'TEXT', value: text.join('')}); |
593 | | - } |
594 | | - //dp('inlineline out:', pp(out)); |
595 | | - return out; |
596 | | -} |
597 | | - |
598 | | -/* TODO: convert all these to annotations! |
599 | | - * -> need (start, end) offsets within block |
600 | | - */ |
601 | | -inline_element |
602 | | - = comment |
603 | | - / xmlish_tag |
604 | | - / extlink |
605 | | - / template |
606 | | - / link |
607 | | - / quote |
608 | | - |
609 | 628 | comment |
610 | 629 | = '<!--' c:comment_chars* ('-->' / eof) |
611 | 630 | cs:(space* newline space* cn:comment { return cn })* { |
— | — | @@ -633,14 +652,20 @@ |
634 | 653 | = c:(inline_element / ch:[^\]] { return {type: 'TEXT', value: ch}; })+ |
635 | 654 | |
636 | 655 | template |
637 | | - = "{{" target:link_target params:("|" p:template_param { return p })* "}}" { |
638 | | - var obj = { type: 'SELFCLOSINGTAG', name: 'template', attribs: [['target', target]] } |
| 656 | + = "{{" target:template_target params:("|" p:template_param { return p })* "}}" { |
| 657 | + var obj = { type: 'TAG', name: 'template', attribs: [['target', target]] } |
639 | 658 | if (params && params.length) { |
640 | 659 | obj.attribs.push(params); |
641 | 660 | } |
642 | | - return obj; |
| 661 | + // Should actually use a self-closing tag here, but the Node HTML5 |
| 662 | + // parser only recognizes known self-closing tags for now, so use an |
| 663 | + // explicit end tag for now. |
| 664 | + return [obj, {type: 'ENDTAG', name: 'template'}]; |
643 | 665 | } |
644 | 666 | |
| 667 | +template_target |
| 668 | + = h:( !"}}" x:([^|]) { return x } )* { return h.join(''); } |
| 669 | + |
645 | 670 | template_param |
646 | 671 | = name:template_param_name "=" c:template_param_text { |
647 | 672 | return [name, c]; |
— | — | @@ -739,26 +764,30 @@ |
740 | 765 | return [ {type: 'TAG', name: 'pre', attribs: attribs} ] |
741 | 766 | .concat(ts, [{type: 'ENDTAG', name: 'pre'}]); |
742 | 767 | } |
743 | | - /// "</pre>" { return {type: 'TEXT', value: "</pre>"}; } |
| 768 | + / "</pre>" { return {type: 'TEXT', value: "</pre>"}; } |
744 | 769 | |
745 | 770 | nowiki |
746 | | - = "<nowiki>" nc:nowiki_content "</nowiki>" { return nc } |
| 771 | + = "<nowiki>" nc:nowiki_content "</nowiki>" { |
| 772 | + // console.log(pp(nc)); |
| 773 | + return nc; |
| 774 | + } |
747 | 775 | / "<nowiki>" { |
748 | | - // console.log('nowiki fallback'); |
| 776 | + //console.log('nowiki fallback'); |
749 | 777 | return [{type: 'TEXT', value: '<nowiki>'}]; |
750 | 778 | } |
751 | 779 | / "</nowiki>" { return [{type: 'TEXT', value: '</nowiki>'}]; } |
752 | 780 | |
753 | 781 | nowiki_content |
754 | | - = ts:( t:[^<]+ { return t.join('') } |
755 | | - / "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" { |
756 | | - //console.log('nested pre in nowiki'); |
757 | | - return ["<pre"].concat(p0, p1, [">"], [p2[0].value], ["</pre>"]).join(''); |
758 | | - } |
759 | | - / (!("</nowiki>" / "</pre>") c:. {return c}) )* { |
760 | | - // return nowiki tags as well? |
761 | | - return [{type: 'TEXT', value: ts.join('')}]; |
762 | | - } |
| 782 | + = ts:( t:[^<]+ { return t.join('') } |
| 783 | + / "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" { |
| 784 | + //console.log('nested pre in nowiki'); |
| 785 | + return ["<pre"].concat(p0, p1, [">"], [p2[0].value], ["</pre>"]).join(''); |
| 786 | + } |
| 787 | + / (!("</nowiki>" / "</pre>") c:. {return c}) |
| 788 | + )* { |
| 789 | + // return nowiki tags as well? |
| 790 | + return [{type: 'TEXT', value: ts.join('')}]; |
| 791 | + } |
763 | 792 | |
764 | 793 | // See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and |
765 | 794 | // following paragraphs |
— | — | @@ -953,30 +982,24 @@ |
954 | 983 | , c ]; |
955 | 984 | } |
956 | 985 | |
957 | | -dtdd = bullets:list_char+ |
| 986 | +dtdd = bullets:(!(";" !list_char) list_char)* |
| 987 | + ";" |
| 988 | + // XXX: convert to inline stops! |
958 | 989 | c:(inline_element / (n:[^:\n] { return {type: 'TEXT', value: n}; }))+ |
959 | 990 | ":" |
960 | | - d:(inline_element / (n:[^\n] { return {type: 'TEXT', value: n}; }))+ |
| 991 | + d:inlineline |
961 | 992 | &eolf |
962 | 993 | { |
963 | | - // reject rule if bullets do not end in semicolon |
964 | | - if (bullets[bullets.length - 1] != ';') { |
965 | | - return null; |
966 | | - } else { |
967 | | - var dtbullets = bullets.slice(0, bullets.length - 1); |
968 | | - dtbullets.push(':'); |
969 | | - |
970 | | - // convert trailing space into |
971 | | - var clen = c.length; |
972 | | - if (clen && c[clen - 1].type === 'TEXT' && c[clen - 1].value == ' ') { |
973 | | - c[clen - 1].value = "\u00a0"; |
974 | | - } |
975 | | - |
976 | | - return [ { type: 'TAG', name: 'listItem', bullets: bullets } ] |
977 | | - .concat( c |
978 | | - ,[{ type: 'TAG', name: 'listItem', bullets: dtbullets } ] |
979 | | - , d ); |
| 994 | + // convert trailing space into |
| 995 | + var clen = c.length; |
| 996 | + if (clen && c[clen - 1].type === 'TEXT' && c[clen - 1].value == ' ') { |
| 997 | + c[clen - 1].value = "\u00a0"; |
980 | 998 | } |
| 999 | + |
| 1000 | + return [ { type: 'TAG', name: 'listItem', bullets: bullets + ";" } ] |
| 1001 | + .concat( c |
| 1002 | + ,[{ type: 'TAG', name: 'listItem', bullets: bullets + ":" } ] |
| 1003 | + , d ); |
981 | 1004 | } |
982 | 1005 | |
983 | 1006 | |
— | — | @@ -1082,38 +1105,5 @@ |
1083 | 1106 | table_end = newline? "|}" { clearFlag('table'); } |
1084 | 1107 | |
1085 | 1108 | |
1086 | | -/* Wikidom TODO: |
1087 | | - * split off text into content nodes |
1088 | | - * convert inlines into annotations |
1089 | | - * change contents into children |
1090 | | - * |
1091 | | - * { text: text, |
1092 | | - * annotations: [(normal annotations)], |
1093 | | - * maybeannotations: [ |
1094 | | - * { type: 'something', |
1095 | | - * side: MA_START, |
1096 | | - * tag: { start: x, length: y } |
1097 | | - * } |
1098 | | - * ] |
1099 | | - * } |
1100 | | - * offsets in annotations: presume maybeannotations are actually text |
1101 | | - * -> need to transform annotations if match found |
1102 | | - * -> format annotations, comments can run to the end (re-opened after |
1103 | | - * block-level tags); only closed on table cells, object,? |
1104 | | - * -> other annotations (images, templates etc) are limited by block-level |
1105 | | - * elements, tightly bound |
1106 | | - * |
1107 | | - * Block-level elements |
1108 | | - * -------------------- |
1109 | | - * - Need some early clean-up to provide structure and offsets |
1110 | | - * - Establish scope limits for some inlines |
1111 | | - * - Line-based balanced by construction |
1112 | | - * - HTML tags need balancing/ matching / implicit close |
1113 | | - * - content in illegal places (e.g. between table and td tags) needs foster |
1114 | | - * parenting |
1115 | | - * - grammar will match outermost pair if unmatched pairs are recognized as |
1116 | | - * tokens (or as text) |
1117 | | - * - post-processing needed, but has to be limited by scope |
1118 | | - */ |
1119 | 1109 | /* Tabs do not mix well with the hybrid production syntax */ |
1120 | 1110 | /* vim: filetype=javascript:expandtabs:ts=4:sw=4:cindent */ |