r104677 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r104676‎ | r104677 | r104678 >
Date:13:40, 30 November 2011
Author:gwicke
Status:deferred
Tags:
Comment:
A bit of tokenizer grammar clean-up and additional expected-html
normalization. 99 parser tests now passing.
Modified paths:
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.DOMPostProcessor.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt (modified) (history)
  • /trunk/extensions/VisualEditor/tests/parser/parserTests.js (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/tests/parser/parserTests.js
@@ -147,7 +147,10 @@
148148 .getElementsByTagName('body')[0]
149149 .innerHTML
150150 // a few things we ignore for now..
151 - .replace(/(title|class|rel)="[^"]+"/g, '');
 151+ .replace(/\/wiki\/Main_Page/g, 'Main Page')
 152+ .replace(/(title|class|rel)="[^"]+"/g, '')
 153+ .replace(/<a +href/g, '<a href')
 154+ .replace(/" +>/g, '">');
152155 } catch(e) {
153156 console.log("normalizeHTML failed on" +
154157 source + " with the following error: " + e);
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.DOMPostProcessor.js
@@ -43,7 +43,7 @@
4444 var child = cnodes[i - deleted],
4545 ctype = child.nodeType;
4646 //console.log(child + ctype);
47 - if (ctype === 3 && (haveInlines || !isElementContentWhitespace(child))) ||
 47+ if ((ctype === 3 && (haveInlines || !isElementContentWhitespace(child))) ||
4848 (ctype !== 3 && // text
4949 ctype !== 8 && // comment
5050 !isBlock(child.nodeName))) {
Index: trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt
@@ -107,6 +107,7 @@
108108 out.push({type: 'TAG', name: newName});
109109 endtags.push({type: 'ENDTAG', name: newName});
110110 } else {
 111+ popTags(bs.length - prefixLen);
111112 for(var i = prefixLen; i < bn.length; i++) {
112113 switch (bn[i]) {
113114 case '*':
@@ -351,6 +352,9 @@
352353 var isEOF = function (pos) {
353354 return pos === inputLength;
354355 };
 356+
 357+ // text start position
 358+ var textStart = 0;
355359 }
356360
357361 start
@@ -363,9 +367,24 @@
364368 anyblockline = block / inlineline
365369
366370
367 -// All chars that cannot start syntactic structures
368 -text = t:[A-Za-z0-9,._ -]+ { return t.join('') }
 371+// All chars that cannot start syntactic structures in the middle of a line
 372+// XXX: ] and other end delimiters should probably only be activated inside
 373+// structures to avoid unnecessarily leaving the text production on plain
 374+// content.
 375+text = t:[^'<~\][\n\r{}]+ { return t.join(''); }
 376+//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
369377
 378+// Experimental tweaked version: avoid expensive single-char substrings
 379+// This did not bring the expected performance boost, however.
 380+//text = [A-Za-z0-9,._ -] {
 381+// textStart = pos;
 382+//
 383+// var res = input.substr(textStart - 1, inputLength)
 384+// .match(/[A-Za-z0-9,._ -]+/)[0];
 385+// pos = pos + (res.length - 1);
 386+// return res
 387+// }
 388+
370389 space
371390 = s:[ \t]+ { return s.join(''); }
372391
@@ -403,18 +422,18 @@
404423 = block_lines
405424 / pre
406425 / comment &eolf
 426+ / nowiki
407427 / pre
408428 / bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag
409429 / para
410430 / inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor
411 - / (s:sol {
412 - if (s) {
413 - return [s, {type: 'NEWLINE'}];
414 - } else {
415 - return [{type: 'NEWLINE'}];
 431+ / s:sol {
 432+ if (s) {
 433+ return [s, {type: 'NEWLINE'}];
 434+ } else {
 435+ return [{type: 'NEWLINE'}];
 436+ }
416437 }
417 - }
418 - )
419438
420439 block_lines
421440 = s:sol
@@ -434,6 +453,87 @@
435454 / pre
436455
437456
 457+
 458+
 459+// TODO: convert inline content to annotations!
 460+para
 461+ = s1:sol s2:sol c:inlineline {
 462+ return s1.concat(s2, [{type: 'TAG', name: 'p'}], c);
 463+ }
 464+
 465+br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} }
 466+
 467+// Syntax that stops inline expansion
 468+inline_breaks
 469+ = //& { console.log(pp(syntaxFlags)); return true; }
 470+ & { return syntaxFlags['table']; }
 471+ a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a)); return true; }
 472+ / & { return syntaxFlags['linkdesc']; } link_end { return true; }
 473+ / & { return syntaxFlags['h']; }
 474+ ( & { return syntaxFlags['h1'] } '=' newline { return true; }
 475+ / & { return syntaxFlags['h2'] } '==' newline { return true; }
 476+ / & { return syntaxFlags['h3'] } '===' newline { return true; }
 477+ / & { return syntaxFlags['h4'] } '====' newline { return true; }
 478+ / & { return syntaxFlags['h5'] } '=====' newline { return true; }
 479+ / & { return syntaxFlags['h6'] } '======' newline { return true; }
 480+ )
 481+
 482+inline
 483+ = c:(text / inline_element / (!inline_breaks ch:. { return ch; }))+ {
 484+ var out = [];
 485+ var text = [];
 486+ c = flatten(c);
 487+ for (var i = 0; i < c.length; i++) {
 488+ if (typeof c[i] == 'string') {
 489+ text.push(c[i]);
 490+ } else {
 491+ if (text.length) {
 492+ out.push({ type: "TEXT", value: text.join('') });
 493+ text = [];
 494+ }
 495+ out.concat(c[i]);
 496+ }
 497+ }
 498+ if (text.length) {
 499+ out.push({ type: 'TEXT', value: text.join('') });
 500+ }
 501+ return out;
 502+}
 503+
 504+inlineline
 505+ = c:(text / !inline_breaks (inline_element / [^\n]))+ {
 506+ var out = [];
 507+ var text = [];
 508+ c = flatten(c);
 509+ for (var i = 0; i < c.length; i++) {
 510+ if (typeof c[i] == 'string') {
 511+ text.push(c[i]);
 512+ } else {
 513+ if (text.length) {
 514+ out.push({type: 'TEXT', value: text.join('')});
 515+ text = [];
 516+ }
 517+ out.push(c[i]);
 518+ }
 519+ }
 520+ if (text.length) {
 521+ out.push({type: 'TEXT', value: text.join('')});
 522+ }
 523+ //dp('inlineline out:', pp(out));
 524+ return out;
 525+}
 526+
 527+/* TODO: convert all these to annotations!
 528+ * -> need (start, end) offsets within block
 529+ */
 530+inline_element
 531+ = comment
 532+ / xmlish_tag
 533+ / extlink
 534+ / template
 535+ / link
 536+ / quote
 537+
438538 /* Headings */
439539 h = h1 / h2 / h3 / h4 / h5 / h6
440540
@@ -513,15 +613,6 @@
514614 heading_text
515615 = h:( !(heading_marker newline) x:inlineline { return x } )* { return h.join(''); }
516616
517 -
518 -// TODO: convert inline content to annotations!
519 -para
520 - = s1:sol s2:sol c:inlineline {
521 - return s1.concat(s2, [{type: 'TAG', name: 'p'}], c);
522 - }
523 -
524 -br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} }
525 -
526617 pre_indent
527618 = l:pre_indent_line ls:(sol pre_indent_line)* {
528619 return [{type: 'TAG', name: 'pre'}]
@@ -532,79 +623,7 @@
533624 return [{type: 'TEXT', value: '\n'}].concat(l);
534625 }
535626
536 -// Syntax that stops inline expansion
537 -inline_breaks
538 - = //& { console.log(pp(syntaxFlags)); return true; }
539 - & { return syntaxFlags['table']; }
540 - a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a)); return true; }
541 - / & { return syntaxFlags['linkdesc']; } link_end { return true; }
542 - / & { return syntaxFlags['h']; }
543 - ( & { return syntaxFlags['h1'] } '=' newline { return true; }
544 - / & { return syntaxFlags['h2'] } '==' newline { return true; }
545 - / & { return syntaxFlags['h3'] } '===' newline { return true; }
546 - / & { return syntaxFlags['h4'] } '====' newline { return true; }
547 - / & { return syntaxFlags['h5'] } '=====' newline { return true; }
548 - / & { return syntaxFlags['h6'] } '======' newline { return true; }
549 - )
550627
551 -
552 -
553 -inline
554 - = c:(text / inline_element / (!inline_breaks ch:. { return ch; }))+ {
555 - var out = [];
556 - var text = [];
557 - c = flatten(c);
558 - for (var i = 0; i < c.length; i++) {
559 - if (typeof c[i] == 'string') {
560 - text.push(c[i]);
561 - } else {
562 - if (text.length) {
563 - out.push({ type: "TEXT", value: text.join('') });
564 - text = [];
565 - }
566 - out.concat(c[i]);
567 - }
568 - }
569 - if (text.length) {
570 - out.push({ type: 'TEXT', value: text.join('') });
571 - }
572 - return out;
573 -}
574 -
575 -inlineline
576 - = c:(text / !inline_breaks (inline_element / [^\n]))+ {
577 - var out = [];
578 - var text = [];
579 - c = flatten(c);
580 - for (var i = 0; i < c.length; i++) {
581 - if (typeof c[i] == 'string') {
582 - text.push(c[i]);
583 - } else {
584 - if (text.length) {
585 - out.push({type: 'TEXT', value: text.join('')});
586 - text = [];
587 - }
588 - out.push(c[i]);
589 - }
590 - }
591 - if (text.length) {
592 - out.push({type: 'TEXT', value: text.join('')});
593 - }
594 - //dp('inlineline out:', pp(out));
595 - return out;
596 -}
597 -
598 -/* TODO: convert all these to annotations!
599 - * -> need (start, end) offsets within block
600 - */
601 -inline_element
602 - = comment
603 - / xmlish_tag
604 - / extlink
605 - / template
606 - / link
607 - / quote
608 -
609628 comment
610629 = '<!--' c:comment_chars* ('-->' / eof)
611630 cs:(space* newline space* cn:comment { return cn })* {
@@ -633,14 +652,20 @@
634653 = c:(inline_element / ch:[^\]] { return {type: 'TEXT', value: ch}; })+
635654
636655 template
637 - = "{{" target:link_target params:("|" p:template_param { return p })* "}}" {
638 - var obj = { type: 'SELFCLOSINGTAG', name: 'template', attribs: [['target', target]] }
 656+ = "{{" target:template_target params:("|" p:template_param { return p })* "}}" {
 657+ var obj = { type: 'TAG', name: 'template', attribs: [['target', target]] }
639658 if (params && params.length) {
640659 obj.attribs.push(params);
641660 }
642 - return obj;
 661+ // Should actually use a self-closing tag here, but the Node HTML5
 662+ // parser only recognizes known self-closing tags for now, so use an
 663+ // explicit end tag for now.
 664+ return [obj, {type: 'ENDTAG', name: 'template'}];
643665 }
644666
 667+template_target
 668+ = h:( !"}}" x:([^|]) { return x } )* { return h.join(''); }
 669+
645670 template_param
646671 = name:template_param_name "=" c:template_param_text {
647672 return [name, c];
@@ -739,26 +764,30 @@
740765 return [ {type: 'TAG', name: 'pre', attribs: attribs} ]
741766 .concat(ts, [{type: 'ENDTAG', name: 'pre'}]);
742767 }
743 - /// "</pre>" { return {type: 'TEXT', value: "</pre>"}; }
 768+ / "</pre>" { return {type: 'TEXT', value: "</pre>"}; }
744769
745770 nowiki
746 - = "<nowiki>" nc:nowiki_content "</nowiki>" { return nc }
 771+ = "<nowiki>" nc:nowiki_content "</nowiki>" {
 772+ // console.log(pp(nc));
 773+ return nc;
 774+ }
747775 / "<nowiki>" {
748 - // console.log('nowiki fallback');
 776+ //console.log('nowiki fallback');
749777 return [{type: 'TEXT', value: '<nowiki>'}];
750778 }
751779 / "</nowiki>" { return [{type: 'TEXT', value: '</nowiki>'}]; }
752780
753781 nowiki_content
754 - = ts:( t:[^<]+ { return t.join('') }
755 - / "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
756 - //console.log('nested pre in nowiki');
757 - return ["<pre"].concat(p0, p1, [">"], [p2[0].value], ["</pre>"]).join('');
758 - }
759 - / (!("</nowiki>" / "</pre>") c:. {return c}) )* {
760 - // return nowiki tags as well?
761 - return [{type: 'TEXT', value: ts.join('')}];
762 - }
 782+ = ts:( t:[^<]+ { return t.join('') }
 783+ / "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
 784+ //console.log('nested pre in nowiki');
 785+ return ["<pre"].concat(p0, p1, [">"], [p2[0].value], ["</pre>"]).join('');
 786+ }
 787+ / (!("</nowiki>" / "</pre>") c:. {return c})
 788+ )* {
 789+ // return nowiki tags as well?
 790+ return [{type: 'TEXT', value: ts.join('')}];
 791+ }
763792
764793 // See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
765794 // following paragraphs
@@ -953,30 +982,24 @@
954983 , c ];
955984 }
956985
957 -dtdd = bullets:list_char+
 986+dtdd = bullets:(!(";" !list_char) list_char)*
 987+ ";"
 988+ // XXX: convert to inline stops!
958989 c:(inline_element / (n:[^:\n] { return {type: 'TEXT', value: n}; }))+
959990 ":"
960 - d:(inline_element / (n:[^\n] { return {type: 'TEXT', value: n}; }))+
 991+ d:inlineline
961992 &eolf
962993 {
963 - // reject rule if bullets do not end in semicolon
964 - if (bullets[bullets.length - 1] != ';') {
965 - return null;
966 - } else {
967 - var dtbullets = bullets.slice(0, bullets.length - 1);
968 - dtbullets.push(':');
969 -
970 - // convert trailing space into &nbsp;
971 - var clen = c.length;
972 - if (clen && c[clen - 1].type === 'TEXT' && c[clen - 1].value == ' ') {
973 - c[clen - 1].value = "\u00a0";
974 - }
975 -
976 - return [ { type: 'TAG', name: 'listItem', bullets: bullets } ]
977 - .concat( c
978 - ,[{ type: 'TAG', name: 'listItem', bullets: dtbullets } ]
979 - , d );
 994+ // convert trailing space into &nbsp;
 995+ var clen = c.length;
 996+ if (clen && c[clen - 1].type === 'TEXT' && c[clen - 1].value == ' ') {
 997+ c[clen - 1].value = "\u00a0";
980998 }
 999+
 1000+ return [ { type: 'TAG', name: 'listItem', bullets: bullets + ";" } ]
 1001+ .concat( c
 1002+ ,[{ type: 'TAG', name: 'listItem', bullets: bullets + ":" } ]
 1003+ , d );
9811004 }
9821005
9831006
@@ -1082,38 +1105,5 @@
10831106 table_end = newline? "|}" { clearFlag('table'); }
10841107
10851108
1086 -/* Wikidom TODO:
1087 - * split off text into content nodes
1088 - * convert inlines into annotations
1089 - * change contents into children
1090 - *
1091 - * { text: text,
1092 - * annotations: [(normal annotations)],
1093 - * maybeannotations: [
1094 - * { type: 'something',
1095 - * side: MA_START,
1096 - * tag: { start: x, length: y }
1097 - * }
1098 - * ]
1099 - * }
1100 - * offsets in annotations: presume maybeannotations are actually text
1101 - * -> need to transform annotations if match found
1102 - * -> format annotations, comments can run to the end (re-opened after
1103 - * block-level tags); only closed on table cells, object,?
1104 - * -> other annotations (images, templates etc) are limited by block-level
1105 - * elements, tightly bound
1106 - *
1107 - * Block-level elements
1108 - * --------------------
1109 - * - Need some early clean-up to provide structure and offsets
1110 - * - Establish scope limits for some inlines
1111 - * - Line-based balanced by construction
1112 - * - HTML tags need balancing/ matching / implicit close
1113 - * - content in illegal places (e.g. between table and td tags) needs foster
1114 - * parenting
1115 - * - grammar will match outermost pair if unmatched pairs are recognized as
1116 - * tokens (or as text)
1117 - * - post-processing needed, but has to be limited by scope
1118 - */
11191109 /* Tabs do not mix well with the hybrid production syntax */
11201110 /* vim: filetype=javascript:expandtabs:ts=4:sw=4:cindent */

Status & tagging log