r104567 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r104566‎ | r104567 | r104568 >
Date:15:11, 29 November 2011
Author:gwicke
Status:deferred
Tags:
Comment:
* Split paragraph handling between tokenizer and DOM postprocessor for better
html markup handling.
* Remove global 'use strict' declarations from html5 parser.
* Add trailing whitespace handling in dt

Overall, 55 parser tests are now passing.
Modified paths:
  • /trunk/extensions/VisualEditor/modules/parser/html5/parser.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/html5/parser/after_head_phase.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/html5/parser/before_head_phase.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/html5/parser/in_body_phase.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/html5/parser/in_head_phase.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/html5/serializer.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/html5/treebuilder.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.DOMPostProcessor.js (added) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.html5TokenEmitter.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt (modified) (history)
  • /trunk/extensions/VisualEditor/tests/parser/parserTests.js (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/tests/parser/parserTests.js
@@ -11,7 +11,7 @@
1212 */
1313
1414 (function() {
15 -"use strict";
 15+//"use strict";
1616
1717 var fs = require('fs'),
1818 path = require('path'),
@@ -54,7 +54,8 @@
5555 _import(pj('parser', 'mediawiki.parser.environment.js'), ['MWParserEnvironment']);
5656 _import(pj('parser', 'ext.cite.taghook.ref.js'), ['MWRefTagHook']);
5757
58 -_require(pj('parser', 'mediawiki.html5TokenEmitter.js'));
 58+_import(pj('parser', 'mediawiki.html5TokenEmitter.js'), ['FauxHTML5']);
 59+_import(pj('parser', 'mediawiki.DOMPostProcessor.js'), ['DOMPostProcessor']);
5960
6061 // WikiDom and serializers
6162 _require(pj('es', 'es.js'));
@@ -131,21 +132,25 @@
132133 return $('<div>').append(node).html();
133134 }
134135
 136+var htmlparser = new HTML5.Parser();
 137+
135138 /* Normalize the expected parser output by parsing it using a HTML5 parser and
136139 * re-serializing it to HTML. Ideally, the parser would normalize inter-tag
137140 * whitespace for us. For now, we fake that by simply stripping all newlines.
138141 */
139142 function normalizeHTML(source) {
140 - var parser = new HTML5.Parser();
141143 // TODO: Do not strip newlines in pre and nowiki blocks!
142144 source = source.replace(/\n/g, '');
143145 try {
144 - parser.parse('<body>' + source + '</body>');
145 - return parser.document
 146+ htmlparser.parse('<body>' + source + '</body>');
 147+ return htmlparser.document
146148 .getElementsByTagName('body')[0]
147 - .innerHTML;
 149+ .innerHTML
 150+ // a few things we ignore for now..
 151+ .replace(/(title|class|rel)="[^"]+"/g, '');
148152 } catch(e) {
149 - console.log("normalizeHTML failed:" + e);
 153+ console.log("normalizeHTML failed on" +
 154+ source + " with the following error: " + e);
150155 console.trace();
151156 return source;
152157 }
@@ -172,7 +177,8 @@
173178 failOutputTests = 0;
174179
175180 function processTest(item) {
176 - var tokenizer = new FauxHTML5.Tokenizer();
 181+ var tokenizer = new FauxHTML5.Tokenizer(),
 182+ postProcessor = new DOMPostProcessor();
177183 if (!('title' in item)) {
178184 console.log(item);
179185 throw new Error('Missing title from test case.');
@@ -208,7 +214,15 @@
209215 });
210216 //var res = es.HtmlSerializer.stringify(tokens,environment);
211217 //console.log(JSON.stringify(tokens));
 218+
 219+ // Build a DOM tree from tokens using the HTML tree
 220+ // builder/parser.
212221 processTokens(tokens, tokenizer);
 222+
 223+ // Perform post-processing on DOM.
 224+ postProcessor.doPostProcess(tokenizer.parser.document);
 225+
 226+ // And serialize the result.
213227 var out = tokenizer.parser.document
214228 .getElementsByTagName('body')[0]
215229 .innerHTML;
@@ -217,7 +231,12 @@
218232 printTitle();
219233 failTreeTests++;
220234 console.log('RENDER FAIL', err);
221 - } else if ( normalizeOut(out) !== normalizeHTML(item.result) ) {
 235+ return;
 236+ }
 237+
 238+ var normalizedOut = normalizeOut(out);
 239+ var normalizedExpected = normalizeHTML(item.result);
 240+ if ( normalizedOut !== normalizedExpected ) {
222241 printTitle();
223242 failOutputTests++;
224243 console.log('RAW EXPECTED:');
@@ -226,12 +245,12 @@
227246 console.log('RAW RENDERED:');
228247 console.log(formatHTML(out) + "\n");
229248
230 - var a = formatHTML(normalizeHTML( item.result ));
 249+ var a = formatHTML(normalizedExpected);
231250
232251 console.log('NORMALIZED EXPECTED:');
233252 console.log(a + "\n");
234253
235 - var b = formatHTML(normalizeOut( out ));
 254+ var b = formatHTML(normalizedOut);
236255
237256 console.log('NORMALIZED RENDERED:')
238257 console.log(formatHTML(normalizeOut(out)) + "\n");
@@ -241,7 +260,7 @@
242261 console.log(patch.replace(/^[^\n]*\n[^\n]*\n[^\n]*\n[^\n]*\n/, ''));
243262 } else {
244263 passedTests++;
245 - console.log( 'PASS: ' + item.title );
 264+ console.log( 'PASSED: ' + item.title );
246265 }
247266 }
248267 });
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.html5TokenEmitter.js
@@ -3,16 +3,16 @@
44
55 var HTML5 = require('./html5/index');
66
7 -FauxHTML5 = {}
 7+FauxHTML5 = {};
88
99
1010 FauxHTML5.Tokenizer = function ( ) {
1111 this.parser = new HTML5.Parser();
1212 this.parser.parse(this);
1313 return this;
14 -}
 14+};
1515
16 -FauxHTML5.Tokenizer.prototype = new events.EventEmitter;
 16+FauxHTML5.Tokenizer.prototype = new events.EventEmitter();
1717
1818 FauxHTML5.Tokenizer.prototype.processToken = function (token) {
1919 var att = function (maybeAttribs) {
@@ -43,7 +43,7 @@
4444 data: att(token.attribs)});
4545 break;
4646 case "SELFCLOSINGTAG":
47 - this.emit('token', {type: 'EmptyTag',
 47+ this.emit('token', {type: 'StartTag',
4848 name: token.name,
4949 data: att(token.attribs)});
5050 break;
@@ -61,4 +61,8 @@
6262 console.log("Unhandled token: " + JSON.stringify(token));
6363 break;
6464 }
 65+};
 66+
 67+if (typeof module == "object") {
 68+ module.exports.FauxHTML5 = FauxHTML5;
6569 }
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.DOMPostProcessor.js
@@ -0,0 +1,92 @@
 2+/* Perform post-processing steps on an already-built HTML DOM. */
 3+
 4+var isBlock = function isBlock (name) {
 5+ switch (name.toLowerCase()) {
 6+ case 'div':
 7+ case 'table':
 8+ case 'td':
 9+ case 'tr':
 10+ case 'tbody':
 11+ case 'p':
 12+ case 'ul':
 13+ case 'ol':
 14+ case 'li':
 15+ case 'dl':
 16+ case 'dt':
 17+ case 'dd':
 18+ case 'img': // hmm!
 19+ case 'pre':
 20+ case 'center':
 21+ case 'blockquote':
 22+ return true;
 23+ default:
 24+ return false;
 25+ }
 26+};
 27+
 28+var process_inlines_in_p = function ( document ) {
 29+ // document.body does not always work in jsdom
 30+ var body = document.getElementsByTagName('body')[0],
 31+ children = body.cloneNode(false),
 32+ cnodes = body.childNodes,
 33+ inlineStack = [];
 34+
 35+ function wrapInlines (inlines) {
 36+ var newp = document.createElement('p');
 37+ for(var i = 0, length = inlines.length; i < length; i++) {
 38+ newp.appendChild(inlines[i]);
 39+ }
 40+ body.appendChild(newp);
 41+ inlineStack = [];
 42+ }
 43+ var i,
 44+ length = cnodes.length;
 45+ // Clear body
 46+ for(i = 0; i < length; i++) {
 47+ var cnode = body.firstChild;
 48+ children.appendChild(cnode);
 49+ }
 50+
 51+ function isElementContentWhitespace ( e ) {
 52+ return (e.data.match(/^[ \r\n\t]*$/) !== null);
 53+ }
 54+
 55+ // Now re-append all block elements and inline elements wrapped in
 56+ // paragraphs.
 57+ for(i = 0; i < length; i++) {
 58+ var child = children.firstChild,
 59+ ctype = child.nodeType;
 60+ //console.log(child + ctype);
 61+ if ((ctype === 3 && (inlineStack.length || !isElementContentWhitespace(child))) ||
 62+ (ctype !== 3 && // text
 63+ ctype !== 8 && // comment
 64+ !isBlock(child.nodeName))) {
 65+ // text node
 66+ inlineStack.push(child);
 67+ } else if (inlineStack.length) {
 68+ wrapInlines(inlineStack);
 69+ body.appendChild(child);
 70+ } else {
 71+ body.appendChild(child);
 72+ }
 73+ }
 74+
 75+ if (inlineStack.length) {
 76+ wrapInlines(inlineStack);
 77+ }
 78+};
 79+
 80+function DOMPostProcessor () {
 81+ this.processors = [process_inlines_in_p];
 82+}
 83+
 84+DOMPostProcessor.prototype.doPostProcess = function ( document ) {
 85+ for(var i = 0; i < this.processors.length; i++) {
 86+ this.processors[i](document);
 87+ }
 88+};
 89+
 90+
 91+if (typeof module == "object") {
 92+ module.exports.DOMPostProcessor = DOMPostProcessor;
 93+}
Property changes on: trunk/extensions/VisualEditor/modules/parser/mediawiki.DOMPostProcessor.js
___________________________________________________________________
Added: svn:eol-style
194 + native
Index: trunk/extensions/VisualEditor/modules/parser/html5/treebuilder.js
@@ -1,4 +1,4 @@
2 -"use strict";
 2+//"use strict";
33
44 var HTML5 = require('../html5');
55 var assert = require('assert');
Index: trunk/extensions/VisualEditor/modules/parser/html5/serializer.js
@@ -1,4 +1,4 @@
2 -"use strict";
 2+//"use strict";
33 var HTML5 = require('../html5');
44 var events = require('events');
55
Index: trunk/extensions/VisualEditor/modules/parser/html5/parser.js
@@ -1,4 +1,4 @@
2 -"use strict";
 2+//"use strict";
33
44 var HTML5 = exports.HTML5 = require('../html5');
55
Index: trunk/extensions/VisualEditor/modules/parser/html5/parser/in_head_phase.js
@@ -1,4 +1,4 @@
2 -"use strict";
 2+//"use strict";
33 var Phase = require('./phase').Phase;
44 var HTML5 = require('../../html5');
55
Index: trunk/extensions/VisualEditor/modules/parser/html5/parser/after_head_phase.js
@@ -1,4 +1,4 @@
2 -"use strict";
 2+//"use strict";
33 var Phase = require('./phase').Phase;
44 var HTML5 = require('../../html5');
55
Index: trunk/extensions/VisualEditor/modules/parser/html5/parser/in_body_phase.js
@@ -1,4 +1,4 @@
2 -"use strict";
 2+//"use strict";
33 var HTML5 = require('../../html5');
44 var Phase = require('./phase').Phase;
55 var assert = require('assert')
@@ -599,7 +599,7 @@
600600 }
601601
602602 p.prototype.endTagHeading = function(name) {
603 - for(i in HTML5.HEADING_ELEMENTS) {
 603+ for(var i in HTML5.HEADING_ELEMENTS) {
604604 var el = HTML5.HEADING_ELEMENTS[i];
605605 if(this.inScope(el)) {
606606 this.tree.generateImpliedEndTags();
@@ -610,7 +610,7 @@
611611 if(this.tree.open_elements[this.tree.open_elements.length - 1].tagName.toLowerCase() != name)
612612 this.parse_error('end-tag-too-early', {name: name});
613613
614 - for(i in HTML5.HEADING_ELEMENTS) {
 614+ for(var i in HTML5.HEADING_ELEMENTS) {
615615 var el = HTML5.HEADING_ELEMENTS[i];
616616 if(this.inScope(el)) {
617617 this.tree.remove_open_elements_until(function(e) {
Index: trunk/extensions/VisualEditor/modules/parser/html5/parser/before_head_phase.js
@@ -1,4 +1,4 @@
2 -"use strict";
 2+//"use strict";
33 var Phase = require('./phase').Phase;
44
55 var start_tag_handlers = {
Index: trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt
@@ -403,9 +403,9 @@
404404 = block_lines
405405 / pre
406406 / comment &eolf
 407+ / pre
407408 / para
408 - / pre
409 - / block_tag // TODO: handle nesting of inline content for these!
 409+ / inline // includes generic tags; wrapped into paragraphs in DOM postprocessor
410410 / (s:sol {
411411 if (s) {
412412 return [s, {type: 'NEWLINE'}];
@@ -511,9 +511,8 @@
512512
513513 // TODO: convert inline content to annotations!
514514 para
515 - = (sol br)? pl:para_line pls:(!block_lines para_line)* {
516 - return [{type: 'TAG', name: 'p'}]
517 - .concat([pl], pls, [{type: 'ENDTAG', name: 'p'}]);
 515+ = s1:sol s2:sol c:inlineline {
 516+ return s1.concat(s2, [{type: 'TAG', name: 'p'}], c);
518517 }
519518
520519 para_line
@@ -795,6 +794,7 @@
796795 generic_tag
797796 = "<" end:"/"? name:[0-9a-zA-Z]+
798797 attribs:generic_attribute*
 798+ space*
799799 selfclose:"/"?
800800 ">" {
801801 var res = {name: name.join(''), attribs: attribs};
@@ -970,6 +970,13 @@
971971 } else {
972972 var dtbullets = bullets.slice(0, bullets.length - 1);
973973 dtbullets.push(':');
 974+
 975+ // convert trailing space into &nbsp;
 976+ var clen = c.length;
 977+ if (clen && c[clen - 1].type === 'TEXT' && c[clen - 1].value == ' ') {
 978+ c[clen - 1].value = "\u00a0";
 979+ }
 980+
974981 return [ { type: 'TAG', name: 'listItem', bullets: bullets } ]
975982 .concat( c
976983 ,[{ type: 'TAG', name: 'listItem', bullets: dtbullets } ]

Status & tagging log