r104567 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r104566‎ \| r104567 \| r104568 >
Date:	15:11, 29 November 2011
Author:	gwicke
Status:	deferred
Tags:
Comment:	* Split paragraph handling between tokenizer and DOM postprocessor for better html markup handling. * Remove global 'use strict' declarations from html5 parser. * Add trailing whitespace handling in dt Overall, 55 parser tests are now passing.
Modified paths:	/trunk/extensions/VisualEditor/modules/parser/html5/parser.js (modified) (history) /trunk/extensions/VisualEditor/modules/parser/html5/parser/after_head_phase.js (modified) (history) /trunk/extensions/VisualEditor/modules/parser/html5/parser/before_head_phase.js (modified) (history) /trunk/extensions/VisualEditor/modules/parser/html5/parser/in_body_phase.js (modified) (history) /trunk/extensions/VisualEditor/modules/parser/html5/parser/in_head_phase.js (modified) (history) /trunk/extensions/VisualEditor/modules/parser/html5/serializer.js (modified) (history) /trunk/extensions/VisualEditor/modules/parser/html5/treebuilder.js (modified) (history) /trunk/extensions/VisualEditor/modules/parser/mediawiki.DOMPostProcessor.js (added) (history) /trunk/extensions/VisualEditor/modules/parser/mediawiki.html5TokenEmitter.js (modified) (history) /trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt (modified) (history) /trunk/extensions/VisualEditor/tests/parser/parserTests.js (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/tests/parser/parserTests.js
—	—	@@ -11,7 +11,7 @@
12	12	*/
13	13
14	14	(function() {
15		~~-"use strict";~~
	15	+//"use strict";
16	16
17	17	var fs = require('fs'),
18	18	path = require('path'),
—	—	@@ -54,7 +54,8 @@
55	55	_import(pj('parser', 'mediawiki.parser.environment.js'), ['MWParserEnvironment']);
56	56	_import(pj('parser', 'ext.cite.taghook.ref.js'), ['MWRefTagHook']);
57	57
58		~~-_require(pj('parser', 'mediawiki.html5TokenEmitter.js'));~~
	58	+_import(pj('parser', 'mediawiki.html5TokenEmitter.js'), ['FauxHTML5']);
	59	+_import(pj('parser', 'mediawiki.DOMPostProcessor.js'), ['DOMPostProcessor']);
59	60
60	61	// WikiDom and serializers
61	62	_require(pj('es', 'es.js'));
—	—	@@ -131,21 +132,25 @@
132	133	return $('<div>').append(node).html();
133	134	}
134	135
	136	+var htmlparser = new HTML5.Parser();
	137	+
135	138	/* Normalize the expected parser output by parsing it using a HTML5 parser and
136	139	* re-serializing it to HTML. Ideally, the parser would normalize inter-tag
137	140	* whitespace for us. For now, we fake that by simply stripping all newlines.
138	141	*/
139	142	function normalizeHTML(source) {
140		~~- var parser = new HTML5.Parser();~~
141	143	// TODO: Do not strip newlines in pre and nowiki blocks!
142	144	source = source.replace(/\n/g, '');
143	145	try {
144		~~- parser.parse('<body>' + source + '</body>');~~
145		~~- return parser.document~~
	146	+ htmlparser.parse('<body>' + source + '</body>');
	147	+ return htmlparser.document
146	148	.getElementsByTagName('body')[0]
147		~~- .innerHTML;~~
	149	+ .innerHTML
	150	+ // a few things we ignore for now..
	151	+ .replace(/(title\|class\|rel)="[^"]+"/g, '');
148	152	} catch(e) {
149		~~- console.log("normalizeHTML failed:" + e);~~
	153	+ console.log("normalizeHTML failed on" +
	154	+ source + " with the following error: " + e);
150	155	console.trace();
151	156	return source;
152	157	}
—	—	@@ -172,7 +177,8 @@
173	178	failOutputTests = 0;
174	179
175	180	function processTest(item) {
176		~~- var tokenizer = new FauxHTML5.Tokenizer();~~
	181	+ var tokenizer = new FauxHTML5.Tokenizer(),
	182	+ postProcessor = new DOMPostProcessor();
177	183	if (!('title' in item)) {
178	184	console.log(item);
179	185	throw new Error('Missing title from test case.');
—	—	@@ -208,7 +214,15 @@
209	215	});
210	216	//var res = es.HtmlSerializer.stringify(tokens,environment);
211	217	//console.log(JSON.stringify(tokens));
	218	+
	219	+ // Build a DOM tree from tokens using the HTML tree
	220	+ // builder/parser.
212	221	processTokens(tokens, tokenizer);
	222	+
	223	+ // Perform post-processing on DOM.
	224	+ postProcessor.doPostProcess(tokenizer.parser.document);
	225	+
	226	+ // And serialize the result.
213	227	var out = tokenizer.parser.document
214	228	.getElementsByTagName('body')[0]
215	229	.innerHTML;
—	—	@@ -217,7 +231,12 @@
218	232	printTitle();
219	233	failTreeTests++;
220	234	console.log('RENDER FAIL', err);
221		~~- } else if ( normalizeOut(out) !== normalizeHTML(item.result) ) {~~
	235	+ return;
	236	+ }
	237	+
	238	+ var normalizedOut = normalizeOut(out);
	239	+ var normalizedExpected = normalizeHTML(item.result);
	240	+ if ( normalizedOut !== normalizedExpected ) {
222	241	printTitle();
223	242	failOutputTests++;
224	243	console.log('RAW EXPECTED:');
—	—	@@ -226,12 +245,12 @@
227	246	console.log('RAW RENDERED:');
228	247	console.log(formatHTML(out) + "\n");
229	248
230		~~- var a = formatHTML(normalizeHTML( item.result ));~~
	249	+ var a = formatHTML(normalizedExpected);
231	250
232	251	console.log('NORMALIZED EXPECTED:');
233	252	console.log(a + "\n");
234	253
235		~~- var b = formatHTML(normalizeOut( out ));~~
	254	+ var b = formatHTML(normalizedOut);
236	255
237	256	console.log('NORMALIZED RENDERED:')
238	257	console.log(formatHTML(normalizeOut(out)) + "\n");
—	—	@@ -241,7 +260,7 @@
242	261	console.log(patch.replace(/^[^\n]\n[^\n]\n[^\n]\n[^\n]\n/, ''));
243	262	} else {
244	263	passedTests++;
245		~~- console.log( 'PASS: ' + item.title );~~
	264	+ console.log( 'PASSED: ' + item.title );
246	265	}
247	266	}
248	267	});
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.html5TokenEmitter.js
—	—	@@ -3,16 +3,16 @@
4	4
5	5	var HTML5 = require('./html5/index');
6	6
7		~~-FauxHTML5 = {}~~
	7	+FauxHTML5 = {};
8	8
9	9
10	10	FauxHTML5.Tokenizer = function ( ) {
11	11	this.parser = new HTML5.Parser();
12	12	this.parser.parse(this);
13	13	return this;
14		-}
	14	+};
15	15
16		~~-FauxHTML5.Tokenizer.prototype = new events.EventEmitter;~~
	16	+FauxHTML5.Tokenizer.prototype = new events.EventEmitter();
17	17
18	18	FauxHTML5.Tokenizer.prototype.processToken = function (token) {
19	19	var att = function (maybeAttribs) {
—	—	@@ -43,7 +43,7 @@
44	44	data: att(token.attribs)});
45	45	break;
46	46	case "SELFCLOSINGTAG":
47		~~- this.emit('token', {type: 'EmptyTag',~~
	47	+ this.emit('token', {type: 'StartTag',
48	48	name: token.name,
49	49	data: att(token.attribs)});
50	50	break;
—	—	@@ -61,4 +61,8 @@
62	62	console.log("Unhandled token: " + JSON.stringify(token));
63	63	break;
64	64	}
	65	+};
	66	+
	67	+if (typeof module == "object") {
	68	+ module.exports.FauxHTML5 = FauxHTML5;
65	69	}
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.DOMPostProcessor.js
—	—	@@ -0,0 +1,92 @@
	2	+/* Perform post-processing steps on an already-built HTML DOM. */
	3	+
	4	+var isBlock = function isBlock (name) {
	5	+ switch (name.toLowerCase()) {
	6	+ case 'div':
	7	+ case 'table':
	8	+ case 'td':
	9	+ case 'tr':
	10	+ case 'tbody':
	11	+ case 'p':
	12	+ case 'ul':
	13	+ case 'ol':
	14	+ case 'li':
	15	+ case 'dl':
	16	+ case 'dt':
	17	+ case 'dd':
	18	+ case 'img': // hmm!
	19	+ case 'pre':
	20	+ case 'center':
	21	+ case 'blockquote':
	22	+ return true;
	23	+ default:
	24	+ return false;
	25	+ }
	26	+};
	27	+
	28	+var process_inlines_in_p = function ( document ) {
	29	+ // document.body does not always work in jsdom
	30	+ var body = document.getElementsByTagName('body')[0],
	31	+ children = body.cloneNode(false),
	32	+ cnodes = body.childNodes,
	33	+ inlineStack = [];
	34	+
	35	+ function wrapInlines (inlines) {
	36	+ var newp = document.createElement('p');
	37	+ for(var i = 0, length = inlines.length; i < length; i++) {
	38	+ newp.appendChild(inlines[i]);
	39	+ }
	40	+ body.appendChild(newp);
	41	+ inlineStack = [];
	42	+ }
	43	+ var i,
	44	+ length = cnodes.length;
	45	+ // Clear body
	46	+ for(i = 0; i < length; i++) {
	47	+ var cnode = body.firstChild;
	48	+ children.appendChild(cnode);
	49	+ }
	50	+
	51	+ function isElementContentWhitespace ( e ) {
	52	+ return (e.data.match(/^[ \r\n\t]*$/) !== null);
	53	+ }
	54	+
	55	+ // Now re-append all block elements and inline elements wrapped in
	56	+ // paragraphs.
	57	+ for(i = 0; i < length; i++) {
	58	+ var child = children.firstChild,
	59	+ ctype = child.nodeType;
	60	+ //console.log(child + ctype);
	61	+ if ((ctype === 3 && (inlineStack.length \|\| !isElementContentWhitespace(child))) \|\|
	62	+ (ctype !== 3 && // text
	63	+ ctype !== 8 && // comment
	64	+ !isBlock(child.nodeName))) {
	65	+ // text node
	66	+ inlineStack.push(child);
	67	+ } else if (inlineStack.length) {
	68	+ wrapInlines(inlineStack);
	69	+ body.appendChild(child);
	70	+ } else {
	71	+ body.appendChild(child);
	72	+ }
	73	+ }
	74	+
	75	+ if (inlineStack.length) {
	76	+ wrapInlines(inlineStack);
	77	+ }
	78	+};
	79	+
	80	+function DOMPostProcessor () {
	81	+ this.processors = [process_inlines_in_p];
	82	+}
	83	+
	84	+DOMPostProcessor.prototype.doPostProcess = function ( document ) {
	85	+ for(var i = 0; i < this.processors.length; i++) {
	86	+ this.processors[i](document);
	87	+ }
	88	+};
	89	+
	90	+
	91	+if (typeof module == "object") {
	92	+ module.exports.DOMPostProcessor = DOMPostProcessor;
	93	+}
Property changes on: trunk/extensions/VisualEditor/modules/parser/mediawiki.DOMPostProcessor.js
___________________________________________________________________
Added: svn:eol-style
1	94	+ native
Index: trunk/extensions/VisualEditor/modules/parser/html5/treebuilder.js
—	—	@@ -1,4 +1,4 @@
2		~~-"use strict";~~
	2	+//"use strict";
3	3
4	4	var HTML5 = require('../html5');
5	5	var assert = require('assert');
Index: trunk/extensions/VisualEditor/modules/parser/html5/serializer.js
—	—	@@ -1,4 +1,4 @@
2		~~-"use strict";~~
	2	+//"use strict";
3	3	var HTML5 = require('../html5');
4	4	var events = require('events');
5	5
Index: trunk/extensions/VisualEditor/modules/parser/html5/parser.js
—	—	@@ -1,4 +1,4 @@
2		~~-"use strict";~~
	2	+//"use strict";
3	3
4	4	var HTML5 = exports.HTML5 = require('../html5');
5	5
Index: trunk/extensions/VisualEditor/modules/parser/html5/parser/in_head_phase.js
—	—	@@ -1,4 +1,4 @@
2		~~-"use strict";~~
	2	+//"use strict";
3	3	var Phase = require('./phase').Phase;
4	4	var HTML5 = require('../../html5');
5	5
Index: trunk/extensions/VisualEditor/modules/parser/html5/parser/after_head_phase.js
—	—	@@ -1,4 +1,4 @@
2		~~-"use strict";~~
	2	+//"use strict";
3	3	var Phase = require('./phase').Phase;
4	4	var HTML5 = require('../../html5');
5	5
Index: trunk/extensions/VisualEditor/modules/parser/html5/parser/in_body_phase.js
—	—	@@ -1,4 +1,4 @@
2		~~-"use strict";~~
	2	+//"use strict";
3	3	var HTML5 = require('../../html5');
4	4	var Phase = require('./phase').Phase;
5	5	var assert = require('assert')
—	—	@@ -599,7 +599,7 @@
600	600	}
601	601
602	602	p.prototype.endTagHeading = function(name) {
603		~~- for(i in HTML5.HEADING_ELEMENTS) {~~
	603	+ for(var i in HTML5.HEADING_ELEMENTS) {
604	604	var el = HTML5.HEADING_ELEMENTS[i];
605	605	if(this.inScope(el)) {
606	606	this.tree.generateImpliedEndTags();
—	—	@@ -610,7 +610,7 @@
611	611	if(this.tree.open_elements[this.tree.open_elements.length - 1].tagName.toLowerCase() != name)
612	612	this.parse_error('end-tag-too-early', {name: name});
613	613
614		~~- for(i in HTML5.HEADING_ELEMENTS) {~~
	614	+ for(var i in HTML5.HEADING_ELEMENTS) {
615	615	var el = HTML5.HEADING_ELEMENTS[i];
616	616	if(this.inScope(el)) {
617	617	this.tree.remove_open_elements_until(function(e) {
Index: trunk/extensions/VisualEditor/modules/parser/html5/parser/before_head_phase.js
—	—	@@ -1,4 +1,4 @@
2		~~-"use strict";~~
	2	+//"use strict";
3	3	var Phase = require('./phase').Phase;
4	4
5	5	var start_tag_handlers = {
Index: trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt
—	—	@@ -403,9 +403,9 @@
404	404	= block_lines
405	405	/ pre
406	406	/ comment &eolf
	407	+ / pre
407	408	/ para
408		~~- / pre~~
409		~~- / block_tag // TODO: handle nesting of inline content for these!~~
	409	+ / inline // includes generic tags; wrapped into paragraphs in DOM postprocessor
410	410	/ (s:sol {
411	411	if (s) {
412	412	return [s, {type: 'NEWLINE'}];
—	—	@@ -511,9 +511,8 @@
512	512
513	513	// TODO: convert inline content to annotations!
514	514	para
515		~~- = (sol br)? pl:para_line pls:(!block_lines para_line)* {~~
516		~~- return [{type: 'TAG', name: 'p'}]~~
517		~~- .concat([pl], pls, [{type: 'ENDTAG', name: 'p'}]);~~
	515	+ = s1:sol s2:sol c:inlineline {
	516	+ return s1.concat(s2, [{type: 'TAG', name: 'p'}], c);
518	517	}
519	518
520	519	para_line
—	—	@@ -795,6 +794,7 @@
796	795	generic_tag
797	796	= "<" end:"/"? name:[0-9a-zA-Z]+
798	797	attribs:generic_attribute*
	798	+ space*
799	799	selfclose:"/"?
800	800	">" {
801	801	var res = {name: name.join(''), attribs: attribs};
—	—	@@ -970,6 +970,13 @@
971	971	} else {
972	972	var dtbullets = bullets.slice(0, bullets.length - 1);
973	973	dtbullets.push(':');
	974	+
	975	+ // convert trailing space into
	976	+ var clen = c.length;
	977	+ if (clen && c[clen - 1].type === 'TEXT' && c[clen - 1].value == ' ') {
	978	+ c[clen - 1].value = "\u00a0";
	979	+ }
	980	+
974	981	return [ { type: 'TAG', name: 'listItem', bullets: bullets } ]
975	982	.concat( c
976	983	,[{ type: 'TAG', name: 'listItem', bullets: dtbullets } ]

Status & tagging log

16:36, 30 November 2011 Hashar (talk | contribs) changed the status of r104567 [removed: new added: deferred]