r112026 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r112025‎ \| r112026 \| r112027 >
Date:	17:21, 21 February 2012
Author:	gwicke
Status:	deferred
Tags:
Comment:	Tidy up and comment the tokenizer a bit more. Start to move code into mediawiki.tokenizer.js module, and pass a reference to parse(). Faster inline_breaks production using a JS function which seems to be generally correct, but still breaks five tests when enabled. Seems to be some weird interaction with peg.js, possibly something to do with caching.
Modified paths:	/trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js (modified) (history) /trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
—	—	@@ -292,6 +292,10 @@
293	293
294	294	}
295	295
	296	+/*********************************************************
	297	+ * The top-level production
	298	+ *********************************************************/
	299	+
296	300	start
297	301	= e:toplevelblock* newline* {
298	302	// end is passed inline as a token, as well as a separate event for now.
—	—	@@ -310,241 +314,26 @@
311	315	}
312	316
313	317
314		~~-/* All chars that cannot start syntactic structures in the middle of a line~~
315		~~- * XXX: ] and other end delimiters should probably only be activated inside~~
316		~~- * structures to avoid unnecessarily leaving the text production on plain~~
317		~~- * content. */~~
318		-
319		~~-text_char = [^'<~[{\n\r:\]}\|!=]~~
320		-
321		~~-text = t:text_char+ { return t.join(''); }~~
322		-
323		~~-/* Legend~~
324		~~- * ' quotes (italic/bold)~~
325		~~- * < start of xmlish_tag~~
326		~~- * ~ signatures/dates~~
327		~~- * [ start of links~~
328		~~- * { start of parser functions, transclusion and template args~~
329		~~- * \n all sort of block-level markup at start of line~~
330		~~- * \r ditto~~
331		~~- * h http(s) urls~~
332		~~- * n nntp(s) urls~~
333		~~- * m mailto urls~~
334		- *
335		~~- * ! and \| table cell delimiters, might be better to specialize those~~
336		~~- * = headings - also specialize those!~~
337		- *
338		~~- * The following chars are also included for now, but only apply in some~~
339		~~- * contexts and should probably be enabled only in those:~~
340		~~- * : separate definition in ; term : definition~~
341		~~- * ] end of link~~
342		~~- * } end of parser func/transclusion/template arg~~
	318	+/*
	319	+ * A document (start production) is a sequence of toplevelblocks. Tokens are
	320	+ * emitted in chunks per toplevelblock to avoid buffering the full document.
343	321	*/
344		-
345		~~-urltext = ( t:[^'<~[{\n\rfghimnstw\|!:\]} &=]+ { return t.join(''); }~~
346		~~- / & url_chars urllink~~
347		~~- / htmlentity~~
348		~~- // Convert trailing space into  ~~
349		~~- // XXX: This should be moved to a serializer~~
350		~~- / ' ' & ':' { return "\u00a0"; }~~
351		~~- / t:text_char )+~~
352		-
353		-
354		-
355		-
356		-/*
357		~~- '//', // for protocol-relative URLs, but not in text!~~
358		~~- 'ftp://',~~
359		~~- 'git://',~~
360		~~- 'gopher://',~~
361		~~- 'http://',~~
362		~~- 'https://',~~
363		~~- 'irc://',~~
364		~~- 'ircs://', // @bug 28503~~
365		~~- 'mailto:',~~
366		~~- 'mms://',~~
367		~~- 'news:',~~
368		~~- 'nntp://', // @bug 3808 RFC 1738~~
369		~~- 'svn://',~~
370		~~- 'telnet://', // Well if we're going to support the above.. -ævar~~
371		~~- 'worldwind://',~~
372		~~-*/~~
373		-
374		~~-// Old version~~
375		~~-//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }~~
376		-
377		~~-// Experimental tweaked version: avoid expensive single-char substrings~~
378		~~-// This did not bring the expected performance boost, however.~~
379		~~-//text = [A-Za-z0-9,._ -] {~~
380		~~-// textStart = pos;~~
381		~~-//~~
382		~~-// var res = input.substr(textStart - 1, inputLength)~~
383		~~-// .match(/[A-Za-z0-9,._ -]+/)[0];~~
384		~~-// pos = pos + (res.length - 1);~~
385		~~-// return res~~
386		~~-// }~~
387		-
388		~~-htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {~~
389		~~- return unentity("&" + c.join('') + ";")~~
390		-}
391		-
392		~~-space~~
393		~~- = s:[ \t]+ { return s.join(''); }~~
394		-
395		~~-optionalSpaceToken~~
396		~~- = s:space* {~~
397		~~- if ( s.length ) {~~
398		~~- return [s.join('')];~~
399		~~- } else {~~
400		~~- return [];~~
401		~~- }~~
402		~~- }~~
403		-
404		-
405		~~-// Start of line~~
406		~~-sol = nl:(newlineToken / & { return pos === 0; } { return [] })~~
407		~~- // Eat multi-line comments, so that syntax after still matches as if it~~
408		~~- // was actually preceded by a newline~~
409		~~- cn:( c:comment n:newline? {~~
410		~~- if ( n !== '' ) {~~
411		~~- return [c, n];~~
412		~~- } else {~~
413		~~- return [c];~~
414		~~- }~~
415		~~- }~~
416		- )*
417		~~- // Eat includeonly/noinclude at start of line, so that start-of-line~~
418		~~- // syntax after it still matches~~
419		~~- ni:(space* "<" c:"/"? t:("includeonly" / "noinclude") ">" {return [c, t]} )?~~
420		~~- {~~
421		~~- var niToken = [];~~
422		~~- if ( ni !== '') {~~
423		~~- if ( ni[0] === '/' ) {~~
424		~~- niToken = [new EndTagTk( ni[1] )];~~
425		~~- } else {~~
426		~~- niToken = [new TagTk( ni[1] )];~~
427		~~- }~~
428		~~- }~~
429		-
430		~~- return nl.concat(cn, niToken);~~
431		~~- }~~
432		-
433		~~-eof = & { return isEOF(pos); } { return true; }~~
434		-
435		-
436		~~-newline~~
437		~~- = '\n' / '\r\n'~~
438		-
439		~~-newlineToken = newline { return [new NlTk()] }~~
440		-
441		~~-eolf = newline / eof~~
442		-
443		-
444		~~-// 'Preprocessor' directive- higher-level things that can occur in otherwise~~
445		~~-// plain-text content.~~
446		~~-directive~~
447		~~- = comment~~
448		~~- / tplarg_or_template~~
449		~~- / htmlentity~~
450		-
451		~~-// Plain text, but can contain templates, template arguments, comments etc-~~
452		~~-// all stuff that is normally handled by the preprocessor~~
453		~~-// Returns either a list of tokens, or a plain string (if nothing is to be~~
454		~~-// processed).~~
455		~~-preprocessor_text~~
456		~~- = r:( t:[^<~[{\n\r\t\|!\]} &=]+ { return t.join(''); }~~
457		~~- / directive~~
458		~~- / !inline_breaks text_char )+ {~~
459		~~- return flatten ( r );~~
460		~~- }~~
461		-
462		~~-spaceless_preprocessor_text~~
463		~~- = r:( t:[^'<~[{\n\r\|!\]}\t &=]+ { return t.join(''); }~~
464		~~- / directive~~
465		~~- / !inline_breaks !' ' text_char )+ {~~
466		~~- return flatten_string ( r );~~
467		~~- }~~
468		-
469		-
470		~~-wikilink_preprocessor_text~~
471		~~- = r:( t:[^%<~[{\n\r\t\|!\]} &=]+ { return t.join(''); }~~
472		~~- / urlencoded_char~~
473		~~- / directive~~
474		~~- / !inline_breaks !"]]" text_char )+ {~~
475		~~- return flatten_stringlist ( r );~~
476		~~- }~~
477		-
478		~~-extlink_preprocessor_text~~
479		~~- = r:( t:[^'<~[{\n\r\|!\]}\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ { return t.join(''); }~~
480		~~- / directive~~
481		~~- / urlencoded_char~~
482		~~- / !inline_breaks no_punctuation_char~~
483		~~- / s:[.:,] !(space / eolf) { return s }~~
484		~~- / [&%] )+ {~~
485		~~- return flatten_string ( r );~~
486		~~- }~~
487		-
488		~~-// Attribute values with preprocessor support~~
489		~~-attribute_preprocessor_text~~
490		~~- = r:( ts:(!inline_breaks t:[^=<>{\n\r&'"\t ] {return t})+ { return ts.join(''); }~~
491		~~- / directive~~
492		~~- / !inline_breaks [&%] )+ {~~
493		~~- //console.warn('prep');~~
494		~~- return flatten_string ( r );~~
495		~~- }~~
496		~~-attribute_preprocessor_text_single~~
497		~~- = r:( t:[^{&']+ { return t.join(''); }~~
498		~~- / directive~~
499		~~- / !inline_breaks [{&] )* {~~
500		~~- return flatten_string ( r );~~
501		~~- }~~
502		~~-attribute_preprocessor_text_double~~
503		~~- = r:( t:[^{&"]+ { return t.join(''); }~~
504		~~- / directive~~
505		~~- / !inline_breaks [{&] )* {~~
506		~~- //console.warn( 'double:' + pp(r) );~~
507		~~- return flatten_string ( r );~~
508		~~- }~~
509		-
510		~~-// Variants with the entire attribute on a single line~~
511		~~-attribute_preprocessor_text_line~~
512		~~- = r:( ts:(!inline_breaks t:[^=<>{\n\r&'"\t ] {return t})+ { return ts.join(''); }~~
513		~~- / directive~~
514		~~- / !inline_breaks !'\n' [&%] )+ {~~
515		~~- //console.warn('prep');~~
516		~~- return flatten_string ( r );~~
517		~~- }~~
518		~~-attribute_preprocessor_text_single_line~~
519		~~- = r:( t:[^{&']+ { return t.join(''); }~~
520		~~- / directive~~
521		~~- / !inline_breaks !'\n' [{&] )* {~~
522		~~- return flatten_string ( r );~~
523		~~- }~~
524		~~-attribute_preprocessor_text_double_line~~
525		~~- = r:( t:[^{&"]+ { return t.join(''); }~~
526		~~- / directive~~
527		~~- / !inline_breaks !'\n' [{&] )* {~~
528		~~- //console.warn( 'double:' + pp(r) );~~
529		~~- return flatten_string ( r );~~
530		~~- }~~
531		-
532		~~-// A document (start production) is a sequence of toplevelblocks. Tokens are~~
533		~~-// emitted in chunks per toplevelblock to avoid buffering the full document.~~
534	322	toplevelblock
535	323	= & { blockStart = pos; return true; } b:block {
536	324	b = flatten(b);
	325	+
	326	+ // Add source offsets for round-tripping. XXX: Add these not just for
	327	+ // toplevelblocks!
537	328	if ( b.length ) {
538	329	var bs = b[0];
539	330	if ( bs.constructor === String && bs.attribs === undefined ) {
540	331	b[0] = new String( bs );
541	332	bs = b[0];
542	333	}
543		~~- //dp('toplevelblock:' + pp(b));~~
544	334	if (bs.dataAttribs === undefined) {
545	335	bs.dataAttribs = {};
546	336	}
547	337	bs.dataAttribs.sourcePos = [blockStart, pos];
548		~~- //console.warn( 'toplevelblock: ' + pp( bs ));~~
549	338	}
550	339
551	340	// Emit tokens for this toplevelblock. This feeds a chunk to the parser
—	—	@@ -556,24 +345,32 @@
557	346	return true;
558	347	}
559	348
	349	+/*
	350	+ * The actual contents of each block.
	351	+ */
560	352	block
561		~~- = !inline_breaks~~
562		~~- r:( block_lines~~
563		~~- / pre~~
564		~~- / comment &eolf~~
565		~~- / nowiki~~
566		~~- / bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag~~
567		~~- / para~~
568		~~- / inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor~~
569		~~- / s:sol /*{~~
570		~~- if (s) {~~
571		~~- return [s, {type: 'NEWLINE'}];~~
572		~~- } else {~~
573		~~- return [{type: 'NEWLINE'}];~~
574		~~- }~~
575		~~- }*/~~
576		~~- ) { return r }~~
	353	+ = block_lines
	354	+ / & '<' r:( pre // tag variant can start anywhere
	355	+ / comment &eolf
	356	+ / nowiki
	357	+ // avoid a paragraph if we know that the line starts with a block tag
	358	+ / bt:block_tag { return [bt] }
	359	+ ) { return r; }
	360	+ / para
	361	+ // Inlineline includes generic tags; wrapped into paragraphs in token
	362	+ // transform and DOM postprocessor
	363	+ / inlineline
	364	+ / sol
577	365
	366	+/*
	367	+ * A block nested in other constructs. Avoid eating end delimiters for other
	368	+ * constructs by checking against inline_breaks first.
	369	+ */
	370	+nested_block = !inline_breaks b:block { return b }
	371	+
	372	+/*
	373	+ * Line-based block constructs.
	374	+ */
578	375	block_lines
579	376	= s:sol
580	377	// eat an empty line before the block
—	—	@@ -583,10 +380,11 @@
584	381	return s.concat(s2_, bl);
585	382	}
586	383
587		~~-// Block structures with start-of-line wiki syntax~~
	384	+/*
	385	+ * Block structures with start-of-line wiki syntax
	386	+ */
588	387	block_line
589	388	= h
590		~~- /// table~~
591	389	/ & [{}\|] tl:table_lines { return tl; }
592	390	/ lists
593	391	// tag-only lines should not trigger pre
—	—	@@ -599,9 +397,11 @@
600	398	/ pre
601	399
602	400
603		~~-// A paragraph. We don't emit 'p' tokens to avoid issues with template~~
604		~~-// transclusions, <p> tags in the source and the like. Instead, we perform~~
605		~~-// some paragraph wrapping on the DOM.~~
	401	+/*
	402	+ * A paragraph. We don't emit 'p' tokens to avoid issues with template
	403	+ * transclusions, <p> tags in the source and the like. Instead, we perform
	404	+ * some paragraph wrapping on the DOM.
	405	+ */
606	406	para
607	407	= s1:sol s2:sol c:inlineline {
608	408	return s1.concat(s2, /* [new TagTk('p')],*/ c);
—	—	@@ -609,37 +409,60 @@
610	410
611	411	br = space* &newline { return new SelfclosingTagTk( 'br' ) }
612	412
613		~~-// Syntax stops to limit inline expansion defending on syntactic context~~
	413	+/*
	414	+ * Syntax stops: Avoid eating significant tokens for higher-level productions
	415	+ * in nested inline productions.
	416	+ *
	417	+ * XXX: Repeated testing of flags is not terribly efficient.
	418	+ */
614	419	inline_breaks
615		~~- =~~
	420	+ = & [=\|!}:\r\n\]<] // don't check further if char cannot match
	421	+ res:(
	422	+ & { // Important hack: disable caching for this production, as the default
	423	+ // cache key does not take into account flag states!
	424	+ cacheKey = '';
	425	+ console.warn('ilb: ' + input.substr(pos, 5) );
	426	+ return true;
	427	+ }
	428	+
	429	+ & { return syntaxFlags['table']; }
	430	+ ( a:(newline [!\|] / '\|\|' / '!!' / '\|}') {
	431	+ //console.warn("table break" + pp(a) + pos);
	432	+ return true;
	433	+ }
	434	+ / & { return syntaxFlags['tableCellArg'] }
	435	+ "\|" { return true }
	436	+ )
	437	+ / & { return (syntaxFlags['colon'] &&
	438	+ ! syntaxFlags.extlink && // example: ; [[Link:Term]] : Definition
	439	+ ! syntaxFlags.linkdesc); } ":" { return true; }
	440	+ / & { return syntaxFlags['extlink']; } "]" { return true; }
	441	+ / & { return syntaxFlags['linkdesc']; } link_end { return true; }
	442	+ / & { return syntaxFlags['h']; } '='+ space* newline { return true; }
	443	+ / & { return syntaxFlags['template']; } ('\|' / '}}' ) {
	444	+ //console.warn( 'template break @' + pos + input.substr(pos-1, 4) );
	445	+ return true;
	446	+ }
	447	+ / & { return syntaxFlags['equal']; } '=' {
	448	+ //console.warn( 'equal stop @' + pos + input.substr(pos-1, 4) );
	449	+ return true;
	450	+ }
	451	+ / & { return syntaxFlags['pre']; } '</pre>' {
	452	+ //console.warn( 'pre stop @' + pos + input.substr(pos-1, 4) );
	453	+ return true;
	454	+ }
	455	+ ) { return res }
	456	+
	457	+inline_breaks_experiment
	458	+ = & [=\|!}:\r\n\]<]
616	459	& { // Important hack: disable caching for this production, as the default
617	460	// cache key does not take into account flag states!
618	461	cacheKey = '';
	462	+ //console.warn('ilbf: ' + input.substr(pos, 5) );
619	463	return true;
620		~~- }~~
621		~~- & { return syntaxFlags['table']; }~~
622		~~- ( a:(newline [!\|] / '\|\|' / '!!' / '\|}') { dp("table break" + pp(a) + pos); return true; }~~
623		~~- / & { return syntaxFlags['tableCellArg'] }~~
624		~~- "\|" { return true }~~
625		~~- )~~
626		~~- / & { return (syntaxFlags['colon'] &&~~
627		~~- ! syntaxFlags.extlink && // example: ; [[Link:Term]] : Definition~~
628		~~- ! syntaxFlags.linkdesc); } ":" { return true; }~~
629		~~- / & { return syntaxFlags['extlink']; } "]" { return true; }~~
630		~~- / & { return syntaxFlags['linkdesc']; } link_end { return true; }~~
631		~~- / & { return syntaxFlags['h']; } '='+ space* newline { return true; }~~
632		~~- / & { return syntaxFlags['template']; } ('\|' / '}}' ) {~~
633		~~- //console.warn( 'template break @' + pos + input.substr(pos-1, 4) );~~
634		~~- return true;~~
635		~~- }~~
636		~~- / & { return syntaxFlags['equal']; } '=' {~~
637		~~- //console.warn( 'equal stop @' + pos + input.substr(pos-1, 4) );~~
638		~~- return true;~~
639		~~- }~~
640		~~- / & { return syntaxFlags['pre']; } '</pre>' {~~
641		~~- //console.warn( 'pre stop @' + pos + input.substr(pos-1, 4) );~~
642		~~- return true;~~
643		~~- }~~
	464	+ }
	465	+ .
	466	+ { return __parseArgs[3].inline_breaks( input, pos - 1, syntaxFlags ) && true \|\| null ; }
644	467
645	468	inline
646	469	= c:(urltext / (! inline_breaks (inline_element / . )))+ {
—	—	@@ -703,38 +526,6 @@
704	527	/ & { dp('nomatch exit h'); clearFlag('h'); return false } { return null }
705	528	) { return r }
706	529
707		-
708		~~-pre_indent~~
709		~~- = pre_indent_in_tags~~
710		~~- / l:pre_indent_line ls:(sol pre_indent_line)* {~~
711		~~- return [new TagTk( 'pre' )]~~
712		~~- .concat( [l], ls~~
713		~~- , [new EndTagTk( 'pre' )]);~~
714		~~- }~~
715		-
716		~~-// An indented pre block that is surrounded with pre tags. The pre tags are~~
717		~~-// used directly.~~
718		~~-pre_indent_in_tags~~
719		~~- = space+ // XXX: capture space for round-tripping~~
720		~~- "<pre"~~
721		- attribs:generic_attribute*
722		~~- ">"~~
723		~~- & { return setFlag('pre'); }~~
724		~~- l:inlineline~~
725		- ls:(sol pre_indent_line)*
726		~~- "</pre>"~~
727		~~- {~~
728		~~- clearFlag('pre');~~
729		~~- return [ new TagTk( 'pre', attribs ) ]~~
730		~~- .concat( l, flatten( ls ), [ new EndTagTk( 'pre' ) ] );~~
731		~~- }~~
732		~~- / & { return clearFlag('pre'); }~~
733		-
734		~~-pre_indent_line = space l:inlineline {~~
735		~~- return [ '\n' ].concat(l);~~
736		-}
737		-
738		-
739	530	comment
740	531	= '<!--' c:comment_chars* ('-->' / eof)
741	532	cs:(space* newline space* cn:comment { return cn })* {
—	—	@@ -746,6 +537,11 @@
747	538	/ c:'-' !'->' { return c; }
748	539
749	540
	541	+
	542	+/**************************************************************
	543	+ * External (bracketed and autolinked) links
	544	+ **************************************************************/
	545	+
750	546	urllink
751	547	= ! { return syntaxFlags['extlink'] }
752	548	target:url {
—	—	@@ -851,6 +647,11 @@
852	648	return flatten( a ).join('');
853	649	}
854	650
	651	+
	652	+/**************************************************************
	653	+ * Templates, -arguments and wikilinks
	654	+ **************************************************************/
	655	+
855	656	tplarg_or_template = & '{{{{{' template / tplarg / template
856	657
857	658	template
—	—	@@ -992,17 +793,49 @@
993	794	return res;
994	795	}
995	796
996		~~-/* XXX: Extension tags can require a change in the tokenizer mode, which~~
997		~~- * returns any text between extension tags verbatim. For now, we simply~~
998		~~- * continue to parse the contained text and return the tokens. The original~~
999		~~- * input source can be recovered from the source positions added on tag~~
1000		~~- * tokens. This won't however work in all cases. For example, a comment start~~
1001		~~- * (<!--) between extension tags would cause the remaining text to be consumed~~
1002		~~- * as a comment. To avoid this, we might need to look ahead for the end tag~~
1003		~~- * and limit the content parsing to this section. */~~
1004	797
1005		~~-xmlish_tag = nowiki / generic_tag~~
1006	798
	799	+/***********************************************************
	800	+ * Pre and xmlish tags
	801	+ ***********************************************************/
	802	+
	803	+// Indented pre blocks differ from their non-indented (purely tag-based)
	804	+// cousins by having their contents parsed.
	805	+pre_indent
	806	+ = pre_indent_in_tags
	807	+ / l:pre_indent_line ls:(sol pre_indent_line)* {
	808	+ return [new TagTk( 'pre' )]
	809	+ .concat( [l], ls
	810	+ , [new EndTagTk( 'pre' )]);
	811	+ }
	812	+
	813	+// An indented pre block that is surrounded with pre tags. The pre tags are
	814	+// used directly.
	815	+pre_indent_in_tags
	816	+ = space+ // XXX: capture space for round-tripping
	817	+ "<pre"
	818	+ attribs:generic_attribute*
	819	+ ">"
	820	+ & { return setFlag('pre'); }
	821	+ l:inlineline
	822	+ ls:(sol pre_indent_line)*
	823	+ "</pre>"
	824	+ {
	825	+ clearFlag('pre');
	826	+ return [ new TagTk( 'pre', attribs ) ]
	827	+ .concat( l, flatten( ls ), [ new EndTagTk( 'pre' ) ] );
	828	+ }
	829	+ / & { return clearFlag('pre'); }
	830	+
	831	+pre_indent_line = space l:inlineline {
	832	+ return [ '\n' ].concat(l);
	833	+}
	834	+
	835	+/*
	836	+ * Pre blocks defined using non-indented HTML tags only parse nowiki tags
	837	+ * inside them, and convert other content to verbatim text. Nowiki inside pre
	838	+ * is not functionally needed, but supported for backwards compatibility.
	839	+ */
1007	840	pre
1008	841	= "<pre"
1009	842	attribs:generic_attribute*
—	—	@@ -1020,6 +853,24 @@
1021	854	}
1022	855	/ "</pre>" { return "</pre>"; }
1023	856
	857	+/* XXX: Extension tags can require a change in the tokenizer mode, which
	858	+ * returns any text between extension tags verbatim. For now, we simply
	859	+ * continue to parse the contained text and return the tokens. The original
	860	+ * input source can be recovered from the source positions added on tag
	861	+ * tokens. This won't however work in all cases. For example, a comment start
	862	+ * (<!--) between extension tags would cause the remaining text to be consumed
	863	+ * as a comment. To avoid this, we might need to look ahead for the end tag
	864	+ * and limit the content parsing to this section. */
	865	+
	866	+xmlish_tag = nowiki / generic_tag
	867	+
	868	+/*
	869	+ * Nowiki treats anything inside it as plain text. It could thus also be
	870	+ * defined as an extension that returns its raw input text, possibly wrapped
	871	+ * in a span for round-trip information. The special treatment for nowiki in
	872	+ * pre blocks would still remain in the grammar though, so overall handling it
	873	+ * all here is cleaner.
	874	+ */
1024	875	nowiki
1025	876	= "<nowiki>" nc:nowiki_content "</nowiki>" {
1026	877	//console.warn( 'full nowiki return: ' + pp(nc));
—	—	@@ -1050,27 +901,6 @@
1051	902	return [ts.join('')];
1052	903	}
1053	904
1054		~~-// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and~~
1055		~~-// following paragraphs~~
1056		~~-block_tag~~
1057		~~- = "<" end:"/"? name:(cs:[a-zA-Z]+ { return cs.join('') })~~
1058		- attribs:generic_attribute*
1059		~~- selfclose:"/"?~~
1060		~~- ">" {~~
1061		~~- if (block_names[name.toLowerCase()] !== true) {~~
1062		~~- // abort match if tag is not block-level~~
1063		~~- return null;~~
1064		~~- }~~
1065		~~- var res;~~
1066		~~- if ( end != '' ) {~~
1067		~~- res = new EndTagTk( name, attribs );~~
1068		~~- } else if ( selfclose != '' ) {~~
1069		~~- res = new SelfclosingTagTk( name, attribs );~~
1070		~~- } else {~~
1071		~~- res = new TagTk( name, attribs );~~
1072		~~- }~~
1073		~~- return [res];~~
1074		~~- }~~
1075	905
1076	906	// The list of HTML5 tags, mainly used for the identification of non-html
1077	907	// tags. These terminate otherwise tag-eating productions (see list below) in
—	—	@@ -1139,6 +969,7 @@
1140	970	return res;
1141	971	}
1142	972
	973	+// A generic attribute that can span multiple lines.
1143	974	generic_newline_attribute
1144	975	= s:( space / newline )*
1145	976	name:generic_attribute_name
—	—	@@ -1152,6 +983,7 @@
1153	984	}
1154	985	}
1155	986
	987	+// A single-line attribute.
1156	988	generic_attribute
1157	989	= s:space*
1158	990	name:generic_attribute_name
—	—	@@ -1168,12 +1000,13 @@
1169	1001	}
1170	1002	}
1171	1003
1172		~~-// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also~~
1173		~~-// disallow newlines, \| and {.~~
1174		~~-generic_attribute_plain_name~~
1175		~~- = n:[^ \t\0/"'>=\n\|{]+ {~~
1176		~~- return n.join('');~~
1177		~~- }~~
	1004	+// ( Replaced by generic_attribute_name for template / parameter support. )
	1005	+//// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
	1006	+//// disallow newlines, \| and {.
	1007	+//generic_attribute_plain_name
	1008	+// = n:[^ \t\0/"'>=\n\|{]+ {
	1009	+// return n.join('');
	1010	+// }
1178	1011
1179	1012	generic_attribute_name
1180	1013	= & { return setFlag( 'equal' ) }
—	—	@@ -1186,34 +1019,61 @@
1187	1020	}
1188	1021	/ & { return clearFlag( 'equal' ) }
1189	1022
	1023	+// A generic attribute, possibly spanning multiple lines.
1190	1024	generic_attribute_newline_value
1191	1025	= "=" (space / newline )* v:xml_att_value {
1192	1026	return v;
1193	1027	}
	1028	+// A generic but single-line attribute.
1194	1029	generic_attribute_value
1195	1030	= "=" space* v:att_value {
1196	1031	return v;
1197	1032	}
1198	1033
1199		~~-// XXX: attributes can contain templates and template args!!~~
	1034	+// Attribute value, quoted variants can span multiple lines.
1200	1035	xml_att_value
1201	1036	= "'" t:attribute_preprocessor_text_single "'" { return t; }
1202	1037	/ '"' t:attribute_preprocessor_text_double '"' { return t; }
1203	1038	/ attribute_preprocessor_text
1204	1039
1205		~~-// XXX: attributes can contain templates and template args!!~~
	1040	+// Attribute value, restricted to a single line.
1206	1041	att_value
1207	1042	= "'" t:attribute_preprocessor_text_single_line "'" { return t; }
1208	1043	/ '"' t:attribute_preprocessor_text_double_line '"' { return t; }
1209	1044	/ attribute_preprocessor_text_line
1210		~~-// = t:(!inline_breaks c:[^ \t'"<>='\n] { return c } )+ {~~
1211		~~-// return t.join('');~~
1212		~~-// }~~
1213		~~-// // XXX: is "\"" also valid html? or just Wikitext?~~
1214		~~-// / "'" t:[^'>]* "'" { return unquote("'", t.join('')); }~~
1215		~~-// / '"' t:[^">]* '"' { return unquote('"', t.join('')); }~~
1216	1045
1217		~~-/* Lists */~~
	1046	+/*
	1047	+ * A variant of generic_tag, but also checks if the tag name is a block-level
	1048	+ * tag as defined in
	1049	+ * http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and following
	1050	+ * paragraphs.
	1051	+ */
	1052	+block_tag
	1053	+ = "<" end:"/"?
	1054	+ name:(cs:[a-zA-Z]+ { return cs.join('') })
	1055	+ attribs:generic_newline_attribute*
	1056	+ ( space / newline ) *
	1057	+ selfclose:"/"?
	1058	+ ">" {
	1059	+ if (block_names[name.toLowerCase()] !== true) {
	1060	+ // abort match if tag is not block-level
	1061	+ return null;
	1062	+ }
	1063	+ var res;
	1064	+ if ( end != '' ) {
	1065	+ res = new EndTagTk( name, attribs );
	1066	+ } else if ( selfclose != '' ) {
	1067	+ res = new SelfclosingTagTk( name, attribs );
	1068	+ } else {
	1069	+ res = new TagTk( name, attribs );
	1070	+ }
	1071	+ return [res];
	1072	+ }
	1073	+
	1074	+
	1075	+/*********************************************************
	1076	+ * Lists
	1077	+ *********************************************************/
1218	1078	lists = e:(dtdd / li) es:(sol (dtdd / li))*
1219	1079	{
1220	1080	return annotateList( [ new TagTk( 'list' ) ]
—	—	@@ -1265,7 +1125,9 @@
1266	1126
1267	1127	list_char = [*#:;]
1268	1128
1269		-/**
	1129	+
	1130	+
	1131	+/*********************************************************************
1270	1132	* Tables
1271	1133	*
1272	1134	* Table productions are geared to support independent parsing of fragments in
—	—	@@ -1276,7 +1138,7 @@
1277	1139	*
1278	1140	* The separate table_lines production is faster than moving those productions
1279	1141	* directly to block_lines.
1280		~~- * */~~
	1142	+ *********************************************************************/
1281	1143
1282	1144	table_lines
1283	1145	= & { return setFlag('table'); }
—	—	@@ -1351,9 +1213,9 @@
1352	1214	a:table_cell_args?
1353	1215	//& { console.warn("past attrib, pos=" + pos + input.substr(pos,10)); return true; }
1354	1216	// use inline_breaks to break on tr etc
1355		~~- td:( !inline_breaks~~
1356		~~- //& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }~~
1357		- b:block { return b } )*
	1217	+ td:( //& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
	1218	+ b:nested_block { return b }
	1219	+ )*
1358	1220	{
1359	1221	if ( a == '' ) {
1360	1222	a = [];
—	—	@@ -1477,9 +1339,10 @@
1478	1340	a:(as:generic_attribute+ space* pipe !pipe { return as } )?
1479	1341	//& { dp('past attrib, pos=' + pos); return true; }
1480	1342	// use inline_breaks to break on tr etc
1481		~~- td:(!inline_breaks~~
	1343	+ td:(
1482	1344	//& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
1483		~~- b:block { return b })* {~~
	1345	+ b:nested_block { return b }
	1346	+ )* {
1484	1347	if ( a == '' ) {
1485	1348	a = [];
1486	1349	}
—	—	@@ -1516,6 +1379,226 @@
1517	1380	}
1518	1381
1519	1382
	1383	+
	1384	+/*******************************************************************
	1385	+ * Text variants and other general productions
	1386	+ *******************************************************************/
	1387	+
	1388	+/* All chars that cannot start syntactic structures in the middle of a line
	1389	+ * XXX: ] and other end delimiters should probably only be activated inside
	1390	+ * structures to avoid unnecessarily leaving the text production on plain
	1391	+ * content. */
	1392	+
	1393	+text_char = [^'<~[{\n\r:\]}\|!=]
	1394	+
	1395	+text = t:text_char+ { return t.join(''); }
	1396	+
	1397	+/* Legend
	1398	+ * ' quotes (italic/bold)
	1399	+ * < start of xmlish_tag
	1400	+ * ~ signatures/dates
	1401	+ * [ start of links
	1402	+ * { start of parser functions, transclusion and template args
	1403	+ * \n all sort of block-level markup at start of line
	1404	+ * \r ditto
	1405	+ * h http(s) urls
	1406	+ * n nntp(s) urls
	1407	+ * m mailto urls
	1408	+ *
	1409	+ * ! and \| table cell delimiters, might be better to specialize those
	1410	+ * = headings - also specialize those!
	1411	+ *
	1412	+ * The following chars are also included for now, but only apply in some
	1413	+ * contexts and should probably be enabled only in those:
	1414	+ * : separate definition in ; term : definition
	1415	+ * ] end of link
	1416	+ * } end of parser func/transclusion/template arg
	1417	+ */
	1418	+
	1419	+urltext = ( t:[^'<~[{\n\rfghimnstw\|!:\]} &=]+ { return t.join(''); }
	1420	+ / & url_chars urllink
	1421	+ / htmlentity
	1422	+ // Convert trailing space into
	1423	+ // XXX: This should be moved to a serializer
	1424	+ / ' ' & ':' { return "\u00a0"; }
	1425	+ / t:text_char )+
	1426	+
	1427	+/*
	1428	+ '//', // for protocol-relative URLs, but not in text!
	1429	+ 'ftp://',
	1430	+ 'git://',
	1431	+ 'gopher://',
	1432	+ 'http://',
	1433	+ 'https://',
	1434	+ 'irc://',
	1435	+ 'ircs://', // @bug 28503
	1436	+ 'mailto:',
	1437	+ 'mms://',
	1438	+ 'news:',
	1439	+ 'nntp://', // @bug 3808 RFC 1738
	1440	+ 'svn://',
	1441	+ 'telnet://', // Well if we're going to support the above.. -ævar
	1442	+ 'worldwind://',
	1443	+*/
	1444	+
	1445	+// Old version
	1446	+//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
	1447	+
	1448	+// Experimental tweaked version: avoid expensive single-char substrings
	1449	+// This did not bring the expected performance boost, however.
	1450	+//text = [A-Za-z0-9,._ -] {
	1451	+// textStart = pos;
	1452	+//
	1453	+// var res = input.substr(textStart - 1, inputLength)
	1454	+// .match(/[A-Za-z0-9,._ -]+/)[0];
	1455	+// pos = pos + (res.length - 1);
	1456	+// return res
	1457	+// }
	1458	+
	1459	+htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
	1460	+ return unentity("&" + c.join('') + ";")
	1461	+}
	1462	+
	1463	+space
	1464	+ = s:[ \t]+ { return s.join(''); }
	1465	+
	1466	+optionalSpaceToken
	1467	+ = s:space* {
	1468	+ if ( s.length ) {
	1469	+ return [s.join('')];
	1470	+ } else {
	1471	+ return [];
	1472	+ }
	1473	+ }
	1474	+
	1475	+
	1476	+// Start of line
	1477	+sol = nl:(newlineToken / & { return pos === 0; } { return [] })
	1478	+ // Eat multi-line comments, so that syntax after still matches as if it
	1479	+ // was actually preceded by a newline
	1480	+ cn:( c:comment n:newline? {
	1481	+ if ( n !== '' ) {
	1482	+ return [c, n];
	1483	+ } else {
	1484	+ return [c];
	1485	+ }
	1486	+ }
	1487	+ )*
	1488	+ // Eat includeonly/noinclude at start of line, so that start-of-line
	1489	+ // syntax after it still matches
	1490	+ ni:(space* "<" c:"/"? t:("includeonly" / "noinclude") ">" {return [c, t]} )?
	1491	+ {
	1492	+ var niToken = [];
	1493	+ if ( ni !== '') {
	1494	+ if ( ni[0] === '/' ) {
	1495	+ niToken = [new EndTagTk( ni[1] )];
	1496	+ } else {
	1497	+ niToken = [new TagTk( ni[1] )];
	1498	+ }
	1499	+ }
	1500	+
	1501	+ return nl.concat(cn, niToken);
	1502	+ }
	1503	+
	1504	+eof = & { return isEOF(pos); } { return true; }
	1505	+
	1506	+
	1507	+newline
	1508	+ = '\n' / '\r\n'
	1509	+
	1510	+newlineToken = newline { return [new NlTk()] }
	1511	+
	1512	+eolf = newline / eof
	1513	+
	1514	+
	1515	+// 'Preprocessor' directive- higher-level things that can occur in otherwise
	1516	+// plain-text content.
	1517	+directive
	1518	+ = comment
	1519	+ / tplarg_or_template
	1520	+ / htmlentity
	1521	+
	1522	+// Plain text, but can contain templates, template arguments, comments etc-
	1523	+// all stuff that is normally handled by the preprocessor
	1524	+// Returns either a list of tokens, or a plain string (if nothing is to be
	1525	+// processed).
	1526	+preprocessor_text
	1527	+ = r:( t:[^<~[{\n\r\t\|!\]} &=]+ { return t.join(''); }
	1528	+ / directive
	1529	+ / !inline_breaks text_char )+ {
	1530	+ return flatten ( r );
	1531	+ }
	1532	+
	1533	+spaceless_preprocessor_text
	1534	+ = r:( t:[^'<~[{\n\r\|!\]}\t &=]+ { return t.join(''); }
	1535	+ / directive
	1536	+ / !inline_breaks !' ' text_char )+ {
	1537	+ return flatten_string ( r );
	1538	+ }
	1539	+
	1540	+
	1541	+wikilink_preprocessor_text
	1542	+ = r:( t:[^%<~[{\n\r\t\|!\]} &=]+ { return t.join(''); }
	1543	+ / urlencoded_char
	1544	+ / directive
	1545	+ / !inline_breaks !"]]" text_char )+ {
	1546	+ return flatten_stringlist ( r );
	1547	+ }
	1548	+
	1549	+extlink_preprocessor_text
	1550	+ = r:( t:[^'<~[{\n\r\|!\]}\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ { return t.join(''); }
	1551	+ / directive
	1552	+ / urlencoded_char
	1553	+ / !inline_breaks no_punctuation_char
	1554	+ / s:[.:,] !(space / eolf) { return s }
	1555	+ / [&%] )+ {
	1556	+ return flatten_string ( r );
	1557	+ }
	1558	+
	1559	+// Attribute values with preprocessor support
	1560	+attribute_preprocessor_text
	1561	+ = r:( ts:(!inline_breaks t:[^=<>{\n\r&'"\t ] {return t})+ { return ts.join(''); }
	1562	+ / directive
	1563	+ / !inline_breaks [&%] )+ {
	1564	+ //console.warn('prep');
	1565	+ return flatten_string ( r );
	1566	+ }
	1567	+attribute_preprocessor_text_single
	1568	+ = r:( t:[^{&']+ { return t.join(''); }
	1569	+ / directive
	1570	+ / !inline_breaks [{&] )* {
	1571	+ return flatten_string ( r );
	1572	+ }
	1573	+attribute_preprocessor_text_double
	1574	+ = r:( t:[^{&"]+ { return t.join(''); }
	1575	+ / directive
	1576	+ / !inline_breaks [{&] )* {
	1577	+ //console.warn( 'double:' + pp(r) );
	1578	+ return flatten_string ( r );
	1579	+ }
	1580	+
	1581	+// Variants with the entire attribute on a single line
	1582	+attribute_preprocessor_text_line
	1583	+ = r:( ts:(!inline_breaks t:[^=<>{\n\r&'"\t ] {return t})+ { return ts.join(''); }
	1584	+ / directive
	1585	+ / !inline_breaks !'\n' [&%] )+ {
	1586	+ //console.warn('prep');
	1587	+ return flatten_string ( r );
	1588	+ }
	1589	+attribute_preprocessor_text_single_line
	1590	+ = r:( t:[^{&']+ { return t.join(''); }
	1591	+ / directive
	1592	+ / !inline_breaks !'\n' [{&] )* {
	1593	+ return flatten_string ( r );
	1594	+ }
	1595	+attribute_preprocessor_text_double_line
	1596	+ = r:( t:[^{&"]+ { return t.join(''); }
	1597	+ / directive
	1598	+ / !inline_breaks !'\n' [{&] )* {
	1599	+ //console.warn( 'double:' + pp(r) );
	1600	+ return flatten_string ( r );
	1601	+ }
	1602	+
1520	1603	// Special-case support for those pipe templates
1521	1604	pipe = "\|" / "{{!}}"
1522	1605
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js
—	—	@@ -54,7 +54,12 @@
55	55	// reasonable traces. Calling a trace on the extension does not really cut
56	56	// it.
57	57	//try {
58		~~- this.parser.parse(text, 'start', this.emit.bind( this, 'chunk' ));~~
	58	+ this.parser.parse(text, 'start',
	59	+ // callback
	60	+ this.emit.bind( this, 'chunk' ),
	61	+ // inline break test
	62	+ this
	63	+ );
59	64	// emit tokens here until we get that to work per toplevelblock in the
60	65	// actual tokenizer
61	66	//this.emit('chunk', out.concat( [{ type: 'END' }] ) );
—	—	@@ -67,6 +72,61 @@
68	73	//}
69	74	};
70	75
	76	+PegTokenizer.prototype.breakMap = {
	77	+ '=': function(input, pos, syntaxFlags) {
	78	+ return syntaxFlags.equal \|\|
	79	+ ( syntaxFlags.h &&
	80	+ input.substr( pos + 1, 200)
	81	+ .match(/[ \t]*[\r\n]/) !== null ) \|\| null;
	82	+ },
	83	+ '\|': function ( input, pos, syntaxFlags ) {
	84	+ return syntaxFlags.template \|\|
	85	+ ( syntaxFlags.table &&
	86	+ ( input[pos + 1].match(/[\|}]/) !== null \|\|
	87	+ syntaxFlags.tableCellArg
	88	+ )
	89	+ ) \|\| null;
	90	+ },
	91	+ "!": function ( input, pos, syntaxFlags ) {
	92	+ return syntaxFlags.table && input[pos + 1] === "!" \|\|
	93	+ null;
	94	+ },
	95	+ "}": function ( input, pos, syntaxFlags ) {
	96	+ return syntaxFlags.template && input[pos + 1] === "}" \|\| null;
	97	+ },
	98	+ ":": function ( input, pos, syntaxFlags ) {
	99	+ return syntaxFlags.colon &&
	100	+ ! syntaxFlags.extlink &&
	101	+ ! syntaxFlags.linkdesc \|\| null;
	102	+ },
	103	+ "\r": function ( input, pos, syntaxFlags ) {
	104	+ return syntaxFlags.table &&
	105	+ input[pos + 1] !== '!' &&
	106	+ input[pos + 1] !== '\|' \|\|
	107	+ null;
	108	+ },
	109	+ "\n": function ( input, pos, syntaxFlags ) {
	110	+ return syntaxFlags.table &&
	111	+ input[pos + 1] !== '!' &&
	112	+ input[pos + 1] !== '\|' \|\|
	113	+ null;
	114	+ },
	115	+ "]": function ( input, pos, syntaxFlags ) {
	116	+ return syntaxFlags.extlink \|\|
	117	+ ( syntaxFlags.linkdesc && input[pos + 1] === ']' ) \|\|
	118	+ null;
	119	+ },
	120	+ "<": function ( input, pos, syntaxFlags ) {
	121	+ return syntaxFlags.pre && input.substr( pos, 6 ) === '</pre>' \|\| null;
	122	+ }
	123	+};
	124	+
	125	+PegTokenizer.prototype.inline_breaks = function (input, pos, syntaxFlags ) {
	126	+ var res = this.breakMap[ input[pos] ]( input, pos, syntaxFlags);
	127	+ console.warn( 'ilb res: ' + JSON.stringify( [ res, input.substr( pos, 4 ) ] ) );
	128	+ return res;
	129	+};
	130	+
71	131	/*****************************************************************************
72	132	* LEGACY stuff
73	133	*
—	—	@@ -173,6 +233,7 @@
174	234	}
175	235	};
176	236
	237	+
177	238	if (typeof module == "object") {
178	239	module.exports.PegTokenizer = PegTokenizer;
179	240	}

Status & tagging log

17:23, 21 February 2012 GWicke (talk | contribs) changed the status of r112026 [removed: new added: deferred]