r112026 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r112025‎ | r112026 | r112027 >
Date:17:21, 21 February 2012
Author:gwicke
Status:deferred
Tags:
Comment:
Tidy up and comment the tokenizer a bit more. Start to move code into
mediawiki.tokenizer.js module, and pass a reference to parse(). Faster
inline_breaks production using a JS function which seems to be generally
correct, but still breaks five tests when enabled. Seems to be some weird
interaction with peg.js, possibly something to do with caching.
Modified paths:
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
@@ -292,6 +292,10 @@
293293
294294 }
295295
 296+/*********************************************************
 297+ * The top-level production
 298+ *********************************************************/
 299+
296300 start
297301 = e:toplevelblock* newline* {
298302 // end is passed inline as a token, as well as a separate event for now.
@@ -310,241 +314,26 @@
311315 }
312316
313317
314 -/* All chars that cannot start syntactic structures in the middle of a line
315 - * XXX: ] and other end delimiters should probably only be activated inside
316 - * structures to avoid unnecessarily leaving the text production on plain
317 - * content. */
318 -
319 -text_char = [^'<~[{\n\r:\]}|!=]
320 -
321 -text = t:text_char+ { return t.join(''); }
322 -
323 -/* Legend
324 - * ' quotes (italic/bold)
325 - * < start of xmlish_tag
326 - * ~ signatures/dates
327 - * [ start of links
328 - * { start of parser functions, transclusion and template args
329 - * \n all sort of block-level markup at start of line
330 - * \r ditto
331 - * h http(s) urls
332 - * n nntp(s) urls
333 - * m mailto urls
334 - *
335 - * ! and | table cell delimiters, might be better to specialize those
336 - * = headings - also specialize those!
337 - *
338 - * The following chars are also included for now, but only apply in some
339 - * contexts and should probably be enabled only in those:
340 - * : separate definition in ; term : definition
341 - * ] end of link
342 - * } end of parser func/transclusion/template arg
 318+/*
 319+ * A document (start production) is a sequence of toplevelblocks. Tokens are
 320+ * emitted in chunks per toplevelblock to avoid buffering the full document.
343321 */
344 -
345 -urltext = ( t:[^'<~[{\n\rfghimnstw|!:\]} &=]+ { return t.join(''); }
346 - / & url_chars urllink
347 - / htmlentity
348 - // Convert trailing space into &nbsp;
349 - // XXX: This should be moved to a serializer
350 - / ' ' & ':' { return "\u00a0"; }
351 - / t:text_char )+
352 -
353 -
354 -
355 -
356 -/*
357 - '//', // for protocol-relative URLs, but not in text!
358 - 'ftp://',
359 - 'git://',
360 - 'gopher://',
361 - 'http://',
362 - 'https://',
363 - 'irc://',
364 - 'ircs://', // @bug 28503
365 - 'mailto:',
366 - 'mms://',
367 - 'news:',
368 - 'nntp://', // @bug 3808 RFC 1738
369 - 'svn://',
370 - 'telnet://', // Well if we're going to support the above.. -ævar
371 - 'worldwind://',
372 -*/
373 -
374 -// Old version
375 -//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
376 -
377 -// Experimental tweaked version: avoid expensive single-char substrings
378 -// This did not bring the expected performance boost, however.
379 -//text = [A-Za-z0-9,._ -] {
380 -// textStart = pos;
381 -//
382 -// var res = input.substr(textStart - 1, inputLength)
383 -// .match(/[A-Za-z0-9,._ -]+/)[0];
384 -// pos = pos + (res.length - 1);
385 -// return res
386 -// }
387 -
388 -htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
389 - return unentity("&" + c.join('') + ";")
390 -}
391 -
392 -space
393 - = s:[ \t]+ { return s.join(''); }
394 -
395 -optionalSpaceToken
396 - = s:space* {
397 - if ( s.length ) {
398 - return [s.join('')];
399 - } else {
400 - return [];
401 - }
402 - }
403 -
404 -
405 -// Start of line
406 -sol = nl:(newlineToken / & { return pos === 0; } { return [] })
407 - // Eat multi-line comments, so that syntax after still matches as if it
408 - // was actually preceded by a newline
409 - cn:( c:comment n:newline? {
410 - if ( n !== '' ) {
411 - return [c, n];
412 - } else {
413 - return [c];
414 - }
415 - }
416 - )*
417 - // Eat includeonly/noinclude at start of line, so that start-of-line
418 - // syntax after it still matches
419 - ni:(space* "<" c:"/"? t:("includeonly" / "noinclude") ">" {return [c, t]} )?
420 - {
421 - var niToken = [];
422 - if ( ni !== '') {
423 - if ( ni[0] === '/' ) {
424 - niToken = [new EndTagTk( ni[1] )];
425 - } else {
426 - niToken = [new TagTk( ni[1] )];
427 - }
428 - }
429 -
430 - return nl.concat(cn, niToken);
431 - }
432 -
433 -eof = & { return isEOF(pos); } { return true; }
434 -
435 -
436 -newline
437 - = '\n' / '\r\n'
438 -
439 -newlineToken = newline { return [new NlTk()] }
440 -
441 -eolf = newline / eof
442 -
443 -
444 -// 'Preprocessor' directive- higher-level things that can occur in otherwise
445 -// plain-text content.
446 -directive
447 - = comment
448 - / tplarg_or_template
449 - / htmlentity
450 -
451 -// Plain text, but can contain templates, template arguments, comments etc-
452 -// all stuff that is normally handled by the preprocessor
453 -// Returns either a list of tokens, or a plain string (if nothing is to be
454 -// processed).
455 -preprocessor_text
456 - = r:( t:[^<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
457 - / directive
458 - / !inline_breaks text_char )+ {
459 - return flatten ( r );
460 - }
461 -
462 -spaceless_preprocessor_text
463 - = r:( t:[^'<~[{\n\r|!\]}\t &=]+ { return t.join(''); }
464 - / directive
465 - / !inline_breaks !' ' text_char )+ {
466 - return flatten_string ( r );
467 - }
468 -
469 -
470 -wikilink_preprocessor_text
471 - = r:( t:[^%<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
472 - / urlencoded_char
473 - / directive
474 - / !inline_breaks !"]]" text_char )+ {
475 - return flatten_stringlist ( r );
476 - }
477 -
478 -extlink_preprocessor_text
479 - = r:( t:[^'<~[{\n\r|!\]}\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ { return t.join(''); }
480 - / directive
481 - / urlencoded_char
482 - / !inline_breaks no_punctuation_char
483 - / s:[.:,] !(space / eolf) { return s }
484 - / [&%] )+ {
485 - return flatten_string ( r );
486 - }
487 -
488 -// Attribute values with preprocessor support
489 -attribute_preprocessor_text
490 - = r:( ts:(!inline_breaks t:[^=<>{\n\r&'"\t ] {return t})+ { return ts.join(''); }
491 - / directive
492 - / !inline_breaks [&%] )+ {
493 - //console.warn('prep');
494 - return flatten_string ( r );
495 - }
496 -attribute_preprocessor_text_single
497 - = r:( t:[^{&']+ { return t.join(''); }
498 - / directive
499 - / !inline_breaks [{&] )* {
500 - return flatten_string ( r );
501 - }
502 -attribute_preprocessor_text_double
503 - = r:( t:[^{&"]+ { return t.join(''); }
504 - / directive
505 - / !inline_breaks [{&] )* {
506 - //console.warn( 'double:' + pp(r) );
507 - return flatten_string ( r );
508 - }
509 -
510 -// Variants with the entire attribute on a single line
511 -attribute_preprocessor_text_line
512 - = r:( ts:(!inline_breaks t:[^=<>{\n\r&'"\t ] {return t})+ { return ts.join(''); }
513 - / directive
514 - / !inline_breaks !'\n' [&%] )+ {
515 - //console.warn('prep');
516 - return flatten_string ( r );
517 - }
518 -attribute_preprocessor_text_single_line
519 - = r:( t:[^{&']+ { return t.join(''); }
520 - / directive
521 - / !inline_breaks !'\n' [{&] )* {
522 - return flatten_string ( r );
523 - }
524 -attribute_preprocessor_text_double_line
525 - = r:( t:[^{&"]+ { return t.join(''); }
526 - / directive
527 - / !inline_breaks !'\n' [{&] )* {
528 - //console.warn( 'double:' + pp(r) );
529 - return flatten_string ( r );
530 - }
531 -
532 -// A document (start production) is a sequence of toplevelblocks. Tokens are
533 -// emitted in chunks per toplevelblock to avoid buffering the full document.
534322 toplevelblock
535323 = & { blockStart = pos; return true; } b:block {
536324 b = flatten(b);
 325+
 326+ // Add source offsets for round-tripping. XXX: Add these not just for
 327+ // toplevelblocks!
537328 if ( b.length ) {
538329 var bs = b[0];
539330 if ( bs.constructor === String && bs.attribs === undefined ) {
540331 b[0] = new String( bs );
541332 bs = b[0];
542333 }
543 - //dp('toplevelblock:' + pp(b));
544334 if (bs.dataAttribs === undefined) {
545335 bs.dataAttribs = {};
546336 }
547337 bs.dataAttribs.sourcePos = [blockStart, pos];
548 - //console.warn( 'toplevelblock: ' + pp( bs ));
549338 }
550339
551340 // Emit tokens for this toplevelblock. This feeds a chunk to the parser
@@ -556,24 +345,32 @@
557346 return true;
558347 }
559348
 349+/*
 350+ * The actual contents of each block.
 351+ */
560352 block
561 - = !inline_breaks
562 - r:( block_lines
563 - / pre
564 - / comment &eolf
565 - / nowiki
566 - / bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag
567 - / para
568 - / inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor
569 - / s:sol /*{
570 - if (s) {
571 - return [s, {type: 'NEWLINE'}];
572 - } else {
573 - return [{type: 'NEWLINE'}];
574 - }
575 - }*/
576 - ) { return r }
 353+ = block_lines
 354+ / & '<' r:( pre // tag variant can start anywhere
 355+ / comment &eolf
 356+ / nowiki
 357+ // avoid a paragraph if we know that the line starts with a block tag
 358+ / bt:block_tag { return [bt] }
 359+ ) { return r; }
 360+ / para
 361+ // Inlineline includes generic tags; wrapped into paragraphs in token
 362+ // transform and DOM postprocessor
 363+ / inlineline
 364+ / sol
577365
 366+/*
 367+ * A block nested in other constructs. Avoid eating end delimiters for other
 368+ * constructs by checking against inline_breaks first.
 369+ */
 370+nested_block = !inline_breaks b:block { return b }
 371+
 372+/*
 373+ * Line-based block constructs.
 374+ */
578375 block_lines
579376 = s:sol
580377 // eat an empty line before the block
@@ -583,10 +380,11 @@
584381 return s.concat(s2_, bl);
585382 }
586383
587 -// Block structures with start-of-line wiki syntax
 384+/*
 385+ * Block structures with start-of-line wiki syntax
 386+ */
588387 block_line
589388 = h
590 - /// table
591389 / & [{}|] tl:table_lines { return tl; }
592390 / lists
593391 // tag-only lines should not trigger pre
@@ -599,9 +397,11 @@
600398 / pre
601399
602400
603 -// A paragraph. We don't emit 'p' tokens to avoid issues with template
604 -// transclusions, <p> tags in the source and the like. Instead, we perform
605 -// some paragraph wrapping on the DOM.
 401+/*
 402+ * A paragraph. We don't emit 'p' tokens to avoid issues with template
 403+ * transclusions, <p> tags in the source and the like. Instead, we perform
 404+ * some paragraph wrapping on the DOM.
 405+ */
606406 para
607407 = s1:sol s2:sol c:inlineline {
608408 return s1.concat(s2, /* [new TagTk('p')],*/ c);
@@ -609,37 +409,60 @@
610410
611411 br = space* &newline { return new SelfclosingTagTk( 'br' ) }
612412
613 -// Syntax stops to limit inline expansion defending on syntactic context
 413+/*
 414+ * Syntax stops: Avoid eating significant tokens for higher-level productions
 415+ * in nested inline productions.
 416+ *
 417+ * XXX: Repeated testing of flags is not terribly efficient.
 418+ */
614419 inline_breaks
615 - =
 420+ = & [=|!}:\r\n\]<] // don't check further if char cannot match
 421+ res:(
 422+ & { // Important hack: disable caching for this production, as the default
 423+ // cache key does not take into account flag states!
 424+ cacheKey = '';
 425+ console.warn('ilb: ' + input.substr(pos, 5) );
 426+ return true;
 427+ }
 428+
 429+ & { return syntaxFlags['table']; }
 430+ ( a:(newline [!|] / '||' / '!!' / '|}') {
 431+ //console.warn("table break" + pp(a) + pos);
 432+ return true;
 433+ }
 434+ / & { return syntaxFlags['tableCellArg'] }
 435+ "|" { return true }
 436+ )
 437+ / & { return (syntaxFlags['colon'] &&
 438+ ! syntaxFlags.extlink && // example: ; [[Link:Term]] : Definition
 439+ ! syntaxFlags.linkdesc); } ":" { return true; }
 440+ / & { return syntaxFlags['extlink']; } "]" { return true; }
 441+ / & { return syntaxFlags['linkdesc']; } link_end { return true; }
 442+ / & { return syntaxFlags['h']; } '='+ space* newline { return true; }
 443+ / & { return syntaxFlags['template']; } ('|' / '}}' ) {
 444+ //console.warn( 'template break @' + pos + input.substr(pos-1, 4) );
 445+ return true;
 446+ }
 447+ / & { return syntaxFlags['equal']; } '=' {
 448+ //console.warn( 'equal stop @' + pos + input.substr(pos-1, 4) );
 449+ return true;
 450+ }
 451+ / & { return syntaxFlags['pre']; } '</pre>' {
 452+ //console.warn( 'pre stop @' + pos + input.substr(pos-1, 4) );
 453+ return true;
 454+ }
 455+ ) { return res }
 456+
 457+inline_breaks_experiment
 458+ = & [=|!}:\r\n\]<]
616459 & { // Important hack: disable caching for this production, as the default
617460 // cache key does not take into account flag states!
618461 cacheKey = '';
 462+ //console.warn('ilbf: ' + input.substr(pos, 5) );
619463 return true;
620 - }
621 - & { return syntaxFlags['table']; }
622 - ( a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a) + pos); return true; }
623 - / & { return syntaxFlags['tableCellArg'] }
624 - "|" { return true }
625 - )
626 - / & { return (syntaxFlags['colon'] &&
627 - ! syntaxFlags.extlink && // example: ; [[Link:Term]] : Definition
628 - ! syntaxFlags.linkdesc); } ":" { return true; }
629 - / & { return syntaxFlags['extlink']; } "]" { return true; }
630 - / & { return syntaxFlags['linkdesc']; } link_end { return true; }
631 - / & { return syntaxFlags['h']; } '='+ space* newline { return true; }
632 - / & { return syntaxFlags['template']; } ('|' / '}}' ) {
633 - //console.warn( 'template break @' + pos + input.substr(pos-1, 4) );
634 - return true;
635 - }
636 - / & { return syntaxFlags['equal']; } '=' {
637 - //console.warn( 'equal stop @' + pos + input.substr(pos-1, 4) );
638 - return true;
639 - }
640 - / & { return syntaxFlags['pre']; } '</pre>' {
641 - //console.warn( 'pre stop @' + pos + input.substr(pos-1, 4) );
642 - return true;
643 - }
 464+ }
 465+ .
 466+ { return __parseArgs[3].inline_breaks( input, pos - 1, syntaxFlags ) && true || null ; }
644467
645468 inline
646469 = c:(urltext / (! inline_breaks (inline_element / . )))+ {
@@ -703,38 +526,6 @@
704527 / & { dp('nomatch exit h'); clearFlag('h'); return false } { return null }
705528 ) { return r }
706529
707 -
708 -pre_indent
709 - = pre_indent_in_tags
710 - / l:pre_indent_line ls:(sol pre_indent_line)* {
711 - return [new TagTk( 'pre' )]
712 - .concat( [l], ls
713 - , [new EndTagTk( 'pre' )]);
714 - }
715 -
716 -// An indented pre block that is surrounded with pre tags. The pre tags are
717 -// used directly.
718 -pre_indent_in_tags
719 - = space+ // XXX: capture space for round-tripping
720 - "<pre"
721 - attribs:generic_attribute*
722 - ">"
723 - & { return setFlag('pre'); }
724 - l:inlineline
725 - ls:(sol pre_indent_line)*
726 - "</pre>"
727 - {
728 - clearFlag('pre');
729 - return [ new TagTk( 'pre', attribs ) ]
730 - .concat( l, flatten( ls ), [ new EndTagTk( 'pre' ) ] );
731 - }
732 - / & { return clearFlag('pre'); }
733 -
734 -pre_indent_line = space l:inlineline {
735 - return [ '\n' ].concat(l);
736 -}
737 -
738 -
739530 comment
740531 = '<!--' c:comment_chars* ('-->' / eof)
741532 cs:(space* newline space* cn:comment { return cn })* {
@@ -746,6 +537,11 @@
747538 / c:'-' !'->' { return c; }
748539
749540
 541+
 542+/**************************************************************
 543+ * External (bracketed and autolinked) links
 544+ **************************************************************/
 545+
750546 urllink
751547 = ! { return syntaxFlags['extlink'] }
752548 target:url {
@@ -851,6 +647,11 @@
852648 return flatten( a ).join('');
853649 }
854650
 651+
 652+/**************************************************************
 653+ * Templates, -arguments and wikilinks
 654+ **************************************************************/
 655+
855656 tplarg_or_template = & '{{{{{' template / tplarg / template
856657
857658 template
@@ -992,17 +793,49 @@
993794 return res;
994795 }
995796
996 -/* XXX: Extension tags can require a change in the tokenizer mode, which
997 - * returns any text between extension tags verbatim. For now, we simply
998 - * continue to parse the contained text and return the tokens. The original
999 - * input source can be recovered from the source positions added on tag
1000 - * tokens. This won't however work in all cases. For example, a comment start
1001 - * (<!--) between extension tags would cause the remaining text to be consumed
1002 - * as a comment. To avoid this, we might need to look ahead for the end tag
1003 - * and limit the content parsing to this section. */
1004797
1005 -xmlish_tag = nowiki / generic_tag
1006798
 799+/***********************************************************
 800+ * Pre and xmlish tags
 801+ ***********************************************************/
 802+
 803+// Indented pre blocks differ from their non-indented (purely tag-based)
 804+// cousins by having their contents parsed.
 805+pre_indent
 806+ = pre_indent_in_tags
 807+ / l:pre_indent_line ls:(sol pre_indent_line)* {
 808+ return [new TagTk( 'pre' )]
 809+ .concat( [l], ls
 810+ , [new EndTagTk( 'pre' )]);
 811+ }
 812+
 813+// An indented pre block that is surrounded with pre tags. The pre tags are
 814+// used directly.
 815+pre_indent_in_tags
 816+ = space+ // XXX: capture space for round-tripping
 817+ "<pre"
 818+ attribs:generic_attribute*
 819+ ">"
 820+ & { return setFlag('pre'); }
 821+ l:inlineline
 822+ ls:(sol pre_indent_line)*
 823+ "</pre>"
 824+ {
 825+ clearFlag('pre');
 826+ return [ new TagTk( 'pre', attribs ) ]
 827+ .concat( l, flatten( ls ), [ new EndTagTk( 'pre' ) ] );
 828+ }
 829+ / & { return clearFlag('pre'); }
 830+
 831+pre_indent_line = space l:inlineline {
 832+ return [ '\n' ].concat(l);
 833+}
 834+
 835+/*
 836+ * Pre blocks defined using non-indented HTML tags only parse nowiki tags
 837+ * inside them, and convert other content to verbatim text. Nowiki inside pre
 838+ * is not functionally needed, but supported for backwards compatibility.
 839+ */
1007840 pre
1008841 = "<pre"
1009842 attribs:generic_attribute*
@@ -1020,6 +853,24 @@
1021854 }
1022855 / "</pre>" { return "</pre>"; }
1023856
 857+/* XXX: Extension tags can require a change in the tokenizer mode, which
 858+ * returns any text between extension tags verbatim. For now, we simply
 859+ * continue to parse the contained text and return the tokens. The original
 860+ * input source can be recovered from the source positions added on tag
 861+ * tokens. This won't however work in all cases. For example, a comment start
 862+ * (<!--) between extension tags would cause the remaining text to be consumed
 863+ * as a comment. To avoid this, we might need to look ahead for the end tag
 864+ * and limit the content parsing to this section. */
 865+
 866+xmlish_tag = nowiki / generic_tag
 867+
 868+/*
 869+ * Nowiki treats anything inside it as plain text. It could thus also be
 870+ * defined as an extension that returns its raw input text, possibly wrapped
 871+ * in a span for round-trip information. The special treatment for nowiki in
 872+ * pre blocks would still remain in the grammar though, so overall handling it
 873+ * all here is cleaner.
 874+ */
1024875 nowiki
1025876 = "<nowiki>" nc:nowiki_content "</nowiki>" {
1026877 //console.warn( 'full nowiki return: ' + pp(nc));
@@ -1050,27 +901,6 @@
1051902 return [ts.join('')];
1052903 }
1053904
1054 -// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
1055 -// following paragraphs
1056 -block_tag
1057 - = "<" end:"/"? name:(cs:[a-zA-Z]+ { return cs.join('') })
1058 - attribs:generic_attribute*
1059 - selfclose:"/"?
1060 - ">" {
1061 - if (block_names[name.toLowerCase()] !== true) {
1062 - // abort match if tag is not block-level
1063 - return null;
1064 - }
1065 - var res;
1066 - if ( end != '' ) {
1067 - res = new EndTagTk( name, attribs );
1068 - } else if ( selfclose != '' ) {
1069 - res = new SelfclosingTagTk( name, attribs );
1070 - } else {
1071 - res = new TagTk( name, attribs );
1072 - }
1073 - return [res];
1074 - }
1075905
1076906 // The list of HTML5 tags, mainly used for the identification of non-html
1077907 // tags. These terminate otherwise tag-eating productions (see list below) in
@@ -1139,6 +969,7 @@
1140970 return res;
1141971 }
1142972
 973+// A generic attribute that can span multiple lines.
1143974 generic_newline_attribute
1144975 = s:( space / newline )*
1145976 name:generic_attribute_name
@@ -1152,6 +983,7 @@
1153984 }
1154985 }
1155986
 987+// A single-line attribute.
1156988 generic_attribute
1157989 = s:space*
1158990 name:generic_attribute_name
@@ -1168,12 +1000,13 @@
11691001 }
11701002 }
11711003
1172 -// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
1173 -// disallow newlines, | and {.
1174 -generic_attribute_plain_name
1175 - = n:[^ \t\0/"'>=\n|{]+ {
1176 - return n.join('');
1177 - }
 1004+// ( Replaced by generic_attribute_name for template / parameter support. )
 1005+//// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
 1006+//// disallow newlines, | and {.
 1007+//generic_attribute_plain_name
 1008+// = n:[^ \t\0/"'>=\n|{]+ {
 1009+// return n.join('');
 1010+// }
11781011
11791012 generic_attribute_name
11801013 = & { return setFlag( 'equal' ) }
@@ -1186,34 +1019,61 @@
11871020 }
11881021 / & { return clearFlag( 'equal' ) }
11891022
 1023+// A generic attribute, possibly spanning multiple lines.
11901024 generic_attribute_newline_value
11911025 = "=" (space / newline )* v:xml_att_value {
11921026 return v;
11931027 }
 1028+// A generic but single-line attribute.
11941029 generic_attribute_value
11951030 = "=" space* v:att_value {
11961031 return v;
11971032 }
11981033
1199 -// XXX: attributes can contain templates and template args!!
 1034+// Attribute value, quoted variants can span multiple lines.
12001035 xml_att_value
12011036 = "'" t:attribute_preprocessor_text_single "'" { return t; }
12021037 / '"' t:attribute_preprocessor_text_double '"' { return t; }
12031038 / attribute_preprocessor_text
12041039
1205 -// XXX: attributes can contain templates and template args!!
 1040+// Attribute value, restricted to a single line.
12061041 att_value
12071042 = "'" t:attribute_preprocessor_text_single_line "'" { return t; }
12081043 / '"' t:attribute_preprocessor_text_double_line '"' { return t; }
12091044 / attribute_preprocessor_text_line
1210 -// = t:(!inline_breaks c:[^ \t'"<>='\n] { return c } )+ {
1211 -// return t.join('');
1212 -// }
1213 -// // XXX: is "\"" also valid html? or just Wikitext?
1214 -// / "'" t:[^'>]* "'" { return unquote("'", t.join('')); }
1215 -// / '"' t:[^">]* '"' { return unquote('"', t.join('')); }
12161045
1217 -/* Lists */
 1046+/*
 1047+ * A variant of generic_tag, but also checks if the tag name is a block-level
 1048+ * tag as defined in
 1049+ * http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and following
 1050+ * paragraphs.
 1051+ */
 1052+block_tag
 1053+ = "<" end:"/"?
 1054+ name:(cs:[a-zA-Z]+ { return cs.join('') })
 1055+ attribs:generic_newline_attribute*
 1056+ ( space / newline ) *
 1057+ selfclose:"/"?
 1058+ ">" {
 1059+ if (block_names[name.toLowerCase()] !== true) {
 1060+ // abort match if tag is not block-level
 1061+ return null;
 1062+ }
 1063+ var res;
 1064+ if ( end != '' ) {
 1065+ res = new EndTagTk( name, attribs );
 1066+ } else if ( selfclose != '' ) {
 1067+ res = new SelfclosingTagTk( name, attribs );
 1068+ } else {
 1069+ res = new TagTk( name, attribs );
 1070+ }
 1071+ return [res];
 1072+ }
 1073+
 1074+
 1075+/*********************************************************
 1076+ * Lists
 1077+ *********************************************************/
12181078 lists = e:(dtdd / li) es:(sol (dtdd / li))*
12191079 {
12201080 return annotateList( [ new TagTk( 'list' ) ]
@@ -1265,7 +1125,9 @@
12661126
12671127 list_char = [*#:;]
12681128
1269 -/**
 1129+
 1130+
 1131+/*********************************************************************
12701132 * Tables
12711133 *
12721134 * Table productions are geared to support independent parsing of fragments in
@@ -1276,7 +1138,7 @@
12771139 *
12781140 * The separate table_lines production is faster than moving those productions
12791141 * directly to block_lines.
1280 - * */
 1142+ *********************************************************************/
12811143
12821144 table_lines
12831145 = & { return setFlag('table'); }
@@ -1351,9 +1213,9 @@
13521214 a:table_cell_args?
13531215 //& { console.warn("past attrib, pos=" + pos + input.substr(pos,10)); return true; }
13541216 // use inline_breaks to break on tr etc
1355 - td:( !inline_breaks
1356 - //& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
1357 - b:block { return b } )*
 1217+ td:( //& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
 1218+ b:nested_block { return b }
 1219+ )*
13581220 {
13591221 if ( a == '' ) {
13601222 a = [];
@@ -1477,9 +1339,10 @@
14781340 a:(as:generic_attribute+ space* pipe !pipe { return as } )?
14791341 //& { dp('past attrib, pos=' + pos); return true; }
14801342 // use inline_breaks to break on tr etc
1481 - td:(!inline_breaks
 1343+ td:(
14821344 //& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
1483 - b:block { return b })* {
 1345+ b:nested_block { return b }
 1346+ )* {
14841347 if ( a == '' ) {
14851348 a = [];
14861349 }
@@ -1516,6 +1379,226 @@
15171380 }
15181381
15191382
 1383+
 1384+/*******************************************************************
 1385+ * Text variants and other general productions
 1386+ *******************************************************************/
 1387+
 1388+/* All chars that cannot start syntactic structures in the middle of a line
 1389+ * XXX: ] and other end delimiters should probably only be activated inside
 1390+ * structures to avoid unnecessarily leaving the text production on plain
 1391+ * content. */
 1392+
 1393+text_char = [^'<~[{\n\r:\]}|!=]
 1394+
 1395+text = t:text_char+ { return t.join(''); }
 1396+
 1397+/* Legend
 1398+ * ' quotes (italic/bold)
 1399+ * < start of xmlish_tag
 1400+ * ~ signatures/dates
 1401+ * [ start of links
 1402+ * { start of parser functions, transclusion and template args
 1403+ * \n all sort of block-level markup at start of line
 1404+ * \r ditto
 1405+ * h http(s) urls
 1406+ * n nntp(s) urls
 1407+ * m mailto urls
 1408+ *
 1409+ * ! and | table cell delimiters, might be better to specialize those
 1410+ * = headings - also specialize those!
 1411+ *
 1412+ * The following chars are also included for now, but only apply in some
 1413+ * contexts and should probably be enabled only in those:
 1414+ * : separate definition in ; term : definition
 1415+ * ] end of link
 1416+ * } end of parser func/transclusion/template arg
 1417+ */
 1418+
 1419+urltext = ( t:[^'<~[{\n\rfghimnstw|!:\]} &=]+ { return t.join(''); }
 1420+ / & url_chars urllink
 1421+ / htmlentity
 1422+ // Convert trailing space into &nbsp;
 1423+ // XXX: This should be moved to a serializer
 1424+ / ' ' & ':' { return "\u00a0"; }
 1425+ / t:text_char )+
 1426+
 1427+/*
 1428+ '//', // for protocol-relative URLs, but not in text!
 1429+ 'ftp://',
 1430+ 'git://',
 1431+ 'gopher://',
 1432+ 'http://',
 1433+ 'https://',
 1434+ 'irc://',
 1435+ 'ircs://', // @bug 28503
 1436+ 'mailto:',
 1437+ 'mms://',
 1438+ 'news:',
 1439+ 'nntp://', // @bug 3808 RFC 1738
 1440+ 'svn://',
 1441+ 'telnet://', // Well if we're going to support the above.. -ævar
 1442+ 'worldwind://',
 1443+*/
 1444+
 1445+// Old version
 1446+//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
 1447+
 1448+// Experimental tweaked version: avoid expensive single-char substrings
 1449+// This did not bring the expected performance boost, however.
 1450+//text = [A-Za-z0-9,._ -] {
 1451+// textStart = pos;
 1452+//
 1453+// var res = input.substr(textStart - 1, inputLength)
 1454+// .match(/[A-Za-z0-9,._ -]+/)[0];
 1455+// pos = pos + (res.length - 1);
 1456+// return res
 1457+// }
 1458+
 1459+htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
 1460+ return unentity("&" + c.join('') + ";")
 1461+}
 1462+
 1463+space
 1464+ = s:[ \t]+ { return s.join(''); }
 1465+
 1466+optionalSpaceToken
 1467+ = s:space* {
 1468+ if ( s.length ) {
 1469+ return [s.join('')];
 1470+ } else {
 1471+ return [];
 1472+ }
 1473+ }
 1474+
 1475+
 1476+// Start of line
 1477+sol = nl:(newlineToken / & { return pos === 0; } { return [] })
 1478+ // Eat multi-line comments, so that syntax after still matches as if it
 1479+ // was actually preceded by a newline
 1480+ cn:( c:comment n:newline? {
 1481+ if ( n !== '' ) {
 1482+ return [c, n];
 1483+ } else {
 1484+ return [c];
 1485+ }
 1486+ }
 1487+ )*
 1488+ // Eat includeonly/noinclude at start of line, so that start-of-line
 1489+ // syntax after it still matches
 1490+ ni:(space* "<" c:"/"? t:("includeonly" / "noinclude") ">" {return [c, t]} )?
 1491+ {
 1492+ var niToken = [];
 1493+ if ( ni !== '') {
 1494+ if ( ni[0] === '/' ) {
 1495+ niToken = [new EndTagTk( ni[1] )];
 1496+ } else {
 1497+ niToken = [new TagTk( ni[1] )];
 1498+ }
 1499+ }
 1500+
 1501+ return nl.concat(cn, niToken);
 1502+ }
 1503+
 1504+eof = & { return isEOF(pos); } { return true; }
 1505+
 1506+
 1507+newline
 1508+ = '\n' / '\r\n'
 1509+
 1510+newlineToken = newline { return [new NlTk()] }
 1511+
 1512+eolf = newline / eof
 1513+
 1514+
 1515+// 'Preprocessor' directive- higher-level things that can occur in otherwise
 1516+// plain-text content.
 1517+directive
 1518+ = comment
 1519+ / tplarg_or_template
 1520+ / htmlentity
 1521+
 1522+// Plain text, but can contain templates, template arguments, comments etc-
 1523+// all stuff that is normally handled by the preprocessor
 1524+// Returns either a list of tokens, or a plain string (if nothing is to be
 1525+// processed).
 1526+preprocessor_text
 1527+ = r:( t:[^<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
 1528+ / directive
 1529+ / !inline_breaks text_char )+ {
 1530+ return flatten ( r );
 1531+ }
 1532+
 1533+spaceless_preprocessor_text
 1534+ = r:( t:[^'<~[{\n\r|!\]}\t &=]+ { return t.join(''); }
 1535+ / directive
 1536+ / !inline_breaks !' ' text_char )+ {
 1537+ return flatten_string ( r );
 1538+ }
 1539+
 1540+
 1541+wikilink_preprocessor_text
 1542+ = r:( t:[^%<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
 1543+ / urlencoded_char
 1544+ / directive
 1545+ / !inline_breaks !"]]" text_char )+ {
 1546+ return flatten_stringlist ( r );
 1547+ }
 1548+
 1549+extlink_preprocessor_text
 1550+ = r:( t:[^'<~[{\n\r|!\]}\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ { return t.join(''); }
 1551+ / directive
 1552+ / urlencoded_char
 1553+ / !inline_breaks no_punctuation_char
 1554+ / s:[.:,] !(space / eolf) { return s }
 1555+ / [&%] )+ {
 1556+ return flatten_string ( r );
 1557+ }
 1558+
 1559+// Attribute values with preprocessor support
 1560+attribute_preprocessor_text
 1561+ = r:( ts:(!inline_breaks t:[^=<>{\n\r&'"\t ] {return t})+ { return ts.join(''); }
 1562+ / directive
 1563+ / !inline_breaks [&%] )+ {
 1564+ //console.warn('prep');
 1565+ return flatten_string ( r );
 1566+ }
 1567+attribute_preprocessor_text_single
 1568+ = r:( t:[^{&']+ { return t.join(''); }
 1569+ / directive
 1570+ / !inline_breaks [{&] )* {
 1571+ return flatten_string ( r );
 1572+ }
 1573+attribute_preprocessor_text_double
 1574+ = r:( t:[^{&"]+ { return t.join(''); }
 1575+ / directive
 1576+ / !inline_breaks [{&] )* {
 1577+ //console.warn( 'double:' + pp(r) );
 1578+ return flatten_string ( r );
 1579+ }
 1580+
 1581+// Variants with the entire attribute on a single line
 1582+attribute_preprocessor_text_line
 1583+ = r:( ts:(!inline_breaks t:[^=<>{\n\r&'"\t ] {return t})+ { return ts.join(''); }
 1584+ / directive
 1585+ / !inline_breaks !'\n' [&%] )+ {
 1586+ //console.warn('prep');
 1587+ return flatten_string ( r );
 1588+ }
 1589+attribute_preprocessor_text_single_line
 1590+ = r:( t:[^{&']+ { return t.join(''); }
 1591+ / directive
 1592+ / !inline_breaks !'\n' [{&] )* {
 1593+ return flatten_string ( r );
 1594+ }
 1595+attribute_preprocessor_text_double_line
 1596+ = r:( t:[^{&"]+ { return t.join(''); }
 1597+ / directive
 1598+ / !inline_breaks !'\n' [{&] )* {
 1599+ //console.warn( 'double:' + pp(r) );
 1600+ return flatten_string ( r );
 1601+ }
 1602+
15201603 // Special-case support for those pipe templates
15211604 pipe = "|" / "{{!}}"
15221605
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js
@@ -54,7 +54,12 @@
5555 // reasonable traces. Calling a trace on the extension does not really cut
5656 // it.
5757 //try {
58 - this.parser.parse(text, 'start', this.emit.bind( this, 'chunk' ));
 58+ this.parser.parse(text, 'start',
 59+ // callback
 60+ this.emit.bind( this, 'chunk' ),
 61+ // inline break test
 62+ this
 63+ );
5964 // emit tokens here until we get that to work per toplevelblock in the
6065 // actual tokenizer
6166 //this.emit('chunk', out.concat( [{ type: 'END' }] ) );
@@ -67,6 +72,61 @@
6873 //}
6974 };
7075
 76+PegTokenizer.prototype.breakMap = {
 77+ '=': function(input, pos, syntaxFlags) {
 78+ return syntaxFlags.equal ||
 79+ ( syntaxFlags.h &&
 80+ input.substr( pos + 1, 200)
 81+ .match(/[ \t]*[\r\n]/) !== null ) || null;
 82+ },
 83+ '|': function ( input, pos, syntaxFlags ) {
 84+ return syntaxFlags.template ||
 85+ ( syntaxFlags.table &&
 86+ ( input[pos + 1].match(/[|}]/) !== null ||
 87+ syntaxFlags.tableCellArg
 88+ )
 89+ ) || null;
 90+ },
 91+ "!": function ( input, pos, syntaxFlags ) {
 92+ return syntaxFlags.table && input[pos + 1] === "!" ||
 93+ null;
 94+ },
 95+ "}": function ( input, pos, syntaxFlags ) {
 96+ return syntaxFlags.template && input[pos + 1] === "}" || null;
 97+ },
 98+ ":": function ( input, pos, syntaxFlags ) {
 99+ return syntaxFlags.colon &&
 100+ ! syntaxFlags.extlink &&
 101+ ! syntaxFlags.linkdesc || null;
 102+ },
 103+ "\r": function ( input, pos, syntaxFlags ) {
 104+ return syntaxFlags.table &&
 105+ input[pos + 1] !== '!' &&
 106+ input[pos + 1] !== '|' ||
 107+ null;
 108+ },
 109+ "\n": function ( input, pos, syntaxFlags ) {
 110+ return syntaxFlags.table &&
 111+ input[pos + 1] !== '!' &&
 112+ input[pos + 1] !== '|' ||
 113+ null;
 114+ },
 115+ "]": function ( input, pos, syntaxFlags ) {
 116+ return syntaxFlags.extlink ||
 117+ ( syntaxFlags.linkdesc && input[pos + 1] === ']' ) ||
 118+ null;
 119+ },
 120+ "<": function ( input, pos, syntaxFlags ) {
 121+ return syntaxFlags.pre && input.substr( pos, 6 ) === '</pre>' || null;
 122+ }
 123+};
 124+
 125+PegTokenizer.prototype.inline_breaks = function (input, pos, syntaxFlags ) {
 126+ var res = this.breakMap[ input[pos] ]( input, pos, syntaxFlags);
 127+ console.warn( 'ilb res: ' + JSON.stringify( [ res, input.substr( pos, 4 ) ] ) );
 128+ return res;
 129+};
 130+
71131 /*****************************************************************************
72132 * LEGACY stuff
73133 *
@@ -173,6 +233,7 @@
174234 }
175235 };
176236
 237+
177238 if (typeof module == "object") {
178239 module.exports.PegTokenizer = PegTokenizer;
179240 }

Status & tagging log