r103468 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r103467‎ | r103468 | r103469 >
Date:15:26, 17 November 2011
Author:gwicke
Status:deferred
Tags:
Comment:
Convert PEG parser to tokenizer for back-end HTML parser. Now emits a list of
tokens, which for now is still completely built before parsing can proceed.
For each top-level block, the source start/end positions are added as
attributes to the top-most tokens. No tracking of wiki vs. html syntax yet.
Modified paths:
  • /trunk/extensions/VisualEditor/modules/parser/lib.pegjs.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt (modified) (history)
  • /trunk/extensions/VisualEditor/tests/parser/parserTests.js (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/tests/parser/parserTests.js
@@ -109,7 +109,43 @@
110110 function nodeToHtml(node) {
111111 return $('<div>').append(node).html();
112112 }
 113+ /* Temporary debugging help. Is there anything similar in JS or a library? */
 114+ var print_r = function (arr, level) {
113115
 116+ var dumped_text = "";
 117+ if (!level) level = 0;
 118+
 119+ //The padding given at the beginning of the line.
 120+ var level_padding = "";
 121+ var bracket_level_padding = "";
 122+
 123+ for (var j = 0; j < level + 1; j++) level_padding += " ";
 124+ for (var b = 0; b < level; b++) bracket_level_padding += " ";
 125+
 126+ if (typeof(arr) == 'object') { //Array/Hashes/Objects
 127+ dumped_text += "Array\n";
 128+ dumped_text += bracket_level_padding + "(\n";
 129+ for (var item in arr) {
 130+
 131+ var value = arr[item];
 132+
 133+ if (typeof(value) == 'object') { //If it is an array,
 134+ dumped_text += level_padding + "[" + item + "] => ";
 135+ dumped_text += print_r(value, level + 2);
 136+ } else {
 137+ dumped_text += level_padding + "[" + item + "] => '" + value + "'\n";
 138+ }
 139+
 140+ }
 141+ dumped_text += bracket_level_padding + ")\n\n";
 142+ } else { //Strings/Chars/Numbers etc.
 143+ dumped_text = "=>" + arr + "<=(" + typeof(arr) + ")";
 144+ }
 145+
 146+ return dumped_text;
 147+
 148+ };
 149+
114150 function processTest(item) {
115151 if (!('title' in item)) {
116152 console.log(item);
@@ -137,16 +173,16 @@
138174 'references': MWReferencesTagHook
139175 }
140176 });
141 - var res = es.HtmlSerializer.stringify(tree,environment);
142 - if (err) {
143 - console.log('RENDER FAIL', err);
144 - } else {
145 - console.log('EXPECTED:');
146 - console.log(item.result + "\n");
 177+ //var res = es.HtmlSerializer.stringify(tree,environment);
 178+ if (err) {
 179+ console.log('RENDER FAIL', err);
 180+ } else {
 181+ console.log('EXPECTED:');
 182+ console.log(item.result + "\n");
147183
148 - console.log('RENDERED:');
149 - console.log(res + "\n");
150 - }
 184+ console.log('RENDERED:');
 185+ console.log(print_r(tree));
 186+ }
151187 }
152188 });
153189 }
Index: trunk/extensions/VisualEditor/modules/parser/lib.pegjs.js
@@ -3823,6 +3823,7 @@
38243824 }
38253825
38263826 var source = this.emitter(ast);
 3827+ //console.log(source);
38273828 var result = eval(source);
38283829 result._source = source;
38293830
Index: trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt
@@ -1,11 +1,10 @@
22 /* Produces output more or less compatible with FakeParser; plug it into FP's output and see */
3 -
43 {
54 var dp = function ( msg ) {
65 if ( false ) {
76 console.log(msg);
87 }
9 - }
 8+ };
109
1110 /*
1211 * Flags for specific parse environments (inside tables, links etc). Flags
@@ -23,10 +22,10 @@
2423 syntaxFlags[flag] = 1;
2524 }
2625 return true;
27 - }
 26+ };
2827 var clearFlag = function(flag) {
2928 syntaxFlags[flag]--;
30 - }
 29+ };
3130
3231
3332
@@ -65,7 +64,7 @@
6665
6766 return dumped_text;
6867
69 - }
 68+ };
7069
7170 // Convert list prefixes to a list of WikiDom list styles
7271 var bulletsToTypes = function (bullets) {
@@ -86,26 +85,35 @@
8786 return bTypes;
8887 };
8988
90 - var extractInline = function ( node ) {
91 - return { text: extractText(node)
92 - }
 89+ /*var extractInline = function ( node ) {
 90+ return { text: extractText(node, 0) };
9391 };
9492
9593
96 - var extractText = function ( node ) {
 94+ // return [text [annotations]]
 95+ var extractText = function ( node, offset ) {
9796 dp("extract: " + print_r(node));
9897 if (typeof node === 'string') {
99 - return node;
 98+ return [node, []];
10099 } else if ($.isArray(node)) {
101 - var texts = [];
 100+ var texts = [],
 101+ annotations = [];
102102 for (var i = 0, length = node.length; i < length; i++) {
103 - texts.push(extractText(node[i]));
 103+ var res = extractText(node[i], offset);
 104+ texts.push(res[0]);
 105+ annotations.concat(res[1]);
 106+ offset += res[0].length;
104107 }
105 - return texts.join('');
 108+ return [texts.join(''), annotations];
106109 } else if ( 'text' in node ) {
107 - return extractText(node.text);
 110+ var res = extractText(node, offset);
 111+ if ('annotations' in node) {
 112+ return [res[0], node.annotations.concat(res[1])];
 113+ } else {
 114+ return res;
 115+ }
108116 } else if ( 'content' in node ) {
109 - return extractText(node.content);
 117+ return extractText(node.content, offset);
110118 } else if ( 'children' in node ) {
111119 var texts = [];
112120 for (var i = 0, length = node.children.length; i < length; i++) {
@@ -113,27 +121,35 @@
114122 }
115123 return texts.join('');
116124 } else {
117 - console.log("extract failed!" + print_r(node));
118125 throw ("extract failed: " + print_r(node));
119126 }
120127 };
 128+ */
 129+
 130+ // Start position of top-level block
 131+ var blockStart = 0;
 132+
 133+ var unquote = function (quotec, text) {
 134+ return text.replace('\\' + quotec, quotec);
 135+ };
 136+
 137+ var flatten = function ( e ) {
 138+ var es = [];
 139+ // flatten sub-arrays
 140+ for(var i = 0, length = e.length; i < length; i++) {
 141+ var ei = e[i];
 142+ if ($.isArray(ei))
 143+ es = es.concat(flatten(ei));
 144+ else
 145+ es.push(ei);
 146+ };
 147+ return es;
 148+ };
121149 }
122150
123151 start
124 - = e:block* newline* {
125 - var es = [];
126 - // flatten sub-arrays, as a list block can contain multiple lists
127 - for(var i = 0, length = e.length; i < length; i++) {
128 - var ei = e[i];
129 - if ($.isArray(ei))
130 - es = es.concat(ei);
131 - else
132 - es.push(ei);
133 - };
134 - return {
135 - type: 'page',
136 - children: es
137 - }
 152+ = e:toplevelblock* newline* {
 153+ return flatten(e);
138154 }
139155
140156 anyblock = block / inline
@@ -157,12 +173,26 @@
158174 newline
159175 = '\n' / '\r\n'
160176
 177+toplevelblock
 178+ = & { blockStart = pos; return true; } b:block {
 179+ b = flatten(b);
 180+ var bs = b[0];
 181+ dp('toplevelblock:' + print_r(b) + bs);
 182+ if (bs.attribs === undefined) {
 183+ bs.attribs = [];
 184+ }
 185+ bs.attribs.push(['startPos', blockStart]);
 186+ bs.attribs.push(['endPos', pos]);
 187+ return b;
 188+ }
 189+
161190 block
162 - = (sol space* &newline)? block_lines
 191+ = (sol space* &newline)? bl:block_lines { return bl; }
163192 / para
164193 / comment
165 - / sol
 194+ / (s:sol { return [{type: 'TEXT', value: s}]; })
166195
 196+// Block structures with start-of-line wiki syntax
167197 block_lines
168198 = h
169199 / table
@@ -176,14 +206,11 @@
177207 h1 = sol '='
178208 (
179209 & { setFlag('h'); return setFlag('h1') }
180 - c:inlineline '=' &newline {
 210+ c:inlineline '=' comment? &newline {
181211 clearFlag('h');
182212 clearFlag('h1');
183 - return {
184 - type: 'heading',
185 - attributes: {level: 1},
186 - content: extractInline(c)
187 - }
 213+ return [{type: 'TAG', name: 'h1'}]
 214+ .concat(c, [{type: 'ENDTAG', name: 'h1'}]);
188215 }
189216 / { clearFlag('h'); clearFlag('h1'); return null }
190217 )
@@ -191,14 +218,11 @@
192219 h2 = sol '=='
193220 (
194221 & { setFlag('h'); return setFlag('h2') }
195 - c:inlineline '==' &newline {
 222+ c:inlineline '==' comment? &newline {
196223 clearFlag('h');
197224 clearFlag('h2');
198 - return {
199 - type: 'heading',
200 - attributes: {level: 2},
201 - content: extractInline(c)
202 - }
 225+ return [{type: 'TAG', name: 'h2'}]
 226+ .concat(c, [{type: 'ENDTAG', name: 'h2'}]);
203227 }
204228 / { clearFlag('h'); clearFlag('h2'); return null }
205229 )
@@ -206,57 +230,45 @@
207231 h3 = sol '==='
208232 (
209233 & { setFlag('h'); return setFlag('h3') }
210 - c:inlineline '===' &newline {
 234+ c:inlineline '===' comment? &newline {
211235 clearFlag('h');
212236 clearFlag('h3');
213 - return {
214 - type: 'heading',
215 - attributes: {level: 3},
216 - content: extractInline(c)
217 - }
218 - }
 237+ return [{type: 'TAG', name: 'h3'}]
 238+ .concat(c, [{type: 'ENDTAG', name: 'h3'}]);
 239+ }
219240 / { clearFlag('h'); clearFlag('h3'); return null }
220241 )
221242
222243 h4 = sol '===='
223244 (
224245 & { setFlag('h'); return setFlag('h4') }
225 - c:inlineline '====' &newline {
 246+ c:inlineline '====' comment? &newline {
226247 clearFlag('h');
227248 clearFlag('h4');
228 - return {
229 - type: 'heading',
230 - attributes: {level: 4},
231 - content: extractInline(c)
232 - }
233 - }
 249+ return [{type: 'TAG', name: 'h4'}]
 250+ .concat(c, [{type: 'ENDTAG', name: 'h4'}]);
 251+ }
234252 / { clearFlag('h'); clearFlag('h4'); return null }
235253 )
236254
237255 h5 = sol '====='
238256 (& { setFlag('h'); return setFlag('h5') }
239 - c:inlineline '=====' &newline {
 257+ c:inlineline '=====' comment? &newline {
240258 clearFlag('h');
241259 clearFlag('h5');
242 - return {
243 - type: 'heading',
244 - attributes: {level: 5},
245 - content: extractInline(c)
246 - }
 260+ return [{type: 'TAG', name: 'h5'}]
 261+ .concat(c, [{type: 'ENDTAG', name: 'h5'}]);
247262 }
248263 / { clearFlag('h'); clearFlag('h5'); return null }
249264 )
250265
251266 h6 = sol '======'
252267 (& { setFlag('h'); return setFlag('h6') }
253 - c:inlineline '======' &newline {
 268+ c:inlineline '======' comment? &newline {
254269 clearFlag('h');
255270 clearFlag('h6');
256 - return {
257 - type: 'heading',
258 - attributes: {level: 6},
259 - content: extractInline(c)
260 - }
 271+ return [{type: 'TAG', name: 'h6'}]
 272+ .concat(c, [{type: 'ENDTAG', name: 'h6'}]);
261273 }
262274 / { clearFlag('h'); clearFlag('h6'); return null }
263275 )
@@ -270,24 +282,25 @@
271283
272284 // TODO: convert inline content to annotations!
273285 para
274 - = (sol br)? para_lines
 286+ = (sol br)? pl:para_lines { return pl; }
275287
276288 para_lines
277289 = s:sol c:inlineline cs:(!block_lines para_lines)* {
278 - return {
279 - type: 'paragraph',
280 - content: extractInline([s].concat([c]).concat(cs))
281 - }
 290+ var res = [{type: 'TAG', name: 'p'}];
 291+ if (s !== '') {
 292+ res.push(s)
 293+ }
 294+ //console.log('paralines' + print_r(res.concat(c, cs, [{type: 'ENDTAG', name: 'p'}])));
 295+ return res.concat(c, cs, [{type: 'ENDTAG', name: 'p'}]);
282296 }
283297
284 -br = space* &newline { return {type: 'br'} }
 298+br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} }
285299
286300 pre_indent
287301 = l:pre_indent_line+ {
288 - return {
289 - type: 'pre',
290 - content: extractInline(l)
291 - }
 302+ return [{type: 'TAG', name: 'pre'}]
 303+ .concat( l
 304+ , [{type: 'ENDTAG', name: 'pre'}]);
292305 }
293306 pre_indent_line = sol space l:inlineline { return l }
294307
@@ -319,20 +332,14 @@
320333 text += c[i];
321334 } else {
322335 if (text.length) {
323 - out.push({
324 - type: 'text',
325 - text: text
326 - });
 336+ out.push({ type: "TEXT", value: text });
327337 text = '';
328338 }
329 - out.push(c[i]);
 339+ out.concat(c[i]);
330340 }
331341 }
332342 if (text.length) {
333 - out.push({
334 - type: 'text',
335 - text: text
336 - });
 343+ out.push({ type: 'TEXT', value: text });
337344 }
338345 return out;
339346 }
@@ -347,21 +354,16 @@
348355 text += c[i];
349356 } else {
350357 if (text.length) {
351 - out.push({
352 - type: 'text',
353 - text: text
354 - });
 358+ out.push({type: 'TEXT', value: text});
355359 text = '';
356360 }
357361 out.push(c[i]);
358362 }
359363 }
360364 if (text.length) {
361 - out.push({
362 - text: text,
363 - //annotations: []
364 - });
 365+ out.push({type: 'TEXT', value: text});
365366 }
 367+ //dp('inlineline out:', print_r(out));
366368 return out;
367369 }
368370
@@ -380,10 +382,7 @@
381383 comment
382384 = '<!--' c:comment_chars* '-->'
383385 (space* newline space* comment)* {
384 - return {
385 - type: 'comment',
386 - text: c.join('')
387 - }
 386+ return { type: 'COMMENT', value: c.join('') };
388387 }
389388
390389 comment_chars
@@ -392,11 +391,11 @@
393392
394393 extlink
395394 = "[" target:url " " text:extlink_text "]" {
396 - return {
397 - type: 'extlink',
398 - target: target,
399 - text: text
400 - }
 395+ return [ { type: 'TAG',
 396+ name: 'a',
 397+ attribs: [['href', target]] }
 398+ , {type: 'TEXT', value: text}
 399+ , {type: 'ENDTAG', name: 'a'}];
401400 }
402401
403402 // = "[" target:url text:extlink_text "]" { return { type: 'extlink', target: target, text: text } }
@@ -409,36 +408,29 @@
410409
411410 template
412411 = "{{" target:link_target params:("|" p:template_param { return p })* "}}" {
413 - var obj = {
414 - type: 'template',
415 - target: target
416 - };
 412+ var obj = { type: 'SELFCLOSINGTAG', name: 'template', attribs: [['target', target]] }
417413 if (params && params.length) {
418 - obj.params = params;
 414+ obj.attribs.push(params);
419415 }
420416 return obj;
421417 }
422418
423419 template_param
424420 = name:template_param_name "=" c:template_param_text {
425 - return {
426 - name: name,
427 - content: c
428 - };
 421+ return [name, c];
429422 } / c:template_param_text {
430 - return {
431 - content: c
432 - };
 423+ return [null, c];
433424 }
434425
435426 tplarg
436427 = "{{{" name:link_target params:("|" p:template_param { return p })* "}}}" {
437 - var obj = {
438 - type: 'tplarg',
439 - name: name
 428+ var obj = {
 429+ type: 'SELFCLOSINGTAG',
 430+ name: 'templatearg',
 431+ attribs: [['argname', name]]
440432 };
441433 if (params && params.length) {
442 - obj.params = params;
 434+ obj.attribs.push(params);
443435 }
444436 return obj;
445437 }
@@ -463,13 +455,14 @@
464456 link
465457 = "[[" target:link_target text:("|" link_text)* "]]" {
466458 var obj = {
467 - type: 'link',
468 - target: target
 459+ type: 'TAG',
 460+ name: 'a',
 461+ attribs: [['data-type', 'internal']]
469462 };
470463 if (text && text.length) {
471 - obj.text = text[0][1]; // ehhhh
 464+ obj.attribs.push(['href', text[0][1]]); // ehhhh
472465 }
473 - return obj;
 466+ return [obj, {type: 'ENDTAG', name: 'a'}];
474467 }
475468
476469 link_target
@@ -492,10 +485,8 @@
493486 c:inlineline
494487 bold_marker {
495488 clearFlag('bold');
496 - return {
497 - type: 'b',
498 - content: {text: c}
499 - }
 489+ return [{ type: 'TAG', name: 'b' }]
 490+ .concat(c, [{type: 'ENDTAG', name: 'b'}]);
500491 }
501492 / bold_marker { clearFlag('bold'); return null }
502493
@@ -510,11 +501,9 @@
511502 italic_marker {
512503 clearFlag('italic');
513504 dp('ileave:' + pos);
514 - return {
515 - type: 'i',
516 - content: {text: c}
517 - }
518 - }
 505+ return [{ type: 'TAG', name: 'i' }]
 506+ .concat(c, [{ type: 'ENDTAG', name: 'i'}]);
 507+ }
519508 / italic_marker { clearFlag('italic'); return null }
520509
521510 italic_marker
@@ -530,25 +519,24 @@
531520 /* Can we do backreferences to genericize this? */
532521 ref_full
533522 = start:ref_start ">" content:ref_content* close:ref_end {
534 - return {
535 - type: 'ext',
536 - name: 'ref',
537 - params: start.params,
538 - ws: start.ws,
539 - content: content,
540 - close: close
541 - }
 523+ return [
 524+ { type: 'TAG',
 525+ name: 'ext',
 526+ attribs: [['data-extname', 'ref']]
 527+ .concat(start.params, [['data-startws', start.ws]])},
 528+ content,
 529+ {type: 'ENDTAG', name: 'ref'}
 530+ ];
542531 }
543532
544533 ref_empty
545534 = start:ref_start close:(space* "/>") {
546 - return {
547 - type: 'ext',
548 - name: 'ref',
549 - ws: start.ws,
550 - params: start.params,
551 - close: close
552 - }
 535+ return [{ type: 'SELFCLOSINGTAG',
 536+ name: 'ext',
 537+ attribs: [['data-extname', 'ref']]
 538+ .concat(start.params
 539+ ,[['data-startws', start.ws]])
 540+ }];
553541 }
554542
555543 ref_start
@@ -565,7 +553,7 @@
566554 }
567555
568556 ref_content
569 - = !ref_end a:(inline) {
 557+ = !ref_end a:inline { // XXX: ineffective syntactic stop
570558 return a;
571559 }
572560
@@ -574,25 +562,27 @@
575563
576564 references_full
577565 = start:references_start ">" content:references_content* close:references_end {
578 - return {
579 - type: 'ext',
580 - name: 'references',
581 - params: start.params,
582 - ws: start.ws,
583 - content: content,
584 - close: close
585 - }
 566+ return [
 567+ { type: 'TAG',
 568+ name: 'ext',
 569+ attribs: [['data-extname', 'references']]
 570+ .concat(start.params
 571+ ,[['data-startws', start.ws]])
 572+ },
 573+ content,
 574+ { type: 'ENDTAG', name: 'ext' }
 575+ ];
586576 }
587577
588578 references_empty
589579 = start:references_start close:(space* "/>") {
590 - return {
591 - type: 'ext',
592 - name: 'references',
593 - ws: start.ws,
594 - params: start.params,
595 - close: close
596 - }
 580+ return
 581+ [{ type: 'SELFCLOSINGTAG',
 582+ name: 'ext',
 583+ attribs: [['data-extname', 'references']]
 584+ .concat(start.params
 585+ ,[['data-startws', start.ws]])
 586+ }];
597587 }
598588
599589 references_start
@@ -609,14 +599,14 @@
610600 }
611601
612602 references_content
613 - = !references_end a:(inline) {
 603+ = !references_end a:inline {
614604 return a;
615605 }
616606
617607
618608 ext_param
619609 = space* name:ext_param_name "=" val:ext_param_val {
620 - val.name = name;
 610+ val[0] = name;
621611 return val;
622612 }
623613
@@ -626,9 +616,9 @@
627617 }
628618
629619 ext_param_val
630 - = t:[0-9A-Za-z]+ { return {text: t.join('') } }
631 - / "'" t:[^'>]+ "'" { return { quote: "'", text: t.join('') } }
632 - / '"' t:[^">]+ '"' { return { quote: '"', text: t.join('') } }
 620+ = t:[0-9A-Za-z]+ { return [null, t.join('')]; }
 621+ / "'" t:[^'>]+ "'" { return [null, unquote("'", t.join(''))]; }
 622+ / '"' t:[^">]+ '"' { return [null, unquote('"', t.join(''))]; }
633623
634624 lists = es:(dtdd / li)+
635625 {
@@ -643,10 +633,10 @@
644634 flatEs.push(ei);
645635 }
646636 }
647 - return {
648 - type: 'list',
649 - children: flatEs
650 - }
 637+ return [ { type: 'TAG',
 638+ name: 'ul'} ] // XXX!!
 639+ .concat(flatEs
 640+ ,[{ type: 'ENDTAG', name: 'ul' }]);
651641 }
652642
653643 li = sol
@@ -654,38 +644,33 @@
655645 c:inlineline
656646 &newline
657647 {
658 - return {
659 - type: 'listItem',
660 - attributes: {
661 - styles: bulletsToTypes(bullets)
662 - },
663 - content: extractInline(c)
664 - };
 648+ return [ { type: 'TAG',
 649+ name: 'li',
 650+ attribs: [['data-styles', bullets]] }
 651+ , c
 652+ , { type: 'ENDTAG', name: 'li' }
 653+ ];
665654 }
666655
667656 dtdd = sol
668657 bullets:list_char+
669 - c:(inline_element / [^:\n])+
 658+ c:(inline_element / (n:[^:\n] { return {type: 'TEXT', value: n}; }))+
670659 ":"
671 - d:(inline_element / [^\n])+
 660+ d:(inline_element / (n:[^\n] { return {type: 'TEXT', value: n}; }))+
672661 &newline
673662 {
674663 // reject rule if bullets do not end in semicolon
675664 if (bullets[bullets.length - 1] != ';') {
676665 return null;
677666 } else {
678 - return [
679 - {
680 - type: 'listItem',
681 - attributes: {styles: bulletsToTypes(bullets)},
682 - content: extractInline(c)
683 - }, {
684 - type: 'listItem',
685 - attributes: {styles: bulletsToTypes(
686 - bullets.slice(0, bullets.length - 1) + ':')},
687 - content: extractInline(d)
688 - }
689 - ]
 667+ return [ { type: 'TAG', name: 'dl', attribs: [['data-styles', bullets]] }
 668+ , { type: 'TAG', name: 'dt' } ]
 669+ .concat( c
 670+ , [ {type: 'ENDTAG', name: 'dt'}
 671+ , {type: 'TAG', name: 'dd'} ]
 672+ , d
 673+ , [ {type: 'ENDTAG', name: 'dd'}
 674+ , {type: 'ENDTAG', name: 'dl'} ]);
690675 }
691676 }
692677
@@ -697,19 +682,23 @@
698683
699684 table
700685 = tas:table_start c:table_caption? b:table_body? table_end {
701 - var res = {type: 'table'}
 686+ var res = {type: 'TAG', name: 'table'}
702687 var body = b !== '' ? b : [];
703 - if (c !== '') {
704 - res.children = [c].concat(body);
705 - } else {
706 - res.children = body;
707 - }
708688 if (tas.length > 0) {
709689 // FIXME: actually parse and build structure
710 - res.attributes = { unparsed: tas }
 690+ res.attribs = [['data-unparsed', tas.join('')]];
711691 }
 692+
 693+ if (c !== '') {
 694+ var caption = [{type: 'TAG', name: 'caption'}]
 695+ .concat(c, [{type: 'ENDTAG', name: 'caption'}]);
 696+ } else {
 697+ var caption = [];
712698 //dp(print_r(res));
713 - return res;
 699+
 700+ return [res].concat(caption, body,
 701+ [{type: 'ENDTAG', name: 'table'}]);
 702+ }
714703 }
715704
716705 table_start
@@ -729,10 +718,7 @@
730719 table_caption
731720 = newline
732721 "|+" c:inline* {
733 - return {
734 - type: 'tableCaption',
735 - content: c[0]
736 - }
 722+ return c;
737723 }
738724
739725 table_body
@@ -749,20 +735,16 @@
750736
751737 table_firstrow
752738 = td:table_data+ {
753 - return {
754 - type: 'tableRow',
755 - children: td
756 - };
 739+ return [{ type: 'TAG', name: 'tr' }]
 740+ .concat(td, [{type: 'ENDTAG', name: 'tr'}]);
757741 }
758742
759743 table_row
760744 = & { dp("table row enter"); return true; }
761745 newline
762746 "|-" thtd_attribs? space* td:(table_data / table_header)* {
763 - return {
764 - type: 'tableRow',
765 - children: td
766 - };
 747+ return [{type: 'TAG', name: 'tr'}]
 748+ .concat(td, [{type: 'ENDTAG', name: 'tr'}]);
767749 }
768750
769751 table_data
@@ -772,22 +754,16 @@
773755 a:thtd_attribs?
774756 td:(!inline_breaks anyblock)* {
775757 dp("table data result: " + print_r(td) + ", attribts: " + print_r(a));
776 - return {
777 - type: 'tableCell',
778 - attributes: { unparsed: a },
779 - children: td
780 - };
 758+ return [{ type: 'TAG', name: 'td', attribs: [['data-unparsed', a]]}]
 759+ .concat(td, [{type: 'ENDTAG', name: 'td'}]);
781760 }
782761
783762 table_header
784763 = ("!!" / newline "!")
785764 a:thtd_attribs?
786765 c:inline {
787 - return {
788 - type: 'tableHeading',
789 - attributes: { unparsed: a },
790 - children: c
791 - }
 766+ return [{type: 'TAG', name: 'th', attribs: [['data-unparsed', a]]}]
 767+ .concat(c, [{type: 'ENDTAG', name: 'th'}]);
792768 }
793769
794770 thtd_attribs
@@ -804,7 +780,34 @@
805781 * split off text into content nodes
806782 * convert inlines into annotations
807783 * change contents into children
 784+ *
 785+ * { text: text,
 786+ * annotations: [(normal annotations)],
 787+ * maybeannotations: [
 788+ * { type: 'something',
 789+ * side: MA_START,
 790+ * tag: { start: x, length: y }
 791+ * }
 792+ * ]
 793+ * }
 794+ * offsets in annotations: presume maybeannotations are actually text
 795+ * -> need to transform annotations if match found
 796+ * -> format annotations, comments can run to the end (re-opened after
 797+ * block-level tags); only closed on table cells, object,?
 798+ * -> other annotations (images, templates etc) are limited by block-level
 799+ * elements, tightly bound
 800+ *
 801+ * Block-level elements
 802+ * --------------------
 803+ * - Need some early clean-up to provide structure and offsets
 804+ * - Establish scope limits for some inlines
 805+ * - Line-based balanced by construction
 806+ * - HTML tags need balancing/ matching / implicit close
 807+ * - content in illegal places (e.g. between table and td tags) needs foster
 808+ * parenting
 809+ * - grammar will match outermost pair if unmatched pairs are recognized as
 810+ * tokens (or as text)
 811+ * - post-processing needed, but has to be limited by scope
808812 */
809 -
810813 /* Tabs do not mix well with the hybrid production syntax */
811814 /* vim: et:ts=4:sw=4:cindent */

Status & tagging log