r101360 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r101359‎ | r101360 | r101361 >
Date:14:56, 31 October 2011
Author:gwicke
Status:deferred
Tags:parserplayground 
Comment:
Experimental support for nested blocks with syntactical restrictions enforced
using a stack of regexps on matched sections. Initially used for tables
containing other blocks including other tables, but mechanism is general
enough to handle other dynamic restrictions as well. Better performance would
be possible using dynamic character classes in parsers, but this is not
supported by the javascript PEG parser generator.
Modified paths:
  • /trunk/extensions/ParserPlayground/modules/pegParser.pegjs.txt (modified) (history)
  • /trunk/extensions/ParserPlayground/tests/parserTests.js (modified) (history)

Diff [purge]

Index: trunk/extensions/ParserPlayground/tests/parserTests.js
@@ -58,7 +58,12 @@
5959 console.log(testFileName);
6060 }
6161
62 -var testParser = PEG.buildParser(fs.readFileSync('parserTests.pegjs', 'utf8'));
 62+try {
 63+ var testParser = PEG.buildParser(fs.readFileSync('parserTests.pegjs', 'utf8'));
 64+} catch (e) {
 65+ console.log(e);
 66+}
 67+
6368 var testFile = fs.readFileSync(testFileName, 'utf8');
6469
6570
Index: trunk/extensions/ParserPlayground/modules/pegParser.pegjs.txt
@@ -1,7 +1,50 @@
22 /* Produces output more or less compatible with FakeParser; plug it into FP's output and see */
33
44 {
 5+ var dp = function ( msg ) {
 6+ if ( false ) {
 7+ console.log(msg);
 8+ }
 9+ }
 10+
511
 12+ // Forbidden chars in anything content
 13+ var forbiddenThingRegexp = {
 14+ // key: regexp
 15+ // value: nesting counter, regexp removed if zero reached
 16+ kvs: {},
 17+ regexp: "",
 18+ rebuild: function () {
 19+ var keys = [];
 20+ for (var key in this.kvs) {
 21+ keys.push(key);
 22+ }
 23+ this.regexp = keys.join('|');
 24+ },
 25+ push: function (key) {
 26+ if (this.kvs[key] !== undefined) {
 27+ this.kvs[key]++;
 28+ } else {
 29+ this.kvs[key] = 1;
 30+ this.rebuild();
 31+ }
 32+ },
 33+ pop: function (key) {
 34+ if (this.kvs[key] !== undefined) {
 35+ if(this.kvs[key] == 1) {
 36+ delete this.kvs[key];
 37+ this.rebuild();
 38+ } else {
 39+ this.kvs[key]--;
 40+ }
 41+ } else {
 42+ throw "Trying to pop non-existing forbiddenThingRegexp";
 43+ }
 44+ }
 45+ }
 46+
 47+
 48+
649 /* Temporary debugging help. Is there anything similar in JS or a library? */
750 var print_r = function (arr, level) {
851
@@ -26,7 +69,7 @@
2770 dumped_text += level_padding + "[" + item + "] => ";
2871 dumped_text += print_r(value, level + 2);
2972 } else {
30 - dumped_text += level_padding + "[" + item + "] => " + value + "\n";
 73+ dumped_text += level_padding + "[" + item + "] => '" + value + "'\n";
3174 }
3275
3376 }
@@ -50,18 +93,73 @@
5194 else
5295 es.push(ei);
5396 });
54 - //console.log(print_r(es, 10));
 97+ //dp(print_r(es));
5598 return {
5699 type: 'page',
57100 content: es
58101 }
59102 }
60103
61 -anyblock = block / inline_element
 104+anyblock = block / inline
 105+anyblockline = block / inlineline
62106
63107 anything
64 - = text / [^\n]
 108+ = text
 109+ / s:.+ {
 110+ // reject match if forbiddenThingRegexp matches
 111+ var str = s.join('');
 112+ dp("anything: " +print_r(str));
 113+ if (forbiddenThingRegexp.regexp !== '') {
 114+ var m = str.search(forbiddenThingRegexp.regexp)
 115+ if ( m > 0 ) {
 116+ dp("anything reverse " + (str.length - m)
 117+ + ", matched: " + str.substr(0,m));
 118+ // reverse parser position
 119+ pos -= str.length - m;
 120+ return str.substr(0,m);
 121+ } else {
 122+ if (m == 0) {
 123+ pos -= str.length;
 124+ return null;
 125+ } else {
 126+ return str;
 127+ }
 128+ }
 129+ } else {
 130+ return str;
 131+ }
 132+ }
65133
 134+anyline
 135+ = text
 136+ / s:[^\n]+ {
 137+ // reject match if forbiddenThingRegexp matches
 138+ var str = s.join('');
 139+ dp("anyline: " + print_r(str) + ", pos:" + pos);
 140+ if (forbiddenThingRegexp.regexp !== '') {
 141+ var m = str.search(forbiddenThingRegexp.regexp)
 142+ if ( m > 0 ) {
 143+ // reverse parser position
 144+ pos -= str.length - m;
 145+ dp("anyline reverse " + (str.length - m)
 146+ + ", matched: " + str.substr(0,m));
 147+ return str.substr(0,m);
 148+ } else {
 149+ if (m == 0) {
 150+ pos -= str.length;
 151+ return null;
 152+ } else {
 153+ return str;
 154+ }
 155+ }
 156+ } else {
 157+ return str;
 158+ }
 159+ }
 160+
 161+
 162+
 163+
66164 // All chars that cannot start syntactic structures
67165 text = t:[A-Za-z0-9,._ -]+ { return t.join('') }
68166
@@ -132,14 +230,14 @@
133231 = '=' '='*
134232
135233 heading_text
136 - = h:( !heading_marker x:(anything) { return x } )* { return h.join(''); }
 234+ = h:( !heading_marker x:(anyline) { return x } )* { return h.join(''); }
137235
138236 br
139237 = newline { return {type: 'br'} }
140238
141239 para
142 - = c:inline newline { return {type: 'para', content: c } } /
143 - c:anything
 240+ = c:inlineline newline { return {type: 'para', content: c } } /
 241+ c:anyline
144242
145243 inline
146244 = c:(inline_element / anything)+ {
@@ -168,6 +266,33 @@
169267 return out;
170268 }
171269
 270+inlineline
 271+ = c:(inline_element / anyline)+ {
 272+ var out = [];
 273+ var text = '';
 274+ for (var i = 0; i < c.length; i++) {
 275+ if (typeof c[i] == 'string') {
 276+ text += c[i];
 277+ } else {
 278+ if (text.length) {
 279+ out.push({
 280+ type: 'text',
 281+ text: text
 282+ });
 283+ text = '';
 284+ }
 285+ out.push(c[i]);
 286+ }
 287+ }
 288+ if (text.length) {
 289+ out.push({
 290+ type: 'text',
 291+ text: text
 292+ });
 293+ }
 294+ return out;
 295+}
 296+
172297 inline_element
173298 = comment
174299 / xmlish_tag
@@ -277,7 +402,7 @@
278403 = h:( !"]]" x:([^|]) { return x } )* { return h.join(''); }
279404
280405 link_text
281 - = h:( !"]]" x:(anything) { return x } )* { return h.join(''); }
 406+ = h:( !"]]" x:(anyline) { return x } )* { return h.join(''); }
282407
283408 bold
284409 = bold_marker c:bold_text bold_marker {
@@ -291,7 +416,7 @@
292417 = "'''"
293418
294419 bold_text
295 - = h:( !bold_marker x:(anything) { return x } )+ { return h.join(''); }
 420+ = h:( !bold_marker x:(anyline) { return x } )+ { return h.join(''); }
296421
297422 italic
298423 = italic_marker c:italic_text italic_marker {
@@ -305,7 +430,7 @@
306431 = "''"
307432
308433 italic_text
309 - = h:( !italic_marker x:(anything) { return x } )+ { return h.join(''); }
 434+ = h:( !italic_marker x:(anyline) { return x } )+ { return h.join(''); }
310435
311436 /* Will need to check anything xmlish agains known/allowed HTML tags and
312437 * registered extensions, otherwise fail the match. Should ref be treated as a
@@ -354,7 +479,7 @@
355480 }
356481
357482 ref_content
358 - = !ref_end a:(inline_element / anything) {
 483+ = !ref_end a:(inline_element / anyline) {
359484 return a;
360485 }
361486
@@ -398,7 +523,7 @@
399524 }
400525
401526 references_content
402 - = !references_end a:(inline_element / anything) {
 527+ = !references_end a:(inline_element / anyline) {
403528 return a;
404529 }
405530
@@ -481,12 +606,12 @@
482607 }
483608 bstack = bnext;
484609 });
485 - //console.log("out: " + print_r(out, 5));
 610+ //dp("out: " + print_r(out, 5));
486611 return out;
487612 }
488613
489614 li = bullets:list_char+
490 - c:(inline / anything)
 615+ c:(inlineline / anyline)
491616 newline
492617 {
493618 var type;
@@ -507,7 +632,7 @@
508633 dtdd = bullets:list_char+
509634 c:(inline_element / [^:\n])+
510635 ":"
511 - d:(inline / anything)
 636+ d:(inline / anyline)
512637 newline
513638 {
514639 // reject rule if bullets do not end in semicolon
@@ -551,16 +676,23 @@
552677 // FIXME: actually parse and build structure
553678 res.attributes = { unparsed: tas }
554679 }
555 - console.log(print_r(res));
 680+ dp(print_r(res));
556681 return res;
557682 }
558683
559 -table_start = "{|" ta:table_attribs* space* newline? { return ta }
 684+table_start
 685+ = "{|"
 686+ & { forbiddenThingRegexp.push('(\n|^)\\||\\|[|\x7d+]'); return true; }
 687+ ta:table_attribs* space* newline? {
 688+ dp("table_start " + print_r(ta) + ", pos:" + pos);
 689+ return ta;
 690+ }
 691+ / "{|" { forbiddenThingRegexp.pop('(\n|^)\\||\\|[|\x7d+]'); return null; }
560692
561 -table_attribs = anything
 693+table_attribs = anyline
562694
563695 table_caption
564 - = "|+" c:(inline_element / anything / [^\n])* newline? {
 696+ = "|+" c:inline* newline? {
565697 return {
566698 type: 'tableCaption',
567699 content: c
@@ -568,13 +700,14 @@
569701 }
570702
571703 table_body
572 - = firstrow:table_firstrow otherrows:table_row* {
573 - /* console.log('table first and otherrows: '
 704+ = & { dp("table_body enter"); return true; }
 705+ firstrow:table_firstrow otherrows:table_row* {
 706+ /* dp('table first and otherrows: '
574707 * + print_r([firstrow].concat(otherrows))); */
575708 return [firstrow].concat(otherrows);
576709 }
577710 / otherrows:table_row* {
578 - //console.log('table otherrows: ' + print_r(otherrows));
 711+ //dp('table otherrows: ' + print_r(otherrows));
579712 return otherrows;
580713 }
581714
@@ -600,43 +733,43 @@
601734 * starting with a pipe might be used. Checking this requires access to the
602735 * matched source string. */
603736 table_data
604 - = t:table {
605 - //console.log("recursive table result:" + print_r(t));
606 - return {
607 - type: 'tableCell',
608 - content: [t]
609 - }
610 - }
611 - / "||" td_attr? td:[^|]* newline? {
 737+ = & { dp("table_data enter, pos=" + pos); return true; }
 738+ "||" td_attr? td:anyblock* newline? {
 739+ //dp("table || result:" + print_r(td));
612740 return {
613741 type: 'tableCell',
614 - content: td.join('')
 742+ content: td
615743 };
616744 }
617 - / "|" ![}+-] td_attr? td:[^|\n]* newline? {
 745+ / & { dp("table_data : | enter pos=" + pos); return true; }
 746+ "|" ![}+-] td_attr? td:anyblock* newline? {
 747+ //dp("table | result:" + print_r(td));
618748 return {
619749 type: 'tableCell',
620 - content: td.join('')
 750+ content: td
621751 };
622752 }
623753
624 -td_attr = a:[^\n|]+ "|" !"|" { return a.join('') }
 754+td_attr = a:[^\n|]+ "|" !"|" {
 755+ dp("td_attr: " + a.join(''));
 756+ return a.join('');
 757+}
625758
626759 table_header
627 - = "!!" c:(block/ inline_element / text / ("!" !"!") / [^!\n])* newline? {
 760+ = "!!" c:(inline / ("!" !"!") / [^!\n])* newline? {
628761 return {
629762 type: 'tableHeader',
630763 content: c
631764 }
632765 }
633 - / "!" c:(block/ inline_element / text / '!' !'!' / [^!\n])* newline? {
 766+ / "!" c:(inline / text / '!' !'!' / [^!\n])* newline? {
634767 return {
635768 type: 'tableHeader',
636769 content: c
637770 }
638771 }
639772
640 -table_end = "|}" newline?
 773+table_end = "|}" newline? { forbiddenThingRegexp.pop('(\n|^)\\||\\|[|\x7d+]'); }
641774
642775
643776 /* Wikidom TODO:

Status & tagging log