r102285 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r102284‎ | r102285 | r102286 >
Date:14:39, 7 November 2011
Author:gwicke
Status:deferred
Tags:
Comment:
Grammar improvements

* replaced regexp stack with a set of break rules for inline content within
specialized parse contexts, switched more rules to generic
inlineline/inline/block rules.
* don't consume end-of-line for proper start-of-line matching
* added some pre support
* still no conversion of inline elements to annotations
Modified paths:
  • /trunk/extensions/VisualEditor/modules/es/serializers/es.HtmlSerializer.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/modules/es/serializers/es.HtmlSerializer.js
@@ -69,9 +69,15 @@
7070 };
7171
7272 es.HtmlSerializer.prototype.comment = function( node ) {
73 - return '<!--' + node.text + '-->';
 73+ return '<!--(' + node.text + ')-->';
7474 };
7575
 76+es.HtmlSerializer.prototype.pre = function( node ) {
 77+ return es.Html.makeTag(
 78+ 'pre', {}, this.document( node, true )
 79+ );
 80+};
 81+
7682 es.HtmlSerializer.prototype.horizontalRule = function( node ) {
7783 return es.Html.makeTag( 'hr', {}, false );
7884 };
Index: trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt
@@ -6,45 +6,30 @@
77 console.log(msg);
88 }
99 }
10 -
1110
12 - // Forbidden chars in anything content
13 - var forbiddenThingRegexp = {
14 - // key: regexp
15 - // value: nesting counter, regexp removed if zero reached
16 - kvs: {},
17 - regexp: "",
18 - rebuild: function () {
19 - var keys = [];
20 - for (var key in this.kvs) {
21 - keys.push(key);
22 - }
23 - this.regexp = keys.join('|');
24 - },
25 - push: function (key) {
26 - if (this.kvs[key] !== undefined) {
27 - this.kvs[key]++;
28 - } else {
29 - this.kvs[key] = 1;
30 - this.rebuild();
31 - }
32 - },
33 - pop: function (key) {
34 - if (this.kvs[key] !== undefined) {
35 - if(this.kvs[key] == 1) {
36 - delete this.kvs[key];
37 - this.rebuild();
38 - } else {
39 - this.kvs[key]--;
40 - }
41 - } else {
42 - throw "Trying to pop non-existing forbiddenThingRegexp";
43 - }
 11+ /*
 12+ * Flags for specific parse environments (inside tables, links etc). Flags
 13+ * trigger syntactic stops in the inline_breaks production, which
 14+ * terminates inline and attribute matches. Flags merely reduce the number
 15+ * of productions needed: The grammar is still context-free as the
 16+ * productions can just be unrolled for all combinations of environments
 17+ * at the cost of a much larger grammar.
 18+ */
 19+ var syntaxFlags = {};
 20+ var setFlag = function(flag) {
 21+ if (syntaxFlags[flag] !== undefined) {
 22+ syntaxFlags[flag]++;
 23+ } else {
 24+ syntaxFlags[flag] = 1;
4425 }
 26+ return true;
4527 }
46 -
 28+ var clearFlag = function(flag) {
 29+ syntaxFlags[flag]--;
 30+ }
4731
4832
 33+
4934 /* Temporary debugging help. Is there anything similar in JS or a library? */
5035 var print_r = function (arr, level) {
5136
@@ -106,13 +91,13 @@
10792 = e:block* {
10893 var es = [];
10994 // flatten sub-arrays, as a list block can contain multiple lists
110 - $.each(e, function(i, ei) {
111 - if (ei.constructor == Array)
 95+ for(var i = 0, length = e.length; i < length; i++) {
 96+ var ei = e[i];
 97+ if ($.isArray(ei))
11298 es = es.concat(ei);
11399 else
114100 es.push(ei);
115 - });
116 - dp(es);
 101+ };
117102 return {
118103 type: 'page',
119104 children: es
@@ -122,126 +107,88 @@
123108 anyblock = block / inline
124109 anyblockline = block / inlineline
125110
126 -anything
127 - = text
128 - / s:.+ {
129 - // reject match if forbiddenThingRegexp matches
130 - var str = s.join('');
131 - dp("anything: " +print_r(str));
132 - if (forbiddenThingRegexp.regexp !== '') {
133 - var m = str.search(forbiddenThingRegexp.regexp)
134 - if ( m > 0 ) {
135 - dp("anything reverse " + (str.length - m)
136 - + ", matched: " + str.substr(0,m));
137 - // reverse parser position
138 - pos -= str.length - m;
139 - return {text: str.substr(0,m)};
140 - } else {
141 - if (m == 0) {
142 - pos -= str.length;
143 - return null;
144 - } else {
145 - return {text: str};
146 - }
147 - }
148 - } else {
149 - return {text: str};
150 - }
151 - }
152111
153 -anyline
154 - = text
155 - / s:[^\n]+ {
156 - // reject match if forbiddenThingRegexp matches
157 - var str = s.join('');
158 - dp("anyline: " + print_r(str) + ", pos:" + pos);
159 - if (forbiddenThingRegexp.regexp !== '') {
160 - var m = str.search(forbiddenThingRegexp.regexp)
161 - if ( m > 0 ) {
162 - // reverse parser position
163 - pos -= str.length - m;
164 - dp("anyline reverse " + (str.length - m)
165 - + ", matched: " + str.substr(0,m));
166 - return {text: str.substr(0,m)};
167 - } else {
168 - if (m == 0) {
169 - pos -= str.length;
170 - return null;
171 - } else {
172 - return {text : str};
173 - }
174 - }
175 - } else {
176 - return {text: str};
177 - }
178 - }
179 -
180 -
181 -
182 -
183112 // All chars that cannot start syntactic structures
184113 text = t:[A-Za-z0-9,._ -]+ { return t.join('') }
185114
186115 space
187 - = s:[ ]+ { return s.join(''); }
 116+ = s:[ \t]+ { return s.join(''); }
188117
 118+
 119+// Start of line
 120+sol = (newline / & { return pos === 0; } { return true; }) comment?
 121+
189122 newline
190 - = [\n]
 123+ = '\n' / '\r\n'
191124
192125 block
193 - = br
194 - / h
 126+ = block_lines
 127+ / para
 128+ / br
 129+ / newline &newline
 130+ / comment
 131+
 132+block_lines
 133+ = h
195134 / table
196135 / lists
197 - / para
 136+ / pre_indent
198137
 138+
 139+/* Headings */
199140 h = h1 / h2 / h3 / h4 / h5 / h6
200141
201 -h1 = '=' c:heading_text '=' newline {
 142+h1 = sol
 143+ '=' c:heading_text '=' &newline {
202144 return {
203145 type: 'heading',
204 - level: 1,
 146+ attributes: {level: 1},
205147 text: c
206148 }
207149 }
208150
209 -h2 = '==' c:heading_text '==' newline {
 151+h2 = sol
 152+ '==' c:heading_text '==' &newline {
210153 return {
211154 type: 'heading',
212 - level: 2,
213 - text: c
 155+ attributes: {level: 2},
 156+ content: c
214157 }
215158 }
216159
217 -h3 = '===' c:heading_text '===' newline {
 160+h3 = sol
 161+ '===' c:heading_text '===' &newline {
218162 return {
219163 type: 'heading',
220 - level: 3,
221 - text: c
 164+ attributes: {level: 3},
 165+ content: c
222166 }
223167 }
224168
225 -h4 = '====' c:heading_text '====' newline {
 169+h4 = sol
 170+ '====' c:heading_text '====' &newline {
226171 return {
227172 type: 'heading',
228 - level: 4,
229 - text: c
 173+ attributes: {level: 4},
 174+ content: c
230175 }
231176 }
232177
233 -h5 = '=====' c:heading_text '=====' newline {
 178+h5 = sol
 179+ '=====' c:heading_text '=====' &newline {
234180 return {
235181 type: 'heading',
236 - level: 5,
237 - text: c
 182+ attributes: {level: 5},
 183+ content: c
238184 }
239185 }
240186
241 -h6 = '======' c:heading_text '======' newline {
 187+h6 = sol
 188+ '======' c:heading_text '======' &newline {
242189 return {
243190 type: 'heading',
244 - level: 6,
245 - text: c
 191+ attributes: {level: 6},
 192+ content: c
246193 }
247194 }
248195
@@ -249,18 +196,36 @@
250197 = '=' '='*
251198
252199 heading_text
253 - = h:( !heading_marker x:(anyline) { return x } )* { return h.join(''); }
 200+ = h:( !heading_marker x:(inlineline) { return x } )* { return h.join(''); }
254201
255202 br
256 - = newline { return {type: 'br'} }
 203+ = newline !newline { return {type: 'br'} }
257204
258205 // TODO: convert inline content to annotations!
259206 para
260 - = c:inlineline newline { return {type: 'paragraph', content: c[0] } } /
261 - c:anyline
 207+ = sol c:inlineline cs:(!block_lines para)* { return {type: 'paragraph', content: c[0] } }
262208
 209+pre_indent
 210+ = l:pre_indent_line+ {
 211+ return {
 212+ type: 'pre',
 213+ children: l
 214+ }
 215+ }
 216+pre_indent_line = sol space l:inlineline { return l }
 217+
 218+// Syntax that stops inline expansion
 219+inline_breaks
 220+ = //& { console.log(print_r(syntaxFlags)); return true; }
 221+ & { return syntaxFlags['table']; }
 222+ a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + print_r(a)); return true; }
 223+ / & { return syntaxFlags['italic']; } italic_marker { return true; }
 224+ / & { return syntaxFlags['bold']; } bold_marker { return true; }
 225+ / & { return syntaxFlags['linkdesc']; } link_end { return true; }
 226+
 227+
263228 inline
264 - = c:(inline_element / anything)+ {
 229+ = c:(text / inline_element / (!inline_breaks ch:. { return ch; }))+ {
265230 var out = [];
266231 var text = '';
267232 for (var i = 0; i < c.length; i++) {
@@ -287,9 +252,10 @@
288253 }
289254
290255 inlineline
291 - = c:(inline_element / anyline)+ {
 256+ = c:(text / !inline_breaks (inline_element / [^\n]))+ {
292257 var out = [];
293258 var text = '';
 259+ dp("inlineline: " + print_r(c));
294260 for (var i = 0; i < c.length; i++) {
295261 if (typeof c[i] == 'string') {
296262 text += c[i];
@@ -313,7 +279,9 @@
314280 return out;
315281 }
316282
317 -// TODO: convert all these to annotations!
 283+/* TODO: convert all these to annotations!
 284+ * -> need (start, end) offsets within block
 285+ */
318286 inline_element
319287 = comment
320288 / xmlish_tag
@@ -333,11 +301,8 @@
334302
335303 comment_chars
336304 = c:[^-] { return c; }
337 - / c:'-' !'-' { return c; }
 305+ / c:'-' !'->' { return c; }
338306
339 -inline_text_run
340 - = c:[^\n]+ { return c.join(''); }
341 -
342307 extlink
343308 = "[" target:url " " text:extlink_text "]" {
344309 return {
@@ -423,36 +388,50 @@
424389 = h:( !"]]" x:([^|]) { return x } )* { return h.join(''); }
425390
426391 link_text
427 - = h:( !"]]" x:(anyline) { return x } )* { return h.join(''); }
 392+ = h:( & { return setFlag('linkdesc'); }
 393+ x:inlineline { return x }
 394+ )* {
 395+ clearFlag('linkdesc')
 396+ return h.join('');
 397+ }
 398+ / & { clearFlag('linkdesc') } { return null; }
428399
 400+link_end = "]]"
 401+
429402 bold
430 - = bold_marker c:bold_text bold_marker {
431 - return {
432 - type: 'b',
433 - text: c,
434 - }
435 -}
 403+ = bold_marker
 404+ & { dp('benter:' + pos); return setFlag('bold'); }
 405+ c:inlineline
 406+ bold_marker {
 407+ clearFlag('bold');
 408+ return {
 409+ type: 'b',
 410+ text: c,
 411+ }
 412+ }
 413+ / bold_marker { clearFlag('bold'); return null }
436414
437415 bold_marker
438416 = "'''"
439417
440 -bold_text
441 - = h:( !bold_marker x:(anyline) { return x } )+ { return h.join(''); }
442418
443419 italic
444 - = italic_marker c:italic_text italic_marker {
445 - return {
446 - type: 'i',
447 - text: c
448 - }
449 -}
 420+ = italic_marker
 421+ & { dp('ienter:' + pos); return setFlag('italic'); }
 422+ c:inlineline
 423+ italic_marker {
 424+ clearFlag('italic');
 425+ dp('ileave:' + pos);
 426+ return {
 427+ type: 'i',
 428+ text: c
 429+ }
 430+ }
 431+ / italic_marker { clearFlag('italic'); return null }
450432
451433 italic_marker
452434 = "''"
453435
454 -italic_text
455 - = h:( !italic_marker x:(anyline) { return x } )+ { return h.join(''); }
456 -
457436 /* Will need to check anything xmlish agains known/allowed HTML tags and
458437 * registered extensions, otherwise fail the match. Should ref be treated as a
459438 * regular extension? */
@@ -500,7 +479,7 @@
501480 }
502481
503482 ref_content
504 - = !ref_end a:(inline_element / anyline) {
 483+ = !ref_end a:(inline) {
505484 return a;
506485 }
507486
@@ -544,7 +523,7 @@
545524 }
546525
547526 references_content
548 - = !references_end a:(inline_element / anyline) {
 527+ = !references_end a:(inline) {
549528 return a;
550529 }
551530
@@ -584,9 +563,10 @@
585564 }
586565 }
587566
588 -li = bullets:list_char+
589 - c:(inlineline / anyline)
590 - newline
 567+li = sol
 568+ bullets:list_char+
 569+ c:inlineline
 570+ &newline
591571 {
592572 return {
593573 type: 'listItem',
@@ -597,11 +577,12 @@
598578 };
599579 }
600580
601 -dtdd = bullets:list_char+
 581+dtdd = sol
 582+ bullets:list_char+
602583 c:(inline_element / [^:\n])+
603584 ":"
604585 d:(inline_element / [^\n])+
605 - newline
 586+ &newline
606587 {
607588 // reject rule if bullets do not end in semicolon
608589 if (bullets[bullets.length - 1] != ';') {
@@ -641,23 +622,27 @@
642623 // FIXME: actually parse and build structure
643624 res.attributes = { unparsed: tas }
644625 }
645 - dp(print_r(res));
 626+ //dp(print_r(res));
646627 return res;
647628 }
648629
649630 table_start
650 - = "{|"
651 - & { forbiddenThingRegexp.push('(\n|^)\\||\\|[|\x7d+]'); return true; }
652 - ta:table_attribs* space* newline? {
653 - dp("table_start " + print_r(ta) + ", pos:" + pos);
 631+ = sol
 632+ "{|"
 633+ & { setFlag('table'); return true; }
 634+ ta:table_attribs*
 635+ space* {
 636+ //dp("table_start " + print_r(ta) + ", pos:" + pos);
654637 return ta;
655638 }
656 - / "{|" { forbiddenThingRegexp.pop('(\n|^)\\||\\|[|\x7d+]'); return null; }
 639+ / sol "{|" { clearFlag('table'); return null; }
657640
658 -table_attribs = anyline
 641+table_attribs
 642+ = text / ! inline_breaks !newline .
659643
660644 table_caption
661 - = "|+" c:inline* newline? {
 645+ = newline
 646+ "|+" c:inline* {
662647 return {
663648 type: 'tableCaption',
664649 content: c[0]
@@ -685,7 +670,9 @@
686671 }
687672
688673 table_row
689 - = "|-" space* newline? td:(table_data / table_header)* {
 674+ = & { dp("table row enter"); return true; }
 675+ newline
 676+ "|-" thtd_attribs? space* td:(table_data / table_header)* {
690677 return {
691678 type: 'tableRow',
692679 children: td
@@ -694,44 +681,39 @@
695682
696683 table_data
697684 = & { dp("table_data enter, pos=" + pos); return true; }
698 - "||" td_attr? td:anyblock* newline? {
699 - //dp("table || result:" + print_r(td));
 685+ ("||" / newline "|")
 686+ ! [}+-]
 687+ a:thtd_attribs?
 688+ td:(!inline_breaks anyblock)* {
 689+ dp("table data result: " + print_r(td) + ", attribts: " + print_r(a));
700690 return {
701691 type: 'tableCell',
 692+ attributes: { unparsed: a },
702693 children: td
703694 };
704695 }
705 - / & { dp("table_data : | enter pos=" + pos); return true; }
706 - "|" ![}+-] td_attr? td:anyblock* newline? {
707 - //dp("table | result:" + print_r(td));
708 - return {
709 - type: 'tableCell',
710 - children: td
711 - };
712 - }
713696
714 -td_attr = a:[^\n|]+ "|" !"|" {
715 - dp("td_attr: " + a.join(''));
716 - return a.join('');
717 -}
 697+table_header
 698+ = ("!!" / newline "!")
 699+ a:thtd_attribs?
 700+ c:inline {
 701+ return {
 702+ type: 'tableHeading',
 703+ attributes: { unparsed: a },
 704+ children: c
 705+ }
 706+ }
718707
719 -table_header
720 - = "!!" c:(inline / ("!" !"!") / [^!\n])* newline? {
721 - return {
722 - type: 'tableHeading',
723 - children: c
724 - }
725 - }
726 - / "!" c:(inline / text / '!' !'!' / [^!\n])* newline? {
727 - return {
728 - type: 'tableHeading',
729 - children: c
730 - }
731 - }
 708+thtd_attribs
 709+ // In particular, do not match [|\n]
 710+ = a:(text / ! inline_breaks [="':;/,.-] )+ "|" ! [|}+-] {
 711+ return a;
 712+ }
732713
733 -table_end = "|}" newline? { forbiddenThingRegexp.pop('(\n|^)\\||\\|[|\x7d+]'); }
734714
 715+table_end = newline? "|}" { clearFlag('table'); }
735716
 717+
736718 /* Wikidom TODO:
737719 * split off text into content nodes
738720 * convert inlines into annotations

Status & tagging log