r105536 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r105535‎ | r105536 | r105537 >
Date:10:59, 8 December 2011
Author:gwicke
Status:deferred
Tags:
Comment:
Further renaming, this time from pegParser to pegTokenizer.
Modified paths:
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.HTML5TreeBuilder.node.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.peg.js (deleted) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js (added) (history)
  • /trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt (deleted) (history)
  • /trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt (added) (history)
  • /trunk/extensions/VisualEditor/tests/parser/parserTests.js (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/tests/parser/parserTests.js
@@ -142,7 +142,7 @@
143143
144144 var testWhiteList = require('./parserTests-whitelist.js').testWhiteList;
145145
146 -_import(pj('parser', 'mediawiki.parser.peg.js'), ['PegParser']);
 146+_import(pj('parser', 'mediawiki.tokenizer.peg.js'), ['PegTokenizer']);
147147 _import(pj('parser', 'mediawiki.parser.environment.js'), ['MWParserEnvironment']);
148148 _import(pj('parser', 'ext.cite.taghook.ref.js'), ['MWRefTagHook']);
149149
@@ -158,9 +158,9 @@
159159 _require(pj('es', 'serializers', 'es.JsonSerializer.js'));
160160
161161 // Preload the grammar file...
162 -PegParser.src = fs.readFileSync(path.join(basePath, 'parser', 'pegParser.pegjs.txt'), 'utf8');
 162+PegTokenizer.src = fs.readFileSync(path.join(basePath, 'parser', 'pegTokenizer.pegjs.txt'), 'utf8');
163163
164 -var parser = new PegParser();
 164+var wikiTokenizer = new PegTokenizer();
165165
166166 var testFileName = '../../../../phase3/tests/parser/parserTests.txt'; // default
167167 var testFileName2 = '../../../../tests/parser/parserTests.txt'; // Fallback. Not everyone fetch at phase3 level
@@ -378,7 +378,7 @@
379379 console.log(item.input + "\n");
380380 }
381381
382 - parser.parseToTree(item.input + "\n", function(tokens, err) {
 382+ wikiTokenizer.tokenize(item.input + "\n", function(tokens, err) {
383383 if (err) {
384384 printTitle();
385385 failParseTests++;
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.peg.js
@@ -1,133 +0,0 @@
2 -/**
3 - * Parser for wikitext to provisional temp structure, using PEG.js and
4 - * a separate PEG grammar file (pegParser.pegjs.txt)
5 - *
6 - * Use along with the MWTreeRenderer and MWTreeSerializer classes for
7 - * HTML output and source round-tripping.
8 - *
9 - * If installed as a user script or to customize, set parserPlaygroundPegPage
10 - * to point at the MW page name containing the parser peg definition; default
11 - * is 'MediaWiki:Gadget-ParserPlayground-PegParser.pegjs'.
12 - */
13 -function PegParser(env) {
14 - this.env = env || {};
15 -}
16 -
17 -PegParser.src = false;
18 -
19 -PegParser.prototype.parseToTree = function(text, callback) {
20 - this.initSource(function() {
21 - var out, err;
22 - try {
23 - var parser = PEG.buildParser(PegParser.src);
24 - out = parser.parse(text);
25 - } catch (e) {
26 - err = e;
27 - console.trace();
28 - } finally {
29 - callback(out, err);
30 - }
31 - });
32 -}
33 -
34 -/**
35 - * @param {object} tree
36 - * @param {function(tree, error)} callback
37 - */
38 -PegParser.prototype.expandTree = function(tree, callback) {
39 - var self = this;
40 - var subParseArray = function(listOfTrees) {
41 - var content = [];
42 - $.each(listOfTrees, function(i, subtree) {
43 - self.expandTree(subtree, function(substr, err) {
44 - content.push(tree);
45 - });
46 - });
47 - return content;
48 - };
49 - var src;
50 - if (typeof tree === "string") {
51 - callback(tree);
52 - return;
53 - }
54 - if (tree.type == 'template') {
55 - // expand a template node!
56 -
57 - // Resolve a possibly relative link
58 - var templateName = this.env.resolveTitle( tree.target, 'Template' );
59 - this.env.fetchTemplate( tree.target, tree.params || {}, function( templateSrc, error ) {
60 - // @fixme should pre-parse/cache these too?
61 - self.parseToTree( templateSrc, function( templateTree, error ) {
62 - if ( error ) {
63 - callback({
64 - type: 'placeholder',
65 - orig: tree,
66 - content: [
67 - {
68 - // @fixme broken link?
69 - type: 'link',
70 - target: templateName
71 - }
72 - ]
73 - });
74 - } else {
75 - callback({
76 - type: 'placeholder',
77 - orig: tree,
78 - content: self.env.expandTemplateArgs( templateTree, tree.params )
79 - });
80 - }
81 - })
82 - } );
83 - // Wait for async...
84 - return;
85 - }
86 - var out = $.extend( tree ); // @fixme prefer a deep copy?
87 - if (tree.content) {
88 - out.content = subParseArray(tree.content);
89 - }
90 - callback(out);
91 -};
92 -
93 -PegParser.prototype.initSource = function(callback) {
94 - if (PegParser.src) {
95 - callback();
96 - } else {
97 - if ( typeof parserPlaygroundPegPage !== 'undefined' ) {
98 - $.ajax({
99 - url: wgScriptPath + '/api' + wgScriptExtension,
100 - data: {
101 - format: 'json',
102 - action: 'query',
103 - prop: 'revisions',
104 - rvprop: 'content',
105 - titles: parserPlaygroundPegPage
106 - },
107 - success: function(data, xhr) {
108 - $.each(data.query.pages, function(i, page) {
109 - if (page.revisions && page.revisions.length) {
110 - PegParser.src = page.revisions[0]['*'];
111 - }
112 - });
113 - callback()
114 - },
115 - dataType: 'json',
116 - cache: false
117 - }, 'json');
118 - } else {
119 - $.ajax({
120 - url: mw.config.get('wgParserPlaygroundAssetsPath', mw.config.get('wgExtensionAssetsPath')) + '/ParserPlayground/modules/pegParser.pegjs.txt',
121 - success: function(data) {
122 - PegParser.src = data;
123 - callback();
124 - },
125 - dataType: 'text',
126 - cache: false
127 - });
128 - }
129 - }
130 -};
131 -
132 -if (typeof module == "object") {
133 - module.exports.PegParser = PegParser;
134 -}
Index: trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt
@@ -1,1199 +0,0 @@
2 -/* Produces output more or less compatible with FakeParser; plug it into FP's output and see */
3 -{
4 - /* Fixme: use static functions to separate module! Unfortunately, this
5 - * does not work:
6 - * var tu = require('./mediawiki.tokenizer.utils.js');
7 - * console.log(tu.flatten([]));
8 - * Using exports in the module gets a bit further, but accesses to
9 - * tu.flatten in productions still fail. Thus, I just moved the functions
10 - * here until a solution is found:
11 - */
12 -
13 - /* Static utilities */
14 -
15 - // Flatten a list of lists.
16 - var flatten = function ( e ) {
17 - var es = [];
18 - // flatten sub-arrays
19 - for(var i = 0, length = e.length; i < length; i++) {
20 - var ei = e[i];
21 - if ($.isArray(ei))
22 - es = es.concat(flatten(ei));
23 - else
24 - es.push(ei);
25 - };
26 - return es;
27 - };
28 -
29 - // Remove escaped quotes from attributes etc
30 - var unquote = function (quotec, text) {
31 - return text.replace('\\' + quotec, quotec);
32 - };
33 -
34 - // Decode html entities. In a browser, this should only be fed the entity,
35 - // not untrusted html! XXX: replace with safer version.
36 - var unentity = function ( entity ) {
37 - return $("<div/>").html(entity).text();
38 - };
39 -
40 - // Debug print with global switch
41 - var dp = function ( msg ) {
42 - if ( false ) {
43 - console.log(msg);
44 - }
45 - };
46 -
47 - var pp = function ( s ) { return JSON.stringify(s, null, 2); }
48 -
49 - /*
50 - * Annotate a token stream with list items with appropriate list tokens
51 - *
52 - * @static
53 - * @method
54 - * @param {[tokens]} Token stream with li tokens
55 - * @returns {[tokens]} Token stream, possibly with additional list tokens
56 - * */
57 - var annotateList = function ( tokens ) {
58 - var out = [], // List of tokens
59 - bstack = [], // Bullet stack, previous element's listStyle
60 - bnext = [], // Next element's listStyle
61 - endtags = []; // Stack of end tags
62 -
63 - var commonPrefixLength = function (x, y) {
64 - var minLength = Math.min(x.length, y.length);
65 - for(var i = 0; i < minLength; i++) {
66 - if (x[i] != y[i])
67 - break;
68 - }
69 - return i;
70 - };
71 -
72 - var pushList = function ( listName, itemName ) {
73 - out.push({type: 'TAG', name: listName});
74 - out.push({type: 'TAG', name: itemName});
75 - endtags.push({type: 'ENDTAG', name: listName});
76 - endtags.push({type: 'ENDTAG', name: itemName});
77 - };
78 -
79 - var popTags = function ( n ) {
80 - for(;n > 0; n--) {
81 - // push list item..
82 - out.push(endtags.pop());
83 - // and the list end tag
84 - out.push(endtags.pop());
85 - }
86 - };
87 -
88 - var isDlDd = function (a, b) {
89 - var ab = [a,b].sort();
90 - return (ab[0] === ':' && ab[1] === ';');
91 - };
92 -
93 - var doListItem = function ( bs, bn ) {
94 - var prefixLen = commonPrefixLength (bs, bn);
95 - var changeLen = Math.max(bs.length, bn.length) - prefixLen;
96 - var prefix = bn.slice(0, prefixLen);
97 - // emit close tag tokens for closed lists
98 - if (changeLen === 0) {
99 - var itemToken = endtags.pop();
100 - out.push(itemToken);
101 - out.push({type: 'TAG', name: itemToken.name});
102 - endtags.push({type: 'ENDTAG', name: itemToken.name});
103 - } else if ( bs.length == bn.length
104 - && changeLen == 1
105 - && isDlDd( bs[prefixLen], bn[prefixLen] ) ) {
106 - // handle dd/dt transitions
107 - out.push(endtags.pop());
108 - if( bn[prefixLen] == ';') {
109 - var newName = 'dt';
110 - } else {
111 - var newName = 'dd';
112 - }
113 - out.push({type: 'TAG', name: newName});
114 - endtags.push({type: 'ENDTAG', name: newName});
115 - } else {
116 - popTags(bs.length - prefixLen);
117 -
118 - if (prefixLen > 0 && bn.length == prefixLen ) {
119 - var itemToken = endtags.pop();
120 - out.push(itemToken);
121 - out.push({type: 'TAG', name: itemToken.name});
122 - endtags.push({type: 'ENDTAG', name: itemToken.name});
123 - }
124 -
125 - for(var i = prefixLen; i < bn.length; i++) {
126 - switch (bn[i]) {
127 - case '*':
128 - pushList('ul', 'li');
129 - break;
130 - case '#':
131 - pushList('ol', 'li');
132 - break;
133 - case ';':
134 - pushList('dl', 'dt');
135 - break;
136 - case ':':
137 - pushList('dl', 'dd');
138 - break;
139 - default:
140 - throw("Unknown node prefix " + prefix[i]);
141 - }
142 - }
143 - }
144 - };
145 -
146 - for (var i = 0, length = tokens.length; i < length; i++) {
147 - var token = tokens[i];
148 - switch ( token.type ) {
149 - case 'TAG':
150 - switch (token.name) {
151 - case 'list':
152 - // ignore token
153 - break;
154 - case 'listItem':
155 - // convert listItem to list and list item tokens
156 - bnext = token.bullets;
157 - doListItem( bstack, bnext );
158 - bstack = bnext;
159 - break;
160 - default:
161 - // pass through all remaining start tags
162 - out.push(token);
163 - break;
164 - }
165 - break;
166 - case 'ENDTAG':
167 - if ( token.name == 'list' ) {
168 - // pop all open list item tokens
169 - popTags(bstack.length);
170 - bstack = [];
171 - } else {
172 - out.push(token);
173 - }
174 - break;
175 - default:
176 - out.push(token);
177 - break;
178 - }
179 - }
180 - return out;
181 - };
182 -
183 - /*
184 - * Italic/Bold handling.
185 - *
186 - * - list of tokens
187 - * - NEWLINE
188 - * - ticks (2+) -> list with link in line token list?
189 - * - process on newline
190 - * - need access to text nodes before/after for conversion back to text
191 - */
192 - var doQuotes = function ( tokens ) {
193 -
194 - var italics = [],
195 - bolds = [],
196 - out = [],
197 - inserted = 0;
198 -
199 - var convertBold = function ( i ) {
200 - var index = bolds[i];
201 - var txt = out[index - 1];
202 - txt.value += "'";
203 - if ( i > 0 ) {
204 - bolds = bolds.slice(0, i)
205 - .concat(bolds.slice(i + 1, bolds.length - i - 1));
206 - } else {
207 - bolds.shift();
208 - }
209 -
210 - italics.push(index);
211 - italics.sort(function(a,b) { return a - b });
212 - };
213 -
214 - // convert italics/bolds into tags
215 - var quotesToTags = function ( offsets, name ) {
216 - var toggle = true;
217 - for (var j = 0; j < offsets.length; j++) {
218 - var t = out[offsets[j]];
219 - if(toggle) {
220 - t.type = 'TAG';
221 - } else {
222 - t.type = 'ENDTAG';
223 - }
224 - t.name = name;
225 - delete t.value;
226 - toggle = !toggle;
227 - }
228 - if (!toggle) {
229 - // add end tag
230 - out.push({type: 'ENDTAG', name: name});
231 - inserted++;
232 - }
233 - toggle = true;
234 - };
235 -
236 - for (var i = 0, length = tokens.length; i < length; i++) {
237 - var token = tokens[i];
238 - switch (token.type) {
239 - case 'QUOTE':
240 - // depending on length, add starting 's to preceding text node
241 - // (if any)
242 - // add token index to italic/bold lists
243 - // add placeholder for token
244 - var qlen = token.value.length;
245 - switch (qlen) {
246 - case 2: italics.push(i + inserted); out.push(token); break;
247 - case 3: bolds.push(i + inserted); out.push(token); break;
248 - case 4:
249 - token.value = "'''";
250 - if (i > 0 && tokens[i-1].type === 'TEXT') {
251 - tokens[i-1].value += "'";
252 - } else {
253 - out.push({type: 'TEXT', value: "'"});
254 - inserted++;
255 - }
256 - bolds.push(i + inserted);
257 - out.push(token);
258 - break;
259 - case 5:
260 - // order does not matter here, will be fixed
261 - // by HTML parser backend
262 - italics.push(i + inserted);
263 - out.push({type: 'QUOTE', value: "''"});
264 - inserted++;
265 - bolds.push(i + inserted);
266 - out.push({type: 'QUOTE', value: "'''"});
267 - break;
268 - default: // longer than 5, only use the last 5 ticks
269 - token.value = "'''''";
270 - var newvalue = token.value.substr(0, qlen - 5 );
271 - if (i > 0 && tokens[i-1].type === 'TEXT') {
272 - tokens[i-1].value += newvalue;
273 - } else {
274 - out.push({type: 'TEXT', value: newvalue});
275 - inserted++;
276 - }
277 - italics.push(i + inserted);
278 - out.push({type: 'QUOTE', value: "''"});
279 - inserted++;
280 - bolds.push(i + inserted);
281 - out.push({type: 'QUOTE', value: "'''"});
282 - break;
283 - }
284 - break;
285 -
286 - case 'NEWLINE':
287 - // balance out tokens, convert placeholders into tags
288 - if (italics.length % 2 && bolds.length % 2) {
289 - dp("balancing!");
290 - var firstsingleletterword = -1,
291 - firstmultiletterword = -1,
292 - firstspace = -1;
293 - for (var j = 0; j < bolds.length; j++) {
294 - var ticki = bolds[j];
295 - if (ticki > 0 && out[ticki - 1].type === 'TEXT') {
296 - var txt = out[ticki - 1],
297 - lastchar = txt.value[txt.value.length - 1],
298 - secondtolastchar = txt.value[txt.value.length - 2];
299 - dp('txt: ' + pp(txt));
300 - if (lastchar === ' ' && firstspace === -1) {
301 - firstspace = j;
302 - } else if (lastchar !== ' ') {
303 - if ( secondtolastchar === ' ' &&
304 - firstsingleletterword === -1)
305 - {
306 - firstsingleletterword = j;
307 - } else if ( firstmultiletterword == -1) {
308 - firstmultiletterword = j;
309 - }
310 - }
311 - }
312 - }
313 -
314 -
315 - // now see if we can convert a bold to an italic and
316 - // an apostrophe
317 - if (firstsingleletterword > -1) {
318 - convertBold(firstsingleletterword);
319 - } else if (firstmultiletterword > -1) {
320 - convertBold(firstmultiletterword);
321 - } else if (firstspace > -1) {
322 - convertBold(firstspace);
323 - }
324 - }
325 -
326 - quotesToTags(bolds, 'b');
327 - quotesToTags(italics, 'i');
328 - bolds = [];
329 - italics = [];
330 - out.push(token);
331 - break;
332 - default:
333 - out.push(token);
334 - }
335 - }
336 - return out;
337 - };
338 -
339 -
340 - /* End static utilities */
341 -
342 - /*
343 - * Flags for specific parse environments (inside tables, links etc). Flags
344 - * trigger syntactic stops in the inline_breaks production, which
345 - * terminates inline and attribute matches. Flags merely reduce the number
346 - * of productions needed: The grammar is still context-free as the
347 - * productions can just be unrolled for all combinations of environments
348 - * at the cost of a much larger grammar.
349 - */
350 - var syntaxFlags = {};
351 - var setFlag = function(flag) {
352 - if (syntaxFlags[flag] !== undefined) {
353 - syntaxFlags[flag]++;
354 - } else {
355 - syntaxFlags[flag] = 1;
356 - }
357 - return true;
358 - };
359 - var clearFlag = function(flag) {
360 - syntaxFlags[flag]--;
361 - };
362 -
363 - // Start position of top-level block
364 - // Could also provide positions for lower-level blocks using a stack.
365 - var blockStart = 0;
366 -
367 - // Start position of generic tag production
368 - var tagStartPos = 0;
369 -
370 - // cache the input length
371 - var inputLength = input.length;
372 -
373 - // pseudo-production that matches at end of input
374 - var isEOF = function (pos) {
375 - return pos === inputLength;
376 - };
377 -
378 - // text start position
379 - var textStart = 0;
380 -
381 - // hack to support numbered external links ([http://example.com]).
382 - // XXX: Move to token stream transform after templates are expanded!
383 - var linkCount = 1;
384 -
385 - // Define block-level tags in JS, so we can use toLowerCase to match tags
386 - // case-independently. This would be quite ugly (and possibly slower) if
387 - // done manually in the grammar.
388 - var block_names = (function () {
389 - var names = [ "p", "table", "td", "tr", "ul", "ol"
390 - , "li", "dl", "dt", "dd", "div", "center"
391 - , "blockquote" ];
392 - var bnames = {};
393 - for(var i = 0, l = names.length; i < l; i++) {
394 - bnames[names[i]] = true;
395 - }
396 - return bnames;
397 - })();
398 -
399 -
400 -}
401 -
402 -start
403 - = e:toplevelblock* newline* {
404 - return flatten(e);
405 - }
406 -
407 -
408 -/* All chars that cannot start syntactic structures in the middle of a line
409 - * XXX: ] and other end delimiters should probably only be activated inside
410 - * structures to avoid unnecessarily leaving the text production on plain
411 - * content. */
412 -
413 -text_char = [^'<~[{\n\r:\]}|!=]
414 -
415 -text = t:text_char+ { return t.join(''); }
416 -
417 -/* Explanation of chars
418 - * ' quotes (italic/bold)
419 - * < start of xmlish_tag
420 - * ~ signatures/dates
421 - * [ start of links
422 - * { start of parser functions, transclusion and template args
423 - * \n all sort of block-level markup at start of line
424 - * \r ditto
425 - * h http(s) urls
426 - * n nntp(s) urls
427 - * m mailto urls
428 - *
429 - * ! and | table cell delimiters, might be better to specialize those
430 - * = headings - also specialize those!
431 - *
432 - * The following chars are also included for now, but only apply in some
433 - * contexts and should probably be enabled only in those:
434 - * : separate definition in ; term : definition
435 - * ] end of link
436 - * } end of parser func/transclusion/template arg
437 - */
438 -
439 -urltext = ( t:[^'<~[{\n\rfghimnstw|!:\]} &=]+ { return t.join(''); }
440 - / htmlentity
441 - / urllink
442 - // Convert trailing space into &nbsp;
443 - // XXX: This should be moved to a serializer
444 - / ' ' & ':' { return "\u00a0"; }
445 - / t:text_char )+
446 -
447 -/*
448 - '//', // for protocol-relative URLs, but not in text!
449 - 'ftp://',
450 - 'git://',
451 - 'gopher://',
452 - 'http://',
453 - 'https://',
454 - 'irc://',
455 - 'ircs://', // @bug 28503
456 - 'mailto:',
457 - 'mms://',
458 - 'news:',
459 - 'nntp://', // @bug 3808 RFC 1738
460 - 'svn://',
461 - 'telnet://', // Well if we're going to support the above.. -ævar
462 - 'worldwind://',
463 -*/
464 -
465 -// Old version
466 -//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
467 -
468 -// Experimental tweaked version: avoid expensive single-char substrings
469 -// This did not bring the expected performance boost, however.
470 -//text = [A-Za-z0-9,._ -] {
471 -// textStart = pos;
472 -//
473 -// var res = input.substr(textStart - 1, inputLength)
474 -// .match(/[A-Za-z0-9,._ -]+/)[0];
475 -// pos = pos + (res.length - 1);
476 -// return res
477 -// }
478 -
479 -htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
480 - return unentity("&" + c.join('') + ";")
481 -}
482 -
483 -space
484 - = s:[ \t]+ { return s.join(''); }
485 -
486 -optionalSpaceToken
487 - = s:space* {
488 - if ( s.length ) {
489 - return [{type: 'TEXT', value: s.join('')}];
490 - } else {
491 - return [];
492 - }
493 - }
494 -
495 -
496 -// Start of line
497 -sol = (newline / & { return pos === 0; } { return true; })
498 - cn:(c:comment n:newline? { return [c, {type: 'TEXT', value: n}] })* {
499 - return [{type: 'NEWLINE'}].concat(cn);
500 - }
501 -
502 -eof = & { return isEOF(pos); } { return true; }
503 -
504 -
505 -newline
506 - = '\n' / '\r\n'
507 -
508 -eolf = newline / eof
509 -
510 -toplevelblock
511 - = & { blockStart = pos; return true; } b:block {
512 - b = flatten(b);
513 - var bs = b[0];
514 - //dp('toplevelblock:' + pp(b));
515 - if (bs.attribs === undefined) {
516 - bs.attribs = [];
517 - }
518 - bs.attribs.push(['data-sourcePos', blockStart + ':' + pos]);
519 - // XXX: only run this for lines that actually need it!
520 - b.push({type: 'NEWLINE'});
521 - b = doQuotes(b);
522 - return b;
523 - }
524 -
525 -block
526 - = block_lines
527 - / pre
528 - / comment &eolf
529 - / nowiki
530 - / pre
531 - / bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag
532 - / para
533 - / inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor
534 - / s:sol {
535 - if (s) {
536 - return [s, {type: 'NEWLINE'}];
537 - } else {
538 - return [{type: 'NEWLINE'}];
539 - }
540 - }
541 -
542 -block_lines
543 - = s:sol
544 - // eat an empty line before the block
545 - s2:(ss:space* so:sol { return [{type: 'TEXT', value: ss.join('')}].concat(so) })?
546 - bl:block_line {
547 - var s2_ = (s2 !== '') ? s2 : [];
548 - return s.concat(s2_, bl);
549 - }
550 -
551 -// Block structures with start-of-line wiki syntax
552 -block_line
553 - = h
554 - / table
555 - / lists
556 - // tag-only lines should not trigger pre
557 - / st:optionalSpaceToken
558 - bt:(bts:block_tag stl:optionalSpaceToken { return bts.concat(stl) })+
559 - &eolf {
560 - return st.concat(bt);
561 - }
562 - / pre_indent
563 - / pre
564 -
565 -
566 -
567 -
568 -// TODO: convert inline content to annotations!
569 -para
570 - = s1:sol s2:sol c:inlineline {
571 - return s1.concat(s2, [{type: 'TAG', name: 'p'}], c);
572 - }
573 -
574 -br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} }
575 -
576 -// Syntax stops to limit inline expansion defending on syntactic context
577 -inline_breaks
578 - =
579 - & { // Important hack: disable caching for this production, as the default
580 - // cache key does not take into account flag states!
581 - cacheKey = '';
582 - return true;
583 - }
584 - & { return syntaxFlags['table']; }
585 - a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a) + pos); return true; }
586 - / & { return (syntaxFlags['colon'] &&
587 - ! syntaxFlags.extlink && // example: ; [[Link:Term]] : Definition
588 - ! syntaxFlags.linkdesc); } ":" { return true; }
589 - / & { return syntaxFlags['extlink']; } "]" { return true; }
590 - / & { return syntaxFlags['linkdesc']; } link_end { return true; }
591 - / & { return syntaxFlags['h']; } '='+ space* newline { return true; }
592 - / & { return syntaxFlags['template']; } ('|' / '}}') { return true; }
593 -
594 -inline
595 - = c:(urltext / (! inline_breaks (inline_element / . )))+ {
596 - var out = [];
597 - var text = [];
598 - c = flatten(c);
599 - for (var i = 0, l = c.length; i < l; i++) {
600 - var ci = c[i];
601 - if (typeof ci == 'string') {
602 - text.push(ci);
603 - } else {
604 - if (text.length) {
605 - out.push({ type: "TEXT", value: text.join('') });
606 - text = [];
607 - }
608 - out.push(ci);
609 - }
610 - }
611 - if (text.length) {
612 - out.push({ type: 'TEXT', value: text.join('') });
613 - }
614 - //dp('inline out:' + pp(out));
615 - return out;
616 -}
617 -
618 -
619 -inlineline
620 - = c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
621 - var out = [];
622 - var text = [];
623 - c = flatten(c);
624 - for (var i = 0; i < c.length; i++) {
625 - var ci = c[i]
626 - if (typeof ci == 'string') {
627 - text.push(ci);
628 - } else {
629 - if (text.length) {
630 - out.push({type: 'TEXT', value: text.join('')});
631 - text = [];
632 - }
633 - out.push(ci);
634 - }
635 - }
636 - if (text.length) {
637 - out.push({type: 'TEXT', value: text.join('')});
638 - }
639 - //dp('inlineline out:' + pp(out));
640 - return out;
641 -}
642 -
643 -inline_element
644 - = //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
645 - & '<' ( comment / xmlish_tag )
646 - / & '{' ( template / tplarg )
647 - / & '[' ( wikilink / extlink )
648 - / & "'" quote
649 -
650 -/* Headings */
651 -
652 -h = & "=" // guard, to make sure '='+ will match.
653 - // XXX: Also check to end to avoid inline parsing?
654 - r:(
655 - s:'='+ // moved in here to make s accessible to inner action
656 - & { return setFlag('h'); }
657 - c:inlineline
658 - e:'='+
659 - spc:(sp:space+ { return {type: 'TEXT', value: sp.join('') } } / comment)*
660 - &eolf
661 - {
662 - clearFlag('h');
663 - var level = Math.min(s.length, e.length);
664 - // convert surplus equals into text
665 - if(s.length > level) {
666 - var extras = s.substr(0, s.length - level);
667 - if(c[0].type == 'TEXT') {
668 - c[0].value = extras + c[0].value;
669 - } else {
670 - c.unshift({type: 'TEXT', value: extras});
671 - }
672 - }
673 - if(e.length > level) {
674 - var extras = e.substr(0, e.length - level),
675 - lastElem = c[c.length - 1];
676 - if(lastElem.type == 'TEXT') {
677 - lastElem.value = lastElem.value + extras;
678 - } else {
679 - c.push({type: 'TEXT', value: extras});
680 - }
681 - }
682 -
683 - return [{type: 'TAG', name: 'h' + level}]
684 - .concat(c, [{type: 'ENDTAG', name: 'h' + level}, spc]);
685 - }
686 - / & { dp('nomatch exit h'); clearFlag('h'); return false } { return null }
687 - ) { return r }
688 -
689 -
690 -pre_indent
691 - = l:pre_indent_line ls:(sol pre_indent_line)* {
692 - return [{type: 'TAG', name: 'pre'}]
693 - .concat( [l], ls
694 - , [{type: 'ENDTAG', name: 'pre'}]);
695 - }
696 -pre_indent_line = space l:inlineline {
697 - return [{type: 'TEXT', value: '\n'}].concat(l);
698 -}
699 -
700 -
701 -comment
702 - = '<!--' c:comment_chars* ('-->' / eof)
703 - cs:(space* newline space* cn:comment { return cn })* {
704 - return [{ type: 'COMMENT', value: c.join('') }].concat(cs);
705 - }
706 -
707 -comment_chars
708 - = c:[^-] { return c; }
709 - / c:'-' !'->' { return c; }
710 -
711 -
712 -urllink
713 - = target:url {
714 - return [ { type: 'TAG',
715 - name: 'a',
716 - attribs: [['href', target]] }
717 - , {type: 'TEXT', value: target}
718 - , {type: 'ENDTAG', name: 'a'}
719 - ];
720 - }
721 -
722 -extlink
723 - = "["
724 - & { return setFlag('extlink'); }
725 - target:url
726 - space*
727 - text:inlineline?
728 - "]" {
729 - clearFlag('extlink');
730 - if ( text == '' ) {
731 - // XXX: Link numbering should be implemented in post-processor.
732 - text = [{type: 'TEXT', value: "[" + linkCount + "]"}];
733 - linkCount++;
734 - }
735 - return [ { type: 'TAG',
736 - name: 'a',
737 - attribs: [['href', target]] } ]
738 - .concat( text
739 - , [{type: 'ENDTAG', name: 'a'}]);
740 - }
741 - / "[" & { clearFlag('extlink'); return false; }
742 -
743 -/* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can
744 - * be configured dynamically. */
745 -url_protocol
746 - = '//' // for protocol-relative URLs
747 - / 'ftp://'
748 - / 'git://'
749 - / 'gopher://'
750 - / 'http://'
751 - / 'https://'
752 - / 'irc://'
753 - / 'ircs://' // @bug 28503
754 - / 'mailto:'
755 - / 'mms://'
756 - / 'news:'
757 - / 'nntp://' // @bug 3808 RFC 1738
758 - / 'svn://'
759 - / 'telnet://' // Well if we're going to support the above.. -ævar
760 - / 'worldwind://'
761 -
762 -// javascript does not support unicode features..
763 -unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
764 -
765 -
766 -urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] {
767 - return decodeURI("%" + c0 + c1)
768 -}
769 -
770 -//[^][<>"\\x00-\\x20\\x7F\p{Zs}]
771 -url
772 - = proto:url_protocol
773 - rest:( [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
774 - / s:[.:,] !(space / eolf) { return s }
775 - / htmlentity
776 - / urlencoded_char
777 - / [&%] )+
778 -{
779 - return proto + rest.join('');
780 -}
781 -
782 -template
783 - = "{{" target:template_target
784 - params:(newline? "|" newline? p:template_param { return p })*
785 - "}}" {
786 - var obj = { type: 'TAG', name: 'template',
787 - attribs: [['target', target]],
788 - args: {}}
789 - if (params && params.length) {
790 - var position = 1;
791 - for ( var i = 0, l = params.length; i < l; i++ ) {
792 - var param = params[i];
793 - if ( param[0] === null ) {
794 - obj.args[position] = param[1];
795 - position++;
796 - } else {
797 - obj.args[param[0]] = param[1];
798 - }
799 - }
800 - // HACK: temporarily also push the args into an attribute
801 - // (just for debugging)
802 - obj.attribs.push(['data-args', JSON.stringify(obj.args)]);
803 - }
804 - // Should actually use a self-closing tag here, but the Node HTML5
805 - // parser only recognizes known self-closing tags for now, so use an
806 - // explicit end tag for now.
807 - //console.log(pp(obj));
808 - return obj;
809 - }
810 -
811 -template_target
812 - = h:( !"}}" x:([^|\n]) { return x } )* { return h.join(''); }
813 -
814 -template_param
815 - = name:template_param_name space* "=" space* c:template_param_text {
816 - return [name, c];
817 - } / c:template_param_text {
818 - return [null, c];
819 - }
820 -
821 -tplarg
822 - = "{{{" name:link_target params:("|" p:template_param { return p })* "}}}" {
823 - var obj = {
824 - type: 'SELFCLOSINGTAG',
825 - name: 'templatearg',
826 - attribs: [['argname', name]]
827 - };
828 - if (params && params.length) {
829 - // HACK, not final.
830 - obj.attribs.push(['data-args', JSON.stringify(params)]);
831 - }
832 - return obj;
833 - }
834 -
835 -template_param_name
836 - = h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); }
837 -
838 -template_param_text
839 - = & { return setFlag('template') }
840 - il:inline+ {
841 - clearFlag('template');
842 - return il;
843 - }
844 - / & { clearFlag('template'); return false; }
845 -
846 -wikilink
847 - = "[["
848 - ! url
849 - target:link_target text:("|" lt:link_text { return lt })* "]]" suffix:text? {
850 - var obj = {
851 - type: 'TAG',
852 - name: 'a',
853 - attribs: [['data-type', 'internal']]
854 - };
855 - obj.attribs.push(['href', target]);
856 - if (text && text.length) {
857 - var textTokens = text;
858 - } else {
859 - if (suffix !== '') {
860 - target += suffix;
861 - }
862 - var textTokens = [{type: 'TEXT', value: target}];
863 - }
864 - return [obj].concat(textTokens, [{type: 'ENDTAG', name: 'a'}]);
865 - }
866 -
867 -link_target
868 - = h:( c:[^|%\n\]]+ { return c.join('') } // quickly eat anything unsuspicious
869 - / !"]]"
870 - hi:(
871 - [^|%\n]
872 - / urlencoded_char
873 - / '%'
874 - ) { return hi }
875 - )* { return h.join(''); }
876 -
877 -link_text
878 - = h:( & { return setFlag('linkdesc'); }
879 - x:inlineline { return x }
880 - )* {
881 - clearFlag('linkdesc')
882 - return h;
883 - }
884 - / & { clearFlag('linkdesc') } { return null; }
885 -
886 -link_end = "]]"
887 -
888 -/* Generic quote production for italic and bold, further processed in a token
889 - * stream transformation in doQuotes. Relies on NEWLINE tokens being emitted
890 - * for each line of text to balance quotes per line.
891 - *
892 - * We are not using a simple pair rule here as we need to support mis-nested
893 - * bolds/italics and MediaWiki's special heuristics for apostrophes, which are
894 - * all not context free. */
895 -quote = "''" x:"'"* {
896 - return {
897 - type : 'QUOTE',
898 - value: "''" + x.join('')
899 - }
900 -}
901 -
902 -/* XXX: Extension tags can require a change in the tokenizer mode, which
903 - * returns any text between extension tags verbatim. For now, we simply
904 - * continue to parse the contained text and return the tokens. The original
905 - * input source can be recovered from the source positions added on tag
906 - * tokens. This won't however work in all cases. For example, a comment start
907 - * (<!--) between extension tags would cause the remaining text to be consumed
908 - * as a comment. To avoid this, we might need to look ahead for the end tag
909 - * and limit the content parsing to this section. */
910 -
911 -xmlish_tag = nowiki / generic_tag
912 -
913 -pre
914 - = "<pre"
915 - attribs:generic_attribute*
916 - ">"
917 - ts:(t1:[^<]+ { return {type:'TEXT',value:t1.join('')} }
918 - / nowiki
919 - / !"</pre>" t2:. {return {type:'TEXT',value:t2}})+
920 - ("</pre>" / eof) {
921 - // return nowiki tags as well?
922 - //console.log('inpre');
923 - return [ {type: 'TAG', name: 'pre', attribs: attribs} ]
924 - .concat(ts, [{type: 'ENDTAG', name: 'pre'}]);
925 - }
926 - / "</pre>" { return {type: 'TEXT', value: "</pre>"}; }
927 -
928 -nowiki
929 - = "<nowiki>" nc:nowiki_content "</nowiki>" {
930 - // console.log(pp(nc));
931 - return nc;
932 - }
933 - / "<nowiki>" {
934 - //console.log('nowiki fallback');
935 - return [{type: 'TEXT', value: '<nowiki>'}];
936 - }
937 - / "</nowiki>" { return [{type: 'TEXT', value: '</nowiki>'}]; }
938 -
939 -nowiki_content
940 - = ts:( t:[^<]+ { return t.join('') }
941 - / "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
942 - //console.log('nested pre in nowiki');
943 - return ["<pre"].concat(p0, p1, [">"], [p2[0].value], ["</pre>"]).join('');
944 - }
945 - / (!("</nowiki>" / "</pre>") c:. {return c})
946 - )* {
947 - // return nowiki tags as well?
948 - return [{type: 'TEXT', value: ts.join('')}];
949 - }
950 -
951 -// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
952 -// following paragraphs
953 -block_tag
954 - = "<" end:"/"? name:(cs:[a-zA-Z]+ { return cs.join('') })
955 - attribs:generic_attribute*
956 - selfclose:"/"?
957 - ">" {
958 - if (block_names[name.toLowerCase()] !== true) {
959 - // abort match if tag is not block-level
960 - return null;
961 - }
962 - var res = {name: name, attribs: attribs};
963 - if ( end != '' ) {
964 - res.type = 'ENDTAG';
965 - } else if ( selfclose != '' ) {
966 - res.type = 'SELFCLOSINGTAG';
967 - } else {
968 - res.type = 'TAG';
969 - }
970 - return [res];
971 - }
972 -
973 -/* Generic XML-like tags
974 - *
975 - * These also cover extensions (including Cite), which will hook into the
976 - * token stream for further processing. The content of extension tags is
977 - * parsed as regular inline, but the source positions of the tag are added
978 - * to allow reconstructing the unparsed text from the input. */
979 -
980 -// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
981 -// following paragraphs
982 -generic_tag
983 - = "<"
984 - & { tagStartPos = pos; return true; } // remember the start position of this tag
985 - end:"/"? name:[0-9a-zA-Z]+
986 - attribs:generic_attribute*
987 - space*
988 - selfclose:"/"?
989 - ">" {
990 - var res = {name: name.join(''), attribs: attribs};
991 - if ( end != '' ) {
992 - res.type = 'ENDTAG';
993 - } else if ( selfclose != '' ) {
994 - res.type = 'SELFCLOSINGTAG';
995 - } else {
996 - res.type = 'TAG';
997 - }
998 - res.attribs.push(['data-sourceTagPos', (tagStartPos - 1) + ":" + pos]);
999 - return res;
1000 - }
1001 -
1002 -generic_attribute
1003 - = s:space*
1004 - name:generic_attribute_name
1005 - value:(space*
1006 - v:generic_attribute_value { return v })?
1007 -{
1008 - if ( value !== '' ) {
1009 - return [name, value];
1010 - } else {
1011 - return [name,''];
1012 - }
1013 -}
1014 -
1015 -// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
1016 -// disallow newlines and |.
1017 -generic_attribute_name
1018 - = n:[^ \t\0/"'>=\n|]+ {
1019 - return n.join('');
1020 - }
1021 -
1022 -generic_attribute_value
1023 - = "=" space* v:att_value {return v}
1024 -
1025 -att_value
1026 - = t:[^ \t'"<>='\n]+ { return t.join(''); }
1027 - // XXX: is "\"" also valid html? or just Wikitext?
1028 - / "'" t:[^'>]* "'" { return unquote("'", t.join('')); }
1029 - / '"' t:[^">]* '"' { return unquote('"', t.join('')); }
1030 -
1031 -
1032 -/* Lists */
1033 -lists = e:(dtdd / li) es:(sol (dtdd / li))*
1034 -{
1035 - return annotateList( [ { type: 'TAG', name: 'list'} ]
1036 - .concat(flatten([e].concat(es))
1037 - ,[{ type: 'ENDTAG', name: 'list' }]));
1038 -}
1039 -
1040 -li = bullets:list_char+
1041 - c:inlineline?
1042 - &eolf
1043 -{
1044 - if ( c == '' )
1045 - c = [];
1046 - return [ { type: 'TAG',
1047 - name: 'listItem',
1048 - bullets: bullets }
1049 - , c ];
1050 -}
1051 -
1052 -dtdd
1053 - = bullets:(!(";" !list_char) list_char)*
1054 - ";"
1055 - & {return setFlag('colon');}
1056 - c:inlineline
1057 - ":"
1058 - // Fortunately dtdds cannot be nested, so we can simply set the flag
1059 - // back to 0 to disable it.
1060 - & {syntaxFlags['colon'] = 0; return true;}
1061 - d:inlineline
1062 - &eolf {
1063 - // Convert trailing space into &nbsp;
1064 - // XXX: This should be moved to a serializer
1065 - //var clen = c.length;
1066 - //if (clen && c[clen - 1].type === 'TEXT') {
1067 - // var val = c[clen - 1].value;
1068 - // if(val.length && val[val.length - 1] == ' ') {
1069 - // c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
1070 - // }
1071 - //}
1072 -
1073 - return [ { type: 'TAG', name: 'listItem', bullets: bullets + ";" } ]
1074 - .concat( c
1075 - ,[{ type: 'TAG', name: 'listItem', bullets: bullets + ":" } ]
1076 - , d );
1077 - }
1078 - // Fall-back case to clear the colon flag
1079 - / & { return true; } { syntaxFlags['colon'] = 0; return null; }
1080 -
1081 -
1082 -list_char = [*#:;]
1083 -
1084 -
1085 -/* Tables */
1086 -table
1087 - = tas:table_start space* c:table_caption? b:table_body? table_end {
1088 - var res = {type: 'TAG', name: 'table'}
1089 - var body = b !== '' ? b : [];
1090 - dp("body: " + pp(body));
1091 - if (tas.length > 0) {
1092 - // FIXME: actually parse and build structure
1093 - //res.attribs = [['data-unparsed', tas.join('')]];
1094 - res.attribs = tas;
1095 - }
1096 -
1097 - if (c != '') {
1098 - var caption = [{type: 'TAG', name: 'caption'}]
1099 - .concat(c, [{type: 'ENDTAG', name: 'caption'}]);
1100 - } else {
1101 - var caption = [];
1102 - }
1103 - //dp(pp(res));
1104 -
1105 - return [res].concat(caption, body,
1106 - [{type: 'ENDTAG', name: 'table'}]);
1107 - }
1108 -
1109 -table_start
1110 - = "{|"
1111 - res:(
1112 - & { setFlag('table'); return true; }
1113 - ta:generic_attribute*
1114 - {
1115 - dp("table_start " + pp(ta) + ", pos:" + pos);
1116 - return ta;
1117 - }
1118 - / & { clearFlag('table'); return false; } { return null; }
1119 - ) { return res }
1120 -
1121 -table_attribs
1122 - = text / ! inline_breaks !newline ![|] c:. { return c }
1123 -
1124 -table_caption
1125 - = newline
1126 - "|+" c:inline* {
1127 - return c;
1128 - }
1129 -
1130 -table_body
1131 - = //& { dp("table_body enter"); return true; }
1132 - firstrow:table_firstrow otherrows:table_row* {
1133 - /* dp('table first and otherrows: '
1134 - * + pp([firstrow].concat(otherrows))); */
1135 - return [firstrow].concat(otherrows);
1136 - }
1137 - / otherrows:table_row* {
1138 - //dp('table otherrows: ' + pp(otherrows));
1139 - return otherrows;
1140 - }
1141 -
1142 -table_firstrow
1143 - = td:table_data+ {
1144 - //dp('firstrow: ' + pp(td));
1145 - return [{ type: 'TAG', name: 'tr' }]
1146 - .concat(td, [{type: 'ENDTAG', name: 'tr'}]);
1147 - }
1148 -
1149 -table_row
1150 - = //& { dp("table row enter"); return true; }
1151 - newline
1152 - "|-" thtd_attribs? space* td:(table_data / table_header)* {
1153 - return [{type: 'TAG', name: 'tr'}]
1154 - .concat(td, [{type: 'ENDTAG', name: 'tr'}]);
1155 - }
1156 -
1157 -table_data
1158 - = //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
1159 - ("||" / newline "|")
1160 - ! [}+-]
1161 - //& { dp('before attrib, pos=' + pos); return true; }
1162 - a:(as:generic_attribute+ space* "|" !"|" { return as } )?
1163 - //& { dp('past attrib, pos=' + pos); return true; }
1164 - // use inline_breaks to break on tr etc
1165 - td:(!inline_breaks
1166 - //& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
1167 - b:block { return b })* {
1168 - if ( a == '' ) {
1169 - a = [];
1170 - }
1171 - //dp("table data result: " + pp(td) + ", attribts: " + pp(a));
1172 - return [{ type: 'TAG', name: 'td', attribs: a}]
1173 - .concat(td, [{type: 'ENDTAG', name: 'td'}]);
1174 - }
1175 -
1176 -table_header
1177 - = ("!!" / newline "!")
1178 - a:(as:generic_attribute+ "!" !"!" { return as } )?
1179 - c:inline {
1180 - if ( a == '' ) {
1181 - a = [];
1182 - }
1183 - return [{type: 'TAG', name: 'th', attribs: a}]
1184 - .concat(c, [{type: 'ENDTAG', name: 'th'}]);
1185 - }
1186 -
1187 -thtd_attribs
1188 - // In particular, do not match [|\n]
1189 - = a:(text / ! inline_breaks c:[="':;/,. -] { return c } )+ "|" ! "|" {
1190 - return a;
1191 - }
1192 -
1193 -
1194 -table_end
1195 - = newline? "|}" { clearFlag('table'); }
1196 - / newline? eof
1197 -
1198 -
1199 -/* Tabs do not mix well with the hybrid production syntax */
1200 -/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent: */
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.HTML5TreeBuilder.node.js
@@ -1,6 +1,9 @@
 2+/* Front-end/Wrapper for a particular tree builder, in this case the
 3+ * parser/tree builder from the node 'html5' module. Feed it tokens using
 4+ * processToken, and it will build you a DOM tree retrievable using .document
 5+ * or .body(). */
 6+
27 var events = require('events');
3 -
4 -
58 var HTML5 = require('./html5/index');
69
710 FauxHTML5 = {};
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js
@@ -0,0 +1,133 @@
 2+/**
 3+ * Parser for wikitext to provisional temp structure, using PEG.js and
 4+ * a separate PEG grammar file (pegParser.pegjs.txt)
 5+ *
 6+ * Use along with the MWTreeRenderer and MWTreeSerializer classes for
 7+ * HTML output and source round-tripping.
 8+ *
 9+ * If installed as a user script or to customize, set parserPlaygroundPegPage
 10+ * to point at the MW page name containing the parser peg definition; default
 11+ * is 'MediaWiki:Gadget-ParserPlayground-PegParser.pegjs'.
 12+ */
 13+function PegTokenizer(env) {
 14+ this.env = env || {};
 15+}
 16+
 17+PegTokenizer.src = false;
 18+
 19+PegTokenizer.prototype.tokenize = function(text, callback) {
 20+ this.initSource(function() {
 21+ var out, err;
 22+ try {
 23+ var parser = PEG.buildParser(PegTokenizer.src);
 24+ out = parser.parse(text);
 25+ } catch (e) {
 26+ err = e;
 27+ console.trace();
 28+ } finally {
 29+ callback(out, err);
 30+ }
 31+ });
 32+}
 33+
 34+/**
 35+ * @param {object} tree
 36+ * @param {function(tree, error)} callback
 37+ */
 38+PegTokenizer.prototype.expandTree = function(tree, callback) {
 39+ var self = this;
 40+ var subParseArray = function(listOfTrees) {
 41+ var content = [];
 42+ $.each(listOfTrees, function(i, subtree) {
 43+ self.expandTree(subtree, function(substr, err) {
 44+ content.push(tree);
 45+ });
 46+ });
 47+ return content;
 48+ };
 49+ var src;
 50+ if (typeof tree === "string") {
 51+ callback(tree);
 52+ return;
 53+ }
 54+ if (tree.type == 'template') {
 55+ // expand a template node!
 56+
 57+ // Resolve a possibly relative link
 58+ var templateName = this.env.resolveTitle( tree.target, 'Template' );
 59+ this.env.fetchTemplate( tree.target, tree.params || {}, function( templateSrc, error ) {
 60+ // @fixme should pre-parse/cache these too?
 61+ self.parseToTree( templateSrc, function( templateTree, error ) {
 62+ if ( error ) {
 63+ callback({
 64+ type: 'placeholder',
 65+ orig: tree,
 66+ content: [
 67+ {
 68+ // @fixme broken link?
 69+ type: 'link',
 70+ target: templateName
 71+ }
 72+ ]
 73+ });
 74+ } else {
 75+ callback({
 76+ type: 'placeholder',
 77+ orig: tree,
 78+ content: self.env.expandTemplateArgs( templateTree, tree.params )
 79+ });
 80+ }
 81+ })
 82+ } );
 83+ // Wait for async...
 84+ return;
 85+ }
 86+ var out = $.extend( tree ); // @fixme prefer a deep copy?
 87+ if (tree.content) {
 88+ out.content = subParseArray(tree.content);
 89+ }
 90+ callback(out);
 91+};
 92+
 93+PegTokenizer.prototype.initSource = function(callback) {
 94+ if (PegTokenizer.src) {
 95+ callback();
 96+ } else {
 97+ if ( typeof parserPlaygroundPegPage !== 'undefined' ) {
 98+ $.ajax({
 99+ url: wgScriptPath + '/api' + wgScriptExtension,
 100+ data: {
 101+ format: 'json',
 102+ action: 'query',
 103+ prop: 'revisions',
 104+ rvprop: 'content',
 105+ titles: parserPlaygroundPegPage
 106+ },
 107+ success: function(data, xhr) {
 108+ $.each(data.query.pages, function(i, page) {
 109+ if (page.revisions && page.revisions.length) {
 110+ PegTokenizer.src = page.revisions[0]['*'];
 111+ }
 112+ });
 113+ callback()
 114+ },
 115+ dataType: 'json',
 116+ cache: false
 117+ }, 'json');
 118+ } else {
 119+ $.ajax({
 120+ url: mw.config.get('wgParserPlaygroundAssetsPath', mw.config.get('wgExtensionAssetsPath')) + '/ParserPlayground/modules/pegParser.pegjs.txt',
 121+ success: function(data) {
 122+ PegTokenizer.src = data;
 123+ callback();
 124+ },
 125+ dataType: 'text',
 126+ cache: false
 127+ });
 128+ }
 129+ }
 130+};
 131+
 132+if (typeof module == "object") {
 133+ module.exports.PegTokenizer = PegTokenizer;
 134+}
Property changes on: trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js
___________________________________________________________________
Added: svn:eol-style
1135 + native
Index: trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
@@ -0,0 +1,1199 @@
 2+/* Produces output more or less compatible with FakeParser; plug it into FP's output and see */
 3+{
 4+ /* Fixme: use static functions to separate module! Unfortunately, this
 5+ * does not work:
 6+ * var tu = require('./mediawiki.tokenizer.utils.js');
 7+ * console.log(tu.flatten([]));
 8+ * Using exports in the module gets a bit further, but accesses to
 9+ * tu.flatten in productions still fail. Thus, I just moved the functions
 10+ * here until a solution is found:
 11+ */
 12+
 13+ /* Static utilities */
 14+
 15+ // Flatten a list of lists.
 16+ var flatten = function ( e ) {
 17+ var es = [];
 18+ // flatten sub-arrays
 19+ for(var i = 0, length = e.length; i < length; i++) {
 20+ var ei = e[i];
 21+ if ($.isArray(ei))
 22+ es = es.concat(flatten(ei));
 23+ else
 24+ es.push(ei);
 25+ };
 26+ return es;
 27+ };
 28+
 29+ // Remove escaped quotes from attributes etc
 30+ var unquote = function (quotec, text) {
 31+ return text.replace('\\' + quotec, quotec);
 32+ };
 33+
 34+ // Decode html entities. In a browser, this should only be fed the entity,
 35+ // not untrusted html! XXX: replace with safer version.
 36+ var unentity = function ( entity ) {
 37+ return $("<div/>").html(entity).text();
 38+ };
 39+
 40+ // Debug print with global switch
 41+ var dp = function ( msg ) {
 42+ if ( false ) {
 43+ console.log(msg);
 44+ }
 45+ };
 46+
 47+ var pp = function ( s ) { return JSON.stringify(s, null, 2); }
 48+
 49+ /*
 50+ * Annotate a token stream with list items with appropriate list tokens
 51+ *
 52+ * @static
 53+ * @method
 54+ * @param {[tokens]} Token stream with li tokens
 55+ * @returns {[tokens]} Token stream, possibly with additional list tokens
 56+ * */
 57+ var annotateList = function ( tokens ) {
 58+ var out = [], // List of tokens
 59+ bstack = [], // Bullet stack, previous element's listStyle
 60+ bnext = [], // Next element's listStyle
 61+ endtags = []; // Stack of end tags
 62+
 63+ var commonPrefixLength = function (x, y) {
 64+ var minLength = Math.min(x.length, y.length);
 65+ for(var i = 0; i < minLength; i++) {
 66+ if (x[i] != y[i])
 67+ break;
 68+ }
 69+ return i;
 70+ };
 71+
 72+ var pushList = function ( listName, itemName ) {
 73+ out.push({type: 'TAG', name: listName});
 74+ out.push({type: 'TAG', name: itemName});
 75+ endtags.push({type: 'ENDTAG', name: listName});
 76+ endtags.push({type: 'ENDTAG', name: itemName});
 77+ };
 78+
 79+ var popTags = function ( n ) {
 80+ for(;n > 0; n--) {
 81+ // push list item..
 82+ out.push(endtags.pop());
 83+ // and the list end tag
 84+ out.push(endtags.pop());
 85+ }
 86+ };
 87+
 88+ var isDlDd = function (a, b) {
 89+ var ab = [a,b].sort();
 90+ return (ab[0] === ':' && ab[1] === ';');
 91+ };
 92+
 93+ var doListItem = function ( bs, bn ) {
 94+ var prefixLen = commonPrefixLength (bs, bn);
 95+ var changeLen = Math.max(bs.length, bn.length) - prefixLen;
 96+ var prefix = bn.slice(0, prefixLen);
 97+ // emit close tag tokens for closed lists
 98+ if (changeLen === 0) {
 99+ var itemToken = endtags.pop();
 100+ out.push(itemToken);
 101+ out.push({type: 'TAG', name: itemToken.name});
 102+ endtags.push({type: 'ENDTAG', name: itemToken.name});
 103+ } else if ( bs.length == bn.length
 104+ && changeLen == 1
 105+ && isDlDd( bs[prefixLen], bn[prefixLen] ) ) {
 106+ // handle dd/dt transitions
 107+ out.push(endtags.pop());
 108+ if( bn[prefixLen] == ';') {
 109+ var newName = 'dt';
 110+ } else {
 111+ var newName = 'dd';
 112+ }
 113+ out.push({type: 'TAG', name: newName});
 114+ endtags.push({type: 'ENDTAG', name: newName});
 115+ } else {
 116+ popTags(bs.length - prefixLen);
 117+
 118+ if (prefixLen > 0 && bn.length == prefixLen ) {
 119+ var itemToken = endtags.pop();
 120+ out.push(itemToken);
 121+ out.push({type: 'TAG', name: itemToken.name});
 122+ endtags.push({type: 'ENDTAG', name: itemToken.name});
 123+ }
 124+
 125+ for(var i = prefixLen; i < bn.length; i++) {
 126+ switch (bn[i]) {
 127+ case '*':
 128+ pushList('ul', 'li');
 129+ break;
 130+ case '#':
 131+ pushList('ol', 'li');
 132+ break;
 133+ case ';':
 134+ pushList('dl', 'dt');
 135+ break;
 136+ case ':':
 137+ pushList('dl', 'dd');
 138+ break;
 139+ default:
 140+ throw("Unknown node prefix " + prefix[i]);
 141+ }
 142+ }
 143+ }
 144+ };
 145+
 146+ for (var i = 0, length = tokens.length; i < length; i++) {
 147+ var token = tokens[i];
 148+ switch ( token.type ) {
 149+ case 'TAG':
 150+ switch (token.name) {
 151+ case 'list':
 152+ // ignore token
 153+ break;
 154+ case 'listItem':
 155+ // convert listItem to list and list item tokens
 156+ bnext = token.bullets;
 157+ doListItem( bstack, bnext );
 158+ bstack = bnext;
 159+ break;
 160+ default:
 161+ // pass through all remaining start tags
 162+ out.push(token);
 163+ break;
 164+ }
 165+ break;
 166+ case 'ENDTAG':
 167+ if ( token.name == 'list' ) {
 168+ // pop all open list item tokens
 169+ popTags(bstack.length);
 170+ bstack = [];
 171+ } else {
 172+ out.push(token);
 173+ }
 174+ break;
 175+ default:
 176+ out.push(token);
 177+ break;
 178+ }
 179+ }
 180+ return out;
 181+ };
 182+
 183+ /*
 184+ * Italic/Bold handling.
 185+ *
 186+ * - list of tokens
 187+ * - NEWLINE
 188+ * - ticks (2+) -> list with link in line token list?
 189+ * - process on newline
 190+ * - need access to text nodes before/after for conversion back to text
 191+ */
 192+ var doQuotes = function ( tokens ) {
 193+
 194+ var italics = [],
 195+ bolds = [],
 196+ out = [],
 197+ inserted = 0;
 198+
 199+ var convertBold = function ( i ) {
 200+ var index = bolds[i];
 201+ var txt = out[index - 1];
 202+ txt.value += "'";
 203+ if ( i > 0 ) {
 204+ bolds = bolds.slice(0, i)
 205+ .concat(bolds.slice(i + 1, bolds.length - i - 1));
 206+ } else {
 207+ bolds.shift();
 208+ }
 209+
 210+ italics.push(index);
 211+ italics.sort(function(a,b) { return a - b });
 212+ };
 213+
 214+ // convert italics/bolds into tags
 215+ var quotesToTags = function ( offsets, name ) {
 216+ var toggle = true;
 217+ for (var j = 0; j < offsets.length; j++) {
 218+ var t = out[offsets[j]];
 219+ if(toggle) {
 220+ t.type = 'TAG';
 221+ } else {
 222+ t.type = 'ENDTAG';
 223+ }
 224+ t.name = name;
 225+ delete t.value;
 226+ toggle = !toggle;
 227+ }
 228+ if (!toggle) {
 229+ // add end tag
 230+ out.push({type: 'ENDTAG', name: name});
 231+ inserted++;
 232+ }
 233+ toggle = true;
 234+ };
 235+
 236+ for (var i = 0, length = tokens.length; i < length; i++) {
 237+ var token = tokens[i];
 238+ switch (token.type) {
 239+ case 'QUOTE':
 240+ // depending on length, add starting 's to preceding text node
 241+ // (if any)
 242+ // add token index to italic/bold lists
 243+ // add placeholder for token
 244+ var qlen = token.value.length;
 245+ switch (qlen) {
 246+ case 2: italics.push(i + inserted); out.push(token); break;
 247+ case 3: bolds.push(i + inserted); out.push(token); break;
 248+ case 4:
 249+ token.value = "'''";
 250+ if (i > 0 && tokens[i-1].type === 'TEXT') {
 251+ tokens[i-1].value += "'";
 252+ } else {
 253+ out.push({type: 'TEXT', value: "'"});
 254+ inserted++;
 255+ }
 256+ bolds.push(i + inserted);
 257+ out.push(token);
 258+ break;
 259+ case 5:
 260+ // order does not matter here, will be fixed
 261+ // by HTML parser backend
 262+ italics.push(i + inserted);
 263+ out.push({type: 'QUOTE', value: "''"});
 264+ inserted++;
 265+ bolds.push(i + inserted);
 266+ out.push({type: 'QUOTE', value: "'''"});
 267+ break;
 268+ default: // longer than 5, only use the last 5 ticks
 269+ token.value = "'''''";
 270+ var newvalue = token.value.substr(0, qlen - 5 );
 271+ if (i > 0 && tokens[i-1].type === 'TEXT') {
 272+ tokens[i-1].value += newvalue;
 273+ } else {
 274+ out.push({type: 'TEXT', value: newvalue});
 275+ inserted++;
 276+ }
 277+ italics.push(i + inserted);
 278+ out.push({type: 'QUOTE', value: "''"});
 279+ inserted++;
 280+ bolds.push(i + inserted);
 281+ out.push({type: 'QUOTE', value: "'''"});
 282+ break;
 283+ }
 284+ break;
 285+
 286+ case 'NEWLINE':
 287+ // balance out tokens, convert placeholders into tags
 288+ if (italics.length % 2 && bolds.length % 2) {
 289+ dp("balancing!");
 290+ var firstsingleletterword = -1,
 291+ firstmultiletterword = -1,
 292+ firstspace = -1;
 293+ for (var j = 0; j < bolds.length; j++) {
 294+ var ticki = bolds[j];
 295+ if (ticki > 0 && out[ticki - 1].type === 'TEXT') {
 296+ var txt = out[ticki - 1],
 297+ lastchar = txt.value[txt.value.length - 1],
 298+ secondtolastchar = txt.value[txt.value.length - 2];
 299+ dp('txt: ' + pp(txt));
 300+ if (lastchar === ' ' && firstspace === -1) {
 301+ firstspace = j;
 302+ } else if (lastchar !== ' ') {
 303+ if ( secondtolastchar === ' ' &&
 304+ firstsingleletterword === -1)
 305+ {
 306+ firstsingleletterword = j;
 307+ } else if ( firstmultiletterword == -1) {
 308+ firstmultiletterword = j;
 309+ }
 310+ }
 311+ }
 312+ }
 313+
 314+
 315+ // now see if we can convert a bold to an italic and
 316+ // an apostrophe
 317+ if (firstsingleletterword > -1) {
 318+ convertBold(firstsingleletterword);
 319+ } else if (firstmultiletterword > -1) {
 320+ convertBold(firstmultiletterword);
 321+ } else if (firstspace > -1) {
 322+ convertBold(firstspace);
 323+ }
 324+ }
 325+
 326+ quotesToTags(bolds, 'b');
 327+ quotesToTags(italics, 'i');
 328+ bolds = [];
 329+ italics = [];
 330+ out.push(token);
 331+ break;
 332+ default:
 333+ out.push(token);
 334+ }
 335+ }
 336+ return out;
 337+ };
 338+
 339+
 340+ /* End static utilities */
 341+
 342+ /*
 343+ * Flags for specific parse environments (inside tables, links etc). Flags
 344+ * trigger syntactic stops in the inline_breaks production, which
 345+ * terminates inline and attribute matches. Flags merely reduce the number
 346+ * of productions needed: The grammar is still context-free as the
 347+ * productions can just be unrolled for all combinations of environments
 348+ * at the cost of a much larger grammar.
 349+ */
 350+ var syntaxFlags = {};
 351+ var setFlag = function(flag) {
 352+ if (syntaxFlags[flag] !== undefined) {
 353+ syntaxFlags[flag]++;
 354+ } else {
 355+ syntaxFlags[flag] = 1;
 356+ }
 357+ return true;
 358+ };
 359+ var clearFlag = function(flag) {
 360+ syntaxFlags[flag]--;
 361+ };
 362+
 363+ // Start position of top-level block
 364+ // Could also provide positions for lower-level blocks using a stack.
 365+ var blockStart = 0;
 366+
 367+ // Start position of generic tag production
 368+ var tagStartPos = 0;
 369+
 370+ // cache the input length
 371+ var inputLength = input.length;
 372+
 373+ // pseudo-production that matches at end of input
 374+ var isEOF = function (pos) {
 375+ return pos === inputLength;
 376+ };
 377+
 378+ // text start position
 379+ var textStart = 0;
 380+
 381+ // hack to support numbered external links ([http://example.com]).
 382+ // XXX: Move to token stream transform after templates are expanded!
 383+ var linkCount = 1;
 384+
 385+ // Define block-level tags in JS, so we can use toLowerCase to match tags
 386+ // case-independently. This would be quite ugly (and possibly slower) if
 387+ // done manually in the grammar.
 388+ var block_names = (function () {
 389+ var names = [ "p", "table", "td", "tr", "ul", "ol"
 390+ , "li", "dl", "dt", "dd", "div", "center"
 391+ , "blockquote" ];
 392+ var bnames = {};
 393+ for(var i = 0, l = names.length; i < l; i++) {
 394+ bnames[names[i]] = true;
 395+ }
 396+ return bnames;
 397+ })();
 398+
 399+
 400+}
 401+
 402+start
 403+ = e:toplevelblock* newline* {
 404+ return flatten(e);
 405+ }
 406+
 407+
 408+/* All chars that cannot start syntactic structures in the middle of a line
 409+ * XXX: ] and other end delimiters should probably only be activated inside
 410+ * structures to avoid unnecessarily leaving the text production on plain
 411+ * content. */
 412+
 413+text_char = [^'<~[{\n\r:\]}|!=]
 414+
 415+text = t:text_char+ { return t.join(''); }
 416+
 417+/* Explanation of chars
 418+ * ' quotes (italic/bold)
 419+ * < start of xmlish_tag
 420+ * ~ signatures/dates
 421+ * [ start of links
 422+ * { start of parser functions, transclusion and template args
 423+ * \n all sort of block-level markup at start of line
 424+ * \r ditto
 425+ * h http(s) urls
 426+ * n nntp(s) urls
 427+ * m mailto urls
 428+ *
 429+ * ! and | table cell delimiters, might be better to specialize those
 430+ * = headings - also specialize those!
 431+ *
 432+ * The following chars are also included for now, but only apply in some
 433+ * contexts and should probably be enabled only in those:
 434+ * : separate definition in ; term : definition
 435+ * ] end of link
 436+ * } end of parser func/transclusion/template arg
 437+ */
 438+
 439+urltext = ( t:[^'<~[{\n\rfghimnstw|!:\]} &=]+ { return t.join(''); }
 440+ / htmlentity
 441+ / urllink
 442+ // Convert trailing space into &nbsp;
 443+ // XXX: This should be moved to a serializer
 444+ / ' ' & ':' { return "\u00a0"; }
 445+ / t:text_char )+
 446+
 447+/*
 448+ '//', // for protocol-relative URLs, but not in text!
 449+ 'ftp://',
 450+ 'git://',
 451+ 'gopher://',
 452+ 'http://',
 453+ 'https://',
 454+ 'irc://',
 455+ 'ircs://', // @bug 28503
 456+ 'mailto:',
 457+ 'mms://',
 458+ 'news:',
 459+ 'nntp://', // @bug 3808 RFC 1738
 460+ 'svn://',
 461+ 'telnet://', // Well if we're going to support the above.. -ævar
 462+ 'worldwind://',
 463+*/
 464+
 465+// Old version
 466+//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
 467+
 468+// Experimental tweaked version: avoid expensive single-char substrings
 469+// This did not bring the expected performance boost, however.
 470+//text = [A-Za-z0-9,._ -] {
 471+// textStart = pos;
 472+//
 473+// var res = input.substr(textStart - 1, inputLength)
 474+// .match(/[A-Za-z0-9,._ -]+/)[0];
 475+// pos = pos + (res.length - 1);
 476+// return res
 477+// }
 478+
 479+htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
 480+ return unentity("&" + c.join('') + ";")
 481+}
 482+
 483+space
 484+ = s:[ \t]+ { return s.join(''); }
 485+
 486+optionalSpaceToken
 487+ = s:space* {
 488+ if ( s.length ) {
 489+ return [{type: 'TEXT', value: s.join('')}];
 490+ } else {
 491+ return [];
 492+ }
 493+ }
 494+
 495+
 496+// Start of line
 497+sol = (newline / & { return pos === 0; } { return true; })
 498+ cn:(c:comment n:newline? { return [c, {type: 'TEXT', value: n}] })* {
 499+ return [{type: 'NEWLINE'}].concat(cn);
 500+ }
 501+
 502+eof = & { return isEOF(pos); } { return true; }
 503+
 504+
 505+newline
 506+ = '\n' / '\r\n'
 507+
 508+eolf = newline / eof
 509+
 510+toplevelblock
 511+ = & { blockStart = pos; return true; } b:block {
 512+ b = flatten(b);
 513+ var bs = b[0];
 514+ //dp('toplevelblock:' + pp(b));
 515+ if (bs.attribs === undefined) {
 516+ bs.attribs = [];
 517+ }
 518+ bs.attribs.push(['data-sourcePos', blockStart + ':' + pos]);
 519+ // XXX: only run this for lines that actually need it!
 520+ b.push({type: 'NEWLINE'});
 521+ b = doQuotes(b);
 522+ return b;
 523+ }
 524+
 525+block
 526+ = block_lines
 527+ / pre
 528+ / comment &eolf
 529+ / nowiki
 530+ / pre
 531+ / bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag
 532+ / para
 533+ / inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor
 534+ / s:sol {
 535+ if (s) {
 536+ return [s, {type: 'NEWLINE'}];
 537+ } else {
 538+ return [{type: 'NEWLINE'}];
 539+ }
 540+ }
 541+
 542+block_lines
 543+ = s:sol
 544+ // eat an empty line before the block
 545+ s2:(ss:space* so:sol { return [{type: 'TEXT', value: ss.join('')}].concat(so) })?
 546+ bl:block_line {
 547+ var s2_ = (s2 !== '') ? s2 : [];
 548+ return s.concat(s2_, bl);
 549+ }
 550+
 551+// Block structures with start-of-line wiki syntax
 552+block_line
 553+ = h
 554+ / table
 555+ / lists
 556+ // tag-only lines should not trigger pre
 557+ / st:optionalSpaceToken
 558+ bt:(bts:block_tag stl:optionalSpaceToken { return bts.concat(stl) })+
 559+ &eolf {
 560+ return st.concat(bt);
 561+ }
 562+ / pre_indent
 563+ / pre
 564+
 565+
 566+
 567+
 568+// TODO: convert inline content to annotations!
 569+para
 570+ = s1:sol s2:sol c:inlineline {
 571+ return s1.concat(s2, [{type: 'TAG', name: 'p'}], c);
 572+ }
 573+
 574+br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} }
 575+
 576+// Syntax stops to limit inline expansion defending on syntactic context
 577+inline_breaks
 578+ =
 579+ & { // Important hack: disable caching for this production, as the default
 580+ // cache key does not take into account flag states!
 581+ cacheKey = '';
 582+ return true;
 583+ }
 584+ & { return syntaxFlags['table']; }
 585+ a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a) + pos); return true; }
 586+ / & { return (syntaxFlags['colon'] &&
 587+ ! syntaxFlags.extlink && // example: ; [[Link:Term]] : Definition
 588+ ! syntaxFlags.linkdesc); } ":" { return true; }
 589+ / & { return syntaxFlags['extlink']; } "]" { return true; }
 590+ / & { return syntaxFlags['linkdesc']; } link_end { return true; }
 591+ / & { return syntaxFlags['h']; } '='+ space* newline { return true; }
 592+ / & { return syntaxFlags['template']; } ('|' / '}}') { return true; }
 593+
 594+inline
 595+ = c:(urltext / (! inline_breaks (inline_element / . )))+ {
 596+ var out = [];
 597+ var text = [];
 598+ c = flatten(c);
 599+ for (var i = 0, l = c.length; i < l; i++) {
 600+ var ci = c[i];
 601+ if (typeof ci == 'string') {
 602+ text.push(ci);
 603+ } else {
 604+ if (text.length) {
 605+ out.push({ type: "TEXT", value: text.join('') });
 606+ text = [];
 607+ }
 608+ out.push(ci);
 609+ }
 610+ }
 611+ if (text.length) {
 612+ out.push({ type: 'TEXT', value: text.join('') });
 613+ }
 614+ //dp('inline out:' + pp(out));
 615+ return out;
 616+}
 617+
 618+
 619+inlineline
 620+ = c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
 621+ var out = [];
 622+ var text = [];
 623+ c = flatten(c);
 624+ for (var i = 0; i < c.length; i++) {
 625+ var ci = c[i]
 626+ if (typeof ci == 'string') {
 627+ text.push(ci);
 628+ } else {
 629+ if (text.length) {
 630+ out.push({type: 'TEXT', value: text.join('')});
 631+ text = [];
 632+ }
 633+ out.push(ci);
 634+ }
 635+ }
 636+ if (text.length) {
 637+ out.push({type: 'TEXT', value: text.join('')});
 638+ }
 639+ //dp('inlineline out:' + pp(out));
 640+ return out;
 641+}
 642+
 643+inline_element
 644+ = //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
 645+ & '<' ( comment / xmlish_tag )
 646+ / & '{' ( template / tplarg )
 647+ / & '[' ( wikilink / extlink )
 648+ / & "'" quote
 649+
 650+/* Headings */
 651+
 652+h = & "=" // guard, to make sure '='+ will match.
 653+ // XXX: Also check to end to avoid inline parsing?
 654+ r:(
 655+ s:'='+ // moved in here to make s accessible to inner action
 656+ & { return setFlag('h'); }
 657+ c:inlineline
 658+ e:'='+
 659+ spc:(sp:space+ { return {type: 'TEXT', value: sp.join('') } } / comment)*
 660+ &eolf
 661+ {
 662+ clearFlag('h');
 663+ var level = Math.min(s.length, e.length);
 664+ // convert surplus equals into text
 665+ if(s.length > level) {
 666+ var extras = s.substr(0, s.length - level);
 667+ if(c[0].type == 'TEXT') {
 668+ c[0].value = extras + c[0].value;
 669+ } else {
 670+ c.unshift({type: 'TEXT', value: extras});
 671+ }
 672+ }
 673+ if(e.length > level) {
 674+ var extras = e.substr(0, e.length - level),
 675+ lastElem = c[c.length - 1];
 676+ if(lastElem.type == 'TEXT') {
 677+ lastElem.value = lastElem.value + extras;
 678+ } else {
 679+ c.push({type: 'TEXT', value: extras});
 680+ }
 681+ }
 682+
 683+ return [{type: 'TAG', name: 'h' + level}]
 684+ .concat(c, [{type: 'ENDTAG', name: 'h' + level}, spc]);
 685+ }
 686+ / & { dp('nomatch exit h'); clearFlag('h'); return false } { return null }
 687+ ) { return r }
 688+
 689+
 690+pre_indent
 691+ = l:pre_indent_line ls:(sol pre_indent_line)* {
 692+ return [{type: 'TAG', name: 'pre'}]
 693+ .concat( [l], ls
 694+ , [{type: 'ENDTAG', name: 'pre'}]);
 695+ }
 696+pre_indent_line = space l:inlineline {
 697+ return [{type: 'TEXT', value: '\n'}].concat(l);
 698+}
 699+
 700+
 701+comment
 702+ = '<!--' c:comment_chars* ('-->' / eof)
 703+ cs:(space* newline space* cn:comment { return cn })* {
 704+ return [{ type: 'COMMENT', value: c.join('') }].concat(cs);
 705+ }
 706+
 707+comment_chars
 708+ = c:[^-] { return c; }
 709+ / c:'-' !'->' { return c; }
 710+
 711+
 712+urllink
 713+ = target:url {
 714+ return [ { type: 'TAG',
 715+ name: 'a',
 716+ attribs: [['href', target]] }
 717+ , {type: 'TEXT', value: target}
 718+ , {type: 'ENDTAG', name: 'a'}
 719+ ];
 720+ }
 721+
 722+extlink
 723+ = "["
 724+ & { return setFlag('extlink'); }
 725+ target:url
 726+ space*
 727+ text:inlineline?
 728+ "]" {
 729+ clearFlag('extlink');
 730+ if ( text == '' ) {
 731+ // XXX: Link numbering should be implemented in post-processor.
 732+ text = [{type: 'TEXT', value: "[" + linkCount + "]"}];
 733+ linkCount++;
 734+ }
 735+ return [ { type: 'TAG',
 736+ name: 'a',
 737+ attribs: [['href', target]] } ]
 738+ .concat( text
 739+ , [{type: 'ENDTAG', name: 'a'}]);
 740+ }
 741+ / "[" & { clearFlag('extlink'); return false; }
 742+
 743+/* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can
 744+ * be configured dynamically. */
 745+url_protocol
 746+ = '//' // for protocol-relative URLs
 747+ / 'ftp://'
 748+ / 'git://'
 749+ / 'gopher://'
 750+ / 'http://'
 751+ / 'https://'
 752+ / 'irc://'
 753+ / 'ircs://' // @bug 28503
 754+ / 'mailto:'
 755+ / 'mms://'
 756+ / 'news:'
 757+ / 'nntp://' // @bug 3808 RFC 1738
 758+ / 'svn://'
 759+ / 'telnet://' // Well if we're going to support the above.. -ævar
 760+ / 'worldwind://'
 761+
 762+// javascript does not support unicode features..
 763+unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
 764+
 765+
 766+urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] {
 767+ return decodeURI("%" + c0 + c1)
 768+}
 769+
 770+//[^][<>"\\x00-\\x20\\x7F\p{Zs}]
 771+url
 772+ = proto:url_protocol
 773+ rest:( [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
 774+ / s:[.:,] !(space / eolf) { return s }
 775+ / htmlentity
 776+ / urlencoded_char
 777+ / [&%] )+
 778+{
 779+ return proto + rest.join('');
 780+}
 781+
 782+template
 783+ = "{{" target:template_target
 784+ params:(newline? "|" newline? p:template_param { return p })*
 785+ "}}" {
 786+ var obj = { type: 'TAG', name: 'template',
 787+ attribs: [['target', target]],
 788+ args: {}}
 789+ if (params && params.length) {
 790+ var position = 1;
 791+ for ( var i = 0, l = params.length; i < l; i++ ) {
 792+ var param = params[i];
 793+ if ( param[0] === null ) {
 794+ obj.args[position] = param[1];
 795+ position++;
 796+ } else {
 797+ obj.args[param[0]] = param[1];
 798+ }
 799+ }
 800+ // HACK: temporarily also push the args into an attribute
 801+ // (just for debugging)
 802+ obj.attribs.push(['data-args', JSON.stringify(obj.args)]);
 803+ }
 804+ // Should actually use a self-closing tag here, but the Node HTML5
 805+ // parser only recognizes known self-closing tags for now, so use an
 806+ // explicit end tag for now.
 807+ //console.log(pp(obj));
 808+ return obj;
 809+ }
 810+
 811+template_target
 812+ = h:( !"}}" x:([^|\n]) { return x } )* { return h.join(''); }
 813+
 814+template_param
 815+ = name:template_param_name space* "=" space* c:template_param_text {
 816+ return [name, c];
 817+ } / c:template_param_text {
 818+ return [null, c];
 819+ }
 820+
 821+tplarg
 822+ = "{{{" name:link_target params:("|" p:template_param { return p })* "}}}" {
 823+ var obj = {
 824+ type: 'SELFCLOSINGTAG',
 825+ name: 'templatearg',
 826+ attribs: [['argname', name]]
 827+ };
 828+ if (params && params.length) {
 829+ // HACK, not final.
 830+ obj.attribs.push(['data-args', JSON.stringify(params)]);
 831+ }
 832+ return obj;
 833+ }
 834+
 835+template_param_name
 836+ = h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); }
 837+
 838+template_param_text
 839+ = & { return setFlag('template') }
 840+ il:inline+ {
 841+ clearFlag('template');
 842+ return il;
 843+ }
 844+ / & { clearFlag('template'); return false; }
 845+
 846+wikilink
 847+ = "[["
 848+ ! url
 849+ target:link_target text:("|" lt:link_text { return lt })* "]]" suffix:text? {
 850+ var obj = {
 851+ type: 'TAG',
 852+ name: 'a',
 853+ attribs: [['data-type', 'internal']]
 854+ };
 855+ obj.attribs.push(['href', target]);
 856+ if (text && text.length) {
 857+ var textTokens = text;
 858+ } else {
 859+ if (suffix !== '') {
 860+ target += suffix;
 861+ }
 862+ var textTokens = [{type: 'TEXT', value: target}];
 863+ }
 864+ return [obj].concat(textTokens, [{type: 'ENDTAG', name: 'a'}]);
 865+ }
 866+
 867+link_target
 868+ = h:( c:[^|%\n\]]+ { return c.join('') } // quickly eat anything unsuspicious
 869+ / !"]]"
 870+ hi:(
 871+ [^|%\n]
 872+ / urlencoded_char
 873+ / '%'
 874+ ) { return hi }
 875+ )* { return h.join(''); }
 876+
 877+link_text
 878+ = h:( & { return setFlag('linkdesc'); }
 879+ x:inlineline { return x }
 880+ )* {
 881+ clearFlag('linkdesc')
 882+ return h;
 883+ }
 884+ / & { clearFlag('linkdesc') } { return null; }
 885+
 886+link_end = "]]"
 887+
 888+/* Generic quote production for italic and bold, further processed in a token
 889+ * stream transformation in doQuotes. Relies on NEWLINE tokens being emitted
 890+ * for each line of text to balance quotes per line.
 891+ *
 892+ * We are not using a simple pair rule here as we need to support mis-nested
 893+ * bolds/italics and MediaWiki's special heuristics for apostrophes, which are
 894+ * all not context free. */
 895+quote = "''" x:"'"* {
 896+ return {
 897+ type : 'QUOTE',
 898+ value: "''" + x.join('')
 899+ }
 900+}
 901+
 902+/* XXX: Extension tags can require a change in the tokenizer mode, which
 903+ * returns any text between extension tags verbatim. For now, we simply
 904+ * continue to parse the contained text and return the tokens. The original
 905+ * input source can be recovered from the source positions added on tag
 906+ * tokens. This won't however work in all cases. For example, a comment start
 907+ * (<!--) between extension tags would cause the remaining text to be consumed
 908+ * as a comment. To avoid this, we might need to look ahead for the end tag
 909+ * and limit the content parsing to this section. */
 910+
 911+xmlish_tag = nowiki / generic_tag
 912+
 913+pre
 914+ = "<pre"
 915+ attribs:generic_attribute*
 916+ ">"
 917+ ts:(t1:[^<]+ { return {type:'TEXT',value:t1.join('')} }
 918+ / nowiki
 919+ / !"</pre>" t2:. {return {type:'TEXT',value:t2}})+
 920+ ("</pre>" / eof) {
 921+ // return nowiki tags as well?
 922+ //console.log('inpre');
 923+ return [ {type: 'TAG', name: 'pre', attribs: attribs} ]
 924+ .concat(ts, [{type: 'ENDTAG', name: 'pre'}]);
 925+ }
 926+ / "</pre>" { return {type: 'TEXT', value: "</pre>"}; }
 927+
 928+nowiki
 929+ = "<nowiki>" nc:nowiki_content "</nowiki>" {
 930+ // console.log(pp(nc));
 931+ return nc;
 932+ }
 933+ / "<nowiki>" {
 934+ //console.log('nowiki fallback');
 935+ return [{type: 'TEXT', value: '<nowiki>'}];
 936+ }
 937+ / "</nowiki>" { return [{type: 'TEXT', value: '</nowiki>'}]; }
 938+
 939+nowiki_content
 940+ = ts:( t:[^<]+ { return t.join('') }
 941+ / "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
 942+ //console.log('nested pre in nowiki');
 943+ return ["<pre"].concat(p0, p1, [">"], [p2[0].value], ["</pre>"]).join('');
 944+ }
 945+ / (!("</nowiki>" / "</pre>") c:. {return c})
 946+ )* {
 947+ // return nowiki tags as well?
 948+ return [{type: 'TEXT', value: ts.join('')}];
 949+ }
 950+
 951+// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
 952+// following paragraphs
 953+block_tag
 954+ = "<" end:"/"? name:(cs:[a-zA-Z]+ { return cs.join('') })
 955+ attribs:generic_attribute*
 956+ selfclose:"/"?
 957+ ">" {
 958+ if (block_names[name.toLowerCase()] !== true) {
 959+ // abort match if tag is not block-level
 960+ return null;
 961+ }
 962+ var res = {name: name, attribs: attribs};
 963+ if ( end != '' ) {
 964+ res.type = 'ENDTAG';
 965+ } else if ( selfclose != '' ) {
 966+ res.type = 'SELFCLOSINGTAG';
 967+ } else {
 968+ res.type = 'TAG';
 969+ }
 970+ return [res];
 971+ }
 972+
 973+/* Generic XML-like tags
 974+ *
 975+ * These also cover extensions (including Cite), which will hook into the
 976+ * token stream for further processing. The content of extension tags is
 977+ * parsed as regular inline, but the source positions of the tag are added
 978+ * to allow reconstructing the unparsed text from the input. */
 979+
 980+// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
 981+// following paragraphs
 982+generic_tag
 983+ = "<"
 984+ & { tagStartPos = pos; return true; } // remember the start position of this tag
 985+ end:"/"? name:[0-9a-zA-Z]+
 986+ attribs:generic_attribute*
 987+ space*
 988+ selfclose:"/"?
 989+ ">" {
 990+ var res = {name: name.join(''), attribs: attribs};
 991+ if ( end != '' ) {
 992+ res.type = 'ENDTAG';
 993+ } else if ( selfclose != '' ) {
 994+ res.type = 'SELFCLOSINGTAG';
 995+ } else {
 996+ res.type = 'TAG';
 997+ }
 998+ res.attribs.push(['data-sourceTagPos', (tagStartPos - 1) + ":" + pos]);
 999+ return res;
 1000+ }
 1001+
 1002+generic_attribute
 1003+ = s:space*
 1004+ name:generic_attribute_name
 1005+ value:(space*
 1006+ v:generic_attribute_value { return v })?
 1007+{
 1008+ if ( value !== '' ) {
 1009+ return [name, value];
 1010+ } else {
 1011+ return [name,''];
 1012+ }
 1013+}
 1014+
 1015+// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
 1016+// disallow newlines and |.
 1017+generic_attribute_name
 1018+ = n:[^ \t\0/"'>=\n|]+ {
 1019+ return n.join('');
 1020+ }
 1021+
 1022+generic_attribute_value
 1023+ = "=" space* v:att_value {return v}
 1024+
 1025+att_value
 1026+ = t:[^ \t'"<>='\n]+ { return t.join(''); }
 1027+ // XXX: is "\"" also valid html? or just Wikitext?
 1028+ / "'" t:[^'>]* "'" { return unquote("'", t.join('')); }
 1029+ / '"' t:[^">]* '"' { return unquote('"', t.join('')); }
 1030+
 1031+
 1032+/* Lists */
 1033+lists = e:(dtdd / li) es:(sol (dtdd / li))*
 1034+{
 1035+ return annotateList( [ { type: 'TAG', name: 'list'} ]
 1036+ .concat(flatten([e].concat(es))
 1037+ ,[{ type: 'ENDTAG', name: 'list' }]));
 1038+}
 1039+
 1040+li = bullets:list_char+
 1041+ c:inlineline?
 1042+ &eolf
 1043+{
 1044+ if ( c == '' )
 1045+ c = [];
 1046+ return [ { type: 'TAG',
 1047+ name: 'listItem',
 1048+ bullets: bullets }
 1049+ , c ];
 1050+}
 1051+
 1052+dtdd
 1053+ = bullets:(!(";" !list_char) list_char)*
 1054+ ";"
 1055+ & {return setFlag('colon');}
 1056+ c:inlineline
 1057+ ":"
 1058+ // Fortunately dtdds cannot be nested, so we can simply set the flag
 1059+ // back to 0 to disable it.
 1060+ & {syntaxFlags['colon'] = 0; return true;}
 1061+ d:inlineline
 1062+ &eolf {
 1063+ // Convert trailing space into &nbsp;
 1064+ // XXX: This should be moved to a serializer
 1065+ //var clen = c.length;
 1066+ //if (clen && c[clen - 1].type === 'TEXT') {
 1067+ // var val = c[clen - 1].value;
 1068+ // if(val.length && val[val.length - 1] == ' ') {
 1069+ // c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
 1070+ // }
 1071+ //}
 1072+
 1073+ return [ { type: 'TAG', name: 'listItem', bullets: bullets + ";" } ]
 1074+ .concat( c
 1075+ ,[{ type: 'TAG', name: 'listItem', bullets: bullets + ":" } ]
 1076+ , d );
 1077+ }
 1078+ // Fall-back case to clear the colon flag
 1079+ / & { return true; } { syntaxFlags['colon'] = 0; return null; }
 1080+
 1081+
 1082+list_char = [*#:;]
 1083+
 1084+
 1085+/* Tables */
 1086+table
 1087+ = tas:table_start space* c:table_caption? b:table_body? table_end {
 1088+ var res = {type: 'TAG', name: 'table'}
 1089+ var body = b !== '' ? b : [];
 1090+ dp("body: " + pp(body));
 1091+ if (tas.length > 0) {
 1092+ // FIXME: actually parse and build structure
 1093+ //res.attribs = [['data-unparsed', tas.join('')]];
 1094+ res.attribs = tas;
 1095+ }
 1096+
 1097+ if (c != '') {
 1098+ var caption = [{type: 'TAG', name: 'caption'}]
 1099+ .concat(c, [{type: 'ENDTAG', name: 'caption'}]);
 1100+ } else {
 1101+ var caption = [];
 1102+ }
 1103+ //dp(pp(res));
 1104+
 1105+ return [res].concat(caption, body,
 1106+ [{type: 'ENDTAG', name: 'table'}]);
 1107+ }
 1108+
 1109+table_start
 1110+ = "{|"
 1111+ res:(
 1112+ & { setFlag('table'); return true; }
 1113+ ta:generic_attribute*
 1114+ {
 1115+ dp("table_start " + pp(ta) + ", pos:" + pos);
 1116+ return ta;
 1117+ }
 1118+ / & { clearFlag('table'); return false; } { return null; }
 1119+ ) { return res }
 1120+
 1121+table_attribs
 1122+ = text / ! inline_breaks !newline ![|] c:. { return c }
 1123+
 1124+table_caption
 1125+ = newline
 1126+ "|+" c:inline* {
 1127+ return c;
 1128+ }
 1129+
 1130+table_body
 1131+ = //& { dp("table_body enter"); return true; }
 1132+ firstrow:table_firstrow otherrows:table_row* {
 1133+ /* dp('table first and otherrows: '
 1134+ * + pp([firstrow].concat(otherrows))); */
 1135+ return [firstrow].concat(otherrows);
 1136+ }
 1137+ / otherrows:table_row* {
 1138+ //dp('table otherrows: ' + pp(otherrows));
 1139+ return otherrows;
 1140+ }
 1141+
 1142+table_firstrow
 1143+ = td:table_data+ {
 1144+ //dp('firstrow: ' + pp(td));
 1145+ return [{ type: 'TAG', name: 'tr' }]
 1146+ .concat(td, [{type: 'ENDTAG', name: 'tr'}]);
 1147+ }
 1148+
 1149+table_row
 1150+ = //& { dp("table row enter"); return true; }
 1151+ newline
 1152+ "|-" thtd_attribs? space* td:(table_data / table_header)* {
 1153+ return [{type: 'TAG', name: 'tr'}]
 1154+ .concat(td, [{type: 'ENDTAG', name: 'tr'}]);
 1155+ }
 1156+
 1157+table_data
 1158+ = //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
 1159+ ("||" / newline "|")
 1160+ ! [}+-]
 1161+ //& { dp('before attrib, pos=' + pos); return true; }
 1162+ a:(as:generic_attribute+ space* "|" !"|" { return as } )?
 1163+ //& { dp('past attrib, pos=' + pos); return true; }
 1164+ // use inline_breaks to break on tr etc
 1165+ td:(!inline_breaks
 1166+ //& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
 1167+ b:block { return b })* {
 1168+ if ( a == '' ) {
 1169+ a = [];
 1170+ }
 1171+ //dp("table data result: " + pp(td) + ", attribts: " + pp(a));
 1172+ return [{ type: 'TAG', name: 'td', attribs: a}]
 1173+ .concat(td, [{type: 'ENDTAG', name: 'td'}]);
 1174+ }
 1175+
 1176+table_header
 1177+ = ("!!" / newline "!")
 1178+ a:(as:generic_attribute+ "!" !"!" { return as } )?
 1179+ c:inline {
 1180+ if ( a == '' ) {
 1181+ a = [];
 1182+ }
 1183+ return [{type: 'TAG', name: 'th', attribs: a}]
 1184+ .concat(c, [{type: 'ENDTAG', name: 'th'}]);
 1185+ }
 1186+
 1187+thtd_attribs
 1188+ // In particular, do not match [|\n]
 1189+ = a:(text / ! inline_breaks c:[="':;/,. -] { return c } )+ "|" ! "|" {
 1190+ return a;
 1191+ }
 1192+
 1193+
 1194+table_end
 1195+ = newline? "|}" { clearFlag('table'); }
 1196+ / newline? eof
 1197+
 1198+
 1199+/* Tabs do not mix well with the hybrid production syntax */
 1200+/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent: */
Property changes on: trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
___________________________________________________________________
Added: svn:eol-style
11201 + native

Status & tagging log