r105536 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r105535‎ \| r105536 \| r105537 >
Date:	10:59, 8 December 2011
Author:	gwicke
Status:	deferred
Tags:
Comment:	Further renaming, this time from pegParser to pegTokenizer.
Modified paths:	/trunk/extensions/VisualEditor/modules/parser/mediawiki.HTML5TreeBuilder.node.js (modified) (history) /trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.peg.js (deleted) (history) /trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js (added) (history) /trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt (deleted) (history) /trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt (added) (history) /trunk/extensions/VisualEditor/tests/parser/parserTests.js (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/tests/parser/parserTests.js
—	—	@@ -142,7 +142,7 @@
143	143
144	144	var testWhiteList = require('./parserTests-whitelist.js').testWhiteList;
145	145
146		~~-_import(pj('parser', 'mediawiki.parser.peg.js'), ['PegParser']);~~
	146	+_import(pj('parser', 'mediawiki.tokenizer.peg.js'), ['PegTokenizer']);
147	147	_import(pj('parser', 'mediawiki.parser.environment.js'), ['MWParserEnvironment']);
148	148	_import(pj('parser', 'ext.cite.taghook.ref.js'), ['MWRefTagHook']);
149	149
—	—	@@ -158,9 +158,9 @@
159	159	_require(pj('es', 'serializers', 'es.JsonSerializer.js'));
160	160
161	161	// Preload the grammar file...
162		~~-PegParser.src = fs.readFileSync(path.join(basePath, 'parser', 'pegParser.pegjs.txt'), 'utf8');~~
	162	+PegTokenizer.src = fs.readFileSync(path.join(basePath, 'parser', 'pegTokenizer.pegjs.txt'), 'utf8');
163	163
164		~~-var parser = new PegParser();~~
	164	+var wikiTokenizer = new PegTokenizer();
165	165
166	166	var testFileName = '../../../../phase3/tests/parser/parserTests.txt'; // default
167	167	var testFileName2 = '../../../../tests/parser/parserTests.txt'; // Fallback. Not everyone fetch at phase3 level
—	—	@@ -378,7 +378,7 @@
379	379	console.log(item.input + "\n");
380	380	}
381	381
382		~~- parser.parseToTree(item.input + "\n", function(tokens, err) {~~
	382	+ wikiTokenizer.tokenize(item.input + "\n", function(tokens, err) {
383	383	if (err) {
384	384	printTitle();
385	385	failParseTests++;
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.peg.js
—	—	@@ -1,133 +0,0 @@
2		-/**
3		~~- * Parser for wikitext to provisional temp structure, using PEG.js and~~
4		~~- * a separate PEG grammar file (pegParser.pegjs.txt)~~
5		- *
6		~~- * Use along with the MWTreeRenderer and MWTreeSerializer classes for~~
7		~~- * HTML output and source round-tripping.~~
8		- *
9		~~- * If installed as a user script or to customize, set parserPlaygroundPegPage~~
10		~~- * to point at the MW page name containing the parser peg definition; default~~
11		~~- * is 'MediaWiki:Gadget-ParserPlayground-PegParser.pegjs'.~~
12		~~- */~~
13		~~-function PegParser(env) {~~
14		~~- this.env = env \|\| {};~~
15		-}
16		-
17		~~-PegParser.src = false;~~
18		-
19		~~-PegParser.prototype.parseToTree = function(text, callback) {~~
20		~~- this.initSource(function() {~~
21		~~- var out, err;~~
22		~~- try {~~
23		~~- var parser = PEG.buildParser(PegParser.src);~~
24		~~- out = parser.parse(text);~~
25		~~- } catch (e) {~~
26		~~- err = e;~~
27		~~- console.trace();~~
28		~~- } finally {~~
29		~~- callback(out, err);~~
30		~~- }~~
31		~~- });~~
32		-}
33		-
34		-/**
35		~~- * @param {object} tree~~
36		~~- * @param {function(tree, error)} callback~~
37		~~- */~~
38		~~-PegParser.prototype.expandTree = function(tree, callback) {~~
39		~~- var self = this;~~
40		~~- var subParseArray = function(listOfTrees) {~~
41		~~- var content = [];~~
42		~~- $.each(listOfTrees, function(i, subtree) {~~
43		~~- self.expandTree(subtree, function(substr, err) {~~
44		~~- content.push(tree);~~
45		~~- });~~
46		~~- });~~
47		~~- return content;~~
48		~~- };~~
49		~~- var src;~~
50		~~- if (typeof tree === "string") {~~
51		~~- callback(tree);~~
52		~~- return;~~
53		~~- }~~
54		~~- if (tree.type == 'template') {~~
55		~~- // expand a template node!~~
56		-
57		~~- // Resolve a possibly relative link~~
58		~~- var templateName = this.env.resolveTitle( tree.target, 'Template' );~~
59		~~- this.env.fetchTemplate( tree.target, tree.params \|\| {}, function( templateSrc, error ) {~~
60		~~- // @fixme should pre-parse/cache these too?~~
61		~~- self.parseToTree( templateSrc, function( templateTree, error ) {~~
62		~~- if ( error ) {~~
63		~~- callback({~~
64		~~- type: 'placeholder',~~
65		~~- orig: tree,~~
66		~~- content: [~~
67		~~- {~~
68		~~- // @fixme broken link?~~
69		~~- type: 'link',~~
70		~~- target: templateName~~
71		~~- }~~
72		~~- ]~~
73		~~- });~~
74		~~- } else {~~
75		~~- callback({~~
76		~~- type: 'placeholder',~~
77		~~- orig: tree,~~
78		~~- content: self.env.expandTemplateArgs( templateTree, tree.params )~~
79		~~- });~~
80		~~- }~~
81		~~- })~~
82		~~- } );~~
83		~~- // Wait for async...~~
84		~~- return;~~
85		~~- }~~
86		~~- var out = $.extend( tree ); // @fixme prefer a deep copy?~~
87		~~- if (tree.content) {~~
88		~~- out.content = subParseArray(tree.content);~~
89		~~- }~~
90		~~- callback(out);~~
91		~~-};~~
92		-
93		~~-PegParser.prototype.initSource = function(callback) {~~
94		~~- if (PegParser.src) {~~
95		~~- callback();~~
96		~~- } else {~~
97		~~- if ( typeof parserPlaygroundPegPage !== 'undefined' ) {~~
98		~~- $.ajax({~~
99		~~- url: wgScriptPath + '/api' + wgScriptExtension,~~
100		~~- data: {~~
101		~~- format: 'json',~~
102		~~- action: 'query',~~
103		~~- prop: 'revisions',~~
104		~~- rvprop: 'content',~~
105		~~- titles: parserPlaygroundPegPage~~
106		~~- },~~
107		~~- success: function(data, xhr) {~~
108		~~- $.each(data.query.pages, function(i, page) {~~
109		~~- if (page.revisions && page.revisions.length) {~~
110		~~- PegParser.src = page.revisions[0]['*'];~~
111		~~- }~~
112		~~- });~~
113		~~- callback()~~
114		~~- },~~
115		~~- dataType: 'json',~~
116		~~- cache: false~~
117		~~- }, 'json');~~
118		~~- } else {~~
119		~~- $.ajax({~~
120		~~- url: mw.config.get('wgParserPlaygroundAssetsPath', mw.config.get('wgExtensionAssetsPath')) + '/ParserPlayground/modules/pegParser.pegjs.txt',~~
121		~~- success: function(data) {~~
122		~~- PegParser.src = data;~~
123		~~- callback();~~
124		~~- },~~
125		~~- dataType: 'text',~~
126		~~- cache: false~~
127		~~- });~~
128		~~- }~~
129		~~- }~~
130		~~-};~~
131		-
132		~~-if (typeof module == "object") {~~
133		~~- module.exports.PegParser = PegParser;~~
134		-}
Index: trunk/extensions/VisualEditor/modules/parser/pegParser.pegjs.txt
—	—	@@ -1,1199 +0,0 @@
2		~~-/* Produces output more or less compatible with FakeParser; plug it into FP's output and see */~~
3		-{
4		~~- /* Fixme: use static functions to separate module! Unfortunately, this~~
5		~~- * does not work:~~
6		~~- * var tu = require('./mediawiki.tokenizer.utils.js');~~
7		~~- * console.log(tu.flatten([]));~~
8		~~- * Using exports in the module gets a bit further, but accesses to~~
9		~~- * tu.flatten in productions still fail. Thus, I just moved the functions~~
10		~~- * here until a solution is found:~~
11		~~- */~~
12		-
13		~~- /* Static utilities */~~
14		-
15		~~- // Flatten a list of lists.~~
16		~~- var flatten = function ( e ) {~~
17		~~- var es = [];~~
18		~~- // flatten sub-arrays~~
19		~~- for(var i = 0, length = e.length; i < length; i++) {~~
20		~~- var ei = e[i];~~
21		~~- if ($.isArray(ei))~~
22		~~- es = es.concat(flatten(ei));~~
23		~~- else~~
24		~~- es.push(ei);~~
25		~~- };~~
26		~~- return es;~~
27		~~- };~~
28		-
29		~~- // Remove escaped quotes from attributes etc~~
30		~~- var unquote = function (quotec, text) {~~
31		~~- return text.replace('\\' + quotec, quotec);~~
32		~~- };~~
33		-
34		~~- // Decode html entities. In a browser, this should only be fed the entity,~~
35		~~- // not untrusted html! XXX: replace with safer version.~~
36		~~- var unentity = function ( entity ) {~~
37		~~- return $("<div/>").html(entity).text();~~
38		~~- };~~
39		-
40		~~- // Debug print with global switch~~
41		~~- var dp = function ( msg ) {~~
42		~~- if ( false ) {~~
43		~~- console.log(msg);~~
44		~~- }~~
45		~~- };~~
46		-
47		~~- var pp = function ( s ) { return JSON.stringify(s, null, 2); }~~
48		-
49		- /*
50		~~- * Annotate a token stream with list items with appropriate list tokens~~
51		- *
52		~~- * @static~~
53		~~- * @method~~
54		~~- * @param {[tokens]} Token stream with li tokens~~
55		~~- * @returns {[tokens]} Token stream, possibly with additional list tokens~~
56		~~- * */~~
57		~~- var annotateList = function ( tokens ) {~~
58		~~- var out = [], // List of tokens~~
59		~~- bstack = [], // Bullet stack, previous element's listStyle~~
60		~~- bnext = [], // Next element's listStyle~~
61		~~- endtags = []; // Stack of end tags~~
62		-
63		~~- var commonPrefixLength = function (x, y) {~~
64		~~- var minLength = Math.min(x.length, y.length);~~
65		~~- for(var i = 0; i < minLength; i++) {~~
66		~~- if (x[i] != y[i])~~
67		~~- break;~~
68		~~- }~~
69		~~- return i;~~
70		~~- };~~
71		-
72		~~- var pushList = function ( listName, itemName ) {~~
73		~~- out.push({type: 'TAG', name: listName});~~
74		~~- out.push({type: 'TAG', name: itemName});~~
75		~~- endtags.push({type: 'ENDTAG', name: listName});~~
76		~~- endtags.push({type: 'ENDTAG', name: itemName});~~
77		~~- };~~
78		-
79		~~- var popTags = function ( n ) {~~
80		~~- for(;n > 0; n--) {~~
81		~~- // push list item..~~
82		~~- out.push(endtags.pop());~~
83		~~- // and the list end tag~~
84		~~- out.push(endtags.pop());~~
85		~~- }~~
86		~~- };~~
87		-
88		~~- var isDlDd = function (a, b) {~~
89		~~- var ab = [a,b].sort();~~
90		~~- return (ab[0] === ':' && ab[1] === ';');~~
91		~~- };~~
92		-
93		~~- var doListItem = function ( bs, bn ) {~~
94		~~- var prefixLen = commonPrefixLength (bs, bn);~~
95		~~- var changeLen = Math.max(bs.length, bn.length) - prefixLen;~~
96		~~- var prefix = bn.slice(0, prefixLen);~~
97		~~- // emit close tag tokens for closed lists~~
98		~~- if (changeLen === 0) {~~
99		~~- var itemToken = endtags.pop();~~
100		~~- out.push(itemToken);~~
101		~~- out.push({type: 'TAG', name: itemToken.name});~~
102		~~- endtags.push({type: 'ENDTAG', name: itemToken.name});~~
103		~~- } else if ( bs.length == bn.length~~
104		~~- && changeLen == 1~~
105		~~- && isDlDd( bs[prefixLen], bn[prefixLen] ) ) {~~
106		~~- // handle dd/dt transitions~~
107		~~- out.push(endtags.pop());~~
108		~~- if( bn[prefixLen] == ';') {~~
109		~~- var newName = 'dt';~~
110		~~- } else {~~
111		~~- var newName = 'dd';~~
112		~~- }~~
113		~~- out.push({type: 'TAG', name: newName});~~
114		~~- endtags.push({type: 'ENDTAG', name: newName});~~
115		~~- } else {~~
116		~~- popTags(bs.length - prefixLen);~~
117		-
118		~~- if (prefixLen > 0 && bn.length == prefixLen ) {~~
119		~~- var itemToken = endtags.pop();~~
120		~~- out.push(itemToken);~~
121		~~- out.push({type: 'TAG', name: itemToken.name});~~
122		~~- endtags.push({type: 'ENDTAG', name: itemToken.name});~~
123		~~- }~~
124		-
125		~~- for(var i = prefixLen; i < bn.length; i++) {~~
126		~~- switch (bn[i]) {~~
127		~~- case '*':~~
128		~~- pushList('ul', 'li');~~
129		~~- break;~~
130		~~- case '#':~~
131		~~- pushList('ol', 'li');~~
132		~~- break;~~
133		~~- case ';':~~
134		~~- pushList('dl', 'dt');~~
135		~~- break;~~
136		~~- case ':':~~
137		~~- pushList('dl', 'dd');~~
138		~~- break;~~
139		~~- default:~~
140		~~- throw("Unknown node prefix " + prefix[i]);~~
141		~~- }~~
142		~~- }~~
143		~~- }~~
144		~~- };~~
145		-
146		~~- for (var i = 0, length = tokens.length; i < length; i++) {~~
147		~~- var token = tokens[i];~~
148		~~- switch ( token.type ) {~~
149		~~- case 'TAG':~~
150		~~- switch (token.name) {~~
151		~~- case 'list':~~
152		~~- // ignore token~~
153		~~- break;~~
154		~~- case 'listItem':~~
155		~~- // convert listItem to list and list item tokens~~
156		~~- bnext = token.bullets;~~
157		~~- doListItem( bstack, bnext );~~
158		~~- bstack = bnext;~~
159		~~- break;~~
160		~~- default:~~
161		~~- // pass through all remaining start tags~~
162		~~- out.push(token);~~
163		~~- break;~~
164		~~- }~~
165		~~- break;~~
166		~~- case 'ENDTAG':~~
167		~~- if ( token.name == 'list' ) {~~
168		~~- // pop all open list item tokens~~
169		~~- popTags(bstack.length);~~
170		~~- bstack = [];~~
171		~~- } else {~~
172		~~- out.push(token);~~
173		~~- }~~
174		~~- break;~~
175		~~- default:~~
176		~~- out.push(token);~~
177		~~- break;~~
178		~~- }~~
179		~~- }~~
180		~~- return out;~~
181		~~- };~~
182		-
183		- /*
184		~~- * Italic/Bold handling.~~
185		- *
186		~~- * - list of tokens~~
187		~~- * - NEWLINE~~
188		~~- * - ticks (2+) -> list with link in line token list?~~
189		~~- * - process on newline~~
190		~~- * - need access to text nodes before/after for conversion back to text~~
191		~~- */~~
192		~~- var doQuotes = function ( tokens ) {~~
193		-
194		~~- var italics = [],~~
195		~~- bolds = [],~~
196		~~- out = [],~~
197		~~- inserted = 0;~~
198		-
199		~~- var convertBold = function ( i ) {~~
200		~~- var index = bolds[i];~~
201		~~- var txt = out[index - 1];~~
202		~~- txt.value += "'";~~
203		~~- if ( i > 0 ) {~~
204		~~- bolds = bolds.slice(0, i)~~
205		~~- .concat(bolds.slice(i + 1, bolds.length - i - 1));~~
206		~~- } else {~~
207		~~- bolds.shift();~~
208		~~- }~~
209		-
210		~~- italics.push(index);~~
211		~~- italics.sort(function(a,b) { return a - b });~~
212		~~- };~~
213		-
214		~~- // convert italics/bolds into tags~~
215		~~- var quotesToTags = function ( offsets, name ) {~~
216		~~- var toggle = true;~~
217		~~- for (var j = 0; j < offsets.length; j++) {~~
218		~~- var t = out[offsets[j]];~~
219		~~- if(toggle) {~~
220		~~- t.type = 'TAG';~~
221		~~- } else {~~
222		~~- t.type = 'ENDTAG';~~
223		~~- }~~
224		~~- t.name = name;~~
225		~~- delete t.value;~~
226		~~- toggle = !toggle;~~
227		~~- }~~
228		~~- if (!toggle) {~~
229		~~- // add end tag~~
230		~~- out.push({type: 'ENDTAG', name: name});~~
231		~~- inserted++;~~
232		~~- }~~
233		~~- toggle = true;~~
234		~~- };~~
235		-
236		~~- for (var i = 0, length = tokens.length; i < length; i++) {~~
237		~~- var token = tokens[i];~~
238		~~- switch (token.type) {~~
239		~~- case 'QUOTE':~~
240		~~- // depending on length, add starting 's to preceding text node~~
241		~~- // (if any)~~
242		~~- // add token index to italic/bold lists~~
243		~~- // add placeholder for token~~
244		~~- var qlen = token.value.length;~~
245		~~- switch (qlen) {~~
246		~~- case 2: italics.push(i + inserted); out.push(token); break;~~
247		~~- case 3: bolds.push(i + inserted); out.push(token); break;~~
248		~~- case 4:~~
249		~~- token.value = "'''";~~
250		~~- if (i > 0 && tokens[i-1].type === 'TEXT') {~~
251		~~- tokens[i-1].value += "'";~~
252		~~- } else {~~
253		~~- out.push({type: 'TEXT', value: "'"});~~
254		~~- inserted++;~~
255		~~- }~~
256		~~- bolds.push(i + inserted);~~
257		~~- out.push(token);~~
258		~~- break;~~
259		~~- case 5:~~
260		~~- // order does not matter here, will be fixed~~
261		~~- // by HTML parser backend~~
262		~~- italics.push(i + inserted);~~
263		~~- out.push({type: 'QUOTE', value: "''"});~~
264		~~- inserted++;~~
265		~~- bolds.push(i + inserted);~~
266		~~- out.push({type: 'QUOTE', value: "'''"});~~
267		~~- break;~~
268		~~- default: // longer than 5, only use the last 5 ticks~~
269		~~- token.value = "'''''";~~
270		~~- var newvalue = token.value.substr(0, qlen - 5 );~~
271		~~- if (i > 0 && tokens[i-1].type === 'TEXT') {~~
272		~~- tokens[i-1].value += newvalue;~~
273		~~- } else {~~
274		~~- out.push({type: 'TEXT', value: newvalue});~~
275		~~- inserted++;~~
276		~~- }~~
277		~~- italics.push(i + inserted);~~
278		~~- out.push({type: 'QUOTE', value: "''"});~~
279		~~- inserted++;~~
280		~~- bolds.push(i + inserted);~~
281		~~- out.push({type: 'QUOTE', value: "'''"});~~
282		~~- break;~~
283		~~- }~~
284		~~- break;~~
285		-
286		~~- case 'NEWLINE':~~
287		~~- // balance out tokens, convert placeholders into tags~~
288		~~- if (italics.length % 2 && bolds.length % 2) {~~
289		~~- dp("balancing!");~~
290		~~- var firstsingleletterword = -1,~~
291		~~- firstmultiletterword = -1,~~
292		~~- firstspace = -1;~~
293		~~- for (var j = 0; j < bolds.length; j++) {~~
294		~~- var ticki = bolds[j];~~
295		~~- if (ticki > 0 && out[ticki - 1].type === 'TEXT') {~~
296		~~- var txt = out[ticki - 1],~~
297		~~- lastchar = txt.value[txt.value.length - 1],~~
298		~~- secondtolastchar = txt.value[txt.value.length - 2];~~
299		~~- dp('txt: ' + pp(txt));~~
300		~~- if (lastchar === ' ' && firstspace === -1) {~~
301		~~- firstspace = j;~~
302		~~- } else if (lastchar !== ' ') {~~
303		~~- if ( secondtolastchar === ' ' &&~~
304		~~- firstsingleletterword === -1)~~
305		~~- {~~
306		~~- firstsingleletterword = j;~~
307		~~- } else if ( firstmultiletterword == -1) {~~
308		~~- firstmultiletterword = j;~~
309		~~- }~~
310		~~- }~~
311		~~- }~~
312		~~- }~~
313		-
314		-
315		~~- // now see if we can convert a bold to an italic and~~
316		~~- // an apostrophe~~
317		~~- if (firstsingleletterword > -1) {~~
318		~~- convertBold(firstsingleletterword);~~
319		~~- } else if (firstmultiletterword > -1) {~~
320		~~- convertBold(firstmultiletterword);~~
321		~~- } else if (firstspace > -1) {~~
322		~~- convertBold(firstspace);~~
323		~~- }~~
324		~~- }~~
325		-
326		~~- quotesToTags(bolds, 'b');~~
327		~~- quotesToTags(italics, 'i');~~
328		~~- bolds = [];~~
329		~~- italics = [];~~
330		~~- out.push(token);~~
331		~~- break;~~
332		~~- default:~~
333		~~- out.push(token);~~
334		~~- }~~
335		~~- }~~
336		~~- return out;~~
337		~~- };~~
338		-
339		-
340		~~- /* End static utilities */~~
341		-
342		- /*
343		~~- * Flags for specific parse environments (inside tables, links etc). Flags~~
344		~~- * trigger syntactic stops in the inline_breaks production, which~~
345		~~- * terminates inline and attribute matches. Flags merely reduce the number~~
346		~~- * of productions needed: The grammar is still context-free as the~~
347		~~- * productions can just be unrolled for all combinations of environments~~
348		~~- * at the cost of a much larger grammar.~~
349		~~- */~~
350		~~- var syntaxFlags = {};~~
351		~~- var setFlag = function(flag) {~~
352		~~- if (syntaxFlags[flag] !== undefined) {~~
353		~~- syntaxFlags[flag]++;~~
354		~~- } else {~~
355		~~- syntaxFlags[flag] = 1;~~
356		~~- }~~
357		~~- return true;~~
358		~~- };~~
359		~~- var clearFlag = function(flag) {~~
360		~~- syntaxFlags[flag]--;~~
361		~~- };~~
362		-
363		~~- // Start position of top-level block~~
364		~~- // Could also provide positions for lower-level blocks using a stack.~~
365		~~- var blockStart = 0;~~
366		-
367		~~- // Start position of generic tag production~~
368		~~- var tagStartPos = 0;~~
369		-
370		~~- // cache the input length~~
371		~~- var inputLength = input.length;~~
372		-
373		~~- // pseudo-production that matches at end of input~~
374		~~- var isEOF = function (pos) {~~
375		~~- return pos === inputLength;~~
376		~~- };~~
377		-
378		~~- // text start position~~
379		~~- var textStart = 0;~~
380		-
381		~~- // hack to support numbered external links ([http://example.com]).~~
382		~~- // XXX: Move to token stream transform after templates are expanded!~~
383		~~- var linkCount = 1;~~
384		-
385		~~- // Define block-level tags in JS, so we can use toLowerCase to match tags~~
386		~~- // case-independently. This would be quite ugly (and possibly slower) if~~
387		~~- // done manually in the grammar.~~
388		~~- var block_names = (function () {~~
389		~~- var names = [ "p", "table", "td", "tr", "ul", "ol"~~
390		~~- , "li", "dl", "dt", "dd", "div", "center"~~
391		~~- , "blockquote" ];~~
392		~~- var bnames = {};~~
393		~~- for(var i = 0, l = names.length; i < l; i++) {~~
394		~~- bnames[names[i]] = true;~~
395		~~- }~~
396		~~- return bnames;~~
397		~~- })();~~
398		-
399		-
400		-}
401		-
402		~~-start~~
403		~~- = e:toplevelblock* newline* {~~
404		~~- return flatten(e);~~
405		~~- }~~
406		-
407		-
408		~~-/* All chars that cannot start syntactic structures in the middle of a line~~
409		~~- * XXX: ] and other end delimiters should probably only be activated inside~~
410		~~- * structures to avoid unnecessarily leaving the text production on plain~~
411		~~- * content. */~~
412		-
413		~~-text_char = [^'<~[{\n\r:\]}\|!=]~~
414		-
415		~~-text = t:text_char+ { return t.join(''); }~~
416		-
417		~~-/* Explanation of chars~~
418		~~- * ' quotes (italic/bold)~~
419		~~- * < start of xmlish_tag~~
420		~~- * ~ signatures/dates~~
421		~~- * [ start of links~~
422		~~- * { start of parser functions, transclusion and template args~~
423		~~- * \n all sort of block-level markup at start of line~~
424		~~- * \r ditto~~
425		~~- * h http(s) urls~~
426		~~- * n nntp(s) urls~~
427		~~- * m mailto urls~~
428		- *
429		~~- * ! and \| table cell delimiters, might be better to specialize those~~
430		~~- * = headings - also specialize those!~~
431		- *
432		~~- * The following chars are also included for now, but only apply in some~~
433		~~- * contexts and should probably be enabled only in those:~~
434		~~- * : separate definition in ; term : definition~~
435		~~- * ] end of link~~
436		~~- * } end of parser func/transclusion/template arg~~
437		~~- */~~
438		-
439		~~-urltext = ( t:[^'<~[{\n\rfghimnstw\|!:\]} &=]+ { return t.join(''); }~~
440		~~- / htmlentity~~
441		~~- / urllink~~
442		~~- // Convert trailing space into  ~~
443		~~- // XXX: This should be moved to a serializer~~
444		~~- / ' ' & ':' { return "\u00a0"; }~~
445		~~- / t:text_char )+~~
446		-
447		-/*
448		~~- '//', // for protocol-relative URLs, but not in text!~~
449		~~- 'ftp://',~~
450		~~- 'git://',~~
451		~~- 'gopher://',~~
452		~~- 'http://',~~
453		~~- 'https://',~~
454		~~- 'irc://',~~
455		~~- 'ircs://', // @bug 28503~~
456		~~- 'mailto:',~~
457		~~- 'mms://',~~
458		~~- 'news:',~~
459		~~- 'nntp://', // @bug 3808 RFC 1738~~
460		~~- 'svn://',~~
461		~~- 'telnet://', // Well if we're going to support the above.. -ævar~~
462		~~- 'worldwind://',~~
463		~~-*/~~
464		-
465		~~-// Old version~~
466		~~-//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }~~
467		-
468		~~-// Experimental tweaked version: avoid expensive single-char substrings~~
469		~~-// This did not bring the expected performance boost, however.~~
470		~~-//text = [A-Za-z0-9,._ -] {~~
471		~~-// textStart = pos;~~
472		~~-//~~
473		~~-// var res = input.substr(textStart - 1, inputLength)~~
474		~~-// .match(/[A-Za-z0-9,._ -]+/)[0];~~
475		~~-// pos = pos + (res.length - 1);~~
476		~~-// return res~~
477		~~-// }~~
478		-
479		~~-htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {~~
480		~~- return unentity("&" + c.join('') + ";")~~
481		-}
482		-
483		~~-space~~
484		~~- = s:[ \t]+ { return s.join(''); }~~
485		-
486		~~-optionalSpaceToken~~
487		~~- = s:space* {~~
488		~~- if ( s.length ) {~~
489		~~- return [{type: 'TEXT', value: s.join('')}];~~
490		~~- } else {~~
491		~~- return [];~~
492		~~- }~~
493		~~- }~~
494		-
495		-
496		~~-// Start of line~~
497		~~-sol = (newline / & { return pos === 0; } { return true; })~~
498		~~- cn:(c:comment n:newline? { return [c, {type: 'TEXT', value: n}] })* {~~
499		~~- return [{type: 'NEWLINE'}].concat(cn);~~
500		~~- }~~
501		-
502		~~-eof = & { return isEOF(pos); } { return true; }~~
503		-
504		-
505		~~-newline~~
506		~~- = '\n' / '\r\n'~~
507		-
508		~~-eolf = newline / eof~~
509		-
510		~~-toplevelblock~~
511		~~- = & { blockStart = pos; return true; } b:block {~~
512		~~- b = flatten(b);~~
513		~~- var bs = b[0];~~
514		~~- //dp('toplevelblock:' + pp(b));~~
515		~~- if (bs.attribs === undefined) {~~
516		~~- bs.attribs = [];~~
517		~~- }~~
518		~~- bs.attribs.push(['data-sourcePos', blockStart + ':' + pos]);~~
519		~~- // XXX: only run this for lines that actually need it!~~
520		~~- b.push({type: 'NEWLINE'});~~
521		~~- b = doQuotes(b);~~
522		~~- return b;~~
523		~~- }~~
524		-
525		~~-block~~
526		~~- = block_lines~~
527		~~- / pre~~
528		~~- / comment &eolf~~
529		~~- / nowiki~~
530		~~- / pre~~
531		~~- / bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag~~
532		~~- / para~~
533		~~- / inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor~~
534		~~- / s:sol {~~
535		~~- if (s) {~~
536		~~- return [s, {type: 'NEWLINE'}];~~
537		~~- } else {~~
538		~~- return [{type: 'NEWLINE'}];~~
539		~~- }~~
540		~~- }~~
541		-
542		~~-block_lines~~
543		~~- = s:sol~~
544		~~- // eat an empty line before the block~~
545		~~- s2:(ss:space* so:sol { return [{type: 'TEXT', value: ss.join('')}].concat(so) })?~~
546		~~- bl:block_line {~~
547		~~- var s2_ = (s2 !== '') ? s2 : [];~~
548		~~- return s.concat(s2_, bl);~~
549		~~- }~~
550		-
551		~~-// Block structures with start-of-line wiki syntax~~
552		~~-block_line~~
553		~~- = h~~
554		~~- / table~~
555		~~- / lists~~
556		~~- // tag-only lines should not trigger pre~~
557		~~- / st:optionalSpaceToken~~
558		~~- bt:(bts:block_tag stl:optionalSpaceToken { return bts.concat(stl) })+~~
559		~~- &eolf {~~
560		~~- return st.concat(bt);~~
561		~~- }~~
562		~~- / pre_indent~~
563		~~- / pre~~
564		-
565		-
566		-
567		-
568		~~-// TODO: convert inline content to annotations!~~
569		~~-para~~
570		~~- = s1:sol s2:sol c:inlineline {~~
571		~~- return s1.concat(s2, [{type: 'TAG', name: 'p'}], c);~~
572		~~- }~~
573		-
574		~~-br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} }~~
575		-
576		~~-// Syntax stops to limit inline expansion defending on syntactic context~~
577		~~-inline_breaks~~
578		~~- =~~
579		~~- & { // Important hack: disable caching for this production, as the default~~
580		~~- // cache key does not take into account flag states!~~
581		~~- cacheKey = '';~~
582		~~- return true;~~
583		~~- }~~
584		~~- & { return syntaxFlags['table']; }~~
585		~~- a:(newline [!\|] / '\|\|' / '!!' / '\|}') { dp("table break" + pp(a) + pos); return true; }~~
586		~~- / & { return (syntaxFlags['colon'] &&~~
587		~~- ! syntaxFlags.extlink && // example: ; [[Link:Term]] : Definition~~
588		~~- ! syntaxFlags.linkdesc); } ":" { return true; }~~
589		~~- / & { return syntaxFlags['extlink']; } "]" { return true; }~~
590		~~- / & { return syntaxFlags['linkdesc']; } link_end { return true; }~~
591		~~- / & { return syntaxFlags['h']; } '='+ space* newline { return true; }~~
592		~~- / & { return syntaxFlags['template']; } ('\|' / '}}') { return true; }~~
593		-
594		~~-inline~~
595		~~- = c:(urltext / (! inline_breaks (inline_element / . )))+ {~~
596		~~- var out = [];~~
597		~~- var text = [];~~
598		~~- c = flatten(c);~~
599		~~- for (var i = 0, l = c.length; i < l; i++) {~~
600		~~- var ci = c[i];~~
601		~~- if (typeof ci == 'string') {~~
602		~~- text.push(ci);~~
603		~~- } else {~~
604		~~- if (text.length) {~~
605		~~- out.push({ type: "TEXT", value: text.join('') });~~
606		~~- text = [];~~
607		~~- }~~
608		~~- out.push(ci);~~
609		~~- }~~
610		~~- }~~
611		~~- if (text.length) {~~
612		~~- out.push({ type: 'TEXT', value: text.join('') });~~
613		~~- }~~
614		~~- //dp('inline out:' + pp(out));~~
615		~~- return out;~~
616		-}
617		-
618		-
619		~~-inlineline~~
620		~~- = c:(urltext / !inline_breaks (inline_element / [^\n]))+ {~~
621		~~- var out = [];~~
622		~~- var text = [];~~
623		~~- c = flatten(c);~~
624		~~- for (var i = 0; i < c.length; i++) {~~
625		~~- var ci = c[i]~~
626		~~- if (typeof ci == 'string') {~~
627		~~- text.push(ci);~~
628		~~- } else {~~
629		~~- if (text.length) {~~
630		~~- out.push({type: 'TEXT', value: text.join('')});~~
631		~~- text = [];~~
632		~~- }~~
633		~~- out.push(ci);~~
634		~~- }~~
635		~~- }~~
636		~~- if (text.length) {~~
637		~~- out.push({type: 'TEXT', value: text.join('')});~~
638		~~- }~~
639		~~- //dp('inlineline out:' + pp(out));~~
640		~~- return out;~~
641		-}
642		-
643		~~-inline_element~~
644		~~- = //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }~~
645		~~- & '<' ( comment / xmlish_tag )~~
646		~~- / & '{' ( template / tplarg )~~
647		~~- / & '[' ( wikilink / extlink )~~
648		~~- / & "'" quote~~
649		-
650		~~-/* Headings */~~
651		-
652		~~-h = & "=" // guard, to make sure '='+ will match.~~
653		~~- // XXX: Also check to end to avoid inline parsing?~~
654		~~- r:(~~
655		~~- s:'='+ // moved in here to make s accessible to inner action~~
656		~~- & { return setFlag('h'); }~~
657		~~- c:inlineline~~
658		~~- e:'='+~~
659		- spc:(sp:space+ { return {type: 'TEXT', value: sp.join('') } } / comment)*
660		~~- &eolf~~
661		~~- {~~
662		~~- clearFlag('h');~~
663		~~- var level = Math.min(s.length, e.length);~~
664		~~- // convert surplus equals into text~~
665		~~- if(s.length > level) {~~
666		~~- var extras = s.substr(0, s.length - level);~~
667		~~- if(c[0].type == 'TEXT') {~~
668		~~- c[0].value = extras + c[0].value;~~
669		~~- } else {~~
670		~~- c.unshift({type: 'TEXT', value: extras});~~
671		~~- }~~
672		~~- }~~
673		~~- if(e.length > level) {~~
674		~~- var extras = e.substr(0, e.length - level),~~
675		~~- lastElem = c[c.length - 1];~~
676		~~- if(lastElem.type == 'TEXT') {~~
677		~~- lastElem.value = lastElem.value + extras;~~
678		~~- } else {~~
679		~~- c.push({type: 'TEXT', value: extras});~~
680		~~- }~~
681		~~- }~~
682		-
683		~~- return [{type: 'TAG', name: 'h' + level}]~~
684		~~- .concat(c, [{type: 'ENDTAG', name: 'h' + level}, spc]);~~
685		~~- }~~
686		~~- / & { dp('nomatch exit h'); clearFlag('h'); return false } { return null }~~
687		~~- ) { return r }~~
688		-
689		-
690		~~-pre_indent~~
691		~~- = l:pre_indent_line ls:(sol pre_indent_line)* {~~
692		~~- return [{type: 'TAG', name: 'pre'}]~~
693		~~- .concat( [l], ls~~
694		~~- , [{type: 'ENDTAG', name: 'pre'}]);~~
695		~~- }~~
696		~~-pre_indent_line = space l:inlineline {~~
697		~~- return [{type: 'TEXT', value: '\n'}].concat(l);~~
698		-}
699		-
700		-
701		~~-comment~~
702		~~- = '<!--' c:comment_chars* ('-->' / eof)~~
703		~~- cs:(space* newline space* cn:comment { return cn })* {~~
704		~~- return [{ type: 'COMMENT', value: c.join('') }].concat(cs);~~
705		~~- }~~
706		-
707		~~-comment_chars~~
708		~~- = c:[^-] { return c; }~~
709		~~- / c:'-' !'->' { return c; }~~
710		-
711		-
712		~~-urllink~~
713		~~- = target:url {~~
714		~~- return [ { type: 'TAG',~~
715		~~- name: 'a',~~
716		~~- attribs: [['href', target]] }~~
717		~~- , {type: 'TEXT', value: target}~~
718		~~- , {type: 'ENDTAG', name: 'a'}~~
719		~~- ];~~
720		~~- }~~
721		-
722		~~-extlink~~
723		~~- = "["~~
724		~~- & { return setFlag('extlink'); }~~
725		~~- target:url~~
726		- space*
727		~~- text:inlineline?~~
728		~~- "]" {~~
729		~~- clearFlag('extlink');~~
730		~~- if ( text == '' ) {~~
731		~~- // XXX: Link numbering should be implemented in post-processor.~~
732		~~- text = [{type: 'TEXT', value: "[" + linkCount + "]"}];~~
733		~~- linkCount++;~~
734		~~- }~~
735		~~- return [ { type: 'TAG',~~
736		~~- name: 'a',~~
737		~~- attribs: [['href', target]] } ]~~
738		~~- .concat( text~~
739		~~- , [{type: 'ENDTAG', name: 'a'}]);~~
740		~~- }~~
741		~~- / "[" & { clearFlag('extlink'); return false; }~~
742		-
743		~~-/* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can~~
744		~~- * be configured dynamically. */~~
745		~~-url_protocol~~
746		~~- = '//' // for protocol-relative URLs~~
747		~~- / 'ftp://'~~
748		~~- / 'git://'~~
749		~~- / 'gopher://'~~
750		~~- / 'http://'~~
751		~~- / 'https://'~~
752		~~- / 'irc://'~~
753		~~- / 'ircs://' // @bug 28503~~
754		~~- / 'mailto:'~~
755		~~- / 'mms://'~~
756		~~- / 'news:'~~
757		~~- / 'nntp://' // @bug 3808 RFC 1738~~
758		~~- / 'svn://'~~
759		~~- / 'telnet://' // Well if we're going to support the above.. -ævar~~
760		~~- / 'worldwind://'~~
761		-
762		~~-// javascript does not support unicode features..~~
763		~~-unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]~~
764		-
765		-
766		~~-urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] {~~
767		~~- return decodeURI("%" + c0 + c1)~~
768		-}
769		-
770		~~-//[^][<>"\\x00-\\x20\\x7F\p{Zs}]~~
771		~~-url~~
772		~~- = proto:url_protocol~~
773		~~- rest:( [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]~~
774		~~- / s:[.:,] !(space / eolf) { return s }~~
775		~~- / htmlentity~~
776		~~- / urlencoded_char~~
777		~~- / [&%] )+~~
778		-{
779		~~- return proto + rest.join('');~~
780		-}
781		-
782		~~-template~~
783		~~- = "{{" target:template_target~~
784		- params:(newline? "\|" newline? p:template_param { return p })*
785		~~- "}}" {~~
786		~~- var obj = { type: 'TAG', name: 'template',~~
787		~~- attribs: [['target', target]],~~
788		~~- args: {}}~~
789		~~- if (params && params.length) {~~
790		~~- var position = 1;~~
791		~~- for ( var i = 0, l = params.length; i < l; i++ ) {~~
792		~~- var param = params[i];~~
793		~~- if ( param[0] === null ) {~~
794		~~- obj.args[position] = param[1];~~
795		~~- position++;~~
796		~~- } else {~~
797		~~- obj.args[param[0]] = param[1];~~
798		~~- }~~
799		~~- }~~
800		~~- // HACK: temporarily also push the args into an attribute~~
801		~~- // (just for debugging)~~
802		~~- obj.attribs.push(['data-args', JSON.stringify(obj.args)]);~~
803		~~- }~~
804		~~- // Should actually use a self-closing tag here, but the Node HTML5~~
805		~~- // parser only recognizes known self-closing tags for now, so use an~~
806		~~- // explicit end tag for now.~~
807		~~- //console.log(pp(obj));~~
808		~~- return obj;~~
809		~~- }~~
810		-
811		~~-template_target~~
812		~~- = h:( !"}}" x:([^\|\n]) { return x } )* { return h.join(''); }~~
813		-
814		~~-template_param~~
815		~~- = name:template_param_name space* "=" space* c:template_param_text {~~
816		~~- return [name, c];~~
817		~~- } / c:template_param_text {~~
818		~~- return [null, c];~~
819		~~- }~~
820		-
821		~~-tplarg~~
822		~~- = "{{{" name:link_target params:("\|" p:template_param { return p })* "}}}" {~~
823		~~- var obj = {~~
824		~~- type: 'SELFCLOSINGTAG',~~
825		~~- name: 'templatearg',~~
826		~~- attribs: [['argname', name]]~~
827		~~- };~~
828		~~- if (params && params.length) {~~
829		~~- // HACK, not final.~~
830		~~- obj.attribs.push(['data-args', JSON.stringify(params)]);~~
831		~~- }~~
832		~~- return obj;~~
833		~~- }~~
834		-
835		~~-template_param_name~~
836		~~- = h:( !"}}" x:([^=\|\n]) { return x } )* { return h.join(''); }~~
837		-
838		~~-template_param_text~~
839		~~- = & { return setFlag('template') }~~
840		~~- il:inline+ {~~
841		~~- clearFlag('template');~~
842		~~- return il;~~
843		~~- }~~
844		~~- / & { clearFlag('template'); return false; }~~
845		-
846		~~-wikilink~~
847		~~- = "[["~~
848		~~- ! url~~
849		~~- target:link_target text:("\|" lt:link_text { return lt })* "]]" suffix:text? {~~
850		~~- var obj = {~~
851		~~- type: 'TAG',~~
852		~~- name: 'a',~~
853		~~- attribs: [['data-type', 'internal']]~~
854		~~- };~~
855		~~- obj.attribs.push(['href', target]);~~
856		~~- if (text && text.length) {~~
857		~~- var textTokens = text;~~
858		~~- } else {~~
859		~~- if (suffix !== '') {~~
860		~~- target += suffix;~~
861		~~- }~~
862		~~- var textTokens = [{type: 'TEXT', value: target}];~~
863		~~- }~~
864		~~- return [obj].concat(textTokens, [{type: 'ENDTAG', name: 'a'}]);~~
865		~~- }~~
866		-
867		~~-link_target~~
868		~~- = h:( c:[^\|%\n\]]+ { return c.join('') } // quickly eat anything unsuspicious~~
869		~~- / !"]]"~~
870		~~- hi:(~~
871		~~- [^\|%\n]~~
872		~~- / urlencoded_char~~
873		~~- / '%'~~
874		~~- ) { return hi }~~
875		~~- )* { return h.join(''); }~~
876		-
877		~~-link_text~~
878		~~- = h:( & { return setFlag('linkdesc'); }~~
879		~~- x:inlineline { return x }~~
880		~~- )* {~~
881		~~- clearFlag('linkdesc')~~
882		~~- return h;~~
883		~~- }~~
884		~~- / & { clearFlag('linkdesc') } { return null; }~~
885		-
886		~~-link_end = "]]"~~
887		-
888		~~-/* Generic quote production for italic and bold, further processed in a token~~
889		~~- * stream transformation in doQuotes. Relies on NEWLINE tokens being emitted~~
890		~~- * for each line of text to balance quotes per line.~~
891		- *
892		~~- * We are not using a simple pair rule here as we need to support mis-nested~~
893		~~- * bolds/italics and MediaWiki's special heuristics for apostrophes, which are~~
894		~~- * all not context free. */~~
895		~~-quote = "''" x:"'"* {~~
896		~~- return {~~
897		~~- type : 'QUOTE',~~
898		~~- value: "''" + x.join('')~~
899		~~- }~~
900		-}
901		-
902		~~-/* XXX: Extension tags can require a change in the tokenizer mode, which~~
903		~~- * returns any text between extension tags verbatim. For now, we simply~~
904		~~- * continue to parse the contained text and return the tokens. The original~~
905		~~- * input source can be recovered from the source positions added on tag~~
906		~~- * tokens. This won't however work in all cases. For example, a comment start~~
907		~~- * (<!--) between extension tags would cause the remaining text to be consumed~~
908		~~- * as a comment. To avoid this, we might need to look ahead for the end tag~~
909		~~- * and limit the content parsing to this section. */~~
910		-
911		~~-xmlish_tag = nowiki / generic_tag~~
912		-
913		~~-pre~~
914		~~- = "<pre"~~
915		- attribs:generic_attribute*
916		~~- ">"~~
917		~~- ts:(t1:[^<]+ { return {type:'TEXT',value:t1.join('')} }~~
918		~~- / nowiki~~
919		~~- / !"</pre>" t2:. {return {type:'TEXT',value:t2}})+~~
920		~~- ("</pre>" / eof) {~~
921		~~- // return nowiki tags as well?~~
922		~~- //console.log('inpre');~~
923		~~- return [ {type: 'TAG', name: 'pre', attribs: attribs} ]~~
924		~~- .concat(ts, [{type: 'ENDTAG', name: 'pre'}]);~~
925		~~- }~~
926		~~- / "</pre>" { return {type: 'TEXT', value: "</pre>"}; }~~
927		-
928		~~-nowiki~~
929		~~- = "<nowiki>" nc:nowiki_content "</nowiki>" {~~
930		~~- // console.log(pp(nc));~~
931		~~- return nc;~~
932		~~- }~~
933		~~- / "<nowiki>" {~~
934		~~- //console.log('nowiki fallback');~~
935		~~- return [{type: 'TEXT', value: '<nowiki>'}];~~
936		~~- }~~
937		~~- / "</nowiki>" { return [{type: 'TEXT', value: '</nowiki>'}]; }~~
938		-
939		~~-nowiki_content~~
940		~~- = ts:( t:[^<]+ { return t.join('') }~~
941		~~- / "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {~~
942		~~- //console.log('nested pre in nowiki');~~
943		~~- return ["<pre"].concat(p0, p1, [">"], [p2[0].value], ["</pre>"]).join('');~~
944		~~- }~~
945		~~- / (!("</nowiki>" / "</pre>") c:. {return c})~~
946		~~- )* {~~
947		~~- // return nowiki tags as well?~~
948		~~- return [{type: 'TEXT', value: ts.join('')}];~~
949		~~- }~~
950		-
951		~~-// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and~~
952		~~-// following paragraphs~~
953		~~-block_tag~~
954		~~- = "<" end:"/"? name:(cs:[a-zA-Z]+ { return cs.join('') })~~
955		- attribs:generic_attribute*
956		~~- selfclose:"/"?~~
957		~~- ">" {~~
958		~~- if (block_names[name.toLowerCase()] !== true) {~~
959		~~- // abort match if tag is not block-level~~
960		~~- return null;~~
961		~~- }~~
962		~~- var res = {name: name, attribs: attribs};~~
963		~~- if ( end != '' ) {~~
964		~~- res.type = 'ENDTAG';~~
965		~~- } else if ( selfclose != '' ) {~~
966		~~- res.type = 'SELFCLOSINGTAG';~~
967		~~- } else {~~
968		~~- res.type = 'TAG';~~
969		~~- }~~
970		~~- return [res];~~
971		~~- }~~
972		-
973		~~-/* Generic XML-like tags~~
974		- *
975		~~- * These also cover extensions (including Cite), which will hook into the~~
976		~~- * token stream for further processing. The content of extension tags is~~
977		~~- * parsed as regular inline, but the source positions of the tag are added~~
978		~~- * to allow reconstructing the unparsed text from the input. */~~
979		-
980		~~-// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and~~
981		~~-// following paragraphs~~
982		~~-generic_tag~~
983		~~- = "<"~~
984		~~- & { tagStartPos = pos; return true; } // remember the start position of this tag~~
985		~~- end:"/"? name:[0-9a-zA-Z]+~~
986		- attribs:generic_attribute*
987		- space*
988		~~- selfclose:"/"?~~
989		~~- ">" {~~
990		~~- var res = {name: name.join(''), attribs: attribs};~~
991		~~- if ( end != '' ) {~~
992		~~- res.type = 'ENDTAG';~~
993		~~- } else if ( selfclose != '' ) {~~
994		~~- res.type = 'SELFCLOSINGTAG';~~
995		~~- } else {~~
996		~~- res.type = 'TAG';~~
997		~~- }~~
998		~~- res.attribs.push(['data-sourceTagPos', (tagStartPos - 1) + ":" + pos]);~~
999		~~- return res;~~
1000		~~- }~~
1001		-
1002		~~-generic_attribute~~
1003		- = s:space*
1004		~~- name:generic_attribute_name~~
1005		- value:(space*
1006		~~- v:generic_attribute_value { return v })?~~
1007		-{
1008		~~- if ( value !== '' ) {~~
1009		~~- return [name, value];~~
1010		~~- } else {~~
1011		~~- return [name,''];~~
1012		~~- }~~
1013		-}
1014		-
1015		~~-// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also~~
1016		~~-// disallow newlines and \|.~~
1017		~~-generic_attribute_name~~
1018		~~- = n:[^ \t\0/"'>=\n\|]+ {~~
1019		~~- return n.join('');~~
1020		~~- }~~
1021		-
1022		~~-generic_attribute_value~~
1023		~~- = "=" space* v:att_value {return v}~~
1024		-
1025		~~-att_value~~
1026		~~- = t:[^ \t'"<>='\n]+ { return t.join(''); }~~
1027		~~- // XXX: is "\"" also valid html? or just Wikitext?~~
1028		~~- / "'" t:[^'>]* "'" { return unquote("'", t.join('')); }~~
1029		~~- / '"' t:[^">]* '"' { return unquote('"', t.join('')); }~~
1030		-
1031		-
1032		~~-/* Lists */~~
1033		-lists = e:(dtdd / li) es:(sol (dtdd / li))*
1034		-{
1035		~~- return annotateList( [ { type: 'TAG', name: 'list'} ]~~
1036		~~- .concat(flatten([e].concat(es))~~
1037		~~- ,[{ type: 'ENDTAG', name: 'list' }]));~~
1038		-}
1039		-
1040		~~-li = bullets:list_char+~~
1041		~~- c:inlineline?~~
1042		~~- &eolf~~
1043		-{
1044		~~- if ( c == '' )~~
1045		~~- c = [];~~
1046		~~- return [ { type: 'TAG',~~
1047		~~- name: 'listItem',~~
1048		~~- bullets: bullets }~~
1049		~~- , c ];~~
1050		-}
1051		-
1052		~~-dtdd~~
1053		- = bullets:(!(";" !list_char) list_char)*
1054		~~- ";"~~
1055		~~- & {return setFlag('colon');}~~
1056		~~- c:inlineline~~
1057		~~- ":"~~
1058		~~- // Fortunately dtdds cannot be nested, so we can simply set the flag~~
1059		~~- // back to 0 to disable it.~~
1060		~~- & {syntaxFlags['colon'] = 0; return true;}~~
1061		~~- d:inlineline~~
1062		~~- &eolf {~~
1063		~~- // Convert trailing space into  ~~
1064		~~- // XXX: This should be moved to a serializer~~
1065		~~- //var clen = c.length;~~
1066		~~- //if (clen && c[clen - 1].type === 'TEXT') {~~
1067		~~- // var val = c[clen - 1].value;~~
1068		~~- // if(val.length && val[val.length - 1] == ' ') {~~
1069		~~- // c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";~~
1070		~~- // }~~
1071		~~- //}~~
1072		-
1073		~~- return [ { type: 'TAG', name: 'listItem', bullets: bullets + ";" } ]~~
1074		~~- .concat( c~~
1075		~~- ,[{ type: 'TAG', name: 'listItem', bullets: bullets + ":" } ]~~
1076		~~- , d );~~
1077		~~- }~~
1078		~~- // Fall-back case to clear the colon flag~~
1079		~~- / & { return true; } { syntaxFlags['colon'] = 0; return null; }~~
1080		-
1081		-
1082		~~-list_char = [*#:;]~~
1083		-
1084		-
1085		~~-/* Tables */~~
1086		~~-table~~
1087		~~- = tas:table_start space* c:table_caption? b:table_body? table_end {~~
1088		~~- var res = {type: 'TAG', name: 'table'}~~
1089		~~- var body = b !== '' ? b : [];~~
1090		~~- dp("body: " + pp(body));~~
1091		~~- if (tas.length > 0) {~~
1092		~~- // FIXME: actually parse and build structure~~
1093		~~- //res.attribs = [['data-unparsed', tas.join('')]];~~
1094		~~- res.attribs = tas;~~
1095		~~- }~~
1096		-
1097		~~- if (c != '') {~~
1098		~~- var caption = [{type: 'TAG', name: 'caption'}]~~
1099		~~- .concat(c, [{type: 'ENDTAG', name: 'caption'}]);~~
1100		~~- } else {~~
1101		~~- var caption = [];~~
1102		~~- }~~
1103		~~- //dp(pp(res));~~
1104		-
1105		~~- return [res].concat(caption, body,~~
1106		~~- [{type: 'ENDTAG', name: 'table'}]);~~
1107		~~- }~~
1108		-
1109		~~-table_start~~
1110		~~- = "{\|"~~
1111		~~- res:(~~
1112		~~- & { setFlag('table'); return true; }~~
1113		- ta:generic_attribute*
1114		~~- {~~
1115		~~- dp("table_start " + pp(ta) + ", pos:" + pos);~~
1116		~~- return ta;~~
1117		~~- }~~
1118		~~- / & { clearFlag('table'); return false; } { return null; }~~
1119		~~- ) { return res }~~
1120		-
1121		~~-table_attribs~~
1122		~~- = text / ! inline_breaks !newline ![\|] c:. { return c }~~
1123		-
1124		~~-table_caption~~
1125		~~- = newline~~
1126		~~- "\|+" c:inline* {~~
1127		~~- return c;~~
1128		~~- }~~
1129		-
1130		~~-table_body~~
1131		~~- = //& { dp("table_body enter"); return true; }~~
1132		~~- firstrow:table_firstrow otherrows:table_row* {~~
1133		~~- /* dp('table first and otherrows: '~~
1134		~~- * + pp([firstrow].concat(otherrows))); */~~
1135		~~- return [firstrow].concat(otherrows);~~
1136		~~- }~~
1137		~~- / otherrows:table_row* {~~
1138		~~- //dp('table otherrows: ' + pp(otherrows));~~
1139		~~- return otherrows;~~
1140		~~- }~~
1141		-
1142		~~-table_firstrow~~
1143		~~- = td:table_data+ {~~
1144		~~- //dp('firstrow: ' + pp(td));~~
1145		~~- return [{ type: 'TAG', name: 'tr' }]~~
1146		~~- .concat(td, [{type: 'ENDTAG', name: 'tr'}]);~~
1147		~~- }~~
1148		-
1149		~~-table_row~~
1150		~~- = //& { dp("table row enter"); return true; }~~
1151		~~- newline~~
1152		~~- "\|-" thtd_attribs? space* td:(table_data / table_header)* {~~
1153		~~- return [{type: 'TAG', name: 'tr'}]~~
1154		~~- .concat(td, [{type: 'ENDTAG', name: 'tr'}]);~~
1155		~~- }~~
1156		-
1157		~~-table_data~~
1158		~~- = //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }~~
1159		~~- ("\|\|" / newline "\|")~~
1160		~~- ! [}+-]~~
1161		~~- //& { dp('before attrib, pos=' + pos); return true; }~~
1162		~~- a:(as:generic_attribute+ space* "\|" !"\|" { return as } )?~~
1163		~~- //& { dp('past attrib, pos=' + pos); return true; }~~
1164		~~- // use inline_breaks to break on tr etc~~
1165		~~- td:(!inline_breaks~~
1166		~~- //& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }~~
1167		~~- b:block { return b })* {~~
1168		~~- if ( a == '' ) {~~
1169		~~- a = [];~~
1170		~~- }~~
1171		~~- //dp("table data result: " + pp(td) + ", attribts: " + pp(a));~~
1172		~~- return [{ type: 'TAG', name: 'td', attribs: a}]~~
1173		~~- .concat(td, [{type: 'ENDTAG', name: 'td'}]);~~
1174		~~- }~~
1175		-
1176		~~-table_header~~
1177		~~- = ("!!" / newline "!")~~
1178		~~- a:(as:generic_attribute+ "!" !"!" { return as } )?~~
1179		~~- c:inline {~~
1180		~~- if ( a == '' ) {~~
1181		~~- a = [];~~
1182		~~- }~~
1183		~~- return [{type: 'TAG', name: 'th', attribs: a}]~~
1184		~~- .concat(c, [{type: 'ENDTAG', name: 'th'}]);~~
1185		~~- }~~
1186		-
1187		~~-thtd_attribs~~
1188		~~- // In particular, do not match [\|\n]~~
1189		~~- = a:(text / ! inline_breaks c:[="':;/,. -] { return c } )+ "\|" ! "\|" {~~
1190		~~- return a;~~
1191		~~- }~~
1192		-
1193		-
1194		~~-table_end~~
1195		~~- = newline? "\|}" { clearFlag('table'); }~~
1196		~~- / newline? eof~~
1197		-
1198		-
1199		~~-/* Tabs do not mix well with the hybrid production syntax */~~
1200		~~-/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent: */~~
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.HTML5TreeBuilder.node.js
—	—	@@ -1,6 +1,9 @@
	2	+/* Front-end/Wrapper for a particular tree builder, in this case the
	3	+ * parser/tree builder from the node 'html5' module. Feed it tokens using
	4	+ * processToken, and it will build you a DOM tree retrievable using .document
	5	+ * or .body(). */
	6	+
2	7	var events = require('events');
3		-
4		-
5	8	var HTML5 = require('./html5/index');
6	9
7	10	FauxHTML5 = {};
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js
—	—	@@ -0,0 +1,133 @@
	2	+/**
	3	+ * Parser for wikitext to provisional temp structure, using PEG.js and
	4	+ * a separate PEG grammar file (pegParser.pegjs.txt)
	5	+ *
	6	+ * Use along with the MWTreeRenderer and MWTreeSerializer classes for
	7	+ * HTML output and source round-tripping.
	8	+ *
	9	+ * If installed as a user script or to customize, set parserPlaygroundPegPage
	10	+ * to point at the MW page name containing the parser peg definition; default
	11	+ * is 'MediaWiki:Gadget-ParserPlayground-PegParser.pegjs'.
	12	+ */
	13	+function PegTokenizer(env) {
	14	+ this.env = env \|\| {};
	15	+}
	16	+
	17	+PegTokenizer.src = false;
	18	+
	19	+PegTokenizer.prototype.tokenize = function(text, callback) {
	20	+ this.initSource(function() {
	21	+ var out, err;
	22	+ try {
	23	+ var parser = PEG.buildParser(PegTokenizer.src);
	24	+ out = parser.parse(text);
	25	+ } catch (e) {
	26	+ err = e;
	27	+ console.trace();
	28	+ } finally {
	29	+ callback(out, err);
	30	+ }
	31	+ });
	32	+}
	33	+
	34	+/**
	35	+ * @param {object} tree
	36	+ * @param {function(tree, error)} callback
	37	+ */
	38	+PegTokenizer.prototype.expandTree = function(tree, callback) {
	39	+ var self = this;
	40	+ var subParseArray = function(listOfTrees) {
	41	+ var content = [];
	42	+ $.each(listOfTrees, function(i, subtree) {
	43	+ self.expandTree(subtree, function(substr, err) {
	44	+ content.push(tree);
	45	+ });
	46	+ });
	47	+ return content;
	48	+ };
	49	+ var src;
	50	+ if (typeof tree === "string") {
	51	+ callback(tree);
	52	+ return;
	53	+ }
	54	+ if (tree.type == 'template') {
	55	+ // expand a template node!
	56	+
	57	+ // Resolve a possibly relative link
	58	+ var templateName = this.env.resolveTitle( tree.target, 'Template' );
	59	+ this.env.fetchTemplate( tree.target, tree.params \|\| {}, function( templateSrc, error ) {
	60	+ // @fixme should pre-parse/cache these too?
	61	+ self.parseToTree( templateSrc, function( templateTree, error ) {
	62	+ if ( error ) {
	63	+ callback({
	64	+ type: 'placeholder',
	65	+ orig: tree,
	66	+ content: [
	67	+ {
	68	+ // @fixme broken link?
	69	+ type: 'link',
	70	+ target: templateName
	71	+ }
	72	+ ]
	73	+ });
	74	+ } else {
	75	+ callback({
	76	+ type: 'placeholder',
	77	+ orig: tree,
	78	+ content: self.env.expandTemplateArgs( templateTree, tree.params )
	79	+ });
	80	+ }
	81	+ })
	82	+ } );
	83	+ // Wait for async...
	84	+ return;
	85	+ }
	86	+ var out = $.extend( tree ); // @fixme prefer a deep copy?
	87	+ if (tree.content) {
	88	+ out.content = subParseArray(tree.content);
	89	+ }
	90	+ callback(out);
	91	+};
	92	+
	93	+PegTokenizer.prototype.initSource = function(callback) {
	94	+ if (PegTokenizer.src) {
	95	+ callback();
	96	+ } else {
	97	+ if ( typeof parserPlaygroundPegPage !== 'undefined' ) {
	98	+ $.ajax({
	99	+ url: wgScriptPath + '/api' + wgScriptExtension,
	100	+ data: {
	101	+ format: 'json',
	102	+ action: 'query',
	103	+ prop: 'revisions',
	104	+ rvprop: 'content',
	105	+ titles: parserPlaygroundPegPage
	106	+ },
	107	+ success: function(data, xhr) {
	108	+ $.each(data.query.pages, function(i, page) {
	109	+ if (page.revisions && page.revisions.length) {
	110	+ PegTokenizer.src = page.revisions[0]['*'];
	111	+ }
	112	+ });
	113	+ callback()
	114	+ },
	115	+ dataType: 'json',
	116	+ cache: false
	117	+ }, 'json');
	118	+ } else {
	119	+ $.ajax({
	120	+ url: mw.config.get('wgParserPlaygroundAssetsPath', mw.config.get('wgExtensionAssetsPath')) + '/ParserPlayground/modules/pegParser.pegjs.txt',
	121	+ success: function(data) {
	122	+ PegTokenizer.src = data;
	123	+ callback();
	124	+ },
	125	+ dataType: 'text',
	126	+ cache: false
	127	+ });
	128	+ }
	129	+ }
	130	+};
	131	+
	132	+if (typeof module == "object") {
	133	+ module.exports.PegTokenizer = PegTokenizer;
	134	+}
Property changes on: trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js
___________________________________________________________________
Added: svn:eol-style
1	135	+ native
Index: trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
—	—	@@ -0,0 +1,1199 @@
	2	+/* Produces output more or less compatible with FakeParser; plug it into FP's output and see */
	3	+{
	4	+ /* Fixme: use static functions to separate module! Unfortunately, this
	5	+ * does not work:
	6	+ * var tu = require('./mediawiki.tokenizer.utils.js');
	7	+ * console.log(tu.flatten([]));
	8	+ * Using exports in the module gets a bit further, but accesses to
	9	+ * tu.flatten in productions still fail. Thus, I just moved the functions
	10	+ * here until a solution is found:
	11	+ */
	12	+
	13	+ /* Static utilities */
	14	+
	15	+ // Flatten a list of lists.
	16	+ var flatten = function ( e ) {
	17	+ var es = [];
	18	+ // flatten sub-arrays
	19	+ for(var i = 0, length = e.length; i < length; i++) {
	20	+ var ei = e[i];
	21	+ if ($.isArray(ei))
	22	+ es = es.concat(flatten(ei));
	23	+ else
	24	+ es.push(ei);
	25	+ };
	26	+ return es;
	27	+ };
	28	+
	29	+ // Remove escaped quotes from attributes etc
	30	+ var unquote = function (quotec, text) {
	31	+ return text.replace('\\' + quotec, quotec);
	32	+ };
	33	+
	34	+ // Decode html entities. In a browser, this should only be fed the entity,
	35	+ // not untrusted html! XXX: replace with safer version.
	36	+ var unentity = function ( entity ) {
	37	+ return $("<div/>").html(entity).text();
	38	+ };
	39	+
	40	+ // Debug print with global switch
	41	+ var dp = function ( msg ) {
	42	+ if ( false ) {
	43	+ console.log(msg);
	44	+ }
	45	+ };
	46	+
	47	+ var pp = function ( s ) { return JSON.stringify(s, null, 2); }
	48	+
	49	+ /*
	50	+ * Annotate a token stream with list items with appropriate list tokens
	51	+ *
	52	+ * @static
	53	+ * @method
	54	+ * @param {[tokens]} Token stream with li tokens
	55	+ * @returns {[tokens]} Token stream, possibly with additional list tokens
	56	+ * */
	57	+ var annotateList = function ( tokens ) {
	58	+ var out = [], // List of tokens
	59	+ bstack = [], // Bullet stack, previous element's listStyle
	60	+ bnext = [], // Next element's listStyle
	61	+ endtags = []; // Stack of end tags
	62	+
	63	+ var commonPrefixLength = function (x, y) {
	64	+ var minLength = Math.min(x.length, y.length);
	65	+ for(var i = 0; i < minLength; i++) {
	66	+ if (x[i] != y[i])
	67	+ break;
	68	+ }
	69	+ return i;
	70	+ };
	71	+
	72	+ var pushList = function ( listName, itemName ) {
	73	+ out.push({type: 'TAG', name: listName});
	74	+ out.push({type: 'TAG', name: itemName});
	75	+ endtags.push({type: 'ENDTAG', name: listName});
	76	+ endtags.push({type: 'ENDTAG', name: itemName});
	77	+ };
	78	+
	79	+ var popTags = function ( n ) {
	80	+ for(;n > 0; n--) {
	81	+ // push list item..
	82	+ out.push(endtags.pop());
	83	+ // and the list end tag
	84	+ out.push(endtags.pop());
	85	+ }
	86	+ };
	87	+
	88	+ var isDlDd = function (a, b) {
	89	+ var ab = [a,b].sort();
	90	+ return (ab[0] === ':' && ab[1] === ';');
	91	+ };
	92	+
	93	+ var doListItem = function ( bs, bn ) {
	94	+ var prefixLen = commonPrefixLength (bs, bn);
	95	+ var changeLen = Math.max(bs.length, bn.length) - prefixLen;
	96	+ var prefix = bn.slice(0, prefixLen);
	97	+ // emit close tag tokens for closed lists
	98	+ if (changeLen === 0) {
	99	+ var itemToken = endtags.pop();
	100	+ out.push(itemToken);
	101	+ out.push({type: 'TAG', name: itemToken.name});
	102	+ endtags.push({type: 'ENDTAG', name: itemToken.name});
	103	+ } else if ( bs.length == bn.length
	104	+ && changeLen == 1
	105	+ && isDlDd( bs[prefixLen], bn[prefixLen] ) ) {
	106	+ // handle dd/dt transitions
	107	+ out.push(endtags.pop());
	108	+ if( bn[prefixLen] == ';') {
	109	+ var newName = 'dt';
	110	+ } else {
	111	+ var newName = 'dd';
	112	+ }
	113	+ out.push({type: 'TAG', name: newName});
	114	+ endtags.push({type: 'ENDTAG', name: newName});
	115	+ } else {
	116	+ popTags(bs.length - prefixLen);
	117	+
	118	+ if (prefixLen > 0 && bn.length == prefixLen ) {
	119	+ var itemToken = endtags.pop();
	120	+ out.push(itemToken);
	121	+ out.push({type: 'TAG', name: itemToken.name});
	122	+ endtags.push({type: 'ENDTAG', name: itemToken.name});
	123	+ }
	124	+
	125	+ for(var i = prefixLen; i < bn.length; i++) {
	126	+ switch (bn[i]) {
	127	+ case '*':
	128	+ pushList('ul', 'li');
	129	+ break;
	130	+ case '#':
	131	+ pushList('ol', 'li');
	132	+ break;
	133	+ case ';':
	134	+ pushList('dl', 'dt');
	135	+ break;
	136	+ case ':':
	137	+ pushList('dl', 'dd');
	138	+ break;
	139	+ default:
	140	+ throw("Unknown node prefix " + prefix[i]);
	141	+ }
	142	+ }
	143	+ }
	144	+ };
	145	+
	146	+ for (var i = 0, length = tokens.length; i < length; i++) {
	147	+ var token = tokens[i];
	148	+ switch ( token.type ) {
	149	+ case 'TAG':
	150	+ switch (token.name) {
	151	+ case 'list':
	152	+ // ignore token
	153	+ break;
	154	+ case 'listItem':
	155	+ // convert listItem to list and list item tokens
	156	+ bnext = token.bullets;
	157	+ doListItem( bstack, bnext );
	158	+ bstack = bnext;
	159	+ break;
	160	+ default:
	161	+ // pass through all remaining start tags
	162	+ out.push(token);
	163	+ break;
	164	+ }
	165	+ break;
	166	+ case 'ENDTAG':
	167	+ if ( token.name == 'list' ) {
	168	+ // pop all open list item tokens
	169	+ popTags(bstack.length);
	170	+ bstack = [];
	171	+ } else {
	172	+ out.push(token);
	173	+ }
	174	+ break;
	175	+ default:
	176	+ out.push(token);
	177	+ break;
	178	+ }
	179	+ }
	180	+ return out;
	181	+ };
	182	+
	183	+ /*
	184	+ * Italic/Bold handling.
	185	+ *
	186	+ * - list of tokens
	187	+ * - NEWLINE
	188	+ * - ticks (2+) -> list with link in line token list?
	189	+ * - process on newline
	190	+ * - need access to text nodes before/after for conversion back to text
	191	+ */
	192	+ var doQuotes = function ( tokens ) {
	193	+
	194	+ var italics = [],
	195	+ bolds = [],
	196	+ out = [],
	197	+ inserted = 0;
	198	+
	199	+ var convertBold = function ( i ) {
	200	+ var index = bolds[i];
	201	+ var txt = out[index - 1];
	202	+ txt.value += "'";
	203	+ if ( i > 0 ) {
	204	+ bolds = bolds.slice(0, i)
	205	+ .concat(bolds.slice(i + 1, bolds.length - i - 1));
	206	+ } else {
	207	+ bolds.shift();
	208	+ }
	209	+
	210	+ italics.push(index);
	211	+ italics.sort(function(a,b) { return a - b });
	212	+ };
	213	+
	214	+ // convert italics/bolds into tags
	215	+ var quotesToTags = function ( offsets, name ) {
	216	+ var toggle = true;
	217	+ for (var j = 0; j < offsets.length; j++) {
	218	+ var t = out[offsets[j]];
	219	+ if(toggle) {
	220	+ t.type = 'TAG';
	221	+ } else {
	222	+ t.type = 'ENDTAG';
	223	+ }
	224	+ t.name = name;
	225	+ delete t.value;
	226	+ toggle = !toggle;
	227	+ }
	228	+ if (!toggle) {
	229	+ // add end tag
	230	+ out.push({type: 'ENDTAG', name: name});
	231	+ inserted++;
	232	+ }
	233	+ toggle = true;
	234	+ };
	235	+
	236	+ for (var i = 0, length = tokens.length; i < length; i++) {
	237	+ var token = tokens[i];
	238	+ switch (token.type) {
	239	+ case 'QUOTE':
	240	+ // depending on length, add starting 's to preceding text node
	241	+ // (if any)
	242	+ // add token index to italic/bold lists
	243	+ // add placeholder for token
	244	+ var qlen = token.value.length;
	245	+ switch (qlen) {
	246	+ case 2: italics.push(i + inserted); out.push(token); break;
	247	+ case 3: bolds.push(i + inserted); out.push(token); break;
	248	+ case 4:
	249	+ token.value = "'''";
	250	+ if (i > 0 && tokens[i-1].type === 'TEXT') {
	251	+ tokens[i-1].value += "'";
	252	+ } else {
	253	+ out.push({type: 'TEXT', value: "'"});
	254	+ inserted++;
	255	+ }
	256	+ bolds.push(i + inserted);
	257	+ out.push(token);
	258	+ break;
	259	+ case 5:
	260	+ // order does not matter here, will be fixed
	261	+ // by HTML parser backend
	262	+ italics.push(i + inserted);
	263	+ out.push({type: 'QUOTE', value: "''"});
	264	+ inserted++;
	265	+ bolds.push(i + inserted);
	266	+ out.push({type: 'QUOTE', value: "'''"});
	267	+ break;
	268	+ default: // longer than 5, only use the last 5 ticks
	269	+ token.value = "'''''";
	270	+ var newvalue = token.value.substr(0, qlen - 5 );
	271	+ if (i > 0 && tokens[i-1].type === 'TEXT') {
	272	+ tokens[i-1].value += newvalue;
	273	+ } else {
	274	+ out.push({type: 'TEXT', value: newvalue});
	275	+ inserted++;
	276	+ }
	277	+ italics.push(i + inserted);
	278	+ out.push({type: 'QUOTE', value: "''"});
	279	+ inserted++;
	280	+ bolds.push(i + inserted);
	281	+ out.push({type: 'QUOTE', value: "'''"});
	282	+ break;
	283	+ }
	284	+ break;
	285	+
	286	+ case 'NEWLINE':
	287	+ // balance out tokens, convert placeholders into tags
	288	+ if (italics.length % 2 && bolds.length % 2) {
	289	+ dp("balancing!");
	290	+ var firstsingleletterword = -1,
	291	+ firstmultiletterword = -1,
	292	+ firstspace = -1;
	293	+ for (var j = 0; j < bolds.length; j++) {
	294	+ var ticki = bolds[j];
	295	+ if (ticki > 0 && out[ticki - 1].type === 'TEXT') {
	296	+ var txt = out[ticki - 1],
	297	+ lastchar = txt.value[txt.value.length - 1],
	298	+ secondtolastchar = txt.value[txt.value.length - 2];
	299	+ dp('txt: ' + pp(txt));
	300	+ if (lastchar === ' ' && firstspace === -1) {
	301	+ firstspace = j;
	302	+ } else if (lastchar !== ' ') {
	303	+ if ( secondtolastchar === ' ' &&
	304	+ firstsingleletterword === -1)
	305	+ {
	306	+ firstsingleletterword = j;
	307	+ } else if ( firstmultiletterword == -1) {
	308	+ firstmultiletterword = j;
	309	+ }
	310	+ }
	311	+ }
	312	+ }
	313	+
	314	+
	315	+ // now see if we can convert a bold to an italic and
	316	+ // an apostrophe
	317	+ if (firstsingleletterword > -1) {
	318	+ convertBold(firstsingleletterword);
	319	+ } else if (firstmultiletterword > -1) {
	320	+ convertBold(firstmultiletterword);
	321	+ } else if (firstspace > -1) {
	322	+ convertBold(firstspace);
	323	+ }
	324	+ }
	325	+
	326	+ quotesToTags(bolds, 'b');
	327	+ quotesToTags(italics, 'i');
	328	+ bolds = [];
	329	+ italics = [];
	330	+ out.push(token);
	331	+ break;
	332	+ default:
	333	+ out.push(token);
	334	+ }
	335	+ }
	336	+ return out;
	337	+ };
	338	+
	339	+
	340	+ /* End static utilities */
	341	+
	342	+ /*
	343	+ * Flags for specific parse environments (inside tables, links etc). Flags
	344	+ * trigger syntactic stops in the inline_breaks production, which
	345	+ * terminates inline and attribute matches. Flags merely reduce the number
	346	+ * of productions needed: The grammar is still context-free as the
	347	+ * productions can just be unrolled for all combinations of environments
	348	+ * at the cost of a much larger grammar.
	349	+ */
	350	+ var syntaxFlags = {};
	351	+ var setFlag = function(flag) {
	352	+ if (syntaxFlags[flag] !== undefined) {
	353	+ syntaxFlags[flag]++;
	354	+ } else {
	355	+ syntaxFlags[flag] = 1;
	356	+ }
	357	+ return true;
	358	+ };
	359	+ var clearFlag = function(flag) {
	360	+ syntaxFlags[flag]--;
	361	+ };
	362	+
	363	+ // Start position of top-level block
	364	+ // Could also provide positions for lower-level blocks using a stack.
	365	+ var blockStart = 0;
	366	+
	367	+ // Start position of generic tag production
	368	+ var tagStartPos = 0;
	369	+
	370	+ // cache the input length
	371	+ var inputLength = input.length;
	372	+
	373	+ // pseudo-production that matches at end of input
	374	+ var isEOF = function (pos) {
	375	+ return pos === inputLength;
	376	+ };
	377	+
	378	+ // text start position
	379	+ var textStart = 0;
	380	+
	381	+ // hack to support numbered external links ([http://example.com]).
	382	+ // XXX: Move to token stream transform after templates are expanded!
	383	+ var linkCount = 1;
	384	+
	385	+ // Define block-level tags in JS, so we can use toLowerCase to match tags
	386	+ // case-independently. This would be quite ugly (and possibly slower) if
	387	+ // done manually in the grammar.
	388	+ var block_names = (function () {
	389	+ var names = [ "p", "table", "td", "tr", "ul", "ol"
	390	+ , "li", "dl", "dt", "dd", "div", "center"
	391	+ , "blockquote" ];
	392	+ var bnames = {};
	393	+ for(var i = 0, l = names.length; i < l; i++) {
	394	+ bnames[names[i]] = true;
	395	+ }
	396	+ return bnames;
	397	+ })();
	398	+
	399	+
	400	+}
	401	+
	402	+start
	403	+ = e:toplevelblock* newline* {
	404	+ return flatten(e);
	405	+ }
	406	+
	407	+
	408	+/* All chars that cannot start syntactic structures in the middle of a line
	409	+ * XXX: ] and other end delimiters should probably only be activated inside
	410	+ * structures to avoid unnecessarily leaving the text production on plain
	411	+ * content. */
	412	+
	413	+text_char = [^'<~[{\n\r:\]}\|!=]
	414	+
	415	+text = t:text_char+ { return t.join(''); }
	416	+
	417	+/* Explanation of chars
	418	+ * ' quotes (italic/bold)
	419	+ * < start of xmlish_tag
	420	+ * ~ signatures/dates
	421	+ * [ start of links
	422	+ * { start of parser functions, transclusion and template args
	423	+ * \n all sort of block-level markup at start of line
	424	+ * \r ditto
	425	+ * h http(s) urls
	426	+ * n nntp(s) urls
	427	+ * m mailto urls
	428	+ *
	429	+ * ! and \| table cell delimiters, might be better to specialize those
	430	+ * = headings - also specialize those!
	431	+ *
	432	+ * The following chars are also included for now, but only apply in some
	433	+ * contexts and should probably be enabled only in those:
	434	+ * : separate definition in ; term : definition
	435	+ * ] end of link
	436	+ * } end of parser func/transclusion/template arg
	437	+ */
	438	+
	439	+urltext = ( t:[^'<~[{\n\rfghimnstw\|!:\]} &=]+ { return t.join(''); }
	440	+ / htmlentity
	441	+ / urllink
	442	+ // Convert trailing space into
	443	+ // XXX: This should be moved to a serializer
	444	+ / ' ' & ':' { return "\u00a0"; }
	445	+ / t:text_char )+
	446	+
	447	+/*
	448	+ '//', // for protocol-relative URLs, but not in text!
	449	+ 'ftp://',
	450	+ 'git://',
	451	+ 'gopher://',
	452	+ 'http://',
	453	+ 'https://',
	454	+ 'irc://',
	455	+ 'ircs://', // @bug 28503
	456	+ 'mailto:',
	457	+ 'mms://',
	458	+ 'news:',
	459	+ 'nntp://', // @bug 3808 RFC 1738
	460	+ 'svn://',
	461	+ 'telnet://', // Well if we're going to support the above.. -ævar
	462	+ 'worldwind://',
	463	+*/
	464	+
	465	+// Old version
	466	+//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
	467	+
	468	+// Experimental tweaked version: avoid expensive single-char substrings
	469	+// This did not bring the expected performance boost, however.
	470	+//text = [A-Za-z0-9,._ -] {
	471	+// textStart = pos;
	472	+//
	473	+// var res = input.substr(textStart - 1, inputLength)
	474	+// .match(/[A-Za-z0-9,._ -]+/)[0];
	475	+// pos = pos + (res.length - 1);
	476	+// return res
	477	+// }
	478	+
	479	+htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
	480	+ return unentity("&" + c.join('') + ";")
	481	+}
	482	+
	483	+space
	484	+ = s:[ \t]+ { return s.join(''); }
	485	+
	486	+optionalSpaceToken
	487	+ = s:space* {
	488	+ if ( s.length ) {
	489	+ return [{type: 'TEXT', value: s.join('')}];
	490	+ } else {
	491	+ return [];
	492	+ }
	493	+ }
	494	+
	495	+
	496	+// Start of line
	497	+sol = (newline / & { return pos === 0; } { return true; })
	498	+ cn:(c:comment n:newline? { return [c, {type: 'TEXT', value: n}] })* {
	499	+ return [{type: 'NEWLINE'}].concat(cn);
	500	+ }
	501	+
	502	+eof = & { return isEOF(pos); } { return true; }
	503	+
	504	+
	505	+newline
	506	+ = '\n' / '\r\n'
	507	+
	508	+eolf = newline / eof
	509	+
	510	+toplevelblock
	511	+ = & { blockStart = pos; return true; } b:block {
	512	+ b = flatten(b);
	513	+ var bs = b[0];
	514	+ //dp('toplevelblock:' + pp(b));
	515	+ if (bs.attribs === undefined) {
	516	+ bs.attribs = [];
	517	+ }
	518	+ bs.attribs.push(['data-sourcePos', blockStart + ':' + pos]);
	519	+ // XXX: only run this for lines that actually need it!
	520	+ b.push({type: 'NEWLINE'});
	521	+ b = doQuotes(b);
	522	+ return b;
	523	+ }
	524	+
	525	+block
	526	+ = block_lines
	527	+ / pre
	528	+ / comment &eolf
	529	+ / nowiki
	530	+ / pre
	531	+ / bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag
	532	+ / para
	533	+ / inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor
	534	+ / s:sol {
	535	+ if (s) {
	536	+ return [s, {type: 'NEWLINE'}];
	537	+ } else {
	538	+ return [{type: 'NEWLINE'}];
	539	+ }
	540	+ }
	541	+
	542	+block_lines
	543	+ = s:sol
	544	+ // eat an empty line before the block
	545	+ s2:(ss:space* so:sol { return [{type: 'TEXT', value: ss.join('')}].concat(so) })?
	546	+ bl:block_line {
	547	+ var s2_ = (s2 !== '') ? s2 : [];
	548	+ return s.concat(s2_, bl);
	549	+ }
	550	+
	551	+// Block structures with start-of-line wiki syntax
	552	+block_line
	553	+ = h
	554	+ / table
	555	+ / lists
	556	+ // tag-only lines should not trigger pre
	557	+ / st:optionalSpaceToken
	558	+ bt:(bts:block_tag stl:optionalSpaceToken { return bts.concat(stl) })+
	559	+ &eolf {
	560	+ return st.concat(bt);
	561	+ }
	562	+ / pre_indent
	563	+ / pre
	564	+
	565	+
	566	+
	567	+
	568	+// TODO: convert inline content to annotations!
	569	+para
	570	+ = s1:sol s2:sol c:inlineline {
	571	+ return s1.concat(s2, [{type: 'TAG', name: 'p'}], c);
	572	+ }
	573	+
	574	+br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} }
	575	+
	576	+// Syntax stops to limit inline expansion defending on syntactic context
	577	+inline_breaks
	578	+ =
	579	+ & { // Important hack: disable caching for this production, as the default
	580	+ // cache key does not take into account flag states!
	581	+ cacheKey = '';
	582	+ return true;
	583	+ }
	584	+ & { return syntaxFlags['table']; }
	585	+ a:(newline [!\|] / '\|\|' / '!!' / '\|}') { dp("table break" + pp(a) + pos); return true; }
	586	+ / & { return (syntaxFlags['colon'] &&
	587	+ ! syntaxFlags.extlink && // example: ; [[Link:Term]] : Definition
	588	+ ! syntaxFlags.linkdesc); } ":" { return true; }
	589	+ / & { return syntaxFlags['extlink']; } "]" { return true; }
	590	+ / & { return syntaxFlags['linkdesc']; } link_end { return true; }
	591	+ / & { return syntaxFlags['h']; } '='+ space* newline { return true; }
	592	+ / & { return syntaxFlags['template']; } ('\|' / '}}') { return true; }
	593	+
	594	+inline
	595	+ = c:(urltext / (! inline_breaks (inline_element / . )))+ {
	596	+ var out = [];
	597	+ var text = [];
	598	+ c = flatten(c);
	599	+ for (var i = 0, l = c.length; i < l; i++) {
	600	+ var ci = c[i];
	601	+ if (typeof ci == 'string') {
	602	+ text.push(ci);
	603	+ } else {
	604	+ if (text.length) {
	605	+ out.push({ type: "TEXT", value: text.join('') });
	606	+ text = [];
	607	+ }
	608	+ out.push(ci);
	609	+ }
	610	+ }
	611	+ if (text.length) {
	612	+ out.push({ type: 'TEXT', value: text.join('') });
	613	+ }
	614	+ //dp('inline out:' + pp(out));
	615	+ return out;
	616	+}
	617	+
	618	+
	619	+inlineline
	620	+ = c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
	621	+ var out = [];
	622	+ var text = [];
	623	+ c = flatten(c);
	624	+ for (var i = 0; i < c.length; i++) {
	625	+ var ci = c[i]
	626	+ if (typeof ci == 'string') {
	627	+ text.push(ci);
	628	+ } else {
	629	+ if (text.length) {
	630	+ out.push({type: 'TEXT', value: text.join('')});
	631	+ text = [];
	632	+ }
	633	+ out.push(ci);
	634	+ }
	635	+ }
	636	+ if (text.length) {
	637	+ out.push({type: 'TEXT', value: text.join('')});
	638	+ }
	639	+ //dp('inlineline out:' + pp(out));
	640	+ return out;
	641	+}
	642	+
	643	+inline_element
	644	+ = //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
	645	+ & '<' ( comment / xmlish_tag )
	646	+ / & '{' ( template / tplarg )
	647	+ / & '[' ( wikilink / extlink )
	648	+ / & "'" quote
	649	+
	650	+/* Headings */
	651	+
	652	+h = & "=" // guard, to make sure '='+ will match.
	653	+ // XXX: Also check to end to avoid inline parsing?
	654	+ r:(
	655	+ s:'='+ // moved in here to make s accessible to inner action
	656	+ & { return setFlag('h'); }
	657	+ c:inlineline
	658	+ e:'='+
	659	+ spc:(sp:space+ { return {type: 'TEXT', value: sp.join('') } } / comment)*
	660	+ &eolf
	661	+ {
	662	+ clearFlag('h');
	663	+ var level = Math.min(s.length, e.length);
	664	+ // convert surplus equals into text
	665	+ if(s.length > level) {
	666	+ var extras = s.substr(0, s.length - level);
	667	+ if(c[0].type == 'TEXT') {
	668	+ c[0].value = extras + c[0].value;
	669	+ } else {
	670	+ c.unshift({type: 'TEXT', value: extras});
	671	+ }
	672	+ }
	673	+ if(e.length > level) {
	674	+ var extras = e.substr(0, e.length - level),
	675	+ lastElem = c[c.length - 1];
	676	+ if(lastElem.type == 'TEXT') {
	677	+ lastElem.value = lastElem.value + extras;
	678	+ } else {
	679	+ c.push({type: 'TEXT', value: extras});
	680	+ }
	681	+ }
	682	+
	683	+ return [{type: 'TAG', name: 'h' + level}]
	684	+ .concat(c, [{type: 'ENDTAG', name: 'h' + level}, spc]);
	685	+ }
	686	+ / & { dp('nomatch exit h'); clearFlag('h'); return false } { return null }
	687	+ ) { return r }
	688	+
	689	+
	690	+pre_indent
	691	+ = l:pre_indent_line ls:(sol pre_indent_line)* {
	692	+ return [{type: 'TAG', name: 'pre'}]
	693	+ .concat( [l], ls
	694	+ , [{type: 'ENDTAG', name: 'pre'}]);
	695	+ }
	696	+pre_indent_line = space l:inlineline {
	697	+ return [{type: 'TEXT', value: '\n'}].concat(l);
	698	+}
	699	+
	700	+
	701	+comment
	702	+ = '<!--' c:comment_chars* ('-->' / eof)
	703	+ cs:(space* newline space* cn:comment { return cn })* {
	704	+ return [{ type: 'COMMENT', value: c.join('') }].concat(cs);
	705	+ }
	706	+
	707	+comment_chars
	708	+ = c:[^-] { return c; }
	709	+ / c:'-' !'->' { return c; }
	710	+
	711	+
	712	+urllink
	713	+ = target:url {
	714	+ return [ { type: 'TAG',
	715	+ name: 'a',
	716	+ attribs: [['href', target]] }
	717	+ , {type: 'TEXT', value: target}
	718	+ , {type: 'ENDTAG', name: 'a'}
	719	+ ];
	720	+ }
	721	+
	722	+extlink
	723	+ = "["
	724	+ & { return setFlag('extlink'); }
	725	+ target:url
	726	+ space*
	727	+ text:inlineline?
	728	+ "]" {
	729	+ clearFlag('extlink');
	730	+ if ( text == '' ) {
	731	+ // XXX: Link numbering should be implemented in post-processor.
	732	+ text = [{type: 'TEXT', value: "[" + linkCount + "]"}];
	733	+ linkCount++;
	734	+ }
	735	+ return [ { type: 'TAG',
	736	+ name: 'a',
	737	+ attribs: [['href', target]] } ]
	738	+ .concat( text
	739	+ , [{type: 'ENDTAG', name: 'a'}]);
	740	+ }
	741	+ / "[" & { clearFlag('extlink'); return false; }
	742	+
	743	+/* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can
	744	+ * be configured dynamically. */
	745	+url_protocol
	746	+ = '//' // for protocol-relative URLs
	747	+ / 'ftp://'
	748	+ / 'git://'
	749	+ / 'gopher://'
	750	+ / 'http://'
	751	+ / 'https://'
	752	+ / 'irc://'
	753	+ / 'ircs://' // @bug 28503
	754	+ / 'mailto:'
	755	+ / 'mms://'
	756	+ / 'news:'
	757	+ / 'nntp://' // @bug 3808 RFC 1738
	758	+ / 'svn://'
	759	+ / 'telnet://' // Well if we're going to support the above.. -ævar
	760	+ / 'worldwind://'
	761	+
	762	+// javascript does not support unicode features..
	763	+unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
	764	+
	765	+
	766	+urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] {
	767	+ return decodeURI("%" + c0 + c1)
	768	+}
	769	+
	770	+//[^][<>"\\x00-\\x20\\x7F\p{Zs}]
	771	+url
	772	+ = proto:url_protocol
	773	+ rest:( [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
	774	+ / s:[.:,] !(space / eolf) { return s }
	775	+ / htmlentity
	776	+ / urlencoded_char
	777	+ / [&%] )+
	778	+{
	779	+ return proto + rest.join('');
	780	+}
	781	+
	782	+template
	783	+ = "{{" target:template_target
	784	+ params:(newline? "\|" newline? p:template_param { return p })*
	785	+ "}}" {
	786	+ var obj = { type: 'TAG', name: 'template',
	787	+ attribs: [['target', target]],
	788	+ args: {}}
	789	+ if (params && params.length) {
	790	+ var position = 1;
	791	+ for ( var i = 0, l = params.length; i < l; i++ ) {
	792	+ var param = params[i];
	793	+ if ( param[0] === null ) {
	794	+ obj.args[position] = param[1];
	795	+ position++;
	796	+ } else {
	797	+ obj.args[param[0]] = param[1];
	798	+ }
	799	+ }
	800	+ // HACK: temporarily also push the args into an attribute
	801	+ // (just for debugging)
	802	+ obj.attribs.push(['data-args', JSON.stringify(obj.args)]);
	803	+ }
	804	+ // Should actually use a self-closing tag here, but the Node HTML5
	805	+ // parser only recognizes known self-closing tags for now, so use an
	806	+ // explicit end tag for now.
	807	+ //console.log(pp(obj));
	808	+ return obj;
	809	+ }
	810	+
	811	+template_target
	812	+ = h:( !"}}" x:([^\|\n]) { return x } )* { return h.join(''); }
	813	+
	814	+template_param
	815	+ = name:template_param_name space* "=" space* c:template_param_text {
	816	+ return [name, c];
	817	+ } / c:template_param_text {
	818	+ return [null, c];
	819	+ }
	820	+
	821	+tplarg
	822	+ = "{{{" name:link_target params:("\|" p:template_param { return p })* "}}}" {
	823	+ var obj = {
	824	+ type: 'SELFCLOSINGTAG',
	825	+ name: 'templatearg',
	826	+ attribs: [['argname', name]]
	827	+ };
	828	+ if (params && params.length) {
	829	+ // HACK, not final.
	830	+ obj.attribs.push(['data-args', JSON.stringify(params)]);
	831	+ }
	832	+ return obj;
	833	+ }
	834	+
	835	+template_param_name
	836	+ = h:( !"}}" x:([^=\|\n]) { return x } )* { return h.join(''); }
	837	+
	838	+template_param_text
	839	+ = & { return setFlag('template') }
	840	+ il:inline+ {
	841	+ clearFlag('template');
	842	+ return il;
	843	+ }
	844	+ / & { clearFlag('template'); return false; }
	845	+
	846	+wikilink
	847	+ = "[["
	848	+ ! url
	849	+ target:link_target text:("\|" lt:link_text { return lt })* "]]" suffix:text? {
	850	+ var obj = {
	851	+ type: 'TAG',
	852	+ name: 'a',
	853	+ attribs: [['data-type', 'internal']]
	854	+ };
	855	+ obj.attribs.push(['href', target]);
	856	+ if (text && text.length) {
	857	+ var textTokens = text;
	858	+ } else {
	859	+ if (suffix !== '') {
	860	+ target += suffix;
	861	+ }
	862	+ var textTokens = [{type: 'TEXT', value: target}];
	863	+ }
	864	+ return [obj].concat(textTokens, [{type: 'ENDTAG', name: 'a'}]);
	865	+ }
	866	+
	867	+link_target
	868	+ = h:( c:[^\|%\n\]]+ { return c.join('') } // quickly eat anything unsuspicious
	869	+ / !"]]"
	870	+ hi:(
	871	+ [^\|%\n]
	872	+ / urlencoded_char
	873	+ / '%'
	874	+ ) { return hi }
	875	+ )* { return h.join(''); }
	876	+
	877	+link_text
	878	+ = h:( & { return setFlag('linkdesc'); }
	879	+ x:inlineline { return x }
	880	+ )* {
	881	+ clearFlag('linkdesc')
	882	+ return h;
	883	+ }
	884	+ / & { clearFlag('linkdesc') } { return null; }
	885	+
	886	+link_end = "]]"
	887	+
	888	+/* Generic quote production for italic and bold, further processed in a token
	889	+ * stream transformation in doQuotes. Relies on NEWLINE tokens being emitted
	890	+ * for each line of text to balance quotes per line.
	891	+ *
	892	+ * We are not using a simple pair rule here as we need to support mis-nested
	893	+ * bolds/italics and MediaWiki's special heuristics for apostrophes, which are
	894	+ * all not context free. */
	895	+quote = "''" x:"'"* {
	896	+ return {
	897	+ type : 'QUOTE',
	898	+ value: "''" + x.join('')
	899	+ }
	900	+}
	901	+
	902	+/* XXX: Extension tags can require a change in the tokenizer mode, which
	903	+ * returns any text between extension tags verbatim. For now, we simply
	904	+ * continue to parse the contained text and return the tokens. The original
	905	+ * input source can be recovered from the source positions added on tag
	906	+ * tokens. This won't however work in all cases. For example, a comment start
	907	+ * (<!--) between extension tags would cause the remaining text to be consumed
	908	+ * as a comment. To avoid this, we might need to look ahead for the end tag
	909	+ * and limit the content parsing to this section. */
	910	+
	911	+xmlish_tag = nowiki / generic_tag
	912	+
	913	+pre
	914	+ = "<pre"
	915	+ attribs:generic_attribute*
	916	+ ">"
	917	+ ts:(t1:[^<]+ { return {type:'TEXT',value:t1.join('')} }
	918	+ / nowiki
	919	+ / !"</pre>" t2:. {return {type:'TEXT',value:t2}})+
	920	+ ("</pre>" / eof) {
	921	+ // return nowiki tags as well?
	922	+ //console.log('inpre');
	923	+ return [ {type: 'TAG', name: 'pre', attribs: attribs} ]
	924	+ .concat(ts, [{type: 'ENDTAG', name: 'pre'}]);
	925	+ }
	926	+ / "</pre>" { return {type: 'TEXT', value: "</pre>"}; }
	927	+
	928	+nowiki
	929	+ = "<nowiki>" nc:nowiki_content "</nowiki>" {
	930	+ // console.log(pp(nc));
	931	+ return nc;
	932	+ }
	933	+ / "<nowiki>" {
	934	+ //console.log('nowiki fallback');
	935	+ return [{type: 'TEXT', value: '<nowiki>'}];
	936	+ }
	937	+ / "</nowiki>" { return [{type: 'TEXT', value: '</nowiki>'}]; }
	938	+
	939	+nowiki_content
	940	+ = ts:( t:[^<]+ { return t.join('') }
	941	+ / "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
	942	+ //console.log('nested pre in nowiki');
	943	+ return ["<pre"].concat(p0, p1, [">"], [p2[0].value], ["</pre>"]).join('');
	944	+ }
	945	+ / (!("</nowiki>" / "</pre>") c:. {return c})
	946	+ )* {
	947	+ // return nowiki tags as well?
	948	+ return [{type: 'TEXT', value: ts.join('')}];
	949	+ }
	950	+
	951	+// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
	952	+// following paragraphs
	953	+block_tag
	954	+ = "<" end:"/"? name:(cs:[a-zA-Z]+ { return cs.join('') })
	955	+ attribs:generic_attribute*
	956	+ selfclose:"/"?
	957	+ ">" {
	958	+ if (block_names[name.toLowerCase()] !== true) {
	959	+ // abort match if tag is not block-level
	960	+ return null;
	961	+ }
	962	+ var res = {name: name, attribs: attribs};
	963	+ if ( end != '' ) {
	964	+ res.type = 'ENDTAG';
	965	+ } else if ( selfclose != '' ) {
	966	+ res.type = 'SELFCLOSINGTAG';
	967	+ } else {
	968	+ res.type = 'TAG';
	969	+ }
	970	+ return [res];
	971	+ }
	972	+
	973	+/* Generic XML-like tags
	974	+ *
	975	+ * These also cover extensions (including Cite), which will hook into the
	976	+ * token stream for further processing. The content of extension tags is
	977	+ * parsed as regular inline, but the source positions of the tag are added
	978	+ * to allow reconstructing the unparsed text from the input. */
	979	+
	980	+// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
	981	+// following paragraphs
	982	+generic_tag
	983	+ = "<"
	984	+ & { tagStartPos = pos; return true; } // remember the start position of this tag
	985	+ end:"/"? name:[0-9a-zA-Z]+
	986	+ attribs:generic_attribute*
	987	+ space*
	988	+ selfclose:"/"?
	989	+ ">" {
	990	+ var res = {name: name.join(''), attribs: attribs};
	991	+ if ( end != '' ) {
	992	+ res.type = 'ENDTAG';
	993	+ } else if ( selfclose != '' ) {
	994	+ res.type = 'SELFCLOSINGTAG';
	995	+ } else {
	996	+ res.type = 'TAG';
	997	+ }
	998	+ res.attribs.push(['data-sourceTagPos', (tagStartPos - 1) + ":" + pos]);
	999	+ return res;
	1000	+ }
	1001	+
	1002	+generic_attribute
	1003	+ = s:space*
	1004	+ name:generic_attribute_name
	1005	+ value:(space*
	1006	+ v:generic_attribute_value { return v })?
	1007	+{
	1008	+ if ( value !== '' ) {
	1009	+ return [name, value];
	1010	+ } else {
	1011	+ return [name,''];
	1012	+ }
	1013	+}
	1014	+
	1015	+// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
	1016	+// disallow newlines and \|.
	1017	+generic_attribute_name
	1018	+ = n:[^ \t\0/"'>=\n\|]+ {
	1019	+ return n.join('');
	1020	+ }
	1021	+
	1022	+generic_attribute_value
	1023	+ = "=" space* v:att_value {return v}
	1024	+
	1025	+att_value
	1026	+ = t:[^ \t'"<>='\n]+ { return t.join(''); }
	1027	+ // XXX: is "\"" also valid html? or just Wikitext?
	1028	+ / "'" t:[^'>]* "'" { return unquote("'", t.join('')); }
	1029	+ / '"' t:[^">]* '"' { return unquote('"', t.join('')); }
	1030	+
	1031	+
	1032	+/* Lists */
	1033	+lists = e:(dtdd / li) es:(sol (dtdd / li))*
	1034	+{
	1035	+ return annotateList( [ { type: 'TAG', name: 'list'} ]
	1036	+ .concat(flatten([e].concat(es))
	1037	+ ,[{ type: 'ENDTAG', name: 'list' }]));
	1038	+}
	1039	+
	1040	+li = bullets:list_char+
	1041	+ c:inlineline?
	1042	+ &eolf
	1043	+{
	1044	+ if ( c == '' )
	1045	+ c = [];
	1046	+ return [ { type: 'TAG',
	1047	+ name: 'listItem',
	1048	+ bullets: bullets }
	1049	+ , c ];
	1050	+}
	1051	+
	1052	+dtdd
	1053	+ = bullets:(!(";" !list_char) list_char)*
	1054	+ ";"
	1055	+ & {return setFlag('colon');}
	1056	+ c:inlineline
	1057	+ ":"
	1058	+ // Fortunately dtdds cannot be nested, so we can simply set the flag
	1059	+ // back to 0 to disable it.
	1060	+ & {syntaxFlags['colon'] = 0; return true;}
	1061	+ d:inlineline
	1062	+ &eolf {
	1063	+ // Convert trailing space into
	1064	+ // XXX: This should be moved to a serializer
	1065	+ //var clen = c.length;
	1066	+ //if (clen && c[clen - 1].type === 'TEXT') {
	1067	+ // var val = c[clen - 1].value;
	1068	+ // if(val.length && val[val.length - 1] == ' ') {
	1069	+ // c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
	1070	+ // }
	1071	+ //}
	1072	+
	1073	+ return [ { type: 'TAG', name: 'listItem', bullets: bullets + ";" } ]
	1074	+ .concat( c
	1075	+ ,[{ type: 'TAG', name: 'listItem', bullets: bullets + ":" } ]
	1076	+ , d );
	1077	+ }
	1078	+ // Fall-back case to clear the colon flag
	1079	+ / & { return true; } { syntaxFlags['colon'] = 0; return null; }
	1080	+
	1081	+
	1082	+list_char = [*#:;]
	1083	+
	1084	+
	1085	+/* Tables */
	1086	+table
	1087	+ = tas:table_start space* c:table_caption? b:table_body? table_end {
	1088	+ var res = {type: 'TAG', name: 'table'}
	1089	+ var body = b !== '' ? b : [];
	1090	+ dp("body: " + pp(body));
	1091	+ if (tas.length > 0) {
	1092	+ // FIXME: actually parse and build structure
	1093	+ //res.attribs = [['data-unparsed', tas.join('')]];
	1094	+ res.attribs = tas;
	1095	+ }
	1096	+
	1097	+ if (c != '') {
	1098	+ var caption = [{type: 'TAG', name: 'caption'}]
	1099	+ .concat(c, [{type: 'ENDTAG', name: 'caption'}]);
	1100	+ } else {
	1101	+ var caption = [];
	1102	+ }
	1103	+ //dp(pp(res));
	1104	+
	1105	+ return [res].concat(caption, body,
	1106	+ [{type: 'ENDTAG', name: 'table'}]);
	1107	+ }
	1108	+
	1109	+table_start
	1110	+ = "{\|"
	1111	+ res:(
	1112	+ & { setFlag('table'); return true; }
	1113	+ ta:generic_attribute*
	1114	+ {
	1115	+ dp("table_start " + pp(ta) + ", pos:" + pos);
	1116	+ return ta;
	1117	+ }
	1118	+ / & { clearFlag('table'); return false; } { return null; }
	1119	+ ) { return res }
	1120	+
	1121	+table_attribs
	1122	+ = text / ! inline_breaks !newline ![\|] c:. { return c }
	1123	+
	1124	+table_caption
	1125	+ = newline
	1126	+ "\|+" c:inline* {
	1127	+ return c;
	1128	+ }
	1129	+
	1130	+table_body
	1131	+ = //& { dp("table_body enter"); return true; }
	1132	+ firstrow:table_firstrow otherrows:table_row* {
	1133	+ /* dp('table first and otherrows: '
	1134	+ * + pp([firstrow].concat(otherrows))); */
	1135	+ return [firstrow].concat(otherrows);
	1136	+ }
	1137	+ / otherrows:table_row* {
	1138	+ //dp('table otherrows: ' + pp(otherrows));
	1139	+ return otherrows;
	1140	+ }
	1141	+
	1142	+table_firstrow
	1143	+ = td:table_data+ {
	1144	+ //dp('firstrow: ' + pp(td));
	1145	+ return [{ type: 'TAG', name: 'tr' }]
	1146	+ .concat(td, [{type: 'ENDTAG', name: 'tr'}]);
	1147	+ }
	1148	+
	1149	+table_row
	1150	+ = //& { dp("table row enter"); return true; }
	1151	+ newline
	1152	+ "\|-" thtd_attribs? space* td:(table_data / table_header)* {
	1153	+ return [{type: 'TAG', name: 'tr'}]
	1154	+ .concat(td, [{type: 'ENDTAG', name: 'tr'}]);
	1155	+ }
	1156	+
	1157	+table_data
	1158	+ = //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
	1159	+ ("\|\|" / newline "\|")
	1160	+ ! [}+-]
	1161	+ //& { dp('before attrib, pos=' + pos); return true; }
	1162	+ a:(as:generic_attribute+ space* "\|" !"\|" { return as } )?
	1163	+ //& { dp('past attrib, pos=' + pos); return true; }
	1164	+ // use inline_breaks to break on tr etc
	1165	+ td:(!inline_breaks
	1166	+ //& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
	1167	+ b:block { return b })* {
	1168	+ if ( a == '' ) {
	1169	+ a = [];
	1170	+ }
	1171	+ //dp("table data result: " + pp(td) + ", attribts: " + pp(a));
	1172	+ return [{ type: 'TAG', name: 'td', attribs: a}]
	1173	+ .concat(td, [{type: 'ENDTAG', name: 'td'}]);
	1174	+ }
	1175	+
	1176	+table_header
	1177	+ = ("!!" / newline "!")
	1178	+ a:(as:generic_attribute+ "!" !"!" { return as } )?
	1179	+ c:inline {
	1180	+ if ( a == '' ) {
	1181	+ a = [];
	1182	+ }
	1183	+ return [{type: 'TAG', name: 'th', attribs: a}]
	1184	+ .concat(c, [{type: 'ENDTAG', name: 'th'}]);
	1185	+ }
	1186	+
	1187	+thtd_attribs
	1188	+ // In particular, do not match [\|\n]
	1189	+ = a:(text / ! inline_breaks c:[="':;/,. -] { return c } )+ "\|" ! "\|" {
	1190	+ return a;
	1191	+ }
	1192	+
	1193	+
	1194	+table_end
	1195	+ = newline? "\|}" { clearFlag('table'); }
	1196	+ / newline? eof
	1197	+
	1198	+
	1199	+/* Tabs do not mix well with the hybrid production syntax */
	1200	+/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent: */
Property changes on: trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
___________________________________________________________________
Added: svn:eol-style
1	1201	+ native

Status & tagging log

11:00, 8 December 2011 GWicke (talk | contribs) changed the status of r105536 [removed: new added: deferred]