r107921 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r107920‎ | r107921 | r107922 >
Date:18:44, 3 January 2012
Author:gwicke
Status:deferred
Tags:
Comment:
Land big TokenTransformDispatcher and eventization refactoring.

The TokenTransformDispatcher now actually implements an asynchronous, phased
token transformation framework as described in
https://www.mediawiki.org/wiki/Future/Parser_development/Token_stream_transformations.

Additionally, the parser pipeline is now mostly held together using events.
The tokenizer still emits a lame single events with all tokens, as block-level
emission failed with scoping issues specific to the PEGJS parser generator.
All stages clean up when receiving the end tokens, so that the full pipeline
can be used for repeated parsing.

The QuoteTransformer is not yet 100% fixed to work with the new interface, and
the Cite extension is disabled for now pending adaptation. Bold-italic related
tests are failing currently.
Modified paths:
  • /trunk/extensions/VisualEditor/modules/parser/ext.core.QuoteTransformer.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/html5/parser.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.HTML5TreeBuilder.node.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.TokenTransformDispatcher.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt (modified) (history)
  • /trunk/extensions/VisualEditor/tests/parser/parserTests.js (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/tests/parser/parserTests.js
@@ -382,33 +382,18 @@
383383 this.currentItem = item;
384384
385385 // Tokenize the input
386 - var res = pThingy.wikiTokenizer.tokenize(item.input);
 386+ pThingy.parse(item.input);
 387+ var doc = pThingy.document;
387388
388389 // Check for errors
389 - if (res.err) {
 390+ if (doc.err) {
390391 this.printTitle(item);
391392 this.failParseTests++;
392393 console.log('PARSE FAIL', res.err);
393394 } else {
394 - //var res = es.HtmlSerializer.stringify(tokens,environment);
 395+ // Check the result vs. the expected result.
 396+ this.checkResult( this.currentItem, doc.body.innerHTML );
395397
396 - //Slightly better token output debugging:
397 - //console.log( util.inspect( res.tokens, false, null ).yellow);
398 -
399 - // Transform tokens using the TokenTransformDispatcher. When done, the
400 - // TokenTransformDispatcher calls buildTree() and checkResult() with the
401 - // transformed tokens.
402 -
403 - //console.log(JSON.stringify(res.tokens, null, 2));
404 -
405 - pThingy.tokenDispatcher.transformTokens( res.tokens );
406 -
407 - // XXX make this NOT a property
408 - var out = pThingy.document.body.innerHTML;
409 -
410 - // Finally, check the result vs. the expected result.
411 - this.checkResult( this.currentItem, out );
412 -
413398 if ( this.argv.wikidom ) {
414399 // Test HTML DOM -> WikiDOM conversion
415400 this.printWikiDom( pThingy.getWikiDom() );
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.TokenTransformDispatcher.js
@@ -2,27 +2,14 @@
33 * expansion. Individual transformations register for the token types they are
44 * interested in and are called on each matching token.
55 *
6 - * A transformer might set TokenContext.token to null, a single token, or an
7 - * array of tokens before returning it.
8 - * - Null removes the token and stops further processing for this token.
9 - * - A single token is further processed using the remaining transformations
10 - * registered for this token, and finally placed in the output token list.
11 - * - A list of tokens stops the processing for this token. Instead, processing
12 - * restarts with the first returned token.
13 - *
14 - * Additionally, transformers performing asynchronous actions on a token can
15 - * create a new TokenAccumulator using .newAccumulator(). This creates a new
16 - * accumulator for each asynchronous result, with the asynchronously processed
17 - * token last in its internal accumulator. This setup avoids the need to apply
18 - * operational-transform-like index transformations when parallel expansions
19 - * insert tokens in front of other ongoing expansion tasks.
 6+ * See
 7+ * https://www.mediawiki.org/wiki/Future/Parser_development/Token_stream_transformations
 8+ * for more documentation.
209 *
21 - * XXX: I am not completely happy with the mutable TokenContext construct. At
22 - * least the token should probably be passed as a separate argument. Also,
23 - * integrate the general environment (configuration, cache etc). (gwicke)
24 - * */
 10+ * @author Gabriel Wicke <gwicke@wikimedia.org>
 11+ */
2512
26 -$ = require('jquery');
 13+var events = require('events');
2714
2815 /**
2916 * Central dispatcher for potentially asynchronous token transformations.
@@ -32,347 +19,603 @@
3320 * @param {Function} callback, a callback function accepting a token list as
3421 * its only argument.
3522 */
36 -function TokenTransformDispatcher( callback ) {
37 - this.cb = callback; // Called with transformed token list when done
 23+function TokenTransformDispatcher( ) {
3824 this.transformers = {
39 - tag: {}, // for TAG, ENDTAG, SELFCLOSINGTAG, keyed on name
40 - text: [],
41 - newline: [],
42 - comment: [],
43 - end: [], // eof
44 - martian: [], // none of the above (unknown token type)
45 - any: [] // all tokens, before more specific handlers are run
 25+ // phase 0 and 1, rank 2 marks tokens as fully processed for these
 26+ // phases.
 27+ 2: {
 28+ tag: {}, // for TAG, ENDTAG, SELFCLOSINGTAG, keyed on name
 29+ text: [],
 30+ newline: [],
 31+ comment: [],
 32+ end: [], // eof
 33+ martian: [], // none of the above (unknown token type)
 34+ any: [] // all tokens, before more specific handlers are run
 35+ },
 36+ // phase 3, with ranks >= 2 but < 3. 3 marks tokens as fully
 37+ // processed.
 38+ 3: {
 39+ tag: {}, // for TAG, ENDTAG, SELFCLOSINGTAG, keyed on name
 40+ text: [],
 41+ newline: [],
 42+ comment: [],
 43+ end: [], // eof
 44+ martian: [], // none of the above (unknown token type)
 45+ any: [] // all tokens, before more specific handlers are run
 46+ }
4647 };
4748 this.reset();
4849 }
4950
 51+// Inherit from EventEmitter
 52+TokenTransformDispatcher.prototype = new events.EventEmitter();
 53+
5054 /**
 55+ * Register to a token source, normally the tokenizer.
 56+ * The event emitter emits an 'tokens' event which contains a chunk of tokens,
 57+ * and signals the end of tokens by triggering the 'end' event.
 58+ *
 59+ * @param {Object} EventEmitter token even emitter.
 60+ */
 61+TokenTransformDispatcher.prototype.subscribeToTokenEmitter = function ( tokenEmitter ) {
 62+ tokenEmitter.addListener('chunk', this.transformTokens.bind( this ) );
 63+ tokenEmitter.addListener('end', this.onEndEvent.bind( this ) );
 64+};
 65+
 66+
 67+/**
5168 * Reset the internal token and outstanding-callback state of the
5269 * TokenTransformDispatcher, but keep registrations untouched.
5370 *
5471 * @method
5572 */
56 -TokenTransformDispatcher.prototype.reset = function () {
 73+TokenTransformDispatcher.prototype.reset = function ( env ) {
 74+ this.tailAccumulator = undefined;
 75+ this.phase2TailCB = this.returnTokens01.bind( this );
5776 this.accum = new TokenAccumulator(null);
5877 this.firstaccum = this.accum;
59 - this.outstanding = 1; // Number of outstanding processing steps
60 - // (e.g., async template fetches/expansions)
 78+ this.prevToken = undefined;
 79+ this.frame = {
 80+ args: {}, // no arguments at the top level
 81+ env: this.env
 82+ };
 83+ // Should be as static as possible re this and frame
 84+ // This is circular, but that should not really matter for non-broken GCs
 85+ // that handle pure JS ref loops.
 86+ this.frame.transformPhase = this.transformPhase01.bind( this, this.frame );
6187 };
6288
 89+TokenTransformDispatcher.prototype._rankToPhase = function ( rank ) {
 90+ if ( rank < 0 || rank > 3 ) {
 91+ throw "TransformDispatcher error: Invalid transformation rank " + rank;
 92+ }
 93+ if ( rank <= 2 ) {
 94+ return 2;
 95+ } else {
 96+ return 3;
 97+ }
 98+};
 99+
63100 /**
64 - * Append a listener registration. The new listener will be executed after
65 - * other listeners for the same token have been called.
 101+ * Add a transform registration.
66102 *
67103 * @method
68 - * @param {Function} listener, a function accepting a TokenContext and
69 - * returning a TokenContext.
 104+ * @param {Function} transform.
70105 * @param {String} type, one of 'tag', 'text', 'newline', 'comment', 'end',
71106 * 'martian' (unknown token), 'any' (any token, matched before other matches).
72107 * @param {String} tag name for tags, omitted for non-tags
73108 */
74 -TokenTransformDispatcher.prototype.appendListener = function ( listener, type, name ) {
 109+TokenTransformDispatcher.prototype.addTransform = function ( transformation, rank, type, name ) {
 110+ var phase = this._rankToPhase( rank ),
 111+ transArr,
 112+ transformer = {
 113+ transform: transformation,
 114+ rank: rank
 115+ };
75116 if ( type === 'tag' ) {
76117 name = name.toLowerCase();
77 - if ( $.isArray(this.transformers.tag.name) ) {
78 - this.transformers.tag[name].push(listener);
79 - } else {
80 - this.transformers.tag[name] = [listener];
 118+ transArr = this.transformers[phase].tag[name];
 119+ if ( ! transArr ) {
 120+ transArr = this.transformers[phase].tag[name] = [];
81121 }
82122 } else {
83 - this.transformers[type].push(listener);
 123+ transArr = this.transformers[phase][type];
84124 }
 125+ transArr.push(transformer);
 126+ // sort ascending by rank
 127+ transArr.sort( function ( t1, t2 ) { return t1.rank - t2.rank; } );
85128 };
86129
87130 /**
88 - * Prepend a listener registration. The new listener will be called before
89 - * other listeners for the same token have been called.
 131+ * Remove a transform registration
90132 *
91133 * @method
92 - * @param {Function} listener, a function accepting a TokenContext and
93 - * returning a TokenContext.
 134+ * @param {Number} rank, the numeric rank of the handler.
94135 * @param {String} type, one of 'tag', 'text', 'newline', 'comment', 'end',
95136 * 'martian' (unknown token), 'any' (any token, matched before other matches).
96137 * @param {String} tag name for tags, omitted for non-tags
97138 */
98 -TokenTransformDispatcher.prototype.prependListener = function ( listener, type, name ) {
 139+TokenTransformDispatcher.prototype.removeTransform = function ( rank, type, name ) {
 140+ var i = -1,
 141+ phase = this._rankToPhase( rank ),
 142+ ts;
 143+
 144+ function rankUnEqual ( i ) {
 145+ return i.rank !== rank;
 146+ }
 147+
99148 if ( type === 'tag' ) {
100149 name = name.toLowerCase();
101 - if ( $.isArray(this.transformers.tag.name) ) {
102 - this.transformers.tag[name].unshift(listener);
103 - } else {
104 - this.transformers.tag[name] = [listener];
 150+ var maybeTransArr = this.transformers[phase].tag.name;
 151+ if ( maybeTransArr ) {
 152+ this.transformers[phase].tag.name = maybeTransArr.filter( rankUnEqual );
105153 }
106154 } else {
107 - this.transformers[type].unshift(listener);
 155+ this.transformers[phase][type] = this.transformers[phase][type].filter( rankUnEqual ) ;
108156 }
109157 };
110158
111159 /**
112 - * Remove a listener registration
113 - *
114 - * XXX: matching the function for equality is not ideal. Use a string key
115 - * instead?
116 - *
117 - * @method
118 - * @param {Function} listener, a function accepting a TokenContext and
119 - * returning a TokenContext.
120 - * @param {String} type, one of 'tag', 'text', 'newline', 'comment', 'end',
121 - * 'martian' (unknown token), 'any' (any token, matched before other matches).
122 - * @param {String} tag name for tags, omitted for non-tags
 160+ * Enforce separation between phases when token types or tag names have
 161+ * changed, or when multiple tokens were returned. Processing will restart
 162+ * with the new rank.
123163 */
124 -TokenTransformDispatcher.prototype.removeListener = function ( listener, type, name ) {
125 - var i = -1;
126 - var ts;
127 - if ( type === 'tag' ) {
128 - name = name.toLowerCase();
129 - if ( $.isArray(this.transformers.tag.name) ) {
130 - ts = this.transformers.tag[name];
131 - i = ts.indexOf(listener);
 164+TokenTransformDispatcher.prototype._resetTokenRank = function ( res, transformer ) {
 165+ if ( res.token ) {
 166+ // reset rank after type or name change
 167+ if ( transformer.rank < 1 ) {
 168+ res.token.rank = 0;
 169+ } else {
 170+ res.token.rank = 1;
132171 }
133 - } else {
134 - ts = this.transformers[type];
135 - i = ts.indexOf(listener);
 172+ } else if ( res.tokens && transformer.rank > 2 ) {
 173+ for ( var i = 0; i < res.tokens.length; i++ ) {
 174+ if ( res.tokens[i].rank === undefined ) {
 175+ // Do not run phase 0 on newly created tokens from
 176+ // phase 1.
 177+ res.tokens[i].rank = 2;
 178+ }
 179+ }
136180 }
137 - if ( i >= 0 ) {
138 - ts.splice(i, 1);
139 - }
140181 };
141182
142 -/* Constructor for information context relevant to token transformers
143 - *
144 - * @param token The token to precess
145 - * @param accum {TokenAccumulator} The active TokenAccumulator.
146 - * @param processor {TokenTransformDispatcher} The TokenTransformDispatcher object.
147 - * @param lastToken Last returned token or {undefined}.
148 - * @returns {TokenContext}.
149 - */
150 -function TokenContext ( token, accum, dispatcher, lastToken ) {
151 - this.token = token;
152 - this.accum = accum;
153 - this.dispatcher = dispatcher;
154 - this.lastToken = lastToken;
155 - return this;
156 -}
157183
158184 /* Call all transformers on a tag.
159185 *
160 - * @param {TokenContext} The current token and its context.
161 - * @returns {TokenContext} Context with updated token and/or accum.
 186+ * @param {Object} The current token.
 187+ * @param {Function} Completion callback for async processing.
 188+ * @param {Number} Rank of phase end, both key for transforms and rank for
 189+ * processed tokens.
 190+ * @param {Object} The frame, contains a reference to the environment.
 191+ * @returns {Object} Token(s) and async indication.
162192 */
163 -TokenTransformDispatcher.prototype._transformTagToken = function ( tokenCTX ) {
 193+TokenTransformDispatcher.prototype._transformTagToken = function ( token, cb, phaseEndRank, frame ) {
164194 // prepend 'any' transformers
165 - var ts = this.transformers.any;
166 - var tagts = this.transformers.tag[tokenCTX.token.name.toLowerCase()];
 195+ var ts = this.transformers[phaseEndRank].any,
 196+ res = { token: token },
 197+ transform,
 198+ l, i,
 199+ aborted = false,
 200+ tName = token.name.toLowerCase(),
 201+ tagts = this.transformers[phaseEndRank].tag[tName];
 202+
167203 if ( tagts ) {
168204 ts = ts.concat(tagts);
169205 }
170206 //console.log(JSON.stringify(ts, null, 2));
171207 if ( ts ) {
172 - for (var i = 0, l = ts.length; i < l; i++ ) {
 208+ for ( i = 0, l = ts.length; i < l; i++ ) {
 209+ transformer = ts[i];
 210+ if ( res.token.rank && transformer.rank <= res.token.rank ) {
 211+ // skip transformation, was already applied.
 212+ continue;
 213+ }
173214 // Transform token with side effects
174 - tokenCTX = ts[i]( tokenCTX );
175 - if ( tokenCTX.token === null || $.isArray(tokenCTX.token) ) {
 215+ res = transformer.transform( res.token, cb, frame, this.prevToken );
 216+ // if multiple tokens or null token: process returned tokens (in parent)
 217+ if ( !res.token || // async implies tokens instead of token, so no
 218+ // need to check explicitly
 219+ res.token.type !== token.type ||
 220+ res.token.name !== token.name ) {
 221+ this._resetTokenRank ( res, transformer );
 222+ aborted = true;
176223 break;
177224 }
178 -
 225+ // track progress on token
 226+ res.token.rank = transformer.rank;
179227 }
 228+ if ( ! aborted ) {
 229+ // Mark token as fully processed.
 230+ res.token.rank = phaseEndRank;
 231+ }
180232 }
181 - return tokenCTX;
 233+ return res;
182234 };
183235
184236 /* Call all transformers on non-tag token types.
185237 *
186 - * @param tokenCTX {TokenContext} The current token and its context.
187 - * @param ts List of token transformers for this token type.
188 - * @returns {TokenContext} Context with updated token and/or accum.
 238+ * @param {Object} The current token.
 239+ * @param {Function} Completion callback for async processing.
 240+ * @param {Number} Rank of phase end, both key for transforms and rank for
 241+ * processed tokens.
 242+ * @param {Object} The frame, contains a reference to the environment.
 243+ * @param {Array} ts List of token transformers for this token type.
 244+ * @returns {Object} Token(s) and async indication.
189245 */
190 -TokenTransformDispatcher.prototype._transformToken = function ( tokenCTX, ts ) {
 246+TokenTransformDispatcher.prototype._transformToken = function ( token, cb, phaseEndRank, frame, ts ) {
191247 // prepend 'any' transformers
192 - ts = this.transformers.any.concat(ts);
 248+ ts = this.transformers[phaseEndRank].any.concat(ts);
 249+ var transformer,
 250+ res = { token: token },
 251+ aborted = false;
193252 if ( ts ) {
194253 for (var i = 0, l = ts.length; i < l; i++ ) {
 254+ transformer = ts[i];
 255+ if ( res.token.rank && transformer.rank <= res.token.rank ) {
 256+ // skip transformation, was already applied.
 257+ continue;
 258+ }
195259 // Transform token with side effects
196 - tokenCTX = ts[i]( tokenCTX );
197 - if ( tokenCTX.token === null || $.isArray(tokenCTX.token) ) {
 260+ // XXX: it should be a better idea to move the token.rank out of
 261+ // token and into a wrapper object to ensure that transformations
 262+ // don't mess with it!
 263+ res = transformer.transform( res.token, cb, frame, this.prevToken );
 264+ if ( !res.token ||
 265+ res.token.type !== token.type ) {
 266+ this._resetTokenRank ( res, transformer );
 267+ aborted = true;
198268 break;
199269 }
 270+ res.token.rank = transformer.rank;
200271 }
 272+ if ( ! aborted ) {
 273+ // mark token as completely processed
 274+ res.token.rank = phaseEndRank; // need phase passed in!
 275+ }
 276+
201277 }
202 - return tokenCTX;
 278+ return res;
203279 };
204280
205281 /**
206282 * Transform and expand tokens.
207283 *
208 - * Normally called with undefined accum. Asynchronous expansions will call
209 - * this with their known accum, which allows expanded tokens to be spliced in
210 - * at the appropriate location in the token list, which is always at the tail
211 - * end of the current accumulator. Calls back registered callback if there are
212 - * no more outstanding asynchronous expansions.
213 - *
214 - * @param {Array} Tokens to process.
215 - * @param {Object} TokenAccumulator object. Undefined for first call, set to
216 - * accumulator with expanded token at tail for asynchronous expansions.
217 - * @param {Int} delta, default 1. Decrement the outstanding async callback
218 - * count by this much to determine when all outstanding actions are done.
219 - * Main use of this argument is to avoid counting some extra callbacks from
220 - * actions before they are done.
 284+ * Callback for token chunks emitted from the tokenizer.
221285 */
222 -TokenTransformDispatcher.prototype.transformTokens = function ( tokens, accum, delta ) {
223 - if ( accum === undefined ) {
224 - this.reset();
225 - accum = this.accum;
 286+TokenTransformDispatcher.prototype.transformTokens = function ( tokens ) {
 287+ //console.log('TokenTransformDispatcher transformTokens');
 288+ var res = this.transformPhase01 ( this.frame, tokens, this.phase2TailCB );
 289+ this.phase2TailCB( tokens, true );
 290+ if ( res.async ) {
 291+ this.tailAccumulator = res.async;
 292+ this.phase2TailCB = res.async.getParentCB ( 'sibling' );
226293 }
 294+};
227295
228 - //console.log('transformTokens: ' + JSON.stringify(tokens) + JSON.stringify(accum.accum) );
 296+/**
 297+ * Callback for the event emitted from the tokenizer.
 298+ *
 299+ * This simply decrements the outstanding counter on the top-level
 300+ */
 301+TokenTransformDispatcher.prototype.onEndEvent = function () {
 302+ if ( this.tailAccumulator ) {
 303+ this.tailAccumulator.siblingDone();
 304+ } else {
 305+ // nothing was asynchronous, so we'll have to emit end here.
 306+ this.emit('end');
 307+ }
 308+};
229309
230 - var tokenCTX = new TokenContext(undefined, accum, this, undefined);
231 - var origLen = tokens.length;
232 - for ( var i = 0; i < tokens.length; i++ ) {
233 - tokenCTX.lastToken = tokenCTX.token; // FIXME: Fix re-entrant case!
234 - tokenCTX.token = tokens[i];
235 - tokenCTX.pos = i;
236 - tokenCTX.accum = accum;
237 - switch(tokenCTX.token.type) {
 310+/**
 311+ * add parent, parentref args
 312+ * return
 313+ * {tokens: [tokens], async: true}: async expansion -> outstanding++ in parent
 314+ * {tokens: [tokens], async: false}: fully expanded
 315+ * {token: {token}}: single-token return
 316+ * child after first expand (example: template expanded)
 317+ * return some finished tokens, reuse parent accumulator
 318+ * if new accumulator: set parent, ref
 319+ */
 320+
 321+TokenTransformDispatcher.prototype.transformPhase01 = function ( frame, tokens, parentCB ) {
 322+
 323+ //console.log('transformPhase01: ' + JSON.stringify(tokens) );
 324+
 325+ var res,
 326+ phaseEndRank = 2,
 327+ // Prepare a new accumulator, to be used by async children (if any)
 328+ localAccum = [],
 329+ accum = new TokenAccumulator( parentCB ),
 330+ cb = accum.getParentCB( 'child' ),
 331+ activeAccum = null,
 332+ tokensLength = tokens.length,
 333+ token,
 334+ ts = this.transformers[phaseEndRank];
 335+
 336+ for ( var i = 0; i < tokensLength; i++ ) {
 337+ token = tokens[i];
 338+
 339+ switch( token.type ) {
238340 case 'TAG':
239341 case 'ENDTAG':
240342 case 'SELFCLOSINGTAG':
241 - tokenCTX = this._transformTagToken( tokenCTX );
 343+ res = this._transformTagToken( token, cb, phaseEndRank, frame );
242344 break;
243345 case 'TEXT':
244 - tokenCTX = this._transformToken( tokenCTX, this.transformers.text );
 346+ res = this._transformToken( token, cb, phaseEndRank, frame, ts.text );
245347 break;
246348 case 'COMMENT':
247 - tokenCTX = this._transformToken( tokenCTX, this.transformers.comment);
 349+ res = this._transformToken( token, cb, phaseEndRank, frame, ts.comment);
248350 break;
249351 case 'NEWLINE':
250 - tokenCTX = this._transformToken( tokenCTX, this.transformers.newline );
 352+ res = this._transformToken( token, cb, phaseEndRank, frame, ts.newline );
251353 break;
252354 case 'END':
253 - tokenCTX = this._transformToken( tokenCTX, this.transformers.end );
 355+ res = this._transformToken( token, cb, phaseEndRank, frame, ts.end );
254356 break;
255357 default:
256 - tokenCTX = this._transformToken( tokenCTX, this.transformers.martian );
 358+ res = this._transformToken( token, cb, phaseEndRank, frame, ts.martian );
257359 break;
258360 }
259 - // add special DELAYED value
260 - if( $.isArray(tokenCTX.token) ) {
 361+
 362+ if( res.tokens ) {
261363 // Splice in the returned tokens (while replacing the original
262364 // token), and process them next.
263 - [].splice.apply(tokens, [i, 1].concat(tokenCTX.token));
264 - //l += tokenCTX.token.length - 1;
 365+ [].splice.apply( tokens, [i, 1].concat(res.tokens) );
 366+ tokensLength = tokens.length;
265367 i--; // continue at first inserted token
266 - } else if (tokenCTX.token) {
267 - // push to accumulator
268 - accum.push(tokenCTX.token);
 368+ } else if ( res.token ) {
 369+ if ( res.token.rank === 2 ) {
 370+ // token is done.
 371+ if ( activeAccum ) {
 372+ // push to accumulator
 373+ activeAccum.push( res.token );
 374+ } else {
 375+ // If there is no accumulator yet, then directly return the
 376+ // token to the parent. Collect them in localAccum for this
 377+ // purpose.
 378+ localAccum.push(res.token);
 379+ }
 380+ } else {
 381+ // re-process token.
 382+ tokens[i] = res.token;
 383+ i--;
 384+ }
 385+ } else if ( res.async ) {
 386+ // The child now switched to activeAccum, we have to create a new
 387+ // accumulator for the next potential child.
 388+ activeAccum = accum;
 389+ accum = new TokenAccumulator( activeAccum.getParentCB( 'sibling' ) );
 390+ cb = accum.getParentCB( 'child' );
269391 }
270 - // Update current accum, in case a new one was spliced in by a
271 - // transformation starting asynch work.
272 - accum = tokenCTX.accum;
273392 }
274393
275 - if ( delta === undefined ) {
276 - delta = 1;
277 - }
278 -
279 - this.finish( delta );
 394+ // Return finished tokens directly to caller, and indicate if further
 395+ // async actions are outstanding. The caller needs to point a sibling to
 396+ // the returned accumulator, or call .siblingDone() to mark the end of a
 397+ // chain.
 398+ return { tokens: localAccum, async: activeAccum };
280399 };
281400
282401 /**
283 - * Decrement the number of outstanding async actions by delta and call the
284 - * callback with a list of tokens if none are remaining.
285 - *
286 - * @method
287 - * @param {Int} delta, how much to decrement the number of outstanding async
288 - * actions.
 402+ * Callback from tokens fully processed for phase 0 and 1, which are now ready
 403+ * for synchronous and globally in-order phase 2 processing.
289404 */
290 -TokenTransformDispatcher.prototype.finish = function ( delta ) {
291 - this.outstanding -= delta;
292 - if ( this.outstanding === 0 ) {
293 - // Join the token accumulators back into a single token list
294 - var a = this.firstaccum;
295 - var tokens = a.accum;
296 - while ( a.next !== null ) {
297 - a = a.next;
298 - tokens = tokens.concat(a.accum);
299 - }
300 - //console.log('TOKENS: ' + JSON.stringify(tokens, null, 2));
301 - // Call our callback with the flattened token list
302 - this.cb(tokens);
 405+TokenTransformDispatcher.prototype.returnTokens01 = function ( tokens, notYetDone ) {
 406+ // FIXME: store frame in object?
 407+ tokens = this.transformPhase2( this.frame, tokens, this.parentCB );
 408+ //console.log('returnTokens01, after transformPhase2.');
 409+
 410+ this.emit( 'chunk', tokens );
 411+
 412+ if ( ! notYetDone ) {
 413+ console.log('returnTokens01 done.');
 414+ // signal our done-ness to consumers.
 415+ this.emit( 'end' );
 416+ // and reset internal state.
 417+ this.reset();
303418 }
304419 };
305420
 421+
306422 /**
307 - * Start a new accumulator for asynchronous work.
 423+ * Phase 2
308424 *
309 - * @param {Object} TokenAccumulator object after which to insert a new
310 - * accumulator
311 - * @count {Int} (optional, default 1) The number of callbacks to expect before
312 - * considering the asynch work on the new accumulator done.
313 - * */
314 -TokenTransformDispatcher.prototype.newAccumulator = function ( accum, count ) {
315 - if ( count !== undefined ) {
316 - this.outstanding += count;
317 - } else {
318 - this.outstanding++;
 425+ * Global in-order traversal on expanded token stream (after async phase 1).
 426+ * Very similar to transformPhase01, but without async handling.
 427+ */
 428+TokenTransformDispatcher.prototype.transformPhase2 = function ( frame, tokens, cb ) {
 429+ var res,
 430+ phaseEndRank = 3,
 431+ localAccum = [],
 432+ localAccumLength = 0,
 433+ tokensLength = tokens.length,
 434+ token,
 435+ ts = this.transformers[phaseEndRank];
 436+
 437+ for ( var i = 0; i < tokensLength; i++ ) {
 438+ token = tokens[i];
 439+
 440+ switch( token.type ) {
 441+ case 'TAG':
 442+ case 'ENDTAG':
 443+ case 'SELFCLOSINGTAG':
 444+ res = this._transformTagToken( token, cb, phaseEndRank,
 445+ frame );
 446+ break;
 447+ case 'TEXT':
 448+ res = this._transformToken( token, cb, phaseEndRank, frame,
 449+ ts.text );
 450+ break;
 451+ case 'COMMENT':
 452+ res = this._transformToken( token, cb, phaseEndRank, frame,
 453+ ts.comment );
 454+ break;
 455+ case 'NEWLINE':
 456+ res = this._transformToken( token, cb, phaseEndRank, frame,
 457+ ts.newline );
 458+ break;
 459+ case 'END':
 460+ res = this._transformToken( token, cb, phaseEndRank, frame,
 461+ ts.end );
 462+ break;
 463+ default:
 464+ res = this._transformToken( token, cb, phaseEndRank, frame,
 465+ ts.martian );
 466+ break;
 467+ }
 468+
 469+ if( res.tokens ) {
 470+ // Splice in the returned tokens (while replacing the original
 471+ // token), and process them next.
 472+ [].splice.apply( tokens, [i, 1].concat(res.tokens) );
 473+ tokensLength = tokens.length;
 474+ i--; // continue at first inserted token
 475+ } else if ( res.token ) {
 476+ if ( res.token.rank === phaseEndRank ) {
 477+ // token is done.
 478+ localAccum.push(res.token);
 479+ this.prevToken = res.token;
 480+ } else {
 481+ // re-process token.
 482+ tokens[i] = res.token;
 483+ i--;
 484+ }
 485+ }
319486 }
320 - if ( accum === undefined ) {
321 - accum = this.accum;
322 - }
323 - return accum.insertAccumulator( );
 487+ return localAccum;
324488 };
325489
 490+
326491 /**
327 - * Token accumulators in a linked list. Using a linked list simplifies async
328 - * callbacks for template expansions as it avoids stable references to chunks.
 492+ * Token accumulators buffer tokens between asynchronous processing points,
 493+ * and return fully processed token chunks in-order and as soon as possible.
329494 *
330495 * @class
331496 * @constructor
332497 * @param {Object} next TokenAccumulator to link to
333498 * @param {Array} (optional) tokens, init accumulator with tokens or []
334499 */
335 -function TokenAccumulator ( next, tokens ) {
336 - this.next = next;
337 - if ( tokens ) {
338 - this.accum = tokens;
339 - } else {
340 - this.accum = [];
341 - }
342 - return this;
 500+function TokenAccumulator ( parentCB ) {
 501+ this.parentCB = parentCB;
 502+ this.accum = [];
 503+ // Wait for child and sibling by default
 504+ // Note: Need to decrement outstanding on last accum
 505+ // in a chain.
 506+ this.outstanding = 2;
343507 }
344508
345509 /**
346 - * Push a token into the accumulator
 510+ * Curry a parentCB with the object and reference.
347511 *
348 - * @method
349 - * @param {Object} token
 512+ * @param {Object} TokenAccumulator
 513+ * @param {misc} Reference / key for callback
 514+ * @returns {Function}
350515 */
351 -TokenAccumulator.prototype.push = function ( token ) {
352 - return this.accum.push(token);
 516+TokenAccumulator.prototype.getParentCB = function ( reference ) {
 517+ return this.returnTokens01.bind( this, reference );
353518 };
354519
355520 /**
356 - * Pop a token from the accumulator
 521+ * Pass tokens to an accumulator
357522 *
358523 * @method
359 - * @returns {Object} token
 524+ * @param {Object} token
360525 */
361 -TokenAccumulator.prototype.pop = function ( ) {
362 - return this.accum.pop();
 526+TokenAccumulator.prototype.returnTokens01 = function ( reference, tokens, notYetDone ) {
 527+ var res,
 528+ cb,
 529+ returnTokens = [];
 530+
 531+ if ( ! notYetDone ) {
 532+ this.outstanding--;
 533+ }
 534+
 535+ if ( reference === 'child' ) {
 536+ // XXX: Use some marker to avoid re-transforming token chunks several
 537+ // times?
 538+ res = this.transformPhase01( this.frame, tokens, this.parentCB );
 539+
 540+ if ( res.async ) {
 541+ // new asynchronous expansion started, chain of accumulators
 542+ // created
 543+ if ( this.outstanding === 0 ) {
 544+ // Last accum in chain should only wait for child
 545+ res.async.outstanding--;
 546+ cb = this.parentCB;
 547+ } else {
 548+ cb = this.parentCB;
 549+ // set own callback to new sibling, the end of accumulator chain
 550+ this.parentCB = res.async.getParentCB( 'sibling' );
 551+ }
 552+ }
 553+ if ( ! notYetDone ) {
 554+ // Child is done, return accumulator from sibling. Siblings
 555+ // process tokens themselves, so we concat those to the result of
 556+ // processing tokens from the child.
 557+ tokens = res.tokens.concat( this.accum );
 558+ this.accum = [];
 559+ }
 560+ this.cb( res.tokens, res.async );
 561+ return null;
 562+ } else {
 563+ // sibling
 564+ if ( this.outstanding === 0 ) {
 565+ tokens = this.accum.concat( tokens );
 566+ // A sibling will transform tokens, so we don't have to do this
 567+ // again.
 568+ this.parentCB( res.tokens, false );
 569+ return null;
 570+ } else if ( this.outstanding === 1 && notYetDone ) {
 571+ // Sibling is not yet done, but child is. Return own parentCB to
 572+ // allow the sibling to go direct, and call back parent with
 573+ // tokens. The internal accumulator is empty at this stage, as its
 574+ // tokens are passed to the parent when the child is done.
 575+ return this.parentCB( tokens, true);
 576+ }
 577+
 578+
 579+ }
363580 };
364581
365582 /**
366 - * Insert an accumulator after this one.
 583+ * Mark the sibling as done (normally at the tail of a chain).
 584+ */
 585+TokenAccumulator.prototype.siblingDone = function () {
 586+ this.returnTokens01 ( 'sibling', [], false );
 587+};
 588+
 589+
 590+/**
 591+ * Push a token into the accumulator
367592 *
368593 * @method
369 - * @returns {Object} created TokenAccumulator
 594+ * @param {Object} token
370595 */
371 -TokenAccumulator.prototype.insertAccumulator = function ( ) {
372 - this.next = new TokenAccumulator(this.next);
373 - return this.next;
 596+TokenAccumulator.prototype.push = function ( token ) {
 597+ return this.accum.push(token);
374598 };
375599
 600+
 601+
 602+/* TODO list
 603+ *
 604+ * transformPhase01 called first for phase 0-1 (in-order per source file)
 605+ * then only phase 2 (order independent, if 2 <= token phase < 3, 3 ~ done)
 606+ * -> don't execute order-dependent transforms in this phase!
 607+ * * enforce phase on tokens, but not priority within phase
 608+ * -> cycles possible in async phase
 609+ * final transform (phase 2) globally in-order and synchronous in root returnTokens01
 610+ *
 611+ *
 612+ * Transformation phases
 613+ * [0,2)
 614+ * [2,3] (and 1..2 in templates etc, but clamp phase on *returned* tokens to 2)
 615+ * 3
 616+ *
 617+ */
 618+
 619+
376620 if (typeof module == "object") {
377621 module.exports.TokenTransformDispatcher = TokenTransformDispatcher;
378622 }
379 -
Index: trunk/extensions/VisualEditor/modules/parser/ext.core.QuoteTransformer.js
@@ -1,70 +1,92 @@
22 /*
3 - * Italic/Bold handling.
 3+ * MediaWiki-compatible italic/bold handling as a token stream transformation.
44 *
5 - * - list of tokens
6 - * - NEWLINE
7 - * - ticks (2+) -> list with link in line token list?
8 - * - process on newline
9 - * - need access to text nodes before for conversion back to text
 5+ * @author Gabriel Wicke <gwicke@wikimedia.org>
106 */
117
128 function QuoteTransformer ( ) {
139 // Bold and italic tokens are collected in these lists, and then processed
1410 // in onNewLine.
 11+ this.quoteAndNewlineRank = 2.1;
 12+ this.anyRank = 2.101; // Just after regular quote and newline
 13+ this.reset();
 14+}
 15+
 16+QuoteTransformer.prototype.reset = function ( ) {
1517 this.italics = [];
1618 this.bolds = [];
17 -}
 19+ this.currentChunk = [];
 20+ // List of chunks, each starting with a (potentially) bold or italic token
 21+ // and followed by plain tokens.
 22+ this.chunks = [];
 23+};
1824
 25+
1926 // Register this transformer with the TokenTransformer
2027 QuoteTransformer.prototype.register = function ( dispatcher ) {
 28+ this.dispatcher = dispatcher;
2129 // Register for NEWLINE and QUOTE tag tokens
22 - var self = this;
23 - dispatcher.appendListener( function (ctx) {
24 - return self.onNewLine(ctx);
25 - }, 'newline' );
26 - dispatcher.appendListener( function (ctx) {
27 - return self.onQuote(ctx);
28 - }, 'tag', 'mw-quote' );
 30+ dispatcher.addTransform( this.onNewLine.bind(this),
 31+ this.quoteAndNewlineRank, 'newline' );
 32+ dispatcher.addTransform( this.onQuote.bind(this),
 33+ this.quoteAndNewlineRank, 'tag', 'mw-quote' );
 34+ // Reset internal state when we are done
 35+ dispatcher.addTransform( this.reset.bind(this),
 36+ this.quoteAndNewlineRank, 'end' );
2937 };
3038
3139 // Make a copy of the token context
32 -QuoteTransformer.prototype.ctx = function ( tokenCTX ) {
33 - return $.extend({}, tokenCTX);
 40+QuoteTransformer.prototype._startNewChunk = function ( ) {
 41+ this.currentChunk.pos = this.chunks.length;
 42+ this.chunks.push( this.currentChunk );
 43+ this.currentChunk = [];
3444 };
3545
3646 // Handle QUOTE tags. These are collected in italic/bold lists depending on
3747 // the length of quote string. Actual analysis and conversion to the
3848 // appropriate tag tokens is deferred until the next NEWLINE token triggers
3949 // onNewLine.
40 -QuoteTransformer.prototype.onQuote = function ( tokenCTX ) {
41 - var token = tokenCTX.token,
42 - qlen = token.value.length,
43 - out = null,
44 - lastToken = tokenCTX.lastToken,
45 - ctx = this.ctx(tokenCTX),
46 - ctx2,
47 - accum = tokenCTX.accum;
 50+//
 51+// XXX: Cannot use async stuff here, need to buffer things locally instead!
 52+// FIXME: Convert to internal buffering! -> return all tokens with rank set to
 53+// own rank to avoid reprocessing
 54+QuoteTransformer.prototype.onQuote = function ( token, cb, frame, prevToken ) {
 55+ var qlen = token.value.length,
 56+ tokens = [], // output tokens
 57+ ctx = {
 58+ token: token,
 59+ cb: cb,
 60+ frame: frame,
 61+ prevToken: prevToken
 62+ },
 63+ ctx2 = {
 64+ cb: cb,
 65+ frame: frame,
 66+ prevToken: prevToken
 67+ };
 68+
4869
 70+ if ( this.chunks.length === 0 ) {
 71+ // register for any token if not yet active
 72+ this.dispatcher.addTransform( this.onAny.bind(this), this.anyRank, 'tag', 'mw-quote' );
 73+ }
 74+
 75+ this._startNewChunk();
 76+
4977 switch (qlen) {
5078 case 2:
51 - // Start a new accumulator, so we can later go back using the
52 - // reference to this accumulator and append our tags at the end of
53 - // it.
54 - accum = tokenCTX.dispatcher.newAccumulator(accum);
55 - this.italics.push(ctx);
 79+ this.currentChunk.push(ctx);
 80+ this.italics.push(this.currentChunk);
5681 break;
5782 case 3:
58 - accum = tokenCTX.dispatcher.newAccumulator(accum);
59 - this.bolds.push(ctx);
 83+ this.currentChunk.push(ctx);
 84+ this.bolds.push(this.currentChunk);
6085 break;
6186 case 4:
62 - if (lastToken && lastToken.type === 'TEXT') {
63 - lastToken.value += "'";
64 - } else {
65 - out = {type: 'TEXT', value: "'"};
66 - }
67 - accum = tokenCTX.dispatcher.newAccumulator(accum);
68 - this.bolds.push(ctx);
 87+ this.currentChunk.push( {type: 'TEXT', value: "'"} );
 88+ this._startNewChunk();
 89+ this.currentChunk.push(ctx);
 90+ this.bolds.push(this.currentChunk);
6991 break;
7092 case 5:
7193 // The order of italic vs. bold does not matter. Those are
@@ -72,39 +94,50 @@
7395 // by the HTML 5 tree builder. This does not always result in the
7496 // prettiest result, but at least it is always correct and very
7597 // convenient.
76 - accum = tokenCTX.dispatcher.newAccumulator(accum, 2);
77 - this.italics.push(ctx);
78 - ctx2 = this.ctx(tokenCTX);
79 - ctx2.token = {attribs: ctx.token.attribs};
80 - this.bolds.push(ctx2);
 98+ this.currentChunk.push(ctx);
 99+ this.italics.push(this.currentChunk);
 100+ this._startNewChunk();
 101+ ctx2.token = { attribs: token.attribs };
 102+ this.currentChunk.push(ctx2);
 103+ this.bolds.push(this.currentChunk);
81104 break;
82105 default: // longer than 5, only use the last 5 ticks
83106 var newvalue = token.value.substr(0, qlen - 5 );
84 - if (lastToken && lastToken.type === 'TEXT') {
85 - lastToken.value += newvalue;
86 - } else {
87 - out = {type: 'TEXT', value: newvalue};
88 - }
89 - accum = tokenCTX.dispatcher.newAccumulator(accum, 2);
90 - this.italics.push(ctx);
91 - ctx2 = this.ctx(tokenCTX);
92 - ctx2.token = {attribs: ctx.token.attribs};
93 - this.bolds.push(ctx2);
 107+ this.currentChunk.push ( {type: 'TEXT', value: newvalue} );
 108+ this._startNewChunk();
 109+ this.currentChunk.push(ctx);
 110+ this.italics.push(this.currentChunk);
 111+ this._startNewChunk();
 112+ ctx2.token = { attribs: ctx.token.attribs };
 113+ this.currentChunk.push(ctx2);
 114+ this.bolds.push(this.currentChunk);
94115 break;
95116 }
96117
97 - tokenCTX.token = out;
98 - tokenCTX.accum = accum;
99 - return tokenCTX;
 118+ return { token: null };
100119 };
101120
 121+QuoteTransformer.prototype.onAny = function ( token, cb, frame, prevToken ) {
 122+ //console.log('qt onAny: ' + JSON.stringify(token, null, 2));
 123+ this.currentChunk.push( token );
 124+ return {};
 125+};
 126+
102127 // Handle NEWLINE tokens, which trigger the actual quote analysis on the
103128 // collected quote tokens so far.
104 -QuoteTransformer.prototype.onNewLine = function ( tokenCTX ) {
105 - if(!this.bolds && !this.italics) {
 129+QuoteTransformer.prototype.onNewLine = function ( token, cb, frame, prevToken ) {
 130+ var res;
 131+
 132+ if( ! this.chunks.length ) {
106133 // Nothing to do, quick abort.
107 - return tokenCTX;
 134+ return { token: token };
108135 }
 136+
 137+
 138+ token.rank = this.quoteAndNewlineRank;
 139+ this.currentChunk.push( token );
 140+ this._startNewChunk();
 141+
109142 //console.log("onNewLine: " + this.italics + this.bolds);
110143 // balance out tokens, convert placeholders into tags
111144 if (this.italics.length % 2 && this.bolds.length % 2) {
@@ -113,11 +146,11 @@
114147 firstspace = -1;
115148 for (var j = 0; j < this.bolds.length; j++) {
116149 var ctx = this.bolds[j];
117 - //console.log("balancing!" + JSON.stringify(ctx.lastToken, null, 2));
118 - if (ctx.lastToken) {
119 - if (ctx.lastToken.type === 'TEXT') {
120 - var lastchar = ctx.lastToken.value[ctx.lastToken.value.length - 1],
121 - secondtolastchar = ctx.lastToken.value[ctx.lastToken.value.length - 2];
 150+ //console.log("balancing!" + JSON.stringify(ctx.prevToken, null, 2));
 151+ if (ctx.prevToken) {
 152+ if (ctx.prevToken.type === 'TEXT') {
 153+ var lastchar = ctx.prevToken.value[ctx.prevToken.value.length - 1],
 154+ secondtolastchar = ctx.prevToken.value[ctx.prevToken.value.length - 2];
122155 if (lastchar === ' ' && firstspace === -1) {
123156 firstspace = j;
124157 } else if (lastchar !== ' ') {
@@ -129,8 +162,8 @@
130163 firstmultiletterword = j;
131164 }
132165 }
133 - } else if ( ( ctx.lastToken.type === 'NEWLINE' ||
134 - ctx.lastToken.type === 'TAG' ) &&
 166+ } else if ( ( ctx.prevToken.type === 'NEWLINE' ||
 167+ ctx.prevToken.type === 'TAG' ) &&
135168 firstmultiletterword == -1 ) {
136169 // This is an approximation, as the original doQuotes
137170 // operates on the source and just looks at space vs.
@@ -153,51 +186,55 @@
154187 }
155188 }
156189
157 - this.quotesToTags(this.italics, 'i', tokenCTX.dispatcher);
158 - this.quotesToTags(this.bolds, 'b', tokenCTX.dispatcher);
 190+ this.quotesToTags( this.italics, 'i' );
 191+ this.quotesToTags( this.bolds, 'b' );
159192
160 - this.bolds = [];
161 - this.italics = [];
 193+ //console.log('chunks: ' + JSON.stringify( this.chunks, null, 2 ) );
162194
163 - // Pass through the NEWLINE token unchanged
164 - return tokenCTX;
 195+ // return all collected tokens including the newline
 196+ res = { tokens: [].concat.apply([], this.chunks) };
 197+
 198+ // prepare for next session
 199+ this.reset();
 200+
 201+ // remove 'any' registration
 202+ this.dispatcher.removeTransform( this.anyRank, 'any' );
 203+
 204+ return res;
 205+
165206 };
166207
167208 // Convert a bold token to italic to balance an uneven number of both bold and
168209 // italic tags. In the process, one quote needs to be converted back to text.
169210 QuoteTransformer.prototype.convertBold = function ( i ) {
170 - var ctx = this.bolds[i];
 211+ var chunk = this.bolds[i],
 212+ textToken = { type: 'TEXT', value: "'" };
171213 //console.log('convertbold!');
172 - if ( ctx.lastToken && ctx.lastToken.type === 'TEXT' ) {
173 - ctx.lastToken.value += "'";
 214+ if ( chunk.pos ) {
 215+ this.chunks[chunk.pos - 1].push( textToken );
174216 } else {
175 - // Add a text token!
176 - ctx.token = [{type: 'TEXT', value: "'"}, ctx.token];
 217+ // prepend another chunk
 218+ this.chunks.unshift( [ textToken ] );
177219 }
178220
 221+ // delete from bolds
179222 this.bolds.splice(i, 1);
180223
181 - this.italics.push(ctx);
 224+ this.italics.push(chunk);
182225 this.italics.sort(function(a,b) { return a.pos - b.pos; } );
183 - //console.log(this.italics.map(function(a) { return a.pos }));
184 - //console.log(this.bolds.map(function(a) { return a.pos }));
185226 };
186227
187228 // Convert italics/bolds into tags
188 -QuoteTransformer.prototype.quotesToTags = function ( contexts, name, dispatcher ) {
 229+QuoteTransformer.prototype.quotesToTags = function ( chunks, name ) {
189230 var toggle = true,
190231 t,
 232+ j,
191233 out = [];
192 - for (var j = 0; j < contexts.length; j++) {
193 - t = contexts[j].token;
194234
195 - if ( $.isArray(t) ) {
196 - // Slip in a text token from bold to italic rebalancing. Don't
197 - // count this callback towards completion.
198 - var realToken = t.pop();
199 - dispatcher.transformTokens( t, contexts[j].accum, 0 );
200 - t = realToken;
201 - }
 235+ for (j = 0; j < chunks.length; j++) {
 236+ //console.log( 'quotesToTags ' + name + ': ' + JSON.stringify( chunks, null, 2 ) );
 237+ t = chunks[j][0].token;
 238+ //console.log( 'quotesToTags t: ' + JSON.stringify( t, null, 2));
202239
203240 if(toggle) {
204241 t.type = 'TAG';
@@ -206,21 +243,13 @@
207244 }
208245 t.name = name;
209246 delete t.value;
 247+ chunks[j][0] = t;
210248 toggle = !toggle;
211 - // Re-add and process the new token with the original accumulator, but
212 - // don't yet count this callback towards callback completion.
213 - dispatcher.transformTokens( [t], contexts[j].accum, 0 );
214249 }
215 - var l = contexts.length;
216250 if (!toggle) {
217251 // Add end tag, but don't count it towards completion.
218 - dispatcher.transformTokens( [{type: 'ENDTAG', name: name}],
219 - contexts[contexts.length - 1].accum, 0 );
 252+ this.currentChunk.push( {type: 'ENDTAG', name: name} );
220253 }
221 - // Now finally count the number of contexts towards completion, which
222 - // causes the dispatcher to call its own callback if no more asynch
223 - // callbacks are outstanding.
224 - dispatcher.finish( contexts.length );
225254 };
226255
227256 if (typeof module == "object") {
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.HTML5TreeBuilder.node.js
@@ -15,12 +15,38 @@
1616
1717 // Sets up the parser
1818 this.parser.parse(this);
19 - this.document = this.parser.document;
20 - return this;
 19+
 20+ // implicitly start a new document
 21+ this.processToken({type: 'TAG', name: 'body'});
2122 };
2223
2324 FauxHTML5.TreeBuilder.prototype = new events.EventEmitter();
2425
 26+FauxHTML5.TreeBuilder.prototype.subscribeToTokenEmitter = function ( emitter ) {
 27+ emitter.addListener('chunk', this.onChunk.bind( this ) );
 28+ emitter.addListener('end', this.onEnd.bind( this ) );
 29+};
 30+
 31+FauxHTML5.TreeBuilder.prototype.onChunk = function ( tokens ) {
 32+ for (var i = 0, length = tokens.length; i < length; i++) {
 33+ this.processToken(tokens[i]);
 34+ }
 35+};
 36+
 37+FauxHTML5.TreeBuilder.prototype.onEnd = function ( ) {
 38+ //console.log('Fauxhtml5 onEnd');
 39+ // FIXME HACK: For some reason the end token is not processed sometimes,
 40+ // which normally fixes the body reference up.
 41+ this.document = this.parser.document;
 42+ this.document.body = this.parser
 43+ .document.getElementsByTagName('body')[0];
 44+
 45+ // XXX: more clean up to allow reuse.
 46+ this.parser.setup();
 47+ this.processToken({type: 'TAG', name: 'body'});
 48+};
 49+
 50+
2551 // Adapt the token format to internal HTML tree builder format, call the actual
2652 // html tree builder by emitting the token.
2753 FauxHTML5.TreeBuilder.prototype.processToken = function (token) {
@@ -65,6 +91,7 @@
6692 break;
6793 case "END":
6894 this.emit('end');
 95+ this.emit('token', { type: 'EOF' } );
6996 this.document = this.parser.document;
7097 if ( ! this.document.body ) {
7198 // HACK: This should not be needed really.
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js
@@ -8,19 +8,25 @@
99
1010 var PEG = require('pegjs'),
1111 path = require('path'),
12 - fs = require('fs');
 12+ fs = require('fs'),
 13+ events = require('events');
1314
1415 function PegTokenizer() {
1516 var pegSrcPath = path.join( __dirname, 'pegTokenizer.pegjs.txt' );
1617 this.src = fs.readFileSync( pegSrcPath, 'utf8' );
1718 }
1819
 20+// Inherit from EventEmitter
 21+PegTokenizer.prototype = new events.EventEmitter();
 22+
1923 PegTokenizer.src = false;
2024
2125 PegTokenizer.prototype.tokenize = function( text ) {
2226 var out, err;
2327 if ( !this.parser ) {
2428 this.parser = PEG.buildParser(this.src);
 29+ // add reference to this for event emission
 30+ this.parser._tokenizer = this;
2531 }
2632
2733 // some normalization
@@ -28,21 +34,30 @@
2935 text += "\n";
3036 }
3137
32 - try {
 38+ // XXX: Commented out exception handling during development to get
 39+ // reasonable traces. Calling a trace on the extension does not really cut
 40+ // it.
 41+ //try {
3342 out = this.parser.parse(text);
34 - } catch (e) {
35 - err = e;
36 - console.trace();
37 - } finally {
 43+ // emit tokens here until we get that to work per toplevelblock in the
 44+ // actual tokenizer
 45+ this.emit('chunk', out);
 46+ this.emit('end');
 47+ //} catch (e) {
 48+ //err = e;
 49+ //console.trace();
 50+ //} finally {
 51+ return { err: err };
 52+ //}
 53+};
3854
39 - // Append the end (for obvious reasons this should not
40 - // be part of a stream, only when tokenizing complete
41 - // texts)
42 - out.push({type: 'END'});
 55+/*****************************************************************************
 56+ * LEGACY stuff
 57+ *
 58+ * This is kept around as a template for the ongoing template expansion work!
 59+ * It won't work with the token infrastructure.
 60+ */
4361
44 - return {tokens: out, err: err};
45 - }
46 -}
4762
4863 /**
4964 * @param {object} tree
@@ -91,7 +106,7 @@
92107 content: self.env.expandTemplateArgs( templateTree, tree.params )
93108 });
94109 }
95 - })
 110+ });
96111 } );
97112 // Wait for async...
98113 return;
@@ -123,7 +138,7 @@
124139 PegTokenizer.src = page.revisions[0]['*'];
125140 }
126141 });
127 - callback()
 142+ callback();
128143 },
129144 dataType: 'json',
130145 cache: false
Index: trunk/extensions/VisualEditor/modules/parser/html5/parser.js
@@ -44,6 +44,14 @@
4545
4646 Parser.prototype.parse = function(tokenizer) {
4747 this.tokenizer = tokenizer;
 48+
 49+ this.tokenizer.addListener('token', function(t) {
 50+ return function(token) { t.do_token(token); };
 51+ }(this));
 52+ this.tokenizer.addListener('end', function(t) {
 53+ return function() { t.emit('end'); };
 54+ }(this));
 55+
4856 this.setup();
4957 //this.tokenizer.tokenize();
5058 }
@@ -116,12 +124,6 @@
117125 }
118126
119127 Parser.prototype.setup = function(container, encoding) {
120 - this.tokenizer.addListener('token', function(t) {
121 - return function(token) { t.do_token(token); };
122 - }(this));
123 - this.tokenizer.addListener('end', function(t) {
124 - return function() { t.emit('end'); };
125 - }(this));
126128 this.emit('setup', this);
127129
128130 var inner_html = !!container;
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.js
@@ -10,103 +10,73 @@
1111 path = require('path'),
1212 PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer,
1313 TokenTransformDispatcher = require('./mediawiki.TokenTransformDispatcher.js').TokenTransformDispatcher,
14 - DOMPostProcessor = require('./mediawiki.DOMPostProcessor.js').DOMPostProcessor,
15 - DOMConverter = require('./mediawiki.DOMConverter.js').DOMConverter,
1614 QuoteTransformer = require('./ext.core.QuoteTransformer.js').QuoteTransformer,
1715 Cite = require('./ext.Cite.js').Cite,
18 - MWRefTagHook = require('./ext.cite.taghook.ref.js').MWRefTagHook,
19 - FauxHTML5 = require('./mediawiki.HTML5TreeBuilder.node.js').FauxHTML5;
 16+ FauxHTML5 = require('./mediawiki.HTML5TreeBuilder.node.js').FauxHTML5,
 17+ DOMPostProcessor = require('./mediawiki.DOMPostProcessor.js').DOMPostProcessor,
 18+ DOMConverter = require('./mediawiki.DOMConverter.js').DOMConverter;
2019
2120 function ParseThingy( config ) {
22 - // XXX: move the actual parsing to separate method, only perform pipeline
23 - // setup in the constructor!
 21+ // Set up a simple parser pipeline.
2422
2523 if ( !config ) {
2624 config = {};
2725 }
2826
29 -
3027 this.wikiTokenizer = new PegTokenizer();
3128
32 - this.postProcessor = new DOMPostProcessor();
 29+ this.tokenDispatcher = new TokenTransformDispatcher ();
3330
34 - this.DOMConverter = new DOMConverter();
 31+ // Add token transformations..
 32+ var qt = new QuoteTransformer();
 33+ qt.register(this.tokenDispatcher);
3534
36 - var pthingy = this;
 35+ //var citeExtension = new Cite();
 36+ //citeExtension.register(this.tokenDispatcher);
3737
38 - // Set up the TokenTransformDispatcher with a callback for the remaining
39 - // processing.
40 - // XXX: convert to event listener (listening for token chunks from
41 - // tokenizer) and event emitter (emitting token chunks)
42 - // XXX: A parser environment and configuration will be added here to the
43 - // token transform dispatcher.
44 - this.tokenDispatcher = new TokenTransformDispatcher ( function ( tokens ) {
45 -
46 - //console.log("TOKENS: " + JSON.stringify(tokens, null, 2));
47 -
48 - // Create a new tree builder, which also creates a new document.
49 - // XXX: implicitly clean up old state after processing end token, so
50 - // that we can reuse the tree builder.
51 - // XXX: convert to event listener listening for token chunks from the
52 - // token transformer and and emitting an additional 'done' event after
53 - // processing the 'end' token.
54 - var treeBuilder = new FauxHTML5.TreeBuilder();
 38+ this.tokenDispatcher.subscribeToTokenEmitter( this.wikiTokenizer );
5539
56 - // Build a DOM tree from tokens using the HTML tree builder/parser.
57 - // XXX: convert to event listener (token chunks from
58 - // TokenTransformDispatcher) and event emitter (DOM tree to
59 - // DOMPostProcessor)
60 - pthingy.buildTree( tokens, treeBuilder );
61 -
62 - // Perform post-processing on DOM.
63 - // XXX: convert to event listener (listening on treeBuilder 'finished'
64 - // event)
65 - pthingy.postProcessor.doPostProcess(treeBuilder.document);
 40+ // Create a new tree builder, which also creates a new document.
 41+ // XXX: implicitly clean up old state after processing end token, so
 42+ // that we can reuse the tree builder.
 43+ // XXX: convert to event listener listening for token chunks from the
 44+ // token transformer and and emitting an additional 'done' event after
 45+ // processing the 'end' token.
 46+ this.treeBuilder = new FauxHTML5.TreeBuilder();
 47+ this.treeBuilder.subscribeToTokenEmitter( this.tokenDispatcher );
6648
67 - // FIXME: move HTML serialization to separate pipeline!
68 - pthingy.document = treeBuilder.document;
 49+ // Prepare these two, but only call them from parse and getWikiDom for
 50+ // now. These will be called in a callback later, when the full pipeline
 51+ // is used asynchronously.
 52+ this.postProcessor = new DOMPostProcessor();
6953
70 - // XXX: emit event with result
71 - pthingy.getWikiDom = function() {
72 - return JSON.stringify(
73 - pthingy.DOMConverter.HTMLtoWiki( treeBuilder.document.body ),
74 - null,
75 - 2
76 - ) + "\n";
77 - };
 54+ this.DOMConverter = new DOMConverter();
 55+}
7856
79 - });
 57+ParseThingy.prototype.parse = function ( text ) {
 58+ // Set the pipeline in motion by feeding the tokenizer
 59+ this.wikiTokenizer.tokenize( text );
8060
81 - // Add token transformations..
82 - var qt = new QuoteTransformer();
83 - qt.register(this.tokenDispatcher);
 61+ // XXX: this will have to happen in a callback!
 62+ this.document = this.treeBuilder.document;
8463
85 - var citeExtension = new Cite();
86 - citeExtension.register(this.tokenDispatcher);
 64+ //console.log(this.document.body.innerHTML);
8765
88 -}
 66+ // Perform synchronous post-processing on DOM.
 67+ // XXX: convert to event listener (listening on treeBuilder 'finished'
 68+ // event)
 69+ this.postProcessor.doPostProcess( this.document );
 70+};
8971
90 -
91 -ParseThingy.prototype = {
92 - //XXX: This will be moved to the treeBuilder event listener callback,
93 - //where it will process each received chunk.
94 - buildTree: function ( tokens, treeBuilder ) {
95 - // push a body element, just to be sure to have one
96 - treeBuilder.processToken({type: 'TAG', name: 'body'});
97 - // Process all tokens
98 - for (var i = 0, length = tokens.length; i < length; i++) {
99 - treeBuilder.processToken(tokens[i]);
100 - }
101 -
102 - // FIXME HACK: For some reason the end token is not processed sometimes,
103 - // which normally fixes the body reference up.
104 - treeBuilder.document.body = treeBuilder.parser
105 - .document.getElementsByTagName('body')[0];
106 -
107 - }
 72+ParseThingy.prototype.getWikiDom = function () {
 73+ return JSON.stringify(
 74+ pthingy.DOMConverter.HTMLtoWiki( this.document.body ),
 75+ null,
 76+ 2
 77+ );
10878 };
10979
 80+
11081 if (typeof module == "object") {
11182 module.exports.ParseThingy = ParseThingy;
11283 }
113 -
Index: trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
@@ -246,11 +246,22 @@
247247 return bnames;
248248 })();
249249
 250+ var self = this;
 251+
250252
251253 }
252254
253255 start
254256 = e:toplevelblock* newline* {
 257+ // end is passed inline as a token, as well as a separate event for now.
 258+
 259+ // this does not work yet.
 260+ //console.log('about to emit' + pp(self));
 261+ //self._tokenizer.emit('chunk', [ { type: 'END' } ] );
 262+ //self._tokenizer.emit('end');
 263+ // Append the end (for obvious reasons this should not
 264+ // be part of a stream, only when tokenizing complete
 265+ // texts)
255266 return flatten(e);
256267 }
257268
@@ -393,6 +404,10 @@
394405 // XXX: only run this for lines that actually need it!
395406 //b.push({type: 'NEWLINE'});
396407 // Move this to a token stream transform!
 408+ //console.log('about to emit' + pp(self));
 409+ //self._tokenizer.emit('chunk', b);
 410+ //console.log('emitted chunk' + pp(b));
 411+ //return [];
397412 return b;
398413 }
399414

Status & tagging log