r106203 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r106202‎ | r106203 | r106204 >
Date:15:15, 14 December 2011
Author:gwicke
Status:deferred
Tags:
Comment:
Add rough HTML DOM to WikiDom conversion. You can see serialized WikiDom of
parser tests using 'node parserTests.js --wikidom'.
Modified paths:
  • /trunk/extensions/VisualEditor/modules/parser/ext.Cite.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.DOMConverter.js (added) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.DOMPostProcessor.js (modified) (history)
  • /trunk/extensions/VisualEditor/modules/parser/mediawiki.HTML5TreeBuilder.node.js (modified) (history)
  • /trunk/extensions/VisualEditor/tests/parser/parserTests.js (modified) (history)

Diff [purge]

Index: trunk/extensions/VisualEditor/tests/parser/parserTests.js
@@ -63,6 +63,8 @@
6464 _import(pj('parser', 'mediawiki.HTML5TreeBuilder.node.js'), ['FauxHTML5']);
6565 _import(pj('parser', 'mediawiki.DOMPostProcessor.js'), ['DOMPostProcessor']);
6666
 67+_import(pj('parser', 'mediawiki.DOMConverter'), ['DOMConverter']);
 68+
6769 _import(pj('parser', 'ext.core.QuoteTransformer.js'), ['QuoteTransformer']);
6870
6971 _import(pj('parser', 'ext.Cite.js'), ['Cite']);
@@ -121,6 +123,11 @@
122124 description: 'Print out a whitelist entry for failing tests. Default false.',
123125 default: false,
124126 boolean: true
 127+ },
 128+ 'wikidom': {
 129+ description: 'Print out a WikiDom conversion of the HTML DOM',
 130+ default: false,
 131+ boolean: true
125132 }
126133 }
127134 ).check( function(argv) {
@@ -181,6 +188,8 @@
182189
183190 this.postProcessor = new DOMPostProcessor();
184191
 192+ this.DOMConverter = new DOMConverter();
 193+
185194 var pt = this;
186195
187196 // Set up the TokenTransformDispatcher with a callback for the remaining
@@ -196,13 +205,18 @@
197206 pt.buildTree( tokens, treeBuilder );
198207
199208 // Perform post-processing on DOM.
200 - pt.postProcessor.doPostProcess(treeBuilder.parser.document);
 209+ pt.postProcessor.doPostProcess(treeBuilder.document);
201210
202211 // And serialize the result.
203212 var out = treeBuilder.document.body.innerHTML;
204213
205214 // Finally, check the result vs. the expected result.
206215 pt.checkResult( pt.currentItem, out );
 216+
 217+ if ( pt.argv.wikidom ) {
 218+ // Test HTML DOM -> WikiDOM conversion
 219+ pt.printWikiDom( treeBuilder.document.body );
 220+ }
207221 });
208222
209223 // Add token transformations..
@@ -225,8 +239,6 @@
226240 }
227241
228242
229 -
230 -
231243 /**
232244 * Get an object holding our tests cases. Eventually from a cache file
233245 */
@@ -520,6 +532,21 @@
521533 };
522534
523535
 536+/**
 537+ * Print out a WikiDom conversion of the HTML DOM
 538+ */
 539+ParserTests.prototype.printWikiDom = function ( body ) {
 540+ console.log('WikiDom'.cyan + ':');
 541+ console.log(
 542+ JSON.stringify(
 543+ this.DOMConverter.HTMLtoWiki(body),
 544+ null,
 545+ 2
 546+ ) + "\n"
 547+ );
 548+};
 549+
 550+
524551 ParserTests.prototype.buildTree = function ( tokens, treeBuilder ) {
525552 // push a body element, just to be sure to have one
526553 treeBuilder.processToken({type: 'TAG', name: 'body'});
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.HTML5TreeBuilder.node.js
@@ -62,8 +62,8 @@
6363 break;
6464 case "END":
6565 this.emit('end');
 66+ this.document = this.parser.document;
6667 // HACK: This should not be needed really.
67 - this.document = this.parser.document;
6868 this.document.body = this.document.getElementsByTagName('body')[0];
6969 break;
7070 case "NEWLINE":
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.DOMPostProcessor.js
@@ -30,12 +30,18 @@
3131 }
3232 };
3333
 34+// Quick HACK: define Node constants
 35+// https://developer.mozilla.org/en/nodeType
 36+var Node = {
 37+ TEXT_NODE: 3,
 38+ COMMENT_NODE: 8
 39+};
 40+
3441 // Wrap all top-level inline elements in paragraphs. This should also be
3542 // applied inside block-level elements, but in that case the first paragraph
3643 // usually remains plain inline.
3744 var process_inlines_in_p = function ( document ) {
38 - // document.body does not always work in jsdom, so work around it.
39 - var body = document.getElementsByTagName('body')[0],
 45+ var body = document.body,
4046 newP = document.createElement('p'),
4147 cnodes = body.childNodes,
4248 haveInlines = false,
@@ -50,8 +56,8 @@
5157 ctype = child.nodeType;
5258 //console.log(child + ctype);
5359 if ((ctype === 3 && (haveInlines || !isElementContentWhitespace(child))) ||
54 - (ctype !== 3 && // text
55 - ctype !== 8 && // comment
 60+ (ctype !== Node.TEXT_NODE &&
 61+ ctype !== Node.COMMENT_NODE &&
5662 !isBlock(child.nodeName))) {
5763 // text node
5864 newP.appendChild(child);
Index: trunk/extensions/VisualEditor/modules/parser/mediawiki.DOMConverter.js
@@ -0,0 +1,306 @@
 2+/**
 3+ * Conversions between HTML DOM and WikiDom
 4+ *
 5+ * @class
 6+ * @constructor
 7+ */
 8+function DOMConverter () {
 9+}
 10+
 11+// Quick HACK: define Node constants
 12+// https://developer.mozilla.org/en/nodeType
 13+var Node = {
 14+ ELEMENT_NODE: 1,
 15+ ATTRIBUTE_NODE: 2,
 16+ TEXT_NODE: 3,
 17+ CDATA_SECTION_NODE: 4,
 18+ ENTITY_REFERENCE_NODE: 5,
 19+ ENTITY_NODE: 6,
 20+ PROCESSING_INSTRUCTION_NODE: 7,
 21+ COMMENT_NODE: 8,
 22+ DOCUMENT_NODE: 9,
 23+ DOCUMENT_TYPE_NODE: 10,
 24+ DOCUMENT_FRAGMENT_NODE: 11,
 25+ NOTATION_NODE: 12
 26+};
 27+
 28+DOMConverter.prototype.getHTMLHandlerInfo = function ( nodeName ) {
 29+ switch ( nodeName.toLowerCase() ) {
 30+ case 'p':
 31+ return {
 32+ handler: this._convertHTMLLeaf,
 33+ type: 'paragraph'
 34+ };
 35+ case 'li':
 36+ case 'dl':
 37+ case 'dd':
 38+ return {
 39+ handler: this._convertHTMLLeaf,
 40+ type: 'listItem'
 41+ };
 42+ case 'pre':
 43+ return {
 44+ handler: this._convertHTMLLeaf,
 45+ type: 'pre'
 46+ };
 47+ case 'ul':
 48+ case 'ol':
 49+ case 'dl':
 50+ return {
 51+ handler: this._convertHTMLBranch,
 52+ type: 'list'
 53+ };
 54+ default:
 55+ console.log( 'HTML to Wiki DOM conversion error. Unsupported node name ' +
 56+ nodeName );
 57+ return {
 58+ handler: this._convertHTMLBranch,
 59+ type: nodeName.toLowerCase()
 60+ };
 61+ break;
 62+ }
 63+};
 64+
 65+DOMConverter.prototype.getHTMLAnnotationType = function ( nodeName ) {
 66+ switch ( nodeName.toLowerCase() ) {
 67+ case 'i':
 68+ return 'textStyle/italic';
 69+ case 'b':
 70+ return 'textStyle/bold';
 71+ case 'span':
 72+ return 'textStyle/span';
 73+ case 'a':
 74+ return 'link/unknown'; // XXX: distinguish internal / external etc
 75+ default:
 76+ console.log( 'HTML to Wiki DOM conversion error. Unsupported html annotation ' +
 77+ nodeName );
 78+ return undefined;
 79+ break;
 80+ }
 81+};
 82+
 83+/**
 84+ * Convert a HTML DOM to WikiDom
 85+ *
 86+ * @method
 87+ * @param {Object} root of HTML DOM (usually the body element)
 88+ * @returns {Object} WikiDom version
 89+ */
 90+DOMConverter.prototype.HTMLtoWiki = function ( node ) {
 91+ var children = node.childNodes,
 92+ out = {
 93+ type: 'document',
 94+ children: []
 95+ };
 96+ for ( var i = 0, l = children.length; i < l; i++ ) {
 97+ var cnode = children[i];
 98+ switch ( cnode.nodeType ) {
 99+ case Node.ELEMENT_NODE:
 100+ // Call a handler for the particular node type
 101+ var hi = this.getHTMLHandlerInfo( cnode.nodeName );
 102+ var res = hi.handler.call(this, cnode, 0, hi.type );
 103+ out.children.push( res.node );
 104+ break;
 105+ case Node.TEXT_NODE:
 106+ // Add text as content, and increment offset
 107+ // BUT: Should not appear at toplevel!
 108+ break;
 109+ case Node.COMMENT_NODE:
 110+ // Add a comment annotation to which text? Not clear how this
 111+ // can be represented in WikiDom.
 112+ break;
 113+ default:
 114+ console.log( "HTML to Wiki DOM conversion error. Unhandled node type " +
 115+ cnode.innerHTML );
 116+ break;
 117+ }
 118+ }
 119+ return out;
 120+};
 121+
 122+/**
 123+ * Private HTML branch node handler
 124+ *
 125+ * @param {Object} HTML DOM element
 126+ * @param {Int} WikiDom offset within a block
 127+ * @returns {Object} WikiDom object
 128+ */
 129+DOMConverter.prototype._convertHTMLBranch = function ( node, offset, type ) {
 130+ var children = node.childNodes,
 131+ wnode = {
 132+ type: type,
 133+ attributes: this._HTMLPropertiesToWikiAttributes( node ),
 134+ children: []
 135+ };
 136+ for ( var i = 0, l = children.length; i < l; i++ ) {
 137+ var cnode = children[i];
 138+ switch ( cnode.nodeType ) {
 139+ case Node.ELEMENT_NODE:
 140+ // Call a handler for the particular node type
 141+ var hi = this.getHTMLHandlerInfo( cnode.nodeName );
 142+ var res = hi.handler.call(this, cnode, offset + 1, hi.type );
 143+ wnode.children.push( res.node );
 144+ offset = res.offset;
 145+ break;
 146+ case Node.TEXT_NODE:
 147+ // Create a paragraph and add it to children?
 148+ break;
 149+ case Node.COMMENT_NODE:
 150+ // add a comment node.
 151+ break;
 152+ default:
 153+ console.log( "HTML to Wiki DOM conversion error. Unhandled node " +
 154+ cnode.innerHTML );
 155+ break;
 156+ }
 157+ }
 158+ return {
 159+ offset: offset,
 160+ node: wnode
 161+ };
 162+};
 163+
 164+/**
 165+ * Private HTML leaf node handler
 166+ *
 167+ * @param {Object} HTML DOM element
 168+ * @param {Int} WikiDom offset within a block
 169+ * @returns {Object} WikiDom object
 170+ */
 171+DOMConverter.prototype._convertHTMLLeaf = function ( node, offset, type ) {
 172+ var children = node.childNodes,
 173+ wnode = {
 174+ type: type,
 175+ attributes: this._HTMLPropertiesToWikiAttributes( node ),
 176+ content: {
 177+ text: '',
 178+ annotations: []
 179+ }
 180+ };
 181+ //console.log( 'res wnode: ' + JSON.stringify(wnode, null, 2));
 182+ for ( var i = 0, l = children.length; i < l; i++ ) {
 183+ var cnode = children[i];
 184+ switch ( cnode.nodeType ) {
 185+ case Node.ELEMENT_NODE:
 186+ // Call a handler for the particular annotation node type
 187+ var annotationtype = this.getHTMLAnnotationType( cnode.nodeName );
 188+ if ( annotationtype ) {
 189+ var res = this._convertHTMLAnnotation( cnode, offset, annotationtype );
 190+ //console.log( 'res leaf: ' + JSON.stringify(res, null, 2));
 191+ offset += res.text.length;
 192+ wnode.content.text += res.text;
 193+ //console.log( 'res annotations: ' + JSON.stringify(res, null, 2));
 194+ wnode.content.annotations = wnode.content.annotations
 195+ .concat( res.annotations );
 196+ }
 197+ break;
 198+ case Node.TEXT_NODE:
 199+ // Add text as content, and increment offset
 200+ wnode.content.text += cnode.data;
 201+ offset += cnode.data.length;
 202+ break;
 203+ case Node.COMMENT_NODE:
 204+ // add a comment annotation?
 205+ break;
 206+ default:
 207+ console.log( "HTML to Wiki DOM conversion error. Unhandled node " +
 208+ cnode.innerHTML );
 209+ break;
 210+ }
 211+ }
 212+ return {
 213+ offset: offset,
 214+ node: wnode
 215+ };
 216+};
 217+
 218+DOMConverter.prototype._convertHTMLAnnotation = function ( node, offset, type ) {
 219+ var children = node.childNodes,
 220+ text = '',
 221+ annotations = [
 222+ {
 223+ type: type,
 224+ data: this._HTMLPropertiesToWikiData( node ),
 225+ range: {
 226+ start: offset,
 227+ end: offset
 228+ }
 229+ }
 230+ ];
 231+ for ( var i = 0, l = children.length; i < l; i++ ) {
 232+ var cnode = children[i];
 233+ switch ( cnode.nodeType ) {
 234+ case Node.ELEMENT_NODE:
 235+ // Call a handler for the particular annotation node type
 236+ var annotationtype = this.getHTMLAnnotationType(cnode.nodeName);
 237+ if ( annotationtype ) {
 238+ var res = this._convertHTMLAnnotation( cnode, offset, annotationtype );
 239+ //console.log( 'res annotations 2: ' + JSON.stringify(res, null, 2));
 240+ text += res.text;
 241+ offset += res.text.length;
 242+ annotations = annotations.concat( res.annotations );
 243+ }
 244+ break;
 245+ case Node.TEXT_NODE:
 246+ // Add text as content, and increment offset
 247+ text += cnode.data;
 248+ offset += cnode.data.length;
 249+ break;
 250+ case Node.COMMENT_NODE:
 251+ // add a comment annotation?
 252+ break;
 253+ default:
 254+ console.log( "HTML to Wiki DOM conversion error. Unhandled node " +
 255+ cnode.innerHTML );
 256+ break;
 257+ }
 258+ }
 259+ annotations[0].range.end = offset;
 260+ return {
 261+ text: text,
 262+ annotations: annotations
 263+ };
 264+};
 265+
 266+DOMConverter.prototype._HTMLPropertiesToWikiAttributes = function ( elem ) {
 267+ var attribs = elem.attributes,
 268+ out = {};
 269+ for ( var i = 0, l = attribs.length; i < l; i++ ) {
 270+ var attrib = attribs.item(i),
 271+ key = attrib.name;
 272+ console.log('key: ' + key);
 273+ if ( key.match( /^data-/ ) ) {
 274+ // strip data- prefix from data-*
 275+ out[key.replace( /^data-/, '' )] = attrib.value;
 276+ } else {
 277+ // prefix html properties with html/
 278+ out['html/' + key] = attrib.value;
 279+ }
 280+ }
 281+ return out;
 282+};
 283+
 284+DOMConverter.prototype._HTMLPropertiesToWikiData = function ( elem ) {
 285+ var attribs = elem.attributes,
 286+ out = {};
 287+ for ( var i = 0, l = attribs.length; i < l; i++ ) {
 288+ var attrib = attribs.item(i),
 289+ key = attrib.name;
 290+ if ( key.match( /^data-/ ) ) {
 291+ // strip data- prefix from data-*
 292+ out[key.replace( /^data-/, '' )] = attrib.value;
 293+ } else {
 294+ // pass through a few whitelisted keys
 295+ // XXX: This subsets html DOM
 296+ if ( ['title'].indexOf(key) != -1 ) {
 297+ out[key] = attrib.value;
 298+ }
 299+ }
 300+ }
 301+ return out;
 302+};
 303+
 304+
 305+if (typeof module == "object") {
 306+ module.exports.DOMConverter = DOMConverter;
 307+}
Property changes on: trunk/extensions/VisualEditor/modules/parser/mediawiki.DOMConverter.js
___________________________________________________________________
Added: svn:eol-style
1308 + native
Index: trunk/extensions/VisualEditor/modules/parser/ext.Cite.js
@@ -257,7 +257,8 @@
258258 type: 'TAG',
259259 name: 'ol',
260260 attribs: [
261 - ['class', 'references']
 261+ ['class', 'references'],
 262+ ['data-object', 'references'] // Object type
262263 ]
263264 }
264265 ].concat( listItems, { type: 'ENDTAG', name: 'ol' } );

Status & tagging log