r97542 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r97541‎ | r97542 | r97543 >
Date:19:33, 19 September 2011
Author:maxsem
Status:ok (Comments)
Tags:
Comment:
Rewrote tokenizer so that it can actually be understood by mortal humans. All previous tokenizer's glitches are kept intact FOR NOW, except for:
* Slightly different handling of invalid phonemes/hieroglyphs. Garbage in, garbage out.
* Normalised handling of "!" and "." - now they are handled uniformly no matter if they're separated from other tokens or not, e.g. "a!b" === "a ! b".
* Removed the attempt to handle parenthetical grouping pending a proper fix as requested by bug 31000.
Modified paths:
  • /trunk/extensions/wikihiero/tests.txt (modified) (history)
  • /trunk/extensions/wikihiero/wikihiero.body.php (modified) (history)

Diff [purge]

Index: trunk/extensions/wikihiero/tests.txt
@@ -12,7 +12,7 @@
1313 !! input
1414 <hiero><script>alert("FAIL")</script></hiero>
1515 !! result
16 -<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table class="mw-hiero-table"><tr> <td>&lt;script&gt;alert(FAIL)&lt;script&gt;</td> </tr></table> </td></tr></table>
 16+<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table class="mw-hiero-table"><tr> <td>&lt;script&gt;alert(&quot;FAIL&quot;)&lt;/script&gt;</td> </tr></table> </td></tr></table>
1717
1818 !! end
1919
@@ -44,6 +44,15 @@
4545 !! end
4646
4747 !! test
 48+WikiHiero - EOL
 49+!! input
 50+<hiero>A1!B1 ! C1</hiero>
 51+!! result
 52+<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table class="mw-hiero-table"><tr> <td><img style='margin:1px;' height='38' src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1' /></td></tr></table><table class="mw-hiero-table"><tr> <td><img style='margin:1px;' height='38' src='/extensions/wikihiero/img/hiero_B1.png' title='B1' alt='B1' /></td></tr></table><table class="mw-hiero-table"><tr> <td><img style='margin:1px;' height='38' src='/extensions/wikihiero/img/hiero_C1.png' title='C1' alt='C1' /></td></tr></table> </td></tr></table>
 53+
 54+!! end
 55+
 56+!! test
4857 WikiHiero - complex text with EOL
4958 !! input
5059 <hiero>M23-X1:R4-X8-Q2:D4-W17-R14-G4-R8-O29:V30-U23 !
@@ -61,3 +70,14 @@
6271 <table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table class="mw-hiero-table"><tr> <td><img class="mw-mirrored" style='margin:1px;' height='38' src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1' /></td><td><img style='margin:1px;' height='38' src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1' /></td></tr></table> </td></tr></table>
6372
6473 !! end
 74+
 75+!! test
 76+WikiHiero - void blocks
 77+!! input
 78+<hiero>A1..B1.C1</hiero>
 79+<hiero>A1 .. B1 . C1</hiero>
 80+!!result
 81+<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table class="mw-hiero-table"><tr> <td><img style='margin:1px;' height='38' src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1' /></td><td><table class="mw-hiero-table" style="width: 44px;"><tr><td>&#160;</td></tr></table></td><td><img style='margin:1px;' height='38' src='/extensions/wikihiero/img/hiero_B1.png' title='B1' alt='B1' /></td><td><table class="mw-hiero-table" style="width: 22px;"><tr><td>&#160;</td></tr></table></td><td><img style='margin:1px;' height='38' src='/extensions/wikihiero/img/hiero_C1.png' title='C1' alt='C1' /></td></tr></table> </td></tr></table>
 82+<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table class="mw-hiero-table"><tr> <td><img style='margin:1px;' height='38' src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1' /></td><td><table class="mw-hiero-table" style="width: 44px;"><tr><td>&#160;</td></tr></table></td><td><img style='margin:1px;' height='38' src='/extensions/wikihiero/img/hiero_B1.png' title='B1' alt='B1' /></td><td><table class="mw-hiero-table" style="width: 22px;"><tr><td>&#160;</td></tr></table></td><td><img style='margin:1px;' height='38' src='/extensions/wikihiero/img/hiero_C1.png' title='C1' alt='C1' /></td></tr></table> </td></tr></table>
 83+
 84+!! end
Index: trunk/extensions/wikihiero/wikihiero.body.php
@@ -430,16 +430,14 @@
431431 * Hieroglyphs tokenizer class
432432 */
433433 /*private*/ class HieroTokenizer {
434 - const TYPE_NONE = 0;
435 - const TYPE_GLYPH = 1; // rendered items
436 - const TYPE_CODE = 2; // single code as ':', '*', '!', '(' or ')'
437 - const TYPE_SPECIAL = 3; // advanced code (more than 1 caracter)
438 - const TYPE_END = 4; // end of line '!'
 434+ private static $delimiters = false;
 435+ private static $tokenDelimiters;
 436+ private static $singleChars;
439437
440438 private $text;
441439 private $blocks = false;
442 - private $blocks_id = 0;
443 - private $item_id = 0;
 440+ private $currentBlock;
 441+ private $token;
444442
445443 /**
446444 * Constructor
@@ -448,8 +446,19 @@
449447 */
450448 public function __construct( $text ) {
451449 $this->text = $text;
 450+ self::initStatic();
452451 }
453452
 453+ private static function initStatic() {
 454+ if ( self::$delimiters ) {
 455+ return;
 456+ }
 457+
 458+ self::$delimiters = array_flip( array( ' ', '-', "\t", "\n" ) );
 459+ self::$tokenDelimiters = array_flip( array( '*', ':', '(', ')' ) );
 460+ self::$singleChars = array_flip( array( '!' ) );
 461+ }
 462+
454463 /**
455464 * Split text into blocks, then split blocks into items
456465 *
@@ -459,72 +468,93 @@
460469 if ( $this->blocks !== false ) {
461470 return $this->blocks;
462471 }
463 - $this->blocks = array( array( '' ) );
464 - $parentheses = 0;
465 - $type = self::TYPE_NONE;
 472+ $this->blocks = array();
 473+ $this->currentBlock = array();
 474+ $this->token = '';
 475+
 476+ $text = preg_replace( '/<!--.*?-->/', '', $this->text ); // remove HTML comments
466477
467 - for ( $i = 0; $i < strlen( $this->text ); $i++ ) {
 478+ for ( $i = 0; $i < strlen( $text ); $i++ ) {
468479 $char = $this->text[$i];
469480
470 - if ( $char == '(' ) {
471 - $parentheses++;
472 - } elseif ( $char == ')' ) {
473 - $parentheses--;
 481+ if ( isset( self::$delimiters[$char] ) ) {
 482+ $this->newBlock();
 483+ } elseif ( isset( self::$singleChars[$char] ) ) {
 484+ $this->singleCharBlock( $char );
 485+ } elseif ( $char == '.' ) {
 486+ $this->dot();
 487+ } elseif ( isset( self::$tokenDelimiters[$char] ) ) {
 488+ $this->newToken( $char );
 489+ } else {
 490+ $this->char( $char );
474491 }
 492+ }
475493
476 - if ( $parentheses == 0 ) {
477 - if ( $char == '-' || $char == ' ' ) {
478 - if ( $type != self::TYPE_NONE ) {
479 - $this->addBlock( '' );
480 - $type = self::TYPE_NONE;
481 - }
482 - }
483 - } else {// don't split block if inside parentheses
484 - if ( $char == '-' ) {
485 - $this->addItem( '-' );
486 - $type = self::TYPE_CODE;
487 - }
488 - }
 494+ $this->newBlock(); // flush stuff being processed
489495
490 - if ( $char == '!' ) {
491 - if ( $this->item_id > 0 ) {
492 - $this->addBlock();
493 - }
494 - $this->blocks[$this->blocks_id][$this->item_id] = $char;
495 - $type = self::TYPE_END;
 496+ return $this->blocks;
 497+ }
496498
497 - } elseif ( preg_match( '/[*:()]/', $char ) ) {
498 - if ( $type == self::TYPE_GLYPH || $type == self::TYPE_CODE ) {
499 - $this->addItem( '' );
500 - }
501 - $this->blocks[$this->blocks_id][$this->item_id] = $char;
502 - $type = self::TYPE_CODE;
 499+ /**
 500+ * Handles a block delimiter
 501+ */
 502+ private function newBlock() {
 503+ $this->newToken();
 504+ if( $this->currentBlock ) {
 505+ $this->blocks[] = $this->currentBlock;
 506+ $this->currentBlock = array();
 507+ }
 508+ }
503509
504 - } elseif ( ctype_alnum( $char ) || $char == '.' || $char == '<'
505 - || $char == '>' || $char == '\\' ) {
506 - if ( $type == self::TYPE_END ) {
507 - $this->addBlock( '' );
508 - } elseif ( $type == self::TYPE_CODE ) {
509 - $this->addItem( '' );
510 - }
511 - $this->blocks[$this->blocks_id][$this->item_id] .= $char;
512 - $type = self::TYPE_GLYPH;
513 - }
 510+ /**
 511+ * Flushes current token, optionally adds another one
 512+ *
 513+ * @param $token Mixed: token to add or false
 514+ */
 515+ private function newToken( $token = false ) {
 516+ if ( $this->token !== '' ) {
 517+ $this->currentBlock[] = $this->token;
 518+ $this->token = '';
514519 }
515 - return $this->blocks;
 520+ if ( $token !== false ) {
 521+ $this->currentBlock[] = $token;
 522+ }
516523 }
517524
518 - private function addBlock( $newItem = false ) {
519 - $this->blocks_id++;
520 - $this->blocks[$this->blocks_id] = array();
521 - $this->item_id = 0;
522 - if ( $newItem !== false ) {
523 - $this->blocks[$this->blocks_id][$this->item_id] = $newItem;
 525+ /**
 526+ * Adds a block consisting of one character
 527+ *
 528+ * @param $char string: block character
 529+ */
 530+ private function singleCharBlock( $char ) {
 531+ $this->newBlock();
 532+ $this->blocks[] = array( $char );
 533+ }
 534+
 535+ /**
 536+ * Handles void blocks represented by dots
 537+ */
 538+ private function dot() {
 539+ if ( $this->token == '.' ) {
 540+ $this->token = '..';
 541+ $this->newBlock();
 542+ } else {
 543+ $this->newBlock();
 544+ $this->token = '.';
524545 }
525546 }
526547
527 - private function addItem( $item ) {
528 - $this->item_id++;
529 - $this->blocks[$this->blocks_id][$this->item_id] = $item;
 548+ /**
 549+ * Adds a miscellaneous character to current token
 550+ *
 551+ * @param $char string: character to add
 552+ */
 553+ private function char( $char ) {
 554+ if ( $this->token == '.' ) {
 555+ $this->newBlock();
 556+ $this->token = $char;
 557+ } else {
 558+ $this->token .= $char;
 559+ }
530560 }
531 -}
\ No newline at end of file
 561+}

Follow-up revisions

RevisionCommit summaryAuthorDate
r97639Fixed HTML comments handling from r97542maxsem16:03, 20 September 2011

Comments

#Comment by MaxSem (talk | contribs)   19:37, 19 September 2011

Forgot another thing: this also fixed the handling of stuff like <hiero>[[ ]]</hiero> and <hiero>[& &]</hiero> which was previously broken.

#Comment by Tim Starling (talk | contribs)   05:16, 25 November 2011

Since the tokenizer is necessarily relatively slow and memory-intensive compared to other parts of the parser, you may want to consider adding a limit on total string length, similar to ParserFunctions #time:

self::$mTimeChars += strlen( $format );
if ( self::$mTimeChars > self::$mMaxTimeChars ) {
	return '<strong class="error">' . wfMsgForContent( 'pfunc_time_too_long' ) . '</strong>';

Status & tagging log