r14511 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r14510‎ | r14511 | r14512 >
Date:06:16, 1 June 2006
Author:brion
Status:old
Tags:
Comment:
* (bug 5384) Fix <!-- comments --> in <ref> extension
* Nesting of different tag extensions and comments should now work more
consistently and more safely. A cleaner, one-pass tag strip lets the
'outer' tag either take source (<nowiki>-style) or pass it down to
further parsing (<ref>-style). There should no longer be surprise
expansion of foreign extensions inside HTML output, or differences
in behavior based on the order tags are loaded.
Modified paths:
  • /trunk/phase3/RELEASE-NOTES (modified) (history)
  • /trunk/phase3/includes/Parser.php (modified) (history)
  • /trunk/phase3/maintenance/parserTests.txt (modified) (history)

Diff [purge]

Index: trunk/phase3/maintenance/parserTests.txt
@@ -301,6 +301,7 @@
302302 &lt;cite&gt;
303303 &lt;em&gt;
304304 </pre>
 305+
305306 !! end
306307
307308 ###
Index: trunk/phase3/includes/Parser.php
@@ -9,6 +9,7 @@
1010 /** */
1111 require_once( 'Sanitizer.php' );
1212 require_once( 'HttpFunctions.php' );
 13+require_once( 'ImageGallery.php' );
1314
1415 /**
1516 * Update this version number when the ParserOutput format
@@ -319,63 +320,60 @@
320321 * If $tag is set to STRIP_COMMENTS, the function will extract
321322 * <!-- HTML comments -->
322323 *
 324+ * $output: array( 'UNIQ-xxxxx' => array(
 325+ * 'element',
 326+ * 'tag content',
 327+ * array( 'param' => 'x' ),
 328+ * '<element param="x">' ) )
323329 * @private
324330 * @static
325331 */
326 - function extractTagsAndParams($tag, $text, &$content, &$tags, &$params, $uniq_prefix = ''){
327 - $rnd = $uniq_prefix . '-' . $tag . Parser::getRandomString();
328 - if ( !$content ) {
329 - $content = array( );
330 - }
 332+ function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){
 333+ $rand = Parser::getRandomString();
331334 $n = 1;
332335 $stripped = '';
 336+ $matches = array();
333337
334 - if ( !$tags ) {
335 - $tags = array( );
336 - }
337 -
338 - if ( !$params ) {
339 - $params = array( );
340 - }
341 -
342 - if( $tag == STRIP_COMMENTS ) {
343 - $start = '/<!--()/';
344 - $end = '/-->/';
 338+ if( $elements == STRIP_COMMENTS ) {
 339+ $start = '/<!--()()/';
345340 } else {
346 - $start = "/<$tag(\\s+[^>]*|\\s*\/?)>/i";
347 - $end = "/<\\/$tag\\s*>/i";
 341+ $taglist = implode( '|', $elements );
 342+ $start = "/<($taglist)(\\s+[^>]*|\\s*\/?)>/i";
348343 }
349344
350345 while ( '' != $text ) {
351346 $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
352347 $stripped .= $p[0];
353 - if( count( $p ) < 3 ) {
 348+ if( count( $p ) < 4 ) {
354349 break;
355350 }
356 - $attributes = $p[1];
357 - $inside = $p[2];
 351+ $element = $p[1];
 352+ $attributes = $p[2];
 353+ $inside = $p[3];
358354
359355 // If $attributes ends with '/', we have an empty element tag, <tag />
360 - if( $tag != STRIP_COMMENTS && substr( $attributes, -1 ) == '/' ) {
 356+ if( $element != '' && substr( $attributes, -1 ) == '/' ) {
361357 $attributes = substr( $attributes, 0, -1);
362358 $empty = '/';
363359 } else {
364360 $empty = '';
365361 }
366362
367 - $marker = $rnd . sprintf('%08X', $n++);
 363+ $marker = "$uniq_prefix-$element-$rand" . sprintf('%08X', $n++);
368364 $stripped .= $marker;
369365
370 - $tags[$marker] = "<$tag$attributes$empty>";
371 - $params[$marker] = Sanitizer::decodeTagAttributes( $attributes );
372 -
373366 if ( $empty === '/' ) {
374367 // Empty element tag, <tag />
375 - $content[$marker] = null;
 368+ $content = null;
376369 $text = $inside;
377370 } else {
 371+ if( $element ) {
 372+ $end = "/<\\/$element\\s*>/i";
 373+ } else {
 374+ $end = '/-->/';
 375+ }
378376 $q = preg_split( $end, $inside, 2 );
379 - $content[$marker] = $q[0];
 377+ $content = $q[0];
380378 if( count( $q ) < 2 ) {
381379 # No end tag -- let it run out to the end of the text.
382380 break;
@@ -383,27 +381,16 @@
384382 $text = $q[1];
385383 }
386384 }
 385+
 386+ $matches[$marker] = array( $element,
 387+ $content,
 388+ Sanitizer::decodeTagAttributes( $attributes ),
 389+ "<$element$attributes$empty>" );
387390 }
388391 return $stripped;
389392 }
390393
391394 /**
392 - * Wrapper function for extractTagsAndParams
393 - * for cases where $tags and $params isn't needed
394 - * i.e. where tags will never have params, like <nowiki>
395 - *
396 - * @private
397 - * @static
398 - */
399 - function extractTags( $tag, $text, &$content, $uniq_prefix = '' ) {
400 - $dummy_tags = array();
401 - $dummy_params = array();
402 -
403 - return Parser::extractTagsAndParams( $tag, $text, $content,
404 - $dummy_tags, $dummy_params, $uniq_prefix );
405 - }
406 -
407 - /**
408395 * Strips and renders nowiki, pre, math, hiero
409396 * If $render is set, performs necessary rendering operations on plugins
410397 * Returns the text, and fills an array with data needed in unstrip()
@@ -418,124 +405,102 @@
419406 */
420407 function strip( $text, &$state, $stripcomments = false ) {
421408 $render = ($this->mOutputType == OT_HTML);
422 - $html_content = array();
423 - $nowiki_content = array();
424 - $math_content = array();
425 - $pre_content = array();
426 - $comment_content = array();
427 - $ext_content = array();
428 - $ext_tags = array();
429 - $ext_params = array();
430 - $gallery_content = array();
431409
432410 # Replace any instances of the placeholders
433411 $uniq_prefix = $this->mUniqPrefix;
434412 #$text = str_replace( $uniq_prefix, wfHtmlEscapeFirst( $uniq_prefix ), $text );
435 -
436 - # html
 413+
 414+ $elements = array_merge(
 415+ array( 'nowiki', 'pre', 'gallery' ),
 416+ array_keys( $this->mTagHooks ) );
437417 global $wgRawHtml;
438418 if( $wgRawHtml ) {
439 - $text = Parser::extractTags('html', $text, $html_content, $uniq_prefix);
440 - foreach( $html_content as $marker => $content ) {
441 - if ($render ) {
442 - # Raw and unchecked for validity.
443 - $state['html'][$marker] = $content;
444 - } else {
445 - $state['html'][$marker] = '<html>'.$content.'</html>';
446 - }
447 - }
 419+ $elements[] = 'html';
448420 }
449 -
450 - # nowiki
451 - $text = Parser::extractTags('nowiki', $text, $nowiki_content, $uniq_prefix);
452 - foreach( $nowiki_content as $marker => $content ) {
453 - if( $render ){
454 - $state['nowiki'][$marker] = wfEscapeHTMLTagsOnly( $content );
455 - } else {
456 - $state['nowiki'][$marker] = '<nowiki>'.$content.'</nowiki>';
457 - }
458 - }
459 -
460 - # math
461421 if( $this->mOptions->getUseTeX() ) {
462 - $text = Parser::extractTags('math', $text, $math_content, $uniq_prefix);
463 - foreach( $math_content as $marker => $content ){
464 - if( $render ) {
465 - $state['math'][$marker] = renderMath( $content );
466 - } else {
467 - $state['math'][$marker] = '<math>'.$content.'</math>';
468 - }
469 - }
 422+ $elements[] = 'math';
470423 }
 424+
471425
472 - # pre
473 - $text = Parser::extractTags('pre', $text, $pre_content, $uniq_prefix);
474 - foreach( $pre_content as $marker => $content ){
475 - if( $render ){
476 - $state['pre'][$marker] = '<pre>' . wfEscapeHTMLTagsOnly( $content ) . '</pre>';
477 - } else {
478 - $state['pre'][$marker] = '<pre>'.$content.'</pre>';
479 - }
 426+ // Strip comments in a first pass.
 427+ // This saves us from needlessly rendering extensions in comment text
 428+ $text = Parser::extractTagsAndParams(STRIP_COMMENTS, $text, $comment_matches, $uniq_prefix);
 429+ $commentState = array();
 430+ foreach( $comment_matches as $marker => $data ){
 431+ list( $element, $content, $params, $tag ) = $data;
 432+ $commentState[$marker] = '<!--' . $content . '-->';
480433 }
481 -
482 - # gallery
483 - $text = Parser::extractTags('gallery', $text, $gallery_content, $uniq_prefix);
484 - foreach( $gallery_content as $marker => $content ) {
485 - require_once( 'ImageGallery.php' );
486 - if ( $render ) {
487 - $state['gallery'][$marker] = $this->renderImageGallery( $content );
488 - } else {
489 - $state['gallery'][$marker] = '<gallery>'.$content.'</gallery>';
 434+
 435+ $matches = array();
 436+ $text = Parser::extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );
 437+
 438+ foreach( $matches as $marker => $data ) {
 439+ list( $element, $content, $params, $tag ) = $data;
 440+ // Restore any comments; the extension can deal with them.
 441+ if( $content !== null) {
 442+ $content = strtr( $content, $commentState );
490443 }
491 - }
492 -
493 - # Comments
494 - $text = Parser::extractTags(STRIP_COMMENTS, $text, $comment_content, $uniq_prefix);
495 - foreach( $comment_content as $marker => $content ){
496 - $comment_content[$marker] = '<!--'.$content.'-->';
497 - }
498 -
499 - # Extensions
500 - foreach ( $this->mTagHooks as $tag => $callback ) {
501 - $ext_content[$tag] = array();
502 - $text = Parser::extractTagsAndParams( $tag, $text, $ext_content[$tag],
503 - $ext_tags[$tag], $ext_params[$tag], $uniq_prefix );
504 - foreach( $ext_content[$tag] as $marker => $content ) {
505 - $full_tag = $ext_tags[$tag][$marker];
506 - $params = $ext_params[$tag][$marker];
507 - if ( $render )
508 - $state[$tag][$marker] = call_user_func_array( $callback, array( $content, $params, $this ) );
509 - else {
510 - if ( is_null( $content ) ) {
511 - // Empty element tag
512 - $state[$tag][$marker] = $full_tag;
 444+ if( $render ) {
 445+ switch( $element ) {
 446+ case 'html':
 447+ if( $wgRawHtml ) {
 448+ $output = $content;
 449+ break;
 450+ }
 451+ // Shouldn't happen otherwise. :)
 452+ case 'nowiki':
 453+ $output = wfEscapeHTMLTagsOnly( $content );
 454+ break;
 455+ case 'math':
 456+ $output = renderMath( $content );
 457+ break;
 458+ case 'pre':
 459+ // Backwards-compatibility hack
 460+ $content = preg_replace( '!<nowiki>(.*?)</nowiki>!is', '\\1', $content );
 461+ $output = '<pre>' . wfEscapeHTMLTagsOnly( $content ) . '</pre>';
 462+ break;
 463+ case 'gallery':
 464+ $output = $this->renderImageGallery( $content );
 465+ break;
 466+ default:
 467+ $tagName = strtolower( $element );
 468+ if( isset( $this->mTagHooks[$tagName] ) ) {
 469+ $output = call_user_func_array( $this->mTagHooks[$tagName],
 470+ array( $content, $params, $this ) );
513471 } else {
514 - $state[$tag][$marker] = "$full_tag$content</$tag>";
 472+ wfDebugDieBacktrace( "Invalid call hook $element" );
515473 }
516474 }
 475+ } else {
 476+ // Just stripping tags; keep the source
 477+ if( $content === null ) {
 478+ $output = $tag;
 479+ } else {
 480+ $output = "$tag$content</$element>";
 481+ }
517482 }
 483+ $state[$element][$marker] = $output;
518484 }
519485
520486 # Unstrip comments unless explicitly told otherwise.
521487 # (The comments are always stripped prior to this point, so as to
522488 # not invoke any extension tags / parser hooks contained within
523489 # a comment.)
524 - if ( !$stripcomments ) {
525 - $tempstate = array( 'comment' => $comment_content );
526 - $text = $this->unstrip( $text, $tempstate );
527 - $comment_content = array();
528 - } else {
529 - if( !isset( $state['comment'] ) ) {
530 - $state['comment'] = array();
 490+ if ( $stripcomments ) {
 491+ // Add remaining comments to the state array
 492+ foreach( $commentState as $marker => $content ) {
 493+ $state['comment'][$marker] = $content;
531494 }
532 - $state['comment'] += $comment_content;
 495+ } else {
 496+ // Put them all back and forget them
 497+ $text = strtr( $text, $commentState );
533498 }
534499
535500 return $text;
536501 }
537502
538503 /**
539 - * restores pre, math, and hiero removed by strip()
 504+ * Restores pre, math, and other extensions removed by strip()
540505 *
541506 * always call unstripNoWiki() after this one
542507 * @private
@@ -545,20 +510,21 @@
546511 return $text;
547512 }
548513
549 - # Must expand in reverse order, otherwise nested tags will be corrupted
550 - foreach( array_reverse( $state, true ) as $tag => $contentDict ) {
 514+ $replacements = array();
 515+ foreach( $state as $tag => $contentDict ) {
551516 if( $tag != 'nowiki' && $tag != 'html' ) {
552 - foreach( array_reverse( $contentDict, true ) as $uniq => $content ) {
553 - $text = str_replace( $uniq, $content, $text );
 517+ foreach( $contentDict as $uniq => $content ) {
 518+ $replacements[$uniq] = $content;
554519 }
555520 }
556521 }
 522+ $text = strtr( $text, $replacements );
557523
558524 return $text;
559525 }
560526
561527 /**
562 - * always call this after unstrip() to preserve the order
 528+ * Always call this after unstrip() to preserve the order
563529 *
564530 * @private
565531 */
@@ -567,18 +533,15 @@
568534 return $text;
569535 }
570536
571 - # Must expand in reverse order, otherwise nested tags will be corrupted
572 - if( isset( $state['nowiki'] ) )
573 - foreach( array_reverse( $state['nowiki'], true ) as $uniq => $content ) {
574 - $text = str_replace( $uniq, $content, $text );
 537+ $replacements = array();
 538+ foreach( $state as $tag => $contentDict ) {
 539+ if( $tag == 'nowiki' || $tag == 'html' ) {
 540+ foreach( $contentDict as $uniq => $content ) {
 541+ $replacements[$uniq] = $content;
 542+ }
575543 }
576 -
577 - global $wgRawHtml;
578 - if ($wgRawHtml && isset( $state['html'] ) ) {
579 - foreach( array_reverse( $state['html'], true ) as $uniq => $content ) {
580 - $text = str_replace( $uniq, $content, $text );
581 - }
582544 }
 545+ $text = strtr( $text, $replacements );
583546
584547 return $text;
585548 }
@@ -593,14 +556,7 @@
594557 function insertStripItem( $text, &$state ) {
595558 $rnd = $this->mUniqPrefix . '-item' . Parser::getRandomString();
596559 if ( !$state ) {
597 - $state = array(
598 - 'html' => array(),
599 - 'nowiki' => array(),
600 - 'math' => array(),
601 - 'pre' => array(),
602 - 'comment' => array(),
603 - 'gallery' => array(),
604 - );
 560+ $state = array();
605561 }
606562 $state['item'][$rnd] = $text;
607563 return $rnd;
Index: trunk/phase3/RELEASE-NOTES
@@ -398,6 +398,13 @@
399399 * parserTests.php accepts a --file parameter to run an alternate test sutie
400400 * parser tests can now test extensions using !!hooks sections
401401 * Fix oddity with open tag parameters getting stuck on </li>
 402+* (bug 5384) Fix <!-- comments --> in <ref> extension
 403+* Nesting of different tag extensions and comments should now work more
 404+ consistently and more safely. A cleaner, one-pass tag strip lets the
 405+ 'outer' tag either take source (<nowiki>-style) or pass it down to
 406+ further parsing (<ref>-style). There should no longer be surprise
 407+ expansion of foreign extensions inside HTML output, or differences
 408+ in behavior based on the order tags are loaded.
402409
403410
404411 == Compatibility ==

Follow-up revisions

RevisionCommit summaryAuthorDate
r14586Backport fixes and bump to 1.6.7...brion06:27, 6 June 2006

Status & tagging log