r14530 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r14529‎ | r14530 | r14531 >
Date:19:38, 1 June 2006
Author:brion
Status:old
Tags:
Comment:
Fix regressions in parser with incomplete tag stripping, plus some old bugs:
* (bug 885) Pre-save transform no longer silently appends close tags
* Pre-save transform no longer changes the case of close tags
Modified paths:
  • /trunk/phase3/RELEASE-NOTES (modified) (history)
  • /trunk/phase3/includes/Parser.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/Parser.php
@@ -311,20 +311,20 @@
312312 function getOptions() { return $this->mOptions; }
313313
314314 /**
315 - * Replaces all occurrences of <$tag>content</$tag> in the text
316 - * with a random marker and returns the new text. the output parameter
317 - * $content will be an associative array filled with data on the form
318 - * $unique_marker => content.
 315+ * Replaces all occurrences of HTML-style comments and the given tags
 316+ * in the text with a random marker and returns teh next text. The output
 317+ * parameter $matches will be an associative array filled with data in
 318+ * the form:
 319+ * 'UNIQ-xxxxx' => array(
 320+ * 'element',
 321+ * 'tag content',
 322+ * array( 'param' => 'x' ),
 323+ * '<element param="x">tag content</element>' ) )
319324 *
320 - * If $content is already set, the additional entries will be appended
321 - * If $tag is set to STRIP_COMMENTS, the function will extract
322 - * <!-- HTML comments -->
 325+ * @param $elements list of element names. Comments are always extracted.
 326+ * @param $text Source text string.
 327+ * @param $uniq_prefix
323328 *
324 - * $output: array( 'UNIQ-xxxxx' => array(
325 - * 'element',
326 - * 'tag content',
327 - * array( 'param' => 'x' ),
328 - * '<element param="x">' ) )
329329 * @private
330330 * @static
331331 */
@@ -334,58 +334,59 @@
335335 $stripped = '';
336336 $matches = array();
337337
338 - if( $elements == STRIP_COMMENTS ) {
339 - $start = '/<!--()()/';
340 - } else {
341 - $taglist = implode( '|', $elements );
342 - $start = "/<($taglist)(\\s+[^>]*|\\s*\/?)>/i";
343 - }
 338+ $taglist = implode( '|', $elements );
 339+ $start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i";
344340
345341 while ( '' != $text ) {
346342 $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
347343 $stripped .= $p[0];
348 - if( count( $p ) < 4 ) {
 344+ if( count( $p ) < 5 ) {
349345 break;
350346 }
351 - $element = $p[1];
352 - $attributes = $p[2];
353 - $inside = $p[3];
354 -
355 - // If $attributes ends with '/', we have an empty element tag, <tag />
356 - if( $element != '' && substr( $attributes, -1 ) == '/' ) {
357 - $attributes = substr( $attributes, 0, -1);
358 - $empty = '/';
 347+ if( count( $p ) > 5 ) {
 348+ // comment
 349+ $element = $p[4];
 350+ $attributes = '';
 351+ $close = '';
 352+ $inside = $p[5];
359353 } else {
360 - $empty = '';
 354+ // tag
 355+ $element = $p[1];
 356+ $attributes = $p[2];
 357+ $close = $p[3];
 358+ $inside = $p[4];
361359 }
362360
363361 $marker = "$uniq_prefix-$element-$rand" . sprintf('%08X', $n++);
364362 $stripped .= $marker;
365363
366 - if ( $empty === '/' ) {
 364+ if ( $close === '/>' ) {
367365 // Empty element tag, <tag />
368366 $content = null;
369367 $text = $inside;
 368+ $tail = null;
370369 } else {
371 - if( $element ) {
372 - $end = "/<\\/$element\\s*>/i";
 370+ if( $element == '!--' ) {
 371+ $end = '/(-->)/';
373372 } else {
374 - $end = '/-->/';
 373+ $end = "/(<\\/$element\\s*>)/i";
375374 }
376 - $q = preg_split( $end, $inside, 2 );
 375+ $q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE );
377376 $content = $q[0];
378 - if( count( $q ) < 2 ) {
 377+ if( count( $q ) < 3 ) {
379378 # No end tag -- let it run out to the end of the text.
 379+ $tail = '';
380380 $text = '';
381381 } else {
382 - $text = $q[1];
 382+ $tail = $q[1];
 383+ $text = $q[2];
383384 }
384385 }
385386
386387 $matches[$marker] = array( $element,
387388 $content,
388389 Sanitizer::decodeTagAttributes( $attributes ),
389 - "<$element$attributes$empty>" );
 390+ "<$element$attributes$close$content$tail" );
390391 }
391392 return $stripped;
392393 }
@@ -409,6 +410,7 @@
410411 # Replace any instances of the placeholders
411412 $uniq_prefix = $this->mUniqPrefix;
412413 #$text = str_replace( $uniq_prefix, wfHtmlEscapeFirst( $uniq_prefix ), $text );
 414+ $commentState = array();
413415
414416 $elements = array_merge(
415417 array( 'nowiki', 'pre', 'gallery' ),
@@ -422,27 +424,24 @@
423425 }
424426
425427
426 - // Strip comments in a first pass.
427 - // This saves us from needlessly rendering extensions in comment text
428 - $text = Parser::extractTagsAndParams(STRIP_COMMENTS, $text, $comment_matches, $uniq_prefix);
429 - $commentState = array();
430 - foreach( $comment_matches as $marker => $data ){
431 - list( $element, $content, $params, $tag ) = $data;
432 - $commentState[$marker] = '<!--' . $content . '-->';
433 - }
434 -
435428 $matches = array();
436429 $text = Parser::extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );
437430
438431 foreach( $matches as $marker => $data ) {
439432 list( $element, $content, $params, $tag ) = $data;
440 - // Restore any comments; the extension can deal with them.
441 - if( $content !== null) {
442 - $content = strtr( $content, $commentState );
443 - }
444433 if( $render ) {
445434 $tagName = strtolower( $element );
446435 switch( $tagName ) {
 436+ case '!--':
 437+ // Comment
 438+ if( substr( $tag, -3 ) == '-->' ) {
 439+ $output = $tag;
 440+ } else {
 441+ // Unclosed comment in input.
 442+ // Close it so later stripping can remove it
 443+ $output = "$tag-->";
 444+ }
 445+ break;
447446 case 'html':
448447 if( $wgRawHtml ) {
449448 $output = $content;
@@ -473,25 +472,20 @@
474473 }
475474 } else {
476475 // Just stripping tags; keep the source
477 - if( $content === null ) {
478 - $output = $tag;
479 - } else {
480 - $output = "$tag$content</$element>";
481 - }
 476+ $output = $tag;
482477 }
483 - $state[$element][$marker] = $output;
 478+ if( !$stripcomments && $element == '!--' ) {
 479+ $commentState[$marker] = $output;
 480+ } else {
 481+ $state[$element][$marker] = $output;
 482+ }
484483 }
485484
486485 # Unstrip comments unless explicitly told otherwise.
487486 # (The comments are always stripped prior to this point, so as to
488487 # not invoke any extension tags / parser hooks contained within
489488 # a comment.)
490 - if ( $stripcomments ) {
491 - // Add remaining comments to the state array
492 - foreach( $commentState as $marker => $content ) {
493 - $state['comment'][$marker] = $content;
494 - }
495 - } else {
 489+ if ( !$stripcomments ) {
496490 // Put them all back and forget them
497491 $text = strtr( $text, $commentState );
498492 }
Index: trunk/phase3/RELEASE-NOTES
@@ -405,6 +405,8 @@
406406 further parsing (<ref>-style). There should no longer be surprise
407407 expansion of foreign extensions inside HTML output, or differences
408408 in behavior based on the order tags are loaded.
 409+* (bug 885) Pre-save transform no longer silently appends close tags
 410+* Pre-save transform no longer changes the case of close tags
409411
410412
411413 == Compatibility ==

Follow-up revisions

RevisionCommit summaryAuthorDate
r14586Backport fixes and bump to 1.6.7...brion06:27, 6 June 2006

Status & tagging log