r9311 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r9310‎ | r9311 | r9312 >
Date:08:12, 3 June 2005
Author:vibber
Status:old
Tags:
Comment:
* (bug 684) Accept an attribute parameter array on parser hook tags
Some parts of http://bugzilla.wikimedia.org/attachment.cgi?id=96&action=view
with heavy modification; using tag matching in the style we accept regular
HTML elements, and decode attribute values to proper strings.
Modified paths:
  • /trunk/phase3/RELEASE-NOTES (modified) (history)
  • /trunk/phase3/includes/Parser.php (modified) (history)
  • /trunk/phase3/includes/Sanitizer.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/Parser.php
@@ -258,32 +258,51 @@
259259 * @access private
260260 * @static
261261 */
262 - function extractTags($tag, $text, &$content, $uniq_prefix = ''){
 262+ function extractTagsAndParams($tag, $text, &$content, &$tags, &$params, $uniq_prefix = ''){
263263 $rnd = $uniq_prefix . '-' . $tag . Parser::getRandomString();
264264 if ( !$content ) {
265265 $content = array( );
266266 }
267267 $n = 1;
268268 $stripped = '';
 269+
 270+ if ( !$tags ) {
 271+ $tags = array( );
 272+ }
 273+
 274+ if ( !$params ) {
 275+ $params = array( );
 276+ }
269277
 278+ if( $tag == STRIP_COMMENTS ) {
 279+ $start = '/<!--()/';
 280+ $end = '/-->/';
 281+ } else {
 282+ $start = "/<$tag([^>]*)>/i";
 283+ $end = "/<\\/$tag\\s*>/i";
 284+ }
 285+
270286 while ( '' != $text ) {
271 - if($tag==STRIP_COMMENTS) {
272 - $p = preg_split( '/<!--/', $text, 2 );
273 - } else {
274 - $p = preg_split( "/<\\s*$tag\\s*>/i", $text, 2 );
 287+ $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
 288+ $stripped .= $p[0];
 289+ if( count( $p ) < 3 ) {
 290+ break;
275291 }
276 - $stripped .= $p[0];
277 - if ( ( count( $p ) < 2 ) || ( '' == $p[1] ) ) {
278 - $text = '';
 292+ $attributes = $p[1];
 293+ $inside = $p[2];
 294+
 295+ $marker = $rnd . sprintf('%08X', $n++);
 296+ $stripped .= $marker;
 297+
 298+ $tags[$marker] = "<$tag$attributes>";
 299+ $params[$marker] = Sanitizer::decodeTagAttributes( $attributes );
 300+
 301+ $q = preg_split( $end, $inside, 2 );
 302+ $content[$marker] = $q[0];
 303+ if( count( $q ) < 1 ) {
 304+ # No end tag -- let it run out to the end of the text.
 305+ break;
279306 } else {
280 - if($tag==STRIP_COMMENTS) {
281 - $q = preg_split( '/-->/i', $p[1], 2 );
282 - } else {
283 - $q = preg_split( "/<\\/\\s*$tag\\s*>/i", $p[1], 2 );
284 - }
285 - $marker = $rnd . sprintf('%08X', $n++);
286 - $content[$marker] = $q[0];
287 - $stripped .= $marker;
288307 $text = $q[1];
289308 }
290309 }
@@ -291,6 +310,22 @@
292311 }
293312
294313 /**
 314+ * Wrapper function for extractTagsAndParams
 315+ * for cases where $tags and $params isn't needed
 316+ * i.e. where tags will never have params, like <nowiki>
 317+ *
 318+ * @access private
 319+ * @static
 320+ */
 321+ function extractTags( $tag, $text, &$content, $uniq_prefix = '' ) {
 322+ $dummy_tags = array();
 323+ $dummy_params = array();
 324+
 325+ return Parser::extractTagsAndParams( $tag, $text, $content,
 326+ $dummy_tags, $dummy_params, $uniq_prefix );
 327+ }
 328+
 329+ /**
295330 * Strips and renders nowiki, pre, math, hiero
296331 * If $render is set, performs necessary rendering operations on plugins
297332 * Returns the text, and fills an array with data needed in unstrip()
@@ -311,6 +346,8 @@
312347 $pre_content = array();
313348 $comment_content = array();
314349 $ext_content = array();
 350+ $ext_tags = array();
 351+ $ext_params = array();
315352 $gallery_content = array();
316353
317354 # Replace any instances of the placeholders
@@ -387,12 +424,15 @@
388425 # Extensions
389426 foreach ( $this->mTagHooks as $tag => $callback ) {
390427 $ext_content[$tag] = array();
391 - $text = Parser::extractTags( $tag, $text, $ext_content[$tag], $uniq_prefix );
 428+ $text = Parser::extractTagsAndParams( $tag, $text, $ext_content[$tag],
 429+ $ext_tags[$tag], $ext_params[$tag], $uniq_prefix );
392430 foreach( $ext_content[$tag] as $marker => $content ) {
 431+ $full_tag = $ext_tags[$tag][$marker];
 432+ $params = $ext_params[$tag][$marker];
393433 if ( $render ) {
394 - $ext_content[$tag][$marker] = $callback( $content );
 434+ $ext_content[$tag][$marker] = $callback( $content, $params );
395435 } else {
396 - $ext_content[$tag][$marker] = "<$tag>$content</$tag>";
 436+ $ext_content[$tag][$marker] = "$full_tag$content</$tag>";
397437 }
398438 }
399439 }
Index: trunk/phase3/includes/Sanitizer.php
@@ -37,6 +37,27 @@
3838 |(&)/x' );
3939
4040 /**
 41+ * Regular expression to match HTML/XML attribute pairs within a tag.
 42+ * Allows some... latitude.
 43+ * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
 44+ */
 45+$attrib = '[A-Za-z0-9]';
 46+$space = '[\x09\x0a\x0d\x20]';
 47+define( 'MW_ATTRIBS_REGEX',
 48+ "/(?:^|$space)($attrib+)
 49+ ($space*=$space*
 50+ (?:
 51+ # The attribute value: quoted or alone
 52+ \"([^<\"]*)\"
 53+ | '([^<']*)'
 54+ | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
 55+ | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
 56+ # colors are specified like this.
 57+ # We'll be normalizing it.
 58+ )
 59+ )?(?=$space|\$)/sx" );
 60+
 61+/**
4162 * List of all named character entities defined in HTML 4.01
4263 * http://www.w3.org/TR/html4/sgml/entities.html
4364 * @access private
@@ -490,21 +511,8 @@
491512 # Unquoted attribute
492513 # Since we quote this later, this can be anything distinguishable
493514 # from the end of the attribute
494 - $attrib = '[A-Za-z0-9]';
495 - $space = '[\x09\x0a\x0d\x20]';
496515 if( !preg_match_all(
497 - "/(?:^|$space)($attrib+)
498 - ($space*=$space*
499 - (?:
500 - # The attribute value: quoted or alone
501 - \"([^<\"]*)\"
502 - | '([^<']*)'
503 - | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
504 - | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
505 - # colors are specified like this.
506 - # We'll be normalizing it.
507 - )
508 - )?(?=$space|\$)/sx",
 516+ MW_ATTRIBS_REGEX,
509517 $text,
510518 $pairs,
511519 PREG_SET_ORDER ) ) {
@@ -517,26 +525,11 @@
518526 $attribute = strtolower( $set[1] );
519527 if( !isset( $whitelist[$attribute] ) ) {
520528 continue;
521 - } elseif( isset( $set[6] ) ) {
522 - # Illegal #XXXXXX color with no quotes.
523 - $value = Sanitizer::normalizeAttributeValue( $set[6] );
524 - } elseif( isset( $set[5] ) ) {
525 - # No quotes.
526 - $value = Sanitizer::normalizeAttributeValue( $set[5] );
527 - } elseif( isset( $set[4] ) ) {
528 - # Single-quoted
529 - $value = str_replace( '"', '&quot;',
530 - Sanitizer::normalizeAttributeValue( $set[4] ) );
531 - } elseif( isset( $set[3] ) ) {
532 - # Double-quoted
533 - $value = Sanitizer::normalizeAttributeValue( $set[3] );
534 - } elseif( !isset( $set[2] ) ) {
535 - # In XHTML, attributes must have a value.
536 - $value = $set[1];
537 - } else {
538 - wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
539529 }
540530
 531+ $raw = Sanitizer::getTagAttributeCallback( $set );
 532+ $value = Sanitizer::normalizeAttributeValue( $raw );
 533+
541534 # Strip javascript "expression" from stylesheets.
542535 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
543536 if( $attribute == 'style' && preg_match(
@@ -558,6 +551,67 @@
559552 }
560553
561554 /**
 555+ * Return an associative array of attribute names and values from
 556+ * a partial tag string. Attribute names are forces to lowercase,
 557+ * character references are decoded to UTF-8 text.
 558+ *
 559+ * @param string
 560+ * @return array
 561+ */
 562+ function decodeTagAttributes( $text ) {
 563+ $attribs = array();
 564+
 565+ if( trim( $text ) == '' ) {
 566+ return $attribs;
 567+ }
 568+
 569+ if( !preg_match_all(
 570+ MW_ATTRIBS_REGEX,
 571+ $text,
 572+ $pairs,
 573+ PREG_SET_ORDER ) ) {
 574+ return $attribs;
 575+ }
 576+
 577+ foreach( $pairs as $set ) {
 578+ $attribute = strtolower( $set[1] );
 579+ $value = Sanitizer::getTagAttributeCallback( $set );
 580+ $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 581+ }
 582+ return $attribs;
 583+ }
 584+
 585+ /**
 586+ * Pick the appropriate attribute value from a match set from the
 587+ * MW_ATTRIBS_REGEX matches.
 588+ *
 589+ * @param array $set
 590+ * @return string
 591+ * @access private
 592+ */
 593+ function getTagAttributeCallback( $set ) {
 594+ if( isset( $set[6] ) ) {
 595+ # Illegal #XXXXXX color with no quotes.
 596+ return $set[6];
 597+ } elseif( isset( $set[5] ) ) {
 598+ # No quotes.
 599+ return $set[5];
 600+ } elseif( isset( $set[4] ) ) {
 601+ # Single-quoted
 602+ return $set[4];
 603+ } elseif( isset( $set[3] ) ) {
 604+ # Double-quoted
 605+ return $set[3];
 606+ } elseif( !isset( $set[2] ) ) {
 607+ # In XHTML, attributes must have a value.
 608+ # For 'reduced' form, return explicitly the attribute name here.
 609+ return $set[1];
 610+ } else {
 611+ wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 612+ }
 613+ }
 614+
 615+ /**
562616 * Normalize whitespace and character references in an XML source-
563617 * encoded text for an attribute value.
564618 *
@@ -570,10 +624,11 @@
571625 * @access private
572626 */
573627 function normalizeAttributeValue( $text ) {
574 - return preg_replace(
575 - '/\r\n|[\x20\x0d\x0a\x09]/',
576 - ' ',
577 - Sanitizer::normalizeCharReferences( $text ) );
 628+ return str_replace( '"', '&quot;',
 629+ preg_replace(
 630+ '/\r\n|[\x20\x0d\x0a\x09]/',
 631+ ' ',
 632+ Sanitizer::normalizeCharReferences( $text ) ) );
578633 }
579634
580635 /**
Index: trunk/phase3/RELEASE-NOTES
@@ -239,6 +239,7 @@
240240 * (bug 2173) Fatal error when removing an article with an empty title from the watchlist
241241 * Removed -f parameter from mail() usage, likely to cause failures and bounces.
242242 * (bug 2130) Fixed interwiki links with fragments
 243+* (bug 684) Accept an attribute parameter array on parser hook tags
243244
244245
245246 === Caveats ===

Status & tagging log