Index: trunk/phase3/includes/Parser.php |
— | — | @@ -258,32 +258,51 @@ |
259 | 259 | * @access private |
260 | 260 | * @static |
261 | 261 | */ |
262 | | - function extractTags($tag, $text, &$content, $uniq_prefix = ''){ |
| 262 | + function extractTagsAndParams($tag, $text, &$content, &$tags, &$params, $uniq_prefix = ''){ |
263 | 263 | $rnd = $uniq_prefix . '-' . $tag . Parser::getRandomString(); |
264 | 264 | if ( !$content ) { |
265 | 265 | $content = array( ); |
266 | 266 | } |
267 | 267 | $n = 1; |
268 | 268 | $stripped = ''; |
| 269 | + |
| 270 | + if ( !$tags ) { |
| 271 | + $tags = array( ); |
| 272 | + } |
| 273 | + |
| 274 | + if ( !$params ) { |
| 275 | + $params = array( ); |
| 276 | + } |
269 | 277 | |
| 278 | + if( $tag == STRIP_COMMENTS ) { |
| 279 | + $start = '/<!--()/'; |
| 280 | + $end = '/-->/'; |
| 281 | + } else { |
| 282 | + $start = "/<$tag([^>]*)>/i"; |
| 283 | + $end = "/<\\/$tag\\s*>/i"; |
| 284 | + } |
| 285 | + |
270 | 286 | while ( '' != $text ) { |
271 | | - if($tag==STRIP_COMMENTS) { |
272 | | - $p = preg_split( '/<!--/', $text, 2 ); |
273 | | - } else { |
274 | | - $p = preg_split( "/<\\s*$tag\\s*>/i", $text, 2 ); |
| 287 | + $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE ); |
| 288 | + $stripped .= $p[0]; |
| 289 | + if( count( $p ) < 3 ) { |
| 290 | + break; |
275 | 291 | } |
276 | | - $stripped .= $p[0]; |
277 | | - if ( ( count( $p ) < 2 ) || ( '' == $p[1] ) ) { |
278 | | - $text = ''; |
| 292 | + $attributes = $p[1]; |
| 293 | + $inside = $p[2]; |
| 294 | + |
| 295 | + $marker = $rnd . sprintf('%08X', $n++); |
| 296 | + $stripped .= $marker; |
| 297 | + |
| 298 | + $tags[$marker] = "<$tag$attributes>"; |
| 299 | + $params[$marker] = Sanitizer::decodeTagAttributes( $attributes ); |
| 300 | + |
| 301 | + $q = preg_split( $end, $inside, 2 ); |
| 302 | + $content[$marker] = $q[0]; |
| 303 | + if( count( $q ) < 1 ) { |
| 304 | + # No end tag -- let it run out to the end of the text. |
| 305 | + break; |
279 | 306 | } else { |
280 | | - if($tag==STRIP_COMMENTS) { |
281 | | - $q = preg_split( '/-->/i', $p[1], 2 ); |
282 | | - } else { |
283 | | - $q = preg_split( "/<\\/\\s*$tag\\s*>/i", $p[1], 2 ); |
284 | | - } |
285 | | - $marker = $rnd . sprintf('%08X', $n++); |
286 | | - $content[$marker] = $q[0]; |
287 | | - $stripped .= $marker; |
288 | 307 | $text = $q[1]; |
289 | 308 | } |
290 | 309 | } |
— | — | @@ -291,6 +310,22 @@ |
292 | 311 | } |
293 | 312 | |
294 | 313 | /** |
| 314 | + * Wrapper function for extractTagsAndParams |
| 315 | + * for cases where $tags and $params isn't needed |
| 316 | + * i.e. where tags will never have params, like <nowiki> |
| 317 | + * |
| 318 | + * @access private |
| 319 | + * @static |
| 320 | + */ |
| 321 | + function extractTags( $tag, $text, &$content, $uniq_prefix = '' ) { |
| 322 | + $dummy_tags = array(); |
| 323 | + $dummy_params = array(); |
| 324 | + |
| 325 | + return Parser::extractTagsAndParams( $tag, $text, $content, |
| 326 | + $dummy_tags, $dummy_params, $uniq_prefix ); |
| 327 | + } |
| 328 | + |
| 329 | + /** |
295 | 330 | * Strips and renders nowiki, pre, math, hiero |
296 | 331 | * If $render is set, performs necessary rendering operations on plugins |
297 | 332 | * Returns the text, and fills an array with data needed in unstrip() |
— | — | @@ -311,6 +346,8 @@ |
312 | 347 | $pre_content = array(); |
313 | 348 | $comment_content = array(); |
314 | 349 | $ext_content = array(); |
| 350 | + $ext_tags = array(); |
| 351 | + $ext_params = array(); |
315 | 352 | $gallery_content = array(); |
316 | 353 | |
317 | 354 | # Replace any instances of the placeholders |
— | — | @@ -387,12 +424,15 @@ |
388 | 425 | # Extensions |
389 | 426 | foreach ( $this->mTagHooks as $tag => $callback ) { |
390 | 427 | $ext_content[$tag] = array(); |
391 | | - $text = Parser::extractTags( $tag, $text, $ext_content[$tag], $uniq_prefix ); |
| 428 | + $text = Parser::extractTagsAndParams( $tag, $text, $ext_content[$tag], |
| 429 | + $ext_tags[$tag], $ext_params[$tag], $uniq_prefix ); |
392 | 430 | foreach( $ext_content[$tag] as $marker => $content ) { |
| 431 | + $full_tag = $ext_tags[$tag][$marker]; |
| 432 | + $params = $ext_params[$tag][$marker]; |
393 | 433 | if ( $render ) { |
394 | | - $ext_content[$tag][$marker] = $callback( $content ); |
| 434 | + $ext_content[$tag][$marker] = $callback( $content, $params ); |
395 | 435 | } else { |
396 | | - $ext_content[$tag][$marker] = "<$tag>$content</$tag>"; |
| 436 | + $ext_content[$tag][$marker] = "$full_tag$content</$tag>"; |
397 | 437 | } |
398 | 438 | } |
399 | 439 | } |
Index: trunk/phase3/includes/Sanitizer.php |
— | — | @@ -37,6 +37,27 @@ |
38 | 38 | |(&)/x' ); |
39 | 39 | |
40 | 40 | /** |
| 41 | + * Regular expression to match HTML/XML attribute pairs within a tag. |
| 42 | + * Allows some... latitude. |
| 43 | + * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes |
| 44 | + */ |
| 45 | +$attrib = '[A-Za-z0-9]'; |
| 46 | +$space = '[\x09\x0a\x0d\x20]'; |
| 47 | +define( 'MW_ATTRIBS_REGEX', |
| 48 | + "/(?:^|$space)($attrib+) |
| 49 | + ($space*=$space* |
| 50 | + (?: |
| 51 | + # The attribute value: quoted or alone |
| 52 | + \"([^<\"]*)\" |
| 53 | + | '([^<']*)' |
| 54 | + | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) |
| 55 | + | (\#[0-9a-fA-F]+) # Technically wrong, but lots of |
| 56 | + # colors are specified like this. |
| 57 | + # We'll be normalizing it. |
| 58 | + ) |
| 59 | + )?(?=$space|\$)/sx" ); |
| 60 | + |
| 61 | +/** |
41 | 62 | * List of all named character entities defined in HTML 4.01 |
42 | 63 | * http://www.w3.org/TR/html4/sgml/entities.html |
43 | 64 | * @access private |
— | — | @@ -490,21 +511,8 @@ |
491 | 512 | # Unquoted attribute |
492 | 513 | # Since we quote this later, this can be anything distinguishable |
493 | 514 | # from the end of the attribute |
494 | | - $attrib = '[A-Za-z0-9]'; |
495 | | - $space = '[\x09\x0a\x0d\x20]'; |
496 | 515 | if( !preg_match_all( |
497 | | - "/(?:^|$space)($attrib+) |
498 | | - ($space*=$space* |
499 | | - (?: |
500 | | - # The attribute value: quoted or alone |
501 | | - \"([^<\"]*)\" |
502 | | - | '([^<']*)' |
503 | | - | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) |
504 | | - | (\#[0-9a-fA-F]+) # Technically wrong, but lots of |
505 | | - # colors are specified like this. |
506 | | - # We'll be normalizing it. |
507 | | - ) |
508 | | - )?(?=$space|\$)/sx", |
| 516 | + MW_ATTRIBS_REGEX, |
509 | 517 | $text, |
510 | 518 | $pairs, |
511 | 519 | PREG_SET_ORDER ) ) { |
— | — | @@ -517,26 +525,11 @@ |
518 | 526 | $attribute = strtolower( $set[1] ); |
519 | 527 | if( !isset( $whitelist[$attribute] ) ) { |
520 | 528 | continue; |
521 | | - } elseif( isset( $set[6] ) ) { |
522 | | - # Illegal #XXXXXX color with no quotes. |
523 | | - $value = Sanitizer::normalizeAttributeValue( $set[6] ); |
524 | | - } elseif( isset( $set[5] ) ) { |
525 | | - # No quotes. |
526 | | - $value = Sanitizer::normalizeAttributeValue( $set[5] ); |
527 | | - } elseif( isset( $set[4] ) ) { |
528 | | - # Single-quoted |
529 | | - $value = str_replace( '"', '"', |
530 | | - Sanitizer::normalizeAttributeValue( $set[4] ) ); |
531 | | - } elseif( isset( $set[3] ) ) { |
532 | | - # Double-quoted |
533 | | - $value = Sanitizer::normalizeAttributeValue( $set[3] ); |
534 | | - } elseif( !isset( $set[2] ) ) { |
535 | | - # In XHTML, attributes must have a value. |
536 | | - $value = $set[1]; |
537 | | - } else { |
538 | | - wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." ); |
539 | 529 | } |
540 | 530 | |
| 531 | + $raw = Sanitizer::getTagAttributeCallback( $set ); |
| 532 | + $value = Sanitizer::normalizeAttributeValue( $raw ); |
| 533 | + |
541 | 534 | # Strip javascript "expression" from stylesheets. |
542 | 535 | # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp |
543 | 536 | if( $attribute == 'style' && preg_match( |
— | — | @@ -558,6 +551,67 @@ |
559 | 552 | } |
560 | 553 | |
561 | 554 | /** |
| 555 | + * Return an associative array of attribute names and values from |
| 556 | + * a partial tag string. Attribute names are forces to lowercase, |
| 557 | + * character references are decoded to UTF-8 text. |
| 558 | + * |
| 559 | + * @param string |
| 560 | + * @return array |
| 561 | + */ |
| 562 | + function decodeTagAttributes( $text ) { |
| 563 | + $attribs = array(); |
| 564 | + |
| 565 | + if( trim( $text ) == '' ) { |
| 566 | + return $attribs; |
| 567 | + } |
| 568 | + |
| 569 | + if( !preg_match_all( |
| 570 | + MW_ATTRIBS_REGEX, |
| 571 | + $text, |
| 572 | + $pairs, |
| 573 | + PREG_SET_ORDER ) ) { |
| 574 | + return $attribs; |
| 575 | + } |
| 576 | + |
| 577 | + foreach( $pairs as $set ) { |
| 578 | + $attribute = strtolower( $set[1] ); |
| 579 | + $value = Sanitizer::getTagAttributeCallback( $set ); |
| 580 | + $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); |
| 581 | + } |
| 582 | + return $attribs; |
| 583 | + } |
| 584 | + |
| 585 | + /** |
| 586 | + * Pick the appropriate attribute value from a match set from the |
| 587 | + * MW_ATTRIBS_REGEX matches. |
| 588 | + * |
| 589 | + * @param array $set |
| 590 | + * @return string |
| 591 | + * @access private |
| 592 | + */ |
| 593 | + function getTagAttributeCallback( $set ) { |
| 594 | + if( isset( $set[6] ) ) { |
| 595 | + # Illegal #XXXXXX color with no quotes. |
| 596 | + return $set[6]; |
| 597 | + } elseif( isset( $set[5] ) ) { |
| 598 | + # No quotes. |
| 599 | + return $set[5]; |
| 600 | + } elseif( isset( $set[4] ) ) { |
| 601 | + # Single-quoted |
| 602 | + return $set[4]; |
| 603 | + } elseif( isset( $set[3] ) ) { |
| 604 | + # Double-quoted |
| 605 | + return $set[3]; |
| 606 | + } elseif( !isset( $set[2] ) ) { |
| 607 | + # In XHTML, attributes must have a value. |
| 608 | + # For 'reduced' form, return explicitly the attribute name here. |
| 609 | + return $set[1]; |
| 610 | + } else { |
| 611 | + wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." ); |
| 612 | + } |
| 613 | + } |
| 614 | + |
| 615 | + /** |
562 | 616 | * Normalize whitespace and character references in an XML source- |
563 | 617 | * encoded text for an attribute value. |
564 | 618 | * |
— | — | @@ -570,10 +624,11 @@ |
571 | 625 | * @access private |
572 | 626 | */ |
573 | 627 | function normalizeAttributeValue( $text ) { |
574 | | - return preg_replace( |
575 | | - '/\r\n|[\x20\x0d\x0a\x09]/', |
576 | | - ' ', |
577 | | - Sanitizer::normalizeCharReferences( $text ) ); |
| 628 | + return str_replace( '"', '"', |
| 629 | + preg_replace( |
| 630 | + '/\r\n|[\x20\x0d\x0a\x09]/', |
| 631 | + ' ', |
| 632 | + Sanitizer::normalizeCharReferences( $text ) ) ); |
578 | 633 | } |
579 | 634 | |
580 | 635 | /** |
Index: trunk/phase3/RELEASE-NOTES |
— | — | @@ -239,6 +239,7 @@ |
240 | 240 | * (bug 2173) Fatal error when removing an article with an empty title from the watchlist |
241 | 241 | * Removed -f parameter from mail() usage, likely to cause failures and bounces. |
242 | 242 | * (bug 2130) Fixed interwiki links with fragments |
| 243 | +* (bug 684) Accept an attribute parameter array on parser hook tags |
243 | 244 | |
244 | 245 | |
245 | 246 | === Caveats === |