Index: trunk/phase3/maintenance/parserTests.inc |
— | — | @@ -657,7 +657,7 @@ |
658 | 658 | 'wgDefaultExternalStore' => array(), |
659 | 659 | 'wgForeignFileRepos' => array(), |
660 | 660 | 'wgLinkHolderBatchSize' => $linkHolderBatchSize, |
661 | | - 'wgEnforceHtmlIds' => true, |
| 661 | + 'wgExperimentalHtmlIds' => false, |
662 | 662 | 'wgExternalLinkTarget' => false, |
663 | 663 | 'wgAlwaysUseTidy' => false, |
664 | 664 | 'wgHtml5' => true, |
Index: trunk/phase3/includes/parser/Parser.php |
— | — | @@ -3478,7 +3478,7 @@ |
3479 | 3479 | * @private |
3480 | 3480 | */ |
3481 | 3481 | function formatHeadings( $text, $origText, $isMain=true ) { |
3482 | | - global $wgMaxTocLevel, $wgContLang, $wgEnforceHtmlIds; |
| 3482 | + global $wgMaxTocLevel, $wgContLang, $wgExperimentalHtmlIds; |
3483 | 3483 | |
3484 | 3484 | $doNumberHeadings = $this->mOptions->getNumberHeadings(); |
3485 | 3485 | $showEditLink = $this->mOptions->getEditSection(); |
— | — | @@ -3654,11 +3654,7 @@ |
3655 | 3655 | # Save headline for section edit hint before it's escaped |
3656 | 3656 | $headlineHint = $safeHeadline; |
3657 | 3657 | |
3658 | | - if ( $wgEnforceHtmlIds ) { |
3659 | | - $legacyHeadline = false; |
3660 | | - $safeHeadline = Sanitizer::escapeId( $safeHeadline, |
3661 | | - 'noninitial' ); |
3662 | | - } else { |
| 3658 | + if ( $wgExperimentalHtmlIds ) { |
3663 | 3659 | # For reverse compatibility, provide an id that's |
3664 | 3660 | # HTML4-compatible, like we used to. |
3665 | 3661 | # |
— | — | @@ -3670,20 +3666,17 @@ |
3671 | 3667 | # to type in section names like "abc_.D7.93.D7.90.D7.A4" |
3672 | 3668 | # manually, so let's not bother worrying about it. |
3673 | 3669 | $legacyHeadline = Sanitizer::escapeId( $safeHeadline, |
3674 | | - 'noninitial' ); |
3675 | | - $safeHeadline = Sanitizer::escapeId( $safeHeadline, 'xml' ); |
| 3670 | + array( 'noninitial', 'legacy' ) ); |
| 3671 | + $safeHeadline = Sanitizer::escapeId( $safeHeadline ); |
3676 | 3672 | |
3677 | 3673 | if ( $legacyHeadline == $safeHeadline ) { |
3678 | 3674 | # No reason to have both (in fact, we can't) |
3679 | 3675 | $legacyHeadline = false; |
3680 | | - } elseif ( $legacyHeadline != Sanitizer::escapeId( |
3681 | | - $legacyHeadline, 'xml' ) ) { |
3682 | | - # The legacy id is invalid XML. We used to allow this, but |
3683 | | - # there's no reason to do so anymore. Backward |
3684 | | - # compatibility will fail slightly in this case, but it's |
3685 | | - # no big deal. |
3686 | | - $legacyHeadline = false; |
3687 | 3676 | } |
| 3677 | + } else { |
| 3678 | + $legacyHeadline = false; |
| 3679 | + $safeHeadline = Sanitizer::escapeId( $safeHeadline, |
| 3680 | + 'noninitial' ); |
3688 | 3681 | } |
3689 | 3682 | |
3690 | 3683 | # HTML names must be case-insensitively unique (bug 10721). FIXME: |
— | — | @@ -3711,7 +3704,7 @@ |
3712 | 3705 | # Don't number the heading if it is the only one (looks silly) |
3713 | 3706 | if( $doNumberHeadings && count( $matches[3] ) > 1) { |
3714 | 3707 | # the two are different if the line contains a link |
3715 | | - $headline=$numbering . ' ' . $headline; |
| 3708 | + $headline = $numbering . ' ' . $headline; |
3716 | 3709 | } |
3717 | 3710 | |
3718 | 3711 | # Create the anchor for linking from the TOC to the section |
Index: trunk/phase3/includes/Sanitizer.php |
— | — | @@ -651,9 +651,7 @@ |
652 | 652 | } |
653 | 653 | |
654 | 654 | if ( $attribute === 'id' ) { |
655 | | - global $wgEnforceHtmlIds; |
656 | | - $value = Sanitizer::escapeId( $value, |
657 | | - $wgEnforceHtmlIds ? 'noninitial' : 'xml' ); |
| 655 | + $value = Sanitizer::escapeId( $value, 'noninitial' ); |
658 | 656 | } |
659 | 657 | |
660 | 658 | //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity |
— | — | @@ -851,63 +849,62 @@ |
852 | 850 | } |
853 | 851 | |
854 | 852 | /** |
855 | | - * Given a value escape it so that it can be used in an id attribute and |
856 | | - * return it, this does not validate the value however (see first link) |
| 853 | + * Given a value, escape it so that it can be used in an id attribute and |
| 854 | + * return it. This will use HTML5 validation if $wgExperimentalHtmlIds is |
| 855 | + * true, allowing anything but ASCII whitespace. Otherwise it will use |
| 856 | + * HTML 4 rules, which means a narrow subset of ASCII, with bad characters |
| 857 | + * escaped with lots of dots. |
857 | 858 | * |
| 859 | + * To ensure we don't have to bother escaping anything, we also strip ', ", |
| 860 | + * & even if $wgExperimentalIds is true. TODO: Is this the best tactic? |
| 861 | + * |
858 | 862 | * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters |
859 | 863 | * in the id and |
860 | 864 | * name attributes |
861 | 865 | * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute |
| 866 | + * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute |
| 867 | + * HTML5 definition of id attribute |
862 | 868 | * |
863 | | - * @param $id String: id to validate |
| 869 | + * @param $id String: id to escape |
864 | 870 | * @param $options Mixed: string or array of strings (default is array()): |
865 | 871 | * 'noninitial': This is a non-initial fragment of an id, not a full id, |
866 | 872 | * so don't pay attention if the first character isn't valid at the |
867 | | - * beginning of an id. |
868 | | - * 'xml': Don't restrict the id to be HTML4-compatible. This option |
869 | | - * allows any alphabetic character to be used, per the XML standard. |
870 | | - * Therefore, it also completely changes the type of escaping: instead |
871 | | - * of weird dot-encoding, runs of invalid characters (mostly |
872 | | - * whitespace) are just compressed into a single underscore. |
| 873 | + * beginning of an id. Only matters if $wgExperimentalHtmlIds is |
| 874 | + * false. |
| 875 | + * 'legacy': Behave the way the old HTML 4-based ID escaping worked even |
| 876 | + * if $wgExperimentalHtmlIds is used, so we can generate extra |
| 877 | + * anchors and links won't break. |
873 | 878 | * @return String |
874 | 879 | */ |
875 | 880 | static function escapeId( $id, $options = array() ) { |
| 881 | + global $wgExperimentalHtmlIds; |
876 | 882 | $options = (array)$options; |
877 | 883 | |
878 | | - if ( !in_array( 'xml', $options ) ) { |
879 | | - # HTML4-style escaping |
880 | | - static $replace = array( |
881 | | - '%3A' => ':', |
882 | | - '%' => '.' |
883 | | - ); |
884 | | - |
885 | | - $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); |
886 | | - $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); |
887 | | - |
888 | | - if ( !preg_match( '/^[a-zA-Z]/', $id ) |
889 | | - && !in_array( 'noninitial', $options ) ) { |
890 | | - // Initial character must be a letter! |
891 | | - $id = "x$id"; |
| 884 | + if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { |
| 885 | + $id = preg_replace( '/[ \t\n\r\f_\'"&]+/', '_', $id ); |
| 886 | + $id = trim( $id, '_' ); |
| 887 | + if ( $id === '' ) { |
| 888 | + # Must have been all whitespace to start with. |
| 889 | + return '_'; |
| 890 | + } else { |
| 891 | + return $id; |
892 | 892 | } |
893 | | - return $id; |
894 | 893 | } |
895 | 894 | |
896 | | - # XML-style escaping. For the patterns used, see the XML 1.0 standard, |
897 | | - # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/> |
898 | | - $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}' |
899 | | - . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}' |
900 | | - . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}'; |
901 | | - $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}' |
902 | | - . '\x{203F}-\x{2040}'; |
903 | | - # Replace _ as well so we don't get multiple consecutive underscores |
904 | | - $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id ); |
905 | | - $id = trim( $id, '_' ); |
| 895 | + # HTML4-style escaping |
| 896 | + static $replace = array( |
| 897 | + '%3A' => ':', |
| 898 | + '%' => '.' |
| 899 | + ); |
906 | 900 | |
907 | | - if ( !preg_match( "/^[$nameStartChar]/u", $id ) |
908 | | - && !in_array( 'noninitial', $options ) ) { |
909 | | - $id = "_$id"; |
910 | | - } |
| 901 | + $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); |
| 902 | + $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); |
911 | 903 | |
| 904 | + if ( !preg_match( '/^[a-zA-Z]/', $id ) |
| 905 | + && !in_array( 'noninitial', $options ) ) { |
| 906 | + // Initial character must be a letter! |
| 907 | + $id = "x$id"; |
| 908 | + } |
912 | 909 | return $id; |
913 | 910 | } |
914 | 911 | |
Index: trunk/phase3/includes/Title.php |
— | — | @@ -501,13 +501,11 @@ |
502 | 502 | * Escape a text fragment, say from a link, for a URL |
503 | 503 | */ |
504 | 504 | static function escapeFragmentForURL( $fragment ) { |
505 | | - global $wgEnforceHtmlIds; |
506 | 505 | # Note that we don't urlencode the fragment. urlencoded Unicode |
507 | 506 | # fragments appear not to work in IE (at least up to 7) or in at least |
508 | 507 | # one version of Opera 9.x. The W3C validator, for one, doesn't seem |
509 | 508 | # to care if they aren't encoded. |
510 | | - return Sanitizer::escapeId( $fragment, |
511 | | - $wgEnforceHtmlIds ? 'noninitial' : 'xml' ); |
| 509 | + return Sanitizer::escapeId( $fragment, 'noninitial' ); |
512 | 510 | } |
513 | 511 | |
514 | 512 | #---------------------------------------------------------------------------- |
Index: trunk/phase3/includes/DefaultSettings.php |
— | — | @@ -4142,12 +4142,11 @@ |
4143 | 4143 | $wgUniversalEditButton = true; |
4144 | 4144 | |
4145 | 4145 | /** |
4146 | | - * Allow id's that don't conform to HTML4 backward compatibility requirements. |
4147 | | - * This is purely experimental, has multiple known flaws, and will likely be |
4148 | | - * renamed and reconcepted based on HTML5 in the future, so should not be used |
4149 | | - * except for testing. |
| 4146 | + * Should we allow a broader set of characters in id attributes, per HTML5? If |
| 4147 | + * not, use only HTML 4-compatible IDs. This option is for testing -- when the |
| 4148 | + * functionality is ready, it will be on by default with no option. |
4150 | 4149 | */ |
4151 | | -$wgEnforceHtmlIds = true; |
| 4150 | +$wgExperimentalHtmlIds = false; |
4152 | 4151 | |
4153 | 4152 | /** |
4154 | 4153 | * Search form behavior |