r61691 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r61690‎ | r61691 | r61692 >
Date:21:44, 29 January 2010
Author:simetrical
Status:ok
Tags:
Comment:
Refactor $wgEnforceHtmlIds code

Renamed setting to $wgExperimentalHtmlIds, off by default, and updated
the code to enforce the much laxer HTML5 rules. Still needs testing in
various browsers.
Modified paths:
  • /trunk/phase3/includes/DefaultSettings.php (modified) (history)
  • /trunk/phase3/includes/Sanitizer.php (modified) (history)
  • /trunk/phase3/includes/Title.php (modified) (history)
  • /trunk/phase3/includes/parser/Parser.php (modified) (history)
  • /trunk/phase3/maintenance/parserTests.inc (modified) (history)

Diff [purge]

Index: trunk/phase3/maintenance/parserTests.inc
@@ -657,7 +657,7 @@
658658 'wgDefaultExternalStore' => array(),
659659 'wgForeignFileRepos' => array(),
660660 'wgLinkHolderBatchSize' => $linkHolderBatchSize,
661 - 'wgEnforceHtmlIds' => true,
 661+ 'wgExperimentalHtmlIds' => false,
662662 'wgExternalLinkTarget' => false,
663663 'wgAlwaysUseTidy' => false,
664664 'wgHtml5' => true,
Index: trunk/phase3/includes/parser/Parser.php
@@ -3478,7 +3478,7 @@
34793479 * @private
34803480 */
34813481 function formatHeadings( $text, $origText, $isMain=true ) {
3482 - global $wgMaxTocLevel, $wgContLang, $wgEnforceHtmlIds;
 3482+ global $wgMaxTocLevel, $wgContLang, $wgExperimentalHtmlIds;
34833483
34843484 $doNumberHeadings = $this->mOptions->getNumberHeadings();
34853485 $showEditLink = $this->mOptions->getEditSection();
@@ -3654,11 +3654,7 @@
36553655 # Save headline for section edit hint before it's escaped
36563656 $headlineHint = $safeHeadline;
36573657
3658 - if ( $wgEnforceHtmlIds ) {
3659 - $legacyHeadline = false;
3660 - $safeHeadline = Sanitizer::escapeId( $safeHeadline,
3661 - 'noninitial' );
3662 - } else {
 3658+ if ( $wgExperimentalHtmlIds ) {
36633659 # For reverse compatibility, provide an id that's
36643660 # HTML4-compatible, like we used to.
36653661 #
@@ -3670,20 +3666,17 @@
36713667 # to type in section names like "abc_.D7.93.D7.90.D7.A4"
36723668 # manually, so let's not bother worrying about it.
36733669 $legacyHeadline = Sanitizer::escapeId( $safeHeadline,
3674 - 'noninitial' );
3675 - $safeHeadline = Sanitizer::escapeId( $safeHeadline, 'xml' );
 3670+ array( 'noninitial', 'legacy' ) );
 3671+ $safeHeadline = Sanitizer::escapeId( $safeHeadline );
36763672
36773673 if ( $legacyHeadline == $safeHeadline ) {
36783674 # No reason to have both (in fact, we can't)
36793675 $legacyHeadline = false;
3680 - } elseif ( $legacyHeadline != Sanitizer::escapeId(
3681 - $legacyHeadline, 'xml' ) ) {
3682 - # The legacy id is invalid XML. We used to allow this, but
3683 - # there's no reason to do so anymore. Backward
3684 - # compatibility will fail slightly in this case, but it's
3685 - # no big deal.
3686 - $legacyHeadline = false;
36873676 }
 3677+ } else {
 3678+ $legacyHeadline = false;
 3679+ $safeHeadline = Sanitizer::escapeId( $safeHeadline,
 3680+ 'noninitial' );
36883681 }
36893682
36903683 # HTML names must be case-insensitively unique (bug 10721). FIXME:
@@ -3711,7 +3704,7 @@
37123705 # Don't number the heading if it is the only one (looks silly)
37133706 if( $doNumberHeadings && count( $matches[3] ) > 1) {
37143707 # the two are different if the line contains a link
3715 - $headline=$numbering . ' ' . $headline;
 3708+ $headline = $numbering . ' ' . $headline;
37163709 }
37173710
37183711 # Create the anchor for linking from the TOC to the section
Index: trunk/phase3/includes/Sanitizer.php
@@ -651,9 +651,7 @@
652652 }
653653
654654 if ( $attribute === 'id' ) {
655 - global $wgEnforceHtmlIds;
656 - $value = Sanitizer::escapeId( $value,
657 - $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
 655+ $value = Sanitizer::escapeId( $value, 'noninitial' );
658656 }
659657
660658 //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
@@ -851,63 +849,62 @@
852850 }
853851
854852 /**
855 - * Given a value escape it so that it can be used in an id attribute and
856 - * return it, this does not validate the value however (see first link)
 853+ * Given a value, escape it so that it can be used in an id attribute and
 854+ * return it. This will use HTML5 validation if $wgExperimentalHtmlIds is
 855+ * true, allowing anything but ASCII whitespace. Otherwise it will use
 856+ * HTML 4 rules, which means a narrow subset of ASCII, with bad characters
 857+ * escaped with lots of dots.
857858 *
 859+ * To ensure we don't have to bother escaping anything, we also strip ', ",
 860+ * & even if $wgExperimentalIds is true. TODO: Is this the best tactic?
 861+ *
858862 * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
859863 * in the id and
860864 * name attributes
861865 * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 866+ * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
 867+ * HTML5 definition of id attribute
862868 *
863 - * @param $id String: id to validate
 869+ * @param $id String: id to escape
864870 * @param $options Mixed: string or array of strings (default is array()):
865871 * 'noninitial': This is a non-initial fragment of an id, not a full id,
866872 * so don't pay attention if the first character isn't valid at the
867 - * beginning of an id.
868 - * 'xml': Don't restrict the id to be HTML4-compatible. This option
869 - * allows any alphabetic character to be used, per the XML standard.
870 - * Therefore, it also completely changes the type of escaping: instead
871 - * of weird dot-encoding, runs of invalid characters (mostly
872 - * whitespace) are just compressed into a single underscore.
 873+ * beginning of an id. Only matters if $wgExperimentalHtmlIds is
 874+ * false.
 875+ * 'legacy': Behave the way the old HTML 4-based ID escaping worked even
 876+ * if $wgExperimentalHtmlIds is used, so we can generate extra
 877+ * anchors and links won't break.
873878 * @return String
874879 */
875880 static function escapeId( $id, $options = array() ) {
 881+ global $wgExperimentalHtmlIds;
876882 $options = (array)$options;
877883
878 - if ( !in_array( 'xml', $options ) ) {
879 - # HTML4-style escaping
880 - static $replace = array(
881 - '%3A' => ':',
882 - '%' => '.'
883 - );
884 -
885 - $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
886 - $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
887 -
888 - if ( !preg_match( '/^[a-zA-Z]/', $id )
889 - && !in_array( 'noninitial', $options ) ) {
890 - // Initial character must be a letter!
891 - $id = "x$id";
 884+ if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
 885+ $id = preg_replace( '/[ \t\n\r\f_\'"&]+/', '_', $id );
 886+ $id = trim( $id, '_' );
 887+ if ( $id === '' ) {
 888+ # Must have been all whitespace to start with.
 889+ return '_';
 890+ } else {
 891+ return $id;
892892 }
893 - return $id;
894893 }
895894
896 - # XML-style escaping. For the patterns used, see the XML 1.0 standard,
897 - # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
898 - $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
899 - . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
900 - . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
901 - $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
902 - . '\x{203F}-\x{2040}';
903 - # Replace _ as well so we don't get multiple consecutive underscores
904 - $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
905 - $id = trim( $id, '_' );
 895+ # HTML4-style escaping
 896+ static $replace = array(
 897+ '%3A' => ':',
 898+ '%' => '.'
 899+ );
906900
907 - if ( !preg_match( "/^[$nameStartChar]/u", $id )
908 - && !in_array( 'noninitial', $options ) ) {
909 - $id = "_$id";
910 - }
 901+ $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 902+ $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
911903
 904+ if ( !preg_match( '/^[a-zA-Z]/', $id )
 905+ && !in_array( 'noninitial', $options ) ) {
 906+ // Initial character must be a letter!
 907+ $id = "x$id";
 908+ }
912909 return $id;
913910 }
914911
Index: trunk/phase3/includes/Title.php
@@ -501,13 +501,11 @@
502502 * Escape a text fragment, say from a link, for a URL
503503 */
504504 static function escapeFragmentForURL( $fragment ) {
505 - global $wgEnforceHtmlIds;
506505 # Note that we don't urlencode the fragment. urlencoded Unicode
507506 # fragments appear not to work in IE (at least up to 7) or in at least
508507 # one version of Opera 9.x. The W3C validator, for one, doesn't seem
509508 # to care if they aren't encoded.
510 - return Sanitizer::escapeId( $fragment,
511 - $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
 509+ return Sanitizer::escapeId( $fragment, 'noninitial' );
512510 }
513511
514512 #----------------------------------------------------------------------------
Index: trunk/phase3/includes/DefaultSettings.php
@@ -4142,12 +4142,11 @@
41434143 $wgUniversalEditButton = true;
41444144
41454145 /**
4146 - * Allow id's that don't conform to HTML4 backward compatibility requirements.
4147 - * This is purely experimental, has multiple known flaws, and will likely be
4148 - * renamed and reconcepted based on HTML5 in the future, so should not be used
4149 - * except for testing.
 4146+ * Should we allow a broader set of characters in id attributes, per HTML5? If
 4147+ * not, use only HTML 4-compatible IDs. This option is for testing -- when the
 4148+ * functionality is ready, it will be on by default with no option.
41504149 */
4151 -$wgEnforceHtmlIds = true;
 4150+$wgExperimentalHtmlIds = false;
41524151
41534152 /**
41544153 * Search form behavior

Status & tagging log