r14541 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r14540‎ | r14541 | r14542 >
Date:20:54, 2 June 2006
Author:brion
Status:old
Tags:
Comment:
* Improve handling of ;: definition list construct with overlapping or nested HTML tags
Modified paths:
  • /trunk/phase3/RELEASE-NOTES (modified) (history)
  • /trunk/phase3/includes/Parser.php (modified) (history)
  • /trunk/phase3/maintenance/parserTests.txt (modified) (history)

Diff [purge]

Index: trunk/phase3/maintenance/parserTests.txt
@@ -406,7 +406,28 @@
407407
408408 !! end
409409
 410+!! test
 411+Definition lists: colon in HTML attribute
 412+!! input
 413+;<b style="display: inline">bold</b>
 414+!! result
 415+<dl><dt><b style="display: inline">bold</b>
 416+</dt></dl>
410417
 418+!! end
 419+
 420+
 421+!! test
 422+Definition lists: self-closed tag
 423+!! input
 424+;one<br/>two : two-line fun
 425+!! result
 426+<dl><dt>one<br />two&nbsp;</dt><dd> two-line fun
 427+</dd></dl>
 428+
 429+!! end
 430+
 431+
411432 ###
412433 ### External links
413434 ###
Index: trunk/phase3/includes/Parser.php
@@ -59,6 +59,16 @@
6060 '('.EXT_IMAGE_FNAME_CLASS.'+)\\.((?i)'.EXT_IMAGE_EXTENSIONS.')$/S' # Filename
6161 );
6262
 63+// State constants for the definition list colon extraction
 64+define( 'MW_COLON_STATE_TEXT', 0 );
 65+define( 'MW_COLON_STATE_TAG', 1 );
 66+define( 'MW_COLON_STATE_TAGSTART', 2 );
 67+define( 'MW_COLON_STATE_CLOSETAG', 3 );
 68+define( 'MW_COLON_STATE_TAGSLASH', 4 );
 69+define( 'MW_COLON_STATE_COMMENT', 5 );
 70+define( 'MW_COLON_STATE_COMMENTDASH', 6 );
 71+define( 'MW_COLON_STATE_COMMENTDASHDASH', 7 );
 72+
6373 /**
6474 * PHP Parser
6575 *
@@ -1963,43 +1973,142 @@
19641974 }
19651975
19661976 /**
1967 - * Split up a string on ':', ignoring any occurences inside
1968 - * <a>..</a> or <span>...</span>
 1977+ * Split up a string on ':', ignoring any occurences inside tags
 1978+ * to prevent illegal overlapping.
19691979 * @param string $str the string to split
19701980 * @param string &$before set to everything before the ':'
19711981 * @param string &$after set to everything after the ':'
19721982 * return string the position of the ':', or false if none found
19731983 */
19741984 function findColonNoLinks($str, &$before, &$after) {
1975 - # I wonder if we should make this count all tags, not just <a>
1976 - # and <span>. That would prevent us from matching a ':' that
1977 - # comes in the middle of italics other such formatting....
1978 - # -- Wil
19791985 $fname = 'Parser::findColonNoLinks';
19801986 wfProfileIn( $fname );
1981 - $pos = 0;
1982 - do {
1983 - $colon = strpos($str, ':', $pos);
1984 -
1985 - if ($colon !== false) {
1986 - $before = substr($str, 0, $colon);
1987 - $after = substr($str, $colon + 1);
1988 -
1989 - # Skip any ':' within <a> or <span> pairs
1990 - $a = substr_count($before, '<a');
1991 - $s = substr_count($before, '<span');
1992 - $ca = substr_count($before, '</a>');
1993 - $cs = substr_count($before, '</span>');
1994 -
1995 - if ($a <= $ca and $s <= $cs) {
1996 - # Tags are balanced before ':'; ok
 1987+
 1988+ $pos = strpos( $str, ':' );
 1989+ if( $pos === false ) {
 1990+ // Nothing to find!
 1991+ wfProfileOut( $fname );
 1992+ return false;
 1993+ }
 1994+
 1995+ if( strpos( $str, '<' ) === false ) {
 1996+ // Easy; no tag nesting to worry about
 1997+ $before = substr( $str, 0, $pos );
 1998+ $after = substr( $str, $pos+1 );
 1999+ wfProfileOut( $fname );
 2000+ return $pos;
 2001+ }
 2002+
 2003+ // Ugly state machine to walk through avoiding tags.
 2004+ $state = MW_COLON_STATE_TEXT;
 2005+ $stack = 0;
 2006+ $len = strlen( $str );
 2007+ for( $i = 0; $i < $len; $i++ ) {
 2008+ $c = $str{$i};
 2009+
 2010+ switch( $state ) {
 2011+ // (Using the number is a performance hack for common cases)
 2012+ case 0: // MW_COLON_STATE_TEXT:
 2013+ switch( $c ) {
 2014+ case "<":
 2015+ // Could be either a <start> tag or an </end> tag
 2016+ $state = MW_COLON_STATE_TAGSTART;
19972017 break;
 2018+ case ":":
 2019+ if( $stack == 0 ) {
 2020+ // We found it!
 2021+ $before = substr( $str, 0, $i );
 2022+ $after = substr( $str, $i + 1 );
 2023+ wfProfileOut( $fname );
 2024+ return $i;
 2025+ }
 2026+ // Embedded in a tag; don't break it.
 2027+ break;
 2028+ default:
 2029+ // ignore
19982030 }
1999 - $pos = $colon + 1;
 2031+ break;
 2032+ case 1: // MW_COLON_STATE_TAG:
 2033+ // In a <tag>
 2034+ switch( $c ) {
 2035+ case ">":
 2036+ $stack++;
 2037+ $state = MW_COLON_STATE_TEXT;
 2038+ break;
 2039+ case "/":
 2040+ // Slash may be followed by >?
 2041+ $state = MW_COLON_STATE_TAGSLASH;
 2042+ break;
 2043+ default:
 2044+ // ignore
 2045+ }
 2046+ break;
 2047+ case 2: // MW_COLON_STATE_TAGSTART:
 2048+ switch( $c ) {
 2049+ case "/":
 2050+ $state = MW_COLON_STATE_CLOSETAG;
 2051+ break;
 2052+ case "!":
 2053+ $state = MW_COLON_STATE_COMMENT;
 2054+ break;
 2055+ case ">":
 2056+ // Illegal early close? This shouldn't happen D:
 2057+ $state = MW_COLON_STATE_TEXT;
 2058+ break;
 2059+ default:
 2060+ $state = MW_COLON_STATE_TAG;
 2061+ }
 2062+ break;
 2063+ case 3: // MW_COLON_STATE_CLOSETAG:
 2064+ // In a </tag>
 2065+ if( $c == ">" ) {
 2066+ $stack--;
 2067+ if( $stack < 0 ) {
 2068+ wfDebug( "Invalid input in $fname; too many close tags\n" );
 2069+ wfProfileOut( $fname );
 2070+ return false;
 2071+ }
 2072+ $state = MW_COLON_STATE_TEXT;
 2073+ }
 2074+ break;
 2075+ case MW_COLON_STATE_TAGSLASH:
 2076+ if( $c == ">" ) {
 2077+ // Yes, a self-closed tag <blah/>
 2078+ $state = MW_COLON_STATE_TEXT;
 2079+ } else {
 2080+ // Probably we're jumping the gun, and this is an attribute
 2081+ $state = MW_COLON_STATE_TAG;
 2082+ }
 2083+ break;
 2084+ case 5: // MW_COLON_STATE_COMMENT:
 2085+ if( $c == "-" ) {
 2086+ $state = MW_COLON_STATE_COMMENTDASH;
 2087+ }
 2088+ break;
 2089+ case MW_COLON_STATE_COMMENTDASH:
 2090+ if( $c == "-" ) {
 2091+ $state = MW_COLON_STATE_COMMENTDASHDASH;
 2092+ } else {
 2093+ $state = MW_COLON_STATE_COMMENT;
 2094+ }
 2095+ break;
 2096+ case MW_COLON_STATE_COMMENTDASHDASH:
 2097+ if( $c == ">" ) {
 2098+ $state = MW_COLON_STATE_TEXT;
 2099+ } else {
 2100+ $state = MW_COLON_STATE_COMMENT;
 2101+ }
 2102+ break;
 2103+ default:
 2104+ wfDebugDieBacktrace( "State machine error in $fname" );
20002105 }
2001 - } while ($colon !== false);
 2106+ }
 2107+ if( $stack > 0 ) {
 2108+ wfDebug( "Invalid input in $fname; not enough close tags (stack $stack, state $state)\n" );
 2109+ return false;
 2110+ }
20022111 wfProfileOut( $fname );
2003 - return $colon;
 2112+ return false;
20042113 }
20052114
20062115 /**
Index: trunk/phase3/RELEASE-NOTES
@@ -410,7 +410,10 @@
411411 * (bug 6164) Fix regression with <gallery> resetting <ref> state
412412 * Hackaround for IE 7 wrapping bug in MonoBook footer
413413 * New message sp-newimages-showfrom replaces rclistfrom on special:newimages
 414+* Improve handling of ;: definition list construct with overlapping or
 415+ nested HTML tags
414416
 417+
415418 == Compatibility ==
416419
417420 MediaWiki 1.7 requires PHP 5 (5.1 recommended). PHP 4 is no longer supported.

Status & tagging log