Index: trunk/phase3/maintenance/parserTests.txt |
— | — | @@ -406,7 +406,28 @@ |
407 | 407 | |
408 | 408 | !! end |
409 | 409 | |
| 410 | +!! test |
| 411 | +Definition lists: colon in HTML attribute |
| 412 | +!! input |
| 413 | +;<b style="display: inline">bold</b> |
| 414 | +!! result |
| 415 | +<dl><dt><b style="display: inline">bold</b> |
| 416 | +</dt></dl> |
410 | 417 | |
| 418 | +!! end |
| 419 | + |
| 420 | + |
| 421 | +!! test |
| 422 | +Definition lists: self-closed tag |
| 423 | +!! input |
| 424 | +;one<br/>two : two-line fun |
| 425 | +!! result |
| 426 | +<dl><dt>one<br />two </dt><dd> two-line fun |
| 427 | +</dd></dl> |
| 428 | + |
| 429 | +!! end |
| 430 | + |
| 431 | + |
411 | 432 | ### |
412 | 433 | ### External links |
413 | 434 | ### |
Index: trunk/phase3/includes/Parser.php |
— | — | @@ -59,6 +59,16 @@ |
60 | 60 | '('.EXT_IMAGE_FNAME_CLASS.'+)\\.((?i)'.EXT_IMAGE_EXTENSIONS.')$/S' # Filename |
61 | 61 | ); |
62 | 62 | |
| 63 | +// State constants for the definition list colon extraction |
| 64 | +define( 'MW_COLON_STATE_TEXT', 0 ); |
| 65 | +define( 'MW_COLON_STATE_TAG', 1 ); |
| 66 | +define( 'MW_COLON_STATE_TAGSTART', 2 ); |
| 67 | +define( 'MW_COLON_STATE_CLOSETAG', 3 ); |
| 68 | +define( 'MW_COLON_STATE_TAGSLASH', 4 ); |
| 69 | +define( 'MW_COLON_STATE_COMMENT', 5 ); |
| 70 | +define( 'MW_COLON_STATE_COMMENTDASH', 6 ); |
| 71 | +define( 'MW_COLON_STATE_COMMENTDASHDASH', 7 ); |
| 72 | + |
63 | 73 | /** |
64 | 74 | * PHP Parser |
65 | 75 | * |
— | — | @@ -1963,43 +1973,142 @@ |
1964 | 1974 | } |
1965 | 1975 | |
1966 | 1976 | /** |
1967 | | - * Split up a string on ':', ignoring any occurences inside |
1968 | | - * <a>..</a> or <span>...</span> |
| 1977 | + * Split up a string on ':', ignoring any occurences inside tags |
| 1978 | + * to prevent illegal overlapping. |
1969 | 1979 | * @param string $str the string to split |
1970 | 1980 | * @param string &$before set to everything before the ':' |
1971 | 1981 | * @param string &$after set to everything after the ':' |
1972 | 1982 | * return string the position of the ':', or false if none found |
1973 | 1983 | */ |
1974 | 1984 | function findColonNoLinks($str, &$before, &$after) { |
1975 | | - # I wonder if we should make this count all tags, not just <a> |
1976 | | - # and <span>. That would prevent us from matching a ':' that |
1977 | | - # comes in the middle of italics other such formatting.... |
1978 | | - # -- Wil |
1979 | 1985 | $fname = 'Parser::findColonNoLinks'; |
1980 | 1986 | wfProfileIn( $fname ); |
1981 | | - $pos = 0; |
1982 | | - do { |
1983 | | - $colon = strpos($str, ':', $pos); |
1984 | | - |
1985 | | - if ($colon !== false) { |
1986 | | - $before = substr($str, 0, $colon); |
1987 | | - $after = substr($str, $colon + 1); |
1988 | | - |
1989 | | - # Skip any ':' within <a> or <span> pairs |
1990 | | - $a = substr_count($before, '<a'); |
1991 | | - $s = substr_count($before, '<span'); |
1992 | | - $ca = substr_count($before, '</a>'); |
1993 | | - $cs = substr_count($before, '</span>'); |
1994 | | - |
1995 | | - if ($a <= $ca and $s <= $cs) { |
1996 | | - # Tags are balanced before ':'; ok |
| 1987 | + |
| 1988 | + $pos = strpos( $str, ':' ); |
| 1989 | + if( $pos === false ) { |
| 1990 | + // Nothing to find! |
| 1991 | + wfProfileOut( $fname ); |
| 1992 | + return false; |
| 1993 | + } |
| 1994 | + |
| 1995 | + if( strpos( $str, '<' ) === false ) { |
| 1996 | + // Easy; no tag nesting to worry about |
| 1997 | + $before = substr( $str, 0, $pos ); |
| 1998 | + $after = substr( $str, $pos+1 ); |
| 1999 | + wfProfileOut( $fname ); |
| 2000 | + return $pos; |
| 2001 | + } |
| 2002 | + |
| 2003 | + // Ugly state machine to walk through avoiding tags. |
| 2004 | + $state = MW_COLON_STATE_TEXT; |
| 2005 | + $stack = 0; |
| 2006 | + $len = strlen( $str ); |
| 2007 | + for( $i = 0; $i < $len; $i++ ) { |
| 2008 | + $c = $str{$i}; |
| 2009 | + |
| 2010 | + switch( $state ) { |
| 2011 | + // (Using the number is a performance hack for common cases) |
| 2012 | + case 0: // MW_COLON_STATE_TEXT: |
| 2013 | + switch( $c ) { |
| 2014 | + case "<": |
| 2015 | + // Could be either a <start> tag or an </end> tag |
| 2016 | + $state = MW_COLON_STATE_TAGSTART; |
1997 | 2017 | break; |
| 2018 | + case ":": |
| 2019 | + if( $stack == 0 ) { |
| 2020 | + // We found it! |
| 2021 | + $before = substr( $str, 0, $i ); |
| 2022 | + $after = substr( $str, $i + 1 ); |
| 2023 | + wfProfileOut( $fname ); |
| 2024 | + return $i; |
| 2025 | + } |
| 2026 | + // Embedded in a tag; don't break it. |
| 2027 | + break; |
| 2028 | + default: |
| 2029 | + // ignore |
1998 | 2030 | } |
1999 | | - $pos = $colon + 1; |
| 2031 | + break; |
| 2032 | + case 1: // MW_COLON_STATE_TAG: |
| 2033 | + // In a <tag> |
| 2034 | + switch( $c ) { |
| 2035 | + case ">": |
| 2036 | + $stack++; |
| 2037 | + $state = MW_COLON_STATE_TEXT; |
| 2038 | + break; |
| 2039 | + case "/": |
| 2040 | + // Slash may be followed by >? |
| 2041 | + $state = MW_COLON_STATE_TAGSLASH; |
| 2042 | + break; |
| 2043 | + default: |
| 2044 | + // ignore |
| 2045 | + } |
| 2046 | + break; |
| 2047 | + case 2: // MW_COLON_STATE_TAGSTART: |
| 2048 | + switch( $c ) { |
| 2049 | + case "/": |
| 2050 | + $state = MW_COLON_STATE_CLOSETAG; |
| 2051 | + break; |
| 2052 | + case "!": |
| 2053 | + $state = MW_COLON_STATE_COMMENT; |
| 2054 | + break; |
| 2055 | + case ">": |
| 2056 | + // Illegal early close? This shouldn't happen D: |
| 2057 | + $state = MW_COLON_STATE_TEXT; |
| 2058 | + break; |
| 2059 | + default: |
| 2060 | + $state = MW_COLON_STATE_TAG; |
| 2061 | + } |
| 2062 | + break; |
| 2063 | + case 3: // MW_COLON_STATE_CLOSETAG: |
| 2064 | + // In a </tag> |
| 2065 | + if( $c == ">" ) { |
| 2066 | + $stack--; |
| 2067 | + if( $stack < 0 ) { |
| 2068 | + wfDebug( "Invalid input in $fname; too many close tags\n" ); |
| 2069 | + wfProfileOut( $fname ); |
| 2070 | + return false; |
| 2071 | + } |
| 2072 | + $state = MW_COLON_STATE_TEXT; |
| 2073 | + } |
| 2074 | + break; |
| 2075 | + case MW_COLON_STATE_TAGSLASH: |
| 2076 | + if( $c == ">" ) { |
| 2077 | + // Yes, a self-closed tag <blah/> |
| 2078 | + $state = MW_COLON_STATE_TEXT; |
| 2079 | + } else { |
| 2080 | + // Probably we're jumping the gun, and this is an attribute |
| 2081 | + $state = MW_COLON_STATE_TAG; |
| 2082 | + } |
| 2083 | + break; |
| 2084 | + case 5: // MW_COLON_STATE_COMMENT: |
| 2085 | + if( $c == "-" ) { |
| 2086 | + $state = MW_COLON_STATE_COMMENTDASH; |
| 2087 | + } |
| 2088 | + break; |
| 2089 | + case MW_COLON_STATE_COMMENTDASH: |
| 2090 | + if( $c == "-" ) { |
| 2091 | + $state = MW_COLON_STATE_COMMENTDASHDASH; |
| 2092 | + } else { |
| 2093 | + $state = MW_COLON_STATE_COMMENT; |
| 2094 | + } |
| 2095 | + break; |
| 2096 | + case MW_COLON_STATE_COMMENTDASHDASH: |
| 2097 | + if( $c == ">" ) { |
| 2098 | + $state = MW_COLON_STATE_TEXT; |
| 2099 | + } else { |
| 2100 | + $state = MW_COLON_STATE_COMMENT; |
| 2101 | + } |
| 2102 | + break; |
| 2103 | + default: |
| 2104 | + wfDebugDieBacktrace( "State machine error in $fname" ); |
2000 | 2105 | } |
2001 | | - } while ($colon !== false); |
| 2106 | + } |
| 2107 | + if( $stack > 0 ) { |
| 2108 | + wfDebug( "Invalid input in $fname; not enough close tags (stack $stack, state $state)\n" ); |
| 2109 | + return false; |
| 2110 | + } |
2002 | 2111 | wfProfileOut( $fname ); |
2003 | | - return $colon; |
| 2112 | + return false; |
2004 | 2113 | } |
2005 | 2114 | |
2006 | 2115 | /** |
Index: trunk/phase3/RELEASE-NOTES |
— | — | @@ -410,7 +410,10 @@ |
411 | 411 | * (bug 6164) Fix regression with <gallery> resetting <ref> state |
412 | 412 | * Hackaround for IE 7 wrapping bug in MonoBook footer |
413 | 413 | * New message sp-newimages-showfrom replaces rclistfrom on special:newimages |
| 414 | +* Improve handling of ;: definition list construct with overlapping or |
| 415 | + nested HTML tags |
414 | 416 | |
| 417 | + |
415 | 418 | == Compatibility == |
416 | 419 | |
417 | 420 | MediaWiki 1.7 requires PHP 5 (5.1 recommended). PHP 4 is no longer supported. |