Index: trunk/phase3/includes/OutputPage.php |
— | — | @@ -902,7 +902,7 @@ |
903 | 903 | } |
904 | 904 | |
905 | 905 | # Buffer output; final headers may depend on later processing |
906 | | - ob_start(); |
| 906 | + ob_start( array( 'OutputPage', 'cleanCallback') ); |
907 | 907 | |
908 | 908 | $wgRequest->response()->header( "Content-type: $wgMimeType; charset={$wgOutputEncoding}" ); |
909 | 909 | $wgRequest->response()->header( 'Content-language: '.$wgContLanguageCode ); |
— | — | @@ -924,6 +924,13 @@ |
925 | 925 | wfProfileOut( __METHOD__ ); |
926 | 926 | } |
927 | 927 | |
| 928 | + public static function cleanCallback( $s ) { |
| 929 | + wfProfileIn( __METHOD__ ); |
| 930 | + $s = StringUtils::cleanForCharset( $s, $wgOutputEncoding ); |
| 931 | + wfProfileOut( __METHOD__ ); |
| 932 | + return $s; |
| 933 | + } |
| 934 | + |
928 | 935 | /** |
929 | 936 | * @todo document |
930 | 937 | * @param string $ins |
Index: trunk/phase3/includes/StringUtils.php |
— | — | @@ -179,6 +179,86 @@ |
180 | 180 | return new ArrayIterator( explode( $separator, $subject ) ); |
181 | 181 | } |
182 | 182 | } |
| 183 | + |
| 184 | + /** |
| 185 | + * Clean characters that are invalid in the given character set |
| 186 | + * from a given string. |
| 187 | + * |
| 188 | + * @param $string \type{$string} String to clean |
| 189 | + * @param $charset \type{$string} Character set (if unspecified, assume $wgOutputEncoding) |
| 190 | + * @return \type{$string} Cleaned string |
| 191 | + */ |
| 192 | + public static function cleanForCharset( $string, $charset='' ) { |
| 193 | + global $wgOutputEncoding; |
| 194 | + switch ( $charset ? $charset : $wgOutputEncoding ) { |
| 195 | + # UTF-8 should be all we need to worry about. :) |
| 196 | + case 'UTF-8': |
| 197 | + return self::cleanUtf8( $string ); |
| 198 | + default: |
| 199 | + return $string; |
| 200 | + } |
| 201 | + } |
| 202 | + |
| 203 | + /** |
| 204 | + * Clean invalid UTF-8 characters and sequences from a given string, |
| 205 | + * replacing them with U+FFFD. |
| 206 | + * Should be RFC 3629 compliant. |
| 207 | + * |
| 208 | + * @param $string \type{$string} String to clean |
| 209 | + * @return \type{$string} Cleaned string |
| 210 | + */ |
| 211 | + private static function cleanUtf8( $str ) { |
| 212 | + # HERE BE DRAGONS! |
| 213 | + # ABANDON ALL HOPE, ALL YE WHO ENTER THE BITWISE HELLFIRE. |
| 214 | + |
| 215 | + $illegal = array( 0xD800, 0xDB7F, 0xDB80, 0xDBFF, |
| 216 | + 0xDC00, 0xDF80, 0xDFFF, 0xFFFE, 0xFFFF ); |
| 217 | + $len = strlen( $str ); |
| 218 | + $left = $bytes = 0; |
| 219 | + for ( $i = 0; $i < $len; $i++ ) { |
| 220 | + $ch = ord( $str[$i] ); |
| 221 | + if ( !$left ) { |
| 222 | + if ( !($ch & 0x80 ) ) |
| 223 | + continue; |
| 224 | + $left = (( $ch & 0xFE ) == 0xFC ? 5 : |
| 225 | + (( $ch & 0xFC ) == 0xF8 ? 4 : |
| 226 | + (( $ch & 0xF8 ) == 0xF0 ? 3 : |
| 227 | + (( $ch & 0xF0 ) == 0xE0 ? 2 : |
| 228 | + (( $ch & 0xE0 ) == 0xC0 ? 1 : |
| 229 | + 0 ))))); |
| 230 | + if ( $left ) { |
| 231 | + $bytes = $left + 1; |
| 232 | + $sum = $ch & ( 0xFF >> $bytes + 1 ); |
| 233 | + continue; |
| 234 | + } else if ( $ch & 0x80 ) { |
| 235 | + $bytes = 1; |
| 236 | + } |
| 237 | + } else if ( ( $ch & 0xC0 ) == 0x80 ) { |
| 238 | + $sum <<= 6; |
| 239 | + $sum += $ch & 0x3F; |
| 240 | + if ( --$left ) continue; |
| 241 | + if ( ( $bytes == 2 && $sum < 0x80 ) || |
| 242 | + ( $bytes == 3 && $sum < 0x800 ) || |
| 243 | + ( $bytes == 4 && $sum < 0x10000 ) || |
| 244 | + ( $bytes > 4 || $sum > 0x10FFFF ) || |
| 245 | + in_array( $sum, $illegal ) ) { |
| 246 | + } else continue; |
| 247 | + |
| 248 | + } else { |
| 249 | + $bytes -= $left; |
| 250 | + $i--; |
| 251 | + } |
| 252 | + |
| 253 | + $str = ( substr( $str, 0, $i - $bytes + 1 ) . |
| 254 | + "\xEF\xBF\xBD" . |
| 255 | + substr( $str, $i + 1 ) ); |
| 256 | + $i += 3 - $bytes; |
| 257 | + $len += 3 - $bytes; |
| 258 | + $left = 0; |
| 259 | + } |
| 260 | + |
| 261 | + return $str; |
| 262 | + } |
183 | 263 | } |
184 | 264 | |
185 | 265 | /** |