r40837 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r40836‎ | r40837 | r40838 >
Date:00:42, 15 September 2008
Author:krimpet
Status:old
Tags:
Comment:
Fix for bug #332 - all UTF-8 output is now cleaned of invalid forms as defined by RFC 3629. All output from MediaWiki should now be valid UTF-8 in all circumstances.
Modified paths:
  • /trunk/phase3/includes/OutputPage.php (modified) (history)
  • /trunk/phase3/includes/StringUtils.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/OutputPage.php
@@ -902,7 +902,7 @@
903903 }
904904
905905 # Buffer output; final headers may depend on later processing
906 - ob_start();
 906+ ob_start( array( 'OutputPage', 'cleanCallback') );
907907
908908 $wgRequest->response()->header( "Content-type: $wgMimeType; charset={$wgOutputEncoding}" );
909909 $wgRequest->response()->header( 'Content-language: '.$wgContLanguageCode );
@@ -924,6 +924,13 @@
925925 wfProfileOut( __METHOD__ );
926926 }
927927
 928+ public static function cleanCallback( $s ) {
 929+ wfProfileIn( __METHOD__ );
 930+ $s = StringUtils::cleanForCharset( $s, $wgOutputEncoding );
 931+ wfProfileOut( __METHOD__ );
 932+ return $s;
 933+ }
 934+
928935 /**
929936 * @todo document
930937 * @param string $ins
Index: trunk/phase3/includes/StringUtils.php
@@ -179,6 +179,86 @@
180180 return new ArrayIterator( explode( $separator, $subject ) );
181181 }
182182 }
 183+
 184+ /**
 185+ * Clean characters that are invalid in the given character set
 186+ * from a given string.
 187+ *
 188+ * @param $string \type{$string} String to clean
 189+ * @param $charset \type{$string} Character set (if unspecified, assume $wgOutputEncoding)
 190+ * @return \type{$string} Cleaned string
 191+ */
 192+ public static function cleanForCharset( $string, $charset='' ) {
 193+ global $wgOutputEncoding;
 194+ switch ( $charset ? $charset : $wgOutputEncoding ) {
 195+ # UTF-8 should be all we need to worry about. :)
 196+ case 'UTF-8':
 197+ return self::cleanUtf8( $string );
 198+ default:
 199+ return $string;
 200+ }
 201+ }
 202+
 203+ /**
 204+ * Clean invalid UTF-8 characters and sequences from a given string,
 205+ * replacing them with U+FFFD.
 206+ * Should be RFC 3629 compliant.
 207+ *
 208+ * @param $string \type{$string} String to clean
 209+ * @return \type{$string} Cleaned string
 210+ */
 211+ private static function cleanUtf8( $str ) {
 212+ # HERE BE DRAGONS!
 213+ # ABANDON ALL HOPE, ALL YE WHO ENTER THE BITWISE HELLFIRE.
 214+
 215+ $illegal = array( 0xD800, 0xDB7F, 0xDB80, 0xDBFF,
 216+ 0xDC00, 0xDF80, 0xDFFF, 0xFFFE, 0xFFFF );
 217+ $len = strlen( $str );
 218+ $left = $bytes = 0;
 219+ for ( $i = 0; $i < $len; $i++ ) {
 220+ $ch = ord( $str[$i] );
 221+ if ( !$left ) {
 222+ if ( !($ch & 0x80 ) )
 223+ continue;
 224+ $left = (( $ch & 0xFE ) == 0xFC ? 5 :
 225+ (( $ch & 0xFC ) == 0xF8 ? 4 :
 226+ (( $ch & 0xF8 ) == 0xF0 ? 3 :
 227+ (( $ch & 0xF0 ) == 0xE0 ? 2 :
 228+ (( $ch & 0xE0 ) == 0xC0 ? 1 :
 229+ 0 )))));
 230+ if ( $left ) {
 231+ $bytes = $left + 1;
 232+ $sum = $ch & ( 0xFF >> $bytes + 1 );
 233+ continue;
 234+ } else if ( $ch & 0x80 ) {
 235+ $bytes = 1;
 236+ }
 237+ } else if ( ( $ch & 0xC0 ) == 0x80 ) {
 238+ $sum <<= 6;
 239+ $sum += $ch & 0x3F;
 240+ if ( --$left ) continue;
 241+ if ( ( $bytes == 2 && $sum < 0x80 ) ||
 242+ ( $bytes == 3 && $sum < 0x800 ) ||
 243+ ( $bytes == 4 && $sum < 0x10000 ) ||
 244+ ( $bytes > 4 || $sum > 0x10FFFF ) ||
 245+ in_array( $sum, $illegal ) ) {
 246+ } else continue;
 247+
 248+ } else {
 249+ $bytes -= $left;
 250+ $i--;
 251+ }
 252+
 253+ $str = ( substr( $str, 0, $i - $bytes + 1 ) .
 254+ "\xEF\xBF\xBD" .
 255+ substr( $str, $i + 1 ) );
 256+ $i += 3 - $bytes;
 257+ $len += 3 - $bytes;
 258+ $left = 0;
 259+ }
 260+
 261+ return $str;
 262+ }
183263 }
184264
185265 /**

Follow-up revisions

RevisionCommit summaryAuthorDate
r40839Addendum to r40837: only validate/clean the body text, as we can assume the r...krimpet01:27, 15 September 2008
r40840Release notes for bug #332krimpet01:37, 15 September 2008
r40861Revert r40837, r40839, r40840 (bug 332 - broken UTF-8)...brion17:51, 15 September 2008
r103362(bug 31535; bug 332) Properly truncate upload summaries (img_description) on ...bawolff19:50, 16 November 2011