Index: trunk/phase3/includes/normal/UtfNormal.php |
— | — | @@ -79,7 +79,7 @@ |
80 | 80 | * @return string a clean, shiny, normalized UTF-8 string |
81 | 81 | */ |
82 | 82 | static function cleanUp( $string ) { |
83 | | - if( NORMALIZE_ICU || NORMALIZE_INTL ) { |
| 83 | + if( NORMALIZE_ICU ) { |
84 | 84 | # We exclude a few chars that ICU would not. |
85 | 85 | $string = preg_replace( |
86 | 86 | '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', |
— | — | @@ -90,8 +90,24 @@ |
91 | 91 | |
92 | 92 | # UnicodeString constructor fails if the string ends with a |
93 | 93 | # head byte. Add a junk char at the end, we'll strip it off. |
94 | | - if ( NORMALIZE_ICU ) return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" ); |
95 | | - if ( NORMALIZE_INTL ) return normalizer_normalize( $string, Normalizer::FORM_C ); |
| 94 | + return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" ); |
| 95 | + } elseif( NORMALIZE_INTL ) { |
| 96 | + $norm = normalizer_normalize( $string, Normalizer::FORM_C ); |
| 97 | + if( $norm === null || $norm === false ) { |
| 98 | + # normalizer_normalize will either return false or null |
| 99 | + # (depending on which doc you read) if invalid utf8 string. |
| 100 | + # quickIsNFCVerify cleans up invalid sequences. |
| 101 | + |
| 102 | + if( UtfNormal::quickIsNFCVerify( $string ) ) { |
| 103 | + # if that's true, the string is actually already normal. |
| 104 | + return $string; |
| 105 | + } else { |
| 106 | + # Now we are valid but non-normal |
| 107 | + return normalizer_normalize( $string, Normalizer::FORM_C ); |
| 108 | + } |
| 109 | + } else { |
| 110 | + return $norm; |
| 111 | + } |
96 | 112 | } elseif( UtfNormal::quickIsNFCVerify( $string ) ) { |
97 | 113 | # Side effect -- $string has had UTF-8 errors cleaned up. |
98 | 114 | return $string; |
Index: trunk/phase3/RELEASE-NOTES |
— | — | @@ -237,6 +237,7 @@ |
238 | 238 | * (bug 27473) Fix regression: bold, italic no longer interfere with linktrail for ca, kaa |
239 | 239 | * (bug 28444) Fix regression: edit-on-doubleclick retains revision id again |
240 | 240 | * ' character entity is now allowed in wikitext |
| 241 | +* UtfNormal::cleanUp on an invalid utf-8 sequence no longer returns false if intl installed. |
241 | 242 | |
242 | 243 | === API changes in 1.18 === |
243 | 244 | * (bug 26339) Throw warning when truncating an overlarge API result |