r69626 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r69625‎ \| r69626 \| r69627 >
Date:	15:41, 20 July 2010
Author:	mah
Status:	ok
Tags:
Comment:	Prefer the intl PECL extension for ICU Unicode
Modified paths:	/trunk/phase3/includes/normal/UtfNormal.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/normal/UtfNormal.php
—	—	@@ -45,6 +45,7 @@
46	46	define( 'UNORM_FCD', 6 );
47	47
48	48	define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
	49	+define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
49	50
50	51	/**
51	52	* Unicode normalization routines for working with UTF-8 strings.
—	—	@@ -79,7 +80,7 @@
80	81	return $ret;
81	82	}
82	83
83		~~- if( NORMALIZE_ICU ) {~~
	84	+ if( NORMALIZE_ICU \|\| NORMALIZE_INTL ) {
84	85	# We exclude a few chars that ICU would not.
85	86	$string = preg_replace(
86	87	'/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
—	—	@@ -90,7 +91,8 @@
91	92
92	93	# UnicodeString constructor fails if the string ends with a
93	94	# head byte. Add a junk char at the end, we'll strip it off.
94		~~- return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );~~
	95	+ if ( NORMALIZE_ICU ) return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );
	96	+ if ( NORMALIZE_INTL ) return normalizer_normalize( $string, Normalizer::FORM_C );
95	97	} elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
96	98	# Side effect -- $string has had UTF-8 errors cleaned up.
97	99	return $string;
—	—	@@ -108,7 +110,9 @@
109	111	* @return string a UTF-8 string in normal form C
110	112	*/
111	113	static function toNFC( $string ) {
112		~~- if( NORMALIZE_ICU )~~
	114	+ if( NORMALIZE_INTL )
	115	+ return normalizer_normalize( $string, Normalizer::FORM_C );
	116	+ elseif( NORMALIZE_ICU )
113	117	return utf8_normalize( $string, UNORM_NFC );
114	118	elseif( UtfNormal::quickIsNFC( $string ) )
115	119	return $string;
—	—	@@ -124,7 +128,9 @@
125	129	* @return string a UTF-8 string in normal form D
126	130	*/
127	131	static function toNFD( $string ) {
128		~~- if( NORMALIZE_ICU )~~
	132	+ if( NORMALIZE_INTL )
	133	+ return normalizer_normalize( $string, Normalizer::FORM_D );
	134	+ elseif( NORMALIZE_ICU )
129	135	return utf8_normalize( $string, UNORM_NFD );
130	136	elseif( preg_match( '/[\x80-\xff]/', $string ) )
131	137	return UtfNormal::NFD( $string );
—	—	@@ -141,7 +147,9 @@
142	148	* @return string a UTF-8 string in normal form KC
143	149	*/
144	150	static function toNFKC( $string ) {
145		~~- if( NORMALIZE_ICU )~~
	151	+ if( NORMALIZE_INTL )
	152	+ return normalizer_normalize( $string, Normalizer::FORM_KC );
	153	+ elseif( NORMALIZE_ICU )
146	154	return utf8_normalize( $string, UNORM_NFKC );
147	155	elseif( preg_match( '/[\x80-\xff]/', $string ) )
148	156	return UtfNormal::NFKC( $string );
—	—	@@ -158,7 +166,9 @@
159	167	* @return string a UTF-8 string in normal form KD
160	168	*/
161	169	static function toNFKD( $string ) {
162		~~- if( NORMALIZE_ICU )~~
	170	+ if( NORMALIZE_INTL )
	171	+ return normalizer_normalize( $string, Normalizer::FORM_KD );
	172	+ elseif( NORMALIZE_ICU )
163	173	return utf8_normalize( $string, UNORM_NFKD );
164	174	elseif( preg_match( '/[\x80-\xff]/', $string ) )
165	175	return UtfNormal::NFKD( $string );

Follow-up revisions

Revision	Commit summary	Author	Date
r86130	(follow-up r69626) Make it so the intl normalizer_normalize function is not...	bawolff	18:39, 15 April 2011

Status & tagging log

12:52, 13 November 2010 Hashar (talk | contribs) changed the status of r69626 [removed: new added: ok]