r14684 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r14683‎ \| r14684 \| r14685 >
Date:	15:41, 9 June 2006
Author:	nikerabbit
Status:	old
Tags:
Comment:	* (bug 2069) Merge the LanguageUtf8 class into the Language class * Based on patch from Rotem Liss
Modified paths:	/trunk/phase3/RELEASE-NOTES (modified) (history) /trunk/phase3/languages/Language.php (modified) (history) /trunk/phase3/languages/LanguageUtf8.php (modified) (history)

Diff [purge]

Index: trunk/phase3/RELEASE-NOTES
—	—	@@ -464,6 +464,7 @@
465	465	* (bug 6243) Fix email for usernames containing dots when using PEAR::Mail
466	466	* Remove a number of needless {{ns:project}}-type transforms from messages files. These
467	467	usages already have separate label text. Such transforms are wasteful on each page view.
	468	+* (bug 2069) Merge the LanguageUtf8 class into the Language class
468	469
469	470	== Compatibility ==
470	471
Index: trunk/phase3/languages/Language.php
—	—	@@ -732,41 +732,73 @@
733	733	return iconv( $in, $out, $string );
734	734	}
735	735
736		~~- function ucfirst( $string ) {~~
737		~~- # For most languages, this is a wrapper for ucfirst()~~
738		~~- return ucfirst( $string );~~
	736	+ function ucfirst( $str ) {
	737	+ return $this->uc( $str, true );
739	738	}
740	739
741		~~- function uc( $str ) {~~
742		~~- return strtoupper( $str );~~
	740	+ function uc( $str, $first = false ) {
	741	+ if ( function_exists( 'mb_strtoupper' ) )
	742	+ if ( $first )
	743	+ if ( $this->isMultibyte( $str ) )
	744	+ return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
	745	+ else
	746	+ return ucfirst( $str );
	747	+ else
	748	+ return $this->isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str );
	749	+ else
	750	+ if ( $this->isMultibyte( $str ) ) {
	751	+ global $wikiUpperChars;
	752	+ $x = $first ? '^' : '';
	753	+ return preg_replace(
	754	+ "/$x([a-z]\|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
	755	+ "strtr( \"\$1\" , \$wikiUpperChars )",
	756	+ $str
	757	+ );
	758	+ } else
	759	+ return $first ? ucfirst( $str ) : strtoupper( $str );
743	760	}
744	761
745		~~- function lcfirst( $s ) {~~
746		~~- return strtolower( $s{0} ). substr( $s, 1 );~~
	762	+ function lcfirst( $str ) {
	763	+ return $this->lc( $str, true );
747	764	}
748	765
749		~~- function lc( $str ) {~~
750		~~- return strtolower( $str );~~
	766	+ function lc( $str, $first = false ) {
	767	+ if ( function_exists( 'mb_strtolower' ) )
	768	+ if ( $first )
	769	+ if ( $this->isMultibyte( $str ) )
	770	+ return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
	771	+ else
	772	+ return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 );
	773	+ else
	774	+ return $this->isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str );
	775	+ else
	776	+ if ( $this->isMultibyte( $str ) ) {
	777	+ global $wikiLowerChars;
	778	+ $x = $first ? '^' : '';
	779	+ return preg_replace(
	780	+ "/$x([A-Z]\|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
	781	+ "strtr( \"\$1\" , \$wikiLowerChars )",
	782	+ $str
	783	+ );
	784	+ } else
	785	+ return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str );
751	786	}
752	787
753	788	function checkTitleEncoding( $s ) {
754	789	global $wgInputEncoding;
755	790
756		~~- # Check for UTF-8 URLs; Internet Explorer produces these if you~~
757		~~- # type non-ASCII chars in the URL bar or follow unescaped links.~~
	791	+ if( is_array( $s ) ) {
	792	+ wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );
	793	+ }
	794	+ # Check for non-UTF-8 URLs
758	795	$ishigh = preg_match( '/[\x80-\xff]/', $s);
759		~~- $isutf = ($ishigh ? preg_match( '/^([\x00-\x7f]\|[\xc0-\xdf][\x80-\xbf]\|' .~~
760		~~- '[\xe0-\xef][\x80-\xbf]{2}\|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ) : true );~~
	796	+ if(!$ishigh) return $s;
761	797
762		~~- if( ($wgInputEncoding != 'utf-8') and $ishigh and $isutf )~~
763		~~- return @iconv( 'UTF-8', $wgInputEncoding, $s );~~
	798	+ $isutf8 = preg_match( '/^([\x00-\x7f]\|[\xc0-\xdf][\x80-\xbf]\|' .
	799	+ '[\xe0-\xef][\x80-\xbf]{2}\|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
	800	+ if( $isutf8 ) return $s;
764	801
765		~~- if( ($wgInputEncoding == 'utf-8') and $ishigh and !$isutf )~~
766		~~- return utf8_encode( $s );~~
767		-
768		~~- # Other languages can safely leave this function, or replace~~
769		~~- # it with one to detect and convert another legacy encoding.~~
770		~~- return $s;~~
	802	+ return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
771	803	}
772	804
773	805	/**
—	—	@@ -774,11 +806,33 @@
775	807	* or characters which need to be converted for MySQL's
776	808	* indexing to grok it correctly. Make such changes here.
777	809	*
778		~~- * @param string $in~~
	810	+ * @param string $string
779	811	* @return string
780	812	*/
781		~~- function stripForSearch( $in ) {~~
782		~~- return strtolower( $in );~~
	813	+ function stripForSearch( $string ) {
	814	+ # MySQL fulltext index doesn't grok utf-8, so we
	815	+ # need to fold cases and convert to hex
	816	+
	817	+ # In Language:: it just returns lowercase, maybe
	818	+ # all strtolower on stripped output or argument
	819	+ # should be removed and all stripForSearch
	820	+ # methods adjusted to that.
	821	+
	822	+ wfProfileIn( "Language::stripForSearch" );
	823	+ if( function_exists( 'mb_strtolower' ) ) {
	824	+ $out = preg_replace(
	825	+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
	826	+ "'U8' . bin2hex( \"$1\" )",
	827	+ mb_strtolower( $string ) );
	828	+ } else {
	829	+ global $wikiLowerChars;
	830	+ $out = preg_replace(
	831	+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
	832	+ "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
	833	+ $string );
	834	+ }
	835	+ wfProfileOut( "Language::stripForSearch" );
	836	+ return $out;
783	837	}
784	838
785	839	function convertForSearchResult( $termsArray ) {
—	—	@@ -796,7 +850,10 @@
797	851	* @return string
798	852	*/
799	853	function firstChar( $s ) {
800		~~- return $s[0];~~
	854	+ preg_match( '/^([\x00-\x7f]\|[\xc0-\xdf][\x80-\xbf]\|' .
	855	+ '[\xe0-\xef][\x80-\xbf]{2}\|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
	856	+
	857	+ return isset( $matches[1] ) ? $matches[1] : "";
801	858	}
802	859
803	860	function initEncoding() {
—	—	@@ -981,7 +1038,7 @@
982	1039	#
983	1040	# $length does not include the optional ellipsis.
984	1041	# If $length is negative, snip from the beginning
985		~~- function truncate( $string, $length, $ellipsis = '' ) {~~
	1042	+ function truncate( $string, $length, $ellipsis = "" ) {
986	1043	if( $length == 0 ) {
987	1044	return $ellipsis;
988	1045	}
—	—	@@ -990,9 +1047,24 @@
991	1048	}
992	1049	if( $length > 0 ) {
993	1050	$string = substr( $string, 0, $length );
	1051	+ $char = ord( $string[strlen( $string ) - 1] );
	1052	+ if ($char >= 0xc0) {
	1053	+ # We got the first byte only of a multibyte char; remove it.
	1054	+ $string = substr( $string, 0, -1 );
	1055	+ } elseif( $char >= 0x80 &&
	1056	+ preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]\|' .
	1057	+ '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
	1058	+ # We chopped in the middle of a character; remove it
	1059	+ $string = $m[1];
	1060	+ }
994	1061	return $string . $ellipsis;
995	1062	} else {
996	1063	$string = substr( $string, $length );
	1064	+ $char = ord( $string[0] );
	1065	+ if( $char >= 0x80 && $char < 0xc0 ) {
	1066	+ # We chopped in the middle of a character; remove the whole thing
	1067	+ $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
	1068	+ }
997	1069	return $ellipsis . $string;
998	1070	}
999	1071	}
—	—	@@ -1190,13 +1262,34 @@
1191	1263	return str_replace( '_', '-', strtolower( substr( get_class( $this ), 8 ) ) );
1192	1264	}
1193	1265
	1266	+ function isMultibyte( $str ) {
	1267	+ return (bool)preg_match( '/^[\x80-\xff]/', $str );
	1268	+ }
1194	1269
	1270	+ function fallback8bitEncoding() {
	1271	+ # Windows codepage 1252 is a superset of iso 8859-1
	1272	+ # override this to use difference source encoding to
	1273	+ # translate incoming 8-bit URLs.
	1274	+ return "windows-1252";
	1275	+ }
1195	1276	}
1196	1277
1197		~~-# FIXME: Merge all UTF-8 support code into Language base class.~~
1198		~~-# We no longer support Latin-1 charset.~~
1199		~~-require_once( 'LanguageUtf8.php' );~~
	1278	+if( function_exists( 'mb_strtoupper' ) ) {
	1279	+ mb_internal_encoding('UTF-8');
	1280	+} else {
	1281	+ # Hack our own case conversion routines
1200	1282
	1283	+ # Loading serialized arrays is faster than parsing code :P
	1284	+ $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
	1285	+ $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
	1286	+
	1287	+ if(empty( $wikiUpperChars) \|\| empty($wikiLowerChars )) {
	1288	+ require_once( "includes/Utf8Case.php" );
	1289	+ $wgMemc->set( $key1, $wikiUpperChars );
	1290	+ $wgMemc->set( $key2, $wikiLowerChars );
	1291	+ }
	1292	+}
	1293	+
1201	1294	# This should fail gracefully if there's not a localization available
1202	1295	wfSuppressWarnings();
1203	1296	// Preload base classes to work around APC/PHP5 bug
Index: trunk/phase3/languages/LanguageUtf8.php
—	—	@@ -1,199 +1,12 @@
2	2	<?php
3	3	/**
4		~~- * @package MediaWiki~~
5		~~- * @subpackage Language~~
6		~~- */~~
7		-
8		~~-if( defined( "MEDIAWIKI" ) ) {~~
9		-
10		~~-# This file and LanguageLatin1.php may be included from within functions, so~~
11		~~-# we need to have global statements~~
12		-
13		~~-global $wgInputEncoding, $wgOutputEncoding, $wikiUpperChars, $wikiLowerChars;~~
14		~~-global $wgDBname, $wgMemc;~~
15		-
16		~~-$wgInputEncoding = "UTF-8";~~
17		~~-$wgOutputEncoding = "UTF-8";~~
18		-
19		~~-if( function_exists( 'mb_strtoupper' ) ) {~~
20		~~- mb_internal_encoding('UTF-8');~~
21		~~-} else {~~
22		~~- # Hack our own case conversion routines~~
23		-
24		~~- # Loading serialized arrays is faster than parsing code :P~~
25		~~- $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );~~
26		~~- $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );~~
27		-
28		~~- if(empty( $wikiUpperChars) \|\| empty($wikiLowerChars )) {~~
29		~~- require_once( "includes/Utf8Case.php" );~~
30		~~- $wgMemc->set( $key1, $wikiUpperChars );~~
31		~~- $wgMemc->set( $key2, $wikiLowerChars );~~
32		~~- }~~
33		-}
34		-
35		-/**
36	4	* Base stuff useful to all UTF-8 based language files
37	5	* @package MediaWiki
	6	+ *
	7	+ * Will be deleted
38	8	*/
39	9	class LanguageUtf8 extends Language {
40	10
41		~~- # These functions use mbstring library, if it is loaded~~
42		~~- # or compiled and character mapping arrays otherwise.~~
43		~~- # In case of language-specific character mismatch~~
44		~~- # it should be dealt with in Language classes.~~
45		-
46		~~- function ucfirst( $str ) {~~
47		~~- return LanguageUtf8::uc( $str, true );~~
48		~~- }~~
49		-
50		~~- function uc( $str, $first = false ) {~~
51		~~- if ( function_exists( 'mb_strtoupper' ) )~~
52		~~- if ( $first )~~
53		~~- if ( LanguageUtf8::isMultibyte( $str ) )~~
54		~~- return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );~~
55		~~- else~~
56		~~- return ucfirst( $str );~~
57		~~- else~~
58		~~- return LanguageUtf8::isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str );~~
59		~~- else~~
60		~~- if ( LanguageUtf8::isMultibyte( $str ) ) {~~
61		~~- global $wikiUpperChars;~~
62		~~- $x = $first ? '^' : '';~~
63		~~- return preg_replace(~~
64		~~- "/$x([a-z]\|[\\xc0-\\xff][\\x80-\\xbf]*)/e",~~
65		~~- "strtr( \"\$1\" , \$wikiUpperChars )",~~
66		~~- $str~~
67		~~- );~~
68		~~- } else~~
69		~~- return $first ? ucfirst( $str ) : strtoupper( $str );~~
70		~~- }~~
71		-
72		~~- function lcfirst( $str ) {~~
73		~~- return LanguageUtf8::lc( $str, true );~~
74		~~- }~~
75		-
76		~~- function lc( $str, $first = false ) {~~
77		~~- if ( function_exists( 'mb_strtolower' ) )~~
78		~~- if ( $first )~~
79		~~- if ( LanguageUtf8::isMultibyte( $str ) )~~
80		~~- return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );~~
81		~~- else~~
82		~~- return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 );~~
83		~~- else~~
84		~~- return LanguageUtf8::isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str );~~
85		~~- else~~
86		~~- if ( LanguageUtf8::isMultibyte( $str ) ) {~~
87		~~- global $wikiLowerChars;~~
88		~~- $x = $first ? '^' : '';~~
89		~~- return preg_replace(~~
90		~~- "/$x([A-Z]\|[\\xc0-\\xff][\\x80-\\xbf]*)/e",~~
91		~~- "strtr( \"\$1\" , \$wikiLowerChars )",~~
92		~~- $str~~
93		~~- );~~
94		~~- } else~~
95		~~- return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str );~~
96		~~- }~~
97		-
98		~~- function isMultibyte( $str ) {~~
99		~~- return (bool)preg_match( '/^[\x80-\xff]/', $str );~~
100		~~- }~~
101		-
102		~~- function stripForSearch( $string ) {~~
103		~~- # MySQL fulltext index doesn't grok utf-8, so we~~
104		~~- # need to fold cases and convert to hex~~
105		-
106		~~- # In Language:: it just returns lowercase, maybe~~
107		~~- # all strtolower on stripped output or argument~~
108		~~- # should be removed and all stripForSearch~~
109		~~- # methods adjusted to that.~~
110		-
111		~~- wfProfileIn( "LanguageUtf8::stripForSearch" );~~
112		~~- if( function_exists( 'mb_strtolower' ) ) {~~
113		~~- $out = preg_replace(~~
114		~~- "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",~~
115		~~- "'U8' . bin2hex( \"$1\" )",~~
116		~~- mb_strtolower( $string ) );~~
117		~~- } else {~~
118		~~- global $wikiLowerChars;~~
119		~~- $out = preg_replace(~~
120		~~- "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",~~
121		~~- "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",~~
122		~~- $string );~~
123		~~- }~~
124		~~- wfProfileOut( "LanguageUtf8::stripForSearch" );~~
125		~~- return $out;~~
126		~~- }~~
127		-
128		~~- function fallback8bitEncoding() {~~
129		~~- # Windows codepage 1252 is a superset of iso 8859-1~~
130		~~- # override this to use difference source encoding to~~
131		~~- # translate incoming 8-bit URLs.~~
132		~~- return "windows-1252";~~
133		~~- }~~
134		-
135		~~- function checkTitleEncoding( $s ) {~~
136		~~- global $wgInputEncoding;~~
137		-
138		~~- if( is_array( $s ) ) {~~
139		~~- wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );~~
140		~~- }~~
141		~~- # Check for non-UTF-8 URLs~~
142		~~- $ishigh = preg_match( '/[\x80-\xff]/', $s);~~
143		~~- if(!$ishigh) return $s;~~
144		-
145		~~- $isutf8 = preg_match( '/^([\x00-\x7f]\|[\xc0-\xdf][\x80-\xbf]\|' .~~
146		~~- '[\xe0-\xef][\x80-\xbf]{2}\|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );~~
147		~~- if( $isutf8 ) return $s;~~
148		-
149		~~- return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );~~
150		~~- }~~
151		-
152		~~- function firstChar( $s ) {~~
153		~~- preg_match( '/^([\x00-\x7f]\|[\xc0-\xdf][\x80-\xbf]\|' .~~
154		~~- '[\xe0-\xef][\x80-\xbf]{2}\|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);~~
155		-
156		~~- return isset( $matches[1] ) ? $matches[1] : "";~~
157		~~- }~~
158		-
159		~~- # Crop a string from the beginning or end to a certain number of bytes.~~
160		~~- # (Bytes are used because our storage has limited byte lengths for some~~
161		~~- # columns in the database.) Multibyte charsets will need to make sure that~~
162		~~- # only whole characters are included!~~
163		~~- #~~
164		~~- # $length does not include the optional ellipsis.~~
165		~~- # If $length is negative, snip from the beginning~~
166		~~- function truncate( $string, $length, $ellipsis = "" ) {~~
167		~~- if( $length == 0 ) {~~
168		~~- return $ellipsis;~~
169		~~- }~~
170		~~- if ( strlen( $string ) <= abs( $length ) ) {~~
171		~~- return $string;~~
172		~~- }~~
173		~~- if( $length > 0 ) {~~
174		~~- $string = substr( $string, 0, $length );~~
175		~~- $char = ord( $string[strlen( $string ) - 1] );~~
176		~~- if ($char >= 0xc0) {~~
177		~~- # We got the first byte only of a multibyte char; remove it.~~
178		~~- $string = substr( $string, 0, -1 );~~
179		~~- } elseif( $char >= 0x80 &&~~
180		~~- preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]\|' .~~
181		~~- '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {~~
182		~~- # We chopped in the middle of a character; remove it~~
183		~~- $string = $m[1];~~
184		~~- }~~
185		~~- return $string . $ellipsis;~~
186		~~- } else {~~
187		~~- $string = substr( $string, $length );~~
188		~~- $char = ord( $string[0] );~~
189		~~- if( $char >= 0x80 && $char < 0xc0 ) {~~
190		~~- # We chopped in the middle of a character; remove the whole thing~~
191		~~- $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );~~
192		~~- }~~
193		~~- return $ellipsis . $string;~~
194		~~- }~~
195		~~- }~~
196	11	}
197	12
198		~~-} # ifdef MEDIAWIKI~~
199		-
200	13	?>

Follow-up revisions

Revision	Commit summary	Author	Date
r14693	Revert bad patch for (bug 2069) Merge the LanguageUtf8 class into the Languag...	brion	23:23, 9 June 2006

Status & tagging log

01:58, 13 October 2010 😂 (talk | contribs) changed the status of r14684 [removed: new added: old]