r14693 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r14692‎ \| r14693 \| r14694 >
Date:	23:23, 9 June 2006
Author:	brion
Status:	old
Tags:
Comment:	Revert bad patch for (bug 2069) Merge the LanguageUtf8 class into the Language class
Modified paths:	/trunk/phase3/RELEASE-NOTES (modified) (history) /trunk/phase3/languages/Language.php (modified) (history) /trunk/phase3/languages/LanguageUtf8.php (modified) (history)

Diff [purge]

Index: trunk/phase3/RELEASE-NOTES
—	—	@@ -464,7 +464,6 @@
465	465	* (bug 6243) Fix email for usernames containing dots when using PEAR::Mail
466	466	* Remove a number of needless {{ns:project}}-type transforms from messages files. These
467	467	usages already have separate label text. Such transforms are wasteful on each page view.
468		~~-* (bug 2069) Merge the LanguageUtf8 class into the Language class~~
469	468	* Update to Yiddish localization (yi)
470	469	* (bug 6254) Update to Indonesian translation (id) #20
471	470	* (bug 6255) Fix transclusions starting with "#" or "*" in HTML attributes
Index: trunk/phase3/languages/Language.php
—	—	@@ -732,73 +732,41 @@
733	733	return iconv( $in, $out, $string );
734	734	}
735	735
736		~~- function ucfirst( $str ) {~~
737		~~- return $this->uc( $str, true );~~
	736	+ function ucfirst( $string ) {
	737	+ # For most languages, this is a wrapper for ucfirst()
	738	+ return ucfirst( $string );
738	739	}
739	740
740		~~- function uc( $str, $first = false ) {~~
741		~~- if ( function_exists( 'mb_strtoupper' ) )~~
742		~~- if ( $first )~~
743		~~- if ( $this->isMultibyte( $str ) )~~
744		~~- return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );~~
745		~~- else~~
746		~~- return ucfirst( $str );~~
747		~~- else~~
748		~~- return $this->isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str );~~
749		~~- else~~
750		~~- if ( $this->isMultibyte( $str ) ) {~~
751		~~- global $wikiUpperChars;~~
752		~~- $x = $first ? '^' : '';~~
753		~~- return preg_replace(~~
754		~~- "/$x([a-z]\|[\\xc0-\\xff][\\x80-\\xbf]*)/e",~~
755		~~- "strtr( \"\$1\" , \$wikiUpperChars )",~~
756		~~- $str~~
757		~~- );~~
758		~~- } else~~
759		~~- return $first ? ucfirst( $str ) : strtoupper( $str );~~
	741	+ function uc( $str ) {
	742	+ return strtoupper( $str );
760	743	}
761	744
762		~~- function lcfirst( $str ) {~~
763		~~- return $this->lc( $str, true );~~
	745	+ function lcfirst( $s ) {
	746	+ return strtolower( $s{0} ). substr( $s, 1 );
764	747	}
765	748
766		~~- function lc( $str, $first = false ) {~~
767		~~- if ( function_exists( 'mb_strtolower' ) )~~
768		~~- if ( $first )~~
769		~~- if ( $this->isMultibyte( $str ) )~~
770		~~- return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );~~
771		~~- else~~
772		~~- return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 );~~
773		~~- else~~
774		~~- return $this->isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str );~~
775		~~- else~~
776		~~- if ( $this->isMultibyte( $str ) ) {~~
777		~~- global $wikiLowerChars;~~
778		~~- $x = $first ? '^' : '';~~
779		~~- return preg_replace(~~
780		~~- "/$x([A-Z]\|[\\xc0-\\xff][\\x80-\\xbf]*)/e",~~
781		~~- "strtr( \"\$1\" , \$wikiLowerChars )",~~
782		~~- $str~~
783		~~- );~~
784		~~- } else~~
785		~~- return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str );~~
	749	+ function lc( $str ) {
	750	+ return strtolower( $str );
786	751	}
787	752
788	753	function checkTitleEncoding( $s ) {
789	754	global $wgInputEncoding;
790	755
791		~~- if( is_array( $s ) ) {~~
792		~~- wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );~~
793		~~- }~~
794		~~- # Check for non-UTF-8 URLs~~
	756	+ # Check for UTF-8 URLs; Internet Explorer produces these if you
	757	+ # type non-ASCII chars in the URL bar or follow unescaped links.
795	758	$ishigh = preg_match( '/[\x80-\xff]/', $s);
796		~~- if(!$ishigh) return $s;~~
	759	+ $isutf = ($ishigh ? preg_match( '/^([\x00-\x7f]\|[\xc0-\xdf][\x80-\xbf]\|' .
	760	+ '[\xe0-\xef][\x80-\xbf]{2}\|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ) : true );
797	761
798		~~- $isutf8 = preg_match( '/^([\x00-\x7f]\|[\xc0-\xdf][\x80-\xbf]\|' .~~
799		~~- '[\xe0-\xef][\x80-\xbf]{2}\|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );~~
800		~~- if( $isutf8 ) return $s;~~
	762	+ if( ($wgInputEncoding != 'utf-8') and $ishigh and $isutf )
	763	+ return @iconv( 'UTF-8', $wgInputEncoding, $s );
801	764
802		~~- return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );~~
	765	+ if( ($wgInputEncoding == 'utf-8') and $ishigh and !$isutf )
	766	+ return utf8_encode( $s );
	767	+
	768	+ # Other languages can safely leave this function, or replace
	769	+ # it with one to detect and convert another legacy encoding.
	770	+ return $s;
803	771	}
804	772
805	773	/**
—	—	@@ -806,33 +774,11 @@
807	775	* or characters which need to be converted for MySQL's
808	776	* indexing to grok it correctly. Make such changes here.
809	777	*
810		~~- * @param string $string~~
	778	+ * @param string $in
811	779	* @return string
812	780	*/
813		~~- function stripForSearch( $string ) {~~
814		~~- # MySQL fulltext index doesn't grok utf-8, so we~~
815		~~- # need to fold cases and convert to hex~~
816		-
817		~~- # In Language:: it just returns lowercase, maybe~~
818		~~- # all strtolower on stripped output or argument~~
819		~~- # should be removed and all stripForSearch~~
820		~~- # methods adjusted to that.~~
821		-
822		~~- wfProfileIn( "Language::stripForSearch" );~~
823		~~- if( function_exists( 'mb_strtolower' ) ) {~~
824		~~- $out = preg_replace(~~
825		~~- "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",~~
826		~~- "'U8' . bin2hex( \"$1\" )",~~
827		~~- mb_strtolower( $string ) );~~
828		~~- } else {~~
829		~~- global $wikiLowerChars;~~
830		~~- $out = preg_replace(~~
831		~~- "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",~~
832		~~- "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",~~
833		~~- $string );~~
834		~~- }~~
835		~~- wfProfileOut( "Language::stripForSearch" );~~
836		~~- return $out;~~
	781	+ function stripForSearch( $in ) {
	782	+ return strtolower( $in );
837	783	}
838	784
839	785	function convertForSearchResult( $termsArray ) {
—	—	@@ -850,10 +796,7 @@
851	797	* @return string
852	798	*/
853	799	function firstChar( $s ) {
854		~~- preg_match( '/^([\x00-\x7f]\|[\xc0-\xdf][\x80-\xbf]\|' .~~
855		~~- '[\xe0-\xef][\x80-\xbf]{2}\|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);~~
856		-
857		~~- return isset( $matches[1] ) ? $matches[1] : "";~~
	800	+ return $s[0];
858	801	}
859	802
860	803	function initEncoding() {
—	—	@@ -1038,7 +981,7 @@
1039	982	#
1040	983	# $length does not include the optional ellipsis.
1041	984	# If $length is negative, snip from the beginning
1042		~~- function truncate( $string, $length, $ellipsis = "" ) {~~
	985	+ function truncate( $string, $length, $ellipsis = '' ) {
1043	986	if( $length == 0 ) {
1044	987	return $ellipsis;
1045	988	}
—	—	@@ -1047,24 +990,9 @@
1048	991	}
1049	992	if( $length > 0 ) {
1050	993	$string = substr( $string, 0, $length );
1051		~~- $char = ord( $string[strlen( $string ) - 1] );~~
1052		~~- if ($char >= 0xc0) {~~
1053		~~- # We got the first byte only of a multibyte char; remove it.~~
1054		~~- $string = substr( $string, 0, -1 );~~
1055		~~- } elseif( $char >= 0x80 &&~~
1056		~~- preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]\|' .~~
1057		~~- '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {~~
1058		~~- # We chopped in the middle of a character; remove it~~
1059		~~- $string = $m[1];~~
1060		~~- }~~
1061	994	return $string . $ellipsis;
1062	995	} else {
1063	996	$string = substr( $string, $length );
1064		~~- $char = ord( $string[0] );~~
1065		~~- if( $char >= 0x80 && $char < 0xc0 ) {~~
1066		~~- # We chopped in the middle of a character; remove the whole thing~~
1067		~~- $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );~~
1068		~~- }~~
1069	997	return $ellipsis . $string;
1070	998	}
1071	999	}
—	—	@@ -1262,34 +1190,13 @@
1263	1191	return str_replace( '_', '-', strtolower( substr( get_class( $this ), 8 ) ) );
1264	1192	}
1265	1193
1266		~~- function isMultibyte( $str ) {~~
1267		~~- return (bool)preg_match( '/^[\x80-\xff]/', $str );~~
1268		~~- }~~
1269	1194
1270		~~- function fallback8bitEncoding() {~~
1271		~~- # Windows codepage 1252 is a superset of iso 8859-1~~
1272		~~- # override this to use difference source encoding to~~
1273		~~- # translate incoming 8-bit URLs.~~
1274		~~- return "windows-1252";~~
1275		~~- }~~
1276	1195	}
1277	1196
1278		~~-if( function_exists( 'mb_strtoupper' ) ) {~~
1279		~~- mb_internal_encoding('UTF-8');~~
1280		~~-} else {~~
1281		~~- # Hack our own case conversion routines~~
	1197	+# FIXME: Merge all UTF-8 support code into Language base class.
	1198	+# We no longer support Latin-1 charset.
	1199	+require_once( 'LanguageUtf8.php' );
1282	1200
1283		~~- # Loading serialized arrays is faster than parsing code :P~~
1284		~~- $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );~~
1285		~~- $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );~~
1286		-
1287		~~- if(empty( $wikiUpperChars) \|\| empty($wikiLowerChars )) {~~
1288		~~- require_once( "includes/Utf8Case.php" );~~
1289		~~- $wgMemc->set( $key1, $wikiUpperChars );~~
1290		~~- $wgMemc->set( $key2, $wikiLowerChars );~~
1291		~~- }~~
1292		-}
1293		-
1294	1201	# This should fail gracefully if there's not a localization available
1295	1202	wfSuppressWarnings();
1296	1203	// Preload base classes to work around APC/PHP5 bug
Index: trunk/phase3/languages/LanguageUtf8.php
—	—	@@ -1,12 +1,199 @@
2	2	<?php
3	3	/**
	4	+ * @package MediaWiki
	5	+ * @subpackage Language
	6	+ */
	7	+
	8	+if( defined( "MEDIAWIKI" ) ) {
	9	+
	10	+# This file and LanguageLatin1.php may be included from within functions, so
	11	+# we need to have global statements
	12	+
	13	+global $wgInputEncoding, $wgOutputEncoding, $wikiUpperChars, $wikiLowerChars;
	14	+global $wgDBname, $wgMemc;
	15	+
	16	+$wgInputEncoding = "UTF-8";
	17	+$wgOutputEncoding = "UTF-8";
	18	+
	19	+if( function_exists( 'mb_strtoupper' ) ) {
	20	+ mb_internal_encoding('UTF-8');
	21	+} else {
	22	+ # Hack our own case conversion routines
	23	+
	24	+ # Loading serialized arrays is faster than parsing code :P
	25	+ $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
	26	+ $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
	27	+
	28	+ if(empty( $wikiUpperChars) \|\| empty($wikiLowerChars )) {
	29	+ require_once( "includes/Utf8Case.php" );
	30	+ $wgMemc->set( $key1, $wikiUpperChars );
	31	+ $wgMemc->set( $key2, $wikiLowerChars );
	32	+ }
	33	+}
	34	+
	35	+/**
4	36	* Base stuff useful to all UTF-8 based language files
5	37	* @package MediaWiki
6		- *
7		~~- * Will be deleted~~
8	38	*/
9	39	class LanguageUtf8 extends Language {
10	40
	41	+ # These functions use mbstring library, if it is loaded
	42	+ # or compiled and character mapping arrays otherwise.
	43	+ # In case of language-specific character mismatch
	44	+ # it should be dealt with in Language classes.
	45	+
	46	+ function ucfirst( $str ) {
	47	+ return LanguageUtf8::uc( $str, true );
	48	+ }
	49	+
	50	+ function uc( $str, $first = false ) {
	51	+ if ( function_exists( 'mb_strtoupper' ) )
	52	+ if ( $first )
	53	+ if ( LanguageUtf8::isMultibyte( $str ) )
	54	+ return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
	55	+ else
	56	+ return ucfirst( $str );
	57	+ else
	58	+ return LanguageUtf8::isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str );
	59	+ else
	60	+ if ( LanguageUtf8::isMultibyte( $str ) ) {
	61	+ global $wikiUpperChars;
	62	+ $x = $first ? '^' : '';
	63	+ return preg_replace(
	64	+ "/$x([a-z]\|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
	65	+ "strtr( \"\$1\" , \$wikiUpperChars )",
	66	+ $str
	67	+ );
	68	+ } else
	69	+ return $first ? ucfirst( $str ) : strtoupper( $str );
	70	+ }
	71	+
	72	+ function lcfirst( $str ) {
	73	+ return LanguageUtf8::lc( $str, true );
	74	+ }
	75	+
	76	+ function lc( $str, $first = false ) {
	77	+ if ( function_exists( 'mb_strtolower' ) )
	78	+ if ( $first )
	79	+ if ( LanguageUtf8::isMultibyte( $str ) )
	80	+ return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
	81	+ else
	82	+ return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 );
	83	+ else
	84	+ return LanguageUtf8::isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str );
	85	+ else
	86	+ if ( LanguageUtf8::isMultibyte( $str ) ) {
	87	+ global $wikiLowerChars;
	88	+ $x = $first ? '^' : '';
	89	+ return preg_replace(
	90	+ "/$x([A-Z]\|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
	91	+ "strtr( \"\$1\" , \$wikiLowerChars )",
	92	+ $str
	93	+ );
	94	+ } else
	95	+ return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str );
	96	+ }
	97	+
	98	+ function isMultibyte( $str ) {
	99	+ return (bool)preg_match( '/^[\x80-\xff]/', $str );
	100	+ }
	101	+
	102	+ function stripForSearch( $string ) {
	103	+ # MySQL fulltext index doesn't grok utf-8, so we
	104	+ # need to fold cases and convert to hex
	105	+
	106	+ # In Language:: it just returns lowercase, maybe
	107	+ # all strtolower on stripped output or argument
	108	+ # should be removed and all stripForSearch
	109	+ # methods adjusted to that.
	110	+
	111	+ wfProfileIn( "LanguageUtf8::stripForSearch" );
	112	+ if( function_exists( 'mb_strtolower' ) ) {
	113	+ $out = preg_replace(
	114	+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
	115	+ "'U8' . bin2hex( \"$1\" )",
	116	+ mb_strtolower( $string ) );
	117	+ } else {
	118	+ global $wikiLowerChars;
	119	+ $out = preg_replace(
	120	+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
	121	+ "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
	122	+ $string );
	123	+ }
	124	+ wfProfileOut( "LanguageUtf8::stripForSearch" );
	125	+ return $out;
	126	+ }
	127	+
	128	+ function fallback8bitEncoding() {
	129	+ # Windows codepage 1252 is a superset of iso 8859-1
	130	+ # override this to use difference source encoding to
	131	+ # translate incoming 8-bit URLs.
	132	+ return "windows-1252";
	133	+ }
	134	+
	135	+ function checkTitleEncoding( $s ) {
	136	+ global $wgInputEncoding;
	137	+
	138	+ if( is_array( $s ) ) {
	139	+ wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );
	140	+ }
	141	+ # Check for non-UTF-8 URLs
	142	+ $ishigh = preg_match( '/[\x80-\xff]/', $s);
	143	+ if(!$ishigh) return $s;
	144	+
	145	+ $isutf8 = preg_match( '/^([\x00-\x7f]\|[\xc0-\xdf][\x80-\xbf]\|' .
	146	+ '[\xe0-\xef][\x80-\xbf]{2}\|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
	147	+ if( $isutf8 ) return $s;
	148	+
	149	+ return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
	150	+ }
	151	+
	152	+ function firstChar( $s ) {
	153	+ preg_match( '/^([\x00-\x7f]\|[\xc0-\xdf][\x80-\xbf]\|' .
	154	+ '[\xe0-\xef][\x80-\xbf]{2}\|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
	155	+
	156	+ return isset( $matches[1] ) ? $matches[1] : "";
	157	+ }
	158	+
	159	+ # Crop a string from the beginning or end to a certain number of bytes.
	160	+ # (Bytes are used because our storage has limited byte lengths for some
	161	+ # columns in the database.) Multibyte charsets will need to make sure that
	162	+ # only whole characters are included!
	163	+ #
	164	+ # $length does not include the optional ellipsis.
	165	+ # If $length is negative, snip from the beginning
	166	+ function truncate( $string, $length, $ellipsis = "" ) {
	167	+ if( $length == 0 ) {
	168	+ return $ellipsis;
	169	+ }
	170	+ if ( strlen( $string ) <= abs( $length ) ) {
	171	+ return $string;
	172	+ }
	173	+ if( $length > 0 ) {
	174	+ $string = substr( $string, 0, $length );
	175	+ $char = ord( $string[strlen( $string ) - 1] );
	176	+ if ($char >= 0xc0) {
	177	+ # We got the first byte only of a multibyte char; remove it.
	178	+ $string = substr( $string, 0, -1 );
	179	+ } elseif( $char >= 0x80 &&
	180	+ preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]\|' .
	181	+ '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
	182	+ # We chopped in the middle of a character; remove it
	183	+ $string = $m[1];
	184	+ }
	185	+ return $string . $ellipsis;
	186	+ } else {
	187	+ $string = substr( $string, $length );
	188	+ $char = ord( $string[0] );
	189	+ if( $char >= 0x80 && $char < 0xc0 ) {
	190	+ # We chopped in the middle of a character; remove the whole thing
	191	+ $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
	192	+ }
	193	+ return $ellipsis . $string;
	194	+ }
	195	+ }
11	196	}
12	197
	198	+} # ifdef MEDIAWIKI
	199	+
13	200	?>

Past revisions this follows-up on

Revision	Commit summary	Author	Date
r14684	* (bug 2069) Merge the LanguageUtf8 class into the Language class...	nikerabbit	15:41, 9 June 2006

Status & tagging log

01:58, 13 October 2010 😂 (talk | contribs) changed the status of r14693 [removed: new added: old]