Index: trunk/phase3/RELEASE-NOTES |
— | — | @@ -464,6 +464,7 @@ |
465 | 465 | * (bug 6243) Fix email for usernames containing dots when using PEAR::Mail |
466 | 466 | * Remove a number of needless {{ns:project}}-type transforms from messages files. These |
467 | 467 | usages already have separate label text. Such transforms are wasteful on each page view. |
| 468 | +* (bug 2069) Merge the LanguageUtf8 class into the Language class |
468 | 469 | |
469 | 470 | == Compatibility == |
470 | 471 | |
Index: trunk/phase3/languages/Language.php |
— | — | @@ -732,41 +732,73 @@ |
733 | 733 | return iconv( $in, $out, $string ); |
734 | 734 | } |
735 | 735 | |
736 | | - function ucfirst( $string ) { |
737 | | - # For most languages, this is a wrapper for ucfirst() |
738 | | - return ucfirst( $string ); |
| 736 | + function ucfirst( $str ) { |
| 737 | + return $this->uc( $str, true ); |
739 | 738 | } |
740 | 739 | |
741 | | - function uc( $str ) { |
742 | | - return strtoupper( $str ); |
| 740 | + function uc( $str, $first = false ) { |
| 741 | + if ( function_exists( 'mb_strtoupper' ) ) |
| 742 | + if ( $first ) |
| 743 | + if ( $this->isMultibyte( $str ) ) |
| 744 | + return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ); |
| 745 | + else |
| 746 | + return ucfirst( $str ); |
| 747 | + else |
| 748 | + return $this->isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str ); |
| 749 | + else |
| 750 | + if ( $this->isMultibyte( $str ) ) { |
| 751 | + global $wikiUpperChars; |
| 752 | + $x = $first ? '^' : ''; |
| 753 | + return preg_replace( |
| 754 | + "/$x([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e", |
| 755 | + "strtr( \"\$1\" , \$wikiUpperChars )", |
| 756 | + $str |
| 757 | + ); |
| 758 | + } else |
| 759 | + return $first ? ucfirst( $str ) : strtoupper( $str ); |
743 | 760 | } |
744 | 761 | |
745 | | - function lcfirst( $s ) { |
746 | | - return strtolower( $s{0} ). substr( $s, 1 ); |
| 762 | + function lcfirst( $str ) { |
| 763 | + return $this->lc( $str, true ); |
747 | 764 | } |
748 | 765 | |
749 | | - function lc( $str ) { |
750 | | - return strtolower( $str ); |
| 766 | + function lc( $str, $first = false ) { |
| 767 | + if ( function_exists( 'mb_strtolower' ) ) |
| 768 | + if ( $first ) |
| 769 | + if ( $this->isMultibyte( $str ) ) |
| 770 | + return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ); |
| 771 | + else |
| 772 | + return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ); |
| 773 | + else |
| 774 | + return $this->isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str ); |
| 775 | + else |
| 776 | + if ( $this->isMultibyte( $str ) ) { |
| 777 | + global $wikiLowerChars; |
| 778 | + $x = $first ? '^' : ''; |
| 779 | + return preg_replace( |
| 780 | + "/$x([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e", |
| 781 | + "strtr( \"\$1\" , \$wikiLowerChars )", |
| 782 | + $str |
| 783 | + ); |
| 784 | + } else |
| 785 | + return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str ); |
751 | 786 | } |
752 | 787 | |
753 | 788 | function checkTitleEncoding( $s ) { |
754 | 789 | global $wgInputEncoding; |
755 | 790 | |
756 | | - # Check for UTF-8 URLs; Internet Explorer produces these if you |
757 | | - # type non-ASCII chars in the URL bar or follow unescaped links. |
| 791 | + if( is_array( $s ) ) { |
| 792 | + wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' ); |
| 793 | + } |
| 794 | + # Check for non-UTF-8 URLs |
758 | 795 | $ishigh = preg_match( '/[\x80-\xff]/', $s); |
759 | | - $isutf = ($ishigh ? preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . |
760 | | - '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ) : true ); |
| 796 | + if(!$ishigh) return $s; |
761 | 797 | |
762 | | - if( ($wgInputEncoding != 'utf-8') and $ishigh and $isutf ) |
763 | | - return @iconv( 'UTF-8', $wgInputEncoding, $s ); |
| 798 | + $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . |
| 799 | + '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ); |
| 800 | + if( $isutf8 ) return $s; |
764 | 801 | |
765 | | - if( ($wgInputEncoding == 'utf-8') and $ishigh and !$isutf ) |
766 | | - return utf8_encode( $s ); |
767 | | - |
768 | | - # Other languages can safely leave this function, or replace |
769 | | - # it with one to detect and convert another legacy encoding. |
770 | | - return $s; |
| 802 | + return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s ); |
771 | 803 | } |
772 | 804 | |
773 | 805 | /** |
— | — | @@ -774,11 +806,33 @@ |
775 | 807 | * or characters which need to be converted for MySQL's |
776 | 808 | * indexing to grok it correctly. Make such changes here. |
777 | 809 | * |
778 | | - * @param string $in |
| 810 | + * @param string $string |
779 | 811 | * @return string |
780 | 812 | */ |
781 | | - function stripForSearch( $in ) { |
782 | | - return strtolower( $in ); |
| 813 | + function stripForSearch( $string ) { |
| 814 | + # MySQL fulltext index doesn't grok utf-8, so we |
| 815 | + # need to fold cases and convert to hex |
| 816 | + |
| 817 | + # In Language:: it just returns lowercase, maybe |
| 818 | + # all strtolower on stripped output or argument |
| 819 | + # should be removed and all stripForSearch |
| 820 | + # methods adjusted to that. |
| 821 | + |
| 822 | + wfProfileIn( "Language::stripForSearch" ); |
| 823 | + if( function_exists( 'mb_strtolower' ) ) { |
| 824 | + $out = preg_replace( |
| 825 | + "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", |
| 826 | + "'U8' . bin2hex( \"$1\" )", |
| 827 | + mb_strtolower( $string ) ); |
| 828 | + } else { |
| 829 | + global $wikiLowerChars; |
| 830 | + $out = preg_replace( |
| 831 | + "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", |
| 832 | + "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )", |
| 833 | + $string ); |
| 834 | + } |
| 835 | + wfProfileOut( "Language::stripForSearch" ); |
| 836 | + return $out; |
783 | 837 | } |
784 | 838 | |
785 | 839 | function convertForSearchResult( $termsArray ) { |
— | — | @@ -796,7 +850,10 @@ |
797 | 851 | * @return string |
798 | 852 | */ |
799 | 853 | function firstChar( $s ) { |
800 | | - return $s[0]; |
| 854 | + preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . |
| 855 | + '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches); |
| 856 | + |
| 857 | + return isset( $matches[1] ) ? $matches[1] : ""; |
801 | 858 | } |
802 | 859 | |
803 | 860 | function initEncoding() { |
— | — | @@ -981,7 +1038,7 @@ |
982 | 1039 | # |
983 | 1040 | # $length does not include the optional ellipsis. |
984 | 1041 | # If $length is negative, snip from the beginning |
985 | | - function truncate( $string, $length, $ellipsis = '' ) { |
| 1042 | + function truncate( $string, $length, $ellipsis = "" ) { |
986 | 1043 | if( $length == 0 ) { |
987 | 1044 | return $ellipsis; |
988 | 1045 | } |
— | — | @@ -990,9 +1047,24 @@ |
991 | 1048 | } |
992 | 1049 | if( $length > 0 ) { |
993 | 1050 | $string = substr( $string, 0, $length ); |
| 1051 | + $char = ord( $string[strlen( $string ) - 1] ); |
| 1052 | + if ($char >= 0xc0) { |
| 1053 | + # We got the first byte only of a multibyte char; remove it. |
| 1054 | + $string = substr( $string, 0, -1 ); |
| 1055 | + } elseif( $char >= 0x80 && |
| 1056 | + preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' . |
| 1057 | + '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) { |
| 1058 | + # We chopped in the middle of a character; remove it |
| 1059 | + $string = $m[1]; |
| 1060 | + } |
994 | 1061 | return $string . $ellipsis; |
995 | 1062 | } else { |
996 | 1063 | $string = substr( $string, $length ); |
| 1064 | + $char = ord( $string[0] ); |
| 1065 | + if( $char >= 0x80 && $char < 0xc0 ) { |
| 1066 | + # We chopped in the middle of a character; remove the whole thing |
| 1067 | + $string = preg_replace( '/^[\x80-\xbf]+/', '', $string ); |
| 1068 | + } |
997 | 1069 | return $ellipsis . $string; |
998 | 1070 | } |
999 | 1071 | } |
— | — | @@ -1190,13 +1262,34 @@ |
1191 | 1263 | return str_replace( '_', '-', strtolower( substr( get_class( $this ), 8 ) ) ); |
1192 | 1264 | } |
1193 | 1265 | |
| 1266 | + function isMultibyte( $str ) { |
| 1267 | + return (bool)preg_match( '/^[\x80-\xff]/', $str ); |
| 1268 | + } |
1194 | 1269 | |
| 1270 | + function fallback8bitEncoding() { |
| 1271 | + # Windows codepage 1252 is a superset of iso 8859-1 |
| 1272 | + # override this to use difference source encoding to |
| 1273 | + # translate incoming 8-bit URLs. |
| 1274 | + return "windows-1252"; |
| 1275 | + } |
1195 | 1276 | } |
1196 | 1277 | |
1197 | | -# FIXME: Merge all UTF-8 support code into Language base class. |
1198 | | -# We no longer support Latin-1 charset. |
1199 | | -require_once( 'LanguageUtf8.php' ); |
| 1278 | +if( function_exists( 'mb_strtoupper' ) ) { |
| 1279 | + mb_internal_encoding('UTF-8'); |
| 1280 | +} else { |
| 1281 | + # Hack our own case conversion routines |
1200 | 1282 | |
| 1283 | + # Loading serialized arrays is faster than parsing code :P |
| 1284 | + $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" ); |
| 1285 | + $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" ); |
| 1286 | + |
| 1287 | + if(empty( $wikiUpperChars) || empty($wikiLowerChars )) { |
| 1288 | + require_once( "includes/Utf8Case.php" ); |
| 1289 | + $wgMemc->set( $key1, $wikiUpperChars ); |
| 1290 | + $wgMemc->set( $key2, $wikiLowerChars ); |
| 1291 | + } |
| 1292 | +} |
| 1293 | + |
1201 | 1294 | # This should fail gracefully if there's not a localization available |
1202 | 1295 | wfSuppressWarnings(); |
1203 | 1296 | // Preload base classes to work around APC/PHP5 bug |
Index: trunk/phase3/languages/LanguageUtf8.php |
— | — | @@ -1,199 +1,12 @@ |
2 | 2 | <?php |
3 | 3 | /** |
4 | | - * @package MediaWiki |
5 | | - * @subpackage Language |
6 | | - */ |
7 | | - |
8 | | -if( defined( "MEDIAWIKI" ) ) { |
9 | | - |
10 | | -# This file and LanguageLatin1.php may be included from within functions, so |
11 | | -# we need to have global statements |
12 | | - |
13 | | -global $wgInputEncoding, $wgOutputEncoding, $wikiUpperChars, $wikiLowerChars; |
14 | | -global $wgDBname, $wgMemc; |
15 | | - |
16 | | -$wgInputEncoding = "UTF-8"; |
17 | | -$wgOutputEncoding = "UTF-8"; |
18 | | - |
19 | | -if( function_exists( 'mb_strtoupper' ) ) { |
20 | | - mb_internal_encoding('UTF-8'); |
21 | | -} else { |
22 | | - # Hack our own case conversion routines |
23 | | - |
24 | | - # Loading serialized arrays is faster than parsing code :P |
25 | | - $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" ); |
26 | | - $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" ); |
27 | | - |
28 | | - if(empty( $wikiUpperChars) || empty($wikiLowerChars )) { |
29 | | - require_once( "includes/Utf8Case.php" ); |
30 | | - $wgMemc->set( $key1, $wikiUpperChars ); |
31 | | - $wgMemc->set( $key2, $wikiLowerChars ); |
32 | | - } |
33 | | -} |
34 | | - |
35 | | -/** |
36 | 4 | * Base stuff useful to all UTF-8 based language files |
37 | 5 | * @package MediaWiki |
| 6 | + * |
| 7 | + * Will be deleted |
38 | 8 | */ |
39 | 9 | class LanguageUtf8 extends Language { |
40 | 10 | |
41 | | - # These functions use mbstring library, if it is loaded |
42 | | - # or compiled and character mapping arrays otherwise. |
43 | | - # In case of language-specific character mismatch |
44 | | - # it should be dealt with in Language classes. |
45 | | - |
46 | | - function ucfirst( $str ) { |
47 | | - return LanguageUtf8::uc( $str, true ); |
48 | | - } |
49 | | - |
50 | | - function uc( $str, $first = false ) { |
51 | | - if ( function_exists( 'mb_strtoupper' ) ) |
52 | | - if ( $first ) |
53 | | - if ( LanguageUtf8::isMultibyte( $str ) ) |
54 | | - return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ); |
55 | | - else |
56 | | - return ucfirst( $str ); |
57 | | - else |
58 | | - return LanguageUtf8::isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str ); |
59 | | - else |
60 | | - if ( LanguageUtf8::isMultibyte( $str ) ) { |
61 | | - global $wikiUpperChars; |
62 | | - $x = $first ? '^' : ''; |
63 | | - return preg_replace( |
64 | | - "/$x([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e", |
65 | | - "strtr( \"\$1\" , \$wikiUpperChars )", |
66 | | - $str |
67 | | - ); |
68 | | - } else |
69 | | - return $first ? ucfirst( $str ) : strtoupper( $str ); |
70 | | - } |
71 | | - |
72 | | - function lcfirst( $str ) { |
73 | | - return LanguageUtf8::lc( $str, true ); |
74 | | - } |
75 | | - |
76 | | - function lc( $str, $first = false ) { |
77 | | - if ( function_exists( 'mb_strtolower' ) ) |
78 | | - if ( $first ) |
79 | | - if ( LanguageUtf8::isMultibyte( $str ) ) |
80 | | - return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ); |
81 | | - else |
82 | | - return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ); |
83 | | - else |
84 | | - return LanguageUtf8::isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str ); |
85 | | - else |
86 | | - if ( LanguageUtf8::isMultibyte( $str ) ) { |
87 | | - global $wikiLowerChars; |
88 | | - $x = $first ? '^' : ''; |
89 | | - return preg_replace( |
90 | | - "/$x([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e", |
91 | | - "strtr( \"\$1\" , \$wikiLowerChars )", |
92 | | - $str |
93 | | - ); |
94 | | - } else |
95 | | - return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str ); |
96 | | - } |
97 | | - |
98 | | - function isMultibyte( $str ) { |
99 | | - return (bool)preg_match( '/^[\x80-\xff]/', $str ); |
100 | | - } |
101 | | - |
102 | | - function stripForSearch( $string ) { |
103 | | - # MySQL fulltext index doesn't grok utf-8, so we |
104 | | - # need to fold cases and convert to hex |
105 | | - |
106 | | - # In Language:: it just returns lowercase, maybe |
107 | | - # all strtolower on stripped output or argument |
108 | | - # should be removed and all stripForSearch |
109 | | - # methods adjusted to that. |
110 | | - |
111 | | - wfProfileIn( "LanguageUtf8::stripForSearch" ); |
112 | | - if( function_exists( 'mb_strtolower' ) ) { |
113 | | - $out = preg_replace( |
114 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", |
115 | | - "'U8' . bin2hex( \"$1\" )", |
116 | | - mb_strtolower( $string ) ); |
117 | | - } else { |
118 | | - global $wikiLowerChars; |
119 | | - $out = preg_replace( |
120 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", |
121 | | - "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )", |
122 | | - $string ); |
123 | | - } |
124 | | - wfProfileOut( "LanguageUtf8::stripForSearch" ); |
125 | | - return $out; |
126 | | - } |
127 | | - |
128 | | - function fallback8bitEncoding() { |
129 | | - # Windows codepage 1252 is a superset of iso 8859-1 |
130 | | - # override this to use difference source encoding to |
131 | | - # translate incoming 8-bit URLs. |
132 | | - return "windows-1252"; |
133 | | - } |
134 | | - |
135 | | - function checkTitleEncoding( $s ) { |
136 | | - global $wgInputEncoding; |
137 | | - |
138 | | - if( is_array( $s ) ) { |
139 | | - wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' ); |
140 | | - } |
141 | | - # Check for non-UTF-8 URLs |
142 | | - $ishigh = preg_match( '/[\x80-\xff]/', $s); |
143 | | - if(!$ishigh) return $s; |
144 | | - |
145 | | - $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . |
146 | | - '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ); |
147 | | - if( $isutf8 ) return $s; |
148 | | - |
149 | | - return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s ); |
150 | | - } |
151 | | - |
152 | | - function firstChar( $s ) { |
153 | | - preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . |
154 | | - '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches); |
155 | | - |
156 | | - return isset( $matches[1] ) ? $matches[1] : ""; |
157 | | - } |
158 | | - |
159 | | - # Crop a string from the beginning or end to a certain number of bytes. |
160 | | - # (Bytes are used because our storage has limited byte lengths for some |
161 | | - # columns in the database.) Multibyte charsets will need to make sure that |
162 | | - # only whole characters are included! |
163 | | - # |
164 | | - # $length does not include the optional ellipsis. |
165 | | - # If $length is negative, snip from the beginning |
166 | | - function truncate( $string, $length, $ellipsis = "" ) { |
167 | | - if( $length == 0 ) { |
168 | | - return $ellipsis; |
169 | | - } |
170 | | - if ( strlen( $string ) <= abs( $length ) ) { |
171 | | - return $string; |
172 | | - } |
173 | | - if( $length > 0 ) { |
174 | | - $string = substr( $string, 0, $length ); |
175 | | - $char = ord( $string[strlen( $string ) - 1] ); |
176 | | - if ($char >= 0xc0) { |
177 | | - # We got the first byte only of a multibyte char; remove it. |
178 | | - $string = substr( $string, 0, -1 ); |
179 | | - } elseif( $char >= 0x80 && |
180 | | - preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' . |
181 | | - '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) { |
182 | | - # We chopped in the middle of a character; remove it |
183 | | - $string = $m[1]; |
184 | | - } |
185 | | - return $string . $ellipsis; |
186 | | - } else { |
187 | | - $string = substr( $string, $length ); |
188 | | - $char = ord( $string[0] ); |
189 | | - if( $char >= 0x80 && $char < 0xc0 ) { |
190 | | - # We chopped in the middle of a character; remove the whole thing |
191 | | - $string = preg_replace( '/^[\x80-\xbf]+/', '', $string ); |
192 | | - } |
193 | | - return $ellipsis . $string; |
194 | | - } |
195 | | - } |
196 | 11 | } |
197 | 12 | |
198 | | -} # ifdef MEDIAWIKI |
199 | | - |
200 | 13 | ?> |