Index: trunk/phase3/RELEASE-NOTES |
— | — | @@ -464,7 +464,6 @@ |
465 | 465 | * (bug 6243) Fix email for usernames containing dots when using PEAR::Mail |
466 | 466 | * Remove a number of needless {{ns:project}}-type transforms from messages files. These |
467 | 467 | usages already have separate label text. Such transforms are wasteful on each page view. |
468 | | -* (bug 2069) Merge the LanguageUtf8 class into the Language class |
469 | 468 | * Update to Yiddish localization (yi) |
470 | 469 | * (bug 6254) Update to Indonesian translation (id) #20 |
471 | 470 | * (bug 6255) Fix transclusions starting with "#" or "*" in HTML attributes |
Index: trunk/phase3/languages/Language.php |
— | — | @@ -732,73 +732,41 @@ |
733 | 733 | return iconv( $in, $out, $string ); |
734 | 734 | } |
735 | 735 | |
736 | | - function ucfirst( $str ) { |
737 | | - return $this->uc( $str, true ); |
| 736 | + function ucfirst( $string ) { |
| 737 | + # For most languages, this is a wrapper for ucfirst() |
| 738 | + return ucfirst( $string ); |
738 | 739 | } |
739 | 740 | |
740 | | - function uc( $str, $first = false ) { |
741 | | - if ( function_exists( 'mb_strtoupper' ) ) |
742 | | - if ( $first ) |
743 | | - if ( $this->isMultibyte( $str ) ) |
744 | | - return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ); |
745 | | - else |
746 | | - return ucfirst( $str ); |
747 | | - else |
748 | | - return $this->isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str ); |
749 | | - else |
750 | | - if ( $this->isMultibyte( $str ) ) { |
751 | | - global $wikiUpperChars; |
752 | | - $x = $first ? '^' : ''; |
753 | | - return preg_replace( |
754 | | - "/$x([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e", |
755 | | - "strtr( \"\$1\" , \$wikiUpperChars )", |
756 | | - $str |
757 | | - ); |
758 | | - } else |
759 | | - return $first ? ucfirst( $str ) : strtoupper( $str ); |
| 741 | + function uc( $str ) { |
| 742 | + return strtoupper( $str ); |
760 | 743 | } |
761 | 744 | |
762 | | - function lcfirst( $str ) { |
763 | | - return $this->lc( $str, true ); |
| 745 | + function lcfirst( $s ) { |
| 746 | + return strtolower( $s{0} ). substr( $s, 1 ); |
764 | 747 | } |
765 | 748 | |
766 | | - function lc( $str, $first = false ) { |
767 | | - if ( function_exists( 'mb_strtolower' ) ) |
768 | | - if ( $first ) |
769 | | - if ( $this->isMultibyte( $str ) ) |
770 | | - return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ); |
771 | | - else |
772 | | - return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ); |
773 | | - else |
774 | | - return $this->isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str ); |
775 | | - else |
776 | | - if ( $this->isMultibyte( $str ) ) { |
777 | | - global $wikiLowerChars; |
778 | | - $x = $first ? '^' : ''; |
779 | | - return preg_replace( |
780 | | - "/$x([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e", |
781 | | - "strtr( \"\$1\" , \$wikiLowerChars )", |
782 | | - $str |
783 | | - ); |
784 | | - } else |
785 | | - return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str ); |
| 749 | + function lc( $str ) { |
| 750 | + return strtolower( $str ); |
786 | 751 | } |
787 | 752 | |
788 | 753 | function checkTitleEncoding( $s ) { |
789 | 754 | global $wgInputEncoding; |
790 | 755 | |
791 | | - if( is_array( $s ) ) { |
792 | | - wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' ); |
793 | | - } |
794 | | - # Check for non-UTF-8 URLs |
| 756 | + # Check for UTF-8 URLs; Internet Explorer produces these if you |
| 757 | + # type non-ASCII chars in the URL bar or follow unescaped links. |
795 | 758 | $ishigh = preg_match( '/[\x80-\xff]/', $s); |
796 | | - if(!$ishigh) return $s; |
| 759 | + $isutf = ($ishigh ? preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . |
| 760 | + '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ) : true ); |
797 | 761 | |
798 | | - $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . |
799 | | - '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ); |
800 | | - if( $isutf8 ) return $s; |
| 762 | + if( ($wgInputEncoding != 'utf-8') and $ishigh and $isutf ) |
| 763 | + return @iconv( 'UTF-8', $wgInputEncoding, $s ); |
801 | 764 | |
802 | | - return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s ); |
| 765 | + if( ($wgInputEncoding == 'utf-8') and $ishigh and !$isutf ) |
| 766 | + return utf8_encode( $s ); |
| 767 | + |
| 768 | + # Other languages can safely leave this function, or replace |
| 769 | + # it with one to detect and convert another legacy encoding. |
| 770 | + return $s; |
803 | 771 | } |
804 | 772 | |
805 | 773 | /** |
— | — | @@ -806,33 +774,11 @@ |
807 | 775 | * or characters which need to be converted for MySQL's |
808 | 776 | * indexing to grok it correctly. Make such changes here. |
809 | 777 | * |
810 | | - * @param string $string |
| 778 | + * @param string $in |
811 | 779 | * @return string |
812 | 780 | */ |
813 | | - function stripForSearch( $string ) { |
814 | | - # MySQL fulltext index doesn't grok utf-8, so we |
815 | | - # need to fold cases and convert to hex |
816 | | - |
817 | | - # In Language:: it just returns lowercase, maybe |
818 | | - # all strtolower on stripped output or argument |
819 | | - # should be removed and all stripForSearch |
820 | | - # methods adjusted to that. |
821 | | - |
822 | | - wfProfileIn( "Language::stripForSearch" ); |
823 | | - if( function_exists( 'mb_strtolower' ) ) { |
824 | | - $out = preg_replace( |
825 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", |
826 | | - "'U8' . bin2hex( \"$1\" )", |
827 | | - mb_strtolower( $string ) ); |
828 | | - } else { |
829 | | - global $wikiLowerChars; |
830 | | - $out = preg_replace( |
831 | | - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", |
832 | | - "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )", |
833 | | - $string ); |
834 | | - } |
835 | | - wfProfileOut( "Language::stripForSearch" ); |
836 | | - return $out; |
| 781 | + function stripForSearch( $in ) { |
| 782 | + return strtolower( $in ); |
837 | 783 | } |
838 | 784 | |
839 | 785 | function convertForSearchResult( $termsArray ) { |
— | — | @@ -850,10 +796,7 @@ |
851 | 797 | * @return string |
852 | 798 | */ |
853 | 799 | function firstChar( $s ) { |
854 | | - preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . |
855 | | - '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches); |
856 | | - |
857 | | - return isset( $matches[1] ) ? $matches[1] : ""; |
| 800 | + return $s[0]; |
858 | 801 | } |
859 | 802 | |
860 | 803 | function initEncoding() { |
— | — | @@ -1038,7 +981,7 @@ |
1039 | 982 | # |
1040 | 983 | # $length does not include the optional ellipsis. |
1041 | 984 | # If $length is negative, snip from the beginning |
1042 | | - function truncate( $string, $length, $ellipsis = "" ) { |
| 985 | + function truncate( $string, $length, $ellipsis = '' ) { |
1043 | 986 | if( $length == 0 ) { |
1044 | 987 | return $ellipsis; |
1045 | 988 | } |
— | — | @@ -1047,24 +990,9 @@ |
1048 | 991 | } |
1049 | 992 | if( $length > 0 ) { |
1050 | 993 | $string = substr( $string, 0, $length ); |
1051 | | - $char = ord( $string[strlen( $string ) - 1] ); |
1052 | | - if ($char >= 0xc0) { |
1053 | | - # We got the first byte only of a multibyte char; remove it. |
1054 | | - $string = substr( $string, 0, -1 ); |
1055 | | - } elseif( $char >= 0x80 && |
1056 | | - preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' . |
1057 | | - '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) { |
1058 | | - # We chopped in the middle of a character; remove it |
1059 | | - $string = $m[1]; |
1060 | | - } |
1061 | 994 | return $string . $ellipsis; |
1062 | 995 | } else { |
1063 | 996 | $string = substr( $string, $length ); |
1064 | | - $char = ord( $string[0] ); |
1065 | | - if( $char >= 0x80 && $char < 0xc0 ) { |
1066 | | - # We chopped in the middle of a character; remove the whole thing |
1067 | | - $string = preg_replace( '/^[\x80-\xbf]+/', '', $string ); |
1068 | | - } |
1069 | 997 | return $ellipsis . $string; |
1070 | 998 | } |
1071 | 999 | } |
— | — | @@ -1262,34 +1190,13 @@ |
1263 | 1191 | return str_replace( '_', '-', strtolower( substr( get_class( $this ), 8 ) ) ); |
1264 | 1192 | } |
1265 | 1193 | |
1266 | | - function isMultibyte( $str ) { |
1267 | | - return (bool)preg_match( '/^[\x80-\xff]/', $str ); |
1268 | | - } |
1269 | 1194 | |
1270 | | - function fallback8bitEncoding() { |
1271 | | - # Windows codepage 1252 is a superset of iso 8859-1 |
1272 | | - # override this to use difference source encoding to |
1273 | | - # translate incoming 8-bit URLs. |
1274 | | - return "windows-1252"; |
1275 | | - } |
1276 | 1195 | } |
1277 | 1196 | |
1278 | | -if( function_exists( 'mb_strtoupper' ) ) { |
1279 | | - mb_internal_encoding('UTF-8'); |
1280 | | -} else { |
1281 | | - # Hack our own case conversion routines |
| 1197 | +# FIXME: Merge all UTF-8 support code into Language base class. |
| 1198 | +# We no longer support Latin-1 charset. |
| 1199 | +require_once( 'LanguageUtf8.php' ); |
1282 | 1200 | |
1283 | | - # Loading serialized arrays is faster than parsing code :P |
1284 | | - $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" ); |
1285 | | - $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" ); |
1286 | | - |
1287 | | - if(empty( $wikiUpperChars) || empty($wikiLowerChars )) { |
1288 | | - require_once( "includes/Utf8Case.php" ); |
1289 | | - $wgMemc->set( $key1, $wikiUpperChars ); |
1290 | | - $wgMemc->set( $key2, $wikiLowerChars ); |
1291 | | - } |
1292 | | -} |
1293 | | - |
1294 | 1201 | # This should fail gracefully if there's not a localization available |
1295 | 1202 | wfSuppressWarnings(); |
1296 | 1203 | // Preload base classes to work around APC/PHP5 bug |
Index: trunk/phase3/languages/LanguageUtf8.php |
— | — | @@ -1,12 +1,199 @@ |
2 | 2 | <?php |
3 | 3 | /** |
| 4 | + * @package MediaWiki |
| 5 | + * @subpackage Language |
| 6 | + */ |
| 7 | + |
| 8 | +if( defined( "MEDIAWIKI" ) ) { |
| 9 | + |
| 10 | +# This file and LanguageLatin1.php may be included from within functions, so |
| 11 | +# we need to have global statements |
| 12 | + |
| 13 | +global $wgInputEncoding, $wgOutputEncoding, $wikiUpperChars, $wikiLowerChars; |
| 14 | +global $wgDBname, $wgMemc; |
| 15 | + |
| 16 | +$wgInputEncoding = "UTF-8"; |
| 17 | +$wgOutputEncoding = "UTF-8"; |
| 18 | + |
| 19 | +if( function_exists( 'mb_strtoupper' ) ) { |
| 20 | + mb_internal_encoding('UTF-8'); |
| 21 | +} else { |
| 22 | + # Hack our own case conversion routines |
| 23 | + |
| 24 | + # Loading serialized arrays is faster than parsing code :P |
| 25 | + $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" ); |
| 26 | + $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" ); |
| 27 | + |
| 28 | + if(empty( $wikiUpperChars) || empty($wikiLowerChars )) { |
| 29 | + require_once( "includes/Utf8Case.php" ); |
| 30 | + $wgMemc->set( $key1, $wikiUpperChars ); |
| 31 | + $wgMemc->set( $key2, $wikiLowerChars ); |
| 32 | + } |
| 33 | +} |
| 34 | + |
| 35 | +/** |
4 | 36 | * Base stuff useful to all UTF-8 based language files |
5 | 37 | * @package MediaWiki |
6 | | - * |
7 | | - * Will be deleted |
8 | 38 | */ |
9 | 39 | class LanguageUtf8 extends Language { |
10 | 40 | |
| 41 | + # These functions use mbstring library, if it is loaded |
| 42 | + # or compiled and character mapping arrays otherwise. |
| 43 | + # In case of language-specific character mismatch |
| 44 | + # it should be dealt with in Language classes. |
| 45 | + |
| 46 | + function ucfirst( $str ) { |
| 47 | + return LanguageUtf8::uc( $str, true ); |
| 48 | + } |
| 49 | + |
| 50 | + function uc( $str, $first = false ) { |
| 51 | + if ( function_exists( 'mb_strtoupper' ) ) |
| 52 | + if ( $first ) |
| 53 | + if ( LanguageUtf8::isMultibyte( $str ) ) |
| 54 | + return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ); |
| 55 | + else |
| 56 | + return ucfirst( $str ); |
| 57 | + else |
| 58 | + return LanguageUtf8::isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str ); |
| 59 | + else |
| 60 | + if ( LanguageUtf8::isMultibyte( $str ) ) { |
| 61 | + global $wikiUpperChars; |
| 62 | + $x = $first ? '^' : ''; |
| 63 | + return preg_replace( |
| 64 | + "/$x([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e", |
| 65 | + "strtr( \"\$1\" , \$wikiUpperChars )", |
| 66 | + $str |
| 67 | + ); |
| 68 | + } else |
| 69 | + return $first ? ucfirst( $str ) : strtoupper( $str ); |
| 70 | + } |
| 71 | + |
| 72 | + function lcfirst( $str ) { |
| 73 | + return LanguageUtf8::lc( $str, true ); |
| 74 | + } |
| 75 | + |
| 76 | + function lc( $str, $first = false ) { |
| 77 | + if ( function_exists( 'mb_strtolower' ) ) |
| 78 | + if ( $first ) |
| 79 | + if ( LanguageUtf8::isMultibyte( $str ) ) |
| 80 | + return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ); |
| 81 | + else |
| 82 | + return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ); |
| 83 | + else |
| 84 | + return LanguageUtf8::isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str ); |
| 85 | + else |
| 86 | + if ( LanguageUtf8::isMultibyte( $str ) ) { |
| 87 | + global $wikiLowerChars; |
| 88 | + $x = $first ? '^' : ''; |
| 89 | + return preg_replace( |
| 90 | + "/$x([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e", |
| 91 | + "strtr( \"\$1\" , \$wikiLowerChars )", |
| 92 | + $str |
| 93 | + ); |
| 94 | + } else |
| 95 | + return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str ); |
| 96 | + } |
| 97 | + |
| 98 | + function isMultibyte( $str ) { |
| 99 | + return (bool)preg_match( '/^[\x80-\xff]/', $str ); |
| 100 | + } |
| 101 | + |
| 102 | + function stripForSearch( $string ) { |
| 103 | + # MySQL fulltext index doesn't grok utf-8, so we |
| 104 | + # need to fold cases and convert to hex |
| 105 | + |
| 106 | + # In Language:: it just returns lowercase, maybe |
| 107 | + # all strtolower on stripped output or argument |
| 108 | + # should be removed and all stripForSearch |
| 109 | + # methods adjusted to that. |
| 110 | + |
| 111 | + wfProfileIn( "LanguageUtf8::stripForSearch" ); |
| 112 | + if( function_exists( 'mb_strtolower' ) ) { |
| 113 | + $out = preg_replace( |
| 114 | + "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", |
| 115 | + "'U8' . bin2hex( \"$1\" )", |
| 116 | + mb_strtolower( $string ) ); |
| 117 | + } else { |
| 118 | + global $wikiLowerChars; |
| 119 | + $out = preg_replace( |
| 120 | + "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", |
| 121 | + "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )", |
| 122 | + $string ); |
| 123 | + } |
| 124 | + wfProfileOut( "LanguageUtf8::stripForSearch" ); |
| 125 | + return $out; |
| 126 | + } |
| 127 | + |
| 128 | + function fallback8bitEncoding() { |
| 129 | + # Windows codepage 1252 is a superset of iso 8859-1 |
| 130 | + # override this to use difference source encoding to |
| 131 | + # translate incoming 8-bit URLs. |
| 132 | + return "windows-1252"; |
| 133 | + } |
| 134 | + |
| 135 | + function checkTitleEncoding( $s ) { |
| 136 | + global $wgInputEncoding; |
| 137 | + |
| 138 | + if( is_array( $s ) ) { |
| 139 | + wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' ); |
| 140 | + } |
| 141 | + # Check for non-UTF-8 URLs |
| 142 | + $ishigh = preg_match( '/[\x80-\xff]/', $s); |
| 143 | + if(!$ishigh) return $s; |
| 144 | + |
| 145 | + $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . |
| 146 | + '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ); |
| 147 | + if( $isutf8 ) return $s; |
| 148 | + |
| 149 | + return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s ); |
| 150 | + } |
| 151 | + |
| 152 | + function firstChar( $s ) { |
| 153 | + preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . |
| 154 | + '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches); |
| 155 | + |
| 156 | + return isset( $matches[1] ) ? $matches[1] : ""; |
| 157 | + } |
| 158 | + |
| 159 | + # Crop a string from the beginning or end to a certain number of bytes. |
| 160 | + # (Bytes are used because our storage has limited byte lengths for some |
| 161 | + # columns in the database.) Multibyte charsets will need to make sure that |
| 162 | + # only whole characters are included! |
| 163 | + # |
| 164 | + # $length does not include the optional ellipsis. |
| 165 | + # If $length is negative, snip from the beginning |
| 166 | + function truncate( $string, $length, $ellipsis = "" ) { |
| 167 | + if( $length == 0 ) { |
| 168 | + return $ellipsis; |
| 169 | + } |
| 170 | + if ( strlen( $string ) <= abs( $length ) ) { |
| 171 | + return $string; |
| 172 | + } |
| 173 | + if( $length > 0 ) { |
| 174 | + $string = substr( $string, 0, $length ); |
| 175 | + $char = ord( $string[strlen( $string ) - 1] ); |
| 176 | + if ($char >= 0xc0) { |
| 177 | + # We got the first byte only of a multibyte char; remove it. |
| 178 | + $string = substr( $string, 0, -1 ); |
| 179 | + } elseif( $char >= 0x80 && |
| 180 | + preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' . |
| 181 | + '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) { |
| 182 | + # We chopped in the middle of a character; remove it |
| 183 | + $string = $m[1]; |
| 184 | + } |
| 185 | + return $string . $ellipsis; |
| 186 | + } else { |
| 187 | + $string = substr( $string, $length ); |
| 188 | + $char = ord( $string[0] ); |
| 189 | + if( $char >= 0x80 && $char < 0xc0 ) { |
| 190 | + # We chopped in the middle of a character; remove the whole thing |
| 191 | + $string = preg_replace( '/^[\x80-\xbf]+/', '', $string ); |
| 192 | + } |
| 193 | + return $ellipsis . $string; |
| 194 | + } |
| 195 | + } |
11 | 196 | } |
12 | 197 | |
| 198 | +} # ifdef MEDIAWIKI |
| 199 | + |
13 | 200 | ?> |