r14684 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r14683‎ | r14684 | r14685 >
Date:15:41, 9 June 2006
Author:nikerabbit
Status:old
Tags:
Comment:
* (bug 2069) Merge the LanguageUtf8 class into the Language class
* Based on patch from Rotem Liss
Modified paths:
  • /trunk/phase3/RELEASE-NOTES (modified) (history)
  • /trunk/phase3/languages/Language.php (modified) (history)
  • /trunk/phase3/languages/LanguageUtf8.php (modified) (history)

Diff [purge]

Index: trunk/phase3/RELEASE-NOTES
@@ -464,6 +464,7 @@
465465 * (bug 6243) Fix email for usernames containing dots when using PEAR::Mail
466466 * Remove a number of needless {{ns:project}}-type transforms from messages files. These
467467 usages already have separate label text. Such transforms are wasteful on each page view.
 468+* (bug 2069) Merge the LanguageUtf8 class into the Language class
468469
469470 == Compatibility ==
470471
Index: trunk/phase3/languages/Language.php
@@ -732,41 +732,73 @@
733733 return iconv( $in, $out, $string );
734734 }
735735
736 - function ucfirst( $string ) {
737 - # For most languages, this is a wrapper for ucfirst()
738 - return ucfirst( $string );
 736+ function ucfirst( $str ) {
 737+ return $this->uc( $str, true );
739738 }
740739
741 - function uc( $str ) {
742 - return strtoupper( $str );
 740+ function uc( $str, $first = false ) {
 741+ if ( function_exists( 'mb_strtoupper' ) )
 742+ if ( $first )
 743+ if ( $this->isMultibyte( $str ) )
 744+ return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
 745+ else
 746+ return ucfirst( $str );
 747+ else
 748+ return $this->isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str );
 749+ else
 750+ if ( $this->isMultibyte( $str ) ) {
 751+ global $wikiUpperChars;
 752+ $x = $first ? '^' : '';
 753+ return preg_replace(
 754+ "/$x([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
 755+ "strtr( \"\$1\" , \$wikiUpperChars )",
 756+ $str
 757+ );
 758+ } else
 759+ return $first ? ucfirst( $str ) : strtoupper( $str );
743760 }
744761
745 - function lcfirst( $s ) {
746 - return strtolower( $s{0} ). substr( $s, 1 );
 762+ function lcfirst( $str ) {
 763+ return $this->lc( $str, true );
747764 }
748765
749 - function lc( $str ) {
750 - return strtolower( $str );
 766+ function lc( $str, $first = false ) {
 767+ if ( function_exists( 'mb_strtolower' ) )
 768+ if ( $first )
 769+ if ( $this->isMultibyte( $str ) )
 770+ return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
 771+ else
 772+ return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 );
 773+ else
 774+ return $this->isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str );
 775+ else
 776+ if ( $this->isMultibyte( $str ) ) {
 777+ global $wikiLowerChars;
 778+ $x = $first ? '^' : '';
 779+ return preg_replace(
 780+ "/$x([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
 781+ "strtr( \"\$1\" , \$wikiLowerChars )",
 782+ $str
 783+ );
 784+ } else
 785+ return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str );
751786 }
752787
753788 function checkTitleEncoding( $s ) {
754789 global $wgInputEncoding;
755790
756 - # Check for UTF-8 URLs; Internet Explorer produces these if you
757 - # type non-ASCII chars in the URL bar or follow unescaped links.
 791+ if( is_array( $s ) ) {
 792+ wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );
 793+ }
 794+ # Check for non-UTF-8 URLs
758795 $ishigh = preg_match( '/[\x80-\xff]/', $s);
759 - $isutf = ($ishigh ? preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
760 - '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ) : true );
 796+ if(!$ishigh) return $s;
761797
762 - if( ($wgInputEncoding != 'utf-8') and $ishigh and $isutf )
763 - return @iconv( 'UTF-8', $wgInputEncoding, $s );
 798+ $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
 799+ '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
 800+ if( $isutf8 ) return $s;
764801
765 - if( ($wgInputEncoding == 'utf-8') and $ishigh and !$isutf )
766 - return utf8_encode( $s );
767 -
768 - # Other languages can safely leave this function, or replace
769 - # it with one to detect and convert another legacy encoding.
770 - return $s;
 802+ return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
771803 }
772804
773805 /**
@@ -774,11 +806,33 @@
775807 * or characters which need to be converted for MySQL's
776808 * indexing to grok it correctly. Make such changes here.
777809 *
778 - * @param string $in
 810+ * @param string $string
779811 * @return string
780812 */
781 - function stripForSearch( $in ) {
782 - return strtolower( $in );
 813+ function stripForSearch( $string ) {
 814+ # MySQL fulltext index doesn't grok utf-8, so we
 815+ # need to fold cases and convert to hex
 816+
 817+ # In Language:: it just returns lowercase, maybe
 818+ # all strtolower on stripped output or argument
 819+ # should be removed and all stripForSearch
 820+ # methods adjusted to that.
 821+
 822+ wfProfileIn( "Language::stripForSearch" );
 823+ if( function_exists( 'mb_strtolower' ) ) {
 824+ $out = preg_replace(
 825+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
 826+ "'U8' . bin2hex( \"$1\" )",
 827+ mb_strtolower( $string ) );
 828+ } else {
 829+ global $wikiLowerChars;
 830+ $out = preg_replace(
 831+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
 832+ "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
 833+ $string );
 834+ }
 835+ wfProfileOut( "Language::stripForSearch" );
 836+ return $out;
783837 }
784838
785839 function convertForSearchResult( $termsArray ) {
@@ -796,7 +850,10 @@
797851 * @return string
798852 */
799853 function firstChar( $s ) {
800 - return $s[0];
 854+ preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
 855+ '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
 856+
 857+ return isset( $matches[1] ) ? $matches[1] : "";
801858 }
802859
803860 function initEncoding() {
@@ -981,7 +1038,7 @@
9821039 #
9831040 # $length does not include the optional ellipsis.
9841041 # If $length is negative, snip from the beginning
985 - function truncate( $string, $length, $ellipsis = '' ) {
 1042+ function truncate( $string, $length, $ellipsis = "" ) {
9861043 if( $length == 0 ) {
9871044 return $ellipsis;
9881045 }
@@ -990,9 +1047,24 @@
9911048 }
9921049 if( $length > 0 ) {
9931050 $string = substr( $string, 0, $length );
 1051+ $char = ord( $string[strlen( $string ) - 1] );
 1052+ if ($char >= 0xc0) {
 1053+ # We got the first byte only of a multibyte char; remove it.
 1054+ $string = substr( $string, 0, -1 );
 1055+ } elseif( $char >= 0x80 &&
 1056+ preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
 1057+ '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
 1058+ # We chopped in the middle of a character; remove it
 1059+ $string = $m[1];
 1060+ }
9941061 return $string . $ellipsis;
9951062 } else {
9961063 $string = substr( $string, $length );
 1064+ $char = ord( $string[0] );
 1065+ if( $char >= 0x80 && $char < 0xc0 ) {
 1066+ # We chopped in the middle of a character; remove the whole thing
 1067+ $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
 1068+ }
9971069 return $ellipsis . $string;
9981070 }
9991071 }
@@ -1190,13 +1262,34 @@
11911263 return str_replace( '_', '-', strtolower( substr( get_class( $this ), 8 ) ) );
11921264 }
11931265
 1266+ function isMultibyte( $str ) {
 1267+ return (bool)preg_match( '/^[\x80-\xff]/', $str );
 1268+ }
11941269
 1270+ function fallback8bitEncoding() {
 1271+ # Windows codepage 1252 is a superset of iso 8859-1
 1272+ # override this to use difference source encoding to
 1273+ # translate incoming 8-bit URLs.
 1274+ return "windows-1252";
 1275+ }
11951276 }
11961277
1197 -# FIXME: Merge all UTF-8 support code into Language base class.
1198 -# We no longer support Latin-1 charset.
1199 -require_once( 'LanguageUtf8.php' );
 1278+if( function_exists( 'mb_strtoupper' ) ) {
 1279+ mb_internal_encoding('UTF-8');
 1280+} else {
 1281+ # Hack our own case conversion routines
12001282
 1283+ # Loading serialized arrays is faster than parsing code :P
 1284+ $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
 1285+ $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
 1286+
 1287+ if(empty( $wikiUpperChars) || empty($wikiLowerChars )) {
 1288+ require_once( "includes/Utf8Case.php" );
 1289+ $wgMemc->set( $key1, $wikiUpperChars );
 1290+ $wgMemc->set( $key2, $wikiLowerChars );
 1291+ }
 1292+}
 1293+
12011294 # This should fail gracefully if there's not a localization available
12021295 wfSuppressWarnings();
12031296 // Preload base classes to work around APC/PHP5 bug
Index: trunk/phase3/languages/LanguageUtf8.php
@@ -1,199 +1,12 @@
22 <?php
33 /**
4 - * @package MediaWiki
5 - * @subpackage Language
6 - */
7 -
8 -if( defined( "MEDIAWIKI" ) ) {
9 -
10 -# This file and LanguageLatin1.php may be included from within functions, so
11 -# we need to have global statements
12 -
13 -global $wgInputEncoding, $wgOutputEncoding, $wikiUpperChars, $wikiLowerChars;
14 -global $wgDBname, $wgMemc;
15 -
16 -$wgInputEncoding = "UTF-8";
17 -$wgOutputEncoding = "UTF-8";
18 -
19 -if( function_exists( 'mb_strtoupper' ) ) {
20 - mb_internal_encoding('UTF-8');
21 -} else {
22 - # Hack our own case conversion routines
23 -
24 - # Loading serialized arrays is faster than parsing code :P
25 - $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
26 - $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
27 -
28 - if(empty( $wikiUpperChars) || empty($wikiLowerChars )) {
29 - require_once( "includes/Utf8Case.php" );
30 - $wgMemc->set( $key1, $wikiUpperChars );
31 - $wgMemc->set( $key2, $wikiLowerChars );
32 - }
33 -}
34 -
35 -/**
364 * Base stuff useful to all UTF-8 based language files
375 * @package MediaWiki
 6+ *
 7+ * Will be deleted
388 */
399 class LanguageUtf8 extends Language {
4010
41 - # These functions use mbstring library, if it is loaded
42 - # or compiled and character mapping arrays otherwise.
43 - # In case of language-specific character mismatch
44 - # it should be dealt with in Language classes.
45 -
46 - function ucfirst( $str ) {
47 - return LanguageUtf8::uc( $str, true );
48 - }
49 -
50 - function uc( $str, $first = false ) {
51 - if ( function_exists( 'mb_strtoupper' ) )
52 - if ( $first )
53 - if ( LanguageUtf8::isMultibyte( $str ) )
54 - return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
55 - else
56 - return ucfirst( $str );
57 - else
58 - return LanguageUtf8::isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str );
59 - else
60 - if ( LanguageUtf8::isMultibyte( $str ) ) {
61 - global $wikiUpperChars;
62 - $x = $first ? '^' : '';
63 - return preg_replace(
64 - "/$x([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
65 - "strtr( \"\$1\" , \$wikiUpperChars )",
66 - $str
67 - );
68 - } else
69 - return $first ? ucfirst( $str ) : strtoupper( $str );
70 - }
71 -
72 - function lcfirst( $str ) {
73 - return LanguageUtf8::lc( $str, true );
74 - }
75 -
76 - function lc( $str, $first = false ) {
77 - if ( function_exists( 'mb_strtolower' ) )
78 - if ( $first )
79 - if ( LanguageUtf8::isMultibyte( $str ) )
80 - return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
81 - else
82 - return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 );
83 - else
84 - return LanguageUtf8::isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str );
85 - else
86 - if ( LanguageUtf8::isMultibyte( $str ) ) {
87 - global $wikiLowerChars;
88 - $x = $first ? '^' : '';
89 - return preg_replace(
90 - "/$x([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
91 - "strtr( \"\$1\" , \$wikiLowerChars )",
92 - $str
93 - );
94 - } else
95 - return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str );
96 - }
97 -
98 - function isMultibyte( $str ) {
99 - return (bool)preg_match( '/^[\x80-\xff]/', $str );
100 - }
101 -
102 - function stripForSearch( $string ) {
103 - # MySQL fulltext index doesn't grok utf-8, so we
104 - # need to fold cases and convert to hex
105 -
106 - # In Language:: it just returns lowercase, maybe
107 - # all strtolower on stripped output or argument
108 - # should be removed and all stripForSearch
109 - # methods adjusted to that.
110 -
111 - wfProfileIn( "LanguageUtf8::stripForSearch" );
112 - if( function_exists( 'mb_strtolower' ) ) {
113 - $out = preg_replace(
114 - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
115 - "'U8' . bin2hex( \"$1\" )",
116 - mb_strtolower( $string ) );
117 - } else {
118 - global $wikiLowerChars;
119 - $out = preg_replace(
120 - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
121 - "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
122 - $string );
123 - }
124 - wfProfileOut( "LanguageUtf8::stripForSearch" );
125 - return $out;
126 - }
127 -
128 - function fallback8bitEncoding() {
129 - # Windows codepage 1252 is a superset of iso 8859-1
130 - # override this to use difference source encoding to
131 - # translate incoming 8-bit URLs.
132 - return "windows-1252";
133 - }
134 -
135 - function checkTitleEncoding( $s ) {
136 - global $wgInputEncoding;
137 -
138 - if( is_array( $s ) ) {
139 - wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );
140 - }
141 - # Check for non-UTF-8 URLs
142 - $ishigh = preg_match( '/[\x80-\xff]/', $s);
143 - if(!$ishigh) return $s;
144 -
145 - $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
146 - '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
147 - if( $isutf8 ) return $s;
148 -
149 - return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
150 - }
151 -
152 - function firstChar( $s ) {
153 - preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
154 - '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
155 -
156 - return isset( $matches[1] ) ? $matches[1] : "";
157 - }
158 -
159 - # Crop a string from the beginning or end to a certain number of bytes.
160 - # (Bytes are used because our storage has limited byte lengths for some
161 - # columns in the database.) Multibyte charsets will need to make sure that
162 - # only whole characters are included!
163 - #
164 - # $length does not include the optional ellipsis.
165 - # If $length is negative, snip from the beginning
166 - function truncate( $string, $length, $ellipsis = "" ) {
167 - if( $length == 0 ) {
168 - return $ellipsis;
169 - }
170 - if ( strlen( $string ) <= abs( $length ) ) {
171 - return $string;
172 - }
173 - if( $length > 0 ) {
174 - $string = substr( $string, 0, $length );
175 - $char = ord( $string[strlen( $string ) - 1] );
176 - if ($char >= 0xc0) {
177 - # We got the first byte only of a multibyte char; remove it.
178 - $string = substr( $string, 0, -1 );
179 - } elseif( $char >= 0x80 &&
180 - preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
181 - '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
182 - # We chopped in the middle of a character; remove it
183 - $string = $m[1];
184 - }
185 - return $string . $ellipsis;
186 - } else {
187 - $string = substr( $string, $length );
188 - $char = ord( $string[0] );
189 - if( $char >= 0x80 && $char < 0xc0 ) {
190 - # We chopped in the middle of a character; remove the whole thing
191 - $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
192 - }
193 - return $ellipsis . $string;
194 - }
195 - }
19611 }
19712
198 -} # ifdef MEDIAWIKI
199 -
20013 ?>

Follow-up revisions

RevisionCommit summaryAuthorDate
r14693Revert bad patch for (bug 2069) Merge the LanguageUtf8 class into the Languag...brion23:23, 9 June 2006

Status & tagging log