r14693 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r14692‎ | r14693 | r14694 >
Date:23:23, 9 June 2006
Author:brion
Status:old
Tags:
Comment:
Revert bad patch for (bug 2069) Merge the LanguageUtf8 class into the Language class
Modified paths:
  • /trunk/phase3/RELEASE-NOTES (modified) (history)
  • /trunk/phase3/languages/Language.php (modified) (history)
  • /trunk/phase3/languages/LanguageUtf8.php (modified) (history)

Diff [purge]

Index: trunk/phase3/RELEASE-NOTES
@@ -464,7 +464,6 @@
465465 * (bug 6243) Fix email for usernames containing dots when using PEAR::Mail
466466 * Remove a number of needless {{ns:project}}-type transforms from messages files. These
467467 usages already have separate label text. Such transforms are wasteful on each page view.
468 -* (bug 2069) Merge the LanguageUtf8 class into the Language class
469468 * Update to Yiddish localization (yi)
470469 * (bug 6254) Update to Indonesian translation (id) #20
471470 * (bug 6255) Fix transclusions starting with "#" or "*" in HTML attributes
Index: trunk/phase3/languages/Language.php
@@ -732,73 +732,41 @@
733733 return iconv( $in, $out, $string );
734734 }
735735
736 - function ucfirst( $str ) {
737 - return $this->uc( $str, true );
 736+ function ucfirst( $string ) {
 737+ # For most languages, this is a wrapper for ucfirst()
 738+ return ucfirst( $string );
738739 }
739740
740 - function uc( $str, $first = false ) {
741 - if ( function_exists( 'mb_strtoupper' ) )
742 - if ( $first )
743 - if ( $this->isMultibyte( $str ) )
744 - return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
745 - else
746 - return ucfirst( $str );
747 - else
748 - return $this->isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str );
749 - else
750 - if ( $this->isMultibyte( $str ) ) {
751 - global $wikiUpperChars;
752 - $x = $first ? '^' : '';
753 - return preg_replace(
754 - "/$x([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
755 - "strtr( \"\$1\" , \$wikiUpperChars )",
756 - $str
757 - );
758 - } else
759 - return $first ? ucfirst( $str ) : strtoupper( $str );
 741+ function uc( $str ) {
 742+ return strtoupper( $str );
760743 }
761744
762 - function lcfirst( $str ) {
763 - return $this->lc( $str, true );
 745+ function lcfirst( $s ) {
 746+ return strtolower( $s{0} ). substr( $s, 1 );
764747 }
765748
766 - function lc( $str, $first = false ) {
767 - if ( function_exists( 'mb_strtolower' ) )
768 - if ( $first )
769 - if ( $this->isMultibyte( $str ) )
770 - return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
771 - else
772 - return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 );
773 - else
774 - return $this->isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str );
775 - else
776 - if ( $this->isMultibyte( $str ) ) {
777 - global $wikiLowerChars;
778 - $x = $first ? '^' : '';
779 - return preg_replace(
780 - "/$x([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
781 - "strtr( \"\$1\" , \$wikiLowerChars )",
782 - $str
783 - );
784 - } else
785 - return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str );
 749+ function lc( $str ) {
 750+ return strtolower( $str );
786751 }
787752
788753 function checkTitleEncoding( $s ) {
789754 global $wgInputEncoding;
790755
791 - if( is_array( $s ) ) {
792 - wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );
793 - }
794 - # Check for non-UTF-8 URLs
 756+ # Check for UTF-8 URLs; Internet Explorer produces these if you
 757+ # type non-ASCII chars in the URL bar or follow unescaped links.
795758 $ishigh = preg_match( '/[\x80-\xff]/', $s);
796 - if(!$ishigh) return $s;
 759+ $isutf = ($ishigh ? preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
 760+ '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ) : true );
797761
798 - $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
799 - '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
800 - if( $isutf8 ) return $s;
 762+ if( ($wgInputEncoding != 'utf-8') and $ishigh and $isutf )
 763+ return @iconv( 'UTF-8', $wgInputEncoding, $s );
801764
802 - return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
 765+ if( ($wgInputEncoding == 'utf-8') and $ishigh and !$isutf )
 766+ return utf8_encode( $s );
 767+
 768+ # Other languages can safely leave this function, or replace
 769+ # it with one to detect and convert another legacy encoding.
 770+ return $s;
803771 }
804772
805773 /**
@@ -806,33 +774,11 @@
807775 * or characters which need to be converted for MySQL's
808776 * indexing to grok it correctly. Make such changes here.
809777 *
810 - * @param string $string
 778+ * @param string $in
811779 * @return string
812780 */
813 - function stripForSearch( $string ) {
814 - # MySQL fulltext index doesn't grok utf-8, so we
815 - # need to fold cases and convert to hex
816 -
817 - # In Language:: it just returns lowercase, maybe
818 - # all strtolower on stripped output or argument
819 - # should be removed and all stripForSearch
820 - # methods adjusted to that.
821 -
822 - wfProfileIn( "Language::stripForSearch" );
823 - if( function_exists( 'mb_strtolower' ) ) {
824 - $out = preg_replace(
825 - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
826 - "'U8' . bin2hex( \"$1\" )",
827 - mb_strtolower( $string ) );
828 - } else {
829 - global $wikiLowerChars;
830 - $out = preg_replace(
831 - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
832 - "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
833 - $string );
834 - }
835 - wfProfileOut( "Language::stripForSearch" );
836 - return $out;
 781+ function stripForSearch( $in ) {
 782+ return strtolower( $in );
837783 }
838784
839785 function convertForSearchResult( $termsArray ) {
@@ -850,10 +796,7 @@
851797 * @return string
852798 */
853799 function firstChar( $s ) {
854 - preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
855 - '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
856 -
857 - return isset( $matches[1] ) ? $matches[1] : "";
 800+ return $s[0];
858801 }
859802
860803 function initEncoding() {
@@ -1038,7 +981,7 @@
1039982 #
1040983 # $length does not include the optional ellipsis.
1041984 # If $length is negative, snip from the beginning
1042 - function truncate( $string, $length, $ellipsis = "" ) {
 985+ function truncate( $string, $length, $ellipsis = '' ) {
1043986 if( $length == 0 ) {
1044987 return $ellipsis;
1045988 }
@@ -1047,24 +990,9 @@
1048991 }
1049992 if( $length > 0 ) {
1050993 $string = substr( $string, 0, $length );
1051 - $char = ord( $string[strlen( $string ) - 1] );
1052 - if ($char >= 0xc0) {
1053 - # We got the first byte only of a multibyte char; remove it.
1054 - $string = substr( $string, 0, -1 );
1055 - } elseif( $char >= 0x80 &&
1056 - preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
1057 - '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
1058 - # We chopped in the middle of a character; remove it
1059 - $string = $m[1];
1060 - }
1061994 return $string . $ellipsis;
1062995 } else {
1063996 $string = substr( $string, $length );
1064 - $char = ord( $string[0] );
1065 - if( $char >= 0x80 && $char < 0xc0 ) {
1066 - # We chopped in the middle of a character; remove the whole thing
1067 - $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
1068 - }
1069997 return $ellipsis . $string;
1070998 }
1071999 }
@@ -1262,34 +1190,13 @@
12631191 return str_replace( '_', '-', strtolower( substr( get_class( $this ), 8 ) ) );
12641192 }
12651193
1266 - function isMultibyte( $str ) {
1267 - return (bool)preg_match( '/^[\x80-\xff]/', $str );
1268 - }
12691194
1270 - function fallback8bitEncoding() {
1271 - # Windows codepage 1252 is a superset of iso 8859-1
1272 - # override this to use difference source encoding to
1273 - # translate incoming 8-bit URLs.
1274 - return "windows-1252";
1275 - }
12761195 }
12771196
1278 -if( function_exists( 'mb_strtoupper' ) ) {
1279 - mb_internal_encoding('UTF-8');
1280 -} else {
1281 - # Hack our own case conversion routines
 1197+# FIXME: Merge all UTF-8 support code into Language base class.
 1198+# We no longer support Latin-1 charset.
 1199+require_once( 'LanguageUtf8.php' );
12821200
1283 - # Loading serialized arrays is faster than parsing code :P
1284 - $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
1285 - $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
1286 -
1287 - if(empty( $wikiUpperChars) || empty($wikiLowerChars )) {
1288 - require_once( "includes/Utf8Case.php" );
1289 - $wgMemc->set( $key1, $wikiUpperChars );
1290 - $wgMemc->set( $key2, $wikiLowerChars );
1291 - }
1292 -}
1293 -
12941201 # This should fail gracefully if there's not a localization available
12951202 wfSuppressWarnings();
12961203 // Preload base classes to work around APC/PHP5 bug
Index: trunk/phase3/languages/LanguageUtf8.php
@@ -1,12 +1,199 @@
22 <?php
33 /**
 4+ * @package MediaWiki
 5+ * @subpackage Language
 6+ */
 7+
 8+if( defined( "MEDIAWIKI" ) ) {
 9+
 10+# This file and LanguageLatin1.php may be included from within functions, so
 11+# we need to have global statements
 12+
 13+global $wgInputEncoding, $wgOutputEncoding, $wikiUpperChars, $wikiLowerChars;
 14+global $wgDBname, $wgMemc;
 15+
 16+$wgInputEncoding = "UTF-8";
 17+$wgOutputEncoding = "UTF-8";
 18+
 19+if( function_exists( 'mb_strtoupper' ) ) {
 20+ mb_internal_encoding('UTF-8');
 21+} else {
 22+ # Hack our own case conversion routines
 23+
 24+ # Loading serialized arrays is faster than parsing code :P
 25+ $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
 26+ $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
 27+
 28+ if(empty( $wikiUpperChars) || empty($wikiLowerChars )) {
 29+ require_once( "includes/Utf8Case.php" );
 30+ $wgMemc->set( $key1, $wikiUpperChars );
 31+ $wgMemc->set( $key2, $wikiLowerChars );
 32+ }
 33+}
 34+
 35+/**
436 * Base stuff useful to all UTF-8 based language files
537 * @package MediaWiki
6 - *
7 - * Will be deleted
838 */
939 class LanguageUtf8 extends Language {
1040
 41+ # These functions use mbstring library, if it is loaded
 42+ # or compiled and character mapping arrays otherwise.
 43+ # In case of language-specific character mismatch
 44+ # it should be dealt with in Language classes.
 45+
 46+ function ucfirst( $str ) {
 47+ return LanguageUtf8::uc( $str, true );
 48+ }
 49+
 50+ function uc( $str, $first = false ) {
 51+ if ( function_exists( 'mb_strtoupper' ) )
 52+ if ( $first )
 53+ if ( LanguageUtf8::isMultibyte( $str ) )
 54+ return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
 55+ else
 56+ return ucfirst( $str );
 57+ else
 58+ return LanguageUtf8::isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str );
 59+ else
 60+ if ( LanguageUtf8::isMultibyte( $str ) ) {
 61+ global $wikiUpperChars;
 62+ $x = $first ? '^' : '';
 63+ return preg_replace(
 64+ "/$x([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
 65+ "strtr( \"\$1\" , \$wikiUpperChars )",
 66+ $str
 67+ );
 68+ } else
 69+ return $first ? ucfirst( $str ) : strtoupper( $str );
 70+ }
 71+
 72+ function lcfirst( $str ) {
 73+ return LanguageUtf8::lc( $str, true );
 74+ }
 75+
 76+ function lc( $str, $first = false ) {
 77+ if ( function_exists( 'mb_strtolower' ) )
 78+ if ( $first )
 79+ if ( LanguageUtf8::isMultibyte( $str ) )
 80+ return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
 81+ else
 82+ return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 );
 83+ else
 84+ return LanguageUtf8::isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str );
 85+ else
 86+ if ( LanguageUtf8::isMultibyte( $str ) ) {
 87+ global $wikiLowerChars;
 88+ $x = $first ? '^' : '';
 89+ return preg_replace(
 90+ "/$x([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
 91+ "strtr( \"\$1\" , \$wikiLowerChars )",
 92+ $str
 93+ );
 94+ } else
 95+ return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str );
 96+ }
 97+
 98+ function isMultibyte( $str ) {
 99+ return (bool)preg_match( '/^[\x80-\xff]/', $str );
 100+ }
 101+
 102+ function stripForSearch( $string ) {
 103+ # MySQL fulltext index doesn't grok utf-8, so we
 104+ # need to fold cases and convert to hex
 105+
 106+ # In Language:: it just returns lowercase, maybe
 107+ # all strtolower on stripped output or argument
 108+ # should be removed and all stripForSearch
 109+ # methods adjusted to that.
 110+
 111+ wfProfileIn( "LanguageUtf8::stripForSearch" );
 112+ if( function_exists( 'mb_strtolower' ) ) {
 113+ $out = preg_replace(
 114+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
 115+ "'U8' . bin2hex( \"$1\" )",
 116+ mb_strtolower( $string ) );
 117+ } else {
 118+ global $wikiLowerChars;
 119+ $out = preg_replace(
 120+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
 121+ "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
 122+ $string );
 123+ }
 124+ wfProfileOut( "LanguageUtf8::stripForSearch" );
 125+ return $out;
 126+ }
 127+
 128+ function fallback8bitEncoding() {
 129+ # Windows codepage 1252 is a superset of iso 8859-1
 130+ # override this to use difference source encoding to
 131+ # translate incoming 8-bit URLs.
 132+ return "windows-1252";
 133+ }
 134+
 135+ function checkTitleEncoding( $s ) {
 136+ global $wgInputEncoding;
 137+
 138+ if( is_array( $s ) ) {
 139+ wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );
 140+ }
 141+ # Check for non-UTF-8 URLs
 142+ $ishigh = preg_match( '/[\x80-\xff]/', $s);
 143+ if(!$ishigh) return $s;
 144+
 145+ $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
 146+ '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
 147+ if( $isutf8 ) return $s;
 148+
 149+ return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
 150+ }
 151+
 152+ function firstChar( $s ) {
 153+ preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
 154+ '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
 155+
 156+ return isset( $matches[1] ) ? $matches[1] : "";
 157+ }
 158+
 159+ # Crop a string from the beginning or end to a certain number of bytes.
 160+ # (Bytes are used because our storage has limited byte lengths for some
 161+ # columns in the database.) Multibyte charsets will need to make sure that
 162+ # only whole characters are included!
 163+ #
 164+ # $length does not include the optional ellipsis.
 165+ # If $length is negative, snip from the beginning
 166+ function truncate( $string, $length, $ellipsis = "" ) {
 167+ if( $length == 0 ) {
 168+ return $ellipsis;
 169+ }
 170+ if ( strlen( $string ) <= abs( $length ) ) {
 171+ return $string;
 172+ }
 173+ if( $length > 0 ) {
 174+ $string = substr( $string, 0, $length );
 175+ $char = ord( $string[strlen( $string ) - 1] );
 176+ if ($char >= 0xc0) {
 177+ # We got the first byte only of a multibyte char; remove it.
 178+ $string = substr( $string, 0, -1 );
 179+ } elseif( $char >= 0x80 &&
 180+ preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
 181+ '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
 182+ # We chopped in the middle of a character; remove it
 183+ $string = $m[1];
 184+ }
 185+ return $string . $ellipsis;
 186+ } else {
 187+ $string = substr( $string, $length );
 188+ $char = ord( $string[0] );
 189+ if( $char >= 0x80 && $char < 0xc0 ) {
 190+ # We chopped in the middle of a character; remove the whole thing
 191+ $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
 192+ }
 193+ return $ellipsis . $string;
 194+ }
 195+ }
11196 }
12197
 198+} # ifdef MEDIAWIKI
 199+
13200 ?>

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r14684* (bug 2069) Merge the LanguageUtf8 class into the Language class...nikerabbit15:41, 9 June 2006

Status & tagging log