Index: trunk/phase3/includes/GlobalFunctions.php |
— | — | @@ -33,18 +33,71 @@ |
34 | 34 | } |
35 | 35 | } |
36 | 36 | |
37 | | -# UTF-8 substr function based on a PHP manual comment |
38 | 37 | if ( !function_exists( 'mb_substr' ) ) { |
39 | | - function mb_substr( $str, $start ) { |
40 | | - $ar = array(); |
41 | | - preg_match_all( '/./us', $str, $ar ); |
42 | | - |
43 | | - if( func_num_args() >= 3 ) { |
44 | | - $end = func_get_arg( 2 ); |
45 | | - return join( '', array_slice( $ar[0], $start, $end ) ); |
| 38 | + /** |
| 39 | + * Fallback implementation for mb_substr, hardcoded to UTF-8. |
| 40 | + * Attempts to be at least _moderately_ efficient; best optimized |
| 41 | + * for relatively small offset and count values -- about 5x slower |
| 42 | + * than native mb_string in my testing. |
| 43 | + * |
| 44 | + * Larger offsets are still fairly efficient for Latin text, but |
| 45 | + * can be up to 100x slower than native if the text is heavily |
| 46 | + * multibyte and we have to slog through a few hundred kb. |
| 47 | + */ |
| 48 | + function mb_substr( $str, $start, $count='end' ) { |
| 49 | + if( $start != 0 ) { |
| 50 | + $split = mb_substr_split_unicode( $str, intval( $start ) ); |
| 51 | + $str = substr( $str, $split ); |
| 52 | + } |
| 53 | + |
| 54 | + if( $count !== 'end' ) { |
| 55 | + $split = mb_substr_split_unicode( $str, intval( $count ) ); |
| 56 | + $str = substr( $str, 0, $split ); |
| 57 | + } |
| 58 | + |
| 59 | + return $str; |
| 60 | + } |
| 61 | + |
| 62 | + function mb_substr_split_unicode( $str, $splitPos ) { |
| 63 | + if( $splitPos == 0 ) { |
| 64 | + return 0; |
| 65 | + } |
| 66 | + |
| 67 | + $byteLen = strlen( $str ); |
| 68 | + |
| 69 | + if( $splitPos > 0 ) { |
| 70 | + if( $splitPos > 256 ) { |
| 71 | + // Optimize large string offsets by skipping ahead N bytes. |
| 72 | + // This will cut out most of our slow time on Latin-based text, |
| 73 | + // and 1/2 to 1/3 on East European and Asian scripts. |
| 74 | + $bytePos = $splitPos; |
| 75 | + while ($bytePos < $byteLen && $str{$bytePos} >= "\x80" && $str{$bytePos} < "\xc0") |
| 76 | + ++$bytePos; |
| 77 | + $charPos = mb_strlen( substr( $str, 0, $bytePos ) ); |
| 78 | + } else { |
| 79 | + $charPos = 0; |
| 80 | + $bytePos = 0; |
| 81 | + } |
| 82 | + |
| 83 | + while( $charPos++ < $splitPos ) { |
| 84 | + ++$bytePos; |
| 85 | + // Move past any tail bytes |
| 86 | + while ($bytePos < $byteLen && $str{$bytePos} >= "\x80" && $str{$bytePos} < "\xc0") |
| 87 | + ++$bytePos; |
| 88 | + } |
46 | 89 | } else { |
47 | | - return join( '', array_slice( $ar[0], $start ) ); |
| 90 | + $splitPosX = $splitPos + 1; |
| 91 | + $charPos = 0; // relative to end of string; we don't care about the actual char position here |
| 92 | + $bytePos = $byteLen; |
| 93 | + while( $bytePos > 0 && $charPos-- >= $splitPosX ) { |
| 94 | + --$bytePos; |
| 95 | + // Move past any tail bytes |
| 96 | + while ($bytePos > 0 && $str{$bytePos} >= "\x80" && $str{$bytePos} < "\xc0") |
| 97 | + --$bytePos; |
| 98 | + } |
48 | 99 | } |
| 100 | + |
| 101 | + return $bytePos; |
49 | 102 | } |
50 | 103 | } |
51 | 104 | |