Index: trunk/phase3/includes/api/ApiFormatJson.php |
— | — | @@ -58,7 +58,10 @@ |
59 | 59 | $suffix = ")"; |
60 | 60 | } |
61 | 61 | |
62 | | - if (!function_exists('json_encode') || $this->getIsHtml()) { |
| 62 | + // Some versions of PHP have a broken json_encode, see PHP bug |
| 63 | + // 46944. Test encoding an affected character (U+20000) to |
| 64 | + // avoid this. |
| 65 | + if (!function_exists('json_encode') || $this->getIsHtml() || strtolower(json_encode("\xf0\xa0\x80\x80")) != '\ud840\udc00') { |
63 | 66 | $json = new Services_JSON(); |
64 | 67 | $this->printText($prefix . $json->encode($this->getResultData(), $this->getIsHtml()) . $suffix); |
65 | 68 | } else { |
Index: trunk/phase3/includes/api/ApiFormatJson_json.php |
— | — | @@ -168,6 +168,17 @@ |
169 | 169 | return chr(0xC0 | (($bytes >> 6) & 0x1F)) |
170 | 170 | . chr(0x80 | ($bytes & 0x3F)); |
171 | 171 | |
| 172 | + case (0xFC00 & $bytes) == 0xD800 && strlen($utf16) >= 4 && (0xFC & ord($utf16{2})) == 0xDC: |
| 173 | + // return a 4-byte UTF-8 character |
| 174 | + $char = ((($bytes & 0x03FF) << 10) |
| 175 | + | ((ord($utf16{2}) & 0x03) << 8) |
| 176 | + | ord($utf16{3})); |
| 177 | + $char += 0x10000; |
| 178 | + return chr(0xF0 | (($char >> 18) & 0x07)) |
| 179 | + . chr(0x80 | (($char >> 12) & 0x3F)) |
| 180 | + . chr(0x80 | (($char >> 6) & 0x3F)) |
| 181 | + . chr(0x80 | ($char & 0x3F)); |
| 182 | + |
172 | 183 | case (0xFFFF & $bytes) == $bytes: |
173 | 184 | // return a 3-byte UTF-8 character |
174 | 185 | // see: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 |
— | — | @@ -218,6 +229,20 @@ |
219 | 230 | | (0x0F & (ord($utf8{1}) >> 2))) |
220 | 231 | . chr((0xC0 & (ord($utf8{1}) << 6)) |
221 | 232 | | (0x7F & ord($utf8{2}))); |
| 233 | + |
| 234 | + case 4: |
| 235 | + // return a UTF-16 surrogate pair from a 4-byte UTF-8 char |
| 236 | + if(ord($utf8{0}) > 0xF4) return ''; # invalid |
| 237 | + $char = ((0x1C0000 & (ord($utf8{0}) << 18)) |
| 238 | + | (0x03F000 & (ord($utf8{1}) << 12)) |
| 239 | + | (0x000FC0 & (ord($utf8{2}) << 6)) |
| 240 | + | (0x00003F & ord($utf8{3}))); |
| 241 | + if($char > 0x10FFFF) return ''; # invalid |
| 242 | + $char -= 0x10000; |
| 243 | + return chr(0xD8 | (($char >> 18) & 0x03)) |
| 244 | + . chr(($char >> 10) & 0xFF) |
| 245 | + . chr(0xDC | (($char >> 8) & 0x03)) |
| 246 | + . chr($char & 0xFF); |
222 | 247 | } |
223 | 248 | |
224 | 249 | // ignoring UTF-32 for now, sorry |
— | — | @@ -346,41 +371,20 @@ |
347 | 372 | case (($ord_var_c & 0xF8) == 0xF0): |
348 | 373 | // characters U-00010000 - U-001FFFFF, mask 11110XXX |
349 | 374 | // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 |
| 375 | + // These will always return a surrogate pair |
350 | 376 | $char = pack('C*', $ord_var_c, |
351 | 377 | ord($var{$c + 1}), |
352 | 378 | ord($var{$c + 2}), |
353 | 379 | ord($var{$c + 3})); |
354 | 380 | $c += 3; |
355 | 381 | $utf16 = $this->utf82utf16($char); |
356 | | - $ascii .= sprintf('\u%04s', bin2hex($utf16)); |
| 382 | + if($utf16 == '') { |
| 383 | + $ascii .= '\ufffd'; |
| 384 | + } else { |
| 385 | + $utf16 = str_split($utf16, 2); |
| 386 | + $ascii .= sprintf('\u%04s\u%04s', bin2hex($utf16[0]), bin2hex($utf16[1])); |
| 387 | + } |
357 | 388 | break; |
358 | | - |
359 | | - case (($ord_var_c & 0xFC) == 0xF8): |
360 | | - // characters U-00200000 - U-03FFFFFF, mask 111110XX |
361 | | - // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 |
362 | | - $char = pack('C*', $ord_var_c, |
363 | | - ord($var{$c + 1}), |
364 | | - ord($var{$c + 2}), |
365 | | - ord($var{$c + 3}), |
366 | | - ord($var{$c + 4})); |
367 | | - $c += 4; |
368 | | - $utf16 = $this->utf82utf16($char); |
369 | | - $ascii .= sprintf('\u%04s', bin2hex($utf16)); |
370 | | - break; |
371 | | - |
372 | | - case (($ord_var_c & 0xFE) == 0xFC): |
373 | | - // characters U-04000000 - U-7FFFFFFF, mask 1111110X |
374 | | - // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 |
375 | | - $char = pack('C*', $ord_var_c, |
376 | | - ord($var{$c + 1}), |
377 | | - ord($var{$c + 2}), |
378 | | - ord($var{$c + 3}), |
379 | | - ord($var{$c + 4}), |
380 | | - ord($var{$c + 5})); |
381 | | - $c += 5; |
382 | | - $utf16 = $this->utf82utf16($char); |
383 | | - $ascii .= sprintf('\u%04s', bin2hex($utf16)); |
384 | | - break; |
385 | 389 | } |
386 | 390 | } |
387 | 391 | |
— | — | @@ -591,6 +595,16 @@ |
592 | 596 | } |
593 | 597 | break; |
594 | 598 | |
| 599 | + case preg_match('/\\\uD[89AB][0-9A-F]{2}\\\uD[C-F][0-9A-F]{2}/i', substr($chrs, $c, 12)): |
| 600 | + // escaped unicode surrogate pair |
| 601 | + $utf16 = chr(hexdec(substr($chrs, ($c + 2), 2))) |
| 602 | + . chr(hexdec(substr($chrs, ($c + 4), 2))) |
| 603 | + . chr(hexdec(substr($chrs, ($c + 8), 2))) |
| 604 | + . chr(hexdec(substr($chrs, ($c + 10), 2))); |
| 605 | + $utf8 .= $this->utf162utf8($utf16); |
| 606 | + $c += 11; |
| 607 | + break; |
| 608 | + |
595 | 609 | case preg_match('/\\\u[0-9A-F]{4}/i', substr($chrs, $c, 6)): |
596 | 610 | // single, escaped unicode character |
597 | 611 | $utf16 = chr(hexdec(substr($chrs, ($c + 2), 2))) |
Index: trunk/phase3/RELEASE-NOTES |
— | — | @@ -39,6 +39,9 @@ |
40 | 40 | * Fixing the caching issue by using -{T|xxx}- syntax (only applies on wiki with LanguageConverter class) |
41 | 41 | * Improving the efficiency by using -{A|xxx}- syntax (only applies on wiki with LanguageConverter class) |
42 | 42 | |
| 43 | +== API changes in 1.15 == |
| 44 | +* (bug 16798) JSON encoding errors for some characters outside the BMP |
| 45 | + |
43 | 46 | === Languages updated in 1.15 === |
44 | 47 | |
45 | 48 | MediaWiki supports over 300 languages. Many localisations are updated |