r45674 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r45673‎ | r45674 | r45675 >
Date:14:11, 12 January 2009
Author:catrope
Status:ok
Tags:
Comment:
API: (bug 16798) Fix for PHP bug 46944 (PHP's JSON formatter encodes certain UTF8 characters incorrectly) by falling back to our own formatter if PHP's is broken. Also fix up our own JSON formatter which also messed up these characters, albeit in a different way. Slightly modified patch by Brad Jorsch.
Modified paths:
  • /trunk/phase3/RELEASE-NOTES (modified) (history)
  • /trunk/phase3/includes/api/ApiFormatJson.php (modified) (history)
  • /trunk/phase3/includes/api/ApiFormatJson_json.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/api/ApiFormatJson.php
@@ -58,7 +58,10 @@
5959 $suffix = ")";
6060 }
6161
62 - if (!function_exists('json_encode') || $this->getIsHtml()) {
 62+ // Some versions of PHP have a broken json_encode, see PHP bug
 63+ // 46944. Test encoding an affected character (U+20000) to
 64+ // avoid this.
 65+ if (!function_exists('json_encode') || $this->getIsHtml() || strtolower(json_encode("\xf0\xa0\x80\x80")) != '\ud840\udc00') {
6366 $json = new Services_JSON();
6467 $this->printText($prefix . $json->encode($this->getResultData(), $this->getIsHtml()) . $suffix);
6568 } else {
Index: trunk/phase3/includes/api/ApiFormatJson_json.php
@@ -168,6 +168,17 @@
169169 return chr(0xC0 | (($bytes >> 6) & 0x1F))
170170 . chr(0x80 | ($bytes & 0x3F));
171171
 172+ case (0xFC00 & $bytes) == 0xD800 && strlen($utf16) >= 4 && (0xFC & ord($utf16{2})) == 0xDC:
 173+ // return a 4-byte UTF-8 character
 174+ $char = ((($bytes & 0x03FF) << 10)
 175+ | ((ord($utf16{2}) & 0x03) << 8)
 176+ | ord($utf16{3}));
 177+ $char += 0x10000;
 178+ return chr(0xF0 | (($char >> 18) & 0x07))
 179+ . chr(0x80 | (($char >> 12) & 0x3F))
 180+ . chr(0x80 | (($char >> 6) & 0x3F))
 181+ . chr(0x80 | ($char & 0x3F));
 182+
172183 case (0xFFFF & $bytes) == $bytes:
173184 // return a 3-byte UTF-8 character
174185 // see: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
@@ -218,6 +229,20 @@
219230 | (0x0F & (ord($utf8{1}) >> 2)))
220231 . chr((0xC0 & (ord($utf8{1}) << 6))
221232 | (0x7F & ord($utf8{2})));
 233+
 234+ case 4:
 235+ // return a UTF-16 surrogate pair from a 4-byte UTF-8 char
 236+ if(ord($utf8{0}) > 0xF4) return ''; # invalid
 237+ $char = ((0x1C0000 & (ord($utf8{0}) << 18))
 238+ | (0x03F000 & (ord($utf8{1}) << 12))
 239+ | (0x000FC0 & (ord($utf8{2}) << 6))
 240+ | (0x00003F & ord($utf8{3})));
 241+ if($char > 0x10FFFF) return ''; # invalid
 242+ $char -= 0x10000;
 243+ return chr(0xD8 | (($char >> 18) & 0x03))
 244+ . chr(($char >> 10) & 0xFF)
 245+ . chr(0xDC | (($char >> 8) & 0x03))
 246+ . chr($char & 0xFF);
222247 }
223248
224249 // ignoring UTF-32 for now, sorry
@@ -346,41 +371,20 @@
347372 case (($ord_var_c & 0xF8) == 0xF0):
348373 // characters U-00010000 - U-001FFFFF, mask 11110XXX
349374 // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
 375+ // These will always return a surrogate pair
350376 $char = pack('C*', $ord_var_c,
351377 ord($var{$c + 1}),
352378 ord($var{$c + 2}),
353379 ord($var{$c + 3}));
354380 $c += 3;
355381 $utf16 = $this->utf82utf16($char);
356 - $ascii .= sprintf('\u%04s', bin2hex($utf16));
 382+ if($utf16 == '') {
 383+ $ascii .= '\ufffd';
 384+ } else {
 385+ $utf16 = str_split($utf16, 2);
 386+ $ascii .= sprintf('\u%04s\u%04s', bin2hex($utf16[0]), bin2hex($utf16[1]));
 387+ }
357388 break;
358 -
359 - case (($ord_var_c & 0xFC) == 0xF8):
360 - // characters U-00200000 - U-03FFFFFF, mask 111110XX
361 - // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
362 - $char = pack('C*', $ord_var_c,
363 - ord($var{$c + 1}),
364 - ord($var{$c + 2}),
365 - ord($var{$c + 3}),
366 - ord($var{$c + 4}));
367 - $c += 4;
368 - $utf16 = $this->utf82utf16($char);
369 - $ascii .= sprintf('\u%04s', bin2hex($utf16));
370 - break;
371 -
372 - case (($ord_var_c & 0xFE) == 0xFC):
373 - // characters U-04000000 - U-7FFFFFFF, mask 1111110X
374 - // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
375 - $char = pack('C*', $ord_var_c,
376 - ord($var{$c + 1}),
377 - ord($var{$c + 2}),
378 - ord($var{$c + 3}),
379 - ord($var{$c + 4}),
380 - ord($var{$c + 5}));
381 - $c += 5;
382 - $utf16 = $this->utf82utf16($char);
383 - $ascii .= sprintf('\u%04s', bin2hex($utf16));
384 - break;
385389 }
386390 }
387391
@@ -591,6 +595,16 @@
592596 }
593597 break;
594598
 599+ case preg_match('/\\\uD[89AB][0-9A-F]{2}\\\uD[C-F][0-9A-F]{2}/i', substr($chrs, $c, 12)):
 600+ // escaped unicode surrogate pair
 601+ $utf16 = chr(hexdec(substr($chrs, ($c + 2), 2)))
 602+ . chr(hexdec(substr($chrs, ($c + 4), 2)))
 603+ . chr(hexdec(substr($chrs, ($c + 8), 2)))
 604+ . chr(hexdec(substr($chrs, ($c + 10), 2)));
 605+ $utf8 .= $this->utf162utf8($utf16);
 606+ $c += 11;
 607+ break;
 608+
595609 case preg_match('/\\\u[0-9A-F]{4}/i', substr($chrs, $c, 6)):
596610 // single, escaped unicode character
597611 $utf16 = chr(hexdec(substr($chrs, ($c + 2), 2)))
Index: trunk/phase3/RELEASE-NOTES
@@ -39,6 +39,9 @@
4040 * Fixing the caching issue by using -{T|xxx}- syntax (only applies on wiki with LanguageConverter class)
4141 * Improving the efficiency by using -{A|xxx}- syntax (only applies on wiki with LanguageConverter class)
4242
 43+== API changes in 1.15 ==
 44+* (bug 16798) JSON encoding errors for some characters outside the BMP
 45+
4346 === Languages updated in 1.15 ===
4447
4548 MediaWiki supports over 300 languages. Many localisations are updated

Follow-up revisions

RevisionCommit summaryAuthorDate
r45682Backport r45674:...raymond19:06, 12 January 2009

Status & tagging log