r45682 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r45681‎ | r45682 | r45683 >
Date:19:06, 12 January 2009
Author:raymond
Status:deferred
Tags:
Comment:
Backport r45674:
API: (bug 16798) Fix for PHP bug 46944 (PHP's JSON formatter encodes certain UTF8 characters incorrectly) by falling back to our own formatter if PHP's is broken. Also fix up our own JSON formatter which also messed up these characters, albeit in a different way. Slightly modified patch by Brad Jorsch.
Committed per request of catrope
Modified paths:
  • /branches/REL1_14/phase3/RELEASE-NOTES (modified) (history)
  • /branches/REL1_14/phase3/includes/api/ApiFormatJson.php (modified) (history)
  • /branches/REL1_14/phase3/includes/api/ApiFormatJson_json.php (modified) (history)

Diff [purge]

Index: branches/REL1_14/phase3/includes/api/ApiFormatJson.php
@@ -58,7 +58,10 @@
5959 $suffix = ")";
6060 }
6161
62 - if (!function_exists('json_encode') || $this->getIsHtml()) {
 62+ // Some versions of PHP have a broken json_encode, see PHP bug
 63+ // 46944. Test encoding an affected character (U+20000) to
 64+ // avoid this.
 65+ if (!function_exists('json_encode') || $this->getIsHtml() || strtolower(json_encode("\xf0\xa0\x80\x80")) != '\ud840\udc00') {
6366 $json = new Services_JSON();
6467 $this->printText($prefix . $json->encode($this->getResultData(), $this->getIsHtml()) . $suffix);
6568 } else {
Index: branches/REL1_14/phase3/includes/api/ApiFormatJson_json.php
@@ -168,6 +168,17 @@
169169 return chr(0xC0 | (($bytes >> 6) & 0x1F))
170170 . chr(0x80 | ($bytes & 0x3F));
171171
 172+ case (0xFC00 & $bytes) == 0xD800 && strlen($utf16) >= 4 && (0xFC & ord($utf16{2})) == 0xDC:
 173+ // return a 4-byte UTF-8 character
 174+ $char = ((($bytes & 0x03FF) << 10)
 175+ | ((ord($utf16{2}) & 0x03) << 8)
 176+ | ord($utf16{3}));
 177+ $char += 0x10000;
 178+ return chr(0xF0 | (($char >> 18) & 0x07))
 179+ . chr(0x80 | (($char >> 12) & 0x3F))
 180+ . chr(0x80 | (($char >> 6) & 0x3F))
 181+ . chr(0x80 | ($char & 0x3F));
 182+
172183 case (0xFFFF & $bytes) == $bytes:
173184 // return a 3-byte UTF-8 character
174185 // see: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
@@ -218,6 +229,20 @@
219230 | (0x0F & (ord($utf8{1}) >> 2)))
220231 . chr((0xC0 & (ord($utf8{1}) << 6))
221232 | (0x7F & ord($utf8{2})));
 233+
 234+ case 4:
 235+ // return a UTF-16 surrogate pair from a 4-byte UTF-8 char
 236+ if(ord($utf8{0}) > 0xF4) return ''; # invalid
 237+ $char = ((0x1C0000 & (ord($utf8{0}) << 18))
 238+ | (0x03F000 & (ord($utf8{1}) << 12))
 239+ | (0x000FC0 & (ord($utf8{2}) << 6))
 240+ | (0x00003F & ord($utf8{3})));
 241+ if($char > 0x10FFFF) return ''; # invalid
 242+ $char -= 0x10000;
 243+ return chr(0xD8 | (($char >> 18) & 0x03))
 244+ . chr(($char >> 10) & 0xFF)
 245+ . chr(0xDC | (($char >> 8) & 0x03))
 246+ . chr($char & 0xFF);
222247 }
223248
224249 // ignoring UTF-32 for now, sorry
@@ -346,41 +371,20 @@
347372 case (($ord_var_c & 0xF8) == 0xF0):
348373 // characters U-00010000 - U-001FFFFF, mask 11110XXX
349374 // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
 375+ // These will always return a surrogate pair
350376 $char = pack('C*', $ord_var_c,
351377 ord($var{$c + 1}),
352378 ord($var{$c + 2}),
353379 ord($var{$c + 3}));
354380 $c += 3;
355381 $utf16 = $this->utf82utf16($char);
356 - $ascii .= sprintf('\u%04s', bin2hex($utf16));
 382+ if($utf16 == '') {
 383+ $ascii .= '\ufffd';
 384+ } else {
 385+ $utf16 = str_split($utf16, 2);
 386+ $ascii .= sprintf('\u%04s\u%04s', bin2hex($utf16[0]), bin2hex($utf16[1]));
 387+ }
357388 break;
358 -
359 - case (($ord_var_c & 0xFC) == 0xF8):
360 - // characters U-00200000 - U-03FFFFFF, mask 111110XX
361 - // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
362 - $char = pack('C*', $ord_var_c,
363 - ord($var{$c + 1}),
364 - ord($var{$c + 2}),
365 - ord($var{$c + 3}),
366 - ord($var{$c + 4}));
367 - $c += 4;
368 - $utf16 = $this->utf82utf16($char);
369 - $ascii .= sprintf('\u%04s', bin2hex($utf16));
370 - break;
371 -
372 - case (($ord_var_c & 0xFE) == 0xFC):
373 - // characters U-04000000 - U-7FFFFFFF, mask 1111110X
374 - // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
375 - $char = pack('C*', $ord_var_c,
376 - ord($var{$c + 1}),
377 - ord($var{$c + 2}),
378 - ord($var{$c + 3}),
379 - ord($var{$c + 4}),
380 - ord($var{$c + 5}));
381 - $c += 5;
382 - $utf16 = $this->utf82utf16($char);
383 - $ascii .= sprintf('\u%04s', bin2hex($utf16));
384 - break;
385389 }
386390 }
387391
@@ -591,6 +595,16 @@
592596 }
593597 break;
594598
 599+ case preg_match('/\\\uD[89AB][0-9A-F]{2}\\\uD[C-F][0-9A-F]{2}/i', substr($chrs, $c, 12)):
 600+ // escaped unicode surrogate pair
 601+ $utf16 = chr(hexdec(substr($chrs, ($c + 2), 2)))
 602+ . chr(hexdec(substr($chrs, ($c + 4), 2)))
 603+ . chr(hexdec(substr($chrs, ($c + 8), 2)))
 604+ . chr(hexdec(substr($chrs, ($c + 10), 2)));
 605+ $utf8 .= $this->utf162utf8($utf16);
 606+ $c += 11;
 607+ break;
 608+
595609 case preg_match('/\\\u[0-9A-F]{4}/i', substr($chrs, $c, 6)):
596610 // single, escaped unicode character
597611 $utf16 = chr(hexdec(substr($chrs, ($c + 2), 2)))
Index: branches/REL1_14/phase3/RELEASE-NOTES
@@ -18,6 +18,9 @@
1919 Those wishing to use the latest code instead of a branch release can obtain
2020 it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
2121
 22+=== API changes in 1.14rc1 ===
 23+* (bug 16798) JSON encoding errors for some characters outside the BMP
 24+
2225 === Configuration changes in 1.14 ===
2326
2427 * $wgExemptFromUserRobotsControl is an array of namespaces to be exempt from

Follow-up revisions

RevisionCommit summaryAuthorDate
r45753Remove RELEASE-NOTES entries for backported changes (backported in r45682, r4...catrope21:39, 14 January 2009

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r45674API: (bug 16798) Fix for PHP bug 46944 (PHP's JSON formatter encodes certain ...catrope14:11, 12 January 2009

Status & tagging log