r45682 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r45681‎ \| r45682 \| r45683 >
Date:	19:06, 12 January 2009
Author:	raymond
Status:	deferred
Tags:
Comment:	Backport r45674: API: (bug 16798) Fix for PHP bug 46944 (PHP's JSON formatter encodes certain UTF8 characters incorrectly) by falling back to our own formatter if PHP's is broken. Also fix up our own JSON formatter which also messed up these characters, albeit in a different way. Slightly modified patch by Brad Jorsch. Committed per request of catrope
Modified paths:	/branches/REL1_14/phase3/RELEASE-NOTES (modified) (history) /branches/REL1_14/phase3/includes/api/ApiFormatJson.php (modified) (history) /branches/REL1_14/phase3/includes/api/ApiFormatJson_json.php (modified) (history)

Diff [purge]

Index: branches/REL1_14/phase3/includes/api/ApiFormatJson.php
—	—	@@ -58,7 +58,10 @@
59	59	$suffix = ")";
60	60	}
61	61
62		~~- if (!function_exists('json_encode') \|\| $this->getIsHtml()) {~~
	62	+ // Some versions of PHP have a broken json_encode, see PHP bug
	63	+ // 46944. Test encoding an affected character (U+20000) to
	64	+ // avoid this.
	65	+ if (!function_exists('json_encode') \|\| $this->getIsHtml() \|\| strtolower(json_encode("\xf0\xa0\x80\x80")) != '\ud840\udc00') {
63	66	$json = new Services_JSON();
64	67	$this->printText($prefix . $json->encode($this->getResultData(), $this->getIsHtml()) . $suffix);
65	68	} else {
Index: branches/REL1_14/phase3/includes/api/ApiFormatJson_json.php
—	—	@@ -168,6 +168,17 @@
169	169	return chr(0xC0 \| (($bytes >> 6) & 0x1F))
170	170	. chr(0x80 \| ($bytes & 0x3F));
171	171
	172	+ case (0xFC00 & $bytes) == 0xD800 && strlen($utf16) >= 4 && (0xFC & ord($utf16{2})) == 0xDC:
	173	+ // return a 4-byte UTF-8 character
	174	+ $char = ((($bytes & 0x03FF) << 10)
	175	+ \| ((ord($utf16{2}) & 0x03) << 8)
	176	+ \| ord($utf16{3}));
	177	+ $char += 0x10000;
	178	+ return chr(0xF0 \| (($char >> 18) & 0x07))
	179	+ . chr(0x80 \| (($char >> 12) & 0x3F))
	180	+ . chr(0x80 \| (($char >> 6) & 0x3F))
	181	+ . chr(0x80 \| ($char & 0x3F));
	182	+
172	183	case (0xFFFF & $bytes) == $bytes:
173	184	// return a 3-byte UTF-8 character
174	185	// see: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
—	—	@@ -218,6 +229,20 @@
219	230	\| (0x0F & (ord($utf8{1}) >> 2)))
220	231	. chr((0xC0 & (ord($utf8{1}) << 6))
221	232	\| (0x7F & ord($utf8{2})));
	233	+
	234	+ case 4:
	235	+ // return a UTF-16 surrogate pair from a 4-byte UTF-8 char
	236	+ if(ord($utf8{0}) > 0xF4) return ''; # invalid
	237	+ $char = ((0x1C0000 & (ord($utf8{0}) << 18))
	238	+ \| (0x03F000 & (ord($utf8{1}) << 12))
	239	+ \| (0x000FC0 & (ord($utf8{2}) << 6))
	240	+ \| (0x00003F & ord($utf8{3})));
	241	+ if($char > 0x10FFFF) return ''; # invalid
	242	+ $char -= 0x10000;
	243	+ return chr(0xD8 \| (($char >> 18) & 0x03))
	244	+ . chr(($char >> 10) & 0xFF)
	245	+ . chr(0xDC \| (($char >> 8) & 0x03))
	246	+ . chr($char & 0xFF);
222	247	}
223	248
224	249	// ignoring UTF-32 for now, sorry
—	—	@@ -346,41 +371,20 @@
347	372	case (($ord_var_c & 0xF8) == 0xF0):
348	373	// characters U-00010000 - U-001FFFFF, mask 11110XXX
349	374	// see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
	375	+ // These will always return a surrogate pair
350	376	$char = pack('C*', $ord_var_c,
351	377	ord($var{$c + 1}),
352	378	ord($var{$c + 2}),
353	379	ord($var{$c + 3}));
354	380	$c += 3;
355	381	$utf16 = $this->utf82utf16($char);
356		~~- $ascii .= sprintf('\u%04s', bin2hex($utf16));~~
	382	+ if($utf16 == '') {
	383	+ $ascii .= '\ufffd';
	384	+ } else {
	385	+ $utf16 = str_split($utf16, 2);
	386	+ $ascii .= sprintf('\u%04s\u%04s', bin2hex($utf16[0]), bin2hex($utf16[1]));
	387	+ }
357	388	break;
358		-
359		~~- case (($ord_var_c & 0xFC) == 0xF8):~~
360		~~- // characters U-00200000 - U-03FFFFFF, mask 111110XX~~
361		~~- // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8~~
362		~~- $char = pack('C*', $ord_var_c,~~
363		~~- ord($var{$c + 1}),~~
364		~~- ord($var{$c + 2}),~~
365		~~- ord($var{$c + 3}),~~
366		~~- ord($var{$c + 4}));~~
367		~~- $c += 4;~~
368		~~- $utf16 = $this->utf82utf16($char);~~
369		~~- $ascii .= sprintf('\u%04s', bin2hex($utf16));~~
370		~~- break;~~
371		-
372		~~- case (($ord_var_c & 0xFE) == 0xFC):~~
373		~~- // characters U-04000000 - U-7FFFFFFF, mask 1111110X~~
374		~~- // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8~~
375		~~- $char = pack('C*', $ord_var_c,~~
376		~~- ord($var{$c + 1}),~~
377		~~- ord($var{$c + 2}),~~
378		~~- ord($var{$c + 3}),~~
379		~~- ord($var{$c + 4}),~~
380		~~- ord($var{$c + 5}));~~
381		~~- $c += 5;~~
382		~~- $utf16 = $this->utf82utf16($char);~~
383		~~- $ascii .= sprintf('\u%04s', bin2hex($utf16));~~
384		~~- break;~~
385	389	}
386	390	}
387	391
—	—	@@ -591,6 +595,16 @@
592	596	}
593	597	break;
594	598
	599	+ case preg_match('/\\\uD[89AB][0-9A-F]{2}\\\uD[C-F][0-9A-F]{2}/i', substr($chrs, $c, 12)):
	600	+ // escaped unicode surrogate pair
	601	+ $utf16 = chr(hexdec(substr($chrs, ($c + 2), 2)))
	602	+ . chr(hexdec(substr($chrs, ($c + 4), 2)))
	603	+ . chr(hexdec(substr($chrs, ($c + 8), 2)))
	604	+ . chr(hexdec(substr($chrs, ($c + 10), 2)));
	605	+ $utf8 .= $this->utf162utf8($utf16);
	606	+ $c += 11;
	607	+ break;
	608	+
595	609	case preg_match('/\\\u[0-9A-F]{4}/i', substr($chrs, $c, 6)):
596	610	// single, escaped unicode character
597	611	$utf16 = chr(hexdec(substr($chrs, ($c + 2), 2)))
Index: branches/REL1_14/phase3/RELEASE-NOTES
—	—	@@ -18,6 +18,9 @@
19	19	Those wishing to use the latest code instead of a branch release can obtain
20	20	it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
21	21
	22	+=== API changes in 1.14rc1 ===
	23	+* (bug 16798) JSON encoding errors for some characters outside the BMP
	24	+
22	25	=== Configuration changes in 1.14 ===
23	26
24	27	* $wgExemptFromUserRobotsControl is an array of namespaces to be exempt from

Follow-up revisions

Revision	Commit summary	Author	Date
r45753	Remove RELEASE-NOTES entries for backported changes (backported in r45682, r4...	catrope	21:39, 14 January 2009

Past revisions this follows-up on

Revision	Commit summary	Author	Date
r45674	API: (bug 16798) Fix for PHP bug 46944 (PHP's JSON formatter encodes certain ...	catrope	14:11, 12 January 2009

Status & tagging log

20:09, 14 January 2009 Brion VIBBER (talk | contribs) changed the status of r45682 [removed: new added: deferred]