r45674 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r45673‎ \| r45674 \| r45675 >
Date:	14:11, 12 January 2009
Author:	catrope
Status:	ok
Tags:
Comment:	API: (bug 16798) Fix for PHP bug 46944 (PHP's JSON formatter encodes certain UTF8 characters incorrectly) by falling back to our own formatter if PHP's is broken. Also fix up our own JSON formatter which also messed up these characters, albeit in a different way. Slightly modified patch by Brad Jorsch.
Modified paths:	/trunk/phase3/RELEASE-NOTES (modified) (history) /trunk/phase3/includes/api/ApiFormatJson.php (modified) (history) /trunk/phase3/includes/api/ApiFormatJson_json.php (modified) (history)

Diff [purge]

Index: trunk/phase3/includes/api/ApiFormatJson.php
—	—	@@ -58,7 +58,10 @@
59	59	$suffix = ")";
60	60	}
61	61
62		~~- if (!function_exists('json_encode') \|\| $this->getIsHtml()) {~~
	62	+ // Some versions of PHP have a broken json_encode, see PHP bug
	63	+ // 46944. Test encoding an affected character (U+20000) to
	64	+ // avoid this.
	65	+ if (!function_exists('json_encode') \|\| $this->getIsHtml() \|\| strtolower(json_encode("\xf0\xa0\x80\x80")) != '\ud840\udc00') {
63	66	$json = new Services_JSON();
64	67	$this->printText($prefix . $json->encode($this->getResultData(), $this->getIsHtml()) . $suffix);
65	68	} else {
Index: trunk/phase3/includes/api/ApiFormatJson_json.php
—	—	@@ -168,6 +168,17 @@
169	169	return chr(0xC0 \| (($bytes >> 6) & 0x1F))
170	170	. chr(0x80 \| ($bytes & 0x3F));
171	171
	172	+ case (0xFC00 & $bytes) == 0xD800 && strlen($utf16) >= 4 && (0xFC & ord($utf16{2})) == 0xDC:
	173	+ // return a 4-byte UTF-8 character
	174	+ $char = ((($bytes & 0x03FF) << 10)
	175	+ \| ((ord($utf16{2}) & 0x03) << 8)
	176	+ \| ord($utf16{3}));
	177	+ $char += 0x10000;
	178	+ return chr(0xF0 \| (($char >> 18) & 0x07))
	179	+ . chr(0x80 \| (($char >> 12) & 0x3F))
	180	+ . chr(0x80 \| (($char >> 6) & 0x3F))
	181	+ . chr(0x80 \| ($char & 0x3F));
	182	+
172	183	case (0xFFFF & $bytes) == $bytes:
173	184	// return a 3-byte UTF-8 character
174	185	// see: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
—	—	@@ -218,6 +229,20 @@
219	230	\| (0x0F & (ord($utf8{1}) >> 2)))
220	231	. chr((0xC0 & (ord($utf8{1}) << 6))
221	232	\| (0x7F & ord($utf8{2})));
	233	+
	234	+ case 4:
	235	+ // return a UTF-16 surrogate pair from a 4-byte UTF-8 char
	236	+ if(ord($utf8{0}) > 0xF4) return ''; # invalid
	237	+ $char = ((0x1C0000 & (ord($utf8{0}) << 18))
	238	+ \| (0x03F000 & (ord($utf8{1}) << 12))
	239	+ \| (0x000FC0 & (ord($utf8{2}) << 6))
	240	+ \| (0x00003F & ord($utf8{3})));
	241	+ if($char > 0x10FFFF) return ''; # invalid
	242	+ $char -= 0x10000;
	243	+ return chr(0xD8 \| (($char >> 18) & 0x03))
	244	+ . chr(($char >> 10) & 0xFF)
	245	+ . chr(0xDC \| (($char >> 8) & 0x03))
	246	+ . chr($char & 0xFF);
222	247	}
223	248
224	249	// ignoring UTF-32 for now, sorry
—	—	@@ -346,41 +371,20 @@
347	372	case (($ord_var_c & 0xF8) == 0xF0):
348	373	// characters U-00010000 - U-001FFFFF, mask 11110XXX
349	374	// see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
	375	+ // These will always return a surrogate pair
350	376	$char = pack('C*', $ord_var_c,
351	377	ord($var{$c + 1}),
352	378	ord($var{$c + 2}),
353	379	ord($var{$c + 3}));
354	380	$c += 3;
355	381	$utf16 = $this->utf82utf16($char);
356		~~- $ascii .= sprintf('\u%04s', bin2hex($utf16));~~
	382	+ if($utf16 == '') {
	383	+ $ascii .= '\ufffd';
	384	+ } else {
	385	+ $utf16 = str_split($utf16, 2);
	386	+ $ascii .= sprintf('\u%04s\u%04s', bin2hex($utf16[0]), bin2hex($utf16[1]));
	387	+ }
357	388	break;
358		-
359		~~- case (($ord_var_c & 0xFC) == 0xF8):~~
360		~~- // characters U-00200000 - U-03FFFFFF, mask 111110XX~~
361		~~- // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8~~
362		~~- $char = pack('C*', $ord_var_c,~~
363		~~- ord($var{$c + 1}),~~
364		~~- ord($var{$c + 2}),~~
365		~~- ord($var{$c + 3}),~~
366		~~- ord($var{$c + 4}));~~
367		~~- $c += 4;~~
368		~~- $utf16 = $this->utf82utf16($char);~~
369		~~- $ascii .= sprintf('\u%04s', bin2hex($utf16));~~
370		~~- break;~~
371		-
372		~~- case (($ord_var_c & 0xFE) == 0xFC):~~
373		~~- // characters U-04000000 - U-7FFFFFFF, mask 1111110X~~
374		~~- // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8~~
375		~~- $char = pack('C*', $ord_var_c,~~
376		~~- ord($var{$c + 1}),~~
377		~~- ord($var{$c + 2}),~~
378		~~- ord($var{$c + 3}),~~
379		~~- ord($var{$c + 4}),~~
380		~~- ord($var{$c + 5}));~~
381		~~- $c += 5;~~
382		~~- $utf16 = $this->utf82utf16($char);~~
383		~~- $ascii .= sprintf('\u%04s', bin2hex($utf16));~~
384		~~- break;~~
385	389	}
386	390	}
387	391
—	—	@@ -591,6 +595,16 @@
592	596	}
593	597	break;
594	598
	599	+ case preg_match('/\\\uD[89AB][0-9A-F]{2}\\\uD[C-F][0-9A-F]{2}/i', substr($chrs, $c, 12)):
	600	+ // escaped unicode surrogate pair
	601	+ $utf16 = chr(hexdec(substr($chrs, ($c + 2), 2)))
	602	+ . chr(hexdec(substr($chrs, ($c + 4), 2)))
	603	+ . chr(hexdec(substr($chrs, ($c + 8), 2)))
	604	+ . chr(hexdec(substr($chrs, ($c + 10), 2)));
	605	+ $utf8 .= $this->utf162utf8($utf16);
	606	+ $c += 11;
	607	+ break;
	608	+
595	609	case preg_match('/\\\u[0-9A-F]{4}/i', substr($chrs, $c, 6)):
596	610	// single, escaped unicode character
597	611	$utf16 = chr(hexdec(substr($chrs, ($c + 2), 2)))
Index: trunk/phase3/RELEASE-NOTES
—	—	@@ -39,6 +39,9 @@
40	40	* Fixing the caching issue by using -{T\|xxx}- syntax (only applies on wiki with LanguageConverter class)
41	41	* Improving the efficiency by using -{A\|xxx}- syntax (only applies on wiki with LanguageConverter class)
42	42
	43	+== API changes in 1.15 ==
	44	+* (bug 16798) JSON encoding errors for some characters outside the BMP
	45	+
43	46	=== Languages updated in 1.15 ===
44	47
45	48	MediaWiki supports over 300 languages. Many localisations are updated

Follow-up revisions

Revision	Commit summary	Author	Date
r45682	Backport r45674:...	raymond	19:06, 12 January 2009

Status & tagging log

17:05, 4 January 2012 Johnduhart (talk | contribs) changed the tags for r45674 [removed: api]
19:46, 14 January 2009 Brion VIBBER (talk | contribs) changed the status of r45674 [removed: new added: ok]
18:57, 12 January 2009 Catrope (talk | contribs) changed the tags for r45674 [added: api]