r54691 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r54690‎ \| r54691 \| r54692 >
Date:	23:42, 9 August 2009
Author:	conrad
Status:	deferred
Tags:
Comment:	General improvements, particularly to ^ and $ handling, bug fixes
Modified paths:	/trunk/extensions/Transliterator/Transliterator.php (modified) (history)

Diff [purge]

Index: trunk/extensions/Transliterator/Transliterator.php
—	—	@@ -18,6 +18,8 @@
19	19	* better i18n support, adjustable limits, minor formal adjustment.
20	20	* @version 1.1.0
21	21	* addition of answer parameter
	22	+ * @version 1.2.0
	23	+ * semi-case-sensitive by default, fix bugs with edge-detection and html-entities
22	24	*/
23	25
24	26	/**
—	—	@@ -38,6 +40,7 @@
39	41	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
40	42	*/
41	43
	44	+error_reporting(E_ALL \| E_WARNING \| E_STRICT);
42	45	if ( !defined( 'MEDIAWIKI' ) )
43	46	{
44	47	die( 'This file is a MediaWiki extension, not a valid entry point.' );
—	—	@@ -49,7 +52,7 @@
50	53
51	54	$wgExtensionCredits['parserhook'][] = array(
52	55	'name' => 'Transliterator',
53		~~- 'version' => '1.1.0',~~
	56	+ 'version' => '1.2.0',
54	57	'descriptionmsg' => 'transliterator-desc',
55	58	'author' => 'Conrad Irwin',
56	59	'url' => 'http://www.mediawiki.org/wiki/Extension:Transliterator',
—	—	@@ -66,6 +69,7 @@
67	70
68	71	class ExtTransliterator {
69	72
	73	+ const DELIMITER = "\x1F"; // A character that will be inserted in places where the ^ and $ should match
70	74	var $mPages = null; // An Array of "transliterator:$mapname" => The database row for that template.
71	75	var $mMaps = array();// An Array of "$mapname" => The map parsed from that page.
72	76
—	—	@@ -102,6 +106,23 @@
103	107	}
104	108
105	109	/**
	110	+ * Given a codepoints or letters array returns a list that contains 1 for every
	111	+ * alphabetic character and accent, and 0 otherwise. This allows for edge-of-word
	112	+ * detection.
	113	+ */
	114	+ function alphamap( $letters ) {
	115	+
	116	+ $output = Array();
	117	+ $count = count($letters);
	118	+
	119	+ for ($i = 0; $i < $count; $i++) {
	120	+ $output[] = preg_match( '/\pL/u', $letters[$i]) \|\| isset( $utfCombiningClass[$letters[$i]] );
	121	+ }
	122	+
	123	+ return $output;
	124	+ }
	125	+
	126	+ /**
106	127	* Get all the existing maps in one query, useful given that the default
107	128	* behaviour of failing silently is designed to allow it to be used by
108	129	* templates that don't know if a map exists, so may try far too often.
—	—	@@ -156,9 +177,8 @@
157	178	*
158	179	* Input syntax is a set of lines.
159	180	* All " " are ignored.
160		~~- * Lines starting with # are ignored.~~
	181	+ * Lines starting with # are ignored, remaining lines are split by =>
161	182	* HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints)
162		~~- * Remaining lines are split by "=>".~~
163	183	*
164	184	* The map created is a set of "from" strings to "to" strings
165	185	* With extra "from" => true for all substrings of "from" strings
—	—	@@ -174,66 +194,86 @@
175	195	$map = array();
176	196	$decompose = false;
177	197
178		~~- // Split lines and remove comments and space~~
179		~~- $lines = split( "\n", html_entity_decode( preg_replace( '/^\s(#.)?(\n\|$)\| */m', '', $input ), ENT_NOQUOTES, "UTF-8" ) );~~
	198	+ // Split lines and remove whitespace at beginning and end
	199	+ $lines = preg_split( "/(^\|\s\n)(\s(#[^\n])?\n)\s*/", $input."\n" );
	200	+ if ( $lines[0] == "" )
	201	+ array_shift( $lines );
180	202
181		~~- // If the last line was a comment then there will be an empty line at the end~~
182		~~- if ( $lines[count( $lines ) - 1] == "" ) {~~
	203	+ if ( $lines[count( $lines ) - 1] == "" )
183	204	array_pop( $lines );
184		~~- }~~
185	205
	206	+
	207	+ // The first line can contain flags
186	208	$first_line = $lines[0];
187	209	if ( strpos( $first_line, "=>") === FALSE ) {
188		~~- # Empty page~~
	210	+ // Or, could just signify that the message was blank
189	211	if ( $first_line == "<$mappage>")
190	212	return false;
	213	+ else if ( preg_replace( '/<(decompose\|sensitive)>/', '', $first_line ) != '')
	214	+ return wfMsg( 'transliterator-error-syntax', $first_line, $mappage );
191	215
192		~~- if ( strpos( $first_line, "<decompose>" ) ) {~~
	216	+ if ( strpos( $first_line, "<decompose>" ) !== FALSE ) {
193	217	$map['__decompose__'] = true;
194	218	$decompose = true;
195	219	}
196		~~- if ( strpos( $first_line, "<sensitive>" ) ) {~~
	220	+ if ( strpos( $first_line, "<sensitive>" ) !== FALSE ) {
197	221	$map['__sensitive__'] = true;
198	222	}
199	223	array_shift( $lines );
200	224	}
201	225
202	226	if ( count( $lines ) > $wgTransliteratorRuleCount )
203		~~- return wfMsgExt('transliterator-error-rulecount', array('parsemag'), $wgTransliteratorRuleCount, $mappage );~~
	227	+ return wfMsgExt( 'transliterator-error-rulecount', array('parsemag'), $wgTransliteratorRuleCount, $mappage );
204	228
205	229	foreach ( $lines as $line ) {
206	230
207		~~- $pair = split( "=>", $line );~~
	231	+ $pair = preg_split( '/\s=>\s/', $line );
208	232
209		~~- if ( count($pair) != 2 )~~
210		~~- return wfMsg("transliterator-error-syntax", $line, $mappage);~~
	233	+ if ( count( $pair ) != 2 )
	234	+ return wfMsg( "transliterator-error-syntax", $line, $mappage );
211	235
212		~~- if ($decompose) // Undo the NFCing of MediaWiki~~
213		~~- $from = UtfNormal::toNFD( $pair[0] );~~
214		~~- else // substrings by NFC code-point are a superset of substrings by letters~~
215		~~- $from = $pair[0];~~
	236	+ $from = $pair[0];
	237	+ $to = html_entity_decode( $pair[1], ENT_QUOTES, 'UTF-8' );
216	238
217		~~- $to = $pair[1];~~
	239	+ // Convert the ^ and $ selectors into the DELIMITER so that it can be used with a negligable chance of conflict
	240	+ // Leave single ^ and $'s alone incase someone wants to use them
	241	+ // Still permits the creation of the rule "^$=>" that will never match, but hey
	242	+ $fromlast = strlen( $from ) - 1;
	243	+ if ( $fromlast > 0 ) {
	244	+ if ( $from[0] == "^" && $fromlast > 0)
	245	+ $from[0] = ExtTransliterator::DELIMITER;
218	246
	247	+ if ( $from[$fromlast] == "$")
	248	+ $from[$fromlast] = ExtTransliterator::DELIMITER;
	249	+ }
	250	+
	251	+ // Now we've looked at our syntax we can remove html escaping to reveal the true form
	252	+ $from = html_entity_decode( $from, ENT_QUOTES, 'UTF-8' );
	253	+ if ( $decompose ) // Undo the NFCing of MediaWiki
	254	+ $from = UtfNormal::toNFD( $from );
	255	+
	256	+ // If $map[$from] is set we can skip the filling in of sub-strings as there is a longer rule
219	257	if ( isset( $map[$from] ) ) {
220	258
221		~~- if ( is_string( $map[$from] ) )~~
	259	+ // Or a rule of the same length, i.e. the same rule.
	260	+ if ( is_string( $map[$from] ) && $to != $map[$from] )
222	261	return wfMsg("transliterator-error-ambiguous", $line, $mappage);
223	262
224	263	} else if ( strlen( $from ) > 1 ){
225		~~- // Fill in the blanks, so that we know when to stop looking while transliterating~~
226		~~- $to_fill = strlen( $from );~~
227	264
228		~~- if ( $to_fill > $wgTransliteratorRuleSize )~~
	265	+ // Bail if the left hand side is too long (has performance implications otherwise)
	266	+ $fromlen = strlen( $from );
	267	+ if ( $fromlen > $wgTransliteratorRuleSize )
229	268	return wfMsgExt('transliterator-error-rulesize', array('parsemag'), $line, $mappage, $wgTransliteratorRuleSize );
230		-
231		~~- for ( $i = 1; $i < $to_fill; $i++ ) {~~
	269	+
	270	+ // Fill in the blanks, so that we know when to stop looking while transliterating
	271	+ for ( $i = 1; $i < $fromlen; $i++ ) {
232	272	$substr = substr( $from, 0, $i );
233	273
234	274	if (! isset( $map[$substr] ) )
235	275	$map[$substr] = true;
236	276	}
237		~~- }~~
	277	+ } // else we have the default rule
238	278
239	279	$map[$from] = $to;
240	280	}
—	—	@@ -248,16 +288,18 @@
249	289	*/
250	290	function transliterate( $word, $map )
251	291	{
252		~~- $word = "^" . str_replace( " ", "$ ^", $word ) . "$";~~
253	292	if ( isset( $map["__decompose__"] ) ) {
254	293	$letters = $this->codepoints( $word );
255	294	} else {
256	295	$letters = $this->letters( $word );
257	296	}
258	297
259		~~- $sensitive = isset( $map["__sensitive__"] );~~
260		~~- $ucfirst = false;~~
	298	+ $alphamap = $this->alphamap( $letters );
261	299
	300	+ $sensitive = isset( $map["__sensitive__"] ); // Are we in case-sensitive mode, or not
	301	+ $ucfirst = false; // We are in case-sensitive mode and the first character of the current match was upper-case originally
	302	+ $withstart = false; // Have we inserted a start character into the current $current
	303	+
262	304	$output = ""; // The output
263	305	$last_match = 0; // The position of the last character matched, or the first character of the current run
264	306	$last_trans = null; // The transliteration of the last character matched, or null if the first character of the current run
—	—	@@ -265,38 +307,56 @@
266	308	$count = count($letters); // The total number of characters in the string
267	309	$current = ""; // The substring that we are currently trying to find the longest match for.
268	310
	311	+ while ( $last_match < $count ) {
269	312
270		~~- while ( $i < $count ) {~~
	313	+ if ( $i < $count ) {
271	314
272		~~- $next = $current.$letters[$i];~~
	315	+ // if this is the start of a word, first try the form with the start indicator
	316	+ if ( $withstart ) {
	317	+ $withstart = false;
	318	+ } else if ( $alphamap[$i] && ($last_trans == null) && ( $i == 0 \|\| !$alphamap[$i - 1] ) ) {
	319	+ $current = ExtTransliterator::DELIMITER;
	320	+ $withstart = true;
	321	+ }
273	322
274		~~- // There may be a match longer than $current~~
275		~~- if ( isset( $map[$next] ) ) {~~
	323	+ $next = $current.$letters[$i];
276	324
277		~~- // In fact, $next is a match~~
278		~~- if ( is_string( $map[$next] ) ) {~~
279		~~- $last_match = $i;~~
280		~~- $last_trans = $map[$next];~~
	325	+ // There may be a match longer than $current
	326	+ if ( isset( $map[$next] ) ) {
	327	+
	328	+ // In fact, $next is a match
	329	+ if ( is_string( $map[$next] ) ) {
	330	+ $last_match = $i;
	331	+ $last_trans = $map[$next];
	332	+ }
	333	+
	334	+ $i++;
	335	+ $current = $next;
	336	+ continue;
281	337	}
	338	+ }
282	339
283		~~- $i++;~~
284		~~- $current = $next;~~
	340	+ // We had no match at all, pass through one character
	341	+ if ( is_null( $last_trans ) ) {
285	342
286		~~- // No more matching, go back to the last match and start from the character after~~
287		~~- } else {~~
	343	+ // This was a fake character that we inserted
	344	+ if ( $withstart ) {
	345	+ $current = "";
	346	+ continue;
288	347
289		~~- // We had no match at all, pass through one character~~
290		~~- if ( is_null( $last_trans ) ) {~~
	348	+ // It was a real character that we were supposed to transliterate
	349	+ } else {
291	350
292	351	$last_letter = $letters[$last_match];
293	352	$last_lower = $sensitive ? $last_letter : mb_strtolower( $last_letter );
294	353
	354	+ // If we are not being sensitive, we can try down-casing the previous letter
295	355	if ( $last_letter != $last_lower ) {
296	356	$ucfirst = true;
297	357	$letters[$last_match] = $last_lower;
298	358
299	359	// Might be nice to output a ? if we don't understand
300		~~- } else if ( isset( $map[''] ) && $last_letter != '^' && $last_letter != '$' ) {~~
	360	+ } else if ( isset( $map[''] ) ) {
301	361
302	362	if ( $ucfirst ) {
303	363	$output .= str_replace( '$1', mb_strtoupper( $last_letter ), $map[''] );
—	—	@@ -319,33 +379,32 @@
320	380	$i = ++$last_match;
321	381	$current = "";
322	382	}
	383	+ }
323	384
	385	+ // Output the previous match
	386	+ } else {
324	387
325		~~- // Output the previous match~~
326		~~- } else {~~
327		-
328		~~- if ( $ucfirst ) {~~
329		~~- $output .= mb_strtoupper( mb_substr( $last_trans, 0, 1 ) ).mb_substr( $last_trans, 1 );~~
330		~~- $ucfirst = false;~~
331		~~- } else {~~
332		~~- $output .= $last_trans;~~
	388	+ // If this match is at the end of a word, see whether we have a more specific rule
	389	+ if ( $alphamap[$i-1] && ( $i == $count \|\| !$alphamap[$i] ) ) {
	390	+ $try = $current . ExtTransliterator::DELIMITER;
	391	+ if ( isset( $map[$try] ) && is_string( $map[$try] ) ) {
	392	+ $last_trans = $map[$try];
333	393	}
334		~~- $i = ++$last_match;~~
335		~~- $last_trans = null;~~
336		~~- $current = "";~~
	394	+ }
337	395
	396	+ if ( $ucfirst ) {
	397	+ $output .= mb_strtoupper( mb_substr( $last_trans, 0, 1 ) ).mb_substr( $last_trans, 1 );
	398	+ $ucfirst = false;
	399	+ } else {
	400	+ $output .= $last_trans;
338	401	}
	402	+ $i = ++$last_match;
	403	+ $last_trans = null;
	404	+ $current = "";
	405	+
339	406	}
340	407	}
341		~~- if (! is_null( $last_trans ))~~
342		~~- if ( $ucfirst ) {~~
343		~~- $output .= mb_strtoupper( mb_substr( $last_trans, 0, 1 ) ).mb_substr( $last_trans, 1 );~~
344		~~- } else {~~
345		~~- $output .= $last_trans;~~
346		~~- }~~
347		-
348		~~- // Remove the beginnng and end markers~~
349		~~- return preg_replace('/^\^\|\$$\|\$(\s+)\^\|\$(\s+)\|(\s+)\^/',"$1", $output);~~
	408	+ return $output;
350	409	}
351	410
352	411	/**
—	—	@@ -385,8 +444,8 @@
386	445
387	446	} else { // A Map
388	447	$title = Title::newFromRow( $this->mPages[$mappage] );
389		~~- $output = UtfNormal::toNFC( $this->transliterate( $word, $map ) );~~
390		~~- $output = str_replace('$1', $output, $format);~~
	448	+ $output = UtfNormal::toNFC( $this->transliterate( html_entity_decode( $word, ENT_QUOTES, 'UTF-8' ), $map ) );
	449	+ $output = str_replace( '$1', $output, $format );
391	450
392	451	}
393	452	// Populate the dependency table so that we get re-rendered if the map changes.
—	—	@@ -395,8 +454,8 @@
396	455
397	456	return $output;
398	457	}
	458	+}
399	459
400		-}
401	460	function efTransliterator_Setup() {
402	461	global $wgParser;
403	462
—	—	@@ -408,6 +467,6 @@
409	468	function efTransliterator_Magic( &$magicWords, $langCode ) {
410	469	wfLoadExtensionMessages('Transliterator');
411	470
412		~~- $magicWords['transliterate'] = array( 0, 'transliterate', wfMsg('transliterator-invoke') );~~
	471	+ $magicWords['transliterate'] = array( 0, 'transliterate', wfMsg( 'transliterator-invoke' ) );
413	472	return true;
414	473	}

Status & tagging log

22:12, 25 May 2011 😂 (talk | contribs) changed the status of r54691 [removed: new added: deferred]
22:34, 6 May 2011 MarkAHershberger (talk | contribs) changed the status of r54691 [removed: deferred added: new]
06:21, 30 December 2009 MaxSem (talk | contribs) changed the status of r54691 [removed: new added: deferred]