r54691 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r54690‎ | r54691 | r54692 >
Date:23:42, 9 August 2009
Author:conrad
Status:deferred
Tags:
Comment:
General improvements, particularly to ^ and $ handling, bug fixes
Modified paths:
  • /trunk/extensions/Transliterator/Transliterator.php (modified) (history)

Diff [purge]

Index: trunk/extensions/Transliterator/Transliterator.php
@@ -18,6 +18,8 @@
1919 * better i18n support, adjustable limits, minor formal adjustment.
2020 * @version 1.1.0
2121 * addition of answer parameter
 22+ * @version 1.2.0
 23+ * semi-case-sensitive by default, fix bugs with edge-detection and html-entities
2224 */
2325
2426 /**
@@ -38,6 +40,7 @@
3941 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
4042 */
4143
 44+error_reporting(E_ALL | E_WARNING | E_STRICT);
4245 if ( !defined( 'MEDIAWIKI' ) )
4346 {
4447 die( 'This file is a MediaWiki extension, not a valid entry point.' );
@@ -49,7 +52,7 @@
5053
5154 $wgExtensionCredits['parserhook'][] = array(
5255 'name' => 'Transliterator',
53 - 'version' => '1.1.0',
 56+ 'version' => '1.2.0',
5457 'descriptionmsg' => 'transliterator-desc',
5558 'author' => 'Conrad Irwin',
5659 'url' => 'http://www.mediawiki.org/wiki/Extension:Transliterator',
@@ -66,6 +69,7 @@
6770
6871 class ExtTransliterator {
6972
 73+ const DELIMITER = "\x1F"; // A character that will be inserted in places where the ^ and $ should match
7074 var $mPages = null; // An Array of "transliterator:$mapname" => The database row for that template.
7175 var $mMaps = array();// An Array of "$mapname" => The map parsed from that page.
7276
@@ -102,6 +106,23 @@
103107 }
104108
105109 /**
 110+ * Given a codepoints or letters array returns a list that contains 1 for every
 111+ * alphabetic character and accent, and 0 otherwise. This allows for edge-of-word
 112+ * detection.
 113+ */
 114+ function alphamap( $letters ) {
 115+
 116+ $output = Array();
 117+ $count = count($letters);
 118+
 119+ for ($i = 0; $i < $count; $i++) {
 120+ $output[] = preg_match( '/\pL/u', $letters[$i]) || isset( $utfCombiningClass[$letters[$i]] );
 121+ }
 122+
 123+ return $output;
 124+ }
 125+
 126+ /**
106127 * Get all the existing maps in one query, useful given that the default
107128 * behaviour of failing silently is designed to allow it to be used by
108129 * templates that don't know if a map exists, so may try far too often.
@@ -156,9 +177,8 @@
157178 *
158179 * Input syntax is a set of lines.
159180 * All " " are ignored.
160 - * Lines starting with # are ignored.
 181+ * Lines starting with # are ignored, remaining lines are split by =>
161182 * HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints)
162 - * Remaining lines are split by "=>".
163183 *
164184 * The map created is a set of "from" strings to "to" strings
165185 * With extra "from" => true for all substrings of "from" strings
@@ -174,66 +194,86 @@
175195 $map = array();
176196 $decompose = false;
177197
178 - // Split lines and remove comments and space
179 - $lines = split( "\n", html_entity_decode( preg_replace( '/^\s*(#.*)?(\n|$)| */m', '', $input ), ENT_NOQUOTES, "UTF-8" ) );
 198+ // Split lines and remove whitespace at beginning and end
 199+ $lines = preg_split( "/(^|\s*\n)(\s*(#[^\n]*)?\n)*\s*/", $input."\n" );
 200+ if ( $lines[0] == "" )
 201+ array_shift( $lines );
180202
181 - // If the last line was a comment then there will be an empty line at the end
182 - if ( $lines[count( $lines ) - 1] == "" ) {
 203+ if ( $lines[count( $lines ) - 1] == "" )
183204 array_pop( $lines );
184 - }
185205
 206+
 207+ // The first line can contain flags
186208 $first_line = $lines[0];
187209 if ( strpos( $first_line, "=>") === FALSE ) {
188 - # Empty page
 210+ // Or, could just signify that the message was blank
189211 if ( $first_line == "<$mappage>")
190212 return false;
 213+ else if ( preg_replace( '/<(decompose|sensitive)>/', '', $first_line ) != '')
 214+ return wfMsg( 'transliterator-error-syntax', $first_line, $mappage );
191215
192 - if ( strpos( $first_line, "<decompose>" ) ) {
 216+ if ( strpos( $first_line, "<decompose>" ) !== FALSE ) {
193217 $map['__decompose__'] = true;
194218 $decompose = true;
195219 }
196 - if ( strpos( $first_line, "<sensitive>" ) ) {
 220+ if ( strpos( $first_line, "<sensitive>" ) !== FALSE ) {
197221 $map['__sensitive__'] = true;
198222 }
199223 array_shift( $lines );
200224 }
201225
202226 if ( count( $lines ) > $wgTransliteratorRuleCount )
203 - return wfMsgExt('transliterator-error-rulecount', array('parsemag'), $wgTransliteratorRuleCount, $mappage );
 227+ return wfMsgExt( 'transliterator-error-rulecount', array('parsemag'), $wgTransliteratorRuleCount, $mappage );
204228
205229 foreach ( $lines as $line ) {
206230
207 - $pair = split( "=>", $line );
 231+ $pair = preg_split( '/\s*=>\s*/', $line );
208232
209 - if ( count($pair) != 2 )
210 - return wfMsg("transliterator-error-syntax", $line, $mappage);
 233+ if ( count( $pair ) != 2 )
 234+ return wfMsg( "transliterator-error-syntax", $line, $mappage );
211235
212 - if ($decompose) // Undo the NFCing of MediaWiki
213 - $from = UtfNormal::toNFD( $pair[0] );
214 - else // substrings by NFC code-point are a superset of substrings by letters
215 - $from = $pair[0];
 236+ $from = $pair[0];
 237+ $to = html_entity_decode( $pair[1], ENT_QUOTES, 'UTF-8' );
216238
217 - $to = $pair[1];
 239+ // Convert the ^ and $ selectors into the DELIMITER so that it can be used with a negligable chance of conflict
 240+ // Leave single ^ and $'s alone incase someone wants to use them
 241+ // Still permits the creation of the rule "^$=>" that will never match, but hey
 242+ $fromlast = strlen( $from ) - 1;
 243+ if ( $fromlast > 0 ) {
 244+ if ( $from[0] == "^" && $fromlast > 0)
 245+ $from[0] = ExtTransliterator::DELIMITER;
218246
 247+ if ( $from[$fromlast] == "$")
 248+ $from[$fromlast] = ExtTransliterator::DELIMITER;
 249+ }
 250+
 251+ // Now we've looked at our syntax we can remove html escaping to reveal the true form
 252+ $from = html_entity_decode( $from, ENT_QUOTES, 'UTF-8' );
 253+ if ( $decompose ) // Undo the NFCing of MediaWiki
 254+ $from = UtfNormal::toNFD( $from );
 255+
 256+ // If $map[$from] is set we can skip the filling in of sub-strings as there is a longer rule
219257 if ( isset( $map[$from] ) ) {
220258
221 - if ( is_string( $map[$from] ) )
 259+ // Or a rule of the same length, i.e. the same rule.
 260+ if ( is_string( $map[$from] ) && $to != $map[$from] )
222261 return wfMsg("transliterator-error-ambiguous", $line, $mappage);
223262
224263 } else if ( strlen( $from ) > 1 ){
225 - // Fill in the blanks, so that we know when to stop looking while transliterating
226 - $to_fill = strlen( $from );
227264
228 - if ( $to_fill > $wgTransliteratorRuleSize )
 265+ // Bail if the left hand side is too long (has performance implications otherwise)
 266+ $fromlen = strlen( $from );
 267+ if ( $fromlen > $wgTransliteratorRuleSize )
229268 return wfMsgExt('transliterator-error-rulesize', array('parsemag'), $line, $mappage, $wgTransliteratorRuleSize );
230 -
231 - for ( $i = 1; $i < $to_fill; $i++ ) {
 269+
 270+ // Fill in the blanks, so that we know when to stop looking while transliterating
 271+ for ( $i = 1; $i < $fromlen; $i++ ) {
232272 $substr = substr( $from, 0, $i );
233273
234274 if (! isset( $map[$substr] ) )
235275 $map[$substr] = true;
236276 }
237 - }
 277+ } // else we have the default rule
238278
239279 $map[$from] = $to;
240280 }
@@ -248,16 +288,18 @@
249289 */
250290 function transliterate( $word, $map )
251291 {
252 - $word = "^" . str_replace( " ", "$ ^", $word ) . "$";
253292 if ( isset( $map["__decompose__"] ) ) {
254293 $letters = $this->codepoints( $word );
255294 } else {
256295 $letters = $this->letters( $word );
257296 }
258297
259 - $sensitive = isset( $map["__sensitive__"] );
260 - $ucfirst = false;
 298+ $alphamap = $this->alphamap( $letters );
261299
 300+ $sensitive = isset( $map["__sensitive__"] ); // Are we in case-sensitive mode, or not
 301+ $ucfirst = false; // We are in case-sensitive mode and the first character of the current match was upper-case originally
 302+ $withstart = false; // Have we inserted a start character into the current $current
 303+
262304 $output = ""; // The output
263305 $last_match = 0; // The position of the last character matched, or the first character of the current run
264306 $last_trans = null; // The transliteration of the last character matched, or null if the first character of the current run
@@ -265,38 +307,56 @@
266308 $count = count($letters); // The total number of characters in the string
267309 $current = ""; // The substring that we are currently trying to find the longest match for.
268310
 311+ while ( $last_match < $count ) {
269312
270 - while ( $i < $count ) {
 313+ if ( $i < $count ) {
271314
272 - $next = $current.$letters[$i];
 315+ // if this is the start of a word, first try the form with the start indicator
 316+ if ( $withstart ) {
 317+ $withstart = false;
 318+ } else if ( $alphamap[$i] && ($last_trans == null) && ( $i == 0 || !$alphamap[$i - 1] ) ) {
 319+ $current = ExtTransliterator::DELIMITER;
 320+ $withstart = true;
 321+ }
273322
274 - // There may be a match longer than $current
275 - if ( isset( $map[$next] ) ) {
 323+ $next = $current.$letters[$i];
276324
277 - // In fact, $next is a match
278 - if ( is_string( $map[$next] ) ) {
279 - $last_match = $i;
280 - $last_trans = $map[$next];
 325+ // There may be a match longer than $current
 326+ if ( isset( $map[$next] ) ) {
 327+
 328+ // In fact, $next is a match
 329+ if ( is_string( $map[$next] ) ) {
 330+ $last_match = $i;
 331+ $last_trans = $map[$next];
 332+ }
 333+
 334+ $i++;
 335+ $current = $next;
 336+ continue;
281337 }
 338+ }
282339
283 - $i++;
284 - $current = $next;
 340+ // We had no match at all, pass through one character
 341+ if ( is_null( $last_trans ) ) {
285342
286 - // No more matching, go back to the last match and start from the character after
287 - } else {
 343+ // This was a fake character that we inserted
 344+ if ( $withstart ) {
 345+ $current = "";
 346+ continue;
288347
289 - // We had no match at all, pass through one character
290 - if ( is_null( $last_trans ) ) {
 348+ // It was a real character that we were supposed to transliterate
 349+ } else {
291350
292351 $last_letter = $letters[$last_match];
293352 $last_lower = $sensitive ? $last_letter : mb_strtolower( $last_letter );
294353
 354+ // If we are not being sensitive, we can try down-casing the previous letter
295355 if ( $last_letter != $last_lower ) {
296356 $ucfirst = true;
297357 $letters[$last_match] = $last_lower;
298358
299359 // Might be nice to output a ? if we don't understand
300 - } else if ( isset( $map[''] ) && $last_letter != '^' && $last_letter != '$' ) {
 360+ } else if ( isset( $map[''] ) ) {
301361
302362 if ( $ucfirst ) {
303363 $output .= str_replace( '$1', mb_strtoupper( $last_letter ), $map[''] );
@@ -319,33 +379,32 @@
320380 $i = ++$last_match;
321381 $current = "";
322382 }
 383+ }
323384
 385+ // Output the previous match
 386+ } else {
324387
325 - // Output the previous match
326 - } else {
327 -
328 - if ( $ucfirst ) {
329 - $output .= mb_strtoupper( mb_substr( $last_trans, 0, 1 ) ).mb_substr( $last_trans, 1 );
330 - $ucfirst = false;
331 - } else {
332 - $output .= $last_trans;
 388+ // If this match is at the end of a word, see whether we have a more specific rule
 389+ if ( $alphamap[$i-1] && ( $i == $count || !$alphamap[$i] ) ) {
 390+ $try = $current . ExtTransliterator::DELIMITER;
 391+ if ( isset( $map[$try] ) && is_string( $map[$try] ) ) {
 392+ $last_trans = $map[$try];
333393 }
334 - $i = ++$last_match;
335 - $last_trans = null;
336 - $current = "";
 394+ }
337395
 396+ if ( $ucfirst ) {
 397+ $output .= mb_strtoupper( mb_substr( $last_trans, 0, 1 ) ).mb_substr( $last_trans, 1 );
 398+ $ucfirst = false;
 399+ } else {
 400+ $output .= $last_trans;
338401 }
 402+ $i = ++$last_match;
 403+ $last_trans = null;
 404+ $current = "";
 405+
339406 }
340407 }
341 - if (! is_null( $last_trans ))
342 - if ( $ucfirst ) {
343 - $output .= mb_strtoupper( mb_substr( $last_trans, 0, 1 ) ).mb_substr( $last_trans, 1 );
344 - } else {
345 - $output .= $last_trans;
346 - }
347 -
348 - // Remove the beginnng and end markers
349 - return preg_replace('/^\^|\$$|\$(\s+)\^|\$(\s+)|(\s+)\^/',"$1", $output);
 408+ return $output;
350409 }
351410
352411 /**
@@ -385,8 +444,8 @@
386445
387446 } else { // A Map
388447 $title = Title::newFromRow( $this->mPages[$mappage] );
389 - $output = UtfNormal::toNFC( $this->transliterate( $word, $map ) );
390 - $output = str_replace('$1', $output, $format);
 448+ $output = UtfNormal::toNFC( $this->transliterate( html_entity_decode( $word, ENT_QUOTES, 'UTF-8' ), $map ) );
 449+ $output = str_replace( '$1', $output, $format );
391450
392451 }
393452 // Populate the dependency table so that we get re-rendered if the map changes.
@@ -395,8 +454,8 @@
396455
397456 return $output;
398457 }
 458+}
399459
400 -}
401460 function efTransliterator_Setup() {
402461 global $wgParser;
403462
@@ -408,6 +467,6 @@
409468 function efTransliterator_Magic( &$magicWords, $langCode ) {
410469 wfLoadExtensionMessages('Transliterator');
411470
412 - $magicWords['transliterate'] = array( 0, 'transliterate', wfMsg('transliterator-invoke') );
 471+ $magicWords['transliterate'] = array( 0, 'transliterate', wfMsg( 'transliterator-invoke' ) );
413472 return true;
414473 }

Status & tagging log