Index: trunk/extensions/Transliterator/Transliterator.php |
— | — | @@ -18,6 +18,8 @@ |
19 | 19 | * better i18n support, adjustable limits, minor formal adjustment. |
20 | 20 | * @version 1.1.0 |
21 | 21 | * addition of answer parameter |
| 22 | + * @version 1.2.0 |
| 23 | + * semi-case-sensitive by default, fix bugs with edge-detection and html-entities |
22 | 24 | */ |
23 | 25 | |
24 | 26 | /** |
— | — | @@ -38,6 +40,7 @@ |
39 | 41 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA |
40 | 42 | */ |
41 | 43 | |
| 44 | +error_reporting(E_ALL | E_WARNING | E_STRICT); |
42 | 45 | if ( !defined( 'MEDIAWIKI' ) ) |
43 | 46 | { |
44 | 47 | die( 'This file is a MediaWiki extension, not a valid entry point.' ); |
— | — | @@ -49,7 +52,7 @@ |
50 | 53 | |
51 | 54 | $wgExtensionCredits['parserhook'][] = array( |
52 | 55 | 'name' => 'Transliterator', |
53 | | - 'version' => '1.1.0', |
| 56 | + 'version' => '1.2.0', |
54 | 57 | 'descriptionmsg' => 'transliterator-desc', |
55 | 58 | 'author' => 'Conrad Irwin', |
56 | 59 | 'url' => 'http://www.mediawiki.org/wiki/Extension:Transliterator', |
— | — | @@ -66,6 +69,7 @@ |
67 | 70 | |
68 | 71 | class ExtTransliterator { |
69 | 72 | |
| 73 | + const DELIMITER = "\x1F"; // A character that will be inserted in places where the ^ and $ should match |
70 | 74 | var $mPages = null; // An Array of "transliterator:$mapname" => The database row for that template. |
71 | 75 | var $mMaps = array();// An Array of "$mapname" => The map parsed from that page. |
72 | 76 | |
— | — | @@ -102,6 +106,23 @@ |
103 | 107 | } |
104 | 108 | |
105 | 109 | /** |
| 110 | + * Given a codepoints or letters array returns a list that contains 1 for every |
| 111 | + * alphabetic character and accent, and 0 otherwise. This allows for edge-of-word |
| 112 | + * detection. |
| 113 | + */ |
| 114 | + function alphamap( $letters ) { |
| 115 | + |
| 116 | + $output = Array(); |
| 117 | + $count = count($letters); |
| 118 | + |
| 119 | + for ($i = 0; $i < $count; $i++) { |
| 120 | + $output[] = preg_match( '/\pL/u', $letters[$i]) || isset( $utfCombiningClass[$letters[$i]] ); |
| 121 | + } |
| 122 | + |
| 123 | + return $output; |
| 124 | + } |
| 125 | + |
| 126 | + /** |
106 | 127 | * Get all the existing maps in one query, useful given that the default |
107 | 128 | * behaviour of failing silently is designed to allow it to be used by |
108 | 129 | * templates that don't know if a map exists, so may try far too often. |
— | — | @@ -156,9 +177,8 @@ |
157 | 178 | * |
158 | 179 | * Input syntax is a set of lines. |
159 | 180 | * All " " are ignored. |
160 | | - * Lines starting with # are ignored. |
| 181 | + * Lines starting with # are ignored, remaining lines are split by => |
161 | 182 | * HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints) |
162 | | - * Remaining lines are split by "=>". |
163 | 183 | * |
164 | 184 | * The map created is a set of "from" strings to "to" strings |
165 | 185 | * With extra "from" => true for all substrings of "from" strings |
— | — | @@ -174,66 +194,86 @@ |
175 | 195 | $map = array(); |
176 | 196 | $decompose = false; |
177 | 197 | |
178 | | - // Split lines and remove comments and space |
179 | | - $lines = split( "\n", html_entity_decode( preg_replace( '/^\s*(#.*)?(\n|$)| */m', '', $input ), ENT_NOQUOTES, "UTF-8" ) ); |
| 198 | + // Split lines and remove whitespace at beginning and end |
| 199 | + $lines = preg_split( "/(^|\s*\n)(\s*(#[^\n]*)?\n)*\s*/", $input."\n" ); |
| 200 | + if ( $lines[0] == "" ) |
| 201 | + array_shift( $lines ); |
180 | 202 | |
181 | | - // If the last line was a comment then there will be an empty line at the end |
182 | | - if ( $lines[count( $lines ) - 1] == "" ) { |
| 203 | + if ( $lines[count( $lines ) - 1] == "" ) |
183 | 204 | array_pop( $lines ); |
184 | | - } |
185 | 205 | |
| 206 | + |
| 207 | + // The first line can contain flags |
186 | 208 | $first_line = $lines[0]; |
187 | 209 | if ( strpos( $first_line, "=>") === FALSE ) { |
188 | | - # Empty page |
| 210 | + // Or, could just signify that the message was blank |
189 | 211 | if ( $first_line == "<$mappage>") |
190 | 212 | return false; |
| 213 | + else if ( preg_replace( '/<(decompose|sensitive)>/', '', $first_line ) != '') |
| 214 | + return wfMsg( 'transliterator-error-syntax', $first_line, $mappage ); |
191 | 215 | |
192 | | - if ( strpos( $first_line, "<decompose>" ) ) { |
| 216 | + if ( strpos( $first_line, "<decompose>" ) !== FALSE ) { |
193 | 217 | $map['__decompose__'] = true; |
194 | 218 | $decompose = true; |
195 | 219 | } |
196 | | - if ( strpos( $first_line, "<sensitive>" ) ) { |
| 220 | + if ( strpos( $first_line, "<sensitive>" ) !== FALSE ) { |
197 | 221 | $map['__sensitive__'] = true; |
198 | 222 | } |
199 | 223 | array_shift( $lines ); |
200 | 224 | } |
201 | 225 | |
202 | 226 | if ( count( $lines ) > $wgTransliteratorRuleCount ) |
203 | | - return wfMsgExt('transliterator-error-rulecount', array('parsemag'), $wgTransliteratorRuleCount, $mappage ); |
| 227 | + return wfMsgExt( 'transliterator-error-rulecount', array('parsemag'), $wgTransliteratorRuleCount, $mappage ); |
204 | 228 | |
205 | 229 | foreach ( $lines as $line ) { |
206 | 230 | |
207 | | - $pair = split( "=>", $line ); |
| 231 | + $pair = preg_split( '/\s*=>\s*/', $line ); |
208 | 232 | |
209 | | - if ( count($pair) != 2 ) |
210 | | - return wfMsg("transliterator-error-syntax", $line, $mappage); |
| 233 | + if ( count( $pair ) != 2 ) |
| 234 | + return wfMsg( "transliterator-error-syntax", $line, $mappage ); |
211 | 235 | |
212 | | - if ($decompose) // Undo the NFCing of MediaWiki |
213 | | - $from = UtfNormal::toNFD( $pair[0] ); |
214 | | - else // substrings by NFC code-point are a superset of substrings by letters |
215 | | - $from = $pair[0]; |
| 236 | + $from = $pair[0]; |
| 237 | + $to = html_entity_decode( $pair[1], ENT_QUOTES, 'UTF-8' ); |
216 | 238 | |
217 | | - $to = $pair[1]; |
| 239 | + // Convert the ^ and $ selectors into the DELIMITER so that it can be used with a negligable chance of conflict |
| 240 | + // Leave single ^ and $'s alone incase someone wants to use them |
| 241 | + // Still permits the creation of the rule "^$=>" that will never match, but hey |
| 242 | + $fromlast = strlen( $from ) - 1; |
| 243 | + if ( $fromlast > 0 ) { |
| 244 | + if ( $from[0] == "^" && $fromlast > 0) |
| 245 | + $from[0] = ExtTransliterator::DELIMITER; |
218 | 246 | |
| 247 | + if ( $from[$fromlast] == "$") |
| 248 | + $from[$fromlast] = ExtTransliterator::DELIMITER; |
| 249 | + } |
| 250 | + |
| 251 | + // Now we've looked at our syntax we can remove html escaping to reveal the true form |
| 252 | + $from = html_entity_decode( $from, ENT_QUOTES, 'UTF-8' ); |
| 253 | + if ( $decompose ) // Undo the NFCing of MediaWiki |
| 254 | + $from = UtfNormal::toNFD( $from ); |
| 255 | + |
| 256 | + // If $map[$from] is set we can skip the filling in of sub-strings as there is a longer rule |
219 | 257 | if ( isset( $map[$from] ) ) { |
220 | 258 | |
221 | | - if ( is_string( $map[$from] ) ) |
| 259 | + // Or a rule of the same length, i.e. the same rule. |
| 260 | + if ( is_string( $map[$from] ) && $to != $map[$from] ) |
222 | 261 | return wfMsg("transliterator-error-ambiguous", $line, $mappage); |
223 | 262 | |
224 | 263 | } else if ( strlen( $from ) > 1 ){ |
225 | | - // Fill in the blanks, so that we know when to stop looking while transliterating |
226 | | - $to_fill = strlen( $from ); |
227 | 264 | |
228 | | - if ( $to_fill > $wgTransliteratorRuleSize ) |
| 265 | + // Bail if the left hand side is too long (has performance implications otherwise) |
| 266 | + $fromlen = strlen( $from ); |
| 267 | + if ( $fromlen > $wgTransliteratorRuleSize ) |
229 | 268 | return wfMsgExt('transliterator-error-rulesize', array('parsemag'), $line, $mappage, $wgTransliteratorRuleSize ); |
230 | | - |
231 | | - for ( $i = 1; $i < $to_fill; $i++ ) { |
| 269 | + |
| 270 | + // Fill in the blanks, so that we know when to stop looking while transliterating |
| 271 | + for ( $i = 1; $i < $fromlen; $i++ ) { |
232 | 272 | $substr = substr( $from, 0, $i ); |
233 | 273 | |
234 | 274 | if (! isset( $map[$substr] ) ) |
235 | 275 | $map[$substr] = true; |
236 | 276 | } |
237 | | - } |
| 277 | + } // else we have the default rule |
238 | 278 | |
239 | 279 | $map[$from] = $to; |
240 | 280 | } |
— | — | @@ -248,16 +288,18 @@ |
249 | 289 | */ |
250 | 290 | function transliterate( $word, $map ) |
251 | 291 | { |
252 | | - $word = "^" . str_replace( " ", "$ ^", $word ) . "$"; |
253 | 292 | if ( isset( $map["__decompose__"] ) ) { |
254 | 293 | $letters = $this->codepoints( $word ); |
255 | 294 | } else { |
256 | 295 | $letters = $this->letters( $word ); |
257 | 296 | } |
258 | 297 | |
259 | | - $sensitive = isset( $map["__sensitive__"] ); |
260 | | - $ucfirst = false; |
| 298 | + $alphamap = $this->alphamap( $letters ); |
261 | 299 | |
| 300 | + $sensitive = isset( $map["__sensitive__"] ); // Are we in case-sensitive mode, or not |
| 301 | + $ucfirst = false; // We are in case-sensitive mode and the first character of the current match was upper-case originally |
| 302 | + $withstart = false; // Have we inserted a start character into the current $current |
| 303 | + |
262 | 304 | $output = ""; // The output |
263 | 305 | $last_match = 0; // The position of the last character matched, or the first character of the current run |
264 | 306 | $last_trans = null; // The transliteration of the last character matched, or null if the first character of the current run |
— | — | @@ -265,38 +307,56 @@ |
266 | 308 | $count = count($letters); // The total number of characters in the string |
267 | 309 | $current = ""; // The substring that we are currently trying to find the longest match for. |
268 | 310 | |
| 311 | + while ( $last_match < $count ) { |
269 | 312 | |
270 | | - while ( $i < $count ) { |
| 313 | + if ( $i < $count ) { |
271 | 314 | |
272 | | - $next = $current.$letters[$i]; |
| 315 | + // if this is the start of a word, first try the form with the start indicator |
| 316 | + if ( $withstart ) { |
| 317 | + $withstart = false; |
| 318 | + } else if ( $alphamap[$i] && ($last_trans == null) && ( $i == 0 || !$alphamap[$i - 1] ) ) { |
| 319 | + $current = ExtTransliterator::DELIMITER; |
| 320 | + $withstart = true; |
| 321 | + } |
273 | 322 | |
274 | | - // There may be a match longer than $current |
275 | | - if ( isset( $map[$next] ) ) { |
| 323 | + $next = $current.$letters[$i]; |
276 | 324 | |
277 | | - // In fact, $next is a match |
278 | | - if ( is_string( $map[$next] ) ) { |
279 | | - $last_match = $i; |
280 | | - $last_trans = $map[$next]; |
| 325 | + // There may be a match longer than $current |
| 326 | + if ( isset( $map[$next] ) ) { |
| 327 | + |
| 328 | + // In fact, $next is a match |
| 329 | + if ( is_string( $map[$next] ) ) { |
| 330 | + $last_match = $i; |
| 331 | + $last_trans = $map[$next]; |
| 332 | + } |
| 333 | + |
| 334 | + $i++; |
| 335 | + $current = $next; |
| 336 | + continue; |
281 | 337 | } |
| 338 | + } |
282 | 339 | |
283 | | - $i++; |
284 | | - $current = $next; |
| 340 | + // We had no match at all, pass through one character |
| 341 | + if ( is_null( $last_trans ) ) { |
285 | 342 | |
286 | | - // No more matching, go back to the last match and start from the character after |
287 | | - } else { |
| 343 | + // This was a fake character that we inserted |
| 344 | + if ( $withstart ) { |
| 345 | + $current = ""; |
| 346 | + continue; |
288 | 347 | |
289 | | - // We had no match at all, pass through one character |
290 | | - if ( is_null( $last_trans ) ) { |
| 348 | + // It was a real character that we were supposed to transliterate |
| 349 | + } else { |
291 | 350 | |
292 | 351 | $last_letter = $letters[$last_match]; |
293 | 352 | $last_lower = $sensitive ? $last_letter : mb_strtolower( $last_letter ); |
294 | 353 | |
| 354 | + // If we are not being sensitive, we can try down-casing the previous letter |
295 | 355 | if ( $last_letter != $last_lower ) { |
296 | 356 | $ucfirst = true; |
297 | 357 | $letters[$last_match] = $last_lower; |
298 | 358 | |
299 | 359 | // Might be nice to output a ? if we don't understand |
300 | | - } else if ( isset( $map[''] ) && $last_letter != '^' && $last_letter != '$' ) { |
| 360 | + } else if ( isset( $map[''] ) ) { |
301 | 361 | |
302 | 362 | if ( $ucfirst ) { |
303 | 363 | $output .= str_replace( '$1', mb_strtoupper( $last_letter ), $map[''] ); |
— | — | @@ -319,33 +379,32 @@ |
320 | 380 | $i = ++$last_match; |
321 | 381 | $current = ""; |
322 | 382 | } |
| 383 | + } |
323 | 384 | |
| 385 | + // Output the previous match |
| 386 | + } else { |
324 | 387 | |
325 | | - // Output the previous match |
326 | | - } else { |
327 | | - |
328 | | - if ( $ucfirst ) { |
329 | | - $output .= mb_strtoupper( mb_substr( $last_trans, 0, 1 ) ).mb_substr( $last_trans, 1 ); |
330 | | - $ucfirst = false; |
331 | | - } else { |
332 | | - $output .= $last_trans; |
| 388 | + // If this match is at the end of a word, see whether we have a more specific rule |
| 389 | + if ( $alphamap[$i-1] && ( $i == $count || !$alphamap[$i] ) ) { |
| 390 | + $try = $current . ExtTransliterator::DELIMITER; |
| 391 | + if ( isset( $map[$try] ) && is_string( $map[$try] ) ) { |
| 392 | + $last_trans = $map[$try]; |
333 | 393 | } |
334 | | - $i = ++$last_match; |
335 | | - $last_trans = null; |
336 | | - $current = ""; |
| 394 | + } |
337 | 395 | |
| 396 | + if ( $ucfirst ) { |
| 397 | + $output .= mb_strtoupper( mb_substr( $last_trans, 0, 1 ) ).mb_substr( $last_trans, 1 ); |
| 398 | + $ucfirst = false; |
| 399 | + } else { |
| 400 | + $output .= $last_trans; |
338 | 401 | } |
| 402 | + $i = ++$last_match; |
| 403 | + $last_trans = null; |
| 404 | + $current = ""; |
| 405 | + |
339 | 406 | } |
340 | 407 | } |
341 | | - if (! is_null( $last_trans )) |
342 | | - if ( $ucfirst ) { |
343 | | - $output .= mb_strtoupper( mb_substr( $last_trans, 0, 1 ) ).mb_substr( $last_trans, 1 ); |
344 | | - } else { |
345 | | - $output .= $last_trans; |
346 | | - } |
347 | | - |
348 | | - // Remove the beginnng and end markers |
349 | | - return preg_replace('/^\^|\$$|\$(\s+)\^|\$(\s+)|(\s+)\^/',"$1", $output); |
| 408 | + return $output; |
350 | 409 | } |
351 | 410 | |
352 | 411 | /** |
— | — | @@ -385,8 +444,8 @@ |
386 | 445 | |
387 | 446 | } else { // A Map |
388 | 447 | $title = Title::newFromRow( $this->mPages[$mappage] ); |
389 | | - $output = UtfNormal::toNFC( $this->transliterate( $word, $map ) ); |
390 | | - $output = str_replace('$1', $output, $format); |
| 448 | + $output = UtfNormal::toNFC( $this->transliterate( html_entity_decode( $word, ENT_QUOTES, 'UTF-8' ), $map ) ); |
| 449 | + $output = str_replace( '$1', $output, $format ); |
391 | 450 | |
392 | 451 | } |
393 | 452 | // Populate the dependency table so that we get re-rendered if the map changes. |
— | — | @@ -395,8 +454,8 @@ |
396 | 455 | |
397 | 456 | return $output; |
398 | 457 | } |
| 458 | +} |
399 | 459 | |
400 | | -} |
401 | 460 | function efTransliterator_Setup() { |
402 | 461 | global $wgParser; |
403 | 462 | |
— | — | @@ -408,6 +467,6 @@ |
409 | 468 | function efTransliterator_Magic( &$magicWords, $langCode ) { |
410 | 469 | wfLoadExtensionMessages('Transliterator'); |
411 | 470 | |
412 | | - $magicWords['transliterate'] = array( 0, 'transliterate', wfMsg('transliterator-invoke') ); |
| 471 | + $magicWords['transliterate'] = array( 0, 'transliterate', wfMsg( 'transliterator-invoke' ) ); |
413 | 472 | return true; |
414 | 473 | } |