Index: trunk/extensions/Transliterator/Transliterator_body.php |
— | — | @@ -0,0 +1,475 @@ |
| 2 | +<?php |
| 3 | + |
| 4 | +if ( !defined( 'MEDIAWIKI' ) ) { |
| 5 | + die( 'This file is a MediaWiki extension, not a valid entry point.' ); |
| 6 | +} |
| 7 | + |
| 8 | +class ExtTransliterator { |
| 9 | + |
| 10 | + |
| 11 | + const FIRST = "\x1F"; // A character that will be appended when ^ should match at the start |
| 12 | + const LAST = "\x1E"; // A character that will be appended when $ should match at the end |
| 13 | + const CACHE_PREFIX = "extTransliterator.2"; // The prefix to use for cache items (the number should be incremented when the map format changes) |
| 14 | + var $mPages = null; // An Array of "transliterator:$mapname" => The database row for that template. |
| 15 | + var $mMaps = array();// An Array of "$mapname" => The map parsed from that page. |
| 16 | + |
| 17 | + /** |
| 18 | + * Split a word into letters (not bytes or codepoints) implicitly in NFC due to MediaWiki. |
| 19 | + */ |
| 20 | + function letters( $word ) { |
| 21 | + global $utfCombiningClass; |
| 22 | + UtfNormal::loadData(); |
| 23 | + |
| 24 | + $split = preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); |
| 25 | + |
| 26 | + $i = 1; |
| 27 | + while ( $i < count( $split ) ) { |
| 28 | + if ( isset( $utfCombiningClass[$split[$i]] ) ) { |
| 29 | + $split[$i - 1] .= $split[$i]; |
| 30 | + unset( $split[$i] ); |
| 31 | + |
| 32 | + } else { |
| 33 | + $i++; |
| 34 | + |
| 35 | + } |
| 36 | + } |
| 37 | + |
| 38 | + return $split; |
| 39 | + } |
| 40 | + |
| 41 | + /** |
| 42 | + * Split a word into the NFD codepoints that make it up. |
| 43 | + */ |
| 44 | + function codepoints( $word ) { |
| 45 | + $word = UtfNormal::toNFD( $word ); |
| 46 | + return preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); |
| 47 | + } |
| 48 | + |
| 49 | + /** |
| 50 | + * Given a codepoints or letters array returns a list that contains 1 for every |
| 51 | + * alphabetic character and accent, and 0 otherwise. This allows for edge-of-word |
| 52 | + * detection. |
| 53 | + */ |
| 54 | + function alphamap( $letters ) { |
| 55 | + |
| 56 | + $output = Array(); |
| 57 | + $count = count($letters); |
| 58 | + |
| 59 | + for ($i = 0; $i < $count; $i++) { |
| 60 | + $output[] = preg_match( '/\pL/u', $letters[$i]) || isset( $utfCombiningClass[$letters[$i]] ); |
| 61 | + } |
| 62 | + |
| 63 | + return $output; |
| 64 | + } |
| 65 | + |
| 66 | + /** |
| 67 | + * Get all the existing maps in one query, useful given that the default |
| 68 | + * behaviour of failing silently is designed to allow it to be used by |
| 69 | + * templates that don't know if a map exists, so may try far too often. |
| 70 | + */ |
| 71 | + function getExistingMapNames( $prefix ) { |
| 72 | + global $wgMemc; |
| 73 | + |
| 74 | + // Have we used it on this page already? |
| 75 | + if ( ! is_null($this->mPages) ) |
| 76 | + return $this->mPages; |
| 77 | + |
| 78 | + // Have we used it recently? |
| 79 | + $cached = $wgMemc->get( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ) ); |
| 80 | + if ( $cached ) |
| 81 | + return $this->mPages = $cached; |
| 82 | + |
| 83 | + $dbr = wfGetDB( DB_SLAVE ); |
| 84 | + $res = $dbr->select( 'page', |
| 85 | + array( '*' ), |
| 86 | + array( |
| 87 | + 'page_namespace' => NS_MEDIAWIKI, |
| 88 | + 'page_title LIKE \'' . $dbr->escapeLike( $prefix ) .'%\'' |
| 89 | + ), |
| 90 | + __METHOD__ |
| 91 | + ); |
| 92 | + |
| 93 | + $this->mPages = Array(); |
| 94 | + |
| 95 | + while ( $r = $res->fetchObject() ) { |
| 96 | + $this->mPages[$r->page_title] = $r->page_id; |
| 97 | + } |
| 98 | + |
| 99 | + $wgMemc->set( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ), $this->mPages ); |
| 100 | + return $this->mPages; |
| 101 | + } |
| 102 | + /** |
| 103 | + * Get a map function, either from the local cache or from the page, |
| 104 | + */ |
| 105 | + function getMap( $prefix, $mappage ) { |
| 106 | + global $wgMemc; |
| 107 | + |
| 108 | + // Have we used it on this page already? |
| 109 | + if ( isset( $this->mMaps[$mappage] ) ) { |
| 110 | + return $this->mMaps[$mappage]; |
| 111 | + } |
| 112 | + |
| 113 | + // Does it exist at all? |
| 114 | + $existing = $this->getExistingMapNames( $prefix ); |
| 115 | + if ( isset( $existing[$mappage] ) ) { |
| 116 | + |
| 117 | + // Have we used it recently? |
| 118 | + $map = $wgMemc->get( wfMemcKey( self::CACHE_PREFIX, $mappage ) ); |
| 119 | + if (! $map ) { |
| 120 | + |
| 121 | + $map = $this->readMap( wfMsg( $mappage ), $mappage ); |
| 122 | + |
| 123 | + if ( $map ) |
| 124 | + $wgMemc->set( wfMemcKey( self::CACHE_PREFIX, $mappage ), $map); |
| 125 | + } |
| 126 | + |
| 127 | + } else { |
| 128 | + $map = false; |
| 129 | + } |
| 130 | + |
| 131 | + return $this->mMaps[$mappage] = $map; |
| 132 | + } |
| 133 | + |
| 134 | + /** |
| 135 | + * Returns true if the line might contain something useful, false otherwise. |
| 136 | + */ |
| 137 | + static function is_useful_line( $line ) { |
| 138 | + return $line != "" && substr( $line, 0, 1 ) != '#'; |
| 139 | + } |
| 140 | + |
| 141 | + /** |
| 142 | + * Parse a map input syntax into a map. |
| 143 | + * |
| 144 | + * Input syntax is a set of lines. |
| 145 | + * All " " are ignored. |
| 146 | + * Lines starting with # are ignored, remaining lines are split by => |
| 147 | + * HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints) |
| 148 | + * |
| 149 | + * The map created is a set of "from" strings to "to" strings |
| 150 | + * With extra "from" => true for all substrings of "from" strings |
| 151 | + * So that the transliteration algorithm knows when it has found the longest match |
| 152 | + * |
| 153 | + * $map[''] is used as the default fall through for any characters not in the map |
| 154 | + * $map['__decompose__'] indicates that NFD should be used instead of characters |
| 155 | + * $map['__sensitive__'] indicates that the automatic first-letter upper-case fall-through should not be tried |
| 156 | + */ |
| 157 | + function readMap( $input, $mappage ) { |
| 158 | + global $wgTransliteratorRuleCount, $wgTransliteratorRuleSize; |
| 159 | + |
| 160 | + $map = array(); |
| 161 | + $decompose = false; |
| 162 | + |
| 163 | + // Split lines and remove whitespace at beginning and end |
| 164 | + $input = trim( $input ); |
| 165 | + $lines = preg_split( "/\s*\n\s*/", $input ); |
| 166 | + $lines = array_filter( $lines, 'ExtTransliterator::is_useful_line' ); |
| 167 | + $lines = array_values( $lines ); |
| 168 | + |
| 169 | + $count = count( $lines ); |
| 170 | + |
| 171 | + // The only content was comments |
| 172 | + if ( $count == 0 ) |
| 173 | + return false; |
| 174 | + |
| 175 | + // The first line can contain flags |
| 176 | + $firstLine = $lines[0]; |
| 177 | + if ( strpos( $firstLine, "=>") === FALSE ) { |
| 178 | + // Or, could just signify that the message was blank |
| 179 | + if ( $firstLine == "<$mappage>") |
| 180 | + return false; |
| 181 | + else if ( preg_replace( '/<(decompose|sensitive)>/', '', $firstLine ) != '') |
| 182 | + return wfMsg( 'transliterator-error-syntax', $firstLine, $mappage ); |
| 183 | + |
| 184 | + if ( strpos( $firstLine, "<decompose>" ) !== FALSE ) { |
| 185 | + $map['__decompose__'] = true; |
| 186 | + $decompose = true; |
| 187 | + } |
| 188 | + if ( strpos( $firstLine, "<sensitive>" ) !== FALSE ) { |
| 189 | + $map['__sensitive__'] = true; |
| 190 | + } |
| 191 | + array_shift( $lines ); |
| 192 | + $count--; |
| 193 | + } |
| 194 | + |
| 195 | + if ( $count > $wgTransliteratorRuleCount ) |
| 196 | + return wfMsgExt( 'transliterator-error-rulecount', array('parsemag'), $wgTransliteratorRuleCount, $mappage ); |
| 197 | + |
| 198 | + foreach ( $lines as $line ) { |
| 199 | + |
| 200 | + $pair = preg_split( '/\s*=>\s*/', $line ); |
| 201 | + |
| 202 | + if ( count( $pair ) != 2 ) |
| 203 | + return wfMsg( "transliterator-error-syntax", $line, $mappage ); |
| 204 | + |
| 205 | + $from = $pair[0]; |
| 206 | + $to = Sanitizer::decodeCharReferences( $pair[1], ENT_QUOTES, 'UTF-8' ); |
| 207 | + |
| 208 | + // Convert the ^ and $ selectors into special characters for matching |
| 209 | + // Leave single ^ and $'s alone incase someone wants to use them |
| 210 | + // Still permits the creation of the rule "^$=>" that will never match, but hey |
| 211 | + $fromlast = strlen( $from ) - 1; |
| 212 | + if ( $fromlast > 0 ) { |
| 213 | + if ( $from[0] == "^" ) { |
| 214 | + $from = substr( $from, 1 ) . self::FIRST; |
| 215 | + $fromlast--; |
| 216 | + } |
| 217 | + |
| 218 | + if ( $from[$fromlast] == "$") |
| 219 | + $from[$fromlast] = self::LAST; |
| 220 | + } |
| 221 | + |
| 222 | + // Now we've looked at our syntax we can remove html escaping to reveal the true form |
| 223 | + $from = Sanitizer::decodeCharReferences( $from, ENT_QUOTES, 'UTF-8' ); |
| 224 | + if ( $decompose ) { // Undo the NFCing of MediaWiki |
| 225 | + $from = UtfNormal::toNFD( $from ); |
| 226 | + } |
| 227 | + |
| 228 | + // If $map[$from] is set we can skip the filling in of sub-strings as there is a longer rule |
| 229 | + if ( isset( $map[$from] ) ) { |
| 230 | + |
| 231 | + // Or a rule of the same length, i.e. the same rule. |
| 232 | + if ( is_string( $map[$from] ) && $to != $map[$from] ) |
| 233 | + return wfMsg("transliterator-error-ambiguous", $line, $mappage); |
| 234 | + |
| 235 | + } else if ( strlen( $from ) > 1 ){ |
| 236 | + |
| 237 | + // Bail if the left hand side is too long (has performance implications otherwise) |
| 238 | + $fromlen = strlen( $from ); |
| 239 | + if ( $fromlen > $wgTransliteratorRuleSize ) |
| 240 | + return wfMsgExt('transliterator-error-rulesize', array('parsemag'), $line, $mappage, $wgTransliteratorRuleSize ); |
| 241 | + |
| 242 | + // Fill in the blanks, so that we know when to stop looking while transliterating |
| 243 | + for ( $i = 1; $i < $fromlen; $i++ ) { |
| 244 | + $substr = substr( $from, 0, $i ); |
| 245 | + |
| 246 | + if (! isset( $map[$substr] ) ) |
| 247 | + $map[$substr] = true; |
| 248 | + } |
| 249 | + } // else we have the default rule |
| 250 | + |
| 251 | + $map[$from] = $to; |
| 252 | + } |
| 253 | + |
| 254 | + return $map; |
| 255 | + } |
| 256 | + |
| 257 | + /** |
| 258 | + * Transliterate a word by iteratively finding the longest substring from |
| 259 | + * the start of the untransliterated string that we have a rule for, and |
| 260 | + * transliterating it. |
| 261 | + */ |
| 262 | + function transliterate( $word, $map ) |
| 263 | + { |
| 264 | + if ( isset( $map["__decompose__"] ) ) { |
| 265 | + $letters = $this->codepoints( $word ); |
| 266 | + } else { |
| 267 | + $letters = $this->letters( $word ); |
| 268 | + } |
| 269 | + |
| 270 | + $alphamap = $this->alphamap( $letters ); |
| 271 | + |
| 272 | + $sensitive = isset( $map["__sensitive__"] ); // Are we in case-sensitive mode, or not |
| 273 | + $ucfirst = false; // We are in case-sensitive mode and the first character of the current match was upper-case originally |
| 274 | + $lastUpper = null; // We have lower-cased the current letter, but we need to keep track of the original (dotted I for example) |
| 275 | + |
| 276 | + $output = ""; // The output |
| 277 | + $lastMatch = 0; // The position of the last character matched, or the first character of the current run |
| 278 | + $lastTrans = null; // The transliteration of the last character matched, or null if the first character of the current run |
| 279 | + $i = 0; // The current position in the string |
| 280 | + $count = count($letters); // The total number of characters in the string |
| 281 | + $current = ""; // The substring that we are currently trying to find the longest match for. |
| 282 | + $currentStart = 0; // The position that $current starts at |
| 283 | + |
| 284 | + while ( $lastMatch < $count ) { |
| 285 | + |
| 286 | + if ( $i < $count ) { |
| 287 | + |
| 288 | + $next = $current.$letters[$i]; |
| 289 | + |
| 290 | + // There may be a match longer than $current |
| 291 | + if ( isset( $map[$next] ) ) { |
| 292 | + |
| 293 | + // In fact, $next is a match |
| 294 | + if ( is_string( $map[$next] ) ) { |
| 295 | + $lastMatch = $i; |
| 296 | + $lastTrans = $map[$next]; |
| 297 | + } |
| 298 | + |
| 299 | + $i++; |
| 300 | + $current = $next; |
| 301 | + continue; |
| 302 | + } |
| 303 | + } |
| 304 | + |
| 305 | + |
| 306 | + // If this match is at the end of a word, see whether we have a more specific rule |
| 307 | + if ( $alphamap[$i-1] && ( $i == $count || !$alphamap[$i] ) ) { |
| 308 | + $try = $current . self::LAST; |
| 309 | + if ( isset( $map[$try] ) ) { |
| 310 | + if ( is_string( $map[$try] ) ) { |
| 311 | + $lastTrans = $map[$try]; |
| 312 | + } |
| 313 | + if ( isset( $map[$try . self::FIRST] ) ) { |
| 314 | + $current = $try; |
| 315 | + } |
| 316 | + } |
| 317 | + } |
| 318 | + |
| 319 | + // If this match is at the start of a word, see whether we have a more specific rule |
| 320 | + if ( ( $currentStart == 0 || !$alphamap[$currentStart-1]) && $alphamap[$currentStart] ) { |
| 321 | + $try = $current . self::FIRST; |
| 322 | + if ( isset( $map[$try] ) && is_string( $map[$try] ) ) { |
| 323 | + $lastTrans = $map[$try]; |
| 324 | + } |
| 325 | + } |
| 326 | + |
| 327 | + // We had no match at all, pass through one character |
| 328 | + if ( is_null( $lastTrans ) ) { |
| 329 | + |
| 330 | + $lastLetter = $letters[$lastMatch]; |
| 331 | + $lastLower = $sensitive ? $lastLetter : mb_strtolower( $lastLetter ); |
| 332 | + |
| 333 | + // If we are not being sensitive, we can try down-casing the previous letter |
| 334 | + if ( $lastLetter != $lastLower ) { |
| 335 | + $ucfirst = true; |
| 336 | + $letters[$lastMatch] = $lastLower; |
| 337 | + $lastUpper = $lastLetter; |
| 338 | + |
| 339 | + // Might be nice to output a ? if we don't understand |
| 340 | + } else if ( isset( $map[''] ) ) { |
| 341 | + |
| 342 | + if ( $ucfirst ) { |
| 343 | + $output .= str_replace( '$1', $lastUpper , $map[''] ); |
| 344 | + $ucfirst = false; |
| 345 | + } else { |
| 346 | + $output .= str_replace( '$1', $lastLetter, $map[''] ); |
| 347 | + } |
| 348 | + $i = $currentStart = ++$lastMatch; |
| 349 | + $current = ""; |
| 350 | + |
| 351 | + // Or the input if it's likely to be correct enough |
| 352 | + } else { |
| 353 | + |
| 354 | + if ( $ucfirst ) { |
| 355 | + $output .= $lastUpper; |
| 356 | + $ucfirst = false; |
| 357 | + } else { |
| 358 | + $output .= $lastLetter; |
| 359 | + } |
| 360 | + $i = $currentStart = ++$lastMatch; |
| 361 | + $current = ""; |
| 362 | + } |
| 363 | + |
| 364 | + // Output the previous match |
| 365 | + } else { |
| 366 | + |
| 367 | + if ( $ucfirst ) { |
| 368 | + $output .= mb_strtoupper( mb_substr( $lastTrans, 0, 1 ) ).mb_substr( $lastTrans, 1 ); |
| 369 | + $ucfirst = false; |
| 370 | + } else { |
| 371 | + $output .= $lastTrans; |
| 372 | + } |
| 373 | + $i = $currentStart = ++$lastMatch; |
| 374 | + $lastTrans = null; |
| 375 | + $current = ""; |
| 376 | + |
| 377 | + } |
| 378 | + } |
| 379 | + return $output; |
| 380 | + } |
| 381 | + |
| 382 | + /** |
| 383 | + * {{#transliterate:<mapname>|<word>[|<format>[|<answer>[|<onerror>]]]}} |
| 384 | + * |
| 385 | + * Direct usage will generally be of the form {{#transilterate:<mapname>|<word>}} while |
| 386 | + * generic templates may find the latter three parameters invaluable for easy use. |
| 387 | + * |
| 388 | + * $mapname is the name of the transliteration map to find. |
| 389 | + * $word is the string to transliterate (if the map was found) |
| 390 | + * $format is a string containing $1 to be replaced by the transliteration if the map exists |
| 391 | + * $answer allows for a user-specified transliteration to override the automatic one |
| 392 | + * $other is an error messsage to display if $answer is blank and an invalid map is specified |
| 393 | + */ |
| 394 | + function render( &$parser, $mapname = '', $word = '', $format = '$1', $answer = '', $other = '' ) { |
| 395 | + |
| 396 | + if ( trim( $format ) == '') { // Handle the case when people use {{#transliterate:<>|<>||<>}} |
| 397 | + $format = '$1'; |
| 398 | + } |
| 399 | + |
| 400 | + if ( trim( $answer ) != '') { |
| 401 | + return str_replace('$1', $answer, $format); |
| 402 | + } |
| 403 | + |
| 404 | + $prefix = wfMsg( 'transliterator-prefix' ); |
| 405 | + $title = Title::newFromText( $prefix . $mapname, NS_MEDIAWIKI ); |
| 406 | + |
| 407 | + if (! $title ) { |
| 408 | + return $other == '' ? str_replace("$1", "{{#transliterate:$mapname|$word}}", $format) : $other; |
| 409 | + } |
| 410 | + |
| 411 | + $mappage = $title->getDBkey(); |
| 412 | + |
| 413 | + $map = $this->getMap( $prefix, $mappage ); |
| 414 | + |
| 415 | + if ( !$map ) { // False if map was not found |
| 416 | + $output = $other; |
| 417 | + |
| 418 | + } else if ( is_string( $map ) ) { // An error message |
| 419 | + $output = '<span class="transliterator error"> '.$map.' </span>'; |
| 420 | + |
| 421 | + } else { // A Map |
| 422 | + $trans = UtfNormal::toNFC( $this->transliterate( Sanitizer::decodeCharReferences( $word ), $map ) ); |
| 423 | + $output = str_replace( '$1', $trans, $format ); |
| 424 | + } |
| 425 | + |
| 426 | + // Populate the dependency table so that we get re-rendered if the map changes. |
| 427 | + if ( isset( $this->mPages[$mappage] ) ) |
| 428 | + $parser->mOutput->addTemplate( $title, $this->mPages[$mappage], null ); |
| 429 | + |
| 430 | + else |
| 431 | + $parser->mOutput->addTemplate( $title, $title->getArticleID(), null ); |
| 432 | + |
| 433 | + return $output; |
| 434 | + } |
| 435 | + |
| 436 | + /** |
| 437 | + * Called on ArticlePurge, ArticleDeleteComplete and NewRevisionFromEditComplete in order to purge cache |
| 438 | + */ |
| 439 | + static function purgeArticle( &$article ) { |
| 440 | + return self::purgeTitle( $article->getTitle() ); |
| 441 | + } |
| 442 | + |
| 443 | + /** |
| 444 | + * Called on TitleMoveComplete |
| 445 | + */ |
| 446 | + static function purgeNewTitle ( &$title, &$newtitle ) { |
| 447 | + return self::purgeTitle( $newtitle ); |
| 448 | + } |
| 449 | + |
| 450 | + /** |
| 451 | + * Called on ArticleUndelete (and by other purge hook handlers) |
| 452 | + */ |
| 453 | + static function purgeTitle( &$title ) { |
| 454 | + global $wgMemc; |
| 455 | + if ( $title->getNamespace() == NS_MEDIAWIKI ) { |
| 456 | + $text = $title->getText(); |
| 457 | + $prefix = wfMsg( 'transliterator-prefix' ); |
| 458 | + if ( strpos( $text, $prefix ) === 0 ) { |
| 459 | + $wgMemc->delete( wfMemcKey( self::CACHE_PREFIX, $title->getDBkey() ) ); |
| 460 | + $wgMemc->delete( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ) ); |
| 461 | + } |
| 462 | + } |
| 463 | + return true; |
| 464 | + |
| 465 | + } |
| 466 | + |
| 467 | + /** |
| 468 | + * Called on first use to create singleton |
| 469 | + */ |
| 470 | + static function setup( &$parser ) { |
| 471 | + $trans = new ExtTransliterator; |
| 472 | + $parser->setFunctionHook( 'transliterate', array( $trans, 'render' ) ); |
| 473 | + return true; |
| 474 | + } |
| 475 | +} |
| 476 | +?> |
Property changes on: trunk/extensions/Transliterator/Transliterator_body.php |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 477 | + native |
Index: trunk/extensions/Transliterator/Transliterator.php |
— | — | @@ -64,479 +64,14 @@ |
65 | 65 | 'path' => __FILE__, |
66 | 66 | ); |
67 | 67 | |
| 68 | +$wgAutoloadClasses['ExtTransliterator'] = dirname( __FILE__ ) . "/Transliterator_body.php"; |
| 69 | +$wgExtensionMessagesFiles['Transliterator'] = dirname(__FILE__) . '/Transliterator.i18n.php'; |
| 70 | + |
68 | 71 | $wgHooks['ParserFirstCallInit'][] = 'ExtTransliterator::setup'; |
69 | | -$wgExtensionMessagesFiles['Transliterator'] = dirname(__FILE__).'/Transliterator.i18n.php'; |
70 | 72 | $wgHooks['ArticleDeleteComplete'][] = 'ExtTransliterator::purgeArticle'; |
71 | 73 | $wgHooks['NewRevisionFromEditComplete'][] = 'ExtTransliterator::purgeArticle'; |
72 | 74 | $wgHooks['ArticlePurge'][] = 'ExtTransliterator::purgeArticle'; |
73 | 75 | $wgHooks['ArticleUndelete'][] = 'ExtTransliterator::purgeTitle'; |
74 | 76 | $wgHooks['TitleMoveComplete'][] = 'ExtTransliterator::purgeNewtitle'; |
75 | 77 | |
76 | | -class ExtTransliterator { |
77 | | - |
78 | | - const FIRST = "\x1F"; // A character that will be appended when ^ should match at the start |
79 | | - const LAST = "\x1E"; // A character that will be appended when $ should match at the end |
80 | | - const CACHE_PREFIX = "extTransliterator.2:"; // The prefix to use for cache items (the number should be incremented when the map format changes) |
81 | | - var $mPages = null; // An Array of "transliterator:$mapname" => The database row for that template. |
82 | | - var $mMaps = array();// An Array of "$mapname" => The map parsed from that page. |
83 | | - |
84 | | - /** |
85 | | - * Split a word into letters (not bytes or codepoints) implicitly in NFC due to MediaWiki. |
86 | | - */ |
87 | | - function letters( $word ) { |
88 | | - global $utfCombiningClass; |
89 | | - UtfNormal::loadData(); |
90 | | - |
91 | | - $split = preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); |
92 | | - |
93 | | - $i = 1; |
94 | | - while ( $i < count( $split ) ) { |
95 | | - if ( isset( $utfCombiningClass[$split[$i]] ) ) { |
96 | | - $split[$i - 1] .= $split[$i]; |
97 | | - unset( $split[$i] ); |
98 | | - |
99 | | - } else { |
100 | | - $i++; |
101 | | - |
102 | | - } |
103 | | - } |
104 | | - |
105 | | - return $split; |
106 | | - } |
107 | | - |
108 | | - /** |
109 | | - * Split a word into the NFD codepoints that make it up. |
110 | | - */ |
111 | | - function codepoints( $word ) { |
112 | | - $word = UtfNormal::toNFD( $word ); |
113 | | - return preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); |
114 | | - } |
115 | | - |
116 | | - /** |
117 | | - * Given a codepoints or letters array returns a list that contains 1 for every |
118 | | - * alphabetic character and accent, and 0 otherwise. This allows for edge-of-word |
119 | | - * detection. |
120 | | - */ |
121 | | - function alphamap( $letters ) { |
122 | | - |
123 | | - $output = Array(); |
124 | | - $count = count($letters); |
125 | | - |
126 | | - for ($i = 0; $i < $count; $i++) { |
127 | | - $output[] = preg_match( '/\pL/u', $letters[$i]) || isset( $utfCombiningClass[$letters[$i]] ); |
128 | | - } |
129 | | - |
130 | | - return $output; |
131 | | - } |
132 | | - |
133 | | - /** |
134 | | - * Get all the existing maps in one query, useful given that the default |
135 | | - * behaviour of failing silently is designed to allow it to be used by |
136 | | - * templates that don't know if a map exists, so may try far too often. |
137 | | - */ |
138 | | - function getExistingMapNames( $prefix ) { |
139 | | - global $wgMemc; |
140 | | - |
141 | | - // Have we used it on this page already? |
142 | | - if ( ! is_null($this->mPages) ) |
143 | | - return $this->mPages; |
144 | | - |
145 | | - // Have we used it recently? |
146 | | - $cached = $wgMemc->get( self::CACHE_PREFIX . "__map_names__" ); |
147 | | - if ( $cached ) |
148 | | - return $this->mPages = $cached; |
149 | | - |
150 | | - $dbr = wfGetDB( DB_SLAVE ); |
151 | | - $res = $dbr->select( 'page', |
152 | | - array( '*' ), |
153 | | - array( |
154 | | - 'page_namespace' => NS_MEDIAWIKI, |
155 | | - 'page_title LIKE \'' . $dbr->escapeLike( $prefix ) .'%\'' |
156 | | - ), |
157 | | - __METHOD__ |
158 | | - ); |
159 | | - |
160 | | - $this->mPages = Array(); |
161 | | - |
162 | | - while ( $r = $res->fetchObject() ) { |
163 | | - $this->mPages[$r->page_title] = $r->page_id; |
164 | | - } |
165 | | - |
166 | | - $wgMemc->set( self::CACHE_PREFIX . "__map_names__", $this->mPages ); |
167 | | - return $this->mPages; |
168 | | - } |
169 | | - /** |
170 | | - * Get a map function, either from the local cache or from the page, |
171 | | - */ |
172 | | - function getMap( $prefix, $mappage ) { |
173 | | - global $wgMemc; |
174 | | - |
175 | | - // Have we used it on this page already? |
176 | | - if ( isset( $this->mMaps[$mappage] ) ) { |
177 | | - return $this->mMaps[$mappage]; |
178 | | - } |
179 | | - |
180 | | - // Does it exist at all? |
181 | | - $existing = $this->getExistingMapNames( $prefix ); |
182 | | - if ( isset( $existing[$mappage] ) ) { |
183 | | - |
184 | | - // Have we used it recently? |
185 | | - $map = $wgMemc->get( self::CACHE_PREFIX . $mappage ); |
186 | | - if (! $map ) { |
187 | | - |
188 | | - $map = $this->readMap( wfMsg( $mappage ), $mappage ); |
189 | | - |
190 | | - if ( $map ) |
191 | | - $wgMemc->set( self::CACHE_PREFIX . $mappage, $map); |
192 | | - } |
193 | | - |
194 | | - } else { |
195 | | - $map = false; |
196 | | - } |
197 | | - |
198 | | - return $this->mMaps[$mappage] = $map; |
199 | | - } |
200 | | - |
201 | | - /** |
202 | | - * Parse a map input syntax into a map. |
203 | | - * |
204 | | - * Input syntax is a set of lines. |
205 | | - * All " " are ignored. |
206 | | - * Lines starting with # are ignored, remaining lines are split by => |
207 | | - * HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints) |
208 | | - * |
209 | | - * The map created is a set of "from" strings to "to" strings |
210 | | - * With extra "from" => true for all substrings of "from" strings |
211 | | - * So that the transliteration algorithm knows when it has found the longest match |
212 | | - * |
213 | | - * $map[''] is used as the default fall through for any characters not in the map |
214 | | - * $map['__decompose__'] indicates that NFD should be used instead of characters |
215 | | - * $map['__sensitive__'] indicates that the automatic first-letter upper-case fall-through should not be tried |
216 | | - */ |
217 | | - function readMap( $input, $mappage ) { |
218 | | - global $wgTransliteratorRuleCount, $wgTransliteratorRuleSize; |
219 | | - |
220 | | - $map = array(); |
221 | | - $decompose = false; |
222 | | - |
223 | | - // Split lines and remove whitespace at beginning and end |
224 | | - $lines = preg_split( "/(^|\s*\n)(\s*(#[^\n]*)?\n)*\s*/", $input."\n" ); |
225 | | - |
226 | | - $count = count( $lines ); |
227 | | - |
228 | | - if ( $count > 0 && $lines[0] == "" ) { |
229 | | - array_shift( $lines ); |
230 | | - $count--; |
231 | | - } |
232 | | - |
233 | | - if ( $count > 0 && $lines[$count - 1] == "" ) { |
234 | | - array_pop( $lines ); |
235 | | - $count--; |
236 | | - } |
237 | | - |
238 | | - // The only content was comments |
239 | | - if ( $count == 0 ) |
240 | | - return false; |
241 | | - |
242 | | - // The first line can contain flags |
243 | | - $first_line = $lines[0]; |
244 | | - if ( strpos( $first_line, "=>") === FALSE ) { |
245 | | - // Or, could just signify that the message was blank |
246 | | - if ( $first_line == "<$mappage>") |
247 | | - return false; |
248 | | - else if ( preg_replace( '/<(decompose|sensitive)>/', '', $first_line ) != '') |
249 | | - return wfMsg( 'transliterator-error-syntax', $first_line, $mappage ); |
250 | | - |
251 | | - if ( strpos( $first_line, "<decompose>" ) !== FALSE ) { |
252 | | - $map['__decompose__'] = true; |
253 | | - $decompose = true; |
254 | | - } |
255 | | - if ( strpos( $first_line, "<sensitive>" ) !== FALSE ) { |
256 | | - $map['__sensitive__'] = true; |
257 | | - } |
258 | | - array_shift( $lines ); |
259 | | - $count--; |
260 | | - } |
261 | | - |
262 | | - if ( $count > $wgTransliteratorRuleCount ) |
263 | | - return wfMsgExt( 'transliterator-error-rulecount', array('parsemag'), $wgTransliteratorRuleCount, $mappage ); |
264 | | - |
265 | | - foreach ( $lines as $line ) { |
266 | | - |
267 | | - $pair = preg_split( '/\s*=>\s*/', $line ); |
268 | | - |
269 | | - if ( count( $pair ) != 2 ) |
270 | | - return wfMsg( "transliterator-error-syntax", $line, $mappage ); |
271 | | - |
272 | | - $from = $pair[0]; |
273 | | - $to = Sanitizer::decodeCharReferences( $pair[1], ENT_QUOTES, 'UTF-8' ); |
274 | | - |
275 | | - // Convert the ^ and $ selectors into special characters for matching |
276 | | - // Leave single ^ and $'s alone incase someone wants to use them |
277 | | - // Still permits the creation of the rule "^$=>" that will never match, but hey |
278 | | - $fromlast = strlen( $from ) - 1; |
279 | | - if ( $fromlast > 0 ) { |
280 | | - if ( $from[0] == "^" ) { |
281 | | - $from = substr( $from, 1 ) . self::FIRST; |
282 | | - $fromlast--; |
283 | | - } |
284 | | - |
285 | | - if ( $from[$fromlast] == "$") |
286 | | - $from[$fromlast] = self::LAST; |
287 | | - } |
288 | | - |
289 | | - // Now we've looked at our syntax we can remove html escaping to reveal the true form |
290 | | - $from = Sanitizer::decodeCharReferences( $from, ENT_QUOTES, 'UTF-8' ); |
291 | | - if ( $decompose ) { // Undo the NFCing of MediaWiki |
292 | | - $from = UtfNormal::toNFD( $from ); |
293 | | - } |
294 | | - |
295 | | - // If $map[$from] is set we can skip the filling in of sub-strings as there is a longer rule |
296 | | - if ( isset( $map[$from] ) ) { |
297 | | - |
298 | | - // Or a rule of the same length, i.e. the same rule. |
299 | | - if ( is_string( $map[$from] ) && $to != $map[$from] ) |
300 | | - return wfMsg("transliterator-error-ambiguous", $line, $mappage); |
301 | | - |
302 | | - } else if ( strlen( $from ) > 1 ){ |
303 | | - |
304 | | - // Bail if the left hand side is too long (has performance implications otherwise) |
305 | | - $fromlen = strlen( $from ); |
306 | | - if ( $fromlen > $wgTransliteratorRuleSize ) |
307 | | - return wfMsgExt('transliterator-error-rulesize', array('parsemag'), $line, $mappage, $wgTransliteratorRuleSize ); |
308 | | - |
309 | | - // Fill in the blanks, so that we know when to stop looking while transliterating |
310 | | - for ( $i = 1; $i < $fromlen; $i++ ) { |
311 | | - $substr = substr( $from, 0, $i ); |
312 | | - |
313 | | - if (! isset( $map[$substr] ) ) |
314 | | - $map[$substr] = true; |
315 | | - } |
316 | | - } // else we have the default rule |
317 | | - |
318 | | - $map[$from] = $to; |
319 | | - } |
320 | | - |
321 | | - return $map; |
322 | | - } |
323 | | - |
324 | | - /** |
325 | | - * Transliterate a word by iteratively finding the longest substring from |
326 | | - * the start of the untransliterated string that we have a rule for, and |
327 | | - * transliterating it. |
328 | | - */ |
329 | | - function transliterate( $word, $map ) |
330 | | - { |
331 | | - if ( isset( $map["__decompose__"] ) ) { |
332 | | - $letters = $this->codepoints( $word ); |
333 | | - } else { |
334 | | - $letters = $this->letters( $word ); |
335 | | - } |
336 | | - |
337 | | - $alphamap = $this->alphamap( $letters ); |
338 | | - |
339 | | - $sensitive = isset( $map["__sensitive__"] ); // Are we in case-sensitive mode, or not |
340 | | - $ucfirst = false; // We are in case-sensitive mode and the first character of the current match was upper-case originally |
341 | | - $last_upper = null; // We have lower-cased the current letter, but we need to keep track of the original (dotted I for example) |
342 | | - |
343 | | - $output = ""; // The output |
344 | | - $last_match = 0; // The position of the last character matched, or the first character of the current run |
345 | | - $last_trans = null; // The transliteration of the last character matched, or null if the first character of the current run |
346 | | - $i = 0; // The current position in the string |
347 | | - $count = count($letters); // The total number of characters in the string |
348 | | - $current = ""; // The substring that we are currently trying to find the longest match for. |
349 | | - $current_start = 0; // The position that $current starts at |
350 | | - |
351 | | - while ( $last_match < $count ) { |
352 | | - |
353 | | - if ( $i < $count ) { |
354 | | - |
355 | | - $next = $current.$letters[$i]; |
356 | | - |
357 | | - // There may be a match longer than $current |
358 | | - if ( isset( $map[$next] ) ) { |
359 | | - |
360 | | - // In fact, $next is a match |
361 | | - if ( is_string( $map[$next] ) ) { |
362 | | - $last_match = $i; |
363 | | - $last_trans = $map[$next]; |
364 | | - } |
365 | | - |
366 | | - $i++; |
367 | | - $current = $next; |
368 | | - continue; |
369 | | - } |
370 | | - } |
371 | | - |
372 | | - |
373 | | - // If this match is at the end of a word, see whether we have a more specific rule |
374 | | - if ( $alphamap[$i-1] && ( $i == $count || !$alphamap[$i] ) ) { |
375 | | - $try = $current . self::LAST; |
376 | | - if ( isset( $map[$try] ) ) { |
377 | | - if ( is_string( $map[$try] ) ) { |
378 | | - $last_trans = $map[$try]; |
379 | | - } |
380 | | - if ( isset( $map[$try . self::FIRST] ) ) { |
381 | | - $current = $try; |
382 | | - } |
383 | | - } |
384 | | - } |
385 | | - |
386 | | - // If this match is at the start of a word, see whether we have a more specific rule |
387 | | - if ( ( $current_start == 0 || !$alphamap[$current_start-1]) && $alphamap[$current_start] ) { |
388 | | - $try = $current . self::FIRST; |
389 | | - if ( isset( $map[$try] ) && is_string( $map[$try] ) ) { |
390 | | - $last_trans = $map[$try]; |
391 | | - } |
392 | | - } |
393 | | - |
394 | | - // We had no match at all, pass through one character |
395 | | - if ( is_null( $last_trans ) ) { |
396 | | - |
397 | | - $last_letter = $letters[$last_match]; |
398 | | - $last_lower = $sensitive ? $last_letter : mb_strtolower( $last_letter ); |
399 | | - |
400 | | - // If we are not being sensitive, we can try down-casing the previous letter |
401 | | - if ( $last_letter != $last_lower ) { |
402 | | - $ucfirst = true; |
403 | | - $letters[$last_match] = $last_lower; |
404 | | - $last_upper = $last_letter; |
405 | | - |
406 | | - // Might be nice to output a ? if we don't understand |
407 | | - } else if ( isset( $map[''] ) ) { |
408 | | - |
409 | | - if ( $ucfirst ) { |
410 | | - $output .= str_replace( '$1', $last_upper , $map[''] ); |
411 | | - $ucfirst = false; |
412 | | - } else { |
413 | | - $output .= str_replace( '$1', $last_letter, $map[''] ); |
414 | | - } |
415 | | - $i = $current_start = ++$last_match; |
416 | | - $current = ""; |
417 | | - |
418 | | - // Or the input if it's likely to be correct enough |
419 | | - } else { |
420 | | - |
421 | | - if ( $ucfirst ) { |
422 | | - $output .= $last_upper; |
423 | | - $ucfirst = false; |
424 | | - } else { |
425 | | - $output .= $last_letter; |
426 | | - } |
427 | | - $i = $current_start = ++$last_match; |
428 | | - $current = ""; |
429 | | - } |
430 | | - |
431 | | - // Output the previous match |
432 | | - } else { |
433 | | - |
434 | | - if ( $ucfirst ) { |
435 | | - $output .= mb_strtoupper( mb_substr( $last_trans, 0, 1 ) ).mb_substr( $last_trans, 1 ); |
436 | | - $ucfirst = false; |
437 | | - } else { |
438 | | - $output .= $last_trans; |
439 | | - } |
440 | | - $i = $current_start = ++$last_match; |
441 | | - $last_trans = null; |
442 | | - $current = ""; |
443 | | - |
444 | | - } |
445 | | - } |
446 | | - return $output; |
447 | | - } |
448 | | - |
449 | | - /** |
450 | | - * {{#transliterate:<mapname>|<word>[|<format>[|<answer>[|<onerror>]]]}} |
451 | | - * |
452 | | - * Direct usage will generally be of the form {{#transilterate:<mapname>|<word>}} while |
453 | | - * generic templates may find the latter three parameters invaluable for easy use. |
454 | | - * |
455 | | - * $mapname is the name of the transliteration map to find. |
456 | | - * $word is the string to transliterate (if the map was found) |
457 | | - * $format is a string containing $1 to be replaced by the transliteration if the map exists |
458 | | - * $answer allows for a user-specified transliteration to override the automatic one |
459 | | - * $other is an error messsage to display if $answer is blank and an invalid map is specified |
460 | | - */ |
461 | | - function render( &$parser, $mapname = '', $word = '', $format = '$1', $answer = '', $other = '' ) { |
462 | | - |
463 | | - if ( trim( $format ) == '') { // Handle the case when people use {{#transliterate:<>|<>||<>}} |
464 | | - $format = '$1'; |
465 | | - } |
466 | | - |
467 | | - if ( trim( $answer ) != '') { |
468 | | - return str_replace('$1', $answer, $format); |
469 | | - } |
470 | | - |
471 | | - $prefix = wfMsg( 'transliterator-prefix' ); |
472 | | - $title = Title::newFromText( $prefix . $mapname, NS_MEDIAWIKI ); |
473 | | - |
474 | | - if (! $title ) { |
475 | | - return $other == '' ? str_replace("$1", "{{#transliterate:$mapname|$word}}", $format) : $other; |
476 | | - } |
477 | | - |
478 | | - $mappage = $title->getDBkey(); |
479 | | - |
480 | | - $map = $this->getMap( $prefix, $mappage ); |
481 | | - |
482 | | - if ( !$map ) { // False if map was not found |
483 | | - $output = $other; |
484 | | - |
485 | | - } else if ( is_string( $map ) ) { // An error message |
486 | | - $output = '<span class="transliterator error"> '.$map.' </span>'; |
487 | | - |
488 | | - } else { // A Map |
489 | | - $trans = UtfNormal::toNFC( $this->transliterate( Sanitizer::decodeCharReferences( $word ), $map ) ); |
490 | | - $output = str_replace( '$1', $trans, $format ); |
491 | | - } |
492 | | - |
493 | | - // Populate the dependency table so that we get re-rendered if the map changes. |
494 | | - if ( isset( $this->mPages[$mappage] ) ) |
495 | | - $parser->mOutput->addTemplate( $title, $this->mPages[$mappage], null ); |
496 | | - |
497 | | - else |
498 | | - $parser->mOutput->addTemplate( $title, $title->getArticleID(), null ); |
499 | | - |
500 | | - return $output; |
501 | | - } |
502 | | - |
503 | | - /** |
504 | | - * Called on ArticlePurge, ArticleDeleteComplete and NewRevisionFromEditComplete in order to purge cache |
505 | | - */ |
506 | | - static function purgeArticle( &$article, $a=false, $b=false, $c=false, $d=false ) { |
507 | | - return self::purgeTitle( $article->getTitle() ); |
508 | | - } |
509 | | - |
510 | | - /** |
511 | | - * Called on TitleMoveComplete |
512 | | - */ |
513 | | - static function purgeNewTitle ( &$title, &$newtitle, $a=false, $b=false, $c=false ) { |
514 | | - return self::purgeTitle( $newtitle ); |
515 | | - } |
516 | | - |
517 | | - /** |
518 | | - * Called on ArticleUndelete (and by other purge hook handlers) |
519 | | - */ |
520 | | - static function purgeTitle( &$title, $a=false ) { |
521 | | - global $wgMemc; |
522 | | - if ( $title->getNamespace() == NS_MEDIAWIKI ) { |
523 | | - $text = $title->getText(); |
524 | | - $prefix = wfMsg( 'transliterator-prefix' ); |
525 | | - if ( strpos( $text, $prefix ) === 0 ) { |
526 | | - $wgMemc->delete( self::CACHE_PREFIX . $title->getDBkey() ); |
527 | | - $wgMemc->delete( self::CACHE_PREFIX . "__map_names__" ); |
528 | | - } |
529 | | - } |
530 | | - return true; |
531 | | - |
532 | | - } |
533 | | - |
534 | | - /** |
535 | | - * Called on first use to create singleton |
536 | | - */ |
537 | | - static function setup( &$parser ) { |
538 | | - $trans = new ExtTransliterator; |
539 | | - $parser->setFunctionHook( 'transliterate', array( $trans, 'render' ) ); |
540 | | - return true; |
541 | | - } |
542 | | -} |
543 | | - |
| 78 | +?> |