r57423 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r57422‎ | r57423 | r57424 >
Date:11:10, 6 October 2009
Author:conrad
Status:deferred
Tags:
Comment:
Minor style issues
Modified paths:
  • /trunk/extensions/Transliterator/Transliterator.php (modified) (history)
  • /trunk/extensions/Transliterator/Transliterator_body.php (added) (history)

Diff [purge]

Index: trunk/extensions/Transliterator/Transliterator_body.php
@@ -0,0 +1,475 @@
 2+<?php
 3+
 4+if ( !defined( 'MEDIAWIKI' ) ) {
 5+ die( 'This file is a MediaWiki extension, not a valid entry point.' );
 6+}
 7+
 8+class ExtTransliterator {
 9+
 10+
 11+ const FIRST = "\x1F"; // A character that will be appended when ^ should match at the start
 12+ const LAST = "\x1E"; // A character that will be appended when $ should match at the end
 13+ const CACHE_PREFIX = "extTransliterator.2"; // The prefix to use for cache items (the number should be incremented when the map format changes)
 14+ var $mPages = null; // An Array of "transliterator:$mapname" => The database row for that template.
 15+ var $mMaps = array();// An Array of "$mapname" => The map parsed from that page.
 16+
 17+ /**
 18+ * Split a word into letters (not bytes or codepoints) implicitly in NFC due to MediaWiki.
 19+ */
 20+ function letters( $word ) {
 21+ global $utfCombiningClass;
 22+ UtfNormal::loadData();
 23+
 24+ $split = preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
 25+
 26+ $i = 1;
 27+ while ( $i < count( $split ) ) {
 28+ if ( isset( $utfCombiningClass[$split[$i]] ) ) {
 29+ $split[$i - 1] .= $split[$i];
 30+ unset( $split[$i] );
 31+
 32+ } else {
 33+ $i++;
 34+
 35+ }
 36+ }
 37+
 38+ return $split;
 39+ }
 40+
 41+ /**
 42+ * Split a word into the NFD codepoints that make it up.
 43+ */
 44+ function codepoints( $word ) {
 45+ $word = UtfNormal::toNFD( $word );
 46+ return preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
 47+ }
 48+
 49+ /**
 50+ * Given a codepoints or letters array returns a list that contains 1 for every
 51+ * alphabetic character and accent, and 0 otherwise. This allows for edge-of-word
 52+ * detection.
 53+ */
 54+ function alphamap( $letters ) {
 55+
 56+ $output = Array();
 57+ $count = count($letters);
 58+
 59+ for ($i = 0; $i < $count; $i++) {
 60+ $output[] = preg_match( '/\pL/u', $letters[$i]) || isset( $utfCombiningClass[$letters[$i]] );
 61+ }
 62+
 63+ return $output;
 64+ }
 65+
 66+ /**
 67+ * Get all the existing maps in one query, useful given that the default
 68+ * behaviour of failing silently is designed to allow it to be used by
 69+ * templates that don't know if a map exists, so may try far too often.
 70+ */
 71+ function getExistingMapNames( $prefix ) {
 72+ global $wgMemc;
 73+
 74+ // Have we used it on this page already?
 75+ if ( ! is_null($this->mPages) )
 76+ return $this->mPages;
 77+
 78+ // Have we used it recently?
 79+ $cached = $wgMemc->get( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ) );
 80+ if ( $cached )
 81+ return $this->mPages = $cached;
 82+
 83+ $dbr = wfGetDB( DB_SLAVE );
 84+ $res = $dbr->select( 'page',
 85+ array( '*' ),
 86+ array(
 87+ 'page_namespace' => NS_MEDIAWIKI,
 88+ 'page_title LIKE \'' . $dbr->escapeLike( $prefix ) .'%\''
 89+ ),
 90+ __METHOD__
 91+ );
 92+
 93+ $this->mPages = Array();
 94+
 95+ while ( $r = $res->fetchObject() ) {
 96+ $this->mPages[$r->page_title] = $r->page_id;
 97+ }
 98+
 99+ $wgMemc->set( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ), $this->mPages );
 100+ return $this->mPages;
 101+ }
 102+ /**
 103+ * Get a map function, either from the local cache or from the page,
 104+ */
 105+ function getMap( $prefix, $mappage ) {
 106+ global $wgMemc;
 107+
 108+ // Have we used it on this page already?
 109+ if ( isset( $this->mMaps[$mappage] ) ) {
 110+ return $this->mMaps[$mappage];
 111+ }
 112+
 113+ // Does it exist at all?
 114+ $existing = $this->getExistingMapNames( $prefix );
 115+ if ( isset( $existing[$mappage] ) ) {
 116+
 117+ // Have we used it recently?
 118+ $map = $wgMemc->get( wfMemcKey( self::CACHE_PREFIX, $mappage ) );
 119+ if (! $map ) {
 120+
 121+ $map = $this->readMap( wfMsg( $mappage ), $mappage );
 122+
 123+ if ( $map )
 124+ $wgMemc->set( wfMemcKey( self::CACHE_PREFIX, $mappage ), $map);
 125+ }
 126+
 127+ } else {
 128+ $map = false;
 129+ }
 130+
 131+ return $this->mMaps[$mappage] = $map;
 132+ }
 133+
 134+ /**
 135+ * Returns true if the line might contain something useful, false otherwise.
 136+ */
 137+ static function is_useful_line( $line ) {
 138+ return $line != "" && substr( $line, 0, 1 ) != '#';
 139+ }
 140+
 141+ /**
 142+ * Parse a map input syntax into a map.
 143+ *
 144+ * Input syntax is a set of lines.
 145+ * All " " are ignored.
 146+ * Lines starting with # are ignored, remaining lines are split by =>
 147+ * HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints)
 148+ *
 149+ * The map created is a set of "from" strings to "to" strings
 150+ * With extra "from" => true for all substrings of "from" strings
 151+ * So that the transliteration algorithm knows when it has found the longest match
 152+ *
 153+ * $map[''] is used as the default fall through for any characters not in the map
 154+ * $map['__decompose__'] indicates that NFD should be used instead of characters
 155+ * $map['__sensitive__'] indicates that the automatic first-letter upper-case fall-through should not be tried
 156+ */
 157+ function readMap( $input, $mappage ) {
 158+ global $wgTransliteratorRuleCount, $wgTransliteratorRuleSize;
 159+
 160+ $map = array();
 161+ $decompose = false;
 162+
 163+ // Split lines and remove whitespace at beginning and end
 164+ $input = trim( $input );
 165+ $lines = preg_split( "/\s*\n\s*/", $input );
 166+ $lines = array_filter( $lines, 'ExtTransliterator::is_useful_line' );
 167+ $lines = array_values( $lines );
 168+
 169+ $count = count( $lines );
 170+
 171+ // The only content was comments
 172+ if ( $count == 0 )
 173+ return false;
 174+
 175+ // The first line can contain flags
 176+ $firstLine = $lines[0];
 177+ if ( strpos( $firstLine, "=>") === FALSE ) {
 178+ // Or, could just signify that the message was blank
 179+ if ( $firstLine == "<$mappage>")
 180+ return false;
 181+ else if ( preg_replace( '/<(decompose|sensitive)>/', '', $firstLine ) != '')
 182+ return wfMsg( 'transliterator-error-syntax', $firstLine, $mappage );
 183+
 184+ if ( strpos( $firstLine, "<decompose>" ) !== FALSE ) {
 185+ $map['__decompose__'] = true;
 186+ $decompose = true;
 187+ }
 188+ if ( strpos( $firstLine, "<sensitive>" ) !== FALSE ) {
 189+ $map['__sensitive__'] = true;
 190+ }
 191+ array_shift( $lines );
 192+ $count--;
 193+ }
 194+
 195+ if ( $count > $wgTransliteratorRuleCount )
 196+ return wfMsgExt( 'transliterator-error-rulecount', array('parsemag'), $wgTransliteratorRuleCount, $mappage );
 197+
 198+ foreach ( $lines as $line ) {
 199+
 200+ $pair = preg_split( '/\s*=>\s*/', $line );
 201+
 202+ if ( count( $pair ) != 2 )
 203+ return wfMsg( "transliterator-error-syntax", $line, $mappage );
 204+
 205+ $from = $pair[0];
 206+ $to = Sanitizer::decodeCharReferences( $pair[1], ENT_QUOTES, 'UTF-8' );
 207+
 208+ // Convert the ^ and $ selectors into special characters for matching
 209+ // Leave single ^ and $'s alone incase someone wants to use them
 210+ // Still permits the creation of the rule "^$=>" that will never match, but hey
 211+ $fromlast = strlen( $from ) - 1;
 212+ if ( $fromlast > 0 ) {
 213+ if ( $from[0] == "^" ) {
 214+ $from = substr( $from, 1 ) . self::FIRST;
 215+ $fromlast--;
 216+ }
 217+
 218+ if ( $from[$fromlast] == "$")
 219+ $from[$fromlast] = self::LAST;
 220+ }
 221+
 222+ // Now we've looked at our syntax we can remove html escaping to reveal the true form
 223+ $from = Sanitizer::decodeCharReferences( $from, ENT_QUOTES, 'UTF-8' );
 224+ if ( $decompose ) { // Undo the NFCing of MediaWiki
 225+ $from = UtfNormal::toNFD( $from );
 226+ }
 227+
 228+ // If $map[$from] is set we can skip the filling in of sub-strings as there is a longer rule
 229+ if ( isset( $map[$from] ) ) {
 230+
 231+ // Or a rule of the same length, i.e. the same rule.
 232+ if ( is_string( $map[$from] ) && $to != $map[$from] )
 233+ return wfMsg("transliterator-error-ambiguous", $line, $mappage);
 234+
 235+ } else if ( strlen( $from ) > 1 ){
 236+
 237+ // Bail if the left hand side is too long (has performance implications otherwise)
 238+ $fromlen = strlen( $from );
 239+ if ( $fromlen > $wgTransliteratorRuleSize )
 240+ return wfMsgExt('transliterator-error-rulesize', array('parsemag'), $line, $mappage, $wgTransliteratorRuleSize );
 241+
 242+ // Fill in the blanks, so that we know when to stop looking while transliterating
 243+ for ( $i = 1; $i < $fromlen; $i++ ) {
 244+ $substr = substr( $from, 0, $i );
 245+
 246+ if (! isset( $map[$substr] ) )
 247+ $map[$substr] = true;
 248+ }
 249+ } // else we have the default rule
 250+
 251+ $map[$from] = $to;
 252+ }
 253+
 254+ return $map;
 255+ }
 256+
 257+ /**
 258+ * Transliterate a word by iteratively finding the longest substring from
 259+ * the start of the untransliterated string that we have a rule for, and
 260+ * transliterating it.
 261+ */
 262+ function transliterate( $word, $map )
 263+ {
 264+ if ( isset( $map["__decompose__"] ) ) {
 265+ $letters = $this->codepoints( $word );
 266+ } else {
 267+ $letters = $this->letters( $word );
 268+ }
 269+
 270+ $alphamap = $this->alphamap( $letters );
 271+
 272+ $sensitive = isset( $map["__sensitive__"] ); // Are we in case-sensitive mode, or not
 273+ $ucfirst = false; // We are in case-sensitive mode and the first character of the current match was upper-case originally
 274+ $lastUpper = null; // We have lower-cased the current letter, but we need to keep track of the original (dotted I for example)
 275+
 276+ $output = ""; // The output
 277+ $lastMatch = 0; // The position of the last character matched, or the first character of the current run
 278+ $lastTrans = null; // The transliteration of the last character matched, or null if the first character of the current run
 279+ $i = 0; // The current position in the string
 280+ $count = count($letters); // The total number of characters in the string
 281+ $current = ""; // The substring that we are currently trying to find the longest match for.
 282+ $currentStart = 0; // The position that $current starts at
 283+
 284+ while ( $lastMatch < $count ) {
 285+
 286+ if ( $i < $count ) {
 287+
 288+ $next = $current.$letters[$i];
 289+
 290+ // There may be a match longer than $current
 291+ if ( isset( $map[$next] ) ) {
 292+
 293+ // In fact, $next is a match
 294+ if ( is_string( $map[$next] ) ) {
 295+ $lastMatch = $i;
 296+ $lastTrans = $map[$next];
 297+ }
 298+
 299+ $i++;
 300+ $current = $next;
 301+ continue;
 302+ }
 303+ }
 304+
 305+
 306+ // If this match is at the end of a word, see whether we have a more specific rule
 307+ if ( $alphamap[$i-1] && ( $i == $count || !$alphamap[$i] ) ) {
 308+ $try = $current . self::LAST;
 309+ if ( isset( $map[$try] ) ) {
 310+ if ( is_string( $map[$try] ) ) {
 311+ $lastTrans = $map[$try];
 312+ }
 313+ if ( isset( $map[$try . self::FIRST] ) ) {
 314+ $current = $try;
 315+ }
 316+ }
 317+ }
 318+
 319+ // If this match is at the start of a word, see whether we have a more specific rule
 320+ if ( ( $currentStart == 0 || !$alphamap[$currentStart-1]) && $alphamap[$currentStart] ) {
 321+ $try = $current . self::FIRST;
 322+ if ( isset( $map[$try] ) && is_string( $map[$try] ) ) {
 323+ $lastTrans = $map[$try];
 324+ }
 325+ }
 326+
 327+ // We had no match at all, pass through one character
 328+ if ( is_null( $lastTrans ) ) {
 329+
 330+ $lastLetter = $letters[$lastMatch];
 331+ $lastLower = $sensitive ? $lastLetter : mb_strtolower( $lastLetter );
 332+
 333+ // If we are not being sensitive, we can try down-casing the previous letter
 334+ if ( $lastLetter != $lastLower ) {
 335+ $ucfirst = true;
 336+ $letters[$lastMatch] = $lastLower;
 337+ $lastUpper = $lastLetter;
 338+
 339+ // Might be nice to output a ? if we don't understand
 340+ } else if ( isset( $map[''] ) ) {
 341+
 342+ if ( $ucfirst ) {
 343+ $output .= str_replace( '$1', $lastUpper , $map[''] );
 344+ $ucfirst = false;
 345+ } else {
 346+ $output .= str_replace( '$1', $lastLetter, $map[''] );
 347+ }
 348+ $i = $currentStart = ++$lastMatch;
 349+ $current = "";
 350+
 351+ // Or the input if it's likely to be correct enough
 352+ } else {
 353+
 354+ if ( $ucfirst ) {
 355+ $output .= $lastUpper;
 356+ $ucfirst = false;
 357+ } else {
 358+ $output .= $lastLetter;
 359+ }
 360+ $i = $currentStart = ++$lastMatch;
 361+ $current = "";
 362+ }
 363+
 364+ // Output the previous match
 365+ } else {
 366+
 367+ if ( $ucfirst ) {
 368+ $output .= mb_strtoupper( mb_substr( $lastTrans, 0, 1 ) ).mb_substr( $lastTrans, 1 );
 369+ $ucfirst = false;
 370+ } else {
 371+ $output .= $lastTrans;
 372+ }
 373+ $i = $currentStart = ++$lastMatch;
 374+ $lastTrans = null;
 375+ $current = "";
 376+
 377+ }
 378+ }
 379+ return $output;
 380+ }
 381+
 382+ /**
 383+ * {{#transliterate:<mapname>|<word>[|<format>[|<answer>[|<onerror>]]]}}
 384+ *
 385+ * Direct usage will generally be of the form {{#transilterate:<mapname>|<word>}} while
 386+ * generic templates may find the latter three parameters invaluable for easy use.
 387+ *
 388+ * $mapname is the name of the transliteration map to find.
 389+ * $word is the string to transliterate (if the map was found)
 390+ * $format is a string containing $1 to be replaced by the transliteration if the map exists
 391+ * $answer allows for a user-specified transliteration to override the automatic one
 392+ * $other is an error messsage to display if $answer is blank and an invalid map is specified
 393+ */
 394+ function render( &$parser, $mapname = '', $word = '', $format = '$1', $answer = '', $other = '' ) {
 395+
 396+ if ( trim( $format ) == '') { // Handle the case when people use {{#transliterate:<>|<>||<>}}
 397+ $format = '$1';
 398+ }
 399+
 400+ if ( trim( $answer ) != '') {
 401+ return str_replace('$1', $answer, $format);
 402+ }
 403+
 404+ $prefix = wfMsg( 'transliterator-prefix' );
 405+ $title = Title::newFromText( $prefix . $mapname, NS_MEDIAWIKI );
 406+
 407+ if (! $title ) {
 408+ return $other == '' ? str_replace("$1", "{{#transliterate:$mapname|$word}}", $format) : $other;
 409+ }
 410+
 411+ $mappage = $title->getDBkey();
 412+
 413+ $map = $this->getMap( $prefix, $mappage );
 414+
 415+ if ( !$map ) { // False if map was not found
 416+ $output = $other;
 417+
 418+ } else if ( is_string( $map ) ) { // An error message
 419+ $output = '<span class="transliterator error"> '.$map.' </span>';
 420+
 421+ } else { // A Map
 422+ $trans = UtfNormal::toNFC( $this->transliterate( Sanitizer::decodeCharReferences( $word ), $map ) );
 423+ $output = str_replace( '$1', $trans, $format );
 424+ }
 425+
 426+ // Populate the dependency table so that we get re-rendered if the map changes.
 427+ if ( isset( $this->mPages[$mappage] ) )
 428+ $parser->mOutput->addTemplate( $title, $this->mPages[$mappage], null );
 429+
 430+ else
 431+ $parser->mOutput->addTemplate( $title, $title->getArticleID(), null );
 432+
 433+ return $output;
 434+ }
 435+
 436+ /**
 437+ * Called on ArticlePurge, ArticleDeleteComplete and NewRevisionFromEditComplete in order to purge cache
 438+ */
 439+ static function purgeArticle( &$article ) {
 440+ return self::purgeTitle( $article->getTitle() );
 441+ }
 442+
 443+ /**
 444+ * Called on TitleMoveComplete
 445+ */
 446+ static function purgeNewTitle ( &$title, &$newtitle ) {
 447+ return self::purgeTitle( $newtitle );
 448+ }
 449+
 450+ /**
 451+ * Called on ArticleUndelete (and by other purge hook handlers)
 452+ */
 453+ static function purgeTitle( &$title ) {
 454+ global $wgMemc;
 455+ if ( $title->getNamespace() == NS_MEDIAWIKI ) {
 456+ $text = $title->getText();
 457+ $prefix = wfMsg( 'transliterator-prefix' );
 458+ if ( strpos( $text, $prefix ) === 0 ) {
 459+ $wgMemc->delete( wfMemcKey( self::CACHE_PREFIX, $title->getDBkey() ) );
 460+ $wgMemc->delete( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ) );
 461+ }
 462+ }
 463+ return true;
 464+
 465+ }
 466+
 467+ /**
 468+ * Called on first use to create singleton
 469+ */
 470+ static function setup( &$parser ) {
 471+ $trans = new ExtTransliterator;
 472+ $parser->setFunctionHook( 'transliterate', array( $trans, 'render' ) );
 473+ return true;
 474+ }
 475+}
 476+?>
Property changes on: trunk/extensions/Transliterator/Transliterator_body.php
___________________________________________________________________
Name: svn:eol-style
1477 + native
Index: trunk/extensions/Transliterator/Transliterator.php
@@ -64,479 +64,14 @@
6565 'path' => __FILE__,
6666 );
6767
 68+$wgAutoloadClasses['ExtTransliterator'] = dirname( __FILE__ ) . "/Transliterator_body.php";
 69+$wgExtensionMessagesFiles['Transliterator'] = dirname(__FILE__) . '/Transliterator.i18n.php';
 70+
6871 $wgHooks['ParserFirstCallInit'][] = 'ExtTransliterator::setup';
69 -$wgExtensionMessagesFiles['Transliterator'] = dirname(__FILE__).'/Transliterator.i18n.php';
7072 $wgHooks['ArticleDeleteComplete'][] = 'ExtTransliterator::purgeArticle';
7173 $wgHooks['NewRevisionFromEditComplete'][] = 'ExtTransliterator::purgeArticle';
7274 $wgHooks['ArticlePurge'][] = 'ExtTransliterator::purgeArticle';
7375 $wgHooks['ArticleUndelete'][] = 'ExtTransliterator::purgeTitle';
7476 $wgHooks['TitleMoveComplete'][] = 'ExtTransliterator::purgeNewtitle';
7577
76 -class ExtTransliterator {
77 -
78 - const FIRST = "\x1F"; // A character that will be appended when ^ should match at the start
79 - const LAST = "\x1E"; // A character that will be appended when $ should match at the end
80 - const CACHE_PREFIX = "extTransliterator.2:"; // The prefix to use for cache items (the number should be incremented when the map format changes)
81 - var $mPages = null; // An Array of "transliterator:$mapname" => The database row for that template.
82 - var $mMaps = array();// An Array of "$mapname" => The map parsed from that page.
83 -
84 - /**
85 - * Split a word into letters (not bytes or codepoints) implicitly in NFC due to MediaWiki.
86 - */
87 - function letters( $word ) {
88 - global $utfCombiningClass;
89 - UtfNormal::loadData();
90 -
91 - $split = preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
92 -
93 - $i = 1;
94 - while ( $i < count( $split ) ) {
95 - if ( isset( $utfCombiningClass[$split[$i]] ) ) {
96 - $split[$i - 1] .= $split[$i];
97 - unset( $split[$i] );
98 -
99 - } else {
100 - $i++;
101 -
102 - }
103 - }
104 -
105 - return $split;
106 - }
107 -
108 - /**
109 - * Split a word into the NFD codepoints that make it up.
110 - */
111 - function codepoints( $word ) {
112 - $word = UtfNormal::toNFD( $word );
113 - return preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
114 - }
115 -
116 - /**
117 - * Given a codepoints or letters array returns a list that contains 1 for every
118 - * alphabetic character and accent, and 0 otherwise. This allows for edge-of-word
119 - * detection.
120 - */
121 - function alphamap( $letters ) {
122 -
123 - $output = Array();
124 - $count = count($letters);
125 -
126 - for ($i = 0; $i < $count; $i++) {
127 - $output[] = preg_match( '/\pL/u', $letters[$i]) || isset( $utfCombiningClass[$letters[$i]] );
128 - }
129 -
130 - return $output;
131 - }
132 -
133 - /**
134 - * Get all the existing maps in one query, useful given that the default
135 - * behaviour of failing silently is designed to allow it to be used by
136 - * templates that don't know if a map exists, so may try far too often.
137 - */
138 - function getExistingMapNames( $prefix ) {
139 - global $wgMemc;
140 -
141 - // Have we used it on this page already?
142 - if ( ! is_null($this->mPages) )
143 - return $this->mPages;
144 -
145 - // Have we used it recently?
146 - $cached = $wgMemc->get( self::CACHE_PREFIX . "__map_names__" );
147 - if ( $cached )
148 - return $this->mPages = $cached;
149 -
150 - $dbr = wfGetDB( DB_SLAVE );
151 - $res = $dbr->select( 'page',
152 - array( '*' ),
153 - array(
154 - 'page_namespace' => NS_MEDIAWIKI,
155 - 'page_title LIKE \'' . $dbr->escapeLike( $prefix ) .'%\''
156 - ),
157 - __METHOD__
158 - );
159 -
160 - $this->mPages = Array();
161 -
162 - while ( $r = $res->fetchObject() ) {
163 - $this->mPages[$r->page_title] = $r->page_id;
164 - }
165 -
166 - $wgMemc->set( self::CACHE_PREFIX . "__map_names__", $this->mPages );
167 - return $this->mPages;
168 - }
169 - /**
170 - * Get a map function, either from the local cache or from the page,
171 - */
172 - function getMap( $prefix, $mappage ) {
173 - global $wgMemc;
174 -
175 - // Have we used it on this page already?
176 - if ( isset( $this->mMaps[$mappage] ) ) {
177 - return $this->mMaps[$mappage];
178 - }
179 -
180 - // Does it exist at all?
181 - $existing = $this->getExistingMapNames( $prefix );
182 - if ( isset( $existing[$mappage] ) ) {
183 -
184 - // Have we used it recently?
185 - $map = $wgMemc->get( self::CACHE_PREFIX . $mappage );
186 - if (! $map ) {
187 -
188 - $map = $this->readMap( wfMsg( $mappage ), $mappage );
189 -
190 - if ( $map )
191 - $wgMemc->set( self::CACHE_PREFIX . $mappage, $map);
192 - }
193 -
194 - } else {
195 - $map = false;
196 - }
197 -
198 - return $this->mMaps[$mappage] = $map;
199 - }
200 -
201 - /**
202 - * Parse a map input syntax into a map.
203 - *
204 - * Input syntax is a set of lines.
205 - * All " " are ignored.
206 - * Lines starting with # are ignored, remaining lines are split by =>
207 - * HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints)
208 - *
209 - * The map created is a set of "from" strings to "to" strings
210 - * With extra "from" => true for all substrings of "from" strings
211 - * So that the transliteration algorithm knows when it has found the longest match
212 - *
213 - * $map[''] is used as the default fall through for any characters not in the map
214 - * $map['__decompose__'] indicates that NFD should be used instead of characters
215 - * $map['__sensitive__'] indicates that the automatic first-letter upper-case fall-through should not be tried
216 - */
217 - function readMap( $input, $mappage ) {
218 - global $wgTransliteratorRuleCount, $wgTransliteratorRuleSize;
219 -
220 - $map = array();
221 - $decompose = false;
222 -
223 - // Split lines and remove whitespace at beginning and end
224 - $lines = preg_split( "/(^|\s*\n)(\s*(#[^\n]*)?\n)*\s*/", $input."\n" );
225 -
226 - $count = count( $lines );
227 -
228 - if ( $count > 0 && $lines[0] == "" ) {
229 - array_shift( $lines );
230 - $count--;
231 - }
232 -
233 - if ( $count > 0 && $lines[$count - 1] == "" ) {
234 - array_pop( $lines );
235 - $count--;
236 - }
237 -
238 - // The only content was comments
239 - if ( $count == 0 )
240 - return false;
241 -
242 - // The first line can contain flags
243 - $first_line = $lines[0];
244 - if ( strpos( $first_line, "=>") === FALSE ) {
245 - // Or, could just signify that the message was blank
246 - if ( $first_line == "<$mappage>")
247 - return false;
248 - else if ( preg_replace( '/<(decompose|sensitive)>/', '', $first_line ) != '')
249 - return wfMsg( 'transliterator-error-syntax', $first_line, $mappage );
250 -
251 - if ( strpos( $first_line, "<decompose>" ) !== FALSE ) {
252 - $map['__decompose__'] = true;
253 - $decompose = true;
254 - }
255 - if ( strpos( $first_line, "<sensitive>" ) !== FALSE ) {
256 - $map['__sensitive__'] = true;
257 - }
258 - array_shift( $lines );
259 - $count--;
260 - }
261 -
262 - if ( $count > $wgTransliteratorRuleCount )
263 - return wfMsgExt( 'transliterator-error-rulecount', array('parsemag'), $wgTransliteratorRuleCount, $mappage );
264 -
265 - foreach ( $lines as $line ) {
266 -
267 - $pair = preg_split( '/\s*=>\s*/', $line );
268 -
269 - if ( count( $pair ) != 2 )
270 - return wfMsg( "transliterator-error-syntax", $line, $mappage );
271 -
272 - $from = $pair[0];
273 - $to = Sanitizer::decodeCharReferences( $pair[1], ENT_QUOTES, 'UTF-8' );
274 -
275 - // Convert the ^ and $ selectors into special characters for matching
276 - // Leave single ^ and $'s alone incase someone wants to use them
277 - // Still permits the creation of the rule "^$=>" that will never match, but hey
278 - $fromlast = strlen( $from ) - 1;
279 - if ( $fromlast > 0 ) {
280 - if ( $from[0] == "^" ) {
281 - $from = substr( $from, 1 ) . self::FIRST;
282 - $fromlast--;
283 - }
284 -
285 - if ( $from[$fromlast] == "$")
286 - $from[$fromlast] = self::LAST;
287 - }
288 -
289 - // Now we've looked at our syntax we can remove html escaping to reveal the true form
290 - $from = Sanitizer::decodeCharReferences( $from, ENT_QUOTES, 'UTF-8' );
291 - if ( $decompose ) { // Undo the NFCing of MediaWiki
292 - $from = UtfNormal::toNFD( $from );
293 - }
294 -
295 - // If $map[$from] is set we can skip the filling in of sub-strings as there is a longer rule
296 - if ( isset( $map[$from] ) ) {
297 -
298 - // Or a rule of the same length, i.e. the same rule.
299 - if ( is_string( $map[$from] ) && $to != $map[$from] )
300 - return wfMsg("transliterator-error-ambiguous", $line, $mappage);
301 -
302 - } else if ( strlen( $from ) > 1 ){
303 -
304 - // Bail if the left hand side is too long (has performance implications otherwise)
305 - $fromlen = strlen( $from );
306 - if ( $fromlen > $wgTransliteratorRuleSize )
307 - return wfMsgExt('transliterator-error-rulesize', array('parsemag'), $line, $mappage, $wgTransliteratorRuleSize );
308 -
309 - // Fill in the blanks, so that we know when to stop looking while transliterating
310 - for ( $i = 1; $i < $fromlen; $i++ ) {
311 - $substr = substr( $from, 0, $i );
312 -
313 - if (! isset( $map[$substr] ) )
314 - $map[$substr] = true;
315 - }
316 - } // else we have the default rule
317 -
318 - $map[$from] = $to;
319 - }
320 -
321 - return $map;
322 - }
323 -
324 - /**
325 - * Transliterate a word by iteratively finding the longest substring from
326 - * the start of the untransliterated string that we have a rule for, and
327 - * transliterating it.
328 - */
329 - function transliterate( $word, $map )
330 - {
331 - if ( isset( $map["__decompose__"] ) ) {
332 - $letters = $this->codepoints( $word );
333 - } else {
334 - $letters = $this->letters( $word );
335 - }
336 -
337 - $alphamap = $this->alphamap( $letters );
338 -
339 - $sensitive = isset( $map["__sensitive__"] ); // Are we in case-sensitive mode, or not
340 - $ucfirst = false; // We are in case-sensitive mode and the first character of the current match was upper-case originally
341 - $last_upper = null; // We have lower-cased the current letter, but we need to keep track of the original (dotted I for example)
342 -
343 - $output = ""; // The output
344 - $last_match = 0; // The position of the last character matched, or the first character of the current run
345 - $last_trans = null; // The transliteration of the last character matched, or null if the first character of the current run
346 - $i = 0; // The current position in the string
347 - $count = count($letters); // The total number of characters in the string
348 - $current = ""; // The substring that we are currently trying to find the longest match for.
349 - $current_start = 0; // The position that $current starts at
350 -
351 - while ( $last_match < $count ) {
352 -
353 - if ( $i < $count ) {
354 -
355 - $next = $current.$letters[$i];
356 -
357 - // There may be a match longer than $current
358 - if ( isset( $map[$next] ) ) {
359 -
360 - // In fact, $next is a match
361 - if ( is_string( $map[$next] ) ) {
362 - $last_match = $i;
363 - $last_trans = $map[$next];
364 - }
365 -
366 - $i++;
367 - $current = $next;
368 - continue;
369 - }
370 - }
371 -
372 -
373 - // If this match is at the end of a word, see whether we have a more specific rule
374 - if ( $alphamap[$i-1] && ( $i == $count || !$alphamap[$i] ) ) {
375 - $try = $current . self::LAST;
376 - if ( isset( $map[$try] ) ) {
377 - if ( is_string( $map[$try] ) ) {
378 - $last_trans = $map[$try];
379 - }
380 - if ( isset( $map[$try . self::FIRST] ) ) {
381 - $current = $try;
382 - }
383 - }
384 - }
385 -
386 - // If this match is at the start of a word, see whether we have a more specific rule
387 - if ( ( $current_start == 0 || !$alphamap[$current_start-1]) && $alphamap[$current_start] ) {
388 - $try = $current . self::FIRST;
389 - if ( isset( $map[$try] ) && is_string( $map[$try] ) ) {
390 - $last_trans = $map[$try];
391 - }
392 - }
393 -
394 - // We had no match at all, pass through one character
395 - if ( is_null( $last_trans ) ) {
396 -
397 - $last_letter = $letters[$last_match];
398 - $last_lower = $sensitive ? $last_letter : mb_strtolower( $last_letter );
399 -
400 - // If we are not being sensitive, we can try down-casing the previous letter
401 - if ( $last_letter != $last_lower ) {
402 - $ucfirst = true;
403 - $letters[$last_match] = $last_lower;
404 - $last_upper = $last_letter;
405 -
406 - // Might be nice to output a ? if we don't understand
407 - } else if ( isset( $map[''] ) ) {
408 -
409 - if ( $ucfirst ) {
410 - $output .= str_replace( '$1', $last_upper , $map[''] );
411 - $ucfirst = false;
412 - } else {
413 - $output .= str_replace( '$1', $last_letter, $map[''] );
414 - }
415 - $i = $current_start = ++$last_match;
416 - $current = "";
417 -
418 - // Or the input if it's likely to be correct enough
419 - } else {
420 -
421 - if ( $ucfirst ) {
422 - $output .= $last_upper;
423 - $ucfirst = false;
424 - } else {
425 - $output .= $last_letter;
426 - }
427 - $i = $current_start = ++$last_match;
428 - $current = "";
429 - }
430 -
431 - // Output the previous match
432 - } else {
433 -
434 - if ( $ucfirst ) {
435 - $output .= mb_strtoupper( mb_substr( $last_trans, 0, 1 ) ).mb_substr( $last_trans, 1 );
436 - $ucfirst = false;
437 - } else {
438 - $output .= $last_trans;
439 - }
440 - $i = $current_start = ++$last_match;
441 - $last_trans = null;
442 - $current = "";
443 -
444 - }
445 - }
446 - return $output;
447 - }
448 -
449 - /**
450 - * {{#transliterate:<mapname>|<word>[|<format>[|<answer>[|<onerror>]]]}}
451 - *
452 - * Direct usage will generally be of the form {{#transilterate:<mapname>|<word>}} while
453 - * generic templates may find the latter three parameters invaluable for easy use.
454 - *
455 - * $mapname is the name of the transliteration map to find.
456 - * $word is the string to transliterate (if the map was found)
457 - * $format is a string containing $1 to be replaced by the transliteration if the map exists
458 - * $answer allows for a user-specified transliteration to override the automatic one
459 - * $other is an error messsage to display if $answer is blank and an invalid map is specified
460 - */
461 - function render( &$parser, $mapname = '', $word = '', $format = '$1', $answer = '', $other = '' ) {
462 -
463 - if ( trim( $format ) == '') { // Handle the case when people use {{#transliterate:<>|<>||<>}}
464 - $format = '$1';
465 - }
466 -
467 - if ( trim( $answer ) != '') {
468 - return str_replace('$1', $answer, $format);
469 - }
470 -
471 - $prefix = wfMsg( 'transliterator-prefix' );
472 - $title = Title::newFromText( $prefix . $mapname, NS_MEDIAWIKI );
473 -
474 - if (! $title ) {
475 - return $other == '' ? str_replace("$1", "{{#transliterate:$mapname|$word}}", $format) : $other;
476 - }
477 -
478 - $mappage = $title->getDBkey();
479 -
480 - $map = $this->getMap( $prefix, $mappage );
481 -
482 - if ( !$map ) { // False if map was not found
483 - $output = $other;
484 -
485 - } else if ( is_string( $map ) ) { // An error message
486 - $output = '<span class="transliterator error"> '.$map.' </span>';
487 -
488 - } else { // A Map
489 - $trans = UtfNormal::toNFC( $this->transliterate( Sanitizer::decodeCharReferences( $word ), $map ) );
490 - $output = str_replace( '$1', $trans, $format );
491 - }
492 -
493 - // Populate the dependency table so that we get re-rendered if the map changes.
494 - if ( isset( $this->mPages[$mappage] ) )
495 - $parser->mOutput->addTemplate( $title, $this->mPages[$mappage], null );
496 -
497 - else
498 - $parser->mOutput->addTemplate( $title, $title->getArticleID(), null );
499 -
500 - return $output;
501 - }
502 -
503 - /**
504 - * Called on ArticlePurge, ArticleDeleteComplete and NewRevisionFromEditComplete in order to purge cache
505 - */
506 - static function purgeArticle( &$article, $a=false, $b=false, $c=false, $d=false ) {
507 - return self::purgeTitle( $article->getTitle() );
508 - }
509 -
510 - /**
511 - * Called on TitleMoveComplete
512 - */
513 - static function purgeNewTitle ( &$title, &$newtitle, $a=false, $b=false, $c=false ) {
514 - return self::purgeTitle( $newtitle );
515 - }
516 -
517 - /**
518 - * Called on ArticleUndelete (and by other purge hook handlers)
519 - */
520 - static function purgeTitle( &$title, $a=false ) {
521 - global $wgMemc;
522 - if ( $title->getNamespace() == NS_MEDIAWIKI ) {
523 - $text = $title->getText();
524 - $prefix = wfMsg( 'transliterator-prefix' );
525 - if ( strpos( $text, $prefix ) === 0 ) {
526 - $wgMemc->delete( self::CACHE_PREFIX . $title->getDBkey() );
527 - $wgMemc->delete( self::CACHE_PREFIX . "__map_names__" );
528 - }
529 - }
530 - return true;
531 -
532 - }
533 -
534 - /**
535 - * Called on first use to create singleton
536 - */
537 - static function setup( &$parser ) {
538 - $trans = new ExtTransliterator;
539 - $parser->setFunctionHook( 'transliterate', array( $trans, 'render' ) );
540 - return true;
541 - }
542 -}
543 -
 78+?>

Status & tagging log