Index: trunk/extensions/Transliterator/Transliterator.i18n.php |
— | — | @@ -0,0 +1,21 @@ |
| 2 | +<?php |
| 3 | +/** |
| 4 | + * Internationalization file for Transliterator |
| 5 | + */ |
| 6 | +$messages = array(); |
| 7 | + |
| 8 | +/** |
| 9 | + * English |
| 10 | + */ |
| 11 | +$messages['en'] = array( |
| 12 | + 'transliterator-invoke' => 'transliterate', // {{#transliterate:blah}} |
| 13 | + 'transliterator-prefix' => 'Transliterator:', // [[MediaWiki:Transliterator:blah]] NOTE: changing this requires moving all maps |
| 14 | + // $1 is the line from the map, 'a => z', $2 is the map-page including prefix. |
| 15 | + 'transliterator-error-ambiguous' => "Ambiguous rule '$1' in [[MediaWiki:$2]]", |
| 16 | + 'transliterator-error-syntax' => "Invalid syntax '$1' in [[MediaWiki:$2]]", |
| 17 | + // $1 is the limit on number of rules |
| 18 | + 'transliterator-error-rulecount' => "More than $1 rules in [[MediaWiki:$2]]", |
| 19 | + // $2 is the limit on the length of the left hand side (e.g. 'alpha => beta' has 5) |
| 20 | + 'transliterator-error-rulesize' => "Rule '$1' has more than $2 characters on the left in [[MediaWiki:$3]]", |
| 21 | + 'transliterator-description' => "Provides a configurable parser function for transliteration" |
| 22 | +); |
Property changes on: trunk/extensions/Transliterator/Transliterator.i18n.php |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 23 | + native |
Index: trunk/extensions/Transliterator/Transliterator.php |
— | — | @@ -0,0 +1,316 @@ |
| 2 | +<?php |
| 3 | +/** |
| 4 | + Extension:Transliterator Copyright (C) 2009 Conrad.Irwin |
| 5 | + |
| 6 | + This program is free software; you can redistribute it and/or modify |
| 7 | + it under the terms of the GNU General Public License as published by |
| 8 | + the Free Software Foundation; either version 2 of the License, or |
| 9 | + (at your option) any later version. |
| 10 | + |
| 11 | + This program is distributed in the hope that it will be useful, |
| 12 | + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | + GNU General Public License for more details. |
| 15 | + |
| 16 | + You should have received a copy of the GNU General Public License |
| 17 | + along with this program; if not, write to the Free Software |
| 18 | + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA |
| 19 | +*/ |
| 20 | + |
| 21 | +$wgExtensionCredits['parserhook'][] = array( |
| 22 | + 'name' => "Transliterator", |
| 23 | + 'version' => "1.0", |
| 24 | + 'descriptionmsg' => "transliterator-description", |
| 25 | + 'author' => 'Conrad Irwin', |
| 26 | + 'url' => 'http://en.wiktionary.org/wiki/User:Conrad.Irwin/Transliterator.php' |
| 27 | +); |
| 28 | + |
| 29 | +if ( defined( 'MW_SUPPORTS_PARSERFIRSTCALLINIT' ) ) { |
| 30 | + $wgHooks['ParserFirstCallInit'][] = 'efTransliterator_Setup'; |
| 31 | +} else { |
| 32 | + $wgExtensionFunctions[] = 'efTransliterator_Setup'; |
| 33 | +} |
| 34 | +$wgExtensionMessagesFiles['Transliterator'] = dirname(__FILE__).'/Transliterator.i18n.php'; |
| 35 | +$wgHooks['LanguageGetMagic'][] = 'efTransliterator_Magic'; |
| 36 | + |
| 37 | +class ExtTransliterator { |
| 38 | + |
| 39 | + var $mPages = null; // An Array of "transliterator:$mapname" => The database row for that template. |
| 40 | + var $mMaps = array();// An Array of "$mapname" => The map parsed from that page. |
| 41 | + |
| 42 | + /** |
| 43 | + * Split a word into letters (not bytes or codepoints) implicitly in NFC due to MediaWiki. |
| 44 | + */ |
| 45 | + function letters( $word ) { |
| 46 | + global $utfCombiningClass; |
| 47 | + UtfNormal::loadData(); |
| 48 | + |
| 49 | + $split = preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); |
| 50 | + |
| 51 | + $i = 1; |
| 52 | + while ( $i < count( $split ) ) { |
| 53 | + if ( isset( $utfCombiningClass[$split[$i]] ) ) { |
| 54 | + $split[$i - 1] .= $split[$i]; |
| 55 | + unset( $split[$i] ); |
| 56 | + |
| 57 | + } else { |
| 58 | + $i++; |
| 59 | + |
| 60 | + } |
| 61 | + } |
| 62 | + |
| 63 | + return $split; |
| 64 | + } |
| 65 | + |
| 66 | + /** |
| 67 | + * Split a word into the NFD codepoints that make it up. |
| 68 | + */ |
| 69 | + function codepoints( $word ) { |
| 70 | + $word = UtfNormal::toNFD( $word ); |
| 71 | + return preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ); |
| 72 | + } |
| 73 | + |
| 74 | + /** |
| 75 | + * Get all the existing maps in one query, useful given that the default |
| 76 | + * behaviour of failing silently is designed to allow it to be used by |
| 77 | + * templates that don't know if a map exists, so may try far too often. |
| 78 | + */ |
| 79 | + function getExistingMapNames( $prefix ) { |
| 80 | + |
| 81 | + if ( ! is_null($this->mPages) ) |
| 82 | + return $this->mPages; |
| 83 | + |
| 84 | + $dbr = wfGetDB( DB_SLAVE ); |
| 85 | + $res = $dbr->select( 'page', |
| 86 | + array( '*' ), |
| 87 | + array( |
| 88 | + 'page_namespace' => NS_MEDIAWIKI, |
| 89 | + 'page_title LIKE \'' . $dbr->escapeLike( $prefix ) .'%\'' |
| 90 | + ), |
| 91 | + __METHOD__ |
| 92 | + ); |
| 93 | + |
| 94 | + $this->mPages = Array(); |
| 95 | + |
| 96 | + while ( $r = $res->fetchObject() ) { |
| 97 | + $this->mPages[$r->page_title] = $r; |
| 98 | + } |
| 99 | + |
| 100 | + return $this->mPages; |
| 101 | + } |
| 102 | + /** |
| 103 | + * Get a map function, either from the local cache or from the page, |
| 104 | + * TODO: discuss whether memcache should be used in any of this. |
| 105 | + */ |
| 106 | + function getMap( $prefix, $name ) { |
| 107 | + |
| 108 | + $mappage = $prefix.$name; |
| 109 | + |
| 110 | + if ( isset( $mMaps[$mappage] ) ) |
| 111 | + return $mMaps[$mappage]; |
| 112 | + |
| 113 | + $existing = $this->getExistingMapNames( $prefix ); |
| 114 | + |
| 115 | + if (! isset( $existing[$mappage] ) ) |
| 116 | + $mMaps[$mappage] = false; |
| 117 | + |
| 118 | + else |
| 119 | + $mMaps[$mappage] = $this->readMap( wfMsg( $mappage ), $mappage ); |
| 120 | + |
| 121 | + return $mMaps[$mappage]; |
| 122 | + } |
| 123 | + |
| 124 | + /** |
| 125 | + * Parse a map input syntax into a map. |
| 126 | + * |
| 127 | + * Input syntax is a set of lines. |
| 128 | + * All " " are ignored. |
| 129 | + * Lines starting with # are ignored. |
| 130 | + * HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints) |
| 131 | + * Remaining lines are split by "=>". |
| 132 | + * |
| 133 | + * The map created is a set of "from" strings to "to" strings |
| 134 | + * With extra "from" => true for all substrings of "from" strings |
| 135 | + * So that the transliteration algorithm knows when it has found the longest match |
| 136 | + * |
| 137 | + * $map[''] is used as the default fall through for any characters not in the map |
| 138 | + * $map['__decompose__'] indicates that NFD should be used instead of characters |
| 139 | + */ |
| 140 | + function readMap( $input, $mappage ) { |
| 141 | + |
| 142 | + $map = array(); |
| 143 | + $decompose = false; |
| 144 | + |
| 145 | + // Split lines and remove comments and space |
| 146 | + $lines = split( "\n", html_entity_decode( preg_replace( '/^(\s*#.*)?\n| */m', '', "$input" ), ENT_NOQUOTES, "UTF-8" ) ); |
| 147 | + |
| 148 | + if ( $lines[0] == "<decompose>" ) { |
| 149 | + $map['__decompose__'] = true; |
| 150 | + array_shift( $lines ); |
| 151 | + $decompose = true; |
| 152 | + } |
| 153 | + |
| 154 | + if ( count( $lines ) > 255 ) |
| 155 | + return wfMsg("transliterator-error-rulecount", 255, $mappage); |
| 156 | + |
| 157 | + foreach ( $lines as $line ) { |
| 158 | + |
| 159 | + $pair = split( "=>", $line ); |
| 160 | + |
| 161 | + if ( count($pair) != 2 ) |
| 162 | + return wfMsg("transliterator-error-syntax", $line, $mappage); |
| 163 | + |
| 164 | + if ($decompose) // Undo the NFCing of MediaWiki |
| 165 | + $from = UtfNormal::toNFD( $pair[0] ); |
| 166 | + else // substrings by NFC code-point are a superset of substrings by letters |
| 167 | + $from = $pair[0]; |
| 168 | + |
| 169 | + $to = $pair[1]; |
| 170 | + |
| 171 | + if ( isset( $map[$from] ) ) { |
| 172 | + |
| 173 | + if ( is_string( $map[$from] ) ) |
| 174 | + return wfMsg("transliterator-error-ambiguous", $line, $mappage); |
| 175 | + |
| 176 | + } else if ( strlen( $from ) > 1 ){ |
| 177 | + // Fill in the blanks, so that we know when to stop looking while transliterating |
| 178 | + $to_fill = strlen( $from ); |
| 179 | + |
| 180 | + if ( $to_fill > 10 ) |
| 181 | + return wfMsg('transliterator-error-rulesize', $line, 10, $mappage); |
| 182 | + |
| 183 | + for ( $i = 1; $i < $to_fill; $i++ ) { |
| 184 | + $substr = substr( $from, 0, $i ); |
| 185 | + |
| 186 | + if (! isset( $map[$substr] ) ) |
| 187 | + $map[$substr] = true; |
| 188 | + } |
| 189 | + } |
| 190 | + |
| 191 | + $map[$from] = $to; |
| 192 | + } |
| 193 | + |
| 194 | + return $map; |
| 195 | + } |
| 196 | + |
| 197 | + /** |
| 198 | + * Transliterate a word by iteratively finding the longest substring from |
| 199 | + * the start of the untransliterated string that we have a rule for, and |
| 200 | + * transliterating it. |
| 201 | + */ |
| 202 | + function transliterate( $word, $map ) |
| 203 | + { |
| 204 | + $word = "^" . str_replace(" ", "$ ^", $word) . "$"; |
| 205 | + if ( isset( $map["__decompose__"] ) ) { |
| 206 | + $letters = $this->codepoints( $word ); |
| 207 | + }else |
| 208 | + $letters = $this->letters( $word ); |
| 209 | + |
| 210 | + $output = ""; // The output |
| 211 | + $last_match = 0; // The position of the last character matched, or the first character of the current run |
| 212 | + $last_trans = null; // The transliteration of the last character matched, or null if the first character of the current run |
| 213 | + $i = 0; // The current position in the string |
| 214 | + $count = count($letters); // The total number of characters in the string |
| 215 | + $current = ""; // The substring that we are currently trying to find the longest match for. |
| 216 | + |
| 217 | + while ($i < $count) { |
| 218 | + |
| 219 | + $next = $current.$letters[$i]; |
| 220 | + |
| 221 | + // There may be a match longer than $current |
| 222 | + if ( isset( $map[$next] ) ) { |
| 223 | + |
| 224 | + // In fact, $next is a match |
| 225 | + if ( is_string( $map[$next] ) ) { |
| 226 | + $last_match = $i; |
| 227 | + $last_trans = $map[$next]; |
| 228 | + } |
| 229 | + |
| 230 | + $i++; |
| 231 | + $current = $next; |
| 232 | + |
| 233 | + // No more matching, go back to the last match and start from the character after |
| 234 | + } else { |
| 235 | + |
| 236 | + // We had no match at all, pass through one character |
| 237 | + if ( is_null( $last_trans ) ) { |
| 238 | + |
| 239 | + // Might be nice to output a ? if we don't understand |
| 240 | + if ( isset( $map[''] ) ) |
| 241 | + $output .= $map['']; |
| 242 | + // Or the input if it's likely to be correct enough |
| 243 | + else |
| 244 | + $output .= $letters[$last_match]; |
| 245 | + |
| 246 | + $i = ++$last_match; |
| 247 | + |
| 248 | + // Output the previous match |
| 249 | + } else { |
| 250 | + |
| 251 | + $output .= $last_trans; |
| 252 | + $i = ++$last_match; |
| 253 | + $last_trans = null; |
| 254 | + |
| 255 | + } |
| 256 | + $current = ""; |
| 257 | + } |
| 258 | + } |
| 259 | + if (! is_null( $last_trans )) |
| 260 | + $output .= $last_trans; |
| 261 | + |
| 262 | + // Remove the beginnng and end markers |
| 263 | + return preg_replace('/^\^|\$$|\$(\s+)\^|\$(\s+)|(\s+)\^/',"$1", $output); |
| 264 | + } |
| 265 | + |
| 266 | + /** |
| 267 | + * {{#transliterate:<mapname>|<word>[|<format>[|<onerror>]]}} |
| 268 | + * |
| 269 | + * It is envisaged that most usage is in the form {{#transliterate:<mapname>|<word>}} |
| 270 | + * However, when in use in multi-purpose templates, it would be very ugly to have |
| 271 | + * {{#if}}s around all calls to {{#transliterate}} to check whether the map |
| 272 | + * exists. The further two arguments can thus give very flexible output with |
| 273 | + * minimal hassle. |
| 274 | + */ |
| 275 | + function render( &$parser, $mapname = '', $word = '', $format = '$1', $other = '' ) { |
| 276 | + |
| 277 | + $prefix = wfMsg('transliterator-prefix'); |
| 278 | + $mappage = $prefix.$mapname; |
| 279 | + |
| 280 | + $map = $this->getMap( $prefix, $mapname ); |
| 281 | + |
| 282 | + if ( !$map ) { // False if map was not found |
| 283 | + $title = Title::newFromText( $mappage, NS_MEDIAWIKI ); |
| 284 | + $output = $other; |
| 285 | + |
| 286 | + } else if ( is_string( $map ) ) { // An error message |
| 287 | + $title = Title::newFromRow( $this->mPages[$mappage] ); |
| 288 | + $output = '<span class="transliterator error"> '.$map.' </span>'; |
| 289 | + |
| 290 | + } else { // A Map |
| 291 | + $title = Title::newFromRow( $this->mPages[$mappage] ); |
| 292 | + $output = UtfNormal::toNFC( $this->transliterate( $word, $map ) ); |
| 293 | + $output = str_replace('$1', $output, $format); |
| 294 | + |
| 295 | + } |
| 296 | + // Populate the dependency table so that we get re-rendered if the map changes. |
| 297 | + if ($title) |
| 298 | + $parser->mOutput->addTemplate( $title, $title->getArticleID(), null ); |
| 299 | + |
| 300 | + return $output; |
| 301 | + } |
| 302 | + |
| 303 | +} |
| 304 | +function efTransliterator_Setup() { |
| 305 | + global $wgParser; |
| 306 | + |
| 307 | + $trans = new ExtTransliterator; |
| 308 | + $wgParser->setFunctionHook( 'transliterate', array( $trans, 'render' ) ); |
| 309 | + return true; |
| 310 | +} |
| 311 | + |
| 312 | +function efTransliterator_Magic( &$magicWords, $langCode ) { |
| 313 | + wfLoadExtensionMessages('Transliterator'); |
| 314 | + |
| 315 | + $magicWords['transliterate'] = array( 0, 'transliterate', wfMsg('transliterator-invoke') ); |
| 316 | + return true; |
| 317 | +} |
Property changes on: trunk/extensions/Transliterator/Transliterator.php |
___________________________________________________________________ |
Name: svn:eol-style |
1 | 318 | + native |