r53740 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r53739‎ | r53740 | r53741 >
Date:01:38, 25 July 2009
Author:demon
Status:deferred
Tags:
Comment:
Commit Transliterator extension for Cirwin.
Modified paths:
  • /trunk/extensions/Transliterator (added) (history)
  • /trunk/extensions/Transliterator/Transliterator.i18n.php (added) (history)
  • /trunk/extensions/Transliterator/Transliterator.php (added) (history)

Diff [purge]

Index: trunk/extensions/Transliterator/Transliterator.i18n.php
@@ -0,0 +1,21 @@
 2+<?php
 3+/**
 4+ * Internationalization file for Transliterator
 5+ */
 6+$messages = array();
 7+
 8+/**
 9+ * English
 10+ */
 11+$messages['en'] = array(
 12+ 'transliterator-invoke' => 'transliterate', // {{#transliterate:blah}}
 13+ 'transliterator-prefix' => 'Transliterator:', // [[MediaWiki:Transliterator:blah]] NOTE: changing this requires moving all maps
 14+ // $1 is the line from the map, 'a => z', $2 is the map-page including prefix.
 15+ 'transliterator-error-ambiguous' => "Ambiguous rule '$1' in [[MediaWiki:$2]]",
 16+ 'transliterator-error-syntax' => "Invalid syntax '$1' in [[MediaWiki:$2]]",
 17+ // $1 is the limit on number of rules
 18+ 'transliterator-error-rulecount' => "More than $1 rules in [[MediaWiki:$2]]",
 19+ // $2 is the limit on the length of the left hand side (e.g. 'alpha => beta' has 5)
 20+ 'transliterator-error-rulesize' => "Rule '$1' has more than $2 characters on the left in [[MediaWiki:$3]]",
 21+ 'transliterator-description' => "Provides a configurable parser function for transliteration"
 22+);
Property changes on: trunk/extensions/Transliterator/Transliterator.i18n.php
___________________________________________________________________
Name: svn:eol-style
123 + native
Index: trunk/extensions/Transliterator/Transliterator.php
@@ -0,0 +1,316 @@
 2+<?php
 3+/**
 4+ Extension:Transliterator Copyright (C) 2009 Conrad.Irwin
 5+
 6+ This program is free software; you can redistribute it and/or modify
 7+ it under the terms of the GNU General Public License as published by
 8+ the Free Software Foundation; either version 2 of the License, or
 9+ (at your option) any later version.
 10+
 11+ This program is distributed in the hope that it will be useful,
 12+ but WITHOUT ANY WARRANTY; without even the implied warranty of
 13+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 14+ GNU General Public License for more details.
 15+
 16+ You should have received a copy of the GNU General Public License
 17+ along with this program; if not, write to the Free Software
 18+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 19+*/
 20+
 21+$wgExtensionCredits['parserhook'][] = array(
 22+ 'name' => "Transliterator",
 23+ 'version' => "1.0",
 24+ 'descriptionmsg' => "transliterator-description",
 25+ 'author' => 'Conrad Irwin',
 26+ 'url' => 'http://en.wiktionary.org/wiki/User:Conrad.Irwin/Transliterator.php'
 27+);
 28+
 29+if ( defined( 'MW_SUPPORTS_PARSERFIRSTCALLINIT' ) ) {
 30+ $wgHooks['ParserFirstCallInit'][] = 'efTransliterator_Setup';
 31+} else {
 32+ $wgExtensionFunctions[] = 'efTransliterator_Setup';
 33+}
 34+$wgExtensionMessagesFiles['Transliterator'] = dirname(__FILE__).'/Transliterator.i18n.php';
 35+$wgHooks['LanguageGetMagic'][] = 'efTransliterator_Magic';
 36+
 37+class ExtTransliterator {
 38+
 39+ var $mPages = null; // An Array of "transliterator:$mapname" => The database row for that template.
 40+ var $mMaps = array();// An Array of "$mapname" => The map parsed from that page.
 41+
 42+ /**
 43+ * Split a word into letters (not bytes or codepoints) implicitly in NFC due to MediaWiki.
 44+ */
 45+ function letters( $word ) {
 46+ global $utfCombiningClass;
 47+ UtfNormal::loadData();
 48+
 49+ $split = preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
 50+
 51+ $i = 1;
 52+ while ( $i < count( $split ) ) {
 53+ if ( isset( $utfCombiningClass[$split[$i]] ) ) {
 54+ $split[$i - 1] .= $split[$i];
 55+ unset( $split[$i] );
 56+
 57+ } else {
 58+ $i++;
 59+
 60+ }
 61+ }
 62+
 63+ return $split;
 64+ }
 65+
 66+ /**
 67+ * Split a word into the NFD codepoints that make it up.
 68+ */
 69+ function codepoints( $word ) {
 70+ $word = UtfNormal::toNFD( $word );
 71+ return preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
 72+ }
 73+
 74+ /**
 75+ * Get all the existing maps in one query, useful given that the default
 76+ * behaviour of failing silently is designed to allow it to be used by
 77+ * templates that don't know if a map exists, so may try far too often.
 78+ */
 79+ function getExistingMapNames( $prefix ) {
 80+
 81+ if ( ! is_null($this->mPages) )
 82+ return $this->mPages;
 83+
 84+ $dbr = wfGetDB( DB_SLAVE );
 85+ $res = $dbr->select( 'page',
 86+ array( '*' ),
 87+ array(
 88+ 'page_namespace' => NS_MEDIAWIKI,
 89+ 'page_title LIKE \'' . $dbr->escapeLike( $prefix ) .'%\''
 90+ ),
 91+ __METHOD__
 92+ );
 93+
 94+ $this->mPages = Array();
 95+
 96+ while ( $r = $res->fetchObject() ) {
 97+ $this->mPages[$r->page_title] = $r;
 98+ }
 99+
 100+ return $this->mPages;
 101+ }
 102+ /**
 103+ * Get a map function, either from the local cache or from the page,
 104+ * TODO: discuss whether memcache should be used in any of this.
 105+ */
 106+ function getMap( $prefix, $name ) {
 107+
 108+ $mappage = $prefix.$name;
 109+
 110+ if ( isset( $mMaps[$mappage] ) )
 111+ return $mMaps[$mappage];
 112+
 113+ $existing = $this->getExistingMapNames( $prefix );
 114+
 115+ if (! isset( $existing[$mappage] ) )
 116+ $mMaps[$mappage] = false;
 117+
 118+ else
 119+ $mMaps[$mappage] = $this->readMap( wfMsg( $mappage ), $mappage );
 120+
 121+ return $mMaps[$mappage];
 122+ }
 123+
 124+ /**
 125+ * Parse a map input syntax into a map.
 126+ *
 127+ * Input syntax is a set of lines.
 128+ * All " " are ignored.
 129+ * Lines starting with # are ignored.
 130+ * HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints)
 131+ * Remaining lines are split by "=>".
 132+ *
 133+ * The map created is a set of "from" strings to "to" strings
 134+ * With extra "from" => true for all substrings of "from" strings
 135+ * So that the transliteration algorithm knows when it has found the longest match
 136+ *
 137+ * $map[''] is used as the default fall through for any characters not in the map
 138+ * $map['__decompose__'] indicates that NFD should be used instead of characters
 139+ */
 140+ function readMap( $input, $mappage ) {
 141+
 142+ $map = array();
 143+ $decompose = false;
 144+
 145+ // Split lines and remove comments and space
 146+ $lines = split( "\n", html_entity_decode( preg_replace( '/^(\s*#.*)?\n| */m', '', "$input" ), ENT_NOQUOTES, "UTF-8" ) );
 147+
 148+ if ( $lines[0] == "<decompose>" ) {
 149+ $map['__decompose__'] = true;
 150+ array_shift( $lines );
 151+ $decompose = true;
 152+ }
 153+
 154+ if ( count( $lines ) > 255 )
 155+ return wfMsg("transliterator-error-rulecount", 255, $mappage);
 156+
 157+ foreach ( $lines as $line ) {
 158+
 159+ $pair = split( "=>", $line );
 160+
 161+ if ( count($pair) != 2 )
 162+ return wfMsg("transliterator-error-syntax", $line, $mappage);
 163+
 164+ if ($decompose) // Undo the NFCing of MediaWiki
 165+ $from = UtfNormal::toNFD( $pair[0] );
 166+ else // substrings by NFC code-point are a superset of substrings by letters
 167+ $from = $pair[0];
 168+
 169+ $to = $pair[1];
 170+
 171+ if ( isset( $map[$from] ) ) {
 172+
 173+ if ( is_string( $map[$from] ) )
 174+ return wfMsg("transliterator-error-ambiguous", $line, $mappage);
 175+
 176+ } else if ( strlen( $from ) > 1 ){
 177+ // Fill in the blanks, so that we know when to stop looking while transliterating
 178+ $to_fill = strlen( $from );
 179+
 180+ if ( $to_fill > 10 )
 181+ return wfMsg('transliterator-error-rulesize', $line, 10, $mappage);
 182+
 183+ for ( $i = 1; $i < $to_fill; $i++ ) {
 184+ $substr = substr( $from, 0, $i );
 185+
 186+ if (! isset( $map[$substr] ) )
 187+ $map[$substr] = true;
 188+ }
 189+ }
 190+
 191+ $map[$from] = $to;
 192+ }
 193+
 194+ return $map;
 195+ }
 196+
 197+ /**
 198+ * Transliterate a word by iteratively finding the longest substring from
 199+ * the start of the untransliterated string that we have a rule for, and
 200+ * transliterating it.
 201+ */
 202+ function transliterate( $word, $map )
 203+ {
 204+ $word = "^" . str_replace(" ", "$ ^", $word) . "$";
 205+ if ( isset( $map["__decompose__"] ) ) {
 206+ $letters = $this->codepoints( $word );
 207+ }else
 208+ $letters = $this->letters( $word );
 209+
 210+ $output = ""; // The output
 211+ $last_match = 0; // The position of the last character matched, or the first character of the current run
 212+ $last_trans = null; // The transliteration of the last character matched, or null if the first character of the current run
 213+ $i = 0; // The current position in the string
 214+ $count = count($letters); // The total number of characters in the string
 215+ $current = ""; // The substring that we are currently trying to find the longest match for.
 216+
 217+ while ($i < $count) {
 218+
 219+ $next = $current.$letters[$i];
 220+
 221+ // There may be a match longer than $current
 222+ if ( isset( $map[$next] ) ) {
 223+
 224+ // In fact, $next is a match
 225+ if ( is_string( $map[$next] ) ) {
 226+ $last_match = $i;
 227+ $last_trans = $map[$next];
 228+ }
 229+
 230+ $i++;
 231+ $current = $next;
 232+
 233+ // No more matching, go back to the last match and start from the character after
 234+ } else {
 235+
 236+ // We had no match at all, pass through one character
 237+ if ( is_null( $last_trans ) ) {
 238+
 239+ // Might be nice to output a ? if we don't understand
 240+ if ( isset( $map[''] ) )
 241+ $output .= $map[''];
 242+ // Or the input if it's likely to be correct enough
 243+ else
 244+ $output .= $letters[$last_match];
 245+
 246+ $i = ++$last_match;
 247+
 248+ // Output the previous match
 249+ } else {
 250+
 251+ $output .= $last_trans;
 252+ $i = ++$last_match;
 253+ $last_trans = null;
 254+
 255+ }
 256+ $current = "";
 257+ }
 258+ }
 259+ if (! is_null( $last_trans ))
 260+ $output .= $last_trans;
 261+
 262+ // Remove the beginnng and end markers
 263+ return preg_replace('/^\^|\$$|\$(\s+)\^|\$(\s+)|(\s+)\^/',"$1", $output);
 264+ }
 265+
 266+ /**
 267+ * {{#transliterate:<mapname>|<word>[|<format>[|<onerror>]]}}
 268+ *
 269+ * It is envisaged that most usage is in the form {{#transliterate:<mapname>|<word>}}
 270+ * However, when in use in multi-purpose templates, it would be very ugly to have
 271+ * {{#if}}s around all calls to {{#transliterate}} to check whether the map
 272+ * exists. The further two arguments can thus give very flexible output with
 273+ * minimal hassle.
 274+ */
 275+ function render( &$parser, $mapname = '', $word = '', $format = '$1', $other = '' ) {
 276+
 277+ $prefix = wfMsg('transliterator-prefix');
 278+ $mappage = $prefix.$mapname;
 279+
 280+ $map = $this->getMap( $prefix, $mapname );
 281+
 282+ if ( !$map ) { // False if map was not found
 283+ $title = Title::newFromText( $mappage, NS_MEDIAWIKI );
 284+ $output = $other;
 285+
 286+ } else if ( is_string( $map ) ) { // An error message
 287+ $title = Title::newFromRow( $this->mPages[$mappage] );
 288+ $output = '<span class="transliterator error"> '.$map.' </span>';
 289+
 290+ } else { // A Map
 291+ $title = Title::newFromRow( $this->mPages[$mappage] );
 292+ $output = UtfNormal::toNFC( $this->transliterate( $word, $map ) );
 293+ $output = str_replace('$1', $output, $format);
 294+
 295+ }
 296+ // Populate the dependency table so that we get re-rendered if the map changes.
 297+ if ($title)
 298+ $parser->mOutput->addTemplate( $title, $title->getArticleID(), null );
 299+
 300+ return $output;
 301+ }
 302+
 303+}
 304+function efTransliterator_Setup() {
 305+ global $wgParser;
 306+
 307+ $trans = new ExtTransliterator;
 308+ $wgParser->setFunctionHook( 'transliterate', array( $trans, 'render' ) );
 309+ return true;
 310+}
 311+
 312+function efTransliterator_Magic( &$magicWords, $langCode ) {
 313+ wfLoadExtensionMessages('Transliterator');
 314+
 315+ $magicWords['transliterate'] = array( 0, 'transliterate', wfMsg('transliterator-invoke') );
 316+ return true;
 317+}
Property changes on: trunk/extensions/Transliterator/Transliterator.php
___________________________________________________________________
Name: svn:eol-style
1318 + native

Status & tagging log