r53740 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r53739‎ \| r53740 \| r53741 >
Date:	01:38, 25 July 2009
Author:	demon
Status:	deferred
Tags:
Comment:	Commit Transliterator extension for Cirwin.
Modified paths:	/trunk/extensions/Transliterator (added) (history) /trunk/extensions/Transliterator/Transliterator.i18n.php (added) (history) /trunk/extensions/Transliterator/Transliterator.php (added) (history)

Diff [purge]

Index: trunk/extensions/Transliterator/Transliterator.i18n.php
—	—	@@ -0,0 +1,21 @@
	2	+<?php
	3	+/**
	4	+ * Internationalization file for Transliterator
	5	+ */
	6	+$messages = array();
	7	+
	8	+/**
	9	+ * English
	10	+ */
	11	+$messages['en'] = array(
	12	+ 'transliterator-invoke' => 'transliterate', // {{#transliterate:blah}}
	13	+ 'transliterator-prefix' => 'Transliterator:', // [[MediaWiki:Transliterator:blah]] NOTE: changing this requires moving all maps
	14	+ // $1 is the line from the map, 'a => z', $2 is the map-page including prefix.
	15	+ 'transliterator-error-ambiguous' => "Ambiguous rule '$1' in [[MediaWiki:$2]]",
	16	+ 'transliterator-error-syntax' => "Invalid syntax '$1' in [[MediaWiki:$2]]",
	17	+ // $1 is the limit on number of rules
	18	+ 'transliterator-error-rulecount' => "More than $1 rules in [[MediaWiki:$2]]",
	19	+ // $2 is the limit on the length of the left hand side (e.g. 'alpha => beta' has 5)
	20	+ 'transliterator-error-rulesize' => "Rule '$1' has more than $2 characters on the left in [[MediaWiki:$3]]",
	21	+ 'transliterator-description' => "Provides a configurable parser function for transliteration"
	22	+);
Property changes on: trunk/extensions/Transliterator/Transliterator.i18n.php
___________________________________________________________________
Name: svn:eol-style
1	23	+ native
Index: trunk/extensions/Transliterator/Transliterator.php
—	—	@@ -0,0 +1,316 @@
	2	+<?php
	3	+/**
	4	+ Extension:Transliterator Copyright (C) 2009 Conrad.Irwin
	5	+
	6	+ This program is free software; you can redistribute it and/or modify
	7	+ it under the terms of the GNU General Public License as published by
	8	+ the Free Software Foundation; either version 2 of the License, or
	9	+ (at your option) any later version.
	10	+
	11	+ This program is distributed in the hope that it will be useful,
	12	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	+ GNU General Public License for more details.
	15	+
	16	+ You should have received a copy of the GNU General Public License
	17	+ along with this program; if not, write to the Free Software
	18	+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
	19	+*/
	20	+
	21	+$wgExtensionCredits['parserhook'][] = array(
	22	+ 'name' => "Transliterator",
	23	+ 'version' => "1.0",
	24	+ 'descriptionmsg' => "transliterator-description",
	25	+ 'author' => 'Conrad Irwin',
	26	+ 'url' => 'http://en.wiktionary.org/wiki/User:Conrad.Irwin/Transliterator.php'
	27	+);
	28	+
	29	+if ( defined( 'MW_SUPPORTS_PARSERFIRSTCALLINIT' ) ) {
	30	+ $wgHooks['ParserFirstCallInit'][] = 'efTransliterator_Setup';
	31	+} else {
	32	+ $wgExtensionFunctions[] = 'efTransliterator_Setup';
	33	+}
	34	+$wgExtensionMessagesFiles['Transliterator'] = dirname(__FILE__).'/Transliterator.i18n.php';
	35	+$wgHooks['LanguageGetMagic'][] = 'efTransliterator_Magic';
	36	+
	37	+class ExtTransliterator {
	38	+
	39	+ var $mPages = null; // An Array of "transliterator:$mapname" => The database row for that template.
	40	+ var $mMaps = array();// An Array of "$mapname" => The map parsed from that page.
	41	+
	42	+ /**
	43	+ * Split a word into letters (not bytes or codepoints) implicitly in NFC due to MediaWiki.
	44	+ */
	45	+ function letters( $word ) {
	46	+ global $utfCombiningClass;
	47	+ UtfNormal::loadData();
	48	+
	49	+ $split = preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE \| PREG_SPLIT_NO_EMPTY );
	50	+
	51	+ $i = 1;
	52	+ while ( $i < count( $split ) ) {
	53	+ if ( isset( $utfCombiningClass[$split[$i]] ) ) {
	54	+ $split[$i - 1] .= $split[$i];
	55	+ unset( $split[$i] );
	56	+
	57	+ } else {
	58	+ $i++;
	59	+
	60	+ }
	61	+ }
	62	+
	63	+ return $split;
	64	+ }
	65	+
	66	+ /**
	67	+ * Split a word into the NFD codepoints that make it up.
	68	+ */
	69	+ function codepoints( $word ) {
	70	+ $word = UtfNormal::toNFD( $word );
	71	+ return preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE \| PREG_SPLIT_NO_EMPTY );
	72	+ }
	73	+
	74	+ /**
	75	+ * Get all the existing maps in one query, useful given that the default
	76	+ * behaviour of failing silently is designed to allow it to be used by
	77	+ * templates that don't know if a map exists, so may try far too often.
	78	+ */
	79	+ function getExistingMapNames( $prefix ) {
	80	+
	81	+ if ( ! is_null($this->mPages) )
	82	+ return $this->mPages;
	83	+
	84	+ $dbr = wfGetDB( DB_SLAVE );
	85	+ $res = $dbr->select( 'page',
	86	+ array( '*' ),
	87	+ array(
	88	+ 'page_namespace' => NS_MEDIAWIKI,
	89	+ 'page_title LIKE \'' . $dbr->escapeLike( $prefix ) .'%\''
	90	+ ),
	91	+ __METHOD__
	92	+ );
	93	+
	94	+ $this->mPages = Array();
	95	+
	96	+ while ( $r = $res->fetchObject() ) {
	97	+ $this->mPages[$r->page_title] = $r;
	98	+ }
	99	+
	100	+ return $this->mPages;
	101	+ }
	102	+ /**
	103	+ * Get a map function, either from the local cache or from the page,
	104	+ * TODO: discuss whether memcache should be used in any of this.
	105	+ */
	106	+ function getMap( $prefix, $name ) {
	107	+
	108	+ $mappage = $prefix.$name;
	109	+
	110	+ if ( isset( $mMaps[$mappage] ) )
	111	+ return $mMaps[$mappage];
	112	+
	113	+ $existing = $this->getExistingMapNames( $prefix );
	114	+
	115	+ if (! isset( $existing[$mappage] ) )
	116	+ $mMaps[$mappage] = false;
	117	+
	118	+ else
	119	+ $mMaps[$mappage] = $this->readMap( wfMsg( $mappage ), $mappage );
	120	+
	121	+ return $mMaps[$mappage];
	122	+ }
	123	+
	124	+ /**
	125	+ * Parse a map input syntax into a map.
	126	+ *
	127	+ * Input syntax is a set of lines.
	128	+ * All " " are ignored.
	129	+ * Lines starting with # are ignored.
	130	+ * HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints)
	131	+ * Remaining lines are split by "=>".
	132	+ *
	133	+ * The map created is a set of "from" strings to "to" strings
	134	+ * With extra "from" => true for all substrings of "from" strings
	135	+ * So that the transliteration algorithm knows when it has found the longest match
	136	+ *
	137	+ * $map[''] is used as the default fall through for any characters not in the map
	138	+ * $map['__decompose__'] indicates that NFD should be used instead of characters
	139	+ */
	140	+ function readMap( $input, $mappage ) {
	141	+
	142	+ $map = array();
	143	+ $decompose = false;
	144	+
	145	+ // Split lines and remove comments and space
	146	+ $lines = split( "\n", html_entity_decode( preg_replace( '/^(\s#.)?\n\| */m', '', "$input" ), ENT_NOQUOTES, "UTF-8" ) );
	147	+
	148	+ if ( $lines[0] == "<decompose>" ) {
	149	+ $map['__decompose__'] = true;
	150	+ array_shift( $lines );
	151	+ $decompose = true;
	152	+ }
	153	+
	154	+ if ( count( $lines ) > 255 )
	155	+ return wfMsg("transliterator-error-rulecount", 255, $mappage);
	156	+
	157	+ foreach ( $lines as $line ) {
	158	+
	159	+ $pair = split( "=>", $line );
	160	+
	161	+ if ( count($pair) != 2 )
	162	+ return wfMsg("transliterator-error-syntax", $line, $mappage);
	163	+
	164	+ if ($decompose) // Undo the NFCing of MediaWiki
	165	+ $from = UtfNormal::toNFD( $pair[0] );
	166	+ else // substrings by NFC code-point are a superset of substrings by letters
	167	+ $from = $pair[0];
	168	+
	169	+ $to = $pair[1];
	170	+
	171	+ if ( isset( $map[$from] ) ) {
	172	+
	173	+ if ( is_string( $map[$from] ) )
	174	+ return wfMsg("transliterator-error-ambiguous", $line, $mappage);
	175	+
	176	+ } else if ( strlen( $from ) > 1 ){
	177	+ // Fill in the blanks, so that we know when to stop looking while transliterating
	178	+ $to_fill = strlen( $from );
	179	+
	180	+ if ( $to_fill > 10 )
	181	+ return wfMsg('transliterator-error-rulesize', $line, 10, $mappage);
	182	+
	183	+ for ( $i = 1; $i < $to_fill; $i++ ) {
	184	+ $substr = substr( $from, 0, $i );
	185	+
	186	+ if (! isset( $map[$substr] ) )
	187	+ $map[$substr] = true;
	188	+ }
	189	+ }
	190	+
	191	+ $map[$from] = $to;
	192	+ }
	193	+
	194	+ return $map;
	195	+ }
	196	+
	197	+ /**
	198	+ * Transliterate a word by iteratively finding the longest substring from
	199	+ * the start of the untransliterated string that we have a rule for, and
	200	+ * transliterating it.
	201	+ */
	202	+ function transliterate( $word, $map )
	203	+ {
	204	+ $word = "^" . str_replace(" ", "$ ^", $word) . "$";
	205	+ if ( isset( $map["__decompose__"] ) ) {
	206	+ $letters = $this->codepoints( $word );
	207	+ }else
	208	+ $letters = $this->letters( $word );
	209	+
	210	+ $output = ""; // The output
	211	+ $last_match = 0; // The position of the last character matched, or the first character of the current run
	212	+ $last_trans = null; // The transliteration of the last character matched, or null if the first character of the current run
	213	+ $i = 0; // The current position in the string
	214	+ $count = count($letters); // The total number of characters in the string
	215	+ $current = ""; // The substring that we are currently trying to find the longest match for.
	216	+
	217	+ while ($i < $count) {
	218	+
	219	+ $next = $current.$letters[$i];
	220	+
	221	+ // There may be a match longer than $current
	222	+ if ( isset( $map[$next] ) ) {
	223	+
	224	+ // In fact, $next is a match
	225	+ if ( is_string( $map[$next] ) ) {
	226	+ $last_match = $i;
	227	+ $last_trans = $map[$next];
	228	+ }
	229	+
	230	+ $i++;
	231	+ $current = $next;
	232	+
	233	+ // No more matching, go back to the last match and start from the character after
	234	+ } else {
	235	+
	236	+ // We had no match at all, pass through one character
	237	+ if ( is_null( $last_trans ) ) {
	238	+
	239	+ // Might be nice to output a ? if we don't understand
	240	+ if ( isset( $map[''] ) )
	241	+ $output .= $map[''];
	242	+ // Or the input if it's likely to be correct enough
	243	+ else
	244	+ $output .= $letters[$last_match];
	245	+
	246	+ $i = ++$last_match;
	247	+
	248	+ // Output the previous match
	249	+ } else {
	250	+
	251	+ $output .= $last_trans;
	252	+ $i = ++$last_match;
	253	+ $last_trans = null;
	254	+
	255	+ }
	256	+ $current = "";
	257	+ }
	258	+ }
	259	+ if (! is_null( $last_trans ))
	260	+ $output .= $last_trans;
	261	+
	262	+ // Remove the beginnng and end markers
	263	+ return preg_replace('/^\^\|\$$\|\$(\s+)\^\|\$(\s+)\|(\s+)\^/',"$1", $output);
	264	+ }
	265	+
	266	+ /**
	267	+ * {{#transliterate:<mapname>\|<word>[\|<format>[\|<onerror>]]}}
	268	+ *
	269	+ * It is envisaged that most usage is in the form {{#transliterate:<mapname>\|<word>}}
	270	+ * However, when in use in multi-purpose templates, it would be very ugly to have
	271	+ * {{#if}}s around all calls to {{#transliterate}} to check whether the map
	272	+ * exists. The further two arguments can thus give very flexible output with
	273	+ * minimal hassle.
	274	+ */
	275	+ function render( &$parser, $mapname = '', $word = '', $format = '$1', $other = '' ) {
	276	+
	277	+ $prefix = wfMsg('transliterator-prefix');
	278	+ $mappage = $prefix.$mapname;
	279	+
	280	+ $map = $this->getMap( $prefix, $mapname );
	281	+
	282	+ if ( !$map ) { // False if map was not found
	283	+ $title = Title::newFromText( $mappage, NS_MEDIAWIKI );
	284	+ $output = $other;
	285	+
	286	+ } else if ( is_string( $map ) ) { // An error message
	287	+ $title = Title::newFromRow( $this->mPages[$mappage] );
	288	+ $output = '<span class="transliterator error"> '.$map.' </span>';
	289	+
	290	+ } else { // A Map
	291	+ $title = Title::newFromRow( $this->mPages[$mappage] );
	292	+ $output = UtfNormal::toNFC( $this->transliterate( $word, $map ) );
	293	+ $output = str_replace('$1', $output, $format);
	294	+
	295	+ }
	296	+ // Populate the dependency table so that we get re-rendered if the map changes.
	297	+ if ($title)
	298	+ $parser->mOutput->addTemplate( $title, $title->getArticleID(), null );
	299	+
	300	+ return $output;
	301	+ }
	302	+
	303	+}
	304	+function efTransliterator_Setup() {
	305	+ global $wgParser;
	306	+
	307	+ $trans = new ExtTransliterator;
	308	+ $wgParser->setFunctionHook( 'transliterate', array( $trans, 'render' ) );
	309	+ return true;
	310	+}
	311	+
	312	+function efTransliterator_Magic( &$magicWords, $langCode ) {
	313	+ wfLoadExtensionMessages('Transliterator');
	314	+
	315	+ $magicWords['transliterate'] = array( 0, 'transliterate', wfMsg('transliterator-invoke') );
	316	+ return true;
	317	+}
Property changes on: trunk/extensions/Transliterator/Transliterator.php
___________________________________________________________________
Name: svn:eol-style
1	318	+ native

Status & tagging log

20:19, 25 May 2011 Reedy (talk | contribs) changed the status of r53740 [removed: new added: deferred]
22:34, 6 May 2011 MarkAHershberger (talk | contribs) changed the status of r53740 [removed: deferred added: new]
12:29, 24 August 2009 😂 (talk | contribs) changed the status of r53740 [removed: new added: deferred]