r57423 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r57422‎ \| r57423 \| r57424 >
Date:	11:10, 6 October 2009
Author:	conrad
Status:	deferred
Tags:
Comment:	Minor style issues
Modified paths:	/trunk/extensions/Transliterator/Transliterator.php (modified) (history) /trunk/extensions/Transliterator/Transliterator_body.php (added) (history)

Diff [purge]

Index: trunk/extensions/Transliterator/Transliterator_body.php
—	—	@@ -0,0 +1,475 @@
	2	+<?php
	3	+
	4	+if ( !defined( 'MEDIAWIKI' ) ) {
	5	+ die( 'This file is a MediaWiki extension, not a valid entry point.' );
	6	+}
	7	+
	8	+class ExtTransliterator {
	9	+
	10	+
	11	+ const FIRST = "\x1F"; // A character that will be appended when ^ should match at the start
	12	+ const LAST = "\x1E"; // A character that will be appended when $ should match at the end
	13	+ const CACHE_PREFIX = "extTransliterator.2"; // The prefix to use for cache items (the number should be incremented when the map format changes)
	14	+ var $mPages = null; // An Array of "transliterator:$mapname" => The database row for that template.
	15	+ var $mMaps = array();// An Array of "$mapname" => The map parsed from that page.
	16	+
	17	+ /**
	18	+ * Split a word into letters (not bytes or codepoints) implicitly in NFC due to MediaWiki.
	19	+ */
	20	+ function letters( $word ) {
	21	+ global $utfCombiningClass;
	22	+ UtfNormal::loadData();
	23	+
	24	+ $split = preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE \| PREG_SPLIT_NO_EMPTY );
	25	+
	26	+ $i = 1;
	27	+ while ( $i < count( $split ) ) {
	28	+ if ( isset( $utfCombiningClass[$split[$i]] ) ) {
	29	+ $split[$i - 1] .= $split[$i];
	30	+ unset( $split[$i] );
	31	+
	32	+ } else {
	33	+ $i++;
	34	+
	35	+ }
	36	+ }
	37	+
	38	+ return $split;
	39	+ }
	40	+
	41	+ /**
	42	+ * Split a word into the NFD codepoints that make it up.
	43	+ */
	44	+ function codepoints( $word ) {
	45	+ $word = UtfNormal::toNFD( $word );
	46	+ return preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE \| PREG_SPLIT_NO_EMPTY );
	47	+ }
	48	+
	49	+ /**
	50	+ * Given a codepoints or letters array returns a list that contains 1 for every
	51	+ * alphabetic character and accent, and 0 otherwise. This allows for edge-of-word
	52	+ * detection.
	53	+ */
	54	+ function alphamap( $letters ) {
	55	+
	56	+ $output = Array();
	57	+ $count = count($letters);
	58	+
	59	+ for ($i = 0; $i < $count; $i++) {
	60	+ $output[] = preg_match( '/\pL/u', $letters[$i]) \|\| isset( $utfCombiningClass[$letters[$i]] );
	61	+ }
	62	+
	63	+ return $output;
	64	+ }
	65	+
	66	+ /**
	67	+ * Get all the existing maps in one query, useful given that the default
	68	+ * behaviour of failing silently is designed to allow it to be used by
	69	+ * templates that don't know if a map exists, so may try far too often.
	70	+ */
	71	+ function getExistingMapNames( $prefix ) {
	72	+ global $wgMemc;
	73	+
	74	+ // Have we used it on this page already?
	75	+ if ( ! is_null($this->mPages) )
	76	+ return $this->mPages;
	77	+
	78	+ // Have we used it recently?
	79	+ $cached = $wgMemc->get( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ) );
	80	+ if ( $cached )
	81	+ return $this->mPages = $cached;
	82	+
	83	+ $dbr = wfGetDB( DB_SLAVE );
	84	+ $res = $dbr->select( 'page',
	85	+ array( '*' ),
	86	+ array(
	87	+ 'page_namespace' => NS_MEDIAWIKI,
	88	+ 'page_title LIKE \'' . $dbr->escapeLike( $prefix ) .'%\''
	89	+ ),
	90	+ __METHOD__
	91	+ );
	92	+
	93	+ $this->mPages = Array();
	94	+
	95	+ while ( $r = $res->fetchObject() ) {
	96	+ $this->mPages[$r->page_title] = $r->page_id;
	97	+ }
	98	+
	99	+ $wgMemc->set( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ), $this->mPages );
	100	+ return $this->mPages;
	101	+ }
	102	+ /**
	103	+ * Get a map function, either from the local cache or from the page,
	104	+ */
	105	+ function getMap( $prefix, $mappage ) {
	106	+ global $wgMemc;
	107	+
	108	+ // Have we used it on this page already?
	109	+ if ( isset( $this->mMaps[$mappage] ) ) {
	110	+ return $this->mMaps[$mappage];
	111	+ }
	112	+
	113	+ // Does it exist at all?
	114	+ $existing = $this->getExistingMapNames( $prefix );
	115	+ if ( isset( $existing[$mappage] ) ) {
	116	+
	117	+ // Have we used it recently?
	118	+ $map = $wgMemc->get( wfMemcKey( self::CACHE_PREFIX, $mappage ) );
	119	+ if (! $map ) {
	120	+
	121	+ $map = $this->readMap( wfMsg( $mappage ), $mappage );
	122	+
	123	+ if ( $map )
	124	+ $wgMemc->set( wfMemcKey( self::CACHE_PREFIX, $mappage ), $map);
	125	+ }
	126	+
	127	+ } else {
	128	+ $map = false;
	129	+ }
	130	+
	131	+ return $this->mMaps[$mappage] = $map;
	132	+ }
	133	+
	134	+ /**
	135	+ * Returns true if the line might contain something useful, false otherwise.
	136	+ */
	137	+ static function is_useful_line( $line ) {
	138	+ return $line != "" && substr( $line, 0, 1 ) != '#';
	139	+ }
	140	+
	141	+ /**
	142	+ * Parse a map input syntax into a map.
	143	+ *
	144	+ * Input syntax is a set of lines.
	145	+ * All " " are ignored.
	146	+ * Lines starting with # are ignored, remaining lines are split by =>
	147	+ * HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints)
	148	+ *
	149	+ * The map created is a set of "from" strings to "to" strings
	150	+ * With extra "from" => true for all substrings of "from" strings
	151	+ * So that the transliteration algorithm knows when it has found the longest match
	152	+ *
	153	+ * $map[''] is used as the default fall through for any characters not in the map
	154	+ * $map['__decompose__'] indicates that NFD should be used instead of characters
	155	+ * $map['__sensitive__'] indicates that the automatic first-letter upper-case fall-through should not be tried
	156	+ */
	157	+ function readMap( $input, $mappage ) {
	158	+ global $wgTransliteratorRuleCount, $wgTransliteratorRuleSize;
	159	+
	160	+ $map = array();
	161	+ $decompose = false;
	162	+
	163	+ // Split lines and remove whitespace at beginning and end
	164	+ $input = trim( $input );
	165	+ $lines = preg_split( "/\s\n\s/", $input );
	166	+ $lines = array_filter( $lines, 'ExtTransliterator::is_useful_line' );
	167	+ $lines = array_values( $lines );
	168	+
	169	+ $count = count( $lines );
	170	+
	171	+ // The only content was comments
	172	+ if ( $count == 0 )
	173	+ return false;
	174	+
	175	+ // The first line can contain flags
	176	+ $firstLine = $lines[0];
	177	+ if ( strpos( $firstLine, "=>") === FALSE ) {
	178	+ // Or, could just signify that the message was blank
	179	+ if ( $firstLine == "<$mappage>")
	180	+ return false;
	181	+ else if ( preg_replace( '/<(decompose\|sensitive)>/', '', $firstLine ) != '')
	182	+ return wfMsg( 'transliterator-error-syntax', $firstLine, $mappage );
	183	+
	184	+ if ( strpos( $firstLine, "<decompose>" ) !== FALSE ) {
	185	+ $map['__decompose__'] = true;
	186	+ $decompose = true;
	187	+ }
	188	+ if ( strpos( $firstLine, "<sensitive>" ) !== FALSE ) {
	189	+ $map['__sensitive__'] = true;
	190	+ }
	191	+ array_shift( $lines );
	192	+ $count--;
	193	+ }
	194	+
	195	+ if ( $count > $wgTransliteratorRuleCount )
	196	+ return wfMsgExt( 'transliterator-error-rulecount', array('parsemag'), $wgTransliteratorRuleCount, $mappage );
	197	+
	198	+ foreach ( $lines as $line ) {
	199	+
	200	+ $pair = preg_split( '/\s=>\s/', $line );
	201	+
	202	+ if ( count( $pair ) != 2 )
	203	+ return wfMsg( "transliterator-error-syntax", $line, $mappage );
	204	+
	205	+ $from = $pair[0];
	206	+ $to = Sanitizer::decodeCharReferences( $pair[1], ENT_QUOTES, 'UTF-8' );
	207	+
	208	+ // Convert the ^ and $ selectors into special characters for matching
	209	+ // Leave single ^ and $'s alone incase someone wants to use them
	210	+ // Still permits the creation of the rule "^$=>" that will never match, but hey
	211	+ $fromlast = strlen( $from ) - 1;
	212	+ if ( $fromlast > 0 ) {
	213	+ if ( $from[0] == "^" ) {
	214	+ $from = substr( $from, 1 ) . self::FIRST;
	215	+ $fromlast--;
	216	+ }
	217	+
	218	+ if ( $from[$fromlast] == "$")
	219	+ $from[$fromlast] = self::LAST;
	220	+ }
	221	+
	222	+ // Now we've looked at our syntax we can remove html escaping to reveal the true form
	223	+ $from = Sanitizer::decodeCharReferences( $from, ENT_QUOTES, 'UTF-8' );
	224	+ if ( $decompose ) { // Undo the NFCing of MediaWiki
	225	+ $from = UtfNormal::toNFD( $from );
	226	+ }
	227	+
	228	+ // If $map[$from] is set we can skip the filling in of sub-strings as there is a longer rule
	229	+ if ( isset( $map[$from] ) ) {
	230	+
	231	+ // Or a rule of the same length, i.e. the same rule.
	232	+ if ( is_string( $map[$from] ) && $to != $map[$from] )
	233	+ return wfMsg("transliterator-error-ambiguous", $line, $mappage);
	234	+
	235	+ } else if ( strlen( $from ) > 1 ){
	236	+
	237	+ // Bail if the left hand side is too long (has performance implications otherwise)
	238	+ $fromlen = strlen( $from );
	239	+ if ( $fromlen > $wgTransliteratorRuleSize )
	240	+ return wfMsgExt('transliterator-error-rulesize', array('parsemag'), $line, $mappage, $wgTransliteratorRuleSize );
	241	+
	242	+ // Fill in the blanks, so that we know when to stop looking while transliterating
	243	+ for ( $i = 1; $i < $fromlen; $i++ ) {
	244	+ $substr = substr( $from, 0, $i );
	245	+
	246	+ if (! isset( $map[$substr] ) )
	247	+ $map[$substr] = true;
	248	+ }
	249	+ } // else we have the default rule
	250	+
	251	+ $map[$from] = $to;
	252	+ }
	253	+
	254	+ return $map;
	255	+ }
	256	+
	257	+ /**
	258	+ * Transliterate a word by iteratively finding the longest substring from
	259	+ * the start of the untransliterated string that we have a rule for, and
	260	+ * transliterating it.
	261	+ */
	262	+ function transliterate( $word, $map )
	263	+ {
	264	+ if ( isset( $map["__decompose__"] ) ) {
	265	+ $letters = $this->codepoints( $word );
	266	+ } else {
	267	+ $letters = $this->letters( $word );
	268	+ }
	269	+
	270	+ $alphamap = $this->alphamap( $letters );
	271	+
	272	+ $sensitive = isset( $map["__sensitive__"] ); // Are we in case-sensitive mode, or not
	273	+ $ucfirst = false; // We are in case-sensitive mode and the first character of the current match was upper-case originally
	274	+ $lastUpper = null; // We have lower-cased the current letter, but we need to keep track of the original (dotted I for example)
	275	+
	276	+ $output = ""; // The output
	277	+ $lastMatch = 0; // The position of the last character matched, or the first character of the current run
	278	+ $lastTrans = null; // The transliteration of the last character matched, or null if the first character of the current run
	279	+ $i = 0; // The current position in the string
	280	+ $count = count($letters); // The total number of characters in the string
	281	+ $current = ""; // The substring that we are currently trying to find the longest match for.
	282	+ $currentStart = 0; // The position that $current starts at
	283	+
	284	+ while ( $lastMatch < $count ) {
	285	+
	286	+ if ( $i < $count ) {
	287	+
	288	+ $next = $current.$letters[$i];
	289	+
	290	+ // There may be a match longer than $current
	291	+ if ( isset( $map[$next] ) ) {
	292	+
	293	+ // In fact, $next is a match
	294	+ if ( is_string( $map[$next] ) ) {
	295	+ $lastMatch = $i;
	296	+ $lastTrans = $map[$next];
	297	+ }
	298	+
	299	+ $i++;
	300	+ $current = $next;
	301	+ continue;
	302	+ }
	303	+ }
	304	+
	305	+
	306	+ // If this match is at the end of a word, see whether we have a more specific rule
	307	+ if ( $alphamap[$i-1] && ( $i == $count \|\| !$alphamap[$i] ) ) {
	308	+ $try = $current . self::LAST;
	309	+ if ( isset( $map[$try] ) ) {
	310	+ if ( is_string( $map[$try] ) ) {
	311	+ $lastTrans = $map[$try];
	312	+ }
	313	+ if ( isset( $map[$try . self::FIRST] ) ) {
	314	+ $current = $try;
	315	+ }
	316	+ }
	317	+ }
	318	+
	319	+ // If this match is at the start of a word, see whether we have a more specific rule
	320	+ if ( ( $currentStart == 0 \|\| !$alphamap[$currentStart-1]) && $alphamap[$currentStart] ) {
	321	+ $try = $current . self::FIRST;
	322	+ if ( isset( $map[$try] ) && is_string( $map[$try] ) ) {
	323	+ $lastTrans = $map[$try];
	324	+ }
	325	+ }
	326	+
	327	+ // We had no match at all, pass through one character
	328	+ if ( is_null( $lastTrans ) ) {
	329	+
	330	+ $lastLetter = $letters[$lastMatch];
	331	+ $lastLower = $sensitive ? $lastLetter : mb_strtolower( $lastLetter );
	332	+
	333	+ // If we are not being sensitive, we can try down-casing the previous letter
	334	+ if ( $lastLetter != $lastLower ) {
	335	+ $ucfirst = true;
	336	+ $letters[$lastMatch] = $lastLower;
	337	+ $lastUpper = $lastLetter;
	338	+
	339	+ // Might be nice to output a ? if we don't understand
	340	+ } else if ( isset( $map[''] ) ) {
	341	+
	342	+ if ( $ucfirst ) {
	343	+ $output .= str_replace( '$1', $lastUpper , $map[''] );
	344	+ $ucfirst = false;
	345	+ } else {
	346	+ $output .= str_replace( '$1', $lastLetter, $map[''] );
	347	+ }
	348	+ $i = $currentStart = ++$lastMatch;
	349	+ $current = "";
	350	+
	351	+ // Or the input if it's likely to be correct enough
	352	+ } else {
	353	+
	354	+ if ( $ucfirst ) {
	355	+ $output .= $lastUpper;
	356	+ $ucfirst = false;
	357	+ } else {
	358	+ $output .= $lastLetter;
	359	+ }
	360	+ $i = $currentStart = ++$lastMatch;
	361	+ $current = "";
	362	+ }
	363	+
	364	+ // Output the previous match
	365	+ } else {
	366	+
	367	+ if ( $ucfirst ) {
	368	+ $output .= mb_strtoupper( mb_substr( $lastTrans, 0, 1 ) ).mb_substr( $lastTrans, 1 );
	369	+ $ucfirst = false;
	370	+ } else {
	371	+ $output .= $lastTrans;
	372	+ }
	373	+ $i = $currentStart = ++$lastMatch;
	374	+ $lastTrans = null;
	375	+ $current = "";
	376	+
	377	+ }
	378	+ }
	379	+ return $output;
	380	+ }
	381	+
	382	+ /**
	383	+ * {{#transliterate:<mapname>\|<word>[\|<format>[\|<answer>[\|<onerror>]]]}}
	384	+ *
	385	+ * Direct usage will generally be of the form {{#transilterate:<mapname>\|<word>}} while
	386	+ * generic templates may find the latter three parameters invaluable for easy use.
	387	+ *
	388	+ * $mapname is the name of the transliteration map to find.
	389	+ * $word is the string to transliterate (if the map was found)
	390	+ * $format is a string containing $1 to be replaced by the transliteration if the map exists
	391	+ * $answer allows for a user-specified transliteration to override the automatic one
	392	+ * $other is an error messsage to display if $answer is blank and an invalid map is specified
	393	+ */
	394	+ function render( &$parser, $mapname = '', $word = '', $format = '$1', $answer = '', $other = '' ) {
	395	+
	396	+ if ( trim( $format ) == '') { // Handle the case when people use {{#transliterate:<>\|<>\|\|<>}}
	397	+ $format = '$1';
	398	+ }
	399	+
	400	+ if ( trim( $answer ) != '') {
	401	+ return str_replace('$1', $answer, $format);
	402	+ }
	403	+
	404	+ $prefix = wfMsg( 'transliterator-prefix' );
	405	+ $title = Title::newFromText( $prefix . $mapname, NS_MEDIAWIKI );
	406	+
	407	+ if (! $title ) {
	408	+ return $other == '' ? str_replace("$1", "{{#transliterate:$mapname\|$word}}", $format) : $other;
	409	+ }
	410	+
	411	+ $mappage = $title->getDBkey();
	412	+
	413	+ $map = $this->getMap( $prefix, $mappage );
	414	+
	415	+ if ( !$map ) { // False if map was not found
	416	+ $output = $other;
	417	+
	418	+ } else if ( is_string( $map ) ) { // An error message
	419	+ $output = '<span class="transliterator error"> '.$map.' </span>';
	420	+
	421	+ } else { // A Map
	422	+ $trans = UtfNormal::toNFC( $this->transliterate( Sanitizer::decodeCharReferences( $word ), $map ) );
	423	+ $output = str_replace( '$1', $trans, $format );
	424	+ }
	425	+
	426	+ // Populate the dependency table so that we get re-rendered if the map changes.
	427	+ if ( isset( $this->mPages[$mappage] ) )
	428	+ $parser->mOutput->addTemplate( $title, $this->mPages[$mappage], null );
	429	+
	430	+ else
	431	+ $parser->mOutput->addTemplate( $title, $title->getArticleID(), null );
	432	+
	433	+ return $output;
	434	+ }
	435	+
	436	+ /**
	437	+ * Called on ArticlePurge, ArticleDeleteComplete and NewRevisionFromEditComplete in order to purge cache
	438	+ */
	439	+ static function purgeArticle( &$article ) {
	440	+ return self::purgeTitle( $article->getTitle() );
	441	+ }
	442	+
	443	+ /**
	444	+ * Called on TitleMoveComplete
	445	+ */
	446	+ static function purgeNewTitle ( &$title, &$newtitle ) {
	447	+ return self::purgeTitle( $newtitle );
	448	+ }
	449	+
	450	+ /**
	451	+ * Called on ArticleUndelete (and by other purge hook handlers)
	452	+ */
	453	+ static function purgeTitle( &$title ) {
	454	+ global $wgMemc;
	455	+ if ( $title->getNamespace() == NS_MEDIAWIKI ) {
	456	+ $text = $title->getText();
	457	+ $prefix = wfMsg( 'transliterator-prefix' );
	458	+ if ( strpos( $text, $prefix ) === 0 ) {
	459	+ $wgMemc->delete( wfMemcKey( self::CACHE_PREFIX, $title->getDBkey() ) );
	460	+ $wgMemc->delete( wfMemcKey( self::CACHE_PREFIX, "__map_names__" ) );
	461	+ }
	462	+ }
	463	+ return true;
	464	+
	465	+ }
	466	+
	467	+ /**
	468	+ * Called on first use to create singleton
	469	+ */
	470	+ static function setup( &$parser ) {
	471	+ $trans = new ExtTransliterator;
	472	+ $parser->setFunctionHook( 'transliterate', array( $trans, 'render' ) );
	473	+ return true;
	474	+ }
	475	+}
	476	+?>
Property changes on: trunk/extensions/Transliterator/Transliterator_body.php
___________________________________________________________________
Name: svn:eol-style
1	477	+ native
Index: trunk/extensions/Transliterator/Transliterator.php
—	—	@@ -64,479 +64,14 @@
65	65	'path' => __FILE__,
66	66	);
67	67
	68	+$wgAutoloadClasses['ExtTransliterator'] = dirname( __FILE__ ) . "/Transliterator_body.php";
	69	+$wgExtensionMessagesFiles['Transliterator'] = dirname(__FILE__) . '/Transliterator.i18n.php';
	70	+
68	71	$wgHooks['ParserFirstCallInit'][] = 'ExtTransliterator::setup';
69		~~-$wgExtensionMessagesFiles['Transliterator'] = dirname(__FILE__).'/Transliterator.i18n.php';~~
70	72	$wgHooks['ArticleDeleteComplete'][] = 'ExtTransliterator::purgeArticle';
71	73	$wgHooks['NewRevisionFromEditComplete'][] = 'ExtTransliterator::purgeArticle';
72	74	$wgHooks['ArticlePurge'][] = 'ExtTransliterator::purgeArticle';
73	75	$wgHooks['ArticleUndelete'][] = 'ExtTransliterator::purgeTitle';
74	76	$wgHooks['TitleMoveComplete'][] = 'ExtTransliterator::purgeNewtitle';
75	77
76		~~-class ExtTransliterator {~~
77		-
78		~~- const FIRST = "\x1F"; // A character that will be appended when ^ should match at the start~~
79		~~- const LAST = "\x1E"; // A character that will be appended when $ should match at the end~~
80		~~- const CACHE_PREFIX = "extTransliterator.2:"; // The prefix to use for cache items (the number should be incremented when the map format changes)~~
81		~~- var $mPages = null; // An Array of "transliterator:$mapname" => The database row for that template.~~
82		~~- var $mMaps = array();// An Array of "$mapname" => The map parsed from that page.~~
83		-
84		- /**
85		~~- * Split a word into letters (not bytes or codepoints) implicitly in NFC due to MediaWiki.~~
86		~~- */~~
87		~~- function letters( $word ) {~~
88		~~- global $utfCombiningClass;~~
89		~~- UtfNormal::loadData();~~
90		-
91		~~- $split = preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE \| PREG_SPLIT_NO_EMPTY );~~
92		-
93		~~- $i = 1;~~
94		~~- while ( $i < count( $split ) ) {~~
95		~~- if ( isset( $utfCombiningClass[$split[$i]] ) ) {~~
96		~~- $split[$i - 1] .= $split[$i];~~
97		~~- unset( $split[$i] );~~
98		-
99		~~- } else {~~
100		~~- $i++;~~
101		-
102		~~- }~~
103		~~- }~~
104		-
105		~~- return $split;~~
106		~~- }~~
107		-
108		- /**
109		~~- * Split a word into the NFD codepoints that make it up.~~
110		~~- */~~
111		~~- function codepoints( $word ) {~~
112		~~- $word = UtfNormal::toNFD( $word );~~
113		~~- return preg_split( '/(.)/u', $word, -1, PREG_SPLIT_DELIM_CAPTURE \| PREG_SPLIT_NO_EMPTY );~~
114		~~- }~~
115		-
116		- /**
117		~~- * Given a codepoints or letters array returns a list that contains 1 for every~~
118		~~- * alphabetic character and accent, and 0 otherwise. This allows for edge-of-word~~
119		~~- * detection.~~
120		~~- */~~
121		~~- function alphamap( $letters ) {~~
122		-
123		~~- $output = Array();~~
124		~~- $count = count($letters);~~
125		-
126		~~- for ($i = 0; $i < $count; $i++) {~~
127		~~- $output[] = preg_match( '/\pL/u', $letters[$i]) \|\| isset( $utfCombiningClass[$letters[$i]] );~~
128		~~- }~~
129		-
130		~~- return $output;~~
131		~~- }~~
132		-
133		- /**
134		~~- * Get all the existing maps in one query, useful given that the default~~
135		~~- * behaviour of failing silently is designed to allow it to be used by~~
136		~~- * templates that don't know if a map exists, so may try far too often.~~
137		~~- */~~
138		~~- function getExistingMapNames( $prefix ) {~~
139		~~- global $wgMemc;~~
140		-
141		~~- // Have we used it on this page already?~~
142		~~- if ( ! is_null($this->mPages) )~~
143		~~- return $this->mPages;~~
144		-
145		~~- // Have we used it recently?~~
146		~~- $cached = $wgMemc->get( self::CACHE_PREFIX . "__map_names__" );~~
147		~~- if ( $cached )~~
148		~~- return $this->mPages = $cached;~~
149		-
150		~~- $dbr = wfGetDB( DB_SLAVE );~~
151		~~- $res = $dbr->select( 'page',~~
152		~~- array( '*' ),~~
153		~~- array(~~
154		~~- 'page_namespace' => NS_MEDIAWIKI,~~
155		~~- 'page_title LIKE \'' . $dbr->escapeLike( $prefix ) .'%\''~~
156		~~- ),~~
157		~~- __METHOD__~~
158		~~- );~~
159		-
160		~~- $this->mPages = Array();~~
161		-
162		~~- while ( $r = $res->fetchObject() ) {~~
163		~~- $this->mPages[$r->page_title] = $r->page_id;~~
164		~~- }~~
165		-
166		~~- $wgMemc->set( self::CACHE_PREFIX . "__map_names__", $this->mPages );~~
167		~~- return $this->mPages;~~
168		~~- }~~
169		- /**
170		~~- * Get a map function, either from the local cache or from the page,~~
171		~~- */~~
172		~~- function getMap( $prefix, $mappage ) {~~
173		~~- global $wgMemc;~~
174		-
175		~~- // Have we used it on this page already?~~
176		~~- if ( isset( $this->mMaps[$mappage] ) ) {~~
177		~~- return $this->mMaps[$mappage];~~
178		~~- }~~
179		-
180		~~- // Does it exist at all?~~
181		~~- $existing = $this->getExistingMapNames( $prefix );~~
182		~~- if ( isset( $existing[$mappage] ) ) {~~
183		-
184		~~- // Have we used it recently?~~
185		~~- $map = $wgMemc->get( self::CACHE_PREFIX . $mappage );~~
186		~~- if (! $map ) {~~
187		-
188		~~- $map = $this->readMap( wfMsg( $mappage ), $mappage );~~
189		-
190		~~- if ( $map )~~
191		~~- $wgMemc->set( self::CACHE_PREFIX . $mappage, $map);~~
192		~~- }~~
193		-
194		~~- } else {~~
195		~~- $map = false;~~
196		~~- }~~
197		-
198		~~- return $this->mMaps[$mappage] = $map;~~
199		~~- }~~
200		-
201		- /**
202		~~- * Parse a map input syntax into a map.~~
203		- *
204		~~- * Input syntax is a set of lines.~~
205		~~- * All " " are ignored.~~
206		~~- * Lines starting with # are ignored, remaining lines are split by =>~~
207		~~- * HTML entities are decoded (essential for sanity when trying to add rules for combining codepoints)~~
208		- *
209		~~- * The map created is a set of "from" strings to "to" strings~~
210		~~- * With extra "from" => true for all substrings of "from" strings~~
211		~~- * So that the transliteration algorithm knows when it has found the longest match~~
212		- *
213		~~- * $map[''] is used as the default fall through for any characters not in the map~~
214		~~- * $map['__decompose__'] indicates that NFD should be used instead of characters~~
215		~~- * $map['__sensitive__'] indicates that the automatic first-letter upper-case fall-through should not be tried~~
216		~~- */~~
217		~~- function readMap( $input, $mappage ) {~~
218		~~- global $wgTransliteratorRuleCount, $wgTransliteratorRuleSize;~~
219		-
220		~~- $map = array();~~
221		~~- $decompose = false;~~
222		-
223		~~- // Split lines and remove whitespace at beginning and end~~
224		~~- $lines = preg_split( "/(^\|\s\n)(\s(#[^\n])?\n)\s*/", $input."\n" );~~
225		-
226		~~- $count = count( $lines );~~
227		-
228		~~- if ( $count > 0 && $lines[0] == "" ) {~~
229		~~- array_shift( $lines );~~
230		~~- $count--;~~
231		~~- }~~
232		-
233		~~- if ( $count > 0 && $lines[$count - 1] == "" ) {~~
234		~~- array_pop( $lines );~~
235		~~- $count--;~~
236		~~- }~~
237		-
238		~~- // The only content was comments~~
239		~~- if ( $count == 0 )~~
240		~~- return false;~~
241		-
242		~~- // The first line can contain flags~~
243		~~- $first_line = $lines[0];~~
244		~~- if ( strpos( $first_line, "=>") === FALSE ) {~~
245		~~- // Or, could just signify that the message was blank~~
246		~~- if ( $first_line == "<$mappage>")~~
247		~~- return false;~~
248		~~- else if ( preg_replace( '/<(decompose\|sensitive)>/', '', $first_line ) != '')~~
249		~~- return wfMsg( 'transliterator-error-syntax', $first_line, $mappage );~~
250		-
251		~~- if ( strpos( $first_line, "<decompose>" ) !== FALSE ) {~~
252		~~- $map['__decompose__'] = true;~~
253		~~- $decompose = true;~~
254		~~- }~~
255		~~- if ( strpos( $first_line, "<sensitive>" ) !== FALSE ) {~~
256		~~- $map['__sensitive__'] = true;~~
257		~~- }~~
258		~~- array_shift( $lines );~~
259		~~- $count--;~~
260		~~- }~~
261		-
262		~~- if ( $count > $wgTransliteratorRuleCount )~~
263		~~- return wfMsgExt( 'transliterator-error-rulecount', array('parsemag'), $wgTransliteratorRuleCount, $mappage );~~
264		-
265		~~- foreach ( $lines as $line ) {~~
266		-
267		~~- $pair = preg_split( '/\s=>\s/', $line );~~
268		-
269		~~- if ( count( $pair ) != 2 )~~
270		~~- return wfMsg( "transliterator-error-syntax", $line, $mappage );~~
271		-
272		~~- $from = $pair[0];~~
273		~~- $to = Sanitizer::decodeCharReferences( $pair[1], ENT_QUOTES, 'UTF-8' );~~
274		-
275		~~- // Convert the ^ and $ selectors into special characters for matching~~
276		~~- // Leave single ^ and $'s alone incase someone wants to use them~~
277		~~- // Still permits the creation of the rule "^$=>" that will never match, but hey~~
278		~~- $fromlast = strlen( $from ) - 1;~~
279		~~- if ( $fromlast > 0 ) {~~
280		~~- if ( $from[0] == "^" ) {~~
281		~~- $from = substr( $from, 1 ) . self::FIRST;~~
282		~~- $fromlast--;~~
283		~~- }~~
284		-
285		~~- if ( $from[$fromlast] == "$")~~
286		~~- $from[$fromlast] = self::LAST;~~
287		~~- }~~
288		-
289		~~- // Now we've looked at our syntax we can remove html escaping to reveal the true form~~
290		~~- $from = Sanitizer::decodeCharReferences( $from, ENT_QUOTES, 'UTF-8' );~~
291		~~- if ( $decompose ) { // Undo the NFCing of MediaWiki~~
292		~~- $from = UtfNormal::toNFD( $from );~~
293		~~- }~~
294		-
295		~~- // If $map[$from] is set we can skip the filling in of sub-strings as there is a longer rule~~
296		~~- if ( isset( $map[$from] ) ) {~~
297		-
298		~~- // Or a rule of the same length, i.e. the same rule.~~
299		~~- if ( is_string( $map[$from] ) && $to != $map[$from] )~~
300		~~- return wfMsg("transliterator-error-ambiguous", $line, $mappage);~~
301		-
302		~~- } else if ( strlen( $from ) > 1 ){~~
303		-
304		~~- // Bail if the left hand side is too long (has performance implications otherwise)~~
305		~~- $fromlen = strlen( $from );~~
306		~~- if ( $fromlen > $wgTransliteratorRuleSize )~~
307		~~- return wfMsgExt('transliterator-error-rulesize', array('parsemag'), $line, $mappage, $wgTransliteratorRuleSize );~~
308		-
309		~~- // Fill in the blanks, so that we know when to stop looking while transliterating~~
310		~~- for ( $i = 1; $i < $fromlen; $i++ ) {~~
311		~~- $substr = substr( $from, 0, $i );~~
312		-
313		~~- if (! isset( $map[$substr] ) )~~
314		~~- $map[$substr] = true;~~
315		~~- }~~
316		~~- } // else we have the default rule~~
317		-
318		~~- $map[$from] = $to;~~
319		~~- }~~
320		-
321		~~- return $map;~~
322		~~- }~~
323		-
324		- /**
325		~~- * Transliterate a word by iteratively finding the longest substring from~~
326		~~- * the start of the untransliterated string that we have a rule for, and~~
327		~~- * transliterating it.~~
328		~~- */~~
329		~~- function transliterate( $word, $map )~~
330		~~- {~~
331		~~- if ( isset( $map["__decompose__"] ) ) {~~
332		~~- $letters = $this->codepoints( $word );~~
333		~~- } else {~~
334		~~- $letters = $this->letters( $word );~~
335		~~- }~~
336		-
337		~~- $alphamap = $this->alphamap( $letters );~~
338		-
339		~~- $sensitive = isset( $map["__sensitive__"] ); // Are we in case-sensitive mode, or not~~
340		~~- $ucfirst = false; // We are in case-sensitive mode and the first character of the current match was upper-case originally~~
341		~~- $last_upper = null; // We have lower-cased the current letter, but we need to keep track of the original (dotted I for example)~~
342		-
343		~~- $output = ""; // The output~~
344		~~- $last_match = 0; // The position of the last character matched, or the first character of the current run~~
345		~~- $last_trans = null; // The transliteration of the last character matched, or null if the first character of the current run~~
346		~~- $i = 0; // The current position in the string~~
347		~~- $count = count($letters); // The total number of characters in the string~~
348		~~- $current = ""; // The substring that we are currently trying to find the longest match for.~~
349		~~- $current_start = 0; // The position that $current starts at~~
350		-
351		~~- while ( $last_match < $count ) {~~
352		-
353		~~- if ( $i < $count ) {~~
354		-
355		~~- $next = $current.$letters[$i];~~
356		-
357		~~- // There may be a match longer than $current~~
358		~~- if ( isset( $map[$next] ) ) {~~
359		-
360		~~- // In fact, $next is a match~~
361		~~- if ( is_string( $map[$next] ) ) {~~
362		~~- $last_match = $i;~~
363		~~- $last_trans = $map[$next];~~
364		~~- }~~
365		-
366		~~- $i++;~~
367		~~- $current = $next;~~
368		~~- continue;~~
369		~~- }~~
370		~~- }~~
371		-
372		-
373		~~- // If this match is at the end of a word, see whether we have a more specific rule~~
374		~~- if ( $alphamap[$i-1] && ( $i == $count \|\| !$alphamap[$i] ) ) {~~
375		~~- $try = $current . self::LAST;~~
376		~~- if ( isset( $map[$try] ) ) {~~
377		~~- if ( is_string( $map[$try] ) ) {~~
378		~~- $last_trans = $map[$try];~~
379		~~- }~~
380		~~- if ( isset( $map[$try . self::FIRST] ) ) {~~
381		~~- $current = $try;~~
382		~~- }~~
383		~~- }~~
384		~~- }~~
385		-
386		~~- // If this match is at the start of a word, see whether we have a more specific rule~~
387		~~- if ( ( $current_start == 0 \|\| !$alphamap[$current_start-1]) && $alphamap[$current_start] ) {~~
388		~~- $try = $current . self::FIRST;~~
389		~~- if ( isset( $map[$try] ) && is_string( $map[$try] ) ) {~~
390		~~- $last_trans = $map[$try];~~
391		~~- }~~
392		~~- }~~
393		-
394		~~- // We had no match at all, pass through one character~~
395		~~- if ( is_null( $last_trans ) ) {~~
396		-
397		~~- $last_letter = $letters[$last_match];~~
398		~~- $last_lower = $sensitive ? $last_letter : mb_strtolower( $last_letter );~~
399		-
400		~~- // If we are not being sensitive, we can try down-casing the previous letter~~
401		~~- if ( $last_letter != $last_lower ) {~~
402		~~- $ucfirst = true;~~
403		~~- $letters[$last_match] = $last_lower;~~
404		~~- $last_upper = $last_letter;~~
405		-
406		~~- // Might be nice to output a ? if we don't understand~~
407		~~- } else if ( isset( $map[''] ) ) {~~
408		-
409		~~- if ( $ucfirst ) {~~
410		~~- $output .= str_replace( '$1', $last_upper , $map[''] );~~
411		~~- $ucfirst = false;~~
412		~~- } else {~~
413		~~- $output .= str_replace( '$1', $last_letter, $map[''] );~~
414		~~- }~~
415		~~- $i = $current_start = ++$last_match;~~
416		~~- $current = "";~~
417		-
418		~~- // Or the input if it's likely to be correct enough~~
419		~~- } else {~~
420		-
421		~~- if ( $ucfirst ) {~~
422		~~- $output .= $last_upper;~~
423		~~- $ucfirst = false;~~
424		~~- } else {~~
425		~~- $output .= $last_letter;~~
426		~~- }~~
427		~~- $i = $current_start = ++$last_match;~~
428		~~- $current = "";~~
429		~~- }~~
430		-
431		~~- // Output the previous match~~
432		~~- } else {~~
433		-
434		~~- if ( $ucfirst ) {~~
435		~~- $output .= mb_strtoupper( mb_substr( $last_trans, 0, 1 ) ).mb_substr( $last_trans, 1 );~~
436		~~- $ucfirst = false;~~
437		~~- } else {~~
438		~~- $output .= $last_trans;~~
439		~~- }~~
440		~~- $i = $current_start = ++$last_match;~~
441		~~- $last_trans = null;~~
442		~~- $current = "";~~
443		-
444		~~- }~~
445		~~- }~~
446		~~- return $output;~~
447		~~- }~~
448		-
449		- /**
450		~~- * {{#transliterate:<mapname>\|<word>[\|<format>[\|<answer>[\|<onerror>]]]}}~~
451		- *
452		~~- * Direct usage will generally be of the form {{#transilterate:<mapname>\|<word>}} while~~
453		~~- * generic templates may find the latter three parameters invaluable for easy use.~~
454		- *
455		~~- * $mapname is the name of the transliteration map to find.~~
456		~~- * $word is the string to transliterate (if the map was found)~~
457		~~- * $format is a string containing $1 to be replaced by the transliteration if the map exists~~
458		~~- * $answer allows for a user-specified transliteration to override the automatic one~~
459		~~- * $other is an error messsage to display if $answer is blank and an invalid map is specified~~
460		~~- */~~
461		~~- function render( &$parser, $mapname = '', $word = '', $format = '$1', $answer = '', $other = '' ) {~~
462		-
463		~~- if ( trim( $format ) == '') { // Handle the case when people use {{#transliterate:<>\|<>\|\|<>}}~~
464		~~- $format = '$1';~~
465		~~- }~~
466		-
467		~~- if ( trim( $answer ) != '') {~~
468		~~- return str_replace('$1', $answer, $format);~~
469		~~- }~~
470		-
471		~~- $prefix = wfMsg( 'transliterator-prefix' );~~
472		~~- $title = Title::newFromText( $prefix . $mapname, NS_MEDIAWIKI );~~
473		-
474		~~- if (! $title ) {~~
475		~~- return $other == '' ? str_replace("$1", "{{#transliterate:$mapname\|$word}}", $format) : $other;~~
476		~~- }~~
477		-
478		~~- $mappage = $title->getDBkey();~~
479		-
480		~~- $map = $this->getMap( $prefix, $mappage );~~
481		-
482		~~- if ( !$map ) { // False if map was not found~~
483		~~- $output = $other;~~
484		-
485		~~- } else if ( is_string( $map ) ) { // An error message~~
486		~~- $output = '<span class="transliterator error"> '.$map.' </span>';~~
487		-
488		~~- } else { // A Map~~
489		~~- $trans = UtfNormal::toNFC( $this->transliterate( Sanitizer::decodeCharReferences( $word ), $map ) );~~
490		~~- $output = str_replace( '$1', $trans, $format );~~
491		~~- }~~
492		-
493		~~- // Populate the dependency table so that we get re-rendered if the map changes.~~
494		~~- if ( isset( $this->mPages[$mappage] ) )~~
495		~~- $parser->mOutput->addTemplate( $title, $this->mPages[$mappage], null );~~
496		-
497		~~- else~~
498		~~- $parser->mOutput->addTemplate( $title, $title->getArticleID(), null );~~
499		-
500		~~- return $output;~~
501		~~- }~~
502		-
503		- /**
504		~~- * Called on ArticlePurge, ArticleDeleteComplete and NewRevisionFromEditComplete in order to purge cache~~
505		~~- */~~
506		~~- static function purgeArticle( &$article, $a=false, $b=false, $c=false, $d=false ) {~~
507		~~- return self::purgeTitle( $article->getTitle() );~~
508		~~- }~~
509		-
510		- /**
511		~~- * Called on TitleMoveComplete~~
512		~~- */~~
513		~~- static function purgeNewTitle ( &$title, &$newtitle, $a=false, $b=false, $c=false ) {~~
514		~~- return self::purgeTitle( $newtitle );~~
515		~~- }~~
516		-
517		- /**
518		~~- * Called on ArticleUndelete (and by other purge hook handlers)~~
519		~~- */~~
520		~~- static function purgeTitle( &$title, $a=false ) {~~
521		~~- global $wgMemc;~~
522		~~- if ( $title->getNamespace() == NS_MEDIAWIKI ) {~~
523		~~- $text = $title->getText();~~
524		~~- $prefix = wfMsg( 'transliterator-prefix' );~~
525		~~- if ( strpos( $text, $prefix ) === 0 ) {~~
526		~~- $wgMemc->delete( self::CACHE_PREFIX . $title->getDBkey() );~~
527		~~- $wgMemc->delete( self::CACHE_PREFIX . "__map_names__" );~~
528		~~- }~~
529		~~- }~~
530		~~- return true;~~
531		-
532		~~- }~~
533		-
534		- /**
535		~~- * Called on first use to create singleton~~
536		~~- */~~
537		~~- static function setup( &$parser ) {~~
538		~~- $trans = new ExtTransliterator;~~
539		~~- $parser->setFunctionHook( 'transliterate', array( $trans, 'render' ) );~~
540		~~- return true;~~
541		~~- }~~
542		-}
543		-
	78	+?>

Status & tagging log

22:12, 25 May 2011 😂 (talk | contribs) changed the status of r57423 [removed: new added: deferred]
22:33, 6 May 2011 MarkAHershberger (talk | contribs) changed the status of r57423 [removed: deferred added: new]
01:56, 11 December 2009 Tim Starling (talk | contribs) changed the status of r57423 [removed: new added: deferred]