r103327 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r103326‎ | r103327 | r103328 >
Date:15:12, 16 November 2011
Author:hashar
Status:ok (Comments)
Tags:
Comment:
bug 28643 improvement to serbian variants conversion

This patch is a PARTIAL merge of /branches/nikola/phase3 ::

r85224 avoid double conversion when text already use the correct variant
r85239 minor fixes to previous
r85308 documentation (@since 1.18 update to 1.19)
r101359 guessVariant doc + boolean typecast
r101369 tests
r103131 additional test

Test plan:
==========
$ ./phpunit.php --filter LanguageSr
PHPUnit 3.6.3 by Sebastian Bergmann.

Configuration read from /srv/trunk/tests/phpunit/suite.xml

.....

Time: 1 second, Memory: 78.50Mb

OK (5 tests, 19 assertions)
$
Modified paths:
  • /trunk/phase3/RELEASE-NOTES-1.19 (modified) (history)
  • /trunk/phase3/languages/LanguageConverter.php (modified) (history)
  • /trunk/phase3/languages/classes/LanguageSr.php (modified) (history)
  • /trunk/phase3/tests/phpunit/languages/LanguageSrTest.php (added) (history)

Diff [purge]

Index: trunk/phase3/tests/phpunit/languages/LanguageSrTest.php
@@ -0,0 +1,165 @@
 2+<?php
 3+/**
 4+ * PHPUnit tests for the Serbian language.
 5+ * The language can be represented using two scripts:
 6+ * - Latin (SR_el)
 7+ * - Cyrillic (SR_ec)
 8+ * Both representations seems to be bijective, hence MediaWiki can convert
 9+ * from one script to the other.
 10+ *
 11+ * @author Antoine Musso <hashar at free dot fr>
 12+ * @copyright Copyright © 2011, Antoine Musso <hashar at free dot fr>
 13+ * @file
 14+ */
 15+
 16+require_once dirname(dirname(__FILE__)). '/bootstrap.php';
 17+
 18+/** Tests for MediaWiki languages/LanguageTr.php */
 19+class LanguageSrTest extends MediaWikiTestCase {
 20+ /* Language object. Initialized before each test */
 21+ private $lang;
 22+
 23+ function setUp() {
 24+ $this->lang = Language::factory( 'Sr' );
 25+ }
 26+ function tearDown() {
 27+ unset( $this->lang );
 28+ }
 29+
 30+ ##### TESTS #######################################################
 31+
 32+ function testEasyConversions( ) {
 33+ $this->assertCyrillic(
 34+ 'шђчћжШЂЧЋЖ',
 35+ 'Cyrillic guessing characters'
 36+ );
 37+ $this->assertLatin(
 38+ 'šđč枊ĐČĆŽ',
 39+ 'Latin guessing characters'
 40+ );
 41+ }
 42+
 43+ function testMixedConversions() {
 44+ $this->assertCyrillic(
 45+ 'шђчћжШЂЧЋЖ - šđčćž',
 46+ 'Mostly cyrillic characters'
 47+ );
 48+ $this->assertLatin(
 49+ 'šđč枊ĐČĆŽ - шђчћж',
 50+ 'Mostly latin characters'
 51+ );
 52+ }
 53+
 54+ function testSameAmountOfLatinAndCyrillicGetConverted() {
 55+ $this->assertConverted(
 56+ '4 latin: šđčć | 4 cyrillic: шђчћ',
 57+ 'sr-ec'
 58+ );
 59+ $this->assertConverted(
 60+ '4 latin: šđčć | 4 cyrillic: шђчћ',
 61+ 'sr-el'
 62+ );
 63+ }
 64+
 65+ /**
 66+ * @author Nikola Smolenski
 67+ */
 68+ function testConversionToCyrillic() {
 69+ $this->assertEquals( 'абвг',
 70+ $this->convertToCyrillic( 'abvg' )
 71+ );
 72+ $this->assertEquals( 'абвг',
 73+ $this->convertToCyrillic( 'абвг' )
 74+ );
 75+ $this->assertEquals( 'abvgшђжчћ',
 76+ $this->convertToCyrillic( 'abvgшђжчћ' )
 77+ );
 78+ $this->assertEquals( 'абвгшђжчћ',
 79+ $this->convertToCyrillic( 'абвгšđžčć' )
 80+ );
 81+ //Roman numerals are not converted
 82+ $this->assertEquals( 'а I б II в III г IV шђжчћ',
 83+ $this->convertToCyrillic( 'a I b II v III g IV šđžčć' )
 84+ );
 85+ }
 86+
 87+ function testConversionToLatin() {
 88+ $this->assertEquals( 'abcd',
 89+ $this->convertToLatin( 'abcd' )
 90+ );
 91+ $this->assertEquals( 'abcd',
 92+ $this->convertToLatin( 'абцд' )
 93+ );
 94+ $this->assertEquals( 'abcdšđžčć',
 95+ $this->convertToLatin( 'abcdшђжчћ' )
 96+ );
 97+ $this->assertEquals( 'абцдšđžčć',
 98+ $this->convertToLatin( 'абцдšđžčć' )
 99+ );
 100+
 101+ }
 102+
 103+ ##### HELPERS #####################################################
 104+ /**
 105+ *Wrapper to verify text stay the same after applying conversion
 106+ * @param $text string Text to convert
 107+ * @param $variant string Language variant 'sr-ec' or 'sr-el'
 108+ * @param $msg string Optional message
 109+ */
 110+ function assertUnConverted( $text, $variant, $msg = '' ) {
 111+ $this->assertEquals(
 112+ $text,
 113+ $this->convertTo( $text, $variant ),
 114+ $msg
 115+ );
 116+ }
 117+ /**
 118+ * Wrapper to verify a text is different once converted to a variant.
 119+ * @param $text string Text to convert
 120+ * @param $variant string Language variant 'sr-ec' or 'sr-el'
 121+ * @param $msg string Optional message
 122+ */
 123+ function assertConverted( $text, $variant, $msg = '' ) {
 124+ $this->assertNotEquals(
 125+ $text,
 126+ $this->convertTo( $text, $variant ),
 127+ $msg
 128+ );
 129+ }
 130+
 131+ /**
 132+ * Verifiy the given Cyrillic text is not converted when using
 133+ * using the cyrillic variant and converted to Latin when using
 134+ * the Latin variant.
 135+ */
 136+ function assertCyrillic( $text, $msg = '' ) {
 137+ $this->assertUnConverted( $text, 'sr-ec', $msg );
 138+ $this->assertConverted( $text, 'sr-el', $msg );
 139+ }
 140+ /**
 141+ * Verifiy the given Latin text is not converted when using
 142+ * using the Latin variant and converted to Cyrillic when using
 143+ * the Cyrillic variant.
 144+ */
 145+ function assertLatin( $text, $msg = '' ) {
 146+ $this->assertUnConverted( $text, 'sr-el', $msg );
 147+ $this->assertConverted( $text, 'sr-ec', $msg );
 148+ }
 149+
 150+
 151+ /** Wrapper for converter::convertTo() method*/
 152+ function convertTo( $text, $variant ) {
 153+ return $this
 154+ ->lang
 155+ ->mConverter
 156+ ->convertTo(
 157+ $text, $variant
 158+ );
 159+ }
 160+ function convertToCyrillic( $text ) {
 161+ return $this->convertTo( $text, 'sr-ec' );
 162+ }
 163+ function convertToLatin( $text ) {
 164+ return $this->convertTo( $text, 'sr-el' );
 165+ }
 166+}
Property changes on: trunk/phase3/tests/phpunit/languages/LanguageSrTest.php
___________________________________________________________________
Added: svn:eol-style
1167 + native
Index: trunk/phase3/RELEASE-NOTES-1.19
@@ -166,6 +166,7 @@
167167 * (bug 30217) Make pt-br a fallback of pt.
168168 * (bug 31193) Set fallback language of Assamese from Bengali to English.
169169 * Update date format for dsb and hsb: month names need the genitive.
 170+* (bug 28643) Serbian variant conversion improvements (Nikola Smolenski)
170171
171172 === Other changes in 1.19 ===
172173 * jquery.mwPrototypes module was renamed to jquery.mwExtension.
Index: trunk/phase3/languages/classes/LanguageSr.php
@@ -173,6 +173,32 @@
174174
175175 return $ret;
176176 }
 177+
 178+ /**
 179+ * Guess if a text is written in Cyrillic or Latin.
 180+ * Overrides LanguageConverter::guessVariant()
 181+ *
 182+ * @param string $text The text to be checked
 183+ * @param string $variant Language code of the variant to be checked for
 184+ * @return bool true if $text appears to be written in $variant
 185+ *
 186+ * @author Nikola Smolenski <smolensk@eunet.rs>
 187+ * @since 1.19
 188+ */
 189+ public function guessVariant( $text, $variant ) {
 190+ $numCyrillic = preg_match_all("/[шђчћжШЂЧЋЖ]/u", $text, $dummy);
 191+ $numLatin = preg_match_all("/[šđč枊ĐČĆŽ]/u", $text, $dummy);
 192+
 193+ if( $variant == 'sr-ec' ) {
 194+ return (boolean) ($numCyrillic > $numLatin);
 195+ } else if( $variant == 'sr-el' ) {
 196+ return (boolean) ($numLatin > $numCyrillic);
 197+ } else {
 198+ return false;
 199+ }
 200+
 201+ }
 202+
177203 }
178204
179205 /**
Index: trunk/phase3/languages/LanguageConverter.php
@@ -322,6 +322,10 @@
323323 }
324324 }
325325
 326+ if( $this->guessVariant( $text, $toVariant ) ) {
 327+ return $text;
 328+ }
 329+
326330 /* we convert everything except:
327331 1. HTML markups (anything between < and >)
328332 2. HTML entities
@@ -571,7 +575,7 @@
572576 */
573577 public function convertTo( $text, $variant ) {
574578 global $wgDisableLangConversion;
575 - if ( $wgDisableLangConversion ) {
 579+ if ( $wgDisableLangConversion || $this->guessVariant( $text, $variant ) ) {
576580 return $text;
577581 }
578582 return $this->recursiveConvertTopLevel( $text, $variant );
@@ -773,6 +777,20 @@
774778 }
775779
776780 /**
 781+ * Guess if a text is written in a variant. This should be implemented in subclasses.
 782+ *
 783+ * @param string $text the text to be checked
 784+ * @param string $variant language code of the variant to be checked for
 785+ * @return bool true if $text appears to be written in $variant, false if not
 786+ *
 787+ * @author Nikola Smolenski <smolensk@eunet.rs>
 788+ * @since 1.19
 789+ */
 790+ public function guessVariant($text, $variant) {
 791+ return false;
 792+ }
 793+
 794+ /**
777795 * Load default conversion tables.
778796 * This method must be implemented in derived class.
779797 *
Property changes on: trunk/phase3/languages/LanguageConverter.php
___________________________________________________________________
Added: svn:mergeinfo
780798 Merged /branches/new-installer/phase3/languages/LanguageConverter.php:r43664-66004
781799 Merged /branches/REL1_15/phase3/languages/LanguageConverter.php:r51646
782800 Merged /branches/REL1_17/phase3/languages/LanguageConverter.php:r81445,81448
783801 Merged /branches/nikola/phase3/languages/LanguageConverter.php:r85106-103326
784802 Merged /branches/sqlite/languages/LanguageConverter.php:r58211-58321

Follow-up revisions

RevisionCommit summaryAuthorDate
r103924Even if guesses, they should profile out :)...platonides17:19, 22 November 2011

Past revisions this follows-up on

RevisionCommit summaryAuthorDate
r85224Don't convert text to a variant, if it is already written in that variant (to...nikola02:25, 3 April 2011
r85239Minor fixesnikola10:27, 3 April 2011
r85308@since 1.17 → @since 1.18nikola03:07, 4 April 2011
r86623(bug 28643) Merge Serbian language variant conversion improvements to trunk (...demon14:02, 21 April 2011
r101359guessVariant doc and explicit boolean typecasthashar14:51, 31 October 2011
r101369(bug 28643) tests for Serbian script conversions...hashar17:30, 31 October 2011
r103131Adding test for roman numeralsnikola06:18, 15 November 2011

Comments

#Comment by SPQRobin (talk | contribs)   02:31, 9 March 2012

guessVariant does not work well, see bug 35076

Status & tagging log