r110212 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r110211‎ | r110212 | r110213 >
Date:16:49, 28 January 2012
Author:nikerabbit
Status:ok
Tags:i18nreview, miscextensions 
Comment:
Core of the ported translation memory
Modified paths:
  • /trunk/extensions/Translate/Translate.php (modified) (history)
  • /trunk/extensions/Translate/_autoload.php (modified) (history)
  • /trunk/extensions/Translate/utils/TTMServer.php (added) (history)

Diff [purge]

Index: trunk/extensions/Translate/Translate.php
@@ -8,14 +8,14 @@
99 *
1010 * @author Niklas Laxström
1111 * @author Siebrand Mazeland
12 - * @copyright Copyright © 2006-2011, Niklas Laxström, Siebrand Mazeland
 12+ * @copyright Copyright © 2006-2012, Niklas Laxström, Siebrand Mazeland
1313 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
1414 */
1515
1616 /**
1717 * Version number used in extension credits and in other placed where needed.
1818 */
19 -define( 'TRANSLATE_VERSION', '2012-01-13' );
 19+define( 'TRANSLATE_VERSION', '2012-01-28' );
2020
2121 /**
2222 * Extension credits properties.
Index: trunk/extensions/Translate/_autoload.php
@@ -121,7 +121,9 @@
122122 $wgAutoloadClasses['RevTag'] = $dir . 'utils/RevTag.php';
123123
124124 $wgAutoloadClasses['MessageHandle'] = $dir . 'utils/MessageHandle.php';
 125+$wgAutoloadClasses['TTMServer'] = $dir . 'utils/TTMServer.php';
125126
 127+
126128 /**@}*/
127129
128130 /**
Index: trunk/extensions/Translate/utils/TTMServer.php
@@ -0,0 +1,337 @@
 2+<?php
 3+/**
 4+ * TTMServer - The dead simple translation memory
 5+ *
 6+ * @file
 7+ * @author Niklas Laxström
 8+ * @copyright Copyright © 2012, Niklas Laxström
 9+ * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
 10+ */
 11+
 12+/// @since 2012-01-28
 13+interface iTTMServer {
 14+
 15+ /**
 16+ * Adds a new source message in the database. Note that update does this
 17+ * for you automatically.
 18+ *
 19+ * @param $context Title: title of the source page
 20+ * @param $sourceLanguage String: language code for the provide text
 21+ * @param $text String: the source text to add.
 22+ * @return Integer: sid (source id)
 23+ */
 24+ public function insertSource( Title $context, $sourceLanguage, $text );
 25+
 26+ /**
 27+ * Shovels the new translation into translation memory.
 28+ *
 29+ * @param $handle MessageHandle
 30+ * @param $targetText String
 31+ * @return Bool: Success or failture
 32+ */
 33+ public function update( MessageHandle $handle, $targetText );
 34+
 35+ /**
 36+ * Fetches all relevant suggestions for given text.
 37+ *
 38+ * @param $sourceLanguage String: language code for the provide text
 39+ * @param $targetLanguage String: language code for the suggestions
 40+ * @param $text String: the text for which to search suggestions
 41+ * @return List: unordered suggestions, which each has fields:
 42+ * - source: String: the original text of the suggestion
 43+ * - target: String: the suggestion
 44+ * - context: String: title of the page where the suggestion comes from
 45+ * - quality: Float: the quality of suggestion, 1 is perfect match
 46+ */
 47+ public function query( $sourceLanguage, $targetLanguage, $text );
 48+}
 49+
 50+/**
 51+ * TTMServer is the simple translation memory that is just good enough for us
 52+ * @since 2012-01-28
 53+ */
 54+class TTMServer implements iTTMServer {
 55+ protected $config;
 56+
 57+ public function __construct( $config ) {
 58+ $this->config = $config;
 59+ }
 60+
 61+ /**
 62+ * Returns a server instance, useful for chaining.
 63+ * @return iTTMServer
 64+ */
 65+ public static function primary() {
 66+ global $wgTranslateTranslationServices;
 67+ if ( isset( $wgTranslateTranslationServices['TTMServer'] ) ) {
 68+ return new TTMServer( $wgTranslateTranslationServices['TTMServer'] );
 69+ } else {
 70+ return new FakeTTMServer();
 71+ }
 72+ }
 73+
 74+ public function getDB( $mode = DB_SLAVE ) {
 75+ return wfGetDB( $mode, 'ttmserver', $this->config['database'] );
 76+ }
 77+
 78+ public function update( MessageHandle $handle, $targetText ) {
 79+ global $wgContLang;
 80+
 81+ if ( !$handle->isValid() || $handle->getCode() === '' ) {
 82+ return false;
 83+ }
 84+
 85+ $mkey = $handle->getKey();
 86+ $group = $handle->getGroup();
 87+ $targetLanguage = $handle->getCode();
 88+ $sourceLanguage = $group->getSourceLanguage();
 89+ $title = $handle->getTitle();
 90+
 91+ // Skip definitions to not slow down mass imports etc.
 92+ // These will be added when the first translation is made
 93+ if ( $targetLanguage === $sourceLanguage ) {
 94+ return false;
 95+ }
 96+
 97+ $definition = $group->getMessage( $mkey, $sourceLanguage );
 98+ if ( !is_string( $definition ) || !strlen( trim( $definition ) ) ) {
 99+ return false;
 100+ }
 101+
 102+ $dbw = $this->getDB( DB_MASTER );
 103+ /* Check that the definition exists and fetch the sid. If not, add
 104+ * the definition and retrieve the sid. */
 105+ $conds = array( 'tms_context' => $title->getPrefixedText() );
 106+ $sid = $dbw->selectField( 'translate_tms', 'tms_sid', $conds, __METHOD__ );
 107+ if ( $sid === false ) {
 108+ $sid = $this->insertSource( $title, $sourceLanguage, $definition );
 109+ }
 110+
 111+ // Delete old translations for this message
 112+ $deleteConds = array(
 113+ 'tmt_sid' => $sid,
 114+ 'tmt_lang' => $targetLanguage,
 115+ );
 116+ $dbw->delete( 'translate_tmt', $deleteConds, __METHOD__ );
 117+
 118+ // Insert the new translation
 119+ $row = $deleteConds + array(
 120+ 'text' => $targetText,
 121+ );
 122+
 123+ $uniqueIndex = array( 'tmt_sid', 'tmt_lang' );
 124+ $dbw->insert( 'translate_tmt', array( $uniqueIndex ), $row, __METHOD__ );
 125+
 126+ return true;
 127+ }
 128+
 129+ public function insertSource( Title $context, $sourceLanguage, $text ) {
 130+ $row = array(
 131+ 'tms_lang' => $sourceLanguage,
 132+ 'tms_len' => mb_strlen( $text ),
 133+ 'tms_text' => $text,
 134+ 'tms_context' => $context->getPrefixedText(),
 135+ );
 136+
 137+ $dbw = $this->getDB( DB_MASTER );
 138+ $dbw->insert( 'translate_tms', $row, __METHOD__ );
 139+ $sid = $dbw->insertId();
 140+
 141+ // Fulltext
 142+ $fulltext = $this->filterForFulltext( $sourceLanguage, $text );
 143+ if ( count( $fulltext ) ) {
 144+ $row = array(
 145+ 'tmf_sid' => $sid,
 146+ 'tmf_text' => implode( ' ', $fulltext ),
 147+ );
 148+ $dbw->insert( 'translate_tmf', $row, __METHOD__ );
 149+ }
 150+
 151+ return $sid;
 152+ }
 153+
 154+ public function query( $sourceLanguage, $targetLanguage, $text ) {
 155+ // Calculate the bounds of the string length which are able
 156+ // to satisfy the cutoff percentage in edit distance.
 157+ $len = mb_strlen( $text );
 158+ $min = ceil( max( $len * $this->config['cutoff'], 2 ) );
 159+ $max = floor( $len / $this->config['cutoff'] );
 160+
 161+ // We could use fulltext index to narrow the results further
 162+ $dbr = $this->getDB( DB_SLAVE );
 163+ $tables = array( 'translate_tmt', 'translate_tms' );
 164+ $fields = array( 'tms_context', 'tms_text', 'tmt_lang', 'tmt_text' );
 165+ $conds = array(
 166+ 'tms_lang' => $sourceLanguage,
 167+ 'tmt_lang' => $targetLanguage,
 168+ "tms_len BETWEEN $min AND $max",
 169+ 'tms_sid = tmt_sid',
 170+ );
 171+
 172+ $fulltext = $this->filterForFulltext( $sourceLanguage, $text );
 173+ if ( $fulltext ) {
 174+ $tables[] = 'translate_tmf';
 175+ $list = implode( ' ', $fulltext );
 176+ $conds[] = 'tmf_sid = tmt_sid';
 177+ $conds[] = "MATCH(tmf_text) AGAINST( '$list' )";
 178+ }
 179+
 180+ $res = $dbr->select( $tables, $fields, $conds, __METHOD__ );
 181+ return $this->processQueryResults( $res, $text );
 182+ }
 183+
 184+ protected function processQueryResults( $res, $text ) {
 185+ $results = array();
 186+ foreach ( $res as $row ) {
 187+ $a = $text;
 188+ $b = $row->tms_text;
 189+ $len = min( mb_strlen( $a ), mb_strlen( $b ) );
 190+ if ( strlen( $a ) > 255 || strlen( $b ) > 255 ) {
 191+ $dist = self::levenshtein_php( $a, $b );
 192+ } else {
 193+ $dist = levenshtein( $a, $b );
 194+ }
 195+ $quality = 1 - ( $dist / $len );
 196+
 197+ if ( $quality >= $this->config['cutoff'] ) {
 198+ $results[] = array(
 199+ 'source' => $row->tms_text,
 200+ 'target' => $row->tmt_text,
 201+ 'context' => $row->tms_context,
 202+ 'quality' => $quality,
 203+ );
 204+ }
 205+ }
 206+ usort( $results, array( $this, 'qualitySort' ) );
 207+ return $results;
 208+ }
 209+
 210+ protected function qualitySort( $a, $b ) {
 211+ list( $c, $d ) = array( $a['quality'], $b['quality'] );
 212+ if ( $c === $d ) {
 213+ return 0;
 214+ }
 215+ // Descending sort
 216+ return ( $c > $d ) ? -1 : 1;
 217+ }
 218+
 219+ /**
 220+ * Tokenizes the text for fulltext search.
 221+ * Tries to find the most useful tokens.
 222+ */
 223+ protected function filterForFulltext( $language, $input ) {
 224+ $lang = Language::factory( $language );
 225+
 226+ $text = preg_replace( '/[^[:alnum:]]/u', ' ', $input );
 227+ $text = $lang->segmentByWord( $text );
 228+ $text = $lang->lc( $text );
 229+ $segments = preg_split( '/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY );
 230+ if ( count( $segments ) < 4 ) {
 231+ return array();
 232+ }
 233+
 234+ foreach ( $segments as $i => $segment ) {
 235+ // Yes strlen
 236+ $len = strlen( $segment );
 237+ if ( $len < 4 || $len > 15 ) {
 238+ unset( $segments[$i] );
 239+ }
 240+ }
 241+
 242+ $segments = array_unique( $segments );
 243+ $segments = array_slice( $segments, 0, 10 );
 244+ return $segments;
 245+ }
 246+
 247+ /**
 248+ * Stolen from PHP manual comments.
 249+ * The native levenshtein is limited to 255 bytes.
 250+ */
 251+ public static function levenshtein_php( $str1, $str2 ) {
 252+ $len1 = mb_strlen( $str1 );
 253+ $len2 = mb_strlen( $str2 );
 254+
 255+ // strip common prefix
 256+ $i = 0;
 257+ do {
 258+ if ( mb_substr( $str1, $i, 1 ) != mb_substr( $str2, $i, 1 ) )
 259+ break;
 260+ $i++;
 261+ $len1--;
 262+ $len2--;
 263+ } while ( $len1 > 0 && $len2 > 0 );
 264+ if ( $i > 0 ) {
 265+ $str1 = mb_substr( $str1, $i );
 266+ $str2 = mb_substr( $str2, $i );
 267+ }
 268+
 269+ // strip common suffix
 270+ $i = 0;
 271+ do {
 272+ if ( mb_substr( $str1, $len1 -1, 1 ) != mb_substr( $str2, $len2 -1, 1 ) )
 273+ break;
 274+ $i++;
 275+ $len1--;
 276+ $len2--;
 277+ } while ( $len1 > 0 && $len2 > 0 );
 278+ if ( $i > 0 ) {
 279+ $str1 = mb_substr( $str1, 0, $len1 );
 280+ $str2 = mb_substr( $str2, 0, $len2 );
 281+ }
 282+
 283+ if ( $len1 == 0 )
 284+ return $len2;
 285+ if ( $len2 == 0 )
 286+ return $len1;
 287+
 288+ $v0 = range( 0, $len1 );
 289+ $v1 = array();
 290+
 291+ for ( $i = 1; $i <= $len2; $i++ ) {
 292+ $v1[0] = $i;
 293+ $str2j = mb_substr( $str2, $i - 1, 1 );
 294+
 295+ for ( $j = 1; $j <= $len1; $j++ ) {
 296+ $cost = ( mb_substr( $str1, $j - 1, 1 ) == $str2j ) ? 0 : 1;
 297+
 298+ $m_min = $v0[$j] + 1;
 299+ $b = $v1[$j - 1] + 1;
 300+ $c = $v0[$j - 1] + $cost;
 301+
 302+ if ( $b < $m_min )
 303+ $m_min = $b;
 304+ if ( $c < $m_min )
 305+ $m_min = $c;
 306+
 307+ $v1[$j] = $m_min;
 308+ }
 309+
 310+ $vTmp = $v0;
 311+ $v0 = $v1;
 312+ $v1 = $vTmp;
 313+ }
 314+
 315+ return $v0[$len1];
 316+ }
 317+
 318+}
 319+
 320+/**
 321+ * NO-OP version of TTMServer when it is disabled.
 322+ * Keeps other code simpler when they can just do
 323+ * TTMServer::primary()->update( ... );
 324+ * @since 2012-01-28
 325+ */
 326+class FakeTTMServer implements iTTMServer {
 327+ public function insertSource( Title $context, $sourceLanguage, $text ) {
 328+ return false;
 329+ }
 330+
 331+ public function update( MessageHandle $handle, $targetText ) {
 332+ return false;
 333+ }
 334+
 335+ public function query( $sourceLanguage, $targetLanguage, $text ) {
 336+ return array();
 337+ }
 338+}
Property changes on: trunk/extensions/Translate/utils/TTMServer.php
___________________________________________________________________
Added: svn:eol-style
1339 + native

Status & tagging log