Index: trunk/extensions/Translate/Translate.php |
— | — | @@ -8,14 +8,14 @@ |
9 | 9 | * |
10 | 10 | * @author Niklas Laxström |
11 | 11 | * @author Siebrand Mazeland |
12 | | - * @copyright Copyright © 2006-2011, Niklas Laxström, Siebrand Mazeland |
| 12 | + * @copyright Copyright © 2006-2012, Niklas Laxström, Siebrand Mazeland |
13 | 13 | * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later |
14 | 14 | */ |
15 | 15 | |
16 | 16 | /** |
17 | 17 | * Version number used in extension credits and in other placed where needed. |
18 | 18 | */ |
19 | | -define( 'TRANSLATE_VERSION', '2012-01-13' ); |
| 19 | +define( 'TRANSLATE_VERSION', '2012-01-28' ); |
20 | 20 | |
21 | 21 | /** |
22 | 22 | * Extension credits properties. |
Index: trunk/extensions/Translate/_autoload.php |
— | — | @@ -121,7 +121,9 @@ |
122 | 122 | $wgAutoloadClasses['RevTag'] = $dir . 'utils/RevTag.php'; |
123 | 123 | |
124 | 124 | $wgAutoloadClasses['MessageHandle'] = $dir . 'utils/MessageHandle.php'; |
| 125 | +$wgAutoloadClasses['TTMServer'] = $dir . 'utils/TTMServer.php'; |
125 | 126 | |
| 127 | + |
126 | 128 | /**@}*/ |
127 | 129 | |
128 | 130 | /** |
Index: trunk/extensions/Translate/utils/TTMServer.php |
— | — | @@ -0,0 +1,337 @@ |
| 2 | +<?php |
| 3 | +/** |
| 4 | + * TTMServer - The dead simple translation memory |
| 5 | + * |
| 6 | + * @file |
| 7 | + * @author Niklas Laxström |
| 8 | + * @copyright Copyright © 2012, Niklas Laxström |
| 9 | + * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later |
| 10 | + */ |
| 11 | + |
| 12 | +/// @since 2012-01-28 |
| 13 | +interface iTTMServer { |
| 14 | + |
| 15 | + /** |
| 16 | + * Adds a new source message in the database. Note that update does this |
| 17 | + * for you automatically. |
| 18 | + * |
| 19 | + * @param $context Title: title of the source page |
| 20 | + * @param $sourceLanguage String: language code for the provide text |
| 21 | + * @param $text String: the source text to add. |
| 22 | + * @return Integer: sid (source id) |
| 23 | + */ |
| 24 | + public function insertSource( Title $context, $sourceLanguage, $text ); |
| 25 | + |
| 26 | + /** |
| 27 | + * Shovels the new translation into translation memory. |
| 28 | + * |
| 29 | + * @param $handle MessageHandle |
| 30 | + * @param $targetText String |
| 31 | + * @return Bool: Success or failture |
| 32 | + */ |
| 33 | + public function update( MessageHandle $handle, $targetText ); |
| 34 | + |
| 35 | + /** |
| 36 | + * Fetches all relevant suggestions for given text. |
| 37 | + * |
| 38 | + * @param $sourceLanguage String: language code for the provide text |
| 39 | + * @param $targetLanguage String: language code for the suggestions |
| 40 | + * @param $text String: the text for which to search suggestions |
| 41 | + * @return List: unordered suggestions, which each has fields: |
| 42 | + * - source: String: the original text of the suggestion |
| 43 | + * - target: String: the suggestion |
| 44 | + * - context: String: title of the page where the suggestion comes from |
| 45 | + * - quality: Float: the quality of suggestion, 1 is perfect match |
| 46 | + */ |
| 47 | + public function query( $sourceLanguage, $targetLanguage, $text ); |
| 48 | +} |
| 49 | + |
| 50 | +/** |
| 51 | + * TTMServer is the simple translation memory that is just good enough for us |
| 52 | + * @since 2012-01-28 |
| 53 | + */ |
| 54 | +class TTMServer implements iTTMServer { |
| 55 | + protected $config; |
| 56 | + |
| 57 | + public function __construct( $config ) { |
| 58 | + $this->config = $config; |
| 59 | + } |
| 60 | + |
| 61 | + /** |
| 62 | + * Returns a server instance, useful for chaining. |
| 63 | + * @return iTTMServer |
| 64 | + */ |
| 65 | + public static function primary() { |
| 66 | + global $wgTranslateTranslationServices; |
| 67 | + if ( isset( $wgTranslateTranslationServices['TTMServer'] ) ) { |
| 68 | + return new TTMServer( $wgTranslateTranslationServices['TTMServer'] ); |
| 69 | + } else { |
| 70 | + return new FakeTTMServer(); |
| 71 | + } |
| 72 | + } |
| 73 | + |
| 74 | + public function getDB( $mode = DB_SLAVE ) { |
| 75 | + return wfGetDB( $mode, 'ttmserver', $this->config['database'] ); |
| 76 | + } |
| 77 | + |
| 78 | + public function update( MessageHandle $handle, $targetText ) { |
| 79 | + global $wgContLang; |
| 80 | + |
| 81 | + if ( !$handle->isValid() || $handle->getCode() === '' ) { |
| 82 | + return false; |
| 83 | + } |
| 84 | + |
| 85 | + $mkey = $handle->getKey(); |
| 86 | + $group = $handle->getGroup(); |
| 87 | + $targetLanguage = $handle->getCode(); |
| 88 | + $sourceLanguage = $group->getSourceLanguage(); |
| 89 | + $title = $handle->getTitle(); |
| 90 | + |
| 91 | + // Skip definitions to not slow down mass imports etc. |
| 92 | + // These will be added when the first translation is made |
| 93 | + if ( $targetLanguage === $sourceLanguage ) { |
| 94 | + return false; |
| 95 | + } |
| 96 | + |
| 97 | + $definition = $group->getMessage( $mkey, $sourceLanguage ); |
| 98 | + if ( !is_string( $definition ) || !strlen( trim( $definition ) ) ) { |
| 99 | + return false; |
| 100 | + } |
| 101 | + |
| 102 | + $dbw = $this->getDB( DB_MASTER ); |
| 103 | + /* Check that the definition exists and fetch the sid. If not, add |
| 104 | + * the definition and retrieve the sid. */ |
| 105 | + $conds = array( 'tms_context' => $title->getPrefixedText() ); |
| 106 | + $sid = $dbw->selectField( 'translate_tms', 'tms_sid', $conds, __METHOD__ ); |
| 107 | + if ( $sid === false ) { |
| 108 | + $sid = $this->insertSource( $title, $sourceLanguage, $definition ); |
| 109 | + } |
| 110 | + |
| 111 | + // Delete old translations for this message |
| 112 | + $deleteConds = array( |
| 113 | + 'tmt_sid' => $sid, |
| 114 | + 'tmt_lang' => $targetLanguage, |
| 115 | + ); |
| 116 | + $dbw->delete( 'translate_tmt', $deleteConds, __METHOD__ ); |
| 117 | + |
| 118 | + // Insert the new translation |
| 119 | + $row = $deleteConds + array( |
| 120 | + 'text' => $targetText, |
| 121 | + ); |
| 122 | + |
| 123 | + $uniqueIndex = array( 'tmt_sid', 'tmt_lang' ); |
| 124 | + $dbw->insert( 'translate_tmt', array( $uniqueIndex ), $row, __METHOD__ ); |
| 125 | + |
| 126 | + return true; |
| 127 | + } |
| 128 | + |
| 129 | + public function insertSource( Title $context, $sourceLanguage, $text ) { |
| 130 | + $row = array( |
| 131 | + 'tms_lang' => $sourceLanguage, |
| 132 | + 'tms_len' => mb_strlen( $text ), |
| 133 | + 'tms_text' => $text, |
| 134 | + 'tms_context' => $context->getPrefixedText(), |
| 135 | + ); |
| 136 | + |
| 137 | + $dbw = $this->getDB( DB_MASTER ); |
| 138 | + $dbw->insert( 'translate_tms', $row, __METHOD__ ); |
| 139 | + $sid = $dbw->insertId(); |
| 140 | + |
| 141 | + // Fulltext |
| 142 | + $fulltext = $this->filterForFulltext( $sourceLanguage, $text ); |
| 143 | + if ( count( $fulltext ) ) { |
| 144 | + $row = array( |
| 145 | + 'tmf_sid' => $sid, |
| 146 | + 'tmf_text' => implode( ' ', $fulltext ), |
| 147 | + ); |
| 148 | + $dbw->insert( 'translate_tmf', $row, __METHOD__ ); |
| 149 | + } |
| 150 | + |
| 151 | + return $sid; |
| 152 | + } |
| 153 | + |
| 154 | + public function query( $sourceLanguage, $targetLanguage, $text ) { |
| 155 | + // Calculate the bounds of the string length which are able |
| 156 | + // to satisfy the cutoff percentage in edit distance. |
| 157 | + $len = mb_strlen( $text ); |
| 158 | + $min = ceil( max( $len * $this->config['cutoff'], 2 ) ); |
| 159 | + $max = floor( $len / $this->config['cutoff'] ); |
| 160 | + |
| 161 | + // We could use fulltext index to narrow the results further |
| 162 | + $dbr = $this->getDB( DB_SLAVE ); |
| 163 | + $tables = array( 'translate_tmt', 'translate_tms' ); |
| 164 | + $fields = array( 'tms_context', 'tms_text', 'tmt_lang', 'tmt_text' ); |
| 165 | + $conds = array( |
| 166 | + 'tms_lang' => $sourceLanguage, |
| 167 | + 'tmt_lang' => $targetLanguage, |
| 168 | + "tms_len BETWEEN $min AND $max", |
| 169 | + 'tms_sid = tmt_sid', |
| 170 | + ); |
| 171 | + |
| 172 | + $fulltext = $this->filterForFulltext( $sourceLanguage, $text ); |
| 173 | + if ( $fulltext ) { |
| 174 | + $tables[] = 'translate_tmf'; |
| 175 | + $list = implode( ' ', $fulltext ); |
| 176 | + $conds[] = 'tmf_sid = tmt_sid'; |
| 177 | + $conds[] = "MATCH(tmf_text) AGAINST( '$list' )"; |
| 178 | + } |
| 179 | + |
| 180 | + $res = $dbr->select( $tables, $fields, $conds, __METHOD__ ); |
| 181 | + return $this->processQueryResults( $res, $text ); |
| 182 | + } |
| 183 | + |
| 184 | + protected function processQueryResults( $res, $text ) { |
| 185 | + $results = array(); |
| 186 | + foreach ( $res as $row ) { |
| 187 | + $a = $text; |
| 188 | + $b = $row->tms_text; |
| 189 | + $len = min( mb_strlen( $a ), mb_strlen( $b ) ); |
| 190 | + if ( strlen( $a ) > 255 || strlen( $b ) > 255 ) { |
| 191 | + $dist = self::levenshtein_php( $a, $b ); |
| 192 | + } else { |
| 193 | + $dist = levenshtein( $a, $b ); |
| 194 | + } |
| 195 | + $quality = 1 - ( $dist / $len ); |
| 196 | + |
| 197 | + if ( $quality >= $this->config['cutoff'] ) { |
| 198 | + $results[] = array( |
| 199 | + 'source' => $row->tms_text, |
| 200 | + 'target' => $row->tmt_text, |
| 201 | + 'context' => $row->tms_context, |
| 202 | + 'quality' => $quality, |
| 203 | + ); |
| 204 | + } |
| 205 | + } |
| 206 | + usort( $results, array( $this, 'qualitySort' ) ); |
| 207 | + return $results; |
| 208 | + } |
| 209 | + |
| 210 | + protected function qualitySort( $a, $b ) { |
| 211 | + list( $c, $d ) = array( $a['quality'], $b['quality'] ); |
| 212 | + if ( $c === $d ) { |
| 213 | + return 0; |
| 214 | + } |
| 215 | + // Descending sort |
| 216 | + return ( $c > $d ) ? -1 : 1; |
| 217 | + } |
| 218 | + |
| 219 | + /** |
| 220 | + * Tokenizes the text for fulltext search. |
| 221 | + * Tries to find the most useful tokens. |
| 222 | + */ |
| 223 | + protected function filterForFulltext( $language, $input ) { |
| 224 | + $lang = Language::factory( $language ); |
| 225 | + |
| 226 | + $text = preg_replace( '/[^[:alnum:]]/u', ' ', $input ); |
| 227 | + $text = $lang->segmentByWord( $text ); |
| 228 | + $text = $lang->lc( $text ); |
| 229 | + $segments = preg_split( '/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY ); |
| 230 | + if ( count( $segments ) < 4 ) { |
| 231 | + return array(); |
| 232 | + } |
| 233 | + |
| 234 | + foreach ( $segments as $i => $segment ) { |
| 235 | + // Yes strlen |
| 236 | + $len = strlen( $segment ); |
| 237 | + if ( $len < 4 || $len > 15 ) { |
| 238 | + unset( $segments[$i] ); |
| 239 | + } |
| 240 | + } |
| 241 | + |
| 242 | + $segments = array_unique( $segments ); |
| 243 | + $segments = array_slice( $segments, 0, 10 ); |
| 244 | + return $segments; |
| 245 | + } |
| 246 | + |
| 247 | + /** |
| 248 | + * Stolen from PHP manual comments. |
| 249 | + * The native levenshtein is limited to 255 bytes. |
| 250 | + */ |
| 251 | + public static function levenshtein_php( $str1, $str2 ) { |
| 252 | + $len1 = mb_strlen( $str1 ); |
| 253 | + $len2 = mb_strlen( $str2 ); |
| 254 | + |
| 255 | + // strip common prefix |
| 256 | + $i = 0; |
| 257 | + do { |
| 258 | + if ( mb_substr( $str1, $i, 1 ) != mb_substr( $str2, $i, 1 ) ) |
| 259 | + break; |
| 260 | + $i++; |
| 261 | + $len1--; |
| 262 | + $len2--; |
| 263 | + } while ( $len1 > 0 && $len2 > 0 ); |
| 264 | + if ( $i > 0 ) { |
| 265 | + $str1 = mb_substr( $str1, $i ); |
| 266 | + $str2 = mb_substr( $str2, $i ); |
| 267 | + } |
| 268 | + |
| 269 | + // strip common suffix |
| 270 | + $i = 0; |
| 271 | + do { |
| 272 | + if ( mb_substr( $str1, $len1 -1, 1 ) != mb_substr( $str2, $len2 -1, 1 ) ) |
| 273 | + break; |
| 274 | + $i++; |
| 275 | + $len1--; |
| 276 | + $len2--; |
| 277 | + } while ( $len1 > 0 && $len2 > 0 ); |
| 278 | + if ( $i > 0 ) { |
| 279 | + $str1 = mb_substr( $str1, 0, $len1 ); |
| 280 | + $str2 = mb_substr( $str2, 0, $len2 ); |
| 281 | + } |
| 282 | + |
| 283 | + if ( $len1 == 0 ) |
| 284 | + return $len2; |
| 285 | + if ( $len2 == 0 ) |
| 286 | + return $len1; |
| 287 | + |
| 288 | + $v0 = range( 0, $len1 ); |
| 289 | + $v1 = array(); |
| 290 | + |
| 291 | + for ( $i = 1; $i <= $len2; $i++ ) { |
| 292 | + $v1[0] = $i; |
| 293 | + $str2j = mb_substr( $str2, $i - 1, 1 ); |
| 294 | + |
| 295 | + for ( $j = 1; $j <= $len1; $j++ ) { |
| 296 | + $cost = ( mb_substr( $str1, $j - 1, 1 ) == $str2j ) ? 0 : 1; |
| 297 | + |
| 298 | + $m_min = $v0[$j] + 1; |
| 299 | + $b = $v1[$j - 1] + 1; |
| 300 | + $c = $v0[$j - 1] + $cost; |
| 301 | + |
| 302 | + if ( $b < $m_min ) |
| 303 | + $m_min = $b; |
| 304 | + if ( $c < $m_min ) |
| 305 | + $m_min = $c; |
| 306 | + |
| 307 | + $v1[$j] = $m_min; |
| 308 | + } |
| 309 | + |
| 310 | + $vTmp = $v0; |
| 311 | + $v0 = $v1; |
| 312 | + $v1 = $vTmp; |
| 313 | + } |
| 314 | + |
| 315 | + return $v0[$len1]; |
| 316 | + } |
| 317 | + |
| 318 | +} |
| 319 | + |
| 320 | +/** |
| 321 | + * NO-OP version of TTMServer when it is disabled. |
| 322 | + * Keeps other code simpler when they can just do |
| 323 | + * TTMServer::primary()->update( ... ); |
| 324 | + * @since 2012-01-28 |
| 325 | + */ |
| 326 | +class FakeTTMServer implements iTTMServer { |
| 327 | + public function insertSource( Title $context, $sourceLanguage, $text ) { |
| 328 | + return false; |
| 329 | + } |
| 330 | + |
| 331 | + public function update( MessageHandle $handle, $targetText ) { |
| 332 | + return false; |
| 333 | + } |
| 334 | + |
| 335 | + public function query( $sourceLanguage, $targetLanguage, $text ) { |
| 336 | + return array(); |
| 337 | + } |
| 338 | +} |
Property changes on: trunk/extensions/Translate/utils/TTMServer.php |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 339 | + native |