Index: trunk/extensions/Translate/ffs/Gettext.php |
— | — | @@ -29,136 +29,21 @@ |
30 | 30 | } |
31 | 31 | |
32 | 32 | public function parseFile() { |
33 | | - $data = file_get_contents( $this->filename ); |
34 | | - $data = str_replace( "\r\n", "\n", $data ); |
35 | | - |
36 | | - $pluralForms = false; |
37 | | - |
38 | | - $matches = array(); |
39 | | - if ( preg_match( '/X-Language-Code:\s+([a-zA-Z-_]+)/', $data, $matches ) ) { |
40 | | - $code = $matches[1]; |
| 33 | + if ( $this->filename === false ) { |
| 34 | + return array(); |
41 | 35 | } |
42 | | - |
43 | | - if ( preg_match( '/X-Message-Group:\s+([a-zA-Z0-9-._\|]+)/', $data, $matches ) ) { |
44 | | - $groupId = $matches[1]; |
45 | | - } |
46 | | - |
47 | | - if ( preg_match( '/Plural-Forms:\s+nplurals=([0-9]+).*;/', $data, $matches ) ) { |
48 | | - $pluralForms = $matches; |
49 | | - } |
50 | | - |
51 | | - $useCtxtAsKey = false; |
52 | | - |
53 | | - $poformat = '".*"\n?(^".*"$\n?)*'; |
54 | | - $quotePattern = '/(^"|"$\n?)/m'; |
55 | | - |
56 | | - $sections = preg_split( '/\n{2,}/', $data ); |
57 | | - array_shift( $sections ); // First isn't an actual message |
58 | | - $changes = array(); |
59 | | - |
60 | | - foreach ( $sections as $section ) { |
61 | | - if ( trim( $section ) === '' ) continue; |
62 | | - |
63 | | - $item = array( |
64 | | - 'ctxt' => '', |
65 | | - 'id' => '', |
66 | | - 'str' => '', |
67 | | - 'flags' => array(), |
68 | | - 'comments' => array(), |
69 | | - ); |
70 | | - |
71 | | - $matches = array(); |
72 | | - if ( preg_match( "/^msgctxt\s($poformat)/mx", $section, $matches ) ) { |
73 | | - // Remove quoting |
74 | | - $item['ctxt'] = GettextFFS::formatForWiki( $matches[1] ); |
75 | | - } elseif ( $useCtxtAsKey ) { |
76 | | - // Invalid message |
77 | | - continue; |
78 | | - } |
79 | | - |
80 | | - $matches = array(); |
81 | | - if ( preg_match( "/^msgid\s($poformat)/mx", $section, $matches ) ) { |
82 | | - $item['id'] = GettextFFS::formatForWiki( $matches[1] ); |
83 | | - } else { |
84 | | - # echo "Definition not found!\n$section"; |
85 | | - continue; |
86 | | - } |
87 | | - |
88 | | - $pluralMessage = false; |
89 | | - $matches = array(); |
90 | | - if ( preg_match( "/^msgid_plural\s($poformat)/mx", $section, $matches ) ) { |
91 | | - $pluralMessage = true; |
92 | | - $plural = GettextFFS::formatForWiki( $matches[1] ); |
93 | | - $item['id'] = "{{PLURAL:GETTEXT|{$item['id']}|$plural}}"; |
94 | | - } |
95 | | - |
96 | | - if ( $pluralMessage ) { |
97 | | - |
98 | | - $actualForms = array(); |
99 | | - for ( $i = 0; $i < $pluralForms[1]; $i++ ) { |
100 | | - $matches = array(); |
101 | | - if ( preg_match( "/^msgid_plural\s($poformat)/mx", $section, $matches ) ) { |
102 | | - continue; // Skip |
103 | | - } |
104 | | - if ( preg_match( "/^msgstr\[$i\]\s($poformat)/mx", $section, $matches ) ) { |
105 | | - $actualForms[] = GettextFFS::formatForWiki( $matches[1] ); |
106 | | - } else { |
107 | | - throw new MWException( "Plural not found, expecting $i for: $section" ); |
108 | | - } |
109 | | - } |
110 | | - |
111 | | - $item['str'] = '{{PLURAL:GETTEXT|' . implode( '|', $actualForms ) . '}}'; |
112 | | - } else { |
113 | | - |
114 | | - $matches = array(); |
115 | | - if ( preg_match( "/^msgstr\s($poformat)/mx", $section, $matches ) ) { |
116 | | - $item['str'] = GettextFFS::formatForWiki( $matches[1] ); |
117 | | - } else { |
118 | | - # echo "Translation not found!\n"; |
119 | | - continue; |
120 | | - } |
121 | | - } |
122 | | - |
123 | | - // Parse flags |
124 | | - $matches = array(); |
125 | | - if ( preg_match( '/^#,(.*)$/mu', $section, $matches ) ) { |
126 | | - $flags = array_map( 'trim', explode( ',', $matches[1] ) ); |
127 | | - foreach ( $flags as $key => $flag ) { |
128 | | - if ( $flag === 'fuzzy' ) { |
129 | | - $item['str'] = TRANSLATE_FUZZY . $item['str']; |
130 | | - unset( $flags[$key] ); |
131 | | - } |
132 | | - } |
133 | | - $item['flags'] = $flags; |
134 | | - } |
135 | | - |
136 | | - $matches = array(); |
137 | | - if ( preg_match_all( '/^#(.?) (.*)$/m', $section, $matches, PREG_SET_ORDER ) ) { |
138 | | - foreach ( $matches as $match ) { |
139 | | - if ( $match[1] !== ',' ) { |
140 | | - $item['comments'][$match[1]][] = $match[2]; |
141 | | - } |
142 | | - } |
143 | | - } |
144 | | - |
145 | | - $lang = Language::factory( 'en' ); |
146 | | - if ( $useCtxtAsKey ) { |
147 | | - $key = $item['ctxt']; |
148 | | - } else { |
149 | | - $key = GettextFFS::generateKeyFromItem( $item ); |
150 | | - } |
151 | | - |
152 | | - $changes[$key] = $item; |
153 | | - |
154 | | - } |
155 | | - $changes['PLURAL'] = $pluralForms; |
156 | | - return $changes; |
| 36 | + $data = file_get_contents( $this->filename ); |
| 37 | + $parse = GettextFFS::parseGettextData( $data ); |
| 38 | + // Ugly ugly hack! part 1 |
| 39 | + $parse['TEMPLATE']['HEADERS'] = $parse['HEADERS']; |
| 40 | + return $parse['TEMPLATE']; |
157 | 41 | } |
158 | 42 | |
159 | 43 | |
160 | 44 | public function parseMessages( StringMangler $mangler ) { |
161 | 45 | $defs = $this->parseFile(); |
162 | | - unset($defs['PLURAL']); |
| 46 | + // Ugly ugly hack! part 2 |
| 47 | + unset( $defs['HEADERS'] ); |
163 | 48 | $messages = array(); |
164 | 49 | foreach ( $defs as $key => $def ) { |
165 | 50 | if ( $this->pot ) { |
— | — | @@ -184,8 +69,9 @@ |
185 | 70 | if ( $reader instanceof GettextFormatReader ) { |
186 | 71 | $this->addAuthors( $reader->parseAuthors(), $code ); |
187 | 72 | $this->staticHeader = $reader->parseStaticHeader(); |
188 | | - $data = $reader->parseFile(); |
189 | | - $this->plural = $data['PLURAL']; |
| 73 | + $this->owndata = $reader->parseFile(); |
| 74 | + // Ugly ugly hack! part 3 |
| 75 | + $this->headers = $this->owndata['HEADERS']; |
190 | 76 | } |
191 | 77 | if ( $readerEn instanceof GettextFormatReader ) { |
192 | 78 | $this->data = $readerEn->parseFile(); |
— | — | @@ -205,14 +91,15 @@ |
206 | 92 | $label = $this->group->getLabel(); |
207 | 93 | $languageName = TranslateUtils::getLanguageName( $code ); |
208 | 94 | |
209 | | - $headers = array(); |
| 95 | + $headers = $this->headers; |
210 | 96 | $headers['Project-Id-Version'] = $label; |
211 | 97 | // TODO: make this customisable or something |
212 | 98 | // $headers['Report-Msgid-Bugs-To'] = $wgServer; |
213 | 99 | // TODO: sprintfDate doesn't support any time zone flags |
214 | 100 | // $headers['POT-Creation-Date'] |
215 | 101 | $headers['PO-Revision-Date'] = $lang->sprintfDate( 'xnY-xnm-xnd xnH:xni:xns+0000', $now ); |
216 | | - $headers['Language-Team'] = $languageName; |
| 102 | + // Link to portal pages?? |
| 103 | + //$headers['Language-Team'] = $languageName; |
217 | 104 | $headers['Content-Type'] = 'text/plain; charset=UTF-8'; |
218 | 105 | $headers['Content-Transfer-Encoding'] = '8bit'; |
219 | 106 | |
— | — | @@ -222,10 +109,6 @@ |
223 | 110 | $headers['X-Translation-Project'] = "$wgSitename at $wgServer"; |
224 | 111 | $headers['X-Language-Code'] = $code; |
225 | 112 | $headers['X-Message-Group'] = $this->group->getId(); |
226 | | - if( $this->plural[0] ) { |
227 | | - list( $header, $rest ) = explode( ':', $this->plural[0] ); |
228 | | - $headers[$header] = trim($rest); |
229 | | - } |
230 | 113 | |
231 | 114 | $headerlines = array( '' ); |
232 | 115 | foreach ( $headers as $key => $value ) { |
— | — | @@ -251,7 +134,7 @@ |
252 | 135 | # CASE3: optional messages; accept only if different |
253 | 136 | if ( $m->hasTag( 'optional') ) $flags[] = 'x-optional'; |
254 | 137 | |
255 | | - # Remove fuzzy markings before export |
| 138 | + # Remove explicit fuzzy markings from the translation before export |
256 | 139 | $flags = array(); |
257 | 140 | $comments = array(); |
258 | 141 | if ( isset( $this->data[$key]['flags'] ) ) { |
— | — | @@ -381,6 +264,7 @@ |
382 | 265 | |
383 | 266 | return $splitPlurals; |
384 | 267 | } |
| 268 | + |
385 | 269 | } |
386 | 270 | |
387 | 271 | class GettextFFS extends SimpleFFS { |
— | — | @@ -406,34 +290,54 @@ |
407 | 291 | } |
408 | 292 | |
409 | 293 | public function parseGettext( $data ) { |
| 294 | + $useCtxtAsKey = isset($this->extra['CtxtAsKey']) && $this->extra['CtxtAsKey']; |
| 295 | + return self::parseGettextData( $data, $useCtxtAsKey ); |
| 296 | + } |
| 297 | + |
| 298 | + // Ugly hack to avoid code duplication between old and new style FFS |
| 299 | + public static function parseGettextData( $data, $useCtxtAsKey = false ) { |
| 300 | + // Normalise newlines, to make processing easier lates |
410 | 301 | $data = str_replace( "\r\n", "\n", $data ); |
411 | | - $messages = $template = $metadata = array(); |
412 | 302 | |
413 | | - // Defined only once. Be sure to *not* use it without match, or you might get old data |
414 | | - $matches = array(); |
| 303 | + /* Delimit the file into sections, which are separated by two newlines. |
| 304 | + * We are permissive and accept more than two. This parsing method isn't |
| 305 | + * efficient wrt memory, but was easy to implement */ |
| 306 | + $sections = preg_split( '/\n{2,}/', $data ); |
415 | 307 | |
416 | | - if ( preg_match( '/X-Language-Code:\s+([a-zA-Z-_]+)/', $data, $matches ) ) { |
417 | | - $metadata['code'] = $matches[1]; |
418 | | - } |
| 308 | + /* First one isn't an actual message. We'll handle it specially below */ |
| 309 | + $headerSection = array_shift( $sections ); |
419 | 310 | |
420 | | - if ( preg_match( '/X-Message-Group:\s+([a-zA-Z0-9-._\|]+)/', $data, $matches ) ) { |
421 | | - $metadata['group'] = $matches[1]; |
| 311 | + /* Since this is the header section, we are only interested in the tags |
| 312 | + * and msgid is empty. Somewhere we should extract the header comments |
| 313 | + * too */ |
| 314 | + $match = self::expectKeyword( 'msgstr', $headerSection ); |
| 315 | + if ( $match !== null ) { |
| 316 | + $headerBlock = self::formatForWiki( $match, 'trim' ); |
| 317 | + $headers = self::parseHeaderTags( $headerBlock ); |
| 318 | + } else { |
| 319 | + throw new MWException( "Gettext file header was not found:\n\n$data" ); |
422 | 320 | } |
423 | 321 | |
424 | | - $pluralForms = false; |
425 | | - if ( preg_match( '/Plural-Forms:\s+nplurals=([0-9]+).*;/', $data, $matches ) ) { |
426 | | - $metadata['plurals'] = $matches; |
427 | | - $pluralForms = $matches; |
| 322 | + /* At this stage we are only interested how many plurals forms we should |
| 323 | + * be expecting when parsing the rest of this file. */ |
| 324 | + $pluralCount = false; |
| 325 | + if ( isset($headers['Plural-Forms']) ) { |
| 326 | + if ( preg_match( '/nplurals=([0-9]+).*;/', $headers['Plural-Forms'], $matches ) ) { |
| 327 | + $pluralCount = $matches[1]; |
| 328 | + } |
428 | 329 | } |
429 | 330 | |
430 | | - $useCtxtAsKey = isset($this->extra['CtxtAsKey']) && $this->extra['CtxtAsKey']; |
| 331 | + // Extract some metadata from headers for easier use |
| 332 | + $metadata = array(); |
| 333 | + if ( isset($headers['X-Language-Code']) ) { |
| 334 | + $metadata['code'] = $headers['X-Language-Code']; |
| 335 | + } |
431 | 336 | |
432 | | - $poformat = '".*"\n?(^".*"$\n?)*'; |
433 | | - $quotePattern = '/(^"|"$\n?)/m'; |
| 337 | + if ( isset($headers['X-Message-Group']) ) { |
| 338 | + $metadata['group'] = $headers['X-Message-Group']; |
| 339 | + } |
434 | 340 | |
435 | | - $sections = preg_split( '/\n{2,}/', $data ); |
436 | | - array_shift( $sections ); // First isn't an actual message |
437 | | - |
| 341 | + // Then parse the messages |
438 | 342 | foreach ( $sections as $section ) { |
439 | 343 | if ( trim( $section ) === '' ) continue; |
440 | 344 | |
— | — | @@ -445,44 +349,49 @@ |
446 | 350 | 'comments' => array(), |
447 | 351 | ); |
448 | 352 | |
449 | | - $matches = array(); |
450 | | - if ( preg_match( "/^msgid\s($poformat)/mx", $section, $matches ) ) { |
451 | | - $item['id'] = self::formatForWiki( $matches[1] ); |
| 353 | + $match = self::expectKeyword( 'msgid', $section ); |
| 354 | + if ( $match !== null ) { |
| 355 | + $item['id'] = self::formatForWiki( $match ); |
452 | 356 | } else { |
453 | 357 | throw new MWException( "Unable to parse msgid:\n\n$section" ); |
454 | 358 | } |
455 | 359 | |
456 | | - if ( preg_match( "/^msgctxt\s($poformat)/mx", $section, $matches ) ) { |
457 | | - $item['ctxt'] = self::formatForWiki( $matches[1] ); |
| 360 | + $match = self::expectKeyword( 'msgctxt', $section ); |
| 361 | + if ( $match !== null ) { |
| 362 | + $item['ctxt'] = self::formatForWiki( $match ); |
458 | 363 | } elseif ( $useCtxtAsKey ) { // Invalid message |
459 | 364 | $metadata['warnings'][] = "Ctxt missing for {$item['id']}"; |
| 365 | + error_log( "Ctxt missing for {$item['id']}" ); |
460 | 366 | } |
461 | 367 | |
462 | 368 | |
463 | 369 | $pluralMessage = false; |
464 | | - if ( preg_match( "/^msgid_plural\s($poformat)/mx", $section, $matches ) ) { |
| 370 | + $match = self::expectKeyword( 'msgid_plural', $section ); |
| 371 | + if ( $match !== null ) { |
465 | 372 | $pluralMessage = true; |
466 | | - $plural = self::formatForWiki( $matches[1] ); |
| 373 | + $plural = self::formatForWiki( $match ); |
467 | 374 | $item['id'] = "{{PLURAL:GETTEXT|{$item['id']}|$plural}}"; |
468 | 375 | } |
469 | 376 | |
470 | 377 | if ( $pluralMessage ) { |
471 | | - |
472 | 378 | $actualForms = array(); |
473 | | - for ( $i = 0; $i < $pluralForms[1]; $i++ ) { |
474 | | - if ( preg_match( "/^msgstr\[$i\]\s($poformat)/mx", $section, $matches ) ) { |
475 | | - $actualForms[] = self::formatForWiki( $matches[1] ); |
| 379 | + for ( $i = 0; $i < $pluralCount; $i++ ) { |
| 380 | + $match = self::expectKeyword( "msgstr\\[$i\\]", $section ); |
| 381 | + if ( $match !== null ) { |
| 382 | + $actualForms[] = self::formatForWiki( $match ); |
476 | 383 | } else { |
477 | | - throw new MWException( "Plural not found, expecting $i" ); |
| 384 | + throw new MWException( "Plural $i not found, expecting total of $pluralCount" ); |
478 | 385 | } |
479 | 386 | } |
480 | 387 | |
481 | | - $item['str'] = '{{PLURAL:GETTEXT|' . implode( '|', $actualForms ) . '}}'; |
| 388 | + // Keep the translation empty if no form has translation |
| 389 | + if ( array_sum( array_map( 'strlen', $actualForms ) ) > 0 ) { |
| 390 | + $item['str'] = '{{PLURAL:GETTEXT|' . implode( '|', $actualForms ) . '}}'; |
| 391 | + } |
482 | 392 | } else { |
483 | | - |
484 | | - $matches = array(); |
485 | | - if ( preg_match( "/^msgstr\s($poformat)/mx", $section, $matches ) ) { |
486 | | - $item['str'] = self::formatForWiki( $matches[1] ); |
| 393 | + $match = self::expectKeyword( 'msgstr', $section ); |
| 394 | + if ( $match !== null ) { |
| 395 | + $item['str'] = self::formatForWiki( $match ); |
487 | 396 | } else { |
488 | 397 | throw new MWException( "Unable to parse msgstr:\n\n$section" ); |
489 | 398 | } |
— | — | @@ -519,16 +428,34 @@ |
520 | 429 | |
521 | 430 | $messages[$key] = $item['str']; |
522 | 431 | $template[$key] = $item; |
523 | | - |
524 | 432 | } |
525 | 433 | |
526 | 434 | return array( |
527 | 435 | 'MESSAGES' => $messages, |
528 | 436 | 'TEMPLATE' => $template, |
529 | 437 | 'METADATA' => $metadata, |
| 438 | + 'HEADERS' => $headers |
530 | 439 | ); |
531 | 440 | } |
532 | 441 | |
| 442 | + public static function expectKeyword( $name, $section ) { |
| 443 | + /* Catches the multiline textblock that comes after keywords msgid, |
| 444 | + * msgstr, msgid_plural, msgctxt. |
| 445 | + */ |
| 446 | + $poformat = '".*"\n?(^".*"$\n?)*'; |
| 447 | + |
| 448 | + $matches = array(); |
| 449 | + if ( preg_match( "/^$name\s($poformat)/mx", $section, $matches ) ) { |
| 450 | + return $matches[1]; |
| 451 | + } else { |
| 452 | + return null; |
| 453 | + } |
| 454 | + } |
| 455 | + |
| 456 | + /** |
| 457 | + * Generates unique key for each message. Changing this WILL BREAK ALL |
| 458 | + * existing pages! |
| 459 | + */ |
533 | 460 | public static function generateKeyFromItem( $item ) { |
534 | 461 | $lang = Language::factory( 'en' ); |
535 | 462 | global $wgLegalTitleChars; |
— | — | @@ -542,16 +469,37 @@ |
543 | 470 | return "$hash-$snippet"; |
544 | 471 | } |
545 | 472 | |
546 | | - public static function formatForWiki( $data ) { |
| 473 | + /** |
| 474 | + * This parses the Gettext text block format. Since trailing whitespace is |
| 475 | + * not allowed in MediaWiki pages, the default action is to append |
| 476 | + * \-character at the end of the message. You can also choose to ignore it |
| 477 | + * and use the trim action instead. |
| 478 | + */ |
| 479 | + public static function formatForWiki( $data, $whitespace = 'mark' ) { |
547 | 480 | $quotePattern = '/(^"|"$\n?)/m'; |
548 | 481 | $data = preg_replace( $quotePattern, '', $data ); |
549 | 482 | $data = stripcslashes( $data ); |
550 | 483 | if ( preg_match( '/\s$/', $data ) ) { |
551 | | - $data .= '\\'; |
| 484 | + if ( $whitespace === 'mark' ) |
| 485 | + $data .= '\\'; |
| 486 | + elseif ( $whitespace === 'trim' ) |
| 487 | + $data = rtrim($data); |
| 488 | + else |
| 489 | + // FIXME: only triggered if there is trailing whitespace |
| 490 | + throw new MWException( 'Unknown action for whitespace' ); |
552 | 491 | } |
553 | 492 | return $data; |
554 | 493 | } |
555 | 494 | |
| 495 | + public static function parseHeaderTags( $headers ) { |
| 496 | + $tags = array(); |
| 497 | + foreach ( explode("\n", $headers ) as $line ) { |
| 498 | + list( $key, $value ) = explode( ': ', $line, 2 ); |
| 499 | + $tags[$key] = $value; |
| 500 | + } |
| 501 | + return $tags; |
| 502 | + } |
| 503 | + |
556 | 504 | // |
557 | 505 | // WRITE |
558 | 506 | // |