r59335 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r59334‎ | r59335 | r59336 >
Date:14:18, 22 November 2009
Author:nikerabbit
Status:ok
Tags:
Comment:
* Remove code duplication
* Add some comments
* Preserve header tags on export (should fix one of those plural support issues)
Modified paths:
  • /trunk/extensions/Translate/ffs/Gettext.php (modified) (history)

Diff [purge]

Index: trunk/extensions/Translate/ffs/Gettext.php
@@ -29,136 +29,21 @@
3030 }
3131
3232 public function parseFile() {
33 - $data = file_get_contents( $this->filename );
34 - $data = str_replace( "\r\n", "\n", $data );
35 -
36 - $pluralForms = false;
37 -
38 - $matches = array();
39 - if ( preg_match( '/X-Language-Code:\s+([a-zA-Z-_]+)/', $data, $matches ) ) {
40 - $code = $matches[1];
 33+ if ( $this->filename === false ) {
 34+ return array();
4135 }
42 -
43 - if ( preg_match( '/X-Message-Group:\s+([a-zA-Z0-9-._\|]+)/', $data, $matches ) ) {
44 - $groupId = $matches[1];
45 - }
46 -
47 - if ( preg_match( '/Plural-Forms:\s+nplurals=([0-9]+).*;/', $data, $matches ) ) {
48 - $pluralForms = $matches;
49 - }
50 -
51 - $useCtxtAsKey = false;
52 -
53 - $poformat = '".*"\n?(^".*"$\n?)*';
54 - $quotePattern = '/(^"|"$\n?)/m';
55 -
56 - $sections = preg_split( '/\n{2,}/', $data );
57 - array_shift( $sections ); // First isn't an actual message
58 - $changes = array();
59 -
60 - foreach ( $sections as $section ) {
61 - if ( trim( $section ) === '' ) continue;
62 -
63 - $item = array(
64 - 'ctxt' => '',
65 - 'id' => '',
66 - 'str' => '',
67 - 'flags' => array(),
68 - 'comments' => array(),
69 - );
70 -
71 - $matches = array();
72 - if ( preg_match( "/^msgctxt\s($poformat)/mx", $section, $matches ) ) {
73 - // Remove quoting
74 - $item['ctxt'] = GettextFFS::formatForWiki( $matches[1] );
75 - } elseif ( $useCtxtAsKey ) {
76 - // Invalid message
77 - continue;
78 - }
79 -
80 - $matches = array();
81 - if ( preg_match( "/^msgid\s($poformat)/mx", $section, $matches ) ) {
82 - $item['id'] = GettextFFS::formatForWiki( $matches[1] );
83 - } else {
84 - # echo "Definition not found!\n$section";
85 - continue;
86 - }
87 -
88 - $pluralMessage = false;
89 - $matches = array();
90 - if ( preg_match( "/^msgid_plural\s($poformat)/mx", $section, $matches ) ) {
91 - $pluralMessage = true;
92 - $plural = GettextFFS::formatForWiki( $matches[1] );
93 - $item['id'] = "{{PLURAL:GETTEXT|{$item['id']}|$plural}}";
94 - }
95 -
96 - if ( $pluralMessage ) {
97 -
98 - $actualForms = array();
99 - for ( $i = 0; $i < $pluralForms[1]; $i++ ) {
100 - $matches = array();
101 - if ( preg_match( "/^msgid_plural\s($poformat)/mx", $section, $matches ) ) {
102 - continue; // Skip
103 - }
104 - if ( preg_match( "/^msgstr\[$i\]\s($poformat)/mx", $section, $matches ) ) {
105 - $actualForms[] = GettextFFS::formatForWiki( $matches[1] );
106 - } else {
107 - throw new MWException( "Plural not found, expecting $i for: $section" );
108 - }
109 - }
110 -
111 - $item['str'] = '{{PLURAL:GETTEXT|' . implode( '|', $actualForms ) . '}}';
112 - } else {
113 -
114 - $matches = array();
115 - if ( preg_match( "/^msgstr\s($poformat)/mx", $section, $matches ) ) {
116 - $item['str'] = GettextFFS::formatForWiki( $matches[1] );
117 - } else {
118 - # echo "Translation not found!\n";
119 - continue;
120 - }
121 - }
122 -
123 - // Parse flags
124 - $matches = array();
125 - if ( preg_match( '/^#,(.*)$/mu', $section, $matches ) ) {
126 - $flags = array_map( 'trim', explode( ',', $matches[1] ) );
127 - foreach ( $flags as $key => $flag ) {
128 - if ( $flag === 'fuzzy' ) {
129 - $item['str'] = TRANSLATE_FUZZY . $item['str'];
130 - unset( $flags[$key] );
131 - }
132 - }
133 - $item['flags'] = $flags;
134 - }
135 -
136 - $matches = array();
137 - if ( preg_match_all( '/^#(.?) (.*)$/m', $section, $matches, PREG_SET_ORDER ) ) {
138 - foreach ( $matches as $match ) {
139 - if ( $match[1] !== ',' ) {
140 - $item['comments'][$match[1]][] = $match[2];
141 - }
142 - }
143 - }
144 -
145 - $lang = Language::factory( 'en' );
146 - if ( $useCtxtAsKey ) {
147 - $key = $item['ctxt'];
148 - } else {
149 - $key = GettextFFS::generateKeyFromItem( $item );
150 - }
151 -
152 - $changes[$key] = $item;
153 -
154 - }
155 - $changes['PLURAL'] = $pluralForms;
156 - return $changes;
 36+ $data = file_get_contents( $this->filename );
 37+ $parse = GettextFFS::parseGettextData( $data );
 38+ // Ugly ugly hack! part 1
 39+ $parse['TEMPLATE']['HEADERS'] = $parse['HEADERS'];
 40+ return $parse['TEMPLATE'];
15741 }
15842
15943
16044 public function parseMessages( StringMangler $mangler ) {
16145 $defs = $this->parseFile();
162 - unset($defs['PLURAL']);
 46+ // Ugly ugly hack! part 2
 47+ unset( $defs['HEADERS'] );
16348 $messages = array();
16449 foreach ( $defs as $key => $def ) {
16550 if ( $this->pot ) {
@@ -184,8 +69,9 @@
18570 if ( $reader instanceof GettextFormatReader ) {
18671 $this->addAuthors( $reader->parseAuthors(), $code );
18772 $this->staticHeader = $reader->parseStaticHeader();
188 - $data = $reader->parseFile();
189 - $this->plural = $data['PLURAL'];
 73+ $this->owndata = $reader->parseFile();
 74+ // Ugly ugly hack! part 3
 75+ $this->headers = $this->owndata['HEADERS'];
19076 }
19177 if ( $readerEn instanceof GettextFormatReader ) {
19278 $this->data = $readerEn->parseFile();
@@ -205,14 +91,15 @@
20692 $label = $this->group->getLabel();
20793 $languageName = TranslateUtils::getLanguageName( $code );
20894
209 - $headers = array();
 95+ $headers = $this->headers;
21096 $headers['Project-Id-Version'] = $label;
21197 // TODO: make this customisable or something
21298 // $headers['Report-Msgid-Bugs-To'] = $wgServer;
21399 // TODO: sprintfDate doesn't support any time zone flags
214100 // $headers['POT-Creation-Date']
215101 $headers['PO-Revision-Date'] = $lang->sprintfDate( 'xnY-xnm-xnd xnH:xni:xns+0000', $now );
216 - $headers['Language-Team'] = $languageName;
 102+ // Link to portal pages??
 103+ //$headers['Language-Team'] = $languageName;
217104 $headers['Content-Type'] = 'text/plain; charset=UTF-8';
218105 $headers['Content-Transfer-Encoding'] = '8bit';
219106
@@ -222,10 +109,6 @@
223110 $headers['X-Translation-Project'] = "$wgSitename at $wgServer";
224111 $headers['X-Language-Code'] = $code;
225112 $headers['X-Message-Group'] = $this->group->getId();
226 - if( $this->plural[0] ) {
227 - list( $header, $rest ) = explode( ':', $this->plural[0] );
228 - $headers[$header] = trim($rest);
229 - }
230113
231114 $headerlines = array( '' );
232115 foreach ( $headers as $key => $value ) {
@@ -251,7 +134,7 @@
252135 # CASE3: optional messages; accept only if different
253136 if ( $m->hasTag( 'optional') ) $flags[] = 'x-optional';
254137
255 - # Remove fuzzy markings before export
 138+ # Remove explicit fuzzy markings from the translation before export
256139 $flags = array();
257140 $comments = array();
258141 if ( isset( $this->data[$key]['flags'] ) ) {
@@ -381,6 +264,7 @@
382265
383266 return $splitPlurals;
384267 }
 268+
385269 }
386270
387271 class GettextFFS extends SimpleFFS {
@@ -406,34 +290,54 @@
407291 }
408292
409293 public function parseGettext( $data ) {
 294+ $useCtxtAsKey = isset($this->extra['CtxtAsKey']) && $this->extra['CtxtAsKey'];
 295+ return self::parseGettextData( $data, $useCtxtAsKey );
 296+ }
 297+
 298+ // Ugly hack to avoid code duplication between old and new style FFS
 299+ public static function parseGettextData( $data, $useCtxtAsKey = false ) {
 300+ // Normalise newlines, to make processing easier lates
410301 $data = str_replace( "\r\n", "\n", $data );
411 - $messages = $template = $metadata = array();
412302
413 - // Defined only once. Be sure to *not* use it without match, or you might get old data
414 - $matches = array();
 303+ /* Delimit the file into sections, which are separated by two newlines.
 304+ * We are permissive and accept more than two. This parsing method isn't
 305+ * efficient wrt memory, but was easy to implement */
 306+ $sections = preg_split( '/\n{2,}/', $data );
415307
416 - if ( preg_match( '/X-Language-Code:\s+([a-zA-Z-_]+)/', $data, $matches ) ) {
417 - $metadata['code'] = $matches[1];
418 - }
 308+ /* First one isn't an actual message. We'll handle it specially below */
 309+ $headerSection = array_shift( $sections );
419310
420 - if ( preg_match( '/X-Message-Group:\s+([a-zA-Z0-9-._\|]+)/', $data, $matches ) ) {
421 - $metadata['group'] = $matches[1];
 311+ /* Since this is the header section, we are only interested in the tags
 312+ * and msgid is empty. Somewhere we should extract the header comments
 313+ * too */
 314+ $match = self::expectKeyword( 'msgstr', $headerSection );
 315+ if ( $match !== null ) {
 316+ $headerBlock = self::formatForWiki( $match, 'trim' );
 317+ $headers = self::parseHeaderTags( $headerBlock );
 318+ } else {
 319+ throw new MWException( "Gettext file header was not found:\n\n$data" );
422320 }
423321
424 - $pluralForms = false;
425 - if ( preg_match( '/Plural-Forms:\s+nplurals=([0-9]+).*;/', $data, $matches ) ) {
426 - $metadata['plurals'] = $matches;
427 - $pluralForms = $matches;
 322+ /* At this stage we are only interested how many plurals forms we should
 323+ * be expecting when parsing the rest of this file. */
 324+ $pluralCount = false;
 325+ if ( isset($headers['Plural-Forms']) ) {
 326+ if ( preg_match( '/nplurals=([0-9]+).*;/', $headers['Plural-Forms'], $matches ) ) {
 327+ $pluralCount = $matches[1];
 328+ }
428329 }
429330
430 - $useCtxtAsKey = isset($this->extra['CtxtAsKey']) && $this->extra['CtxtAsKey'];
 331+ // Extract some metadata from headers for easier use
 332+ $metadata = array();
 333+ if ( isset($headers['X-Language-Code']) ) {
 334+ $metadata['code'] = $headers['X-Language-Code'];
 335+ }
431336
432 - $poformat = '".*"\n?(^".*"$\n?)*';
433 - $quotePattern = '/(^"|"$\n?)/m';
 337+ if ( isset($headers['X-Message-Group']) ) {
 338+ $metadata['group'] = $headers['X-Message-Group'];
 339+ }
434340
435 - $sections = preg_split( '/\n{2,}/', $data );
436 - array_shift( $sections ); // First isn't an actual message
437 -
 341+ // Then parse the messages
438342 foreach ( $sections as $section ) {
439343 if ( trim( $section ) === '' ) continue;
440344
@@ -445,44 +349,49 @@
446350 'comments' => array(),
447351 );
448352
449 - $matches = array();
450 - if ( preg_match( "/^msgid\s($poformat)/mx", $section, $matches ) ) {
451 - $item['id'] = self::formatForWiki( $matches[1] );
 353+ $match = self::expectKeyword( 'msgid', $section );
 354+ if ( $match !== null ) {
 355+ $item['id'] = self::formatForWiki( $match );
452356 } else {
453357 throw new MWException( "Unable to parse msgid:\n\n$section" );
454358 }
455359
456 - if ( preg_match( "/^msgctxt\s($poformat)/mx", $section, $matches ) ) {
457 - $item['ctxt'] = self::formatForWiki( $matches[1] );
 360+ $match = self::expectKeyword( 'msgctxt', $section );
 361+ if ( $match !== null ) {
 362+ $item['ctxt'] = self::formatForWiki( $match );
458363 } elseif ( $useCtxtAsKey ) { // Invalid message
459364 $metadata['warnings'][] = "Ctxt missing for {$item['id']}";
 365+ error_log( "Ctxt missing for {$item['id']}" );
460366 }
461367
462368
463369 $pluralMessage = false;
464 - if ( preg_match( "/^msgid_plural\s($poformat)/mx", $section, $matches ) ) {
 370+ $match = self::expectKeyword( 'msgid_plural', $section );
 371+ if ( $match !== null ) {
465372 $pluralMessage = true;
466 - $plural = self::formatForWiki( $matches[1] );
 373+ $plural = self::formatForWiki( $match );
467374 $item['id'] = "{{PLURAL:GETTEXT|{$item['id']}|$plural}}";
468375 }
469376
470377 if ( $pluralMessage ) {
471 -
472378 $actualForms = array();
473 - for ( $i = 0; $i < $pluralForms[1]; $i++ ) {
474 - if ( preg_match( "/^msgstr\[$i\]\s($poformat)/mx", $section, $matches ) ) {
475 - $actualForms[] = self::formatForWiki( $matches[1] );
 379+ for ( $i = 0; $i < $pluralCount; $i++ ) {
 380+ $match = self::expectKeyword( "msgstr\\[$i\\]", $section );
 381+ if ( $match !== null ) {
 382+ $actualForms[] = self::formatForWiki( $match );
476383 } else {
477 - throw new MWException( "Plural not found, expecting $i" );
 384+ throw new MWException( "Plural $i not found, expecting total of $pluralCount" );
478385 }
479386 }
480387
481 - $item['str'] = '{{PLURAL:GETTEXT|' . implode( '|', $actualForms ) . '}}';
 388+ // Keep the translation empty if no form has translation
 389+ if ( array_sum( array_map( 'strlen', $actualForms ) ) > 0 ) {
 390+ $item['str'] = '{{PLURAL:GETTEXT|' . implode( '|', $actualForms ) . '}}';
 391+ }
482392 } else {
483 -
484 - $matches = array();
485 - if ( preg_match( "/^msgstr\s($poformat)/mx", $section, $matches ) ) {
486 - $item['str'] = self::formatForWiki( $matches[1] );
 393+ $match = self::expectKeyword( 'msgstr', $section );
 394+ if ( $match !== null ) {
 395+ $item['str'] = self::formatForWiki( $match );
487396 } else {
488397 throw new MWException( "Unable to parse msgstr:\n\n$section" );
489398 }
@@ -519,16 +428,34 @@
520429
521430 $messages[$key] = $item['str'];
522431 $template[$key] = $item;
523 -
524432 }
525433
526434 return array(
527435 'MESSAGES' => $messages,
528436 'TEMPLATE' => $template,
529437 'METADATA' => $metadata,
 438+ 'HEADERS' => $headers
530439 );
531440 }
532441
 442+ public static function expectKeyword( $name, $section ) {
 443+ /* Catches the multiline textblock that comes after keywords msgid,
 444+ * msgstr, msgid_plural, msgctxt.
 445+ */
 446+ $poformat = '".*"\n?(^".*"$\n?)*';
 447+
 448+ $matches = array();
 449+ if ( preg_match( "/^$name\s($poformat)/mx", $section, $matches ) ) {
 450+ return $matches[1];
 451+ } else {
 452+ return null;
 453+ }
 454+ }
 455+
 456+ /**
 457+ * Generates unique key for each message. Changing this WILL BREAK ALL
 458+ * existing pages!
 459+ */
533460 public static function generateKeyFromItem( $item ) {
534461 $lang = Language::factory( 'en' );
535462 global $wgLegalTitleChars;
@@ -542,16 +469,37 @@
543470 return "$hash-$snippet";
544471 }
545472
546 - public static function formatForWiki( $data ) {
 473+ /**
 474+ * This parses the Gettext text block format. Since trailing whitespace is
 475+ * not allowed in MediaWiki pages, the default action is to append
 476+ * \-character at the end of the message. You can also choose to ignore it
 477+ * and use the trim action instead.
 478+ */
 479+ public static function formatForWiki( $data, $whitespace = 'mark' ) {
547480 $quotePattern = '/(^"|"$\n?)/m';
548481 $data = preg_replace( $quotePattern, '', $data );
549482 $data = stripcslashes( $data );
550483 if ( preg_match( '/\s$/', $data ) ) {
551 - $data .= '\\';
 484+ if ( $whitespace === 'mark' )
 485+ $data .= '\\';
 486+ elseif ( $whitespace === 'trim' )
 487+ $data = rtrim($data);
 488+ else
 489+ // FIXME: only triggered if there is trailing whitespace
 490+ throw new MWException( 'Unknown action for whitespace' );
552491 }
553492 return $data;
554493 }
555494
 495+ public static function parseHeaderTags( $headers ) {
 496+ $tags = array();
 497+ foreach ( explode("\n", $headers ) as $line ) {
 498+ list( $key, $value ) = explode( ': ', $line, 2 );
 499+ $tags[$key] = $value;
 500+ }
 501+ return $tags;
 502+ }
 503+
556504 //
557505 // WRITE
558506 //

Status & tagging log