r59335 MediaWiki - Code Review archive

Repository:	MediaWiki
Revision:	< r59334‎ \| r59335 \| r59336 >
Date:	14:18, 22 November 2009
Author:	nikerabbit
Status:	ok
Tags:
Comment:	* Remove code duplication * Add some comments * Preserve header tags on export (should fix one of those plural support issues)
Modified paths:	/trunk/extensions/Translate/ffs/Gettext.php (modified) (history)

Diff [purge]

Index: trunk/extensions/Translate/ffs/Gettext.php
—	—	@@ -29,136 +29,21 @@
30	30	}
31	31
32	32	public function parseFile() {
33		~~- $data = file_get_contents( $this->filename );~~
34		~~- $data = str_replace( "\r\n", "\n", $data );~~
35		-
36		~~- $pluralForms = false;~~
37		-
38		~~- $matches = array();~~
39		~~- if ( preg_match( '/X-Language-Code:\s+([a-zA-Z-_]+)/', $data, $matches ) ) {~~
40		~~- $code = $matches[1];~~
	33	+ if ( $this->filename === false ) {
	34	+ return array();
41	35	}
42		-
43		~~- if ( preg_match( '/X-Message-Group:\s+([a-zA-Z0-9-._\\|]+)/', $data, $matches ) ) {~~
44		~~- $groupId = $matches[1];~~
45		~~- }~~
46		-
47		~~- if ( preg_match( '/Plural-Forms:\s+nplurals=([0-9]+).*;/', $data, $matches ) ) {~~
48		~~- $pluralForms = $matches;~~
49		~~- }~~
50		-
51		~~- $useCtxtAsKey = false;~~
52		-
53		~~- $poformat = '"."\n?(^"."$\n?)*';~~
54		~~- $quotePattern = '/(^"\|"$\n?)/m';~~
55		-
56		~~- $sections = preg_split( '/\n{2,}/', $data );~~
57		~~- array_shift( $sections ); // First isn't an actual message~~
58		~~- $changes = array();~~
59		-
60		~~- foreach ( $sections as $section ) {~~
61		~~- if ( trim( $section ) === '' ) continue;~~
62		-
63		~~- $item = array(~~
64		~~- 'ctxt' => '',~~
65		~~- 'id' => '',~~
66		~~- 'str' => '',~~
67		~~- 'flags' => array(),~~
68		~~- 'comments' => array(),~~
69		~~- );~~
70		-
71		~~- $matches = array();~~
72		~~- if ( preg_match( "/^msgctxt\s($poformat)/mx", $section, $matches ) ) {~~
73		~~- // Remove quoting~~
74		~~- $item['ctxt'] = GettextFFS::formatForWiki( $matches[1] );~~
75		~~- } elseif ( $useCtxtAsKey ) {~~
76		~~- // Invalid message~~
77		~~- continue;~~
78		~~- }~~
79		-
80		~~- $matches = array();~~
81		~~- if ( preg_match( "/^msgid\s($poformat)/mx", $section, $matches ) ) {~~
82		~~- $item['id'] = GettextFFS::formatForWiki( $matches[1] );~~
83		~~- } else {~~
84		~~- # echo "Definition not found!\n$section";~~
85		~~- continue;~~
86		~~- }~~
87		-
88		~~- $pluralMessage = false;~~
89		~~- $matches = array();~~
90		~~- if ( preg_match( "/^msgid_plural\s($poformat)/mx", $section, $matches ) ) {~~
91		~~- $pluralMessage = true;~~
92		~~- $plural = GettextFFS::formatForWiki( $matches[1] );~~
93		~~- $item['id'] = "{{PLURAL:GETTEXT\|{$item['id']}\|$plural}}";~~
94		~~- }~~
95		-
96		~~- if ( $pluralMessage ) {~~
97		-
98		~~- $actualForms = array();~~
99		~~- for ( $i = 0; $i < $pluralForms[1]; $i++ ) {~~
100		~~- $matches = array();~~
101		~~- if ( preg_match( "/^msgid_plural\s($poformat)/mx", $section, $matches ) ) {~~
102		~~- continue; // Skip~~
103		~~- }~~
104		~~- if ( preg_match( "/^msgstr\[$i\]\s($poformat)/mx", $section, $matches ) ) {~~
105		~~- $actualForms[] = GettextFFS::formatForWiki( $matches[1] );~~
106		~~- } else {~~
107		~~- throw new MWException( "Plural not found, expecting $i for: $section" );~~
108		~~- }~~
109		~~- }~~
110		-
111		~~- $item['str'] = '{{PLURAL:GETTEXT\|' . implode( '\|', $actualForms ) . '}}';~~
112		~~- } else {~~
113		-
114		~~- $matches = array();~~
115		~~- if ( preg_match( "/^msgstr\s($poformat)/mx", $section, $matches ) ) {~~
116		~~- $item['str'] = GettextFFS::formatForWiki( $matches[1] );~~
117		~~- } else {~~
118		~~- # echo "Translation not found!\n";~~
119		~~- continue;~~
120		~~- }~~
121		~~- }~~
122		-
123		~~- // Parse flags~~
124		~~- $matches = array();~~
125		~~- if ( preg_match( '/^#,(.*)$/mu', $section, $matches ) ) {~~
126		~~- $flags = array_map( 'trim', explode( ',', $matches[1] ) );~~
127		~~- foreach ( $flags as $key => $flag ) {~~
128		~~- if ( $flag === 'fuzzy' ) {~~
129		~~- $item['str'] = TRANSLATE_FUZZY . $item['str'];~~
130		~~- unset( $flags[$key] );~~
131		~~- }~~
132		~~- }~~
133		~~- $item['flags'] = $flags;~~
134		~~- }~~
135		-
136		~~- $matches = array();~~
137		~~- if ( preg_match_all( '/^#(.?) (.*)$/m', $section, $matches, PREG_SET_ORDER ) ) {~~
138		~~- foreach ( $matches as $match ) {~~
139		~~- if ( $match[1] !== ',' ) {~~
140		~~- $item['comments'][$match[1]][] = $match[2];~~
141		~~- }~~
142		~~- }~~
143		~~- }~~
144		-
145		~~- $lang = Language::factory( 'en' );~~
146		~~- if ( $useCtxtAsKey ) {~~
147		~~- $key = $item['ctxt'];~~
148		~~- } else {~~
149		~~- $key = GettextFFS::generateKeyFromItem( $item );~~
150		~~- }~~
151		-
152		~~- $changes[$key] = $item;~~
153		-
154		~~- }~~
155		~~- $changes['PLURAL'] = $pluralForms;~~
156		~~- return $changes;~~
	36	+ $data = file_get_contents( $this->filename );
	37	+ $parse = GettextFFS::parseGettextData( $data );
	38	+ // Ugly ugly hack! part 1
	39	+ $parse['TEMPLATE']['HEADERS'] = $parse['HEADERS'];
	40	+ return $parse['TEMPLATE'];
157	41	}
158	42
159	43
160	44	public function parseMessages( StringMangler $mangler ) {
161	45	$defs = $this->parseFile();
162		~~- unset($defs['PLURAL']);~~
	46	+ // Ugly ugly hack! part 2
	47	+ unset( $defs['HEADERS'] );
163	48	$messages = array();
164	49	foreach ( $defs as $key => $def ) {
165	50	if ( $this->pot ) {
—	—	@@ -184,8 +69,9 @@
185	70	if ( $reader instanceof GettextFormatReader ) {
186	71	$this->addAuthors( $reader->parseAuthors(), $code );
187	72	$this->staticHeader = $reader->parseStaticHeader();
188		~~- $data = $reader->parseFile();~~
189		~~- $this->plural = $data['PLURAL'];~~
	73	+ $this->owndata = $reader->parseFile();
	74	+ // Ugly ugly hack! part 3
	75	+ $this->headers = $this->owndata['HEADERS'];
190	76	}
191	77	if ( $readerEn instanceof GettextFormatReader ) {
192	78	$this->data = $readerEn->parseFile();
—	—	@@ -205,14 +91,15 @@
206	92	$label = $this->group->getLabel();
207	93	$languageName = TranslateUtils::getLanguageName( $code );
208	94
209		~~- $headers = array();~~
	95	+ $headers = $this->headers;
210	96	$headers['Project-Id-Version'] = $label;
211	97	// TODO: make this customisable or something
212	98	// $headers['Report-Msgid-Bugs-To'] = $wgServer;
213	99	// TODO: sprintfDate doesn't support any time zone flags
214	100	// $headers['POT-Creation-Date']
215	101	$headers['PO-Revision-Date'] = $lang->sprintfDate( 'xnY-xnm-xnd xnH:xni:xns+0000', $now );
216		~~- $headers['Language-Team'] = $languageName;~~
	102	+ // Link to portal pages??
	103	+ //$headers['Language-Team'] = $languageName;
217	104	$headers['Content-Type'] = 'text/plain; charset=UTF-8';
218	105	$headers['Content-Transfer-Encoding'] = '8bit';
219	106
—	—	@@ -222,10 +109,6 @@
223	110	$headers['X-Translation-Project'] = "$wgSitename at $wgServer";
224	111	$headers['X-Language-Code'] = $code;
225	112	$headers['X-Message-Group'] = $this->group->getId();
226		~~- if( $this->plural[0] ) {~~
227		~~- list( $header, $rest ) = explode( ':', $this->plural[0] );~~
228		~~- $headers[$header] = trim($rest);~~
229		~~- }~~
230	113
231	114	$headerlines = array( '' );
232	115	foreach ( $headers as $key => $value ) {
—	—	@@ -251,7 +134,7 @@
252	135	# CASE3: optional messages; accept only if different
253	136	if ( $m->hasTag( 'optional') ) $flags[] = 'x-optional';
254	137
255		~~- # Remove fuzzy markings before export~~
	138	+ # Remove explicit fuzzy markings from the translation before export
256	139	$flags = array();
257	140	$comments = array();
258	141	if ( isset( $this->data[$key]['flags'] ) ) {
—	—	@@ -381,6 +264,7 @@
382	265
383	266	return $splitPlurals;
384	267	}
	268	+
385	269	}
386	270
387	271	class GettextFFS extends SimpleFFS {
—	—	@@ -406,34 +290,54 @@
407	291	}
408	292
409	293	public function parseGettext( $data ) {
	294	+ $useCtxtAsKey = isset($this->extra['CtxtAsKey']) && $this->extra['CtxtAsKey'];
	295	+ return self::parseGettextData( $data, $useCtxtAsKey );
	296	+ }
	297	+
	298	+ // Ugly hack to avoid code duplication between old and new style FFS
	299	+ public static function parseGettextData( $data, $useCtxtAsKey = false ) {
	300	+ // Normalise newlines, to make processing easier lates
410	301	$data = str_replace( "\r\n", "\n", $data );
411		~~- $messages = $template = $metadata = array();~~
412	302
413		~~- // Defined only once. Be sure to not use it without match, or you might get old data~~
414		~~- $matches = array();~~
	303	+ /* Delimit the file into sections, which are separated by two newlines.
	304	+ * We are permissive and accept more than two. This parsing method isn't
	305	+ * efficient wrt memory, but was easy to implement */
	306	+ $sections = preg_split( '/\n{2,}/', $data );
415	307
416		~~- if ( preg_match( '/X-Language-Code:\s+([a-zA-Z-_]+)/', $data, $matches ) ) {~~
417		~~- $metadata['code'] = $matches[1];~~
418		~~- }~~
	308	+ /* First one isn't an actual message. We'll handle it specially below */
	309	+ $headerSection = array_shift( $sections );
419	310
420		~~- if ( preg_match( '/X-Message-Group:\s+([a-zA-Z0-9-._\\|]+)/', $data, $matches ) ) {~~
421		~~- $metadata['group'] = $matches[1];~~
	311	+ /* Since this is the header section, we are only interested in the tags
	312	+ * and msgid is empty. Somewhere we should extract the header comments
	313	+ * too */
	314	+ $match = self::expectKeyword( 'msgstr', $headerSection );
	315	+ if ( $match !== null ) {
	316	+ $headerBlock = self::formatForWiki( $match, 'trim' );
	317	+ $headers = self::parseHeaderTags( $headerBlock );
	318	+ } else {
	319	+ throw new MWException( "Gettext file header was not found:\n\n$data" );
422	320	}
423	321
424		~~- $pluralForms = false;~~
425		~~- if ( preg_match( '/Plural-Forms:\s+nplurals=([0-9]+).*;/', $data, $matches ) ) {~~
426		~~- $metadata['plurals'] = $matches;~~
427		~~- $pluralForms = $matches;~~
	322	+ /* At this stage we are only interested how many plurals forms we should
	323	+ * be expecting when parsing the rest of this file. */
	324	+ $pluralCount = false;
	325	+ if ( isset($headers['Plural-Forms']) ) {
	326	+ if ( preg_match( '/nplurals=([0-9]+).*;/', $headers['Plural-Forms'], $matches ) ) {
	327	+ $pluralCount = $matches[1];
	328	+ }
428	329	}
429	330
430		~~- $useCtxtAsKey = isset($this->extra['CtxtAsKey']) && $this->extra['CtxtAsKey'];~~
	331	+ // Extract some metadata from headers for easier use
	332	+ $metadata = array();
	333	+ if ( isset($headers['X-Language-Code']) ) {
	334	+ $metadata['code'] = $headers['X-Language-Code'];
	335	+ }
431	336
432		~~- $poformat = '"."\n?(^"."$\n?)*';~~
433		~~- $quotePattern = '/(^"\|"$\n?)/m';~~
	337	+ if ( isset($headers['X-Message-Group']) ) {
	338	+ $metadata['group'] = $headers['X-Message-Group'];
	339	+ }
434	340
435		~~- $sections = preg_split( '/\n{2,}/', $data );~~
436		~~- array_shift( $sections ); // First isn't an actual message~~
437		-
	341	+ // Then parse the messages
438	342	foreach ( $sections as $section ) {
439	343	if ( trim( $section ) === '' ) continue;
440	344
—	—	@@ -445,44 +349,49 @@
446	350	'comments' => array(),
447	351	);
448	352
449		~~- $matches = array();~~
450		~~- if ( preg_match( "/^msgid\s($poformat)/mx", $section, $matches ) ) {~~
451		~~- $item['id'] = self::formatForWiki( $matches[1] );~~
	353	+ $match = self::expectKeyword( 'msgid', $section );
	354	+ if ( $match !== null ) {
	355	+ $item['id'] = self::formatForWiki( $match );
452	356	} else {
453	357	throw new MWException( "Unable to parse msgid:\n\n$section" );
454	358	}
455	359
456		~~- if ( preg_match( "/^msgctxt\s($poformat)/mx", $section, $matches ) ) {~~
457		~~- $item['ctxt'] = self::formatForWiki( $matches[1] );~~
	360	+ $match = self::expectKeyword( 'msgctxt', $section );
	361	+ if ( $match !== null ) {
	362	+ $item['ctxt'] = self::formatForWiki( $match );
458	363	} elseif ( $useCtxtAsKey ) { // Invalid message
459	364	$metadata['warnings'][] = "Ctxt missing for {$item['id']}";
	365	+ error_log( "Ctxt missing for {$item['id']}" );
460	366	}
461	367
462	368
463	369	$pluralMessage = false;
464		~~- if ( preg_match( "/^msgid_plural\s($poformat)/mx", $section, $matches ) ) {~~
	370	+ $match = self::expectKeyword( 'msgid_plural', $section );
	371	+ if ( $match !== null ) {
465	372	$pluralMessage = true;
466		~~- $plural = self::formatForWiki( $matches[1] );~~
	373	+ $plural = self::formatForWiki( $match );
467	374	$item['id'] = "{{PLURAL:GETTEXT\|{$item['id']}\|$plural}}";
468	375	}
469	376
470	377	if ( $pluralMessage ) {
471		-
472	378	$actualForms = array();
473		~~- for ( $i = 0; $i < $pluralForms[1]; $i++ ) {~~
474		~~- if ( preg_match( "/^msgstr\[$i\]\s($poformat)/mx", $section, $matches ) ) {~~
475		~~- $actualForms[] = self::formatForWiki( $matches[1] );~~
	379	+ for ( $i = 0; $i < $pluralCount; $i++ ) {
	380	+ $match = self::expectKeyword( "msgstr\\[$i\\]", $section );
	381	+ if ( $match !== null ) {
	382	+ $actualForms[] = self::formatForWiki( $match );
476	383	} else {
477		~~- throw new MWException( "Plural not found, expecting $i" );~~
	384	+ throw new MWException( "Plural $i not found, expecting total of $pluralCount" );
478	385	}
479	386	}
480	387
481		~~- $item['str'] = '{{PLURAL:GETTEXT\|' . implode( '\|', $actualForms ) . '}}';~~
	388	+ // Keep the translation empty if no form has translation
	389	+ if ( array_sum( array_map( 'strlen', $actualForms ) ) > 0 ) {
	390	+ $item['str'] = '{{PLURAL:GETTEXT\|' . implode( '\|', $actualForms ) . '}}';
	391	+ }
482	392	} else {
483		-
484		~~- $matches = array();~~
485		~~- if ( preg_match( "/^msgstr\s($poformat)/mx", $section, $matches ) ) {~~
486		~~- $item['str'] = self::formatForWiki( $matches[1] );~~
	393	+ $match = self::expectKeyword( 'msgstr', $section );
	394	+ if ( $match !== null ) {
	395	+ $item['str'] = self::formatForWiki( $match );
487	396	} else {
488	397	throw new MWException( "Unable to parse msgstr:\n\n$section" );
489	398	}
—	—	@@ -519,16 +428,34 @@
520	429
521	430	$messages[$key] = $item['str'];
522	431	$template[$key] = $item;
523		-
524	432	}
525	433
526	434	return array(
527	435	'MESSAGES' => $messages,
528	436	'TEMPLATE' => $template,
529	437	'METADATA' => $metadata,
	438	+ 'HEADERS' => $headers
530	439	);
531	440	}
532	441
	442	+ public static function expectKeyword( $name, $section ) {
	443	+ /* Catches the multiline textblock that comes after keywords msgid,
	444	+ * msgstr, msgid_plural, msgctxt.
	445	+ */
	446	+ $poformat = '"."\n?(^"."$\n?)*';
	447	+
	448	+ $matches = array();
	449	+ if ( preg_match( "/^$name\s($poformat)/mx", $section, $matches ) ) {
	450	+ return $matches[1];
	451	+ } else {
	452	+ return null;
	453	+ }
	454	+ }
	455	+
	456	+ /**
	457	+ * Generates unique key for each message. Changing this WILL BREAK ALL
	458	+ * existing pages!
	459	+ */
533	460	public static function generateKeyFromItem( $item ) {
534	461	$lang = Language::factory( 'en' );
535	462	global $wgLegalTitleChars;
—	—	@@ -542,16 +469,37 @@
543	470	return "$hash-$snippet";
544	471	}
545	472
546		~~- public static function formatForWiki( $data ) {~~
	473	+ /**
	474	+ * This parses the Gettext text block format. Since trailing whitespace is
	475	+ * not allowed in MediaWiki pages, the default action is to append
	476	+ * \-character at the end of the message. You can also choose to ignore it
	477	+ * and use the trim action instead.
	478	+ */
	479	+ public static function formatForWiki( $data, $whitespace = 'mark' ) {
547	480	$quotePattern = '/(^"\|"$\n?)/m';
548	481	$data = preg_replace( $quotePattern, '', $data );
549	482	$data = stripcslashes( $data );
550	483	if ( preg_match( '/\s$/', $data ) ) {
551		~~- $data .= '\\';~~
	484	+ if ( $whitespace === 'mark' )
	485	+ $data .= '\\';
	486	+ elseif ( $whitespace === 'trim' )
	487	+ $data = rtrim($data);
	488	+ else
	489	+ // FIXME: only triggered if there is trailing whitespace
	490	+ throw new MWException( 'Unknown action for whitespace' );
552	491	}
553	492	return $data;
554	493	}
555	494
	495	+ public static function parseHeaderTags( $headers ) {
	496	+ $tags = array();
	497	+ foreach ( explode("\n", $headers ) as $line ) {
	498	+ list( $key, $value ) = explode( ': ', $line, 2 );
	499	+ $tags[$key] = $value;
	500	+ }
	501	+ return $tags;
	502	+ }
	503	+
556	504	//
557	505	// WRITE
558	506	//

Status & tagging log

20:20, 1 January 2010 Siebrand (talk | contribs) changed the status of r59335 [removed: deferred added: ok]
09:28, 23 November 2009 Bryan (talk | contribs) changed the status of r59335 [removed: new added: deferred]