r56768 MediaWiki - Code Review archive

Repository:MediaWiki
Revision:r56767‎ | r56768 | r56769 >
Date:16:51, 22 September 2009
Author:avar
Status:deferred
Tags:
Comment:
After we de-serialize the PHP data on disk we end up with a Perl hash
with UTF-8 data but none of the scalars have UTF-8 markers indicating
that.

That results in YAML::Syck dumping ugly YAML because it thinks we have
raw bytes instead of proper UTF-8.

Recurse through the hash and fix that. To reproduce this issue try doing:

# php -r '$a = array("a key" => "a value", "a kíe" => "a valjúe"); echo serialize($a), "\n";' > serialized
# ^^ this yields qq[a:2:{s:5:"a key";s:7:"a value";s:6:"a kíe";s:9:"a valjúe";}] in 'serialized'
# cat serialized | perl load-and-dump.pl
# SAME: # cat serialized | perl -Cio load-and-dump.pl

use feature ':5.10';
use strict;
use warnings;

use YAML::Syck 'Dump';
use PHP::Serialization 'unserialize';
use File::Slurp 'slurp';
use Data::Dump 'dump';
use Encode 'decode';

my $serialized = join '', <STDIN>;
my $unserialized = unserialize($serialized);
my $unserialized_utf8;

while (my ($k, $v) = each %$unserialized) {
my $d_k = decode "utf-8", $k;
my $d_v = decode "utf-8", $v;

$unserialized_utf8->{$d_k} = $d_v;
}

say Dump $serialized;
# "a:2:{s:5:\"a key\";s:7:\"a value\";s:6:\"a kíe\";s:9:\"a valjúe\";}\n"

say Dump $unserialized;
# a key: a value
# "a k\xC3\xADe": "a valj\xC3\xBAe"

say Dump $unserialized_utf8;
# a key: a value
# a kíe: a valjúe

Or this. Which recursively iterates:

# perl load-and-dump.pl serialized

use feature ':5.10';
use strict;
use warnings;

use YAML::Syck 'Dump';
use PHP::Serialization 'unserialize';
use File::Slurp 'slurp';

my $serialized = slurp(shift);
my $unserialized = unserialize($serialized);

my $unserialized_utf8 = deutf8($unserialized);

say Dump $unserialized_utf8;

# Just marks hash values as utf8, recursively. Doesn't touch keys (how
# does that work anyway with keys being char*?!)
sub iterate_and_mark_utf8
{
my ($hash, @path) = @_;

while (my ($k, $v) = each %$hash)
{
if (ref $v eq 'HASH')
{
iterate_and_mark_utf8($v, @path, $k);
}
else
{
utf8::decode($hash->{$k});
}
}
}

sub deutf8 {
if(ref($_[0]) eq "HASH") {
return { map { deutf8($_) } %{$_[0]} };
} else {
my $s = $_[0];
utf8::decode($s);
return $s;
}
}
Modified paths:
  • /trunk/extensions/Translate/utils/TranslateYaml.php (modified) (history)

Diff [purge]

Index: trunk/extensions/Translate/utils/TranslateYaml.php
@@ -69,9 +69,23 @@
7070 file_put_contents( $tf, $sdata );
7171
7272 $cmd = "perl -MYAML::Syck=DumpFile -MPHP::Serialization=unserialize -MFile::Slurp=slurp -wle '" .
73 - "my \$serialized = slurp(\"$tf\");" .
74 - "my \$unserialized = unserialize(\$serialized);" .
75 - "DumpFile(q[$tf.yaml], \$unserialized);' 2>&1";
 73+ '$YAML::Syck::Headless = 1;' .
 74+ '$YAML::Syck::SortKeys = 1;' .
 75+ 'my $tf = q[' . $tf . '];' .
 76+ 'my $serialized = slurp($tf);' .
 77+ 'my $unserialized = unserialize($serialized);' .
 78+ 'my $unserialized_utf8 deutf8($unserialized);' .
 79+ 'DumpFile(qq[$tf.yaml], $unserialized_utf8);' .
 80+ 'sub deutf8 {' .
 81+ 'if(ref($_[0]) eq "HASH") {' .
 82+ 'return { map { deutf8($_) } %{$_[0]} };' .
 83+ '} else {' .
 84+ 'my $s = $_[0];' .
 85+ 'utf8::decode($s);' .
 86+ 'return $s;' .
 87+ '}' .
 88+ '}' .
 89+ ' 2>&1';
7690 $out = wfShellExec( $cmd, &$ret );
7791 if ( $ret != 0 ) {
7892 wfDebugDieBacktrace("The command '$cmd' died in execution with exit code '$ret': $out");

Status & tagging log