Comment: | After we de-serialize the PHP data on disk we end up with a Perl hash
with UTF-8 data but none of the scalars have UTF-8 markers indicating
that.
That results in YAML::Syck dumping ugly YAML because it thinks we have
raw bytes instead of proper UTF-8.
Recurse through the hash and fix that. To reproduce this issue try doing:
# php -r '$a = array("a key" => "a value", "a kíe" => "a valjúe"); echo serialize($a), "\n";' > serialized
# ^^ this yields qq[a:2:{s:5:"a key";s:7:"a value";s:6:"a kíe";s:9:"a valjúe";}] in 'serialized'
# cat serialized | perl load-and-dump.pl
# SAME: # cat serialized | perl -Cio load-and-dump.pl
use feature ':5.10';
use strict;
use warnings;
use YAML::Syck 'Dump';
use PHP::Serialization 'unserialize';
use File::Slurp 'slurp';
use Data::Dump 'dump';
use Encode 'decode';
my $serialized = join '', <STDIN>;
my $unserialized = unserialize($serialized);
my $unserialized_utf8;
while (my ($k, $v) = each %$unserialized) {
my $d_k = decode "utf-8", $k;
my $d_v = decode "utf-8", $v;
$unserialized_utf8->{$d_k} = $d_v;
}
say Dump $serialized;
# "a:2:{s:5:\"a key\";s:7:\"a value\";s:6:\"a kíe\";s:9:\"a valjúe\";}\n"
say Dump $unserialized;
# a key: a value
# "a k\xC3\xADe": "a valj\xC3\xBAe"
say Dump $unserialized_utf8;
# a key: a value
# a kíe: a valjúe
Or this. Which recursively iterates:
# perl load-and-dump.pl serialized
use feature ':5.10';
use strict;
use warnings;
use YAML::Syck 'Dump';
use PHP::Serialization 'unserialize';
use File::Slurp 'slurp';
my $serialized = slurp(shift);
my $unserialized = unserialize($serialized);
my $unserialized_utf8 = deutf8($unserialized);
say Dump $unserialized_utf8;
# Just marks hash values as utf8, recursively. Doesn't touch keys (how
# does that work anyway with keys being char*?!)
sub iterate_and_mark_utf8
{
my ($hash, @path) = @_;
while (my ($k, $v) = each %$hash)
{
if (ref $v eq 'HASH')
{
iterate_and_mark_utf8($v, @path, $k);
}
else
{
utf8::decode($hash->{$k});
}
}
}
sub deutf8 {
if(ref($_[0]) eq "HASH") {
return { map { deutf8($_) } %{$_[0]} };
} else {
my $s = $_[0];
utf8::decode($s);
return $s;
}
} |