First, you need to change from read
to sysread
. read
reads until it has the requested number of chars, while sysread
returns as soon as data are available.
But returning data as soon is arrives means you might have an incomplete UTF-8 character at the end, so you'll have to decode only characters fully received and buffer the rest.
sub decode_utf8_partial {
my $s = decode('UTF-8', $_[0], Encode::FB_QUIET);
return undef
if !length($s) && $_[0] =~ /
^
(?: [\x80-\xBF]
| [\xC0-\xDF].
| [\xE0-\xEF]..
| [\xF0-\xF7]...
| [\xF8-\xFF]
)
/xs;
return $s;
}
binmode($fh);
my $buf;
while (1) {
my $rv = sysread($fh, $buf, 64*1024, length($buf));
die $! if !defined($rv);
last if !$rv;
while (1) {
# Leaves undecoded part in $buf
my $s = decode_utf8_partial($buf);
die "Bad UTF-8" if !defined($s);
last if !length($s);
... do something with $s ...
}
}