The following code uses the low-level encoding functions of Ruby to force the rewriting of double encoded UTF-8 (from CP1525) into normal UTF-8.
#!/usr/bin/env ruby
ec = Encoding::Converter.new(Encoding::UTF_8, Encoding::CP1252)
prev_b = nil
orig_bytes = STDIN.read.force_encoding(Encoding::BINARY).bytes.to_a
real_utf8_bytes = ""
real_utf8_bytes.force_encoding(Encoding::BINARY)
orig_bytes.each_with_index do |b, i|
b = b.chr
situation = ec.primitive_convert(b.dup, real_utf8_bytes, nil, nil, Encoding::Converter::PARTIAL_INPUT)
if situation == :undefined_conversion
if prev_b != "\xC2"
$stderr.puts "ERROR found byte #{b.dump} in stream (prev #{(prev_b||'').dump})"
exit
end
real_utf8_bytes.force_encoding(Encoding::BINARY)
real_utf8_bytes << b
real_utf8_bytes.force_encoding(Encoding::CP1252)
end
prev_b = b
end
real_utf8_bytes.force_encoding(Encoding::BINARY)
puts real_utf8_bytes
It is meant to be used in a pipeline:
cat $PROBLEMATIC_FILE | ./fix-double-utf8-encoding.rb > $CORRECTED_FILE