Question

I select data from mysql, the database is not in utf8 (the unicode character is save as latin, for example the unicode string Đỗ Tiến(correct form) is save as Äá»— Tiến). If I use PHP to echo to html, I just set <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> the webpage displays correctly. If I do not set the meta tag, when open by Chrome, the Chrome detect that is in windows-1258 encode, manually change to Unicode (utf-8), the webpage displays correctly.

The problem is: when I select data from mysql using jdbc I convert like this:

    byte[] asciiBytes1 = "Äá»— tiến".getBytes("Cp1258");
    byte[] asciiBytes2 = "Äá»— tiến".getBytes("ISO-8859-1");
    String unicode1 = new String(asciiBytes1, "UTF-8");
    String unicode2 = new String(asciiBytes2, "UTF-8");
    System.out.println(unicode1);//�?ỗ tiến
    System.out.println(unicode2);//Đ�? tiến

as the result, java does not convert properly, I try many encodings in http://docs.oracle.com/javase/1.4.2/docs/guide/intl/encoding.doc.html, not only Cp1258 and ISO-8859-1, but none works. The 2 simple method to converting is use html file with Äá»— tiến string as I mention before or using notepad++, set encoding ANSI, paste Äá»— tiến string then change to utf-8, it will displays Đỗ Tiến(is the correct string I want)

Was it helpful?

Solution

That's kinda complicated, it's in modified Windows-1252 where 0x81, 0x8d, 0x8f, 0x90 and 0x9d that are normally not assigned are replaced with respective C1 characters. It seems Java doesn't take this into account by default when using Windows-1252.

It is easiest to just fix your database and use UTF-8 everywhere.

Here's the code anyway

public static byte[] getBytesModifiedW1252( String str ) {
    final int[] windows1252 = {
            0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F
            ,0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F
            ,0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F
            ,0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F
            ,0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F
            ,0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F
            ,0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F
            ,0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F
            ,0x20AC,0x0081,0x201A,0x0192,0x201E,0x2026,0x2020,0x2021,0x02C6,0x2030,0x0160,0x2039,0x0152,0x008D,0x017D,0x008F
            ,0x0090,0x2018,0x2019,0x201C,0x201D,0x2022,0x2013,0x2014,0x02DC,0x2122,0x0161,0x203A,0x0153,0x009D,0x017E,0x0178
            ,0x00A0,0x00A1,0x00A2,0x00A3,0x00A4,0x00A5,0x00A6,0x00A7,0x00A8,0x00A9,0x00AA,0x00AB,0x00AC,0x00AD,0x00AE,0x00AF
            ,0x00B0,0x00B1,0x00B2,0x00B3,0x00B4,0x00B5,0x00B6,0x00B7,0x00B8,0x00B9,0x00BA,0x00BB,0x00BC,0x00BD,0x00BE,0x00BF
            ,0x00C0,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x00C7,0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF
            ,0x00D0,0x00D1,0x00D2,0x00D3,0x00D4,0x00D5,0x00D6,0x00D7,0x00D8,0x00D9,0x00DA,0x00DB,0x00DC,0x00DD,0x00DE,0x00DF
            ,0x00E0,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x00E7,0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF
            ,0x00F0,0x00F1,0x00F2,0x00F3,0x00F4,0x00F5,0x00F6,0x00F7,0x00F8,0x00F9,0x00FA,0x00FB,0x00FC,0x00FD,0x00FE,0x00FF
        };
    Map<Integer, Integer> map = new HashMap<Integer, Integer>();

    for( int i = 0; i < windows1252.length; ++i ) {
        map.put( windows1252[i], i);
    }
    byte replacement = (byte)0x003F;

    byte[] ret = new byte[str.length()];

    for( int i = 0; i < str.length(); ++i ) {
        int cp = str.charAt(i);
        Integer w1252 = map.get(cp);
        ret[i] = w1252 == null ? replacement : (byte)(int)w1252;
    }

    return ret;
}

public static void main(String args[]) throws UnsupportedEncodingException {
    byte[] bytes = getBytesModifiedW1252( "Äá»— tiến" );
    System.out.println(new String(bytes, "UTF-8"));
    //Đỗ tiến
}

Here's the opposite:

public static String getStringModifiedW1252( byte[] bytes ) {

    final int[] windows1252 = {
            0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F
            ,0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F
            ,0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F
            ,0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F
            ,0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F
            ,0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F
            ,0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F
            ,0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F
            ,0x20AC,0x0081,0x201A,0x0192,0x201E,0x2026,0x2020,0x2021,0x02C6,0x2030,0x0160,0x2039,0x0152,0x008D,0x017D,0x008F
            ,0x0090,0x2018,0x2019,0x201C,0x201D,0x2022,0x2013,0x2014,0x02DC,0x2122,0x0161,0x203A,0x0153,0x009D,0x017E,0x0178
            ,0x00A0,0x00A1,0x00A2,0x00A3,0x00A4,0x00A5,0x00A6,0x00A7,0x00A8,0x00A9,0x00AA,0x00AB,0x00AC,0x00AD,0x00AE,0x00AF
            ,0x00B0,0x00B1,0x00B2,0x00B3,0x00B4,0x00B5,0x00B6,0x00B7,0x00B8,0x00B9,0x00BA,0x00BB,0x00BC,0x00BD,0x00BE,0x00BF
            ,0x00C0,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x00C7,0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF
            ,0x00D0,0x00D1,0x00D2,0x00D3,0x00D4,0x00D5,0x00D6,0x00D7,0x00D8,0x00D9,0x00DA,0x00DB,0x00DC,0x00DD,0x00DE,0x00DF
            ,0x00E0,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x00E7,0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF
            ,0x00F0,0x00F1,0x00F2,0x00F3,0x00F4,0x00F5,0x00F6,0x00F7,0x00F8,0x00F9,0x00FA,0x00FB,0x00FC,0x00FD,0x00FE,0x00FF
        };

    StringBuilder ret = new StringBuilder(bytes.length);

    for( int i = 0; i < bytes.length; ++i ) {
        ret.append( (char) windows1252[(bytes[i] < 0 ? 256 + bytes[i] : bytes[i] )] );
    }

    return ret.toString();

}

public static void main(String args[]) throws UnsupportedEncodingException {
    String str = "Đỗ tiến";
    String w1252 = getStringModifiedW1252( str.getBytes("UTF-8"));
    System.out.println(w1252);
    //Äá»— tiến
}

You probably want to stash the map and array somewhere instead of creating them when the methods are called

OTHER TIPS

try this

byte[] asciiBytes1 = "Äá»— tiến".getBytes("Cp1258");
byte[] asciiBytes2 = "Äá»— tiến".getBytes("ISO-8859-1");
String unicode1 = new String(asciiBytes1, "Cp1258");
String unicode2 = new String(asciiBytes2, "ISO-8859-1");
System.out.println(unicode1);//�?ỗ tiến
System.out.println(unicode2);//Đ�? tiến
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top