Question

NEW EDIT: Basically I've provided a example that isn't correct. In my real application the string will of course not always be "C:/Users/Familjen-Styren/Documents/V\u00E5gformer/20140104-0002/text.txt". Instead I will have a input window in java and then I will "escape" the unicode characters to a universal character name. And then it will be "unescaped" in C (I do this to avoid problems with passing multibyte characters from java to c). So here is a example where I actually ask the user to input a string (filename):

#include <stdio.h>
#include <string.h>

int func(const char *fname);

int main()
{
   char src[100];
   scanf("%s", &src);
   printf("%s\n", src);
   int exists = func((const char*) src);
   printf("Does the file exist? %d\n", exists);
   return exists;
}

int func(const char *fname)
{
    FILE *file;
    if (file = fopen(fname, "r"))
    {
        fclose(file);
        return 1;
    }
    return 0;
}

And now it will think the universal character names is just a part of the actual filename. So how do I "unescape" the universal character names included in the input?

FIRST EDIT: So I compile this example like this: "gcc -std=c99 read.c" where 'read.c' is my source file. I need the -std=c99 parameter because I'm using the prefix '\u' for my universal character name. If I change it to '\x' it works fine, and I can remove the -std=c99 parameter. But in my real application the input will not use the prefix '\x' instead it will be using the prefix '\u'. So how do I work around this?

This code gives the desired result but for my real application I can't really use '\x':

#include <stdio.h>
#include <string.h>

int func(const char *fname);

int main()
{
   char *src = "C:/Users/Familjen-Styren/Documents/V\x00E5gformer/20140104-0002/text.txt";
   int exists = func((const char*) src);
   printf("Does the file exist? %d\n", exists);
   return exists;
}

int func(const char *fname)
{
    FILE *file;
    if (file = fopen(fname, "r"))
    {
        fclose(file);
        return 1;
    }
    return 0;
}

ORIGINAL: I've found a few examples of how to do this in other programming languages like javascript but I couldn't find any example on how to do this in C. Here is a sample code which produces the same error:

#include <stdio.h>
#include <string.h>

int func(const char *fname);

int main()
{
   char *src = "C:/Users/Familjen-Styren/Documents/V\u00E5gformer/20140104-0002/text.txt";
   int len = strlen(src); /* This returns 68. */
   char fname[len];
   sprintf(fname,"%s", src);
   int exists = func((const char*) src);
   printf("%s\n", fname);
   printf("Does the file exist? %d\n", exists); /* Outputs 'Does the file exist? 0' which means it doesn't exist. */
   return exists;
}

int func(const char *fname)
{
    FILE *file;
    if (file = fopen(fname, "r"))
    {
        fclose(file);
        return 1;
    }
    return 0;
}

If I instead use the same string without universal character names:

#include <stdio.h>
#include <string.h>

int func(const char *fname);

int main()
{
   char *src = "C:/Users/Familjen-Styren/Documents/Vågformer/20140104-0002/text.txt";
   int exists = func((const char*) src);
   printf("Does the file exist? %d\n", exists); /* Outputs 'Does the file exist? 1' which means it does exist. */
   return exists;
}

int func(const char *fname)
{
    FILE *file;
    if (file = fopen(fname, "r"))
    {
        fclose(file);
        return 1;
    }
    return 0;
}

it will output 'Does the file exist? 1'. Which means it does indeed exist. But the problem is I need to be able to handle universal character. So how do I unescape a string which contains universal character names?

Thanks in advance.

Was it helpful?

Solution

I'm reediting the answer in the hope to make it clearer. First of all I'm assuming you are familiar with this: http://www.joelonsoftware.com/articles/Unicode.html. It is required background knowledge when dealing with character encoding.

Now I'm starting with a simple test program I typed on my linux machine test.c

#include <stdio.h>
#include <string.h>
#include <wchar.h>
#define BUF_SZ 255
void test_fwrite_universal(const char *fname)
{
    printf("test_fwrite_universal on %s\n", fname);
    printf("In memory we have %d bytes: ", strlen(fname));
    for (unsigned i=0; i<strlen(fname); ++i) {
        printf("%x ", (unsigned char)fname[i]);
    }
    printf("\n");
    
    FILE* file = fopen(fname, "w");
    if (file) {
        fwrite((const void*)fname, 1, strlen(fname),  file);        
        fclose(file);
        file = NULL;
        printf("Wrote to file successfully\n");
    }
}

int main()
{
    test_fwrite_universal("file_\u00e5.txt");
    test_fwrite_universal("file_å.txt");   
    test_fwrite_universal("file_\u0436.txt");   
    return 0;
}

the text file is encoded as UTF-8. On my linux machine my locale is en_US.UTF-8 So I compile and run the program like this:

gcc -std=c99 test.c -fexec-charset=UTF-8 -o test

test

test_fwrite_universal on file_å.txt
In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74 
Wrote to file successfully
test_fwrite_universal on file_å.txt
In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74 
Wrote to file successfully
test_fwrite_universal on file_ж.txt
In memory we have 11 bytes: 66 69 6c 65 5f d0 b6 2e 74 78 74 
Wrote to file successfully

The text file is in UTF-8, my locale is working of of UTF-8 and the execution character set for char is UTF-8. In main I call the function fwrite 3 times with character strings. The function prints the strings byte by byte. Then writes a file with that name and write that string into the file.

We can see that "file_\u00e5.txt" and "file_å.txt" are the same: 66 69 6c 65 5f c3 a5 2e 74 78 74 and sure enough (http://www.fileformat.info/info/unicode/char/e5/index.htm) the UTF-8 representation for code point +00E5 is: c3 a5 In the last example I used \u0436 which is a Russian character ж (UTF-8 d0 b6)

Now lets try the same on my windows machine. Here I use mingw and I execute the same code:

C:\test>gcc -std=c99 test.c -fexec-charset=UTF-8 -o test.exe

C:\test>test

test_fwrite_universal on file_å.txt
In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74
Wrote to file successfully
test_fwrite_universal on file_å.txt
In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74
Wrote to file successfully
test_fwrite_universal on file_╨╢.txt
In memory we have 11 bytes: 66 69 6c 65 5f d0 b6 2e 74 78 74
Wrote to file successfully

So it looks like something went horribly wrong printf is not writing the characters properly and the files on the disk also look wrong. Two things worth noting: in terms of byte values the file name is the same in both linux and windows. The content of the file is also correct when opened with something like notepad++

The reason for the problem is the C Standard library on windows and the locale. Where on linux the system locale is UTF-8 on windows my default locale is CP-437. And when I call functions such as printf fopen it assumes the input is in CP-437 and there c3 a5 are actually two characters.

Before we look at a proper windows solution lets try to explain why you have different results in file_å.txt vs file_\u00e5.txt. I believe the key is the encoding of your text file. If I write the same test.c in CP-437:

C:\test>iconv -f UTF-8 -t cp437 test.c > test_lcl.c

C:\test>gcc -std=c99 test_lcl.c -fexec-charset=UTF-8 -o test_lcl.exe

C:\test>test_lcl

test_fwrite_universal on file_å.txt
In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74
Wrote to file successfully
test_fwrite_universal on file_å.txt
In memory we have 10 bytes: 66 69 6c 65 5f 86 2e 74 78 74
Wrote to file successfully
test_fwrite_universal on file_╨╢.txt
In memory we have 11 bytes: 66 69 6c 65 5f d0 b6 2e 74 78 74
Wrote to file successfully

I now get a difference between file_å and file_\u00e5. The character å in the file is actually encoded as 0x86. Notice that this time the second string is 10 characters long not 11. If we look at the file and tell Notepad++ to use UTF-8 we will see a funny result. Same goes to the actual data written to the file.

Finally how to get the damn thing working on windows. Unfortunately It seems that it is impossible to use the standard library with UTF-8 encoded strings. On windows you can't set the C locale to that. see: What is the Windows equivalent for en_US.UTF-8 locale?.

However we can work around this with wide characters:

#include <stdio.h>
#include <string.h>
#include <windows.h>
#define BUF_SZ 255
void test_fopen_windows(const char *fname)
{
    wchar_t buf[BUF_SZ] = {0};
    int sz = MultiByteToWideChar(CP_UTF8, 0, fname, strlen(fname), (LPWSTR)buf, BUF_SZ-1);
    wprintf(L"converted %d characters\n", sz);
    wprintf(L"Converting to wide characters %s\n", buf);
    FILE* file =_wfopen(buf, L"w");
    if (file) {
        fwrite((const void*)fname, 1, strlen(fname),  file);        
        fclose(file);
        wprintf(L"Wrote file %s successfully\n", buf);
    }
}


int main()
{
    test_fopen_windows("file_\u00e5.txt");
    return 0;
}

To compile use:

gcc -std=gnu99 -fexec-charset=UTF-8 test_wide.c -o test_wide.exe

_wfopen is not ANSI compliant and -std=c99 actually means STRICT_ANSI so you should use gnu99 to have that function.

OTHER TIPS

Wrong array size (forgot the .txt and \0 and that an encoded non-ASCII char takes up more than 1 byte.)

// length of the string without the universal character name. 
// C:/Users/Familjen-Styren/Documents/Vågformer/20140104-0002/text
// 123456789012345678901234567890123456789012345678901234567890123
//          1         2         3         4         5         6
// int len = 63;

// C:/Users/Familjen-Styren/Documents/Vågformer/20140104-0002/text.txt
int len = 100;


char *src = "C:/Users/Familjen-Styren/Documents/V\u00E5gformer/20140104-0002/text.txt";
char fname[len];
// or if you can use VLA
char fname[strlen(src)+1];

sprintf(fname, "%s", src);
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top