using wregex creates garbage output?

https://stackoverflow.com/questions/17770000

03-06-2022
|

Вопрос

I created a simple program using regex to tokenize a file. For nonUnicode content it works fine. For a Unicode based content I made a wregex version but this version creates garbage output!

I am trying to output Unicode characters or strings on a console screen instead I stored them in a map<wstring,int> and a file of type wostream so that the values are intact and correct. After running the application the file containing the extracted tokens, contain just garbage!!!

Whats wrong with this program and how can I fix it?

#include "stdafx.h"

#include <iostream>
#include <regex>
#include <fstream>
#include <string>
#include <map>
using namespace std;

int main()
{
    string path="";    

    map<wstring, int> container;
    wifstream file("ftest.txt"); 
    wregex reg(_T("\\w+"));
    wstring s=_T("");
    while (file.good())
    {
        file>>s;
        for ( wsregex_iterator it (s.begin(), s.end(), reg),it_end; it != it_end; ++it)
        {
            container[(wstring)(*it)[0]]++ ;
        }

    }

    cout <<"\nDone..."<< endl;
    wofstream output("list.txt",ios::app);
    for (auto item : container)
    {
        //cout<<item.first<<" : "<<item.second<<endl;
        output<<item.first<<" : "<<item.second<<endl;
    }
    system("pause");
    return 0;
}

This is the content of ftest.txt:

بسم الله الرحمن الرحیم 
واشنگتن پست طی گزارشی اعلام کرد کنگره آمریکا برخلاف رویه سابق، ارسال مصوبه سالانه خود در زمینه تحریم های ایران به کاخ سفید را به تاخیر انداخت و به نظر می رسد انتخاب حسن روحانی به عنوان رئیس جمهوری جدید ایران علت این امر بوده است.
0 0 0 نظر
[-]     اندازه متن  [+]


به دنبال انتخاب حسن روحانی به عنوان رئیس جمهوری جدید ایران، کنگره آمریکا بر اساس برخی ملاحظات ارسال مصوبه سالانه خود در زمینه تحریم های ایران به کاخ سفید را به تاخیر انداخت.

And this is the garbage output inside list.txt

0 : 3
1 : 1
14 : 1
16 : 1
26 : 1
27 : 1
5 : 2
50 : 1
6 : 1
7 : 1
ط : 475
طھ : 12
طھط : 20
طھطµظ : 1
طھظ : 10
طھغ : 2
ط² : 6
ط²ط : 6
ط²ظ : 6
ط³ : 5
ط³ط : 12
ط³طھ : 8
ط³طھط : 4
ط³طھظ : 2
ط³ظ : 10
ط³غ : 1
طµ : 1
طµط : 1
طµظ : 6
ط¹ط : 1
ط¹ظ : 8
ظ : 291
ع : 54
غ : 95
ï : 1

Решение

This link Solved my problem.:) for a portable solution check this link out.

And this is the final code which works flawlessly :) :

#include "stdafx.h"
#include <iostream>
#include <regex>
#include <fstream>
#include <string>
#include <map>
#include <fcntl.h> // for _wfopen_s
#include <io.h> //for _setmode


using namespace std;

int main()
{
    string path = "";    

    map<wstring, int> container;

     FILE* fp;
    _wfopen_s (&fp, L"ftest.txt", L"r");
    _setmode (_fileno (fp), _O_U8TEXT);

    wifstream file(fp);
    wregex reg(L"\\w+");

    wstring s = L"";

    while (file.good())
    {
        getline(file,s);    
        for ( wsregex_iterator it (s.begin(), s.end(), reg), it_end ; it != it_end ; ++it)
        {
            container[(wstring)(*it)[0]]++ ;
        }
    }

    cout <<"\nDone..."<< endl;

    fclose(fp);

    _wfopen_s (&fp, L"list.txt", L"w");
    _setmode (_fileno (fp), _O_U8TEXT);
    wofstream output(fp);

    for (auto item : container)
    {
        wcout<<item.first <<" : "<<item.second <<endl;
        //write output to list.txt
        output<<item.first <<" : "<<item.second <<endl;
    }
    fclose(fp);
    system("pause");
    return 0;
}

Другие советы

You need to convert from the UTF8 encoding of the file to the UTF16 encoding that std::wregex uses.

With C++11 you can do it using std::codecvt_utf8_utf16:

std::wifstream file("ftest.txt"); 
file.imbue(std::locale(file.getloc(), new std::codecvt_utf8_utf16<wchar_t>());
// "file" will now read UTF8 and output UTF16.

Pre C++11 you can use boost::locale to convert:

e.g.

auto w_s = boost::locale::utf_to_utf<char>(s);

Лицензировано под: CC-BY-SA с атрибуция

Не связан с StackOverflow