Index and histogram of Unicode characters in input using C++

https://stackoverflow.com/questions/11128893

16-06-2021
|

Question

count the occurrences of each symbol and the location they appear in the text, word or line I have a list of words like so in many languages.

What I am trying to do it count the occurrences of each character and the position in the text where they are, or are common. also if it's possible to count the common number of syllables that would also be helpful.

sommige
disa
بَعْض - ba'th
mi qani - մի քանի
bəzi
batzuk
nyeykі/nyeykaya/nyeykaye/nyeykіya - нейкі/нейкая/нейкае/нейкія
kisu - কিসু
afouhe - بعض
neki
alguns
njakoj - някой
一些
algú/alguns/alguna/algunes
neki
někteří
nogle
berekhey āz - برخی از
een paar
kam - كام
some
iuj
mõned
berekhey āz - برخی از
ilan
joitakin
sommige
certains
algúns
ramdenime - რამდენიმე
einige
peripou - περίπου
keṭelāk - કેટલાક 
wasu
kèk
khemeh - כמה
kuch - कुछ
néhány
sumir
beberapa
roinnt
alcuni
ikutsu ka no - いくつかの
kelavu
មួយចំនួន
조금 - jo geum
هەندێک
aliquis
daži
keli
nekoi - некои
misy
beberapa
ഏതാനും
xi
yī xiē  - 一些
kaahi - कांही
neki
shwiya - بعض
kehi - केही
enkelte
gari
berekhey āz - برخی از
b'eda - بعضی
kilka
ਕਈ
alguns
câţiva/câteva
некоторые - nekotorыe

some
neki - неки
samahara - සමහර
niektorí
nekaj
algunos
baadhi
några
ilan
yakchand - якчанд
konjam - கொஞ்சம்
yan
konni - కొన్ని
บาง - baang
bazı
dejakі - деякі
chened - چند
ba'zi, qandaydir
một số
rhai
עטלעכע
die
okumbalwa

this is the current code sehe made it work with unicode

//#define PREFER_BOOST
#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <istream>
#include <algorithm>
#include <iterator>
#ifdef PREFER_BOOST
#include <boost/locale.hpp>
#endif

using namespace std;

std::map<wchar_t, int> letterCount;
struct Counter
{
    void operator()(wchar_t  item) 
    { 
        if ( !std::isspace(item) )
            ++letterCount[std::tolower(item)]; //remove tolower if you want case-sensitive solution!
    }
};

int main()
{
    std::setlocale(LC_ALL, "en_US.UTF-8");
    wifstream input("input.txt");

#ifdef PREFER_BOOST 
    boost::locale::generator gen;
    std::locale loc = gen("en_US.UTF-8"); 
#else
    std::locale loc("en_US.UTF-8");
#endif
    input.imbue(loc);
    wcout.imbue(loc);

    istreambuf_iterator<wchar_t> start(input), end;
    std::for_each(start, end, Counter());

    for (std::map<wchar_t, int>::iterator it = letterCount.begin(); it != letterCount.end(); ++it)
    {
        wcout << it->first <<" : "<< it->second << endl;
    }
}

this was my original code

 #include <iostream>
  #include <cctype>
 #include <fstream>
#include <string>
 #include <map>
  #include <istream>
   #include <vector>
 #include <list>
 #include <algorithm>
#include <iterator>


using namespace std;
 struct letter_only: std::ctype<char> 
 {
    letter_only(): std::ctype<char>(get_table()) {}

    static std::ctype_base::mask const* get_table()
    {
       static std::vector<std::ctype_base::mask> 
             rc(std::ctype<char>::table_size,std::ctype_base::space);

       std::fill(&rc['A'], &rc['z'+1], std::ctype_base::alpha);
       return &rc[0];
    }
 };

struct Counter
{
    std::map<char, int> letterCount;
    void operator()(char  item) 
    { 
       if ( item != std::ctype_base::space)
         ++letterCount[tolower(item)]; //remove tolower if you want case-sensitive solution!
    }
    operator std::map<char, int>() { return letterCount ; }
};

int main()
{
     ifstream input;
     input.imbue(std::locale(std::locale(), new letter_only())); //enable reading only leters only!
     input.open("T");
     istream_iterator<char> start(input);
     istream_iterator<char> end;
     std::map<char, int> letterCount = std::for_each(start, end, Counter());
     for (std::map<char, int>::iterator it = letterCount.begin(); it != letterCount.end(); ++it)
     {
          cout << it->first <<" : "<< it->second << endl;
     }
 }

Example of what I am trying to get as out put

к : 10 (2,5) (1,5,8) (2,7) (1,3,5)

the letter that is found K then the number of occurrences it was found 10 then the locations in each word where it was found as mentioned before.

Solution

Here's what I got, and it seems to works quite well on my machine¹.

//#define PREFER_BOOST
#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <istream>
#include <algorithm>
#include <iterator>
#ifdef PREFER_BOOST
#include <boost/locale.hpp>
#endif

using namespace std;

std::map<wchar_t, int> letterCount;
struct Counter
{
    void operator()(wchar_t  item) 
    { 
        if ( !std::isspace(item) )
            ++letterCount[std::tolower(item)]; //remove tolower if you want case-sensitive solution!
    }
};

int main()
{
    std::setlocale(LC_ALL, "en_US.UTF-8");
    wifstream input("input.txt");

#ifdef PREFER_BOOST 
    boost::locale::generator gen;
    std::locale loc = gen("en_US.UTF-8"); 
#else
    std::locale loc("en_US.UTF-8");
#endif
    input.imbue(loc);
    wcout.imbue(loc);

    istreambuf_iterator<wchar_t> start(input), end;
    std::for_each(start, end, Counter());

    for (std::map<wchar_t, int>::iterator it = letterCount.begin(); it != letterCount.end(); ++it)
    {
        wcout << it->first <<" : "<< it->second << endl;
    }
}

^{If you prefer the boost locale library, you need to link to boost_system, boost_locale and boost_thread; I didn't see any noticeable difference in behaviour}

Output:

' : 3 , : 1 - : 32 / : 10 a : 67 b : 16 c : 7
d : 12 e : 61 f : 1 g : 16 h : 17 i : 46 j : 8
k : 41 l : 19 m : 19 n : 47 o : 20 p : 5 q : 3
r : 18 s : 21 t : 12 u : 21 v : 3 w : 3 x : 2
y : 21 z : 7 á : 1 â : 2 å : 1 è : 1 é : 1
í : 2 õ : 1 ú : 2 ā : 4 ē : 1 ě : 1 ī : 1
ı : 1 ř : 1 ţ : 1 ž : 1 ə : 1 ί : 1 ε : 1
ο : 1 π : 2 ρ : 1 υ : 1 а : 3 д : 2 е : 10
и : 2 й : 5 к : 10 н : 9 о : 4 р : 1 т : 1
ч : 1 ы : 2 я : 5 і : 6 ա : 1 ի : 2 մ : 1
ն : 1 ք : 1 ה : 1 ט : 1 כ : 2 ל : 1 מ : 1
ע : 3 ا : 4 ب : 7 خ : 3 د : 2 ر : 3 ز : 3
ض : 4 ع : 4 ك : 1 م : 1 ن : 2 ه : 1 َ : 1
 : 1 چ : 1 ک : 1 ی : 4 ێ : 1 ە : 1 ं : 1
ी : 2 ु : 1 े : 1 ক : 1 ি : 1 ু : 1 ਕ : 1
ક : 2 ટ : 1 ે : 1 க : 1 ் : 2 క : 1 ి : 1
 : 1 ഏ : 1 ു : 1 ර : 1 ස : 1 ง : 1 า : 1
ა : 1 დ : 1 ე : 2 ი : 1 მ : 2 ნ : 1 რ : 1
ច : 1 ន : 2 ម : 1 យ : 1 ួ : 2 ំ : 1 ố : 1
ộ : 1 い : 1 か : 1 く : 1 の : 1 一 : 2 些 : 2
금 : 1

¹. I might not get all characters displayed, but it might be due to my terminal font.

OTHER TIPS

Here is what I got it to do, with the help of all the others, thank you.

#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <istream>
#include <algorithm>
#include <iterator>
#include <sstream>
using namespace std;

struct op {
    int O;
    wstring P;
};

template <typename T>
wstring NumberToString ( T Number )
{
    wstringstream ss;
    ss << Number;
    ss<<",";
    return ss.str();
}

std::map<wchar_t, struct op> letterCount;
void Counter(wchar_t  item) {
    if ( !std::isspace(item) ) {
        ++letterCount[std::tolower(item)].O;    //remove tolower if you want case-sensitive solution!
    }
}

int main()
{
    wchar_t * cline;
    wstring line;
    wchar_t const* B = L"(";
    wchar_t const* E = L")";
    std::setlocale(LC_ALL, "en_US.UTF-8");
    wifstream input("T");
    std::locale loc("en_US.UTF-8");
    input.imbue(loc);
    wcout.imbue(loc);

    if (input.is_open()) {
        while ( !input.eof() ) {
            wstring check;
            getline (input,line);
            wcout << line << endl;
            cline = new wchar_t [line.size()+1];
            wcscpy (cline, line.c_str());

            for (int i = 0; i  < line.size()+1; ++i) {
                Counter(cline[i]);
                if(check.find(cline[i]) == string::npos)
                    for (int j=0; j<line.size()+1; j++) {
                        if (j == 0) {
                            letterCount[std::tolower(cline[i])].P+= B;
                        }
                        if (j == line.size()) {
                            letterCount[std::tolower(cline[i])].P+= E;
                        }
                        if(cline[i]==cline[j]) {
                            letterCount[std::tolower(cline[i])].P+= NumberToString(j) ;
                            check +=cline[i];
                        }
                    }
            }
        }
        input.close();
    }

    for (std::map<wchar_t, struct op>::iterator it = letterCount.begin(); it != letterCount.end(); ++it) {
        wcout << it->first <<" : "<< it->second.O << "@" << it->second.P<< endl;
    }
}

output:

н : 9@(36,42,49,56,)(9,)(8,)(0,)(7,)(15,)
о : 4@(12,)(11,)(3,5,)
р : 1@(6,)
т : 1@(4,)
ч : 1@(13,)
ы : 2@(7,19,)
я : 5@(47,61,)(10,)(11,)(11,)
і : 6@(5,30,40,60,)(5,13,)
ա : 1@(14,)
ի : 2@(11,16,)
մ : 1@(10,)
ն : 1@(15,)
ք : 1@(13,)

#include <iostream>
#include <stdio>
#include <conio>

main()
{
 char name[1000],temp[1000];
 int i,j,n,present,count=0;
 clrscr();
 cout<<"Enter your char length:-";
 cin>>n;
 cout<<"\nEnter your text below:-\n\n";

 //get the text
 for(i=0;i<n;i++)
 {
  name[i]=getchar();
  temp[i]='\0';        //clear temp array
 }

 //extracting unique characters to temp[]
 temp[0]=name[0];
 for(i=1;i<n;i++)
 {
    present=0;
    for(j=0;j<strlen(temp);j++)
    {
      if(name[i]==temp[j])
      {
        present=1;
        break;
      }
    }
    if(present==0)
    {
     count++;
     temp[count]=name[i];
    }

 }

//counting char occurance
for (i=0;i<strlen(temp); i++)
{
   int count=0;
   cout<<"\n(";
   for (int j=0;j<n; j++)
   {
      if(temp[i]==name[j])
      {
        count++;
        cout<<j<<",";
      }
   }
   cout<<")\t\t"<<temp[i]<<":"<<count;
}
getch();
}

You should investigate Huffman coding: NIST on Huffman Coding

That way you will not only get all occurrences of a letter, but also common number of syllables (if i understand correctly what is meant by that).

The Huffman algorithm is typically used for compression and search trees, but it will solve your problem in drive by (as this is exactly what Huffman does).

There is already a C++ implementation of that available on Codeproject: Huffman in C++ You will only need a part of it, as you are probably not interested in compression.

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow