C++에서 문자열을 어떻게 토큰화합니까?

https://stackoverflow.com/questions/53849

09-06-2019
|

문제

Java에는 편리한 분할 방법이 있습니다.

String str = "The quick brown fox";
String[] results = str.split(" ");

C++에서 이 작업을 수행하는 쉬운 방법이 있습니까?

해결책

간단한 케이스는 다음을 사용하여 쉽게 만들 수 있습니다. std::string::find 방법.그러나 살펴보십시오. 부스트.토크나이저.훌륭해요.Boost에는 일반적으로 매우 멋진 문자열 도구가 있습니다.

다른 팁

그만큼 부스트 토크나이저 클래스를 사용하면 이런 종류의 작업을 매우 간단하게 만들 수 있습니다.

#include <iostream>
#include <string>
#include <boost/foreach.hpp>
#include <boost/tokenizer.hpp>

using namespace std;
using namespace boost;

int main(int, char**)
{
    string text = "token, test   string";

    char_separator<char> sep(", ");
    tokenizer< char_separator<char> > tokens(text, sep);
    BOOST_FOREACH (const string& t, tokens) {
        cout << t << "." << endl;
    }
}

C++11용으로 업데이트되었습니다.

#include <iostream>
#include <string>
#include <boost/tokenizer.hpp>

using namespace std;
using namespace boost;

int main(int, char**)
{
    string text = "token, test   string";

    char_separator<char> sep(", ");
    tokenizer<char_separator<char>> tokens(text, sep);
    for (const auto& t : tokens) {
        cout << t << "." << endl;
    }
}

다음은 정말 간단한 것입니다:

#include <vector>
#include <string>
using namespace std;

vector<string> split(const char *str, char c = ' ')
{
    vector<string> result;

    do
    {
        const char *begin = str;

        while(*str != c && *str)
            str++;

        result.push_back(string(begin, str));
    } while (0 != *str++);

    return result;
}

strtok을 사용하세요.제 생각에는 strtok이 필요한 것을 제공하지 않는 한 토큰화에 관한 클래스를 구축할 필요가 없습니다.그렇지 않을 수도 있지만 15년 이상 C와 C++로 다양한 구문 분석 코드를 작성하면서 저는 항상 strtok를 사용해 왔습니다.여기에 예가 있습니다.

char myString[] = "The quick brown fox";
char *p = strtok(myString, " ");
while (p) {
    printf ("Token: %s\n", p);
    p = strtok(NULL, " ");
}

몇 가지 주의 사항(귀하의 요구 사항에 맞지 않을 수 있음)이 과정에서 문자열이 "파괴"됩니다. 즉, EOS 문자가 구분 기호 지점에 인라인으로 배치됩니다.올바르게 사용하려면 문자열의 비-const 버전을 만들어야 할 수도 있습니다.구문 분석 중에 구분 기호 목록을 변경할 수도 있습니다.

내 생각에는 위의 코드가 별도의 클래스를 작성하는 것보다 훨씬 간단하고 사용하기 쉽습니다.나에게 이것은 언어가 제공하는 기능 중 하나이며 훌륭하고 깔끔하게 수행됩니다.이는 단순히 "C 기반" 솔루션입니다.적절하고 쉬우며 추가 코드를 많이 작성할 필요가 없습니다. :-)

또 다른 빠른 방법은 다음을 사용하는 것입니다. getline.다음과 같은 것 :

stringstream ss("bla bla");
string s;

while (getline(ss, s, ' ')) {
 cout << s << endl;
}

원하시면 간단하게 만들어 드실 수 있어요 split() 반환하는 메서드 vector<string>, 정말 유용합니다.

스트림, 반복자 및 복사 알고리즘을 사용하여 이 작업을 상당히 직접적으로 수행할 수 있습니다.

#include <string>
#include <vector>
#include <iostream>
#include <istream>
#include <ostream>
#include <iterator>
#include <sstream>
#include <algorithm>

int main()
{
  std::string str = "The quick brown fox";

  // construct a stream from the string
  std::stringstream strstr(str);

  // use stream iterators to copy the stream to the vector as whitespace separated strings
  std::istream_iterator<std::string> it(strstr);
  std::istream_iterator<std::string> end;
  std::vector<std::string> results(it, end);

  // send the vector to stdout.
  std::ostream_iterator<std::string> oit(std::cout);
  std::copy(results.begin(), results.end(), oit);
}

화를 내지 마세요 여러분, 하지만 이렇게 간단한 문제에 대해 여러분은 뭔가를 만들고 있습니다. 방법 너무 복잡한.사용하는 데에는 많은 이유가 있습니다 후원.하지만 이렇게 간단한 것은 20# 썰매로 파리를 때리는 것과 같습니다.

void
split( vector<string> & theStringVector,  /* Altered/returned value */
       const  string  & theString,
       const  string  & theDelimiter)
{
    UASSERT( theDelimiter.size(), >, 0); // My own ASSERT macro.

    size_t  start = 0, end = 0;

    while ( end != string::npos)
    {
        end = theString.find( theDelimiter, start);

        // If at end, use length=maxLength.  Else use length=end-start.
        theStringVector.push_back( theString.substr( start,
                       (end == string::npos) ? string::npos : end - start));

        // If at end, use start=maxSize.  Else use start=end+delimiter.
        start = (   ( end > (string::npos - theDelimiter.size()) )
                  ?  string::npos  :  end + theDelimiter.size());
    }
}

예를 들어 (Doug의 경우)

#define SHOW(I,X)   cout << "[" << (I) << "]\t " # X " = \"" << (X) << "\"" << endl

int
main()
{
    vector<string> v;

    split( v, "A:PEP:909:Inventory Item", ":" );

    for (unsigned int i = 0;  i < v.size();   i++)
        SHOW( i, v[i] );
}

그렇습니다. 분할()이 벡터를 전달하는 대신 새 벡터를 반환하도록 할 수도 있습니다.포장하고 과부하하는 것은 쉽지 않습니다.하지만 내가 하는 일에 따라 항상 새로운 객체를 만드는 것보다 기존 객체를 재사용하는 것이 더 낫다는 것을 종종 알게 됩니다.(그 사이에 벡터를 비우는 것을 잊지 않는 한!)

참조: http://www.cplusplus.com/reference/string/string/.

(저는 원래 Doug의 질문에 대한 답변을 쓰고 있었습니다. 구분 기호를 기반으로 C++ 문자열 수정 및 추출(닫기).하지만 마틴 요크가 여기에 포인터를 얹어 그 질문을 마무리한 이후로...내 코드를 일반화하겠습니다.)

후원 강력한 분할 기능이 있습니다. 부스트::알고리즘::분할.

샘플 프로그램:

#include <vector>
#include <boost/algorithm/string.hpp>

int main() {
    auto s = "a,b, c ,,e,f,";
    std::vector<std::string> fields;
    boost::split(fields, s, boost::is_any_of(","));
    for (const auto& field : fields)
        std::cout << "\"" << field << "\"\n";
    return 0;
}

산출:

"a"
"b"
" c "
""
"e"
"f"
""

사용하는 솔루션 regex_token_iterator에스:

#include <iostream>
#include <regex>
#include <string>

using namespace std;

int main()
{
    string str("The quick brown fox");

    regex reg("\\s+");

    sregex_token_iterator iter(str.begin(), str.end(), reg, -1);
    sregex_token_iterator end;

    vector<string> vec(iter, end);

    for (auto a : vec)
    {
        cout << a << endl;
    }
}

귀하가 C++ 솔루션을 요청했다는 것을 알고 있지만 다음 내용이 도움이 될 수 있습니다.

#include <QString>

...

QString str = "The quick brown fox"; 
QStringList results = str.split(" ");

이 예에서 Boost에 비해 장점은 게시물 코드에 대한 일대일 직접 매핑이라는 것입니다.

자세한 내용은 다음에서 확인하세요. Qt 문서

다음은 원하는 작업을 수행할 수 있는 샘플 토크나이저 클래스입니다.

//Header file
class Tokenizer 
{
    public:
        static const std::string DELIMITERS;
        Tokenizer(const std::string& str);
        Tokenizer(const std::string& str, const std::string& delimiters);
        bool NextToken();
        bool NextToken(const std::string& delimiters);
        const std::string GetToken() const;
        void Reset();
    protected:
        size_t m_offset;
        const std::string m_string;
        std::string m_token;
        std::string m_delimiters;
};

//CPP file
const std::string Tokenizer::DELIMITERS(" \t\n\r");

Tokenizer::Tokenizer(const std::string& s) :
    m_string(s), 
    m_offset(0), 
    m_delimiters(DELIMITERS) {}

Tokenizer::Tokenizer(const std::string& s, const std::string& delimiters) :
    m_string(s), 
    m_offset(0), 
    m_delimiters(delimiters) {}

bool Tokenizer::NextToken() 
{
    return NextToken(m_delimiters);
}

bool Tokenizer::NextToken(const std::string& delimiters) 
{
    size_t i = m_string.find_first_not_of(delimiters, m_offset);
    if (std::string::npos == i) 
    {
        m_offset = m_string.length();
        return false;
    }

    size_t j = m_string.find_first_of(delimiters, i);
    if (std::string::npos == j) 
    {
        m_token = m_string.substr(i);
        m_offset = m_string.length();
        return true;
    }

    m_token = m_string.substr(i, j - i);
    m_offset = j;
    return true;
}

예:

std::vector <std::string> v;
Tokenizer s("split this string", " ");
while (s.NextToken())
{
    v.push_back(s.GetToken());
}

이것은 다음을 사용하는 간단한 STL 전용 솔루션(~5줄!)입니다. std::find 그리고 std::find_first_not_of 구분 기호(예: 공백 또는 마침표)의 반복과 선행 및 후행 구분 기호를 처리합니다.

#include <string>
#include <vector>

void tokenize(std::string str, std::vector<string> &token_v){
    size_t start = str.find_first_not_of(DELIMITER), end=start;

    while (start != std::string::npos){
        // Find next occurence of delimiter
        end = str.find(DELIMITER, start);
        // Push back the token found into vector
        token_v.push_back(str.substr(start, end-start));
        // Skip all occurences of the delimiter to find new start
        start = str.find_first_not_of(DELIMITER, end);
    }
}

사용해 보세요 살다!

피스트링 분할 메소드를 포함하여 여러 Python의 문자열 함수를 구현하는 작은 라이브러리입니다.

#include <string>
#include <vector>
#include "pystring.h"

std::vector<std::string> chunks;
pystring::split("this string", chunks);

// also can specify a separator
pystring::split("this-string", chunks, "-");

비슷한 질문에 대한 답변을 게시했습니다.
바퀴를 재발명하지 마세요.나는 여러 라이브러리를 사용해 보았고 내가 본 것 중 가장 빠르고 유연한 것은 다음과 같습니다. C++ 문자열 툴킷 라이브러리.

다음은 제가 stackoverflow의 다른 곳에 게시한 사용 방법의 예입니다.

#include <iostream>
#include <vector>
#include <string>
#include <strtk.hpp>

const char *whitespace  = " \t\r\n\f";
const char *whitespace_and_punctuation  = " \t\r\n\f;,=";

int main()
{
    {   // normal parsing of a string into a vector of strings
       std::string s("Somewhere down the road");
       std::vector<std::string> result;
       if( strtk::parse( s, whitespace, result ) )
       {
           for(size_t i = 0; i < result.size(); ++i )
            std::cout << result[i] << std::endl;
       }
    }

    {  // parsing a string into a vector of floats with other separators
       // besides spaces

       std::string t("3.0, 3.14; 4.0");
       std::vector<float> values;
       if( strtk::parse( s, whitespace_and_punctuation, values ) )
       {
           for(size_t i = 0; i < values.size(); ++i )
            std::cout << values[i] << std::endl;
       }
    }

    {  // parsing a string into specific variables

       std::string u("angle = 45; radius = 9.9");
       std::string w1, w2;
       float v1, v2;
       if( strtk::parse( s, whitespace_and_punctuation, w1, v1, w2, v2) )
       {
           std::cout << "word " << w1 << ", value " << v1 << std::endl;
           std::cout << "word " << w2 << ", value " << v2 << std::endl;
       }
    }

    return 0;
}

이 예를 확인하세요.도움이 될 수도 있겠네요..

#include <iostream>
#include <sstream>

using namespace std;

int main ()
{
    string tmps;
    istringstream is ("the dellimiter is the space");
    while (is.good ()) {
        is >> tmps;
        cout << tmps << "\n";
    }
    return 0;
}

MFC/ATL에는 매우 훌륭한 토크나이저가 있습니다.MSDN에서:

CAtlString str( "%First Second#Third" );
CAtlString resToken;
int curPos= 0;

resToken= str.Tokenize("% #",curPos);
while (resToken != "")
{
   printf("Resulting token: %s\n", resToken);
   resToken= str.Tokenize("% #",curPos);
};

Output

Resulting Token: First
Resulting Token: Second
Resulting Token: Third

당신은 단순히 정규식 라이브러리 정규식을 사용하여 문제를 해결하세요.

표현식(\w+)과 \1(또는 정규식의 라이브러리 구현에 따라 $1)의 변수를 사용합니다.

C를 사용하고 싶다면 다음을 사용할 수 있습니다. strtok 기능.사용 시 멀티스레딩 문제에 주의해야 합니다.

간단한 작업에는 다음을 사용합니다.

unsigned TokenizeString(const std::string& i_source,
                        const std::string& i_seperators,
                        bool i_discard_empty_tokens,
                        std::vector<std::string>& o_tokens)
{
    unsigned prev_pos = 0;
    unsigned pos = 0;
    unsigned number_of_tokens = 0;
    o_tokens.clear();
    pos = i_source.find_first_of(i_seperators, pos);
    while (pos != std::string::npos)
    {
        std::string token = i_source.substr(prev_pos, pos - prev_pos);
        if (!i_discard_empty_tokens || token != "")
        {
            o_tokens.push_back(i_source.substr(prev_pos, pos - prev_pos));
            number_of_tokens++;
        }

        pos++;
        prev_pos = pos;
        pos = i_source.find_first_of(i_seperators, pos);
    }

    if (prev_pos < i_source.length())
    {
        o_tokens.push_back(i_source.substr(prev_pos));
        number_of_tokens++;
    }

    return number_of_tokens;
}

비겁한 면책조항:저는 바이너리 파일, 소켓 또는 일부 API 호출(I/O 카드, 카메라)을 통해 데이터가 들어오는 실시간 데이터 처리 소프트웨어를 작성합니다.나는 시작 시 외부 구성 파일을 읽는 것보다 더 복잡하거나 시간이 중요한 작업에는 이 기능을 사용하지 않습니다.

여기에는 지나치게 복잡한 제안이 많이 있습니다.다음과 같은 간단한 std::string 솔루션을 사용해 보세요.

using namespace std;

string someText = ...

string::size_type tokenOff = 0, sepOff = tokenOff;
while (sepOff != string::npos)
{
    sepOff = someText.find(' ', sepOff);
    string::size_type tokenLen = (sepOff == string::npos) ? sepOff : sepOff++ - tokenOff;
    string token = someText.substr(tokenOff, tokenLen);
    if (!token.empty())
        /* do something with token */;
    tokenOff = sepOff;
}

나는 그것이 무엇인지 생각했다. >> 문자열 스트림의 연산자는 다음과 같습니다.

string word; sin >> word;

아담 피어스의 답변 손으로 회전하는 토크나이저를 제공합니다. const char*.반복자와 관련된 것은 좀 더 문제가 있습니다. 증가 string의 끝 반복자가 정의되지 않았습니다..즉, 주어진 string str{ "The quick brown fox" } 우리는 확실히 이것을 달성할 수 있습니다:

auto start = find(cbegin(str), cend(str), ' ');
vector<string> tokens{ string(cbegin(str), start) };

while (start != cend(str)) {
    const auto finish = find(++start, cend(str), ' ');

    tokens.push_back(string(start, finish));
    start = finish;
}

실제 예시

다음과 같이 표준 기능을 사용하여 복잡성을 추상화하려는 경우 프로인트에서는 제안한다 strtok 간단한 옵션입니다:

vector<string> tokens;

for (auto i = strtok(data(str), " "); i != nullptr; i = strtok(nullptr, " ")) tokens.push_back(i);

C++17에 액세스할 수 없다면 다음으로 대체해야 합니다. data(str) 이 예에서와 같이: http://ideone.com/8kAGoa

예시에서는 보여주지 않았지만, strtok 각 토큰에 동일한 구분 기호를 사용할 필요는 없습니다.하지만 이러한 장점과 함께 몇 가지 단점도 있습니다.

strtok 여러 개에 사용할 수 없습니다 strings 동시에:어느 쪽이든 nullptr 현재 토큰화를 계속하려면 통과되어야 합니다. string 아니면 새로운 char* 토큰화하려면 전달되어야 합니다(그러나 이를 지원하는 다음과 같은 일부 비표준 구현이 있습니다: strtok_s)
같은 이유로 strtok 여러 스레드에서 동시에 사용할 수 없습니다(그러나 이는 구현에 따라 정의될 수 있습니다. 예: Visual Studio의 구현은 스레드로부터 안전합니다.)
부름 strtok 수정합니다 string 에서 작동 중이므로 사용할 수 없습니다. const string에스, const char*s 또는 리터럴 문자열을 사용하여 이들 중 하나를 토큰화합니다. strtok 또는 string 누구의 콘텐츠를 보존해야 하는지, str 복사해야 할 경우 해당 복사본을 다른 곳에서 사용할 수 있습니다.

이전 방법 모두 토큰화된 토큰을 생성할 수 없습니다. vector 내부, 즉 도우미 함수로 추상화하지 않으면 초기화할 수 없음을 의미합니다. const vector<string> tokens.그 기능 그리고 받아들이는 능력 어느 공백 구분 기호는 다음을 사용하여 활용할 수 있습니다. istream_iterator.예를 들어 다음과 같습니다. const string str{ "The quick \tbrown \nfox" } 우리는 할 수있어:

istringstream is{ str };
const vector<string> tokens{ istream_iterator<string>(is), istream_iterator<string>() };

실제 사례

필요한 건설 istringstream 이 옵션의 경우 이전 2개 옵션보다 비용이 훨씬 높지만 일반적으로 이 비용은 다음 비용에 숨겨져 있습니다. string 배당.

위의 옵션 중 어느 것도 토큰화 요구 사항에 충분히 유연하지 않은 경우 가장 유연한 옵션은 다음을 사용하는 것입니다. regex_token_iterator 물론 이러한 유연성으로 인해 비용이 더 많이 들지만 이는 다시 한 번 숨겨져 있을 가능성이 높습니다. string 할당 비용.예를 들어 다음 입력이 주어지면 이스케이프되지 않은 쉼표를 기반으로 토큰화하고 공백도 사용한다고 가정해 보겠습니다. const string str{ "The ,qu\\,ick ,\tbrown, fox" } 우리는 할 수있어:

const regex re{ "\\s*((?:[^\\\\,]|\\\\.)*?)\\s*(?:,|$)" };
const vector<string> tokens{ sregex_token_iterator(cbegin(str), cend(str), re, 1), sregex_token_iterator() };

실제 사례

다음은 빈 토큰이 포함되는지(예: strsep) 또는 제외되는지(예: strtok)를 제어할 수 있는 접근 방식입니다.

#include <string.h> // for strchr and strlen

/*
 * want_empty_tokens==true  : include empty tokens, like strsep()
 * want_empty_tokens==false : exclude empty tokens, like strtok()
 */
std::vector<std::string> tokenize(const char* src,
                                  char delim,
                                  bool want_empty_tokens)
{
  std::vector<std::string> tokens;

  if (src and *src != '\0') // defensive
    while( true )  {
      const char* d = strchr(src, delim);
      size_t len = (d)? d-src : strlen(src);

      if (len or want_empty_tokens)
        tokens.push_back( std::string(src, len) ); // capture token

      if (d) src += len+1; else break;
    }

  return tokens;
}

여기 SO에서 속도에 민감한 우리 모두와 함께 구분 기호에 대해 컴파일 시간에 생성된 조회 테이블을 사용하는 버전을 제시한 사람이 아무도 없다는 것이 이상하게 보입니다(구현 예는 아래에 있음).조회 테이블과 반복자를 사용하면 효율성 측면에서 std::regex보다 우수합니다. 정규식을 이길 필요가 없다면 C++11의 표준이자 매우 유연한 정규식을 사용하세요.

일부는 이미 정규식을 제안했지만 멍청한 사람들을 위해 여기에 OP가 기대하는 것을 정확하게 수행해야 하는 패키지 예제가 있습니다.

std::vector<std::string> split(std::string::const_iterator it, std::string::const_iterator end, std::regex e = std::regex{"\\w+"}){
    std::smatch m{};
    std::vector<std::string> ret{};
    while (std::regex_search (it,end,m,e)) {
        ret.emplace_back(m.str());              
        std::advance(it, m.position() + m.length()); //next start position = match position + match length
    }
    return ret;
}
std::vector<std::string> split(const std::string &s, std::regex e = std::regex{"\\w+"}){  //comfort version calls flexible version
    return split(s.cbegin(), s.cend(), std::move(e));
}
int main ()
{
    std::string str {"Some people, excluding those present, have been compile time constants - since puberty."};
    auto v = split(str);
    for(const auto&s:v){
        std::cout << s << std::endl;
    }
    std::cout << "crazy version:" << std::endl;
    v = split(str, std::regex{"[^e]+"});  //using e as delim shows flexibility
    for(const auto&s:v){
        std::cout << s << std::endl;
    }
    return 0;
}

더 빠른 속도가 필요하고 모든 문자가 8비트여야 한다는 제약 조건을 수락해야 하는 경우 메타 프로그래밍을 사용하여 컴파일 타임에 조회 테이블을 만들 수 있습니다.

template<bool...> struct BoolSequence{};        //just here to hold bools
template<char...> struct CharSequence{};        //just here to hold chars
template<typename T, char C> struct Contains;   //generic
template<char First, char... Cs, char Match>    //not first specialization
struct Contains<CharSequence<First, Cs...>,Match> :
    Contains<CharSequence<Cs...>, Match>{};     //strip first and increase index
template<char First, char... Cs>                //is first specialization
struct Contains<CharSequence<First, Cs...>,First>: std::true_type {}; 
template<char Match>                            //not found specialization
struct Contains<CharSequence<>,Match>: std::false_type{};

template<int I, typename T, typename U> 
struct MakeSequence;                            //generic
template<int I, bool... Bs, typename U> 
struct MakeSequence<I,BoolSequence<Bs...>, U>:  //not last
    MakeSequence<I-1, BoolSequence<Contains<U,I-1>::value,Bs...>, U>{};
template<bool... Bs, typename U> 
struct MakeSequence<0,BoolSequence<Bs...>,U>{   //last  
    using Type = BoolSequence<Bs...>;
};
template<typename T> struct BoolASCIITable;
template<bool... Bs> struct BoolASCIITable<BoolSequence<Bs...>>{
    /* could be made constexpr but not yet supported by MSVC */
    static bool isDelim(const char c){
        static const bool table[256] = {Bs...};
        return table[static_cast<int>(c)];
    }   
};
using Delims = CharSequence<'.',',',' ',':','\n'>;  //list your custom delimiters here
using Table = BoolASCIITable<typename MakeSequence<256,BoolSequence<>,Delims>::Type>;

그 자리에서 getNextToken 기능은 쉽습니다:

template<typename T_It>
std::pair<T_It,T_It> getNextToken(T_It begin,T_It end){
    begin = std::find_if(begin,end,std::not1(Table{})); //find first non delim or end
    auto second = std::find_if(begin,end,Table{});      //find first delim or end
    return std::make_pair(begin,second);
}

사용하는 것도 쉽습니다.

int main() {
    std::string s{"Some people, excluding those present, have been compile time constants - since puberty."};
    auto it = std::begin(s);
    auto end = std::end(s);
    while(it != std::end(s)){
        auto token = getNextToken(it,end);
        std::cout << std::string(token.first,token.second) << std::endl;
        it = token.second;
    }
    return 0;
}

다음은 실제 예입니다. http://ideone.com/GKtkLQ

이 질문에 대한 답변은 이미 알고 있지만 기여하고 싶습니다.어쩌면 내 솔루션은 약간 간단할 수도 있지만 이것이 내가 생각해낸 것입니다.

vector<string> get_words(string const& text)
{
    vector<string> result;
    string tmp = text;

    size_t first_pos = 0;
    size_t second_pos = tmp.find(" ");;

    while (second_pos != string::npos)
    {
        if (first_pos != second_pos)
        {
            string word = tmp.substr(first_pos, second_pos - first_pos);
            result.push_back(word);
        }
        tmp = tmp.substr(second_pos + 1);
        second_pos = tmp.find(" ");
    }

    result.push_back(tmp);

    return result;
}

내 코드에 더 나은 접근 방식이 있거나 뭔가 잘못된 경우 의견을 말해주세요.

이를 수행할 수 있는 직접적인 방법은 없습니다.나타내다 이 코드 프로젝트 소스 코드 이를 위한 클래스를 만드는 방법을 알아보세요.

Boost::make_find_iterator를 활용할 수 있습니다.이것과 비슷한 것 :

template<typename CH>
inline vector< basic_string<CH> > tokenize(
    const basic_string<CH> &Input,
    const basic_string<CH> &Delimiter,
    bool remove_empty_token
    ) {

    typedef typename basic_string<CH>::const_iterator string_iterator_t;
    typedef boost::find_iterator< string_iterator_t > string_find_iterator_t;

    vector< basic_string<CH> > Result;
    string_iterator_t it = Input.begin();
    string_iterator_t it_end = Input.end();
    for(string_find_iterator_t i = boost::make_find_iterator(Input, boost::first_finder(Delimiter, boost::is_equal()));
        i != string_find_iterator_t();
        ++i) {
        if(remove_empty_token){
            if(it != i->begin())
                Result.push_back(basic_string<CH>(it,i->begin()));
        }
        else
            Result.push_back(basic_string<CH>(it,i->begin()));
        it = i->end();
    }
    if(it != it_end)
        Result.push_back(basic_string<CH>(it,it_end));

    return Result;
}

토큰화할 입력 문자열의 최대 길이를 알고 있는 경우 이를 활용하여 매우 빠른 버전을 구현할 수 있습니다.나는 아래에 strtok()와 Jon Bentley의 "Programming Perls" 2판 15장에 설명된 "접미사 배열" 데이터 구조에서 영감을 받은 기본 아이디어를 스케치하고 있습니다.이 경우 C++ 클래스는 일부 구성과 사용 편의성만 제공합니다.표시된 구현은 토큰의 선행 및 후행 공백 문자를 제거하기 위해 쉽게 확장될 수 있습니다.

기본적으로 구분 기호 문자를 문자열 종료 '\0' 문자로 바꾸고 수정된 문자열이 포함된 토큰에 대한 포인터를 설정할 수 있습니다.문자열이 구분 기호로만 구성된 극단적인 경우에는 문자열 길이에 1을 더한 빈 토큰이 생성됩니다.수정할 문자열을 복제하는 것이 실용적입니다.

헤더 파일:

class TextLineSplitter
{
public:

    TextLineSplitter( const size_t max_line_len );

    ~TextLineSplitter();

    void            SplitLine( const char *line,
                               const char sep_char = ',',
                             );

    inline size_t   NumTokens( void ) const
    {
        return mNumTokens;
    }

    const char *    GetToken( const size_t token_idx ) const
    {
        assert( token_idx < mNumTokens );
        return mTokens[ token_idx ];
    }

private:
    const size_t    mStorageSize;

    char           *mBuff;
    char          **mTokens;
    size_t          mNumTokens;

    inline void     ResetContent( void )
    {
        memset( mBuff, 0, mStorageSize );
        // mark all items as empty:
        memset( mTokens, 0, mStorageSize * sizeof( char* ) );
        // reset counter for found items:
        mNumTokens = 0L;
    }
};

구현 파일:

TextLineSplitter::TextLineSplitter( const size_t max_line_len ):
    mStorageSize ( max_line_len + 1L )
{
    // allocate memory
    mBuff   = new char  [ mStorageSize ];
    mTokens = new char* [ mStorageSize ];

    ResetContent();
}

TextLineSplitter::~TextLineSplitter()
{
    delete [] mBuff;
    delete [] mTokens;
}


void TextLineSplitter::SplitLine( const char *line,
                                  const char sep_char   /* = ',' */,
                                )
{
    assert( sep_char != '\0' );

    ResetContent();
    strncpy( mBuff, line, mMaxLineLen );

    size_t idx       = 0L; // running index for characters

    do
    {
        assert( idx < mStorageSize );

        const char chr = line[ idx ]; // retrieve current character

        if( mTokens[ mNumTokens ] == NULL )
        {
            mTokens[ mNumTokens ] = &mBuff[ idx ];
        } // if

        if( chr == sep_char || chr == '\0' )
        { // item or line finished
            // overwrite separator with a 0-terminating character:
            mBuff[ idx ] = '\0';
            // count-up items:
            mNumTokens ++;
        } // if

    } while( line[ idx++ ] );
}

사용 시나리오는 다음과 같습니다.

// create an instance capable of splitting strings up to 1000 chars long:
TextLineSplitter spl( 1000 );
spl.SplitLine( "Item1,,Item2,Item3" );
for( size_t i = 0; i < spl.NumTokens(); i++ )
{
    printf( "%s\n", spl.GetToken( i ) );
}

산출:

Item1

Item2
Item3

boost::tokenizer 당신의 친구이지만 국제화(i18n) 문제를 참조하여 코드를 이식 가능하게 만드는 것을 고려하십시오. wstring/wchar_t 유산 대신 string/char 유형.

#include <iostream>
#include <boost/tokenizer.hpp>
#include <string>

using namespace std;
using namespace boost;

typedef tokenizer<char_separator<wchar_t>,
                  wstring::const_iterator, wstring> Tok;

int main()
{
  wstring s;
  while (getline(wcin, s)) {
    char_separator<wchar_t> sep(L" "); // list of separator characters
    Tok tok(s, sep);
    for (Tok::iterator beg = tok.begin(); beg != tok.end(); ++beg) {
      wcout << *beg << L"\t"; // output (or store in vector)
    }
    wcout << L"\n";
  }
  return 0;
}

간단한 C++ 코드(표준 C++98)는 여러 구분 기호(std::string에 지정됨)를 허용하고 벡터, 문자열 및 반복자만 사용합니다.

#include <iostream>
#include <vector>
#include <string>
#include <stdexcept> 

std::vector<std::string> 
split(const std::string& str, const std::string& delim){
    std::vector<std::string> result;
    if (str.empty())
        throw std::runtime_error("Can not tokenize an empty string!");
    std::string::const_iterator begin, str_it;
    begin = str_it = str.begin(); 
    do {
        while (delim.find(*str_it) == std::string::npos && str_it != str.end())
            str_it++; // find the position of the first delimiter in str
        std::string token = std::string(begin, str_it); // grab the token
        if (!token.empty()) // empty token only when str starts with a delimiter
            result.push_back(token); // push the token into a vector<string>
        while (delim.find(*str_it) != std::string::npos && str_it != str.end())
            str_it++; // ignore the additional consecutive delimiters
        begin = str_it; // process the remaining tokens
        } while (str_it != str.end());
    return result;
}

int main() {
    std::string test_string = ".this is.a.../.simple;;test;;;END";
    std::string delim = "; ./"; // string containing the delimiters
    std::vector<std::string> tokens = split(test_string, delim);           
    for (std::vector<std::string>::const_iterator it = tokens.begin(); 
        it != tokens.end(); it++)
            std::cout << *it << std::endl;
}

라이센스 : CC-BY-SA ~와 함께 속성

제휴하지 않습니다 StackOverflow