how boost::spirit::lex token is recognized

https://stackoverflow.com/questions/19132731

30-06-2022
|

Question

I am learning to use boost::spirit. To do that, I wanted to create some simple lexer, combine them and then start parsing using spirit. but the result is quite confused:

Here's the lexer:

// #define BOOST_SPIRIT_LEXERTL_DEBUG
#define BOOST_VARIANT_MINIMIZE_SIZE

#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_statement.hpp>
#include <boost/spirit/include/phoenix_container.hpp>


#include <iostream>
#include <string>

using namespace boost::spirit;
using namespace boost::spirit::ascii;


enum tokenids
{
  IDANY = lex::min_token_id + 10,
  T_USER,
  T_DOMAINLABEL,
  T_CRLF
};


template <typename Lexer>
struct sip_token : lex::lexer<Lexer>
{
  sip_token()
  {
    this->self.add_pattern
      ("ALPHANUM", "[0-9a-zA-Z]")
      ("MARK", "[-_.!~*'()]")           
      ("UNRESERVED","{ALPHANUM}|{MARK}")            
      ("USER", "({UNRESERVED})+" ) 
      ("DOMAINLABEL", "({ALPHANUM})+")
      // ("DOMAINLABEL", "{ALPHANUM}|({ALPHANUM}({ALPHANUM}|-)*{ALPHANUM})") 
      ;     

    this->self.add
      ("{USER}",T_USER)
      ("{DOMAINLABEL}", T_DOMAINLABEL)          
      ("\r\n", T_CRLF)
      (".", IDANY)    // string literals will not be esacped by the library
      ;
  } 
};


template <typename Iterator>
struct sip_grammar : qi::grammar<Iterator>
// struct sip_grammar : qi::grammar<Iterator>
{
  template <typename TokenDef>
  sip_grammar(TokenDef const& tok)
    : sip_grammar::base_type(start)
    , c(0), w(0), l(0)
  {
    using boost::phoenix::ref;
    using boost::phoenix::size;
    using boost::spirit::qi::eol;


    start =  (      
      (qi::token(T_DOMAINLABEL))[++ref(c), ++ref(l)]
      >>   qi::token(T_CRLF) [++ref(w)]
      ) 
      ;
  }

  std::size_t c, w, l;
  qi::rule<Iterator> start; 
};



int main(int argc, char* argv[])
{
  typedef lex::lexertl::token<
  char const*, boost::mpl::vector<std::string>
  > token_type;

  typedef std::string::const_iterator str_iterator_type;
  typedef lex::lexertl::lexer<token_type> lexer_type;
  typedef sip_token<lexer_type>::iterator_type iterator_type;

  std::string str;
  while (std::getline(std::cin, str))
  {
    if (str.empty() || str[0] == 'q' || str[0] == 'Q')
      break;        
    else
      str += "\r\n";

    sip_token<lexer_type> siplexer;
    sip_grammar<iterator_type > g(siplexer);

    char const* first = str.c_str();
    char const* last = &first[str.size()];

    /*<  Parsing is done based on the the token stream, not the character
    stream read from the input. The function `tokenize_and_parse()` wraps
    the passed iterator range `[first, last)` by the lexical analyzer and
    uses its exposed iterators to parse the toke stream.
    >*/  
    unsigned result = 0;
    bool r = lex::tokenize_and_parse(first, last, siplexer, g);     

    if (r) {
      std::cout << "Parsing OK" << g.l << ", " << g.w
        << ", " << g.c << "\n";
    }
    else {
      std::string rest(first, last);
      std::cerr << "Parsing failed\n" << "stopped at: \""
        << rest << "\"\n";
    }

  }
  return 0;
}
//]

in code, I add "T_DOMAINLABEL" after "T_USER"， T_DOMAINLABEL always gets parsing failure.seems lexer will match T_USER firstly. why is that? does it mean I can't add these similar patterns together?

Solution

Well, T_USER matches:

  ("{USER}",T_USER)

  // which is defined as
  ("USER", "({UNRESERVED})+" ) 

  // which is defined as
  ("UNRESERVED","{ALPHANUM}|{MARK}")

So, it takes any series of alphanumeric characters (as well as 'marks', which is irrelevant now)

T_DOMAINLABEL matches:

  ("{DOMAINLABEL}", T_DOMAINLABEL)          

  // which is defined as
  ("DOMAINLABEL", "({ALPHANUM})+")

As you can see, any T_DOMAINLABEL token is always a valid T_USER token. So, there is no way it would ever get a T_DOMAINLABEL.

This is not because of "the token not matching", it's a result of tokenizing being eager and not doing backtracking (outside a single token).

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow