Ускорение лексера состояний перекрестного опыления

Question

Ускорение лексера состояний перекрестного опыления

Я пытаюсь использовать состояния лексера для анализа контекста, но кажется, что разные состояния лексера перекрестно опыляют. Вот очень простой пример

#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_container.hpp>

#include <iostream>
#include <string>

using namespace boost::spirit;

template <typename Lexer>
struct strip_comments_tokens : lex::lexer<Lexer>
{
strip_comments_tokens()
: strip_comments_tokens::base_type(lex::match_flags::match_default)
{
ccomment = "\\/\\*";
endcomment = ".*\\*\\/";
hello = "hello";

this->self.add
(ccomment)
(hello);

this->self("COMMENT").add
(endcomment);
}

lex::token_def<> ccomment, endcomment;
lex::token_def<std::string> hello;
};

template <typename Iterator>
struct strip_comments_grammar : qi::grammar<Iterator>
{
template <typename TokenDef>
strip_comments_grammar(TokenDef const& tok)
: strip_comments_grammar::base_type(start)
{
start =  *(   tok.ccomment
>>  qi::in_state("COMMENT")
[
tok.endcomment
]
|   tok.hello [ std::cout << _1 ]
);
}

qi::rule<Iterator> start;
};int main(int argc, char* argv[])
{
typedef std::string::iterator base_iterator_type;

typedef
lex::lexertl::lexer<lex::lexertl::token<base_iterator_type> >
lexer_type;

typedef strip_comments_tokens<lexer_type>::iterator_type iterator_type;

strip_comments_tokens<lexer_type> strip_comments;           // Our lexer
strip_comments_grammar<iterator_type> g (strip_comments);   // Our parser

std::string str("hello/*hello*/hello");
base_iterator_type first = str.begin();

bool r = lex::tokenize_and_parse(first, str.end(), strip_comments, g);

return 0;
}

Я ожидаю, что вход

"hello/*hello*/hello"

быть отмеченным как привет коммент конец привет. Но то, что происходит, является входным токеном как привет привет привет, поэтому грамматика перестает работать. Если вы измените вход на

"hello/*anything else*/hello"

все работает как положено.

Есть идеи?

2

boost boost-spirit boost-spirit-lex boost-spirit-qi c++

Решение

Другие решения

Источник

Accepted Answer

Вы никогда не изменяете состояние лексера. Так что всегда в "INITIAL" государство.

Установка состояния лексера должна быть сделана в стадии Лексера (по моему опыту и после долгих экспериментов нет надежного способа получения обратной связи от стадии парсера).

Так что вам нужно обновить до actor_lexer и присоедините семантические действия к token_defs, добавленным в таблицы лексеров:

typedef
lex::lexertl::actor_lexer<lex::lexertl::token<base_iterator_type> >
lexer_type;

А также

this->self +=
ccomment [ lex::_state = "COMMENT" ]
| hello;

this->self("COMMENT") +=
endcomment [ lex::_state = "INITIAL" ];

Тем не менее, я полагаю, это много проще просто пропустить токены. Если вы действительно хотите знать, как использовать состояния Lexer для пропуска, смотрите:

Ошибка грамматики / лексера Boost.Spirit SQL

Я бы предложил Упростить и получить прибыль подход с использованием lex::_pass = lex::pass_flags::pass_ignore хоть:

Вот мое мнение:

Жить на Колиру

#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/include/qi.hpp> // for the parser expression *strip_comments.hello

namespace lex = boost::spirit::lex;
namespace phx = boost::phoenix;

template <typename Lexer>
struct strip_comments_tokens : lex::lexer<Lexer> {
strip_comments_tokens()
: strip_comments_tokens::base_type(lex::match_flags::match_default)
{
ccomment   = "\\/\\*.*\\*\\/";
hello      = "hello"; // why not "."?

this->self +=
ccomment [ lex::_pass = lex::pass_flags::pass_ignore ]
// IDEA: | lex::token_def<char>(".") // to just accept anything
| hello
;
}

lex::token_def<lex::omit>   ccomment;
lex::token_def<std::string> hello;
};

int main() {
typedef std::string::const_iterator base_iterator_type;
typedef lex::lexertl::actor_lexer<
lex::lexertl::token<base_iterator_type/*, boost::mpl::vector<char, std::string>, boost::mpl::false_*/>
> lexer_type;

strip_comments_tokens<lexer_type> strip_comments;         // Our lexer

std::string const str("hello/*hello*/hello");
std::string stripped;

base_iterator_type first = str.begin();
bool r = lex::tokenize_and_parse(first, str.end(), strip_comments, *strip_comments.hello, stripped);

if (r)
std::cout << "\nStripped: '" << stripped << "'\n";
else
std::cout << "Failed: '" << std::string(first, str.end()) << "'\n";
}

2