Сигналы Boost Spirit успешно проанализированы, несмотря на то, что токен неполон

Question

Сигналы Boost Spirit успешно проанализированы, несмотря на то, что токен неполон

У меня есть очень простая конструкция пути, которую я пытаюсь проанализировать с помощью boost spirit.lex.

У нас есть следующая грамматика:

token := [a-z]+
path := (token : path) | (token)

Итак, мы просто говорим о строковых ASCII-строках, разделенных двоеточиями.

У меня есть три примера «xyz», «abc: xyz», «abc: xyz:».

Первые два должны считаться действительными. Третий, имеющий в конце двоеточие, не должен считаться действительным. К сожалению, мой анализатор распознает все три как действительные. Грамматика не должна допускать пустого токена, но, очевидно, дух делает именно это. Чего мне не хватает, чтобы получить третий отказ?

Также, если вы читаете код ниже, в комментариях есть другая версия парсера, которая требует, чтобы все пути заканчивались точкой с запятой. Я могу получить соответствующее поведение, когда активирую эти строки (то есть отклонение «abc: xyz :;»), но это не совсем то, что я хочу.

У кого-нибудь есть идеи?

Благодарю.

#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>

#include <iostream>
#include <string>

using namespace boost::spirit;
using boost::phoenix::val;

template<typename Lexer>
struct PathTokens : boost::spirit::lex::lexer<Lexer>
{
PathTokens()
{
identifier = "[a-z]+";
separator = ":";

this->self.add
(identifier)
(separator)
(';')
;
}
boost::spirit::lex::token_def<std::string> identifier, separator;
};template <typename Iterator>
struct PathGrammar
: boost::spirit::qi::grammar<Iterator>
{
template <typename TokenDef>
PathGrammar(TokenDef const& tok)
: PathGrammar::base_type(path)
{
using boost::spirit::_val;
path
=
(token >> tok.separator >> path)[std::cerr << _1 << "\n"]
|
//(token >> ';')[std::cerr << _1 << "\n"]
(token)[std::cerr << _1 << "\n"]
;

token
= (tok.identifier) [_val=_1]
;

}
boost::spirit::qi::rule<Iterator> path;
boost::spirit::qi::rule<Iterator, std::string()> token;
};int main()
{
typedef std::string::iterator BaseIteratorType;
typedef boost::spirit::lex::lexertl::token<BaseIteratorType, boost::mpl::vector<std::string> > TokenType;
typedef boost::spirit::lex::lexertl::lexer<TokenType> LexerType;
typedef PathTokens<LexerType>::iterator_type TokensIterator;
typedef std::vector<std::string> Tests;

Tests paths;
paths.push_back("abc");
paths.push_back("abc:xyz");
paths.push_back("abc:xyz:");
/*
paths.clear();
paths.push_back("abc;");
paths.push_back("abc:xyz;");
paths.push_back("abc:xyz:;");
*/
for ( Tests::iterator iter = paths.begin(); iter != paths.end(); ++iter )
{
std::string str = *iter;
std::cerr << "*****" << str << "*****\n";

PathTokens<LexerType> tokens;
PathGrammar<TokensIterator> grammar(tokens);

BaseIteratorType first = str.begin();
BaseIteratorType last = str.end();

bool r = boost::spirit::lex::tokenize_and_parse(first, last, tokens, grammar);

std::cerr << r << " " << (first==last) << "\n";
}
}

4

boost boost-spirit boost-spirit-lex c++

Решение

Другие решения

Я в дополнение к тому, что llonesmiz уже сказал, вот трюк, используя qi::eoi что я иногда использую:

path = (
(token >> tok.separator >> path) [std::cerr << _1 << "\n"]
| token                           [std::cerr << _1 << "\n"]
) >> eoi;

Это делает грамматику требовать eoi (конец ввода) в конце успешного совпадения. Это приводит к желаемому результату:

http://liveworkspace.org/code/23a7adb11889bbb2825097d7c553f71d

*****abc*****
abc
1 1
*****abc:xyz*****
xyz
abc
1 1
*****abc:xyz:*****
xyz
abc
0 1

5

Это то, что я, наконец, закончил. Он использует предложения как @sehe, так и @llonesmiz. Обратите внимание на преобразование в std :: wstring и использование действий в определении грамматики, которых не было в исходном посте.

#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/bind.hpp>

#include <iostream>
#include <string>

//
// This example uses boost spirit to parse a simple
// colon-delimited grammar.
//
// The grammar we want to recognize is:
//    identifier := [a-z]+
//    separator = :
//    path= (identifier separator path) | identifier
//
// From the boost spirit perspective this example shows
// a few things I found hard to come by when building my
// first parser.
//    1. How to flag an incomplete token at the end of input
//       as an error. (use of boost::spirit::eoi)
//    2. How to bind an action on an instance of an object
//       that is taken as input to the parser.
//    3. Use of std::wstring.
//    4. Use of the lexer iterator.
//

// This using directive will cause issues with boost::bind
// when referencing placeholders such as _1.
// using namespace boost::spirit;

//! A class that tokenizes our input.
template<typename Lexer>
struct Tokens : boost::spirit::lex::lexer<Lexer>
{
Tokens()
{
identifier = L"[a-z]+";
separator = L":";

this->self.add
(identifier)
(separator)
;
}
boost::spirit::lex::token_def<std::wstring, wchar_t> identifier, separator;
};

//! This class provides a callback that echoes strings to stderr.
struct Echo
{
void echo(boost::fusion::vector<std::wstring> const& t) const
{
using namespace boost::fusion;
std::wcerr << at_c<0>(t) << "\n";
}
};//! The definition of our grammar, as described above.
template <typename Iterator>
struct Grammar : boost::spirit::qi::grammar<Iterator>
{
template <typename TokenDef>
Grammar(TokenDef const& tok, Echo const& e)
: Grammar::base_type(path)
{
using boost::spirit::_val;
path
=
((token >> tok.separator >> path)[boost::bind(&Echo::echo, e,::_1)]
|
(token)[boost::bind(&Echo::echo, &e, ::_1)]
) >> boost::spirit::eoi; // Look for end of input.

token
= (tok.identifier) [_val=boost::spirit::qi::_1]
;

}
boost::spirit::qi::rule<Iterator> path;
boost::spirit::qi::rule<Iterator, std::wstring()> token;
};int main()
{
// A set of typedefs to make things a little clearer. This stuff is
// well described in the boost spirit documentation/examples.
typedef std::wstring::iterator BaseIteratorType;
typedef boost::spirit::lex::lexertl::token<BaseIteratorType, boost::mpl::vector<std::wstring> > TokenType;
typedef boost::spirit::lex::lexertl::lexer<TokenType> LexerType;
typedef Tokens<LexerType>::iterator_type TokensIterator;
typedef LexerType::iterator_type LexerIterator;

// Define some paths to parse.
typedef std::vector<std::wstring> Tests;
Tests paths;
paths.push_back(L"abc");
paths.push_back(L"abc:xyz");
paths.push_back(L"abc:xyz:");
paths.push_back(L":");

// Parse 'em.
for ( Tests::iterator iter = paths.begin(); iter != paths.end(); ++iter )
{
std::wstring str = *iter;
std::wcerr << L"*****" << str << L"*****\n";

Echo e;
Tokens<LexerType> tokens;
Grammar<TokensIterator> grammar(tokens, e);

BaseIteratorType first = str.begin();
BaseIteratorType last = str.end();

// Have the lexer consume our string.
LexerIterator lexFirst = tokens.begin(first, last);
LexerIterator lexLast = tokens.end();

// Have the parser consume the output of the lexer.
bool r = boost::spirit::qi::parse(lexFirst, lexLast, grammar);

// Print the status and whether or note all output of the lexer
// was processed.
std::wcerr << r << L" " << (lexFirst==lexLast) << L"\n";
}
}

2

Источник

Accepted Answer

Проблема заключается в значении first а также last после вашего звонка tokenize_and_parse, first==last проверяет, была ли ваша строка полностью токенизирована, вы не можете сделать вывод о грамматике. Если вы изолируете синтаксический анализ следующим образом, вы получите ожидаемый результат:

  PathTokens<LexerType> tokens;
PathGrammar<TokensIterator> grammar(tokens);

BaseIteratorType first = str.begin();
BaseIteratorType last = str.end();

LexerType::iterator_type lexfirst = tokens.begin(first,last);
LexerType::iterator_type lexlast = tokens.end();bool r = parse(lexfirst, lexlast, grammar);

std::cerr << r << " " << (lexfirst==lexlast) << "\n";

3