Ошибка грамматики / лексера Boost.Spirit SQL

У меня две проблемы со следующей грамматикой SQL:

#define BOOST_SPIRIT_QI_DEBUG

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/include/karma.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/fusion/include/std_pair.hpp>

#include <boost/algorithm/string.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/make_shared.hpp>
#include <boost/lexical_cast.hpp>

#include <iostream>
#include <fstream>
#include <string>
#include <set>
#include <utility>

namespace bs = boost::spirit;
namespace lex = boost::spirit::lex;
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;

// Token definition base, defines all tokens for the base grammar below
template <typename Lexer>
struct sql_tokens : lex::lexer<Lexer>
{
public:
// Tokens with no attributes.
lex::token_def<lex::omit> type_smallint, type_int, type_varchar, type_text, type_date;
lex::token_def<lex::omit> kw_not_null, kw_auto_increment, kw_unique, kw_default, kw_create,
kw_table, kw_constraint, kw_primary_key;

// Attributed tokens. (If you add a new type, don't forget to add it to the lex::lexertl::token definition too).
lex::token_def<int> signed_digit;
lex::token_def<std::size_t> unsigned_digit;
lex::token_def<std::string> identifier;
lex::token_def<std::string> quoted_string;

sql_tokens()
{
// Column data types.
type_smallint = "(?i:smallint)";
type_int = "(?i:int)";
type_varchar = "(?i:varchar)";
type_text = "(?i:text)";
type_date = "(?i:date)";

// Keywords.
kw_not_null = "(?i:not +null)";
kw_auto_increment = "(?i:auto_increment)";
kw_unique = "(?i:unique)";
kw_default = "(?i:default)";
kw_create = "(?i:create)";
kw_table = "(?i:table)";
kw_constraint = "(?i:constraint)";
kw_primary_key = "(?i:primary +key)";

// Values.
signed_digit = "[+-]?[0-9]+";
unsigned_digit = "[0-9]+";
quoted_string = "\\\"(\\\\.|[^\\\"])*\\\""; // \"(\\.|[^\"])*\"
// Identifier.
identifier = "[a-zA-Z][a-zA-Z0-9_]*";

// The token must be added in priority order.
this->self += lex::token_def<>('(') | ')' | ',' | ';';
this->self += type_smallint | type_int | type_varchar | type_text |
type_date;
this->self += kw_not_null | kw_auto_increment | kw_unique | kw_default |
kw_create | kw_table | kw_constraint | kw_primary_key;
this->self += identifier | unsigned_digit | signed_digit | quoted_string;

// define the whitespace to ignore.
this->self("WS")
=       lex::token_def<>("[ \\t\\n]+")
|       "--[^\\n]*\\n"  // Single line comments with --
|       "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/" // C-style comments
;
}
};

// Grammar definition, define a little part of the SQL language.
template <typename Iterator, typename Lexer>
struct sql_grammar
: qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
{
template <typename TokenDef>
sql_grammar(TokenDef const& tok)
: sql_grammar::base_type(program, "program")
{
program
=  (statement % ';') >> *qi::lit(';')
;

statement
=   create_statement.alias()
;

create_statement
=   tok.kw_create >> create_table
;

create_table
=   tok.kw_table >> tok.identifier >> '(' >> create_table_columns >> -(',' >> table_constraints) >> ')'
;

table_constraints
=   constraint_definition % ','
;

constraint_definition
= tok.kw_constraint >> tok.identifier >> primary_key_constraint
;

primary_key_constraint
= tok.kw_primary_key >> '(' >> (tok.identifier % ',') >> ')'
;

create_table_columns
=   column_definition % ','
;

column_definition
=   tok.identifier >> column_type >> *type_constraint
;

type_constraint
=   tok.kw_not_null
|   tok.kw_auto_increment
|   tok.kw_unique
|   default_value
;

default_value
=   tok.kw_default > tok.quoted_string
;

column_type
=   tok.type_smallint
|   tok.type_int
|   (tok.type_varchar > '(' > tok.unsigned_digit > ')')
|   tok.type_text
|   tok.type_date
;

program.name("program");
statement.name("statement");
create_statement.name("create statement");
create_table.name("create table");
create_table_columns.name("create table columns");
column_definition.name("column definition");
column_type.name("column type");
default_value.name("default value");
type_constraint.name("type constraint");
table_constraints.name("table constraints");
constraint_definition.name("constraint definition");
primary_key_constraint.name("primary key constraint");

BOOST_SPIRIT_DEBUG_NODE(program);
BOOST_SPIRIT_DEBUG_NODE(statement);
BOOST_SPIRIT_DEBUG_NODE(create_statement);
BOOST_SPIRIT_DEBUG_NODE(create_table);
BOOST_SPIRIT_DEBUG_NODE(create_table_columns);
BOOST_SPIRIT_DEBUG_NODE(column_definition);
BOOST_SPIRIT_DEBUG_NODE(column_type);
BOOST_SPIRIT_DEBUG_NODE(default_value);
BOOST_SPIRIT_DEBUG_NODE(type_constraint);
BOOST_SPIRIT_DEBUG_NODE(table_constraints);
BOOST_SPIRIT_DEBUG_NODE(constraint_definition);
BOOST_SPIRIT_DEBUG_NODE(primary_key_constraint);

using namespace qi::labels;
qi::on_error<qi::fail>
(
program,
std::cout
<< phx::val("Error! Expecting ")
<< bs::_4                               // what failed?
<< phx::val(" here: \"")
<< phx::construct<std::string>(bs::_3, bs::_2)   // iterators to error-pos, end
<< phx::val("\"")
<< std::endl
);
}

private:
typedef qi::in_state_skipper<Lexer> skipper_type;
typedef qi::rule<Iterator, skipper_type> simple_rule;

simple_rule program, statement, create_statement, create_table, table_constraints, constraint_definition;
simple_rule primary_key_constraint, create_table_columns, column_definition, type_constraint, default_value, column_type;
};

std::string file2string(const std::string& filename)
{
std::ifstream s(filename.c_str(), std::ios_base::binary);
std::stringstream ss;
ss << s.rdbuf();
return ss.str();
}

int main(int argc, char* argv[])
{
if(argc != 2)
{
std::cerr << "usage: " << argv[0] << " schema_filename\n";
return 1;
}

// iterator type used to expose the underlying input stream
typedef std::string::iterator base_iterator_type;

// This is the lexer token type to use.
typedef lex::lexertl::token<
base_iterator_type, boost::mpl::vector<int, std::size_t, std::string>
> token_type;

// Here we use the lexertl based lexer engine.
typedef lex::lexertl::lexer<token_type> lexer_type;

// This is the token definition type (derived from the given lexer type).
typedef sql_tokens<lexer_type> sql_tokens;

// this is the iterator type exposed by the lexer
typedef sql_tokens::iterator_type iterator_type;

// this is the type of the grammar to parse
typedef sql_grammar<iterator_type, sql_tokens::lexer_def> sql_grammar;

// now we use the types defined above to create the lexer and grammar
// object instances needed to invoke the parsing process
sql_tokens tokens;                         // Our lexer
sql_grammar sql(tokens);                  // Our parser

std::string str(file2string(argv[1]));

// At this point we generate the iterator pair used to expose the
// tokenized input stream.
base_iterator_type it = str.begin();
iterator_type iter = tokens.begin(it, str.end());
iterator_type end = tokens.end();

// Parsing is done based on the the token stream, not the character
// stream read from the input.
// Note how we use the lexer defined above as the skip parser. It must
// be explicitly wrapped inside a state directive, switching the lexer
// state for the duration of skipping whitespace.
std::string ws("WS");
bool r = qi::phrase_parse(iter, end, sql, qi::in_state(ws)[tokens.self]);

if (r && iter == end)
{
std::cout << "-------------------------\n";
std::cout << "Parsing succeeded\n";
std::cout << "-------------------------\n";
}
else
{
std::cout << "-------------------------\n";
std::cout << "Parsing failed\n";
std::cout << "-------------------------\n";
}
return 0;
}

Проблема 1: Начните с комментариев

Когда файл начинается с комментария, синтаксический анализ немедленно завершается неудачно:

/* bouh */

CREATE TABLE mytable (
id int NOT NULL AUTO_INCREMENT
);

С этим провальным деревом:

<program>
<try>[/]</try>
<statement>
<try>[/]</try>
<create_statement>
<try>[/]</try>
<fail/>
</create_statement>
<fail/>
</statement>
<fail/>
</program>

Но если я добавлю строку return прямо перед этим, это сработает. Оба типа комментариев («-» и «/ ** /») терпят неудачу.

Проблема 2: уникальное ключевое слово не распознано

Сбой разбора в очень специфических условиях с уникальным ключевым словом. Это не работает, когда уникально в верхнем регистре и сразу после запятой.

Все следующие случаи успешны:

-- Success
CREATE TABLE Addon (
id int NOT NULL AUTO_INCREMENT,
u smallint NOT NULL UNIQUE
);

-- Success
CREATE TABLE Addon (
id int NOT NULL AUTO_INCREMENT,
u smallint NOT NULL unique,
s int NOT NULL UNIQUE
);

-- Success
CREATE TABLE Addon (
id int NOT NULL AUTO_INCREMENT,
u smallint NOT NULL UNIQUE ,
s int NOT NULL UNIQUE
);

-- Success
CREATE TABLE Addon (
id int NOT NULL AUTO_INCREMENT,
u smallint UNIQUE NOT NULL,
s int NOT NULL UNIQUE
);

Но этот не делает:

-- Fail
CREATE TABLE Addon (
id int NOT NULL AUTO_INCREMENT,
u smallint NOT NULL UNIQUE,
s int NOT NULL
);

У вас есть идеи, что не так?
Спасибо!

3

Решение

Что касается пропуска пробелов, я могу только сделать вывод, что предварительный пропуск изначально не выполняется (возможно, состояние переключено неправильно).

Конечно, вы можете попытаться исправить это, используя lex::tokenize_and_parse API (передавая начальное состояние как «WS»). Я неправильно запомнил API, Вы можете сделать это только с помощью ручного токенизации, который в первую очередь исключает переключение состояний с помощью Qi.

Однако я склоняюсь к тому, чтобы пропустить ответственность лексера:

ws = "[ \\t\\n]+";
comment = "--[^\\n]*\\n";  // Single line comments with --
cstyle_comment = "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"; // C-style comments

this->self += ws              [ lex::_pass = lex::pass_flags::pass_ignore ]
| comment         [ lex::_pass = lex::pass_flags::pass_ignore ]
| cstyle_comment  [ lex::_pass = lex::pass_flags::pass_ignore ]
;

Теперь вообще не нужно использовать шкипер, и это позволяет разобрать первую проблему (начиная с комментария).

Полный код: Жить на Колиру

Ищу #ifdef STATE_WS

//#define BOOST_SPIRIT_QI_DEBUG
//#define STATE_WS

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/include/karma.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/fusion/include/std_pair.hpp>

#include <boost/algorithm/string.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/make_shared.hpp>
#include <boost/lexical_cast.hpp>

#include <iostream>
#include <fstream>
#include <string>
#include <set>
#include <utility>

namespace bs  = boost::spirit;
namespace lex = boost::spirit::lex;
namespace qi  = boost::spirit::qi;
namespace phx = boost::phoenix;

// Token definition base, defines all tokens for the base grammar below
template <typename Lexer>
struct sql_tokens : lex::lexer<Lexer>
{
public:
// Tokens with no attributes.
lex::token_def<lex::omit> type_smallint;
lex::token_def<lex::omit> type_int;
lex::token_def<lex::omit> type_varchar;
lex::token_def<lex::omit> type_text;
lex::token_def<lex::omit> type_date;
lex::token_def<lex::omit> kw_not_null;
lex::token_def<lex::omit> kw_auto_increment;
lex::token_def<lex::omit> kw_unique;
lex::token_def<lex::omit> kw_default;
lex::token_def<lex::omit> kw_create;
lex::token_def<lex::omit> kw_table;
lex::token_def<lex::omit> kw_constraint;
lex::token_def<lex::omit> kw_primary_key;

// Attributed tokens. (If you add a new type, don't forget to add it to the lex::lexertl::token definition too).
lex::token_def<int>         signed_digit;
lex::token_def<std::size_t> unsigned_digit;
lex::token_def<std::string> identifier;
lex::token_def<std::string> quoted_string;

lex::token_def<lex::omit>   ws, comment, cstyle_comment;

sql_tokens()
{
// Column data types.
type_smallint     = "(?i:smallint)";
type_int          = "(?i:int)";
type_varchar      = "(?i:varchar)";
type_text         = "(?i:text)";
type_date         = "(?i:date)";

// Keywords.
kw_not_null       = "(?i:not +null)";
kw_auto_increment = "(?i:auto_increment)";
kw_unique         = "(?i:unique)";
kw_default        = "(?i:default)";
kw_create         = "(?i:create)";
kw_table          = "(?i:table)";
kw_constraint     = "(?i:constraint)";
kw_primary_key    = "(?i:primary +key)";

// Values.
signed_digit      = "[+-]?[0-9]+";
unsigned_digit    = "[0-9]+";
quoted_string     = "\\\"(\\\\.|[^\\\"])*\\\""; // \"(\\.|[^\"])*\"
// Identifier.
identifier        = "[a-zA-Z][a-zA-Z0-9_]*";

// The token must be added in priority order.
this->self += lex::token_def<>('(') | ')' | ',' | ';';
this->self += type_smallint | type_int | type_varchar | type_text |
type_date;
this->self += kw_not_null | kw_auto_increment | kw_unique | kw_default |
kw_create | kw_table | kw_constraint | kw_primary_key;
this->self += identifier | unsigned_digit | signed_digit | quoted_string;

#ifdef STATE_WS
// define the whitespace to ignore.
this->self("WS")
=       ws
|       comment
|       cstyle_comment
;
#else
ws = "[ \\t\\n]+";
comment = "--[^\\n]*\\n";  // Single line comments with --
cstyle_comment = "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"; // C-style comments

this->self += ws              [ lex::_pass = lex::pass_flags::pass_ignore ]
| comment         [ lex::_pass = lex::pass_flags::pass_ignore ]
| cstyle_comment  [ lex::_pass = lex::pass_flags::pass_ignore ]
;
#endif
}
};

// Grammar definition, define a little part of the SQL language.
template <typename Iterator, typename Lexer>
struct sql_grammar
#ifdef STATE_WS
: qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
#else
: qi::grammar<Iterator>
#endif
{
template <typename TokenDef>
sql_grammar(TokenDef const& tok)
: sql_grammar::base_type(program, "program")
{
program
=  (statement % ';') >> *qi::lit(';')
;

statement
=   create_statement.alias()
;

create_statement
=   tok.kw_create >> create_table
;

create_table
=   tok.kw_table >> tok.identifier >> '(' >> create_table_columns >> -(',' >> table_constraints) >> ')'
;

table_constraints
=   constraint_definition % ','
;

constraint_definition
= tok.kw_constraint >> tok.identifier >> primary_key_constraint
;

primary_key_constraint
= tok.kw_primary_key >> '(' >> (tok.identifier % ',') >> ')'
;

create_table_columns
=   column_definition % ','
;

column_definition
=   tok.identifier >> column_type >> *type_constraint
;

type_constraint
=   tok.kw_not_null
|   tok.kw_auto_increment
|   tok.kw_unique
|   default_value
;

default_value
=   tok.kw_default > tok.quoted_string
;

column_type
=   tok.type_smallint
|   tok.type_int
|   (tok.type_varchar > '(' > tok.unsigned_digit > ')')
|   tok.type_text
|   tok.type_date
;

program.name("program");
statement.name("statement");
create_statement.name("create statement");
create_table.name("create table");
create_table_columns.name("create table columns");
column_definition.name("column definition");
column_type.name("column type");
default_value.name("default value");
type_constraint.name("type constraint");
table_constraints.name("table constraints");
constraint_definition.name("constraint definition");
primary_key_constraint.name("primary key constraint");

BOOST_SPIRIT_DEBUG_NODE(program);
BOOST_SPIRIT_DEBUG_NODE(statement);
BOOST_SPIRIT_DEBUG_NODE(create_statement);
BOOST_SPIRIT_DEBUG_NODE(create_table);
BOOST_SPIRIT_DEBUG_NODE(create_table_columns);
BOOST_SPIRIT_DEBUG_NODE(column_definition);
BOOST_SPIRIT_DEBUG_NODE(column_type);
BOOST_SPIRIT_DEBUG_NODE(default_value);
BOOST_SPIRIT_DEBUG_NODE(type_constraint);
BOOST_SPIRIT_DEBUG_NODE(table_constraints);
BOOST_SPIRIT_DEBUG_NODE(constraint_definition);
BOOST_SPIRIT_DEBUG_NODE(primary_key_constraint);

using namespace qi::labels;
qi::on_error<qi::fail>
(
program,
std::cout
<< phx::val("Error! Expecting ")
<< bs::_4                               // what failed?
<< phx::val(" here: \"")
<< phx::construct<std::string>(bs::_3, bs::_2)   // iterators to error-pos, end
<< phx::val("\"")
<< std::endl
);
}

private:
#ifdef STATE_WS
typedef qi::in_state_skipper<Lexer> skipper_type;
#else
typedef qi::unused_type skipper_type;
#endif
typedef qi::rule<Iterator, skipper_type> simple_rule;

simple_rule program, statement, create_statement, create_table, table_constraints, constraint_definition;
simple_rule primary_key_constraint, create_table_columns, column_definition, type_constraint, default_value, column_type;
};

std::string cin2string()
{
std::istreambuf_iterator<char> f(std::cin), l;
std::string result;
std::copy(f, l, std::back_inserter(result));
return result;
}

int main(int argc, char* argv[])
{
// iterator type used to expose the underlying input stream
typedef std::string::const_iterator base_iterator_type;

// This is the lexer token type to use.
typedef lex::lexertl::token<
base_iterator_type, boost::mpl::vector<int, std::size_t, std::string>
> token_type;

#ifdef STATE_WS
typedef lex::lexertl::lexer<token_type> lexer_type;
#else
typedef lex::lexertl::actor_lexer<token_type> lexer_type;
#endif

// This is the token definition type (derived from the given lexer type).
typedef sql_tokens<lexer_type> sql_tokens;

// this is the iterator type exposed by the lexer
typedef sql_tokens::iterator_type iterator_type;

// this is the type of the grammar to parse
typedef sql_grammar<iterator_type, sql_tokens::lexer_def> sql_grammar;

// now we use the types defined above to create the lexer and grammar
// object instances needed to invoke the parsing process
sql_tokens tokens;                         // Our lexer
sql_grammar sql(tokens);                  // Our parser

const std::string str = cin2string();

// At this point we generate the iterator pair used to expose the
// tokenized input stream.
base_iterator_type it = str.begin();
iterator_type iter = tokens.begin(it, str.end());
iterator_type end = tokens.end();

// Parsing is done based on the the token stream, not the character
// stream read from the input.
// Note how we use the lexer defined above as the skip parser. It must
// be explicitly wrapped inside a state directive, switching the lexer
// state for the duration of skipping whitespace.
#ifdef STATE_WS
std::string ws("WS");
bool r = qi::phrase_parse(iter, end, sql, qi::in_state(ws)[tokens.self]);
#else
bool r = qi::parse(iter, end, sql);
#endif

if (r && iter == end)
{
std::cout << "-------------------------\n";
std::cout << "Parsing succeeded\n";
std::cout << "-------------------------\n";
}
else
{
std::cout << "-------------------------\n";
std::cout << "Parsing failed\n";
std::cout << "-------------------------\n";
}
return 0;
}
3

Другие решения

Других решений пока нет …

По вопросам рекламы [email protected]