// tokeniser.hpp
|
// Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
|
//
|
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
#ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_TOKENISER_RE_TOKENISER_HPP
|
#define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_TOKENISER_RE_TOKENISER_HPP
|
|
// memcpy()
|
#include <cstring>
|
#include <map>
|
#include "num_token.hpp"
|
#include "../../runtime_error.hpp"
|
#include "../../size_t.hpp"
|
#include <sstream>
|
#include "../../string_token.hpp"
|
#include "re_tokeniser_helper.hpp"
|
|
namespace boost
|
{
|
namespace lexer
|
{
|
namespace detail
|
{
|
template<typename CharT>
|
class basic_re_tokeniser
|
{
|
public:
|
typedef basic_num_token<CharT> num_token;
|
typedef basic_re_tokeniser_state<CharT> state;
|
typedef basic_string_token<CharT> string_token;
|
typedef typename string_token::string string;
|
typedef std::map<string_token, std::size_t> token_map;
|
typedef std::pair<string_token, std::size_t> token_pair;
|
|
static void next (state &state_, token_map &map_, num_token &token_)
|
{
|
CharT ch_ = 0;
|
bool eos_ = state_.next (ch_);
|
|
token_.min_max (0, false, 0);
|
|
while (!eos_ && ch_ == '"')
|
{
|
state_._in_string ^= 1;
|
eos_ = state_.next (ch_);
|
}
|
|
if (eos_)
|
{
|
if (state_._in_string)
|
{
|
throw runtime_error ("Unexpected end of regex "
|
"(missing '\"').");
|
}
|
|
if (state_._paren_count)
|
{
|
throw runtime_error ("Unexpected end of regex "
|
"(missing ')').");
|
}
|
|
token_.set (num_token::END, null_token);
|
}
|
else
|
{
|
if (ch_ == '\\')
|
{
|
// Even if we are in a string, respect escape sequences...
|
escape (state_, map_, token_);
|
}
|
else if (state_._in_string)
|
{
|
// All other meta characters lose their special meaning
|
// inside a string.
|
create_charset_token (string (1, ch_), false, map_, token_);
|
}
|
else
|
{
|
// Not an escape sequence and not inside a string, so
|
// check for meta characters.
|
switch (ch_)
|
{
|
case '(':
|
token_.set (num_token::OPENPAREN, null_token);
|
++state_._paren_count;
|
read_options (state_);
|
break;
|
case ')':
|
--state_._paren_count;
|
|
if (state_._paren_count < 0)
|
{
|
std::ostringstream ss_;
|
|
ss_ << "Number of open parenthesis < 0 at index " <<
|
state_.index () - 1 << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
|
token_.set (num_token::CLOSEPAREN, null_token);
|
|
if (!state_._flags_stack.empty ())
|
{
|
state_._flags = state_._flags_stack.top ();
|
state_._flags_stack.pop ();
|
}
|
break;
|
case '?':
|
if (!state_.eos () && *state_._curr == '?')
|
{
|
token_.set (num_token::AOPT, null_token);
|
state_.increment ();
|
}
|
else
|
{
|
token_.set (num_token::OPT, null_token);
|
}
|
|
break;
|
case '*':
|
if (!state_.eos () && *state_._curr == '?')
|
{
|
token_.set (num_token::AZEROORMORE, null_token);
|
state_.increment ();
|
}
|
else
|
{
|
token_.set (num_token::ZEROORMORE, null_token);
|
}
|
|
break;
|
case '+':
|
if (!state_.eos () && *state_._curr == '?')
|
{
|
token_.set (num_token::AONEORMORE, null_token);
|
state_.increment ();
|
}
|
else
|
{
|
token_.set (num_token::ONEORMORE, null_token);
|
}
|
|
break;
|
case '{':
|
open_curly (state_, token_);
|
break;
|
case '|':
|
token_.set (num_token::OR, null_token);
|
break;
|
case '^':
|
if (state_._curr - 1 == state_._start)
|
{
|
token_.set (num_token::CHARSET, bol_token);
|
state_._seen_BOL_assertion = true;
|
}
|
else
|
{
|
create_charset_token (string (1, ch_), false,
|
map_, token_);
|
}
|
|
break;
|
case '$':
|
if (state_._curr == state_._end)
|
{
|
token_.set (num_token::CHARSET, eol_token);
|
state_._seen_EOL_assertion = true;
|
}
|
else
|
{
|
create_charset_token (string (1, ch_), false,
|
map_, token_);
|
}
|
|
break;
|
case '.':
|
{
|
string dot_;
|
|
if (state_._flags & dot_not_newline)
|
{
|
dot_ = '\n';
|
}
|
|
create_charset_token (dot_, true, map_, token_);
|
break;
|
}
|
case '[':
|
{
|
charset (state_, map_, token_);
|
break;
|
}
|
case '/':
|
throw runtime_error("Lookahead ('/') is not supported yet.");
|
break;
|
default:
|
if ((state_._flags & icase) &&
|
(std::isupper (ch_, state_._locale) ||
|
std::islower (ch_, state_._locale)))
|
{
|
CharT upper_ = std::toupper (ch_, state_._locale);
|
CharT lower_ = std::tolower (ch_, state_._locale);
|
|
string str_ (1, upper_);
|
|
str_ += lower_;
|
create_charset_token (str_, false, map_, token_);
|
}
|
else
|
{
|
create_charset_token (string (1, ch_), false,
|
map_, token_);
|
}
|
|
break;
|
}
|
}
|
}
|
}
|
|
private:
|
typedef basic_re_tokeniser_helper<CharT> tokeniser_helper;
|
|
static void read_options (state &state_)
|
{
|
if (!state_.eos () && *state_._curr == '?')
|
{
|
CharT ch_ = 0;
|
bool eos_ = false;
|
bool negate_ = false;
|
|
state_.increment ();
|
eos_ = state_.next (ch_);
|
state_._flags_stack.push (state_._flags);
|
|
while (!eos_ && ch_ != ':')
|
{
|
switch (ch_)
|
{
|
case '-':
|
negate_ ^= 1;
|
break;
|
case 'i':
|
if (negate_)
|
{
|
state_._flags = static_cast<regex_flags>
|
(state_._flags & ~icase);
|
}
|
else
|
{
|
state_._flags = static_cast<regex_flags>
|
(state_._flags | icase);
|
}
|
|
negate_ = false;
|
break;
|
case 's':
|
if (negate_)
|
{
|
state_._flags = static_cast<regex_flags>
|
(state_._flags | dot_not_newline);
|
}
|
else
|
{
|
state_._flags = static_cast<regex_flags>
|
(state_._flags & ~dot_not_newline);
|
}
|
|
negate_ = false;
|
break;
|
default:
|
{
|
std::ostringstream ss_;
|
|
ss_ << "Unknown option at index " <<
|
state_.index () - 1 << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
}
|
|
eos_ = state_.next (ch_);
|
}
|
|
// End of string handler will handle early termination
|
}
|
else if (!state_._flags_stack.empty ())
|
{
|
state_._flags_stack.push (state_._flags);
|
}
|
}
|
|
static void escape (state &state_, token_map &map_, num_token &token_)
|
{
|
CharT ch_ = 0;
|
std::size_t str_len_ = 0;
|
const CharT *str_ = tokeniser_helper::escape_sequence (state_,
|
ch_, str_len_);
|
|
if (str_)
|
{
|
state state2_ (str_ + 1, str_ + str_len_, state_._flags,
|
state_._locale);
|
|
charset (state2_, map_, token_);
|
}
|
else
|
{
|
create_charset_token (string (1, ch_), false, map_, token_);
|
}
|
}
|
|
static void charset (state &state_, token_map &map_, num_token &token_)
|
{
|
string chars_;
|
bool negated_ = false;
|
|
tokeniser_helper::charset (state_, chars_, negated_);
|
create_charset_token (chars_, negated_, map_, token_);
|
}
|
|
static void create_charset_token (const string &charset_,
|
const bool negated_, token_map &map_, num_token &token_)
|
{
|
std::size_t id_ = null_token;
|
string_token stok_ (negated_, charset_);
|
|
stok_.remove_duplicates ();
|
stok_.normalise ();
|
|
typename token_map::const_iterator iter_ = map_.find (stok_);
|
|
if (iter_ == map_.end ())
|
{
|
id_ = map_.size ();
|
map_.insert (token_pair (stok_, id_));
|
}
|
else
|
{
|
id_ = iter_->second;
|
}
|
|
token_.set (num_token::CHARSET, id_);
|
}
|
|
static void open_curly (state &state_, num_token &token_)
|
{
|
if (state_.eos ())
|
{
|
throw runtime_error ("Unexpected end of regex "
|
"(missing '}').");
|
}
|
else if (*state_._curr >= '0' && *state_._curr <= '9')
|
{
|
repeat_n (state_, token_);
|
|
if (!state_.eos () && *state_._curr == '?')
|
{
|
token_._type = num_token::AREPEATN;
|
state_.increment ();
|
}
|
}
|
else
|
{
|
macro (state_, token_);
|
}
|
}
|
|
// SYNTAX:
|
// {n[,[n]]}
|
// SEMANTIC RULES:
|
// {0} - INVALID (throw exception)
|
// {0,} = *
|
// {0,0} - INVALID (throw exception)
|
// {0,1} = ?
|
// {1,} = +
|
// {min,max} where min == max - {min}
|
// {min,max} where max < min - INVALID (throw exception)
|
static void repeat_n (state &state_, num_token &token_)
|
{
|
CharT ch_ = 0;
|
bool eos_ = state_.next (ch_);
|
|
while (!eos_ && ch_ >= '0' && ch_ <= '9')
|
{
|
token_._min *= 10;
|
token_._min += ch_ - '0';
|
eos_ = state_.next (ch_);
|
}
|
|
if (eos_)
|
{
|
throw runtime_error ("Unexpected end of regex "
|
"(missing '}').");
|
}
|
|
bool min_max_ = false;
|
bool repeatn_ = true;
|
|
token_._comma = ch_ == ',';
|
|
if (token_._comma)
|
{
|
eos_ = state_.next (ch_);
|
|
if (eos_)
|
{
|
throw runtime_error ("Unexpected end of regex "
|
"(missing '}').");
|
}
|
|
if (ch_ == '}')
|
{
|
// Small optimisation: Check for '*' equivalency.
|
if (token_._min == 0)
|
{
|
token_.set (num_token::ZEROORMORE, null_token);
|
repeatn_ = false;
|
}
|
// Small optimisation: Check for '+' equivalency.
|
else if (token_._min == 1)
|
{
|
token_.set (num_token::ONEORMORE, null_token);
|
repeatn_ = false;
|
}
|
}
|
else
|
{
|
if (ch_ < '0' || ch_ > '9')
|
{
|
std::ostringstream ss_;
|
|
ss_ << "Missing '}' at index " <<
|
state_.index () - 1 << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
|
min_max_ = true;
|
|
do
|
{
|
token_._max *= 10;
|
token_._max += ch_ - '0';
|
eos_ = state_.next (ch_);
|
} while (!eos_ && ch_ >= '0' && ch_ <= '9');
|
|
if (eos_)
|
{
|
throw runtime_error ("Unexpected end of regex "
|
"(missing '}').");
|
}
|
|
// Small optimisation: Check for '?' equivalency.
|
if (token_._min == 0 && token_._max == 1)
|
{
|
token_.set (num_token::OPT, null_token);
|
repeatn_ = false;
|
}
|
// Small optimisation: if min == max, then min.
|
else if (token_._min == token_._max)
|
{
|
token_._comma = false;
|
min_max_ = false;
|
token_._max = 0;
|
}
|
}
|
}
|
|
if (ch_ != '}')
|
{
|
std::ostringstream ss_;
|
|
ss_ << "Missing '}' at index " << state_.index () - 1 << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
|
if (repeatn_)
|
{
|
// SEMANTIC VALIDATION follows:
|
// NOTE: {0,} has already become *
|
// therefore we don't check for a comma.
|
if (token_._min == 0 && token_._max == 0)
|
{
|
std::ostringstream ss_;
|
|
ss_ << "Cannot have exactly zero repeats preceding index " <<
|
state_.index () << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
|
if (min_max_ && token_._max < token_._min)
|
{
|
std::ostringstream ss_;
|
|
ss_ << "Max less than min preceding index " <<
|
state_.index () << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
|
token_.set (num_token::REPEATN, null_token);
|
}
|
}
|
|
static void macro (state &state_, num_token &token_)
|
{
|
CharT ch_ = 0;
|
bool eos_ = false;
|
const CharT *start_ = state_._curr;
|
|
state_.next (ch_);
|
|
if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') &&
|
!(ch_ >= 'a' && ch_ <= 'z'))
|
{
|
std::ostringstream ss_;
|
|
ss_ << "Invalid MACRO name at index " <<
|
state_.index () - 1 << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
|
do
|
{
|
eos_ = state_.next (ch_);
|
|
if (eos_)
|
{
|
throw runtime_error ("Unexpected end of regex "
|
"(missing '}').");
|
}
|
} while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') ||
|
(ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9'));
|
|
if (ch_ != '}')
|
{
|
std::ostringstream ss_;
|
|
ss_ << "Missing '}' at index " << state_.index () - 1 << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
|
std::size_t len_ = state_._curr - 1 - start_;
|
|
if (len_ > max_macro_len)
|
{
|
std::basic_stringstream<CharT> ss_;
|
std::ostringstream os_;
|
|
os_ << "MACRO name '";
|
|
while (len_)
|
{
|
os_ << ss_.narrow (*start_++, ' ');
|
--len_;
|
}
|
|
os_ << "' too long.";
|
throw runtime_error (os_.str ());
|
}
|
|
token_.set (num_token::MACRO, null_token);
|
|
// Some systems have memcpy in namespace std.
|
using namespace std;
|
|
memcpy (token_._macro, start_, len_ * sizeof (CharT));
|
token_._macro[len_] = 0;
|
}
|
};
|
}
|
}
|
}
|
|
#endif
|