// tokeniser_helper.hpp
|
// Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
|
//
|
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
#ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_TOKENISER_RE_TOKENISER_HELPER_HPP
|
#define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_TOKENISER_RE_TOKENISER_HELPER_HPP
|
|
#include "../../char_traits.hpp"
|
// strlen()
|
#include <cstring>
|
#include "../../size_t.hpp"
|
#include "re_tokeniser_state.hpp"
|
|
namespace boost
|
{
|
namespace lexer
|
{
|
namespace detail
|
{
|
template<typename CharT, typename Traits = char_traits<CharT> >
|
class basic_re_tokeniser_helper
|
{
|
public:
|
typedef basic_re_tokeniser_state<CharT> state;
|
typedef std::basic_string<CharT> string;
|
|
static const CharT *escape_sequence (state &state_, CharT &ch_,
|
std::size_t &str_len_)
|
{
|
bool eos_ = state_.eos ();
|
|
if (eos_)
|
{
|
throw runtime_error ("Unexpected end of regex "
|
"following '\\'.");
|
}
|
|
const CharT *str_ = charset_shortcut (*state_._curr, str_len_);
|
|
if (str_)
|
{
|
state_.increment ();
|
}
|
else
|
{
|
ch_ = chr (state_);
|
}
|
|
return str_;
|
}
|
|
// This function can call itself.
|
static void charset (state &state_, string &chars_, bool &negated_)
|
{
|
CharT ch_ = 0;
|
bool eos_ = state_.next (ch_);
|
|
if (eos_)
|
{
|
// Pointless returning index if at end of string
|
throw runtime_error ("Unexpected end of regex "
|
"following '['.");
|
}
|
|
negated_ = ch_ == '^';
|
|
if (negated_)
|
{
|
eos_ = state_.next (ch_);
|
|
if (eos_)
|
{
|
// Pointless returning index if at end of string
|
throw runtime_error ("Unexpected end of regex "
|
"following '^'.");
|
}
|
}
|
|
bool chset_ = false;
|
CharT prev_ = 0;
|
|
while (ch_ != ']')
|
{
|
if (ch_ == '\\')
|
{
|
std::size_t str_len_ = 0;
|
const CharT *str_ = escape_sequence (state_, prev_, str_len_);
|
|
chset_ = str_ != 0;
|
|
if (chset_)
|
{
|
state temp_state_ (str_ + 1, str_ + str_len_,
|
state_._flags, state_._locale);
|
string temp_chars_;
|
bool temp_negated_ = false;
|
|
charset (temp_state_, temp_chars_, temp_negated_);
|
|
if (negated_ != temp_negated_)
|
{
|
std::ostringstream ss_;
|
|
ss_ << "Mismatch in charset negation preceding "
|
"index " << state_.index () << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
|
chars_ += temp_chars_;
|
}
|
}
|
/*
|
else if (ch_ == '[' && !state_.eos () && *state_._curr == ':')
|
{
|
// TODO: POSIX charsets
|
}
|
*/
|
else
|
{
|
chset_ = false;
|
prev_ = ch_;
|
}
|
|
eos_ = state_.next (ch_);
|
|
// Covers preceding if, else if and else
|
if (eos_)
|
{
|
// Pointless returning index if at end of string
|
throw runtime_error ("Unexpected end of regex "
|
"(missing ']').");
|
}
|
|
if (ch_ == '-')
|
{
|
charset_range (chset_, state_, eos_, ch_, prev_, chars_);
|
}
|
else if (!chset_)
|
{
|
if ((state_._flags & icase) &&
|
(std::isupper (prev_, state_._locale) ||
|
std::islower (prev_, state_._locale)))
|
{
|
CharT upper_ = std::toupper (prev_, state_._locale);
|
CharT lower_ = std::tolower (prev_, state_._locale);
|
|
chars_ += upper_;
|
chars_ += lower_;
|
}
|
else
|
{
|
chars_ += prev_;
|
}
|
}
|
}
|
|
if (!negated_ && chars_.empty ())
|
{
|
throw runtime_error ("Empty charsets not allowed.");
|
}
|
}
|
|
static CharT chr (state &state_)
|
{
|
CharT ch_ = 0;
|
|
// eos_ has already been checked for.
|
switch (*state_._curr)
|
{
|
case '0':
|
case '1':
|
case '2':
|
case '3':
|
case '4':
|
case '5':
|
case '6':
|
case '7':
|
ch_ = decode_octal (state_);
|
break;
|
case 'a':
|
ch_ = '\a';
|
state_.increment ();
|
break;
|
case 'b':
|
ch_ = '\b';
|
state_.increment ();
|
break;
|
case 'c':
|
ch_ = decode_control_char (state_);
|
break;
|
case 'e':
|
ch_ = 27; // '\e' not recognised by compiler
|
state_.increment ();
|
break;
|
case 'f':
|
ch_ = '\f';
|
state_.increment ();
|
break;
|
case 'n':
|
ch_ = '\n';
|
state_.increment ();
|
break;
|
case 'r':
|
ch_ = '\r';
|
state_.increment ();
|
break;
|
case 't':
|
ch_ = '\t';
|
state_.increment ();
|
break;
|
case 'v':
|
ch_ = '\v';
|
state_.increment ();
|
break;
|
case 'x':
|
ch_ = decode_hex (state_);
|
break;
|
default:
|
ch_ = *state_._curr;
|
state_.increment ();
|
break;
|
}
|
|
return ch_;
|
}
|
|
private:
|
static const char *charset_shortcut (const char ch_,
|
std::size_t &str_len_)
|
{
|
const char *str_ = 0;
|
|
switch (ch_)
|
{
|
case 'd':
|
str_ = "[0-9]";
|
break;
|
case 'D':
|
str_ = "[^0-9]";
|
break;
|
case 's':
|
str_ = "[ \t\n\r\f\v]";
|
break;
|
case 'S':
|
str_ = "[^ \t\n\r\f\v]";
|
break;
|
case 'w':
|
str_ = "[_0-9A-Za-z]";
|
break;
|
case 'W':
|
str_ = "[^_0-9A-Za-z]";
|
break;
|
}
|
|
if (str_)
|
{
|
// Some systems have strlen in namespace std.
|
using namespace std;
|
|
str_len_ = strlen (str_);
|
}
|
else
|
{
|
str_len_ = 0;
|
}
|
|
return str_;
|
}
|
|
static const wchar_t *charset_shortcut (const wchar_t ch_,
|
std::size_t &str_len_)
|
{
|
const wchar_t *str_ = 0;
|
|
switch (ch_)
|
{
|
case 'd':
|
str_ = L"[0-9]";
|
break;
|
case 'D':
|
str_ = L"[^0-9]";
|
break;
|
case 's':
|
str_ = L"[ \t\n\r\f\v]";
|
break;
|
case 'S':
|
str_ = L"[^ \t\n\r\f\v]";
|
break;
|
case 'w':
|
str_ = L"[_0-9A-Za-z]";
|
break;
|
case 'W':
|
str_ = L"[^_0-9A-Za-z]";
|
break;
|
}
|
|
if (str_)
|
{
|
// Some systems have wcslen in namespace std.
|
using namespace std;
|
|
str_len_ = wcslen (str_);
|
}
|
else
|
{
|
str_len_ = 0;
|
}
|
|
return str_;
|
}
|
|
static CharT decode_octal (state &state_)
|
{
|
std::size_t accumulator_ = 0;
|
CharT ch_ = *state_._curr;
|
unsigned short count_ = 3;
|
bool eos_ = false;
|
|
for (;;)
|
{
|
accumulator_ *= 8;
|
accumulator_ += ch_ - '0';
|
--count_;
|
state_.increment ();
|
eos_ = state_.eos ();
|
|
if (!count_ || eos_) break;
|
|
ch_ = *state_._curr;
|
|
// Don't consume invalid chars!
|
if (ch_ < '0' || ch_ > '7')
|
{
|
break;
|
}
|
}
|
|
return static_cast<CharT> (accumulator_);
|
}
|
|
static CharT decode_control_char (state &state_)
|
{
|
// Skip over 'c'
|
state_.increment ();
|
|
CharT ch_ = 0;
|
bool eos_ = state_.next (ch_);
|
|
if (eos_)
|
{
|
// Pointless returning index if at end of string
|
throw runtime_error ("Unexpected end of regex following \\c.");
|
}
|
else
|
{
|
if (ch_ >= 'a' && ch_ <= 'z')
|
{
|
ch_ -= 'a' - 1;
|
}
|
else if (ch_ >= 'A' && ch_ <= 'Z')
|
{
|
ch_ -= 'A' - 1;
|
}
|
else if (ch_ == '@')
|
{
|
// Apparently...
|
ch_ = 0;
|
}
|
else
|
{
|
std::ostringstream ss_;
|
|
ss_ << "Invalid control char at index " <<
|
state_.index () - 1 << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
}
|
|
return ch_;
|
}
|
|
static CharT decode_hex (state &state_)
|
{
|
// Skip over 'x'
|
state_.increment ();
|
|
CharT ch_ = 0;
|
bool eos_ = state_.next (ch_);
|
|
if (eos_)
|
{
|
// Pointless returning index if at end of string
|
throw runtime_error ("Unexpected end of regex following \\x.");
|
}
|
|
if (!((ch_ >= '0' && ch_ <= '9') || (ch_ >= 'a' && ch_ <= 'f') ||
|
(ch_ >= 'A' && ch_ <= 'F')))
|
{
|
std::ostringstream ss_;
|
|
ss_ << "Illegal char following \\x at index " <<
|
state_.index () - 1 << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
|
std::size_t hex_ = 0;
|
|
do
|
{
|
hex_ *= 16;
|
|
if (ch_ >= '0' && ch_ <= '9')
|
{
|
hex_ += ch_ - '0';
|
}
|
else if (ch_ >= 'a' && ch_ <= 'f')
|
{
|
hex_ += 10 + (ch_ - 'a');
|
}
|
else
|
{
|
hex_ += 10 + (ch_ - 'A');
|
}
|
|
eos_ = state_.eos ();
|
|
if (!eos_)
|
{
|
ch_ = *state_._curr;
|
|
// Don't consume invalid chars!
|
if (((ch_ >= '0' && ch_ <= '9') ||
|
(ch_ >= 'a' && ch_ <= 'f') || (ch_ >= 'A' && ch_ <= 'F')))
|
{
|
state_.increment ();
|
}
|
else
|
{
|
eos_ = true;
|
}
|
}
|
} while (!eos_);
|
|
return static_cast<CharT> (hex_);
|
}
|
|
static void charset_range (const bool chset_, state &state_, bool &eos_,
|
CharT &ch_, const CharT prev_, string &chars_)
|
{
|
if (chset_)
|
{
|
std::ostringstream ss_;
|
|
ss_ << "Charset cannot form start of range preceding "
|
"index " << state_.index () - 1 << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
|
eos_ = state_.next (ch_);
|
|
if (eos_)
|
{
|
// Pointless returning index if at end of string
|
throw runtime_error ("Unexpected end of regex "
|
"following '-'.");
|
}
|
|
CharT curr_ = 0;
|
|
if (ch_ == '\\')
|
{
|
std::size_t str_len_ = 0;
|
|
if (escape_sequence (state_, curr_, str_len_))
|
{
|
std::ostringstream ss_;
|
|
ss_ << "Charset cannot form end of range preceding index "
|
<< state_.index () << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
}
|
/*
|
else if (ch_ == '[' && !state_.eos () && *state_._curr == ':')
|
{
|
std::ostringstream ss_;
|
|
ss_ << "POSIX char class cannot form end of range at "
|
"index " << state_.index () - 1 << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
*/
|
else
|
{
|
curr_ = ch_;
|
}
|
|
eos_ = state_.next (ch_);
|
|
// Covers preceding if and else
|
if (eos_)
|
{
|
// Pointless returning index if at end of string
|
throw runtime_error ("Unexpected end of regex "
|
"(missing ']').");
|
}
|
|
std::size_t start_ = static_cast<typename Traits::index_type> (prev_);
|
std::size_t end_ = static_cast<typename Traits::index_type> (curr_);
|
|
// Semanic check
|
if (end_ < start_)
|
{
|
std::ostringstream ss_;
|
|
ss_ << "Invalid range in charset preceding index " <<
|
state_.index () - 1 << '.';
|
throw runtime_error (ss_.str ().c_str ());
|
}
|
|
chars_.reserve (chars_.size () + (end_ + 1 - start_));
|
|
for (; start_ <= end_; ++start_)
|
{
|
CharT ch_ = static_cast<CharT> (start_);
|
|
if ((state_._flags & icase) &&
|
(std::isupper (ch_, state_._locale) ||
|
std::islower (ch_, state_._locale)))
|
{
|
CharT upper_ = std::toupper (ch_, state_._locale);
|
CharT lower_ = std::tolower (ch_, state_._locale);
|
|
chars_ += (upper_);
|
chars_ += (lower_);
|
}
|
else
|
{
|
chars_ += (ch_);
|
}
|
}
|
}
|
};
|
}
|
}
|
}
|
|
#endif
|