1 ///////////////////////////////////////////////////////////////////////////////
2 // detail/dynamic/parser_traits.hpp
4 // Copyright 2008 Eric Niebler. Distributed under the Boost
5 // Software License, Version 1.0. (See accompanying file
6 // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
8 #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSER_TRAITS_HPP_EAN_10_04_2005
9 #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSER_TRAITS_HPP_EAN_10_04_2005
11 // MS compatible compilers support #pragma once
12 #if defined(_MSC_VER) && (_MSC_VER >= 1020)
18 #include <boost/assert.hpp>
19 #include <boost/throw_exception.hpp>
20 #include <boost/xpressive/regex_error.hpp>
21 #include <boost/xpressive/regex_traits.hpp>
22 #include <boost/xpressive/detail/detail_fwd.hpp>
23 #include <boost/xpressive/detail/dynamic/matchable.hpp>
24 #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
25 #include <boost/xpressive/detail/utility/literals.hpp>
26 #include <boost/xpressive/detail/utility/algorithm.hpp>
28 namespace boost { namespace xpressive
31 ///////////////////////////////////////////////////////////////////////////////
33 // this works for char and wchar_t. it must be specialized for anything else.
35 template<typename RegexTraits>
36 struct compiler_traits
38 typedef RegexTraits regex_traits;
39 typedef typename regex_traits::char_type char_type;
40 typedef typename regex_traits::string_type string_type;
41 typedef typename regex_traits::locale_type locale_type;
43 ///////////////////////////////////////////////////////////////////////////////
45 explicit compiler_traits(RegexTraits const &traits = RegexTraits())
47 , flags_(regex_constants::ECMAScript)
48 , space_(lookup_classname(traits_, "space"))
49 , alnum_(lookup_classname(traits_, "alnum"))
53 ///////////////////////////////////////////////////////////////////////////////
55 regex_constants::syntax_option_type flags() const
60 ///////////////////////////////////////////////////////////////////////////////
62 void flags(regex_constants::syntax_option_type flags)
67 ///////////////////////////////////////////////////////////////////////////////
69 regex_traits &traits()
74 regex_traits const &traits() const
79 ///////////////////////////////////////////////////////////////////////////////
81 locale_type imbue(locale_type const &loc)
83 locale_type oldloc = this->traits().imbue(loc);
84 this->space_ = lookup_classname(this->traits(), "space");
85 this->alnum_ = lookup_classname(this->traits(), "alnum");
89 ///////////////////////////////////////////////////////////////////////////////
91 locale_type getloc() const
93 return this->traits().getloc();
96 ///////////////////////////////////////////////////////////////////////////////
98 // get a token and advance the iterator
99 template<typename FwdIter>
100 regex_constants::compiler_token_type get_token(FwdIter &begin, FwdIter end)
102 using namespace regex_constants;
103 if(this->eat_ws_(begin, end) == end)
105 return regex_constants::token_end_of_pattern;
110 case BOOST_XPR_CHAR_(char_type, '\\'): return this->get_escape_token(++begin, end);
111 case BOOST_XPR_CHAR_(char_type, '.'): ++begin; return token_any;
112 case BOOST_XPR_CHAR_(char_type, '^'): ++begin; return token_assert_begin_line;
113 case BOOST_XPR_CHAR_(char_type, '$'): ++begin; return token_assert_end_line;
114 case BOOST_XPR_CHAR_(char_type, '('): ++begin; return token_group_begin;
115 case BOOST_XPR_CHAR_(char_type, ')'): ++begin; return token_group_end;
116 case BOOST_XPR_CHAR_(char_type, '|'): ++begin; return token_alternate;
117 case BOOST_XPR_CHAR_(char_type, '['): ++begin; return token_charset_begin;
119 case BOOST_XPR_CHAR_(char_type, '*'):
120 case BOOST_XPR_CHAR_(char_type, '+'):
121 case BOOST_XPR_CHAR_(char_type, '?'):
122 return token_invalid_quantifier;
124 case BOOST_XPR_CHAR_(char_type, ']'):
125 case BOOST_XPR_CHAR_(char_type, '{'):
127 return token_literal;
131 ///////////////////////////////////////////////////////////////////////////////
133 template<typename FwdIter>
134 bool get_quant_spec(FwdIter &begin, FwdIter end, detail::quant_spec &spec)
136 using namespace regex_constants;
139 if(this->eat_ws_(begin, end) == end)
146 case BOOST_XPR_CHAR_(char_type, '*'):
148 spec.max_ = (std::numeric_limits<unsigned int>::max)();
151 case BOOST_XPR_CHAR_(char_type, '+'):
153 spec.max_ = (std::numeric_limits<unsigned int>::max)();
156 case BOOST_XPR_CHAR_(char_type, '?'):
161 case BOOST_XPR_CHAR_(char_type, '{'):
162 old_begin = this->eat_ws_(++begin, end);
163 spec.min_ = spec.max_ = detail::toi(begin, end, this->traits());
166 begin != old_begin && begin != end, error_brace, "invalid quantifier"
169 if(*begin == BOOST_XPR_CHAR_(char_type, ','))
171 old_begin = this->eat_ws_(++begin, end);
172 spec.max_ = detail::toi(begin, end, this->traits());
175 begin != end && BOOST_XPR_CHAR_(char_type, '}') == *begin
176 , error_brace, "invalid quantifier"
179 if(begin == old_begin)
181 spec.max_ = (std::numeric_limits<unsigned int>::max)();
187 spec.min_ <= spec.max_, error_badbrace, "invalid quantification range"
195 BOOST_XPR_CHAR_(char_type, '}') == *begin, error_brace, "invalid quantifier"
205 if(this->eat_ws_(++begin, end) != end && BOOST_XPR_CHAR_(char_type, '?') == *begin)
208 spec.greedy_ = false;
214 ///////////////////////////////////////////////////////////////////////////
216 template<typename FwdIter>
217 regex_constants::compiler_token_type get_group_type(FwdIter &begin, FwdIter end, string_type &name)
219 using namespace regex_constants;
220 if(this->eat_ws_(begin, end) != end && BOOST_XPR_CHAR_(char_type, '?') == *begin)
222 this->eat_ws_(++begin, end);
223 BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
227 case BOOST_XPR_CHAR_(char_type, ':'): ++begin; return token_no_mark;
228 case BOOST_XPR_CHAR_(char_type, '>'): ++begin; return token_independent_sub_expression;
229 case BOOST_XPR_CHAR_(char_type, '#'): ++begin; return token_comment;
230 case BOOST_XPR_CHAR_(char_type, '='): ++begin; return token_positive_lookahead;
231 case BOOST_XPR_CHAR_(char_type, '!'): ++begin; return token_negative_lookahead;
232 case BOOST_XPR_CHAR_(char_type, 'R'): ++begin; return token_recurse;
233 case BOOST_XPR_CHAR_(char_type, '$'):
234 this->get_name_(++begin, end, name);
235 BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
236 if(BOOST_XPR_CHAR_(char_type, '=') == *begin)
239 return token_rule_assign;
241 return token_rule_ref;
243 case BOOST_XPR_CHAR_(char_type, '<'):
244 this->eat_ws_(++begin, end);
245 BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
248 case BOOST_XPR_CHAR_(char_type, '='): ++begin; return token_positive_lookbehind;
249 case BOOST_XPR_CHAR_(char_type, '!'): ++begin; return token_negative_lookbehind;
251 BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension"));
254 case BOOST_XPR_CHAR_(char_type, 'P'):
255 this->eat_ws_(++begin, end);
256 BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
259 case BOOST_XPR_CHAR_(char_type, '<'):
260 this->get_name_(++begin, end, name);
261 BOOST_XPR_ENSURE_(begin != end && BOOST_XPR_CHAR_(char_type, '>') == *begin++, error_paren, "incomplete extension");
262 return token_named_mark;
263 case BOOST_XPR_CHAR_(char_type, '='):
264 this->get_name_(++begin, end, name);
265 BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
266 return token_named_mark_ref;
268 BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension"));
271 case BOOST_XPR_CHAR_(char_type, 'i'):
272 case BOOST_XPR_CHAR_(char_type, 'm'):
273 case BOOST_XPR_CHAR_(char_type, 's'):
274 case BOOST_XPR_CHAR_(char_type, 'x'):
275 case BOOST_XPR_CHAR_(char_type, '-'):
276 return this->parse_mods_(begin, end);
279 BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension"));
283 return token_literal;
286 //////////////////////////////////////////////////////////////////////////
288 // NOTE: white-space is *never* ignored in a charset.
289 template<typename FwdIter>
290 regex_constants::compiler_token_type get_charset_token(FwdIter &begin, FwdIter end)
292 using namespace regex_constants;
293 BOOST_ASSERT(begin != end);
296 case BOOST_XPR_CHAR_(char_type, '^'): ++begin; return token_charset_invert;
297 case BOOST_XPR_CHAR_(char_type, '-'): ++begin; return token_charset_hyphen;
298 case BOOST_XPR_CHAR_(char_type, ']'): ++begin; return token_charset_end;
299 case BOOST_XPR_CHAR_(char_type, '['):
301 FwdIter next = begin; ++next;
305 *next != BOOST_XPR_CHAR_(char_type, '=')
307 , "equivalence classes are not yet supported"
311 *next != BOOST_XPR_CHAR_(char_type, '.')
313 , "collation sequences are not yet supported"
316 if(*next == BOOST_XPR_CHAR_(char_type, ':'))
319 return token_posix_charset_begin;
324 case BOOST_XPR_CHAR_(char_type, ':'):
326 FwdIter next = begin; ++next;
327 if(next != end && *next == BOOST_XPR_CHAR_(char_type, ']'))
330 return token_posix_charset_end;
334 case BOOST_XPR_CHAR_(char_type, '\\'):
339 case BOOST_XPR_CHAR_(char_type, 'b'): ++begin; return token_charset_backspace;
346 return token_literal;
349 //////////////////////////////////////////////////////////////////////////
351 template<typename FwdIter>
352 regex_constants::compiler_token_type get_escape_token(FwdIter &begin, FwdIter end)
354 using namespace regex_constants;
359 //case BOOST_XPR_CHAR_(char_type, 'a'): ++begin; return token_escape_bell;
360 //case BOOST_XPR_CHAR_(char_type, 'c'): ++begin; return token_escape_control;
361 //case BOOST_XPR_CHAR_(char_type, 'e'): ++begin; return token_escape_escape;
362 //case BOOST_XPR_CHAR_(char_type, 'f'): ++begin; return token_escape_formfeed;
363 //case BOOST_XPR_CHAR_(char_type, 'n'): ++begin; return token_escape_newline;
364 //case BOOST_XPR_CHAR_(char_type, 't'): ++begin; return token_escape_horizontal_tab;
365 //case BOOST_XPR_CHAR_(char_type, 'v'): ++begin; return token_escape_vertical_tab;
366 case BOOST_XPR_CHAR_(char_type, 'A'): ++begin; return token_assert_begin_sequence;
367 case BOOST_XPR_CHAR_(char_type, 'b'): ++begin; return token_assert_word_boundary;
368 case BOOST_XPR_CHAR_(char_type, 'B'): ++begin; return token_assert_not_word_boundary;
369 case BOOST_XPR_CHAR_(char_type, 'E'): ++begin; return token_quote_meta_end;
370 case BOOST_XPR_CHAR_(char_type, 'Q'): ++begin; return token_quote_meta_begin;
371 case BOOST_XPR_CHAR_(char_type, 'Z'): ++begin; return token_assert_end_sequence;
372 // Non-standard extension to ECMAScript syntax
373 case BOOST_XPR_CHAR_(char_type, '<'): ++begin; return token_assert_word_begin;
374 case BOOST_XPR_CHAR_(char_type, '>'): ++begin; return token_assert_word_end;
375 default:; // fall-through
384 //////////////////////////////////////////////////////////////////////////
386 template<typename FwdIter>
387 regex_constants::compiler_token_type parse_mods_(FwdIter &begin, FwdIter end)
389 using namespace regex_constants;
393 case BOOST_XPR_CHAR_(char_type, 'i'): this->flag_(set, icase_); break;
394 case BOOST_XPR_CHAR_(char_type, 'm'): this->flag_(!set, single_line); break;
395 case BOOST_XPR_CHAR_(char_type, 's'): this->flag_(!set, not_dot_newline); break;
396 case BOOST_XPR_CHAR_(char_type, 'x'): this->flag_(set, ignore_white_space); break;
397 case BOOST_XPR_CHAR_(char_type, ':'): ++begin; // fall-through
398 case BOOST_XPR_CHAR_(char_type, ')'): return token_no_mark;
399 case BOOST_XPR_CHAR_(char_type, '-'): if(false == (set = !set)) break; // else fall-through
400 default: BOOST_THROW_EXCEPTION(regex_error(error_paren, "unknown pattern modifier"));
402 while(BOOST_XPR_ENSURE_(++begin != end, error_paren, "incomplete extension"));
403 // this return is technically unreachable, but this must
404 // be here to work around a bug in gcc 4.0
405 return token_no_mark;
408 ///////////////////////////////////////////////////////////////////////////////
410 void flag_(bool set, regex_constants::syntax_option_type flag)
412 this->flags_ = set ? (this->flags_ | flag) : (this->flags_ & ~flag);
415 ///////////////////////////////////////////////////////////////////////////
417 bool is_space_(char_type ch) const
419 return 0 != this->space_ && this->traits().isctype(ch, this->space_);
422 ///////////////////////////////////////////////////////////////////////////
424 bool is_alnum_(char_type ch) const
426 return 0 != this->alnum_ && this->traits().isctype(ch, this->alnum_);
429 ///////////////////////////////////////////////////////////////////////////
431 template<typename FwdIter>
432 void get_name_(FwdIter &begin, FwdIter end, string_type &name)
434 this->eat_ws_(begin, end);
435 for(name.clear(); begin != end && this->is_alnum_(*begin); ++begin)
437 name.push_back(*begin);
439 this->eat_ws_(begin, end);
440 BOOST_XPR_ENSURE_(!name.empty(), regex_constants::error_paren, "incomplete extension");
443 ///////////////////////////////////////////////////////////////////////////////
445 template<typename FwdIter>
446 FwdIter &eat_ws_(FwdIter &begin, FwdIter end)
448 if(0 != (regex_constants::ignore_white_space & this->flags()))
450 while(end != begin && (BOOST_XPR_CHAR_(char_type, '#') == *begin || this->is_space_(*begin)))
452 if(BOOST_XPR_CHAR_(char_type, '#') == *begin++)
454 while(end != begin && BOOST_XPR_CHAR_(char_type, '\n') != *begin++) {}
458 for(; end != begin && this->is_space_(*begin); ++begin) {}
466 regex_traits traits_;
467 regex_constants::syntax_option_type flags_;
468 typename regex_traits::char_class_type space_;
469 typename regex_traits::char_class_type alnum_;
472 }} // namespace boost::xpressive