1 /*
2 *
3 * Copyright (c) 2004
4 * John Maddock
5 *
6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 *
10 */
11
12 /*
13 * LOCATION: see http://www.boost.org for most recent version.
14 * FILE basic_regex_parser.cpp
15 * VERSION see <boost/version.hpp>
16 * DESCRIPTION: Declares template class basic_regex_parser.
17 */
18
19 #ifndef BOOST_REGEX_V5_BASIC_REGEX_PARSER_HPP
20 #define BOOST_REGEX_V5_BASIC_REGEX_PARSER_HPP
21
22 namespace boost{
23 namespace BOOST_REGEX_DETAIL_NS{
24
25 #ifdef BOOST_REGEX_MSVC
26 #pragma warning(push)
27 #pragma warning(disable:4244 4459)
28 #if BOOST_REGEX_MSVC < 1910
29 #pragma warning(disable:4800)
30 #endif
31 #endif
32
umax(std::integral_constant<bool,false> const &)33 inline std::intmax_t umax(std::integral_constant<bool, false> const&)
34 {
35 // Get out clause here, just in case numeric_limits is unspecialized:
36 return std::numeric_limits<std::intmax_t>::is_specialized ? (std::numeric_limits<std::intmax_t>::max)() : INT_MAX;
37 }
umax(std::integral_constant<bool,true> const &)38 inline std::intmax_t umax(std::integral_constant<bool, true> const&)
39 {
40 return (std::numeric_limits<std::size_t>::max)();
41 }
42
umax()43 inline std::intmax_t umax()
44 {
45 return umax(std::integral_constant<bool, std::numeric_limits<std::intmax_t>::digits >= std::numeric_limits<std::size_t>::digits>());
46 }
47
48 template <class charT, class traits>
49 class basic_regex_parser : public basic_regex_creator<charT, traits>
50 {
51 public:
52 basic_regex_parser(regex_data<charT, traits>* data);
53 void parse(const charT* p1, const charT* p2, unsigned flags);
54 void fail(regex_constants::error_type error_code, std::ptrdiff_t position);
55 void fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos);
fail(regex_constants::error_type error_code,std::ptrdiff_t position,const std::string & message)56 void fail(regex_constants::error_type error_code, std::ptrdiff_t position, const std::string& message)
57 {
58 fail(error_code, position, message, position);
59 }
60
61 bool parse_all();
62 bool parse_basic();
63 bool parse_extended();
64 bool parse_literal();
65 bool parse_open_paren();
66 bool parse_basic_escape();
67 bool parse_extended_escape();
68 bool parse_match_any();
69 bool parse_repeat(std::size_t low = 0, std::size_t high = (std::numeric_limits<std::size_t>::max)());
70 bool parse_repeat_range(bool isbasic);
71 bool parse_alt();
72 bool parse_set();
73 bool parse_backref();
74 void parse_set_literal(basic_char_set<charT, traits>& char_set);
75 bool parse_inner_set(basic_char_set<charT, traits>& char_set);
76 bool parse_QE();
77 bool parse_perl_extension();
78 bool parse_perl_verb();
79 bool match_verb(const char*);
80 bool add_emacs_code(bool negate);
81 bool unwind_alts(std::ptrdiff_t last_paren_start);
82 digraph<charT> get_next_set_literal(basic_char_set<charT, traits>& char_set);
83 charT unescape_character();
84 regex_constants::syntax_option_type parse_options();
85
86 private:
87 typedef bool (basic_regex_parser::*parser_proc_type)();
88 typedef typename traits::string_type string_type;
89 typedef typename traits::char_class_type char_class_type;
90 parser_proc_type m_parser_proc; // the main parser to use
91 const charT* m_base; // the start of the string being parsed
92 const charT* m_end; // the end of the string being parsed
93 const charT* m_position; // our current parser position
94 unsigned m_mark_count; // how many sub-expressions we have
95 int m_mark_reset; // used to indicate that we're inside a (?|...) block.
96 unsigned m_max_mark; // largest mark count seen inside a (?|...) block.
97 std::ptrdiff_t m_paren_start; // where the last seen ')' began (where repeats are inserted).
98 std::ptrdiff_t m_alt_insert_point; // where to insert the next alternative
99 bool m_has_case_change; // true if somewhere in the current block the case has changed
100 unsigned m_recursion_count; // How many times we've called parse_all.
101 unsigned m_max_backref; // Largest index of any backref.
102 #if defined(BOOST_REGEX_MSVC) && defined(_M_IX86)
103 // This is an ugly warning suppression workaround (for warnings *inside* std::vector
104 // that can not otherwise be suppressed)...
105 static_assert(sizeof(long) >= sizeof(void*), "Long isn't long enough!");
106 std::vector<long> m_alt_jumps; // list of alternative in the current scope.
107 #else
108 std::vector<std::ptrdiff_t> m_alt_jumps; // list of alternative in the current scope.
109 #endif
110
111 basic_regex_parser& operator=(const basic_regex_parser&);
112 basic_regex_parser(const basic_regex_parser&);
113 };
114
115 template <class charT, class traits>
basic_regex_parser(regex_data<charT,traits> * data)116 basic_regex_parser<charT, traits>::basic_regex_parser(regex_data<charT, traits>* data)
117 : basic_regex_creator<charT, traits>(data), m_parser_proc(), m_base(0), m_end(0), m_position(0),
118 m_mark_count(0), m_mark_reset(-1), m_max_mark(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false), m_recursion_count(0), m_max_backref(0)
119 {
120 }
121
122 template <class charT, class traits>
parse(const charT * p1,const charT * p2,unsigned l_flags)123 void basic_regex_parser<charT, traits>::parse(const charT* p1, const charT* p2, unsigned l_flags)
124 {
125 // pass l_flags on to base class:
126 this->init(l_flags);
127 // set up pointers:
128 m_position = m_base = p1;
129 m_end = p2;
130 // empty strings are errors:
131 if((p1 == p2) &&
132 (
133 ((l_flags & regbase::main_option_type) != regbase::perl_syntax_group)
134 || (l_flags & regbase::no_empty_expressions)
135 )
136 )
137 {
138 fail(regex_constants::error_empty, 0);
139 return;
140 }
141 // select which parser to use:
142 switch(l_flags & regbase::main_option_type)
143 {
144 case regbase::perl_syntax_group:
145 {
146 m_parser_proc = &basic_regex_parser<charT, traits>::parse_extended;
147 //
148 // Add a leading paren with index zero to give recursions a target:
149 //
150 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
151 br->index = 0;
152 br->icase = this->flags() & regbase::icase;
153 break;
154 }
155 case regbase::basic_syntax_group:
156 m_parser_proc = &basic_regex_parser<charT, traits>::parse_basic;
157 break;
158 case regbase::literal:
159 m_parser_proc = &basic_regex_parser<charT, traits>::parse_literal;
160 break;
161 default:
162 // Oops, someone has managed to set more than one of the main option flags,
163 // so this must be an error:
164 fail(regex_constants::error_unknown, 0, "An invalid combination of regular expression syntax flags was used.");
165 return;
166 }
167
168 // parse all our characters:
169 bool result = parse_all();
170 //
171 // Unwind our alternatives:
172 //
173 unwind_alts(-1);
174 // reset l_flags as a global scope (?imsx) may have altered them:
175 this->flags(l_flags);
176 // if we haven't gobbled up all the characters then we must
177 // have had an unexpected ')' :
178 if(!result)
179 {
180 fail(regex_constants::error_paren, std::distance(m_base, m_position), "Found a closing ) with no corresponding opening parenthesis.");
181 return;
182 }
183 // if an error has been set then give up now:
184 if(this->m_pdata->m_status)
185 return;
186 // fill in our sub-expression count:
187 this->m_pdata->m_mark_count = 1u + (std::size_t)m_mark_count;
188 //
189 // Check we don't have backreferences to sub-expressions which don't exist:
190 //
191 if (m_max_backref > m_mark_count)
192 {
193 fail(regex_constants::error_backref, std::distance(m_base, m_position), "Found a backreference to a non-existant sub-expression.");
194 }
195 this->finalize(p1, p2);
196 }
197
198 template <class charT, class traits>
fail(regex_constants::error_type error_code,std::ptrdiff_t position)199 void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position)
200 {
201 // get the error message:
202 std::string message = this->m_pdata->m_ptraits->error_string(error_code);
203 fail(error_code, position, message);
204 }
205
206 template <class charT, class traits>
fail(regex_constants::error_type error_code,std::ptrdiff_t position,std::string message,std::ptrdiff_t start_pos)207 void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos)
208 {
209 if(0 == this->m_pdata->m_status) // update the error code if not already set
210 this->m_pdata->m_status = error_code;
211 m_position = m_end; // don't bother parsing anything else
212
213 //
214 // Augment error message with the regular expression text:
215 //
216 if(start_pos == position)
217 start_pos = (std::max)(static_cast<std::ptrdiff_t>(0), position - static_cast<std::ptrdiff_t>(10));
218 std::ptrdiff_t end_pos = (std::min)(position + static_cast<std::ptrdiff_t>(10), static_cast<std::ptrdiff_t>(m_end - m_base));
219 if(error_code != regex_constants::error_empty)
220 {
221 if((start_pos != 0) || (end_pos != (m_end - m_base)))
222 message += " The error occurred while parsing the regular expression fragment: '";
223 else
224 message += " The error occurred while parsing the regular expression: '";
225 if(start_pos != end_pos)
226 {
227 message += std::string(m_base + start_pos, m_base + position);
228 message += ">>>HERE>>>";
229 message += std::string(m_base + position, m_base + end_pos);
230 }
231 message += "'.";
232 }
233
234 #ifndef BOOST_NO_EXCEPTIONS
235 if(0 == (this->flags() & regex_constants::no_except))
236 {
237 boost::regex_error e(message, error_code, position);
238 e.raise();
239 }
240 #else
241 (void)position; // suppress warnings.
242 #endif
243 }
244
245 template <class charT, class traits>
parse_all()246 bool basic_regex_parser<charT, traits>::parse_all()
247 {
248 if (++m_recursion_count > 400)
249 {
250 // exceeded internal limits
251 fail(boost::regex_constants::error_complexity, m_position - m_base, "Exceeded nested brace limit.");
252 }
253 bool result = true;
254 while(result && (m_position != m_end))
255 {
256 result = (this->*m_parser_proc)();
257 }
258 --m_recursion_count;
259 return result;
260 }
261
262 #ifdef BOOST_REGEX_MSVC
263 #pragma warning(push)
264 #pragma warning(disable:4702)
265 #endif
266 template <class charT, class traits>
parse_basic()267 bool basic_regex_parser<charT, traits>::parse_basic()
268 {
269 switch(this->m_traits.syntax_type(*m_position))
270 {
271 case regex_constants::syntax_escape:
272 return parse_basic_escape();
273 case regex_constants::syntax_dot:
274 return parse_match_any();
275 case regex_constants::syntax_caret:
276 ++m_position;
277 this->append_state(syntax_element_start_line);
278 break;
279 case regex_constants::syntax_dollar:
280 ++m_position;
281 this->append_state(syntax_element_end_line);
282 break;
283 case regex_constants::syntax_star:
284 if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line))
285 return parse_literal();
286 else
287 {
288 ++m_position;
289 return parse_repeat();
290 }
291 case regex_constants::syntax_plus:
292 if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex))
293 return parse_literal();
294 else
295 {
296 ++m_position;
297 return parse_repeat(1);
298 }
299 case regex_constants::syntax_question:
300 if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex))
301 return parse_literal();
302 else
303 {
304 ++m_position;
305 return parse_repeat(0, 1);
306 }
307 case regex_constants::syntax_open_set:
308 return parse_set();
309 case regex_constants::syntax_newline:
310 if(this->flags() & regbase::newline_alt)
311 return parse_alt();
312 else
313 return parse_literal();
314 default:
315 return parse_literal();
316 }
317 return true;
318 }
319
320 #ifdef BOOST_REGEX_MSVC
321 # pragma warning(push)
322 #if BOOST_REGEX_MSVC >= 1800
323 #pragma warning(disable:26812)
324 #endif
325 #endif
326 template <class charT, class traits>
parse_extended()327 bool basic_regex_parser<charT, traits>::parse_extended()
328 {
329 bool result = true;
330 switch(this->m_traits.syntax_type(*m_position))
331 {
332 case regex_constants::syntax_open_mark:
333 return parse_open_paren();
334 case regex_constants::syntax_close_mark:
335 return false;
336 case regex_constants::syntax_escape:
337 return parse_extended_escape();
338 case regex_constants::syntax_dot:
339 return parse_match_any();
340 case regex_constants::syntax_caret:
341 ++m_position;
342 this->append_state(
343 (this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_start : syntax_element_start_line));
344 break;
345 case regex_constants::syntax_dollar:
346 ++m_position;
347 this->append_state(
348 (this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_end : syntax_element_end_line));
349 break;
350 case regex_constants::syntax_star:
351 if(m_position == this->m_base)
352 {
353 fail(regex_constants::error_badrepeat, 0, "The repeat operator \"*\" cannot start a regular expression.");
354 return false;
355 }
356 ++m_position;
357 return parse_repeat();
358 case regex_constants::syntax_question:
359 if(m_position == this->m_base)
360 {
361 fail(regex_constants::error_badrepeat, 0, "The repeat operator \"?\" cannot start a regular expression.");
362 return false;
363 }
364 ++m_position;
365 return parse_repeat(0,1);
366 case regex_constants::syntax_plus:
367 if(m_position == this->m_base)
368 {
369 fail(regex_constants::error_badrepeat, 0, "The repeat operator \"+\" cannot start a regular expression.");
370 return false;
371 }
372 ++m_position;
373 return parse_repeat(1);
374 case regex_constants::syntax_open_brace:
375 ++m_position;
376 return parse_repeat_range(false);
377 case regex_constants::syntax_close_brace:
378 if((this->flags() & regbase::no_perl_ex) == regbase::no_perl_ex)
379 {
380 fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {.");
381 return false;
382 }
383 result = parse_literal();
384 break;
385 case regex_constants::syntax_or:
386 return parse_alt();
387 case regex_constants::syntax_open_set:
388 return parse_set();
389 case regex_constants::syntax_newline:
390 if(this->flags() & regbase::newline_alt)
391 return parse_alt();
392 else
393 return parse_literal();
394 case regex_constants::syntax_hash:
395 //
396 // If we have a mod_x flag set, then skip until
397 // we get to a newline character:
398 //
399 if((this->flags()
400 & (regbase::no_perl_ex|regbase::mod_x))
401 == regbase::mod_x)
402 {
403 while((m_position != m_end) && !is_separator(*m_position++)){}
404 return true;
405 }
406 BOOST_REGEX_FALLTHROUGH;
407 default:
408 result = parse_literal();
409 break;
410 }
411 return result;
412 }
413 #ifdef BOOST_REGEX_MSVC
414 # pragma warning(pop)
415 #endif
416 #ifdef BOOST_REGEX_MSVC
417 #pragma warning(pop)
418 #endif
419
420 template <class charT, class traits>
parse_literal()421 bool basic_regex_parser<charT, traits>::parse_literal()
422 {
423 // append this as a literal provided it's not a space character
424 // or the perl option regbase::mod_x is not set:
425 if(
426 ((this->flags()
427 & (regbase::main_option_type|regbase::mod_x|regbase::no_perl_ex))
428 != regbase::mod_x)
429 || !this->m_traits.isctype(*m_position, this->m_mask_space))
430 this->append_literal(*m_position);
431 ++m_position;
432 return true;
433 }
434
435 template <class charT, class traits>
parse_open_paren()436 bool basic_regex_parser<charT, traits>::parse_open_paren()
437 {
438 //
439 // skip the '(' and error check:
440 //
441 if(++m_position == m_end)
442 {
443 fail(regex_constants::error_paren, m_position - m_base);
444 return false;
445 }
446 //
447 // begin by checking for a perl-style (?...) extension:
448 //
449 if(
450 ((this->flags() & (regbase::main_option_type | regbase::no_perl_ex)) == 0)
451 || ((this->flags() & (regbase::main_option_type | regbase::emacs_ex)) == (regbase::basic_syntax_group|regbase::emacs_ex))
452 )
453 {
454 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
455 return parse_perl_extension();
456 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_star)
457 return parse_perl_verb();
458 }
459 //
460 // update our mark count, and append the required state:
461 //
462 unsigned markid = 0;
463 if(0 == (this->flags() & regbase::nosubs))
464 {
465 markid = ++m_mark_count;
466 if(this->flags() & regbase::save_subexpression_location)
467 this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 1, 0));
468 }
469 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
470 pb->index = markid;
471 pb->icase = this->flags() & regbase::icase;
472 std::ptrdiff_t last_paren_start = this->getoffset(pb);
473 // back up insertion point for alternations, and set new point:
474 std::ptrdiff_t last_alt_point = m_alt_insert_point;
475 this->m_pdata->m_data.align();
476 m_alt_insert_point = this->m_pdata->m_data.size();
477 //
478 // back up the current flags in case we have a nested (?imsx) group:
479 //
480 regex_constants::syntax_option_type opts = this->flags();
481 bool old_case_change = m_has_case_change;
482 m_has_case_change = false; // no changes to this scope as yet...
483 //
484 // Back up branch reset data in case we have a nested (?|...)
485 //
486 int mark_reset = m_mark_reset;
487 m_mark_reset = -1;
488 //
489 // now recursively add more states, this will terminate when we get to a
490 // matching ')' :
491 //
492 parse_all();
493 //
494 // Unwind pushed alternatives:
495 //
496 if(0 == unwind_alts(last_paren_start))
497 return false;
498 //
499 // restore flags:
500 //
501 if(m_has_case_change)
502 {
503 // the case has changed in one or more of the alternatives
504 // within the scoped (...) block: we have to add a state
505 // to reset the case sensitivity:
506 static_cast<re_case*>(
507 this->append_state(syntax_element_toggle_case, sizeof(re_case))
508 )->icase = opts & regbase::icase;
509 }
510 this->flags(opts);
511 m_has_case_change = old_case_change;
512 //
513 // restore branch reset:
514 //
515 m_mark_reset = mark_reset;
516 //
517 // we either have a ')' or we have run out of characters prematurely:
518 //
519 if(m_position == m_end)
520 {
521 this->fail(regex_constants::error_paren, std::distance(m_base, m_end));
522 return false;
523 }
524 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
525 return false;
526 if(markid && (this->flags() & regbase::save_subexpression_location))
527 this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position);
528 ++m_position;
529 //
530 // append closing parenthesis state:
531 //
532 pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
533 pb->index = markid;
534 pb->icase = this->flags() & regbase::icase;
535 this->m_paren_start = last_paren_start;
536 //
537 // restore the alternate insertion point:
538 //
539 this->m_alt_insert_point = last_alt_point;
540
541 return true;
542 }
543
544 template <class charT, class traits>
parse_basic_escape()545 bool basic_regex_parser<charT, traits>::parse_basic_escape()
546 {
547 if(++m_position == m_end)
548 {
549 fail(regex_constants::error_paren, m_position - m_base);
550 return false;
551 }
552 bool result = true;
553 switch(this->m_traits.escape_syntax_type(*m_position))
554 {
555 case regex_constants::syntax_open_mark:
556 return parse_open_paren();
557 case regex_constants::syntax_close_mark:
558 return false;
559 case regex_constants::syntax_plus:
560 if(this->flags() & regex_constants::bk_plus_qm)
561 {
562 ++m_position;
563 return parse_repeat(1);
564 }
565 else
566 return parse_literal();
567 case regex_constants::syntax_question:
568 if(this->flags() & regex_constants::bk_plus_qm)
569 {
570 ++m_position;
571 return parse_repeat(0, 1);
572 }
573 else
574 return parse_literal();
575 case regex_constants::syntax_open_brace:
576 if(this->flags() & regbase::no_intervals)
577 return parse_literal();
578 ++m_position;
579 return parse_repeat_range(true);
580 case regex_constants::syntax_close_brace:
581 if(this->flags() & regbase::no_intervals)
582 return parse_literal();
583 fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {.");
584 return false;
585 case regex_constants::syntax_or:
586 if(this->flags() & regbase::bk_vbar)
587 return parse_alt();
588 else
589 result = parse_literal();
590 break;
591 case regex_constants::syntax_digit:
592 return parse_backref();
593 case regex_constants::escape_type_start_buffer:
594 if(this->flags() & regbase::emacs_ex)
595 {
596 ++m_position;
597 this->append_state(syntax_element_buffer_start);
598 }
599 else
600 result = parse_literal();
601 break;
602 case regex_constants::escape_type_end_buffer:
603 if(this->flags() & regbase::emacs_ex)
604 {
605 ++m_position;
606 this->append_state(syntax_element_buffer_end);
607 }
608 else
609 result = parse_literal();
610 break;
611 case regex_constants::escape_type_word_assert:
612 if(this->flags() & regbase::emacs_ex)
613 {
614 ++m_position;
615 this->append_state(syntax_element_word_boundary);
616 }
617 else
618 result = parse_literal();
619 break;
620 case regex_constants::escape_type_not_word_assert:
621 if(this->flags() & regbase::emacs_ex)
622 {
623 ++m_position;
624 this->append_state(syntax_element_within_word);
625 }
626 else
627 result = parse_literal();
628 break;
629 case regex_constants::escape_type_left_word:
630 if(this->flags() & regbase::emacs_ex)
631 {
632 ++m_position;
633 this->append_state(syntax_element_word_start);
634 }
635 else
636 result = parse_literal();
637 break;
638 case regex_constants::escape_type_right_word:
639 if(this->flags() & regbase::emacs_ex)
640 {
641 ++m_position;
642 this->append_state(syntax_element_word_end);
643 }
644 else
645 result = parse_literal();
646 break;
647 default:
648 if(this->flags() & regbase::emacs_ex)
649 {
650 bool negate = true;
651 switch(*m_position)
652 {
653 case 'w':
654 negate = false;
655 BOOST_REGEX_FALLTHROUGH;
656 case 'W':
657 {
658 basic_char_set<charT, traits> char_set;
659 if(negate)
660 char_set.negate();
661 char_set.add_class(this->m_word_mask);
662 if(0 == this->append_set(char_set))
663 {
664 fail(regex_constants::error_ctype, m_position - m_base);
665 return false;
666 }
667 ++m_position;
668 return true;
669 }
670 case 's':
671 negate = false;
672 BOOST_REGEX_FALLTHROUGH;
673 case 'S':
674 return add_emacs_code(negate);
675 case 'c':
676 case 'C':
677 // not supported yet:
678 fail(regex_constants::error_escape, m_position - m_base, "The \\c and \\C escape sequences are not supported by POSIX basic regular expressions: try the Perl syntax instead.");
679 return false;
680 default:
681 break;
682 }
683 }
684 result = parse_literal();
685 break;
686 }
687 return result;
688 }
689
690 template <class charT, class traits>
parse_extended_escape()691 bool basic_regex_parser<charT, traits>::parse_extended_escape()
692 {
693 ++m_position;
694 if(m_position == m_end)
695 {
696 fail(regex_constants::error_escape, m_position - m_base, "Incomplete escape sequence found.");
697 return false;
698 }
699 bool negate = false; // in case this is a character class escape: \w \d etc
700 switch(this->m_traits.escape_syntax_type(*m_position))
701 {
702 case regex_constants::escape_type_not_class:
703 negate = true;
704 BOOST_REGEX_FALLTHROUGH;
705 case regex_constants::escape_type_class:
706 {
707 escape_type_class_jump:
708 typedef typename traits::char_class_type m_type;
709 m_type m = this->m_traits.lookup_classname(m_position, m_position+1);
710 if(m != 0)
711 {
712 basic_char_set<charT, traits> char_set;
713 if(negate)
714 char_set.negate();
715 char_set.add_class(m);
716 if(0 == this->append_set(char_set))
717 {
718 fail(regex_constants::error_ctype, m_position - m_base);
719 return false;
720 }
721 ++m_position;
722 return true;
723 }
724 //
725 // not a class, just a regular unknown escape:
726 //
727 this->append_literal(unescape_character());
728 break;
729 }
730 case regex_constants::syntax_digit:
731 return parse_backref();
732 case regex_constants::escape_type_left_word:
733 ++m_position;
734 this->append_state(syntax_element_word_start);
735 break;
736 case regex_constants::escape_type_right_word:
737 ++m_position;
738 this->append_state(syntax_element_word_end);
739 break;
740 case regex_constants::escape_type_start_buffer:
741 ++m_position;
742 this->append_state(syntax_element_buffer_start);
743 break;
744 case regex_constants::escape_type_end_buffer:
745 ++m_position;
746 this->append_state(syntax_element_buffer_end);
747 break;
748 case regex_constants::escape_type_word_assert:
749 ++m_position;
750 this->append_state(syntax_element_word_boundary);
751 break;
752 case regex_constants::escape_type_not_word_assert:
753 ++m_position;
754 this->append_state(syntax_element_within_word);
755 break;
756 case regex_constants::escape_type_Z:
757 ++m_position;
758 this->append_state(syntax_element_soft_buffer_end);
759 break;
760 case regex_constants::escape_type_Q:
761 return parse_QE();
762 case regex_constants::escape_type_C:
763 return parse_match_any();
764 case regex_constants::escape_type_X:
765 ++m_position;
766 this->append_state(syntax_element_combining);
767 break;
768 case regex_constants::escape_type_G:
769 ++m_position;
770 this->append_state(syntax_element_restart_continue);
771 break;
772 case regex_constants::escape_type_not_property:
773 negate = true;
774 BOOST_REGEX_FALLTHROUGH;
775 case regex_constants::escape_type_property:
776 {
777 ++m_position;
778 char_class_type m;
779 if(m_position == m_end)
780 {
781 fail(regex_constants::error_escape, m_position - m_base, "Incomplete property escape found.");
782 return false;
783 }
784 // maybe have \p{ddd}
785 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
786 {
787 const charT* base = m_position;
788 // skip forward until we find enclosing brace:
789 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
790 ++m_position;
791 if(m_position == m_end)
792 {
793 fail(regex_constants::error_escape, m_position - m_base, "Closing } missing from property escape sequence.");
794 return false;
795 }
796 m = this->m_traits.lookup_classname(++base, m_position++);
797 }
798 else
799 {
800 m = this->m_traits.lookup_classname(m_position, m_position+1);
801 ++m_position;
802 }
803 if(m != 0)
804 {
805 basic_char_set<charT, traits> char_set;
806 if(negate)
807 char_set.negate();
808 char_set.add_class(m);
809 if(0 == this->append_set(char_set))
810 {
811 fail(regex_constants::error_ctype, m_position - m_base);
812 return false;
813 }
814 return true;
815 }
816 fail(regex_constants::error_ctype, m_position - m_base, "Escape sequence was neither a valid property nor a valid character class name.");
817 return false;
818 }
819 case regex_constants::escape_type_reset_start_mark:
820 if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
821 {
822 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
823 pb->index = -5;
824 pb->icase = this->flags() & regbase::icase;
825 this->m_pdata->m_data.align();
826 ++m_position;
827 return true;
828 }
829 goto escape_type_class_jump;
830 case regex_constants::escape_type_line_ending:
831 if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
832 {
833 const charT* e = get_escape_R_string<charT>();
834 const charT* old_position = m_position;
835 const charT* old_end = m_end;
836 const charT* old_base = m_base;
837 m_position = e;
838 m_base = e;
839 m_end = e + traits::length(e);
840 bool r = parse_all();
841 m_position = ++old_position;
842 m_end = old_end;
843 m_base = old_base;
844 return r;
845 }
846 goto escape_type_class_jump;
847 case regex_constants::escape_type_extended_backref:
848 if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
849 {
850 bool have_brace = false;
851 bool negative = false;
852 static const char incomplete_message[] = "Incomplete \\g escape found.";
853 if(++m_position == m_end)
854 {
855 fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
856 return false;
857 }
858 // maybe have \g{ddd}
859 regex_constants::syntax_type syn = this->m_traits.syntax_type(*m_position);
860 regex_constants::syntax_type syn_end = 0;
861 if((syn == regex_constants::syntax_open_brace)
862 || (syn == regex_constants::escape_type_left_word)
863 || (syn == regex_constants::escape_type_end_buffer))
864 {
865 if(++m_position == m_end)
866 {
867 fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
868 return false;
869 }
870 have_brace = true;
871 switch(syn)
872 {
873 case regex_constants::syntax_open_brace:
874 syn_end = regex_constants::syntax_close_brace;
875 break;
876 case regex_constants::escape_type_left_word:
877 syn_end = regex_constants::escape_type_right_word;
878 break;
879 default:
880 syn_end = regex_constants::escape_type_end_buffer;
881 break;
882 }
883 }
884 negative = (*m_position == static_cast<charT>('-'));
885 if((negative) && (++m_position == m_end))
886 {
887 fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
888 return false;
889 }
890 const charT* pc = m_position;
891 std::intmax_t i = this->m_traits.toi(pc, m_end, 10);
892 if((i < 0) && syn_end)
893 {
894 // Check for a named capture, get the leftmost one if there is more than one:
895 const charT* base = m_position;
896 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != syn_end))
897 {
898 ++m_position;
899 }
900 i = hash_value_from_capture_name(base, m_position);
901 pc = m_position;
902 }
903 if(negative)
904 i = 1 + (static_cast<std::intmax_t>(m_mark_count) - i);
905 if(((i < hash_value_mask) && (i > 0)) || ((i >= hash_value_mask) && (this->m_pdata->get_id((int)i) > 0)))
906 {
907 m_position = pc;
908 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
909 pb->index = (int)i;
910 pb->icase = this->flags() & regbase::icase;
911 if ((i > m_max_backref) && (i < hash_value_mask))
912 m_max_backref = i;
913 }
914 else
915 {
916 fail(regex_constants::error_backref, m_position - m_base);
917 return false;
918 }
919 m_position = pc;
920 if(have_brace)
921 {
922 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != syn_end))
923 {
924 fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
925 return false;
926 }
927 ++m_position;
928 }
929 return true;
930 }
931 goto escape_type_class_jump;
932 case regex_constants::escape_type_control_v:
933 if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
934 goto escape_type_class_jump;
935 BOOST_REGEX_FALLTHROUGH;
936 default:
937 this->append_literal(unescape_character());
938 break;
939 }
940 return true;
941 }
942
943 template <class charT, class traits>
parse_match_any()944 bool basic_regex_parser<charT, traits>::parse_match_any()
945 {
946 //
947 // we have a '.' that can match any character:
948 //
949 ++m_position;
950 static_cast<re_dot*>(
951 this->append_state(syntax_element_wild, sizeof(re_dot))
952 )->mask = static_cast<unsigned char>(this->flags() & regbase::no_mod_s
953 ? BOOST_REGEX_DETAIL_NS::force_not_newline
954 : this->flags() & regbase::mod_s ?
955 BOOST_REGEX_DETAIL_NS::force_newline : BOOST_REGEX_DETAIL_NS::dont_care);
956 return true;
957 }
958
959 template <class charT, class traits>
parse_repeat(std::size_t low,std::size_t high)960 bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_t high)
961 {
962 bool greedy = true;
963 bool possessive = false;
964 std::size_t insert_point;
965 //
966 // when we get to here we may have a non-greedy ? mark still to come:
967 //
968 if((m_position != m_end)
969 && (
970 (0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
971 || ((regbase::basic_syntax_group|regbase::emacs_ex) == (this->flags() & (regbase::main_option_type | regbase::emacs_ex)))
972 )
973 )
974 {
975 // OK we have a perl or emacs regex, check for a '?':
976 if ((this->flags() & (regbase::main_option_type | regbase::mod_x | regbase::no_perl_ex)) == regbase::mod_x)
977 {
978 // whitespace skip:
979 while ((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
980 ++m_position;
981 }
982 if((m_position != m_end) && (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question))
983 {
984 greedy = false;
985 ++m_position;
986 }
987 // for perl regexes only check for possessive ++ repeats.
988 if((m_position != m_end)
989 && (0 == (this->flags() & regbase::main_option_type))
990 && (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_plus))
991 {
992 possessive = true;
993 ++m_position;
994 }
995 }
996 if(0 == this->m_last_state)
997 {
998 fail(regex_constants::error_badrepeat, std::distance(m_base, m_position), "Nothing to repeat.");
999 return false;
1000 }
1001 if(this->m_last_state->type == syntax_element_endmark)
1002 {
1003 // insert a repeat before the '(' matching the last ')':
1004 insert_point = this->m_paren_start;
1005 }
1006 else if((this->m_last_state->type == syntax_element_literal) && (static_cast<re_literal*>(this->m_last_state)->length > 1))
1007 {
1008 // the last state was a literal with more than one character, split it in two:
1009 re_literal* lit = static_cast<re_literal*>(this->m_last_state);
1010 charT c = (static_cast<charT*>(static_cast<void*>(lit+1)))[lit->length - 1];
1011 lit->length -= 1;
1012 // now append new state:
1013 lit = static_cast<re_literal*>(this->append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
1014 lit->length = 1;
1015 (static_cast<charT*>(static_cast<void*>(lit+1)))[0] = c;
1016 insert_point = this->getoffset(this->m_last_state);
1017 }
1018 else
1019 {
1020 // repeat the last state whatever it was, need to add some error checking here:
1021 switch(this->m_last_state->type)
1022 {
1023 case syntax_element_start_line:
1024 case syntax_element_end_line:
1025 case syntax_element_word_boundary:
1026 case syntax_element_within_word:
1027 case syntax_element_word_start:
1028 case syntax_element_word_end:
1029 case syntax_element_buffer_start:
1030 case syntax_element_buffer_end:
1031 case syntax_element_alt:
1032 case syntax_element_soft_buffer_end:
1033 case syntax_element_restart_continue:
1034 case syntax_element_jump:
1035 case syntax_element_startmark:
1036 case syntax_element_backstep:
1037 case syntax_element_toggle_case:
1038 // can't legally repeat any of the above:
1039 fail(regex_constants::error_badrepeat, m_position - m_base);
1040 return false;
1041 default:
1042 // do nothing...
1043 break;
1044 }
1045 insert_point = this->getoffset(this->m_last_state);
1046 }
1047 //
1048 // OK we now know what to repeat, so insert the repeat around it:
1049 //
1050 re_repeat* rep = static_cast<re_repeat*>(this->insert_state(insert_point, syntax_element_rep, re_repeater_size));
1051 rep->min = low;
1052 rep->max = high;
1053 rep->greedy = greedy;
1054 rep->leading = false;
1055 // store our repeater position for later:
1056 std::ptrdiff_t rep_off = this->getoffset(rep);
1057 // and append a back jump to the repeat:
1058 re_jump* jmp = static_cast<re_jump*>(this->append_state(syntax_element_jump, sizeof(re_jump)));
1059 jmp->alt.i = rep_off - this->getoffset(jmp);
1060 this->m_pdata->m_data.align();
1061 // now fill in the alt jump for the repeat:
1062 rep = static_cast<re_repeat*>(this->getaddress(rep_off));
1063 rep->alt.i = this->m_pdata->m_data.size() - rep_off;
1064 //
1065 // If the repeat is possessive then bracket the repeat with a (?>...)
1066 // independent sub-expression construct:
1067 //
1068 if(possessive)
1069 {
1070 if(m_position != m_end)
1071 {
1072 //
1073 // Check for illegal following quantifier, we have to do this here, because
1074 // the extra states we insert below circumvents our usual error checking :-(
1075 //
1076 bool contin = false;
1077 do
1078 {
1079 if ((this->flags() & (regbase::main_option_type | regbase::mod_x | regbase::no_perl_ex)) == regbase::mod_x)
1080 {
1081 // whitespace skip:
1082 while ((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1083 ++m_position;
1084 }
1085 if (m_position != m_end)
1086 {
1087 switch (this->m_traits.syntax_type(*m_position))
1088 {
1089 case regex_constants::syntax_star:
1090 case regex_constants::syntax_plus:
1091 case regex_constants::syntax_question:
1092 case regex_constants::syntax_open_brace:
1093 fail(regex_constants::error_badrepeat, m_position - m_base);
1094 return false;
1095 case regex_constants::syntax_open_mark:
1096 // Do we have a comment? If so we need to skip it here...
1097 if ((m_position + 2 < m_end) && this->m_traits.syntax_type(*(m_position + 1)) == regex_constants::syntax_question
1098 && this->m_traits.syntax_type(*(m_position + 2)) == regex_constants::syntax_hash)
1099 {
1100 while ((m_position != m_end)
1101 && (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark)) {
1102 }
1103 contin = true;
1104 }
1105 else
1106 contin = false;
1107 break;
1108 default:
1109 contin = false;
1110 }
1111 }
1112 else
1113 contin = false;
1114 } while (contin);
1115 }
1116 re_brace* pb = static_cast<re_brace*>(this->insert_state(insert_point, syntax_element_startmark, sizeof(re_brace)));
1117 pb->index = -3;
1118 pb->icase = this->flags() & regbase::icase;
1119 jmp = static_cast<re_jump*>(this->insert_state(insert_point + sizeof(re_brace), syntax_element_jump, sizeof(re_jump)));
1120 this->m_pdata->m_data.align();
1121 jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
1122 pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
1123 pb->index = -3;
1124 pb->icase = this->flags() & regbase::icase;
1125 }
1126 return true;
1127 }
1128
1129 template <class charT, class traits>
parse_repeat_range(bool isbasic)1130 bool basic_regex_parser<charT, traits>::parse_repeat_range(bool isbasic)
1131 {
1132 static const char incomplete_message[] = "Missing } in quantified repetition.";
1133 //
1134 // parse a repeat-range:
1135 //
1136 std::size_t min, max;
1137 std::intmax_t v;
1138 // skip whitespace:
1139 while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1140 ++m_position;
1141 if(this->m_position == this->m_end)
1142 {
1143 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1144 {
1145 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1146 return false;
1147 }
1148 // Treat the opening '{' as a literal character, rewind to start of error:
1149 --m_position;
1150 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1151 return parse_literal();
1152 }
1153 // get min:
1154 v = this->m_traits.toi(m_position, m_end, 10);
1155 // skip whitespace:
1156 if((v < 0) || (v > umax()))
1157 {
1158 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1159 {
1160 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1161 return false;
1162 }
1163 // Treat the opening '{' as a literal character, rewind to start of error:
1164 --m_position;
1165 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1166 return parse_literal();
1167 }
1168 while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1169 ++m_position;
1170 if(this->m_position == this->m_end)
1171 {
1172 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1173 {
1174 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1175 return false;
1176 }
1177 // Treat the opening '{' as a literal character, rewind to start of error:
1178 --m_position;
1179 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1180 return parse_literal();
1181 }
1182 min = static_cast<std::size_t>(v);
1183 // see if we have a comma:
1184 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_comma)
1185 {
1186 // move on and error check:
1187 ++m_position;
1188 // skip whitespace:
1189 while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1190 ++m_position;
1191 if(this->m_position == this->m_end)
1192 {
1193 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1194 {
1195 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1196 return false;
1197 }
1198 // Treat the opening '{' as a literal character, rewind to start of error:
1199 --m_position;
1200 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1201 return parse_literal();
1202 }
1203 // get the value if any:
1204 v = this->m_traits.toi(m_position, m_end, 10);
1205 max = ((v >= 0) && (v < umax())) ? (std::size_t)v : (std::numeric_limits<std::size_t>::max)();
1206 }
1207 else
1208 {
1209 // no comma, max = min:
1210 max = min;
1211 }
1212 // skip whitespace:
1213 while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1214 ++m_position;
1215 // OK now check trailing }:
1216 if(this->m_position == this->m_end)
1217 {
1218 if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1219 {
1220 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1221 return false;
1222 }
1223 // Treat the opening '{' as a literal character, rewind to start of error:
1224 --m_position;
1225 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1226 return parse_literal();
1227 }
1228 if(isbasic)
1229 {
1230 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_escape)
1231 {
1232 ++m_position;
1233 if(this->m_position == this->m_end)
1234 {
1235 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1236 return false;
1237 }
1238 }
1239 else
1240 {
1241 fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1242 return false;
1243 }
1244 }
1245 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_brace)
1246 ++m_position;
1247 else
1248 {
1249 // Treat the opening '{' as a literal character, rewind to start of error:
1250 --m_position;
1251 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1252 return parse_literal();
1253 }
1254 //
1255 // finally go and add the repeat, unless error:
1256 //
1257 if(min > max)
1258 {
1259 // Backtrack to error location:
1260 m_position -= 2;
1261 while(this->m_traits.isctype(*m_position, this->m_word_mask)) --m_position;
1262 ++m_position;
1263 fail(regex_constants::error_badbrace, m_position - m_base);
1264 return false;
1265 }
1266 return parse_repeat(min, max);
1267 }
1268
1269 template <class charT, class traits>
parse_alt()1270 bool basic_regex_parser<charT, traits>::parse_alt()
1271 {
1272 //
1273 // error check: if there have been no previous states,
1274 // or if the last state was a '(' then error:
1275 //
1276 if(
1277 ((this->m_last_state == 0) || (this->m_last_state->type == syntax_element_startmark))
1278 &&
1279 !(
1280 ((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group)
1281 &&
1282 ((this->flags() & regbase::no_empty_expressions) == 0)
1283 )
1284 )
1285 {
1286 fail(regex_constants::error_empty, this->m_position - this->m_base, "A regular expression cannot start with the alternation operator |.");
1287 return false;
1288 }
1289 //
1290 // Reset mark count if required:
1291 //
1292 if(m_max_mark < m_mark_count)
1293 m_max_mark = m_mark_count;
1294 if(m_mark_reset >= 0)
1295 m_mark_count = m_mark_reset;
1296
1297 ++m_position;
1298 //
1299 // we need to append a trailing jump:
1300 //
1301 re_syntax_base* pj = this->append_state(BOOST_REGEX_DETAIL_NS::syntax_element_jump, sizeof(re_jump));
1302 std::ptrdiff_t jump_offset = this->getoffset(pj);
1303 //
1304 // now insert the alternative:
1305 //
1306 re_alt* palt = static_cast<re_alt*>(this->insert_state(this->m_alt_insert_point, syntax_element_alt, re_alt_size));
1307 jump_offset += re_alt_size;
1308 this->m_pdata->m_data.align();
1309 palt->alt.i = this->m_pdata->m_data.size() - this->getoffset(palt);
1310 //
1311 // update m_alt_insert_point so that the next alternate gets
1312 // inserted at the start of the second of the two we've just created:
1313 //
1314 this->m_alt_insert_point = this->m_pdata->m_data.size();
1315 //
1316 // the start of this alternative must have a case changes state
1317 // if the current block has messed around with case changes:
1318 //
1319 if(m_has_case_change)
1320 {
1321 static_cast<re_case*>(
1322 this->append_state(syntax_element_toggle_case, sizeof(re_case))
1323 )->icase = this->m_icase;
1324 }
1325 //
1326 // push the alternative onto our stack, a recursive
1327 // implementation here is easier to understand (and faster
1328 // as it happens), but causes all kinds of stack overflow problems
1329 // on programs with small stacks (COM+).
1330 //
1331 m_alt_jumps.push_back(jump_offset);
1332 return true;
1333 }
1334
1335 template <class charT, class traits>
parse_set()1336 bool basic_regex_parser<charT, traits>::parse_set()
1337 {
1338 static const char incomplete_message[] = "Character set declaration starting with [ terminated prematurely - either no ] was found or the set had no content.";
1339 ++m_position;
1340 if(m_position == m_end)
1341 {
1342 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1343 return false;
1344 }
1345 basic_char_set<charT, traits> char_set;
1346
1347 const charT* base = m_position; // where the '[' was
1348 const charT* item_base = m_position; // where the '[' or '^' was
1349
1350 while(m_position != m_end)
1351 {
1352 switch(this->m_traits.syntax_type(*m_position))
1353 {
1354 case regex_constants::syntax_caret:
1355 if(m_position == base)
1356 {
1357 char_set.negate();
1358 ++m_position;
1359 item_base = m_position;
1360 }
1361 else
1362 parse_set_literal(char_set);
1363 break;
1364 case regex_constants::syntax_close_set:
1365 if(m_position == item_base)
1366 {
1367 parse_set_literal(char_set);
1368 break;
1369 }
1370 else
1371 {
1372 ++m_position;
1373 if(0 == this->append_set(char_set))
1374 {
1375 fail(regex_constants::error_ctype, m_position - m_base);
1376 return false;
1377 }
1378 }
1379 return true;
1380 case regex_constants::syntax_open_set:
1381 if(parse_inner_set(char_set))
1382 break;
1383 return true;
1384 case regex_constants::syntax_escape:
1385 {
1386 //
1387 // look ahead and see if this is a character class shortcut
1388 // \d \w \s etc...
1389 //
1390 ++m_position;
1391 if(this->m_traits.escape_syntax_type(*m_position)
1392 == regex_constants::escape_type_class)
1393 {
1394 char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
1395 if(m != 0)
1396 {
1397 char_set.add_class(m);
1398 ++m_position;
1399 break;
1400 }
1401 }
1402 else if(this->m_traits.escape_syntax_type(*m_position)
1403 == regex_constants::escape_type_not_class)
1404 {
1405 // negated character class:
1406 char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
1407 if(m != 0)
1408 {
1409 char_set.add_negated_class(m);
1410 ++m_position;
1411 break;
1412 }
1413 }
1414 // not a character class, just a regular escape:
1415 --m_position;
1416 parse_set_literal(char_set);
1417 break;
1418 }
1419 default:
1420 parse_set_literal(char_set);
1421 break;
1422 }
1423 }
1424 return m_position != m_end;
1425 }
1426
1427 template <class charT, class traits>
parse_inner_set(basic_char_set<charT,traits> & char_set)1428 bool basic_regex_parser<charT, traits>::parse_inner_set(basic_char_set<charT, traits>& char_set)
1429 {
1430 static const char incomplete_message[] = "Character class declaration starting with [ terminated prematurely - either no ] was found or the set had no content.";
1431 //
1432 // we have either a character class [:name:]
1433 // a collating element [.name.]
1434 // or an equivalence class [=name=]
1435 //
1436 if(m_end == ++m_position)
1437 {
1438 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1439 return false;
1440 }
1441 switch(this->m_traits.syntax_type(*m_position))
1442 {
1443 case regex_constants::syntax_dot:
1444 //
1445 // a collating element is treated as a literal:
1446 //
1447 --m_position;
1448 parse_set_literal(char_set);
1449 return true;
1450 case regex_constants::syntax_colon:
1451 {
1452 // check that character classes are actually enabled:
1453 if((this->flags() & (regbase::main_option_type | regbase::no_char_classes))
1454 == (regbase::basic_syntax_group | regbase::no_char_classes))
1455 {
1456 --m_position;
1457 parse_set_literal(char_set);
1458 return true;
1459 }
1460 // skip the ':'
1461 if(m_end == ++m_position)
1462 {
1463 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1464 return false;
1465 }
1466 const charT* name_first = m_position;
1467 // skip at least one character, then find the matching ':]'
1468 if(m_end == ++m_position)
1469 {
1470 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1471 return false;
1472 }
1473 while((m_position != m_end)
1474 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_colon))
1475 ++m_position;
1476 const charT* name_last = m_position;
1477 if(m_end == m_position)
1478 {
1479 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1480 return false;
1481 }
1482 if((m_end == ++m_position)
1483 || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1484 {
1485 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1486 return false;
1487 }
1488 //
1489 // check for negated class:
1490 //
1491 bool negated = false;
1492 if(this->m_traits.syntax_type(*name_first) == regex_constants::syntax_caret)
1493 {
1494 ++name_first;
1495 negated = true;
1496 }
1497 typedef typename traits::char_class_type m_type;
1498 m_type m = this->m_traits.lookup_classname(name_first, name_last);
1499 if(m == 0)
1500 {
1501 if(char_set.empty() && (name_last - name_first == 1))
1502 {
1503 // maybe a special case:
1504 ++m_position;
1505 if( (m_position != m_end)
1506 && (this->m_traits.syntax_type(*m_position)
1507 == regex_constants::syntax_close_set))
1508 {
1509 if(this->m_traits.escape_syntax_type(*name_first)
1510 == regex_constants::escape_type_left_word)
1511 {
1512 ++m_position;
1513 this->append_state(syntax_element_word_start);
1514 return false;
1515 }
1516 if(this->m_traits.escape_syntax_type(*name_first)
1517 == regex_constants::escape_type_right_word)
1518 {
1519 ++m_position;
1520 this->append_state(syntax_element_word_end);
1521 return false;
1522 }
1523 }
1524 }
1525 fail(regex_constants::error_ctype, name_first - m_base);
1526 return false;
1527 }
1528 if(!negated)
1529 char_set.add_class(m);
1530 else
1531 char_set.add_negated_class(m);
1532 ++m_position;
1533 break;
1534 }
1535 case regex_constants::syntax_equal:
1536 {
1537 // skip the '='
1538 if(m_end == ++m_position)
1539 {
1540 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1541 return false;
1542 }
1543 const charT* name_first = m_position;
1544 // skip at least one character, then find the matching '=]'
1545 if(m_end == ++m_position)
1546 {
1547 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1548 return false;
1549 }
1550 while((m_position != m_end)
1551 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal))
1552 ++m_position;
1553 const charT* name_last = m_position;
1554 if(m_end == m_position)
1555 {
1556 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1557 return false;
1558 }
1559 if((m_end == ++m_position)
1560 || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1561 {
1562 fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1563 return false;
1564 }
1565 string_type m = this->m_traits.lookup_collatename(name_first, name_last);
1566 if(m.empty() || (m.size() > 2))
1567 {
1568 fail(regex_constants::error_collate, name_first - m_base);
1569 return false;
1570 }
1571 digraph<charT> d;
1572 d.first = m[0];
1573 if(m.size() > 1)
1574 d.second = m[1];
1575 else
1576 d.second = 0;
1577 char_set.add_equivalent(d);
1578 ++m_position;
1579 break;
1580 }
1581 default:
1582 --m_position;
1583 parse_set_literal(char_set);
1584 break;
1585 }
1586 return true;
1587 }
1588
1589 template <class charT, class traits>
parse_set_literal(basic_char_set<charT,traits> & char_set)1590 void basic_regex_parser<charT, traits>::parse_set_literal(basic_char_set<charT, traits>& char_set)
1591 {
1592 digraph<charT> start_range(get_next_set_literal(char_set));
1593 if(m_end == m_position)
1594 {
1595 fail(regex_constants::error_brack, m_position - m_base);
1596 return;
1597 }
1598 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
1599 {
1600 // we have a range:
1601 if(m_end == ++m_position)
1602 {
1603 fail(regex_constants::error_brack, m_position - m_base);
1604 return;
1605 }
1606 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)
1607 {
1608 digraph<charT> end_range = get_next_set_literal(char_set);
1609 char_set.add_range(start_range, end_range);
1610 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
1611 {
1612 if(m_end == ++m_position)
1613 {
1614 fail(regex_constants::error_brack, m_position - m_base);
1615 return;
1616 }
1617 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_set)
1618 {
1619 // trailing - :
1620 --m_position;
1621 return;
1622 }
1623 fail(regex_constants::error_range, m_position - m_base);
1624 return;
1625 }
1626 return;
1627 }
1628 --m_position;
1629 }
1630 char_set.add_single(start_range);
1631 }
1632
1633 template <class charT, class traits>
get_next_set_literal(basic_char_set<charT,traits> & char_set)1634 digraph<charT> basic_regex_parser<charT, traits>::get_next_set_literal(basic_char_set<charT, traits>& char_set)
1635 {
1636 digraph<charT> result;
1637 switch(this->m_traits.syntax_type(*m_position))
1638 {
1639 case regex_constants::syntax_dash:
1640 if(!char_set.empty())
1641 {
1642 // see if we are at the end of the set:
1643 if((++m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1644 {
1645 fail(regex_constants::error_range, m_position - m_base);
1646 return result;
1647 }
1648 --m_position;
1649 }
1650 result.first = *m_position++;
1651 return result;
1652 case regex_constants::syntax_escape:
1653 // check to see if escapes are supported first:
1654 if(this->flags() & regex_constants::no_escape_in_lists)
1655 {
1656 result = *m_position++;
1657 break;
1658 }
1659 ++m_position;
1660 result = unescape_character();
1661 break;
1662 case regex_constants::syntax_open_set:
1663 {
1664 if(m_end == ++m_position)
1665 {
1666 fail(regex_constants::error_collate, m_position - m_base);
1667 return result;
1668 }
1669 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot)
1670 {
1671 --m_position;
1672 result.first = *m_position;
1673 ++m_position;
1674 return result;
1675 }
1676 if(m_end == ++m_position)
1677 {
1678 fail(regex_constants::error_collate, m_position - m_base);
1679 return result;
1680 }
1681 const charT* name_first = m_position;
1682 // skip at least one character, then find the matching ':]'
1683 if(m_end == ++m_position)
1684 {
1685 fail(regex_constants::error_collate, name_first - m_base);
1686 return result;
1687 }
1688 while((m_position != m_end)
1689 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot))
1690 ++m_position;
1691 const charT* name_last = m_position;
1692 if(m_end == m_position)
1693 {
1694 fail(regex_constants::error_collate, name_first - m_base);
1695 return result;
1696 }
1697 if((m_end == ++m_position)
1698 || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1699 {
1700 fail(regex_constants::error_collate, name_first - m_base);
1701 return result;
1702 }
1703 ++m_position;
1704 string_type s = this->m_traits.lookup_collatename(name_first, name_last);
1705 if(s.empty() || (s.size() > 2))
1706 {
1707 fail(regex_constants::error_collate, name_first - m_base);
1708 return result;
1709 }
1710 result.first = s[0];
1711 if(s.size() > 1)
1712 result.second = s[1];
1713 else
1714 result.second = 0;
1715 return result;
1716 }
1717 default:
1718 result = *m_position++;
1719 }
1720 return result;
1721 }
1722
1723 //
1724 // does a value fit in the specified charT type?
1725 //
1726 template <class charT>
valid_value(charT,std::intmax_t v,const std::integral_constant<bool,true> &)1727 bool valid_value(charT, std::intmax_t v, const std::integral_constant<bool, true>&)
1728 {
1729 return (v >> (sizeof(charT) * CHAR_BIT)) == 0;
1730 }
1731 template <class charT>
valid_value(charT,std::intmax_t,const std::integral_constant<bool,false> &)1732 bool valid_value(charT, std::intmax_t, const std::integral_constant<bool, false>&)
1733 {
1734 return true; // v will alsways fit in a charT
1735 }
1736 template <class charT>
valid_value(charT c,std::intmax_t v)1737 bool valid_value(charT c, std::intmax_t v)
1738 {
1739 return valid_value(c, v, std::integral_constant<bool, (sizeof(charT) < sizeof(std::intmax_t))>());
1740 }
1741
1742 template <class charT, class traits>
1743 charT basic_regex_parser<charT, traits>::unescape_character()
1744 {
1745 #ifdef BOOST_REGEX_MSVC
1746 #pragma warning(push)
1747 #pragma warning(disable:4127)
1748 #endif
1749 charT result(0);
1750 if(m_position == m_end)
1751 {
1752 fail(regex_constants::error_escape, m_position - m_base, "Escape sequence terminated prematurely.");
1753 return false;
1754 }
1755 switch(this->m_traits.escape_syntax_type(*m_position))
1756 {
1757 case regex_constants::escape_type_control_a:
1758 result = charT('\a');
1759 break;
1760 case regex_constants::escape_type_e:
1761 result = charT(27);
1762 break;
1763 case regex_constants::escape_type_control_f:
1764 result = charT('\f');
1765 break;
1766 case regex_constants::escape_type_control_n:
1767 result = charT('\n');
1768 break;
1769 case regex_constants::escape_type_control_r:
1770 result = charT('\r');
1771 break;
1772 case regex_constants::escape_type_control_t:
1773 result = charT('\t');
1774 break;
1775 case regex_constants::escape_type_control_v:
1776 result = charT('\v');
1777 break;
1778 case regex_constants::escape_type_word_assert:
1779 result = charT('\b');
1780 break;
1781 case regex_constants::escape_type_ascii_control:
1782 ++m_position;
1783 if(m_position == m_end)
1784 {
1785 // Rewind to start of escape:
1786 --m_position;
1787 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1788 fail(regex_constants::error_escape, m_position - m_base, "ASCII escape sequence terminated prematurely.");
1789 return result;
1790 }
1791 result = static_cast<charT>(*m_position % 32);
1792 break;
1793 case regex_constants::escape_type_hex:
1794 ++m_position;
1795 if(m_position == m_end)
1796 {
1797 // Rewind to start of escape:
1798 --m_position;
1799 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1800 fail(regex_constants::error_escape, m_position - m_base, "Hexadecimal escape sequence terminated prematurely.");
1801 return result;
1802 }
1803 // maybe have \x{ddd}
1804 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
1805 {
1806 ++m_position;
1807 if(m_position == m_end)
1808 {
1809 // Rewind to start of escape:
1810 --m_position;
1811 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1812 fail(regex_constants::error_escape, m_position - m_base, "Missing } in hexadecimal escape sequence.");
1813 return result;
1814 }
1815 std::intmax_t i = this->m_traits.toi(m_position, m_end, 16);
1816 if((m_position == m_end)
1817 || (i < 0)
1818 || ((std::numeric_limits<charT>::is_specialized) && (i > (std::intmax_t)(std::numeric_limits<charT>::max)()))
1819 || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
1820 {
1821 // Rewind to start of escape:
1822 --m_position;
1823 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1824 fail(regex_constants::error_badbrace, m_position - m_base, "Hexadecimal escape sequence was invalid.");
1825 return result;
1826 }
1827 ++m_position;
1828 result = charT(i);
1829 }
1830 else
1831 {
1832 std::ptrdiff_t len = (std::min)(static_cast<std::ptrdiff_t>(2), static_cast<std::ptrdiff_t>(m_end - m_position));
1833 std::intmax_t i = this->m_traits.toi(m_position, m_position + len, 16);
1834 if((i < 0)
1835 || !valid_value(charT(0), i))
1836 {
1837 // Rewind to start of escape:
1838 --m_position;
1839 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1840 fail(regex_constants::error_escape, m_position - m_base, "Escape sequence did not encode a valid character.");
1841 return result;
1842 }
1843 result = charT(i);
1844 }
1845 return result;
1846 case regex_constants::syntax_digit:
1847 {
1848 // an octal escape sequence, the first character must be a zero
1849 // followed by up to 3 octal digits:
1850 std::ptrdiff_t len = (std::min)(std::distance(m_position, m_end), static_cast<std::ptrdiff_t>(4));
1851 const charT* bp = m_position;
1852 std::intmax_t val = this->m_traits.toi(bp, bp + 1, 8);
1853 if(val != 0)
1854 {
1855 // Rewind to start of escape:
1856 --m_position;
1857 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1858 // Oops not an octal escape after all:
1859 fail(regex_constants::error_escape, m_position - m_base, "Invalid octal escape sequence.");
1860 return result;
1861 }
1862 val = this->m_traits.toi(m_position, m_position + len, 8);
1863 if((val < 0) || (val > (std::intmax_t)(std::numeric_limits<charT>::max)()))
1864 {
1865 // Rewind to start of escape:
1866 --m_position;
1867 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1868 fail(regex_constants::error_escape, m_position - m_base, "Octal escape sequence is invalid.");
1869 return result;
1870 }
1871 return static_cast<charT>(val);
1872 }
1873 case regex_constants::escape_type_named_char:
1874 {
1875 ++m_position;
1876 if(m_position == m_end)
1877 {
1878 // Rewind to start of escape:
1879 --m_position;
1880 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1881 fail(regex_constants::error_escape, m_position - m_base);
1882 return false;
1883 }
1884 // maybe have \N{name}
1885 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
1886 {
1887 const charT* base = m_position;
1888 // skip forward until we find enclosing brace:
1889 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
1890 ++m_position;
1891 if(m_position == m_end)
1892 {
1893 // Rewind to start of escape:
1894 --m_position;
1895 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1896 fail(regex_constants::error_escape, m_position - m_base);
1897 return false;
1898 }
1899 string_type s = this->m_traits.lookup_collatename(++base, m_position++);
1900 if(s.empty())
1901 {
1902 // Rewind to start of escape:
1903 --m_position;
1904 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1905 fail(regex_constants::error_collate, m_position - m_base);
1906 return false;
1907 }
1908 if(s.size() == 1)
1909 {
1910 return s[0];
1911 }
1912 }
1913 // fall through is a failure:
1914 // Rewind to start of escape:
1915 --m_position;
1916 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1917 fail(regex_constants::error_escape, m_position - m_base);
1918 return false;
1919 }
1920 default:
1921 result = *m_position;
1922 break;
1923 }
1924 ++m_position;
1925 return result;
1926 #ifdef BOOST_REGEX_MSVC
1927 #pragma warning(pop)
1928 #endif
1929 }
1930
1931 template <class charT, class traits>
parse_backref()1932 bool basic_regex_parser<charT, traits>::parse_backref()
1933 {
1934 BOOST_REGEX_ASSERT(m_position != m_end);
1935 const charT* pc = m_position;
1936 std::intmax_t i = this->m_traits.toi(pc, pc + 1, 10);
1937 if((i == 0) || (((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group) && (this->flags() & regbase::no_bk_refs)))
1938 {
1939 // not a backref at all but an octal escape sequence:
1940 charT c = unescape_character();
1941 this->append_literal(c);
1942 }
1943 else if((i > 0))
1944 {
1945 m_position = pc;
1946 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
1947 pb->index = (int)i;
1948 pb->icase = this->flags() & regbase::icase;
1949 if(i > m_max_backref)
1950 m_max_backref = i;
1951 }
1952 else
1953 {
1954 // Rewind to start of escape:
1955 --m_position;
1956 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1957 fail(regex_constants::error_backref, m_position - m_base);
1958 return false;
1959 }
1960 return true;
1961 }
1962
1963 template <class charT, class traits>
parse_QE()1964 bool basic_regex_parser<charT, traits>::parse_QE()
1965 {
1966 #ifdef BOOST_REGEX_MSVC
1967 #pragma warning(push)
1968 #pragma warning(disable:4127)
1969 #endif
1970 //
1971 // parse a \Q...\E sequence:
1972 //
1973 ++m_position; // skip the Q
1974 const charT* start = m_position;
1975 const charT* end;
1976 do
1977 {
1978 while((m_position != m_end)
1979 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape))
1980 ++m_position;
1981 if(m_position == m_end)
1982 {
1983 // a \Q...\E sequence may terminate with the end of the expression:
1984 end = m_position;
1985 break;
1986 }
1987 if(++m_position == m_end) // skip the escape
1988 {
1989 fail(regex_constants::error_escape, m_position - m_base, "Unterminated \\Q...\\E sequence.");
1990 return false;
1991 }
1992 // check to see if it's a \E:
1993 if(this->m_traits.escape_syntax_type(*m_position) == regex_constants::escape_type_E)
1994 {
1995 ++m_position;
1996 end = m_position - 2;
1997 break;
1998 }
1999 // otherwise go round again:
2000 }while(true);
2001 //
2002 // now add all the character between the two escapes as literals:
2003 //
2004 while(start != end)
2005 {
2006 this->append_literal(*start);
2007 ++start;
2008 }
2009 return true;
2010 #ifdef BOOST_REGEX_MSVC
2011 #pragma warning(pop)
2012 #endif
2013 }
2014
2015 template <class charT, class traits>
parse_perl_extension()2016 bool basic_regex_parser<charT, traits>::parse_perl_extension()
2017 {
2018 if(++m_position == m_end)
2019 {
2020 // Rewind to start of (? sequence:
2021 --m_position;
2022 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2023 fail(regex_constants::error_perl_extension, m_position - m_base);
2024 return false;
2025 }
2026 //
2027 // treat comments as a special case, as these
2028 // are the only ones that don't start with a leading
2029 // startmark state:
2030 //
2031 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_hash)
2032 {
2033 while((m_position != m_end)
2034 && (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark))
2035 {}
2036 return true;
2037 }
2038 //
2039 // backup some state, and prepare the way:
2040 //
2041 int markid = 0;
2042 std::ptrdiff_t jump_offset = 0;
2043 re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
2044 pb->icase = this->flags() & regbase::icase;
2045 std::ptrdiff_t last_paren_start = this->getoffset(pb);
2046 // back up insertion point for alternations, and set new point:
2047 std::ptrdiff_t last_alt_point = m_alt_insert_point;
2048 this->m_pdata->m_data.align();
2049 m_alt_insert_point = this->m_pdata->m_data.size();
2050 std::ptrdiff_t expected_alt_point = m_alt_insert_point;
2051 bool restore_flags = true;
2052 regex_constants::syntax_option_type old_flags = this->flags();
2053 bool old_case_change = m_has_case_change;
2054 m_has_case_change = false;
2055 charT name_delim;
2056 int mark_reset = m_mark_reset;
2057 int max_mark = m_max_mark;
2058 m_mark_reset = -1;
2059 m_max_mark = m_mark_count;
2060 std::intmax_t v;
2061 //
2062 // select the actual extension used:
2063 //
2064 switch(this->m_traits.syntax_type(*m_position))
2065 {
2066 case regex_constants::syntax_or:
2067 m_mark_reset = m_mark_count;
2068 BOOST_REGEX_FALLTHROUGH;
2069 case regex_constants::syntax_colon:
2070 //
2071 // a non-capturing mark:
2072 //
2073 pb->index = markid = 0;
2074 ++m_position;
2075 break;
2076 case regex_constants::syntax_digit:
2077 {
2078 //
2079 // a recursive subexpression:
2080 //
2081 v = this->m_traits.toi(m_position, m_end, 10);
2082 if((v < 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2083 {
2084 // Rewind to start of (? sequence:
2085 --m_position;
2086 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2087 fail(regex_constants::error_perl_extension, m_position - m_base, "The recursive sub-expression refers to an invalid marking group, or is unterminated.");
2088 return false;
2089 }
2090 insert_recursion:
2091 pb->index = markid = 0;
2092 re_recurse* pr = static_cast<re_recurse*>(this->append_state(syntax_element_recurse, sizeof(re_recurse)));
2093 pr->alt.i = (std::ptrdiff_t)v;
2094 pr->state_id = 0;
2095 static_cast<re_case*>(
2096 this->append_state(syntax_element_toggle_case, sizeof(re_case))
2097 )->icase = this->flags() & regbase::icase;
2098 break;
2099 }
2100 case regex_constants::syntax_plus:
2101 //
2102 // A forward-relative recursive subexpression:
2103 //
2104 ++m_position;
2105 v = this->m_traits.toi(m_position, m_end, 10);
2106 if((v <= 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2107 {
2108 // Rewind to start of (? sequence:
2109 --m_position;
2110 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2111 fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
2112 return false;
2113 }
2114 if ((std::numeric_limits<std::intmax_t>::max)() - m_mark_count < v)
2115 {
2116 fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
2117 return false;
2118 }
2119 v += m_mark_count;
2120 goto insert_recursion;
2121 case regex_constants::syntax_dash:
2122 //
2123 // Possibly a backward-relative recursive subexpression:
2124 //
2125 ++m_position;
2126 v = this->m_traits.toi(m_position, m_end, 10);
2127 if(v <= 0)
2128 {
2129 --m_position;
2130 // Oops not a relative recursion at all, but a (?-imsx) group:
2131 goto option_group_jump;
2132 }
2133 v = static_cast<std::intmax_t>(m_mark_count) + 1 - v;
2134 if(v <= 0)
2135 {
2136 // Rewind to start of (? sequence:
2137 --m_position;
2138 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2139 fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
2140 return false;
2141 }
2142 goto insert_recursion;
2143 case regex_constants::syntax_equal:
2144 pb->index = markid = -1;
2145 ++m_position;
2146 jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2147 this->m_pdata->m_data.align();
2148 m_alt_insert_point = this->m_pdata->m_data.size();
2149 break;
2150 case regex_constants::syntax_not:
2151 pb->index = markid = -2;
2152 ++m_position;
2153 jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2154 this->m_pdata->m_data.align();
2155 m_alt_insert_point = this->m_pdata->m_data.size();
2156 break;
2157 case regex_constants::escape_type_left_word:
2158 {
2159 // a lookbehind assertion:
2160 if(++m_position == m_end)
2161 {
2162 // Rewind to start of (? sequence:
2163 --m_position;
2164 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2165 fail(regex_constants::error_perl_extension, m_position - m_base);
2166 return false;
2167 }
2168 regex_constants::syntax_type t = this->m_traits.syntax_type(*m_position);
2169 if(t == regex_constants::syntax_not)
2170 pb->index = markid = -2;
2171 else if(t == regex_constants::syntax_equal)
2172 pb->index = markid = -1;
2173 else
2174 {
2175 // Probably a named capture which also starts (?< :
2176 name_delim = '>';
2177 --m_position;
2178 goto named_capture_jump;
2179 }
2180 ++m_position;
2181 jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2182 this->append_state(syntax_element_backstep, sizeof(re_brace));
2183 this->m_pdata->m_data.align();
2184 m_alt_insert_point = this->m_pdata->m_data.size();
2185 break;
2186 }
2187 case regex_constants::escape_type_right_word:
2188 //
2189 // an independent sub-expression:
2190 //
2191 pb->index = markid = -3;
2192 ++m_position;
2193 jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2194 this->m_pdata->m_data.align();
2195 m_alt_insert_point = this->m_pdata->m_data.size();
2196 break;
2197 case regex_constants::syntax_open_mark:
2198 {
2199 // a conditional expression:
2200 pb->index = markid = -4;
2201 if(++m_position == m_end)
2202 {
2203 // Rewind to start of (? sequence:
2204 --m_position;
2205 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2206 fail(regex_constants::error_perl_extension, m_position - m_base);
2207 return false;
2208 }
2209 v = this->m_traits.toi(m_position, m_end, 10);
2210 if(m_position == m_end)
2211 {
2212 // Rewind to start of (? sequence:
2213 --m_position;
2214 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2215 fail(regex_constants::error_perl_extension, m_position - m_base);
2216 return false;
2217 }
2218 if(*m_position == charT('R'))
2219 {
2220 if(++m_position == m_end)
2221 {
2222 // Rewind to start of (? sequence:
2223 --m_position;
2224 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2225 fail(regex_constants::error_perl_extension, m_position - m_base);
2226 return false;
2227 }
2228 if(*m_position == charT('&'))
2229 {
2230 const charT* base = ++m_position;
2231 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2232 ++m_position;
2233 if(m_position == m_end)
2234 {
2235 // Rewind to start of (? sequence:
2236 --m_position;
2237 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2238 fail(regex_constants::error_perl_extension, m_position - m_base);
2239 return false;
2240 }
2241 v = -static_cast<int>(hash_value_from_capture_name(base, m_position));
2242 }
2243 else
2244 {
2245 v = -this->m_traits.toi(m_position, m_end, 10);
2246 }
2247 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2248 br->index = v < 0 ? (int)(v - 1) : 0;
2249 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2250 {
2251 // Rewind to start of (? sequence:
2252 --m_position;
2253 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2254 fail(regex_constants::error_perl_extension, m_position - m_base);
2255 return false;
2256 }
2257 if(++m_position == m_end)
2258 {
2259 // Rewind to start of (? sequence:
2260 --m_position;
2261 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2262 fail(regex_constants::error_perl_extension, m_position - m_base);
2263 return false;
2264 }
2265 }
2266 else if((*m_position == charT('\'')) || (*m_position == charT('<')))
2267 {
2268 const charT* base = ++m_position;
2269 while((m_position != m_end) && (*m_position != charT('>')) && (*m_position != charT('\'')))
2270 ++m_position;
2271 if(m_position == m_end)
2272 {
2273 // Rewind to start of (? sequence:
2274 --m_position;
2275 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2276 fail(regex_constants::error_perl_extension, m_position - m_base);
2277 return false;
2278 }
2279 v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2280 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2281 br->index = (int)v;
2282 if(((*m_position != charT('>')) && (*m_position != charT('\''))) || (++m_position == m_end))
2283 {
2284 // Rewind to start of (? sequence:
2285 --m_position;
2286 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2287 fail(regex_constants::error_perl_extension, m_position - m_base, "Unterminated named capture.");
2288 return false;
2289 }
2290 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2291 {
2292 // Rewind to start of (? sequence:
2293 --m_position;
2294 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2295 fail(regex_constants::error_perl_extension, m_position - m_base);
2296 return false;
2297 }
2298 if(++m_position == m_end)
2299 {
2300 // Rewind to start of (? sequence:
2301 --m_position;
2302 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2303 fail(regex_constants::error_perl_extension, m_position - m_base);
2304 return false;
2305 }
2306 }
2307 else if(*m_position == charT('D'))
2308 {
2309 const char* def = "DEFINE";
2310 while(*def && (m_position != m_end) && (*m_position == charT(*def)))
2311 ++m_position, ++def;
2312 if((m_position == m_end) || *def)
2313 {
2314 // Rewind to start of (? sequence:
2315 --m_position;
2316 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2317 fail(regex_constants::error_perl_extension, m_position - m_base);
2318 return false;
2319 }
2320 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2321 br->index = 9999; // special magic value!
2322 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2323 {
2324 // Rewind to start of (? sequence:
2325 --m_position;
2326 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2327 fail(regex_constants::error_perl_extension, m_position - m_base);
2328 return false;
2329 }
2330 if(++m_position == m_end)
2331 {
2332 // Rewind to start of (? sequence:
2333 --m_position;
2334 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2335 fail(regex_constants::error_perl_extension, m_position - m_base);
2336 return false;
2337 }
2338 }
2339 else if(v > 0)
2340 {
2341 re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2342 br->index = (int)v;
2343 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2344 {
2345 // Rewind to start of (? sequence:
2346 --m_position;
2347 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2348 fail(regex_constants::error_perl_extension, m_position - m_base);
2349 return false;
2350 }
2351 if(++m_position == m_end)
2352 {
2353 // Rewind to start of (? sequence:
2354 --m_position;
2355 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2356 fail(regex_constants::error_perl_extension, m_position - m_base);
2357 return false;
2358 }
2359 }
2360 else
2361 {
2362 // verify that we have a lookahead or lookbehind assert:
2363 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_question)
2364 {
2365 // Rewind to start of (? sequence:
2366 --m_position;
2367 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2368 fail(regex_constants::error_perl_extension, m_position - m_base);
2369 return false;
2370 }
2371 if(++m_position == m_end)
2372 {
2373 // Rewind to start of (? sequence:
2374 --m_position;
2375 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2376 fail(regex_constants::error_perl_extension, m_position - m_base);
2377 return false;
2378 }
2379 if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_left_word)
2380 {
2381 if(++m_position == m_end)
2382 {
2383 // Rewind to start of (? sequence:
2384 --m_position;
2385 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2386 fail(regex_constants::error_perl_extension, m_position - m_base);
2387 return false;
2388 }
2389 if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
2390 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
2391 {
2392 // Rewind to start of (? sequence:
2393 --m_position;
2394 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2395 fail(regex_constants::error_perl_extension, m_position - m_base);
2396 return false;
2397 }
2398 m_position -= 3;
2399 }
2400 else
2401 {
2402 if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
2403 && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
2404 {
2405 // Rewind to start of (? sequence:
2406 --m_position;
2407 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2408 fail(regex_constants::error_perl_extension, m_position - m_base);
2409 return false;
2410 }
2411 m_position -= 2;
2412 }
2413 }
2414 break;
2415 }
2416 case regex_constants::syntax_close_mark:
2417 // Rewind to start of (? sequence:
2418 --m_position;
2419 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2420 fail(regex_constants::error_perl_extension, m_position - m_base);
2421 return false;
2422 case regex_constants::escape_type_end_buffer:
2423 {
2424 name_delim = *m_position;
2425 named_capture_jump:
2426 markid = 0;
2427 if(0 == (this->flags() & regbase::nosubs))
2428 {
2429 markid = ++m_mark_count;
2430 if(this->flags() & regbase::save_subexpression_location)
2431 this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 2, 0));
2432 }
2433 pb->index = markid;
2434 const charT* base = ++m_position;
2435 if(m_position == m_end)
2436 {
2437 // Rewind to start of (? sequence:
2438 --m_position;
2439 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2440 fail(regex_constants::error_perl_extension, m_position - m_base);
2441 return false;
2442 }
2443 while((m_position != m_end) && (*m_position != name_delim))
2444 ++m_position;
2445 if(m_position == m_end)
2446 {
2447 // Rewind to start of (? sequence:
2448 --m_position;
2449 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2450 fail(regex_constants::error_perl_extension, m_position - m_base);
2451 return false;
2452 }
2453 this->m_pdata->set_name(base, m_position, markid);
2454 ++m_position;
2455 break;
2456 }
2457 default:
2458 if(*m_position == charT('R'))
2459 {
2460 ++m_position;
2461 v = 0;
2462 if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2463 {
2464 // Rewind to start of (? sequence:
2465 --m_position;
2466 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2467 fail(regex_constants::error_perl_extension, m_position - m_base);
2468 return false;
2469 }
2470 goto insert_recursion;
2471 }
2472 if(*m_position == charT('&'))
2473 {
2474 ++m_position;
2475 const charT* base = m_position;
2476 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2477 ++m_position;
2478 if(m_position == m_end)
2479 {
2480 // Rewind to start of (? sequence:
2481 --m_position;
2482 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2483 fail(regex_constants::error_perl_extension, m_position - m_base);
2484 return false;
2485 }
2486 v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2487 goto insert_recursion;
2488 }
2489 if(*m_position == charT('P'))
2490 {
2491 ++m_position;
2492 if(m_position == m_end)
2493 {
2494 // Rewind to start of (? sequence:
2495 --m_position;
2496 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2497 fail(regex_constants::error_perl_extension, m_position - m_base);
2498 return false;
2499 }
2500 if(*m_position == charT('>'))
2501 {
2502 ++m_position;
2503 const charT* base = m_position;
2504 while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2505 ++m_position;
2506 if(m_position == m_end)
2507 {
2508 // Rewind to start of (? sequence:
2509 --m_position;
2510 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2511 fail(regex_constants::error_perl_extension, m_position - m_base);
2512 return false;
2513 }
2514 v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2515 goto insert_recursion;
2516 }
2517 }
2518 //
2519 // lets assume that we have a (?imsx) group and try and parse it:
2520 //
2521 option_group_jump:
2522 regex_constants::syntax_option_type opts = parse_options();
2523 if(m_position == m_end)
2524 {
2525 // Rewind to start of (? sequence:
2526 --m_position;
2527 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2528 fail(regex_constants::error_perl_extension, m_position - m_base);
2529 return false;
2530 }
2531 // make a note of whether we have a case change:
2532 m_has_case_change = ((opts & regbase::icase) != (this->flags() & regbase::icase));
2533 pb->index = markid = 0;
2534 if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark)
2535 {
2536 // update flags and carry on as normal:
2537 this->flags(opts);
2538 restore_flags = false;
2539 old_case_change |= m_has_case_change; // defer end of scope by one ')'
2540 }
2541 else if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_colon)
2542 {
2543 // update flags and carry on until the matching ')' is found:
2544 this->flags(opts);
2545 ++m_position;
2546 }
2547 else
2548 {
2549 // Rewind to start of (? sequence:
2550 --m_position;
2551 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2552 fail(regex_constants::error_perl_extension, m_position - m_base);
2553 return false;
2554 }
2555
2556 // finally append a case change state if we need it:
2557 if(m_has_case_change)
2558 {
2559 static_cast<re_case*>(
2560 this->append_state(syntax_element_toggle_case, sizeof(re_case))
2561 )->icase = opts & regbase::icase;
2562 }
2563
2564 }
2565 //
2566 // now recursively add more states, this will terminate when we get to a
2567 // matching ')' :
2568 //
2569 parse_all();
2570 //
2571 // Unwind alternatives:
2572 //
2573 if(0 == unwind_alts(last_paren_start))
2574 {
2575 // Rewind to start of (? sequence:
2576 --m_position;
2577 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2578 fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid alternation operators within (?...) block.");
2579 return false;
2580 }
2581 //
2582 // we either have a ')' or we have run out of characters prematurely:
2583 //
2584 if(m_position == m_end)
2585 {
2586 // Rewind to start of (? sequence:
2587 --m_position;
2588 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2589 this->fail(regex_constants::error_paren, std::distance(m_base, m_end));
2590 return false;
2591 }
2592 BOOST_REGEX_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
2593 ++m_position;
2594 //
2595 // restore the flags:
2596 //
2597 if(restore_flags)
2598 {
2599 // append a case change state if we need it:
2600 if(m_has_case_change)
2601 {
2602 static_cast<re_case*>(
2603 this->append_state(syntax_element_toggle_case, sizeof(re_case))
2604 )->icase = old_flags & regbase::icase;
2605 }
2606 this->flags(old_flags);
2607 }
2608 //
2609 // set up the jump pointer if we have one:
2610 //
2611 if(jump_offset)
2612 {
2613 this->m_pdata->m_data.align();
2614 re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
2615 jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
2616 if((this->m_last_state == jmp) && (markid != -2))
2617 {
2618 // Oops... we didn't have anything inside the assertion.
2619 // Note we don't get here for negated forward lookahead as (?!)
2620 // does have some uses.
2621 // Rewind to start of (? sequence:
2622 --m_position;
2623 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2624 fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid or empty zero width assertion.");
2625 return false;
2626 }
2627 }
2628 //
2629 // verify that if this is conditional expression, that we do have
2630 // an alternative, if not add one:
2631 //
2632 if(markid == -4)
2633 {
2634 re_syntax_base* b = this->getaddress(expected_alt_point);
2635 // Make sure we have exactly one alternative following this state:
2636 if(b->type != syntax_element_alt)
2637 {
2638 re_alt* alt = static_cast<re_alt*>(this->insert_state(expected_alt_point, syntax_element_alt, sizeof(re_alt)));
2639 alt->alt.i = this->m_pdata->m_data.size() - this->getoffset(alt);
2640 }
2641 else if(((std::ptrdiff_t)this->m_pdata->m_data.size() > (static_cast<re_alt*>(b)->alt.i + this->getoffset(b))) && (static_cast<re_alt*>(b)->alt.i > 0) && this->getaddress(static_cast<re_alt*>(b)->alt.i, b)->type == syntax_element_alt)
2642 {
2643 // Can't have seen more than one alternative:
2644 // Rewind to start of (? sequence:
2645 --m_position;
2646 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2647 fail(regex_constants::error_bad_pattern, m_position - m_base, "More than one alternation operator | was encountered inside a conditional expression.");
2648 return false;
2649 }
2650 else
2651 {
2652 // We must *not* have seen an alternative inside a (DEFINE) block:
2653 b = this->getaddress(b->next.i, b);
2654 if((b->type == syntax_element_assert_backref) && (static_cast<re_brace*>(b)->index == 9999))
2655 {
2656 // Rewind to start of (? sequence:
2657 --m_position;
2658 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2659 fail(regex_constants::error_bad_pattern, m_position - m_base, "Alternation operators are not allowed inside a DEFINE block.");
2660 return false;
2661 }
2662 }
2663 // check for invalid repetition of next state:
2664 b = this->getaddress(expected_alt_point);
2665 b = this->getaddress(static_cast<re_alt*>(b)->next.i, b);
2666 if((b->type != syntax_element_assert_backref)
2667 && (b->type != syntax_element_startmark))
2668 {
2669 // Rewind to start of (? sequence:
2670 --m_position;
2671 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2672 fail(regex_constants::error_badrepeat, m_position - m_base, "A repetition operator cannot be applied to a zero-width assertion.");
2673 return false;
2674 }
2675 }
2676 //
2677 // append closing parenthesis state:
2678 //
2679 pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
2680 pb->index = markid;
2681 pb->icase = this->flags() & regbase::icase;
2682 this->m_paren_start = last_paren_start;
2683 //
2684 // restore the alternate insertion point:
2685 //
2686 this->m_alt_insert_point = last_alt_point;
2687 //
2688 // and the case change data:
2689 //
2690 m_has_case_change = old_case_change;
2691 //
2692 // And the mark_reset data:
2693 //
2694 if(m_max_mark > m_mark_count)
2695 {
2696 m_mark_count = m_max_mark;
2697 }
2698 m_mark_reset = mark_reset;
2699 m_max_mark = max_mark;
2700
2701
2702 if(markid > 0)
2703 {
2704 if(this->flags() & regbase::save_subexpression_location)
2705 this->m_pdata->m_subs.at((std::size_t)markid - 1).second = std::distance(m_base, m_position) - 1;
2706 }
2707 return true;
2708 }
2709
2710 template <class charT, class traits>
match_verb(const char * verb)2711 bool basic_regex_parser<charT, traits>::match_verb(const char* verb)
2712 {
2713 while(*verb)
2714 {
2715 if(static_cast<charT>(*verb) != *m_position)
2716 {
2717 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2718 fail(regex_constants::error_perl_extension, m_position - m_base);
2719 return false;
2720 }
2721 if(++m_position == m_end)
2722 {
2723 --m_position;
2724 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2725 fail(regex_constants::error_perl_extension, m_position - m_base);
2726 return false;
2727 }
2728 ++verb;
2729 }
2730 return true;
2731 }
2732
2733 #ifdef BOOST_REGEX_MSVC
2734 # pragma warning(push)
2735 #if BOOST_REGEX_MSVC >= 1800
2736 #pragma warning(disable:26812)
2737 #endif
2738 #endif
2739 template <class charT, class traits>
parse_perl_verb()2740 bool basic_regex_parser<charT, traits>::parse_perl_verb()
2741 {
2742 if(++m_position == m_end)
2743 {
2744 // Rewind to start of (* sequence:
2745 --m_position;
2746 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2747 fail(regex_constants::error_perl_extension, m_position - m_base);
2748 return false;
2749 }
2750 switch(*m_position)
2751 {
2752 case 'F':
2753 if(++m_position == m_end)
2754 {
2755 // Rewind to start of (* sequence:
2756 --m_position;
2757 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2758 fail(regex_constants::error_perl_extension, m_position - m_base);
2759 return false;
2760 }
2761 if((this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark) || match_verb("AIL"))
2762 {
2763 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2764 {
2765 // Rewind to start of (* sequence:
2766 --m_position;
2767 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2768 fail(regex_constants::error_perl_extension, m_position - m_base);
2769 return false;
2770 }
2771 ++m_position;
2772 this->append_state(syntax_element_fail);
2773 return true;
2774 }
2775 break;
2776 case 'A':
2777 if(++m_position == m_end)
2778 {
2779 // Rewind to start of (* sequence:
2780 --m_position;
2781 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2782 fail(regex_constants::error_perl_extension, m_position - m_base);
2783 return false;
2784 }
2785 if(match_verb("CCEPT"))
2786 {
2787 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2788 {
2789 // Rewind to start of (* sequence:
2790 --m_position;
2791 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2792 fail(regex_constants::error_perl_extension, m_position - m_base);
2793 return false;
2794 }
2795 ++m_position;
2796 this->append_state(syntax_element_accept);
2797 return true;
2798 }
2799 break;
2800 case 'C':
2801 if(++m_position == m_end)
2802 {
2803 // Rewind to start of (* sequence:
2804 --m_position;
2805 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2806 fail(regex_constants::error_perl_extension, m_position - m_base);
2807 return false;
2808 }
2809 if(match_verb("OMMIT"))
2810 {
2811 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2812 {
2813 // Rewind to start of (* sequence:
2814 --m_position;
2815 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2816 fail(regex_constants::error_perl_extension, m_position - m_base);
2817 return false;
2818 }
2819 ++m_position;
2820 static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_commit;
2821 this->m_pdata->m_disable_match_any = true;
2822 return true;
2823 }
2824 break;
2825 case 'P':
2826 if(++m_position == m_end)
2827 {
2828 // Rewind to start of (* sequence:
2829 --m_position;
2830 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2831 fail(regex_constants::error_perl_extension, m_position - m_base);
2832 return false;
2833 }
2834 if(match_verb("RUNE"))
2835 {
2836 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2837 {
2838 // Rewind to start of (* sequence:
2839 --m_position;
2840 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2841 fail(regex_constants::error_perl_extension, m_position - m_base);
2842 return false;
2843 }
2844 ++m_position;
2845 static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_prune;
2846 this->m_pdata->m_disable_match_any = true;
2847 return true;
2848 }
2849 break;
2850 case 'S':
2851 if(++m_position == m_end)
2852 {
2853 // Rewind to start of (* sequence:
2854 --m_position;
2855 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2856 fail(regex_constants::error_perl_extension, m_position - m_base);
2857 return false;
2858 }
2859 if(match_verb("KIP"))
2860 {
2861 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2862 {
2863 // Rewind to start of (* sequence:
2864 --m_position;
2865 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2866 fail(regex_constants::error_perl_extension, m_position - m_base);
2867 return false;
2868 }
2869 ++m_position;
2870 static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_skip;
2871 this->m_pdata->m_disable_match_any = true;
2872 return true;
2873 }
2874 break;
2875 case 'T':
2876 if(++m_position == m_end)
2877 {
2878 // Rewind to start of (* sequence:
2879 --m_position;
2880 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2881 fail(regex_constants::error_perl_extension, m_position - m_base);
2882 return false;
2883 }
2884 if(match_verb("HEN"))
2885 {
2886 if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2887 {
2888 // Rewind to start of (* sequence:
2889 --m_position;
2890 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2891 fail(regex_constants::error_perl_extension, m_position - m_base);
2892 return false;
2893 }
2894 ++m_position;
2895 this->append_state(syntax_element_then);
2896 this->m_pdata->m_disable_match_any = true;
2897 return true;
2898 }
2899 break;
2900 }
2901 // Rewind to start of (* sequence:
2902 --m_position;
2903 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2904 fail(regex_constants::error_perl_extension, m_position - m_base);
2905 return false;
2906 }
2907 #ifdef BOOST_REGEX_MSVC
2908 # pragma warning(pop)
2909 #endif
2910
2911 template <class charT, class traits>
add_emacs_code(bool negate)2912 bool basic_regex_parser<charT, traits>::add_emacs_code(bool negate)
2913 {
2914 //
2915 // parses an emacs style \sx or \Sx construct.
2916 //
2917 if(++m_position == m_end)
2918 {
2919 // Rewind to start of sequence:
2920 --m_position;
2921 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
2922 fail(regex_constants::error_escape, m_position - m_base);
2923 return false;
2924 }
2925 basic_char_set<charT, traits> char_set;
2926 if(negate)
2927 char_set.negate();
2928
2929 static const charT s_punct[5] = { 'p', 'u', 'n', 'c', 't', };
2930
2931 switch(*m_position)
2932 {
2933 case 's':
2934 case ' ':
2935 char_set.add_class(this->m_mask_space);
2936 break;
2937 case 'w':
2938 char_set.add_class(this->m_word_mask);
2939 break;
2940 case '_':
2941 char_set.add_single(digraph<charT>(charT('$')));
2942 char_set.add_single(digraph<charT>(charT('&')));
2943 char_set.add_single(digraph<charT>(charT('*')));
2944 char_set.add_single(digraph<charT>(charT('+')));
2945 char_set.add_single(digraph<charT>(charT('-')));
2946 char_set.add_single(digraph<charT>(charT('_')));
2947 char_set.add_single(digraph<charT>(charT('<')));
2948 char_set.add_single(digraph<charT>(charT('>')));
2949 break;
2950 case '.':
2951 char_set.add_class(this->m_traits.lookup_classname(s_punct, s_punct+5));
2952 break;
2953 case '(':
2954 char_set.add_single(digraph<charT>(charT('(')));
2955 char_set.add_single(digraph<charT>(charT('[')));
2956 char_set.add_single(digraph<charT>(charT('{')));
2957 break;
2958 case ')':
2959 char_set.add_single(digraph<charT>(charT(')')));
2960 char_set.add_single(digraph<charT>(charT(']')));
2961 char_set.add_single(digraph<charT>(charT('}')));
2962 break;
2963 case '"':
2964 char_set.add_single(digraph<charT>(charT('"')));
2965 char_set.add_single(digraph<charT>(charT('\'')));
2966 char_set.add_single(digraph<charT>(charT('`')));
2967 break;
2968 case '\'':
2969 char_set.add_single(digraph<charT>(charT('\'')));
2970 char_set.add_single(digraph<charT>(charT(',')));
2971 char_set.add_single(digraph<charT>(charT('#')));
2972 break;
2973 case '<':
2974 char_set.add_single(digraph<charT>(charT(';')));
2975 break;
2976 case '>':
2977 char_set.add_single(digraph<charT>(charT('\n')));
2978 char_set.add_single(digraph<charT>(charT('\f')));
2979 break;
2980 default:
2981 fail(regex_constants::error_ctype, m_position - m_base);
2982 return false;
2983 }
2984 if(0 == this->append_set(char_set))
2985 {
2986 fail(regex_constants::error_ctype, m_position - m_base);
2987 return false;
2988 }
2989 ++m_position;
2990 return true;
2991 }
2992
2993 template <class charT, class traits>
parse_options()2994 regex_constants::syntax_option_type basic_regex_parser<charT, traits>::parse_options()
2995 {
2996 // we have a (?imsx-imsx) group, convert it into a set of flags:
2997 regex_constants::syntax_option_type f = this->flags();
2998 bool breakout = false;
2999 do
3000 {
3001 switch(*m_position)
3002 {
3003 case 's':
3004 f |= regex_constants::mod_s;
3005 f &= ~regex_constants::no_mod_s;
3006 break;
3007 case 'm':
3008 f &= ~regex_constants::no_mod_m;
3009 break;
3010 case 'i':
3011 f |= regex_constants::icase;
3012 break;
3013 case 'x':
3014 f |= regex_constants::mod_x;
3015 break;
3016 default:
3017 breakout = true;
3018 continue;
3019 }
3020 if(++m_position == m_end)
3021 {
3022 // Rewind to start of (? sequence:
3023 --m_position;
3024 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
3025 fail(regex_constants::error_paren, m_position - m_base);
3026 return false;
3027 }
3028 }
3029 while(!breakout);
3030
3031 breakout = false;
3032
3033 if(*m_position == static_cast<charT>('-'))
3034 {
3035 if(++m_position == m_end)
3036 {
3037 // Rewind to start of (? sequence:
3038 --m_position;
3039 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
3040 fail(regex_constants::error_paren, m_position - m_base);
3041 return false;
3042 }
3043 do
3044 {
3045 switch(*m_position)
3046 {
3047 case 's':
3048 f &= ~regex_constants::mod_s;
3049 f |= regex_constants::no_mod_s;
3050 break;
3051 case 'm':
3052 f |= regex_constants::no_mod_m;
3053 break;
3054 case 'i':
3055 f &= ~regex_constants::icase;
3056 break;
3057 case 'x':
3058 f &= ~regex_constants::mod_x;
3059 break;
3060 default:
3061 breakout = true;
3062 continue;
3063 }
3064 if(++m_position == m_end)
3065 {
3066 // Rewind to start of (? sequence:
3067 --m_position;
3068 while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
3069 fail(regex_constants::error_paren, m_position - m_base);
3070 return false;
3071 }
3072 }
3073 while(!breakout);
3074 }
3075 return f;
3076 }
3077
3078 template <class charT, class traits>
unwind_alts(std::ptrdiff_t last_paren_start)3079 bool basic_regex_parser<charT, traits>::unwind_alts(std::ptrdiff_t last_paren_start)
3080 {
3081 //
3082 // If we didn't actually add any states after the last
3083 // alternative then that's an error:
3084 //
3085 if((this->m_alt_insert_point == static_cast<std::ptrdiff_t>(this->m_pdata->m_data.size()))
3086 && (!m_alt_jumps.empty()) && (m_alt_jumps.back() > last_paren_start)
3087 &&
3088 !(
3089 ((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group)
3090 &&
3091 ((this->flags() & regbase::no_empty_expressions) == 0)
3092 )
3093 )
3094 {
3095 fail(regex_constants::error_empty, this->m_position - this->m_base, "Can't terminate a sub-expression with an alternation operator |.");
3096 return false;
3097 }
3098 //
3099 // Fix up our alternatives:
3100 //
3101 while((!m_alt_jumps.empty()) && (m_alt_jumps.back() > last_paren_start))
3102 {
3103 //
3104 // fix up the jump to point to the end of the states
3105 // that we've just added:
3106 //
3107 std::ptrdiff_t jump_offset = m_alt_jumps.back();
3108 m_alt_jumps.pop_back();
3109 this->m_pdata->m_data.align();
3110 re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
3111 if (jmp->type != syntax_element_jump)
3112 {
3113 // Something really bad happened, this used to be an assert,
3114 // but we'll make it an error just in case we should ever get here.
3115 fail(regex_constants::error_unknown, this->m_position - this->m_base, "Internal logic failed while compiling the expression, probably you added a repeat to something non-repeatable!");
3116 return false;
3117 }
3118 jmp->alt.i = this->m_pdata->m_data.size() - jump_offset;
3119 }
3120 return true;
3121 }
3122
3123 #ifdef BOOST_REGEX_MSVC
3124 #pragma warning(pop)
3125 #endif
3126
3127 } // namespace BOOST_REGEX_DETAIL_NS
3128 } // namespace boost
3129
3130 #endif
3131