1 /*
2  *
3  * Copyright (c) 2004
4  * John Maddock
5  *
6  * Use, modification and distribution are subject to the
7  * Boost Software License, Version 1.0. (See accompanying file
8  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9  *
10  */
11 
12  /*
13   *   LOCATION:    see http://www.boost.org for most recent version.
14   *   FILE         basic_regex_parser.cpp
15   *   VERSION      see <boost/version.hpp>
16   *   DESCRIPTION: Declares template class basic_regex_parser.
17   */
18 
19 #ifndef BOOST_REGEX_V5_BASIC_REGEX_PARSER_HPP
20 #define BOOST_REGEX_V5_BASIC_REGEX_PARSER_HPP
21 
22 namespace boost{
23 namespace BOOST_REGEX_DETAIL_NS{
24 
25 #ifdef BOOST_REGEX_MSVC
26 #pragma warning(push)
27 #pragma warning(disable:4244 4459)
28 #if BOOST_REGEX_MSVC < 1910
29 #pragma warning(disable:4800)
30 #endif
31 #endif
32 
umax(std::integral_constant<bool,false> const &)33 inline std::intmax_t umax(std::integral_constant<bool, false> const&)
34 {
35    // Get out clause here, just in case numeric_limits is unspecialized:
36    return std::numeric_limits<std::intmax_t>::is_specialized ? (std::numeric_limits<std::intmax_t>::max)() : INT_MAX;
37 }
umax(std::integral_constant<bool,true> const &)38 inline std::intmax_t umax(std::integral_constant<bool, true> const&)
39 {
40    return (std::numeric_limits<std::size_t>::max)();
41 }
42 
umax()43 inline std::intmax_t umax()
44 {
45    return umax(std::integral_constant<bool, std::numeric_limits<std::intmax_t>::digits >= std::numeric_limits<std::size_t>::digits>());
46 }
47 
48 template <class charT, class traits>
49 class basic_regex_parser : public basic_regex_creator<charT, traits>
50 {
51 public:
52    basic_regex_parser(regex_data<charT, traits>* data);
53    void parse(const charT* p1, const charT* p2, unsigned flags);
54    void fail(regex_constants::error_type error_code, std::ptrdiff_t position);
55    void fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos);
fail(regex_constants::error_type error_code,std::ptrdiff_t position,const std::string & message)56    void fail(regex_constants::error_type error_code, std::ptrdiff_t position, const std::string& message)
57    {
58       fail(error_code, position, message, position);
59    }
60 
61    bool parse_all();
62    bool parse_basic();
63    bool parse_extended();
64    bool parse_literal();
65    bool parse_open_paren();
66    bool parse_basic_escape();
67    bool parse_extended_escape();
68    bool parse_match_any();
69    bool parse_repeat(std::size_t low = 0, std::size_t high = (std::numeric_limits<std::size_t>::max)());
70    bool parse_repeat_range(bool isbasic);
71    bool parse_alt();
72    bool parse_set();
73    bool parse_backref();
74    void parse_set_literal(basic_char_set<charT, traits>& char_set);
75    bool parse_inner_set(basic_char_set<charT, traits>& char_set);
76    bool parse_QE();
77    bool parse_perl_extension();
78    bool parse_perl_verb();
79    bool match_verb(const char*);
80    bool add_emacs_code(bool negate);
81    bool unwind_alts(std::ptrdiff_t last_paren_start);
82    digraph<charT> get_next_set_literal(basic_char_set<charT, traits>& char_set);
83    charT unescape_character();
84    regex_constants::syntax_option_type parse_options();
85 
86 private:
87    typedef bool (basic_regex_parser::*parser_proc_type)();
88    typedef typename traits::string_type string_type;
89    typedef typename traits::char_class_type char_class_type;
90    parser_proc_type           m_parser_proc;    // the main parser to use
91    const charT*               m_base;           // the start of the string being parsed
92    const charT*               m_end;            // the end of the string being parsed
93    const charT*               m_position;       // our current parser position
94    unsigned                   m_mark_count;     // how many sub-expressions we have
95    int                        m_mark_reset;     // used to indicate that we're inside a (?|...) block.
96    unsigned                   m_max_mark;       // largest mark count seen inside a (?|...) block.
97    std::ptrdiff_t             m_paren_start;    // where the last seen ')' began (where repeats are inserted).
98    std::ptrdiff_t             m_alt_insert_point; // where to insert the next alternative
99    bool                       m_has_case_change; // true if somewhere in the current block the case has changed
100    unsigned                   m_recursion_count; // How many times we've called parse_all.
101    unsigned                   m_max_backref;     // Largest index of any backref.
102 #if defined(BOOST_REGEX_MSVC) && defined(_M_IX86)
103    // This is an ugly warning suppression workaround (for warnings *inside* std::vector
104    // that can not otherwise be suppressed)...
105    static_assert(sizeof(long) >= sizeof(void*), "Long isn't long enough!");
106    std::vector<long>           m_alt_jumps;      // list of alternative in the current scope.
107 #else
108    std::vector<std::ptrdiff_t> m_alt_jumps;      // list of alternative in the current scope.
109 #endif
110 
111    basic_regex_parser& operator=(const basic_regex_parser&);
112    basic_regex_parser(const basic_regex_parser&);
113 };
114 
115 template <class charT, class traits>
basic_regex_parser(regex_data<charT,traits> * data)116 basic_regex_parser<charT, traits>::basic_regex_parser(regex_data<charT, traits>* data)
117    : basic_regex_creator<charT, traits>(data), m_parser_proc(), m_base(0), m_end(0), m_position(0),
118    m_mark_count(0), m_mark_reset(-1), m_max_mark(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false), m_recursion_count(0), m_max_backref(0)
119 {
120 }
121 
122 template <class charT, class traits>
parse(const charT * p1,const charT * p2,unsigned l_flags)123 void basic_regex_parser<charT, traits>::parse(const charT* p1, const charT* p2, unsigned l_flags)
124 {
125    // pass l_flags on to base class:
126    this->init(l_flags);
127    // set up pointers:
128    m_position = m_base = p1;
129    m_end = p2;
130    // empty strings are errors:
131    if((p1 == p2) &&
132       (
133          ((l_flags & regbase::main_option_type) != regbase::perl_syntax_group)
134          || (l_flags & regbase::no_empty_expressions)
135       )
136      )
137    {
138       fail(regex_constants::error_empty, 0);
139       return;
140    }
141    // select which parser to use:
142    switch(l_flags & regbase::main_option_type)
143    {
144    case regbase::perl_syntax_group:
145       {
146          m_parser_proc = &basic_regex_parser<charT, traits>::parse_extended;
147          //
148          // Add a leading paren with index zero to give recursions a target:
149          //
150          re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
151          br->index = 0;
152          br->icase = this->flags() & regbase::icase;
153          break;
154       }
155    case regbase::basic_syntax_group:
156       m_parser_proc = &basic_regex_parser<charT, traits>::parse_basic;
157       break;
158    case regbase::literal:
159       m_parser_proc = &basic_regex_parser<charT, traits>::parse_literal;
160       break;
161    default:
162       // Oops, someone has managed to set more than one of the main option flags,
163       // so this must be an error:
164       fail(regex_constants::error_unknown, 0, "An invalid combination of regular expression syntax flags was used.");
165       return;
166    }
167 
168    // parse all our characters:
169    bool result = parse_all();
170    //
171    // Unwind our alternatives:
172    //
173    unwind_alts(-1);
174    // reset l_flags as a global scope (?imsx) may have altered them:
175    this->flags(l_flags);
176    // if we haven't gobbled up all the characters then we must
177    // have had an unexpected ')' :
178    if(!result)
179    {
180       fail(regex_constants::error_paren, std::distance(m_base, m_position), "Found a closing ) with no corresponding opening parenthesis.");
181       return;
182    }
183    // if an error has been set then give up now:
184    if(this->m_pdata->m_status)
185       return;
186    // fill in our sub-expression count:
187    this->m_pdata->m_mark_count = 1u + (std::size_t)m_mark_count;
188    //
189    // Check we don't have backreferences to sub-expressions which don't exist:
190    //
191    if (m_max_backref > m_mark_count)
192    {
193       fail(regex_constants::error_backref, std::distance(m_base, m_position), "Found a backreference to a non-existant sub-expression.");
194    }
195    this->finalize(p1, p2);
196 }
197 
198 template <class charT, class traits>
fail(regex_constants::error_type error_code,std::ptrdiff_t position)199 void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position)
200 {
201    // get the error message:
202    std::string message = this->m_pdata->m_ptraits->error_string(error_code);
203    fail(error_code, position, message);
204 }
205 
206 template <class charT, class traits>
fail(regex_constants::error_type error_code,std::ptrdiff_t position,std::string message,std::ptrdiff_t start_pos)207 void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos)
208 {
209    if(0 == this->m_pdata->m_status) // update the error code if not already set
210       this->m_pdata->m_status = error_code;
211    m_position = m_end; // don't bother parsing anything else
212 
213    //
214    // Augment error message with the regular expression text:
215    //
216    if(start_pos == position)
217       start_pos = (std::max)(static_cast<std::ptrdiff_t>(0), position - static_cast<std::ptrdiff_t>(10));
218    std::ptrdiff_t end_pos = (std::min)(position + static_cast<std::ptrdiff_t>(10), static_cast<std::ptrdiff_t>(m_end - m_base));
219    if(error_code != regex_constants::error_empty)
220    {
221       if((start_pos != 0) || (end_pos != (m_end - m_base)))
222          message += "  The error occurred while parsing the regular expression fragment: '";
223       else
224          message += "  The error occurred while parsing the regular expression: '";
225       if(start_pos != end_pos)
226       {
227          message += std::string(m_base + start_pos, m_base + position);
228          message += ">>>HERE>>>";
229          message += std::string(m_base + position, m_base + end_pos);
230       }
231       message += "'.";
232    }
233 
234 #ifndef BOOST_NO_EXCEPTIONS
235    if(0 == (this->flags() & regex_constants::no_except))
236    {
237       boost::regex_error e(message, error_code, position);
238       e.raise();
239    }
240 #else
241    (void)position; // suppress warnings.
242 #endif
243 }
244 
245 template <class charT, class traits>
parse_all()246 bool basic_regex_parser<charT, traits>::parse_all()
247 {
248    if (++m_recursion_count > 400)
249    {
250       // exceeded internal limits
251       fail(boost::regex_constants::error_complexity, m_position - m_base, "Exceeded nested brace limit.");
252    }
253    bool result = true;
254    while(result && (m_position != m_end))
255    {
256       result = (this->*m_parser_proc)();
257    }
258    --m_recursion_count;
259    return result;
260 }
261 
262 #ifdef BOOST_REGEX_MSVC
263 #pragma warning(push)
264 #pragma warning(disable:4702)
265 #endif
266 template <class charT, class traits>
parse_basic()267 bool basic_regex_parser<charT, traits>::parse_basic()
268 {
269    switch(this->m_traits.syntax_type(*m_position))
270    {
271    case regex_constants::syntax_escape:
272       return parse_basic_escape();
273    case regex_constants::syntax_dot:
274       return parse_match_any();
275    case regex_constants::syntax_caret:
276       ++m_position;
277       this->append_state(syntax_element_start_line);
278       break;
279    case regex_constants::syntax_dollar:
280       ++m_position;
281       this->append_state(syntax_element_end_line);
282       break;
283    case regex_constants::syntax_star:
284       if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line))
285          return parse_literal();
286       else
287       {
288          ++m_position;
289          return parse_repeat();
290       }
291    case regex_constants::syntax_plus:
292       if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex))
293          return parse_literal();
294       else
295       {
296          ++m_position;
297          return parse_repeat(1);
298       }
299    case regex_constants::syntax_question:
300       if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex))
301          return parse_literal();
302       else
303       {
304          ++m_position;
305          return parse_repeat(0, 1);
306       }
307    case regex_constants::syntax_open_set:
308       return parse_set();
309    case regex_constants::syntax_newline:
310       if(this->flags() & regbase::newline_alt)
311          return parse_alt();
312       else
313          return parse_literal();
314    default:
315       return parse_literal();
316    }
317    return true;
318 }
319 
320 #ifdef BOOST_REGEX_MSVC
321 #  pragma warning(push)
322 #if BOOST_REGEX_MSVC >= 1800
323 #pragma warning(disable:26812)
324 #endif
325 #endif
326 template <class charT, class traits>
parse_extended()327 bool basic_regex_parser<charT, traits>::parse_extended()
328 {
329    bool result = true;
330    switch(this->m_traits.syntax_type(*m_position))
331    {
332    case regex_constants::syntax_open_mark:
333       return parse_open_paren();
334    case regex_constants::syntax_close_mark:
335       return false;
336    case regex_constants::syntax_escape:
337       return parse_extended_escape();
338    case regex_constants::syntax_dot:
339       return parse_match_any();
340    case regex_constants::syntax_caret:
341       ++m_position;
342       this->append_state(
343          (this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_start : syntax_element_start_line));
344       break;
345    case regex_constants::syntax_dollar:
346       ++m_position;
347       this->append_state(
348          (this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_end : syntax_element_end_line));
349       break;
350    case regex_constants::syntax_star:
351       if(m_position == this->m_base)
352       {
353          fail(regex_constants::error_badrepeat, 0, "The repeat operator \"*\" cannot start a regular expression.");
354          return false;
355       }
356       ++m_position;
357       return parse_repeat();
358    case regex_constants::syntax_question:
359       if(m_position == this->m_base)
360       {
361          fail(regex_constants::error_badrepeat, 0, "The repeat operator \"?\" cannot start a regular expression.");
362          return false;
363       }
364       ++m_position;
365       return parse_repeat(0,1);
366    case regex_constants::syntax_plus:
367       if(m_position == this->m_base)
368       {
369          fail(regex_constants::error_badrepeat, 0, "The repeat operator \"+\" cannot start a regular expression.");
370          return false;
371       }
372       ++m_position;
373       return parse_repeat(1);
374    case regex_constants::syntax_open_brace:
375       ++m_position;
376       return parse_repeat_range(false);
377    case regex_constants::syntax_close_brace:
378       if((this->flags() & regbase::no_perl_ex) == regbase::no_perl_ex)
379       {
380          fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {.");
381          return false;
382       }
383       result = parse_literal();
384       break;
385    case regex_constants::syntax_or:
386       return parse_alt();
387    case regex_constants::syntax_open_set:
388       return parse_set();
389    case regex_constants::syntax_newline:
390       if(this->flags() & regbase::newline_alt)
391          return parse_alt();
392       else
393          return parse_literal();
394    case regex_constants::syntax_hash:
395       //
396       // If we have a mod_x flag set, then skip until
397       // we get to a newline character:
398       //
399       if((this->flags()
400          & (regbase::no_perl_ex|regbase::mod_x))
401          == regbase::mod_x)
402       {
403          while((m_position != m_end) && !is_separator(*m_position++)){}
404          return true;
405       }
406       BOOST_REGEX_FALLTHROUGH;
407    default:
408       result = parse_literal();
409       break;
410    }
411    return result;
412 }
413 #ifdef BOOST_REGEX_MSVC
414 #  pragma warning(pop)
415 #endif
416 #ifdef BOOST_REGEX_MSVC
417 #pragma warning(pop)
418 #endif
419 
420 template <class charT, class traits>
parse_literal()421 bool basic_regex_parser<charT, traits>::parse_literal()
422 {
423    // append this as a literal provided it's not a space character
424    // or the perl option regbase::mod_x is not set:
425    if(
426       ((this->flags()
427          & (regbase::main_option_type|regbase::mod_x|regbase::no_perl_ex))
428             != regbase::mod_x)
429       || !this->m_traits.isctype(*m_position, this->m_mask_space))
430          this->append_literal(*m_position);
431    ++m_position;
432    return true;
433 }
434 
435 template <class charT, class traits>
parse_open_paren()436 bool basic_regex_parser<charT, traits>::parse_open_paren()
437 {
438    //
439    // skip the '(' and error check:
440    //
441    if(++m_position == m_end)
442    {
443       fail(regex_constants::error_paren, m_position - m_base);
444       return false;
445    }
446    //
447    // begin by checking for a perl-style (?...) extension:
448    //
449    if(
450          ((this->flags() & (regbase::main_option_type | regbase::no_perl_ex)) == 0)
451          || ((this->flags() & (regbase::main_option_type | regbase::emacs_ex)) == (regbase::basic_syntax_group|regbase::emacs_ex))
452      )
453    {
454       if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
455          return parse_perl_extension();
456       if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_star)
457          return parse_perl_verb();
458    }
459    //
460    // update our mark count, and append the required state:
461    //
462    unsigned markid = 0;
463    if(0 == (this->flags() & regbase::nosubs))
464    {
465       markid = ++m_mark_count;
466       if(this->flags() & regbase::save_subexpression_location)
467          this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 1, 0));
468    }
469    re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
470    pb->index = markid;
471    pb->icase = this->flags() & regbase::icase;
472    std::ptrdiff_t last_paren_start = this->getoffset(pb);
473    // back up insertion point for alternations, and set new point:
474    std::ptrdiff_t last_alt_point = m_alt_insert_point;
475    this->m_pdata->m_data.align();
476    m_alt_insert_point = this->m_pdata->m_data.size();
477    //
478    // back up the current flags in case we have a nested (?imsx) group:
479    //
480    regex_constants::syntax_option_type opts = this->flags();
481    bool old_case_change = m_has_case_change;
482    m_has_case_change = false; // no changes to this scope as yet...
483    //
484    // Back up branch reset data in case we have a nested (?|...)
485    //
486    int mark_reset = m_mark_reset;
487    m_mark_reset = -1;
488    //
489    // now recursively add more states, this will terminate when we get to a
490    // matching ')' :
491    //
492    parse_all();
493    //
494    // Unwind pushed alternatives:
495    //
496    if(0 == unwind_alts(last_paren_start))
497       return false;
498    //
499    // restore flags:
500    //
501    if(m_has_case_change)
502    {
503       // the case has changed in one or more of the alternatives
504       // within the scoped (...) block: we have to add a state
505       // to reset the case sensitivity:
506       static_cast<re_case*>(
507          this->append_state(syntax_element_toggle_case, sizeof(re_case))
508          )->icase = opts & regbase::icase;
509    }
510    this->flags(opts);
511    m_has_case_change = old_case_change;
512    //
513    // restore branch reset:
514    //
515    m_mark_reset = mark_reset;
516    //
517    // we either have a ')' or we have run out of characters prematurely:
518    //
519    if(m_position == m_end)
520    {
521       this->fail(regex_constants::error_paren, std::distance(m_base, m_end));
522       return false;
523    }
524    if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
525       return false;
526    if(markid && (this->flags() & regbase::save_subexpression_location))
527       this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position);
528    ++m_position;
529    //
530    // append closing parenthesis state:
531    //
532    pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
533    pb->index = markid;
534    pb->icase = this->flags() & regbase::icase;
535    this->m_paren_start = last_paren_start;
536    //
537    // restore the alternate insertion point:
538    //
539    this->m_alt_insert_point = last_alt_point;
540 
541    return true;
542 }
543 
544 template <class charT, class traits>
parse_basic_escape()545 bool basic_regex_parser<charT, traits>::parse_basic_escape()
546 {
547    if(++m_position == m_end)
548    {
549       fail(regex_constants::error_paren, m_position - m_base);
550       return false;
551    }
552    bool result = true;
553    switch(this->m_traits.escape_syntax_type(*m_position))
554    {
555    case regex_constants::syntax_open_mark:
556       return parse_open_paren();
557    case regex_constants::syntax_close_mark:
558       return false;
559    case regex_constants::syntax_plus:
560       if(this->flags() & regex_constants::bk_plus_qm)
561       {
562          ++m_position;
563          return parse_repeat(1);
564       }
565       else
566          return parse_literal();
567    case regex_constants::syntax_question:
568       if(this->flags() & regex_constants::bk_plus_qm)
569       {
570          ++m_position;
571          return parse_repeat(0, 1);
572       }
573       else
574          return parse_literal();
575    case regex_constants::syntax_open_brace:
576       if(this->flags() & regbase::no_intervals)
577          return parse_literal();
578       ++m_position;
579       return parse_repeat_range(true);
580    case regex_constants::syntax_close_brace:
581       if(this->flags() & regbase::no_intervals)
582          return parse_literal();
583       fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {.");
584       return false;
585    case regex_constants::syntax_or:
586       if(this->flags() & regbase::bk_vbar)
587          return parse_alt();
588       else
589          result = parse_literal();
590       break;
591    case regex_constants::syntax_digit:
592       return parse_backref();
593    case regex_constants::escape_type_start_buffer:
594       if(this->flags() & regbase::emacs_ex)
595       {
596          ++m_position;
597          this->append_state(syntax_element_buffer_start);
598       }
599       else
600          result = parse_literal();
601       break;
602    case regex_constants::escape_type_end_buffer:
603       if(this->flags() & regbase::emacs_ex)
604       {
605          ++m_position;
606          this->append_state(syntax_element_buffer_end);
607       }
608       else
609          result = parse_literal();
610       break;
611    case regex_constants::escape_type_word_assert:
612       if(this->flags() & regbase::emacs_ex)
613       {
614          ++m_position;
615          this->append_state(syntax_element_word_boundary);
616       }
617       else
618          result = parse_literal();
619       break;
620    case regex_constants::escape_type_not_word_assert:
621       if(this->flags() & regbase::emacs_ex)
622       {
623          ++m_position;
624          this->append_state(syntax_element_within_word);
625       }
626       else
627          result = parse_literal();
628       break;
629    case regex_constants::escape_type_left_word:
630       if(this->flags() & regbase::emacs_ex)
631       {
632          ++m_position;
633          this->append_state(syntax_element_word_start);
634       }
635       else
636          result = parse_literal();
637       break;
638    case regex_constants::escape_type_right_word:
639       if(this->flags() & regbase::emacs_ex)
640       {
641          ++m_position;
642          this->append_state(syntax_element_word_end);
643       }
644       else
645          result = parse_literal();
646       break;
647    default:
648       if(this->flags() & regbase::emacs_ex)
649       {
650          bool negate = true;
651          switch(*m_position)
652          {
653          case 'w':
654             negate = false;
655             BOOST_REGEX_FALLTHROUGH;
656          case 'W':
657             {
658             basic_char_set<charT, traits> char_set;
659             if(negate)
660                char_set.negate();
661             char_set.add_class(this->m_word_mask);
662             if(0 == this->append_set(char_set))
663             {
664                fail(regex_constants::error_ctype, m_position - m_base);
665                return false;
666             }
667             ++m_position;
668             return true;
669             }
670          case 's':
671             negate = false;
672             BOOST_REGEX_FALLTHROUGH;
673          case 'S':
674             return add_emacs_code(negate);
675          case 'c':
676          case 'C':
677             // not supported yet:
678             fail(regex_constants::error_escape, m_position - m_base, "The \\c and \\C escape sequences are not supported by POSIX basic regular expressions: try the Perl syntax instead.");
679             return false;
680          default:
681             break;
682          }
683       }
684       result = parse_literal();
685       break;
686    }
687    return result;
688 }
689 
690 template <class charT, class traits>
parse_extended_escape()691 bool basic_regex_parser<charT, traits>::parse_extended_escape()
692 {
693    ++m_position;
694    if(m_position == m_end)
695    {
696       fail(regex_constants::error_escape, m_position - m_base, "Incomplete escape sequence found.");
697       return false;
698    }
699    bool negate = false; // in case this is a character class escape: \w \d etc
700    switch(this->m_traits.escape_syntax_type(*m_position))
701    {
702    case regex_constants::escape_type_not_class:
703       negate = true;
704       BOOST_REGEX_FALLTHROUGH;
705    case regex_constants::escape_type_class:
706       {
707 escape_type_class_jump:
708          typedef typename traits::char_class_type m_type;
709          m_type m = this->m_traits.lookup_classname(m_position, m_position+1);
710          if(m != 0)
711          {
712             basic_char_set<charT, traits> char_set;
713             if(negate)
714                char_set.negate();
715             char_set.add_class(m);
716             if(0 == this->append_set(char_set))
717             {
718                fail(regex_constants::error_ctype, m_position - m_base);
719                return false;
720             }
721             ++m_position;
722             return true;
723          }
724          //
725          // not a class, just a regular unknown escape:
726          //
727          this->append_literal(unescape_character());
728          break;
729       }
730    case regex_constants::syntax_digit:
731       return parse_backref();
732    case regex_constants::escape_type_left_word:
733       ++m_position;
734       this->append_state(syntax_element_word_start);
735       break;
736    case regex_constants::escape_type_right_word:
737       ++m_position;
738       this->append_state(syntax_element_word_end);
739       break;
740    case regex_constants::escape_type_start_buffer:
741       ++m_position;
742       this->append_state(syntax_element_buffer_start);
743       break;
744    case regex_constants::escape_type_end_buffer:
745       ++m_position;
746       this->append_state(syntax_element_buffer_end);
747       break;
748    case regex_constants::escape_type_word_assert:
749       ++m_position;
750       this->append_state(syntax_element_word_boundary);
751       break;
752    case regex_constants::escape_type_not_word_assert:
753       ++m_position;
754       this->append_state(syntax_element_within_word);
755       break;
756    case regex_constants::escape_type_Z:
757       ++m_position;
758       this->append_state(syntax_element_soft_buffer_end);
759       break;
760    case regex_constants::escape_type_Q:
761       return parse_QE();
762    case regex_constants::escape_type_C:
763       return parse_match_any();
764    case regex_constants::escape_type_X:
765       ++m_position;
766       this->append_state(syntax_element_combining);
767       break;
768    case regex_constants::escape_type_G:
769       ++m_position;
770       this->append_state(syntax_element_restart_continue);
771       break;
772    case regex_constants::escape_type_not_property:
773       negate = true;
774       BOOST_REGEX_FALLTHROUGH;
775    case regex_constants::escape_type_property:
776       {
777          ++m_position;
778          char_class_type m;
779          if(m_position == m_end)
780          {
781             fail(regex_constants::error_escape, m_position - m_base, "Incomplete property escape found.");
782             return false;
783          }
784          // maybe have \p{ddd}
785          if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
786          {
787             const charT* base = m_position;
788             // skip forward until we find enclosing brace:
789             while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
790                ++m_position;
791             if(m_position == m_end)
792             {
793                fail(regex_constants::error_escape, m_position - m_base, "Closing } missing from property escape sequence.");
794                return false;
795             }
796             m = this->m_traits.lookup_classname(++base, m_position++);
797          }
798          else
799          {
800             m = this->m_traits.lookup_classname(m_position, m_position+1);
801             ++m_position;
802          }
803          if(m != 0)
804          {
805             basic_char_set<charT, traits> char_set;
806             if(negate)
807                char_set.negate();
808             char_set.add_class(m);
809             if(0 == this->append_set(char_set))
810             {
811                fail(regex_constants::error_ctype, m_position - m_base);
812                return false;
813             }
814             return true;
815          }
816          fail(regex_constants::error_ctype, m_position - m_base, "Escape sequence was neither a valid property nor a valid character class name.");
817          return false;
818       }
819    case regex_constants::escape_type_reset_start_mark:
820       if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
821       {
822          re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
823          pb->index = -5;
824          pb->icase = this->flags() & regbase::icase;
825          this->m_pdata->m_data.align();
826          ++m_position;
827          return true;
828       }
829       goto escape_type_class_jump;
830    case regex_constants::escape_type_line_ending:
831       if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
832       {
833          const charT* e = get_escape_R_string<charT>();
834          const charT* old_position = m_position;
835          const charT* old_end = m_end;
836          const charT* old_base = m_base;
837          m_position = e;
838          m_base = e;
839          m_end = e + traits::length(e);
840          bool r = parse_all();
841          m_position = ++old_position;
842          m_end = old_end;
843          m_base = old_base;
844          return r;
845       }
846       goto escape_type_class_jump;
847    case regex_constants::escape_type_extended_backref:
848       if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
849       {
850          bool have_brace = false;
851          bool negative = false;
852          static const char incomplete_message[] = "Incomplete \\g escape found.";
853          if(++m_position == m_end)
854          {
855             fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
856             return false;
857          }
858          // maybe have \g{ddd}
859          regex_constants::syntax_type syn = this->m_traits.syntax_type(*m_position);
860          regex_constants::syntax_type syn_end = 0;
861          if((syn == regex_constants::syntax_open_brace)
862             || (syn == regex_constants::escape_type_left_word)
863             || (syn == regex_constants::escape_type_end_buffer))
864          {
865             if(++m_position == m_end)
866             {
867                fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
868                return false;
869             }
870             have_brace = true;
871             switch(syn)
872             {
873             case regex_constants::syntax_open_brace:
874                syn_end = regex_constants::syntax_close_brace;
875                break;
876             case regex_constants::escape_type_left_word:
877                syn_end = regex_constants::escape_type_right_word;
878                break;
879             default:
880                syn_end = regex_constants::escape_type_end_buffer;
881                break;
882             }
883          }
884          negative = (*m_position == static_cast<charT>('-'));
885          if((negative) && (++m_position == m_end))
886          {
887             fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
888             return false;
889          }
890          const charT* pc = m_position;
891          std::intmax_t i = this->m_traits.toi(pc, m_end, 10);
892          if((i < 0) && syn_end)
893          {
894             // Check for a named capture, get the leftmost one if there is more than one:
895             const charT* base = m_position;
896             while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != syn_end))
897             {
898                ++m_position;
899             }
900             i = hash_value_from_capture_name(base, m_position);
901             pc = m_position;
902          }
903          if(negative)
904             i = 1 + (static_cast<std::intmax_t>(m_mark_count) - i);
905          if(((i < hash_value_mask) && (i > 0)) || ((i >= hash_value_mask) && (this->m_pdata->get_id((int)i) > 0)))
906          {
907             m_position = pc;
908             re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
909             pb->index = (int)i;
910             pb->icase = this->flags() & regbase::icase;
911             if ((i > m_max_backref) && (i < hash_value_mask))
912                m_max_backref = i;
913          }
914          else
915          {
916             fail(regex_constants::error_backref, m_position - m_base);
917             return false;
918          }
919          m_position = pc;
920          if(have_brace)
921          {
922             if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != syn_end))
923             {
924                fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
925                return false;
926             }
927             ++m_position;
928          }
929          return true;
930       }
931       goto escape_type_class_jump;
932    case regex_constants::escape_type_control_v:
933       if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
934          goto escape_type_class_jump;
935       BOOST_REGEX_FALLTHROUGH;
936    default:
937       this->append_literal(unescape_character());
938       break;
939    }
940    return true;
941 }
942 
943 template <class charT, class traits>
parse_match_any()944 bool basic_regex_parser<charT, traits>::parse_match_any()
945 {
946    //
947    // we have a '.' that can match any character:
948    //
949    ++m_position;
950    static_cast<re_dot*>(
951       this->append_state(syntax_element_wild, sizeof(re_dot))
952       )->mask = static_cast<unsigned char>(this->flags() & regbase::no_mod_s
953       ? BOOST_REGEX_DETAIL_NS::force_not_newline
954          : this->flags() & regbase::mod_s ?
955             BOOST_REGEX_DETAIL_NS::force_newline : BOOST_REGEX_DETAIL_NS::dont_care);
956    return true;
957 }
958 
959 template <class charT, class traits>
parse_repeat(std::size_t low,std::size_t high)960 bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_t high)
961 {
962    bool greedy = true;
963    bool possessive = false;
964    std::size_t insert_point;
965    //
966    // when we get to here we may have a non-greedy ? mark still to come:
967    //
968    if((m_position != m_end)
969       && (
970             (0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
971             || ((regbase::basic_syntax_group|regbase::emacs_ex) == (this->flags() & (regbase::main_option_type | regbase::emacs_ex)))
972          )
973       )
974    {
975       // OK we have a perl or emacs regex, check for a '?':
976       if ((this->flags() & (regbase::main_option_type | regbase::mod_x | regbase::no_perl_ex)) == regbase::mod_x)
977       {
978          // whitespace skip:
979          while ((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
980             ++m_position;
981       }
982       if((m_position != m_end) && (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question))
983       {
984          greedy = false;
985          ++m_position;
986       }
987       // for perl regexes only check for possessive ++ repeats.
988       if((m_position != m_end)
989          && (0 == (this->flags() & regbase::main_option_type))
990          && (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_plus))
991       {
992          possessive = true;
993          ++m_position;
994       }
995    }
996    if(0 == this->m_last_state)
997    {
998       fail(regex_constants::error_badrepeat, std::distance(m_base, m_position), "Nothing to repeat.");
999       return false;
1000    }
1001    if(this->m_last_state->type == syntax_element_endmark)
1002    {
1003       // insert a repeat before the '(' matching the last ')':
1004       insert_point = this->m_paren_start;
1005    }
1006    else if((this->m_last_state->type == syntax_element_literal) && (static_cast<re_literal*>(this->m_last_state)->length > 1))
1007    {
1008       // the last state was a literal with more than one character, split it in two:
1009       re_literal* lit = static_cast<re_literal*>(this->m_last_state);
1010       charT c = (static_cast<charT*>(static_cast<void*>(lit+1)))[lit->length - 1];
1011       lit->length -= 1;
1012       // now append new state:
1013       lit = static_cast<re_literal*>(this->append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
1014       lit->length = 1;
1015       (static_cast<charT*>(static_cast<void*>(lit+1)))[0] = c;
1016       insert_point = this->getoffset(this->m_last_state);
1017    }
1018    else
1019    {
1020       // repeat the last state whatever it was, need to add some error checking here:
1021       switch(this->m_last_state->type)
1022       {
1023       case syntax_element_start_line:
1024       case syntax_element_end_line:
1025       case syntax_element_word_boundary:
1026       case syntax_element_within_word:
1027       case syntax_element_word_start:
1028       case syntax_element_word_end:
1029       case syntax_element_buffer_start:
1030       case syntax_element_buffer_end:
1031       case syntax_element_alt:
1032       case syntax_element_soft_buffer_end:
1033       case syntax_element_restart_continue:
1034       case syntax_element_jump:
1035       case syntax_element_startmark:
1036       case syntax_element_backstep:
1037       case syntax_element_toggle_case:
1038          // can't legally repeat any of the above:
1039          fail(regex_constants::error_badrepeat, m_position - m_base);
1040          return false;
1041       default:
1042          // do nothing...
1043          break;
1044       }
1045       insert_point = this->getoffset(this->m_last_state);
1046    }
1047    //
1048    // OK we now know what to repeat, so insert the repeat around it:
1049    //
1050    re_repeat* rep = static_cast<re_repeat*>(this->insert_state(insert_point, syntax_element_rep, re_repeater_size));
1051    rep->min = low;
1052    rep->max = high;
1053    rep->greedy = greedy;
1054    rep->leading = false;
1055    // store our repeater position for later:
1056    std::ptrdiff_t rep_off = this->getoffset(rep);
1057    // and append a back jump to the repeat:
1058    re_jump* jmp = static_cast<re_jump*>(this->append_state(syntax_element_jump, sizeof(re_jump)));
1059    jmp->alt.i = rep_off - this->getoffset(jmp);
1060    this->m_pdata->m_data.align();
1061    // now fill in the alt jump for the repeat:
1062    rep = static_cast<re_repeat*>(this->getaddress(rep_off));
1063    rep->alt.i = this->m_pdata->m_data.size() - rep_off;
1064    //
1065    // If the repeat is possessive then bracket the repeat with a (?>...)
1066    // independent sub-expression construct:
1067    //
1068    if(possessive)
1069    {
1070       if(m_position != m_end)
1071       {
1072          //
1073          // Check for illegal following quantifier, we have to do this here, because
1074          // the extra states we insert below circumvents our usual error checking :-(
1075          //
1076          bool contin = false;
1077          do
1078          {
1079             if ((this->flags() & (regbase::main_option_type | regbase::mod_x | regbase::no_perl_ex)) == regbase::mod_x)
1080             {
1081                // whitespace skip:
1082                while ((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1083                   ++m_position;
1084             }
1085             if (m_position != m_end)
1086             {
1087                switch (this->m_traits.syntax_type(*m_position))
1088                {
1089                case regex_constants::syntax_star:
1090                case regex_constants::syntax_plus:
1091                case regex_constants::syntax_question:
1092                case regex_constants::syntax_open_brace:
1093                   fail(regex_constants::error_badrepeat, m_position - m_base);
1094                   return false;
1095                case regex_constants::syntax_open_mark:
1096                   // Do we have a comment?  If so we need to skip it here...
1097                   if ((m_position + 2 < m_end) && this->m_traits.syntax_type(*(m_position + 1)) == regex_constants::syntax_question
1098                      && this->m_traits.syntax_type(*(m_position + 2)) == regex_constants::syntax_hash)
1099                   {
1100                      while ((m_position != m_end)
1101                         && (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark)) {
1102                      }
1103                      contin = true;
1104                   }
1105                   else
1106                      contin = false;
1107                   break;
1108                default:
1109                   contin = false;
1110                }
1111             }
1112             else
1113                contin = false;
1114          } while (contin);
1115       }
1116       re_brace* pb = static_cast<re_brace*>(this->insert_state(insert_point, syntax_element_startmark, sizeof(re_brace)));
1117       pb->index = -3;
1118       pb->icase = this->flags() & regbase::icase;
1119       jmp = static_cast<re_jump*>(this->insert_state(insert_point + sizeof(re_brace), syntax_element_jump, sizeof(re_jump)));
1120       this->m_pdata->m_data.align();
1121       jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
1122       pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
1123       pb->index = -3;
1124       pb->icase = this->flags() & regbase::icase;
1125    }
1126    return true;
1127 }
1128 
1129 template <class charT, class traits>
parse_repeat_range(bool isbasic)1130 bool basic_regex_parser<charT, traits>::parse_repeat_range(bool isbasic)
1131 {
1132    static const char incomplete_message[] = "Missing } in quantified repetition.";
1133    //
1134    // parse a repeat-range:
1135    //
1136    std::size_t min, max;
1137    std::intmax_t v;
1138    // skip whitespace:
1139    while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1140       ++m_position;
1141    if(this->m_position == this->m_end)
1142    {
1143       if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1144       {
1145          fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1146          return false;
1147       }
1148       // Treat the opening '{' as a literal character, rewind to start of error:
1149       --m_position;
1150       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1151       return parse_literal();
1152    }
1153    // get min:
1154    v = this->m_traits.toi(m_position, m_end, 10);
1155    // skip whitespace:
1156    if((v < 0) || (v > umax()))
1157    {
1158       if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1159       {
1160          fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1161          return false;
1162       }
1163       // Treat the opening '{' as a literal character, rewind to start of error:
1164       --m_position;
1165       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1166       return parse_literal();
1167    }
1168    while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1169       ++m_position;
1170    if(this->m_position == this->m_end)
1171    {
1172       if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1173       {
1174          fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1175          return false;
1176       }
1177       // Treat the opening '{' as a literal character, rewind to start of error:
1178       --m_position;
1179       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1180       return parse_literal();
1181    }
1182    min = static_cast<std::size_t>(v);
1183    // see if we have a comma:
1184    if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_comma)
1185    {
1186       // move on and error check:
1187       ++m_position;
1188       // skip whitespace:
1189       while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1190          ++m_position;
1191       if(this->m_position == this->m_end)
1192       {
1193          if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1194          {
1195             fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1196             return false;
1197          }
1198          // Treat the opening '{' as a literal character, rewind to start of error:
1199          --m_position;
1200          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1201          return parse_literal();
1202       }
1203       // get the value if any:
1204       v = this->m_traits.toi(m_position, m_end, 10);
1205       max = ((v >= 0) && (v < umax())) ? (std::size_t)v : (std::numeric_limits<std::size_t>::max)();
1206    }
1207    else
1208    {
1209       // no comma, max = min:
1210       max = min;
1211    }
1212    // skip whitespace:
1213    while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
1214       ++m_position;
1215    // OK now check trailing }:
1216    if(this->m_position == this->m_end)
1217    {
1218       if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
1219       {
1220          fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1221          return false;
1222       }
1223       // Treat the opening '{' as a literal character, rewind to start of error:
1224       --m_position;
1225       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1226       return parse_literal();
1227    }
1228    if(isbasic)
1229    {
1230       if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_escape)
1231       {
1232          ++m_position;
1233          if(this->m_position == this->m_end)
1234          {
1235             fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1236             return false;
1237          }
1238       }
1239       else
1240       {
1241          fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
1242          return false;
1243       }
1244    }
1245    if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_brace)
1246       ++m_position;
1247    else
1248    {
1249       // Treat the opening '{' as a literal character, rewind to start of error:
1250       --m_position;
1251       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
1252       return parse_literal();
1253    }
1254    //
1255    // finally go and add the repeat, unless error:
1256    //
1257    if(min > max)
1258    {
1259       // Backtrack to error location:
1260       m_position -= 2;
1261       while(this->m_traits.isctype(*m_position, this->m_word_mask)) --m_position;
1262          ++m_position;
1263       fail(regex_constants::error_badbrace, m_position - m_base);
1264       return false;
1265    }
1266    return parse_repeat(min, max);
1267 }
1268 
1269 template <class charT, class traits>
parse_alt()1270 bool basic_regex_parser<charT, traits>::parse_alt()
1271 {
1272    //
1273    // error check: if there have been no previous states,
1274    // or if the last state was a '(' then error:
1275    //
1276    if(
1277       ((this->m_last_state == 0) || (this->m_last_state->type == syntax_element_startmark))
1278       &&
1279       !(
1280          ((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group)
1281            &&
1282          ((this->flags() & regbase::no_empty_expressions) == 0)
1283         )
1284       )
1285    {
1286       fail(regex_constants::error_empty, this->m_position - this->m_base, "A regular expression cannot start with the alternation operator |.");
1287       return false;
1288    }
1289    //
1290    // Reset mark count if required:
1291    //
1292    if(m_max_mark < m_mark_count)
1293       m_max_mark = m_mark_count;
1294    if(m_mark_reset >= 0)
1295       m_mark_count = m_mark_reset;
1296 
1297    ++m_position;
1298    //
1299    // we need to append a trailing jump:
1300    //
1301    re_syntax_base* pj = this->append_state(BOOST_REGEX_DETAIL_NS::syntax_element_jump, sizeof(re_jump));
1302    std::ptrdiff_t jump_offset = this->getoffset(pj);
1303    //
1304    // now insert the alternative:
1305    //
1306    re_alt* palt = static_cast<re_alt*>(this->insert_state(this->m_alt_insert_point, syntax_element_alt, re_alt_size));
1307    jump_offset += re_alt_size;
1308    this->m_pdata->m_data.align();
1309    palt->alt.i = this->m_pdata->m_data.size() - this->getoffset(palt);
1310    //
1311    // update m_alt_insert_point so that the next alternate gets
1312    // inserted at the start of the second of the two we've just created:
1313    //
1314    this->m_alt_insert_point = this->m_pdata->m_data.size();
1315    //
1316    // the start of this alternative must have a case changes state
1317    // if the current block has messed around with case changes:
1318    //
1319    if(m_has_case_change)
1320    {
1321       static_cast<re_case*>(
1322          this->append_state(syntax_element_toggle_case, sizeof(re_case))
1323          )->icase = this->m_icase;
1324    }
1325    //
1326    // push the alternative onto our stack, a recursive
1327    // implementation here is easier to understand (and faster
1328    // as it happens), but causes all kinds of stack overflow problems
1329    // on programs with small stacks (COM+).
1330    //
1331    m_alt_jumps.push_back(jump_offset);
1332    return true;
1333 }
1334 
1335 template <class charT, class traits>
parse_set()1336 bool basic_regex_parser<charT, traits>::parse_set()
1337 {
1338    static const char incomplete_message[] = "Character set declaration starting with [ terminated prematurely - either no ] was found or the set had no content.";
1339    ++m_position;
1340    if(m_position == m_end)
1341    {
1342       fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1343       return false;
1344    }
1345    basic_char_set<charT, traits> char_set;
1346 
1347    const charT* base = m_position;  // where the '[' was
1348    const charT* item_base = m_position;  // where the '[' or '^' was
1349 
1350    while(m_position != m_end)
1351    {
1352       switch(this->m_traits.syntax_type(*m_position))
1353       {
1354       case regex_constants::syntax_caret:
1355          if(m_position == base)
1356          {
1357             char_set.negate();
1358             ++m_position;
1359             item_base = m_position;
1360          }
1361          else
1362             parse_set_literal(char_set);
1363          break;
1364       case regex_constants::syntax_close_set:
1365          if(m_position == item_base)
1366          {
1367             parse_set_literal(char_set);
1368             break;
1369          }
1370          else
1371          {
1372             ++m_position;
1373             if(0 == this->append_set(char_set))
1374             {
1375                fail(regex_constants::error_ctype, m_position - m_base);
1376                return false;
1377             }
1378          }
1379          return true;
1380       case regex_constants::syntax_open_set:
1381          if(parse_inner_set(char_set))
1382             break;
1383          return true;
1384       case regex_constants::syntax_escape:
1385          {
1386             //
1387             // look ahead and see if this is a character class shortcut
1388             // \d \w \s etc...
1389             //
1390             ++m_position;
1391             if(this->m_traits.escape_syntax_type(*m_position)
1392                == regex_constants::escape_type_class)
1393             {
1394                char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
1395                if(m != 0)
1396                {
1397                   char_set.add_class(m);
1398                   ++m_position;
1399                   break;
1400                }
1401             }
1402             else if(this->m_traits.escape_syntax_type(*m_position)
1403                == regex_constants::escape_type_not_class)
1404             {
1405                // negated character class:
1406                char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
1407                if(m != 0)
1408                {
1409                   char_set.add_negated_class(m);
1410                   ++m_position;
1411                   break;
1412                }
1413             }
1414             // not a character class, just a regular escape:
1415             --m_position;
1416             parse_set_literal(char_set);
1417             break;
1418          }
1419       default:
1420          parse_set_literal(char_set);
1421          break;
1422       }
1423    }
1424    return m_position != m_end;
1425 }
1426 
1427 template <class charT, class traits>
parse_inner_set(basic_char_set<charT,traits> & char_set)1428 bool basic_regex_parser<charT, traits>::parse_inner_set(basic_char_set<charT, traits>& char_set)
1429 {
1430    static const char incomplete_message[] = "Character class declaration starting with [ terminated prematurely - either no ] was found or the set had no content.";
1431    //
1432    // we have either a character class [:name:]
1433    // a collating element [.name.]
1434    // or an equivalence class [=name=]
1435    //
1436    if(m_end == ++m_position)
1437    {
1438       fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1439       return false;
1440    }
1441    switch(this->m_traits.syntax_type(*m_position))
1442    {
1443    case regex_constants::syntax_dot:
1444       //
1445       // a collating element is treated as a literal:
1446       //
1447       --m_position;
1448       parse_set_literal(char_set);
1449       return true;
1450    case regex_constants::syntax_colon:
1451       {
1452       // check that character classes are actually enabled:
1453       if((this->flags() & (regbase::main_option_type | regbase::no_char_classes))
1454          == (regbase::basic_syntax_group  | regbase::no_char_classes))
1455       {
1456          --m_position;
1457          parse_set_literal(char_set);
1458          return true;
1459       }
1460       // skip the ':'
1461       if(m_end == ++m_position)
1462       {
1463          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1464          return false;
1465       }
1466       const charT* name_first = m_position;
1467       // skip at least one character, then find the matching ':]'
1468       if(m_end == ++m_position)
1469       {
1470          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1471          return false;
1472       }
1473       while((m_position != m_end)
1474          && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_colon))
1475          ++m_position;
1476       const charT* name_last = m_position;
1477       if(m_end == m_position)
1478       {
1479          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1480          return false;
1481       }
1482       if((m_end == ++m_position)
1483          || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1484       {
1485          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1486          return false;
1487       }
1488       //
1489       // check for negated class:
1490       //
1491       bool negated = false;
1492       if(this->m_traits.syntax_type(*name_first) == regex_constants::syntax_caret)
1493       {
1494          ++name_first;
1495          negated = true;
1496       }
1497       typedef typename traits::char_class_type m_type;
1498       m_type m = this->m_traits.lookup_classname(name_first, name_last);
1499       if(m == 0)
1500       {
1501          if(char_set.empty() && (name_last - name_first == 1))
1502          {
1503             // maybe a special case:
1504             ++m_position;
1505             if( (m_position != m_end)
1506                && (this->m_traits.syntax_type(*m_position)
1507                   == regex_constants::syntax_close_set))
1508             {
1509                if(this->m_traits.escape_syntax_type(*name_first)
1510                   == regex_constants::escape_type_left_word)
1511                {
1512                   ++m_position;
1513                   this->append_state(syntax_element_word_start);
1514                   return false;
1515                }
1516                if(this->m_traits.escape_syntax_type(*name_first)
1517                   == regex_constants::escape_type_right_word)
1518                {
1519                   ++m_position;
1520                   this->append_state(syntax_element_word_end);
1521                   return false;
1522                }
1523             }
1524          }
1525          fail(regex_constants::error_ctype, name_first - m_base);
1526          return false;
1527       }
1528       if(!negated)
1529          char_set.add_class(m);
1530       else
1531          char_set.add_negated_class(m);
1532       ++m_position;
1533       break;
1534    }
1535    case regex_constants::syntax_equal:
1536       {
1537       // skip the '='
1538       if(m_end == ++m_position)
1539       {
1540          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1541          return false;
1542       }
1543       const charT* name_first = m_position;
1544       // skip at least one character, then find the matching '=]'
1545       if(m_end == ++m_position)
1546       {
1547          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1548          return false;
1549       }
1550       while((m_position != m_end)
1551          && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal))
1552          ++m_position;
1553       const charT* name_last = m_position;
1554       if(m_end == m_position)
1555       {
1556          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1557          return false;
1558       }
1559       if((m_end == ++m_position)
1560          || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1561       {
1562          fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
1563          return false;
1564       }
1565       string_type m = this->m_traits.lookup_collatename(name_first, name_last);
1566       if(m.empty() || (m.size() > 2))
1567       {
1568          fail(regex_constants::error_collate, name_first - m_base);
1569          return false;
1570       }
1571       digraph<charT> d;
1572       d.first = m[0];
1573       if(m.size() > 1)
1574          d.second = m[1];
1575       else
1576          d.second = 0;
1577       char_set.add_equivalent(d);
1578       ++m_position;
1579       break;
1580    }
1581    default:
1582       --m_position;
1583       parse_set_literal(char_set);
1584       break;
1585    }
1586    return true;
1587 }
1588 
1589 template <class charT, class traits>
parse_set_literal(basic_char_set<charT,traits> & char_set)1590 void basic_regex_parser<charT, traits>::parse_set_literal(basic_char_set<charT, traits>& char_set)
1591 {
1592    digraph<charT> start_range(get_next_set_literal(char_set));
1593    if(m_end == m_position)
1594    {
1595       fail(regex_constants::error_brack, m_position - m_base);
1596       return;
1597    }
1598    if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
1599    {
1600       // we have a range:
1601       if(m_end == ++m_position)
1602       {
1603          fail(regex_constants::error_brack, m_position - m_base);
1604          return;
1605       }
1606       if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)
1607       {
1608          digraph<charT> end_range = get_next_set_literal(char_set);
1609          char_set.add_range(start_range, end_range);
1610          if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
1611          {
1612             if(m_end == ++m_position)
1613             {
1614                fail(regex_constants::error_brack, m_position - m_base);
1615                return;
1616             }
1617             if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_set)
1618             {
1619                // trailing - :
1620                --m_position;
1621                return;
1622             }
1623             fail(regex_constants::error_range, m_position - m_base);
1624             return;
1625          }
1626          return;
1627       }
1628       --m_position;
1629    }
1630    char_set.add_single(start_range);
1631 }
1632 
1633 template <class charT, class traits>
get_next_set_literal(basic_char_set<charT,traits> & char_set)1634 digraph<charT> basic_regex_parser<charT, traits>::get_next_set_literal(basic_char_set<charT, traits>& char_set)
1635 {
1636    digraph<charT> result;
1637    switch(this->m_traits.syntax_type(*m_position))
1638    {
1639    case regex_constants::syntax_dash:
1640       if(!char_set.empty())
1641       {
1642          // see if we are at the end of the set:
1643          if((++m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1644          {
1645             fail(regex_constants::error_range, m_position - m_base);
1646             return result;
1647          }
1648          --m_position;
1649       }
1650       result.first = *m_position++;
1651       return result;
1652    case regex_constants::syntax_escape:
1653       // check to see if escapes are supported first:
1654       if(this->flags() & regex_constants::no_escape_in_lists)
1655       {
1656          result = *m_position++;
1657          break;
1658       }
1659       ++m_position;
1660       result = unescape_character();
1661       break;
1662    case regex_constants::syntax_open_set:
1663    {
1664       if(m_end == ++m_position)
1665       {
1666          fail(regex_constants::error_collate, m_position - m_base);
1667          return result;
1668       }
1669       if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot)
1670       {
1671          --m_position;
1672          result.first = *m_position;
1673          ++m_position;
1674          return result;
1675       }
1676       if(m_end == ++m_position)
1677       {
1678          fail(regex_constants::error_collate, m_position - m_base);
1679          return result;
1680       }
1681       const charT* name_first = m_position;
1682       // skip at least one character, then find the matching ':]'
1683       if(m_end == ++m_position)
1684       {
1685          fail(regex_constants::error_collate, name_first - m_base);
1686          return result;
1687       }
1688       while((m_position != m_end)
1689          && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot))
1690          ++m_position;
1691       const charT* name_last = m_position;
1692       if(m_end == m_position)
1693       {
1694          fail(regex_constants::error_collate, name_first - m_base);
1695          return result;
1696       }
1697       if((m_end == ++m_position)
1698          || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1699       {
1700          fail(regex_constants::error_collate, name_first - m_base);
1701          return result;
1702       }
1703       ++m_position;
1704       string_type s = this->m_traits.lookup_collatename(name_first, name_last);
1705       if(s.empty() || (s.size() > 2))
1706       {
1707          fail(regex_constants::error_collate, name_first - m_base);
1708          return result;
1709       }
1710       result.first = s[0];
1711       if(s.size() > 1)
1712          result.second = s[1];
1713       else
1714          result.second = 0;
1715       return result;
1716    }
1717    default:
1718       result = *m_position++;
1719    }
1720    return result;
1721 }
1722 
1723 //
1724 // does a value fit in the specified charT type?
1725 //
1726 template <class charT>
valid_value(charT,std::intmax_t v,const std::integral_constant<bool,true> &)1727 bool valid_value(charT, std::intmax_t v, const std::integral_constant<bool, true>&)
1728 {
1729    return (v >> (sizeof(charT) * CHAR_BIT)) == 0;
1730 }
1731 template <class charT>
valid_value(charT,std::intmax_t,const std::integral_constant<bool,false> &)1732 bool valid_value(charT, std::intmax_t, const std::integral_constant<bool, false>&)
1733 {
1734    return true; // v will alsways fit in a charT
1735 }
1736 template <class charT>
valid_value(charT c,std::intmax_t v)1737 bool valid_value(charT c, std::intmax_t v)
1738 {
1739    return valid_value(c, v, std::integral_constant<bool, (sizeof(charT) < sizeof(std::intmax_t))>());
1740 }
1741 
1742 template <class charT, class traits>
1743 charT basic_regex_parser<charT, traits>::unescape_character()
1744 {
1745 #ifdef BOOST_REGEX_MSVC
1746 #pragma warning(push)
1747 #pragma warning(disable:4127)
1748 #endif
1749    charT result(0);
1750    if(m_position == m_end)
1751    {
1752       fail(regex_constants::error_escape, m_position - m_base, "Escape sequence terminated prematurely.");
1753       return false;
1754    }
1755    switch(this->m_traits.escape_syntax_type(*m_position))
1756    {
1757    case regex_constants::escape_type_control_a:
1758       result = charT('\a');
1759       break;
1760    case regex_constants::escape_type_e:
1761       result = charT(27);
1762       break;
1763    case regex_constants::escape_type_control_f:
1764       result = charT('\f');
1765       break;
1766    case regex_constants::escape_type_control_n:
1767       result = charT('\n');
1768       break;
1769    case regex_constants::escape_type_control_r:
1770       result = charT('\r');
1771       break;
1772    case regex_constants::escape_type_control_t:
1773       result = charT('\t');
1774       break;
1775    case regex_constants::escape_type_control_v:
1776       result = charT('\v');
1777       break;
1778    case regex_constants::escape_type_word_assert:
1779       result = charT('\b');
1780       break;
1781    case regex_constants::escape_type_ascii_control:
1782       ++m_position;
1783       if(m_position == m_end)
1784       {
1785          // Rewind to start of escape:
1786          --m_position;
1787          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1788          fail(regex_constants::error_escape, m_position - m_base, "ASCII escape sequence terminated prematurely.");
1789          return result;
1790       }
1791       result = static_cast<charT>(*m_position % 32);
1792       break;
1793    case regex_constants::escape_type_hex:
1794       ++m_position;
1795       if(m_position == m_end)
1796       {
1797          // Rewind to start of escape:
1798          --m_position;
1799          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1800          fail(regex_constants::error_escape, m_position - m_base, "Hexadecimal escape sequence terminated prematurely.");
1801          return result;
1802       }
1803       // maybe have \x{ddd}
1804       if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
1805       {
1806          ++m_position;
1807          if(m_position == m_end)
1808          {
1809             // Rewind to start of escape:
1810             --m_position;
1811             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1812             fail(regex_constants::error_escape, m_position - m_base, "Missing } in hexadecimal escape sequence.");
1813             return result;
1814          }
1815          std::intmax_t i = this->m_traits.toi(m_position, m_end, 16);
1816          if((m_position == m_end)
1817             || (i < 0)
1818             || ((std::numeric_limits<charT>::is_specialized) && (i > (std::intmax_t)(std::numeric_limits<charT>::max)()))
1819             || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
1820          {
1821             // Rewind to start of escape:
1822             --m_position;
1823             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1824             fail(regex_constants::error_badbrace, m_position - m_base, "Hexadecimal escape sequence was invalid.");
1825             return result;
1826          }
1827          ++m_position;
1828          result = charT(i);
1829       }
1830       else
1831       {
1832          std::ptrdiff_t len = (std::min)(static_cast<std::ptrdiff_t>(2), static_cast<std::ptrdiff_t>(m_end - m_position));
1833          std::intmax_t i = this->m_traits.toi(m_position, m_position + len, 16);
1834          if((i < 0)
1835             || !valid_value(charT(0), i))
1836          {
1837             // Rewind to start of escape:
1838             --m_position;
1839             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1840             fail(regex_constants::error_escape, m_position - m_base, "Escape sequence did not encode a valid character.");
1841             return result;
1842          }
1843          result = charT(i);
1844       }
1845       return result;
1846    case regex_constants::syntax_digit:
1847       {
1848       // an octal escape sequence, the first character must be a zero
1849       // followed by up to 3 octal digits:
1850       std::ptrdiff_t len = (std::min)(std::distance(m_position, m_end), static_cast<std::ptrdiff_t>(4));
1851       const charT* bp = m_position;
1852       std::intmax_t val = this->m_traits.toi(bp, bp + 1, 8);
1853       if(val != 0)
1854       {
1855          // Rewind to start of escape:
1856          --m_position;
1857          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1858          // Oops not an octal escape after all:
1859          fail(regex_constants::error_escape, m_position - m_base, "Invalid octal escape sequence.");
1860          return result;
1861       }
1862       val = this->m_traits.toi(m_position, m_position + len, 8);
1863       if((val < 0) || (val > (std::intmax_t)(std::numeric_limits<charT>::max)()))
1864       {
1865          // Rewind to start of escape:
1866          --m_position;
1867          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1868          fail(regex_constants::error_escape, m_position - m_base, "Octal escape sequence is invalid.");
1869          return result;
1870       }
1871       return static_cast<charT>(val);
1872       }
1873    case regex_constants::escape_type_named_char:
1874       {
1875          ++m_position;
1876          if(m_position == m_end)
1877          {
1878             // Rewind to start of escape:
1879             --m_position;
1880             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1881             fail(regex_constants::error_escape, m_position - m_base);
1882             return false;
1883          }
1884          // maybe have \N{name}
1885          if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
1886          {
1887             const charT* base = m_position;
1888             // skip forward until we find enclosing brace:
1889             while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
1890                ++m_position;
1891             if(m_position == m_end)
1892             {
1893                // Rewind to start of escape:
1894                --m_position;
1895                while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1896                fail(regex_constants::error_escape, m_position - m_base);
1897                return false;
1898             }
1899             string_type s = this->m_traits.lookup_collatename(++base, m_position++);
1900             if(s.empty())
1901             {
1902                // Rewind to start of escape:
1903                --m_position;
1904                while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1905                fail(regex_constants::error_collate, m_position - m_base);
1906                return false;
1907             }
1908             if(s.size() == 1)
1909             {
1910                return s[0];
1911             }
1912          }
1913          // fall through is a failure:
1914          // Rewind to start of escape:
1915          --m_position;
1916          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1917          fail(regex_constants::error_escape, m_position - m_base);
1918          return false;
1919       }
1920    default:
1921       result = *m_position;
1922       break;
1923    }
1924    ++m_position;
1925    return result;
1926 #ifdef BOOST_REGEX_MSVC
1927 #pragma warning(pop)
1928 #endif
1929 }
1930 
1931 template <class charT, class traits>
parse_backref()1932 bool basic_regex_parser<charT, traits>::parse_backref()
1933 {
1934    BOOST_REGEX_ASSERT(m_position != m_end);
1935    const charT* pc = m_position;
1936    std::intmax_t i = this->m_traits.toi(pc, pc + 1, 10);
1937    if((i == 0) || (((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group) && (this->flags() & regbase::no_bk_refs)))
1938    {
1939       // not a backref at all but an octal escape sequence:
1940       charT c = unescape_character();
1941       this->append_literal(c);
1942    }
1943    else if((i > 0))
1944    {
1945       m_position = pc;
1946       re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
1947       pb->index = (int)i;
1948       pb->icase = this->flags() & regbase::icase;
1949       if(i > m_max_backref)
1950          m_max_backref = i;
1951    }
1952    else
1953    {
1954       // Rewind to start of escape:
1955       --m_position;
1956       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
1957       fail(regex_constants::error_backref, m_position - m_base);
1958       return false;
1959    }
1960    return true;
1961 }
1962 
1963 template <class charT, class traits>
parse_QE()1964 bool basic_regex_parser<charT, traits>::parse_QE()
1965 {
1966 #ifdef BOOST_REGEX_MSVC
1967 #pragma warning(push)
1968 #pragma warning(disable:4127)
1969 #endif
1970    //
1971    // parse a \Q...\E sequence:
1972    //
1973    ++m_position; // skip the Q
1974    const charT* start = m_position;
1975    const charT* end;
1976    do
1977    {
1978       while((m_position != m_end)
1979          && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape))
1980          ++m_position;
1981       if(m_position == m_end)
1982       {
1983          //  a \Q...\E sequence may terminate with the end of the expression:
1984          end = m_position;
1985          break;
1986       }
1987       if(++m_position == m_end) // skip the escape
1988       {
1989          fail(regex_constants::error_escape, m_position - m_base, "Unterminated \\Q...\\E sequence.");
1990          return false;
1991       }
1992       // check to see if it's a \E:
1993       if(this->m_traits.escape_syntax_type(*m_position) == regex_constants::escape_type_E)
1994       {
1995          ++m_position;
1996          end = m_position - 2;
1997          break;
1998       }
1999       // otherwise go round again:
2000    }while(true);
2001    //
2002    // now add all the character between the two escapes as literals:
2003    //
2004    while(start != end)
2005    {
2006       this->append_literal(*start);
2007       ++start;
2008    }
2009    return true;
2010 #ifdef BOOST_REGEX_MSVC
2011 #pragma warning(pop)
2012 #endif
2013 }
2014 
2015 template <class charT, class traits>
parse_perl_extension()2016 bool basic_regex_parser<charT, traits>::parse_perl_extension()
2017 {
2018    if(++m_position == m_end)
2019    {
2020       // Rewind to start of (? sequence:
2021       --m_position;
2022       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2023       fail(regex_constants::error_perl_extension, m_position - m_base);
2024       return false;
2025    }
2026    //
2027    // treat comments as a special case, as these
2028    // are the only ones that don't start with a leading
2029    // startmark state:
2030    //
2031    if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_hash)
2032    {
2033       while((m_position != m_end)
2034          && (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark))
2035       {}
2036       return true;
2037    }
2038    //
2039    // backup some state, and prepare the way:
2040    //
2041    int markid = 0;
2042    std::ptrdiff_t jump_offset = 0;
2043    re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
2044    pb->icase = this->flags() & regbase::icase;
2045    std::ptrdiff_t last_paren_start = this->getoffset(pb);
2046    // back up insertion point for alternations, and set new point:
2047    std::ptrdiff_t last_alt_point = m_alt_insert_point;
2048    this->m_pdata->m_data.align();
2049    m_alt_insert_point = this->m_pdata->m_data.size();
2050    std::ptrdiff_t expected_alt_point = m_alt_insert_point;
2051    bool restore_flags = true;
2052    regex_constants::syntax_option_type old_flags = this->flags();
2053    bool old_case_change = m_has_case_change;
2054    m_has_case_change = false;
2055    charT name_delim;
2056    int mark_reset = m_mark_reset;
2057    int max_mark = m_max_mark;
2058    m_mark_reset = -1;
2059    m_max_mark = m_mark_count;
2060    std::intmax_t v;
2061    //
2062    // select the actual extension used:
2063    //
2064    switch(this->m_traits.syntax_type(*m_position))
2065    {
2066    case regex_constants::syntax_or:
2067       m_mark_reset = m_mark_count;
2068       BOOST_REGEX_FALLTHROUGH;
2069    case regex_constants::syntax_colon:
2070       //
2071       // a non-capturing mark:
2072       //
2073       pb->index = markid = 0;
2074       ++m_position;
2075       break;
2076    case regex_constants::syntax_digit:
2077       {
2078       //
2079       // a recursive subexpression:
2080       //
2081       v = this->m_traits.toi(m_position, m_end, 10);
2082       if((v < 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2083       {
2084          // Rewind to start of (? sequence:
2085          --m_position;
2086          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2087          fail(regex_constants::error_perl_extension, m_position - m_base, "The recursive sub-expression refers to an invalid marking group, or is unterminated.");
2088          return false;
2089       }
2090 insert_recursion:
2091       pb->index = markid = 0;
2092       re_recurse* pr = static_cast<re_recurse*>(this->append_state(syntax_element_recurse, sizeof(re_recurse)));
2093       pr->alt.i = (std::ptrdiff_t)v;
2094       pr->state_id = 0;
2095       static_cast<re_case*>(
2096             this->append_state(syntax_element_toggle_case, sizeof(re_case))
2097             )->icase = this->flags() & regbase::icase;
2098       break;
2099       }
2100    case regex_constants::syntax_plus:
2101       //
2102       // A forward-relative recursive subexpression:
2103       //
2104       ++m_position;
2105       v = this->m_traits.toi(m_position, m_end, 10);
2106       if((v <= 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2107       {
2108          // Rewind to start of (? sequence:
2109          --m_position;
2110          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2111          fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
2112          return false;
2113       }
2114       if ((std::numeric_limits<std::intmax_t>::max)() - m_mark_count < v)
2115       {
2116          fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
2117          return false;
2118       }
2119       v += m_mark_count;
2120       goto insert_recursion;
2121    case regex_constants::syntax_dash:
2122       //
2123       // Possibly a backward-relative recursive subexpression:
2124       //
2125       ++m_position;
2126       v = this->m_traits.toi(m_position, m_end, 10);
2127       if(v <= 0)
2128       {
2129          --m_position;
2130          // Oops not a relative recursion at all, but a (?-imsx) group:
2131          goto option_group_jump;
2132       }
2133       v = static_cast<std::intmax_t>(m_mark_count) + 1 - v;
2134       if(v <= 0)
2135       {
2136          // Rewind to start of (? sequence:
2137          --m_position;
2138          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2139          fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
2140          return false;
2141       }
2142       goto insert_recursion;
2143    case regex_constants::syntax_equal:
2144       pb->index = markid = -1;
2145       ++m_position;
2146       jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2147       this->m_pdata->m_data.align();
2148       m_alt_insert_point = this->m_pdata->m_data.size();
2149       break;
2150    case regex_constants::syntax_not:
2151       pb->index = markid = -2;
2152       ++m_position;
2153       jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2154       this->m_pdata->m_data.align();
2155       m_alt_insert_point = this->m_pdata->m_data.size();
2156       break;
2157    case regex_constants::escape_type_left_word:
2158       {
2159          // a lookbehind assertion:
2160          if(++m_position == m_end)
2161          {
2162             // Rewind to start of (? sequence:
2163             --m_position;
2164             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2165             fail(regex_constants::error_perl_extension, m_position - m_base);
2166             return false;
2167          }
2168          regex_constants::syntax_type t = this->m_traits.syntax_type(*m_position);
2169          if(t == regex_constants::syntax_not)
2170             pb->index = markid = -2;
2171          else if(t == regex_constants::syntax_equal)
2172             pb->index = markid = -1;
2173          else
2174          {
2175             // Probably a named capture which also starts (?< :
2176             name_delim = '>';
2177             --m_position;
2178             goto named_capture_jump;
2179          }
2180          ++m_position;
2181          jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2182          this->append_state(syntax_element_backstep, sizeof(re_brace));
2183          this->m_pdata->m_data.align();
2184          m_alt_insert_point = this->m_pdata->m_data.size();
2185          break;
2186       }
2187    case regex_constants::escape_type_right_word:
2188       //
2189       // an independent sub-expression:
2190       //
2191       pb->index = markid = -3;
2192       ++m_position;
2193       jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
2194       this->m_pdata->m_data.align();
2195       m_alt_insert_point = this->m_pdata->m_data.size();
2196       break;
2197    case regex_constants::syntax_open_mark:
2198       {
2199       // a conditional expression:
2200       pb->index = markid = -4;
2201       if(++m_position == m_end)
2202       {
2203          // Rewind to start of (? sequence:
2204          --m_position;
2205          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2206          fail(regex_constants::error_perl_extension, m_position - m_base);
2207          return false;
2208       }
2209       v = this->m_traits.toi(m_position, m_end, 10);
2210       if(m_position == m_end)
2211       {
2212          // Rewind to start of (? sequence:
2213          --m_position;
2214          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2215          fail(regex_constants::error_perl_extension, m_position - m_base);
2216          return false;
2217       }
2218       if(*m_position == charT('R'))
2219       {
2220          if(++m_position == m_end)
2221          {
2222             // Rewind to start of (? sequence:
2223             --m_position;
2224             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2225             fail(regex_constants::error_perl_extension, m_position - m_base);
2226             return false;
2227          }
2228          if(*m_position == charT('&'))
2229          {
2230             const charT* base = ++m_position;
2231             while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2232                ++m_position;
2233             if(m_position == m_end)
2234             {
2235                // Rewind to start of (? sequence:
2236                --m_position;
2237                while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2238                fail(regex_constants::error_perl_extension, m_position - m_base);
2239                return false;
2240             }
2241             v = -static_cast<int>(hash_value_from_capture_name(base, m_position));
2242          }
2243          else
2244          {
2245             v = -this->m_traits.toi(m_position, m_end, 10);
2246          }
2247          re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2248          br->index = v < 0 ? (int)(v - 1) : 0;
2249          if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2250          {
2251             // Rewind to start of (? sequence:
2252             --m_position;
2253             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2254             fail(regex_constants::error_perl_extension, m_position - m_base);
2255             return false;
2256          }
2257          if(++m_position == m_end)
2258          {
2259             // Rewind to start of (? sequence:
2260             --m_position;
2261             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2262             fail(regex_constants::error_perl_extension, m_position - m_base);
2263             return false;
2264          }
2265       }
2266       else if((*m_position == charT('\'')) || (*m_position == charT('<')))
2267       {
2268          const charT* base = ++m_position;
2269          while((m_position != m_end) && (*m_position != charT('>')) && (*m_position != charT('\'')))
2270             ++m_position;
2271          if(m_position == m_end)
2272          {
2273             // Rewind to start of (? sequence:
2274             --m_position;
2275             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2276             fail(regex_constants::error_perl_extension, m_position - m_base);
2277             return false;
2278          }
2279          v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2280          re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2281          br->index = (int)v;
2282          if(((*m_position != charT('>')) && (*m_position != charT('\''))) || (++m_position == m_end))
2283          {
2284             // Rewind to start of (? sequence:
2285             --m_position;
2286             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2287             fail(regex_constants::error_perl_extension, m_position - m_base, "Unterminated named capture.");
2288             return false;
2289          }
2290          if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2291          {
2292             // Rewind to start of (? sequence:
2293             --m_position;
2294             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2295             fail(regex_constants::error_perl_extension, m_position - m_base);
2296             return false;
2297          }
2298          if(++m_position == m_end)
2299          {
2300             // Rewind to start of (? sequence:
2301             --m_position;
2302             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2303             fail(regex_constants::error_perl_extension, m_position - m_base);
2304             return false;
2305          }
2306       }
2307       else if(*m_position == charT('D'))
2308       {
2309          const char* def = "DEFINE";
2310          while(*def && (m_position != m_end) && (*m_position == charT(*def)))
2311             ++m_position, ++def;
2312          if((m_position == m_end) || *def)
2313          {
2314             // Rewind to start of (? sequence:
2315             --m_position;
2316             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2317             fail(regex_constants::error_perl_extension, m_position - m_base);
2318             return false;
2319          }
2320          re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2321          br->index = 9999; // special magic value!
2322          if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2323          {
2324             // Rewind to start of (? sequence:
2325             --m_position;
2326             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2327             fail(regex_constants::error_perl_extension, m_position - m_base);
2328             return false;
2329          }
2330          if(++m_position == m_end)
2331          {
2332             // Rewind to start of (? sequence:
2333             --m_position;
2334             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2335             fail(regex_constants::error_perl_extension, m_position - m_base);
2336             return false;
2337          }
2338       }
2339       else if(v > 0)
2340       {
2341          re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
2342          br->index = (int)v;
2343          if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2344          {
2345             // Rewind to start of (? sequence:
2346             --m_position;
2347             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2348             fail(regex_constants::error_perl_extension, m_position - m_base);
2349             return false;
2350          }
2351          if(++m_position == m_end)
2352          {
2353             // Rewind to start of (? sequence:
2354             --m_position;
2355             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2356             fail(regex_constants::error_perl_extension, m_position - m_base);
2357             return false;
2358          }
2359       }
2360       else
2361       {
2362          // verify that we have a lookahead or lookbehind assert:
2363          if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_question)
2364          {
2365             // Rewind to start of (? sequence:
2366             --m_position;
2367             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2368             fail(regex_constants::error_perl_extension, m_position - m_base);
2369             return false;
2370          }
2371          if(++m_position == m_end)
2372          {
2373             // Rewind to start of (? sequence:
2374             --m_position;
2375             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2376             fail(regex_constants::error_perl_extension, m_position - m_base);
2377             return false;
2378          }
2379          if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_left_word)
2380          {
2381             if(++m_position == m_end)
2382             {
2383                // Rewind to start of (? sequence:
2384                --m_position;
2385                while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2386                fail(regex_constants::error_perl_extension, m_position - m_base);
2387                return false;
2388             }
2389             if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
2390                && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
2391             {
2392                // Rewind to start of (? sequence:
2393                --m_position;
2394                while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2395                fail(regex_constants::error_perl_extension, m_position - m_base);
2396                return false;
2397             }
2398             m_position -= 3;
2399          }
2400          else
2401          {
2402             if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
2403                && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
2404             {
2405                // Rewind to start of (? sequence:
2406                --m_position;
2407                while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2408                fail(regex_constants::error_perl_extension, m_position - m_base);
2409                return false;
2410             }
2411             m_position -= 2;
2412          }
2413       }
2414       break;
2415       }
2416    case regex_constants::syntax_close_mark:
2417       // Rewind to start of (? sequence:
2418       --m_position;
2419       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2420       fail(regex_constants::error_perl_extension, m_position - m_base);
2421       return false;
2422    case regex_constants::escape_type_end_buffer:
2423       {
2424       name_delim = *m_position;
2425 named_capture_jump:
2426       markid = 0;
2427       if(0 == (this->flags() & regbase::nosubs))
2428       {
2429          markid = ++m_mark_count;
2430          if(this->flags() & regbase::save_subexpression_location)
2431             this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 2, 0));
2432       }
2433       pb->index = markid;
2434       const charT* base = ++m_position;
2435       if(m_position == m_end)
2436       {
2437          // Rewind to start of (? sequence:
2438          --m_position;
2439          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2440          fail(regex_constants::error_perl_extension, m_position - m_base);
2441          return false;
2442       }
2443       while((m_position != m_end) && (*m_position != name_delim))
2444          ++m_position;
2445       if(m_position == m_end)
2446       {
2447          // Rewind to start of (? sequence:
2448          --m_position;
2449          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2450          fail(regex_constants::error_perl_extension, m_position - m_base);
2451          return false;
2452       }
2453       this->m_pdata->set_name(base, m_position, markid);
2454       ++m_position;
2455       break;
2456       }
2457    default:
2458       if(*m_position == charT('R'))
2459       {
2460          ++m_position;
2461          v = 0;
2462          if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
2463          {
2464             // Rewind to start of (? sequence:
2465             --m_position;
2466             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2467             fail(regex_constants::error_perl_extension, m_position - m_base);
2468             return false;
2469          }
2470          goto insert_recursion;
2471       }
2472       if(*m_position == charT('&'))
2473       {
2474          ++m_position;
2475          const charT* base = m_position;
2476          while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2477             ++m_position;
2478          if(m_position == m_end)
2479          {
2480             // Rewind to start of (? sequence:
2481             --m_position;
2482             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2483             fail(regex_constants::error_perl_extension, m_position - m_base);
2484             return false;
2485          }
2486          v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2487          goto insert_recursion;
2488       }
2489       if(*m_position == charT('P'))
2490       {
2491          ++m_position;
2492          if(m_position == m_end)
2493          {
2494             // Rewind to start of (? sequence:
2495             --m_position;
2496             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2497             fail(regex_constants::error_perl_extension, m_position - m_base);
2498             return false;
2499          }
2500          if(*m_position == charT('>'))
2501          {
2502             ++m_position;
2503             const charT* base = m_position;
2504             while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2505                ++m_position;
2506             if(m_position == m_end)
2507             {
2508                // Rewind to start of (? sequence:
2509                --m_position;
2510                while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2511                fail(regex_constants::error_perl_extension, m_position - m_base);
2512                return false;
2513             }
2514             v = static_cast<int>(hash_value_from_capture_name(base, m_position));
2515             goto insert_recursion;
2516          }
2517       }
2518       //
2519       // lets assume that we have a (?imsx) group and try and parse it:
2520       //
2521 option_group_jump:
2522       regex_constants::syntax_option_type opts = parse_options();
2523       if(m_position == m_end)
2524       {
2525          // Rewind to start of (? sequence:
2526          --m_position;
2527          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2528          fail(regex_constants::error_perl_extension, m_position - m_base);
2529          return false;
2530       }
2531       // make a note of whether we have a case change:
2532       m_has_case_change = ((opts & regbase::icase) != (this->flags() & regbase::icase));
2533       pb->index = markid = 0;
2534       if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark)
2535       {
2536          // update flags and carry on as normal:
2537          this->flags(opts);
2538          restore_flags = false;
2539          old_case_change |= m_has_case_change; // defer end of scope by one ')'
2540       }
2541       else if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_colon)
2542       {
2543          // update flags and carry on until the matching ')' is found:
2544          this->flags(opts);
2545          ++m_position;
2546       }
2547       else
2548       {
2549          // Rewind to start of (? sequence:
2550          --m_position;
2551          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2552          fail(regex_constants::error_perl_extension, m_position - m_base);
2553          return false;
2554       }
2555 
2556       // finally append a case change state if we need it:
2557       if(m_has_case_change)
2558       {
2559          static_cast<re_case*>(
2560             this->append_state(syntax_element_toggle_case, sizeof(re_case))
2561             )->icase = opts & regbase::icase;
2562       }
2563 
2564    }
2565    //
2566    // now recursively add more states, this will terminate when we get to a
2567    // matching ')' :
2568    //
2569    parse_all();
2570    //
2571    // Unwind alternatives:
2572    //
2573    if(0 == unwind_alts(last_paren_start))
2574    {
2575       // Rewind to start of (? sequence:
2576       --m_position;
2577       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2578       fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid alternation operators within (?...) block.");
2579       return false;
2580    }
2581    //
2582    // we either have a ')' or we have run out of characters prematurely:
2583    //
2584    if(m_position == m_end)
2585    {
2586       // Rewind to start of (? sequence:
2587       --m_position;
2588       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2589       this->fail(regex_constants::error_paren, std::distance(m_base, m_end));
2590       return false;
2591    }
2592    BOOST_REGEX_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
2593    ++m_position;
2594    //
2595    // restore the flags:
2596    //
2597    if(restore_flags)
2598    {
2599       // append a case change state if we need it:
2600       if(m_has_case_change)
2601       {
2602          static_cast<re_case*>(
2603             this->append_state(syntax_element_toggle_case, sizeof(re_case))
2604             )->icase = old_flags & regbase::icase;
2605       }
2606       this->flags(old_flags);
2607    }
2608    //
2609    // set up the jump pointer if we have one:
2610    //
2611    if(jump_offset)
2612    {
2613       this->m_pdata->m_data.align();
2614       re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
2615       jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
2616       if((this->m_last_state == jmp) && (markid != -2))
2617       {
2618          // Oops... we didn't have anything inside the assertion.
2619          // Note we don't get here for negated forward lookahead as (?!)
2620          // does have some uses.
2621          // Rewind to start of (? sequence:
2622          --m_position;
2623          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2624          fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid or empty zero width assertion.");
2625          return false;
2626       }
2627    }
2628    //
2629    // verify that if this is conditional expression, that we do have
2630    // an alternative, if not add one:
2631    //
2632    if(markid == -4)
2633    {
2634       re_syntax_base* b = this->getaddress(expected_alt_point);
2635       // Make sure we have exactly one alternative following this state:
2636       if(b->type != syntax_element_alt)
2637       {
2638          re_alt* alt = static_cast<re_alt*>(this->insert_state(expected_alt_point, syntax_element_alt, sizeof(re_alt)));
2639          alt->alt.i = this->m_pdata->m_data.size() - this->getoffset(alt);
2640       }
2641       else if(((std::ptrdiff_t)this->m_pdata->m_data.size() > (static_cast<re_alt*>(b)->alt.i + this->getoffset(b))) && (static_cast<re_alt*>(b)->alt.i > 0) && this->getaddress(static_cast<re_alt*>(b)->alt.i, b)->type == syntax_element_alt)
2642       {
2643          // Can't have seen more than one alternative:
2644          // Rewind to start of (? sequence:
2645          --m_position;
2646          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2647          fail(regex_constants::error_bad_pattern, m_position - m_base, "More than one alternation operator | was encountered inside a conditional expression.");
2648          return false;
2649       }
2650       else
2651       {
2652          // We must *not* have seen an alternative inside a (DEFINE) block:
2653          b = this->getaddress(b->next.i, b);
2654          if((b->type == syntax_element_assert_backref) && (static_cast<re_brace*>(b)->index == 9999))
2655          {
2656             // Rewind to start of (? sequence:
2657             --m_position;
2658             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2659             fail(regex_constants::error_bad_pattern, m_position - m_base, "Alternation operators are not allowed inside a DEFINE block.");
2660             return false;
2661          }
2662       }
2663       // check for invalid repetition of next state:
2664       b = this->getaddress(expected_alt_point);
2665       b = this->getaddress(static_cast<re_alt*>(b)->next.i, b);
2666       if((b->type != syntax_element_assert_backref)
2667          && (b->type != syntax_element_startmark))
2668       {
2669          // Rewind to start of (? sequence:
2670          --m_position;
2671          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2672          fail(regex_constants::error_badrepeat, m_position - m_base, "A repetition operator cannot be applied to a zero-width assertion.");
2673          return false;
2674       }
2675    }
2676    //
2677    // append closing parenthesis state:
2678    //
2679    pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
2680    pb->index = markid;
2681    pb->icase = this->flags() & regbase::icase;
2682    this->m_paren_start = last_paren_start;
2683    //
2684    // restore the alternate insertion point:
2685    //
2686    this->m_alt_insert_point = last_alt_point;
2687    //
2688    // and the case change data:
2689    //
2690    m_has_case_change = old_case_change;
2691    //
2692    // And the mark_reset data:
2693    //
2694    if(m_max_mark > m_mark_count)
2695    {
2696       m_mark_count = m_max_mark;
2697    }
2698    m_mark_reset = mark_reset;
2699    m_max_mark = max_mark;
2700 
2701 
2702    if(markid > 0)
2703    {
2704       if(this->flags() & regbase::save_subexpression_location)
2705          this->m_pdata->m_subs.at((std::size_t)markid - 1).second = std::distance(m_base, m_position) - 1;
2706    }
2707    return true;
2708 }
2709 
2710 template <class charT, class traits>
match_verb(const char * verb)2711 bool basic_regex_parser<charT, traits>::match_verb(const char* verb)
2712 {
2713    while(*verb)
2714    {
2715       if(static_cast<charT>(*verb) != *m_position)
2716       {
2717          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2718          fail(regex_constants::error_perl_extension, m_position - m_base);
2719          return false;
2720       }
2721       if(++m_position == m_end)
2722       {
2723          --m_position;
2724          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2725          fail(regex_constants::error_perl_extension, m_position - m_base);
2726          return false;
2727       }
2728       ++verb;
2729    }
2730    return true;
2731 }
2732 
2733 #ifdef BOOST_REGEX_MSVC
2734 #  pragma warning(push)
2735 #if BOOST_REGEX_MSVC >= 1800
2736 #pragma warning(disable:26812)
2737 #endif
2738 #endif
2739 template <class charT, class traits>
parse_perl_verb()2740 bool basic_regex_parser<charT, traits>::parse_perl_verb()
2741 {
2742    if(++m_position == m_end)
2743    {
2744       // Rewind to start of (* sequence:
2745       --m_position;
2746       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2747       fail(regex_constants::error_perl_extension, m_position - m_base);
2748       return false;
2749    }
2750    switch(*m_position)
2751    {
2752    case 'F':
2753       if(++m_position == m_end)
2754       {
2755          // Rewind to start of (* sequence:
2756          --m_position;
2757          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2758          fail(regex_constants::error_perl_extension, m_position - m_base);
2759          return false;
2760       }
2761       if((this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark) || match_verb("AIL"))
2762       {
2763          if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2764          {
2765             // Rewind to start of (* sequence:
2766             --m_position;
2767             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2768             fail(regex_constants::error_perl_extension, m_position - m_base);
2769             return false;
2770          }
2771          ++m_position;
2772          this->append_state(syntax_element_fail);
2773          return true;
2774       }
2775       break;
2776    case 'A':
2777       if(++m_position == m_end)
2778       {
2779          // Rewind to start of (* sequence:
2780          --m_position;
2781          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2782          fail(regex_constants::error_perl_extension, m_position - m_base);
2783          return false;
2784       }
2785       if(match_verb("CCEPT"))
2786       {
2787          if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2788          {
2789             // Rewind to start of (* sequence:
2790             --m_position;
2791             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2792             fail(regex_constants::error_perl_extension, m_position - m_base);
2793             return false;
2794          }
2795          ++m_position;
2796          this->append_state(syntax_element_accept);
2797          return true;
2798       }
2799       break;
2800    case 'C':
2801       if(++m_position == m_end)
2802       {
2803          // Rewind to start of (* sequence:
2804          --m_position;
2805          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2806          fail(regex_constants::error_perl_extension, m_position - m_base);
2807          return false;
2808       }
2809       if(match_verb("OMMIT"))
2810       {
2811          if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2812          {
2813             // Rewind to start of (* sequence:
2814             --m_position;
2815             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2816             fail(regex_constants::error_perl_extension, m_position - m_base);
2817             return false;
2818          }
2819          ++m_position;
2820          static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_commit;
2821          this->m_pdata->m_disable_match_any = true;
2822          return true;
2823       }
2824       break;
2825    case 'P':
2826       if(++m_position == m_end)
2827       {
2828          // Rewind to start of (* sequence:
2829          --m_position;
2830          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2831          fail(regex_constants::error_perl_extension, m_position - m_base);
2832          return false;
2833       }
2834       if(match_verb("RUNE"))
2835       {
2836          if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2837          {
2838             // Rewind to start of (* sequence:
2839             --m_position;
2840             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2841             fail(regex_constants::error_perl_extension, m_position - m_base);
2842             return false;
2843          }
2844          ++m_position;
2845          static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_prune;
2846          this->m_pdata->m_disable_match_any = true;
2847          return true;
2848       }
2849       break;
2850    case 'S':
2851       if(++m_position == m_end)
2852       {
2853          // Rewind to start of (* sequence:
2854          --m_position;
2855          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2856          fail(regex_constants::error_perl_extension, m_position - m_base);
2857          return false;
2858       }
2859       if(match_verb("KIP"))
2860       {
2861          if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2862          {
2863             // Rewind to start of (* sequence:
2864             --m_position;
2865             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2866             fail(regex_constants::error_perl_extension, m_position - m_base);
2867             return false;
2868          }
2869          ++m_position;
2870          static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_skip;
2871          this->m_pdata->m_disable_match_any = true;
2872          return true;
2873       }
2874       break;
2875    case 'T':
2876       if(++m_position == m_end)
2877       {
2878          // Rewind to start of (* sequence:
2879          --m_position;
2880          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2881          fail(regex_constants::error_perl_extension, m_position - m_base);
2882          return false;
2883       }
2884       if(match_verb("HEN"))
2885       {
2886          if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
2887          {
2888             // Rewind to start of (* sequence:
2889             --m_position;
2890             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2891             fail(regex_constants::error_perl_extension, m_position - m_base);
2892             return false;
2893          }
2894          ++m_position;
2895          this->append_state(syntax_element_then);
2896          this->m_pdata->m_disable_match_any = true;
2897          return true;
2898       }
2899       break;
2900    }
2901    // Rewind to start of (* sequence:
2902    --m_position;
2903    while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
2904    fail(regex_constants::error_perl_extension, m_position - m_base);
2905    return false;
2906 }
2907 #ifdef BOOST_REGEX_MSVC
2908 #  pragma warning(pop)
2909 #endif
2910 
2911 template <class charT, class traits>
add_emacs_code(bool negate)2912 bool basic_regex_parser<charT, traits>::add_emacs_code(bool negate)
2913 {
2914    //
2915    // parses an emacs style \sx or \Sx construct.
2916    //
2917    if(++m_position == m_end)
2918    {
2919       // Rewind to start of sequence:
2920       --m_position;
2921       while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
2922       fail(regex_constants::error_escape, m_position - m_base);
2923       return false;
2924    }
2925    basic_char_set<charT, traits> char_set;
2926    if(negate)
2927       char_set.negate();
2928 
2929    static const charT s_punct[5] = { 'p', 'u', 'n', 'c', 't', };
2930 
2931    switch(*m_position)
2932    {
2933    case 's':
2934    case ' ':
2935       char_set.add_class(this->m_mask_space);
2936       break;
2937    case 'w':
2938       char_set.add_class(this->m_word_mask);
2939       break;
2940    case '_':
2941       char_set.add_single(digraph<charT>(charT('$')));
2942       char_set.add_single(digraph<charT>(charT('&')));
2943       char_set.add_single(digraph<charT>(charT('*')));
2944       char_set.add_single(digraph<charT>(charT('+')));
2945       char_set.add_single(digraph<charT>(charT('-')));
2946       char_set.add_single(digraph<charT>(charT('_')));
2947       char_set.add_single(digraph<charT>(charT('<')));
2948       char_set.add_single(digraph<charT>(charT('>')));
2949       break;
2950    case '.':
2951       char_set.add_class(this->m_traits.lookup_classname(s_punct, s_punct+5));
2952       break;
2953    case '(':
2954       char_set.add_single(digraph<charT>(charT('(')));
2955       char_set.add_single(digraph<charT>(charT('[')));
2956       char_set.add_single(digraph<charT>(charT('{')));
2957       break;
2958    case ')':
2959       char_set.add_single(digraph<charT>(charT(')')));
2960       char_set.add_single(digraph<charT>(charT(']')));
2961       char_set.add_single(digraph<charT>(charT('}')));
2962       break;
2963    case '"':
2964       char_set.add_single(digraph<charT>(charT('"')));
2965       char_set.add_single(digraph<charT>(charT('\'')));
2966       char_set.add_single(digraph<charT>(charT('`')));
2967       break;
2968    case '\'':
2969       char_set.add_single(digraph<charT>(charT('\'')));
2970       char_set.add_single(digraph<charT>(charT(',')));
2971       char_set.add_single(digraph<charT>(charT('#')));
2972       break;
2973    case '<':
2974       char_set.add_single(digraph<charT>(charT(';')));
2975       break;
2976    case '>':
2977       char_set.add_single(digraph<charT>(charT('\n')));
2978       char_set.add_single(digraph<charT>(charT('\f')));
2979       break;
2980    default:
2981       fail(regex_constants::error_ctype, m_position - m_base);
2982       return false;
2983    }
2984    if(0 == this->append_set(char_set))
2985    {
2986       fail(regex_constants::error_ctype, m_position - m_base);
2987       return false;
2988    }
2989    ++m_position;
2990    return true;
2991 }
2992 
2993 template <class charT, class traits>
parse_options()2994 regex_constants::syntax_option_type basic_regex_parser<charT, traits>::parse_options()
2995 {
2996    // we have a (?imsx-imsx) group, convert it into a set of flags:
2997    regex_constants::syntax_option_type f = this->flags();
2998    bool breakout = false;
2999    do
3000    {
3001       switch(*m_position)
3002       {
3003       case 's':
3004          f |= regex_constants::mod_s;
3005          f &= ~regex_constants::no_mod_s;
3006          break;
3007       case 'm':
3008          f &= ~regex_constants::no_mod_m;
3009          break;
3010       case 'i':
3011          f |= regex_constants::icase;
3012          break;
3013       case 'x':
3014          f |= regex_constants::mod_x;
3015          break;
3016       default:
3017          breakout = true;
3018          continue;
3019       }
3020       if(++m_position == m_end)
3021       {
3022          // Rewind to start of (? sequence:
3023          --m_position;
3024          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
3025          fail(regex_constants::error_paren, m_position - m_base);
3026          return false;
3027       }
3028    }
3029    while(!breakout);
3030 
3031    breakout = false;
3032 
3033    if(*m_position == static_cast<charT>('-'))
3034    {
3035       if(++m_position == m_end)
3036       {
3037          // Rewind to start of (? sequence:
3038          --m_position;
3039          while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
3040          fail(regex_constants::error_paren, m_position - m_base);
3041          return false;
3042       }
3043       do
3044       {
3045          switch(*m_position)
3046          {
3047          case 's':
3048             f &= ~regex_constants::mod_s;
3049             f |= regex_constants::no_mod_s;
3050             break;
3051          case 'm':
3052             f |= regex_constants::no_mod_m;
3053             break;
3054          case 'i':
3055             f &= ~regex_constants::icase;
3056             break;
3057          case 'x':
3058             f &= ~regex_constants::mod_x;
3059             break;
3060          default:
3061             breakout = true;
3062             continue;
3063          }
3064          if(++m_position == m_end)
3065          {
3066             // Rewind to start of (? sequence:
3067             --m_position;
3068             while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
3069             fail(regex_constants::error_paren, m_position - m_base);
3070             return false;
3071          }
3072       }
3073       while(!breakout);
3074    }
3075    return f;
3076 }
3077 
3078 template <class charT, class traits>
unwind_alts(std::ptrdiff_t last_paren_start)3079 bool basic_regex_parser<charT, traits>::unwind_alts(std::ptrdiff_t last_paren_start)
3080 {
3081    //
3082    // If we didn't actually add any states after the last
3083    // alternative then that's an error:
3084    //
3085    if((this->m_alt_insert_point == static_cast<std::ptrdiff_t>(this->m_pdata->m_data.size()))
3086       && (!m_alt_jumps.empty()) && (m_alt_jumps.back() > last_paren_start)
3087       &&
3088       !(
3089          ((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group)
3090            &&
3091          ((this->flags() & regbase::no_empty_expressions) == 0)
3092         )
3093       )
3094    {
3095       fail(regex_constants::error_empty, this->m_position - this->m_base, "Can't terminate a sub-expression with an alternation operator |.");
3096       return false;
3097    }
3098    //
3099    // Fix up our alternatives:
3100    //
3101    while((!m_alt_jumps.empty()) && (m_alt_jumps.back() > last_paren_start))
3102    {
3103       //
3104       // fix up the jump to point to the end of the states
3105       // that we've just added:
3106       //
3107       std::ptrdiff_t jump_offset = m_alt_jumps.back();
3108       m_alt_jumps.pop_back();
3109       this->m_pdata->m_data.align();
3110       re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
3111       if (jmp->type != syntax_element_jump)
3112       {
3113          // Something really bad happened, this used to be an assert,
3114          // but we'll make it an error just in case we should ever get here.
3115          fail(regex_constants::error_unknown, this->m_position - this->m_base, "Internal logic failed while compiling the expression, probably you added a repeat to something non-repeatable!");
3116          return false;
3117       }
3118       jmp->alt.i = this->m_pdata->m_data.size() - jump_offset;
3119    }
3120    return true;
3121 }
3122 
3123 #ifdef BOOST_REGEX_MSVC
3124 #pragma warning(pop)
3125 #endif
3126 
3127 } // namespace BOOST_REGEX_DETAIL_NS
3128 } // namespace boost
3129 
3130 #endif
3131