1 /*
2  *
3  * Copyright (c) 2002
4  * John Maddock
5  *
6  * Use, modification and distribution are subject to the
7  * Boost Software License, Version 1.0. (See accompanying file
8  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9  *
10  */
11 
12 #ifndef BOOST_REGEX_MATCHER_HPP
13 #define BOOST_REGEX_MATCHER_HPP
14 
15 #include <boost/regex/v4/iterator_category.hpp>
16 
17 #ifdef BOOST_MSVC
18 #pragma warning(push)
19 #pragma warning(disable: 4103)
20 #endif
21 #ifdef BOOST_HAS_ABI_HEADERS
22 #  include BOOST_ABI_PREFIX
23 #endif
24 #ifdef BOOST_MSVC
25 #pragma warning(pop)
26 #endif
27 
28 #ifdef BOOST_MSVC
29 #  pragma warning(push)
30 #pragma warning(disable : 4251)
31 #if BOOST_MSVC < 1700
32 #     pragma warning(disable : 4231)
33 #endif
34 #  if BOOST_MSVC < 1600
35 #     pragma warning(disable : 4660)
36 #  endif
37 #if BOOST_MSVC < 1910
38 #pragma warning(disable:4800)
39 #endif
40 #endif
41 
42 namespace boost{
43 namespace BOOST_REGEX_DETAIL_NS{
44 
45 //
46 // error checking API:
47 //
verify_options(boost::regex_constants::syntax_option_type,match_flag_type mf)48 inline void BOOST_REGEX_CALL verify_options(boost::regex_constants::syntax_option_type, match_flag_type mf)
49 {
50    //
51    // can't mix match_extra with POSIX matching rules:
52    //
53    if ((mf & match_extra) && (mf & match_posix))
54    {
55       std::logic_error msg("Usage Error: Can't mix regular expression captures with POSIX matching rules");
56       throw_exception(msg);
57    }
58 }
59 //
60 // function can_start:
61 //
62 template <class charT>
can_start(charT c,const unsigned char * map,unsigned char mask)63 inline bool can_start(charT c, const unsigned char* map, unsigned char mask)
64 {
65    return ((c < static_cast<charT>(0)) ? true : ((c >= static_cast<charT>(1 << CHAR_BIT)) ? true : map[c] & mask));
66 }
can_start(char c,const unsigned char * map,unsigned char mask)67 inline bool can_start(char c, const unsigned char* map, unsigned char mask)
68 {
69    return map[(unsigned char)c] & mask;
70 }
can_start(signed char c,const unsigned char * map,unsigned char mask)71 inline bool can_start(signed char c, const unsigned char* map, unsigned char mask)
72 {
73    return map[(unsigned char)c] & mask;
74 }
can_start(unsigned char c,const unsigned char * map,unsigned char mask)75 inline bool can_start(unsigned char c, const unsigned char* map, unsigned char mask)
76 {
77    return map[c] & mask;
78 }
can_start(unsigned short c,const unsigned char * map,unsigned char mask)79 inline bool can_start(unsigned short c, const unsigned char* map, unsigned char mask)
80 {
81    return ((c >= (1 << CHAR_BIT)) ? true : map[c] & mask);
82 }
83 #if !defined(__hpux) && !defined(__WINSCW__)// WCHAR_MIN not usable in pp-directives.
84 #if defined(WCHAR_MIN) && (WCHAR_MIN == 0) && !defined(BOOST_NO_INTRINSIC_WCHAR_T)
can_start(wchar_t c,const unsigned char * map,unsigned char mask)85 inline bool can_start(wchar_t c, const unsigned char* map, unsigned char mask)
86 {
87    return ((c >= static_cast<wchar_t>(1u << CHAR_BIT)) ? true : map[c] & mask);
88 }
89 #endif
90 #endif
91 #if !defined(BOOST_NO_INTRINSIC_WCHAR_T)
can_start(unsigned int c,const unsigned char * map,unsigned char mask)92 inline bool can_start(unsigned int c, const unsigned char* map, unsigned char mask)
93 {
94    return (((c >= static_cast<unsigned int>(1u << CHAR_BIT)) ? true : map[c] & mask));
95 }
96 #endif
97 
98 
99 //
100 // Unfortunately Rogue Waves standard library appears to have a bug
101 // in std::basic_string::compare that results in erroneous answers
102 // in some cases (tested with Borland C++ 5.1, Rogue Wave lib version
103 // 0x020101) the test case was:
104 // {39135,0} < {0xff,0}
105 // which succeeds when it should not.
106 //
107 #ifndef _RWSTD_VER
108 template <class C, class T, class A>
string_compare(const std::basic_string<C,T,A> & s,const C * p)109 inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
110 {
111    if(0 == *p)
112    {
113       if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
114          return 0;
115    }
116    return s.compare(p);
117 }
118 #else
119 template <class C, class T, class A>
string_compare(const std::basic_string<C,T,A> & s,const C * p)120 inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
121 {
122    if(0 == *p)
123    {
124       if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
125          return 0;
126    }
127    return s.compare(p);
128 }
string_compare(const std::string & s,const char * p)129 inline int string_compare(const std::string& s, const char* p)
130 { return std::strcmp(s.c_str(), p); }
131 # ifndef BOOST_NO_WREGEX
string_compare(const std::wstring & s,const wchar_t * p)132 inline int string_compare(const std::wstring& s, const wchar_t* p)
133 { return std::wcscmp(s.c_str(), p); }
134 #endif
135 #endif
136 template <class Seq, class C>
string_compare(const Seq & s,const C * p)137 inline int string_compare(const Seq& s, const C* p)
138 {
139    std::size_t i = 0;
140    while((i < s.size()) && (p[i] == s[i]))
141    {
142       ++i;
143    }
144    return (i == s.size()) ? -(int)p[i] : (int)s[i] - (int)p[i];
145 }
146 # define STR_COMP(s,p) string_compare(s,p)
147 
148 template<class charT>
re_skip_past_null(const charT * p)149 inline const charT* re_skip_past_null(const charT* p)
150 {
151   while (*p != static_cast<charT>(0)) ++p;
152   return ++p;
153 }
154 
155 template <class iterator, class charT, class traits_type, class char_classT>
156 iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
157                           iterator last,
158                           const re_set_long<char_classT>* set_,
159                           const regex_data<charT, traits_type>& e, bool icase)
160 {
161    const charT* p = reinterpret_cast<const charT*>(set_+1);
162    iterator ptr;
163    unsigned int i;
164    //bool icase = e.m_flags & regex_constants::icase;
165 
166    if(next == last) return next;
167 
168    typedef typename traits_type::string_type traits_string_type;
169    const ::boost::regex_traits_wrapper<traits_type>& traits_inst = *(e.m_ptraits);
170 
171    // dwa 9/13/00 suppress incorrect MSVC warning - it claims this is never
172    // referenced
173    (void)traits_inst;
174 
175    // try and match a single character, could be a multi-character
176    // collating element...
177    for(i = 0; i < set_->csingles; ++i)
178    {
179       ptr = next;
180       if(*p == static_cast<charT>(0))
181       {
182          // treat null string as special case:
183          if(traits_inst.translate(*ptr, icase))
184          {
185             ++p;
186             continue;
187          }
188          return set_->isnot ? next : (ptr == next) ? ++next : ptr;
189       }
190       else
191       {
192          while(*p && (ptr != last))
193          {
194             if(traits_inst.translate(*ptr, icase) != *p)
195                break;
196             ++p;
197             ++ptr;
198          }
199 
200          if(*p == static_cast<charT>(0)) // if null we've matched
201             return set_->isnot ? next : (ptr == next) ? ++next : ptr;
202 
203          p = re_skip_past_null(p);     // skip null
204       }
205    }
206 
207    charT col = traits_inst.translate(*next, icase);
208 
209 
210    if(set_->cranges || set_->cequivalents)
211    {
212       traits_string_type s1;
213       //
214       // try and match a range, NB only a single character can match
215       if(set_->cranges)
216       {
217          if((e.m_flags & regex_constants::collate) == 0)
218             s1.assign(1, col);
219          else
220          {
221             charT a[2] = { col, charT(0), };
222             s1 = traits_inst.transform(a, a + 1);
223          }
224          for(i = 0; i < set_->cranges; ++i)
225          {
226             if(STR_COMP(s1, p) >= 0)
227             {
228                do{ ++p; }while(*p);
229                ++p;
230                if(STR_COMP(s1, p) <= 0)
231                   return set_->isnot ? next : ++next;
232             }
233             else
234             {
235                // skip first string
236                do{ ++p; }while(*p);
237                ++p;
238             }
239             // skip second string
240             do{ ++p; }while(*p);
241             ++p;
242          }
243       }
244       //
245       // try and match an equivalence class, NB only a single character can match
246       if(set_->cequivalents)
247       {
248          charT a[2] = { col, charT(0), };
249          s1 = traits_inst.transform_primary(a, a +1);
250          for(i = 0; i < set_->cequivalents; ++i)
251          {
252             if(STR_COMP(s1, p) == 0)
253                return set_->isnot ? next : ++next;
254             // skip string
255             do{ ++p; }while(*p);
256             ++p;
257          }
258       }
259    }
260    if(traits_inst.isctype(col, set_->cclasses) == true)
261       return set_->isnot ? next : ++next;
262    if((set_->cnclasses != 0) && (traits_inst.isctype(col, set_->cnclasses) == false))
263       return set_->isnot ? next : ++next;
264    return set_->isnot ? ++next : next;
265 }
266 
267 template <class BidiIterator>
268 class repeater_count
269 {
270    repeater_count** stack;
271    repeater_count* next;
272    int state_id;
273    std::size_t count;        // the number of iterations so far
274    BidiIterator start_pos;   // where the last repeat started
275 
unwind_until(int n,repeater_count * p,int current_recursion_id)276    repeater_count* unwind_until(int n, repeater_count* p, int current_recursion_id)
277    {
278       while(p && (p->state_id != n))
279       {
280          if(-2 - current_recursion_id == p->state_id)
281             return 0;
282          p = p->next;
283          if(p && (p->state_id < 0))
284          {
285             p = unwind_until(p->state_id, p, current_recursion_id);
286             if(!p)
287                return p;
288             p = p->next;
289          }
290       }
291       return p;
292    }
293 public:
repeater_count(repeater_count ** s)294    repeater_count(repeater_count** s) : stack(s), next(0), state_id(-1), count(0), start_pos() {}
295 
repeater_count(int i,repeater_count ** s,BidiIterator start,int current_recursion_id)296    repeater_count(int i, repeater_count** s, BidiIterator start, int current_recursion_id)
297       : start_pos(start)
298    {
299       state_id = i;
300       stack = s;
301       next = *stack;
302       *stack = this;
303       if((state_id > next->state_id) && (next->state_id >= 0))
304          count = 0;
305       else
306       {
307          repeater_count* p = next;
308          p = unwind_until(state_id, p, current_recursion_id);
309          if(p)
310          {
311             count = p->count;
312             start_pos = p->start_pos;
313          }
314          else
315             count = 0;
316       }
317    }
~repeater_count()318    ~repeater_count()
319    {
320       if(next)
321          *stack = next;
322    }
get_count()323    std::size_t get_count() { return count; }
get_id()324    int get_id() { return state_id; }
operator ++()325    std::size_t operator++() { return ++count; }
check_null_repeat(const BidiIterator & pos,std::size_t max)326    bool check_null_repeat(const BidiIterator& pos, std::size_t max)
327    {
328       // this is called when we are about to start a new repeat,
329       // if the last one was NULL move our count to max,
330       // otherwise save the current position.
331       bool result = (count == 0) ? false : (pos == start_pos);
332       if(result)
333          count = max;
334       else
335          start_pos = pos;
336       return result;
337    }
338 };
339 
340 struct saved_state;
341 
342 enum saved_state_type
343 {
344    saved_type_end = 0,
345    saved_type_paren = 1,
346    saved_type_recurse = 2,
347    saved_type_assertion = 3,
348    saved_state_alt = 4,
349    saved_state_repeater_count = 5,
350    saved_state_extra_block = 6,
351    saved_state_greedy_single_repeat = 7,
352    saved_state_rep_slow_dot = 8,
353    saved_state_rep_fast_dot = 9,
354    saved_state_rep_char = 10,
355    saved_state_rep_short_set = 11,
356    saved_state_rep_long_set = 12,
357    saved_state_non_greedy_long_repeat = 13,
358    saved_state_count = 14
359 };
360 
361 #ifdef BOOST_MSVC
362 #  pragma warning(push)
363 #if BOOST_MSVC >= 1800
364 #pragma warning(disable:26495)
365 #endif
366 #endif
367 template <class Results>
368 struct recursion_info
369 {
370    typedef typename Results::value_type value_type;
371    typedef typename value_type::iterator iterator;
372    int idx;
373    const re_syntax_base* preturn_address;
374    Results results;
375    repeater_count<iterator>* repeater_stack;
376    iterator location_of_start;
377 };
378 #ifdef BOOST_MSVC
379 #  pragma warning(pop)
380 #endif
381 
382 template <class BidiIterator, class Allocator, class traits>
383 class perl_matcher
384 {
385 public:
386    typedef typename traits::char_type char_type;
387    typedef perl_matcher<BidiIterator, Allocator, traits> self_type;
388    typedef bool (self_type::*matcher_proc_type)();
389    typedef std::size_t traits_size_type;
390    typedef typename is_byte<char_type>::width_type width_type;
391    typedef typename regex_iterator_traits<BidiIterator>::difference_type difference_type;
392    typedef match_results<BidiIterator, Allocator> results_type;
393 
perl_matcher(BidiIterator first,BidiIterator end,match_results<BidiIterator,Allocator> & what,const basic_regex<char_type,traits> & e,match_flag_type f,BidiIterator l_base)394    perl_matcher(BidiIterator first, BidiIterator end,
395       match_results<BidiIterator, Allocator>& what,
396       const basic_regex<char_type, traits>& e,
397       match_flag_type f,
398       BidiIterator l_base)
399       :  m_result(what), base(first), last(end),
400          position(first), backstop(l_base), re(e), traits_inst(e.get_traits()),
401          m_independent(false), next_count(&rep_obj), rep_obj(&next_count)
402 #ifdef BOOST_REGEX_NON_RECURSIVE
403       , m_recursions(0)
404 #endif
405    {
406       construct_init(e, f);
407    }
408 
409    bool match();
410    bool find();
411 
setf(match_flag_type f)412    void setf(match_flag_type f)
413    { m_match_flags |= f; }
unsetf(match_flag_type f)414    void unsetf(match_flag_type f)
415    { m_match_flags &= ~f; }
416 
417 private:
418    void construct_init(const basic_regex<char_type, traits>& e, match_flag_type f);
419 
420    bool find_imp();
421    bool match_imp();
422 #ifdef BOOST_REGEX_HAS_MS_STACK_GUARD
423    typedef bool (perl_matcher::*protected_proc_type)();
424    bool protected_call(protected_proc_type);
425 #endif
426    void estimate_max_state_count(std::random_access_iterator_tag*);
427    void estimate_max_state_count(void*);
428    bool match_prefix();
429    bool match_all_states();
430 
431    // match procs, stored in s_match_vtable:
432    bool match_startmark();
433    bool match_endmark();
434    bool match_literal();
435    bool match_start_line();
436    bool match_end_line();
437    bool match_wild();
438    bool match_match();
439    bool match_word_boundary();
440    bool match_within_word();
441    bool match_word_start();
442    bool match_word_end();
443    bool match_buffer_start();
444    bool match_buffer_end();
445    bool match_backref();
446    bool match_long_set();
447    bool match_set();
448    bool match_jump();
449    bool match_alt();
450    bool match_rep();
451    bool match_combining();
452    bool match_soft_buffer_end();
453    bool match_restart_continue();
454    bool match_long_set_repeat();
455    bool match_set_repeat();
456    bool match_char_repeat();
457    bool match_dot_repeat_fast();
458    bool match_dot_repeat_slow();
match_dot_repeat_dispatch()459    bool match_dot_repeat_dispatch()
460    {
461       return ::boost::is_random_access_iterator<BidiIterator>::value ? match_dot_repeat_fast() : match_dot_repeat_slow();
462    }
463    bool match_backstep();
464    bool match_assert_backref();
465    bool match_toggle_case();
466 #ifdef BOOST_REGEX_RECURSIVE
467    bool backtrack_till_match(std::size_t count);
468 #endif
469    bool match_recursion();
470    bool match_fail();
471    bool match_accept();
472    bool match_commit();
473    bool match_then();
474    bool skip_until_paren(int index, bool match = true);
475 
476    // find procs stored in s_find_vtable:
477    bool find_restart_any();
478    bool find_restart_word();
479    bool find_restart_line();
480    bool find_restart_buf();
481    bool find_restart_lit();
482 
483 private:
484    // final result structure to be filled in:
485    match_results<BidiIterator, Allocator>& m_result;
486    // temporary result for POSIX matches:
487    scoped_ptr<match_results<BidiIterator, Allocator> > m_temp_match;
488    // pointer to actual result structure to fill in:
489    match_results<BidiIterator, Allocator>* m_presult;
490    // start of sequence being searched:
491    BidiIterator base;
492    // end of sequence being searched:
493    BidiIterator last;
494    // current character being examined:
495    BidiIterator position;
496    // where to restart next search after failed match attempt:
497    BidiIterator restart;
498    // where the current search started from, acts as base for $` during grep:
499    BidiIterator search_base;
500    // how far we can go back when matching lookbehind:
501    BidiIterator backstop;
502    // the expression being examined:
503    const basic_regex<char_type, traits>& re;
504    // the expression's traits class:
505    const ::boost::regex_traits_wrapper<traits>& traits_inst;
506    // the next state in the machine being matched:
507    const re_syntax_base* pstate;
508    // matching flags in use:
509    match_flag_type m_match_flags;
510    // how many states we have examined so far:
511    std::ptrdiff_t state_count;
512    // max number of states to examine before giving up:
513    std::ptrdiff_t max_state_count;
514    // whether we should ignore case or not:
515    bool icase;
516    // set to true when (position == last), indicates that we may have a partial match:
517    bool m_has_partial_match;
518    // set to true whenever we get a match:
519    bool m_has_found_match;
520    // set to true whenever we're inside an independent sub-expression:
521    bool m_independent;
522    // the current repeat being examined:
523    repeater_count<BidiIterator>* next_count;
524    // the first repeat being examined (top of linked list):
525    repeater_count<BidiIterator> rep_obj;
526    // the mask to pass when matching word boundaries:
527    typename traits::char_class_type m_word_mask;
528    // the bitmask to use when determining whether a match_any matches a newline or not:
529    unsigned char match_any_mask;
530    // recursion information:
531    std::vector<recursion_info<results_type> > recursion_stack;
532 #ifdef BOOST_REGEX_RECURSIVE
533    // Set to false by a (*COMMIT):
534    bool m_can_backtrack;
535    bool m_have_accept;
536    bool m_have_then;
537 #endif
538 #ifdef BOOST_REGEX_NON_RECURSIVE
539    //
540    // additional members for non-recursive version:
541    //
542    typedef bool (self_type::*unwind_proc_type)(bool);
543 
544    void extend_stack();
545    bool unwind(bool);
546    bool unwind_end(bool);
547    bool unwind_paren(bool);
548    bool unwind_recursion_stopper(bool);
549    bool unwind_assertion(bool);
550    bool unwind_alt(bool);
551    bool unwind_repeater_counter(bool);
552    bool unwind_extra_block(bool);
553    bool unwind_greedy_single_repeat(bool);
554    bool unwind_slow_dot_repeat(bool);
555    bool unwind_fast_dot_repeat(bool);
556    bool unwind_char_repeat(bool);
557    bool unwind_short_set_repeat(bool);
558    bool unwind_long_set_repeat(bool);
559    bool unwind_non_greedy_repeat(bool);
560    bool unwind_recursion(bool);
561    bool unwind_recursion_pop(bool);
562    bool unwind_commit(bool);
563    bool unwind_then(bool);
564    bool unwind_case(bool);
565    void destroy_single_repeat();
566    void push_matched_paren(int index, const sub_match<BidiIterator>& sub);
567    void push_recursion_stopper();
568    void push_assertion(const re_syntax_base* ps, bool positive);
569    void push_alt(const re_syntax_base* ps);
570    void push_repeater_count(int i, repeater_count<BidiIterator>** s);
571    void push_single_repeat(std::size_t c, const re_repeat* r, BidiIterator last_position, int state_id);
572    void push_non_greedy_repeat(const re_syntax_base* ps);
573    void push_recursion(int idx, const re_syntax_base* p, results_type* presults, results_type* presults2);
574    void push_recursion_pop();
575    void push_case_change(bool);
576 
577    // pointer to base of stack:
578    saved_state* m_stack_base;
579    // pointer to current stack position:
580    saved_state* m_backup_state;
581    // how many memory blocks have we used up?:
582    unsigned used_block_count;
583    // determines what value to return when unwinding from recursion,
584    // allows for mixed recursive/non-recursive algorithm:
585    bool m_recursive_result;
586    // We have unwound to a lookahead/lookbehind, used by COMMIT/PRUNE/SKIP:
587    bool m_unwound_lookahead;
588    // We have unwound to an alternative, used by THEN:
589    bool m_unwound_alt;
590    // We are unwinding a commit - used by independent subs to determine whether to stop there or carry on unwinding:
591    //bool m_unwind_commit;
592    // Recursion limit:
593    unsigned m_recursions;
594 #endif
595 
596 #ifdef BOOST_MSVC
597 #  pragma warning(push)
598 #if BOOST_MSVC >= 1800
599 #pragma warning(disable:26495)
600 #endif
601 #endif
602    // these operations aren't allowed, so are declared private,
603    // bodies are provided to keep explicit-instantiation requests happy:
operator =(const perl_matcher &)604    perl_matcher& operator=(const perl_matcher&)
605    {
606       return *this;
607    }
perl_matcher(const perl_matcher & that)608    perl_matcher(const perl_matcher& that)
609       : m_result(that.m_result), re(that.re), traits_inst(that.traits_inst), rep_obj(0) {}
610 #ifdef BOOST_MSVC
611 #  pragma warning(pop)
612 #endif
613 };
614 
615 } // namespace BOOST_REGEX_DETAIL_NS
616 
617 #ifdef BOOST_MSVC
618 #  pragma warning(pop)
619 #endif
620 
621 #ifdef BOOST_MSVC
622 #pragma warning(push)
623 #pragma warning(disable: 4103)
624 #endif
625 #ifdef BOOST_HAS_ABI_HEADERS
626 #  include BOOST_ABI_SUFFIX
627 #endif
628 #ifdef BOOST_MSVC
629 #pragma warning(pop)
630 #endif
631 
632 } // namespace boost
633 
634 //
635 // include the implementation of perl_matcher:
636 //
637 #ifdef BOOST_REGEX_RECURSIVE
638 #include <boost/regex/v4/perl_matcher_recursive.hpp>
639 #else
640 #include <boost/regex/v4/perl_matcher_non_recursive.hpp>
641 #endif
642 // this one has to be last:
643 #include <boost/regex/v4/perl_matcher_common.hpp>
644 
645 #endif
646