1 /*
2  *
3  * Copyright (c) 2002
4  * John Maddock
5  *
6  * Use, modification and distribution are subject to the
7  * Boost Software License, Version 1.0. (See accompanying file
8  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9  *
10  */
11 
12 #ifndef BOOST_REGEX_MATCHER_HPP
13 #define BOOST_REGEX_MATCHER_HPP
14 
15 #include <boost/regex/v4/iterator_category.hpp>
16 
17 #ifdef BOOST_MSVC
18 #pragma warning(push)
19 #pragma warning(disable: 4103)
20 #endif
21 #ifdef BOOST_HAS_ABI_HEADERS
22 #  include BOOST_ABI_PREFIX
23 #endif
24 #ifdef BOOST_MSVC
25 #pragma warning(pop)
26 #endif
27 
28 #ifdef BOOST_MSVC
29 #  pragma warning(push)
30 #pragma warning(disable : 4251)
31 #if BOOST_MSVC < 1700
32 #     pragma warning(disable : 4231)
33 #endif
34 #  if BOOST_MSVC < 1600
35 #     pragma warning(disable : 4660)
36 #  endif
37 #if BOOST_MSVC < 1910
38 #pragma warning(disable:4800)
39 #endif
40 #endif
41 
42 namespace boost{
43 namespace BOOST_REGEX_DETAIL_NS{
44 
45 //
46 // error checking API:
47 //
48 BOOST_REGEX_DECL void BOOST_REGEX_CALL verify_options(boost::regex_constants::syntax_option_type ef, match_flag_type mf);
49 //
50 // function can_start:
51 //
52 template <class charT>
can_start(charT c,const unsigned char * map,unsigned char mask)53 inline bool can_start(charT c, const unsigned char* map, unsigned char mask)
54 {
55    return ((c < static_cast<charT>(0)) ? true : ((c >= static_cast<charT>(1 << CHAR_BIT)) ? true : map[c] & mask));
56 }
can_start(char c,const unsigned char * map,unsigned char mask)57 inline bool can_start(char c, const unsigned char* map, unsigned char mask)
58 {
59    return map[(unsigned char)c] & mask;
60 }
can_start(signed char c,const unsigned char * map,unsigned char mask)61 inline bool can_start(signed char c, const unsigned char* map, unsigned char mask)
62 {
63    return map[(unsigned char)c] & mask;
64 }
can_start(unsigned char c,const unsigned char * map,unsigned char mask)65 inline bool can_start(unsigned char c, const unsigned char* map, unsigned char mask)
66 {
67    return map[c] & mask;
68 }
can_start(unsigned short c,const unsigned char * map,unsigned char mask)69 inline bool can_start(unsigned short c, const unsigned char* map, unsigned char mask)
70 {
71    return ((c >= (1 << CHAR_BIT)) ? true : map[c] & mask);
72 }
73 #if !defined(__hpux) && !defined(__WINSCW__)// WCHAR_MIN not usable in pp-directives.
74 #if defined(WCHAR_MIN) && (WCHAR_MIN == 0) && !defined(BOOST_NO_INTRINSIC_WCHAR_T)
can_start(wchar_t c,const unsigned char * map,unsigned char mask)75 inline bool can_start(wchar_t c, const unsigned char* map, unsigned char mask)
76 {
77    return ((c >= static_cast<wchar_t>(1u << CHAR_BIT)) ? true : map[c] & mask);
78 }
79 #endif
80 #endif
81 #if !defined(BOOST_NO_INTRINSIC_WCHAR_T)
can_start(unsigned int c,const unsigned char * map,unsigned char mask)82 inline bool can_start(unsigned int c, const unsigned char* map, unsigned char mask)
83 {
84    return (((c >= static_cast<unsigned int>(1u << CHAR_BIT)) ? true : map[c] & mask));
85 }
86 #endif
87 
88 
89 //
90 // Unfortunately Rogue Waves standard library appears to have a bug
91 // in std::basic_string::compare that results in eroneous answers
92 // in some cases (tested with Borland C++ 5.1, Rogue Wave lib version
93 // 0x020101) the test case was:
94 // {39135,0} < {0xff,0}
95 // which succeeds when it should not.
96 //
97 #ifndef _RWSTD_VER
98 template <class C, class T, class A>
string_compare(const std::basic_string<C,T,A> & s,const C * p)99 inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
100 {
101    if(0 == *p)
102    {
103       if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
104          return 0;
105    }
106    return s.compare(p);
107 }
108 #else
109 template <class C, class T, class A>
string_compare(const std::basic_string<C,T,A> & s,const C * p)110 inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
111 {
112    if(0 == *p)
113    {
114       if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
115          return 0;
116    }
117    return s.compare(p);
118 }
string_compare(const std::string & s,const char * p)119 inline int string_compare(const std::string& s, const char* p)
120 { return std::strcmp(s.c_str(), p); }
121 # ifndef BOOST_NO_WREGEX
string_compare(const std::wstring & s,const wchar_t * p)122 inline int string_compare(const std::wstring& s, const wchar_t* p)
123 { return std::wcscmp(s.c_str(), p); }
124 #endif
125 #endif
126 template <class Seq, class C>
string_compare(const Seq & s,const C * p)127 inline int string_compare(const Seq& s, const C* p)
128 {
129    std::size_t i = 0;
130    while((i < s.size()) && (p[i] == s[i]))
131    {
132       ++i;
133    }
134    return (i == s.size()) ? -(int)p[i] : (int)s[i] - (int)p[i];
135 }
136 # define STR_COMP(s,p) string_compare(s,p)
137 
138 template<class charT>
re_skip_past_null(const charT * p)139 inline const charT* re_skip_past_null(const charT* p)
140 {
141   while (*p != static_cast<charT>(0)) ++p;
142   return ++p;
143 }
144 
145 template <class iterator, class charT, class traits_type, class char_classT>
146 iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
147                           iterator last,
148                           const re_set_long<char_classT>* set_,
149                           const regex_data<charT, traits_type>& e, bool icase)
150 {
151    const charT* p = reinterpret_cast<const charT*>(set_+1);
152    iterator ptr;
153    unsigned int i;
154    //bool icase = e.m_flags & regex_constants::icase;
155 
156    if(next == last) return next;
157 
158    typedef typename traits_type::string_type traits_string_type;
159    const ::boost::regex_traits_wrapper<traits_type>& traits_inst = *(e.m_ptraits);
160 
161    // dwa 9/13/00 suppress incorrect MSVC warning - it claims this is never
162    // referenced
163    (void)traits_inst;
164 
165    // try and match a single character, could be a multi-character
166    // collating element...
167    for(i = 0; i < set_->csingles; ++i)
168    {
169       ptr = next;
170       if(*p == static_cast<charT>(0))
171       {
172          // treat null string as special case:
173          if(traits_inst.translate(*ptr, icase))
174          {
175             ++p;
176             continue;
177          }
178          return set_->isnot ? next : (ptr == next) ? ++next : ptr;
179       }
180       else
181       {
182          while(*p && (ptr != last))
183          {
184             if(traits_inst.translate(*ptr, icase) != *p)
185                break;
186             ++p;
187             ++ptr;
188          }
189 
190          if(*p == static_cast<charT>(0)) // if null we've matched
191             return set_->isnot ? next : (ptr == next) ? ++next : ptr;
192 
193          p = re_skip_past_null(p);     // skip null
194       }
195    }
196 
197    charT col = traits_inst.translate(*next, icase);
198 
199 
200    if(set_->cranges || set_->cequivalents)
201    {
202       traits_string_type s1;
203       //
204       // try and match a range, NB only a single character can match
205       if(set_->cranges)
206       {
207          if((e.m_flags & regex_constants::collate) == 0)
208             s1.assign(1, col);
209          else
210          {
211             charT a[2] = { col, charT(0), };
212             s1 = traits_inst.transform(a, a + 1);
213          }
214          for(i = 0; i < set_->cranges; ++i)
215          {
216             if(STR_COMP(s1, p) >= 0)
217             {
218                do{ ++p; }while(*p);
219                ++p;
220                if(STR_COMP(s1, p) <= 0)
221                   return set_->isnot ? next : ++next;
222             }
223             else
224             {
225                // skip first string
226                do{ ++p; }while(*p);
227                ++p;
228             }
229             // skip second string
230             do{ ++p; }while(*p);
231             ++p;
232          }
233       }
234       //
235       // try and match an equivalence class, NB only a single character can match
236       if(set_->cequivalents)
237       {
238          charT a[2] = { col, charT(0), };
239          s1 = traits_inst.transform_primary(a, a +1);
240          for(i = 0; i < set_->cequivalents; ++i)
241          {
242             if(STR_COMP(s1, p) == 0)
243                return set_->isnot ? next : ++next;
244             // skip string
245             do{ ++p; }while(*p);
246             ++p;
247          }
248       }
249    }
250    if(traits_inst.isctype(col, set_->cclasses) == true)
251       return set_->isnot ? next : ++next;
252    if((set_->cnclasses != 0) && (traits_inst.isctype(col, set_->cnclasses) == false))
253       return set_->isnot ? next : ++next;
254    return set_->isnot ? ++next : next;
255 }
256 
257 template <class BidiIterator>
258 class repeater_count
259 {
260    repeater_count** stack;
261    repeater_count* next;
262    int state_id;
263    std::size_t count;        // the number of iterations so far
264    BidiIterator start_pos;   // where the last repeat started
265 
unwind_until(int n,repeater_count * p,int current_recursion_id)266    repeater_count* unwind_until(int n, repeater_count* p, int current_recursion_id)
267    {
268       while(p && (p->state_id != n))
269       {
270          if(-2 - current_recursion_id == p->state_id)
271             return 0;
272          p = p->next;
273          if(p && (p->state_id < 0))
274          {
275             p = unwind_until(p->state_id, p, current_recursion_id);
276             if(!p)
277                return p;
278             p = p->next;
279          }
280       }
281       return p;
282    }
283 public:
repeater_count(repeater_count ** s)284    repeater_count(repeater_count** s) : stack(s), next(0), state_id(-1), count(0), start_pos() {}
285 
repeater_count(int i,repeater_count ** s,BidiIterator start,int current_recursion_id)286    repeater_count(int i, repeater_count** s, BidiIterator start, int current_recursion_id)
287       : start_pos(start)
288    {
289       state_id = i;
290       stack = s;
291       next = *stack;
292       *stack = this;
293       if((state_id > next->state_id) && (next->state_id >= 0))
294          count = 0;
295       else
296       {
297          repeater_count* p = next;
298          p = unwind_until(state_id, p, current_recursion_id);
299          if(p)
300          {
301             count = p->count;
302             start_pos = p->start_pos;
303          }
304          else
305             count = 0;
306       }
307    }
~repeater_count()308    ~repeater_count()
309    {
310       if(next)
311          *stack = next;
312    }
get_count()313    std::size_t get_count() { return count; }
get_id()314    int get_id() { return state_id; }
operator ++()315    std::size_t operator++() { return ++count; }
check_null_repeat(const BidiIterator & pos,std::size_t max)316    bool check_null_repeat(const BidiIterator& pos, std::size_t max)
317    {
318       // this is called when we are about to start a new repeat,
319       // if the last one was NULL move our count to max,
320       // otherwise save the current position.
321       bool result = (count == 0) ? false : (pos == start_pos);
322       if(result)
323          count = max;
324       else
325          start_pos = pos;
326       return result;
327    }
328 };
329 
330 struct saved_state;
331 
332 enum saved_state_type
333 {
334    saved_type_end = 0,
335    saved_type_paren = 1,
336    saved_type_recurse = 2,
337    saved_type_assertion = 3,
338    saved_state_alt = 4,
339    saved_state_repeater_count = 5,
340    saved_state_extra_block = 6,
341    saved_state_greedy_single_repeat = 7,
342    saved_state_rep_slow_dot = 8,
343    saved_state_rep_fast_dot = 9,
344    saved_state_rep_char = 10,
345    saved_state_rep_short_set = 11,
346    saved_state_rep_long_set = 12,
347    saved_state_non_greedy_long_repeat = 13,
348    saved_state_count = 14
349 };
350 
351 #ifdef BOOST_MSVC
352 #  pragma warning(push)
353 #if BOOST_MSVC >= 1800
354 #pragma warning(disable:26495)
355 #endif
356 #endif
357 template <class Results>
358 struct recursion_info
359 {
360    typedef typename Results::value_type value_type;
361    typedef typename value_type::iterator iterator;
362    int idx;
363    const re_syntax_base* preturn_address;
364    Results results;
365    repeater_count<iterator>* repeater_stack;
366    iterator location_of_start;
367 };
368 #ifdef BOOST_MSVC
369 #  pragma warning(pop)
370 #endif
371 
372 template <class BidiIterator, class Allocator, class traits>
373 class perl_matcher
374 {
375 public:
376    typedef typename traits::char_type char_type;
377    typedef perl_matcher<BidiIterator, Allocator, traits> self_type;
378    typedef bool (self_type::*matcher_proc_type)(void);
379    typedef std::size_t traits_size_type;
380    typedef typename is_byte<char_type>::width_type width_type;
381    typedef typename regex_iterator_traits<BidiIterator>::difference_type difference_type;
382    typedef match_results<BidiIterator, Allocator> results_type;
383 
perl_matcher(BidiIterator first,BidiIterator end,match_results<BidiIterator,Allocator> & what,const basic_regex<char_type,traits> & e,match_flag_type f,BidiIterator l_base)384    perl_matcher(BidiIterator first, BidiIterator end,
385       match_results<BidiIterator, Allocator>& what,
386       const basic_regex<char_type, traits>& e,
387       match_flag_type f,
388       BidiIterator l_base)
389       :  m_result(what), base(first), last(end),
390          position(first), backstop(l_base), re(e), traits_inst(e.get_traits()),
391          m_independent(false), next_count(&rep_obj), rep_obj(&next_count)
392 #ifdef BOOST_REGEX_NON_RECURSIVE
393       , m_recursions(0)
394 #endif
395    {
396       construct_init(e, f);
397    }
398 
399    bool match();
400    bool find();
401 
setf(match_flag_type f)402    void setf(match_flag_type f)
403    { m_match_flags |= f; }
unsetf(match_flag_type f)404    void unsetf(match_flag_type f)
405    { m_match_flags &= ~f; }
406 
407 private:
408    void construct_init(const basic_regex<char_type, traits>& e, match_flag_type f);
409 
410    bool find_imp();
411    bool match_imp();
412 #ifdef BOOST_REGEX_HAS_MS_STACK_GUARD
413    typedef bool (perl_matcher::*protected_proc_type)();
414    bool protected_call(protected_proc_type);
415 #endif
416    void estimate_max_state_count(std::random_access_iterator_tag*);
417    void estimate_max_state_count(void*);
418    bool match_prefix();
419    bool match_all_states();
420 
421    // match procs, stored in s_match_vtable:
422    bool match_startmark();
423    bool match_endmark();
424    bool match_literal();
425    bool match_start_line();
426    bool match_end_line();
427    bool match_wild();
428    bool match_match();
429    bool match_word_boundary();
430    bool match_within_word();
431    bool match_word_start();
432    bool match_word_end();
433    bool match_buffer_start();
434    bool match_buffer_end();
435    bool match_backref();
436    bool match_long_set();
437    bool match_set();
438    bool match_jump();
439    bool match_alt();
440    bool match_rep();
441    bool match_combining();
442    bool match_soft_buffer_end();
443    bool match_restart_continue();
444    bool match_long_set_repeat();
445    bool match_set_repeat();
446    bool match_char_repeat();
447    bool match_dot_repeat_fast();
448    bool match_dot_repeat_slow();
match_dot_repeat_dispatch()449    bool match_dot_repeat_dispatch()
450    {
451       return ::boost::is_random_access_iterator<BidiIterator>::value ? match_dot_repeat_fast() : match_dot_repeat_slow();
452    }
453    bool match_backstep();
454    bool match_assert_backref();
455    bool match_toggle_case();
456 #ifdef BOOST_REGEX_RECURSIVE
457    bool backtrack_till_match(std::size_t count);
458 #endif
459    bool match_recursion();
460    bool match_fail();
461    bool match_accept();
462    bool match_commit();
463    bool match_then();
464    bool skip_until_paren(int index, bool match = true);
465 
466    // find procs stored in s_find_vtable:
467    bool find_restart_any();
468    bool find_restart_word();
469    bool find_restart_line();
470    bool find_restart_buf();
471    bool find_restart_lit();
472 
473 private:
474    // final result structure to be filled in:
475    match_results<BidiIterator, Allocator>& m_result;
476    // temporary result for POSIX matches:
477    scoped_ptr<match_results<BidiIterator, Allocator> > m_temp_match;
478    // pointer to actual result structure to fill in:
479    match_results<BidiIterator, Allocator>* m_presult;
480    // start of sequence being searched:
481    BidiIterator base;
482    // end of sequence being searched:
483    BidiIterator last;
484    // current character being examined:
485    BidiIterator position;
486    // where to restart next search after failed match attempt:
487    BidiIterator restart;
488    // where the current search started from, acts as base for $` during grep:
489    BidiIterator search_base;
490    // how far we can go back when matching lookbehind:
491    BidiIterator backstop;
492    // the expression being examined:
493    const basic_regex<char_type, traits>& re;
494    // the expression's traits class:
495    const ::boost::regex_traits_wrapper<traits>& traits_inst;
496    // the next state in the machine being matched:
497    const re_syntax_base* pstate;
498    // matching flags in use:
499    match_flag_type m_match_flags;
500    // how many states we have examined so far:
501    std::ptrdiff_t state_count;
502    // max number of states to examine before giving up:
503    std::ptrdiff_t max_state_count;
504    // whether we should ignore case or not:
505    bool icase;
506    // set to true when (position == last), indicates that we may have a partial match:
507    bool m_has_partial_match;
508    // set to true whenever we get a match:
509    bool m_has_found_match;
510    // set to true whenever we're inside an independent sub-expression:
511    bool m_independent;
512    // the current repeat being examined:
513    repeater_count<BidiIterator>* next_count;
514    // the first repeat being examined (top of linked list):
515    repeater_count<BidiIterator> rep_obj;
516    // the mask to pass when matching word boundaries:
517    typename traits::char_class_type m_word_mask;
518    // the bitmask to use when determining whether a match_any matches a newline or not:
519    unsigned char match_any_mask;
520    // recursion information:
521    std::vector<recursion_info<results_type> > recursion_stack;
522 #ifdef BOOST_REGEX_RECURSIVE
523    // Set to false by a (*COMMIT):
524    bool m_can_backtrack;
525    bool m_have_accept;
526    bool m_have_then;
527 #endif
528 #ifdef BOOST_REGEX_NON_RECURSIVE
529    //
530    // additional members for non-recursive version:
531    //
532    typedef bool (self_type::*unwind_proc_type)(bool);
533 
534    void extend_stack();
535    bool unwind(bool);
536    bool unwind_end(bool);
537    bool unwind_paren(bool);
538    bool unwind_recursion_stopper(bool);
539    bool unwind_assertion(bool);
540    bool unwind_alt(bool);
541    bool unwind_repeater_counter(bool);
542    bool unwind_extra_block(bool);
543    bool unwind_greedy_single_repeat(bool);
544    bool unwind_slow_dot_repeat(bool);
545    bool unwind_fast_dot_repeat(bool);
546    bool unwind_char_repeat(bool);
547    bool unwind_short_set_repeat(bool);
548    bool unwind_long_set_repeat(bool);
549    bool unwind_non_greedy_repeat(bool);
550    bool unwind_recursion(bool);
551    bool unwind_recursion_pop(bool);
552    bool unwind_commit(bool);
553    bool unwind_then(bool);
554    bool unwind_case(bool);
555    void destroy_single_repeat();
556    void push_matched_paren(int index, const sub_match<BidiIterator>& sub);
557    void push_recursion_stopper();
558    void push_assertion(const re_syntax_base* ps, bool positive);
559    void push_alt(const re_syntax_base* ps);
560    void push_repeater_count(int i, repeater_count<BidiIterator>** s);
561    void push_single_repeat(std::size_t c, const re_repeat* r, BidiIterator last_position, int state_id);
562    void push_non_greedy_repeat(const re_syntax_base* ps);
563    void push_recursion(int idx, const re_syntax_base* p, results_type* presults, results_type* presults2);
564    void push_recursion_pop();
565    void push_case_change(bool);
566 
567    // pointer to base of stack:
568    saved_state* m_stack_base;
569    // pointer to current stack position:
570    saved_state* m_backup_state;
571    // how many memory blocks have we used up?:
572    unsigned used_block_count;
573    // determines what value to return when unwinding from recursion,
574    // allows for mixed recursive/non-recursive algorithm:
575    bool m_recursive_result;
576    // We have unwound to a lookahead/lookbehind, used by COMMIT/PRUNE/SKIP:
577    bool m_unwound_lookahead;
578    // We have unwound to an alternative, used by THEN:
579    bool m_unwound_alt;
580    // We are unwinding a commit - used by independent subs to determine whether to stop there or carry on unwinding:
581    //bool m_unwind_commit;
582    // Recursion limit:
583    unsigned m_recursions;
584 #endif
585 
586 #ifdef BOOST_MSVC
587 #  pragma warning(push)
588 #if BOOST_MSVC >= 1800
589 #pragma warning(disable:26495)
590 #endif
591 #endif
592    // these operations aren't allowed, so are declared private,
593    // bodies are provided to keep explicit-instantiation requests happy:
operator =(const perl_matcher &)594    perl_matcher& operator=(const perl_matcher&)
595    {
596       return *this;
597    }
perl_matcher(const perl_matcher & that)598    perl_matcher(const perl_matcher& that)
599       : m_result(that.m_result), re(that.re), traits_inst(that.traits_inst), rep_obj(0) {}
600 #ifdef BOOST_MSVC
601 #  pragma warning(pop)
602 #endif
603 };
604 
605 } // namespace BOOST_REGEX_DETAIL_NS
606 
607 #ifdef BOOST_MSVC
608 #  pragma warning(pop)
609 #endif
610 
611 #ifdef BOOST_MSVC
612 #pragma warning(push)
613 #pragma warning(disable: 4103)
614 #endif
615 #ifdef BOOST_HAS_ABI_HEADERS
616 #  include BOOST_ABI_SUFFIX
617 #endif
618 #ifdef BOOST_MSVC
619 #pragma warning(pop)
620 #endif
621 
622 } // namespace boost
623 
624 //
625 // include the implementation of perl_matcher:
626 //
627 #ifdef BOOST_REGEX_RECURSIVE
628 #include <boost/regex/v4/perl_matcher_recursive.hpp>
629 #else
630 #include <boost/regex/v4/perl_matcher_non_recursive.hpp>
631 #endif
632 // this one has to be last:
633 #include <boost/regex/v4/perl_matcher_common.hpp>
634 
635 #endif
636 
637