1 /*
2  *
3  * Copyright (c) 2002
4  * John Maddock
5  *
6  * Use, modification and distribution are subject to the
7  * Boost Software License, Version 1.0. (See accompanying file
8  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9  *
10  */
11 
12 #ifndef BOOST_REGEX_MATCHER_HPP
13 #define BOOST_REGEX_MATCHER_HPP
14 
15 #include <boost/regex/v4/iterator_category.hpp>
16 
17 #ifdef BOOST_MSVC
18 #pragma warning(push)
19 #pragma warning(disable: 4103)
20 #endif
21 #ifdef BOOST_HAS_ABI_HEADERS
22 #  include BOOST_ABI_PREFIX
23 #endif
24 #ifdef BOOST_MSVC
25 #pragma warning(pop)
26 #endif
27 
28 #ifdef BOOST_MSVC
29 #  pragma warning(push)
30 #  pragma warning(disable: 4800)
31 #endif
32 
33 namespace boost{
34 namespace BOOST_REGEX_DETAIL_NS{
35 
36 //
37 // error checking API:
38 //
39 BOOST_REGEX_DECL void BOOST_REGEX_CALL verify_options(boost::regex_constants::syntax_option_type ef, match_flag_type mf);
40 //
41 // function can_start:
42 //
43 template <class charT>
can_start(charT c,const unsigned char * map,unsigned char mask)44 inline bool can_start(charT c, const unsigned char* map, unsigned char mask)
45 {
46    return ((c < static_cast<charT>(0)) ? true : ((c >= static_cast<charT>(1 << CHAR_BIT)) ? true : map[c] & mask));
47 }
can_start(char c,const unsigned char * map,unsigned char mask)48 inline bool can_start(char c, const unsigned char* map, unsigned char mask)
49 {
50    return map[(unsigned char)c] & mask;
51 }
can_start(signed char c,const unsigned char * map,unsigned char mask)52 inline bool can_start(signed char c, const unsigned char* map, unsigned char mask)
53 {
54    return map[(unsigned char)c] & mask;
55 }
can_start(unsigned char c,const unsigned char * map,unsigned char mask)56 inline bool can_start(unsigned char c, const unsigned char* map, unsigned char mask)
57 {
58    return map[c] & mask;
59 }
can_start(unsigned short c,const unsigned char * map,unsigned char mask)60 inline bool can_start(unsigned short c, const unsigned char* map, unsigned char mask)
61 {
62    return ((c >= (1 << CHAR_BIT)) ? true : map[c] & mask);
63 }
64 #if !defined(__hpux) && !defined(__WINSCW__)// WCHAR_MIN not usable in pp-directives.
65 #if defined(WCHAR_MIN) && (WCHAR_MIN == 0) && !defined(BOOST_NO_INTRINSIC_WCHAR_T)
can_start(wchar_t c,const unsigned char * map,unsigned char mask)66 inline bool can_start(wchar_t c, const unsigned char* map, unsigned char mask)
67 {
68    return ((c >= static_cast<wchar_t>(1u << CHAR_BIT)) ? true : map[c] & mask);
69 }
70 #endif
71 #endif
72 #if !defined(BOOST_NO_INTRINSIC_WCHAR_T)
can_start(unsigned int c,const unsigned char * map,unsigned char mask)73 inline bool can_start(unsigned int c, const unsigned char* map, unsigned char mask)
74 {
75    return (((c >= static_cast<unsigned int>(1u << CHAR_BIT)) ? true : map[c] & mask));
76 }
77 #endif
78 
79 
80 //
81 // Unfortunately Rogue Waves standard library appears to have a bug
82 // in std::basic_string::compare that results in eroneous answers
83 // in some cases (tested with Borland C++ 5.1, Rogue Wave lib version
84 // 0x020101) the test case was:
85 // {39135,0} < {0xff,0}
86 // which succeeds when it should not.
87 //
88 #ifndef _RWSTD_VER
89 template <class C, class T, class A>
string_compare(const std::basic_string<C,T,A> & s,const C * p)90 inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
91 {
92    if(0 == *p)
93    {
94       if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
95          return 0;
96    }
97    return s.compare(p);
98 }
99 #else
100 template <class C, class T, class A>
string_compare(const std::basic_string<C,T,A> & s,const C * p)101 inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
102 {
103    if(0 == *p)
104    {
105       if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
106          return 0;
107    }
108    return s.compare(p);
109 }
string_compare(const std::string & s,const char * p)110 inline int string_compare(const std::string& s, const char* p)
111 { return std::strcmp(s.c_str(), p); }
112 # ifndef BOOST_NO_WREGEX
string_compare(const std::wstring & s,const wchar_t * p)113 inline int string_compare(const std::wstring& s, const wchar_t* p)
114 { return std::wcscmp(s.c_str(), p); }
115 #endif
116 #endif
117 template <class Seq, class C>
string_compare(const Seq & s,const C * p)118 inline int string_compare(const Seq& s, const C* p)
119 {
120    std::size_t i = 0;
121    while((i < s.size()) && (p[i] == s[i]))
122    {
123       ++i;
124    }
125    return (i == s.size()) ? -p[i] : s[i] - p[i];
126 }
127 # define STR_COMP(s,p) string_compare(s,p)
128 
129 template<class charT>
re_skip_past_null(const charT * p)130 inline const charT* re_skip_past_null(const charT* p)
131 {
132   while (*p != static_cast<charT>(0)) ++p;
133   return ++p;
134 }
135 
136 template <class iterator, class charT, class traits_type, class char_classT>
137 iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
138                           iterator last,
139                           const re_set_long<char_classT>* set_,
140                           const regex_data<charT, traits_type>& e, bool icase)
141 {
142    const charT* p = reinterpret_cast<const charT*>(set_+1);
143    iterator ptr;
144    unsigned int i;
145    //bool icase = e.m_flags & regex_constants::icase;
146 
147    if(next == last) return next;
148 
149    typedef typename traits_type::string_type traits_string_type;
150    const ::boost::regex_traits_wrapper<traits_type>& traits_inst = *(e.m_ptraits);
151 
152    // dwa 9/13/00 suppress incorrect MSVC warning - it claims this is never
153    // referenced
154    (void)traits_inst;
155 
156    // try and match a single character, could be a multi-character
157    // collating element...
158    for(i = 0; i < set_->csingles; ++i)
159    {
160       ptr = next;
161       if(*p == static_cast<charT>(0))
162       {
163          // treat null string as special case:
164          if(traits_inst.translate(*ptr, icase) != *p)
165          {
166             while(*p == static_cast<charT>(0))++p;
167             continue;
168          }
169          return set_->isnot ? next : (ptr == next) ? ++next : ptr;
170       }
171       else
172       {
173          while(*p && (ptr != last))
174          {
175             if(traits_inst.translate(*ptr, icase) != *p)
176                break;
177             ++p;
178             ++ptr;
179          }
180 
181          if(*p == static_cast<charT>(0)) // if null we've matched
182             return set_->isnot ? next : (ptr == next) ? ++next : ptr;
183 
184          p = re_skip_past_null(p);     // skip null
185       }
186    }
187 
188    charT col = traits_inst.translate(*next, icase);
189 
190 
191    if(set_->cranges || set_->cequivalents)
192    {
193       traits_string_type s1;
194       //
195       // try and match a range, NB only a single character can match
196       if(set_->cranges)
197       {
198          if((e.m_flags & regex_constants::collate) == 0)
199             s1.assign(1, col);
200          else
201          {
202             charT a[2] = { col, charT(0), };
203             s1 = traits_inst.transform(a, a + 1);
204          }
205          for(i = 0; i < set_->cranges; ++i)
206          {
207             if(STR_COMP(s1, p) >= 0)
208             {
209                do{ ++p; }while(*p);
210                ++p;
211                if(STR_COMP(s1, p) <= 0)
212                   return set_->isnot ? next : ++next;
213             }
214             else
215             {
216                // skip first string
217                do{ ++p; }while(*p);
218                ++p;
219             }
220             // skip second string
221             do{ ++p; }while(*p);
222             ++p;
223          }
224       }
225       //
226       // try and match an equivalence class, NB only a single character can match
227       if(set_->cequivalents)
228       {
229          charT a[2] = { col, charT(0), };
230          s1 = traits_inst.transform_primary(a, a +1);
231          for(i = 0; i < set_->cequivalents; ++i)
232          {
233             if(STR_COMP(s1, p) == 0)
234                return set_->isnot ? next : ++next;
235             // skip string
236             do{ ++p; }while(*p);
237             ++p;
238          }
239       }
240    }
241    if(traits_inst.isctype(col, set_->cclasses) == true)
242       return set_->isnot ? next : ++next;
243    if((set_->cnclasses != 0) && (traits_inst.isctype(col, set_->cnclasses) == false))
244       return set_->isnot ? next : ++next;
245    return set_->isnot ? ++next : next;
246 }
247 
248 template <class BidiIterator>
249 class repeater_count
250 {
251    repeater_count** stack;
252    repeater_count* next;
253    int state_id;
254    std::size_t count;        // the number of iterations so far
255    BidiIterator start_pos;   // where the last repeat started
256 
unwind_until(int n,repeater_count * p,int current_recursion_id)257    repeater_count* unwind_until(int n, repeater_count* p, int current_recursion_id)
258    {
259       while(p && (p->state_id != n))
260       {
261          if(-2 - current_recursion_id == p->state_id)
262             return 0;
263          p = p->next;
264          if(p && (p->state_id < 0))
265          {
266             p = unwind_until(p->state_id, p, current_recursion_id);
267             if(!p)
268                return p;
269             p = p->next;
270          }
271       }
272       return p;
273    }
274 public:
repeater_count(repeater_count ** s)275    repeater_count(repeater_count** s) : stack(s), next(0), state_id(-1), count(0), start_pos() {}
276 
repeater_count(int i,repeater_count ** s,BidiIterator start,int current_recursion_id)277    repeater_count(int i, repeater_count** s, BidiIterator start, int current_recursion_id)
278       : start_pos(start)
279    {
280       state_id = i;
281       stack = s;
282       next = *stack;
283       *stack = this;
284       if((state_id > next->state_id) && (next->state_id >= 0))
285          count = 0;
286       else
287       {
288          repeater_count* p = next;
289          p = unwind_until(state_id, p, current_recursion_id);
290          if(p)
291          {
292             count = p->count;
293             start_pos = p->start_pos;
294          }
295          else
296             count = 0;
297       }
298    }
~repeater_count()299    ~repeater_count()
300    {
301       if(next)
302          *stack = next;
303    }
get_count()304    std::size_t get_count() { return count; }
get_id()305    int get_id() { return state_id; }
operator ++()306    std::size_t operator++() { return ++count; }
check_null_repeat(const BidiIterator & pos,std::size_t max)307    bool check_null_repeat(const BidiIterator& pos, std::size_t max)
308    {
309       // this is called when we are about to start a new repeat,
310       // if the last one was NULL move our count to max,
311       // otherwise save the current position.
312       bool result = (count == 0) ? false : (pos == start_pos);
313       if(result)
314          count = max;
315       else
316          start_pos = pos;
317       return result;
318    }
319 };
320 
321 struct saved_state;
322 
323 enum saved_state_type
324 {
325    saved_type_end = 0,
326    saved_type_paren = 1,
327    saved_type_recurse = 2,
328    saved_type_assertion = 3,
329    saved_state_alt = 4,
330    saved_state_repeater_count = 5,
331    saved_state_extra_block = 6,
332    saved_state_greedy_single_repeat = 7,
333    saved_state_rep_slow_dot = 8,
334    saved_state_rep_fast_dot = 9,
335    saved_state_rep_char = 10,
336    saved_state_rep_short_set = 11,
337    saved_state_rep_long_set = 12,
338    saved_state_non_greedy_long_repeat = 13,
339    saved_state_count = 14
340 };
341 
342 template <class Results>
343 struct recursion_info
344 {
345    typedef typename Results::value_type value_type;
346    typedef typename value_type::iterator iterator;
347    int idx;
348    const re_syntax_base* preturn_address;
349    Results results;
350    repeater_count<iterator>* repeater_stack;
351 };
352 
353 #ifdef BOOST_MSVC
354 #pragma warning(push)
355 #pragma warning(disable : 4251 4231)
356 #  if BOOST_MSVC < 1600
357 #     pragma warning(disable : 4660)
358 #  endif
359 #endif
360 
361 template <class BidiIterator, class Allocator, class traits>
362 class perl_matcher
363 {
364 public:
365    typedef typename traits::char_type char_type;
366    typedef perl_matcher<BidiIterator, Allocator, traits> self_type;
367    typedef bool (self_type::*matcher_proc_type)(void);
368    typedef std::size_t traits_size_type;
369    typedef typename is_byte<char_type>::width_type width_type;
370    typedef typename regex_iterator_traits<BidiIterator>::difference_type difference_type;
371    typedef match_results<BidiIterator, Allocator> results_type;
372 
perl_matcher(BidiIterator first,BidiIterator end,match_results<BidiIterator,Allocator> & what,const basic_regex<char_type,traits> & e,match_flag_type f,BidiIterator l_base)373    perl_matcher(BidiIterator first, BidiIterator end,
374       match_results<BidiIterator, Allocator>& what,
375       const basic_regex<char_type, traits>& e,
376       match_flag_type f,
377       BidiIterator l_base)
378       :  m_result(what), base(first), last(end),
379          position(first), backstop(l_base), re(e), traits_inst(e.get_traits()),
380          m_independent(false), next_count(&rep_obj), rep_obj(&next_count)
381    {
382       construct_init(e, f);
383    }
384 
385    bool match();
386    bool find();
387 
setf(match_flag_type f)388    void setf(match_flag_type f)
389    { m_match_flags |= f; }
unsetf(match_flag_type f)390    void unsetf(match_flag_type f)
391    { m_match_flags &= ~f; }
392 
393 private:
394    void construct_init(const basic_regex<char_type, traits>& e, match_flag_type f);
395 
396    bool find_imp();
397    bool match_imp();
398 #ifdef BOOST_REGEX_HAS_MS_STACK_GUARD
399    typedef bool (perl_matcher::*protected_proc_type)();
400    bool protected_call(protected_proc_type);
401 #endif
402    void estimate_max_state_count(std::random_access_iterator_tag*);
403    void estimate_max_state_count(void*);
404    bool match_prefix();
405    bool match_all_states();
406 
407    // match procs, stored in s_match_vtable:
408    bool match_startmark();
409    bool match_endmark();
410    bool match_literal();
411    bool match_start_line();
412    bool match_end_line();
413    bool match_wild();
414    bool match_match();
415    bool match_word_boundary();
416    bool match_within_word();
417    bool match_word_start();
418    bool match_word_end();
419    bool match_buffer_start();
420    bool match_buffer_end();
421    bool match_backref();
422    bool match_long_set();
423    bool match_set();
424    bool match_jump();
425    bool match_alt();
426    bool match_rep();
427    bool match_combining();
428    bool match_soft_buffer_end();
429    bool match_restart_continue();
430    bool match_long_set_repeat();
431    bool match_set_repeat();
432    bool match_char_repeat();
433    bool match_dot_repeat_fast();
434    bool match_dot_repeat_slow();
match_dot_repeat_dispatch()435    bool match_dot_repeat_dispatch()
436    {
437       return ::boost::is_random_access_iterator<BidiIterator>::value ? match_dot_repeat_fast() : match_dot_repeat_slow();
438    }
439    bool match_backstep();
440    bool match_assert_backref();
441    bool match_toggle_case();
442 #ifdef BOOST_REGEX_RECURSIVE
443    bool backtrack_till_match(std::size_t count);
444 #endif
445    bool match_recursion();
446    bool match_fail();
447    bool match_accept();
448    bool match_commit();
449    bool match_then();
450    bool skip_until_paren(int index, bool match = true);
451 
452    // find procs stored in s_find_vtable:
453    bool find_restart_any();
454    bool find_restart_word();
455    bool find_restart_line();
456    bool find_restart_buf();
457    bool find_restart_lit();
458 
459 private:
460    // final result structure to be filled in:
461    match_results<BidiIterator, Allocator>& m_result;
462    // temporary result for POSIX matches:
463    scoped_ptr<match_results<BidiIterator, Allocator> > m_temp_match;
464    // pointer to actual result structure to fill in:
465    match_results<BidiIterator, Allocator>* m_presult;
466    // start of sequence being searched:
467    BidiIterator base;
468    // end of sequence being searched:
469    BidiIterator last;
470    // current character being examined:
471    BidiIterator position;
472    // where to restart next search after failed match attempt:
473    BidiIterator restart;
474    // where the current search started from, acts as base for $` during grep:
475    BidiIterator search_base;
476    // how far we can go back when matching lookbehind:
477    BidiIterator backstop;
478    // the expression being examined:
479    const basic_regex<char_type, traits>& re;
480    // the expression's traits class:
481    const ::boost::regex_traits_wrapper<traits>& traits_inst;
482    // the next state in the machine being matched:
483    const re_syntax_base* pstate;
484    // matching flags in use:
485    match_flag_type m_match_flags;
486    // how many states we have examined so far:
487    std::ptrdiff_t state_count;
488    // max number of states to examine before giving up:
489    std::ptrdiff_t max_state_count;
490    // whether we should ignore case or not:
491    bool icase;
492    // set to true when (position == last), indicates that we may have a partial match:
493    bool m_has_partial_match;
494    // set to true whenever we get a match:
495    bool m_has_found_match;
496    // set to true whenever we're inside an independent sub-expression:
497    bool m_independent;
498    // the current repeat being examined:
499    repeater_count<BidiIterator>* next_count;
500    // the first repeat being examined (top of linked list):
501    repeater_count<BidiIterator> rep_obj;
502    // the mask to pass when matching word boundaries:
503    typename traits::char_class_type m_word_mask;
504    // the bitmask to use when determining whether a match_any matches a newline or not:
505    unsigned char match_any_mask;
506    // recursion information:
507    std::vector<recursion_info<results_type> > recursion_stack;
508 #ifdef BOOST_REGEX_RECURSIVE
509    // Set to false by a (*COMMIT):
510    bool m_can_backtrack;
511    bool m_have_accept;
512    bool m_have_then;
513 #endif
514 #ifdef BOOST_REGEX_NON_RECURSIVE
515    //
516    // additional members for non-recursive version:
517    //
518    typedef bool (self_type::*unwind_proc_type)(bool);
519 
520    void extend_stack();
521    bool unwind(bool);
522    bool unwind_end(bool);
523    bool unwind_paren(bool);
524    bool unwind_recursion_stopper(bool);
525    bool unwind_assertion(bool);
526    bool unwind_alt(bool);
527    bool unwind_repeater_counter(bool);
528    bool unwind_extra_block(bool);
529    bool unwind_greedy_single_repeat(bool);
530    bool unwind_slow_dot_repeat(bool);
531    bool unwind_fast_dot_repeat(bool);
532    bool unwind_char_repeat(bool);
533    bool unwind_short_set_repeat(bool);
534    bool unwind_long_set_repeat(bool);
535    bool unwind_non_greedy_repeat(bool);
536    bool unwind_recursion(bool);
537    bool unwind_recursion_pop(bool);
538    bool unwind_commit(bool);
539    bool unwind_then(bool);
540    bool unwind_case(bool);
541    void destroy_single_repeat();
542    void push_matched_paren(int index, const sub_match<BidiIterator>& sub);
543    void push_recursion_stopper();
544    void push_assertion(const re_syntax_base* ps, bool positive);
545    void push_alt(const re_syntax_base* ps);
546    void push_repeater_count(int i, repeater_count<BidiIterator>** s);
547    void push_single_repeat(std::size_t c, const re_repeat* r, BidiIterator last_position, int state_id);
548    void push_non_greedy_repeat(const re_syntax_base* ps);
549    void push_recursion(int idx, const re_syntax_base* p, results_type* presults);
550    void push_recursion_pop();
551    void push_case_change(bool);
552 
553    // pointer to base of stack:
554    saved_state* m_stack_base;
555    // pointer to current stack position:
556    saved_state* m_backup_state;
557    // how many memory blocks have we used up?:
558    unsigned used_block_count;
559    // determines what value to return when unwinding from recursion,
560    // allows for mixed recursive/non-recursive algorithm:
561    bool m_recursive_result;
562    // We have unwound to a lookahead/lookbehind, used by COMMIT/PRUNE/SKIP:
563    bool m_unwound_lookahead;
564    // We have unwound to an alternative, used by THEN:
565    bool m_unwound_alt;
566    // We are unwinding a commit - used by independent subs to determine whether to stop there or carry on unwinding:
567    //bool m_unwind_commit;
568 #endif
569 
570    // these operations aren't allowed, so are declared private,
571    // bodies are provided to keep explicit-instantiation requests happy:
operator =(const perl_matcher &)572    perl_matcher& operator=(const perl_matcher&)
573    {
574       return *this;
575    }
perl_matcher(const perl_matcher & that)576    perl_matcher(const perl_matcher& that)
577       : m_result(that.m_result), re(that.re), traits_inst(that.traits_inst), rep_obj(0) {}
578 };
579 
580 #ifdef BOOST_MSVC
581 #pragma warning(pop)
582 #endif
583 
584 } // namespace BOOST_REGEX_DETAIL_NS
585 
586 #ifdef BOOST_MSVC
587 #pragma warning(push)
588 #pragma warning(disable: 4103)
589 #endif
590 #ifdef BOOST_HAS_ABI_HEADERS
591 #  include BOOST_ABI_SUFFIX
592 #endif
593 #ifdef BOOST_MSVC
594 #pragma warning(pop)
595 #endif
596 
597 } // namespace boost
598 
599 #ifdef BOOST_MSVC
600 #  pragma warning(pop)
601 #endif
602 
603 //
604 // include the implementation of perl_matcher:
605 //
606 #ifdef BOOST_REGEX_RECURSIVE
607 #include <boost/regex/v4/perl_matcher_recursive.hpp>
608 #else
609 #include <boost/regex/v4/perl_matcher_non_recursive.hpp>
610 #endif
611 // this one has to be last:
612 #include <boost/regex/v4/perl_matcher_common.hpp>
613 
614 #endif
615 
616