1 /*
2  *
3  * Copyright (c) 2002
4  * John Maddock
5  *
6  * Use, modification and distribution are subject to the
7  * Boost Software License, Version 1.0. (See accompanying file
8  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9  *
10  */
11 
12 #ifndef BOOST_REGEX_MATCHER_HPP
13 #define BOOST_REGEX_MATCHER_HPP
14 
15 #include <boost/regex/v4/iterator_category.hpp>
16 
17 #ifdef BOOST_HAS_ABI_HEADERS
18 #  include BOOST_ABI_PREFIX
19 #endif
20 
21 namespace boost{
22 namespace re_detail{
23 
24 //
25 // error checking API:
26 //
27 BOOST_REGEX_DECL void BOOST_REGEX_CALL verify_options(boost::regex_constants::syntax_option_type ef, match_flag_type mf);
28 //
29 // function can_start:
30 //
31 template <class charT>
can_start(charT c,const unsigned char * map,unsigned char mask)32 bool can_start(charT c, const unsigned char* map, unsigned char mask)
33 {
34    return ((c < static_cast<charT>(0)) ? true : ((c >= static_cast<charT>(1 << CHAR_BIT)) ? true : map[c] & mask));
35 }
can_start(char c,const unsigned char * map,unsigned char mask)36 inline bool can_start(char c, const unsigned char* map, unsigned char mask)
37 {
38    return map[(unsigned char)c] & mask;
39 }
can_start(signed char c,const unsigned char * map,unsigned char mask)40 inline bool can_start(signed char c, const unsigned char* map, unsigned char mask)
41 {
42    return map[(unsigned char)c] & mask;
43 }
can_start(unsigned char c,const unsigned char * map,unsigned char mask)44 inline bool can_start(unsigned char c, const unsigned char* map, unsigned char mask)
45 {
46    return map[c] & mask;
47 }
can_start(unsigned short c,const unsigned char * map,unsigned char mask)48 inline bool can_start(unsigned short c, const unsigned char* map, unsigned char mask)
49 {
50    return ((c >= (1 << CHAR_BIT)) ? true : map[c] & mask);
51 }
52 #if !defined(__HP_aCC)
53 #if defined(WCHAR_MIN) && (WCHAR_MIN == 0) && !defined(BOOST_NO_INTRINSIC_WCHAR_T)
can_start(wchar_t c,const unsigned char * map,unsigned char mask)54 inline bool can_start(wchar_t c, const unsigned char* map, unsigned char mask)
55 {
56    return ((c >= (1 << CHAR_BIT)) ? true : map[c] & mask);
57 }
58 #endif
59 #endif
60 
61 
62 //
63 // Unfortunately Rogue Waves standard library appears to have a bug
64 // in std::basic_string::compare that results in eroneous answers
65 // in some cases (tested with Borland C++ 5.1, Rogue Wave lib version
66 // 0x020101) the test case was:
67 // {39135,0} < {0xff,0}
68 // which succeeds when it should not.
69 //
70 #ifndef _RWSTD_VER
71 #if !BOOST_WORKAROUND(BOOST_MSVC, < 1310)
72 template <class C, class T, class A>
string_compare(const std::basic_string<C,T,A> & s,const C * p)73 inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
74 {
75    if(0 == *p)
76    {
77       if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
78          return 0;
79    }
80    return s.compare(p);
81 }
82 #endif
83 #else
84 #if !BOOST_WORKAROUND(BOOST_MSVC, < 1310)
85 template <class C, class T, class A>
string_compare(const std::basic_string<C,T,A> & s,const C * p)86 inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
87 {
88    if(0 == *p)
89    {
90       if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
91          return 0;
92    }
93    return s.compare(p);
94 }
95 #endif
string_compare(const std::string & s,const char * p)96 inline int string_compare(const std::string& s, const char* p)
97 { return std::strcmp(s.c_str(), p); }
98 # ifndef BOOST_NO_WREGEX
string_compare(const std::wstring & s,const wchar_t * p)99 inline int string_compare(const std::wstring& s, const wchar_t* p)
100 { return std::wcscmp(s.c_str(), p); }
101 #endif
102 #endif
103 template <class Seq, class C>
string_compare(const Seq & s,const C * p)104 inline int string_compare(const Seq& s, const C* p)
105 {
106    std::size_t i = 0;
107    while((i < s.size()) && (p[i] == s[i]))
108    {
109       ++i;
110    }
111    return (i == s.size()) ? -p[i] : s[i] - p[i];
112 }
113 # define STR_COMP(s,p) string_compare(s,p)
114 
115 template<class charT>
re_skip_past_null(const charT * p)116 inline const charT* re_skip_past_null(const charT* p)
117 {
118   while (*p != static_cast<charT>(0)) ++p;
119   return ++p;
120 }
121 
122 template <class iterator, class charT, class traits_type, class char_classT>
123 iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
124                           iterator last,
125                           const re_set_long<char_classT>* set_,
126                           const regex_data<charT, traits_type>& e, bool icase)
127 {
128    const charT* p = reinterpret_cast<const charT*>(set_+1);
129    iterator ptr;
130    unsigned int i;
131    //bool icase = e.m_flags & regex_constants::icase;
132 
133    if(next == last) return next;
134 
135    typedef typename traits_type::string_type traits_string_type;
136    const ::boost::regex_traits_wrapper<traits_type>& traits_inst = *(e.m_ptraits);
137 
138    // dwa 9/13/00 suppress incorrect MSVC warning - it claims this is never
139    // referenced
140    (void)traits_inst;
141 
142    // try and match a single character, could be a multi-character
143    // collating element...
144    for(i = 0; i < set_->csingles; ++i)
145    {
146       ptr = next;
147       if(*p == static_cast<charT>(0))
148       {
149          // treat null string as special case:
150          if(traits_inst.translate(*ptr, icase) != *p)
151          {
152             while(*p == static_cast<charT>(0))++p;
153             continue;
154          }
155          return set_->isnot ? next : (ptr == next) ? ++next : ptr;
156       }
157       else
158       {
159          while(*p && (ptr != last))
160          {
161             if(traits_inst.translate(*ptr, icase) != *p)
162                break;
163             ++p;
164             ++ptr;
165          }
166 
167          if(*p == static_cast<charT>(0)) // if null we've matched
168             return set_->isnot ? next : (ptr == next) ? ++next : ptr;
169 
170          p = re_skip_past_null(p);     // skip null
171       }
172    }
173 
174    charT col = traits_inst.translate(*next, icase);
175 
176 
177    if(set_->cranges || set_->cequivalents)
178    {
179       traits_string_type s1;
180       //
181       // try and match a range, NB only a single character can match
182       if(set_->cranges)
183       {
184          if((e.m_flags & regex_constants::collate) == 0)
185             s1.assign(1, col);
186          else
187          {
188             charT a[2] = { col, charT(0), };
189             s1 = traits_inst.transform(a, a + 1);
190          }
191          for(i = 0; i < set_->cranges; ++i)
192          {
193             if(STR_COMP(s1, p) >= 0)
194             {
195                do{ ++p; }while(*p);
196                ++p;
197                if(STR_COMP(s1, p) <= 0)
198                   return set_->isnot ? next : ++next;
199             }
200             else
201             {
202                // skip first string
203                do{ ++p; }while(*p);
204                ++p;
205             }
206             // skip second string
207             do{ ++p; }while(*p);
208             ++p;
209          }
210       }
211       //
212       // try and match an equivalence class, NB only a single character can match
213       if(set_->cequivalents)
214       {
215          charT a[2] = { col, charT(0), };
216          s1 = traits_inst.transform_primary(a, a +1);
217          for(i = 0; i < set_->cequivalents; ++i)
218          {
219             if(STR_COMP(s1, p) == 0)
220                return set_->isnot ? next : ++next;
221             // skip string
222             do{ ++p; }while(*p);
223             ++p;
224          }
225       }
226    }
227    if(traits_inst.isctype(col, set_->cclasses) == true)
228       return set_->isnot ? next : ++next;
229    if((set_->cnclasses != 0) && (traits_inst.isctype(col, set_->cnclasses) == false))
230       return set_->isnot ? next : ++next;
231    return set_->isnot ? ++next : next;
232 }
233 
234 template <class BidiIterator>
235 class repeater_count
236 {
237    repeater_count** stack;
238    repeater_count* next;
239    int id;
240    std::size_t count;        // the number of iterations so far
241    BidiIterator start_pos;   // where the last repeat started
242 public:
repeater_count(repeater_count ** s)243    repeater_count(repeater_count** s)
244    {
245       stack = s;
246       next = 0;
247       id = -1;
248       count = 0;
249    }
repeater_count(int i,repeater_count ** s,BidiIterator start)250    repeater_count(int i, repeater_count** s, BidiIterator start)
251       : start_pos(start)
252    {
253       id = i;
254       stack = s;
255       next = *stack;
256       *stack = this;
257       if(id > next->id)
258          count = 0;
259       else
260       {
261          repeater_count* p = next;
262          while(p->id != id)
263             p = p->next;
264          count = p->count;
265          start_pos = p->start_pos;
266       }
267    }
~repeater_count()268    ~repeater_count()
269    {
270       *stack = next;
271    }
get_count()272    std::size_t get_count() { return count; }
get_id()273    int get_id() { return id; }
operator ++()274    std::size_t operator++() { return ++count; }
check_null_repeat(const BidiIterator & pos,std::size_t max)275    bool check_null_repeat(const BidiIterator& pos, std::size_t max)
276    {
277       // this is called when we are about to start a new repeat,
278       // if the last one was NULL move our count to max,
279       // otherwise save the current position.
280       bool result = (count == 0) ? false : (pos == start_pos);
281       if(result)
282          count = max;
283       else
284          start_pos = pos;
285       return result;
286    }
287 };
288 
289 struct saved_state;
290 
291 enum saved_state_type
292 {
293    saved_type_end = 0,
294    saved_type_paren = 1,
295    saved_type_recurse = 2,
296    saved_type_assertion = 3,
297    saved_state_alt = 4,
298    saved_state_repeater_count = 5,
299    saved_state_extra_block = 6,
300    saved_state_greedy_single_repeat = 7,
301    saved_state_rep_slow_dot = 8,
302    saved_state_rep_fast_dot = 9,
303    saved_state_rep_char = 10,
304    saved_state_rep_short_set = 11,
305    saved_state_rep_long_set = 12,
306    saved_state_non_greedy_long_repeat = 13,
307    saved_state_count = 14
308 };
309 
310 #ifdef BOOST_MSVC
311 #pragma warning(push)
312 #pragma warning(disable : 4251 4231 4660)
313 #endif
314 
315 template <class BidiIterator, class Allocator, class traits>
316 class perl_matcher
317 {
318 public:
319    typedef typename traits::char_type char_type;
320    typedef perl_matcher<BidiIterator, Allocator, traits> self_type;
321    typedef bool (self_type::*matcher_proc_type)(void);
322    typedef typename traits::size_type traits_size_type;
323    typedef typename is_byte<char_type>::width_type width_type;
324    typedef typename regex_iterator_traits<BidiIterator>::difference_type difference_type;
325 
326    perl_matcher(BidiIterator first, BidiIterator end,
327       match_results<BidiIterator, Allocator>& what,
328       const basic_regex<char_type, traits>& e,
329       match_flag_type f,
330       BidiIterator base);
331 
332    bool match();
333    bool find();
334 
setf(match_flag_type f)335    void setf(match_flag_type f)
336    { m_match_flags |= f; }
unsetf(match_flag_type f)337    void unsetf(match_flag_type f)
338    { m_match_flags &= ~f; }
339 
340 private:
341    void construct_init(const basic_regex<char_type, traits>& e, match_flag_type f);
342    bool find_imp();
343    bool match_imp();
344 #ifdef BOOST_REGEX_HAS_MS_STACK_GUARD
345    typedef bool (perl_matcher::*protected_proc_type)();
346    bool protected_call(protected_proc_type);
347 #endif
348    void estimate_max_state_count(std::random_access_iterator_tag*);
349    void estimate_max_state_count(void*);
350    bool match_prefix();
351    bool match_all_states();
352 
353    // match procs, stored in s_match_vtable:
354    bool match_startmark();
355    bool match_endmark();
356    bool match_literal();
357    bool match_start_line();
358    bool match_end_line();
359    bool match_wild();
360    bool match_match();
361    bool match_word_boundary();
362    bool match_within_word();
363    bool match_word_start();
364    bool match_word_end();
365    bool match_buffer_start();
366    bool match_buffer_end();
367    bool match_backref();
368    bool match_long_set();
369    bool match_set();
370    bool match_jump();
371    bool match_alt();
372    bool match_rep();
373    bool match_combining();
374    bool match_soft_buffer_end();
375    bool match_restart_continue();
376    bool match_long_set_repeat();
377    bool match_set_repeat();
378    bool match_char_repeat();
379    bool match_dot_repeat_fast();
380    bool match_dot_repeat_slow();
381    bool match_backstep();
382    bool match_assert_backref();
383    bool match_toggle_case();
384 #ifdef BOOST_REGEX_RECURSIVE
385    bool backtrack_till_match(std::size_t count);
386 #endif
387 
388    // find procs stored in s_find_vtable:
389    bool find_restart_any();
390    bool find_restart_word();
391    bool find_restart_line();
392    bool find_restart_buf();
393    bool find_restart_lit();
394 
395 private:
396    // final result structure to be filled in:
397    match_results<BidiIterator, Allocator>& m_result;
398    // temporary result for POSIX matches:
399    scoped_ptr<match_results<BidiIterator, Allocator> > m_temp_match;
400    // pointer to actual result structure to fill in:
401    match_results<BidiIterator, Allocator>* m_presult;
402    // start of sequence being searched:
403    BidiIterator base;
404    // end of sequence being searched:
405    BidiIterator last;
406    // current character being examined:
407    BidiIterator position;
408    // where to restart next search after failed match attempt:
409    BidiIterator restart;
410    // where the current search started from, acts as base for $` during grep:
411    BidiIterator search_base;
412    // how far we can go back when matching lookbehind:
413    BidiIterator backstop;
414    // the expression being examined:
415    const basic_regex<char_type, traits>& re;
416    // the expression's traits class:
417    const ::boost::regex_traits_wrapper<traits>& traits_inst;
418    // the next state in the machine being matched:
419    const re_syntax_base* pstate;
420    // matching flags in use:
421    match_flag_type m_match_flags;
422    // how many states we have examined so far:
423    difference_type state_count;
424    // max number of states to examine before giving up:
425    difference_type max_state_count;
426    // whether we should ignore case or not:
427    bool icase;
428    // set to true when (position == last), indicates that we may have a partial match:
429    bool m_has_partial_match;
430    // set to true whenever we get a match:
431    bool m_has_found_match;
432    // set to true whenever we're inside an independent sub-expression:
433    bool m_independent;
434    // the current repeat being examined:
435    repeater_count<BidiIterator>* next_count;
436    // the first repeat being examined (top of linked list):
437    repeater_count<BidiIterator> rep_obj;
438    // the mask to pass when matching word boundaries:
439    typename traits::char_class_type m_word_mask;
440    // the bitmask to use when determining whether a match_any matches a newline or not:
441    unsigned char match_any_mask;
442 
443 #ifdef BOOST_REGEX_NON_RECURSIVE
444    //
445    // additional members for non-recursive version:
446    //
447    typedef bool (self_type::*unwind_proc_type)(bool);
448 
449    void extend_stack();
450    bool unwind(bool);
451    bool unwind_end(bool);
452    bool unwind_paren(bool);
453    bool unwind_recursion_stopper(bool);
454    bool unwind_assertion(bool);
455    bool unwind_alt(bool);
456    bool unwind_repeater_counter(bool);
457    bool unwind_extra_block(bool);
458    bool unwind_greedy_single_repeat(bool);
459    bool unwind_slow_dot_repeat(bool);
460    bool unwind_fast_dot_repeat(bool);
461    bool unwind_char_repeat(bool);
462    bool unwind_short_set_repeat(bool);
463    bool unwind_long_set_repeat(bool);
464    bool unwind_non_greedy_repeat(bool);
465    void destroy_single_repeat();
466    void push_matched_paren(int index, const sub_match<BidiIterator>& sub);
467    void push_recursion_stopper();
468    void push_assertion(const re_syntax_base* ps, bool positive);
469    void push_alt(const re_syntax_base* ps);
470    void push_repeater_count(int i, repeater_count<BidiIterator>** s);
471    void push_single_repeat(std::size_t c, const re_repeat* r, BidiIterator last_position, int id);
472    void push_non_greedy_repeat(const re_syntax_base* ps);
473 
474 
475    // pointer to base of stack:
476    saved_state* m_stack_base;
477    // pointer to current stack position:
478    saved_state* m_backup_state;
479    // determines what value to return when unwinding from recursion,
480    // allows for mixed recursive/non-recursive algorithm:
481    bool m_recursive_result;
482    // how many memory blocks have we used up?:
483    unsigned used_block_count;
484 #endif
485 
486    // these operations aren't allowed, so are declared private,
487    // bodies are provided to keep explicit-instantiation requests happy:
operator =(const perl_matcher &)488    perl_matcher& operator=(const perl_matcher&)
489    {
490       return *this;
491    }
perl_matcher(const perl_matcher & that)492    perl_matcher(const perl_matcher& that)
493       : m_result(that.m_result), re(that.re), traits_inst(that.traits_inst), rep_obj(0) {}
494 };
495 
496 #ifdef BOOST_MSVC
497 #pragma warning(pop)
498 #endif
499 
500 } // namespace re_detail
501 
502 #ifdef BOOST_HAS_ABI_HEADERS
503 #  include BOOST_ABI_SUFFIX
504 #endif
505 
506 } // namespace boost
507 
508 //
509 // include the implementation of perl_matcher:
510 //
511 #ifdef BOOST_REGEX_RECURSIVE
512 #include <boost/regex/v4/perl_matcher_recursive.hpp>
513 #else
514 #include <boost/regex/v4/perl_matcher_non_recursive.hpp>
515 #endif
516 // this one has to be last:
517 #include <boost/regex/v4/perl_matcher_common.hpp>
518 
519 #endif
520 
521