1 /*
2  *
3  * Copyright (c) 2004
4  * John Maddock
5  *
6  * Use, modification and distribution are subject to the
7  * Boost Software License, Version 1.0. (See accompanying file
8  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9  *
10  */
11 
12  /*
13   *   LOCATION:    see http://www.boost.org for most recent version.
14   *   FILE         basic_regex_creator.cpp
15   *   VERSION      see <boost/version.hpp>
16   *   DESCRIPTION: Declares template class basic_regex_creator which fills in
17   *                the data members of a regex_data object.
18   */
19 
20 #ifndef BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP
21 #define BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP
22 
23 #ifdef BOOST_MSVC
24 #pragma warning(push)
25 #pragma warning(disable: 4103)
26 #endif
27 #ifdef BOOST_HAS_ABI_HEADERS
28 #  include BOOST_ABI_PREFIX
29 #endif
30 #ifdef BOOST_MSVC
31 #pragma warning(pop)
32 #endif
33 
34 #ifdef BOOST_MSVC
35 #  pragma warning(push)
36 #if BOOST_MSVC < 1910
37 #pragma warning(disable:4800)
38 #endif
39 #endif
40 
41 namespace boost{
42 
43 namespace BOOST_REGEX_DETAIL_NS{
44 
45 template <class charT>
46 struct digraph : public std::pair<charT, charT>
47 {
digraphboost::BOOST_REGEX_DETAIL_NS::digraph48    digraph() : std::pair<charT, charT>(charT(0), charT(0)){}
digraphboost::BOOST_REGEX_DETAIL_NS::digraph49    digraph(charT c1) : std::pair<charT, charT>(c1, charT(0)){}
digraphboost::BOOST_REGEX_DETAIL_NS::digraph50    digraph(charT c1, charT c2) : std::pair<charT, charT>(c1, c2)
51    {}
digraphboost::BOOST_REGEX_DETAIL_NS::digraph52    digraph(const digraph<charT>& d) : std::pair<charT, charT>(d.first, d.second){}
53    template <class Seq>
digraphboost::BOOST_REGEX_DETAIL_NS::digraph54    digraph(const Seq& s) : std::pair<charT, charT>()
55    {
56       BOOST_ASSERT(s.size() <= 2);
57       BOOST_ASSERT(s.size());
58       this->first = s[0];
59       this->second = (s.size() > 1) ? s[1] : 0;
60    }
61 };
62 
63 template <class charT, class traits>
64 class basic_char_set
65 {
66 public:
67    typedef digraph<charT>                   digraph_type;
68    typedef typename traits::string_type     string_type;
69    typedef typename traits::char_class_type m_type;
70 
basic_char_set()71    basic_char_set()
72    {
73       m_negate = false;
74       m_has_digraphs = false;
75       m_classes = 0;
76       m_negated_classes = 0;
77       m_empty = true;
78    }
79 
add_single(const digraph_type & s)80    void add_single(const digraph_type& s)
81    {
82       m_singles.insert(s);
83       if(s.second)
84          m_has_digraphs = true;
85       m_empty = false;
86    }
add_range(const digraph_type & first,const digraph_type & end)87    void add_range(const digraph_type& first, const digraph_type& end)
88    {
89       m_ranges.push_back(first);
90       m_ranges.push_back(end);
91       if(first.second)
92       {
93          m_has_digraphs = true;
94          add_single(first);
95       }
96       if(end.second)
97       {
98          m_has_digraphs = true;
99          add_single(end);
100       }
101       m_empty = false;
102    }
add_class(m_type m)103    void add_class(m_type m)
104    {
105       m_classes |= m;
106       m_empty = false;
107    }
add_negated_class(m_type m)108    void add_negated_class(m_type m)
109    {
110       m_negated_classes |= m;
111       m_empty = false;
112    }
add_equivalent(const digraph_type & s)113    void add_equivalent(const digraph_type& s)
114    {
115       m_equivalents.insert(s);
116       if(s.second)
117       {
118          m_has_digraphs = true;
119          add_single(s);
120       }
121       m_empty = false;
122    }
negate()123    void negate()
124    {
125       m_negate = true;
126       //m_empty = false;
127    }
128 
129    //
130    // accessor functions:
131    //
has_digraphs() const132    bool has_digraphs()const
133    {
134       return m_has_digraphs;
135    }
is_negated() const136    bool is_negated()const
137    {
138       return m_negate;
139    }
140    typedef typename std::vector<digraph_type>::const_iterator  list_iterator;
141    typedef typename std::set<digraph_type>::const_iterator     set_iterator;
singles_begin() const142    set_iterator singles_begin()const
143    {
144       return m_singles.begin();
145    }
singles_end() const146    set_iterator singles_end()const
147    {
148       return m_singles.end();
149    }
ranges_begin() const150    list_iterator ranges_begin()const
151    {
152       return m_ranges.begin();
153    }
ranges_end() const154    list_iterator ranges_end()const
155    {
156       return m_ranges.end();
157    }
equivalents_begin() const158    set_iterator equivalents_begin()const
159    {
160       return m_equivalents.begin();
161    }
equivalents_end() const162    set_iterator equivalents_end()const
163    {
164       return m_equivalents.end();
165    }
classes() const166    m_type classes()const
167    {
168       return m_classes;
169    }
negated_classes() const170    m_type negated_classes()const
171    {
172       return m_negated_classes;
173    }
empty() const174    bool empty()const
175    {
176       return m_empty;
177    }
178 private:
179    std::set<digraph_type>    m_singles;         // a list of single characters to match
180    std::vector<digraph_type> m_ranges;          // a list of end points of our ranges
181    bool                      m_negate;          // true if the set is to be negated
182    bool                      m_has_digraphs;    // true if we have digraphs present
183    m_type                    m_classes;         // character classes to match
184    m_type                    m_negated_classes; // negated character classes to match
185    bool                      m_empty;           // whether we've added anything yet
186    std::set<digraph_type>    m_equivalents;     // a list of equivalence classes
187 };
188 
189 template <class charT, class traits>
190 class basic_regex_creator
191 {
192 public:
193    basic_regex_creator(regex_data<charT, traits>* data);
getoffset(void * addr)194    std::ptrdiff_t getoffset(void* addr)
195    {
196       return getoffset(addr, m_pdata->m_data.data());
197    }
getoffset(const void * addr,const void * base)198    std::ptrdiff_t getoffset(const void* addr, const void* base)
199    {
200       return static_cast<const char*>(addr) - static_cast<const char*>(base);
201    }
getaddress(std::ptrdiff_t off)202    re_syntax_base* getaddress(std::ptrdiff_t off)
203    {
204       return getaddress(off, m_pdata->m_data.data());
205    }
getaddress(std::ptrdiff_t off,void * base)206    re_syntax_base* getaddress(std::ptrdiff_t off, void* base)
207    {
208       return static_cast<re_syntax_base*>(static_cast<void*>(static_cast<char*>(base) + off));
209    }
init(unsigned l_flags)210    void init(unsigned l_flags)
211    {
212       m_pdata->m_flags = l_flags;
213       m_icase = l_flags & regex_constants::icase;
214    }
flags()215    regbase::flag_type flags()
216    {
217       return m_pdata->m_flags;
218    }
flags(regbase::flag_type f)219    void flags(regbase::flag_type f)
220    {
221       m_pdata->m_flags = f;
222       if(m_icase != static_cast<bool>(f & regbase::icase))
223       {
224          m_icase = static_cast<bool>(f & regbase::icase);
225       }
226    }
227    re_syntax_base* append_state(syntax_element_type t, std::size_t s = sizeof(re_syntax_base));
228    re_syntax_base* insert_state(std::ptrdiff_t pos, syntax_element_type t, std::size_t s = sizeof(re_syntax_base));
229    re_literal* append_literal(charT c);
230    re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set);
231    re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set, mpl::false_*);
232    re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set, mpl::true_*);
233    void finalize(const charT* p1, const charT* p2);
234 protected:
235    regex_data<charT, traits>*    m_pdata;              // pointer to the basic_regex_data struct we are filling in
236    const ::boost::regex_traits_wrapper<traits>&
237                                  m_traits;             // convenience reference to traits class
238    re_syntax_base*               m_last_state;         // the last state we added
239    bool                          m_icase;              // true for case insensitive matches
240    unsigned                      m_repeater_id;        // the state_id of the next repeater
241    bool                          m_has_backrefs;       // true if there are actually any backrefs
242    unsigned                      m_backrefs;           // bitmask of permitted backrefs
243    boost::uintmax_t              m_bad_repeats;        // bitmask of repeats we can't deduce a startmap for;
244    bool                          m_has_recursions;     // set when we have recursive expresisons to fixup
245    std::vector<unsigned char>    m_recursion_checks;   // notes which recursions we've followed while analysing this expression
246    typename traits::char_class_type m_word_mask;       // mask used to determine if a character is a word character
247    typename traits::char_class_type m_mask_space;      // mask used to determine if a character is a word character
248    typename traits::char_class_type m_lower_mask;       // mask used to determine if a character is a lowercase character
249    typename traits::char_class_type m_upper_mask;      // mask used to determine if a character is an uppercase character
250    typename traits::char_class_type m_alpha_mask;      // mask used to determine if a character is an alphabetic character
251 private:
252    basic_regex_creator& operator=(const basic_regex_creator&);
253    basic_regex_creator(const basic_regex_creator&);
254 
255    void fixup_pointers(re_syntax_base* state);
256    void fixup_recursions(re_syntax_base* state);
257    void create_startmaps(re_syntax_base* state);
258    int calculate_backstep(re_syntax_base* state);
259    void create_startmap(re_syntax_base* state, unsigned char* l_map, unsigned int* pnull, unsigned char mask);
260    unsigned get_restart_type(re_syntax_base* state);
261    void set_all_masks(unsigned char* bits, unsigned char);
262    bool is_bad_repeat(re_syntax_base* pt);
263    void set_bad_repeat(re_syntax_base* pt);
264    syntax_element_type get_repeat_type(re_syntax_base* state);
265    void probe_leading_repeat(re_syntax_base* state);
266 };
267 
268 template <class charT, class traits>
basic_regex_creator(regex_data<charT,traits> * data)269 basic_regex_creator<charT, traits>::basic_regex_creator(regex_data<charT, traits>* data)
270    : m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_repeater_id(0), m_has_backrefs(false), m_backrefs(0), m_has_recursions(false)
271 {
272    m_pdata->m_data.clear();
273    m_pdata->m_status = ::boost::regex_constants::error_ok;
274    static const charT w = 'w';
275    static const charT s = 's';
276    static const charT l[5] = { 'l', 'o', 'w', 'e', 'r', };
277    static const charT u[5] = { 'u', 'p', 'p', 'e', 'r', };
278    static const charT a[5] = { 'a', 'l', 'p', 'h', 'a', };
279    m_word_mask = m_traits.lookup_classname(&w, &w +1);
280    m_mask_space = m_traits.lookup_classname(&s, &s +1);
281    m_lower_mask = m_traits.lookup_classname(l, l + 5);
282    m_upper_mask = m_traits.lookup_classname(u, u + 5);
283    m_alpha_mask = m_traits.lookup_classname(a, a + 5);
284    m_pdata->m_word_mask = m_word_mask;
285    BOOST_ASSERT(m_word_mask != 0);
286    BOOST_ASSERT(m_mask_space != 0);
287    BOOST_ASSERT(m_lower_mask != 0);
288    BOOST_ASSERT(m_upper_mask != 0);
289    BOOST_ASSERT(m_alpha_mask != 0);
290 }
291 
292 template <class charT, class traits>
append_state(syntax_element_type t,std::size_t s)293 re_syntax_base* basic_regex_creator<charT, traits>::append_state(syntax_element_type t, std::size_t s)
294 {
295    // if the state is a backref then make a note of it:
296    if(t == syntax_element_backref)
297       this->m_has_backrefs = true;
298    // append a new state, start by aligning our last one:
299    m_pdata->m_data.align();
300    // set the offset to the next state in our last one:
301    if(m_last_state)
302       m_last_state->next.i = m_pdata->m_data.size() - getoffset(m_last_state);
303    // now actually extent our data:
304    m_last_state = static_cast<re_syntax_base*>(m_pdata->m_data.extend(s));
305    // fill in boilerplate options in the new state:
306    m_last_state->next.i = 0;
307    m_last_state->type = t;
308    return m_last_state;
309 }
310 
311 template <class charT, class traits>
insert_state(std::ptrdiff_t pos,syntax_element_type t,std::size_t s)312 re_syntax_base* basic_regex_creator<charT, traits>::insert_state(std::ptrdiff_t pos, syntax_element_type t, std::size_t s)
313 {
314    // append a new state, start by aligning our last one:
315    m_pdata->m_data.align();
316    // set the offset to the next state in our last one:
317    if(m_last_state)
318       m_last_state->next.i = m_pdata->m_data.size() - getoffset(m_last_state);
319    // remember the last state position:
320    std::ptrdiff_t off = getoffset(m_last_state) + s;
321    // now actually insert our data:
322    re_syntax_base* new_state = static_cast<re_syntax_base*>(m_pdata->m_data.insert(pos, s));
323    // fill in boilerplate options in the new state:
324    new_state->next.i = s;
325    new_state->type = t;
326    m_last_state = getaddress(off);
327    return new_state;
328 }
329 
330 template <class charT, class traits>
append_literal(charT c)331 re_literal* basic_regex_creator<charT, traits>::append_literal(charT c)
332 {
333    re_literal* result;
334    // start by seeing if we have an existing re_literal we can extend:
335    if((0 == m_last_state) || (m_last_state->type != syntax_element_literal))
336    {
337       // no existing re_literal, create a new one:
338       result = static_cast<re_literal*>(append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
339       result->length = 1;
340       *static_cast<charT*>(static_cast<void*>(result+1)) = m_traits.translate(c, m_icase);
341    }
342    else
343    {
344       // we have an existing re_literal, extend it:
345       std::ptrdiff_t off = getoffset(m_last_state);
346       m_pdata->m_data.extend(sizeof(charT));
347       m_last_state = result = static_cast<re_literal*>(getaddress(off));
348       charT* characters = static_cast<charT*>(static_cast<void*>(result+1));
349       characters[result->length] = m_traits.translate(c, m_icase);
350       result->length += 1;
351    }
352    return result;
353 }
354 
355 template <class charT, class traits>
append_set(const basic_char_set<charT,traits> & char_set)356 inline re_syntax_base* basic_regex_creator<charT, traits>::append_set(
357    const basic_char_set<charT, traits>& char_set)
358 {
359    typedef mpl::bool_< (sizeof(charT) == 1) > truth_type;
360    return char_set.has_digraphs()
361       ? append_set(char_set, static_cast<mpl::false_*>(0))
362       : append_set(char_set, static_cast<truth_type*>(0));
363 }
364 
365 template <class charT, class traits>
append_set(const basic_char_set<charT,traits> & char_set,mpl::false_ *)366 re_syntax_base* basic_regex_creator<charT, traits>::append_set(
367    const basic_char_set<charT, traits>& char_set, mpl::false_*)
368 {
369    typedef typename traits::string_type string_type;
370    typedef typename basic_char_set<charT, traits>::list_iterator item_iterator;
371    typedef typename basic_char_set<charT, traits>::set_iterator  set_iterator;
372    typedef typename traits::char_class_type m_type;
373 
374    re_set_long<m_type>* result = static_cast<re_set_long<m_type>*>(append_state(syntax_element_long_set, sizeof(re_set_long<m_type>)));
375    //
376    // fill in the basics:
377    //
378    result->csingles = static_cast<unsigned int>(::boost::BOOST_REGEX_DETAIL_NS::distance(char_set.singles_begin(), char_set.singles_end()));
379    result->cranges = static_cast<unsigned int>(::boost::BOOST_REGEX_DETAIL_NS::distance(char_set.ranges_begin(), char_set.ranges_end())) / 2;
380    result->cequivalents = static_cast<unsigned int>(::boost::BOOST_REGEX_DETAIL_NS::distance(char_set.equivalents_begin(), char_set.equivalents_end()));
381    result->cclasses = char_set.classes();
382    result->cnclasses = char_set.negated_classes();
383    if(flags() & regbase::icase)
384    {
385       // adjust classes as needed:
386       if(((result->cclasses & m_lower_mask) == m_lower_mask) || ((result->cclasses & m_upper_mask) == m_upper_mask))
387          result->cclasses |= m_alpha_mask;
388       if(((result->cnclasses & m_lower_mask) == m_lower_mask) || ((result->cnclasses & m_upper_mask) == m_upper_mask))
389          result->cnclasses |= m_alpha_mask;
390    }
391 
392    result->isnot = char_set.is_negated();
393    result->singleton = !char_set.has_digraphs();
394    //
395    // remember where the state is for later:
396    //
397    std::ptrdiff_t offset = getoffset(result);
398    //
399    // now extend with all the singles:
400    //
401    item_iterator first, last;
402    set_iterator sfirst, slast;
403    sfirst = char_set.singles_begin();
404    slast = char_set.singles_end();
405    while(sfirst != slast)
406    {
407       charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (sfirst->first == static_cast<charT>(0) ? 1 : sfirst->second ? 3 : 2)));
408       p[0] = m_traits.translate(sfirst->first, m_icase);
409       if(sfirst->first == static_cast<charT>(0))
410       {
411          p[0] = 0;
412       }
413       else if(sfirst->second)
414       {
415          p[1] = m_traits.translate(sfirst->second, m_icase);
416          p[2] = 0;
417       }
418       else
419          p[1] = 0;
420       ++sfirst;
421    }
422    //
423    // now extend with all the ranges:
424    //
425    first = char_set.ranges_begin();
426    last = char_set.ranges_end();
427    while(first != last)
428    {
429       // first grab the endpoints of the range:
430       digraph<charT> c1 = *first;
431       c1.first = this->m_traits.translate(c1.first, this->m_icase);
432       c1.second = this->m_traits.translate(c1.second, this->m_icase);
433       ++first;
434       digraph<charT> c2 = *first;
435       c2.first = this->m_traits.translate(c2.first, this->m_icase);
436       c2.second = this->m_traits.translate(c2.second, this->m_icase);
437       ++first;
438       string_type s1, s2;
439       // different actions now depending upon whether collation is turned on:
440       if(flags() & regex_constants::collate)
441       {
442          // we need to transform our range into sort keys:
443          charT a1[3] = { c1.first, c1.second, charT(0), };
444          charT a2[3] = { c2.first, c2.second, charT(0), };
445          s1 = this->m_traits.transform(a1, (a1[1] ? a1+2 : a1+1));
446          s2 = this->m_traits.transform(a2, (a2[1] ? a2+2 : a2+1));
447          if(s1.size() == 0)
448             s1 = string_type(1, charT(0));
449          if(s2.size() == 0)
450             s2 = string_type(1, charT(0));
451       }
452       else
453       {
454          if(c1.second)
455          {
456             s1.insert(s1.end(), c1.first);
457             s1.insert(s1.end(), c1.second);
458          }
459          else
460             s1 = string_type(1, c1.first);
461          if(c2.second)
462          {
463             s2.insert(s2.end(), c2.first);
464             s2.insert(s2.end(), c2.second);
465          }
466          else
467             s2.insert(s2.end(), c2.first);
468       }
469       if(s1 > s2)
470       {
471          // Oops error:
472          return 0;
473       }
474       charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (s1.size() + s2.size() + 2) ) );
475       BOOST_REGEX_DETAIL_NS::copy(s1.begin(), s1.end(), p);
476       p[s1.size()] = charT(0);
477       p += s1.size() + 1;
478       BOOST_REGEX_DETAIL_NS::copy(s2.begin(), s2.end(), p);
479       p[s2.size()] = charT(0);
480    }
481    //
482    // now process the equivalence classes:
483    //
484    sfirst = char_set.equivalents_begin();
485    slast = char_set.equivalents_end();
486    while(sfirst != slast)
487    {
488       string_type s;
489       if(sfirst->second)
490       {
491          charT cs[3] = { sfirst->first, sfirst->second, charT(0), };
492          s = m_traits.transform_primary(cs, cs+2);
493       }
494       else
495          s = m_traits.transform_primary(&sfirst->first, &sfirst->first+1);
496       if(s.empty())
497          return 0;  // invalid or unsupported equivalence class
498       charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (s.size()+1) ) );
499       BOOST_REGEX_DETAIL_NS::copy(s.begin(), s.end(), p);
500       p[s.size()] = charT(0);
501       ++sfirst;
502    }
503    //
504    // finally reset the address of our last state:
505    //
506    m_last_state = result = static_cast<re_set_long<m_type>*>(getaddress(offset));
507    return result;
508 }
509 
510 template<class T>
char_less(T t1,T t2)511 inline bool char_less(T t1, T t2)
512 {
513    return t1 < t2;
514 }
char_less(char t1,char t2)515 inline bool char_less(char t1, char t2)
516 {
517    return static_cast<unsigned char>(t1) < static_cast<unsigned char>(t2);
518 }
char_less(signed char t1,signed char t2)519 inline bool char_less(signed char t1, signed char t2)
520 {
521    return static_cast<unsigned char>(t1) < static_cast<unsigned char>(t2);
522 }
523 
524 template <class charT, class traits>
append_set(const basic_char_set<charT,traits> & char_set,mpl::true_ *)525 re_syntax_base* basic_regex_creator<charT, traits>::append_set(
526    const basic_char_set<charT, traits>& char_set, mpl::true_*)
527 {
528    typedef typename traits::string_type string_type;
529    typedef typename basic_char_set<charT, traits>::list_iterator item_iterator;
530    typedef typename basic_char_set<charT, traits>::set_iterator set_iterator;
531 
532    re_set* result = static_cast<re_set*>(append_state(syntax_element_set, sizeof(re_set)));
533    bool negate = char_set.is_negated();
534    std::memset(result->_map, 0, sizeof(result->_map));
535    //
536    // handle singles first:
537    //
538    item_iterator first, last;
539    set_iterator sfirst, slast;
540    sfirst = char_set.singles_begin();
541    slast = char_set.singles_end();
542    while(sfirst != slast)
543    {
544       for(unsigned int i = 0; i < (1 << CHAR_BIT); ++i)
545       {
546          if(this->m_traits.translate(static_cast<charT>(i), this->m_icase)
547             == this->m_traits.translate(sfirst->first, this->m_icase))
548             result->_map[i] = true;
549       }
550       ++sfirst;
551    }
552    //
553    // OK now handle ranges:
554    //
555    first = char_set.ranges_begin();
556    last = char_set.ranges_end();
557    while(first != last)
558    {
559       // first grab the endpoints of the range:
560       charT c1 = this->m_traits.translate(first->first, this->m_icase);
561       ++first;
562       charT c2 = this->m_traits.translate(first->first, this->m_icase);
563       ++first;
564       // different actions now depending upon whether collation is turned on:
565       if(flags() & regex_constants::collate)
566       {
567          // we need to transform our range into sort keys:
568          charT c3[2] = { c1, charT(0), };
569          string_type s1 = this->m_traits.transform(c3, c3+1);
570          c3[0] = c2;
571          string_type s2 = this->m_traits.transform(c3, c3+1);
572          if(s1 > s2)
573          {
574             // Oops error:
575             return 0;
576          }
577          BOOST_ASSERT(c3[1] == charT(0));
578          for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
579          {
580             c3[0] = static_cast<charT>(i);
581             string_type s3 = this->m_traits.transform(c3, c3 +1);
582             if((s1 <= s3) && (s3 <= s2))
583                result->_map[i] = true;
584          }
585       }
586       else
587       {
588          if(char_less(c2, c1))
589          {
590             // Oops error:
591             return 0;
592          }
593          // everything in range matches:
594          std::memset(result->_map + static_cast<unsigned char>(c1), true, 1 + static_cast<unsigned char>(c2) - static_cast<unsigned char>(c1));
595       }
596    }
597    //
598    // and now the classes:
599    //
600    typedef typename traits::char_class_type m_type;
601    m_type m = char_set.classes();
602    if(flags() & regbase::icase)
603    {
604       // adjust m as needed:
605       if(((m & m_lower_mask) == m_lower_mask) || ((m & m_upper_mask) == m_upper_mask))
606          m |= m_alpha_mask;
607    }
608    if(m != 0)
609    {
610       for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
611       {
612          if(this->m_traits.isctype(static_cast<charT>(i), m))
613             result->_map[i] = true;
614       }
615    }
616    //
617    // and now the negated classes:
618    //
619    m = char_set.negated_classes();
620    if(flags() & regbase::icase)
621    {
622       // adjust m as needed:
623       if(((m & m_lower_mask) == m_lower_mask) || ((m & m_upper_mask) == m_upper_mask))
624          m |= m_alpha_mask;
625    }
626    if(m != 0)
627    {
628       for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
629       {
630          if(0 == this->m_traits.isctype(static_cast<charT>(i), m))
631             result->_map[i] = true;
632       }
633    }
634    //
635    // now process the equivalence classes:
636    //
637    sfirst = char_set.equivalents_begin();
638    slast = char_set.equivalents_end();
639    while(sfirst != slast)
640    {
641       string_type s;
642       BOOST_ASSERT(static_cast<charT>(0) == sfirst->second);
643       s = m_traits.transform_primary(&sfirst->first, &sfirst->first+1);
644       if(s.empty())
645          return 0;  // invalid or unsupported equivalence class
646       for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
647       {
648          charT c[2] = { (static_cast<charT>(i)), charT(0), };
649          string_type s2 = this->m_traits.transform_primary(c, c+1);
650          if(s == s2)
651             result->_map[i] = true;
652       }
653       ++sfirst;
654    }
655    if(negate)
656    {
657       for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
658       {
659          result->_map[i] = !(result->_map[i]);
660       }
661    }
662    return result;
663 }
664 
665 template <class charT, class traits>
finalize(const charT * p1,const charT * p2)666 void basic_regex_creator<charT, traits>::finalize(const charT* p1, const charT* p2)
667 {
668    if(this->m_pdata->m_status)
669       return;
670    // we've added all the states we need, now finish things off.
671    // start by adding a terminating state:
672    append_state(syntax_element_match);
673    // extend storage to store original expression:
674    std::ptrdiff_t len = p2 - p1;
675    m_pdata->m_expression_len = len;
676    charT* ps = static_cast<charT*>(m_pdata->m_data.extend(sizeof(charT) * (1 + (p2 - p1))));
677    m_pdata->m_expression = ps;
678    BOOST_REGEX_DETAIL_NS::copy(p1, p2, ps);
679    ps[p2 - p1] = 0;
680    // fill in our other data...
681    // successful parsing implies a zero status:
682    m_pdata->m_status = 0;
683    // get the first state of the machine:
684    m_pdata->m_first_state = static_cast<re_syntax_base*>(m_pdata->m_data.data());
685    // fixup pointers in the machine:
686    fixup_pointers(m_pdata->m_first_state);
687    if(m_has_recursions)
688    {
689       m_pdata->m_has_recursions = true;
690       fixup_recursions(m_pdata->m_first_state);
691       if(this->m_pdata->m_status)
692          return;
693    }
694    else
695       m_pdata->m_has_recursions = false;
696    // create nested startmaps:
697    create_startmaps(m_pdata->m_first_state);
698    // create main startmap:
699    std::memset(m_pdata->m_startmap, 0, sizeof(m_pdata->m_startmap));
700    m_pdata->m_can_be_null = 0;
701 
702    m_bad_repeats = 0;
703    if(m_has_recursions)
704       m_recursion_checks.assign(1 + m_pdata->m_mark_count, 0u);
705    create_startmap(m_pdata->m_first_state, m_pdata->m_startmap, &(m_pdata->m_can_be_null), mask_all);
706    // get the restart type:
707    m_pdata->m_restart_type = get_restart_type(m_pdata->m_first_state);
708    // optimise a leading repeat if there is one:
709    probe_leading_repeat(m_pdata->m_first_state);
710 }
711 
712 template <class charT, class traits>
fixup_pointers(re_syntax_base * state)713 void basic_regex_creator<charT, traits>::fixup_pointers(re_syntax_base* state)
714 {
715    while(state)
716    {
717       switch(state->type)
718       {
719       case syntax_element_recurse:
720          m_has_recursions = true;
721          if(state->next.i)
722             state->next.p = getaddress(state->next.i, state);
723          else
724             state->next.p = 0;
725          break;
726       case syntax_element_rep:
727       case syntax_element_dot_rep:
728       case syntax_element_char_rep:
729       case syntax_element_short_set_rep:
730       case syntax_element_long_set_rep:
731          // set the state_id of this repeat:
732          static_cast<re_repeat*>(state)->state_id = m_repeater_id++;
733          BOOST_FALLTHROUGH;
734       case syntax_element_alt:
735          std::memset(static_cast<re_alt*>(state)->_map, 0, sizeof(static_cast<re_alt*>(state)->_map));
736          static_cast<re_alt*>(state)->can_be_null = 0;
737          BOOST_FALLTHROUGH;
738       case syntax_element_jump:
739          static_cast<re_jump*>(state)->alt.p = getaddress(static_cast<re_jump*>(state)->alt.i, state);
740          BOOST_FALLTHROUGH;
741       default:
742          if(state->next.i)
743             state->next.p = getaddress(state->next.i, state);
744          else
745             state->next.p = 0;
746       }
747       state = state->next.p;
748    }
749 }
750 
751 template <class charT, class traits>
fixup_recursions(re_syntax_base * state)752 void basic_regex_creator<charT, traits>::fixup_recursions(re_syntax_base* state)
753 {
754    re_syntax_base* base = state;
755    while(state)
756    {
757       switch(state->type)
758       {
759       case syntax_element_assert_backref:
760          {
761             // just check that the index is valid:
762             int idx = static_cast<const re_brace*>(state)->index;
763             if(idx < 0)
764             {
765                idx = -idx-1;
766                if(idx >= 10000)
767                {
768                   idx = m_pdata->get_id(idx);
769                   if(idx <= 0)
770                   {
771                      // check of sub-expression that doesn't exist:
772                      if(0 == this->m_pdata->m_status) // update the error code if not already set
773                         this->m_pdata->m_status = boost::regex_constants::error_bad_pattern;
774                      //
775                      // clear the expression, we should be empty:
776                      //
777                      this->m_pdata->m_expression = 0;
778                      this->m_pdata->m_expression_len = 0;
779                      //
780                      // and throw if required:
781                      //
782                      if(0 == (this->flags() & regex_constants::no_except))
783                      {
784                         std::string message = "Encountered a forward reference to a marked sub-expression that does not exist.";
785                         boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0);
786                         e.raise();
787                      }
788                   }
789                }
790             }
791          }
792          break;
793       case syntax_element_recurse:
794          {
795             bool ok = false;
796             re_syntax_base* p = base;
797             std::ptrdiff_t idx = static_cast<re_jump*>(state)->alt.i;
798             if(idx > 10000)
799             {
800                //
801                // There may be more than one capture group with this hash, just do what Perl
802                // does and recurse to the leftmost:
803                //
804                idx = m_pdata->get_id(static_cast<int>(idx));
805             }
806             if(idx < 0)
807             {
808                ok = false;
809             }
810             else
811             {
812                while(p)
813                {
814                   if((p->type == syntax_element_startmark) && (static_cast<re_brace*>(p)->index == idx))
815                   {
816                      //
817                      // We've found the target of the recursion, set the jump target:
818                      //
819                      static_cast<re_jump*>(state)->alt.p = p;
820                      ok = true;
821                      //
822                      // Now scan the target for nested repeats:
823                      //
824                      p = p->next.p;
825                      int next_rep_id = 0;
826                      while(p)
827                      {
828                         switch(p->type)
829                         {
830                         case syntax_element_rep:
831                         case syntax_element_dot_rep:
832                         case syntax_element_char_rep:
833                         case syntax_element_short_set_rep:
834                         case syntax_element_long_set_rep:
835                            next_rep_id = static_cast<re_repeat*>(p)->state_id;
836                            break;
837                         case syntax_element_endmark:
838                            if(static_cast<const re_brace*>(p)->index == idx)
839                               next_rep_id = -1;
840                            break;
841                         default:
842                            break;
843                         }
844                         if(next_rep_id)
845                            break;
846                         p = p->next.p;
847                      }
848                      if(next_rep_id > 0)
849                      {
850                         static_cast<re_recurse*>(state)->state_id = next_rep_id - 1;
851                      }
852 
853                      break;
854                   }
855                   p = p->next.p;
856                }
857             }
858             if(!ok)
859             {
860                // recursion to sub-expression that doesn't exist:
861                if(0 == this->m_pdata->m_status) // update the error code if not already set
862                   this->m_pdata->m_status = boost::regex_constants::error_bad_pattern;
863                //
864                // clear the expression, we should be empty:
865                //
866                this->m_pdata->m_expression = 0;
867                this->m_pdata->m_expression_len = 0;
868                //
869                // and throw if required:
870                //
871                if(0 == (this->flags() & regex_constants::no_except))
872                {
873                   std::string message = "Encountered a forward reference to a recursive sub-expression that does not exist.";
874                   boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0);
875                   e.raise();
876                }
877             }
878          }
879          break;
880       default:
881          break;
882       }
883       state = state->next.p;
884    }
885 }
886 
887 template <class charT, class traits>
create_startmaps(re_syntax_base * state)888 void basic_regex_creator<charT, traits>::create_startmaps(re_syntax_base* state)
889 {
890    // non-recursive implementation:
891    // create the last map in the machine first, so that earlier maps
892    // can make use of the result...
893    //
894    // This was originally a recursive implementation, but that caused stack
895    // overflows with complex expressions on small stacks (think COM+).
896 
897    // start by saving the case setting:
898    bool l_icase = m_icase;
899    std::vector<std::pair<bool, re_syntax_base*> > v;
900 
901    while(state)
902    {
903       switch(state->type)
904       {
905       case syntax_element_toggle_case:
906          // we need to track case changes here:
907          m_icase = static_cast<re_case*>(state)->icase;
908          state = state->next.p;
909          continue;
910       case syntax_element_alt:
911       case syntax_element_rep:
912       case syntax_element_dot_rep:
913       case syntax_element_char_rep:
914       case syntax_element_short_set_rep:
915       case syntax_element_long_set_rep:
916          // just push the state onto our stack for now:
917          v.push_back(std::pair<bool, re_syntax_base*>(m_icase, state));
918          state = state->next.p;
919          break;
920       case syntax_element_backstep:
921          // we need to calculate how big the backstep is:
922          static_cast<re_brace*>(state)->index
923             = this->calculate_backstep(state->next.p);
924          if(static_cast<re_brace*>(state)->index < 0)
925          {
926             // Oops error:
927             if(0 == this->m_pdata->m_status) // update the error code if not already set
928                this->m_pdata->m_status = boost::regex_constants::error_bad_pattern;
929             //
930             // clear the expression, we should be empty:
931             //
932             this->m_pdata->m_expression = 0;
933             this->m_pdata->m_expression_len = 0;
934             //
935             // and throw if required:
936             //
937             if(0 == (this->flags() & regex_constants::no_except))
938             {
939                std::string message = "Invalid lookbehind assertion encountered in the regular expression.";
940                boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0);
941                e.raise();
942             }
943          }
944          BOOST_FALLTHROUGH;
945       default:
946          state = state->next.p;
947       }
948    }
949 
950    // now work through our list, building all the maps as we go:
951    while(v.size())
952    {
953       // Initialize m_recursion_checks if we need it:
954       if(m_has_recursions)
955          m_recursion_checks.assign(1 + m_pdata->m_mark_count, 0u);
956 
957       const std::pair<bool, re_syntax_base*>& p = v.back();
958       m_icase = p.first;
959       state = p.second;
960       v.pop_back();
961 
962       // Build maps:
963       m_bad_repeats = 0;
964       create_startmap(state->next.p, static_cast<re_alt*>(state)->_map, &static_cast<re_alt*>(state)->can_be_null, mask_take);
965       m_bad_repeats = 0;
966 
967       if(m_has_recursions)
968          m_recursion_checks.assign(1 + m_pdata->m_mark_count, 0u);
969       create_startmap(static_cast<re_alt*>(state)->alt.p, static_cast<re_alt*>(state)->_map, &static_cast<re_alt*>(state)->can_be_null, mask_skip);
970       // adjust the type of the state to allow for faster matching:
971       state->type = this->get_repeat_type(state);
972    }
973    // restore case sensitivity:
974    m_icase = l_icase;
975 }
976 
977 template <class charT, class traits>
calculate_backstep(re_syntax_base * state)978 int basic_regex_creator<charT, traits>::calculate_backstep(re_syntax_base* state)
979 {
980    typedef typename traits::char_class_type m_type;
981    int result = 0;
982    while(state)
983    {
984       switch(state->type)
985       {
986       case syntax_element_startmark:
987          if((static_cast<re_brace*>(state)->index == -1)
988             || (static_cast<re_brace*>(state)->index == -2))
989          {
990             state = static_cast<re_jump*>(state->next.p)->alt.p->next.p;
991             continue;
992          }
993          else if(static_cast<re_brace*>(state)->index == -3)
994          {
995             state = state->next.p->next.p;
996             continue;
997          }
998          break;
999       case syntax_element_endmark:
1000          if((static_cast<re_brace*>(state)->index == -1)
1001             || (static_cast<re_brace*>(state)->index == -2))
1002             return result;
1003          break;
1004       case syntax_element_literal:
1005          result += static_cast<re_literal*>(state)->length;
1006          break;
1007       case syntax_element_wild:
1008       case syntax_element_set:
1009          result += 1;
1010          break;
1011       case syntax_element_dot_rep:
1012       case syntax_element_char_rep:
1013       case syntax_element_short_set_rep:
1014       case syntax_element_backref:
1015       case syntax_element_rep:
1016       case syntax_element_combining:
1017       case syntax_element_long_set_rep:
1018       case syntax_element_backstep:
1019          {
1020             re_repeat* rep = static_cast<re_repeat *>(state);
1021             // adjust the type of the state to allow for faster matching:
1022             state->type = this->get_repeat_type(state);
1023             if((state->type == syntax_element_dot_rep)
1024                || (state->type == syntax_element_char_rep)
1025                || (state->type == syntax_element_short_set_rep))
1026             {
1027                if(rep->max != rep->min)
1028                   return -1;
1029                result += static_cast<int>(rep->min);
1030                state = rep->alt.p;
1031                continue;
1032             }
1033             else if(state->type == syntax_element_long_set_rep)
1034             {
1035                BOOST_ASSERT(rep->next.p->type == syntax_element_long_set);
1036                if(static_cast<re_set_long<m_type>*>(rep->next.p)->singleton == 0)
1037                   return -1;
1038                if(rep->max != rep->min)
1039                   return -1;
1040                result += static_cast<int>(rep->min);
1041                state = rep->alt.p;
1042                continue;
1043             }
1044          }
1045          return -1;
1046       case syntax_element_long_set:
1047          if(static_cast<re_set_long<m_type>*>(state)->singleton == 0)
1048             return -1;
1049          result += 1;
1050          break;
1051       case syntax_element_jump:
1052          state = static_cast<re_jump*>(state)->alt.p;
1053          continue;
1054       case syntax_element_alt:
1055          {
1056             int r1 = calculate_backstep(state->next.p);
1057             int r2 = calculate_backstep(static_cast<re_alt*>(state)->alt.p);
1058             if((r1 < 0) || (r1 != r2))
1059                return -1;
1060             return result + r1;
1061          }
1062       default:
1063          break;
1064       }
1065       state = state->next.p;
1066    }
1067    return -1;
1068 }
1069 
1070 template <class charT, class traits>
create_startmap(re_syntax_base * state,unsigned char * l_map,unsigned int * pnull,unsigned char mask)1071 void basic_regex_creator<charT, traits>::create_startmap(re_syntax_base* state, unsigned char* l_map, unsigned int* pnull, unsigned char mask)
1072 {
1073    int not_last_jump = 1;
1074    re_syntax_base* recursion_start = 0;
1075    int recursion_sub = 0;
1076    re_syntax_base* recursion_restart = 0;
1077 
1078    // track case sensitivity:
1079    bool l_icase = m_icase;
1080 
1081    while(state)
1082    {
1083       switch(state->type)
1084       {
1085       case syntax_element_toggle_case:
1086          l_icase = static_cast<re_case*>(state)->icase;
1087          state = state->next.p;
1088          break;
1089       case syntax_element_literal:
1090       {
1091          // don't set anything in *pnull, set each element in l_map
1092          // that could match the first character in the literal:
1093          if(l_map)
1094          {
1095             l_map[0] |= mask_init;
1096             charT first_char = *static_cast<charT*>(static_cast<void*>(static_cast<re_literal*>(state) + 1));
1097             for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
1098             {
1099                if(m_traits.translate(static_cast<charT>(i), l_icase) == first_char)
1100                   l_map[i] |= mask;
1101             }
1102          }
1103          return;
1104       }
1105       case syntax_element_end_line:
1106       {
1107          // next character must be a line separator (if there is one):
1108          if(l_map)
1109          {
1110             l_map[0] |= mask_init;
1111             l_map[static_cast<unsigned>('\n')] |= mask;
1112             l_map[static_cast<unsigned>('\r')] |= mask;
1113             l_map[static_cast<unsigned>('\f')] |= mask;
1114             l_map[0x85] |= mask;
1115          }
1116          // now figure out if we can match a NULL string at this point:
1117          if(pnull)
1118             create_startmap(state->next.p, 0, pnull, mask);
1119          return;
1120       }
1121       case syntax_element_recurse:
1122          {
1123             BOOST_ASSERT(static_cast<const re_jump*>(state)->alt.p->type == syntax_element_startmark);
1124             recursion_sub = static_cast<re_brace*>(static_cast<const re_jump*>(state)->alt.p)->index;
1125             if(m_recursion_checks[recursion_sub] & 1u)
1126             {
1127                // Infinite recursion!!
1128                if(0 == this->m_pdata->m_status) // update the error code if not already set
1129                   this->m_pdata->m_status = boost::regex_constants::error_bad_pattern;
1130                //
1131                // clear the expression, we should be empty:
1132                //
1133                this->m_pdata->m_expression = 0;
1134                this->m_pdata->m_expression_len = 0;
1135                //
1136                // and throw if required:
1137                //
1138                if(0 == (this->flags() & regex_constants::no_except))
1139                {
1140                   std::string message = "Encountered an infinite recursion.";
1141                   boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0);
1142                   e.raise();
1143                }
1144             }
1145             else if(recursion_start == 0)
1146             {
1147                recursion_start = state;
1148                recursion_restart = state->next.p;
1149                state = static_cast<re_jump*>(state)->alt.p;
1150                m_recursion_checks[recursion_sub] |= 1u;
1151                break;
1152             }
1153             m_recursion_checks[recursion_sub] |= 1u;
1154             // can't handle nested recursion here...
1155             BOOST_FALLTHROUGH;
1156          }
1157       case syntax_element_backref:
1158          // can be null, and any character can match:
1159          if(pnull)
1160             *pnull |= mask;
1161          BOOST_FALLTHROUGH;
1162       case syntax_element_wild:
1163       {
1164          // can't be null, any character can match:
1165          set_all_masks(l_map, mask);
1166          return;
1167       }
1168       case syntax_element_accept:
1169       case syntax_element_match:
1170       {
1171          // must be null, any character can match:
1172          set_all_masks(l_map, mask);
1173          if(pnull)
1174             *pnull |= mask;
1175          return;
1176       }
1177       case syntax_element_word_start:
1178       {
1179          // recurse, then AND with all the word characters:
1180          create_startmap(state->next.p, l_map, pnull, mask);
1181          if(l_map)
1182          {
1183             l_map[0] |= mask_init;
1184             for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
1185             {
1186                if(!m_traits.isctype(static_cast<charT>(i), m_word_mask))
1187                   l_map[i] &= static_cast<unsigned char>(~mask);
1188             }
1189          }
1190          return;
1191       }
1192       case syntax_element_word_end:
1193       {
1194          // recurse, then AND with all the word characters:
1195          create_startmap(state->next.p, l_map, pnull, mask);
1196          if(l_map)
1197          {
1198             l_map[0] |= mask_init;
1199             for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
1200             {
1201                if(m_traits.isctype(static_cast<charT>(i), m_word_mask))
1202                   l_map[i] &= static_cast<unsigned char>(~mask);
1203             }
1204          }
1205          return;
1206       }
1207       case syntax_element_buffer_end:
1208       {
1209          // we *must be null* :
1210          if(pnull)
1211             *pnull |= mask;
1212          return;
1213       }
1214       case syntax_element_long_set:
1215          if(l_map)
1216          {
1217             typedef typename traits::char_class_type m_type;
1218             if(static_cast<re_set_long<m_type>*>(state)->singleton)
1219             {
1220                l_map[0] |= mask_init;
1221                for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
1222                {
1223                   charT c = static_cast<charT>(i);
1224                   if(&c != re_is_set_member(&c, &c + 1, static_cast<re_set_long<m_type>*>(state), *m_pdata, l_icase))
1225                      l_map[i] |= mask;
1226                }
1227             }
1228             else
1229                set_all_masks(l_map, mask);
1230          }
1231          return;
1232       case syntax_element_set:
1233          if(l_map)
1234          {
1235             l_map[0] |= mask_init;
1236             for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
1237             {
1238                if(static_cast<re_set*>(state)->_map[
1239                   static_cast<unsigned char>(m_traits.translate(static_cast<charT>(i), l_icase))])
1240                   l_map[i] |= mask;
1241             }
1242          }
1243          return;
1244       case syntax_element_jump:
1245          // take the jump:
1246          state = static_cast<re_alt*>(state)->alt.p;
1247          not_last_jump = -1;
1248          break;
1249       case syntax_element_alt:
1250       case syntax_element_rep:
1251       case syntax_element_dot_rep:
1252       case syntax_element_char_rep:
1253       case syntax_element_short_set_rep:
1254       case syntax_element_long_set_rep:
1255          {
1256             re_alt* rep = static_cast<re_alt*>(state);
1257             if(rep->_map[0] & mask_init)
1258             {
1259                if(l_map)
1260                {
1261                   // copy previous results:
1262                   l_map[0] |= mask_init;
1263                   for(unsigned int i = 0; i <= UCHAR_MAX; ++i)
1264                   {
1265                      if(rep->_map[i] & mask_any)
1266                         l_map[i] |= mask;
1267                   }
1268                }
1269                if(pnull)
1270                {
1271                   if(rep->can_be_null & mask_any)
1272                      *pnull |= mask;
1273                }
1274             }
1275             else
1276             {
1277                // we haven't created a startmap for this alternative yet
1278                // so take the union of the two options:
1279                if(is_bad_repeat(state))
1280                {
1281                   set_all_masks(l_map, mask);
1282                   if(pnull)
1283                      *pnull |= mask;
1284                   return;
1285                }
1286                set_bad_repeat(state);
1287                create_startmap(state->next.p, l_map, pnull, mask);
1288                if((state->type == syntax_element_alt)
1289                   || (static_cast<re_repeat*>(state)->min == 0)
1290                   || (not_last_jump == 0))
1291                   create_startmap(rep->alt.p, l_map, pnull, mask);
1292             }
1293          }
1294          return;
1295       case syntax_element_soft_buffer_end:
1296          // match newline or null:
1297          if(l_map)
1298          {
1299             l_map[0] |= mask_init;
1300             l_map[static_cast<unsigned>('\n')] |= mask;
1301             l_map[static_cast<unsigned>('\r')] |= mask;
1302          }
1303          if(pnull)
1304             *pnull |= mask;
1305          return;
1306       case syntax_element_endmark:
1307          // need to handle independent subs as a special case:
1308          if(static_cast<re_brace*>(state)->index < 0)
1309          {
1310             // can be null, any character can match:
1311             set_all_masks(l_map, mask);
1312             if(pnull)
1313                *pnull |= mask;
1314             return;
1315          }
1316          else if(recursion_start && (recursion_sub != 0) && (recursion_sub == static_cast<re_brace*>(state)->index))
1317          {
1318             // recursion termination:
1319             recursion_start = 0;
1320             state = recursion_restart;
1321             break;
1322          }
1323 
1324          //
1325          // Normally we just go to the next state... but if this sub-expression is
1326          // the target of a recursion, then we might be ending a recursion, in which
1327          // case we should check whatever follows that recursion, as well as whatever
1328          // follows this state:
1329          //
1330          if(m_pdata->m_has_recursions && static_cast<re_brace*>(state)->index)
1331          {
1332             bool ok = false;
1333             re_syntax_base* p = m_pdata->m_first_state;
1334             while(p)
1335             {
1336                if(p->type == syntax_element_recurse)
1337                {
1338                   re_brace* p2 = static_cast<re_brace*>(static_cast<re_jump*>(p)->alt.p);
1339                   if((p2->type == syntax_element_startmark) && (p2->index == static_cast<re_brace*>(state)->index))
1340                   {
1341                      ok = true;
1342                      break;
1343                   }
1344                }
1345                p = p->next.p;
1346             }
1347             if(ok && ((m_recursion_checks[static_cast<re_brace*>(state)->index] & 2u) == 0))
1348             {
1349                m_recursion_checks[static_cast<re_brace*>(state)->index] |= 2u;
1350                create_startmap(p->next.p, l_map, pnull, mask);
1351             }
1352          }
1353          state = state->next.p;
1354          break;
1355 
1356       case syntax_element_commit:
1357          set_all_masks(l_map, mask);
1358          // Continue scanning so we can figure out whether we can be null:
1359          state = state->next.p;
1360          break;
1361       case syntax_element_startmark:
1362          // need to handle independent subs as a special case:
1363          if(static_cast<re_brace*>(state)->index == -3)
1364          {
1365             state = state->next.p->next.p;
1366             break;
1367          }
1368          BOOST_FALLTHROUGH;
1369       default:
1370          state = state->next.p;
1371       }
1372       ++not_last_jump;
1373    }
1374 }
1375 
1376 template <class charT, class traits>
get_restart_type(re_syntax_base * state)1377 unsigned basic_regex_creator<charT, traits>::get_restart_type(re_syntax_base* state)
1378 {
1379    //
1380    // find out how the machine starts, so we can optimise the search:
1381    //
1382    while(state)
1383    {
1384       switch(state->type)
1385       {
1386       case syntax_element_startmark:
1387       case syntax_element_endmark:
1388          state = state->next.p;
1389          continue;
1390       case syntax_element_start_line:
1391          return regbase::restart_line;
1392       case syntax_element_word_start:
1393          return regbase::restart_word;
1394       case syntax_element_buffer_start:
1395          return regbase::restart_buf;
1396       case syntax_element_restart_continue:
1397          return regbase::restart_continue;
1398       default:
1399          state = 0;
1400          continue;
1401       }
1402    }
1403    return regbase::restart_any;
1404 }
1405 
1406 template <class charT, class traits>
set_all_masks(unsigned char * bits,unsigned char mask)1407 void basic_regex_creator<charT, traits>::set_all_masks(unsigned char* bits, unsigned char mask)
1408 {
1409    //
1410    // set mask in all of bits elements,
1411    // if bits[0] has mask_init not set then we can
1412    // optimise this to a call to memset:
1413    //
1414    if(bits)
1415    {
1416       if(bits[0] == 0)
1417          (std::memset)(bits, mask, 1u << CHAR_BIT);
1418       else
1419       {
1420          for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
1421             bits[i] |= mask;
1422       }
1423       bits[0] |= mask_init;
1424    }
1425 }
1426 
1427 template <class charT, class traits>
is_bad_repeat(re_syntax_base * pt)1428 bool basic_regex_creator<charT, traits>::is_bad_repeat(re_syntax_base* pt)
1429 {
1430    switch(pt->type)
1431    {
1432    case syntax_element_rep:
1433    case syntax_element_dot_rep:
1434    case syntax_element_char_rep:
1435    case syntax_element_short_set_rep:
1436    case syntax_element_long_set_rep:
1437       {
1438          unsigned state_id = static_cast<re_repeat*>(pt)->state_id;
1439          if(state_id >= sizeof(m_bad_repeats) * CHAR_BIT)
1440             return true;  // run out of bits, assume we can't traverse this one.
1441          static const boost::uintmax_t one = 1uL;
1442          return m_bad_repeats & (one << state_id);
1443       }
1444    default:
1445       return false;
1446    }
1447 }
1448 
1449 template <class charT, class traits>
set_bad_repeat(re_syntax_base * pt)1450 void basic_regex_creator<charT, traits>::set_bad_repeat(re_syntax_base* pt)
1451 {
1452    switch(pt->type)
1453    {
1454    case syntax_element_rep:
1455    case syntax_element_dot_rep:
1456    case syntax_element_char_rep:
1457    case syntax_element_short_set_rep:
1458    case syntax_element_long_set_rep:
1459       {
1460          unsigned state_id = static_cast<re_repeat*>(pt)->state_id;
1461          static const boost::uintmax_t one = 1uL;
1462          if(state_id <= sizeof(m_bad_repeats) * CHAR_BIT)
1463             m_bad_repeats |= (one << state_id);
1464       }
1465       break;
1466    default:
1467       break;
1468    }
1469 }
1470 
1471 template <class charT, class traits>
get_repeat_type(re_syntax_base * state)1472 syntax_element_type basic_regex_creator<charT, traits>::get_repeat_type(re_syntax_base* state)
1473 {
1474    typedef typename traits::char_class_type m_type;
1475    if(state->type == syntax_element_rep)
1476    {
1477       // check to see if we are repeating a single state:
1478       if(state->next.p->next.p->next.p == static_cast<re_alt*>(state)->alt.p)
1479       {
1480          switch(state->next.p->type)
1481          {
1482          case BOOST_REGEX_DETAIL_NS::syntax_element_wild:
1483             return BOOST_REGEX_DETAIL_NS::syntax_element_dot_rep;
1484          case BOOST_REGEX_DETAIL_NS::syntax_element_literal:
1485             return BOOST_REGEX_DETAIL_NS::syntax_element_char_rep;
1486          case BOOST_REGEX_DETAIL_NS::syntax_element_set:
1487             return BOOST_REGEX_DETAIL_NS::syntax_element_short_set_rep;
1488          case BOOST_REGEX_DETAIL_NS::syntax_element_long_set:
1489             if(static_cast<BOOST_REGEX_DETAIL_NS::re_set_long<m_type>*>(state->next.p)->singleton)
1490                return BOOST_REGEX_DETAIL_NS::syntax_element_long_set_rep;
1491             break;
1492          default:
1493             break;
1494          }
1495       }
1496    }
1497    return state->type;
1498 }
1499 
1500 template <class charT, class traits>
probe_leading_repeat(re_syntax_base * state)1501 void basic_regex_creator<charT, traits>::probe_leading_repeat(re_syntax_base* state)
1502 {
1503    // enumerate our states, and see if we have a leading repeat
1504    // for which failed search restarts can be optimised;
1505    do
1506    {
1507       switch(state->type)
1508       {
1509       case syntax_element_startmark:
1510          if(static_cast<re_brace*>(state)->index >= 0)
1511          {
1512             state = state->next.p;
1513             continue;
1514          }
1515          if((static_cast<re_brace*>(state)->index == -1)
1516             || (static_cast<re_brace*>(state)->index == -2))
1517          {
1518             // skip past the zero width assertion:
1519             state = static_cast<const re_jump*>(state->next.p)->alt.p->next.p;
1520             continue;
1521          }
1522          if(static_cast<re_brace*>(state)->index == -3)
1523          {
1524             // Have to skip the leading jump state:
1525             state = state->next.p->next.p;
1526             continue;
1527          }
1528          return;
1529       case syntax_element_endmark:
1530       case syntax_element_start_line:
1531       case syntax_element_end_line:
1532       case syntax_element_word_boundary:
1533       case syntax_element_within_word:
1534       case syntax_element_word_start:
1535       case syntax_element_word_end:
1536       case syntax_element_buffer_start:
1537       case syntax_element_buffer_end:
1538       case syntax_element_restart_continue:
1539          state = state->next.p;
1540          break;
1541       case syntax_element_dot_rep:
1542       case syntax_element_char_rep:
1543       case syntax_element_short_set_rep:
1544       case syntax_element_long_set_rep:
1545          if(this->m_has_backrefs == 0)
1546             static_cast<re_repeat*>(state)->leading = true;
1547          BOOST_FALLTHROUGH;
1548       default:
1549          return;
1550       }
1551    }while(state);
1552 }
1553 
1554 
1555 } // namespace BOOST_REGEX_DETAIL_NS
1556 
1557 } // namespace boost
1558 
1559 #ifdef BOOST_MSVC
1560 #  pragma warning(pop)
1561 #endif
1562 
1563 #ifdef BOOST_MSVC
1564 #pragma warning(push)
1565 #pragma warning(disable: 4103)
1566 #endif
1567 #ifdef BOOST_HAS_ABI_HEADERS
1568 #  include BOOST_ABI_SUFFIX
1569 #endif
1570 #ifdef BOOST_MSVC
1571 #pragma warning(pop)
1572 #endif
1573 
1574 #endif
1575 
1576