1 /******************************************************************************\
2 * Copyright (c) 2016, Robert van Engelen, Genivia Inc. All rights reserved.    *
3 *                                                                              *
4 * Redistribution and use in source and binary forms, with or without           *
5 * modification, are permitted provided that the following conditions are met:  *
6 *                                                                              *
7 *   (1) Redistributions of source code must retain the above copyright notice, *
8 *       this list of conditions and the following disclaimer.                  *
9 *                                                                              *
10 *   (2) Redistributions in binary form must reproduce the above copyright      *
11 *       notice, this list of conditions and the following disclaimer in the    *
12 *       documentation and/or other materials provided with the distribution.   *
13 *                                                                              *
14 *   (3) The name of the author may not be used to endorse or promote products  *
15 *       derived from this software without specific prior written permission.  *
16 *                                                                              *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED *
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF         *
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO   *
20 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,       *
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;  *
23 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,     *
24 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR      *
25 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF       *
26 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                   *
27 \******************************************************************************/
28 
29 /**
30 @file      pattern.cpp
31 @brief     RE/flex regular expression pattern compiler
32 @author    Robert van Engelen - engelen@genivia.com
33 @copyright (c) 2016-2020, Robert van Engelen, Genivia Inc. All rights reserved.
34 @copyright (c) BSD-3 License - see LICENSE.txt
35 */
36 
37 #include <reflex/pattern.h>
38 #include <reflex/timer.h>
39 #include <cstdlib>
40 #include <cerrno>
41 #include <cmath>
42 
43 /// DFA compaction: -1 == reverse order edge compression (best); 1 == edge compression; 0 == no edge compression.
44 /** Edge compression reorders edges to produce fewer tests when executed in the compacted order.
45     For example ([a-cg-ik]|d|[e-g]|j|y|[x-z]) after reverse edge compression has only 2 edges:
46     c1 = m.FSM_CHAR();
47     if ('x' <= c1 && c1 <= 'z') goto S3;
48     if ('a' <= c1 && c1 <= 'k') goto S3;
49     return m.FSM_HALT(c1);
50 */
51 #define WITH_COMPACT_DFA -1
52 
53 #ifdef DEBUG
54 # define DBGLOGPOS(p) \
55   if ((p).accept()) \
56   { \
57     DBGLOGA(" (%u)", (p).accepts()); \
58     if ((p).lazy()) \
59       DBGLOGA("?%u", (p).lazy()); \
60     if ((p).greedy()) \
61       DBGLOGA("!"); \
62   } \
63   else \
64   { \
65     DBGLOGA(" "); \
66     if ((p).iter()) \
67       DBGLOGA("%u.", (p).iter()); \
68     DBGLOGA("%u", (p).loc()); \
69     if ((p).lazy()) \
70       DBGLOGA("?%u", (p).lazy()); \
71     if ((p).anchor()) \
72       DBGLOGA("^"); \
73     if ((p).greedy()) \
74       DBGLOGA("!"); \
75     if ((p).ticked()) \
76       DBGLOGA("'"); \
77     if ((p).negate()) \
78       DBGLOGA("-"); \
79   }
80 #endif
81 
82 namespace reflex {
83 
84 #if (defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(__BORLANDC__)) && !defined(__CYGWIN__) && !defined(__MINGW32__) && !defined(__MINGW64__)
fopen_s(FILE ** file,const char * name,const char * mode)85 inline int fopen_s(FILE **file, const char *name, const char *mode) { return ::fopen_s(file, name, mode); }
86 #else
87 inline int fopen_s(FILE **file, const char *name, const char *mode) { return (*file = ::fopen(name, mode)) ? 0 : errno; }
88 #endif
89 
print_char(FILE * file,int c,bool h=false)90 static void print_char(FILE *file, int c, bool h = false)
91 {
92   if (c >= '\a' && c <= '\r')
93     ::fprintf(file, "'\\%c'", "abtnvfr"[c - '\a']);
94   else if (c == '\\')
95     ::fprintf(file, "'\\\\'");
96   else if (c == '\'')
97     ::fprintf(file, "'\\''");
98   else if (std::isprint(c))
99     ::fprintf(file, "'%c'", c);
100   else if (h)
101     ::fprintf(file, "%02x", c);
102   else
103     ::fprintf(file, "%u", c);
104 }
105 
106 static const char *posix_class[] = {
107   "ASCII",
108   "Space",
109   "XDigit",
110   "Cntrl",
111   "Print",
112   "Alnum",
113   "Alpha",
114   "Blank",
115   "Digit",
116   "Graph",
117   "Lower",
118   "Punct",
119   "Upper",
120   "Word",
121 };
122 
123 static const char *meta_label[] = {
124   NULL,
125   "NWB",
126   "NWE",
127   "BWB",
128   "EWB",
129   "BWE",
130   "EWE",
131   "BOL",
132   "EOL",
133   "BOB",
134   "EOB",
135   "UND",
136   "IND",
137   "DED",
138 };
139 
operator [](Accept choice) const140 const std::string Pattern::operator[](Accept choice) const
141 {
142   if (choice == 0)
143     return rex_;
144   if (choice <= size())
145   {
146     Location loc = end_.at(choice - 1);
147     Location prev = 0;
148     if (choice >= 2)
149       prev = end_.at(choice - 2) + 1;
150     return rex_.substr(prev, loc - prev);
151   }
152   return "";
153 }
154 
error(regex_error_type code,size_t pos) const155 void Pattern::error(regex_error_type code, size_t pos) const
156 {
157   regex_error err(code, rex_, pos);
158   if (opt_.w)
159     std::cerr << err.what();
160   if (code == regex_error::exceeds_limits || opt_.r)
161     throw err;
162 }
163 
init(const char * options,const uint8_t * pred)164 void Pattern::init(const char *options, const uint8_t *pred)
165 {
166   init_options(options);
167   nop_ = 0;
168   len_ = 0;
169   min_ = 0;
170   one_ = false;
171   if (opc_ != NULL || fsm_ != NULL )
172   {
173     if (pred != NULL)
174     {
175       len_ = pred[0];
176       min_ = pred[1] & 0x0f;
177       one_ = pred[1] & 0x10;
178       memcpy(pre_, pred + 2, len_);
179       if (min_ > 0)
180       {
181         size_t n = len_ + 2;
182         if (min_ > 1 && len_ == 0)
183         {
184           for (size_t i = 0; i < 256; ++i)
185             bit_[i] = ~pred[i + n];
186           n += 256;
187         }
188         if (min_ >= 4)
189         {
190           for (size_t i = 0; i < Const::HASH; ++i)
191             pmh_[i] = ~pred[i + n];
192         }
193         else
194         {
195           for (size_t i = 0; i < Const::HASH; ++i)
196             pma_[i] = ~pred[i + n];
197         }
198       }
199     }
200   }
201   else
202   {
203     Positions startpos;
204     Follow    followpos;
205     Map       modifiers;
206     Map       lookahead;
207     // parse the regex pattern to construct the followpos NFA without epsilon transitions
208     parse(startpos, followpos, modifiers, lookahead);
209     // start state = startpos = firstpost of the followpos NFA, also merge the tree DFA root when non-NULL
210     DFA::State *start = dfa_.state(tfa_.tree, startpos);
211     // compile the NFA into a DFA
212     compile(start, followpos, modifiers, lookahead);
213     // assemble DFA opcode tables or direct code
214     assemble(start);
215     // delete the DFA
216     dfa_.clear();
217   }
218 }
219 
init_options(const char * options)220 void Pattern::init_options(const char *options)
221 {
222   opt_.b = false;
223   opt_.i = false;
224   opt_.m = false;
225   opt_.o = false;
226   opt_.p = false;
227   opt_.q = false;
228   opt_.r = false;
229   opt_.s = false;
230   opt_.w = false;
231   opt_.x = false;
232   opt_.e = '\\';
233   if (options != NULL)
234   {
235     for (const char *s = options; *s != '\0'; ++s)
236     {
237       switch (*s)
238       {
239         case 'b':
240           opt_.b = true;
241           break;
242         case 'e':
243           opt_.e = (*(s += (s[1] == '=') + 1) == ';' || *s == '\0' ? 256 : *s++);
244           --s;
245           break;
246         case 'p':
247           opt_.p = true;
248           break;
249         case 'i':
250           opt_.i = true;
251           break;
252         case 'm':
253           opt_.m = true;
254           break;
255         case 'o':
256           opt_.o = true;
257           break;
258         case 'q':
259           opt_.q = true;
260           break;
261         case 'r':
262           opt_.r = true;
263           break;
264         case 's':
265           opt_.s = true;
266           break;
267         case 'w':
268           opt_.w = true;
269           break;
270         case 'x':
271           opt_.x = true;
272           break;
273         case 'z':
274             for (const char *t = s += (s[1] == '='); *s != ';' && *s != '\0'; ++t)
275             {
276               if (std::isspace(*t) || *t == ';' || *t == '\0')
277               {
278                 if (t > s + 1)
279                   opt_.z = std::string(s + 1, t - s - 1);
280                 s = t;
281               }
282             }
283             --s;
284             break;
285         case 'f':
286         case 'n':
287           for (const char *t = s += (s[1] == '='); *s != ';' && *s != '\0'; ++t)
288           {
289             if (*t == ',' || std::isspace(*t) || *t == ';' || *t == '\0')
290             {
291               if (t > s + 1)
292               {
293                 std::string name(s + 1, t - s - 1);
294                 if (name.find('.') == std::string::npos)
295                   opt_.n = name;
296                 else
297                   opt_.f.push_back(name);
298               }
299               s = t;
300             }
301           }
302           --s;
303           break;
304       }
305     }
306   }
307 }
308 
parse(Positions & startpos,Follow & followpos,Map & modifiers,Map & lookahead)309 void Pattern::parse(
310     Positions& startpos,
311     Follow&    followpos,
312     Map&       modifiers,
313     Map&       lookahead)
314 {
315   DBGLOG("BEGIN parse()");
316   if (rex_.size() > Position::MAXLOC)
317     throw regex_error(regex_error::exceeds_length, rex_, Position::MAXLOC);
318   Location   len = static_cast<Location>(rex_.size());
319   Location   loc = 0;
320   Accept     choice = 1;
321   Lazy       lazyidx = 0;
322   Positions  firstpos;
323   Positions  lastpos;
324   bool       nullable;
325   Iter       iter;
326   timer_type t;
327   timer_start(t);
328   if (at(0) == '(' && at(1) == '?')
329   {
330     loc = 2;
331     while (at(loc) == '-' || std::isalnum(at(loc)))
332       ++loc;
333     if (at(loc) == ')')
334     {
335       bool active = true;
336       loc = 2;
337       Char c;
338       while ((c = at(loc)) != ')')
339       {
340         c = at(loc);
341         if (c == '-')
342           active = false;
343         else if (c == 'i')
344           opt_.i = active;
345         else if (c == 'm')
346           opt_.m = active;
347         else if (c == 'q')
348           opt_.q = active;
349         else if (c == 's')
350           opt_.s = active;
351         else if (c == 'x')
352           opt_.x = active;
353         else
354           error(regex_error::invalid_modifier, loc);
355         ++loc;
356       }
357       ++loc;
358     }
359     else
360     {
361       loc = 0;
362     }
363   }
364   do
365   {
366     Location end = loc;
367     if (!opt_.q && !opt_.x)
368     {
369       while (true)
370       {
371         Char c = at(end);
372         if (c == '\0' || c == '|')
373           break;
374         if (c == '.' || c == '^' || c == '$' || c == '(' || c == ')' || c == '[' || c == '{' || c == '?' || c == '*' || c == '+')
375         {
376           end = loc;
377           break;
378         }
379         if (c == opt_.e)
380         {
381           c = at(++end);
382           if (c == '\0' || std::strchr("0123456789<>ABDHLNPSUWXbcdehijklpsuwxz", c) != NULL)
383           {
384             end = loc;
385             break;
386           }
387           if (c == 'Q')
388           {
389             while ((c = at(++end)) != '\0')
390               if (c == opt_.e && at(end + 1) == 'E')
391                 break;
392           }
393         }
394         ++end;
395       }
396     }
397     if (loc < end)
398     {
399       // string pattern found w/o regex metas: merge string into the tree DFA
400       bool quote = false;
401       Tree::Node *t = tfa_.root();
402       while (loc < end)
403       {
404         Char c = at(loc++);
405         if (c == opt_.e)
406         {
407           if (at(loc) == 'Q')
408           {
409             quote = true;
410             ++loc;
411             continue;
412           }
413           if (at(loc) == 'E')
414           {
415             quote = false;
416             ++loc;
417             continue;
418           }
419           if (!quote)
420           {
421             static const char abtnvfr[] = "abtnvfr";
422             c = at(loc++);
423             const char *s = std::strchr(abtnvfr, c);
424             if (s != NULL)
425               c = static_cast<Char>(s - abtnvfr + '\a');
426           }
427         }
428         else if (c >= 'A' && c <= 'Z' && opt_.i)
429         {
430           c = lowercase(c);
431         }
432         t = tfa_.edge(t, c);
433       }
434       if (t->accept == 0)
435         t->accept = choice;
436     }
437     else
438     {
439       Lazyset lazyset;
440       parse2(
441           true,
442           loc,
443           firstpos,
444           lastpos,
445           nullable,
446           followpos,
447           lazyidx,
448           lazyset,
449           modifiers,
450           lookahead[choice],
451           iter);
452       end_.push_back(loc);
453       set_insert(startpos, firstpos);
454       if (nullable)
455       {
456         if (lazyset.empty())
457         {
458           startpos.insert(Position(choice).accept(true));
459         }
460         else
461         {
462           for (Lazyset::const_iterator l = lazyset.begin(); l != lazyset.end(); ++l)
463             startpos.insert(Position(choice).accept(true).lazy(*l));
464         }
465       }
466       for (Positions::const_iterator p = lastpos.begin(); p != lastpos.end(); ++p)
467       {
468         if (lazyset.empty())
469         {
470           followpos[p->pos()].insert(Position(choice).accept(true));
471         }
472         else
473         {
474           for (Lazyset::const_iterator l = lazyset.begin(); l != lazyset.end(); ++l)
475             followpos[p->pos()].insert(Position(choice).accept(true).lazy(*l));
476         }
477       }
478     }
479     if (++choice == 0)
480       error(regex_error::exceeds_limits, loc); // overflow: too many top-level alternations (should never happen)
481   } while (at(loc++) == '|');
482   --loc;
483   if (at(loc) == ')')
484     error(regex_error::mismatched_parens, loc);
485   else if (at(loc) != 0)
486     error(regex_error::invalid_syntax, loc);
487   if (opt_.i)
488     update_modified('i', modifiers, 0, len - 1);
489   if (opt_.m)
490     update_modified('m', modifiers, 0, len - 1);
491   if (opt_.s)
492     update_modified('s', modifiers, 0, len - 1);
493   pms_ = timer_elapsed(t);
494 #ifdef DEBUG
495   DBGLOGN("startpos = {");
496   for (Positions::const_iterator p = startpos.begin(); p != startpos.end(); ++p)
497     DBGLOGPOS(*p);
498   DBGLOGA(" }");
499   for (Follow::const_iterator fp = followpos.begin(); fp != followpos.end(); ++fp)
500   {
501     DBGLOGN("followpos(");
502     DBGLOGPOS(fp->first);
503     DBGLOGA(" ) = {");
504     for (Positions::const_iterator p = fp->second.begin(); p != fp->second.end(); ++p)
505       DBGLOGPOS(*p);
506     DBGLOGA(" }");
507   }
508 #endif
509   DBGLOG("END parse()");
510 }
511 
parse1(bool begin,Location & loc,Positions & firstpos,Positions & lastpos,bool & nullable,Follow & followpos,Lazy & lazyidx,Lazyset & lazyset,Map & modifiers,Locations & lookahead,Iter & iter)512 void Pattern::parse1(
513     bool       begin,
514     Location&  loc,
515     Positions& firstpos,
516     Positions& lastpos,
517     bool&      nullable,
518     Follow&    followpos,
519     Lazy&      lazyidx,
520     Lazyset&   lazyset,
521     Map&       modifiers,
522     Locations& lookahead,
523     Iter&      iter)
524 {
525   DBGLOG("BEGIN parse1(%u)", loc);
526   parse2(
527       begin,
528       loc,
529       firstpos,
530       lastpos,
531       nullable,
532       followpos,
533       lazyidx,
534       lazyset,
535       modifiers,
536       lookahead,
537       iter);
538   Positions firstpos1;
539   Positions lastpos1;
540   bool      nullable1;
541   Lazyset   lazyset1;
542   Iter      iter1;
543   while (at(loc) == '|')
544   {
545     ++loc;
546     parse2(
547         begin,
548         loc,
549         firstpos1,
550         lastpos1,
551         nullable1,
552         followpos,
553         lazyidx,
554         lazyset1,
555         modifiers,
556         lookahead,
557         iter1);
558     set_insert(firstpos, firstpos1);
559     set_insert(lastpos, lastpos1);
560     set_insert(lazyset, lazyset1);
561     if (nullable1)
562       nullable = true;
563     if (iter1 > iter)
564       iter = iter1;
565   }
566   DBGLOG("END parse1()");
567 }
568 
parse2(bool begin,Location & loc,Positions & firstpos,Positions & lastpos,bool & nullable,Follow & followpos,Lazy & lazyidx,Lazyset & lazyset,Map & modifiers,Locations & lookahead,Iter & iter)569 void Pattern::parse2(
570     bool       begin,
571     Location&  loc,
572     Positions& firstpos,
573     Positions& lastpos,
574     bool&      nullable,
575     Follow&    followpos,
576     Lazy&      lazyidx,
577     Lazyset&   lazyset,
578     Map&       modifiers,
579     Locations& lookahead,
580     Iter&      iter)
581 {
582   DBGLOG("BEGIN parse2(%u)", loc);
583   Positions a_pos;
584   Char      c;
585   if (begin)
586   {
587     while (true)
588     {
589       if (opt_.x)
590         while (std::isspace(at(loc)))
591           ++loc;
592       if (at(loc) == '^')
593       {
594         a_pos.insert(Position(loc++));
595         begin = false; // CHECKED algorithmic options: 7/29 but does not allow ^ as a pattern
596       }
597       else if (escapes_at(loc, "ABb<>"))
598       {
599         a_pos.insert(Position(loc));
600         loc += 2;
601         begin = false; // CHECKED algorithmic options: 7/29 but does not allow \b as a pattern
602       }
603       else
604       {
605         if (escapes_at(loc, "ij"))
606           begin = false;
607         break;
608       }
609     }
610   }
611   if (begin || ((c = at(loc)) != '\0' && c != '|' && c != ')'))
612   {
613     parse3(
614         begin,
615         loc,
616         firstpos,
617         lastpos,
618         nullable,
619         followpos,
620         lazyidx,
621         lazyset,
622         modifiers,
623         lookahead,
624         iter);
625     Positions firstpos1;
626     Positions lastpos1;
627     bool      nullable1;
628     Lazyset   lazyset1;
629     Iter      iter1;
630     while ((c = at(loc)) != '\0' && c != '|' && c != ')')
631     {
632       parse3(
633           false,
634           loc,
635           firstpos1,
636           lastpos1,
637           nullable1,
638           followpos,
639           lazyidx,
640           lazyset1,
641           modifiers,
642           lookahead,
643           iter1);
644       if (!lazyset.empty()) // CHECKED this is an extra rule for + only and (may) not be needed for *
645       {
646         // CHECKED algorithmic options: lazy(lazyset, firstpos1); does not work for (a|b)*?a*b+, below works
647         Positions firstpos2;
648         lazy(lazyset, firstpos1, firstpos2);
649         set_insert(firstpos1, firstpos2);
650         // if (lazyset1.empty())
651         // greedy(firstpos1); // CHECKED algorithmic options: 8/1 works except fails for ((a|b)*?b){2} and (a|b)??(a|b)??aa
652       }
653       if (nullable)
654         set_insert(firstpos, firstpos1);
655       for (Positions::const_iterator p = lastpos.begin(); p != lastpos.end(); ++p)
656         set_insert(followpos[p->pos()], firstpos1);
657       if (nullable1)
658       {
659         set_insert(lastpos, lastpos1);
660         set_insert(lazyset, lazyset1); // CHECKED 10/21
661       }
662       else
663       {
664         lastpos.swap(lastpos1);
665         lazyset.swap(lazyset1); // CHECKED 10/21
666         nullable = false;
667       }
668       // CHECKED 10/21 set_insert(lazyset, lazyset1);
669       if (iter1 > iter)
670         iter = iter1;
671     }
672   }
673   for (Positions::const_iterator p = a_pos.begin(); p != a_pos.end(); ++p)
674   {
675     for (Positions::const_iterator k = lastpos.begin(); k != lastpos.end(); ++k)
676       if (at(k->loc()) == ')' && lookahead.find(k->loc()) != lookahead.end())
677         followpos[p->pos()].insert(*k);
678     for (Positions::const_iterator k = lastpos.begin(); k != lastpos.end(); ++k)
679       followpos[k->pos()].insert(p->anchor(!nullable || k->pos() != p->pos()));
680     lastpos.clear();
681     lastpos.insert(*p);
682     if (nullable || firstpos.empty())
683     {
684       firstpos.insert(*p);
685       nullable = false;
686     }
687   }
688   DBGLOG("END parse2()");
689 }
690 
parse3(bool begin,Location & loc,Positions & firstpos,Positions & lastpos,bool & nullable,Follow & followpos,Lazy & lazyidx,Lazyset & lazyset,Map & modifiers,Locations & lookahead,Iter & iter)691 void Pattern::parse3(
692     bool       begin,
693     Location&  loc,
694     Positions& firstpos,
695     Positions& lastpos,
696     bool&      nullable,
697     Follow&    followpos,
698     Lazy&      lazyidx,
699     Lazyset&   lazyset,
700     Map&       modifiers,
701     Locations& lookahead,
702     Iter&      iter)
703 {
704   DBGLOG("BEGIN parse3(%u)", loc);
705   Position b_pos(loc);
706   parse4(
707       begin,
708       loc,
709       firstpos,
710       lastpos,
711       nullable,
712       followpos,
713       lazyidx,
714       lazyset,
715       modifiers,
716       lookahead,
717       iter);
718   Char c = at(loc);
719   if (opt_.x)
720     while (std::isspace(c))
721       c = at(++loc);
722   while (true)
723   {
724     if (c == '*' || c == '+' || c == '?')
725     {
726       if (c == '*' || c == '?')
727         nullable = true;
728       if (at(++loc) == '?')
729       {
730         if (++lazyidx == 0)
731           error(regex_error::exceeds_limits, loc); // overflow: exceeds max 255 lazy quantifiers
732         lazyset.insert(lazyidx);
733         if (nullable)
734           lazy(lazyset, firstpos);
735         ++loc;
736       }
737       else
738       {
739         // CHECKED algorithmic options: 7/30 if (!nullable)
740         // CHECKED algorithmic options: 7/30   lazyset.clear();
741         greedy(firstpos);
742       }
743       if (c == '+' && !nullable && !lazyset.empty())
744       {
745         Positions firstpos1;
746         lazy(lazyset, firstpos, firstpos1);
747         for (Positions::const_iterator p = lastpos.begin(); p != lastpos.end(); ++p)
748           set_insert(followpos[p->pos()], firstpos1);
749         set_insert(firstpos, firstpos1);
750       }
751       else if (c == '*' || c == '+')
752       {
753         for (Positions::const_iterator p = lastpos.begin(); p != lastpos.end(); ++p)
754           set_insert(followpos[p->pos()], firstpos);
755       }
756     }
757     else if (c == '{') // {n,m} repeat min n times to max m
758     {
759       size_t k = 0;
760       for (Location i = 0; i < 7 && std::isdigit(c = at(++loc)); ++i)
761         k = 10 * k + (c - '0');
762       if (k > Position::MAXITER)
763         error(regex_error::exceeds_limits, loc);
764       Iter n = static_cast<Iter>(k);
765       Iter m = n;
766       bool unlimited = false;
767       if (at(loc) == ',')
768       {
769         if (std::isdigit(at(loc + 1)))
770         {
771           m = 0;
772           for (Location i = 0; i < 7 && std::isdigit(c = at(++loc)); ++i)
773             m = 10 * m + (c - '0');
774         }
775         else
776         {
777           unlimited = true;
778           ++loc;
779         }
780       }
781       if (at(loc) == '}')
782       {
783         bool nullable1 = nullable;
784         if (n == 0)
785           nullable = true;
786         if (n > m)
787           error(regex_error::invalid_repeat, loc);
788         if (at(++loc) == '?')
789         {
790           if (++lazyidx == 0)
791             error(regex_error::exceeds_limits, loc); // overflow: exceeds max 255 lazy quantifiers
792           lazyset.insert(lazyidx);
793           if (nullable)
794             lazy(lazyset, firstpos);
795           /* CHECKED algorithmic options: 8/1 else
796              {
797              lazy(lazyset, firstpos, firstpos1);
798              set_insert(firstpos, firstpos1);
799              pfirstpos = &firstpos1;
800              } */
801           ++loc;
802         }
803         else
804         {
805           // CHECKED algorithmic options 7/30 if (!nullable)
806           // CHECKED algorithmic options 7/30   lazyset.clear();
807           if (n < m && lazyset.empty())
808             greedy(firstpos);
809         }
810         // CHECKED added pfirstpos to point to updated firstpos with lazy quants
811         Positions firstpos1, *pfirstpos = &firstpos;
812         if (!nullable && !lazyset.empty()) // CHECKED algorithmic options 8/1 added to make ((a|b)*?b){2} work
813         {
814           lazy(lazyset, firstpos, firstpos1);
815           pfirstpos = &firstpos1;
816         }
817         if (nullable && unlimited) // {0,} == *
818         {
819           for (Positions::const_iterator p = lastpos.begin(); p != lastpos.end(); ++p)
820             set_insert(followpos[p->pos()], *pfirstpos);
821         }
822         else if (m > 0)
823         {
824           if (iter * m > Position::MAXITER)
825             error(regex_error::exceeds_limits, loc);
826           // update followpos by virtually repeating sub-regex m-1 times
827           Follow followpos1;
828           for (Follow::const_iterator fp = followpos.begin(); fp != followpos.end(); ++fp)
829             if (fp->first.loc() >= b_pos)
830               for (Iter i = 0; i < m - 1; ++i)
831                 for (Positions::const_iterator p = fp->second.begin(); p != fp->second.end(); ++p)
832                   followpos1[fp->first.iter(iter * (i + 1))].insert(p->iter(iter * (i + 1)));
833           for (Follow::const_iterator fp = followpos1.begin(); fp != followpos1.end(); ++fp)
834             set_insert(followpos[fp->first], fp->second);
835           // add m-1 times virtual concatenation (by indexed positions k.i)
836           for (Iter i = 0; i < m - 1; ++i)
837             for (Positions::const_iterator k = lastpos.begin(); k != lastpos.end(); ++k)
838               for (Positions::const_iterator j = pfirstpos->begin(); j != pfirstpos->end(); ++j)
839                 followpos[k->pos().iter(iter * i)].insert(j->iter(iter * i + iter));
840           if (unlimited)
841             for (Positions::const_iterator k = lastpos.begin(); k != lastpos.end(); ++k)
842               for (Positions::const_iterator j = pfirstpos->begin(); j != pfirstpos->end(); ++j)
843                 followpos[k->pos().iter(iter * (m - 1))].insert(j->iter(iter * (m - 1)));
844           if (nullable1)
845           {
846             // extend firstpos when sub-regex is nullable
847             Positions firstpos1 = *pfirstpos;
848             for (Iter i = 1; i <= m - 1; ++i)
849               for (Positions::const_iterator k = firstpos1.begin(); k != firstpos1.end(); ++k)
850                 firstpos.insert(k->iter(iter * i));
851           }
852           // n to m-1 are optional with all 0 to m-1 are optional when nullable
853           Positions lastpos1;
854           for (Iter i = (nullable ? 0 : n - 1); i <= m - 1; ++i)
855             for (Positions::const_iterator k = lastpos.begin(); k != lastpos.end(); ++k)
856               lastpos1.insert(k->iter(iter * i));
857           lastpos.swap(lastpos1);
858           iter *= m;
859         }
860         else // zero range {0}
861         {
862           firstpos.clear();
863           lastpos.clear();
864           lazyset.clear();
865         }
866       }
867       else
868       {
869         error(regex_error::invalid_repeat, loc);
870       }
871     }
872     else
873     {
874       break;
875     }
876     c = at(loc);
877   }
878   DBGLOG("END parse3()");
879 }
880 
parse4(bool begin,Location & loc,Positions & firstpos,Positions & lastpos,bool & nullable,Follow & followpos,Lazy & lazyidx,Lazyset & lazyset,Map & modifiers,Locations & lookahead,Iter & iter)881 void Pattern::parse4(
882     bool       begin,
883     Location&  loc,
884     Positions& firstpos,
885     Positions& lastpos,
886     bool&      nullable,
887     Follow&    followpos,
888     Lazy&      lazyidx,
889     Lazyset&   lazyset,
890     Map&       modifiers,
891     Locations& lookahead,
892     Iter&      iter)
893 {
894   DBGLOG("BEGIN parse4(%u)", loc);
895   firstpos.clear();
896   lastpos.clear();
897   nullable = true;
898   lazyset.clear();
899   iter = 1;
900   Char c = at(loc);
901   if (c == '(')
902   {
903     if (at(++loc) == '?')
904     {
905       c = at(++loc);
906       if (c == '#') // (?# comment
907       {
908         while ((c = at(++loc)) != '\0' && c != ')')
909           continue;
910         if (c == ')')
911           ++loc;
912       }
913       else if (c == '^') // (?^ negative pattern to be ignored (new mode), producing a redo match
914       {
915         Positions firstpos1;
916         ++loc;
917         parse1(
918             begin,
919             loc,
920             firstpos1,
921             lastpos,
922             nullable,
923             followpos,
924             lazyidx,
925             lazyset,
926             modifiers,
927             lookahead,
928             iter);
929         for (Positions::iterator p = firstpos1.begin(); p != firstpos1.end(); ++p)
930           firstpos.insert(p->negate(true));
931       }
932       else if (c == '=') // (?= lookahead
933       {
934         Position l_pos(loc++ - 2); // lookahead at (
935         parse1(
936             begin,
937             loc,
938             firstpos,
939             lastpos,
940             nullable,
941             followpos,
942             lazyidx,
943             lazyset,
944             modifiers,
945             lookahead,
946             iter);
947         firstpos.insert(l_pos);
948         if (nullable)
949           lastpos.insert(l_pos);
950         if (lookahead.find(l_pos.loc(), loc) == lookahead.end()) // do not permit nested lookaheads
951           lookahead.insert(l_pos.loc(), loc); // lookstop at )
952         for (Positions::const_iterator p = lastpos.begin(); p != lastpos.end(); ++p)
953           followpos[p->pos()].insert(Position(loc).ticked(true));
954         lastpos.insert(Position(loc).ticked(true));
955         if (nullable)
956         {
957           firstpos.insert(Position(loc).ticked(true));
958           lastpos.insert(l_pos);
959         }
960       }
961       else if (c == ':')
962       {
963         ++loc;
964         parse1(
965             begin,
966             loc,
967             firstpos,
968             lastpos,
969             nullable,
970             followpos,
971             lazyidx,
972             lazyset,
973             modifiers,
974             lookahead,
975             iter);
976       }
977       else
978       {
979         Location m_loc = loc;
980         bool opt_q = opt_.q;
981         bool opt_x = opt_.x;
982         bool active = true;
983         do
984         {
985           if (c == '-')
986             active = false;
987           else if (c == 'q')
988             opt_.q = active;
989           else if (c == 'x')
990             opt_.x = active;
991           else if (c != 'i' && c != 'm' && c != 's')
992             error(regex_error::invalid_modifier, loc);
993           c = at(++loc);
994         } while (c != '\0' && c != ':' && c != ')');
995         if (c != '\0')
996           ++loc;
997         // enforce (?imqsx) modes
998         parse1(
999             begin,
1000             loc,
1001             firstpos,
1002             lastpos,
1003             nullable,
1004             followpos,
1005             lazyidx,
1006             lazyset,
1007             modifiers,
1008             lookahead,
1009             iter);
1010         active = true;
1011         do
1012         {
1013           c = at(m_loc++);
1014           if (c == '-')
1015           {
1016             active = false;
1017           }
1018           else if (c != '\0' && c != 'q' && c != 'x' && c != ':' && c != ')')
1019           {
1020             if (active)
1021               update_modified(c, modifiers, m_loc, loc);
1022             else
1023               update_modified(uppercase(c), modifiers, m_loc, loc);
1024           }
1025         } while (c != '\0' && c != ':' && c != ')');
1026         opt_.q = opt_q;
1027         opt_.x = opt_x;
1028       }
1029     }
1030     else
1031     {
1032       parse1(
1033           begin,
1034           loc,
1035           firstpos,
1036           lastpos,
1037           nullable,
1038           followpos,
1039           lazyidx,
1040           lazyset,
1041           modifiers,
1042           lookahead,
1043           iter);
1044     }
1045     if (c != ')')
1046     {
1047       if (at(loc) == ')')
1048         ++loc;
1049       else
1050         error(regex_error::mismatched_parens, loc);
1051     }
1052   }
1053   else if (c == '[')
1054   {
1055     firstpos.insert(loc);
1056     lastpos.insert(loc);
1057     nullable = false;
1058     if ((c = at(++loc)) == '^')
1059       c = at(++loc);
1060     while (c != '\0')
1061     {
1062       if (c == '[' && at(loc + 1) == ':')
1063       {
1064         size_t c_loc = find_at(loc + 2, ':');
1065         if (c_loc != std::string::npos && at(static_cast<Location>(c_loc + 1)) == ']')
1066           loc = static_cast<Location>(c_loc + 1);
1067       }
1068       else if (c == opt_.e && !opt_.b)
1069       {
1070         ++loc;
1071       }
1072       if ((c = at(++loc)) == ']')
1073         break;
1074     }
1075     if (c == '\0')
1076       error(regex_error::mismatched_brackets, loc);
1077     ++loc;
1078   }
1079   else if ((c == '"' && opt_.q) || escape_at(loc) == 'Q')
1080   {
1081     bool quoted = (c == '"');
1082     if (!quoted)
1083       ++loc;
1084     Location q_loc = ++loc;
1085     c = at(loc);
1086     if (c != '\0' && (quoted ? c != '"' : c != opt_.e || at(loc + 1) != 'E'))
1087     {
1088       firstpos.insert(loc);
1089       Position p;
1090       do
1091       {
1092         if (quoted && c == opt_.e && at(loc + 1) == '"')
1093           ++loc;
1094         if (p != Position::NPOS)
1095           followpos[p.pos()].insert(loc);
1096         p = loc++;
1097         c = at(loc);
1098       } while (c != '\0' && (!quoted || c != '"') && (quoted || c != opt_.e || at(loc + 1) != 'E'));
1099       lastpos.insert(p);
1100       nullable = false;
1101       modifiers['q'].insert(q_loc, loc - 1);
1102     }
1103     if (!quoted && at(loc) != '\0')
1104       ++loc;
1105     if (at(loc) != '\0')
1106       ++loc;
1107     else
1108       error(regex_error::mismatched_quotation, loc);
1109   }
1110   else if (c == '#' && opt_.x)
1111   {
1112     ++loc;
1113     while ((c = at(loc)) != '\0' && c != '\n')
1114       ++loc;
1115     if (c == '\n')
1116       ++loc;
1117   }
1118   else if (std::isspace(c) && opt_.x)
1119   {
1120     ++loc;
1121   }
1122   else if (c == ')')
1123   {
1124     if (begin)
1125       error(regex_error::empty_expression, loc++);
1126     else
1127       error(regex_error::mismatched_parens, loc++);
1128   }
1129   else if (c == '}')
1130   {
1131     error(regex_error::mismatched_braces, loc++);
1132   }
1133   else if (c != '\0' && c != '|' && c != '?' && c != '*' && c != '+')
1134   {
1135     firstpos.insert(loc);
1136     lastpos.insert(loc);
1137     nullable = false;
1138     if (c == opt_.e)
1139       (void)parse_esc(loc);
1140     else
1141       ++loc;
1142   }
1143   else if (begin && c != '\0') // permits empty regex pattern but not empty subpatterns
1144   {
1145     error(regex_error::empty_expression, loc);
1146   }
1147   DBGLOG("END parse4()");
1148 }
1149 
parse_esc(Location & loc,Chars * chars) const1150 Pattern::Char Pattern::parse_esc(Location& loc, Chars *chars) const
1151 {
1152   Char c = at(++loc);
1153   if (c == '0')
1154   {
1155     c = 0;
1156     int d = at(++loc);
1157     if (d >= '0' && d <= '7')
1158     {
1159       c = d - '0';
1160       d = at(++loc);
1161       if (d >= '0' && d <= '7')
1162       {
1163         c = (c << 3) + d - '0';
1164         d = at(++loc);
1165         if (c < 32 && d >= '0' && d <= '7')
1166         {
1167           c = (c << 3) + d - '0';
1168           ++loc;
1169         }
1170       }
1171     }
1172   }
1173   else if ((c == 'x' || c == 'u') && at(loc + 1) == '{')
1174   {
1175     c = 0;
1176     loc += 2;
1177     int d = at(loc);
1178     if (std::isxdigit(d))
1179     {
1180       c = (d > '9' ? (d | 0x20) - ('a' - 10) : d - '0');
1181       d = at(++loc);
1182       if (std::isxdigit(d))
1183       {
1184         c = (c << 4) + (d > '9' ? (d | 0x20) - ('a' - 10) : d - '0');
1185         ++loc;
1186       }
1187     }
1188     if (at(loc) == '}')
1189       ++loc;
1190     else
1191       error(regex_error::invalid_escape, loc);
1192   }
1193   else if (c == 'x' && std::isxdigit(at(loc + 1)))
1194   {
1195     int d = at(++loc);
1196     c = (d > '9' ? (d | 0x20) - ('a' - 10) : d - '0');
1197     d = at(++loc);
1198     if (std::isxdigit(d))
1199     {
1200       c = (c << 4) + (d > '9' ? (d | 0x20) - ('a' - 10) : d - '0');
1201       ++loc;
1202     }
1203   }
1204   else if (c == 'c')
1205   {
1206     c = at(++loc) % 32;
1207     ++loc;
1208   }
1209   else if (c == 'e')
1210   {
1211     c = 0x1B;
1212     ++loc;
1213   }
1214   else if (c == 'N')
1215   {
1216     if (chars != NULL)
1217     {
1218       chars->insert(0, 9);
1219       chars->insert(11, 255);
1220     }
1221     ++loc;
1222     c = META_EOL;
1223   }
1224   else if ((c == 'p' || c == 'P') && at(loc + 1) == '{')
1225   {
1226     loc += 2;
1227     if (chars != NULL)
1228     {
1229       size_t i;
1230       for (i = 0; i < 14; ++i)
1231         if (eq_at(loc, posix_class[i]))
1232           break;
1233       if (i < 14)
1234         posix(i, *chars);
1235       else
1236         error(regex_error::invalid_class, loc);
1237       if (c == 'P')
1238         flip(*chars);
1239       loc += static_cast<Location>(strlen(posix_class[i]));
1240       if (at(loc) == '}')
1241         ++loc;
1242       else
1243         error(regex_error::invalid_escape, loc);
1244     }
1245     else
1246     {
1247       while ((c = at(++loc)) != '\0' && c != '}')
1248         continue;
1249       if (c == '}')
1250         ++loc;
1251       else
1252         error(regex_error::invalid_escape, loc);
1253     }
1254     c = META_EOL;
1255   }
1256   else if (c != '_')
1257   {
1258     static const char abtnvfr[] = "abtnvfr";
1259     const char *s = std::strchr(abtnvfr, c);
1260     if (s != NULL)
1261     {
1262       c = static_cast<Char>(s - abtnvfr + '\a');
1263     }
1264     else
1265     {
1266       static const char escapes[] = "__sSxX________hHdD__lL__uUwW";
1267       s = std::strchr(escapes, c);
1268       if (s != NULL)
1269       {
1270         if (chars != NULL)
1271         {
1272           posix((s - escapes) / 2, *chars);
1273           if ((s - escapes) % 2)
1274             flip(*chars);
1275         }
1276         c = META_EOL;
1277       }
1278     }
1279     ++loc;
1280   }
1281   if (c <= 0xFF && chars != NULL)
1282     chars->insert(c);
1283   return c;
1284 }
1285 
compile(DFA::State * start,Follow & followpos,const Map & modifiers,const Map & lookahead)1286 void Pattern::compile(
1287     DFA::State *start,
1288     Follow&     followpos,
1289     const Map&  modifiers,
1290     const Map&  lookahead)
1291 {
1292   DBGLOG("BEGIN compile()");
1293   // init stats and timers
1294   vno_ = 0;
1295   eno_ = 0;
1296   ems_ = 0.0;
1297   timer_type vt, et;
1298   timer_start(vt);
1299   // construct the DFA
1300   acc_.resize(end_.size(), false);
1301   trim_lazy(start);
1302   // hash table with 64K entries (uint16_t indexed)
1303   DFA::State **table = new DFA::State*[65536];
1304   for (int i = 0; i < 65536; ++i)
1305     table[i] = NULL;
1306   // start state should only be discoverable (to possibly cycle back to) if no tree DFA was constructed
1307   if (start->tnode == NULL)
1308     table[hash_pos(start)] = start;
1309   // last added state
1310   DFA::State *last_state = start;
1311   for (DFA::State *state = start; state; state = state->next)
1312   {
1313     Moves moves;
1314     timer_start(et);
1315     // use the tree DFA accept state, if present
1316     if (state->tnode != NULL && state->tnode->accept > 0)
1317       state->accept = state->tnode->accept;
1318     compile_transition(
1319         state,
1320         followpos,
1321         modifiers,
1322         lookahead,
1323         moves);
1324     if (state->tnode != NULL)
1325     {
1326       // merge tree DFA transitions into the final DFA transitions to target states
1327       if (moves.empty())
1328       {
1329         // no DFA transitions: the final DFA transitions are the tree DFA transitions to target states
1330         for (Char c = 0; c < 256; ++c)
1331         {
1332           if (state->tnode->edge[c] != NULL)
1333           {
1334             DFA::State *target_state = last_state = last_state->next = dfa_.state(state->tnode->edge[c]);
1335             if (opt_.i && std::isalpha(c))
1336             {
1337               state->edges[lowercase(c)] = std::pair<Char,DFA::State*>(lowercase(c), target_state);
1338               state->edges[uppercase(c)] = std::pair<Char,DFA::State*>(uppercase(c), target_state);
1339               eno_ += 2;
1340             }
1341             else
1342             {
1343               state->edges[c] = std::pair<Char,DFA::State*>(c, target_state);
1344               ++eno_;
1345             }
1346           }
1347         }
1348       }
1349       else
1350       {
1351         // combine the tree DFA transitions with the regex DFA transition moves
1352         Chars chars;
1353         for (Char c = 0; c < 256; ++c)
1354           if (state->tnode->edge[c] != NULL)
1355             chars.insert(c);
1356         if (opt_.i)
1357           for (Char c = 'a'; c <= 'z'; ++c)
1358             if (state->tnode->edge[c] != NULL)
1359               chars.insert(uppercase(c));
1360         Moves::iterator i = moves.begin();
1361         Moves::iterator end = moves.end();
1362         while (i != end)
1363         {
1364           if (chars.intersects(i->first))
1365           {
1366             // tree DFA transitions intersect with this DFA transition move
1367             Chars common = chars & i->first;
1368             chars -= common;
1369             Char lo = common.lo();
1370             Char hi = common.hi();
1371             for (Char c = lo; c <= hi; ++c)
1372             {
1373               if (common.contains(c))
1374               {
1375                 Positions pos(i->second);
1376                 if (opt_.i && std::isalpha(c))
1377                 {
1378                   if (c >= 'a' && c <= 'z')
1379                   {
1380                     DFA::State *target_state = last_state = last_state->next = dfa_.state(state->tnode->edge[c], pos);
1381                     state->edges[c] = std::pair<Char,DFA::State*>(c, target_state);
1382                     state->edges[uppercase(c)] = std::pair<Char,DFA::State*>(uppercase(c), target_state);
1383                     eno_ += 2;
1384                   }
1385                 }
1386                 else
1387                 {
1388                   DFA::State *target_state = last_state = last_state->next = dfa_.state(state->tnode->edge[c], pos);
1389                   state->edges[c] = std::pair<Char,DFA::State*>(c, target_state);
1390                   ++eno_;
1391                 }
1392               }
1393             }
1394             i->first -= common;
1395             if (i->first.any())
1396               ++i;
1397             else
1398               moves.erase(i++);
1399           }
1400           else
1401           {
1402             ++i;
1403           }
1404         }
1405         if (opt_.i)
1406         {
1407           // normalize by removing upper case if option i (case insensitivem matching) is enabled
1408           static const uint64_t upper[5] = { 0x0000000000000000, 0x0000000007FFFFFE, 0, 0, 0 };
1409           chars -= Chars(upper);
1410         }
1411         if (chars.any())
1412         {
1413           Char lo = chars.lo();
1414           Char hi = chars.hi();
1415           for (Char c = lo; c <= hi; ++c)
1416           {
1417             if (chars.contains(c))
1418             {
1419               DFA::State *target_state = last_state = last_state->next = dfa_.state(state->tnode->edge[c]);
1420               if (opt_.i && std::isalpha(c))
1421               {
1422                 state->edges[lowercase(c)] = std::pair<Char,DFA::State*>(lowercase(c), target_state);
1423                 state->edges[uppercase(c)] = std::pair<Char,DFA::State*>(uppercase(c), target_state);
1424                 eno_ += 2;
1425               }
1426               else
1427               {
1428                 state->edges[c] = std::pair<Char,DFA::State*>(c, target_state);
1429                 ++eno_;
1430               }
1431             }
1432           }
1433         }
1434       }
1435     }
1436     ems_ += timer_elapsed(et);
1437     Moves::iterator end = moves.end();
1438     for (Moves::iterator i = moves.begin(); i != end; ++i)
1439     {
1440       Positions& pos = i->second;
1441       if (!pos.empty())
1442       {
1443         uint16_t h = hash_pos(&pos);
1444         DFA::State **branch_ptr = &table[h];
1445         DFA::State *target_state = *branch_ptr;
1446         // binary search the target state for a possible matching state in the hash table overflow tree
1447         while (target_state != NULL)
1448         {
1449           if (pos < *target_state)
1450             target_state = *(branch_ptr = &target_state->left);
1451           else if (pos > *target_state)
1452             target_state = *(branch_ptr = &target_state->right);
1453           else
1454             break;
1455         }
1456         if (target_state == NULL)
1457         {
1458           target_state = last_state = last_state->next = dfa_.state(NULL, pos);
1459           if (branch_ptr != NULL)
1460             *branch_ptr = target_state;
1461           else
1462             table[h] = target_state;
1463         }
1464         Char lo = i->first.lo();
1465         Char max = i->first.hi();
1466 #ifdef DEBUG
1467         DBGLOGN("from state %p on %02x-%02x move to {", state, lo, max);
1468         for (Positions::const_iterator p = pos.begin(); p != pos.end(); ++p)
1469           DBGLOGPOS(*p);
1470         DBGLOGN(" } = state %p", target_state);
1471 #endif
1472         while (lo <= max)
1473         {
1474           if (i->first.contains(lo))
1475           {
1476             Char hi = lo + 1;
1477             while (hi <= max && i->first.contains(hi))
1478               ++hi;
1479             --hi;
1480 #if WITH_COMPACT_DFA == -1
1481             state->edges[lo] = std::pair<Char,DFA::State*>(hi, target_state);
1482 #else
1483             state->edges[hi] = std::pair<Char,DFA::State*>(lo, target_state);
1484 #endif
1485             eno_ += hi - lo + 1;
1486             lo = hi + 1;
1487           }
1488           ++lo;
1489         }
1490       }
1491     }
1492     if (state->accept > 0 && state->accept <= end_.size())
1493       acc_[state->accept - 1] = true;
1494     ++vno_;
1495   }
1496   delete[] table;
1497   tfa_.clear();
1498   vms_ = timer_elapsed(vt) - ems_;
1499   DBGLOG("END compile()");
1500 }
1501 
lazy(const Lazyset & lazyset,Positions & pos) const1502 void Pattern::lazy(
1503     const Lazyset& lazyset,
1504     Positions&     pos) const
1505 {
1506   if (!lazyset.empty())
1507   {
1508     Positions pos1;
1509     lazy(lazyset, pos, pos1);
1510     pos.swap(pos1);
1511   }
1512 }
1513 
lazy(const Lazyset & lazyset,const Positions & pos,Positions & pos1) const1514 void Pattern::lazy(
1515     const Lazyset&   lazyset,
1516     const Positions& pos,
1517     Positions&       pos1) const
1518 {
1519   for (Positions::const_iterator p = pos.begin(); p != pos.end(); ++p)
1520     for (Lazyset::const_iterator l = lazyset.begin(); l != lazyset.end(); ++l)
1521       // pos1.insert(p->lazy() ? *p : p->lazy(*l)); // CHECKED algorithmic options: only if p is not already lazy??
1522       pos1.insert(p->lazy(*l)); // overrides lazyness even when p is already lazy
1523 }
1524 
greedy(Positions & pos) const1525 void Pattern::greedy(Positions& pos) const
1526 {
1527   Positions pos1;
1528   for (Positions::const_iterator p = pos.begin(); p != pos.end(); ++p)
1529     pos1.insert(p->lazy() ? *p : p->greedy(true)); // CHECKED algorithmic options: 7/29 guard added: p->lazy() ? *p : p->greedy(true)
1530     // CHECKED 10/21 pos1.insert(p->lazy(0).greedy(true));
1531   pos.swap(pos1);
1532 }
1533 
trim_anchors(Positions & follow,const Position p) const1534 void Pattern::trim_anchors(Positions& follow, const Position p) const
1535 {
1536 #ifdef DEBUG
1537   DBGLOG("trim_anchors({");
1538   for (Positions::const_iterator q = follow.begin(); q != follow.end(); ++q)
1539     DBGLOGPOS(*q);
1540   DBGLOGA(" }, %u)", p.loc());
1541 #endif
1542   Positions::iterator q = follow.begin();
1543   Positions::iterator end = follow.end();
1544   // check if we follow into an accepting state, if so trim follow state to remove back edges and cyclic anchors e.g. (^$)*
1545   while (q != end && !q->accept())
1546     ++q;
1547   if (q != end)
1548   {
1549     q = follow.begin();
1550     if (p.anchor())
1551     {
1552       while (q != end)
1553       {
1554         // erase if not accepting and not a begin anchor and not a ) lookahead tail
1555         if (!q->accept() && !q->anchor() && at(q->loc()) != ')')
1556           follow.erase(q++);
1557         else
1558           ++q;
1559       }
1560     }
1561     else
1562     {
1563       Location loc = p.loc();
1564       while (q != end)
1565       {
1566         // erase if not accepting and not a begin anchor and back edge
1567         if (!q->accept() && !q->anchor() && q->loc() <= loc)
1568           follow.erase(q++);
1569         else
1570           ++q;
1571       }
1572     }
1573   }
1574 #ifdef DEBUG
1575   DBGLOGA(" = {");
1576   for (Positions::const_iterator q = follow.begin(); q != follow.end(); ++q)
1577     DBGLOGPOS(*q);
1578   DBGLOG(" }");
1579 #endif
1580 }
1581 
trim_lazy(Positions * pos) const1582 void Pattern::trim_lazy(Positions *pos) const
1583 {
1584 #ifdef DEBUG
1585   DBGLOG("BEGIN trim_lazy({");
1586   for (Positions::const_iterator q = pos->begin(); q != pos->end(); ++q)
1587     DBGLOGPOS(*q);
1588   DBGLOGA(" })");
1589 #endif
1590   Positions::reverse_iterator p = pos->rbegin();
1591   while (p != pos->rend() && p->lazy())
1592   {
1593     Location l = p->lazy();
1594     if (p->accept() || p->anchor()) // CHECKED algorithmic options: 7/28 added p->anchor()
1595     {
1596       pos->insert(p->lazy(0)); // make lazy accept/anchor a non-lazy accept/anchor
1597       pos->erase(--p.base());
1598       while (p != pos->rend() && !p->accept() && p->lazy() == l)
1599       {
1600 #if 0 // CHECKED algorithmic options: set to 1 to turn lazy trimming off
1601         ++p;
1602 #else
1603         pos->erase(--p.base());
1604 #endif
1605       }
1606     }
1607     else
1608     {
1609 #if 0 // CHECKED algorithmic options: 7/31
1610       if (p->greedy())
1611       {
1612         pos->insert(p->lazy(0).greedy(false));
1613         pos->erase(--p.base());
1614       }
1615       else
1616       {
1617         break; // ++p;
1618       }
1619 #else
1620       if (!p->greedy()) // stop here, greedy bit is 0 from here on
1621         break;
1622       pos->insert(p->lazy(0));
1623       pos->erase(--p.base()); // CHECKED 10/21 ++p;
1624 #endif
1625     }
1626   }
1627 #if 0 // CHECKED algorithmic options: 7/31 but results in more states
1628   while (p != pos->rend() && p->greedy())
1629   {
1630     pos->insert(p->greedy(false));
1631     pos->erase(--p.base());
1632   }
1633 #endif
1634   // trims accept positions keeping the first only
1635   Positions::iterator q = pos->begin(), a = pos->end();
1636   while (q != pos->end())
1637   {
1638     if (q->accept() && !q->negate())
1639     {
1640       if (a == pos->end())
1641         a = q++;
1642       else
1643         pos->erase(q++);
1644     }
1645     else
1646     {
1647       ++q;
1648     }
1649   }
1650 #ifdef DEBUG
1651   DBGLOG("END trim_lazy({");
1652   for (Positions::const_iterator q = pos->begin(); q != pos->end(); ++q)
1653     DBGLOGPOS(*q);
1654   DBGLOGA(" })");
1655 #endif
1656 }
1657 
compile_transition(DFA::State * state,Follow & followpos,const Map & modifiers,const Map & lookahead,Moves & moves) const1658 void Pattern::compile_transition(
1659     DFA::State *state,
1660     Follow&     followpos,
1661     const Map&  modifiers,
1662     const Map&  lookahead,
1663     Moves&      moves) const
1664 {
1665   DBGLOG("BEGIN compile_transition()");
1666   Positions::const_iterator end = state->end();
1667   for (Positions::const_iterator k = state->begin(); k != end; ++k)
1668   {
1669     if (k->accept())
1670     {
1671       Accept accept = k->accepts();
1672       if (state->accept == 0 || accept < state->accept)
1673         state->accept = accept;
1674       if (k->negate())
1675         state->redo = true;
1676     }
1677     else
1678     {
1679       Location loc = k->loc();
1680       Char c = at(loc);
1681       DBGLOGN("At %u: %c", loc, c);
1682       bool literal = is_modified('q', modifiers, loc);
1683       if (c == '(' && !literal)
1684       {
1685         Lookahead n = 0;
1686         DBGLOG("LOOKAHEAD HEAD");
1687         for (Map::const_iterator i = lookahead.begin(); i != lookahead.end(); ++i)
1688         {
1689           Locations::const_iterator j = i->second.find(loc);
1690           DBGLOGN("%d %d (%d) %u", state->accept, i->first, j != i->second.end(), n);
1691           if (j != i->second.end())
1692           {
1693             Lookahead l = n + static_cast<Lookahead>(std::distance(i->second.begin(), j));
1694             if (l < n)
1695               error(regex_error::exceeds_limits, loc);
1696             state->heads.insert(l);
1697           }
1698           Lookahead k = n;
1699           n += static_cast<Lookahead>(i->second.size());
1700           if (n < k)
1701             error(regex_error::exceeds_limits, loc);
1702         }
1703       }
1704       else if (c == ')' && !literal)
1705       {
1706         /* CHECKED algorithmic options: 7/18 do no longer check for accept state, assume we are at an accept state
1707         if (state->accept > 0)
1708         */
1709         {
1710           Lookahead n = 0;
1711           DBGLOG("LOOKAHEAD TAIL");
1712           for (Map::const_iterator i = lookahead.begin(); i != lookahead.end(); ++i)
1713           {
1714             Locations::const_iterator j = i->second.find(loc);
1715             DBGLOGN("%d %d (%d) %u", state->accept, i->first, j != i->second.end(), n);
1716             if (j != i->second.end() /* CHECKED algorithmic options: 7/18 && state->accept == i->first */ ) // only add lookstop when part of the proper accept state
1717             {
1718               Lookahead l = n + static_cast<Lookahead>(std::distance(i->second.begin(), j));
1719               if (l < n)
1720                 error(regex_error::exceeds_limits, loc);
1721               state->tails.insert(l);
1722             }
1723             Lookahead k = n;
1724             n += static_cast<Lookahead>(i->second.size());
1725             if (n < k)
1726               error(regex_error::exceeds_limits, loc);
1727           }
1728         }
1729       }
1730       else
1731       {
1732         Follow::iterator i = followpos.find(k->pos());
1733         if (i != followpos.end())
1734         {
1735           if (k->negate())
1736           {
1737             Positions::const_iterator b = i->second.begin();
1738             if (b != i->second.end() && !b->negate())
1739             {
1740               Positions to;
1741               for (Positions::const_iterator p = b; p != i->second.end(); ++p)
1742                 to.insert(p->negate(true));
1743               i->second.swap(to);
1744             }
1745           }
1746           if (k->lazy())
1747           {
1748 #if 1 // CHECKED algorithmic options: 7/31 this optimization works fine when trim_lazy adds non-lazy greedy state, but may increase the total number of states:
1749             if (k->greedy())
1750               continue;
1751 #endif
1752             Follow::iterator j = followpos.find(*k);
1753             if (j == followpos.end())
1754             {
1755               // followpos is not defined for lazy pos yet, so add lazy followpos (memoization)
1756               j = followpos.insert(std::pair<Position,Positions>(*k, Positions())).first;
1757               for (Positions::const_iterator p = i->second.begin(); p != i->second.end(); ++p)
1758                 j->second.insert(/* p->lazy() || CHECKED algorithmic options: 7/31 */ p->ticked() ? *p : /* CHECKED algorithmic options: 7/31 adds too many states p->greedy() ? p->lazy(0).greedy(false) : */ p->lazy(k->lazy())); // CHECKED algorithmic options: 7/18 ticked() preserves lookahead tail at '/' and ')'
1759 #ifdef DEBUG
1760               DBGLOGN("lazy followpos(");
1761               DBGLOGPOS(*k);
1762               DBGLOGA(" ) = {");
1763               for (Positions::const_iterator q = j->second.begin(); q != j->second.end(); ++q)
1764                 DBGLOGPOS(*q);
1765               DBGLOGA(" }");
1766 #endif
1767             }
1768             i = j;
1769           }
1770           Positions &follow = i->second;
1771           Chars chars;
1772           if (literal)
1773           {
1774             if (std::isalpha(c) && is_modified('i', modifiers, loc))
1775             {
1776               chars.insert(uppercase(c));
1777               chars.insert(lowercase(c));
1778             }
1779             else
1780             {
1781               chars.insert(c);
1782             }
1783           }
1784           else
1785           {
1786             switch (c)
1787             {
1788               case '.':
1789                 if (is_modified('s', modifiers, loc))
1790                 {
1791                   static const uint64_t dot[5] = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0 };
1792                   chars |= Chars(dot);
1793                 }
1794                 else
1795                 {
1796                   static const uint64_t dot[5] = { 0xFFFFFFFFFFFFFBFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0 };
1797                   chars |= Chars(dot);
1798                 }
1799                 break;
1800               case '^':
1801                 chars.insert(is_modified('m', modifiers, loc) ? META_BOL : META_BOB);
1802                 trim_anchors(follow, *k);
1803                 break;
1804               case '$':
1805                 chars.insert(is_modified('m', modifiers, loc) ? META_EOL : META_EOB);
1806                 trim_anchors(follow, *k);
1807                 break;
1808               default:
1809                 if (c == '[')
1810                 {
1811                   compile_list(loc + 1, chars, modifiers);
1812                 }
1813                 else
1814                 {
1815                   switch (escape_at(loc))
1816                   {
1817                     case '\0': // no escape at current loc
1818                       if (std::isalpha(c) && is_modified('i', modifiers, loc))
1819                       {
1820                         chars.insert(uppercase(c));
1821                         chars.insert(lowercase(c));
1822                       }
1823                       else
1824                       {
1825                         chars.insert(c);
1826                       }
1827                       break;
1828                     case 'i':
1829                       chars.insert(META_IND);
1830                       break;
1831                     case 'j':
1832                       chars.insert(META_DED);
1833                       break;
1834                     case 'k':
1835                       chars.insert(META_UND);
1836                       break;
1837                     case 'A':
1838                       chars.insert(META_BOB);
1839                       trim_anchors(follow, *k);
1840                       break;
1841                     case 'z':
1842                       chars.insert(META_EOB);
1843                       trim_anchors(follow, *k);
1844                       break;
1845                     case 'B':
1846                       chars.insert(k->anchor() ? META_NWB : META_NWE);
1847                       trim_anchors(follow, *k);
1848                       break;
1849                     case 'b':
1850                       if (k->anchor())
1851                         chars.insert(META_BWB, META_EWB);
1852                       else
1853                         chars.insert(META_BWE, META_EWE);
1854                       trim_anchors(follow, *k);
1855                       break;
1856                     case '<':
1857                       chars.insert(k->anchor() ? META_BWB : META_BWE);
1858                       trim_anchors(follow, *k);
1859                       break;
1860                     case '>':
1861                       chars.insert(k->anchor() ? META_EWB : META_EWE);
1862                       trim_anchors(follow, *k);
1863                       break;
1864                     default:
1865                       c = parse_esc(loc, &chars);
1866                       if (c <= 'z' && std::isalpha(c) && is_modified('i', modifiers, loc))
1867                       {
1868                         chars.insert(uppercase(c));
1869                         chars.insert(lowercase(c));
1870                       }
1871                   }
1872                 }
1873             }
1874           }
1875           transition(moves, chars, follow);
1876         }
1877       }
1878     }
1879   }
1880   Moves::iterator i = moves.begin();
1881   Moves::iterator e = moves.end();
1882   while (i != e)
1883   {
1884     trim_lazy(&i->second);
1885     if (i->second.empty())
1886       moves.erase(i++);
1887     else
1888       ++i;
1889   }
1890   DBGLOG("END compile_transition()");
1891 }
1892 
transition(Moves & moves,Chars & chars,const Positions & follow) const1893 void Pattern::transition(
1894     Moves&           moves,
1895     Chars&           chars,
1896     const Positions& follow) const
1897 {
1898   Moves::iterator i = moves.begin();
1899   Moves::iterator end = moves.end();
1900   while (i != end)
1901   {
1902     if (i->second == follow)
1903     {
1904       chars += i->first;
1905       moves.erase(i++);
1906     }
1907     else
1908     {
1909       ++i;
1910     }
1911   }
1912   for (i = moves.begin(); i != end; ++i)
1913   {
1914     if (chars.intersects(i->first))
1915     {
1916       if (is_subset(follow, i->second))
1917       {
1918         chars -= i->first;
1919       }
1920       else
1921       {
1922         if (chars.contains(i->first))
1923         {
1924           chars -= i->first;
1925           set_insert(i->second, follow);
1926         }
1927         else
1928         {
1929           Move back(chars & i->first, i->second);
1930           set_insert(back.second, follow);
1931           chars -= back.first;
1932           i->first -= back.first;
1933           moves.push_back(back);
1934         }
1935       }
1936       if (!chars.any())
1937         return;
1938     }
1939   }
1940   if (chars.any())
1941     moves.push_back(Move(chars, follow));
1942 }
1943 
compile_list(Location loc,Chars & chars,const Map & modifiers) const1944 void Pattern::compile_list(Location loc, Chars& chars, const Map& modifiers) const
1945 {
1946   bool complement = (at(loc) == '^');
1947   if (complement)
1948     ++loc;
1949   Char prev = META_BOL;
1950   Char lo = META_EOL;
1951   for (Char c = at(loc); c != '\0' && (c != ']' || prev == META_BOL); c = at(++loc))
1952   {
1953     if (c == '-' && !is_meta(prev) && is_meta(lo))
1954     {
1955       lo = prev;
1956     }
1957     else
1958     {
1959       size_t c_loc;
1960       if (c == '[' && at(loc + 1) == ':' && (c_loc = find_at(loc + 2, ':')) != std::string::npos && at(static_cast<Location>(c_loc + 1)) == ']')
1961       {
1962         if (c_loc == loc + 3)
1963         {
1964           ++loc;
1965           c = parse_esc(loc, &chars);
1966         }
1967         else
1968         {
1969           size_t i;
1970           for (i = 0; i < 14; ++i)
1971             if (eq_at(loc + 4, posix_class[i] + 2)) // ignore first two letters (upper/lower) when matching
1972               break;
1973           if (i < 14)
1974             posix(i, chars);
1975           else
1976             error(regex_error::invalid_class, loc);
1977           c = META_EOL;
1978         }
1979         loc = static_cast<Location>(c_loc + 1);
1980       }
1981       else if (c == opt_.e && !opt_.b)
1982       {
1983         c = parse_esc(loc, &chars);
1984         --loc;
1985       }
1986       if (!is_meta(c))
1987       {
1988         if (!is_meta(lo))
1989         {
1990           if (lo <= c)
1991             chars.insert(lo, c);
1992           else
1993             error(regex_error::invalid_class_range, loc);
1994           if (is_modified('i', modifiers, loc))
1995           {
1996             for (Char a = lo; a <= c; ++a)
1997             {
1998               if (std::isupper(a))
1999                 chars.insert(lowercase(a));
2000               else if (std::islower(a))
2001                 chars.insert(uppercase(a));
2002             }
2003           }
2004           c = META_EOL;
2005         }
2006         else
2007         {
2008           if (std::isalpha(c) && is_modified('i', modifiers, loc))
2009           {
2010             chars.insert(uppercase(c));
2011             chars.insert(lowercase(c));
2012           }
2013           else
2014           {
2015             chars.insert(c);
2016           }
2017         }
2018       }
2019       prev = c;
2020       lo = META_EOL;
2021     }
2022   }
2023   if (!is_meta(lo))
2024     chars.insert('-');
2025   if (complement)
2026     flip(chars);
2027 }
2028 
posix(size_t index,Chars & chars) const2029 void Pattern::posix(size_t index, Chars& chars) const
2030 {
2031   DBGLOG("posix(%lu)", index);
2032   static const uint64_t posix_chars[14][5] = {
2033     { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0ULL, 0ULL, 0ULL }, // ASCII
2034     { 0x0000000100003E00ULL, 0x0000000000000000ULL, 0ULL, 0ULL, 0ULL }, // Space: \t-\r, ' '
2035     { 0x03FF000000000000ULL, 0x0000007E0000007EULL, 0ULL, 0ULL, 0ULL }, // XDigit: 0-9, A-F, a-f
2036     { 0x00000000FFFFFFFFULL, 0x8000000000000000ULL, 0ULL, 0ULL, 0ULL }, // Cntrl: \x00-0x1F, \0x7F
2037     { 0xFFFFFFFF00000000ULL, 0x7FFFFFFFFFFFFFFFULL, 0ULL, 0ULL, 0ULL }, // Print: ' '-'~'
2038     { 0x03FF000000000000ULL, 0x07FFFFFE07FFFFFEULL, 0ULL, 0ULL, 0ULL }, // Alnum: 0-9, A-Z, a-z
2039     { 0x0000000000000000ULL, 0x07FFFFFE07FFFFFEULL, 0ULL, 0ULL, 0ULL }, // Alpha: A-Z, a-z
2040     { 0x0000000100000200ULL, 0x0000000000000000ULL, 0ULL, 0ULL, 0ULL }, // Blank: \t, ' '
2041     { 0x03FF000000000000ULL, 0x0000000000000000ULL, 0ULL, 0ULL, 0ULL }, // Digit: 0-9
2042     { 0xFFFFFFFE00000000ULL, 0x7FFFFFFFFFFFFFFFULL, 0ULL, 0ULL, 0ULL }, // Graph: '!'-'~'
2043     { 0x0000000000000000ULL, 0x07FFFFFE00000000ULL, 0ULL, 0ULL, 0ULL }, // Lower: a-z
2044     { 0xFC00FFFE00000000ULL, 0x78000001F8000001ULL, 0ULL, 0ULL, 0ULL }, // Punct: '!'-'/', ':'-'@', '['-'`', '{'-'~'
2045     { 0x0000000000000000ULL, 0x0000000007FFFFFEULL, 0ULL, 0ULL, 0ULL }, // Upper: A-Z
2046     { 0x03FF000000000000ULL, 0x07FFFFFE87FFFFFEULL, 0ULL, 0ULL, 0ULL }, // Word: 0-9, A-Z, a-z, _
2047   };
2048   chars |= Chars(posix_chars[index]);
2049 }
2050 
flip(Chars & chars) const2051 void Pattern::flip(Chars& chars) const
2052 {
2053   chars.flip256();
2054 }
2055 
assemble(DFA::State * start)2056 void Pattern::assemble(DFA::State *start)
2057 {
2058   DBGLOG("BEGIN assemble()");
2059   timer_type t;
2060   timer_start(t);
2061   predict_match_dfa(start);
2062   export_dfa(start);
2063   compact_dfa(start);
2064   encode_dfa(start);
2065   wms_ = timer_elapsed(t);
2066   gencode_dfa(start);
2067   export_code();
2068   DBGLOG("END assemble()");
2069 }
2070 
compact_dfa(DFA::State * start)2071 void Pattern::compact_dfa(DFA::State *start)
2072 {
2073 #if WITH_COMPACT_DFA == -1
2074   // edge compaction in reverse order
2075   for (DFA::State *state = start; state; state = state->next)
2076   {
2077     for (DFA::State::Edges::iterator i = state->edges.begin(); i != state->edges.end(); ++i)
2078     {
2079       Char hi = i->second.first;
2080       if (hi >= 0xFF)
2081         break;
2082       DFA::State::Edges::iterator j = i;
2083       ++j;
2084       while (j != state->edges.end() && j->first <= hi + 1)
2085       {
2086         hi = j->second.first;
2087         if (j->second.second == i->second.second)
2088         {
2089           i->second.first = hi;
2090           state->edges.erase(j++);
2091         }
2092         else
2093         {
2094           ++j;
2095         }
2096       }
2097     }
2098   }
2099 #elif WITH_COMPACT_DFA == 1
2100   // edge compaction
2101   for (DFA::State *state = start; state; state = state->next)
2102   {
2103     for (DFA::State::Edges::reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2104     {
2105       Char lo = i->second.first;
2106       if (lo <= 0x00)
2107         break;
2108       DFA::State::Edges::reverse_iterator j = i;
2109       ++j;
2110       while (j != state->edges.rend() && j->first >= lo - 1)
2111       {
2112         lo = j->second.first;
2113         if (j->second.second == i->second.second)
2114         {
2115           i->second.first = lo;
2116           state->edges.erase(--j.base());
2117         }
2118         else
2119         {
2120           ++j;
2121         }
2122       }
2123     }
2124   }
2125 #else
2126   (void)start;
2127 #endif
2128 }
2129 
encode_dfa(DFA::State * start)2130 void Pattern::encode_dfa(DFA::State *start)
2131 {
2132   nop_ = 0;
2133   for (DFA::State *state = start; state; state = state->next)
2134   {
2135     if (state->accept > Const::AMAX)
2136       state->accept = Const::AMAX;
2137     state->first = state->index = nop_;
2138 #if WITH_COMPACT_DFA == -1
2139     Char hi = 0x00;
2140     for (DFA::State::Edges::const_iterator i = state->edges.begin(); i != state->edges.end(); ++i)
2141     {
2142       Char lo = i->first;
2143       if (lo == hi)
2144         hi = i->second.first + 1;
2145       ++nop_;
2146       if (is_meta(lo))
2147         nop_ += i->second.first - lo;
2148     }
2149     // add final dead state (HALT opcode) only when needed, i.e. skip dead state if all chars 0-255 are already covered
2150     if (hi <= 0xFF)
2151     {
2152       state->edges[hi] = std::pair<Char,DFA::State*>(0xFF, static_cast<DFA::State*>(NULL)); // cast to appease MSVC 2010
2153       ++nop_;
2154     }
2155 #else
2156     Char lo = 0xFF;
2157     bool covered = false;
2158     for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2159     {
2160       Char hi = i->first;
2161       if (lo == hi)
2162       {
2163         if (i->second.first == 0x00)
2164           covered = true;
2165         else
2166           lo = i->second.first - 1;
2167       }
2168       ++nop_;
2169       if (is_meta(lo))
2170         nop_ += hi - i->second.first;
2171     }
2172     // add final dead state (HALT opcode) only when needed, i.e. skip dead state if all chars 0-255 are already covered
2173     if (!covered)
2174     {
2175       state->edges[lo] = std::pair<Char,DFA::State*>(0x00, static_cast<DFA::State*>(NULL)); // cast to appease MSVC 2010
2176       ++nop_;
2177     }
2178 #endif
2179     nop_ += static_cast<Index>(state->heads.size() + state->tails.size() + (state->accept > 0 || state->redo));
2180     if (!valid_goto_index(nop_))
2181       throw regex_error(regex_error::exceeds_limits, rex_, rex_.size());
2182   }
2183   if (nop_ > Const::LONG)
2184   {
2185     // over 64K opcodes: use 64-bit GOTO LONG opcodes
2186     nop_ = 0;
2187     for (DFA::State *state = start; state; state = state->next)
2188     {
2189       state->index = nop_;
2190 #if WITH_COMPACT_DFA == -1
2191       Char hi = 0x00;
2192       for (DFA::State::Edges::const_iterator i = state->edges.begin(); i != state->edges.end(); ++i)
2193       {
2194         Char lo = i->first;
2195         if (lo == hi)
2196           hi = i->second.first + 1;
2197         // use 64-bit jump opcode if forward jump determined by previous loop is beyond 32K or backward jump is beyond 64K
2198         if (i->second.second != NULL &&
2199             ((i->second.second->first > state->first && i->second.second->first >= Const::LONG / 2) ||
2200              i->second.second->index >= Const::LONG))
2201           nop_ += 2;
2202         else
2203           ++nop_;
2204         if (is_meta(lo))
2205         {
2206           // use 64-bit jump opcode if forward jump determined by previous loop is beyond 32K or backward jump is beyond 64K
2207           if (i->second.second != NULL &&
2208             ((i->second.second->first > state->first && i->second.second->first >= Const::LONG / 2) ||
2209              i->second.second->index >= Const::LONG))
2210             nop_ += 2 * (i->second.first - lo);
2211           else
2212             nop_ += i->second.first - lo;
2213         }
2214       }
2215 #else
2216       Char lo = 0xFF;
2217       for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2218       {
2219         Char hi = i->first;
2220         if (lo == hi)
2221           lo = i->second.first - 1;
2222         // use 64-bit jump opcode if forward jump determined by previous loop is beyond 32K or backward jump is beyond 64K
2223         if (i->second.second != NULL &&
2224             ((i->second.second->first > state->first && i->second.second->first >= Const::LONG / 2) ||
2225              i->second.second->index >= Const::LONG))
2226           nop_ += 2;
2227         else
2228           ++nop_;
2229         if (is_meta(lo))
2230         {
2231           // use 64-bit jump opcode if forward jump determined by previous loop is beyond 32K or backward jump is beyond 64K
2232           if (i->second.second != NULL &&
2233             ((i->second.second->first > state->first && i->second.second->first >= Const::LONG / 2) ||
2234              i->second.second->index >= Const::LONG))
2235             nop_ += 2 * (hi - i->second.first);
2236           else
2237             nop_ += hi - i->second.first;
2238         }
2239       }
2240 #endif
2241       nop_ += static_cast<Index>(state->heads.size() + state->tails.size() + (state->accept > 0 || state->redo));
2242       if (!valid_goto_index(nop_))
2243         throw regex_error(regex_error::exceeds_limits, rex_, rex_.size());
2244     }
2245   }
2246   Opcode *opcode = new Opcode[nop_];
2247   opc_ = opcode;
2248   Index pc = 0;
2249   for (const DFA::State *state = start; state; state = state->next)
2250   {
2251     if (state->redo)
2252     {
2253       opcode[pc++] = opcode_redo();
2254     }
2255     else if (state->accept > 0)
2256     {
2257       opcode[pc++] = opcode_take(state->accept);
2258     }
2259     for (Lookaheads::const_iterator i = state->tails.begin(); i != state->tails.end(); ++i)
2260     {
2261       if (!valid_lookahead_index(static_cast<Index>(*i)))
2262         throw regex_error(regex_error::exceeds_limits, rex_, rex_.size());
2263       opcode[pc++] = opcode_tail(static_cast<Index>(*i));
2264     }
2265     for (Lookaheads::const_iterator i = state->heads.begin(); i != state->heads.end(); ++i)
2266     {
2267       if (!valid_lookahead_index(static_cast<Index>(*i)))
2268         throw regex_error(regex_error::exceeds_limits, rex_, rex_.size());
2269       opcode[pc++] = opcode_head(static_cast<Index>(*i));
2270     }
2271 #if WITH_COMPACT_DFA == -1
2272     for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2273     {
2274       Char lo = i->first;
2275       Char hi = i->second.first;
2276       Index target_first = i->second.second != NULL ? i->second.second->first : Const::IMAX;
2277       Index target_index = i->second.second != NULL ? i->second.second->index : Const::IMAX;
2278       if (!is_meta(lo))
2279       {
2280         if (target_index == Const::IMAX)
2281         {
2282           opcode[pc++] = opcode_goto(lo, hi, Const::HALT);
2283         }
2284         else if (nop_ > Const::LONG && ((target_first > state->first && target_first >= Const::LONG / 2) || target_index >= Const::LONG))
2285         {
2286           opcode[pc++] = opcode_goto(lo, hi, Const::LONG);
2287           opcode[pc++] = opcode_long(target_index);
2288         }
2289         else
2290         {
2291           opcode[pc++] = opcode_goto(lo, hi, target_index);
2292         }
2293       }
2294       else
2295       {
2296         do
2297         {
2298           if (target_index == Const::IMAX)
2299           {
2300             opcode[pc++] = opcode_goto(lo, lo, Const::HALT);
2301           }
2302           else if (nop_ > Const::LONG && ((target_first > state->first && target_first >= Const::LONG / 2) || target_index >= Const::LONG))
2303           {
2304             opcode[pc++] = opcode_goto(lo, lo, Const::LONG);
2305             opcode[pc++] = opcode_long(target_index);
2306           }
2307           else
2308           {
2309             opcode[pc++] = opcode_goto(lo, lo, target_index);
2310           }
2311         } while (++lo <= hi);
2312       }
2313     }
2314 #else
2315     for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2316     {
2317       Char hi = i->first;
2318       Char lo = i->second.first;
2319       if (is_meta(lo))
2320       {
2321         Index target_first = i->second.second != NULL ? i->second.second->first : Const::IMAX;
2322         Index target_index = i->second.second != NULL ? i->second.second->index : Const::IMAX;
2323         do
2324         {
2325           if (target_index == Const::IMAX)
2326           {
2327             opcode[pc++] = opcode_goto(lo, lo, Const::HALT);
2328           }
2329           else if (nop_ > Const::LONG && ((target_first > state->first && target_first >= Const::LONG / 2) || target_index >= Const::LONG))
2330           {
2331             opcode[pc++] = opcode_goto(lo, lo, Const::LONG);
2332             opcode[pc++] = opcode_long(target_index);
2333           }
2334           else
2335           {
2336             opcode[pc++] = opcode_goto(lo, lo, target_index);
2337           }
2338         } while (++lo <= hi);
2339       }
2340     }
2341     for (DFA::State::Edges::const_iterator i = state->edges.begin(); i != state->edges.end(); ++i)
2342     {
2343       Char lo = i->second.first;
2344       if (!is_meta(lo))
2345       {
2346         Char hi = i->first;
2347         if (i->second.second != NULL)
2348         {
2349           Index target_first = i->second.second->first;
2350           Index target_index = i->second.second->index;
2351           if (nop_ > Const::LONG && ((target_first > state->first && target_first >= Const::LONG / 2) || target_index >= Const::LONG))
2352           {
2353             opcode[pc++] = opcode_goto(lo, hi, Const::LONG);
2354             opcode[pc++] = opcode_long(target_index);
2355           }
2356           else
2357           {
2358             opcode[pc++] = opcode_goto(lo, hi, target_index);
2359           }
2360         }
2361         else
2362         {
2363           opcode[pc++] = opcode_goto(lo, hi, Const::HALT);
2364         }
2365       }
2366     }
2367 #endif
2368   }
2369 }
2370 
gencode_dfa(const DFA::State * start) const2371 void Pattern::gencode_dfa(const DFA::State *start) const
2372 {
2373   if (!opt_.o)
2374     return;
2375   for (std::vector<std::string>::const_iterator i = opt_.f.begin(); i != opt_.f.end(); ++i)
2376   {
2377     const std::string& filename = *i;
2378     size_t len = filename.length();
2379     if ((len > 2 && filename.compare(len - 2, 2, ".h"  ) == 0)
2380      || (len > 4 && filename.compare(len - 4, 4, ".hpp") == 0)
2381      || (len > 4 && filename.compare(len - 4, 4, ".cpp") == 0)
2382      || (len > 3 && filename.compare(len - 3, 3, ".cc" ) == 0))
2383     {
2384       FILE *file = NULL;
2385       int err = 0;
2386       if (filename.compare(0, 7, "stdout.") == 0)
2387         file = stdout;
2388       else if (filename.at(0) == '+')
2389         err = reflex::fopen_s(&file, filename.c_str() + 1, "a");
2390       else
2391         err = reflex::fopen_s(&file, filename.c_str(), "w");
2392       if (!err && file)
2393       {
2394         ::fprintf(file,
2395             "#include <reflex/matcher.h>\n\n"
2396             "#if defined(OS_WIN)\n"
2397             "#pragma warning(disable:4101 4102)\n"
2398             "#elif defined(__GNUC__)\n"
2399             "#pragma GCC diagnostic ignored \"-Wunused-variable\"\n"
2400             "#pragma GCC diagnostic ignored \"-Wunused-label\"\n"
2401             "#elif defined(__clang__)\n"
2402             "#pragma clang diagnostic ignored \"-Wunused-variable\"\n"
2403             "#pragma clang diagnostic ignored \"-Wunused-label\"\n"
2404             "#endif\n\n");
2405         write_namespace_open(file);
2406         ::fprintf(file,
2407             "void reflex_code_%s(reflex::Matcher& m)\n"
2408             "{\n"
2409             "  int c0 = 0, c1 = 0;\n"
2410             "  m.FSM_INIT(c1);\n", opt_.n.empty() ? "FSM" : opt_.n.c_str());
2411         for (const DFA::State *state = start; state; state = state->next)
2412         {
2413           ::fprintf(file, "\nS%u:\n", state->index);
2414           if (state == start)
2415             ::fprintf(file, "  m.FSM_FIND();\n");
2416           if (state->redo)
2417             ::fprintf(file, "  m.FSM_REDO();\n");
2418           else if (state->accept > 0)
2419             ::fprintf(file, "  m.FSM_TAKE(%u);\n", state->accept);
2420           for (Lookaheads::const_iterator i = state->tails.begin(); i != state->tails.end(); ++i)
2421             ::fprintf(file, "  m.FSM_TAIL(%u);\n", *i);
2422           for (Lookaheads::const_iterator i = state->heads.begin(); i != state->heads.end(); ++i)
2423             ::fprintf(file, "  m.FSM_HEAD(%u);\n", *i);
2424           if (state->edges.rbegin() != state->edges.rend() && state->edges.rbegin()->first == META_DED)
2425             ::fprintf(file, "  if (m.FSM_DENT()) goto S%u;\n", state->edges.rbegin()->second.second->index);
2426           bool peek = false; // if we need to read a character into c1
2427           bool prev = false; // if we need to keep the previous character in c0
2428           for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2429           {
2430 #if WITH_COMPACT_DFA == -1
2431             Char lo = i->first;
2432             Char hi = i->second.first;
2433 #else
2434             Char hi = i->first;
2435             Char lo = i->second.first;
2436 #endif
2437             if (!is_meta(lo))
2438             {
2439               Index target_index = Const::IMAX;
2440               if (i->second.second != NULL)
2441                 target_index = i->second.second->index;
2442               DFA::State::Edges::const_reverse_iterator j = i;
2443               if (target_index == Const::IMAX && (++j == state->edges.rend() || is_meta(j->second.first)))
2444                 break;
2445               peek = true;
2446             }
2447             else
2448             {
2449               do
2450               {
2451                 if (lo == META_EOB || lo == META_EOL)
2452                   peek = true;
2453                 else if (lo == META_EWE || lo == META_BWE || lo == META_NWE)
2454                   prev = peek = true;
2455                 if (prev && peek)
2456                   break;
2457                 check_dfa_closure(i->second.second, 1, peek, prev);
2458               } while (++lo <= hi);
2459             }
2460           }
2461           bool read = peek;
2462           bool elif = false;
2463 #if WITH_COMPACT_DFA == -1
2464           for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2465           {
2466             Char lo = i->first;
2467             Char hi = i->second.first;
2468             Index target_index = Const::IMAX;
2469             if (i->second.second != NULL)
2470               target_index = i->second.second->index;
2471             if (read)
2472             {
2473               if (prev)
2474                 ::fprintf(file, "  c0 = c1, c1 = m.FSM_CHAR();\n");
2475               else
2476                 ::fprintf(file, "  c1 = m.FSM_CHAR();\n");
2477               read = false;
2478             }
2479             if (!is_meta(lo))
2480             {
2481               DFA::State::Edges::const_reverse_iterator j = i;
2482               if (target_index == Const::IMAX && (++j == state->edges.rend() || is_meta(j->second.first)))
2483                 break;
2484               if (lo == hi)
2485               {
2486                 ::fprintf(file, "  if (c1 == ");
2487                 print_char(file, lo);
2488                 ::fprintf(file, ")");
2489               }
2490               else if (hi == 0xFF)
2491               {
2492                 ::fprintf(file, "  if (");
2493                 print_char(file, lo);
2494                 ::fprintf(file, " <= c1)");
2495               }
2496               else
2497               {
2498                 ::fprintf(file, "  if (");
2499                 print_char(file, lo);
2500                 ::fprintf(file, " <= c1 && c1 <= ");
2501                 print_char(file, hi);
2502                 ::fprintf(file, ")");
2503               }
2504               if (target_index == Const::IMAX)
2505               {
2506                 if (peek)
2507                   ::fprintf(file, " return m.FSM_HALT(c1);\n");
2508                 else
2509                   ::fprintf(file, " return m.FSM_HALT();\n");
2510               }
2511               else
2512               {
2513                 ::fprintf(file, " goto S%u;\n", target_index);
2514               }
2515             }
2516             else
2517             {
2518               do
2519               {
2520                 switch (lo)
2521                 {
2522                   case META_EOB:
2523                   case META_EOL:
2524                     ::fprintf(file, "  ");
2525                     if (elif)
2526                       ::fprintf(file, "else ");
2527                     ::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
2528                     gencode_dfa_closure(file, i->second.second, 2, peek);
2529                     ::fprintf(file, "  }\n");
2530                     elif = true;
2531                     break;
2532                   case META_EWE:
2533                   case META_BWE:
2534                   case META_NWE:
2535                     ::fprintf(file, "  ");
2536                     if (elif)
2537                       ::fprintf(file, "else ");
2538                     ::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]);
2539                     gencode_dfa_closure(file, i->second.second, 2, peek);
2540                     ::fprintf(file, "  }\n");
2541                     elif = true;
2542                     break;
2543                   default:
2544                     ::fprintf(file, "  ");
2545                     if (elif)
2546                       ::fprintf(file, "else ");
2547                     ::fprintf(file, "if (m.FSM_META_%s()) {\n", meta_label[lo - META_MIN]);
2548                     gencode_dfa_closure(file, i->second.second, 2, peek);
2549                     ::fprintf(file, "  }\n");
2550                     elif = true;
2551                 }
2552               } while (++lo <= hi);
2553             }
2554           }
2555 #else
2556           for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2557           {
2558             Char hi = i->first;
2559             Char lo = i->second.first;
2560             if (is_meta(lo))
2561             {
2562               if (read)
2563               {
2564                 if (prev)
2565                   ::fprintf(file, "  c0 = c1, c1 = m.FSM_CHAR();\n");
2566                 else
2567                   ::fprintf(file, "  c1 = m.FSM_CHAR();\n");
2568                 read = false;
2569               }
2570               do
2571               {
2572                 switch (lo)
2573                 {
2574                   case META_EOB:
2575                   case META_EOL:
2576                     ::fprintf(file, "  ");
2577                     if (elif)
2578                       ::fprintf(file, "else ");
2579                     ::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
2580                     gencode_dfa_closure(file, i->second.second, 2, peek);
2581                     ::fprintf(file, "  }\n");
2582                     elif = true;
2583                     break;
2584                   case META_EWE:
2585                   case META_BWE:
2586                   case META_NWE:
2587                     ::fprintf(file, "  ");
2588                     if (elif)
2589                       ::fprintf(file, "else ");
2590                     ::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]);
2591                     gencode_dfa_closure(file, i->second.second, 2, peek);
2592                     ::fprintf(file, "  }\n");
2593                     elif = true;
2594                     break;
2595                   default:
2596                     ::fprintf(file, "  ");
2597                     if (elif)
2598                       ::fprintf(file, "else ");
2599                     ::fprintf(file, "if (m.FSM_META_%s()) {\n", meta_label[lo - META_MIN]);
2600                     gencode_dfa_closure(file, i->second.second, 2, peek);
2601                     ::fprintf(file, "  }\n");
2602                     elif = true;
2603                 }
2604               } while (++lo <= hi);
2605             }
2606           }
2607           for (DFA::State::Edges::const_iterator i = state->edges.begin(); i != state->edges.end(); ++i)
2608           {
2609             Char hi = i->first;
2610             Char lo = i->second.first;
2611             Index target_index = Const::IMAX;
2612             if (i->second.second != NULL)
2613               target_index = i->second.second->index;
2614             if (read)
2615             {
2616               if (prev)
2617                 ::fprintf(file, "  c0 = c1, c1 = m.FSM_CHAR();\n");
2618               else
2619                 ::fprintf(file, "  c1 = m.FSM_CHAR();\n");
2620               read = false;
2621             }
2622             if (!is_meta(lo))
2623             {
2624               DFA::State::Edges::const_iterator j = i;
2625               if (target_index == Const::IMAX && (++j == state->edges.end() || is_meta(j->second.first)))
2626                 break;
2627               if (lo == hi)
2628               {
2629                 ::fprintf(file, "  if (c1 == ");
2630                 print_char(file, lo);
2631                 ::fprintf(file, ")");
2632               }
2633               else if (hi == 0xFF)
2634               {
2635                 ::fprintf(file, "  if (");
2636                 print_char(file, lo);
2637                 ::fprintf(file, " <= c1)");
2638               }
2639               else
2640               {
2641                 ::fprintf(file, "  if (");
2642                 print_char(file, lo);
2643                 ::fprintf(file, " <= c1 && c1 <= ");
2644                 print_char(file, hi);
2645                 ::fprintf(file, ")");
2646               }
2647               if (target_index == Const::IMAX)
2648               {
2649                 if (peek)
2650                   ::fprintf(file, " return m.FSM_HALT(c1);\n");
2651                 else
2652                   ::fprintf(file, " return m.FSM_HALT();\n");
2653               }
2654               else
2655               {
2656                 ::fprintf(file, " goto S%u;\n", target_index);
2657               }
2658             }
2659           }
2660 #endif
2661           if (peek)
2662             ::fprintf(file, "  return m.FSM_HALT(c1);\n");
2663           else
2664             ::fprintf(file, "  return m.FSM_HALT();\n");
2665         }
2666         ::fprintf(file, "}\n\n");
2667         if (opt_.p)
2668           write_predictor(file);
2669         write_namespace_close(file);
2670         if (file != stdout)
2671           ::fclose(file);
2672       }
2673     }
2674   }
2675 }
2676 
check_dfa_closure(const DFA::State * state,int nest,bool & peek,bool & prev) const2677 void Pattern::check_dfa_closure(const DFA::State *state, int nest, bool& peek, bool& prev) const
2678 {
2679   if (nest > 4)
2680     return;
2681   for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2682   {
2683 #if WITH_COMPACT_DFA == -1
2684     Char lo = i->first;
2685     Char hi = i->second.first;
2686 #else
2687     Char hi = i->first;
2688     Char lo = i->second.first;
2689 #endif
2690     if (is_meta(lo))
2691     {
2692       do
2693       {
2694         if (lo == META_EOB || lo == META_EOL)
2695           peek = true;
2696         else if (lo == META_EWE || lo == META_BWE || lo == META_NWE)
2697           prev = peek = true;
2698         if (prev && peek)
2699           break;
2700         check_dfa_closure(i->second.second, nest + 1, peek, prev);
2701       } while (++lo <= hi);
2702     }
2703   }
2704 }
2705 
gencode_dfa_closure(FILE * file,const DFA::State * state,int nest,bool peek) const2706 void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest, bool peek) const
2707 {
2708   bool elif = false;
2709   if (state->redo)
2710   {
2711     if (peek)
2712       ::fprintf(file, "%*sm.FSM_REDO(c1);\n", 2*nest, "");
2713     else
2714       ::fprintf(file, "%*sm.FSM_REDO();\n", 2*nest, "");
2715   }
2716   else if (state->accept > 0)
2717   {
2718     if (peek)
2719       ::fprintf(file, "%*sm.FSM_TAKE(%u, c1);\n", 2*nest, "", state->accept);
2720     else
2721       ::fprintf(file, "%*sm.FSM_TAKE(%u);\n", 2*nest, "", state->accept);
2722   }
2723   for (Lookaheads::const_iterator i = state->tails.begin(); i != state->tails.end(); ++i)
2724     ::fprintf(file, "%*sm.FSM_TAIL(%u);\n", 2*nest, "", *i);
2725   if (nest > 5)
2726     return;
2727   for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2728   {
2729 #if WITH_COMPACT_DFA == -1
2730     Char lo = i->first;
2731     Char hi = i->second.first;
2732 #else
2733     Char hi = i->first;
2734     Char lo = i->second.first;
2735 #endif
2736     if (is_meta(lo))
2737     {
2738       do
2739       {
2740         switch (lo)
2741         {
2742           case META_EOB:
2743           case META_EOL:
2744             ::fprintf(file, "%*s", 2*nest, "");
2745             if (elif)
2746               ::fprintf(file, "else ");
2747             ::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
2748             gencode_dfa_closure(file, i->second.second, nest + 1, peek);
2749             ::fprintf(file, "%*s}\n", 2*nest, "");
2750             elif = true;
2751             break;
2752           case META_EWE:
2753           case META_BWE:
2754           case META_NWE:
2755             ::fprintf(file, "%*s", 2*nest, "");
2756             if (elif)
2757               ::fprintf(file, "else ");
2758             ::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]);
2759             gencode_dfa_closure(file, i->second.second, nest + 1, peek);
2760             ::fprintf(file, "%*s}\n", 2*nest, "");
2761             elif = true;
2762             break;
2763           default:
2764             ::fprintf(file, "%*s", 2*nest, "");
2765             if (elif)
2766               ::fprintf(file, "else ");
2767             ::fprintf(file, "if (m.FSM_META_%s()) {\n", meta_label[lo - META_MIN]);
2768             gencode_dfa_closure(file, i->second.second, nest + 1, peek);
2769             ::fprintf(file, "%*s}\n", 2*nest, "");
2770             elif = true;
2771         }
2772       } while (++lo <= hi);
2773     }
2774   }
2775 }
2776 
export_dfa(const DFA::State * start) const2777 void Pattern::export_dfa(const DFA::State *start) const
2778 {
2779   for (std::vector<std::string>::const_iterator i = opt_.f.begin(); i != opt_.f.end(); ++i)
2780   {
2781     const std::string& filename = *i;
2782     size_t len = filename.length();
2783     if (len > 3 && filename.compare(len - 3, 3, ".gv") == 0)
2784     {
2785       FILE *file = NULL;
2786       int err = 0;
2787       if (filename.compare(0, 7, "stdout.") == 0)
2788         file = stdout;
2789       else if (filename.at(0) == '+')
2790         err = reflex::fopen_s(&file, filename.c_str() + 1, "a");
2791       else
2792         err = reflex::fopen_s(&file, filename.c_str(), "w");
2793       if (!err && file)
2794       {
2795         ::fprintf(file, "digraph %s {\n\t\trankdir=LR;\n\t\tconcentrate=true;\n\t\tnode [fontname=\"ArialNarrow\"];\n\t\tedge [fontname=\"Courier\"];\n\n\t\tinit [root=true,peripheries=0,label=\"%s\",fontname=\"Courier\"];\n\t\tinit -> N%p;\n", opt_.n.empty() ? "FSM" : opt_.n.c_str(), opt_.n.c_str(), (void*)start);
2796         for (const DFA::State *state = start; state; state = state->next)
2797         {
2798           if (state == start)
2799             ::fprintf(file, "\n/*START*/\t");
2800           if (state->redo)
2801             ::fprintf(file, "\n/*REDO*/\t");
2802           else if (state->accept)
2803             ::fprintf(file, "\n/*ACCEPT %u*/\t", state->accept);
2804           for (Lookaheads::const_iterator i = state->heads.begin(); i != state->heads.end(); ++i)
2805             ::fprintf(file, "\n/*HEAD %u*/\t", *i);
2806           for (Lookaheads::const_iterator i = state->tails.begin(); i != state->tails.end(); ++i)
2807             ::fprintf(file, "\n/*TAIL %u*/\t", *i);
2808           if (state != start && !state->accept && state->heads.empty() && state->tails.empty())
2809             ::fprintf(file, "\n/*STATE*/\t");
2810           ::fprintf(file, "N%p [label=\"", (void*)state);
2811 #ifdef DEBUG
2812           size_t k = 1;
2813           size_t n = std::sqrt(state->size()) + 0.5;
2814           const char *sep = "";
2815           for (Positions::const_iterator i = state->begin(); i != state->end(); ++i)
2816           {
2817             ::fprintf(file, "%s", sep);
2818             if (i->accept())
2819             {
2820               ::fprintf(file, "(%u)", i->accepts());
2821             }
2822             else
2823             {
2824               if (i->iter())
2825                 ::fprintf(file, "%u.", i->iter());
2826               ::fprintf(file, "%u", i->loc());
2827             }
2828             if (i->lazy())
2829               ::fprintf(file, "?%u", i->lazy());
2830             if (i->anchor())
2831               ::fprintf(file, "^");
2832             if (i->greedy())
2833               ::fprintf(file, "!");
2834             if (i->ticked())
2835               ::fprintf(file, "'");
2836             if (i->negate())
2837               ::fprintf(file, "-");
2838             if (k++ % n)
2839               sep = " ";
2840             else
2841               sep = "\\n";
2842           }
2843           if ((state->accept && !state->redo) || !state->heads.empty() || !state->tails.empty())
2844             ::fprintf(file, "\\n");
2845 #endif
2846           if (state->accept > 0 && !state->redo)
2847             ::fprintf(file, "[%u]", state->accept);
2848           for (Lookaheads::const_iterator i = state->tails.begin(); i != state->tails.end(); ++i)
2849             ::fprintf(file, "%u>", *i);
2850           for (Lookaheads::const_iterator i = state->heads.begin(); i != state->heads.end(); ++i)
2851             ::fprintf(file, "<%u", *i);
2852           if (state->redo)
2853             ::fprintf(file, "\",style=dashed,peripheries=1];\n");
2854           else if (state->accept > 0)
2855             ::fprintf(file, "\",peripheries=2];\n");
2856           else if (!state->heads.empty())
2857             ::fprintf(file, "\",style=dashed,peripheries=2];\n");
2858           else
2859             ::fprintf(file, "\"];\n");
2860           for (DFA::State::Edges::const_iterator i = state->edges.begin(); i != state->edges.end(); ++i)
2861           {
2862 #if WITH_COMPACT_DFA == -1
2863             Char lo = i->first;
2864             Char hi = i->second.first;
2865 #else
2866             Char hi = i->first;
2867             Char lo = i->second.first;
2868 #endif
2869             if (!is_meta(lo))
2870             {
2871               ::fprintf(file, "\t\tN%p -> N%p [label=\"", (void*)state, (void*)i->second.second);
2872               if (lo >= '\a' && lo <= '\r')
2873                 ::fprintf(file, "\\\\%c", "abtnvfr"[lo - '\a']);
2874               else if (lo == '"')
2875                 ::fprintf(file, "\\\"");
2876               else if (lo == '\\')
2877                 ::fprintf(file, "\\\\");
2878               else if (std::isgraph(lo))
2879                 ::fprintf(file, "%c", lo);
2880               else if (lo < 8)
2881                 ::fprintf(file, "\\\\%u", lo);
2882               else
2883                 ::fprintf(file, "\\\\x%02x", lo);
2884               if (lo != hi)
2885               {
2886                 ::fprintf(file, "-");
2887                 if (hi >= '\a' && hi <= '\r')
2888                   ::fprintf(file, "\\\\%c", "abtnvfr"[hi - '\a']);
2889                 else if (hi == '"')
2890                   ::fprintf(file, "\\\"");
2891                 else if (hi == '\\')
2892                   ::fprintf(file, "\\\\");
2893                 else if (std::isgraph(hi))
2894                   ::fprintf(file, "%c", hi);
2895                 else if (hi < 8)
2896                   ::fprintf(file, "\\\\%u", hi);
2897                 else
2898                   ::fprintf(file, "\\\\x%02x", hi);
2899               }
2900               ::fprintf(file, "\"];\n");
2901             }
2902             else
2903             {
2904               do
2905               {
2906                 ::fprintf(file, "\t\tN%p -> N%p [label=\"%s\",style=\"dashed\"];\n", (void*)state, (void*)i->second.second, meta_label[lo - META_MIN]);
2907               } while (++lo <= hi);
2908             }
2909           }
2910           if (state->redo)
2911             ::fprintf(file, "\t\tN%p -> R%p;\n\t\tR%p [peripheries=0,label=\"redo\"];\n", (void*)state, (void*)state, (void*)state);
2912         }
2913         ::fprintf(file, "}\n");
2914         if (file != stdout)
2915           ::fclose(file);
2916       }
2917     }
2918   }
2919 }
2920 
export_code() const2921 void Pattern::export_code() const
2922 {
2923   if (nop_ == 0)
2924     return;
2925   if (opt_.o)
2926     return;
2927   for (std::vector<std::string>::const_iterator i = opt_.f.begin(); i != opt_.f.end(); ++i)
2928   {
2929     const std::string& filename = *i;
2930     size_t len = filename.length();
2931     if ((len > 2 && filename.compare(len - 2, 2, ".h"  ) == 0)
2932      || (len > 4 && filename.compare(len - 4, 4, ".hpp") == 0)
2933      || (len > 4 && filename.compare(len - 4, 4, ".cpp") == 0)
2934      || (len > 3 && filename.compare(len - 3, 3, ".cc" ) == 0))
2935     {
2936       FILE *file = NULL;
2937       int err = 0;
2938       if (filename.compare(0, 7, "stdout.") == 0)
2939         file = stdout;
2940       else if (filename.at(0) == '+')
2941         err = reflex::fopen_s(&file, filename.c_str() + 1, "a");
2942       else
2943         err = reflex::fopen_s(&file, filename.c_str(), "w");
2944       if (!err && file)
2945       {
2946         ::fprintf(file, "#ifndef REFLEX_CODE_DECL\n#include <reflex/pattern.h>\n#define REFLEX_CODE_DECL const reflex::Pattern::Opcode\n#endif\n\n");
2947         write_namespace_open(file);
2948         ::fprintf(file, "extern REFLEX_CODE_DECL reflex_code_%s[%u] =\n{\n", opt_.n.empty() ? "FSM" : opt_.n.c_str(), nop_);
2949         for (Index i = 0; i < nop_; ++i)
2950         {
2951           Opcode opcode = opc_[i];
2952           Char lo = lo_of(opcode);
2953           Char hi = hi_of(opcode);
2954           ::fprintf(file, "  0x%08X, // %u: ", opcode, i);
2955           if (is_opcode_redo(opcode))
2956           {
2957             ::fprintf(file, "REDO\n");
2958           }
2959           else if (is_opcode_take(opcode))
2960           {
2961             ::fprintf(file, "TAKE %u\n", long_index_of(opcode));
2962           }
2963           else if (is_opcode_tail(opcode))
2964           {
2965             ::fprintf(file, "TAIL %u\n", long_index_of(opcode));
2966           }
2967           else if (is_opcode_head(opcode))
2968           {
2969             ::fprintf(file, "HEAD %u\n", long_index_of(opcode));
2970           }
2971           else if (is_opcode_halt(opcode))
2972           {
2973             ::fprintf(file, "HALT\n");
2974           }
2975           else
2976           {
2977             Index index = index_of(opcode);
2978             if (index == Const::HALT)
2979             {
2980               ::fprintf(file, "HALT ON ");
2981             }
2982             else
2983             {
2984               if (index == Const::LONG)
2985               {
2986                 opcode = opc_[++i];
2987                 index = long_index_of(opcode);
2988                 ::fprintf(file, "GOTO\n  0x%08X, // %u:  FAR %u ON ", opcode, i, index);
2989               }
2990               else
2991               {
2992                 ::fprintf(file, "GOTO %u ON ", index);
2993               }
2994             }
2995             if (!is_meta(lo))
2996             {
2997               print_char(file, lo, true);
2998               if (lo != hi)
2999               {
3000                 ::fprintf(file, "-");
3001                 print_char(file, hi, true);
3002               }
3003             }
3004             else
3005             {
3006               ::fprintf(file, "%s", meta_label[lo - META_MIN]);
3007             }
3008             ::fprintf(file, "\n");
3009           }
3010         }
3011         ::fprintf(file, "};\n\n");
3012         if (opt_.p)
3013           write_predictor(file);
3014         write_namespace_close(file);
3015         if (file != stdout)
3016           ::fclose(file);
3017       }
3018     }
3019   }
3020 }
3021 
predict_match_dfa(DFA::State * start)3022 void Pattern::predict_match_dfa(DFA::State *start)
3023 {
3024   DBGLOG("BEGIN Pattern::predict_match_dfa()");
3025   DFA::State *state = start;
3026   one_ = true;
3027   while (state->accept == 0)
3028   {
3029     if (state->edges.size() != 1)
3030     {
3031       one_ = false;
3032       break;
3033     }
3034     Char lo = state->edges.begin()->first;
3035     if (!is_meta(lo) && lo == state->edges.begin()->second.first)
3036     {
3037       if (lo != state->edges.begin()->second.first)
3038         break;
3039       if (len_ >= 255)
3040       {
3041         one_ = false;
3042         break;
3043       }
3044       pre_[len_++] = static_cast<uint8_t>(lo);
3045     }
3046     else
3047     {
3048       one_ = false;
3049       break;
3050     }
3051     DFA::State *next = state->edges.begin()->second.second;
3052     if (next == NULL)
3053     {
3054       one_ = false;
3055       break;
3056     }
3057     state = next;
3058   }
3059   if (state != NULL && state->accept > 0 && !state->edges.empty())
3060     one_ = false;
3061   min_ = 0;
3062   std::memset(bit_, 0xFF, sizeof(bit_));
3063   std::memset(pmh_, 0xFF, sizeof(pmh_));
3064   std::memset(pma_, 0xFF, sizeof(pma_));
3065   if (state != NULL && state->accept == 0)
3066   {
3067     gen_predict_match(state);
3068 #ifdef DEBUG
3069     for (Char i = 0; i < 256; ++i)
3070     {
3071       if (bit_[i] != 0xFF)
3072       {
3073         if (isprint(i))
3074           DBGLOGN("bit['%c'] = %02x", i, bit_[i]);
3075         else
3076           DBGLOGN("bit[%3d] = %02x", i, bit_[i]);
3077       }
3078     }
3079     for (Hash i = 0; i < Const::HASH; ++i)
3080     {
3081       if (pmh_[i] != 0xFF)
3082       {
3083         if (isprint(pmh_[i]))
3084           DBGLOGN("pmh['%c'] = %02x", i, pmh_[i]);
3085         else
3086           DBGLOGN("pmh[%3d] = %02x", i, pmh_[i]);
3087       }
3088     }
3089     for (Hash i = 0; i < Const::HASH; ++i)
3090     {
3091       if (pma_[i] != 0xFF)
3092       {
3093         if (isprint(pma_[i]))
3094           DBGLOGN("pma['%c'] = %02x", i, pma_[i]);
3095         else
3096           DBGLOGN("pma[%3d] = %02x", i, pma_[i]);
3097       }
3098     }
3099 #endif
3100   }
3101   DBGLOGN("min = %zu", min_);
3102   DBGLOG("END Pattern::predict_match_dfa()");
3103 }
3104 
gen_predict_match(DFA::State * state)3105 void Pattern::gen_predict_match(DFA::State *state)
3106 {
3107   min_ = 8;
3108   std::map<DFA::State*,ORanges<Hash> > states[8];
3109   gen_predict_match_transitions(state, states[0]);
3110   for (int level = 1; level < 8; ++level)
3111     for (std::map<DFA::State*,ORanges<Hash> >::iterator from = states[level - 1].begin(); from != states[level - 1].end(); ++from)
3112       gen_predict_match_transitions(level, from->first, from->second, states[level]);
3113   for (Char i = 0; i < 256; ++i)
3114     bit_[i] &= (1 << min_) - 1;
3115 }
3116 
gen_predict_match_transitions(DFA::State * state,std::map<DFA::State *,ORanges<Hash>> & states)3117 void Pattern::gen_predict_match_transitions(DFA::State *state, std::map<DFA::State*,ORanges<Hash> >& states)
3118 {
3119   for (DFA::State::Edges::const_iterator edge = state->edges.begin(); edge != state->edges.end(); ++edge)
3120   {
3121     Char lo = edge->first;
3122     if (is_meta(lo))
3123     {
3124       min_ = 0;
3125       break;
3126     }
3127     DFA::State *next = edge->second.second;
3128     bool accept = (next == NULL || next->accept > 0);
3129     if (!accept)
3130     {
3131       for (DFA::State::Edges::const_iterator e = next->edges.begin(); e != next->edges.end(); ++e)
3132       {
3133         if (is_meta(e->first))
3134         {
3135           if (e == next->edges.begin())
3136             next = NULL;
3137           accept = true;
3138           break;
3139         }
3140       }
3141     }
3142     else if (next != NULL && next->edges.empty())
3143     {
3144       next = NULL;
3145     }
3146     if (accept)
3147       min_ = 1;
3148     Char hi = edge->second.first;
3149     while (lo <= hi)
3150     {
3151       bit_[lo] &= ~1;
3152       pmh_[lo] &= ~1;
3153       if (accept)
3154         pma_[lo] &= ~(1 << 7);
3155       pma_[lo] &= ~(1 << 6);
3156       if (next != NULL)
3157         states[next].insert(hash(lo));
3158       ++lo;
3159     }
3160   }
3161 }
3162 
gen_predict_match_transitions(size_t level,DFA::State * state,ORanges<Hash> & labels,std::map<DFA::State *,ORanges<Hash>> & states)3163 void Pattern::gen_predict_match_transitions(size_t level, DFA::State *state, ORanges<Hash>& labels, std::map<DFA::State*,ORanges<Hash> >& states)
3164 {
3165   for (DFA::State::Edges::const_iterator edge = state->edges.begin(); edge != state->edges.end(); ++edge)
3166   {
3167     Char lo = edge->first;
3168     if (is_meta(lo))
3169       break;
3170     DFA::State *next = level < 7 ? edge->second.second : NULL;
3171     bool accept = next == NULL || next->accept > 0;
3172     if (!accept)
3173     {
3174       for (DFA::State::Edges::const_iterator e = next->edges.begin(); e != next->edges.end(); ++e)
3175       {
3176         if (is_meta(e->first))
3177         {
3178           if (e == next->edges.begin())
3179             next = NULL;
3180           accept = true;
3181           break;
3182         }
3183       }
3184     }
3185     else if (next != NULL && next->edges.empty())
3186     {
3187       next = NULL;
3188     }
3189     if (accept && min_ > level)
3190       min_ = level + 1;
3191     if (level < 4 || level <= min_)
3192     {
3193       Char hi = edge->second.first;
3194       if (level <= min_)
3195         while (lo <= hi)
3196           bit_[lo++] &= ~(1 << level);
3197       for (ORanges<Hash>::const_iterator label = labels.begin(); label != labels.end(); ++label)
3198       {
3199         Hash label_hi = label->second - 1;
3200         for (Hash label_lo = label->first; label_lo <= label_hi; ++label_lo)
3201         {
3202           for (lo = edge->first; lo <= hi; ++lo)
3203           {
3204             Hash h = hash(label_lo, static_cast<uint8_t>(lo));
3205             pmh_[h] &= ~(1 << level);
3206             if (level < 4)
3207             {
3208               if (level == 3 || accept)
3209                 pma_[h] &= ~(1 << (7 - 2 * level));
3210               pma_[h] &= ~(1 << (6 - 2 * level));
3211             }
3212             if (next != NULL)
3213               states[next].insert(hash(h));
3214           }
3215         }
3216       }
3217     }
3218   }
3219 }
3220 
write_predictor(FILE * file) const3221 void Pattern::write_predictor(FILE *file) const
3222 {
3223   ::fprintf(file, "extern const reflex::Pattern::Pred reflex_pred_%s[%zu] = {", opt_.n.empty() ? "FSM" : opt_.n.c_str(), 2 + len_ + (min_ > 1 && len_ == 0) * 256 + (min_ > 0) * Const::HASH);
3224   ::fprintf(file, "\n  %3hhu,%3hhu,", static_cast<uint8_t>(len_), (static_cast<uint8_t>(min_ | (one_ << 4))));
3225   for (size_t i = 0; i < len_; ++i)
3226     ::fprintf(file, "%s%3hhu,", ((i + 2) & 0xF) ? "" : "\n  ", static_cast<uint8_t>(pre_[i]));
3227   if (min_ > 0)
3228   {
3229     if (min_ > 1 && len_ == 0)
3230     {
3231       for (Char i = 0; i < 256; ++i)
3232         ::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n  ", static_cast<uint8_t>(~bit_[i]));
3233     }
3234     if (min_ >= 4)
3235     {
3236       for (Hash i = 0; i < Const::HASH; ++i)
3237         ::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n  ", static_cast<uint8_t>(~pmh_[i]));
3238     }
3239     else
3240     {
3241       for (Hash i = 0; i < Const::HASH; ++i)
3242         ::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n  ", static_cast<uint8_t>(~pma_[i]));
3243     }
3244   }
3245   ::fprintf(file, "\n};\n\n");
3246 }
3247 
write_namespace_open(FILE * file) const3248 void Pattern::write_namespace_open(FILE *file) const
3249 {
3250   if (opt_.z.empty())
3251     return;
3252 
3253   const std::string& s = opt_.z;
3254   size_t i = 0, j;
3255   while ((j = s.find("::", i)) != std::string::npos)
3256   {
3257     ::fprintf(file, "namespace %s {\n", s.substr(i, j - i).c_str());
3258     i = j + 2;
3259   }
3260   ::fprintf(file, "namespace %s {\n\n", s.substr(i).c_str());
3261 }
3262 
write_namespace_close(FILE * file) const3263 void Pattern::write_namespace_close(FILE *file) const
3264 {
3265   if (opt_.z.empty())
3266     return;
3267 
3268   const std::string& s = opt_.z;
3269   size_t i = 0, j;
3270   while ((j = s.find("::", i)) != std::string::npos)
3271   {
3272     ::fprintf(file, "} // namespace %s\n\n", s.substr(i, j - i).c_str());
3273     i = j + 2;
3274   }
3275   ::fprintf(file, "} // namespace %s\n\n", s.substr(i).c_str());
3276 }
3277 
3278 } // namespace reflex
3279