1 /******************************************************************************\
2 * Copyright (c) 2016, Robert van Engelen, Genivia Inc. All rights reserved. *
3 * *
4 * Redistribution and use in source and binary forms, with or without *
5 * modification, are permitted provided that the following conditions are met: *
6 * *
7 * (1) Redistributions of source code must retain the above copyright notice, *
8 * this list of conditions and the following disclaimer. *
9 * *
10 * (2) Redistributions in binary form must reproduce the above copyright *
11 * notice, this list of conditions and the following disclaimer in the *
12 * documentation and/or other materials provided with the distribution. *
13 * *
14 * (3) The name of the author may not be used to endorse or promote products *
15 * derived from this software without specific prior written permission. *
16 * *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED *
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF *
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO *
20 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, *
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; *
23 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, *
24 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR *
25 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF *
26 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
27 \******************************************************************************/
28
29 /**
30 @file pattern.cpp
31 @brief RE/flex regular expression pattern compiler
32 @author Robert van Engelen - engelen@genivia.com
33 @copyright (c) 2016-2020, Robert van Engelen, Genivia Inc. All rights reserved.
34 @copyright (c) BSD-3 License - see LICENSE.txt
35 */
36
37 #include <reflex/pattern.h>
38 #include <reflex/timer.h>
39 #include <cstdlib>
40 #include <cerrno>
41 #include <cmath>
42
43 /// DFA compaction: -1 == reverse order edge compression (best); 1 == edge compression; 0 == no edge compression.
44 /** Edge compression reorders edges to produce fewer tests when executed in the compacted order.
45 For example ([a-cg-ik]|d|[e-g]|j|y|[x-z]) after reverse edge compression has only 2 edges:
46 c1 = m.FSM_CHAR();
47 if ('x' <= c1 && c1 <= 'z') goto S3;
48 if ('a' <= c1 && c1 <= 'k') goto S3;
49 return m.FSM_HALT(c1);
50 */
51 #define WITH_COMPACT_DFA -1
52
53 #ifdef DEBUG
54 # define DBGLOGPOS(p) \
55 if ((p).accept()) \
56 { \
57 DBGLOGA(" (%u)", (p).accepts()); \
58 if ((p).lazy()) \
59 DBGLOGA("?%u", (p).lazy()); \
60 if ((p).greedy()) \
61 DBGLOGA("!"); \
62 } \
63 else \
64 { \
65 DBGLOGA(" "); \
66 if ((p).iter()) \
67 DBGLOGA("%u.", (p).iter()); \
68 DBGLOGA("%u", (p).loc()); \
69 if ((p).lazy()) \
70 DBGLOGA("?%u", (p).lazy()); \
71 if ((p).anchor()) \
72 DBGLOGA("^"); \
73 if ((p).greedy()) \
74 DBGLOGA("!"); \
75 if ((p).ticked()) \
76 DBGLOGA("'"); \
77 if ((p).negate()) \
78 DBGLOGA("-"); \
79 }
80 #endif
81
82 namespace reflex {
83
84 #if (defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(__BORLANDC__)) && !defined(__CYGWIN__) && !defined(__MINGW32__) && !defined(__MINGW64__)
fopen_s(FILE ** file,const char * name,const char * mode)85 inline int fopen_s(FILE **file, const char *name, const char *mode) { return ::fopen_s(file, name, mode); }
86 #else
87 inline int fopen_s(FILE **file, const char *name, const char *mode) { return (*file = ::fopen(name, mode)) ? 0 : errno; }
88 #endif
89
print_char(FILE * file,int c,bool h=false)90 static void print_char(FILE *file, int c, bool h = false)
91 {
92 if (c >= '\a' && c <= '\r')
93 ::fprintf(file, "'\\%c'", "abtnvfr"[c - '\a']);
94 else if (c == '\\')
95 ::fprintf(file, "'\\\\'");
96 else if (c == '\'')
97 ::fprintf(file, "'\\''");
98 else if (std::isprint(c))
99 ::fprintf(file, "'%c'", c);
100 else if (h)
101 ::fprintf(file, "%02x", c);
102 else
103 ::fprintf(file, "%u", c);
104 }
105
106 static const char *posix_class[] = {
107 "ASCII",
108 "Space",
109 "XDigit",
110 "Cntrl",
111 "Print",
112 "Alnum",
113 "Alpha",
114 "Blank",
115 "Digit",
116 "Graph",
117 "Lower",
118 "Punct",
119 "Upper",
120 "Word",
121 };
122
123 static const char *meta_label[] = {
124 NULL,
125 "NWB",
126 "NWE",
127 "BWB",
128 "EWB",
129 "BWE",
130 "EWE",
131 "BOL",
132 "EOL",
133 "BOB",
134 "EOB",
135 "UND",
136 "IND",
137 "DED",
138 };
139
operator [](Accept choice) const140 const std::string Pattern::operator[](Accept choice) const
141 {
142 if (choice == 0)
143 return rex_;
144 if (choice <= size())
145 {
146 Location loc = end_.at(choice - 1);
147 Location prev = 0;
148 if (choice >= 2)
149 prev = end_.at(choice - 2) + 1;
150 return rex_.substr(prev, loc - prev);
151 }
152 return "";
153 }
154
error(regex_error_type code,size_t pos) const155 void Pattern::error(regex_error_type code, size_t pos) const
156 {
157 regex_error err(code, rex_, pos);
158 if (opt_.w)
159 std::cerr << err.what();
160 if (code == regex_error::exceeds_limits || opt_.r)
161 throw err;
162 }
163
init(const char * options,const uint8_t * pred)164 void Pattern::init(const char *options, const uint8_t *pred)
165 {
166 init_options(options);
167 nop_ = 0;
168 len_ = 0;
169 min_ = 0;
170 one_ = false;
171 if (opc_ != NULL || fsm_ != NULL )
172 {
173 if (pred != NULL)
174 {
175 len_ = pred[0];
176 min_ = pred[1] & 0x0f;
177 one_ = pred[1] & 0x10;
178 memcpy(pre_, pred + 2, len_);
179 if (min_ > 0)
180 {
181 size_t n = len_ + 2;
182 if (min_ > 1 && len_ == 0)
183 {
184 for (size_t i = 0; i < 256; ++i)
185 bit_[i] = ~pred[i + n];
186 n += 256;
187 }
188 if (min_ >= 4)
189 {
190 for (size_t i = 0; i < Const::HASH; ++i)
191 pmh_[i] = ~pred[i + n];
192 }
193 else
194 {
195 for (size_t i = 0; i < Const::HASH; ++i)
196 pma_[i] = ~pred[i + n];
197 }
198 }
199 }
200 }
201 else
202 {
203 Positions startpos;
204 Follow followpos;
205 Map modifiers;
206 Map lookahead;
207 // parse the regex pattern to construct the followpos NFA without epsilon transitions
208 parse(startpos, followpos, modifiers, lookahead);
209 // start state = startpos = firstpost of the followpos NFA, also merge the tree DFA root when non-NULL
210 DFA::State *start = dfa_.state(tfa_.tree, startpos);
211 // compile the NFA into a DFA
212 compile(start, followpos, modifiers, lookahead);
213 // assemble DFA opcode tables or direct code
214 assemble(start);
215 // delete the DFA
216 dfa_.clear();
217 }
218 }
219
init_options(const char * options)220 void Pattern::init_options(const char *options)
221 {
222 opt_.b = false;
223 opt_.i = false;
224 opt_.m = false;
225 opt_.o = false;
226 opt_.p = false;
227 opt_.q = false;
228 opt_.r = false;
229 opt_.s = false;
230 opt_.w = false;
231 opt_.x = false;
232 opt_.e = '\\';
233 if (options != NULL)
234 {
235 for (const char *s = options; *s != '\0'; ++s)
236 {
237 switch (*s)
238 {
239 case 'b':
240 opt_.b = true;
241 break;
242 case 'e':
243 opt_.e = (*(s += (s[1] == '=') + 1) == ';' || *s == '\0' ? 256 : *s++);
244 --s;
245 break;
246 case 'p':
247 opt_.p = true;
248 break;
249 case 'i':
250 opt_.i = true;
251 break;
252 case 'm':
253 opt_.m = true;
254 break;
255 case 'o':
256 opt_.o = true;
257 break;
258 case 'q':
259 opt_.q = true;
260 break;
261 case 'r':
262 opt_.r = true;
263 break;
264 case 's':
265 opt_.s = true;
266 break;
267 case 'w':
268 opt_.w = true;
269 break;
270 case 'x':
271 opt_.x = true;
272 break;
273 case 'z':
274 for (const char *t = s += (s[1] == '='); *s != ';' && *s != '\0'; ++t)
275 {
276 if (std::isspace(*t) || *t == ';' || *t == '\0')
277 {
278 if (t > s + 1)
279 opt_.z = std::string(s + 1, t - s - 1);
280 s = t;
281 }
282 }
283 --s;
284 break;
285 case 'f':
286 case 'n':
287 for (const char *t = s += (s[1] == '='); *s != ';' && *s != '\0'; ++t)
288 {
289 if (*t == ',' || std::isspace(*t) || *t == ';' || *t == '\0')
290 {
291 if (t > s + 1)
292 {
293 std::string name(s + 1, t - s - 1);
294 if (name.find('.') == std::string::npos)
295 opt_.n = name;
296 else
297 opt_.f.push_back(name);
298 }
299 s = t;
300 }
301 }
302 --s;
303 break;
304 }
305 }
306 }
307 }
308
parse(Positions & startpos,Follow & followpos,Map & modifiers,Map & lookahead)309 void Pattern::parse(
310 Positions& startpos,
311 Follow& followpos,
312 Map& modifiers,
313 Map& lookahead)
314 {
315 DBGLOG("BEGIN parse()");
316 if (rex_.size() > Position::MAXLOC)
317 throw regex_error(regex_error::exceeds_length, rex_, Position::MAXLOC);
318 Location len = static_cast<Location>(rex_.size());
319 Location loc = 0;
320 Accept choice = 1;
321 Lazy lazyidx = 0;
322 Positions firstpos;
323 Positions lastpos;
324 bool nullable;
325 Iter iter;
326 timer_type t;
327 timer_start(t);
328 if (at(0) == '(' && at(1) == '?')
329 {
330 loc = 2;
331 while (at(loc) == '-' || std::isalnum(at(loc)))
332 ++loc;
333 if (at(loc) == ')')
334 {
335 bool active = true;
336 loc = 2;
337 Char c;
338 while ((c = at(loc)) != ')')
339 {
340 c = at(loc);
341 if (c == '-')
342 active = false;
343 else if (c == 'i')
344 opt_.i = active;
345 else if (c == 'm')
346 opt_.m = active;
347 else if (c == 'q')
348 opt_.q = active;
349 else if (c == 's')
350 opt_.s = active;
351 else if (c == 'x')
352 opt_.x = active;
353 else
354 error(regex_error::invalid_modifier, loc);
355 ++loc;
356 }
357 ++loc;
358 }
359 else
360 {
361 loc = 0;
362 }
363 }
364 do
365 {
366 Location end = loc;
367 if (!opt_.q && !opt_.x)
368 {
369 while (true)
370 {
371 Char c = at(end);
372 if (c == '\0' || c == '|')
373 break;
374 if (c == '.' || c == '^' || c == '$' || c == '(' || c == ')' || c == '[' || c == '{' || c == '?' || c == '*' || c == '+')
375 {
376 end = loc;
377 break;
378 }
379 if (c == opt_.e)
380 {
381 c = at(++end);
382 if (c == '\0' || std::strchr("0123456789<>ABDHLNPSUWXbcdehijklpsuwxz", c) != NULL)
383 {
384 end = loc;
385 break;
386 }
387 if (c == 'Q')
388 {
389 while ((c = at(++end)) != '\0')
390 if (c == opt_.e && at(end + 1) == 'E')
391 break;
392 }
393 }
394 ++end;
395 }
396 }
397 if (loc < end)
398 {
399 // string pattern found w/o regex metas: merge string into the tree DFA
400 bool quote = false;
401 Tree::Node *t = tfa_.root();
402 while (loc < end)
403 {
404 Char c = at(loc++);
405 if (c == opt_.e)
406 {
407 if (at(loc) == 'Q')
408 {
409 quote = true;
410 ++loc;
411 continue;
412 }
413 if (at(loc) == 'E')
414 {
415 quote = false;
416 ++loc;
417 continue;
418 }
419 if (!quote)
420 {
421 static const char abtnvfr[] = "abtnvfr";
422 c = at(loc++);
423 const char *s = std::strchr(abtnvfr, c);
424 if (s != NULL)
425 c = static_cast<Char>(s - abtnvfr + '\a');
426 }
427 }
428 else if (c >= 'A' && c <= 'Z' && opt_.i)
429 {
430 c = lowercase(c);
431 }
432 t = tfa_.edge(t, c);
433 }
434 if (t->accept == 0)
435 t->accept = choice;
436 }
437 else
438 {
439 Lazyset lazyset;
440 parse2(
441 true,
442 loc,
443 firstpos,
444 lastpos,
445 nullable,
446 followpos,
447 lazyidx,
448 lazyset,
449 modifiers,
450 lookahead[choice],
451 iter);
452 end_.push_back(loc);
453 set_insert(startpos, firstpos);
454 if (nullable)
455 {
456 if (lazyset.empty())
457 {
458 startpos.insert(Position(choice).accept(true));
459 }
460 else
461 {
462 for (Lazyset::const_iterator l = lazyset.begin(); l != lazyset.end(); ++l)
463 startpos.insert(Position(choice).accept(true).lazy(*l));
464 }
465 }
466 for (Positions::const_iterator p = lastpos.begin(); p != lastpos.end(); ++p)
467 {
468 if (lazyset.empty())
469 {
470 followpos[p->pos()].insert(Position(choice).accept(true));
471 }
472 else
473 {
474 for (Lazyset::const_iterator l = lazyset.begin(); l != lazyset.end(); ++l)
475 followpos[p->pos()].insert(Position(choice).accept(true).lazy(*l));
476 }
477 }
478 }
479 if (++choice == 0)
480 error(regex_error::exceeds_limits, loc); // overflow: too many top-level alternations (should never happen)
481 } while (at(loc++) == '|');
482 --loc;
483 if (at(loc) == ')')
484 error(regex_error::mismatched_parens, loc);
485 else if (at(loc) != 0)
486 error(regex_error::invalid_syntax, loc);
487 if (opt_.i)
488 update_modified('i', modifiers, 0, len - 1);
489 if (opt_.m)
490 update_modified('m', modifiers, 0, len - 1);
491 if (opt_.s)
492 update_modified('s', modifiers, 0, len - 1);
493 pms_ = timer_elapsed(t);
494 #ifdef DEBUG
495 DBGLOGN("startpos = {");
496 for (Positions::const_iterator p = startpos.begin(); p != startpos.end(); ++p)
497 DBGLOGPOS(*p);
498 DBGLOGA(" }");
499 for (Follow::const_iterator fp = followpos.begin(); fp != followpos.end(); ++fp)
500 {
501 DBGLOGN("followpos(");
502 DBGLOGPOS(fp->first);
503 DBGLOGA(" ) = {");
504 for (Positions::const_iterator p = fp->second.begin(); p != fp->second.end(); ++p)
505 DBGLOGPOS(*p);
506 DBGLOGA(" }");
507 }
508 #endif
509 DBGLOG("END parse()");
510 }
511
parse1(bool begin,Location & loc,Positions & firstpos,Positions & lastpos,bool & nullable,Follow & followpos,Lazy & lazyidx,Lazyset & lazyset,Map & modifiers,Locations & lookahead,Iter & iter)512 void Pattern::parse1(
513 bool begin,
514 Location& loc,
515 Positions& firstpos,
516 Positions& lastpos,
517 bool& nullable,
518 Follow& followpos,
519 Lazy& lazyidx,
520 Lazyset& lazyset,
521 Map& modifiers,
522 Locations& lookahead,
523 Iter& iter)
524 {
525 DBGLOG("BEGIN parse1(%u)", loc);
526 parse2(
527 begin,
528 loc,
529 firstpos,
530 lastpos,
531 nullable,
532 followpos,
533 lazyidx,
534 lazyset,
535 modifiers,
536 lookahead,
537 iter);
538 Positions firstpos1;
539 Positions lastpos1;
540 bool nullable1;
541 Lazyset lazyset1;
542 Iter iter1;
543 while (at(loc) == '|')
544 {
545 ++loc;
546 parse2(
547 begin,
548 loc,
549 firstpos1,
550 lastpos1,
551 nullable1,
552 followpos,
553 lazyidx,
554 lazyset1,
555 modifiers,
556 lookahead,
557 iter1);
558 set_insert(firstpos, firstpos1);
559 set_insert(lastpos, lastpos1);
560 set_insert(lazyset, lazyset1);
561 if (nullable1)
562 nullable = true;
563 if (iter1 > iter)
564 iter = iter1;
565 }
566 DBGLOG("END parse1()");
567 }
568
parse2(bool begin,Location & loc,Positions & firstpos,Positions & lastpos,bool & nullable,Follow & followpos,Lazy & lazyidx,Lazyset & lazyset,Map & modifiers,Locations & lookahead,Iter & iter)569 void Pattern::parse2(
570 bool begin,
571 Location& loc,
572 Positions& firstpos,
573 Positions& lastpos,
574 bool& nullable,
575 Follow& followpos,
576 Lazy& lazyidx,
577 Lazyset& lazyset,
578 Map& modifiers,
579 Locations& lookahead,
580 Iter& iter)
581 {
582 DBGLOG("BEGIN parse2(%u)", loc);
583 Positions a_pos;
584 Char c;
585 if (begin)
586 {
587 while (true)
588 {
589 if (opt_.x)
590 while (std::isspace(at(loc)))
591 ++loc;
592 if (at(loc) == '^')
593 {
594 a_pos.insert(Position(loc++));
595 begin = false; // CHECKED algorithmic options: 7/29 but does not allow ^ as a pattern
596 }
597 else if (escapes_at(loc, "ABb<>"))
598 {
599 a_pos.insert(Position(loc));
600 loc += 2;
601 begin = false; // CHECKED algorithmic options: 7/29 but does not allow \b as a pattern
602 }
603 else
604 {
605 if (escapes_at(loc, "ij"))
606 begin = false;
607 break;
608 }
609 }
610 }
611 if (begin || ((c = at(loc)) != '\0' && c != '|' && c != ')'))
612 {
613 parse3(
614 begin,
615 loc,
616 firstpos,
617 lastpos,
618 nullable,
619 followpos,
620 lazyidx,
621 lazyset,
622 modifiers,
623 lookahead,
624 iter);
625 Positions firstpos1;
626 Positions lastpos1;
627 bool nullable1;
628 Lazyset lazyset1;
629 Iter iter1;
630 while ((c = at(loc)) != '\0' && c != '|' && c != ')')
631 {
632 parse3(
633 false,
634 loc,
635 firstpos1,
636 lastpos1,
637 nullable1,
638 followpos,
639 lazyidx,
640 lazyset1,
641 modifiers,
642 lookahead,
643 iter1);
644 if (!lazyset.empty()) // CHECKED this is an extra rule for + only and (may) not be needed for *
645 {
646 // CHECKED algorithmic options: lazy(lazyset, firstpos1); does not work for (a|b)*?a*b+, below works
647 Positions firstpos2;
648 lazy(lazyset, firstpos1, firstpos2);
649 set_insert(firstpos1, firstpos2);
650 // if (lazyset1.empty())
651 // greedy(firstpos1); // CHECKED algorithmic options: 8/1 works except fails for ((a|b)*?b){2} and (a|b)??(a|b)??aa
652 }
653 if (nullable)
654 set_insert(firstpos, firstpos1);
655 for (Positions::const_iterator p = lastpos.begin(); p != lastpos.end(); ++p)
656 set_insert(followpos[p->pos()], firstpos1);
657 if (nullable1)
658 {
659 set_insert(lastpos, lastpos1);
660 set_insert(lazyset, lazyset1); // CHECKED 10/21
661 }
662 else
663 {
664 lastpos.swap(lastpos1);
665 lazyset.swap(lazyset1); // CHECKED 10/21
666 nullable = false;
667 }
668 // CHECKED 10/21 set_insert(lazyset, lazyset1);
669 if (iter1 > iter)
670 iter = iter1;
671 }
672 }
673 for (Positions::const_iterator p = a_pos.begin(); p != a_pos.end(); ++p)
674 {
675 for (Positions::const_iterator k = lastpos.begin(); k != lastpos.end(); ++k)
676 if (at(k->loc()) == ')' && lookahead.find(k->loc()) != lookahead.end())
677 followpos[p->pos()].insert(*k);
678 for (Positions::const_iterator k = lastpos.begin(); k != lastpos.end(); ++k)
679 followpos[k->pos()].insert(p->anchor(!nullable || k->pos() != p->pos()));
680 lastpos.clear();
681 lastpos.insert(*p);
682 if (nullable || firstpos.empty())
683 {
684 firstpos.insert(*p);
685 nullable = false;
686 }
687 }
688 DBGLOG("END parse2()");
689 }
690
parse3(bool begin,Location & loc,Positions & firstpos,Positions & lastpos,bool & nullable,Follow & followpos,Lazy & lazyidx,Lazyset & lazyset,Map & modifiers,Locations & lookahead,Iter & iter)691 void Pattern::parse3(
692 bool begin,
693 Location& loc,
694 Positions& firstpos,
695 Positions& lastpos,
696 bool& nullable,
697 Follow& followpos,
698 Lazy& lazyidx,
699 Lazyset& lazyset,
700 Map& modifiers,
701 Locations& lookahead,
702 Iter& iter)
703 {
704 DBGLOG("BEGIN parse3(%u)", loc);
705 Position b_pos(loc);
706 parse4(
707 begin,
708 loc,
709 firstpos,
710 lastpos,
711 nullable,
712 followpos,
713 lazyidx,
714 lazyset,
715 modifiers,
716 lookahead,
717 iter);
718 Char c = at(loc);
719 if (opt_.x)
720 while (std::isspace(c))
721 c = at(++loc);
722 while (true)
723 {
724 if (c == '*' || c == '+' || c == '?')
725 {
726 if (c == '*' || c == '?')
727 nullable = true;
728 if (at(++loc) == '?')
729 {
730 if (++lazyidx == 0)
731 error(regex_error::exceeds_limits, loc); // overflow: exceeds max 255 lazy quantifiers
732 lazyset.insert(lazyidx);
733 if (nullable)
734 lazy(lazyset, firstpos);
735 ++loc;
736 }
737 else
738 {
739 // CHECKED algorithmic options: 7/30 if (!nullable)
740 // CHECKED algorithmic options: 7/30 lazyset.clear();
741 greedy(firstpos);
742 }
743 if (c == '+' && !nullable && !lazyset.empty())
744 {
745 Positions firstpos1;
746 lazy(lazyset, firstpos, firstpos1);
747 for (Positions::const_iterator p = lastpos.begin(); p != lastpos.end(); ++p)
748 set_insert(followpos[p->pos()], firstpos1);
749 set_insert(firstpos, firstpos1);
750 }
751 else if (c == '*' || c == '+')
752 {
753 for (Positions::const_iterator p = lastpos.begin(); p != lastpos.end(); ++p)
754 set_insert(followpos[p->pos()], firstpos);
755 }
756 }
757 else if (c == '{') // {n,m} repeat min n times to max m
758 {
759 size_t k = 0;
760 for (Location i = 0; i < 7 && std::isdigit(c = at(++loc)); ++i)
761 k = 10 * k + (c - '0');
762 if (k > Position::MAXITER)
763 error(regex_error::exceeds_limits, loc);
764 Iter n = static_cast<Iter>(k);
765 Iter m = n;
766 bool unlimited = false;
767 if (at(loc) == ',')
768 {
769 if (std::isdigit(at(loc + 1)))
770 {
771 m = 0;
772 for (Location i = 0; i < 7 && std::isdigit(c = at(++loc)); ++i)
773 m = 10 * m + (c - '0');
774 }
775 else
776 {
777 unlimited = true;
778 ++loc;
779 }
780 }
781 if (at(loc) == '}')
782 {
783 bool nullable1 = nullable;
784 if (n == 0)
785 nullable = true;
786 if (n > m)
787 error(regex_error::invalid_repeat, loc);
788 if (at(++loc) == '?')
789 {
790 if (++lazyidx == 0)
791 error(regex_error::exceeds_limits, loc); // overflow: exceeds max 255 lazy quantifiers
792 lazyset.insert(lazyidx);
793 if (nullable)
794 lazy(lazyset, firstpos);
795 /* CHECKED algorithmic options: 8/1 else
796 {
797 lazy(lazyset, firstpos, firstpos1);
798 set_insert(firstpos, firstpos1);
799 pfirstpos = &firstpos1;
800 } */
801 ++loc;
802 }
803 else
804 {
805 // CHECKED algorithmic options 7/30 if (!nullable)
806 // CHECKED algorithmic options 7/30 lazyset.clear();
807 if (n < m && lazyset.empty())
808 greedy(firstpos);
809 }
810 // CHECKED added pfirstpos to point to updated firstpos with lazy quants
811 Positions firstpos1, *pfirstpos = &firstpos;
812 if (!nullable && !lazyset.empty()) // CHECKED algorithmic options 8/1 added to make ((a|b)*?b){2} work
813 {
814 lazy(lazyset, firstpos, firstpos1);
815 pfirstpos = &firstpos1;
816 }
817 if (nullable && unlimited) // {0,} == *
818 {
819 for (Positions::const_iterator p = lastpos.begin(); p != lastpos.end(); ++p)
820 set_insert(followpos[p->pos()], *pfirstpos);
821 }
822 else if (m > 0)
823 {
824 if (iter * m > Position::MAXITER)
825 error(regex_error::exceeds_limits, loc);
826 // update followpos by virtually repeating sub-regex m-1 times
827 Follow followpos1;
828 for (Follow::const_iterator fp = followpos.begin(); fp != followpos.end(); ++fp)
829 if (fp->first.loc() >= b_pos)
830 for (Iter i = 0; i < m - 1; ++i)
831 for (Positions::const_iterator p = fp->second.begin(); p != fp->second.end(); ++p)
832 followpos1[fp->first.iter(iter * (i + 1))].insert(p->iter(iter * (i + 1)));
833 for (Follow::const_iterator fp = followpos1.begin(); fp != followpos1.end(); ++fp)
834 set_insert(followpos[fp->first], fp->second);
835 // add m-1 times virtual concatenation (by indexed positions k.i)
836 for (Iter i = 0; i < m - 1; ++i)
837 for (Positions::const_iterator k = lastpos.begin(); k != lastpos.end(); ++k)
838 for (Positions::const_iterator j = pfirstpos->begin(); j != pfirstpos->end(); ++j)
839 followpos[k->pos().iter(iter * i)].insert(j->iter(iter * i + iter));
840 if (unlimited)
841 for (Positions::const_iterator k = lastpos.begin(); k != lastpos.end(); ++k)
842 for (Positions::const_iterator j = pfirstpos->begin(); j != pfirstpos->end(); ++j)
843 followpos[k->pos().iter(iter * (m - 1))].insert(j->iter(iter * (m - 1)));
844 if (nullable1)
845 {
846 // extend firstpos when sub-regex is nullable
847 Positions firstpos1 = *pfirstpos;
848 for (Iter i = 1; i <= m - 1; ++i)
849 for (Positions::const_iterator k = firstpos1.begin(); k != firstpos1.end(); ++k)
850 firstpos.insert(k->iter(iter * i));
851 }
852 // n to m-1 are optional with all 0 to m-1 are optional when nullable
853 Positions lastpos1;
854 for (Iter i = (nullable ? 0 : n - 1); i <= m - 1; ++i)
855 for (Positions::const_iterator k = lastpos.begin(); k != lastpos.end(); ++k)
856 lastpos1.insert(k->iter(iter * i));
857 lastpos.swap(lastpos1);
858 iter *= m;
859 }
860 else // zero range {0}
861 {
862 firstpos.clear();
863 lastpos.clear();
864 lazyset.clear();
865 }
866 }
867 else
868 {
869 error(regex_error::invalid_repeat, loc);
870 }
871 }
872 else
873 {
874 break;
875 }
876 c = at(loc);
877 }
878 DBGLOG("END parse3()");
879 }
880
parse4(bool begin,Location & loc,Positions & firstpos,Positions & lastpos,bool & nullable,Follow & followpos,Lazy & lazyidx,Lazyset & lazyset,Map & modifiers,Locations & lookahead,Iter & iter)881 void Pattern::parse4(
882 bool begin,
883 Location& loc,
884 Positions& firstpos,
885 Positions& lastpos,
886 bool& nullable,
887 Follow& followpos,
888 Lazy& lazyidx,
889 Lazyset& lazyset,
890 Map& modifiers,
891 Locations& lookahead,
892 Iter& iter)
893 {
894 DBGLOG("BEGIN parse4(%u)", loc);
895 firstpos.clear();
896 lastpos.clear();
897 nullable = true;
898 lazyset.clear();
899 iter = 1;
900 Char c = at(loc);
901 if (c == '(')
902 {
903 if (at(++loc) == '?')
904 {
905 c = at(++loc);
906 if (c == '#') // (?# comment
907 {
908 while ((c = at(++loc)) != '\0' && c != ')')
909 continue;
910 if (c == ')')
911 ++loc;
912 }
913 else if (c == '^') // (?^ negative pattern to be ignored (new mode), producing a redo match
914 {
915 Positions firstpos1;
916 ++loc;
917 parse1(
918 begin,
919 loc,
920 firstpos1,
921 lastpos,
922 nullable,
923 followpos,
924 lazyidx,
925 lazyset,
926 modifiers,
927 lookahead,
928 iter);
929 for (Positions::iterator p = firstpos1.begin(); p != firstpos1.end(); ++p)
930 firstpos.insert(p->negate(true));
931 }
932 else if (c == '=') // (?= lookahead
933 {
934 Position l_pos(loc++ - 2); // lookahead at (
935 parse1(
936 begin,
937 loc,
938 firstpos,
939 lastpos,
940 nullable,
941 followpos,
942 lazyidx,
943 lazyset,
944 modifiers,
945 lookahead,
946 iter);
947 firstpos.insert(l_pos);
948 if (nullable)
949 lastpos.insert(l_pos);
950 if (lookahead.find(l_pos.loc(), loc) == lookahead.end()) // do not permit nested lookaheads
951 lookahead.insert(l_pos.loc(), loc); // lookstop at )
952 for (Positions::const_iterator p = lastpos.begin(); p != lastpos.end(); ++p)
953 followpos[p->pos()].insert(Position(loc).ticked(true));
954 lastpos.insert(Position(loc).ticked(true));
955 if (nullable)
956 {
957 firstpos.insert(Position(loc).ticked(true));
958 lastpos.insert(l_pos);
959 }
960 }
961 else if (c == ':')
962 {
963 ++loc;
964 parse1(
965 begin,
966 loc,
967 firstpos,
968 lastpos,
969 nullable,
970 followpos,
971 lazyidx,
972 lazyset,
973 modifiers,
974 lookahead,
975 iter);
976 }
977 else
978 {
979 Location m_loc = loc;
980 bool opt_q = opt_.q;
981 bool opt_x = opt_.x;
982 bool active = true;
983 do
984 {
985 if (c == '-')
986 active = false;
987 else if (c == 'q')
988 opt_.q = active;
989 else if (c == 'x')
990 opt_.x = active;
991 else if (c != 'i' && c != 'm' && c != 's')
992 error(regex_error::invalid_modifier, loc);
993 c = at(++loc);
994 } while (c != '\0' && c != ':' && c != ')');
995 if (c != '\0')
996 ++loc;
997 // enforce (?imqsx) modes
998 parse1(
999 begin,
1000 loc,
1001 firstpos,
1002 lastpos,
1003 nullable,
1004 followpos,
1005 lazyidx,
1006 lazyset,
1007 modifiers,
1008 lookahead,
1009 iter);
1010 active = true;
1011 do
1012 {
1013 c = at(m_loc++);
1014 if (c == '-')
1015 {
1016 active = false;
1017 }
1018 else if (c != '\0' && c != 'q' && c != 'x' && c != ':' && c != ')')
1019 {
1020 if (active)
1021 update_modified(c, modifiers, m_loc, loc);
1022 else
1023 update_modified(uppercase(c), modifiers, m_loc, loc);
1024 }
1025 } while (c != '\0' && c != ':' && c != ')');
1026 opt_.q = opt_q;
1027 opt_.x = opt_x;
1028 }
1029 }
1030 else
1031 {
1032 parse1(
1033 begin,
1034 loc,
1035 firstpos,
1036 lastpos,
1037 nullable,
1038 followpos,
1039 lazyidx,
1040 lazyset,
1041 modifiers,
1042 lookahead,
1043 iter);
1044 }
1045 if (c != ')')
1046 {
1047 if (at(loc) == ')')
1048 ++loc;
1049 else
1050 error(regex_error::mismatched_parens, loc);
1051 }
1052 }
1053 else if (c == '[')
1054 {
1055 firstpos.insert(loc);
1056 lastpos.insert(loc);
1057 nullable = false;
1058 if ((c = at(++loc)) == '^')
1059 c = at(++loc);
1060 while (c != '\0')
1061 {
1062 if (c == '[' && at(loc + 1) == ':')
1063 {
1064 size_t c_loc = find_at(loc + 2, ':');
1065 if (c_loc != std::string::npos && at(static_cast<Location>(c_loc + 1)) == ']')
1066 loc = static_cast<Location>(c_loc + 1);
1067 }
1068 else if (c == opt_.e && !opt_.b)
1069 {
1070 ++loc;
1071 }
1072 if ((c = at(++loc)) == ']')
1073 break;
1074 }
1075 if (c == '\0')
1076 error(regex_error::mismatched_brackets, loc);
1077 ++loc;
1078 }
1079 else if ((c == '"' && opt_.q) || escape_at(loc) == 'Q')
1080 {
1081 bool quoted = (c == '"');
1082 if (!quoted)
1083 ++loc;
1084 Location q_loc = ++loc;
1085 c = at(loc);
1086 if (c != '\0' && (quoted ? c != '"' : c != opt_.e || at(loc + 1) != 'E'))
1087 {
1088 firstpos.insert(loc);
1089 Position p;
1090 do
1091 {
1092 if (quoted && c == opt_.e && at(loc + 1) == '"')
1093 ++loc;
1094 if (p != Position::NPOS)
1095 followpos[p.pos()].insert(loc);
1096 p = loc++;
1097 c = at(loc);
1098 } while (c != '\0' && (!quoted || c != '"') && (quoted || c != opt_.e || at(loc + 1) != 'E'));
1099 lastpos.insert(p);
1100 nullable = false;
1101 modifiers['q'].insert(q_loc, loc - 1);
1102 }
1103 if (!quoted && at(loc) != '\0')
1104 ++loc;
1105 if (at(loc) != '\0')
1106 ++loc;
1107 else
1108 error(regex_error::mismatched_quotation, loc);
1109 }
1110 else if (c == '#' && opt_.x)
1111 {
1112 ++loc;
1113 while ((c = at(loc)) != '\0' && c != '\n')
1114 ++loc;
1115 if (c == '\n')
1116 ++loc;
1117 }
1118 else if (std::isspace(c) && opt_.x)
1119 {
1120 ++loc;
1121 }
1122 else if (c == ')')
1123 {
1124 if (begin)
1125 error(regex_error::empty_expression, loc++);
1126 else
1127 error(regex_error::mismatched_parens, loc++);
1128 }
1129 else if (c == '}')
1130 {
1131 error(regex_error::mismatched_braces, loc++);
1132 }
1133 else if (c != '\0' && c != '|' && c != '?' && c != '*' && c != '+')
1134 {
1135 firstpos.insert(loc);
1136 lastpos.insert(loc);
1137 nullable = false;
1138 if (c == opt_.e)
1139 (void)parse_esc(loc);
1140 else
1141 ++loc;
1142 }
1143 else if (begin && c != '\0') // permits empty regex pattern but not empty subpatterns
1144 {
1145 error(regex_error::empty_expression, loc);
1146 }
1147 DBGLOG("END parse4()");
1148 }
1149
parse_esc(Location & loc,Chars * chars) const1150 Pattern::Char Pattern::parse_esc(Location& loc, Chars *chars) const
1151 {
1152 Char c = at(++loc);
1153 if (c == '0')
1154 {
1155 c = 0;
1156 int d = at(++loc);
1157 if (d >= '0' && d <= '7')
1158 {
1159 c = d - '0';
1160 d = at(++loc);
1161 if (d >= '0' && d <= '7')
1162 {
1163 c = (c << 3) + d - '0';
1164 d = at(++loc);
1165 if (c < 32 && d >= '0' && d <= '7')
1166 {
1167 c = (c << 3) + d - '0';
1168 ++loc;
1169 }
1170 }
1171 }
1172 }
1173 else if ((c == 'x' || c == 'u') && at(loc + 1) == '{')
1174 {
1175 c = 0;
1176 loc += 2;
1177 int d = at(loc);
1178 if (std::isxdigit(d))
1179 {
1180 c = (d > '9' ? (d | 0x20) - ('a' - 10) : d - '0');
1181 d = at(++loc);
1182 if (std::isxdigit(d))
1183 {
1184 c = (c << 4) + (d > '9' ? (d | 0x20) - ('a' - 10) : d - '0');
1185 ++loc;
1186 }
1187 }
1188 if (at(loc) == '}')
1189 ++loc;
1190 else
1191 error(regex_error::invalid_escape, loc);
1192 }
1193 else if (c == 'x' && std::isxdigit(at(loc + 1)))
1194 {
1195 int d = at(++loc);
1196 c = (d > '9' ? (d | 0x20) - ('a' - 10) : d - '0');
1197 d = at(++loc);
1198 if (std::isxdigit(d))
1199 {
1200 c = (c << 4) + (d > '9' ? (d | 0x20) - ('a' - 10) : d - '0');
1201 ++loc;
1202 }
1203 }
1204 else if (c == 'c')
1205 {
1206 c = at(++loc) % 32;
1207 ++loc;
1208 }
1209 else if (c == 'e')
1210 {
1211 c = 0x1B;
1212 ++loc;
1213 }
1214 else if (c == 'N')
1215 {
1216 if (chars != NULL)
1217 {
1218 chars->insert(0, 9);
1219 chars->insert(11, 255);
1220 }
1221 ++loc;
1222 c = META_EOL;
1223 }
1224 else if ((c == 'p' || c == 'P') && at(loc + 1) == '{')
1225 {
1226 loc += 2;
1227 if (chars != NULL)
1228 {
1229 size_t i;
1230 for (i = 0; i < 14; ++i)
1231 if (eq_at(loc, posix_class[i]))
1232 break;
1233 if (i < 14)
1234 posix(i, *chars);
1235 else
1236 error(regex_error::invalid_class, loc);
1237 if (c == 'P')
1238 flip(*chars);
1239 loc += static_cast<Location>(strlen(posix_class[i]));
1240 if (at(loc) == '}')
1241 ++loc;
1242 else
1243 error(regex_error::invalid_escape, loc);
1244 }
1245 else
1246 {
1247 while ((c = at(++loc)) != '\0' && c != '}')
1248 continue;
1249 if (c == '}')
1250 ++loc;
1251 else
1252 error(regex_error::invalid_escape, loc);
1253 }
1254 c = META_EOL;
1255 }
1256 else if (c != '_')
1257 {
1258 static const char abtnvfr[] = "abtnvfr";
1259 const char *s = std::strchr(abtnvfr, c);
1260 if (s != NULL)
1261 {
1262 c = static_cast<Char>(s - abtnvfr + '\a');
1263 }
1264 else
1265 {
1266 static const char escapes[] = "__sSxX________hHdD__lL__uUwW";
1267 s = std::strchr(escapes, c);
1268 if (s != NULL)
1269 {
1270 if (chars != NULL)
1271 {
1272 posix((s - escapes) / 2, *chars);
1273 if ((s - escapes) % 2)
1274 flip(*chars);
1275 }
1276 c = META_EOL;
1277 }
1278 }
1279 ++loc;
1280 }
1281 if (c <= 0xFF && chars != NULL)
1282 chars->insert(c);
1283 return c;
1284 }
1285
compile(DFA::State * start,Follow & followpos,const Map & modifiers,const Map & lookahead)1286 void Pattern::compile(
1287 DFA::State *start,
1288 Follow& followpos,
1289 const Map& modifiers,
1290 const Map& lookahead)
1291 {
1292 DBGLOG("BEGIN compile()");
1293 // init stats and timers
1294 vno_ = 0;
1295 eno_ = 0;
1296 ems_ = 0.0;
1297 timer_type vt, et;
1298 timer_start(vt);
1299 // construct the DFA
1300 acc_.resize(end_.size(), false);
1301 trim_lazy(start);
1302 // hash table with 64K entries (uint16_t indexed)
1303 DFA::State **table = new DFA::State*[65536];
1304 for (int i = 0; i < 65536; ++i)
1305 table[i] = NULL;
1306 // start state should only be discoverable (to possibly cycle back to) if no tree DFA was constructed
1307 if (start->tnode == NULL)
1308 table[hash_pos(start)] = start;
1309 // last added state
1310 DFA::State *last_state = start;
1311 for (DFA::State *state = start; state; state = state->next)
1312 {
1313 Moves moves;
1314 timer_start(et);
1315 // use the tree DFA accept state, if present
1316 if (state->tnode != NULL && state->tnode->accept > 0)
1317 state->accept = state->tnode->accept;
1318 compile_transition(
1319 state,
1320 followpos,
1321 modifiers,
1322 lookahead,
1323 moves);
1324 if (state->tnode != NULL)
1325 {
1326 // merge tree DFA transitions into the final DFA transitions to target states
1327 if (moves.empty())
1328 {
1329 // no DFA transitions: the final DFA transitions are the tree DFA transitions to target states
1330 for (Char c = 0; c < 256; ++c)
1331 {
1332 if (state->tnode->edge[c] != NULL)
1333 {
1334 DFA::State *target_state = last_state = last_state->next = dfa_.state(state->tnode->edge[c]);
1335 if (opt_.i && std::isalpha(c))
1336 {
1337 state->edges[lowercase(c)] = std::pair<Char,DFA::State*>(lowercase(c), target_state);
1338 state->edges[uppercase(c)] = std::pair<Char,DFA::State*>(uppercase(c), target_state);
1339 eno_ += 2;
1340 }
1341 else
1342 {
1343 state->edges[c] = std::pair<Char,DFA::State*>(c, target_state);
1344 ++eno_;
1345 }
1346 }
1347 }
1348 }
1349 else
1350 {
1351 // combine the tree DFA transitions with the regex DFA transition moves
1352 Chars chars;
1353 for (Char c = 0; c < 256; ++c)
1354 if (state->tnode->edge[c] != NULL)
1355 chars.insert(c);
1356 if (opt_.i)
1357 for (Char c = 'a'; c <= 'z'; ++c)
1358 if (state->tnode->edge[c] != NULL)
1359 chars.insert(uppercase(c));
1360 Moves::iterator i = moves.begin();
1361 Moves::iterator end = moves.end();
1362 while (i != end)
1363 {
1364 if (chars.intersects(i->first))
1365 {
1366 // tree DFA transitions intersect with this DFA transition move
1367 Chars common = chars & i->first;
1368 chars -= common;
1369 Char lo = common.lo();
1370 Char hi = common.hi();
1371 for (Char c = lo; c <= hi; ++c)
1372 {
1373 if (common.contains(c))
1374 {
1375 Positions pos(i->second);
1376 if (opt_.i && std::isalpha(c))
1377 {
1378 if (c >= 'a' && c <= 'z')
1379 {
1380 DFA::State *target_state = last_state = last_state->next = dfa_.state(state->tnode->edge[c], pos);
1381 state->edges[c] = std::pair<Char,DFA::State*>(c, target_state);
1382 state->edges[uppercase(c)] = std::pair<Char,DFA::State*>(uppercase(c), target_state);
1383 eno_ += 2;
1384 }
1385 }
1386 else
1387 {
1388 DFA::State *target_state = last_state = last_state->next = dfa_.state(state->tnode->edge[c], pos);
1389 state->edges[c] = std::pair<Char,DFA::State*>(c, target_state);
1390 ++eno_;
1391 }
1392 }
1393 }
1394 i->first -= common;
1395 if (i->first.any())
1396 ++i;
1397 else
1398 moves.erase(i++);
1399 }
1400 else
1401 {
1402 ++i;
1403 }
1404 }
1405 if (opt_.i)
1406 {
1407 // normalize by removing upper case if option i (case insensitivem matching) is enabled
1408 static const uint64_t upper[5] = { 0x0000000000000000, 0x0000000007FFFFFE, 0, 0, 0 };
1409 chars -= Chars(upper);
1410 }
1411 if (chars.any())
1412 {
1413 Char lo = chars.lo();
1414 Char hi = chars.hi();
1415 for (Char c = lo; c <= hi; ++c)
1416 {
1417 if (chars.contains(c))
1418 {
1419 DFA::State *target_state = last_state = last_state->next = dfa_.state(state->tnode->edge[c]);
1420 if (opt_.i && std::isalpha(c))
1421 {
1422 state->edges[lowercase(c)] = std::pair<Char,DFA::State*>(lowercase(c), target_state);
1423 state->edges[uppercase(c)] = std::pair<Char,DFA::State*>(uppercase(c), target_state);
1424 eno_ += 2;
1425 }
1426 else
1427 {
1428 state->edges[c] = std::pair<Char,DFA::State*>(c, target_state);
1429 ++eno_;
1430 }
1431 }
1432 }
1433 }
1434 }
1435 }
1436 ems_ += timer_elapsed(et);
1437 Moves::iterator end = moves.end();
1438 for (Moves::iterator i = moves.begin(); i != end; ++i)
1439 {
1440 Positions& pos = i->second;
1441 if (!pos.empty())
1442 {
1443 uint16_t h = hash_pos(&pos);
1444 DFA::State **branch_ptr = &table[h];
1445 DFA::State *target_state = *branch_ptr;
1446 // binary search the target state for a possible matching state in the hash table overflow tree
1447 while (target_state != NULL)
1448 {
1449 if (pos < *target_state)
1450 target_state = *(branch_ptr = &target_state->left);
1451 else if (pos > *target_state)
1452 target_state = *(branch_ptr = &target_state->right);
1453 else
1454 break;
1455 }
1456 if (target_state == NULL)
1457 {
1458 target_state = last_state = last_state->next = dfa_.state(NULL, pos);
1459 if (branch_ptr != NULL)
1460 *branch_ptr = target_state;
1461 else
1462 table[h] = target_state;
1463 }
1464 Char lo = i->first.lo();
1465 Char max = i->first.hi();
1466 #ifdef DEBUG
1467 DBGLOGN("from state %p on %02x-%02x move to {", state, lo, max);
1468 for (Positions::const_iterator p = pos.begin(); p != pos.end(); ++p)
1469 DBGLOGPOS(*p);
1470 DBGLOGN(" } = state %p", target_state);
1471 #endif
1472 while (lo <= max)
1473 {
1474 if (i->first.contains(lo))
1475 {
1476 Char hi = lo + 1;
1477 while (hi <= max && i->first.contains(hi))
1478 ++hi;
1479 --hi;
1480 #if WITH_COMPACT_DFA == -1
1481 state->edges[lo] = std::pair<Char,DFA::State*>(hi, target_state);
1482 #else
1483 state->edges[hi] = std::pair<Char,DFA::State*>(lo, target_state);
1484 #endif
1485 eno_ += hi - lo + 1;
1486 lo = hi + 1;
1487 }
1488 ++lo;
1489 }
1490 }
1491 }
1492 if (state->accept > 0 && state->accept <= end_.size())
1493 acc_[state->accept - 1] = true;
1494 ++vno_;
1495 }
1496 delete[] table;
1497 tfa_.clear();
1498 vms_ = timer_elapsed(vt) - ems_;
1499 DBGLOG("END compile()");
1500 }
1501
lazy(const Lazyset & lazyset,Positions & pos) const1502 void Pattern::lazy(
1503 const Lazyset& lazyset,
1504 Positions& pos) const
1505 {
1506 if (!lazyset.empty())
1507 {
1508 Positions pos1;
1509 lazy(lazyset, pos, pos1);
1510 pos.swap(pos1);
1511 }
1512 }
1513
lazy(const Lazyset & lazyset,const Positions & pos,Positions & pos1) const1514 void Pattern::lazy(
1515 const Lazyset& lazyset,
1516 const Positions& pos,
1517 Positions& pos1) const
1518 {
1519 for (Positions::const_iterator p = pos.begin(); p != pos.end(); ++p)
1520 for (Lazyset::const_iterator l = lazyset.begin(); l != lazyset.end(); ++l)
1521 // pos1.insert(p->lazy() ? *p : p->lazy(*l)); // CHECKED algorithmic options: only if p is not already lazy??
1522 pos1.insert(p->lazy(*l)); // overrides lazyness even when p is already lazy
1523 }
1524
greedy(Positions & pos) const1525 void Pattern::greedy(Positions& pos) const
1526 {
1527 Positions pos1;
1528 for (Positions::const_iterator p = pos.begin(); p != pos.end(); ++p)
1529 pos1.insert(p->lazy() ? *p : p->greedy(true)); // CHECKED algorithmic options: 7/29 guard added: p->lazy() ? *p : p->greedy(true)
1530 // CHECKED 10/21 pos1.insert(p->lazy(0).greedy(true));
1531 pos.swap(pos1);
1532 }
1533
trim_anchors(Positions & follow,const Position p) const1534 void Pattern::trim_anchors(Positions& follow, const Position p) const
1535 {
1536 #ifdef DEBUG
1537 DBGLOG("trim_anchors({");
1538 for (Positions::const_iterator q = follow.begin(); q != follow.end(); ++q)
1539 DBGLOGPOS(*q);
1540 DBGLOGA(" }, %u)", p.loc());
1541 #endif
1542 Positions::iterator q = follow.begin();
1543 Positions::iterator end = follow.end();
1544 // check if we follow into an accepting state, if so trim follow state to remove back edges and cyclic anchors e.g. (^$)*
1545 while (q != end && !q->accept())
1546 ++q;
1547 if (q != end)
1548 {
1549 q = follow.begin();
1550 if (p.anchor())
1551 {
1552 while (q != end)
1553 {
1554 // erase if not accepting and not a begin anchor and not a ) lookahead tail
1555 if (!q->accept() && !q->anchor() && at(q->loc()) != ')')
1556 follow.erase(q++);
1557 else
1558 ++q;
1559 }
1560 }
1561 else
1562 {
1563 Location loc = p.loc();
1564 while (q != end)
1565 {
1566 // erase if not accepting and not a begin anchor and back edge
1567 if (!q->accept() && !q->anchor() && q->loc() <= loc)
1568 follow.erase(q++);
1569 else
1570 ++q;
1571 }
1572 }
1573 }
1574 #ifdef DEBUG
1575 DBGLOGA(" = {");
1576 for (Positions::const_iterator q = follow.begin(); q != follow.end(); ++q)
1577 DBGLOGPOS(*q);
1578 DBGLOG(" }");
1579 #endif
1580 }
1581
trim_lazy(Positions * pos) const1582 void Pattern::trim_lazy(Positions *pos) const
1583 {
1584 #ifdef DEBUG
1585 DBGLOG("BEGIN trim_lazy({");
1586 for (Positions::const_iterator q = pos->begin(); q != pos->end(); ++q)
1587 DBGLOGPOS(*q);
1588 DBGLOGA(" })");
1589 #endif
1590 Positions::reverse_iterator p = pos->rbegin();
1591 while (p != pos->rend() && p->lazy())
1592 {
1593 Location l = p->lazy();
1594 if (p->accept() || p->anchor()) // CHECKED algorithmic options: 7/28 added p->anchor()
1595 {
1596 pos->insert(p->lazy(0)); // make lazy accept/anchor a non-lazy accept/anchor
1597 pos->erase(--p.base());
1598 while (p != pos->rend() && !p->accept() && p->lazy() == l)
1599 {
1600 #if 0 // CHECKED algorithmic options: set to 1 to turn lazy trimming off
1601 ++p;
1602 #else
1603 pos->erase(--p.base());
1604 #endif
1605 }
1606 }
1607 else
1608 {
1609 #if 0 // CHECKED algorithmic options: 7/31
1610 if (p->greedy())
1611 {
1612 pos->insert(p->lazy(0).greedy(false));
1613 pos->erase(--p.base());
1614 }
1615 else
1616 {
1617 break; // ++p;
1618 }
1619 #else
1620 if (!p->greedy()) // stop here, greedy bit is 0 from here on
1621 break;
1622 pos->insert(p->lazy(0));
1623 pos->erase(--p.base()); // CHECKED 10/21 ++p;
1624 #endif
1625 }
1626 }
1627 #if 0 // CHECKED algorithmic options: 7/31 but results in more states
1628 while (p != pos->rend() && p->greedy())
1629 {
1630 pos->insert(p->greedy(false));
1631 pos->erase(--p.base());
1632 }
1633 #endif
1634 // trims accept positions keeping the first only
1635 Positions::iterator q = pos->begin(), a = pos->end();
1636 while (q != pos->end())
1637 {
1638 if (q->accept() && !q->negate())
1639 {
1640 if (a == pos->end())
1641 a = q++;
1642 else
1643 pos->erase(q++);
1644 }
1645 else
1646 {
1647 ++q;
1648 }
1649 }
1650 #ifdef DEBUG
1651 DBGLOG("END trim_lazy({");
1652 for (Positions::const_iterator q = pos->begin(); q != pos->end(); ++q)
1653 DBGLOGPOS(*q);
1654 DBGLOGA(" })");
1655 #endif
1656 }
1657
compile_transition(DFA::State * state,Follow & followpos,const Map & modifiers,const Map & lookahead,Moves & moves) const1658 void Pattern::compile_transition(
1659 DFA::State *state,
1660 Follow& followpos,
1661 const Map& modifiers,
1662 const Map& lookahead,
1663 Moves& moves) const
1664 {
1665 DBGLOG("BEGIN compile_transition()");
1666 Positions::const_iterator end = state->end();
1667 for (Positions::const_iterator k = state->begin(); k != end; ++k)
1668 {
1669 if (k->accept())
1670 {
1671 Accept accept = k->accepts();
1672 if (state->accept == 0 || accept < state->accept)
1673 state->accept = accept;
1674 if (k->negate())
1675 state->redo = true;
1676 }
1677 else
1678 {
1679 Location loc = k->loc();
1680 Char c = at(loc);
1681 DBGLOGN("At %u: %c", loc, c);
1682 bool literal = is_modified('q', modifiers, loc);
1683 if (c == '(' && !literal)
1684 {
1685 Lookahead n = 0;
1686 DBGLOG("LOOKAHEAD HEAD");
1687 for (Map::const_iterator i = lookahead.begin(); i != lookahead.end(); ++i)
1688 {
1689 Locations::const_iterator j = i->second.find(loc);
1690 DBGLOGN("%d %d (%d) %u", state->accept, i->first, j != i->second.end(), n);
1691 if (j != i->second.end())
1692 {
1693 Lookahead l = n + static_cast<Lookahead>(std::distance(i->second.begin(), j));
1694 if (l < n)
1695 error(regex_error::exceeds_limits, loc);
1696 state->heads.insert(l);
1697 }
1698 Lookahead k = n;
1699 n += static_cast<Lookahead>(i->second.size());
1700 if (n < k)
1701 error(regex_error::exceeds_limits, loc);
1702 }
1703 }
1704 else if (c == ')' && !literal)
1705 {
1706 /* CHECKED algorithmic options: 7/18 do no longer check for accept state, assume we are at an accept state
1707 if (state->accept > 0)
1708 */
1709 {
1710 Lookahead n = 0;
1711 DBGLOG("LOOKAHEAD TAIL");
1712 for (Map::const_iterator i = lookahead.begin(); i != lookahead.end(); ++i)
1713 {
1714 Locations::const_iterator j = i->second.find(loc);
1715 DBGLOGN("%d %d (%d) %u", state->accept, i->first, j != i->second.end(), n);
1716 if (j != i->second.end() /* CHECKED algorithmic options: 7/18 && state->accept == i->first */ ) // only add lookstop when part of the proper accept state
1717 {
1718 Lookahead l = n + static_cast<Lookahead>(std::distance(i->second.begin(), j));
1719 if (l < n)
1720 error(regex_error::exceeds_limits, loc);
1721 state->tails.insert(l);
1722 }
1723 Lookahead k = n;
1724 n += static_cast<Lookahead>(i->second.size());
1725 if (n < k)
1726 error(regex_error::exceeds_limits, loc);
1727 }
1728 }
1729 }
1730 else
1731 {
1732 Follow::iterator i = followpos.find(k->pos());
1733 if (i != followpos.end())
1734 {
1735 if (k->negate())
1736 {
1737 Positions::const_iterator b = i->second.begin();
1738 if (b != i->second.end() && !b->negate())
1739 {
1740 Positions to;
1741 for (Positions::const_iterator p = b; p != i->second.end(); ++p)
1742 to.insert(p->negate(true));
1743 i->second.swap(to);
1744 }
1745 }
1746 if (k->lazy())
1747 {
1748 #if 1 // CHECKED algorithmic options: 7/31 this optimization works fine when trim_lazy adds non-lazy greedy state, but may increase the total number of states:
1749 if (k->greedy())
1750 continue;
1751 #endif
1752 Follow::iterator j = followpos.find(*k);
1753 if (j == followpos.end())
1754 {
1755 // followpos is not defined for lazy pos yet, so add lazy followpos (memoization)
1756 j = followpos.insert(std::pair<Position,Positions>(*k, Positions())).first;
1757 for (Positions::const_iterator p = i->second.begin(); p != i->second.end(); ++p)
1758 j->second.insert(/* p->lazy() || CHECKED algorithmic options: 7/31 */ p->ticked() ? *p : /* CHECKED algorithmic options: 7/31 adds too many states p->greedy() ? p->lazy(0).greedy(false) : */ p->lazy(k->lazy())); // CHECKED algorithmic options: 7/18 ticked() preserves lookahead tail at '/' and ')'
1759 #ifdef DEBUG
1760 DBGLOGN("lazy followpos(");
1761 DBGLOGPOS(*k);
1762 DBGLOGA(" ) = {");
1763 for (Positions::const_iterator q = j->second.begin(); q != j->second.end(); ++q)
1764 DBGLOGPOS(*q);
1765 DBGLOGA(" }");
1766 #endif
1767 }
1768 i = j;
1769 }
1770 Positions &follow = i->second;
1771 Chars chars;
1772 if (literal)
1773 {
1774 if (std::isalpha(c) && is_modified('i', modifiers, loc))
1775 {
1776 chars.insert(uppercase(c));
1777 chars.insert(lowercase(c));
1778 }
1779 else
1780 {
1781 chars.insert(c);
1782 }
1783 }
1784 else
1785 {
1786 switch (c)
1787 {
1788 case '.':
1789 if (is_modified('s', modifiers, loc))
1790 {
1791 static const uint64_t dot[5] = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0 };
1792 chars |= Chars(dot);
1793 }
1794 else
1795 {
1796 static const uint64_t dot[5] = { 0xFFFFFFFFFFFFFBFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0 };
1797 chars |= Chars(dot);
1798 }
1799 break;
1800 case '^':
1801 chars.insert(is_modified('m', modifiers, loc) ? META_BOL : META_BOB);
1802 trim_anchors(follow, *k);
1803 break;
1804 case '$':
1805 chars.insert(is_modified('m', modifiers, loc) ? META_EOL : META_EOB);
1806 trim_anchors(follow, *k);
1807 break;
1808 default:
1809 if (c == '[')
1810 {
1811 compile_list(loc + 1, chars, modifiers);
1812 }
1813 else
1814 {
1815 switch (escape_at(loc))
1816 {
1817 case '\0': // no escape at current loc
1818 if (std::isalpha(c) && is_modified('i', modifiers, loc))
1819 {
1820 chars.insert(uppercase(c));
1821 chars.insert(lowercase(c));
1822 }
1823 else
1824 {
1825 chars.insert(c);
1826 }
1827 break;
1828 case 'i':
1829 chars.insert(META_IND);
1830 break;
1831 case 'j':
1832 chars.insert(META_DED);
1833 break;
1834 case 'k':
1835 chars.insert(META_UND);
1836 break;
1837 case 'A':
1838 chars.insert(META_BOB);
1839 trim_anchors(follow, *k);
1840 break;
1841 case 'z':
1842 chars.insert(META_EOB);
1843 trim_anchors(follow, *k);
1844 break;
1845 case 'B':
1846 chars.insert(k->anchor() ? META_NWB : META_NWE);
1847 trim_anchors(follow, *k);
1848 break;
1849 case 'b':
1850 if (k->anchor())
1851 chars.insert(META_BWB, META_EWB);
1852 else
1853 chars.insert(META_BWE, META_EWE);
1854 trim_anchors(follow, *k);
1855 break;
1856 case '<':
1857 chars.insert(k->anchor() ? META_BWB : META_BWE);
1858 trim_anchors(follow, *k);
1859 break;
1860 case '>':
1861 chars.insert(k->anchor() ? META_EWB : META_EWE);
1862 trim_anchors(follow, *k);
1863 break;
1864 default:
1865 c = parse_esc(loc, &chars);
1866 if (c <= 'z' && std::isalpha(c) && is_modified('i', modifiers, loc))
1867 {
1868 chars.insert(uppercase(c));
1869 chars.insert(lowercase(c));
1870 }
1871 }
1872 }
1873 }
1874 }
1875 transition(moves, chars, follow);
1876 }
1877 }
1878 }
1879 }
1880 Moves::iterator i = moves.begin();
1881 Moves::iterator e = moves.end();
1882 while (i != e)
1883 {
1884 trim_lazy(&i->second);
1885 if (i->second.empty())
1886 moves.erase(i++);
1887 else
1888 ++i;
1889 }
1890 DBGLOG("END compile_transition()");
1891 }
1892
transition(Moves & moves,Chars & chars,const Positions & follow) const1893 void Pattern::transition(
1894 Moves& moves,
1895 Chars& chars,
1896 const Positions& follow) const
1897 {
1898 Moves::iterator i = moves.begin();
1899 Moves::iterator end = moves.end();
1900 while (i != end)
1901 {
1902 if (i->second == follow)
1903 {
1904 chars += i->first;
1905 moves.erase(i++);
1906 }
1907 else
1908 {
1909 ++i;
1910 }
1911 }
1912 for (i = moves.begin(); i != end; ++i)
1913 {
1914 if (chars.intersects(i->first))
1915 {
1916 if (is_subset(follow, i->second))
1917 {
1918 chars -= i->first;
1919 }
1920 else
1921 {
1922 if (chars.contains(i->first))
1923 {
1924 chars -= i->first;
1925 set_insert(i->second, follow);
1926 }
1927 else
1928 {
1929 Move back(chars & i->first, i->second);
1930 set_insert(back.second, follow);
1931 chars -= back.first;
1932 i->first -= back.first;
1933 moves.push_back(back);
1934 }
1935 }
1936 if (!chars.any())
1937 return;
1938 }
1939 }
1940 if (chars.any())
1941 moves.push_back(Move(chars, follow));
1942 }
1943
compile_list(Location loc,Chars & chars,const Map & modifiers) const1944 void Pattern::compile_list(Location loc, Chars& chars, const Map& modifiers) const
1945 {
1946 bool complement = (at(loc) == '^');
1947 if (complement)
1948 ++loc;
1949 Char prev = META_BOL;
1950 Char lo = META_EOL;
1951 for (Char c = at(loc); c != '\0' && (c != ']' || prev == META_BOL); c = at(++loc))
1952 {
1953 if (c == '-' && !is_meta(prev) && is_meta(lo))
1954 {
1955 lo = prev;
1956 }
1957 else
1958 {
1959 size_t c_loc;
1960 if (c == '[' && at(loc + 1) == ':' && (c_loc = find_at(loc + 2, ':')) != std::string::npos && at(static_cast<Location>(c_loc + 1)) == ']')
1961 {
1962 if (c_loc == loc + 3)
1963 {
1964 ++loc;
1965 c = parse_esc(loc, &chars);
1966 }
1967 else
1968 {
1969 size_t i;
1970 for (i = 0; i < 14; ++i)
1971 if (eq_at(loc + 4, posix_class[i] + 2)) // ignore first two letters (upper/lower) when matching
1972 break;
1973 if (i < 14)
1974 posix(i, chars);
1975 else
1976 error(regex_error::invalid_class, loc);
1977 c = META_EOL;
1978 }
1979 loc = static_cast<Location>(c_loc + 1);
1980 }
1981 else if (c == opt_.e && !opt_.b)
1982 {
1983 c = parse_esc(loc, &chars);
1984 --loc;
1985 }
1986 if (!is_meta(c))
1987 {
1988 if (!is_meta(lo))
1989 {
1990 if (lo <= c)
1991 chars.insert(lo, c);
1992 else
1993 error(regex_error::invalid_class_range, loc);
1994 if (is_modified('i', modifiers, loc))
1995 {
1996 for (Char a = lo; a <= c; ++a)
1997 {
1998 if (std::isupper(a))
1999 chars.insert(lowercase(a));
2000 else if (std::islower(a))
2001 chars.insert(uppercase(a));
2002 }
2003 }
2004 c = META_EOL;
2005 }
2006 else
2007 {
2008 if (std::isalpha(c) && is_modified('i', modifiers, loc))
2009 {
2010 chars.insert(uppercase(c));
2011 chars.insert(lowercase(c));
2012 }
2013 else
2014 {
2015 chars.insert(c);
2016 }
2017 }
2018 }
2019 prev = c;
2020 lo = META_EOL;
2021 }
2022 }
2023 if (!is_meta(lo))
2024 chars.insert('-');
2025 if (complement)
2026 flip(chars);
2027 }
2028
posix(size_t index,Chars & chars) const2029 void Pattern::posix(size_t index, Chars& chars) const
2030 {
2031 DBGLOG("posix(%lu)", index);
2032 static const uint64_t posix_chars[14][5] = {
2033 { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, 0ULL, 0ULL, 0ULL }, // ASCII
2034 { 0x0000000100003E00ULL, 0x0000000000000000ULL, 0ULL, 0ULL, 0ULL }, // Space: \t-\r, ' '
2035 { 0x03FF000000000000ULL, 0x0000007E0000007EULL, 0ULL, 0ULL, 0ULL }, // XDigit: 0-9, A-F, a-f
2036 { 0x00000000FFFFFFFFULL, 0x8000000000000000ULL, 0ULL, 0ULL, 0ULL }, // Cntrl: \x00-0x1F, \0x7F
2037 { 0xFFFFFFFF00000000ULL, 0x7FFFFFFFFFFFFFFFULL, 0ULL, 0ULL, 0ULL }, // Print: ' '-'~'
2038 { 0x03FF000000000000ULL, 0x07FFFFFE07FFFFFEULL, 0ULL, 0ULL, 0ULL }, // Alnum: 0-9, A-Z, a-z
2039 { 0x0000000000000000ULL, 0x07FFFFFE07FFFFFEULL, 0ULL, 0ULL, 0ULL }, // Alpha: A-Z, a-z
2040 { 0x0000000100000200ULL, 0x0000000000000000ULL, 0ULL, 0ULL, 0ULL }, // Blank: \t, ' '
2041 { 0x03FF000000000000ULL, 0x0000000000000000ULL, 0ULL, 0ULL, 0ULL }, // Digit: 0-9
2042 { 0xFFFFFFFE00000000ULL, 0x7FFFFFFFFFFFFFFFULL, 0ULL, 0ULL, 0ULL }, // Graph: '!'-'~'
2043 { 0x0000000000000000ULL, 0x07FFFFFE00000000ULL, 0ULL, 0ULL, 0ULL }, // Lower: a-z
2044 { 0xFC00FFFE00000000ULL, 0x78000001F8000001ULL, 0ULL, 0ULL, 0ULL }, // Punct: '!'-'/', ':'-'@', '['-'`', '{'-'~'
2045 { 0x0000000000000000ULL, 0x0000000007FFFFFEULL, 0ULL, 0ULL, 0ULL }, // Upper: A-Z
2046 { 0x03FF000000000000ULL, 0x07FFFFFE87FFFFFEULL, 0ULL, 0ULL, 0ULL }, // Word: 0-9, A-Z, a-z, _
2047 };
2048 chars |= Chars(posix_chars[index]);
2049 }
2050
flip(Chars & chars) const2051 void Pattern::flip(Chars& chars) const
2052 {
2053 chars.flip256();
2054 }
2055
assemble(DFA::State * start)2056 void Pattern::assemble(DFA::State *start)
2057 {
2058 DBGLOG("BEGIN assemble()");
2059 timer_type t;
2060 timer_start(t);
2061 predict_match_dfa(start);
2062 export_dfa(start);
2063 compact_dfa(start);
2064 encode_dfa(start);
2065 wms_ = timer_elapsed(t);
2066 gencode_dfa(start);
2067 export_code();
2068 DBGLOG("END assemble()");
2069 }
2070
compact_dfa(DFA::State * start)2071 void Pattern::compact_dfa(DFA::State *start)
2072 {
2073 #if WITH_COMPACT_DFA == -1
2074 // edge compaction in reverse order
2075 for (DFA::State *state = start; state; state = state->next)
2076 {
2077 for (DFA::State::Edges::iterator i = state->edges.begin(); i != state->edges.end(); ++i)
2078 {
2079 Char hi = i->second.first;
2080 if (hi >= 0xFF)
2081 break;
2082 DFA::State::Edges::iterator j = i;
2083 ++j;
2084 while (j != state->edges.end() && j->first <= hi + 1)
2085 {
2086 hi = j->second.first;
2087 if (j->second.second == i->second.second)
2088 {
2089 i->second.first = hi;
2090 state->edges.erase(j++);
2091 }
2092 else
2093 {
2094 ++j;
2095 }
2096 }
2097 }
2098 }
2099 #elif WITH_COMPACT_DFA == 1
2100 // edge compaction
2101 for (DFA::State *state = start; state; state = state->next)
2102 {
2103 for (DFA::State::Edges::reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2104 {
2105 Char lo = i->second.first;
2106 if (lo <= 0x00)
2107 break;
2108 DFA::State::Edges::reverse_iterator j = i;
2109 ++j;
2110 while (j != state->edges.rend() && j->first >= lo - 1)
2111 {
2112 lo = j->second.first;
2113 if (j->second.second == i->second.second)
2114 {
2115 i->second.first = lo;
2116 state->edges.erase(--j.base());
2117 }
2118 else
2119 {
2120 ++j;
2121 }
2122 }
2123 }
2124 }
2125 #else
2126 (void)start;
2127 #endif
2128 }
2129
encode_dfa(DFA::State * start)2130 void Pattern::encode_dfa(DFA::State *start)
2131 {
2132 nop_ = 0;
2133 for (DFA::State *state = start; state; state = state->next)
2134 {
2135 if (state->accept > Const::AMAX)
2136 state->accept = Const::AMAX;
2137 state->first = state->index = nop_;
2138 #if WITH_COMPACT_DFA == -1
2139 Char hi = 0x00;
2140 for (DFA::State::Edges::const_iterator i = state->edges.begin(); i != state->edges.end(); ++i)
2141 {
2142 Char lo = i->first;
2143 if (lo == hi)
2144 hi = i->second.first + 1;
2145 ++nop_;
2146 if (is_meta(lo))
2147 nop_ += i->second.first - lo;
2148 }
2149 // add final dead state (HALT opcode) only when needed, i.e. skip dead state if all chars 0-255 are already covered
2150 if (hi <= 0xFF)
2151 {
2152 state->edges[hi] = std::pair<Char,DFA::State*>(0xFF, static_cast<DFA::State*>(NULL)); // cast to appease MSVC 2010
2153 ++nop_;
2154 }
2155 #else
2156 Char lo = 0xFF;
2157 bool covered = false;
2158 for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2159 {
2160 Char hi = i->first;
2161 if (lo == hi)
2162 {
2163 if (i->second.first == 0x00)
2164 covered = true;
2165 else
2166 lo = i->second.first - 1;
2167 }
2168 ++nop_;
2169 if (is_meta(lo))
2170 nop_ += hi - i->second.first;
2171 }
2172 // add final dead state (HALT opcode) only when needed, i.e. skip dead state if all chars 0-255 are already covered
2173 if (!covered)
2174 {
2175 state->edges[lo] = std::pair<Char,DFA::State*>(0x00, static_cast<DFA::State*>(NULL)); // cast to appease MSVC 2010
2176 ++nop_;
2177 }
2178 #endif
2179 nop_ += static_cast<Index>(state->heads.size() + state->tails.size() + (state->accept > 0 || state->redo));
2180 if (!valid_goto_index(nop_))
2181 throw regex_error(regex_error::exceeds_limits, rex_, rex_.size());
2182 }
2183 if (nop_ > Const::LONG)
2184 {
2185 // over 64K opcodes: use 64-bit GOTO LONG opcodes
2186 nop_ = 0;
2187 for (DFA::State *state = start; state; state = state->next)
2188 {
2189 state->index = nop_;
2190 #if WITH_COMPACT_DFA == -1
2191 Char hi = 0x00;
2192 for (DFA::State::Edges::const_iterator i = state->edges.begin(); i != state->edges.end(); ++i)
2193 {
2194 Char lo = i->first;
2195 if (lo == hi)
2196 hi = i->second.first + 1;
2197 // use 64-bit jump opcode if forward jump determined by previous loop is beyond 32K or backward jump is beyond 64K
2198 if (i->second.second != NULL &&
2199 ((i->second.second->first > state->first && i->second.second->first >= Const::LONG / 2) ||
2200 i->second.second->index >= Const::LONG))
2201 nop_ += 2;
2202 else
2203 ++nop_;
2204 if (is_meta(lo))
2205 {
2206 // use 64-bit jump opcode if forward jump determined by previous loop is beyond 32K or backward jump is beyond 64K
2207 if (i->second.second != NULL &&
2208 ((i->second.second->first > state->first && i->second.second->first >= Const::LONG / 2) ||
2209 i->second.second->index >= Const::LONG))
2210 nop_ += 2 * (i->second.first - lo);
2211 else
2212 nop_ += i->second.first - lo;
2213 }
2214 }
2215 #else
2216 Char lo = 0xFF;
2217 for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2218 {
2219 Char hi = i->first;
2220 if (lo == hi)
2221 lo = i->second.first - 1;
2222 // use 64-bit jump opcode if forward jump determined by previous loop is beyond 32K or backward jump is beyond 64K
2223 if (i->second.second != NULL &&
2224 ((i->second.second->first > state->first && i->second.second->first >= Const::LONG / 2) ||
2225 i->second.second->index >= Const::LONG))
2226 nop_ += 2;
2227 else
2228 ++nop_;
2229 if (is_meta(lo))
2230 {
2231 // use 64-bit jump opcode if forward jump determined by previous loop is beyond 32K or backward jump is beyond 64K
2232 if (i->second.second != NULL &&
2233 ((i->second.second->first > state->first && i->second.second->first >= Const::LONG / 2) ||
2234 i->second.second->index >= Const::LONG))
2235 nop_ += 2 * (hi - i->second.first);
2236 else
2237 nop_ += hi - i->second.first;
2238 }
2239 }
2240 #endif
2241 nop_ += static_cast<Index>(state->heads.size() + state->tails.size() + (state->accept > 0 || state->redo));
2242 if (!valid_goto_index(nop_))
2243 throw regex_error(regex_error::exceeds_limits, rex_, rex_.size());
2244 }
2245 }
2246 Opcode *opcode = new Opcode[nop_];
2247 opc_ = opcode;
2248 Index pc = 0;
2249 for (const DFA::State *state = start; state; state = state->next)
2250 {
2251 if (state->redo)
2252 {
2253 opcode[pc++] = opcode_redo();
2254 }
2255 else if (state->accept > 0)
2256 {
2257 opcode[pc++] = opcode_take(state->accept);
2258 }
2259 for (Lookaheads::const_iterator i = state->tails.begin(); i != state->tails.end(); ++i)
2260 {
2261 if (!valid_lookahead_index(static_cast<Index>(*i)))
2262 throw regex_error(regex_error::exceeds_limits, rex_, rex_.size());
2263 opcode[pc++] = opcode_tail(static_cast<Index>(*i));
2264 }
2265 for (Lookaheads::const_iterator i = state->heads.begin(); i != state->heads.end(); ++i)
2266 {
2267 if (!valid_lookahead_index(static_cast<Index>(*i)))
2268 throw regex_error(regex_error::exceeds_limits, rex_, rex_.size());
2269 opcode[pc++] = opcode_head(static_cast<Index>(*i));
2270 }
2271 #if WITH_COMPACT_DFA == -1
2272 for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2273 {
2274 Char lo = i->first;
2275 Char hi = i->second.first;
2276 Index target_first = i->second.second != NULL ? i->second.second->first : Const::IMAX;
2277 Index target_index = i->second.second != NULL ? i->second.second->index : Const::IMAX;
2278 if (!is_meta(lo))
2279 {
2280 if (target_index == Const::IMAX)
2281 {
2282 opcode[pc++] = opcode_goto(lo, hi, Const::HALT);
2283 }
2284 else if (nop_ > Const::LONG && ((target_first > state->first && target_first >= Const::LONG / 2) || target_index >= Const::LONG))
2285 {
2286 opcode[pc++] = opcode_goto(lo, hi, Const::LONG);
2287 opcode[pc++] = opcode_long(target_index);
2288 }
2289 else
2290 {
2291 opcode[pc++] = opcode_goto(lo, hi, target_index);
2292 }
2293 }
2294 else
2295 {
2296 do
2297 {
2298 if (target_index == Const::IMAX)
2299 {
2300 opcode[pc++] = opcode_goto(lo, lo, Const::HALT);
2301 }
2302 else if (nop_ > Const::LONG && ((target_first > state->first && target_first >= Const::LONG / 2) || target_index >= Const::LONG))
2303 {
2304 opcode[pc++] = opcode_goto(lo, lo, Const::LONG);
2305 opcode[pc++] = opcode_long(target_index);
2306 }
2307 else
2308 {
2309 opcode[pc++] = opcode_goto(lo, lo, target_index);
2310 }
2311 } while (++lo <= hi);
2312 }
2313 }
2314 #else
2315 for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2316 {
2317 Char hi = i->first;
2318 Char lo = i->second.first;
2319 if (is_meta(lo))
2320 {
2321 Index target_first = i->second.second != NULL ? i->second.second->first : Const::IMAX;
2322 Index target_index = i->second.second != NULL ? i->second.second->index : Const::IMAX;
2323 do
2324 {
2325 if (target_index == Const::IMAX)
2326 {
2327 opcode[pc++] = opcode_goto(lo, lo, Const::HALT);
2328 }
2329 else if (nop_ > Const::LONG && ((target_first > state->first && target_first >= Const::LONG / 2) || target_index >= Const::LONG))
2330 {
2331 opcode[pc++] = opcode_goto(lo, lo, Const::LONG);
2332 opcode[pc++] = opcode_long(target_index);
2333 }
2334 else
2335 {
2336 opcode[pc++] = opcode_goto(lo, lo, target_index);
2337 }
2338 } while (++lo <= hi);
2339 }
2340 }
2341 for (DFA::State::Edges::const_iterator i = state->edges.begin(); i != state->edges.end(); ++i)
2342 {
2343 Char lo = i->second.first;
2344 if (!is_meta(lo))
2345 {
2346 Char hi = i->first;
2347 if (i->second.second != NULL)
2348 {
2349 Index target_first = i->second.second->first;
2350 Index target_index = i->second.second->index;
2351 if (nop_ > Const::LONG && ((target_first > state->first && target_first >= Const::LONG / 2) || target_index >= Const::LONG))
2352 {
2353 opcode[pc++] = opcode_goto(lo, hi, Const::LONG);
2354 opcode[pc++] = opcode_long(target_index);
2355 }
2356 else
2357 {
2358 opcode[pc++] = opcode_goto(lo, hi, target_index);
2359 }
2360 }
2361 else
2362 {
2363 opcode[pc++] = opcode_goto(lo, hi, Const::HALT);
2364 }
2365 }
2366 }
2367 #endif
2368 }
2369 }
2370
gencode_dfa(const DFA::State * start) const2371 void Pattern::gencode_dfa(const DFA::State *start) const
2372 {
2373 if (!opt_.o)
2374 return;
2375 for (std::vector<std::string>::const_iterator i = opt_.f.begin(); i != opt_.f.end(); ++i)
2376 {
2377 const std::string& filename = *i;
2378 size_t len = filename.length();
2379 if ((len > 2 && filename.compare(len - 2, 2, ".h" ) == 0)
2380 || (len > 4 && filename.compare(len - 4, 4, ".hpp") == 0)
2381 || (len > 4 && filename.compare(len - 4, 4, ".cpp") == 0)
2382 || (len > 3 && filename.compare(len - 3, 3, ".cc" ) == 0))
2383 {
2384 FILE *file = NULL;
2385 int err = 0;
2386 if (filename.compare(0, 7, "stdout.") == 0)
2387 file = stdout;
2388 else if (filename.at(0) == '+')
2389 err = reflex::fopen_s(&file, filename.c_str() + 1, "a");
2390 else
2391 err = reflex::fopen_s(&file, filename.c_str(), "w");
2392 if (!err && file)
2393 {
2394 ::fprintf(file,
2395 "#include <reflex/matcher.h>\n\n"
2396 "#if defined(OS_WIN)\n"
2397 "#pragma warning(disable:4101 4102)\n"
2398 "#elif defined(__GNUC__)\n"
2399 "#pragma GCC diagnostic ignored \"-Wunused-variable\"\n"
2400 "#pragma GCC diagnostic ignored \"-Wunused-label\"\n"
2401 "#elif defined(__clang__)\n"
2402 "#pragma clang diagnostic ignored \"-Wunused-variable\"\n"
2403 "#pragma clang diagnostic ignored \"-Wunused-label\"\n"
2404 "#endif\n\n");
2405 write_namespace_open(file);
2406 ::fprintf(file,
2407 "void reflex_code_%s(reflex::Matcher& m)\n"
2408 "{\n"
2409 " int c0 = 0, c1 = 0;\n"
2410 " m.FSM_INIT(c1);\n", opt_.n.empty() ? "FSM" : opt_.n.c_str());
2411 for (const DFA::State *state = start; state; state = state->next)
2412 {
2413 ::fprintf(file, "\nS%u:\n", state->index);
2414 if (state == start)
2415 ::fprintf(file, " m.FSM_FIND();\n");
2416 if (state->redo)
2417 ::fprintf(file, " m.FSM_REDO();\n");
2418 else if (state->accept > 0)
2419 ::fprintf(file, " m.FSM_TAKE(%u);\n", state->accept);
2420 for (Lookaheads::const_iterator i = state->tails.begin(); i != state->tails.end(); ++i)
2421 ::fprintf(file, " m.FSM_TAIL(%u);\n", *i);
2422 for (Lookaheads::const_iterator i = state->heads.begin(); i != state->heads.end(); ++i)
2423 ::fprintf(file, " m.FSM_HEAD(%u);\n", *i);
2424 if (state->edges.rbegin() != state->edges.rend() && state->edges.rbegin()->first == META_DED)
2425 ::fprintf(file, " if (m.FSM_DENT()) goto S%u;\n", state->edges.rbegin()->second.second->index);
2426 bool peek = false; // if we need to read a character into c1
2427 bool prev = false; // if we need to keep the previous character in c0
2428 for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2429 {
2430 #if WITH_COMPACT_DFA == -1
2431 Char lo = i->first;
2432 Char hi = i->second.first;
2433 #else
2434 Char hi = i->first;
2435 Char lo = i->second.first;
2436 #endif
2437 if (!is_meta(lo))
2438 {
2439 Index target_index = Const::IMAX;
2440 if (i->second.second != NULL)
2441 target_index = i->second.second->index;
2442 DFA::State::Edges::const_reverse_iterator j = i;
2443 if (target_index == Const::IMAX && (++j == state->edges.rend() || is_meta(j->second.first)))
2444 break;
2445 peek = true;
2446 }
2447 else
2448 {
2449 do
2450 {
2451 if (lo == META_EOB || lo == META_EOL)
2452 peek = true;
2453 else if (lo == META_EWE || lo == META_BWE || lo == META_NWE)
2454 prev = peek = true;
2455 if (prev && peek)
2456 break;
2457 check_dfa_closure(i->second.second, 1, peek, prev);
2458 } while (++lo <= hi);
2459 }
2460 }
2461 bool read = peek;
2462 bool elif = false;
2463 #if WITH_COMPACT_DFA == -1
2464 for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2465 {
2466 Char lo = i->first;
2467 Char hi = i->second.first;
2468 Index target_index = Const::IMAX;
2469 if (i->second.second != NULL)
2470 target_index = i->second.second->index;
2471 if (read)
2472 {
2473 if (prev)
2474 ::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n");
2475 else
2476 ::fprintf(file, " c1 = m.FSM_CHAR();\n");
2477 read = false;
2478 }
2479 if (!is_meta(lo))
2480 {
2481 DFA::State::Edges::const_reverse_iterator j = i;
2482 if (target_index == Const::IMAX && (++j == state->edges.rend() || is_meta(j->second.first)))
2483 break;
2484 if (lo == hi)
2485 {
2486 ::fprintf(file, " if (c1 == ");
2487 print_char(file, lo);
2488 ::fprintf(file, ")");
2489 }
2490 else if (hi == 0xFF)
2491 {
2492 ::fprintf(file, " if (");
2493 print_char(file, lo);
2494 ::fprintf(file, " <= c1)");
2495 }
2496 else
2497 {
2498 ::fprintf(file, " if (");
2499 print_char(file, lo);
2500 ::fprintf(file, " <= c1 && c1 <= ");
2501 print_char(file, hi);
2502 ::fprintf(file, ")");
2503 }
2504 if (target_index == Const::IMAX)
2505 {
2506 if (peek)
2507 ::fprintf(file, " return m.FSM_HALT(c1);\n");
2508 else
2509 ::fprintf(file, " return m.FSM_HALT();\n");
2510 }
2511 else
2512 {
2513 ::fprintf(file, " goto S%u;\n", target_index);
2514 }
2515 }
2516 else
2517 {
2518 do
2519 {
2520 switch (lo)
2521 {
2522 case META_EOB:
2523 case META_EOL:
2524 ::fprintf(file, " ");
2525 if (elif)
2526 ::fprintf(file, "else ");
2527 ::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
2528 gencode_dfa_closure(file, i->second.second, 2, peek);
2529 ::fprintf(file, " }\n");
2530 elif = true;
2531 break;
2532 case META_EWE:
2533 case META_BWE:
2534 case META_NWE:
2535 ::fprintf(file, " ");
2536 if (elif)
2537 ::fprintf(file, "else ");
2538 ::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]);
2539 gencode_dfa_closure(file, i->second.second, 2, peek);
2540 ::fprintf(file, " }\n");
2541 elif = true;
2542 break;
2543 default:
2544 ::fprintf(file, " ");
2545 if (elif)
2546 ::fprintf(file, "else ");
2547 ::fprintf(file, "if (m.FSM_META_%s()) {\n", meta_label[lo - META_MIN]);
2548 gencode_dfa_closure(file, i->second.second, 2, peek);
2549 ::fprintf(file, " }\n");
2550 elif = true;
2551 }
2552 } while (++lo <= hi);
2553 }
2554 }
2555 #else
2556 for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2557 {
2558 Char hi = i->first;
2559 Char lo = i->second.first;
2560 if (is_meta(lo))
2561 {
2562 if (read)
2563 {
2564 if (prev)
2565 ::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n");
2566 else
2567 ::fprintf(file, " c1 = m.FSM_CHAR();\n");
2568 read = false;
2569 }
2570 do
2571 {
2572 switch (lo)
2573 {
2574 case META_EOB:
2575 case META_EOL:
2576 ::fprintf(file, " ");
2577 if (elif)
2578 ::fprintf(file, "else ");
2579 ::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
2580 gencode_dfa_closure(file, i->second.second, 2, peek);
2581 ::fprintf(file, " }\n");
2582 elif = true;
2583 break;
2584 case META_EWE:
2585 case META_BWE:
2586 case META_NWE:
2587 ::fprintf(file, " ");
2588 if (elif)
2589 ::fprintf(file, "else ");
2590 ::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]);
2591 gencode_dfa_closure(file, i->second.second, 2, peek);
2592 ::fprintf(file, " }\n");
2593 elif = true;
2594 break;
2595 default:
2596 ::fprintf(file, " ");
2597 if (elif)
2598 ::fprintf(file, "else ");
2599 ::fprintf(file, "if (m.FSM_META_%s()) {\n", meta_label[lo - META_MIN]);
2600 gencode_dfa_closure(file, i->second.second, 2, peek);
2601 ::fprintf(file, " }\n");
2602 elif = true;
2603 }
2604 } while (++lo <= hi);
2605 }
2606 }
2607 for (DFA::State::Edges::const_iterator i = state->edges.begin(); i != state->edges.end(); ++i)
2608 {
2609 Char hi = i->first;
2610 Char lo = i->second.first;
2611 Index target_index = Const::IMAX;
2612 if (i->second.second != NULL)
2613 target_index = i->second.second->index;
2614 if (read)
2615 {
2616 if (prev)
2617 ::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n");
2618 else
2619 ::fprintf(file, " c1 = m.FSM_CHAR();\n");
2620 read = false;
2621 }
2622 if (!is_meta(lo))
2623 {
2624 DFA::State::Edges::const_iterator j = i;
2625 if (target_index == Const::IMAX && (++j == state->edges.end() || is_meta(j->second.first)))
2626 break;
2627 if (lo == hi)
2628 {
2629 ::fprintf(file, " if (c1 == ");
2630 print_char(file, lo);
2631 ::fprintf(file, ")");
2632 }
2633 else if (hi == 0xFF)
2634 {
2635 ::fprintf(file, " if (");
2636 print_char(file, lo);
2637 ::fprintf(file, " <= c1)");
2638 }
2639 else
2640 {
2641 ::fprintf(file, " if (");
2642 print_char(file, lo);
2643 ::fprintf(file, " <= c1 && c1 <= ");
2644 print_char(file, hi);
2645 ::fprintf(file, ")");
2646 }
2647 if (target_index == Const::IMAX)
2648 {
2649 if (peek)
2650 ::fprintf(file, " return m.FSM_HALT(c1);\n");
2651 else
2652 ::fprintf(file, " return m.FSM_HALT();\n");
2653 }
2654 else
2655 {
2656 ::fprintf(file, " goto S%u;\n", target_index);
2657 }
2658 }
2659 }
2660 #endif
2661 if (peek)
2662 ::fprintf(file, " return m.FSM_HALT(c1);\n");
2663 else
2664 ::fprintf(file, " return m.FSM_HALT();\n");
2665 }
2666 ::fprintf(file, "}\n\n");
2667 if (opt_.p)
2668 write_predictor(file);
2669 write_namespace_close(file);
2670 if (file != stdout)
2671 ::fclose(file);
2672 }
2673 }
2674 }
2675 }
2676
check_dfa_closure(const DFA::State * state,int nest,bool & peek,bool & prev) const2677 void Pattern::check_dfa_closure(const DFA::State *state, int nest, bool& peek, bool& prev) const
2678 {
2679 if (nest > 4)
2680 return;
2681 for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2682 {
2683 #if WITH_COMPACT_DFA == -1
2684 Char lo = i->first;
2685 Char hi = i->second.first;
2686 #else
2687 Char hi = i->first;
2688 Char lo = i->second.first;
2689 #endif
2690 if (is_meta(lo))
2691 {
2692 do
2693 {
2694 if (lo == META_EOB || lo == META_EOL)
2695 peek = true;
2696 else if (lo == META_EWE || lo == META_BWE || lo == META_NWE)
2697 prev = peek = true;
2698 if (prev && peek)
2699 break;
2700 check_dfa_closure(i->second.second, nest + 1, peek, prev);
2701 } while (++lo <= hi);
2702 }
2703 }
2704 }
2705
gencode_dfa_closure(FILE * file,const DFA::State * state,int nest,bool peek) const2706 void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest, bool peek) const
2707 {
2708 bool elif = false;
2709 if (state->redo)
2710 {
2711 if (peek)
2712 ::fprintf(file, "%*sm.FSM_REDO(c1);\n", 2*nest, "");
2713 else
2714 ::fprintf(file, "%*sm.FSM_REDO();\n", 2*nest, "");
2715 }
2716 else if (state->accept > 0)
2717 {
2718 if (peek)
2719 ::fprintf(file, "%*sm.FSM_TAKE(%u, c1);\n", 2*nest, "", state->accept);
2720 else
2721 ::fprintf(file, "%*sm.FSM_TAKE(%u);\n", 2*nest, "", state->accept);
2722 }
2723 for (Lookaheads::const_iterator i = state->tails.begin(); i != state->tails.end(); ++i)
2724 ::fprintf(file, "%*sm.FSM_TAIL(%u);\n", 2*nest, "", *i);
2725 if (nest > 5)
2726 return;
2727 for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i)
2728 {
2729 #if WITH_COMPACT_DFA == -1
2730 Char lo = i->first;
2731 Char hi = i->second.first;
2732 #else
2733 Char hi = i->first;
2734 Char lo = i->second.first;
2735 #endif
2736 if (is_meta(lo))
2737 {
2738 do
2739 {
2740 switch (lo)
2741 {
2742 case META_EOB:
2743 case META_EOL:
2744 ::fprintf(file, "%*s", 2*nest, "");
2745 if (elif)
2746 ::fprintf(file, "else ");
2747 ::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]);
2748 gencode_dfa_closure(file, i->second.second, nest + 1, peek);
2749 ::fprintf(file, "%*s}\n", 2*nest, "");
2750 elif = true;
2751 break;
2752 case META_EWE:
2753 case META_BWE:
2754 case META_NWE:
2755 ::fprintf(file, "%*s", 2*nest, "");
2756 if (elif)
2757 ::fprintf(file, "else ");
2758 ::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]);
2759 gencode_dfa_closure(file, i->second.second, nest + 1, peek);
2760 ::fprintf(file, "%*s}\n", 2*nest, "");
2761 elif = true;
2762 break;
2763 default:
2764 ::fprintf(file, "%*s", 2*nest, "");
2765 if (elif)
2766 ::fprintf(file, "else ");
2767 ::fprintf(file, "if (m.FSM_META_%s()) {\n", meta_label[lo - META_MIN]);
2768 gencode_dfa_closure(file, i->second.second, nest + 1, peek);
2769 ::fprintf(file, "%*s}\n", 2*nest, "");
2770 elif = true;
2771 }
2772 } while (++lo <= hi);
2773 }
2774 }
2775 }
2776
export_dfa(const DFA::State * start) const2777 void Pattern::export_dfa(const DFA::State *start) const
2778 {
2779 for (std::vector<std::string>::const_iterator i = opt_.f.begin(); i != opt_.f.end(); ++i)
2780 {
2781 const std::string& filename = *i;
2782 size_t len = filename.length();
2783 if (len > 3 && filename.compare(len - 3, 3, ".gv") == 0)
2784 {
2785 FILE *file = NULL;
2786 int err = 0;
2787 if (filename.compare(0, 7, "stdout.") == 0)
2788 file = stdout;
2789 else if (filename.at(0) == '+')
2790 err = reflex::fopen_s(&file, filename.c_str() + 1, "a");
2791 else
2792 err = reflex::fopen_s(&file, filename.c_str(), "w");
2793 if (!err && file)
2794 {
2795 ::fprintf(file, "digraph %s {\n\t\trankdir=LR;\n\t\tconcentrate=true;\n\t\tnode [fontname=\"ArialNarrow\"];\n\t\tedge [fontname=\"Courier\"];\n\n\t\tinit [root=true,peripheries=0,label=\"%s\",fontname=\"Courier\"];\n\t\tinit -> N%p;\n", opt_.n.empty() ? "FSM" : opt_.n.c_str(), opt_.n.c_str(), (void*)start);
2796 for (const DFA::State *state = start; state; state = state->next)
2797 {
2798 if (state == start)
2799 ::fprintf(file, "\n/*START*/\t");
2800 if (state->redo)
2801 ::fprintf(file, "\n/*REDO*/\t");
2802 else if (state->accept)
2803 ::fprintf(file, "\n/*ACCEPT %u*/\t", state->accept);
2804 for (Lookaheads::const_iterator i = state->heads.begin(); i != state->heads.end(); ++i)
2805 ::fprintf(file, "\n/*HEAD %u*/\t", *i);
2806 for (Lookaheads::const_iterator i = state->tails.begin(); i != state->tails.end(); ++i)
2807 ::fprintf(file, "\n/*TAIL %u*/\t", *i);
2808 if (state != start && !state->accept && state->heads.empty() && state->tails.empty())
2809 ::fprintf(file, "\n/*STATE*/\t");
2810 ::fprintf(file, "N%p [label=\"", (void*)state);
2811 #ifdef DEBUG
2812 size_t k = 1;
2813 size_t n = std::sqrt(state->size()) + 0.5;
2814 const char *sep = "";
2815 for (Positions::const_iterator i = state->begin(); i != state->end(); ++i)
2816 {
2817 ::fprintf(file, "%s", sep);
2818 if (i->accept())
2819 {
2820 ::fprintf(file, "(%u)", i->accepts());
2821 }
2822 else
2823 {
2824 if (i->iter())
2825 ::fprintf(file, "%u.", i->iter());
2826 ::fprintf(file, "%u", i->loc());
2827 }
2828 if (i->lazy())
2829 ::fprintf(file, "?%u", i->lazy());
2830 if (i->anchor())
2831 ::fprintf(file, "^");
2832 if (i->greedy())
2833 ::fprintf(file, "!");
2834 if (i->ticked())
2835 ::fprintf(file, "'");
2836 if (i->negate())
2837 ::fprintf(file, "-");
2838 if (k++ % n)
2839 sep = " ";
2840 else
2841 sep = "\\n";
2842 }
2843 if ((state->accept && !state->redo) || !state->heads.empty() || !state->tails.empty())
2844 ::fprintf(file, "\\n");
2845 #endif
2846 if (state->accept > 0 && !state->redo)
2847 ::fprintf(file, "[%u]", state->accept);
2848 for (Lookaheads::const_iterator i = state->tails.begin(); i != state->tails.end(); ++i)
2849 ::fprintf(file, "%u>", *i);
2850 for (Lookaheads::const_iterator i = state->heads.begin(); i != state->heads.end(); ++i)
2851 ::fprintf(file, "<%u", *i);
2852 if (state->redo)
2853 ::fprintf(file, "\",style=dashed,peripheries=1];\n");
2854 else if (state->accept > 0)
2855 ::fprintf(file, "\",peripheries=2];\n");
2856 else if (!state->heads.empty())
2857 ::fprintf(file, "\",style=dashed,peripheries=2];\n");
2858 else
2859 ::fprintf(file, "\"];\n");
2860 for (DFA::State::Edges::const_iterator i = state->edges.begin(); i != state->edges.end(); ++i)
2861 {
2862 #if WITH_COMPACT_DFA == -1
2863 Char lo = i->first;
2864 Char hi = i->second.first;
2865 #else
2866 Char hi = i->first;
2867 Char lo = i->second.first;
2868 #endif
2869 if (!is_meta(lo))
2870 {
2871 ::fprintf(file, "\t\tN%p -> N%p [label=\"", (void*)state, (void*)i->second.second);
2872 if (lo >= '\a' && lo <= '\r')
2873 ::fprintf(file, "\\\\%c", "abtnvfr"[lo - '\a']);
2874 else if (lo == '"')
2875 ::fprintf(file, "\\\"");
2876 else if (lo == '\\')
2877 ::fprintf(file, "\\\\");
2878 else if (std::isgraph(lo))
2879 ::fprintf(file, "%c", lo);
2880 else if (lo < 8)
2881 ::fprintf(file, "\\\\%u", lo);
2882 else
2883 ::fprintf(file, "\\\\x%02x", lo);
2884 if (lo != hi)
2885 {
2886 ::fprintf(file, "-");
2887 if (hi >= '\a' && hi <= '\r')
2888 ::fprintf(file, "\\\\%c", "abtnvfr"[hi - '\a']);
2889 else if (hi == '"')
2890 ::fprintf(file, "\\\"");
2891 else if (hi == '\\')
2892 ::fprintf(file, "\\\\");
2893 else if (std::isgraph(hi))
2894 ::fprintf(file, "%c", hi);
2895 else if (hi < 8)
2896 ::fprintf(file, "\\\\%u", hi);
2897 else
2898 ::fprintf(file, "\\\\x%02x", hi);
2899 }
2900 ::fprintf(file, "\"];\n");
2901 }
2902 else
2903 {
2904 do
2905 {
2906 ::fprintf(file, "\t\tN%p -> N%p [label=\"%s\",style=\"dashed\"];\n", (void*)state, (void*)i->second.second, meta_label[lo - META_MIN]);
2907 } while (++lo <= hi);
2908 }
2909 }
2910 if (state->redo)
2911 ::fprintf(file, "\t\tN%p -> R%p;\n\t\tR%p [peripheries=0,label=\"redo\"];\n", (void*)state, (void*)state, (void*)state);
2912 }
2913 ::fprintf(file, "}\n");
2914 if (file != stdout)
2915 ::fclose(file);
2916 }
2917 }
2918 }
2919 }
2920
export_code() const2921 void Pattern::export_code() const
2922 {
2923 if (nop_ == 0)
2924 return;
2925 if (opt_.o)
2926 return;
2927 for (std::vector<std::string>::const_iterator i = opt_.f.begin(); i != opt_.f.end(); ++i)
2928 {
2929 const std::string& filename = *i;
2930 size_t len = filename.length();
2931 if ((len > 2 && filename.compare(len - 2, 2, ".h" ) == 0)
2932 || (len > 4 && filename.compare(len - 4, 4, ".hpp") == 0)
2933 || (len > 4 && filename.compare(len - 4, 4, ".cpp") == 0)
2934 || (len > 3 && filename.compare(len - 3, 3, ".cc" ) == 0))
2935 {
2936 FILE *file = NULL;
2937 int err = 0;
2938 if (filename.compare(0, 7, "stdout.") == 0)
2939 file = stdout;
2940 else if (filename.at(0) == '+')
2941 err = reflex::fopen_s(&file, filename.c_str() + 1, "a");
2942 else
2943 err = reflex::fopen_s(&file, filename.c_str(), "w");
2944 if (!err && file)
2945 {
2946 ::fprintf(file, "#ifndef REFLEX_CODE_DECL\n#include <reflex/pattern.h>\n#define REFLEX_CODE_DECL const reflex::Pattern::Opcode\n#endif\n\n");
2947 write_namespace_open(file);
2948 ::fprintf(file, "extern REFLEX_CODE_DECL reflex_code_%s[%u] =\n{\n", opt_.n.empty() ? "FSM" : opt_.n.c_str(), nop_);
2949 for (Index i = 0; i < nop_; ++i)
2950 {
2951 Opcode opcode = opc_[i];
2952 Char lo = lo_of(opcode);
2953 Char hi = hi_of(opcode);
2954 ::fprintf(file, " 0x%08X, // %u: ", opcode, i);
2955 if (is_opcode_redo(opcode))
2956 {
2957 ::fprintf(file, "REDO\n");
2958 }
2959 else if (is_opcode_take(opcode))
2960 {
2961 ::fprintf(file, "TAKE %u\n", long_index_of(opcode));
2962 }
2963 else if (is_opcode_tail(opcode))
2964 {
2965 ::fprintf(file, "TAIL %u\n", long_index_of(opcode));
2966 }
2967 else if (is_opcode_head(opcode))
2968 {
2969 ::fprintf(file, "HEAD %u\n", long_index_of(opcode));
2970 }
2971 else if (is_opcode_halt(opcode))
2972 {
2973 ::fprintf(file, "HALT\n");
2974 }
2975 else
2976 {
2977 Index index = index_of(opcode);
2978 if (index == Const::HALT)
2979 {
2980 ::fprintf(file, "HALT ON ");
2981 }
2982 else
2983 {
2984 if (index == Const::LONG)
2985 {
2986 opcode = opc_[++i];
2987 index = long_index_of(opcode);
2988 ::fprintf(file, "GOTO\n 0x%08X, // %u: FAR %u ON ", opcode, i, index);
2989 }
2990 else
2991 {
2992 ::fprintf(file, "GOTO %u ON ", index);
2993 }
2994 }
2995 if (!is_meta(lo))
2996 {
2997 print_char(file, lo, true);
2998 if (lo != hi)
2999 {
3000 ::fprintf(file, "-");
3001 print_char(file, hi, true);
3002 }
3003 }
3004 else
3005 {
3006 ::fprintf(file, "%s", meta_label[lo - META_MIN]);
3007 }
3008 ::fprintf(file, "\n");
3009 }
3010 }
3011 ::fprintf(file, "};\n\n");
3012 if (opt_.p)
3013 write_predictor(file);
3014 write_namespace_close(file);
3015 if (file != stdout)
3016 ::fclose(file);
3017 }
3018 }
3019 }
3020 }
3021
predict_match_dfa(DFA::State * start)3022 void Pattern::predict_match_dfa(DFA::State *start)
3023 {
3024 DBGLOG("BEGIN Pattern::predict_match_dfa()");
3025 DFA::State *state = start;
3026 one_ = true;
3027 while (state->accept == 0)
3028 {
3029 if (state->edges.size() != 1)
3030 {
3031 one_ = false;
3032 break;
3033 }
3034 Char lo = state->edges.begin()->first;
3035 if (!is_meta(lo) && lo == state->edges.begin()->second.first)
3036 {
3037 if (lo != state->edges.begin()->second.first)
3038 break;
3039 if (len_ >= 255)
3040 {
3041 one_ = false;
3042 break;
3043 }
3044 pre_[len_++] = static_cast<uint8_t>(lo);
3045 }
3046 else
3047 {
3048 one_ = false;
3049 break;
3050 }
3051 DFA::State *next = state->edges.begin()->second.second;
3052 if (next == NULL)
3053 {
3054 one_ = false;
3055 break;
3056 }
3057 state = next;
3058 }
3059 if (state != NULL && state->accept > 0 && !state->edges.empty())
3060 one_ = false;
3061 min_ = 0;
3062 std::memset(bit_, 0xFF, sizeof(bit_));
3063 std::memset(pmh_, 0xFF, sizeof(pmh_));
3064 std::memset(pma_, 0xFF, sizeof(pma_));
3065 if (state != NULL && state->accept == 0)
3066 {
3067 gen_predict_match(state);
3068 #ifdef DEBUG
3069 for (Char i = 0; i < 256; ++i)
3070 {
3071 if (bit_[i] != 0xFF)
3072 {
3073 if (isprint(i))
3074 DBGLOGN("bit['%c'] = %02x", i, bit_[i]);
3075 else
3076 DBGLOGN("bit[%3d] = %02x", i, bit_[i]);
3077 }
3078 }
3079 for (Hash i = 0; i < Const::HASH; ++i)
3080 {
3081 if (pmh_[i] != 0xFF)
3082 {
3083 if (isprint(pmh_[i]))
3084 DBGLOGN("pmh['%c'] = %02x", i, pmh_[i]);
3085 else
3086 DBGLOGN("pmh[%3d] = %02x", i, pmh_[i]);
3087 }
3088 }
3089 for (Hash i = 0; i < Const::HASH; ++i)
3090 {
3091 if (pma_[i] != 0xFF)
3092 {
3093 if (isprint(pma_[i]))
3094 DBGLOGN("pma['%c'] = %02x", i, pma_[i]);
3095 else
3096 DBGLOGN("pma[%3d] = %02x", i, pma_[i]);
3097 }
3098 }
3099 #endif
3100 }
3101 DBGLOGN("min = %zu", min_);
3102 DBGLOG("END Pattern::predict_match_dfa()");
3103 }
3104
gen_predict_match(DFA::State * state)3105 void Pattern::gen_predict_match(DFA::State *state)
3106 {
3107 min_ = 8;
3108 std::map<DFA::State*,ORanges<Hash> > states[8];
3109 gen_predict_match_transitions(state, states[0]);
3110 for (int level = 1; level < 8; ++level)
3111 for (std::map<DFA::State*,ORanges<Hash> >::iterator from = states[level - 1].begin(); from != states[level - 1].end(); ++from)
3112 gen_predict_match_transitions(level, from->first, from->second, states[level]);
3113 for (Char i = 0; i < 256; ++i)
3114 bit_[i] &= (1 << min_) - 1;
3115 }
3116
gen_predict_match_transitions(DFA::State * state,std::map<DFA::State *,ORanges<Hash>> & states)3117 void Pattern::gen_predict_match_transitions(DFA::State *state, std::map<DFA::State*,ORanges<Hash> >& states)
3118 {
3119 for (DFA::State::Edges::const_iterator edge = state->edges.begin(); edge != state->edges.end(); ++edge)
3120 {
3121 Char lo = edge->first;
3122 if (is_meta(lo))
3123 {
3124 min_ = 0;
3125 break;
3126 }
3127 DFA::State *next = edge->second.second;
3128 bool accept = (next == NULL || next->accept > 0);
3129 if (!accept)
3130 {
3131 for (DFA::State::Edges::const_iterator e = next->edges.begin(); e != next->edges.end(); ++e)
3132 {
3133 if (is_meta(e->first))
3134 {
3135 if (e == next->edges.begin())
3136 next = NULL;
3137 accept = true;
3138 break;
3139 }
3140 }
3141 }
3142 else if (next != NULL && next->edges.empty())
3143 {
3144 next = NULL;
3145 }
3146 if (accept)
3147 min_ = 1;
3148 Char hi = edge->second.first;
3149 while (lo <= hi)
3150 {
3151 bit_[lo] &= ~1;
3152 pmh_[lo] &= ~1;
3153 if (accept)
3154 pma_[lo] &= ~(1 << 7);
3155 pma_[lo] &= ~(1 << 6);
3156 if (next != NULL)
3157 states[next].insert(hash(lo));
3158 ++lo;
3159 }
3160 }
3161 }
3162
gen_predict_match_transitions(size_t level,DFA::State * state,ORanges<Hash> & labels,std::map<DFA::State *,ORanges<Hash>> & states)3163 void Pattern::gen_predict_match_transitions(size_t level, DFA::State *state, ORanges<Hash>& labels, std::map<DFA::State*,ORanges<Hash> >& states)
3164 {
3165 for (DFA::State::Edges::const_iterator edge = state->edges.begin(); edge != state->edges.end(); ++edge)
3166 {
3167 Char lo = edge->first;
3168 if (is_meta(lo))
3169 break;
3170 DFA::State *next = level < 7 ? edge->second.second : NULL;
3171 bool accept = next == NULL || next->accept > 0;
3172 if (!accept)
3173 {
3174 for (DFA::State::Edges::const_iterator e = next->edges.begin(); e != next->edges.end(); ++e)
3175 {
3176 if (is_meta(e->first))
3177 {
3178 if (e == next->edges.begin())
3179 next = NULL;
3180 accept = true;
3181 break;
3182 }
3183 }
3184 }
3185 else if (next != NULL && next->edges.empty())
3186 {
3187 next = NULL;
3188 }
3189 if (accept && min_ > level)
3190 min_ = level + 1;
3191 if (level < 4 || level <= min_)
3192 {
3193 Char hi = edge->second.first;
3194 if (level <= min_)
3195 while (lo <= hi)
3196 bit_[lo++] &= ~(1 << level);
3197 for (ORanges<Hash>::const_iterator label = labels.begin(); label != labels.end(); ++label)
3198 {
3199 Hash label_hi = label->second - 1;
3200 for (Hash label_lo = label->first; label_lo <= label_hi; ++label_lo)
3201 {
3202 for (lo = edge->first; lo <= hi; ++lo)
3203 {
3204 Hash h = hash(label_lo, static_cast<uint8_t>(lo));
3205 pmh_[h] &= ~(1 << level);
3206 if (level < 4)
3207 {
3208 if (level == 3 || accept)
3209 pma_[h] &= ~(1 << (7 - 2 * level));
3210 pma_[h] &= ~(1 << (6 - 2 * level));
3211 }
3212 if (next != NULL)
3213 states[next].insert(hash(h));
3214 }
3215 }
3216 }
3217 }
3218 }
3219 }
3220
write_predictor(FILE * file) const3221 void Pattern::write_predictor(FILE *file) const
3222 {
3223 ::fprintf(file, "extern const reflex::Pattern::Pred reflex_pred_%s[%zu] = {", opt_.n.empty() ? "FSM" : opt_.n.c_str(), 2 + len_ + (min_ > 1 && len_ == 0) * 256 + (min_ > 0) * Const::HASH);
3224 ::fprintf(file, "\n %3hhu,%3hhu,", static_cast<uint8_t>(len_), (static_cast<uint8_t>(min_ | (one_ << 4))));
3225 for (size_t i = 0; i < len_; ++i)
3226 ::fprintf(file, "%s%3hhu,", ((i + 2) & 0xF) ? "" : "\n ", static_cast<uint8_t>(pre_[i]));
3227 if (min_ > 0)
3228 {
3229 if (min_ > 1 && len_ == 0)
3230 {
3231 for (Char i = 0; i < 256; ++i)
3232 ::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~bit_[i]));
3233 }
3234 if (min_ >= 4)
3235 {
3236 for (Hash i = 0; i < Const::HASH; ++i)
3237 ::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~pmh_[i]));
3238 }
3239 else
3240 {
3241 for (Hash i = 0; i < Const::HASH; ++i)
3242 ::fprintf(file, "%s%3hhu,", (i & 0xF) ? "" : "\n ", static_cast<uint8_t>(~pma_[i]));
3243 }
3244 }
3245 ::fprintf(file, "\n};\n\n");
3246 }
3247
write_namespace_open(FILE * file) const3248 void Pattern::write_namespace_open(FILE *file) const
3249 {
3250 if (opt_.z.empty())
3251 return;
3252
3253 const std::string& s = opt_.z;
3254 size_t i = 0, j;
3255 while ((j = s.find("::", i)) != std::string::npos)
3256 {
3257 ::fprintf(file, "namespace %s {\n", s.substr(i, j - i).c_str());
3258 i = j + 2;
3259 }
3260 ::fprintf(file, "namespace %s {\n\n", s.substr(i).c_str());
3261 }
3262
write_namespace_close(FILE * file) const3263 void Pattern::write_namespace_close(FILE *file) const
3264 {
3265 if (opt_.z.empty())
3266 return;
3267
3268 const std::string& s = opt_.z;
3269 size_t i = 0, j;
3270 while ((j = s.find("::", i)) != std::string::npos)
3271 {
3272 ::fprintf(file, "} // namespace %s\n\n", s.substr(i, j - i).c_str());
3273 i = j + 2;
3274 }
3275 ::fprintf(file, "} // namespace %s\n\n", s.substr(i).c_str());
3276 }
3277
3278 } // namespace reflex
3279