1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Compiled regular expression representation.
6 // Tested by compile_test.cc
7 
8 #include "re2/prog.h"
9 
10 #include <stdint.h>
11 #include <string.h>
12 #include <algorithm>
13 #include <memory>
14 #include <utility>
15 
16 #include "util/util.h"
17 #include "util/logging.h"
18 #include "util/strutil.h"
19 #include "re2/bitmap256.h"
20 #include "re2/stringpiece.h"
21 
22 namespace re2 {
23 
24 // Constructors per Inst opcode
25 
InitAlt(uint32_t out,uint32_t out1)26 void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
27   DCHECK_EQ(out_opcode_, 0);
28   set_out_opcode(out, kInstAlt);
29   out1_ = out1;
30 }
31 
InitByteRange(int lo,int hi,int foldcase,uint32_t out)32 void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
33   DCHECK_EQ(out_opcode_, 0);
34   set_out_opcode(out, kInstByteRange);
35   lo_ = lo & 0xFF;
36   hi_ = hi & 0xFF;
37   hint_foldcase_ = foldcase&1;
38 }
39 
InitCapture(int cap,uint32_t out)40 void Prog::Inst::InitCapture(int cap, uint32_t out) {
41   DCHECK_EQ(out_opcode_, 0);
42   set_out_opcode(out, kInstCapture);
43   cap_ = cap;
44 }
45 
InitEmptyWidth(EmptyOp empty,uint32_t out)46 void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
47   DCHECK_EQ(out_opcode_, 0);
48   set_out_opcode(out, kInstEmptyWidth);
49   empty_ = empty;
50 }
51 
InitMatch(int32_t id)52 void Prog::Inst::InitMatch(int32_t id) {
53   DCHECK_EQ(out_opcode_, 0);
54   set_opcode(kInstMatch);
55   match_id_ = id;
56 }
57 
InitNop(uint32_t out)58 void Prog::Inst::InitNop(uint32_t out) {
59   DCHECK_EQ(out_opcode_, 0);
60   set_opcode(kInstNop);
61 }
62 
InitFail()63 void Prog::Inst::InitFail() {
64   DCHECK_EQ(out_opcode_, 0);
65   set_opcode(kInstFail);
66 }
67 
Dump()68 std::string Prog::Inst::Dump() {
69   switch (opcode()) {
70     default:
71       return StringPrintf("opcode %d", static_cast<int>(opcode()));
72 
73     case kInstAlt:
74       return StringPrintf("alt -> %d | %d", out(), out1_);
75 
76     case kInstAltMatch:
77       return StringPrintf("altmatch -> %d | %d", out(), out1_);
78 
79     case kInstByteRange:
80       return StringPrintf("byte%s [%02x-%02x] %d -> %d",
81                           foldcase() ? "/i" : "",
82                           lo_, hi_, hint(), out());
83 
84     case kInstCapture:
85       return StringPrintf("capture %d -> %d", cap_, out());
86 
87     case kInstEmptyWidth:
88       return StringPrintf("emptywidth %#x -> %d",
89                           static_cast<int>(empty_), out());
90 
91     case kInstMatch:
92       return StringPrintf("match! %d", match_id());
93 
94     case kInstNop:
95       return StringPrintf("nop -> %d", out());
96 
97     case kInstFail:
98       return StringPrintf("fail");
99   }
100 }
101 
Prog()102 Prog::Prog()
103   : anchor_start_(false),
104     anchor_end_(false),
105     reversed_(false),
106     did_flatten_(false),
107     did_onepass_(false),
108     start_(0),
109     start_unanchored_(0),
110     size_(0),
111     bytemap_range_(0),
112     first_byte_(-1),
113     list_count_(0),
114     dfa_mem_(0),
115     dfa_first_(NULL),
116     dfa_longest_(NULL) {
117 }
118 
~Prog()119 Prog::~Prog() {
120   DeleteDFA(dfa_longest_);
121   DeleteDFA(dfa_first_);
122 }
123 
124 typedef SparseSet Workq;
125 
AddToQueue(Workq * q,int id)126 static inline void AddToQueue(Workq* q, int id) {
127   if (id != 0)
128     q->insert(id);
129 }
130 
ProgToString(Prog * prog,Workq * q)131 static std::string ProgToString(Prog* prog, Workq* q) {
132   std::string s;
133   for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
134     int id = *i;
135     Prog::Inst* ip = prog->inst(id);
136     s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
137     AddToQueue(q, ip->out());
138     if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
139       AddToQueue(q, ip->out1());
140   }
141   return s;
142 }
143 
FlattenedProgToString(Prog * prog,int start)144 static std::string FlattenedProgToString(Prog* prog, int start) {
145   std::string s;
146   for (int id = start; id < prog->size(); id++) {
147     Prog::Inst* ip = prog->inst(id);
148     if (ip->last())
149       s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
150     else
151       s += StringPrintf("%d+ %s\n", id, ip->Dump().c_str());
152   }
153   return s;
154 }
155 
Dump()156 std::string Prog::Dump() {
157   if (did_flatten_)
158     return FlattenedProgToString(this, start_);
159 
160   Workq q(size_);
161   AddToQueue(&q, start_);
162   return ProgToString(this, &q);
163 }
164 
DumpUnanchored()165 std::string Prog::DumpUnanchored() {
166   if (did_flatten_)
167     return FlattenedProgToString(this, start_unanchored_);
168 
169   Workq q(size_);
170   AddToQueue(&q, start_unanchored_);
171   return ProgToString(this, &q);
172 }
173 
DumpByteMap()174 std::string Prog::DumpByteMap() {
175   std::string map;
176   for (int c = 0; c < 256; c++) {
177     int b = bytemap_[c];
178     int lo = c;
179     while (c < 256-1 && bytemap_[c+1] == b)
180       c++;
181     int hi = c;
182     map += StringPrintf("[%02x-%02x] -> %d\n", lo, hi, b);
183   }
184   return map;
185 }
186 
187 // Is ip a guaranteed match at end of text, perhaps after some capturing?
IsMatch(Prog * prog,Prog::Inst * ip)188 static bool IsMatch(Prog* prog, Prog::Inst* ip) {
189   for (;;) {
190     switch (ip->opcode()) {
191       default:
192         LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
193         return false;
194 
195       case kInstAlt:
196       case kInstAltMatch:
197       case kInstByteRange:
198       case kInstFail:
199       case kInstEmptyWidth:
200         return false;
201 
202       case kInstCapture:
203       case kInstNop:
204         ip = prog->inst(ip->out());
205         break;
206 
207       case kInstMatch:
208         return true;
209     }
210   }
211 }
212 
213 // Peep-hole optimizer.
Optimize()214 void Prog::Optimize() {
215   Workq q(size_);
216 
217   // Eliminate nops.  Most are taken out during compilation
218   // but a few are hard to avoid.
219   q.clear();
220   AddToQueue(&q, start_);
221   for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
222     int id = *i;
223 
224     Inst* ip = inst(id);
225     int j = ip->out();
226     Inst* jp;
227     while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
228       j = jp->out();
229     }
230     ip->set_out(j);
231     AddToQueue(&q, ip->out());
232 
233     if (ip->opcode() == kInstAlt) {
234       j = ip->out1();
235       while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
236         j = jp->out();
237       }
238       ip->out1_ = j;
239       AddToQueue(&q, ip->out1());
240     }
241   }
242 
243   // Insert kInstAltMatch instructions
244   // Look for
245   //   ip: Alt -> j | k
246   //	  j: ByteRange [00-FF] -> ip
247   //    k: Match
248   // or the reverse (the above is the greedy one).
249   // Rewrite Alt to AltMatch.
250   q.clear();
251   AddToQueue(&q, start_);
252   for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
253     int id = *i;
254     Inst* ip = inst(id);
255     AddToQueue(&q, ip->out());
256     if (ip->opcode() == kInstAlt)
257       AddToQueue(&q, ip->out1());
258 
259     if (ip->opcode() == kInstAlt) {
260       Inst* j = inst(ip->out());
261       Inst* k = inst(ip->out1());
262       if (j->opcode() == kInstByteRange && j->out() == id &&
263           j->lo() == 0x00 && j->hi() == 0xFF &&
264           IsMatch(this, k)) {
265         ip->set_opcode(kInstAltMatch);
266         continue;
267       }
268       if (IsMatch(this, j) &&
269           k->opcode() == kInstByteRange && k->out() == id &&
270           k->lo() == 0x00 && k->hi() == 0xFF) {
271         ip->set_opcode(kInstAltMatch);
272       }
273     }
274   }
275 }
276 
EmptyFlags(const StringPiece & text,const char * p)277 uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
278   int flags = 0;
279 
280   // ^ and \A
281   if (p == text.data())
282     flags |= kEmptyBeginText | kEmptyBeginLine;
283   else if (p[-1] == '\n')
284     flags |= kEmptyBeginLine;
285 
286   // $ and \z
287   if (p == text.data() + text.size())
288     flags |= kEmptyEndText | kEmptyEndLine;
289   else if (p < text.data() + text.size() && p[0] == '\n')
290     flags |= kEmptyEndLine;
291 
292   // \b and \B
293   if (p == text.data() && p == text.data() + text.size()) {
294     // no word boundary here
295   } else if (p == text.data()) {
296     if (IsWordChar(p[0]))
297       flags |= kEmptyWordBoundary;
298   } else if (p == text.data() + text.size()) {
299     if (IsWordChar(p[-1]))
300       flags |= kEmptyWordBoundary;
301   } else {
302     if (IsWordChar(p[-1]) != IsWordChar(p[0]))
303       flags |= kEmptyWordBoundary;
304   }
305   if (!(flags & kEmptyWordBoundary))
306     flags |= kEmptyNonWordBoundary;
307 
308   return flags;
309 }
310 
311 // ByteMapBuilder implements a coloring algorithm.
312 //
313 // The first phase is a series of "mark and merge" batches: we mark one or more
314 // [lo-hi] ranges, then merge them into our internal state. Batching is not for
315 // performance; rather, it means that the ranges are treated indistinguishably.
316 //
317 // Internally, the ranges are represented using a bitmap that stores the splits
318 // and a vector that stores the colors; both of them are indexed by the ranges'
319 // last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
320 // hi (if not already split), then recolor each range in between. The color map
321 // (i.e. from the old color to the new color) is maintained for the lifetime of
322 // the batch and so underpins this somewhat obscure approach to set operations.
323 //
324 // The second phase builds the bytemap from our internal state: we recolor each
325 // range, then store the new color (which is now the byte class) in each of the
326 // corresponding array elements. Finally, we output the number of byte classes.
327 class ByteMapBuilder {
328  public:
ByteMapBuilder()329   ByteMapBuilder() {
330     // Initial state: the [0-255] range has color 256.
331     // This will avoid problems during the second phase,
332     // in which we assign byte classes numbered from 0.
333     splits_.Set(255);
334     colors_[255] = 256;
335     nextcolor_ = 257;
336   }
337 
338   void Mark(int lo, int hi);
339   void Merge();
340   void Build(uint8_t* bytemap, int* bytemap_range);
341 
342  private:
343   int Recolor(int oldcolor);
344 
345   Bitmap256 splits_;
346   int colors_[256];
347   int nextcolor_;
348   std::vector<std::pair<int, int>> colormap_;
349   std::vector<std::pair<int, int>> ranges_;
350 
351   ByteMapBuilder(const ByteMapBuilder&) = delete;
352   ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
353 };
354 
Mark(int lo,int hi)355 void ByteMapBuilder::Mark(int lo, int hi) {
356   DCHECK_GE(lo, 0);
357   DCHECK_GE(hi, 0);
358   DCHECK_LE(lo, 255);
359   DCHECK_LE(hi, 255);
360   DCHECK_LE(lo, hi);
361 
362   // Ignore any [0-255] ranges. They cause us to recolor every range, which
363   // has no effect on the eventual result and is therefore a waste of time.
364   if (lo == 0 && hi == 255)
365     return;
366 
367   ranges_.emplace_back(lo, hi);
368 }
369 
Merge()370 void ByteMapBuilder::Merge() {
371   for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
372        it != ranges_.end();
373        ++it) {
374     int lo = it->first-1;
375     int hi = it->second;
376 
377     if (0 <= lo && !splits_.Test(lo)) {
378       splits_.Set(lo);
379       int next = splits_.FindNextSetBit(lo+1);
380       colors_[lo] = colors_[next];
381     }
382     if (!splits_.Test(hi)) {
383       splits_.Set(hi);
384       int next = splits_.FindNextSetBit(hi+1);
385       colors_[hi] = colors_[next];
386     }
387 
388     int c = lo+1;
389     while (c < 256) {
390       int next = splits_.FindNextSetBit(c);
391       colors_[next] = Recolor(colors_[next]);
392       if (next == hi)
393         break;
394       c = next+1;
395     }
396   }
397   colormap_.clear();
398   ranges_.clear();
399 }
400 
Build(uint8_t * bytemap,int * bytemap_range)401 void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
402   // Assign byte classes numbered from 0.
403   nextcolor_ = 0;
404 
405   int c = 0;
406   while (c < 256) {
407     int next = splits_.FindNextSetBit(c);
408     uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
409     while (c <= next) {
410       bytemap[c] = b;
411       c++;
412     }
413   }
414 
415   *bytemap_range = nextcolor_;
416 }
417 
Recolor(int oldcolor)418 int ByteMapBuilder::Recolor(int oldcolor) {
419   // Yes, this is a linear search. There can be at most 256
420   // colors and there will typically be far fewer than that.
421   // Also, we need to consider keys *and* values in order to
422   // avoid recoloring a given range more than once per batch.
423   std::vector<std::pair<int, int>>::const_iterator it =
424       std::find_if(colormap_.begin(), colormap_.end(),
425                    [=](const std::pair<int, int>& kv) -> bool {
426                      return kv.first == oldcolor || kv.second == oldcolor;
427                    });
428   if (it != colormap_.end())
429     return it->second;
430   int newcolor = nextcolor_;
431   nextcolor_++;
432   colormap_.emplace_back(oldcolor, newcolor);
433   return newcolor;
434 }
435 
ComputeByteMap()436 void Prog::ComputeByteMap() {
437   // Fill in bytemap with byte classes for the program.
438   // Ranges of bytes that are treated indistinguishably
439   // will be mapped to a single byte class.
440   ByteMapBuilder builder;
441 
442   // Don't repeat the work for ^ and $.
443   bool marked_line_boundaries = false;
444   // Don't repeat the work for \b and \B.
445   bool marked_word_boundaries = false;
446 
447   for (int id = 0; id < size(); id++) {
448     Inst* ip = inst(id);
449     if (ip->opcode() == kInstByteRange) {
450       int lo = ip->lo();
451       int hi = ip->hi();
452       builder.Mark(lo, hi);
453       if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
454         int foldlo = lo;
455         int foldhi = hi;
456         if (foldlo < 'a')
457           foldlo = 'a';
458         if (foldhi > 'z')
459           foldhi = 'z';
460         if (foldlo <= foldhi) {
461           foldlo += 'A' - 'a';
462           foldhi += 'A' - 'a';
463           builder.Mark(foldlo, foldhi);
464         }
465       }
466       // If this Inst is not the last Inst in its list AND the next Inst is
467       // also a ByteRange AND the Insts have the same out, defer the merge.
468       if (!ip->last() &&
469           inst(id+1)->opcode() == kInstByteRange &&
470           ip->out() == inst(id+1)->out())
471         continue;
472       builder.Merge();
473     } else if (ip->opcode() == kInstEmptyWidth) {
474       if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
475           !marked_line_boundaries) {
476         builder.Mark('\n', '\n');
477         builder.Merge();
478         marked_line_boundaries = true;
479       }
480       if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
481           !marked_word_boundaries) {
482         // We require two batches here: the first for ranges that are word
483         // characters, the second for ranges that are not word characters.
484         for (bool isword : {true, false}) {
485           int j;
486           for (int i = 0; i < 256; i = j) {
487             for (j = i + 1; j < 256 &&
488                             Prog::IsWordChar(static_cast<uint8_t>(i)) ==
489                                 Prog::IsWordChar(static_cast<uint8_t>(j));
490                  j++)
491               ;
492             if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
493               builder.Mark(i, j - 1);
494           }
495           builder.Merge();
496         }
497         marked_word_boundaries = true;
498       }
499     }
500   }
501 
502   builder.Build(bytemap_, &bytemap_range_);
503 
504   if (0) {  // For debugging, use trivial bytemap.
505     LOG(ERROR) << "Using trivial bytemap.";
506     for (int i = 0; i < 256; i++)
507       bytemap_[i] = static_cast<uint8_t>(i);
508     bytemap_range_ = 256;
509   }
510 }
511 
512 // Prog::Flatten() implements a graph rewriting algorithm.
513 //
514 // The overall process is similar to epsilon removal, but retains some epsilon
515 // transitions: those from Capture and EmptyWidth instructions; and those from
516 // nullable subexpressions. (The latter avoids quadratic blowup in transitions
517 // in the worst case.) It might be best thought of as Alt instruction elision.
518 //
519 // In conceptual terms, it divides the Prog into "trees" of instructions, then
520 // traverses the "trees" in order to produce "lists" of instructions. A "tree"
521 // is one or more instructions that grow from one "root" instruction to one or
522 // more "leaf" instructions; if a "tree" has exactly one instruction, then the
523 // "root" is also the "leaf". In most cases, a "root" is the successor of some
524 // "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
525 // and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
526 // EmptyWidth or Match instruction. However, this is insufficient for handling
527 // nested nullable subexpressions correctly, so in some cases, a "root" is the
528 // dominator of the instructions reachable from some "successor root" (i.e. it
529 // has an unreachable predecessor) and is considered a "dominator root". Since
530 // only Alt instructions can be "dominator roots" (other instructions would be
531 // "leaves"), only Alt instructions are required to be marked as predecessors.
532 //
533 // Dividing the Prog into "trees" comprises two passes: marking the "successor
534 // roots" and the predecessors; and marking the "dominator roots". Sorting the
535 // "successor roots" by their bytecode offsets enables iteration in order from
536 // greatest to least during the second pass; by working backwards in this case
537 // and flooding the graph no further than "leaves" and already marked "roots",
538 // it becomes possible to mark "dominator roots" without doing excessive work.
539 //
540 // Traversing the "trees" is just iterating over the "roots" in order of their
541 // marking and flooding the graph no further than "leaves" and "roots". When a
542 // "leaf" is reached, the instruction is copied with its successor remapped to
543 // its "root" number. When a "root" is reached, a Nop instruction is generated
544 // with its successor remapped similarly. As each "list" is produced, its last
545 // instruction is marked as such. After all of the "lists" have been produced,
546 // a pass over their instructions remaps their successors to bytecode offsets.
Flatten()547 void Prog::Flatten() {
548   if (did_flatten_)
549     return;
550   did_flatten_ = true;
551 
552   // Scratch structures. It's important that these are reused by functions
553   // that we call in loops because they would thrash the heap otherwise.
554   SparseSet reachable(size());
555   std::vector<int> stk;
556   stk.reserve(size());
557 
558   // First pass: Marks "successor roots" and predecessors.
559   // Builds the mapping from inst-ids to root-ids.
560   SparseArray<int> rootmap(size());
561   SparseArray<int> predmap(size());
562   std::vector<std::vector<int>> predvec;
563   MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
564 
565   // Second pass: Marks "dominator roots".
566   SparseArray<int> sorted(rootmap);
567   std::sort(sorted.begin(), sorted.end(), sorted.less);
568   for (SparseArray<int>::const_iterator i = sorted.end() - 1;
569        i != sorted.begin();
570        --i) {
571     if (i->index() != start_unanchored() && i->index() != start())
572       MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
573   }
574 
575   // Third pass: Emits "lists". Remaps outs to root-ids.
576   // Builds the mapping from root-ids to flat-ids.
577   std::vector<int> flatmap(rootmap.size());
578   std::vector<Inst> flat;
579   flat.reserve(size());
580   for (SparseArray<int>::const_iterator i = rootmap.begin();
581        i != rootmap.end();
582        ++i) {
583     flatmap[i->value()] = static_cast<int>(flat.size());
584     EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
585     flat.back().set_last();
586     // We have the bounds of the "list", so this is the
587     // most convenient point at which to compute hints.
588     ComputeHints(&flat, flatmap[i->value()], static_cast<int>(flat.size()));
589   }
590 
591   list_count_ = static_cast<int>(flatmap.size());
592   for (int i = 0; i < kNumInst; i++)
593     inst_count_[i] = 0;
594 
595   // Fourth pass: Remaps outs to flat-ids.
596   // Counts instructions by opcode.
597   for (int id = 0; id < static_cast<int>(flat.size()); id++) {
598     Inst* ip = &flat[id];
599     if (ip->opcode() != kInstAltMatch)  // handled in EmitList()
600       ip->set_out(flatmap[ip->out()]);
601     inst_count_[ip->opcode()]++;
602   }
603 
604   int total = 0;
605   for (int i = 0; i < kNumInst; i++)
606     total += inst_count_[i];
607   DCHECK_EQ(total, static_cast<int>(flat.size()));
608 
609   // Remap start_unanchored and start.
610   if (start_unanchored() == 0) {
611     DCHECK_EQ(start(), 0);
612   } else if (start_unanchored() == start()) {
613     set_start_unanchored(flatmap[1]);
614     set_start(flatmap[1]);
615   } else {
616     set_start_unanchored(flatmap[1]);
617     set_start(flatmap[2]);
618   }
619 
620   // Finally, replace the old instructions with the new instructions.
621   size_ = static_cast<int>(flat.size());
622   inst_ = PODArray<Inst>(size_);
623   memmove(inst_.data(), flat.data(), size_*sizeof inst_[0]);
624 
625   // Populate the list heads for BitState.
626   // 512 instructions limits the memory footprint to 1KiB.
627   if (size_ <= 512) {
628     list_heads_ = PODArray<uint16_t>(size_);
629     // 0xFF makes it more obvious if we try to look up a non-head.
630     memset(list_heads_.data(), 0xFF, size_*sizeof list_heads_[0]);
631     for (int i = 0; i < list_count_; ++i)
632       list_heads_[flatmap[i]] = i;
633   }
634 }
635 
MarkSuccessors(SparseArray<int> * rootmap,SparseArray<int> * predmap,std::vector<std::vector<int>> * predvec,SparseSet * reachable,std::vector<int> * stk)636 void Prog::MarkSuccessors(SparseArray<int>* rootmap,
637                           SparseArray<int>* predmap,
638                           std::vector<std::vector<int>>* predvec,
639                           SparseSet* reachable, std::vector<int>* stk) {
640   // Mark the kInstFail instruction.
641   rootmap->set_new(0, rootmap->size());
642 
643   // Mark the start_unanchored and start instructions.
644   if (!rootmap->has_index(start_unanchored()))
645     rootmap->set_new(start_unanchored(), rootmap->size());
646   if (!rootmap->has_index(start()))
647     rootmap->set_new(start(), rootmap->size());
648 
649   reachable->clear();
650   stk->clear();
651   stk->push_back(start_unanchored());
652   while (!stk->empty()) {
653     int id = stk->back();
654     stk->pop_back();
655   Loop:
656     if (reachable->contains(id))
657       continue;
658     reachable->insert_new(id);
659 
660     Inst* ip = inst(id);
661     switch (ip->opcode()) {
662       default:
663         LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
664         break;
665 
666       case kInstAltMatch:
667       case kInstAlt:
668         // Mark this instruction as a predecessor of each out.
669         for (int out : {ip->out(), ip->out1()}) {
670           if (!predmap->has_index(out)) {
671             predmap->set_new(out, static_cast<int>(predvec->size()));
672             predvec->emplace_back();
673           }
674           (*predvec)[predmap->get_existing(out)].emplace_back(id);
675         }
676         stk->push_back(ip->out1());
677         id = ip->out();
678         goto Loop;
679 
680       case kInstByteRange:
681       case kInstCapture:
682       case kInstEmptyWidth:
683         // Mark the out of this instruction as a "root".
684         if (!rootmap->has_index(ip->out()))
685           rootmap->set_new(ip->out(), rootmap->size());
686         id = ip->out();
687         goto Loop;
688 
689       case kInstNop:
690         id = ip->out();
691         goto Loop;
692 
693       case kInstMatch:
694       case kInstFail:
695         break;
696     }
697   }
698 }
699 
MarkDominator(int root,SparseArray<int> * rootmap,SparseArray<int> * predmap,std::vector<std::vector<int>> * predvec,SparseSet * reachable,std::vector<int> * stk)700 void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
701                          SparseArray<int>* predmap,
702                          std::vector<std::vector<int>>* predvec,
703                          SparseSet* reachable, std::vector<int>* stk) {
704   reachable->clear();
705   stk->clear();
706   stk->push_back(root);
707   while (!stk->empty()) {
708     int id = stk->back();
709     stk->pop_back();
710   Loop:
711     if (reachable->contains(id))
712       continue;
713     reachable->insert_new(id);
714 
715     if (id != root && rootmap->has_index(id)) {
716       // We reached another "tree" via epsilon transition.
717       continue;
718     }
719 
720     Inst* ip = inst(id);
721     switch (ip->opcode()) {
722       default:
723         LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
724         break;
725 
726       case kInstAltMatch:
727       case kInstAlt:
728         stk->push_back(ip->out1());
729         id = ip->out();
730         goto Loop;
731 
732       case kInstByteRange:
733       case kInstCapture:
734       case kInstEmptyWidth:
735         break;
736 
737       case kInstNop:
738         id = ip->out();
739         goto Loop;
740 
741       case kInstMatch:
742       case kInstFail:
743         break;
744     }
745   }
746 
747   for (SparseSet::const_iterator i = reachable->begin();
748        i != reachable->end();
749        ++i) {
750     int id = *i;
751     if (predmap->has_index(id)) {
752       for (int pred : (*predvec)[predmap->get_existing(id)]) {
753         if (!reachable->contains(pred)) {
754           // id has a predecessor that cannot be reached from root!
755           // Therefore, id must be a "root" too - mark it as such.
756           if (!rootmap->has_index(id))
757             rootmap->set_new(id, rootmap->size());
758         }
759       }
760     }
761   }
762 }
763 
EmitList(int root,SparseArray<int> * rootmap,std::vector<Inst> * flat,SparseSet * reachable,std::vector<int> * stk)764 void Prog::EmitList(int root, SparseArray<int>* rootmap,
765                     std::vector<Inst>* flat,
766                     SparseSet* reachable, std::vector<int>* stk) {
767   reachable->clear();
768   stk->clear();
769   stk->push_back(root);
770   while (!stk->empty()) {
771     int id = stk->back();
772     stk->pop_back();
773   Loop:
774     if (reachable->contains(id))
775       continue;
776     reachable->insert_new(id);
777 
778     if (id != root && rootmap->has_index(id)) {
779       // We reached another "tree" via epsilon transition. Emit a kInstNop
780       // instruction so that the Prog does not become quadratically larger.
781       flat->emplace_back();
782       flat->back().set_opcode(kInstNop);
783       flat->back().set_out(rootmap->get_existing(id));
784       continue;
785     }
786 
787     Inst* ip = inst(id);
788     switch (ip->opcode()) {
789       default:
790         LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
791         break;
792 
793       case kInstAltMatch:
794         flat->emplace_back();
795         flat->back().set_opcode(kInstAltMatch);
796         flat->back().set_out(static_cast<int>(flat->size()));
797         flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
798         FALLTHROUGH_INTENDED;
799 
800       case kInstAlt:
801         stk->push_back(ip->out1());
802         id = ip->out();
803         goto Loop;
804 
805       case kInstByteRange:
806       case kInstCapture:
807       case kInstEmptyWidth:
808         flat->emplace_back();
809         memmove(&flat->back(), ip, sizeof *ip);
810         flat->back().set_out(rootmap->get_existing(ip->out()));
811         break;
812 
813       case kInstNop:
814         id = ip->out();
815         goto Loop;
816 
817       case kInstMatch:
818       case kInstFail:
819         flat->emplace_back();
820         memmove(&flat->back(), ip, sizeof *ip);
821         break;
822     }
823   }
824 }
825 
826 // For each ByteRange instruction in [begin, end), computes a hint to execution
827 // engines: the delta to the next instruction (in flat) worth exploring iff the
828 // current instruction matched.
829 //
830 // Implements a coloring algorithm related to ByteMapBuilder, but in this case,
831 // colors are instructions and recoloring ranges precisely identifies conflicts
832 // between instructions. Iterating backwards over [begin, end) is guaranteed to
833 // identify the nearest conflict (if any) with only linear complexity.
ComputeHints(std::vector<Inst> * flat,int begin,int end)834 void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {
835   Bitmap256 splits;
836   int colors[256];
837 
838   bool dirty = false;
839   for (int id = end; id >= begin; --id) {
840     if (id == end ||
841         (*flat)[id].opcode() != kInstByteRange) {
842       if (dirty) {
843         dirty = false;
844         splits.Clear();
845       }
846       splits.Set(255);
847       colors[255] = id;
848       // At this point, the [0-255] range is colored with id.
849       // Thus, hints cannot point beyond id; and if id == end,
850       // hints that would have pointed to id will be 0 instead.
851       continue;
852     }
853     dirty = true;
854 
855     // We recolor the [lo-hi] range with id. Note that first ratchets backwards
856     // from end to the nearest conflict (if any) during recoloring.
857     int first = end;
858     auto Recolor = [&](int lo, int hi) {
859       // Like ByteMapBuilder, we split at lo-1 and at hi.
860       --lo;
861 
862       if (0 <= lo && !splits.Test(lo)) {
863         splits.Set(lo);
864         int next = splits.FindNextSetBit(lo+1);
865         colors[lo] = colors[next];
866       }
867       if (!splits.Test(hi)) {
868         splits.Set(hi);
869         int next = splits.FindNextSetBit(hi+1);
870         colors[hi] = colors[next];
871       }
872 
873       int c = lo+1;
874       while (c < 256) {
875         int next = splits.FindNextSetBit(c);
876         // Ratchet backwards...
877         first = std::min(first, colors[next]);
878         // Recolor with id - because it's the new nearest conflict!
879         colors[next] = id;
880         if (next == hi)
881           break;
882         c = next+1;
883       }
884     };
885 
886     Inst* ip = &(*flat)[id];
887     int lo = ip->lo();
888     int hi = ip->hi();
889     Recolor(lo, hi);
890     if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
891       int foldlo = lo;
892       int foldhi = hi;
893       if (foldlo < 'a')
894         foldlo = 'a';
895       if (foldhi > 'z')
896         foldhi = 'z';
897       if (foldlo <= foldhi) {
898         foldlo += 'A' - 'a';
899         foldhi += 'A' - 'a';
900         Recolor(foldlo, foldhi);
901       }
902     }
903 
904     if (first != end) {
905       uint16_t hint = static_cast<uint16_t>(std::min(first - id, 32767));
906       ip->hint_foldcase_ |= hint<<1;
907     }
908   }
909 }
910 
911 }  // namespace re2
912