1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Compile regular expression to Prog.
6 //
7 // Prog and Inst are defined in prog.h.
8 // This file's external interface is just Regexp::CompileToProg.
9 // The Compiler class defined in this file is private.
10 
11 #include <stdint.h>
12 #include <string.h>
13 #include <unordered_map>
14 #include <utility>
15 
16 #include "util/logging.h"
17 #include "util/utf.h"
18 #include "re2/pod_array.h"
19 #include "re2/prog.h"
20 #include "re2/re2.h"
21 #include "re2/regexp.h"
22 #include "re2/walker-inl.h"
23 
24 namespace re2 {
25 
26 // List of pointers to Inst* that need to be filled in (patched).
27 // Because the Inst* haven't been filled in yet,
28 // we can use the Inst* word to hold the list's "next" pointer.
29 // It's kind of sleazy, but it works well in practice.
30 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
31 //
32 // Because the out and out1 fields in Inst are no longer pointers,
33 // we can't use pointers directly here either.  Instead, head refers
34 // to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
35 // head == 0 represents the NULL list.  This is okay because instruction #0
36 // is always the fail instruction, which never appears on a list.
37 struct PatchList {
38   // Returns patch list containing just p.
Mkre2::PatchList39   static PatchList Mk(uint32_t p) {
40     return {p, p};
41   }
42 
43   // Patches all the entries on l to have value p.
44   // Caller must not ever use patch list again.
Patchre2::PatchList45   static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) {
46     while (l.head != 0) {
47       Prog::Inst* ip = &inst0[l.head>>1];
48       if (l.head&1) {
49         l.head = ip->out1();
50         ip->out1_ = p;
51       } else {
52         l.head = ip->out();
53         ip->set_out(p);
54       }
55     }
56   }
57 
58   // Appends two patch lists and returns result.
Appendre2::PatchList59   static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
60     if (l1.head == 0)
61       return l2;
62     if (l2.head == 0)
63       return l1;
64     Prog::Inst* ip = &inst0[l1.tail>>1];
65     if (l1.tail&1)
66       ip->out1_ = l2.head;
67     else
68       ip->set_out(l2.head);
69     return {l1.head, l2.tail};
70   }
71 
72   uint32_t head;
73   uint32_t tail;  // for constant-time append
74 };
75 
76 static const PatchList kNullPatchList = {0, 0};
77 
78 // Compiled program fragment.
79 struct Frag {
80   uint32_t begin;
81   PatchList end;
82   bool nullable;
83 
Fragre2::Frag84   Frag() : begin(0), end(kNullPatchList), nullable(false) {}
Fragre2::Frag85   Frag(uint32_t begin, PatchList end, bool nullable)
86       : begin(begin), end(end), nullable(nullable) {}
87 };
88 
89 // Input encodings.
90 enum Encoding {
91   kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
92   kEncodingLatin1,    // Latin-1 (0-FF)
93 };
94 
95 class Compiler : public Regexp::Walker<Frag> {
96  public:
97   explicit Compiler();
98   ~Compiler();
99 
100   // Compiles Regexp to a new Prog.
101   // Caller is responsible for deleting Prog when finished with it.
102   // If reversed is true, compiles for walking over the input
103   // string backward (reverses all concatenations).
104   static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
105 
106   // Compiles alternation of all the re to a new Prog.
107   // Each re has a match with an id equal to its index in the vector.
108   static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
109 
110   // Interface for Regexp::Walker, which helps traverse the Regexp.
111   // The walk is purely post-recursive: given the machines for the
112   // children, PostVisit combines them to create the machine for
113   // the current node.  The child_args are Frags.
114   // The Compiler traverses the Regexp parse tree, visiting
115   // each node in depth-first order.  It invokes PreVisit before
116   // visiting the node's children and PostVisit after visiting
117   // the children.
118   Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
119   Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
120                  int nchild_args);
121   Frag ShortVisit(Regexp* re, Frag parent_arg);
122   Frag Copy(Frag arg);
123 
124   // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
125   Frag Plus(Frag a, bool nongreedy);
126   Frag Star(Frag a, bool nongreedy);
127   Frag Quest(Frag a, bool nongreedy);
128 
129   // Given fragment a, returns (a) capturing as \n.
130   Frag Capture(Frag a, int n);
131 
132   // Given fragments a and b, returns ab; a|b
133   Frag Cat(Frag a, Frag b);
134   Frag Alt(Frag a, Frag b);
135 
136   // Returns a fragment that can't match anything.
137   Frag NoMatch();
138 
139   // Returns a fragment that matches the empty string.
140   Frag Match(int32_t id);
141 
142   // Returns a no-op fragment.
143   Frag Nop();
144 
145   // Returns a fragment matching the byte range lo-hi.
146   Frag ByteRange(int lo, int hi, bool foldcase);
147 
148   // Returns a fragment matching an empty-width special op.
149   Frag EmptyWidth(EmptyOp op);
150 
151   // Adds n instructions to the program.
152   // Returns the index of the first one.
153   // Returns -1 if no more instructions are available.
154   int AllocInst(int n);
155 
156   // Rune range compiler.
157 
158   // Begins a new alternation.
159   void BeginRange();
160 
161   // Adds a fragment matching the rune range lo-hi.
162   void AddRuneRange(Rune lo, Rune hi, bool foldcase);
163   void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
164   void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
165   void Add_80_10ffff();
166 
167   // New suffix that matches the byte range lo-hi, then goes to next.
168   int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
169   int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
170 
171   // Returns true iff the suffix is cached.
172   bool IsCachedRuneByteSuffix(int id);
173 
174   // Adds a suffix to alternation.
175   void AddSuffix(int id);
176 
177   // Adds a suffix to the trie starting from the given root node.
178   // Returns zero iff allocating an instruction fails. Otherwise, returns
179   // the current root node, which might be different from what was given.
180   int AddSuffixRecursive(int root, int id);
181 
182   // Finds the trie node for the given suffix. Returns a Frag in order to
183   // distinguish between pointing at the root node directly (end.head == 0)
184   // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively).
185   Frag FindByteRange(int root, int id);
186 
187   // Compares two ByteRanges and returns true iff they are equal.
188   bool ByteRangeEqual(int id1, int id2);
189 
190   // Returns the alternation of all the added suffixes.
191   Frag EndRange();
192 
193   // Single rune.
194   Frag Literal(Rune r, bool foldcase);
195 
196   void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor);
197   Prog* Finish(Regexp* re);
198 
199   // Returns .* where dot = any byte
200   Frag DotStar();
201 
202  private:
203   Prog* prog_;         // Program being built.
204   bool failed_;        // Did we give up compiling?
205   Encoding encoding_;  // Input encoding
206   bool reversed_;      // Should program run backward over text?
207 
208   PODArray<Prog::Inst> inst_;
209   int ninst_;          // Number of instructions used.
210   int max_ninst_;      // Maximum number of instructions.
211 
212   int64_t max_mem_;    // Total memory budget.
213 
214   std::unordered_map<uint64_t, int> rune_cache_;
215   Frag rune_range_;
216 
217   RE2::Anchor anchor_;  // anchor mode for RE2::Set
218 
219   Compiler(const Compiler&) = delete;
220   Compiler& operator=(const Compiler&) = delete;
221 };
222 
Compiler()223 Compiler::Compiler() {
224   prog_ = new Prog();
225   failed_ = false;
226   encoding_ = kEncodingUTF8;
227   reversed_ = false;
228   ninst_ = 0;
229   max_ninst_ = 1;  // make AllocInst for fail instruction okay
230   max_mem_ = 0;
231   int fail = AllocInst(1);
232   inst_[fail].InitFail();
233   max_ninst_ = 0;  // Caller must change
234 }
235 
~Compiler()236 Compiler::~Compiler() {
237   delete prog_;
238 }
239 
AllocInst(int n)240 int Compiler::AllocInst(int n) {
241   if (failed_ || ninst_ + n > max_ninst_) {
242     failed_ = true;
243     return -1;
244   }
245 
246   if (ninst_ + n > inst_.size()) {
247     int cap = inst_.size();
248     if (cap == 0)
249       cap = 8;
250     while (ninst_ + n > cap)
251       cap *= 2;
252     PODArray<Prog::Inst> inst(cap);
253     if (inst_.data() != NULL)
254       memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]);
255     memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]);
256     inst_ = std::move(inst);
257   }
258   int id = ninst_;
259   ninst_ += n;
260   return id;
261 }
262 
263 // These routines are somewhat hard to visualize in text --
264 // see http://swtch.com/~rsc/regexp/regexp1.html for
265 // pictures explaining what is going on here.
266 
267 // Returns an unmatchable fragment.
NoMatch()268 Frag Compiler::NoMatch() {
269   return Frag();
270 }
271 
272 // Is a an unmatchable fragment?
IsNoMatch(Frag a)273 static bool IsNoMatch(Frag a) {
274   return a.begin == 0;
275 }
276 
277 // Given fragments a and b, returns fragment for ab.
Cat(Frag a,Frag b)278 Frag Compiler::Cat(Frag a, Frag b) {
279   if (IsNoMatch(a) || IsNoMatch(b))
280     return NoMatch();
281 
282   // Elide no-op.
283   Prog::Inst* begin = &inst_[a.begin];
284   if (begin->opcode() == kInstNop &&
285       a.end.head == (a.begin << 1) &&
286       begin->out() == 0) {
287     // in case refs to a somewhere
288     PatchList::Patch(inst_.data(), a.end, b.begin);
289     return b;
290   }
291 
292   // To run backward over string, reverse all concatenations.
293   if (reversed_) {
294     PatchList::Patch(inst_.data(), b.end, a.begin);
295     return Frag(b.begin, a.end, b.nullable && a.nullable);
296   }
297 
298   PatchList::Patch(inst_.data(), a.end, b.begin);
299   return Frag(a.begin, b.end, a.nullable && b.nullable);
300 }
301 
302 // Given fragments for a and b, returns fragment for a|b.
Alt(Frag a,Frag b)303 Frag Compiler::Alt(Frag a, Frag b) {
304   // Special case for convenience in loops.
305   if (IsNoMatch(a))
306     return b;
307   if (IsNoMatch(b))
308     return a;
309 
310   int id = AllocInst(1);
311   if (id < 0)
312     return NoMatch();
313 
314   inst_[id].InitAlt(a.begin, b.begin);
315   return Frag(id, PatchList::Append(inst_.data(), a.end, b.end),
316               a.nullable || b.nullable);
317 }
318 
319 // When capturing submatches in like-Perl mode, a kOpAlt Inst
320 // treats out_ as the first choice, out1_ as the second.
321 //
322 // For *, +, and ?, if out_ causes another repetition,
323 // then the operator is greedy.  If out1_ is the repetition
324 // (and out_ moves forward), then the operator is non-greedy.
325 
326 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
Plus(Frag a,bool nongreedy)327 Frag Compiler::Plus(Frag a, bool nongreedy) {
328   int id = AllocInst(1);
329   if (id < 0)
330     return NoMatch();
331   PatchList pl;
332   if (nongreedy) {
333     inst_[id].InitAlt(0, a.begin);
334     pl = PatchList::Mk(id << 1);
335   } else {
336     inst_[id].InitAlt(a.begin, 0);
337     pl = PatchList::Mk((id << 1) | 1);
338   }
339   PatchList::Patch(inst_.data(), a.end, id);
340   return Frag(a.begin, pl, a.nullable);
341 }
342 
343 // Given a fragment for a, returns a fragment for a* or a*? (if nongreedy)
Star(Frag a,bool nongreedy)344 Frag Compiler::Star(Frag a, bool nongreedy) {
345   // When the subexpression is nullable, one Alt isn't enough to guarantee
346   // correct priority ordering within the transitive closure. The simplest
347   // solution is to handle it as (a+)? instead, which adds the second Alt.
348   if (a.nullable)
349     return Quest(Plus(a, nongreedy), nongreedy);
350 
351   int id = AllocInst(1);
352   if (id < 0)
353     return NoMatch();
354   PatchList pl;
355   if (nongreedy) {
356     inst_[id].InitAlt(0, a.begin);
357     pl = PatchList::Mk(id << 1);
358   } else {
359     inst_[id].InitAlt(a.begin, 0);
360     pl = PatchList::Mk((id << 1) | 1);
361   }
362   PatchList::Patch(inst_.data(), a.end, id);
363   return Frag(id, pl, true);
364 }
365 
366 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
Quest(Frag a,bool nongreedy)367 Frag Compiler::Quest(Frag a, bool nongreedy) {
368   if (IsNoMatch(a))
369     return Nop();
370   int id = AllocInst(1);
371   if (id < 0)
372     return NoMatch();
373   PatchList pl;
374   if (nongreedy) {
375     inst_[id].InitAlt(0, a.begin);
376     pl = PatchList::Mk(id << 1);
377   } else {
378     inst_[id].InitAlt(a.begin, 0);
379     pl = PatchList::Mk((id << 1) | 1);
380   }
381   return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true);
382 }
383 
384 // Returns a fragment for the byte range lo-hi.
ByteRange(int lo,int hi,bool foldcase)385 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
386   int id = AllocInst(1);
387   if (id < 0)
388     return NoMatch();
389   inst_[id].InitByteRange(lo, hi, foldcase, 0);
390   return Frag(id, PatchList::Mk(id << 1), false);
391 }
392 
393 // Returns a no-op fragment.  Sometimes unavoidable.
Nop()394 Frag Compiler::Nop() {
395   int id = AllocInst(1);
396   if (id < 0)
397     return NoMatch();
398   inst_[id].InitNop(0);
399   return Frag(id, PatchList::Mk(id << 1), true);
400 }
401 
402 // Returns a fragment that signals a match.
Match(int32_t match_id)403 Frag Compiler::Match(int32_t match_id) {
404   int id = AllocInst(1);
405   if (id < 0)
406     return NoMatch();
407   inst_[id].InitMatch(match_id);
408   return Frag(id, kNullPatchList, false);
409 }
410 
411 // Returns a fragment matching a particular empty-width op (like ^ or $)
EmptyWidth(EmptyOp empty)412 Frag Compiler::EmptyWidth(EmptyOp empty) {
413   int id = AllocInst(1);
414   if (id < 0)
415     return NoMatch();
416   inst_[id].InitEmptyWidth(empty, 0);
417   return Frag(id, PatchList::Mk(id << 1), true);
418 }
419 
420 // Given a fragment a, returns a fragment with capturing parens around a.
Capture(Frag a,int n)421 Frag Compiler::Capture(Frag a, int n) {
422   if (IsNoMatch(a))
423     return NoMatch();
424   int id = AllocInst(2);
425   if (id < 0)
426     return NoMatch();
427   inst_[id].InitCapture(2*n, a.begin);
428   inst_[id+1].InitCapture(2*n+1, 0);
429   PatchList::Patch(inst_.data(), a.end, id+1);
430 
431   return Frag(id, PatchList::Mk((id+1) << 1), a.nullable);
432 }
433 
434 // A Rune is a name for a Unicode code point.
435 // Returns maximum rune encoded by UTF-8 sequence of length len.
MaxRune(int len)436 static int MaxRune(int len) {
437   int b;  // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
438   if (len == 1)
439     b = 7;
440   else
441     b = 8-(len+1) + 6*(len-1);
442   return (1<<b) - 1;   // maximum Rune for b bits.
443 }
444 
445 // The rune range compiler caches common suffix fragments,
446 // which are very common in UTF-8 (e.g., [80-bf]).
447 // The fragment suffixes are identified by their start
448 // instructions.  NULL denotes the eventual end match.
449 // The Frag accumulates in rune_range_.  Caching common
450 // suffixes reduces the UTF-8 "." from 32 to 24 instructions,
451 // and it reduces the corresponding one-pass NFA from 16 nodes to 8.
452 
BeginRange()453 void Compiler::BeginRange() {
454   rune_cache_.clear();
455   rune_range_.begin = 0;
456   rune_range_.end = kNullPatchList;
457 }
458 
UncachedRuneByteSuffix(uint8_t lo,uint8_t hi,bool foldcase,int next)459 int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
460                                      int next) {
461   Frag f = ByteRange(lo, hi, foldcase);
462   if (next != 0) {
463     PatchList::Patch(inst_.data(), f.end, next);
464   } else {
465     rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
466   }
467   return f.begin;
468 }
469 
MakeRuneCacheKey(uint8_t lo,uint8_t hi,bool foldcase,int next)470 static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
471                                  int next) {
472   return (uint64_t)next << 17 |
473          (uint64_t)lo   <<  9 |
474          (uint64_t)hi   <<  1 |
475          (uint64_t)foldcase;
476 }
477 
CachedRuneByteSuffix(uint8_t lo,uint8_t hi,bool foldcase,int next)478 int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
479                                    int next) {
480   uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
481   std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
482   if (it != rune_cache_.end())
483     return it->second;
484   int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
485   rune_cache_[key] = id;
486   return id;
487 }
488 
IsCachedRuneByteSuffix(int id)489 bool Compiler::IsCachedRuneByteSuffix(int id) {
490   uint8_t lo = inst_[id].lo_;
491   uint8_t hi = inst_[id].hi_;
492   bool foldcase = inst_[id].foldcase() != 0;
493   int next = inst_[id].out();
494 
495   uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
496   return rune_cache_.find(key) != rune_cache_.end();
497 }
498 
AddSuffix(int id)499 void Compiler::AddSuffix(int id) {
500   if (failed_)
501     return;
502 
503   if (rune_range_.begin == 0) {
504     rune_range_.begin = id;
505     return;
506   }
507 
508   if (encoding_ == kEncodingUTF8) {
509     // Build a trie in order to reduce fanout.
510     rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
511     return;
512   }
513 
514   int alt = AllocInst(1);
515   if (alt < 0) {
516     rune_range_.begin = 0;
517     return;
518   }
519   inst_[alt].InitAlt(rune_range_.begin, id);
520   rune_range_.begin = alt;
521 }
522 
AddSuffixRecursive(int root,int id)523 int Compiler::AddSuffixRecursive(int root, int id) {
524   DCHECK(inst_[root].opcode() == kInstAlt ||
525          inst_[root].opcode() == kInstByteRange);
526 
527   Frag f = FindByteRange(root, id);
528   if (IsNoMatch(f)) {
529     int alt = AllocInst(1);
530     if (alt < 0)
531       return 0;
532     inst_[alt].InitAlt(root, id);
533     return alt;
534   }
535 
536   int br;
537   if (f.end.head == 0)
538     br = root;
539   else if (f.end.head&1)
540     br = inst_[f.begin].out1();
541   else
542     br = inst_[f.begin].out();
543 
544   if (IsCachedRuneByteSuffix(br)) {
545     // We can't fiddle with cached suffixes, so make a clone of the head.
546     int byterange = AllocInst(1);
547     if (byterange < 0)
548       return 0;
549     inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
550                                    inst_[br].foldcase(), inst_[br].out());
551 
552     // Ensure that the parent points to the clone, not to the original.
553     // Note that this could leave the head unreachable except via the cache.
554     br = byterange;
555     if (f.end.head == 0)
556       root = br;
557     else if (f.end.head&1)
558       inst_[f.begin].out1_ = br;
559     else
560       inst_[f.begin].set_out(br);
561   }
562 
563   int out = inst_[id].out();
564   if (!IsCachedRuneByteSuffix(id)) {
565     // The head should be the instruction most recently allocated, so free it
566     // instead of leaving it unreachable.
567     DCHECK_EQ(id, ninst_-1);
568     inst_[id].out_opcode_ = 0;
569     inst_[id].out1_ = 0;
570     ninst_--;
571   }
572 
573   out = AddSuffixRecursive(inst_[br].out(), out);
574   if (out == 0)
575     return 0;
576 
577   inst_[br].set_out(out);
578   return root;
579 }
580 
ByteRangeEqual(int id1,int id2)581 bool Compiler::ByteRangeEqual(int id1, int id2) {
582   return inst_[id1].lo() == inst_[id2].lo() &&
583          inst_[id1].hi() == inst_[id2].hi() &&
584          inst_[id1].foldcase() == inst_[id2].foldcase();
585 }
586 
FindByteRange(int root,int id)587 Frag Compiler::FindByteRange(int root, int id) {
588   if (inst_[root].opcode() == kInstByteRange) {
589     if (ByteRangeEqual(root, id))
590       return Frag(root, kNullPatchList, false);
591     else
592       return NoMatch();
593   }
594 
595   while (inst_[root].opcode() == kInstAlt) {
596     int out1 = inst_[root].out1();
597     if (ByteRangeEqual(out1, id))
598       return Frag(root, PatchList::Mk((root << 1) | 1), false);
599 
600     // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
601     // what we're looking for, then we can stop immediately. Unfortunately, we
602     // can't short-circuit the search in reverse mode.
603     if (!reversed_)
604       return NoMatch();
605 
606     int out = inst_[root].out();
607     if (inst_[out].opcode() == kInstAlt)
608       root = out;
609     else if (ByteRangeEqual(out, id))
610       return Frag(root, PatchList::Mk(root << 1), false);
611     else
612       return NoMatch();
613   }
614 
615   LOG(DFATAL) << "should never happen";
616   return NoMatch();
617 }
618 
EndRange()619 Frag Compiler::EndRange() {
620   return rune_range_;
621 }
622 
623 // Converts rune range lo-hi into a fragment that recognizes
624 // the bytes that would make up those runes in the current
625 // encoding (Latin 1 or UTF-8).
626 // This lets the machine work byte-by-byte even when
627 // using multibyte encodings.
628 
AddRuneRange(Rune lo,Rune hi,bool foldcase)629 void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
630   switch (encoding_) {
631     default:
632     case kEncodingUTF8:
633       AddRuneRangeUTF8(lo, hi, foldcase);
634       break;
635     case kEncodingLatin1:
636       AddRuneRangeLatin1(lo, hi, foldcase);
637       break;
638   }
639 }
640 
AddRuneRangeLatin1(Rune lo,Rune hi,bool foldcase)641 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
642   // Latin-1 is easy: runes *are* bytes.
643   if (lo > hi || lo > 0xFF)
644     return;
645   if (hi > 0xFF)
646     hi = 0xFF;
647   AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
648                                    static_cast<uint8_t>(hi), foldcase, 0));
649 }
650 
Add_80_10ffff()651 void Compiler::Add_80_10ffff() {
652   // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
653   // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
654   // permitting overlong encodings in E0 and F0 sequences and code points
655   // over 10FFFF in F4 sequences, the size of the bytecode and the number
656   // of equivalence classes are reduced significantly.
657   int id;
658   if (reversed_) {
659     // Prefix factoring matters, but we don't have to handle it here
660     // because the rune range trie logic takes care of that already.
661     id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0);
662     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
663     AddSuffix(id);
664 
665     id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0);
666     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
667     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
668     AddSuffix(id);
669 
670     id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0);
671     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
672     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
673     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
674     AddSuffix(id);
675   } else {
676     // Suffix factoring matters - and we do have to handle it here.
677     int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0);
678     id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1);
679     AddSuffix(id);
680 
681     int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1);
682     id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2);
683     AddSuffix(id);
684 
685     int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2);
686     id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3);
687     AddSuffix(id);
688   }
689 }
690 
AddRuneRangeUTF8(Rune lo,Rune hi,bool foldcase)691 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
692   if (lo > hi)
693     return;
694 
695   // Pick off 80-10FFFF as a common special case.
696   if (lo == 0x80 && hi == 0x10ffff) {
697     Add_80_10ffff();
698     return;
699   }
700 
701   // Split range into same-length sized ranges.
702   for (int i = 1; i < UTFmax; i++) {
703     Rune max = MaxRune(i);
704     if (lo <= max && max < hi) {
705       AddRuneRangeUTF8(lo, max, foldcase);
706       AddRuneRangeUTF8(max+1, hi, foldcase);
707       return;
708     }
709   }
710 
711   // ASCII range is always a special case.
712   if (hi < Runeself) {
713     AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
714                                      static_cast<uint8_t>(hi), foldcase, 0));
715     return;
716   }
717 
718   // Split range into sections that agree on leading bytes.
719   for (int i = 1; i < UTFmax; i++) {
720     uint32_t m = (1<<(6*i)) - 1;  // last i bytes of a UTF-8 sequence
721     if ((lo & ~m) != (hi & ~m)) {
722       if ((lo & m) != 0) {
723         AddRuneRangeUTF8(lo, lo|m, foldcase);
724         AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
725         return;
726       }
727       if ((hi & m) != m) {
728         AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
729         AddRuneRangeUTF8(hi&~m, hi, foldcase);
730         return;
731       }
732     }
733   }
734 
735   // Finally.  Generate byte matching equivalent for lo-hi.
736   uint8_t ulo[UTFmax], uhi[UTFmax];
737   int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
738   int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
739   (void)m;  // USED(m)
740   DCHECK_EQ(n, m);
741 
742   // The logic below encodes this thinking:
743   //
744   // 1. When we have built the whole suffix, we know that it cannot
745   // possibly be a suffix of anything longer: in forward mode, nothing
746   // else can occur before the leading byte; in reverse mode, nothing
747   // else can occur after the last continuation byte or else the leading
748   // byte would have to change. Thus, there is no benefit to caching
749   // the first byte of the suffix whereas there is a cost involved in
750   // cloning it if it begins a common prefix, which is fairly likely.
751   //
752   // 2. Conversely, the last byte of the suffix cannot possibly be a
753   // prefix of anything because next == 0, so we will never want to
754   // clone it, but it is fairly likely to be a common suffix. Perhaps
755   // more so in reverse mode than in forward mode because the former is
756   // "converging" towards lower entropy, but caching is still worthwhile
757   // for the latter in cases such as 80-BF.
758   //
759   // 3. Handling the bytes between the first and the last is less
760   // straightforward and, again, the approach depends on whether we are
761   // "converging" towards lower entropy: in forward mode, a single byte
762   // is unlikely to be part of a common suffix whereas a byte range
763   // is more likely so; in reverse mode, a byte range is unlikely to
764   // be part of a common suffix whereas a single byte is more likely
765   // so. The same benefit versus cost argument applies here.
766   int id = 0;
767   if (reversed_) {
768     for (int i = 0; i < n; i++) {
769       // In reverse UTF-8 mode: cache the leading byte; don't cache the last
770       // continuation byte; cache anything else iff it's a single byte (XX-XX).
771       if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
772         id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
773       else
774         id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
775     }
776   } else {
777     for (int i = n-1; i >= 0; i--) {
778       // In forward UTF-8 mode: don't cache the leading byte; cache the last
779       // continuation byte; cache anything else iff it's a byte range (XX-YY).
780       if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
781         id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
782       else
783         id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
784     }
785   }
786   AddSuffix(id);
787 }
788 
789 // Should not be called.
Copy(Frag arg)790 Frag Compiler::Copy(Frag arg) {
791   // We're using WalkExponential; there should be no copying.
792   LOG(DFATAL) << "Compiler::Copy called!";
793   failed_ = true;
794   return NoMatch();
795 }
796 
797 // Visits a node quickly; called once WalkExponential has
798 // decided to cut this walk short.
ShortVisit(Regexp * re,Frag)799 Frag Compiler::ShortVisit(Regexp* re, Frag) {
800   failed_ = true;
801   return NoMatch();
802 }
803 
804 // Called before traversing a node's children during the walk.
PreVisit(Regexp * re,Frag,bool * stop)805 Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
806   // Cut off walk if we've already failed.
807   if (failed_)
808     *stop = true;
809 
810   return Frag();  // not used by caller
811 }
812 
Literal(Rune r,bool foldcase)813 Frag Compiler::Literal(Rune r, bool foldcase) {
814   switch (encoding_) {
815     default:
816       return Frag();
817 
818     case kEncodingLatin1:
819       return ByteRange(r, r, foldcase);
820 
821     case kEncodingUTF8: {
822       if (r < Runeself)  // Make common case fast.
823         return ByteRange(r, r, foldcase);
824       uint8_t buf[UTFmax];
825       int n = runetochar(reinterpret_cast<char*>(buf), &r);
826       Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
827       for (int i = 1; i < n; i++)
828         f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
829       return f;
830     }
831   }
832 }
833 
834 // Called after traversing the node's children during the walk.
835 // Given their frags, build and return the frag for this re.
PostVisit(Regexp * re,Frag,Frag,Frag * child_frags,int nchild_frags)836 Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
837                          int nchild_frags) {
838   // If a child failed, don't bother going forward, especially
839   // since the child_frags might contain Frags with NULLs in them.
840   if (failed_)
841     return NoMatch();
842 
843   // Given the child fragments, return the fragment for this node.
844   switch (re->op()) {
845     case kRegexpRepeat:
846       // Should not see; code at bottom of function will print error
847       break;
848 
849     case kRegexpNoMatch:
850       return NoMatch();
851 
852     case kRegexpEmptyMatch:
853       return Nop();
854 
855     case kRegexpHaveMatch: {
856       Frag f = Match(re->match_id());
857       if (anchor_ == RE2::ANCHOR_BOTH) {
858         // Append \z or else the subexpression will effectively be unanchored.
859         // Complemented by the UNANCHORED case in CompileSet().
860         f = Cat(EmptyWidth(kEmptyEndText), f);
861       }
862       return f;
863     }
864 
865     case kRegexpConcat: {
866       Frag f = child_frags[0];
867       for (int i = 1; i < nchild_frags; i++)
868         f = Cat(f, child_frags[i]);
869       return f;
870     }
871 
872     case kRegexpAlternate: {
873       Frag f = child_frags[0];
874       for (int i = 1; i < nchild_frags; i++)
875         f = Alt(f, child_frags[i]);
876       return f;
877     }
878 
879     case kRegexpStar:
880       return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
881 
882     case kRegexpPlus:
883       return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
884 
885     case kRegexpQuest:
886       return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
887 
888     case kRegexpLiteral:
889       return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
890 
891     case kRegexpLiteralString: {
892       // Concatenation of literals.
893       if (re->nrunes() == 0)
894         return Nop();
895       Frag f;
896       for (int i = 0; i < re->nrunes(); i++) {
897         Frag f1 = Literal(re->runes()[i],
898                           (re->parse_flags()&Regexp::FoldCase) != 0);
899         if (i == 0)
900           f = f1;
901         else
902           f = Cat(f, f1);
903       }
904       return f;
905     }
906 
907     case kRegexpAnyChar:
908       BeginRange();
909       AddRuneRange(0, Runemax, false);
910       return EndRange();
911 
912     case kRegexpAnyByte:
913       return ByteRange(0x00, 0xFF, false);
914 
915     case kRegexpCharClass: {
916       CharClass* cc = re->cc();
917       if (cc->empty()) {
918         // This can't happen.
919         LOG(DFATAL) << "No ranges in char class";
920         failed_ = true;
921         return NoMatch();
922       }
923 
924       // ASCII case-folding optimization: if the char class
925       // behaves the same on A-Z as it does on a-z,
926       // discard any ranges wholly contained in A-Z
927       // and mark the other ranges as foldascii.
928       // This reduces the size of a program for
929       // (?i)abc from 3 insts per letter to 1 per letter.
930       bool foldascii = cc->FoldsASCII();
931 
932       // Character class is just a big OR of the different
933       // character ranges in the class.
934       BeginRange();
935       for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
936         // ASCII case-folding optimization (see above).
937         if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
938           continue;
939 
940         // If this range contains all of A-Za-z or none of it,
941         // the fold flag is unnecessary; don't bother.
942         bool fold = foldascii;
943         if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
944             ('Z' < i->lo && i->hi < 'a'))
945           fold = false;
946 
947         AddRuneRange(i->lo, i->hi, fold);
948       }
949       return EndRange();
950     }
951 
952     case kRegexpCapture:
953       // If this is a non-capturing parenthesis -- (?:foo) --
954       // just use the inner expression.
955       if (re->cap() < 0)
956         return child_frags[0];
957       return Capture(child_frags[0], re->cap());
958 
959     case kRegexpBeginLine:
960       return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
961 
962     case kRegexpEndLine:
963       return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
964 
965     case kRegexpBeginText:
966       return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
967 
968     case kRegexpEndText:
969       return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
970 
971     case kRegexpWordBoundary:
972       return EmptyWidth(kEmptyWordBoundary);
973 
974     case kRegexpNoWordBoundary:
975       return EmptyWidth(kEmptyNonWordBoundary);
976   }
977   LOG(DFATAL) << "Missing case in Compiler: " << re->op();
978   failed_ = true;
979   return NoMatch();
980 }
981 
982 // Is this regexp required to start at the beginning of the text?
983 // Only approximate; can return false for complicated regexps like (\Aa|\Ab),
984 // but handles (\A(a|b)).  Could use the Walker to write a more exact one.
IsAnchorStart(Regexp ** pre,int depth)985 static bool IsAnchorStart(Regexp** pre, int depth) {
986   Regexp* re = *pre;
987   Regexp* sub;
988   // The depth limit makes sure that we don't overflow
989   // the stack on a deeply nested regexp.  As the comment
990   // above says, IsAnchorStart is conservative, so returning
991   // a false negative is okay.  The exact limit is somewhat arbitrary.
992   if (re == NULL || depth >= 4)
993     return false;
994   switch (re->op()) {
995     default:
996       break;
997     case kRegexpConcat:
998       if (re->nsub() > 0) {
999         sub = re->sub()[0]->Incref();
1000         if (IsAnchorStart(&sub, depth+1)) {
1001           PODArray<Regexp*> subcopy(re->nsub());
1002           subcopy[0] = sub;  // already have reference
1003           for (int i = 1; i < re->nsub(); i++)
1004             subcopy[i] = re->sub()[i]->Incref();
1005           *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1006           re->Decref();
1007           return true;
1008         }
1009         sub->Decref();
1010       }
1011       break;
1012     case kRegexpCapture:
1013       sub = re->sub()[0]->Incref();
1014       if (IsAnchorStart(&sub, depth+1)) {
1015         *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1016         re->Decref();
1017         return true;
1018       }
1019       sub->Decref();
1020       break;
1021     case kRegexpBeginText:
1022       *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1023       re->Decref();
1024       return true;
1025   }
1026   return false;
1027 }
1028 
1029 // Is this regexp required to start at the end of the text?
1030 // Only approximate; can return false for complicated regexps like (a\z|b\z),
1031 // but handles ((a|b)\z).  Could use the Walker to write a more exact one.
IsAnchorEnd(Regexp ** pre,int depth)1032 static bool IsAnchorEnd(Regexp** pre, int depth) {
1033   Regexp* re = *pre;
1034   Regexp* sub;
1035   // The depth limit makes sure that we don't overflow
1036   // the stack on a deeply nested regexp.  As the comment
1037   // above says, IsAnchorEnd is conservative, so returning
1038   // a false negative is okay.  The exact limit is somewhat arbitrary.
1039   if (re == NULL || depth >= 4)
1040     return false;
1041   switch (re->op()) {
1042     default:
1043       break;
1044     case kRegexpConcat:
1045       if (re->nsub() > 0) {
1046         sub = re->sub()[re->nsub() - 1]->Incref();
1047         if (IsAnchorEnd(&sub, depth+1)) {
1048           PODArray<Regexp*> subcopy(re->nsub());
1049           subcopy[re->nsub() - 1] = sub;  // already have reference
1050           for (int i = 0; i < re->nsub() - 1; i++)
1051             subcopy[i] = re->sub()[i]->Incref();
1052           *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1053           re->Decref();
1054           return true;
1055         }
1056         sub->Decref();
1057       }
1058       break;
1059     case kRegexpCapture:
1060       sub = re->sub()[0]->Incref();
1061       if (IsAnchorEnd(&sub, depth+1)) {
1062         *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1063         re->Decref();
1064         return true;
1065       }
1066       sub->Decref();
1067       break;
1068     case kRegexpEndText:
1069       *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1070       re->Decref();
1071       return true;
1072   }
1073   return false;
1074 }
1075 
Setup(Regexp::ParseFlags flags,int64_t max_mem,RE2::Anchor anchor)1076 void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
1077                      RE2::Anchor anchor) {
1078   if (flags & Regexp::Latin1)
1079     encoding_ = kEncodingLatin1;
1080   max_mem_ = max_mem;
1081   if (max_mem <= 0) {
1082     max_ninst_ = 100000;  // more than enough
1083   } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
1084     // No room for anything.
1085     max_ninst_ = 0;
1086   } else {
1087     int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
1088     // Limit instruction count so that inst->id() fits nicely in an int.
1089     // SparseArray also assumes that the indices (inst->id()) are ints.
1090     // The call to WalkExponential uses 2*max_ninst_ below,
1091     // and other places in the code use 2 or 3 * prog->size().
1092     // Limiting to 2^24 should avoid overflow in those places.
1093     // (The point of allowing more than 32 bits of memory is to
1094     // have plenty of room for the DFA states, not to use it up
1095     // on the program.)
1096     if (m >= 1<<24)
1097       m = 1<<24;
1098     // Inst imposes its own limit (currently bigger than 2^24 but be safe).
1099     if (m > Prog::Inst::kMaxInst)
1100       m = Prog::Inst::kMaxInst;
1101     max_ninst_ = static_cast<int>(m);
1102   }
1103   anchor_ = anchor;
1104 }
1105 
1106 // Compiles re, returning program.
1107 // Caller is responsible for deleting prog_.
1108 // If reversed is true, compiles a program that expects
1109 // to run over the input string backward (reverses all concatenations).
1110 // The reversed flag is also recorded in the returned program.
Compile(Regexp * re,bool reversed,int64_t max_mem)1111 Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
1112   Compiler c;
1113   c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
1114   c.reversed_ = reversed;
1115 
1116   // Simplify to remove things like counted repetitions
1117   // and character classes like \d.
1118   Regexp* sre = re->Simplify();
1119   if (sre == NULL)
1120     return NULL;
1121 
1122   // Record whether prog is anchored, removing the anchors.
1123   // (They get in the way of other optimizations.)
1124   bool is_anchor_start = IsAnchorStart(&sre, 0);
1125   bool is_anchor_end = IsAnchorEnd(&sre, 0);
1126 
1127   // Generate fragment for entire regexp.
1128   Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1129   sre->Decref();
1130   if (c.failed_)
1131     return NULL;
1132 
1133   // Success!  Finish by putting Match node at end, and record start.
1134   // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1135   // to behave normally.
1136   c.reversed_ = false;
1137   all = c.Cat(all, c.Match(0));
1138 
1139   c.prog_->set_reversed(reversed);
1140   if (c.prog_->reversed()) {
1141     c.prog_->set_anchor_start(is_anchor_end);
1142     c.prog_->set_anchor_end(is_anchor_start);
1143   } else {
1144     c.prog_->set_anchor_start(is_anchor_start);
1145     c.prog_->set_anchor_end(is_anchor_end);
1146   }
1147 
1148   c.prog_->set_start(all.begin);
1149   if (!c.prog_->anchor_start()) {
1150     // Also create unanchored version, which starts with a .*? loop.
1151     all = c.Cat(c.DotStar(), all);
1152   }
1153   c.prog_->set_start_unanchored(all.begin);
1154 
1155   // Hand ownership of prog_ to caller.
1156   return c.Finish(re);
1157 }
1158 
Finish(Regexp * re)1159 Prog* Compiler::Finish(Regexp* re) {
1160   if (failed_)
1161     return NULL;
1162 
1163   if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1164     // No possible matches; keep Fail instruction only.
1165     ninst_ = 1;
1166   }
1167 
1168   // Hand off the array to Prog.
1169   prog_->inst_ = std::move(inst_);
1170   prog_->size_ = ninst_;
1171 
1172   prog_->Optimize();
1173   prog_->Flatten();
1174   prog_->ComputeByteMap();
1175 
1176   if (!prog_->reversed()) {
1177     std::string prefix;
1178     bool prefix_foldcase;
1179     if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase))
1180       prog_->ConfigurePrefixAccel(prefix, prefix_foldcase);
1181   }
1182 
1183   // Record remaining memory for DFA.
1184   if (max_mem_ <= 0) {
1185     prog_->set_dfa_mem(1<<20);
1186   } else {
1187     int64_t m = max_mem_ - sizeof(Prog);
1188     m -= prog_->size_*sizeof(Prog::Inst);  // account for inst_
1189     if (prog_->CanBitState())
1190       m -= prog_->size_*sizeof(uint16_t);  // account for list_heads_
1191     if (m < 0)
1192       m = 0;
1193     prog_->set_dfa_mem(m);
1194   }
1195 
1196   Prog* p = prog_;
1197   prog_ = NULL;
1198   return p;
1199 }
1200 
1201 // Converts Regexp to Prog.
CompileToProg(int64_t max_mem)1202 Prog* Regexp::CompileToProg(int64_t max_mem) {
1203   return Compiler::Compile(this, false, max_mem);
1204 }
1205 
CompileToReverseProg(int64_t max_mem)1206 Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
1207   return Compiler::Compile(this, true, max_mem);
1208 }
1209 
DotStar()1210 Frag Compiler::DotStar() {
1211   return Star(ByteRange(0x00, 0xff, false), true);
1212 }
1213 
1214 // Compiles RE set to Prog.
CompileSet(Regexp * re,RE2::Anchor anchor,int64_t max_mem)1215 Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1216   Compiler c;
1217   c.Setup(re->parse_flags(), max_mem, anchor);
1218 
1219   Regexp* sre = re->Simplify();
1220   if (sre == NULL)
1221     return NULL;
1222 
1223   Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1224   sre->Decref();
1225   if (c.failed_)
1226     return NULL;
1227 
1228   c.prog_->set_anchor_start(true);
1229   c.prog_->set_anchor_end(true);
1230 
1231   if (anchor == RE2::UNANCHORED) {
1232     // Prepend .* or else the expression will effectively be anchored.
1233     // Complemented by the ANCHOR_BOTH case in PostVisit().
1234     all = c.Cat(c.DotStar(), all);
1235   }
1236   c.prog_->set_start(all.begin);
1237   c.prog_->set_start_unanchored(all.begin);
1238 
1239   Prog* prog = c.Finish(re);
1240   if (prog == NULL)
1241     return NULL;
1242 
1243   // Make sure DFA has enough memory to operate,
1244   // since we're not going to fall back to the NFA.
1245   bool dfa_failed = false;
1246   StringPiece sp = "hello, world";
1247   prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1248                   NULL, &dfa_failed, NULL);
1249   if (dfa_failed) {
1250     delete prog;
1251     return NULL;
1252   }
1253 
1254   return prog;
1255 }
1256 
CompileSet(Regexp * re,RE2::Anchor anchor,int64_t max_mem)1257 Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1258   return Compiler::CompileSet(re, anchor, max_mem);
1259 }
1260 
1261 }  // namespace re2
1262