1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Compile regular expression to Prog.
6 //
7 // Prog and Inst are defined in prog.h.
8 // This file's external interface is just Regexp::CompileToProg.
9 // The Compiler class defined in this file is private.
10 
11 #include <stdint.h>
12 #include <string.h>
13 #include <unordered_map>
14 #include <utility>
15 
16 #include "util/logging.h"
17 #include "util/utf.h"
18 #include "re2/pod_array.h"
19 #include "re2/prog.h"
20 #include "re2/re2.h"
21 #include "re2/regexp.h"
22 #include "re2/walker-inl.h"
23 
24 namespace re2 {
25 
26 // List of pointers to Inst* that need to be filled in (patched).
27 // Because the Inst* haven't been filled in yet,
28 // we can use the Inst* word to hold the list's "next" pointer.
29 // It's kind of sleazy, but it works well in practice.
30 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
31 //
32 // Because the out and out1 fields in Inst are no longer pointers,
33 // we can't use pointers directly here either.  Instead, head refers
34 // to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
35 // head == 0 represents the NULL list.  This is okay because instruction #0
36 // is always the fail instruction, which never appears on a list.
37 struct PatchList {
38   // Returns patch list containing just p.
Mkre2::PatchList39   static PatchList Mk(uint32_t p) {
40     return {p, p};
41   }
42 
43   // Patches all the entries on l to have value p.
44   // Caller must not ever use patch list again.
Patchre2::PatchList45   static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) {
46     while (l.head != 0) {
47       Prog::Inst* ip = &inst0[l.head>>1];
48       if (l.head&1) {
49         l.head = ip->out1();
50         ip->out1_ = p;
51       } else {
52         l.head = ip->out();
53         ip->set_out(p);
54       }
55     }
56   }
57 
58   // Appends two patch lists and returns result.
Appendre2::PatchList59   static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
60     if (l1.head == 0)
61       return l2;
62     if (l2.head == 0)
63       return l1;
64     Prog::Inst* ip = &inst0[l1.tail>>1];
65     if (l1.tail&1)
66       ip->out1_ = l2.head;
67     else
68       ip->set_out(l2.head);
69     return {l1.head, l2.tail};
70   }
71 
72   uint32_t head;
73   uint32_t tail;  // for constant-time append
74 };
75 
76 static const PatchList kNullPatchList = {0, 0};
77 
78 // Compiled program fragment.
79 struct Frag {
80   uint32_t begin;
81   PatchList end;
82 
Fragre2::Frag83   Frag() : begin(0) { end.head = 0; }  // needed so Frag can go in vector
Fragre2::Frag84   Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {}
85 };
86 
87 // Input encodings.
88 enum Encoding {
89   kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
90   kEncodingLatin1,    // Latin-1 (0-FF)
91 };
92 
93 class Compiler : public Regexp::Walker<Frag> {
94  public:
95   explicit Compiler();
96   ~Compiler();
97 
98   // Compiles Regexp to a new Prog.
99   // Caller is responsible for deleting Prog when finished with it.
100   // If reversed is true, compiles for walking over the input
101   // string backward (reverses all concatenations).
102   static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
103 
104   // Compiles alternation of all the re to a new Prog.
105   // Each re has a match with an id equal to its index in the vector.
106   static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
107 
108   // Interface for Regexp::Walker, which helps traverse the Regexp.
109   // The walk is purely post-recursive: given the machines for the
110   // children, PostVisit combines them to create the machine for
111   // the current node.  The child_args are Frags.
112   // The Compiler traverses the Regexp parse tree, visiting
113   // each node in depth-first order.  It invokes PreVisit before
114   // visiting the node's children and PostVisit after visiting
115   // the children.
116   Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
117   Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
118                  int nchild_args);
119   Frag ShortVisit(Regexp* re, Frag parent_arg);
120   Frag Copy(Frag arg);
121 
122   // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
123   Frag Plus(Frag a, bool nongreedy);
124   Frag Star(Frag a, bool nongreedy);
125   Frag Quest(Frag a, bool nongreedy);
126 
127   // Given fragment a, returns (a) capturing as \n.
128   Frag Capture(Frag a, int n);
129 
130   // Given fragments a and b, returns ab; a|b
131   Frag Cat(Frag a, Frag b);
132   Frag Alt(Frag a, Frag b);
133 
134   // Returns a fragment that can't match anything.
135   Frag NoMatch();
136 
137   // Returns a fragment that matches the empty string.
138   Frag Match(int32_t id);
139 
140   // Returns a no-op fragment.
141   Frag Nop();
142 
143   // Returns a fragment matching the byte range lo-hi.
144   Frag ByteRange(int lo, int hi, bool foldcase);
145 
146   // Returns a fragment matching an empty-width special op.
147   Frag EmptyWidth(EmptyOp op);
148 
149   // Adds n instructions to the program.
150   // Returns the index of the first one.
151   // Returns -1 if no more instructions are available.
152   int AllocInst(int n);
153 
154   // Rune range compiler.
155 
156   // Begins a new alternation.
157   void BeginRange();
158 
159   // Adds a fragment matching the rune range lo-hi.
160   void AddRuneRange(Rune lo, Rune hi, bool foldcase);
161   void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
162   void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
163   void Add_80_10ffff();
164 
165   // New suffix that matches the byte range lo-hi, then goes to next.
166   int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
167   int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
168 
169   // Returns true iff the suffix is cached.
170   bool IsCachedRuneByteSuffix(int id);
171 
172   // Adds a suffix to alternation.
173   void AddSuffix(int id);
174 
175   // Adds a suffix to the trie starting from the given root node.
176   // Returns zero iff allocating an instruction fails. Otherwise, returns
177   // the current root node, which might be different from what was given.
178   int AddSuffixRecursive(int root, int id);
179 
180   // Finds the trie node for the given suffix. Returns a Frag in order to
181   // distinguish between pointing at the root node directly (end.head == 0)
182   // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively).
183   Frag FindByteRange(int root, int id);
184 
185   // Compares two ByteRanges and returns true iff they are equal.
186   bool ByteRangeEqual(int id1, int id2);
187 
188   // Returns the alternation of all the added suffixes.
189   Frag EndRange();
190 
191   // Single rune.
192   Frag Literal(Rune r, bool foldcase);
193 
194   void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor);
195   Prog* Finish(Regexp* re);
196 
197   // Returns .* where dot = any byte
198   Frag DotStar();
199 
200  private:
201   Prog* prog_;         // Program being built.
202   bool failed_;        // Did we give up compiling?
203   Encoding encoding_;  // Input encoding
204   bool reversed_;      // Should program run backward over text?
205 
206   PODArray<Prog::Inst> inst_;
207   int ninst_;          // Number of instructions used.
208   int max_ninst_;      // Maximum number of instructions.
209 
210   int64_t max_mem_;    // Total memory budget.
211 
212   std::unordered_map<uint64_t, int> rune_cache_;
213   Frag rune_range_;
214 
215   RE2::Anchor anchor_;  // anchor mode for RE2::Set
216 
217   Compiler(const Compiler&) = delete;
218   Compiler& operator=(const Compiler&) = delete;
219 };
220 
Compiler()221 Compiler::Compiler() {
222   prog_ = new Prog();
223   failed_ = false;
224   encoding_ = kEncodingUTF8;
225   reversed_ = false;
226   ninst_ = 0;
227   max_ninst_ = 1;  // make AllocInst for fail instruction okay
228   max_mem_ = 0;
229   int fail = AllocInst(1);
230   inst_[fail].InitFail();
231   max_ninst_ = 0;  // Caller must change
232 }
233 
~Compiler()234 Compiler::~Compiler() {
235   delete prog_;
236 }
237 
AllocInst(int n)238 int Compiler::AllocInst(int n) {
239   if (failed_ || ninst_ + n > max_ninst_) {
240     failed_ = true;
241     return -1;
242   }
243 
244   if (ninst_ + n > inst_.size()) {
245     int cap = inst_.size();
246     if (cap == 0)
247       cap = 8;
248     while (ninst_ + n > cap)
249       cap *= 2;
250     PODArray<Prog::Inst> inst(cap);
251     if (inst_.data() != NULL)
252       memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]);
253     memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]);
254     inst_ = std::move(inst);
255   }
256   int id = ninst_;
257   ninst_ += n;
258   return id;
259 }
260 
261 // These routines are somewhat hard to visualize in text --
262 // see http://swtch.com/~rsc/regexp/regexp1.html for
263 // pictures explaining what is going on here.
264 
265 // Returns an unmatchable fragment.
NoMatch()266 Frag Compiler::NoMatch() {
267   return Frag(0, kNullPatchList);
268 }
269 
270 // Is a an unmatchable fragment?
IsNoMatch(Frag a)271 static bool IsNoMatch(Frag a) {
272   return a.begin == 0;
273 }
274 
275 // Given fragments a and b, returns fragment for ab.
Cat(Frag a,Frag b)276 Frag Compiler::Cat(Frag a, Frag b) {
277   if (IsNoMatch(a) || IsNoMatch(b))
278     return NoMatch();
279 
280   // Elide no-op.
281   Prog::Inst* begin = &inst_[a.begin];
282   if (begin->opcode() == kInstNop &&
283       a.end.head == (a.begin << 1) &&
284       begin->out() == 0) {
285     // in case refs to a somewhere
286     PatchList::Patch(inst_.data(), a.end, b.begin);
287     return b;
288   }
289 
290   // To run backward over string, reverse all concatenations.
291   if (reversed_) {
292     PatchList::Patch(inst_.data(), b.end, a.begin);
293     return Frag(b.begin, a.end);
294   }
295 
296   PatchList::Patch(inst_.data(), a.end, b.begin);
297   return Frag(a.begin, b.end);
298 }
299 
300 // Given fragments for a and b, returns fragment for a|b.
Alt(Frag a,Frag b)301 Frag Compiler::Alt(Frag a, Frag b) {
302   // Special case for convenience in loops.
303   if (IsNoMatch(a))
304     return b;
305   if (IsNoMatch(b))
306     return a;
307 
308   int id = AllocInst(1);
309   if (id < 0)
310     return NoMatch();
311 
312   inst_[id].InitAlt(a.begin, b.begin);
313   return Frag(id, PatchList::Append(inst_.data(), a.end, b.end));
314 }
315 
316 // When capturing submatches in like-Perl mode, a kOpAlt Inst
317 // treats out_ as the first choice, out1_ as the second.
318 //
319 // For *, +, and ?, if out_ causes another repetition,
320 // then the operator is greedy.  If out1_ is the repetition
321 // (and out_ moves forward), then the operator is non-greedy.
322 
323 // Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
Star(Frag a,bool nongreedy)324 Frag Compiler::Star(Frag a, bool nongreedy) {
325   int id = AllocInst(1);
326   if (id < 0)
327     return NoMatch();
328   inst_[id].InitAlt(0, 0);
329   PatchList::Patch(inst_.data(), a.end, id);
330   if (nongreedy) {
331     inst_[id].out1_ = a.begin;
332     return Frag(id, PatchList::Mk(id << 1));
333   } else {
334     inst_[id].set_out(a.begin);
335     return Frag(id, PatchList::Mk((id << 1) | 1));
336   }
337 }
338 
339 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
Plus(Frag a,bool nongreedy)340 Frag Compiler::Plus(Frag a, bool nongreedy) {
341   // a+ is just a* with a different entry point.
342   Frag f = Star(a, nongreedy);
343   return Frag(a.begin, f.end);
344 }
345 
346 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
Quest(Frag a,bool nongreedy)347 Frag Compiler::Quest(Frag a, bool nongreedy) {
348   if (IsNoMatch(a))
349     return Nop();
350   int id = AllocInst(1);
351   if (id < 0)
352     return NoMatch();
353   PatchList pl;
354   if (nongreedy) {
355     inst_[id].InitAlt(0, a.begin);
356     pl = PatchList::Mk(id << 1);
357   } else {
358     inst_[id].InitAlt(a.begin, 0);
359     pl = PatchList::Mk((id << 1) | 1);
360   }
361   return Frag(id, PatchList::Append(inst_.data(), pl, a.end));
362 }
363 
364 // Returns a fragment for the byte range lo-hi.
ByteRange(int lo,int hi,bool foldcase)365 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
366   int id = AllocInst(1);
367   if (id < 0)
368     return NoMatch();
369   inst_[id].InitByteRange(lo, hi, foldcase, 0);
370   return Frag(id, PatchList::Mk(id << 1));
371 }
372 
373 // Returns a no-op fragment.  Sometimes unavoidable.
Nop()374 Frag Compiler::Nop() {
375   int id = AllocInst(1);
376   if (id < 0)
377     return NoMatch();
378   inst_[id].InitNop(0);
379   return Frag(id, PatchList::Mk(id << 1));
380 }
381 
382 // Returns a fragment that signals a match.
Match(int32_t match_id)383 Frag Compiler::Match(int32_t match_id) {
384   int id = AllocInst(1);
385   if (id < 0)
386     return NoMatch();
387   inst_[id].InitMatch(match_id);
388   return Frag(id, kNullPatchList);
389 }
390 
391 // Returns a fragment matching a particular empty-width op (like ^ or $)
EmptyWidth(EmptyOp empty)392 Frag Compiler::EmptyWidth(EmptyOp empty) {
393   int id = AllocInst(1);
394   if (id < 0)
395     return NoMatch();
396   inst_[id].InitEmptyWidth(empty, 0);
397   return Frag(id, PatchList::Mk(id << 1));
398 }
399 
400 // Given a fragment a, returns a fragment with capturing parens around a.
Capture(Frag a,int n)401 Frag Compiler::Capture(Frag a, int n) {
402   if (IsNoMatch(a))
403     return NoMatch();
404   int id = AllocInst(2);
405   if (id < 0)
406     return NoMatch();
407   inst_[id].InitCapture(2*n, a.begin);
408   inst_[id+1].InitCapture(2*n+1, 0);
409   PatchList::Patch(inst_.data(), a.end, id+1);
410 
411   return Frag(id, PatchList::Mk((id+1) << 1));
412 }
413 
414 // A Rune is a name for a Unicode code point.
415 // Returns maximum rune encoded by UTF-8 sequence of length len.
MaxRune(int len)416 static int MaxRune(int len) {
417   int b;  // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
418   if (len == 1)
419     b = 7;
420   else
421     b = 8-(len+1) + 6*(len-1);
422   return (1<<b) - 1;   // maximum Rune for b bits.
423 }
424 
425 // The rune range compiler caches common suffix fragments,
426 // which are very common in UTF-8 (e.g., [80-bf]).
427 // The fragment suffixes are identified by their start
428 // instructions.  NULL denotes the eventual end match.
429 // The Frag accumulates in rune_range_.  Caching common
430 // suffixes reduces the UTF-8 "." from 32 to 24 instructions,
431 // and it reduces the corresponding one-pass NFA from 16 nodes to 8.
432 
BeginRange()433 void Compiler::BeginRange() {
434   rune_cache_.clear();
435   rune_range_.begin = 0;
436   rune_range_.end = kNullPatchList;
437 }
438 
UncachedRuneByteSuffix(uint8_t lo,uint8_t hi,bool foldcase,int next)439 int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
440                                      int next) {
441   Frag f = ByteRange(lo, hi, foldcase);
442   if (next != 0) {
443     PatchList::Patch(inst_.data(), f.end, next);
444   } else {
445     rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
446   }
447   return f.begin;
448 }
449 
MakeRuneCacheKey(uint8_t lo,uint8_t hi,bool foldcase,int next)450 static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
451                                  int next) {
452   return (uint64_t)next << 17 |
453          (uint64_t)lo   <<  9 |
454          (uint64_t)hi   <<  1 |
455          (uint64_t)foldcase;
456 }
457 
CachedRuneByteSuffix(uint8_t lo,uint8_t hi,bool foldcase,int next)458 int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
459                                    int next) {
460   uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
461   std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
462   if (it != rune_cache_.end())
463     return it->second;
464   int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
465   rune_cache_[key] = id;
466   return id;
467 }
468 
IsCachedRuneByteSuffix(int id)469 bool Compiler::IsCachedRuneByteSuffix(int id) {
470   uint8_t lo = inst_[id].lo_;
471   uint8_t hi = inst_[id].hi_;
472   bool foldcase = inst_[id].foldcase() != 0;
473   int next = inst_[id].out();
474 
475   uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
476   return rune_cache_.find(key) != rune_cache_.end();
477 }
478 
AddSuffix(int id)479 void Compiler::AddSuffix(int id) {
480   if (failed_)
481     return;
482 
483   if (rune_range_.begin == 0) {
484     rune_range_.begin = id;
485     return;
486   }
487 
488   if (encoding_ == kEncodingUTF8) {
489     // Build a trie in order to reduce fanout.
490     rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
491     return;
492   }
493 
494   int alt = AllocInst(1);
495   if (alt < 0) {
496     rune_range_.begin = 0;
497     return;
498   }
499   inst_[alt].InitAlt(rune_range_.begin, id);
500   rune_range_.begin = alt;
501 }
502 
AddSuffixRecursive(int root,int id)503 int Compiler::AddSuffixRecursive(int root, int id) {
504   DCHECK(inst_[root].opcode() == kInstAlt ||
505          inst_[root].opcode() == kInstByteRange);
506 
507   Frag f = FindByteRange(root, id);
508   if (IsNoMatch(f)) {
509     int alt = AllocInst(1);
510     if (alt < 0)
511       return 0;
512     inst_[alt].InitAlt(root, id);
513     return alt;
514   }
515 
516   int br;
517   if (f.end.head == 0)
518     br = root;
519   else if (f.end.head&1)
520     br = inst_[f.begin].out1();
521   else
522     br = inst_[f.begin].out();
523 
524   if (IsCachedRuneByteSuffix(br)) {
525     // We can't fiddle with cached suffixes, so make a clone of the head.
526     int byterange = AllocInst(1);
527     if (byterange < 0)
528       return 0;
529     inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
530                                    inst_[br].foldcase(), inst_[br].out());
531 
532     // Ensure that the parent points to the clone, not to the original.
533     // Note that this could leave the head unreachable except via the cache.
534     br = byterange;
535     if (f.end.head == 0)
536       root = br;
537     else if (f.end.head&1)
538       inst_[f.begin].out1_ = br;
539     else
540       inst_[f.begin].set_out(br);
541   }
542 
543   int out = inst_[id].out();
544   if (!IsCachedRuneByteSuffix(id)) {
545     // The head should be the instruction most recently allocated, so free it
546     // instead of leaving it unreachable.
547     DCHECK_EQ(id, ninst_-1);
548     inst_[id].out_opcode_ = 0;
549     inst_[id].out1_ = 0;
550     ninst_--;
551   }
552 
553   out = AddSuffixRecursive(inst_[br].out(), out);
554   if (out == 0)
555     return 0;
556 
557   inst_[br].set_out(out);
558   return root;
559 }
560 
ByteRangeEqual(int id1,int id2)561 bool Compiler::ByteRangeEqual(int id1, int id2) {
562   return inst_[id1].lo() == inst_[id2].lo() &&
563          inst_[id1].hi() == inst_[id2].hi() &&
564          inst_[id1].foldcase() == inst_[id2].foldcase();
565 }
566 
FindByteRange(int root,int id)567 Frag Compiler::FindByteRange(int root, int id) {
568   if (inst_[root].opcode() == kInstByteRange) {
569     if (ByteRangeEqual(root, id))
570       return Frag(root, kNullPatchList);
571     else
572       return NoMatch();
573   }
574 
575   while (inst_[root].opcode() == kInstAlt) {
576     int out1 = inst_[root].out1();
577     if (ByteRangeEqual(out1, id))
578       return Frag(root, PatchList::Mk((root << 1) | 1));
579 
580     // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
581     // what we're looking for, then we can stop immediately. Unfortunately, we
582     // can't short-circuit the search in reverse mode.
583     if (!reversed_)
584       return NoMatch();
585 
586     int out = inst_[root].out();
587     if (inst_[out].opcode() == kInstAlt)
588       root = out;
589     else if (ByteRangeEqual(out, id))
590       return Frag(root, PatchList::Mk(root << 1));
591     else
592       return NoMatch();
593   }
594 
595   LOG(DFATAL) << "should never happen";
596   return NoMatch();
597 }
598 
EndRange()599 Frag Compiler::EndRange() {
600   return rune_range_;
601 }
602 
603 // Converts rune range lo-hi into a fragment that recognizes
604 // the bytes that would make up those runes in the current
605 // encoding (Latin 1 or UTF-8).
606 // This lets the machine work byte-by-byte even when
607 // using multibyte encodings.
608 
AddRuneRange(Rune lo,Rune hi,bool foldcase)609 void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
610   switch (encoding_) {
611     default:
612     case kEncodingUTF8:
613       AddRuneRangeUTF8(lo, hi, foldcase);
614       break;
615     case kEncodingLatin1:
616       AddRuneRangeLatin1(lo, hi, foldcase);
617       break;
618   }
619 }
620 
AddRuneRangeLatin1(Rune lo,Rune hi,bool foldcase)621 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
622   // Latin-1 is easy: runes *are* bytes.
623   if (lo > hi || lo > 0xFF)
624     return;
625   if (hi > 0xFF)
626     hi = 0xFF;
627   AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
628                                    static_cast<uint8_t>(hi), foldcase, 0));
629 }
630 
Add_80_10ffff()631 void Compiler::Add_80_10ffff() {
632   // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
633   // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
634   // permitting overlong encodings in E0 and F0 sequences and code points
635   // over 10FFFF in F4 sequences, the size of the bytecode and the number
636   // of equivalence classes are reduced significantly.
637   int id;
638   if (reversed_) {
639     // Prefix factoring matters, but we don't have to handle it here
640     // because the rune range trie logic takes care of that already.
641     id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0);
642     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
643     AddSuffix(id);
644 
645     id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0);
646     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
647     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
648     AddSuffix(id);
649 
650     id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0);
651     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
652     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
653     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
654     AddSuffix(id);
655   } else {
656     // Suffix factoring matters - and we do have to handle it here.
657     int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0);
658     id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1);
659     AddSuffix(id);
660 
661     int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1);
662     id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2);
663     AddSuffix(id);
664 
665     int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2);
666     id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3);
667     AddSuffix(id);
668   }
669 }
670 
AddRuneRangeUTF8(Rune lo,Rune hi,bool foldcase)671 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
672   if (lo > hi)
673     return;
674 
675   // Pick off 80-10FFFF as a common special case.
676   if (lo == 0x80 && hi == 0x10ffff) {
677     Add_80_10ffff();
678     return;
679   }
680 
681   // Split range into same-length sized ranges.
682   for (int i = 1; i < UTFmax; i++) {
683     Rune max = MaxRune(i);
684     if (lo <= max && max < hi) {
685       AddRuneRangeUTF8(lo, max, foldcase);
686       AddRuneRangeUTF8(max+1, hi, foldcase);
687       return;
688     }
689   }
690 
691   // ASCII range is always a special case.
692   if (hi < Runeself) {
693     AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
694                                      static_cast<uint8_t>(hi), foldcase, 0));
695     return;
696   }
697 
698   // Split range into sections that agree on leading bytes.
699   for (int i = 1; i < UTFmax; i++) {
700     uint32_t m = (1<<(6*i)) - 1;  // last i bytes of a UTF-8 sequence
701     if ((lo & ~m) != (hi & ~m)) {
702       if ((lo & m) != 0) {
703         AddRuneRangeUTF8(lo, lo|m, foldcase);
704         AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
705         return;
706       }
707       if ((hi & m) != m) {
708         AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
709         AddRuneRangeUTF8(hi&~m, hi, foldcase);
710         return;
711       }
712     }
713   }
714 
715   // Finally.  Generate byte matching equivalent for lo-hi.
716   uint8_t ulo[UTFmax], uhi[UTFmax];
717   int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
718   int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
719   (void)m;  // USED(m)
720   DCHECK_EQ(n, m);
721 
722   // The logic below encodes this thinking:
723   //
724   // 1. When we have built the whole suffix, we know that it cannot
725   // possibly be a suffix of anything longer: in forward mode, nothing
726   // else can occur before the leading byte; in reverse mode, nothing
727   // else can occur after the last continuation byte or else the leading
728   // byte would have to change. Thus, there is no benefit to caching
729   // the first byte of the suffix whereas there is a cost involved in
730   // cloning it if it begins a common prefix, which is fairly likely.
731   //
732   // 2. Conversely, the last byte of the suffix cannot possibly be a
733   // prefix of anything because next == 0, so we will never want to
734   // clone it, but it is fairly likely to be a common suffix. Perhaps
735   // more so in reverse mode than in forward mode because the former is
736   // "converging" towards lower entropy, but caching is still worthwhile
737   // for the latter in cases such as 80-BF.
738   //
739   // 3. Handling the bytes between the first and the last is less
740   // straightforward and, again, the approach depends on whether we are
741   // "converging" towards lower entropy: in forward mode, a single byte
742   // is unlikely to be part of a common suffix whereas a byte range
743   // is more likely so; in reverse mode, a byte range is unlikely to
744   // be part of a common suffix whereas a single byte is more likely
745   // so. The same benefit versus cost argument applies here.
746   int id = 0;
747   if (reversed_) {
748     for (int i = 0; i < n; i++) {
749       // In reverse UTF-8 mode: cache the leading byte; don't cache the last
750       // continuation byte; cache anything else iff it's a single byte (XX-XX).
751       if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
752         id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
753       else
754         id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
755     }
756   } else {
757     for (int i = n-1; i >= 0; i--) {
758       // In forward UTF-8 mode: don't cache the leading byte; cache the last
759       // continuation byte; cache anything else iff it's a byte range (XX-YY).
760       if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
761         id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
762       else
763         id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
764     }
765   }
766   AddSuffix(id);
767 }
768 
769 // Should not be called.
Copy(Frag arg)770 Frag Compiler::Copy(Frag arg) {
771   // We're using WalkExponential; there should be no copying.
772   LOG(DFATAL) << "Compiler::Copy called!";
773   failed_ = true;
774   return NoMatch();
775 }
776 
777 // Visits a node quickly; called once WalkExponential has
778 // decided to cut this walk short.
ShortVisit(Regexp * re,Frag)779 Frag Compiler::ShortVisit(Regexp* re, Frag) {
780   failed_ = true;
781   return NoMatch();
782 }
783 
784 // Called before traversing a node's children during the walk.
PreVisit(Regexp * re,Frag,bool * stop)785 Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
786   // Cut off walk if we've already failed.
787   if (failed_)
788     *stop = true;
789 
790   return Frag();  // not used by caller
791 }
792 
Literal(Rune r,bool foldcase)793 Frag Compiler::Literal(Rune r, bool foldcase) {
794   switch (encoding_) {
795     default:
796       return Frag();
797 
798     case kEncodingLatin1:
799       return ByteRange(r, r, foldcase);
800 
801     case kEncodingUTF8: {
802       if (r < Runeself)  // Make common case fast.
803         return ByteRange(r, r, foldcase);
804       uint8_t buf[UTFmax];
805       int n = runetochar(reinterpret_cast<char*>(buf), &r);
806       Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
807       for (int i = 1; i < n; i++)
808         f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
809       return f;
810     }
811   }
812 }
813 
814 // Called after traversing the node's children during the walk.
815 // Given their frags, build and return the frag for this re.
PostVisit(Regexp * re,Frag,Frag,Frag * child_frags,int nchild_frags)816 Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
817                          int nchild_frags) {
818   // If a child failed, don't bother going forward, especially
819   // since the child_frags might contain Frags with NULLs in them.
820   if (failed_)
821     return NoMatch();
822 
823   // Given the child fragments, return the fragment for this node.
824   switch (re->op()) {
825     case kRegexpRepeat:
826       // Should not see; code at bottom of function will print error
827       break;
828 
829     case kRegexpNoMatch:
830       return NoMatch();
831 
832     case kRegexpEmptyMatch:
833       return Nop();
834 
835     case kRegexpHaveMatch: {
836       Frag f = Match(re->match_id());
837       if (anchor_ == RE2::ANCHOR_BOTH) {
838         // Append \z or else the subexpression will effectively be unanchored.
839         // Complemented by the UNANCHORED case in CompileSet().
840         f = Cat(EmptyWidth(kEmptyEndText), f);
841       }
842       return f;
843     }
844 
845     case kRegexpConcat: {
846       Frag f = child_frags[0];
847       for (int i = 1; i < nchild_frags; i++)
848         f = Cat(f, child_frags[i]);
849       return f;
850     }
851 
852     case kRegexpAlternate: {
853       Frag f = child_frags[0];
854       for (int i = 1; i < nchild_frags; i++)
855         f = Alt(f, child_frags[i]);
856       return f;
857     }
858 
859     case kRegexpStar:
860       return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
861 
862     case kRegexpPlus:
863       return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
864 
865     case kRegexpQuest:
866       return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
867 
868     case kRegexpLiteral:
869       return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
870 
871     case kRegexpLiteralString: {
872       // Concatenation of literals.
873       if (re->nrunes() == 0)
874         return Nop();
875       Frag f;
876       for (int i = 0; i < re->nrunes(); i++) {
877         Frag f1 = Literal(re->runes()[i],
878                           (re->parse_flags()&Regexp::FoldCase) != 0);
879         if (i == 0)
880           f = f1;
881         else
882           f = Cat(f, f1);
883       }
884       return f;
885     }
886 
887     case kRegexpAnyChar:
888       BeginRange();
889       AddRuneRange(0, Runemax, false);
890       return EndRange();
891 
892     case kRegexpAnyByte:
893       return ByteRange(0x00, 0xFF, false);
894 
895     case kRegexpCharClass: {
896       CharClass* cc = re->cc();
897       if (cc->empty()) {
898         // This can't happen.
899         LOG(DFATAL) << "No ranges in char class";
900         failed_ = true;
901         return NoMatch();
902       }
903 
904       // ASCII case-folding optimization: if the char class
905       // behaves the same on A-Z as it does on a-z,
906       // discard any ranges wholly contained in A-Z
907       // and mark the other ranges as foldascii.
908       // This reduces the size of a program for
909       // (?i)abc from 3 insts per letter to 1 per letter.
910       bool foldascii = cc->FoldsASCII();
911 
912       // Character class is just a big OR of the different
913       // character ranges in the class.
914       BeginRange();
915       for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
916         // ASCII case-folding optimization (see above).
917         if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
918           continue;
919 
920         // If this range contains all of A-Za-z or none of it,
921         // the fold flag is unnecessary; don't bother.
922         bool fold = foldascii;
923         if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
924             ('Z' < i->lo && i->hi < 'a'))
925           fold = false;
926 
927         AddRuneRange(i->lo, i->hi, fold);
928       }
929       return EndRange();
930     }
931 
932     case kRegexpCapture:
933       // If this is a non-capturing parenthesis -- (?:foo) --
934       // just use the inner expression.
935       if (re->cap() < 0)
936         return child_frags[0];
937       return Capture(child_frags[0], re->cap());
938 
939     case kRegexpBeginLine:
940       return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
941 
942     case kRegexpEndLine:
943       return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
944 
945     case kRegexpBeginText:
946       return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
947 
948     case kRegexpEndText:
949       return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
950 
951     case kRegexpWordBoundary:
952       return EmptyWidth(kEmptyWordBoundary);
953 
954     case kRegexpNoWordBoundary:
955       return EmptyWidth(kEmptyNonWordBoundary);
956   }
957   LOG(DFATAL) << "Missing case in Compiler: " << re->op();
958   failed_ = true;
959   return NoMatch();
960 }
961 
962 // Is this regexp required to start at the beginning of the text?
963 // Only approximate; can return false for complicated regexps like (\Aa|\Ab),
964 // but handles (\A(a|b)).  Could use the Walker to write a more exact one.
IsAnchorStart(Regexp ** pre,int depth)965 static bool IsAnchorStart(Regexp** pre, int depth) {
966   Regexp* re = *pre;
967   Regexp* sub;
968   // The depth limit makes sure that we don't overflow
969   // the stack on a deeply nested regexp.  As the comment
970   // above says, IsAnchorStart is conservative, so returning
971   // a false negative is okay.  The exact limit is somewhat arbitrary.
972   if (re == NULL || depth >= 4)
973     return false;
974   switch (re->op()) {
975     default:
976       break;
977     case kRegexpConcat:
978       if (re->nsub() > 0) {
979         sub = re->sub()[0]->Incref();
980         if (IsAnchorStart(&sub, depth+1)) {
981           PODArray<Regexp*> subcopy(re->nsub());
982           subcopy[0] = sub;  // already have reference
983           for (int i = 1; i < re->nsub(); i++)
984             subcopy[i] = re->sub()[i]->Incref();
985           *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
986           re->Decref();
987           return true;
988         }
989         sub->Decref();
990       }
991       break;
992     case kRegexpCapture:
993       sub = re->sub()[0]->Incref();
994       if (IsAnchorStart(&sub, depth+1)) {
995         *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
996         re->Decref();
997         return true;
998       }
999       sub->Decref();
1000       break;
1001     case kRegexpBeginText:
1002       *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1003       re->Decref();
1004       return true;
1005   }
1006   return false;
1007 }
1008 
1009 // Is this regexp required to start at the end of the text?
1010 // Only approximate; can return false for complicated regexps like (a\z|b\z),
1011 // but handles ((a|b)\z).  Could use the Walker to write a more exact one.
IsAnchorEnd(Regexp ** pre,int depth)1012 static bool IsAnchorEnd(Regexp** pre, int depth) {
1013   Regexp* re = *pre;
1014   Regexp* sub;
1015   // The depth limit makes sure that we don't overflow
1016   // the stack on a deeply nested regexp.  As the comment
1017   // above says, IsAnchorEnd is conservative, so returning
1018   // a false negative is okay.  The exact limit is somewhat arbitrary.
1019   if (re == NULL || depth >= 4)
1020     return false;
1021   switch (re->op()) {
1022     default:
1023       break;
1024     case kRegexpConcat:
1025       if (re->nsub() > 0) {
1026         sub = re->sub()[re->nsub() - 1]->Incref();
1027         if (IsAnchorEnd(&sub, depth+1)) {
1028           PODArray<Regexp*> subcopy(re->nsub());
1029           subcopy[re->nsub() - 1] = sub;  // already have reference
1030           for (int i = 0; i < re->nsub() - 1; i++)
1031             subcopy[i] = re->sub()[i]->Incref();
1032           *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1033           re->Decref();
1034           return true;
1035         }
1036         sub->Decref();
1037       }
1038       break;
1039     case kRegexpCapture:
1040       sub = re->sub()[0]->Incref();
1041       if (IsAnchorEnd(&sub, depth+1)) {
1042         *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1043         re->Decref();
1044         return true;
1045       }
1046       sub->Decref();
1047       break;
1048     case kRegexpEndText:
1049       *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1050       re->Decref();
1051       return true;
1052   }
1053   return false;
1054 }
1055 
Setup(Regexp::ParseFlags flags,int64_t max_mem,RE2::Anchor anchor)1056 void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
1057                      RE2::Anchor anchor) {
1058   if (flags & Regexp::Latin1)
1059     encoding_ = kEncodingLatin1;
1060   max_mem_ = max_mem;
1061   if (max_mem <= 0) {
1062     max_ninst_ = 100000;  // more than enough
1063   } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
1064     // No room for anything.
1065     max_ninst_ = 0;
1066   } else {
1067     int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
1068     // Limit instruction count so that inst->id() fits nicely in an int.
1069     // SparseArray also assumes that the indices (inst->id()) are ints.
1070     // The call to WalkExponential uses 2*max_ninst_ below,
1071     // and other places in the code use 2 or 3 * prog->size().
1072     // Limiting to 2^24 should avoid overflow in those places.
1073     // (The point of allowing more than 32 bits of memory is to
1074     // have plenty of room for the DFA states, not to use it up
1075     // on the program.)
1076     if (m >= 1<<24)
1077       m = 1<<24;
1078     // Inst imposes its own limit (currently bigger than 2^24 but be safe).
1079     if (m > Prog::Inst::kMaxInst)
1080       m = Prog::Inst::kMaxInst;
1081     max_ninst_ = static_cast<int>(m);
1082   }
1083   anchor_ = anchor;
1084 }
1085 
1086 // Compiles re, returning program.
1087 // Caller is responsible for deleting prog_.
1088 // If reversed is true, compiles a program that expects
1089 // to run over the input string backward (reverses all concatenations).
1090 // The reversed flag is also recorded in the returned program.
Compile(Regexp * re,bool reversed,int64_t max_mem)1091 Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
1092   Compiler c;
1093   c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
1094   c.reversed_ = reversed;
1095 
1096   // Simplify to remove things like counted repetitions
1097   // and character classes like \d.
1098   Regexp* sre = re->Simplify();
1099   if (sre == NULL)
1100     return NULL;
1101 
1102   // Record whether prog is anchored, removing the anchors.
1103   // (They get in the way of other optimizations.)
1104   bool is_anchor_start = IsAnchorStart(&sre, 0);
1105   bool is_anchor_end = IsAnchorEnd(&sre, 0);
1106 
1107   // Generate fragment for entire regexp.
1108   Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1109   sre->Decref();
1110   if (c.failed_)
1111     return NULL;
1112 
1113   // Success!  Finish by putting Match node at end, and record start.
1114   // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1115   // to behave normally.
1116   c.reversed_ = false;
1117   all = c.Cat(all, c.Match(0));
1118 
1119   c.prog_->set_reversed(reversed);
1120   if (c.prog_->reversed()) {
1121     c.prog_->set_anchor_start(is_anchor_end);
1122     c.prog_->set_anchor_end(is_anchor_start);
1123   } else {
1124     c.prog_->set_anchor_start(is_anchor_start);
1125     c.prog_->set_anchor_end(is_anchor_end);
1126   }
1127 
1128   c.prog_->set_start(all.begin);
1129   if (!c.prog_->anchor_start()) {
1130     // Also create unanchored version, which starts with a .*? loop.
1131     all = c.Cat(c.DotStar(), all);
1132   }
1133   c.prog_->set_start_unanchored(all.begin);
1134 
1135   // Hand ownership of prog_ to caller.
1136   return c.Finish(re);
1137 }
1138 
Finish(Regexp * re)1139 Prog* Compiler::Finish(Regexp* re) {
1140   if (failed_)
1141     return NULL;
1142 
1143   if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1144     // No possible matches; keep Fail instruction only.
1145     ninst_ = 1;
1146   }
1147 
1148   // Hand off the array to Prog.
1149   prog_->inst_ = std::move(inst_);
1150   prog_->size_ = ninst_;
1151 
1152   prog_->Optimize();
1153   prog_->Flatten();
1154   prog_->ComputeByteMap();
1155 
1156   if (!prog_->reversed()) {
1157     std::string prefix;
1158     bool prefix_foldcase;
1159     if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase) &&
1160         !prefix_foldcase) {
1161       prog_->prefix_size_ = prefix.size();
1162       prog_->prefix_front_ = prefix.front();
1163       prog_->prefix_back_ = prefix.back();
1164     }
1165   }
1166 
1167   // Record remaining memory for DFA.
1168   if (max_mem_ <= 0) {
1169     prog_->set_dfa_mem(1<<20);
1170   } else {
1171     int64_t m = max_mem_ - sizeof(Prog);
1172     m -= prog_->size_*sizeof(Prog::Inst);  // account for inst_
1173     if (prog_->CanBitState())
1174       m -= prog_->size_*sizeof(uint16_t);  // account for list_heads_
1175     if (m < 0)
1176       m = 0;
1177     prog_->set_dfa_mem(m);
1178   }
1179 
1180   Prog* p = prog_;
1181   prog_ = NULL;
1182   return p;
1183 }
1184 
1185 // Converts Regexp to Prog.
CompileToProg(int64_t max_mem)1186 Prog* Regexp::CompileToProg(int64_t max_mem) {
1187   return Compiler::Compile(this, false, max_mem);
1188 }
1189 
CompileToReverseProg(int64_t max_mem)1190 Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
1191   return Compiler::Compile(this, true, max_mem);
1192 }
1193 
DotStar()1194 Frag Compiler::DotStar() {
1195   return Star(ByteRange(0x00, 0xff, false), true);
1196 }
1197 
1198 // Compiles RE set to Prog.
CompileSet(Regexp * re,RE2::Anchor anchor,int64_t max_mem)1199 Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1200   Compiler c;
1201   c.Setup(re->parse_flags(), max_mem, anchor);
1202 
1203   Regexp* sre = re->Simplify();
1204   if (sre == NULL)
1205     return NULL;
1206 
1207   Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1208   sre->Decref();
1209   if (c.failed_)
1210     return NULL;
1211 
1212   c.prog_->set_anchor_start(true);
1213   c.prog_->set_anchor_end(true);
1214 
1215   if (anchor == RE2::UNANCHORED) {
1216     // Prepend .* or else the expression will effectively be anchored.
1217     // Complemented by the ANCHOR_BOTH case in PostVisit().
1218     all = c.Cat(c.DotStar(), all);
1219   }
1220   c.prog_->set_start(all.begin);
1221   c.prog_->set_start_unanchored(all.begin);
1222 
1223   Prog* prog = c.Finish(re);
1224   if (prog == NULL)
1225     return NULL;
1226 
1227   // Make sure DFA has enough memory to operate,
1228   // since we're not going to fall back to the NFA.
1229   bool dfa_failed = false;
1230   StringPiece sp = "hello, world";
1231   prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1232                   NULL, &dfa_failed, NULL);
1233   if (dfa_failed) {
1234     delete prog;
1235     return NULL;
1236   }
1237 
1238   return prog;
1239 }
1240 
CompileSet(Regexp * re,RE2::Anchor anchor,int64_t max_mem)1241 Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1242   return Compiler::CompileSet(re, anchor, max_mem);
1243 }
1244 
1245 }  // namespace re2
1246