1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Compile regular expression to Prog.
6 //
7 // Prog and Inst are defined in prog.h.
8 // This file's external interface is just Regexp::CompileToProg.
9 // The Compiler class defined in this file is private.
10
11 #include <stdint.h>
12 #include <string.h>
13 #include <unordered_map>
14 #include <utility>
15
16 #include "util/logging.h"
17 #include "util/utf.h"
18 #include "re2/pod_array.h"
19 #include "re2/prog.h"
20 #include "re2/re2.h"
21 #include "re2/regexp.h"
22 #include "re2/walker-inl.h"
23
24 namespace re2 {
25
26 // List of pointers to Inst* that need to be filled in (patched).
27 // Because the Inst* haven't been filled in yet,
28 // we can use the Inst* word to hold the list's "next" pointer.
29 // It's kind of sleazy, but it works well in practice.
30 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
31 //
32 // Because the out and out1 fields in Inst are no longer pointers,
33 // we can't use pointers directly here either. Instead, head refers
34 // to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
35 // head == 0 represents the NULL list. This is okay because instruction #0
36 // is always the fail instruction, which never appears on a list.
37 struct PatchList {
38 // Returns patch list containing just p.
Mkre2::PatchList39 static PatchList Mk(uint32_t p) {
40 return {p, p};
41 }
42
43 // Patches all the entries on l to have value p.
44 // Caller must not ever use patch list again.
Patchre2::PatchList45 static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) {
46 while (l.head != 0) {
47 Prog::Inst* ip = &inst0[l.head>>1];
48 if (l.head&1) {
49 l.head = ip->out1();
50 ip->out1_ = p;
51 } else {
52 l.head = ip->out();
53 ip->set_out(p);
54 }
55 }
56 }
57
58 // Appends two patch lists and returns result.
Appendre2::PatchList59 static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
60 if (l1.head == 0)
61 return l2;
62 if (l2.head == 0)
63 return l1;
64 Prog::Inst* ip = &inst0[l1.tail>>1];
65 if (l1.tail&1)
66 ip->out1_ = l2.head;
67 else
68 ip->set_out(l2.head);
69 return {l1.head, l2.tail};
70 }
71
72 uint32_t head;
73 uint32_t tail; // for constant-time append
74 };
75
76 static const PatchList kNullPatchList = {0, 0};
77
78 // Compiled program fragment.
79 struct Frag {
80 uint32_t begin;
81 PatchList end;
82 bool nullable;
83
Fragre2::Frag84 Frag() : begin(0), end(kNullPatchList), nullable(false) {}
Fragre2::Frag85 Frag(uint32_t begin, PatchList end, bool nullable)
86 : begin(begin), end(end), nullable(nullable) {}
87 };
88
89 // Input encodings.
90 enum Encoding {
91 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
92 kEncodingLatin1, // Latin-1 (0-FF)
93 };
94
95 class Compiler : public Regexp::Walker<Frag> {
96 public:
97 explicit Compiler();
98 ~Compiler();
99
100 // Compiles Regexp to a new Prog.
101 // Caller is responsible for deleting Prog when finished with it.
102 // If reversed is true, compiles for walking over the input
103 // string backward (reverses all concatenations).
104 static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
105
106 // Compiles alternation of all the re to a new Prog.
107 // Each re has a match with an id equal to its index in the vector.
108 static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
109
110 // Interface for Regexp::Walker, which helps traverse the Regexp.
111 // The walk is purely post-recursive: given the machines for the
112 // children, PostVisit combines them to create the machine for
113 // the current node. The child_args are Frags.
114 // The Compiler traverses the Regexp parse tree, visiting
115 // each node in depth-first order. It invokes PreVisit before
116 // visiting the node's children and PostVisit after visiting
117 // the children.
118 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
119 Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
120 int nchild_args);
121 Frag ShortVisit(Regexp* re, Frag parent_arg);
122 Frag Copy(Frag arg);
123
124 // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
125 Frag Plus(Frag a, bool nongreedy);
126 Frag Star(Frag a, bool nongreedy);
127 Frag Quest(Frag a, bool nongreedy);
128
129 // Given fragment a, returns (a) capturing as \n.
130 Frag Capture(Frag a, int n);
131
132 // Given fragments a and b, returns ab; a|b
133 Frag Cat(Frag a, Frag b);
134 Frag Alt(Frag a, Frag b);
135
136 // Returns a fragment that can't match anything.
137 Frag NoMatch();
138
139 // Returns a fragment that matches the empty string.
140 Frag Match(int32_t id);
141
142 // Returns a no-op fragment.
143 Frag Nop();
144
145 // Returns a fragment matching the byte range lo-hi.
146 Frag ByteRange(int lo, int hi, bool foldcase);
147
148 // Returns a fragment matching an empty-width special op.
149 Frag EmptyWidth(EmptyOp op);
150
151 // Adds n instructions to the program.
152 // Returns the index of the first one.
153 // Returns -1 if no more instructions are available.
154 int AllocInst(int n);
155
156 // Rune range compiler.
157
158 // Begins a new alternation.
159 void BeginRange();
160
161 // Adds a fragment matching the rune range lo-hi.
162 void AddRuneRange(Rune lo, Rune hi, bool foldcase);
163 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
164 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
165 void Add_80_10ffff();
166
167 // New suffix that matches the byte range lo-hi, then goes to next.
168 int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
169 int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
170
171 // Returns true iff the suffix is cached.
172 bool IsCachedRuneByteSuffix(int id);
173
174 // Adds a suffix to alternation.
175 void AddSuffix(int id);
176
177 // Adds a suffix to the trie starting from the given root node.
178 // Returns zero iff allocating an instruction fails. Otherwise, returns
179 // the current root node, which might be different from what was given.
180 int AddSuffixRecursive(int root, int id);
181
182 // Finds the trie node for the given suffix. Returns a Frag in order to
183 // distinguish between pointing at the root node directly (end.head == 0)
184 // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively).
185 Frag FindByteRange(int root, int id);
186
187 // Compares two ByteRanges and returns true iff they are equal.
188 bool ByteRangeEqual(int id1, int id2);
189
190 // Returns the alternation of all the added suffixes.
191 Frag EndRange();
192
193 // Single rune.
194 Frag Literal(Rune r, bool foldcase);
195
196 void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor);
197 Prog* Finish(Regexp* re);
198
199 // Returns .* where dot = any byte
200 Frag DotStar();
201
202 private:
203 Prog* prog_; // Program being built.
204 bool failed_; // Did we give up compiling?
205 Encoding encoding_; // Input encoding
206 bool reversed_; // Should program run backward over text?
207
208 PODArray<Prog::Inst> inst_;
209 int ninst_; // Number of instructions used.
210 int max_ninst_; // Maximum number of instructions.
211
212 int64_t max_mem_; // Total memory budget.
213
214 std::unordered_map<uint64_t, int> rune_cache_;
215 Frag rune_range_;
216
217 RE2::Anchor anchor_; // anchor mode for RE2::Set
218
219 Compiler(const Compiler&) = delete;
220 Compiler& operator=(const Compiler&) = delete;
221 };
222
Compiler()223 Compiler::Compiler() {
224 prog_ = new Prog();
225 failed_ = false;
226 encoding_ = kEncodingUTF8;
227 reversed_ = false;
228 ninst_ = 0;
229 max_ninst_ = 1; // make AllocInst for fail instruction okay
230 max_mem_ = 0;
231 int fail = AllocInst(1);
232 inst_[fail].InitFail();
233 max_ninst_ = 0; // Caller must change
234 }
235
~Compiler()236 Compiler::~Compiler() {
237 delete prog_;
238 }
239
AllocInst(int n)240 int Compiler::AllocInst(int n) {
241 if (failed_ || ninst_ + n > max_ninst_) {
242 failed_ = true;
243 return -1;
244 }
245
246 if (ninst_ + n > inst_.size()) {
247 int cap = inst_.size();
248 if (cap == 0)
249 cap = 8;
250 while (ninst_ + n > cap)
251 cap *= 2;
252 PODArray<Prog::Inst> inst(cap);
253 if (inst_.data() != NULL)
254 memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]);
255 memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]);
256 inst_ = std::move(inst);
257 }
258 int id = ninst_;
259 ninst_ += n;
260 return id;
261 }
262
263 // These routines are somewhat hard to visualize in text --
264 // see http://swtch.com/~rsc/regexp/regexp1.html for
265 // pictures explaining what is going on here.
266
267 // Returns an unmatchable fragment.
NoMatch()268 Frag Compiler::NoMatch() {
269 return Frag();
270 }
271
272 // Is a an unmatchable fragment?
IsNoMatch(Frag a)273 static bool IsNoMatch(Frag a) {
274 return a.begin == 0;
275 }
276
277 // Given fragments a and b, returns fragment for ab.
Cat(Frag a,Frag b)278 Frag Compiler::Cat(Frag a, Frag b) {
279 if (IsNoMatch(a) || IsNoMatch(b))
280 return NoMatch();
281
282 // Elide no-op.
283 Prog::Inst* begin = &inst_[a.begin];
284 if (begin->opcode() == kInstNop &&
285 a.end.head == (a.begin << 1) &&
286 begin->out() == 0) {
287 // in case refs to a somewhere
288 PatchList::Patch(inst_.data(), a.end, b.begin);
289 return b;
290 }
291
292 // To run backward over string, reverse all concatenations.
293 if (reversed_) {
294 PatchList::Patch(inst_.data(), b.end, a.begin);
295 return Frag(b.begin, a.end, b.nullable && a.nullable);
296 }
297
298 PatchList::Patch(inst_.data(), a.end, b.begin);
299 return Frag(a.begin, b.end, a.nullable && b.nullable);
300 }
301
302 // Given fragments for a and b, returns fragment for a|b.
Alt(Frag a,Frag b)303 Frag Compiler::Alt(Frag a, Frag b) {
304 // Special case for convenience in loops.
305 if (IsNoMatch(a))
306 return b;
307 if (IsNoMatch(b))
308 return a;
309
310 int id = AllocInst(1);
311 if (id < 0)
312 return NoMatch();
313
314 inst_[id].InitAlt(a.begin, b.begin);
315 return Frag(id, PatchList::Append(inst_.data(), a.end, b.end),
316 a.nullable || b.nullable);
317 }
318
319 // When capturing submatches in like-Perl mode, a kOpAlt Inst
320 // treats out_ as the first choice, out1_ as the second.
321 //
322 // For *, +, and ?, if out_ causes another repetition,
323 // then the operator is greedy. If out1_ is the repetition
324 // (and out_ moves forward), then the operator is non-greedy.
325
326 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
Plus(Frag a,bool nongreedy)327 Frag Compiler::Plus(Frag a, bool nongreedy) {
328 int id = AllocInst(1);
329 if (id < 0)
330 return NoMatch();
331 PatchList pl;
332 if (nongreedy) {
333 inst_[id].InitAlt(0, a.begin);
334 pl = PatchList::Mk(id << 1);
335 } else {
336 inst_[id].InitAlt(a.begin, 0);
337 pl = PatchList::Mk((id << 1) | 1);
338 }
339 PatchList::Patch(inst_.data(), a.end, id);
340 return Frag(a.begin, pl, a.nullable);
341 }
342
343 // Given a fragment for a, returns a fragment for a* or a*? (if nongreedy)
Star(Frag a,bool nongreedy)344 Frag Compiler::Star(Frag a, bool nongreedy) {
345 // When the subexpression is nullable, one Alt isn't enough to guarantee
346 // correct priority ordering within the transitive closure. The simplest
347 // solution is to handle it as (a+)? instead, which adds the second Alt.
348 if (a.nullable)
349 return Quest(Plus(a, nongreedy), nongreedy);
350
351 int id = AllocInst(1);
352 if (id < 0)
353 return NoMatch();
354 PatchList pl;
355 if (nongreedy) {
356 inst_[id].InitAlt(0, a.begin);
357 pl = PatchList::Mk(id << 1);
358 } else {
359 inst_[id].InitAlt(a.begin, 0);
360 pl = PatchList::Mk((id << 1) | 1);
361 }
362 PatchList::Patch(inst_.data(), a.end, id);
363 return Frag(id, pl, true);
364 }
365
366 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
Quest(Frag a,bool nongreedy)367 Frag Compiler::Quest(Frag a, bool nongreedy) {
368 if (IsNoMatch(a))
369 return Nop();
370 int id = AllocInst(1);
371 if (id < 0)
372 return NoMatch();
373 PatchList pl;
374 if (nongreedy) {
375 inst_[id].InitAlt(0, a.begin);
376 pl = PatchList::Mk(id << 1);
377 } else {
378 inst_[id].InitAlt(a.begin, 0);
379 pl = PatchList::Mk((id << 1) | 1);
380 }
381 return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true);
382 }
383
384 // Returns a fragment for the byte range lo-hi.
ByteRange(int lo,int hi,bool foldcase)385 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
386 int id = AllocInst(1);
387 if (id < 0)
388 return NoMatch();
389 inst_[id].InitByteRange(lo, hi, foldcase, 0);
390 return Frag(id, PatchList::Mk(id << 1), false);
391 }
392
393 // Returns a no-op fragment. Sometimes unavoidable.
Nop()394 Frag Compiler::Nop() {
395 int id = AllocInst(1);
396 if (id < 0)
397 return NoMatch();
398 inst_[id].InitNop(0);
399 return Frag(id, PatchList::Mk(id << 1), true);
400 }
401
402 // Returns a fragment that signals a match.
Match(int32_t match_id)403 Frag Compiler::Match(int32_t match_id) {
404 int id = AllocInst(1);
405 if (id < 0)
406 return NoMatch();
407 inst_[id].InitMatch(match_id);
408 return Frag(id, kNullPatchList, false);
409 }
410
411 // Returns a fragment matching a particular empty-width op (like ^ or $)
EmptyWidth(EmptyOp empty)412 Frag Compiler::EmptyWidth(EmptyOp empty) {
413 int id = AllocInst(1);
414 if (id < 0)
415 return NoMatch();
416 inst_[id].InitEmptyWidth(empty, 0);
417 return Frag(id, PatchList::Mk(id << 1), true);
418 }
419
420 // Given a fragment a, returns a fragment with capturing parens around a.
Capture(Frag a,int n)421 Frag Compiler::Capture(Frag a, int n) {
422 if (IsNoMatch(a))
423 return NoMatch();
424 int id = AllocInst(2);
425 if (id < 0)
426 return NoMatch();
427 inst_[id].InitCapture(2*n, a.begin);
428 inst_[id+1].InitCapture(2*n+1, 0);
429 PatchList::Patch(inst_.data(), a.end, id+1);
430
431 return Frag(id, PatchList::Mk((id+1) << 1), a.nullable);
432 }
433
434 // A Rune is a name for a Unicode code point.
435 // Returns maximum rune encoded by UTF-8 sequence of length len.
MaxRune(int len)436 static int MaxRune(int len) {
437 int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
438 if (len == 1)
439 b = 7;
440 else
441 b = 8-(len+1) + 6*(len-1);
442 return (1<<b) - 1; // maximum Rune for b bits.
443 }
444
445 // The rune range compiler caches common suffix fragments,
446 // which are very common in UTF-8 (e.g., [80-bf]).
447 // The fragment suffixes are identified by their start
448 // instructions. NULL denotes the eventual end match.
449 // The Frag accumulates in rune_range_. Caching common
450 // suffixes reduces the UTF-8 "." from 32 to 24 instructions,
451 // and it reduces the corresponding one-pass NFA from 16 nodes to 8.
452
BeginRange()453 void Compiler::BeginRange() {
454 rune_cache_.clear();
455 rune_range_.begin = 0;
456 rune_range_.end = kNullPatchList;
457 }
458
UncachedRuneByteSuffix(uint8_t lo,uint8_t hi,bool foldcase,int next)459 int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
460 int next) {
461 Frag f = ByteRange(lo, hi, foldcase);
462 if (next != 0) {
463 PatchList::Patch(inst_.data(), f.end, next);
464 } else {
465 rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
466 }
467 return f.begin;
468 }
469
MakeRuneCacheKey(uint8_t lo,uint8_t hi,bool foldcase,int next)470 static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
471 int next) {
472 return (uint64_t)next << 17 |
473 (uint64_t)lo << 9 |
474 (uint64_t)hi << 1 |
475 (uint64_t)foldcase;
476 }
477
CachedRuneByteSuffix(uint8_t lo,uint8_t hi,bool foldcase,int next)478 int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
479 int next) {
480 uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
481 std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
482 if (it != rune_cache_.end())
483 return it->second;
484 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
485 rune_cache_[key] = id;
486 return id;
487 }
488
IsCachedRuneByteSuffix(int id)489 bool Compiler::IsCachedRuneByteSuffix(int id) {
490 uint8_t lo = inst_[id].lo_;
491 uint8_t hi = inst_[id].hi_;
492 bool foldcase = inst_[id].foldcase() != 0;
493 int next = inst_[id].out();
494
495 uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
496 return rune_cache_.find(key) != rune_cache_.end();
497 }
498
AddSuffix(int id)499 void Compiler::AddSuffix(int id) {
500 if (failed_)
501 return;
502
503 if (rune_range_.begin == 0) {
504 rune_range_.begin = id;
505 return;
506 }
507
508 if (encoding_ == kEncodingUTF8) {
509 // Build a trie in order to reduce fanout.
510 rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
511 return;
512 }
513
514 int alt = AllocInst(1);
515 if (alt < 0) {
516 rune_range_.begin = 0;
517 return;
518 }
519 inst_[alt].InitAlt(rune_range_.begin, id);
520 rune_range_.begin = alt;
521 }
522
AddSuffixRecursive(int root,int id)523 int Compiler::AddSuffixRecursive(int root, int id) {
524 DCHECK(inst_[root].opcode() == kInstAlt ||
525 inst_[root].opcode() == kInstByteRange);
526
527 Frag f = FindByteRange(root, id);
528 if (IsNoMatch(f)) {
529 int alt = AllocInst(1);
530 if (alt < 0)
531 return 0;
532 inst_[alt].InitAlt(root, id);
533 return alt;
534 }
535
536 int br;
537 if (f.end.head == 0)
538 br = root;
539 else if (f.end.head&1)
540 br = inst_[f.begin].out1();
541 else
542 br = inst_[f.begin].out();
543
544 if (IsCachedRuneByteSuffix(br)) {
545 // We can't fiddle with cached suffixes, so make a clone of the head.
546 int byterange = AllocInst(1);
547 if (byterange < 0)
548 return 0;
549 inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
550 inst_[br].foldcase(), inst_[br].out());
551
552 // Ensure that the parent points to the clone, not to the original.
553 // Note that this could leave the head unreachable except via the cache.
554 br = byterange;
555 if (f.end.head == 0)
556 root = br;
557 else if (f.end.head&1)
558 inst_[f.begin].out1_ = br;
559 else
560 inst_[f.begin].set_out(br);
561 }
562
563 int out = inst_[id].out();
564 if (!IsCachedRuneByteSuffix(id)) {
565 // The head should be the instruction most recently allocated, so free it
566 // instead of leaving it unreachable.
567 DCHECK_EQ(id, ninst_-1);
568 inst_[id].out_opcode_ = 0;
569 inst_[id].out1_ = 0;
570 ninst_--;
571 }
572
573 out = AddSuffixRecursive(inst_[br].out(), out);
574 if (out == 0)
575 return 0;
576
577 inst_[br].set_out(out);
578 return root;
579 }
580
ByteRangeEqual(int id1,int id2)581 bool Compiler::ByteRangeEqual(int id1, int id2) {
582 return inst_[id1].lo() == inst_[id2].lo() &&
583 inst_[id1].hi() == inst_[id2].hi() &&
584 inst_[id1].foldcase() == inst_[id2].foldcase();
585 }
586
FindByteRange(int root,int id)587 Frag Compiler::FindByteRange(int root, int id) {
588 if (inst_[root].opcode() == kInstByteRange) {
589 if (ByteRangeEqual(root, id))
590 return Frag(root, kNullPatchList, false);
591 else
592 return NoMatch();
593 }
594
595 while (inst_[root].opcode() == kInstAlt) {
596 int out1 = inst_[root].out1();
597 if (ByteRangeEqual(out1, id))
598 return Frag(root, PatchList::Mk((root << 1) | 1), false);
599
600 // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
601 // what we're looking for, then we can stop immediately. Unfortunately, we
602 // can't short-circuit the search in reverse mode.
603 if (!reversed_)
604 return NoMatch();
605
606 int out = inst_[root].out();
607 if (inst_[out].opcode() == kInstAlt)
608 root = out;
609 else if (ByteRangeEqual(out, id))
610 return Frag(root, PatchList::Mk(root << 1), false);
611 else
612 return NoMatch();
613 }
614
615 LOG(DFATAL) << "should never happen";
616 return NoMatch();
617 }
618
EndRange()619 Frag Compiler::EndRange() {
620 return rune_range_;
621 }
622
623 // Converts rune range lo-hi into a fragment that recognizes
624 // the bytes that would make up those runes in the current
625 // encoding (Latin 1 or UTF-8).
626 // This lets the machine work byte-by-byte even when
627 // using multibyte encodings.
628
AddRuneRange(Rune lo,Rune hi,bool foldcase)629 void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
630 switch (encoding_) {
631 default:
632 case kEncodingUTF8:
633 AddRuneRangeUTF8(lo, hi, foldcase);
634 break;
635 case kEncodingLatin1:
636 AddRuneRangeLatin1(lo, hi, foldcase);
637 break;
638 }
639 }
640
AddRuneRangeLatin1(Rune lo,Rune hi,bool foldcase)641 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
642 // Latin-1 is easy: runes *are* bytes.
643 if (lo > hi || lo > 0xFF)
644 return;
645 if (hi > 0xFF)
646 hi = 0xFF;
647 AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
648 static_cast<uint8_t>(hi), foldcase, 0));
649 }
650
Add_80_10ffff()651 void Compiler::Add_80_10ffff() {
652 // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
653 // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
654 // permitting overlong encodings in E0 and F0 sequences and code points
655 // over 10FFFF in F4 sequences, the size of the bytecode and the number
656 // of equivalence classes are reduced significantly.
657 int id;
658 if (reversed_) {
659 // Prefix factoring matters, but we don't have to handle it here
660 // because the rune range trie logic takes care of that already.
661 id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0);
662 id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
663 AddSuffix(id);
664
665 id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0);
666 id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
667 id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
668 AddSuffix(id);
669
670 id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0);
671 id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
672 id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
673 id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
674 AddSuffix(id);
675 } else {
676 // Suffix factoring matters - and we do have to handle it here.
677 int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0);
678 id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1);
679 AddSuffix(id);
680
681 int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1);
682 id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2);
683 AddSuffix(id);
684
685 int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2);
686 id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3);
687 AddSuffix(id);
688 }
689 }
690
AddRuneRangeUTF8(Rune lo,Rune hi,bool foldcase)691 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
692 if (lo > hi)
693 return;
694
695 // Pick off 80-10FFFF as a common special case.
696 if (lo == 0x80 && hi == 0x10ffff) {
697 Add_80_10ffff();
698 return;
699 }
700
701 // Split range into same-length sized ranges.
702 for (int i = 1; i < UTFmax; i++) {
703 Rune max = MaxRune(i);
704 if (lo <= max && max < hi) {
705 AddRuneRangeUTF8(lo, max, foldcase);
706 AddRuneRangeUTF8(max+1, hi, foldcase);
707 return;
708 }
709 }
710
711 // ASCII range is always a special case.
712 if (hi < Runeself) {
713 AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
714 static_cast<uint8_t>(hi), foldcase, 0));
715 return;
716 }
717
718 // Split range into sections that agree on leading bytes.
719 for (int i = 1; i < UTFmax; i++) {
720 uint32_t m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
721 if ((lo & ~m) != (hi & ~m)) {
722 if ((lo & m) != 0) {
723 AddRuneRangeUTF8(lo, lo|m, foldcase);
724 AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
725 return;
726 }
727 if ((hi & m) != m) {
728 AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
729 AddRuneRangeUTF8(hi&~m, hi, foldcase);
730 return;
731 }
732 }
733 }
734
735 // Finally. Generate byte matching equivalent for lo-hi.
736 uint8_t ulo[UTFmax], uhi[UTFmax];
737 int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
738 int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
739 (void)m; // USED(m)
740 DCHECK_EQ(n, m);
741
742 // The logic below encodes this thinking:
743 //
744 // 1. When we have built the whole suffix, we know that it cannot
745 // possibly be a suffix of anything longer: in forward mode, nothing
746 // else can occur before the leading byte; in reverse mode, nothing
747 // else can occur after the last continuation byte or else the leading
748 // byte would have to change. Thus, there is no benefit to caching
749 // the first byte of the suffix whereas there is a cost involved in
750 // cloning it if it begins a common prefix, which is fairly likely.
751 //
752 // 2. Conversely, the last byte of the suffix cannot possibly be a
753 // prefix of anything because next == 0, so we will never want to
754 // clone it, but it is fairly likely to be a common suffix. Perhaps
755 // more so in reverse mode than in forward mode because the former is
756 // "converging" towards lower entropy, but caching is still worthwhile
757 // for the latter in cases such as 80-BF.
758 //
759 // 3. Handling the bytes between the first and the last is less
760 // straightforward and, again, the approach depends on whether we are
761 // "converging" towards lower entropy: in forward mode, a single byte
762 // is unlikely to be part of a common suffix whereas a byte range
763 // is more likely so; in reverse mode, a byte range is unlikely to
764 // be part of a common suffix whereas a single byte is more likely
765 // so. The same benefit versus cost argument applies here.
766 int id = 0;
767 if (reversed_) {
768 for (int i = 0; i < n; i++) {
769 // In reverse UTF-8 mode: cache the leading byte; don't cache the last
770 // continuation byte; cache anything else iff it's a single byte (XX-XX).
771 if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
772 id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
773 else
774 id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
775 }
776 } else {
777 for (int i = n-1; i >= 0; i--) {
778 // In forward UTF-8 mode: don't cache the leading byte; cache the last
779 // continuation byte; cache anything else iff it's a byte range (XX-YY).
780 if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
781 id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
782 else
783 id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
784 }
785 }
786 AddSuffix(id);
787 }
788
789 // Should not be called.
Copy(Frag arg)790 Frag Compiler::Copy(Frag arg) {
791 // We're using WalkExponential; there should be no copying.
792 LOG(DFATAL) << "Compiler::Copy called!";
793 failed_ = true;
794 return NoMatch();
795 }
796
797 // Visits a node quickly; called once WalkExponential has
798 // decided to cut this walk short.
ShortVisit(Regexp * re,Frag)799 Frag Compiler::ShortVisit(Regexp* re, Frag) {
800 failed_ = true;
801 return NoMatch();
802 }
803
804 // Called before traversing a node's children during the walk.
PreVisit(Regexp * re,Frag,bool * stop)805 Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
806 // Cut off walk if we've already failed.
807 if (failed_)
808 *stop = true;
809
810 return Frag(); // not used by caller
811 }
812
Literal(Rune r,bool foldcase)813 Frag Compiler::Literal(Rune r, bool foldcase) {
814 switch (encoding_) {
815 default:
816 return Frag();
817
818 case kEncodingLatin1:
819 return ByteRange(r, r, foldcase);
820
821 case kEncodingUTF8: {
822 if (r < Runeself) // Make common case fast.
823 return ByteRange(r, r, foldcase);
824 uint8_t buf[UTFmax];
825 int n = runetochar(reinterpret_cast<char*>(buf), &r);
826 Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
827 for (int i = 1; i < n; i++)
828 f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
829 return f;
830 }
831 }
832 }
833
834 // Called after traversing the node's children during the walk.
835 // Given their frags, build and return the frag for this re.
PostVisit(Regexp * re,Frag,Frag,Frag * child_frags,int nchild_frags)836 Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
837 int nchild_frags) {
838 // If a child failed, don't bother going forward, especially
839 // since the child_frags might contain Frags with NULLs in them.
840 if (failed_)
841 return NoMatch();
842
843 // Given the child fragments, return the fragment for this node.
844 switch (re->op()) {
845 case kRegexpRepeat:
846 // Should not see; code at bottom of function will print error
847 break;
848
849 case kRegexpNoMatch:
850 return NoMatch();
851
852 case kRegexpEmptyMatch:
853 return Nop();
854
855 case kRegexpHaveMatch: {
856 Frag f = Match(re->match_id());
857 if (anchor_ == RE2::ANCHOR_BOTH) {
858 // Append \z or else the subexpression will effectively be unanchored.
859 // Complemented by the UNANCHORED case in CompileSet().
860 f = Cat(EmptyWidth(kEmptyEndText), f);
861 }
862 return f;
863 }
864
865 case kRegexpConcat: {
866 Frag f = child_frags[0];
867 for (int i = 1; i < nchild_frags; i++)
868 f = Cat(f, child_frags[i]);
869 return f;
870 }
871
872 case kRegexpAlternate: {
873 Frag f = child_frags[0];
874 for (int i = 1; i < nchild_frags; i++)
875 f = Alt(f, child_frags[i]);
876 return f;
877 }
878
879 case kRegexpStar:
880 return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
881
882 case kRegexpPlus:
883 return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
884
885 case kRegexpQuest:
886 return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
887
888 case kRegexpLiteral:
889 return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
890
891 case kRegexpLiteralString: {
892 // Concatenation of literals.
893 if (re->nrunes() == 0)
894 return Nop();
895 Frag f;
896 for (int i = 0; i < re->nrunes(); i++) {
897 Frag f1 = Literal(re->runes()[i],
898 (re->parse_flags()&Regexp::FoldCase) != 0);
899 if (i == 0)
900 f = f1;
901 else
902 f = Cat(f, f1);
903 }
904 return f;
905 }
906
907 case kRegexpAnyChar:
908 BeginRange();
909 AddRuneRange(0, Runemax, false);
910 return EndRange();
911
912 case kRegexpAnyByte:
913 return ByteRange(0x00, 0xFF, false);
914
915 case kRegexpCharClass: {
916 CharClass* cc = re->cc();
917 if (cc->empty()) {
918 // This can't happen.
919 LOG(DFATAL) << "No ranges in char class";
920 failed_ = true;
921 return NoMatch();
922 }
923
924 // ASCII case-folding optimization: if the char class
925 // behaves the same on A-Z as it does on a-z,
926 // discard any ranges wholly contained in A-Z
927 // and mark the other ranges as foldascii.
928 // This reduces the size of a program for
929 // (?i)abc from 3 insts per letter to 1 per letter.
930 bool foldascii = cc->FoldsASCII();
931
932 // Character class is just a big OR of the different
933 // character ranges in the class.
934 BeginRange();
935 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
936 // ASCII case-folding optimization (see above).
937 if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
938 continue;
939
940 // If this range contains all of A-Za-z or none of it,
941 // the fold flag is unnecessary; don't bother.
942 bool fold = foldascii;
943 if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
944 ('Z' < i->lo && i->hi < 'a'))
945 fold = false;
946
947 AddRuneRange(i->lo, i->hi, fold);
948 }
949 return EndRange();
950 }
951
952 case kRegexpCapture:
953 // If this is a non-capturing parenthesis -- (?:foo) --
954 // just use the inner expression.
955 if (re->cap() < 0)
956 return child_frags[0];
957 return Capture(child_frags[0], re->cap());
958
959 case kRegexpBeginLine:
960 return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
961
962 case kRegexpEndLine:
963 return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
964
965 case kRegexpBeginText:
966 return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
967
968 case kRegexpEndText:
969 return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
970
971 case kRegexpWordBoundary:
972 return EmptyWidth(kEmptyWordBoundary);
973
974 case kRegexpNoWordBoundary:
975 return EmptyWidth(kEmptyNonWordBoundary);
976 }
977 LOG(DFATAL) << "Missing case in Compiler: " << re->op();
978 failed_ = true;
979 return NoMatch();
980 }
981
982 // Is this regexp required to start at the beginning of the text?
983 // Only approximate; can return false for complicated regexps like (\Aa|\Ab),
984 // but handles (\A(a|b)). Could use the Walker to write a more exact one.
IsAnchorStart(Regexp ** pre,int depth)985 static bool IsAnchorStart(Regexp** pre, int depth) {
986 Regexp* re = *pre;
987 Regexp* sub;
988 // The depth limit makes sure that we don't overflow
989 // the stack on a deeply nested regexp. As the comment
990 // above says, IsAnchorStart is conservative, so returning
991 // a false negative is okay. The exact limit is somewhat arbitrary.
992 if (re == NULL || depth >= 4)
993 return false;
994 switch (re->op()) {
995 default:
996 break;
997 case kRegexpConcat:
998 if (re->nsub() > 0) {
999 sub = re->sub()[0]->Incref();
1000 if (IsAnchorStart(&sub, depth+1)) {
1001 PODArray<Regexp*> subcopy(re->nsub());
1002 subcopy[0] = sub; // already have reference
1003 for (int i = 1; i < re->nsub(); i++)
1004 subcopy[i] = re->sub()[i]->Incref();
1005 *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1006 re->Decref();
1007 return true;
1008 }
1009 sub->Decref();
1010 }
1011 break;
1012 case kRegexpCapture:
1013 sub = re->sub()[0]->Incref();
1014 if (IsAnchorStart(&sub, depth+1)) {
1015 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1016 re->Decref();
1017 return true;
1018 }
1019 sub->Decref();
1020 break;
1021 case kRegexpBeginText:
1022 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1023 re->Decref();
1024 return true;
1025 }
1026 return false;
1027 }
1028
1029 // Is this regexp required to start at the end of the text?
1030 // Only approximate; can return false for complicated regexps like (a\z|b\z),
1031 // but handles ((a|b)\z). Could use the Walker to write a more exact one.
IsAnchorEnd(Regexp ** pre,int depth)1032 static bool IsAnchorEnd(Regexp** pre, int depth) {
1033 Regexp* re = *pre;
1034 Regexp* sub;
1035 // The depth limit makes sure that we don't overflow
1036 // the stack on a deeply nested regexp. As the comment
1037 // above says, IsAnchorEnd is conservative, so returning
1038 // a false negative is okay. The exact limit is somewhat arbitrary.
1039 if (re == NULL || depth >= 4)
1040 return false;
1041 switch (re->op()) {
1042 default:
1043 break;
1044 case kRegexpConcat:
1045 if (re->nsub() > 0) {
1046 sub = re->sub()[re->nsub() - 1]->Incref();
1047 if (IsAnchorEnd(&sub, depth+1)) {
1048 PODArray<Regexp*> subcopy(re->nsub());
1049 subcopy[re->nsub() - 1] = sub; // already have reference
1050 for (int i = 0; i < re->nsub() - 1; i++)
1051 subcopy[i] = re->sub()[i]->Incref();
1052 *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1053 re->Decref();
1054 return true;
1055 }
1056 sub->Decref();
1057 }
1058 break;
1059 case kRegexpCapture:
1060 sub = re->sub()[0]->Incref();
1061 if (IsAnchorEnd(&sub, depth+1)) {
1062 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1063 re->Decref();
1064 return true;
1065 }
1066 sub->Decref();
1067 break;
1068 case kRegexpEndText:
1069 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1070 re->Decref();
1071 return true;
1072 }
1073 return false;
1074 }
1075
Setup(Regexp::ParseFlags flags,int64_t max_mem,RE2::Anchor anchor)1076 void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
1077 RE2::Anchor anchor) {
1078 if (flags & Regexp::Latin1)
1079 encoding_ = kEncodingLatin1;
1080 max_mem_ = max_mem;
1081 if (max_mem <= 0) {
1082 max_ninst_ = 100000; // more than enough
1083 } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
1084 // No room for anything.
1085 max_ninst_ = 0;
1086 } else {
1087 int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
1088 // Limit instruction count so that inst->id() fits nicely in an int.
1089 // SparseArray also assumes that the indices (inst->id()) are ints.
1090 // The call to WalkExponential uses 2*max_ninst_ below,
1091 // and other places in the code use 2 or 3 * prog->size().
1092 // Limiting to 2^24 should avoid overflow in those places.
1093 // (The point of allowing more than 32 bits of memory is to
1094 // have plenty of room for the DFA states, not to use it up
1095 // on the program.)
1096 if (m >= 1<<24)
1097 m = 1<<24;
1098 // Inst imposes its own limit (currently bigger than 2^24 but be safe).
1099 if (m > Prog::Inst::kMaxInst)
1100 m = Prog::Inst::kMaxInst;
1101 max_ninst_ = static_cast<int>(m);
1102 }
1103 anchor_ = anchor;
1104 }
1105
1106 // Compiles re, returning program.
1107 // Caller is responsible for deleting prog_.
1108 // If reversed is true, compiles a program that expects
1109 // to run over the input string backward (reverses all concatenations).
1110 // The reversed flag is also recorded in the returned program.
Compile(Regexp * re,bool reversed,int64_t max_mem)1111 Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
1112 Compiler c;
1113 c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
1114 c.reversed_ = reversed;
1115
1116 // Simplify to remove things like counted repetitions
1117 // and character classes like \d.
1118 Regexp* sre = re->Simplify();
1119 if (sre == NULL)
1120 return NULL;
1121
1122 // Record whether prog is anchored, removing the anchors.
1123 // (They get in the way of other optimizations.)
1124 bool is_anchor_start = IsAnchorStart(&sre, 0);
1125 bool is_anchor_end = IsAnchorEnd(&sre, 0);
1126
1127 // Generate fragment for entire regexp.
1128 Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1129 sre->Decref();
1130 if (c.failed_)
1131 return NULL;
1132
1133 // Success! Finish by putting Match node at end, and record start.
1134 // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1135 // to behave normally.
1136 c.reversed_ = false;
1137 all = c.Cat(all, c.Match(0));
1138
1139 c.prog_->set_reversed(reversed);
1140 if (c.prog_->reversed()) {
1141 c.prog_->set_anchor_start(is_anchor_end);
1142 c.prog_->set_anchor_end(is_anchor_start);
1143 } else {
1144 c.prog_->set_anchor_start(is_anchor_start);
1145 c.prog_->set_anchor_end(is_anchor_end);
1146 }
1147
1148 c.prog_->set_start(all.begin);
1149 if (!c.prog_->anchor_start()) {
1150 // Also create unanchored version, which starts with a .*? loop.
1151 all = c.Cat(c.DotStar(), all);
1152 }
1153 c.prog_->set_start_unanchored(all.begin);
1154
1155 // Hand ownership of prog_ to caller.
1156 return c.Finish(re);
1157 }
1158
Finish(Regexp * re)1159 Prog* Compiler::Finish(Regexp* re) {
1160 if (failed_)
1161 return NULL;
1162
1163 if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1164 // No possible matches; keep Fail instruction only.
1165 ninst_ = 1;
1166 }
1167
1168 // Hand off the array to Prog.
1169 prog_->inst_ = std::move(inst_);
1170 prog_->size_ = ninst_;
1171
1172 prog_->Optimize();
1173 prog_->Flatten();
1174 prog_->ComputeByteMap();
1175
1176 if (!prog_->reversed()) {
1177 std::string prefix;
1178 bool prefix_foldcase;
1179 if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase))
1180 prog_->ConfigurePrefixAccel(prefix, prefix_foldcase);
1181 }
1182
1183 // Record remaining memory for DFA.
1184 if (max_mem_ <= 0) {
1185 prog_->set_dfa_mem(1<<20);
1186 } else {
1187 int64_t m = max_mem_ - sizeof(Prog);
1188 m -= prog_->size_*sizeof(Prog::Inst); // account for inst_
1189 if (prog_->CanBitState())
1190 m -= prog_->size_*sizeof(uint16_t); // account for list_heads_
1191 if (m < 0)
1192 m = 0;
1193 prog_->set_dfa_mem(m);
1194 }
1195
1196 Prog* p = prog_;
1197 prog_ = NULL;
1198 return p;
1199 }
1200
1201 // Converts Regexp to Prog.
CompileToProg(int64_t max_mem)1202 Prog* Regexp::CompileToProg(int64_t max_mem) {
1203 return Compiler::Compile(this, false, max_mem);
1204 }
1205
CompileToReverseProg(int64_t max_mem)1206 Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
1207 return Compiler::Compile(this, true, max_mem);
1208 }
1209
DotStar()1210 Frag Compiler::DotStar() {
1211 return Star(ByteRange(0x00, 0xff, false), true);
1212 }
1213
1214 // Compiles RE set to Prog.
CompileSet(Regexp * re,RE2::Anchor anchor,int64_t max_mem)1215 Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1216 Compiler c;
1217 c.Setup(re->parse_flags(), max_mem, anchor);
1218
1219 Regexp* sre = re->Simplify();
1220 if (sre == NULL)
1221 return NULL;
1222
1223 Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1224 sre->Decref();
1225 if (c.failed_)
1226 return NULL;
1227
1228 c.prog_->set_anchor_start(true);
1229 c.prog_->set_anchor_end(true);
1230
1231 if (anchor == RE2::UNANCHORED) {
1232 // Prepend .* or else the expression will effectively be anchored.
1233 // Complemented by the ANCHOR_BOTH case in PostVisit().
1234 all = c.Cat(c.DotStar(), all);
1235 }
1236 c.prog_->set_start(all.begin);
1237 c.prog_->set_start_unanchored(all.begin);
1238
1239 Prog* prog = c.Finish(re);
1240 if (prog == NULL)
1241 return NULL;
1242
1243 // Make sure DFA has enough memory to operate,
1244 // since we're not going to fall back to the NFA.
1245 bool dfa_failed = false;
1246 StringPiece sp = "hello, world";
1247 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1248 NULL, &dfa_failed, NULL);
1249 if (dfa_failed) {
1250 delete prog;
1251 return NULL;
1252 }
1253
1254 return prog;
1255 }
1256
CompileSet(Regexp * re,RE2::Anchor anchor,int64_t max_mem)1257 Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1258 return Compiler::CompileSet(re, anchor, max_mem);
1259 }
1260
1261 } // namespace re2
1262