1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Compiled regular expression representation.
6 // Tested by compile_test.cc
7
8 #include "re2/prog.h"
9
10 #if defined(__AVX2__)
11 #include <immintrin.h>
12 #ifdef _MSC_VER
13 #include <intrin.h>
14 #endif
15 #endif
16 #include <stdint.h>
17 #include <string.h>
18 #include <algorithm>
19 #include <memory>
20 #include <utility>
21
22 #include "util/util.h"
23 #include "util/logging.h"
24 #include "util/strutil.h"
25 #include "re2/bitmap256.h"
26 #include "re2/stringpiece.h"
27
28 namespace re2 {
29
30 // Constructors per Inst opcode
31
InitAlt(uint32_t out,uint32_t out1)32 void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
33 DCHECK_EQ(out_opcode_, 0);
34 set_out_opcode(out, kInstAlt);
35 out1_ = out1;
36 }
37
InitByteRange(int lo,int hi,int foldcase,uint32_t out)38 void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
39 DCHECK_EQ(out_opcode_, 0);
40 set_out_opcode(out, kInstByteRange);
41 lo_ = lo & 0xFF;
42 hi_ = hi & 0xFF;
43 hint_foldcase_ = foldcase&1;
44 }
45
InitCapture(int cap,uint32_t out)46 void Prog::Inst::InitCapture(int cap, uint32_t out) {
47 DCHECK_EQ(out_opcode_, 0);
48 set_out_opcode(out, kInstCapture);
49 cap_ = cap;
50 }
51
InitEmptyWidth(EmptyOp empty,uint32_t out)52 void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
53 DCHECK_EQ(out_opcode_, 0);
54 set_out_opcode(out, kInstEmptyWidth);
55 empty_ = empty;
56 }
57
InitMatch(int32_t id)58 void Prog::Inst::InitMatch(int32_t id) {
59 DCHECK_EQ(out_opcode_, 0);
60 set_opcode(kInstMatch);
61 match_id_ = id;
62 }
63
InitNop(uint32_t out)64 void Prog::Inst::InitNop(uint32_t out) {
65 DCHECK_EQ(out_opcode_, 0);
66 set_opcode(kInstNop);
67 }
68
InitFail()69 void Prog::Inst::InitFail() {
70 DCHECK_EQ(out_opcode_, 0);
71 set_opcode(kInstFail);
72 }
73
Dump()74 std::string Prog::Inst::Dump() {
75 switch (opcode()) {
76 default:
77 return StringPrintf("opcode %d", static_cast<int>(opcode()));
78
79 case kInstAlt:
80 return StringPrintf("alt -> %d | %d", out(), out1_);
81
82 case kInstAltMatch:
83 return StringPrintf("altmatch -> %d | %d", out(), out1_);
84
85 case kInstByteRange:
86 return StringPrintf("byte%s [%02x-%02x] %d -> %d",
87 foldcase() ? "/i" : "",
88 lo_, hi_, hint(), out());
89
90 case kInstCapture:
91 return StringPrintf("capture %d -> %d", cap_, out());
92
93 case kInstEmptyWidth:
94 return StringPrintf("emptywidth %#x -> %d",
95 static_cast<int>(empty_), out());
96
97 case kInstMatch:
98 return StringPrintf("match! %d", match_id());
99
100 case kInstNop:
101 return StringPrintf("nop -> %d", out());
102
103 case kInstFail:
104 return StringPrintf("fail");
105 }
106 }
107
Prog()108 Prog::Prog()
109 : anchor_start_(false),
110 anchor_end_(false),
111 reversed_(false),
112 did_flatten_(false),
113 did_onepass_(false),
114 start_(0),
115 start_unanchored_(0),
116 size_(0),
117 bytemap_range_(0),
118 prefix_foldcase_(false),
119 prefix_size_(0),
120 list_count_(0),
121 bit_state_text_max_size_(0),
122 dfa_mem_(0),
123 dfa_first_(NULL),
124 dfa_longest_(NULL) {
125 }
126
~Prog()127 Prog::~Prog() {
128 DeleteDFA(dfa_longest_);
129 DeleteDFA(dfa_first_);
130 if (prefix_foldcase_)
131 delete[] prefix_dfa_;
132 }
133
134 typedef SparseSet Workq;
135
AddToQueue(Workq * q,int id)136 static inline void AddToQueue(Workq* q, int id) {
137 if (id != 0)
138 q->insert(id);
139 }
140
ProgToString(Prog * prog,Workq * q)141 static std::string ProgToString(Prog* prog, Workq* q) {
142 std::string s;
143 for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
144 int id = *i;
145 Prog::Inst* ip = prog->inst(id);
146 s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
147 AddToQueue(q, ip->out());
148 if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
149 AddToQueue(q, ip->out1());
150 }
151 return s;
152 }
153
FlattenedProgToString(Prog * prog,int start)154 static std::string FlattenedProgToString(Prog* prog, int start) {
155 std::string s;
156 for (int id = start; id < prog->size(); id++) {
157 Prog::Inst* ip = prog->inst(id);
158 if (ip->last())
159 s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
160 else
161 s += StringPrintf("%d+ %s\n", id, ip->Dump().c_str());
162 }
163 return s;
164 }
165
Dump()166 std::string Prog::Dump() {
167 if (did_flatten_)
168 return FlattenedProgToString(this, start_);
169
170 Workq q(size_);
171 AddToQueue(&q, start_);
172 return ProgToString(this, &q);
173 }
174
DumpUnanchored()175 std::string Prog::DumpUnanchored() {
176 if (did_flatten_)
177 return FlattenedProgToString(this, start_unanchored_);
178
179 Workq q(size_);
180 AddToQueue(&q, start_unanchored_);
181 return ProgToString(this, &q);
182 }
183
DumpByteMap()184 std::string Prog::DumpByteMap() {
185 std::string map;
186 for (int c = 0; c < 256; c++) {
187 int b = bytemap_[c];
188 int lo = c;
189 while (c < 256-1 && bytemap_[c+1] == b)
190 c++;
191 int hi = c;
192 map += StringPrintf("[%02x-%02x] -> %d\n", lo, hi, b);
193 }
194 return map;
195 }
196
197 // Is ip a guaranteed match at end of text, perhaps after some capturing?
IsMatch(Prog * prog,Prog::Inst * ip)198 static bool IsMatch(Prog* prog, Prog::Inst* ip) {
199 for (;;) {
200 switch (ip->opcode()) {
201 default:
202 LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
203 return false;
204
205 case kInstAlt:
206 case kInstAltMatch:
207 case kInstByteRange:
208 case kInstFail:
209 case kInstEmptyWidth:
210 return false;
211
212 case kInstCapture:
213 case kInstNop:
214 ip = prog->inst(ip->out());
215 break;
216
217 case kInstMatch:
218 return true;
219 }
220 }
221 }
222
223 // Peep-hole optimizer.
Optimize()224 void Prog::Optimize() {
225 Workq q(size_);
226
227 // Eliminate nops. Most are taken out during compilation
228 // but a few are hard to avoid.
229 q.clear();
230 AddToQueue(&q, start_);
231 for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
232 int id = *i;
233
234 Inst* ip = inst(id);
235 int j = ip->out();
236 Inst* jp;
237 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
238 j = jp->out();
239 }
240 ip->set_out(j);
241 AddToQueue(&q, ip->out());
242
243 if (ip->opcode() == kInstAlt) {
244 j = ip->out1();
245 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
246 j = jp->out();
247 }
248 ip->out1_ = j;
249 AddToQueue(&q, ip->out1());
250 }
251 }
252
253 // Insert kInstAltMatch instructions
254 // Look for
255 // ip: Alt -> j | k
256 // j: ByteRange [00-FF] -> ip
257 // k: Match
258 // or the reverse (the above is the greedy one).
259 // Rewrite Alt to AltMatch.
260 q.clear();
261 AddToQueue(&q, start_);
262 for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
263 int id = *i;
264 Inst* ip = inst(id);
265 AddToQueue(&q, ip->out());
266 if (ip->opcode() == kInstAlt)
267 AddToQueue(&q, ip->out1());
268
269 if (ip->opcode() == kInstAlt) {
270 Inst* j = inst(ip->out());
271 Inst* k = inst(ip->out1());
272 if (j->opcode() == kInstByteRange && j->out() == id &&
273 j->lo() == 0x00 && j->hi() == 0xFF &&
274 IsMatch(this, k)) {
275 ip->set_opcode(kInstAltMatch);
276 continue;
277 }
278 if (IsMatch(this, j) &&
279 k->opcode() == kInstByteRange && k->out() == id &&
280 k->lo() == 0x00 && k->hi() == 0xFF) {
281 ip->set_opcode(kInstAltMatch);
282 }
283 }
284 }
285 }
286
EmptyFlags(const StringPiece & text,const char * p)287 uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
288 int flags = 0;
289
290 // ^ and \A
291 if (p == text.data())
292 flags |= kEmptyBeginText | kEmptyBeginLine;
293 else if (p[-1] == '\n')
294 flags |= kEmptyBeginLine;
295
296 // $ and \z
297 if (p == text.data() + text.size())
298 flags |= kEmptyEndText | kEmptyEndLine;
299 else if (p < text.data() + text.size() && p[0] == '\n')
300 flags |= kEmptyEndLine;
301
302 // \b and \B
303 if (p == text.data() && p == text.data() + text.size()) {
304 // no word boundary here
305 } else if (p == text.data()) {
306 if (IsWordChar(p[0]))
307 flags |= kEmptyWordBoundary;
308 } else if (p == text.data() + text.size()) {
309 if (IsWordChar(p[-1]))
310 flags |= kEmptyWordBoundary;
311 } else {
312 if (IsWordChar(p[-1]) != IsWordChar(p[0]))
313 flags |= kEmptyWordBoundary;
314 }
315 if (!(flags & kEmptyWordBoundary))
316 flags |= kEmptyNonWordBoundary;
317
318 return flags;
319 }
320
321 // ByteMapBuilder implements a coloring algorithm.
322 //
323 // The first phase is a series of "mark and merge" batches: we mark one or more
324 // [lo-hi] ranges, then merge them into our internal state. Batching is not for
325 // performance; rather, it means that the ranges are treated indistinguishably.
326 //
327 // Internally, the ranges are represented using a bitmap that stores the splits
328 // and a vector that stores the colors; both of them are indexed by the ranges'
329 // last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
330 // hi (if not already split), then recolor each range in between. The color map
331 // (i.e. from the old color to the new color) is maintained for the lifetime of
332 // the batch and so underpins this somewhat obscure approach to set operations.
333 //
334 // The second phase builds the bytemap from our internal state: we recolor each
335 // range, then store the new color (which is now the byte class) in each of the
336 // corresponding array elements. Finally, we output the number of byte classes.
337 class ByteMapBuilder {
338 public:
ByteMapBuilder()339 ByteMapBuilder() {
340 // Initial state: the [0-255] range has color 256.
341 // This will avoid problems during the second phase,
342 // in which we assign byte classes numbered from 0.
343 splits_.Set(255);
344 colors_[255] = 256;
345 nextcolor_ = 257;
346 }
347
348 void Mark(int lo, int hi);
349 void Merge();
350 void Build(uint8_t* bytemap, int* bytemap_range);
351
352 private:
353 int Recolor(int oldcolor);
354
355 Bitmap256 splits_;
356 int colors_[256];
357 int nextcolor_;
358 std::vector<std::pair<int, int>> colormap_;
359 std::vector<std::pair<int, int>> ranges_;
360
361 ByteMapBuilder(const ByteMapBuilder&) = delete;
362 ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
363 };
364
Mark(int lo,int hi)365 void ByteMapBuilder::Mark(int lo, int hi) {
366 DCHECK_GE(lo, 0);
367 DCHECK_GE(hi, 0);
368 DCHECK_LE(lo, 255);
369 DCHECK_LE(hi, 255);
370 DCHECK_LE(lo, hi);
371
372 // Ignore any [0-255] ranges. They cause us to recolor every range, which
373 // has no effect on the eventual result and is therefore a waste of time.
374 if (lo == 0 && hi == 255)
375 return;
376
377 ranges_.emplace_back(lo, hi);
378 }
379
Merge()380 void ByteMapBuilder::Merge() {
381 for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
382 it != ranges_.end();
383 ++it) {
384 int lo = it->first-1;
385 int hi = it->second;
386
387 if (0 <= lo && !splits_.Test(lo)) {
388 splits_.Set(lo);
389 int next = splits_.FindNextSetBit(lo+1);
390 colors_[lo] = colors_[next];
391 }
392 if (!splits_.Test(hi)) {
393 splits_.Set(hi);
394 int next = splits_.FindNextSetBit(hi+1);
395 colors_[hi] = colors_[next];
396 }
397
398 int c = lo+1;
399 while (c < 256) {
400 int next = splits_.FindNextSetBit(c);
401 colors_[next] = Recolor(colors_[next]);
402 if (next == hi)
403 break;
404 c = next+1;
405 }
406 }
407 colormap_.clear();
408 ranges_.clear();
409 }
410
Build(uint8_t * bytemap,int * bytemap_range)411 void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
412 // Assign byte classes numbered from 0.
413 nextcolor_ = 0;
414
415 int c = 0;
416 while (c < 256) {
417 int next = splits_.FindNextSetBit(c);
418 uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
419 while (c <= next) {
420 bytemap[c] = b;
421 c++;
422 }
423 }
424
425 *bytemap_range = nextcolor_;
426 }
427
Recolor(int oldcolor)428 int ByteMapBuilder::Recolor(int oldcolor) {
429 // Yes, this is a linear search. There can be at most 256
430 // colors and there will typically be far fewer than that.
431 // Also, we need to consider keys *and* values in order to
432 // avoid recoloring a given range more than once per batch.
433 std::vector<std::pair<int, int>>::const_iterator it =
434 std::find_if(colormap_.begin(), colormap_.end(),
435 [=](const std::pair<int, int>& kv) -> bool {
436 return kv.first == oldcolor || kv.second == oldcolor;
437 });
438 if (it != colormap_.end())
439 return it->second;
440 int newcolor = nextcolor_;
441 nextcolor_++;
442 colormap_.emplace_back(oldcolor, newcolor);
443 return newcolor;
444 }
445
ComputeByteMap()446 void Prog::ComputeByteMap() {
447 // Fill in bytemap with byte classes for the program.
448 // Ranges of bytes that are treated indistinguishably
449 // will be mapped to a single byte class.
450 ByteMapBuilder builder;
451
452 // Don't repeat the work for ^ and $.
453 bool marked_line_boundaries = false;
454 // Don't repeat the work for \b and \B.
455 bool marked_word_boundaries = false;
456
457 for (int id = 0; id < size(); id++) {
458 Inst* ip = inst(id);
459 if (ip->opcode() == kInstByteRange) {
460 int lo = ip->lo();
461 int hi = ip->hi();
462 builder.Mark(lo, hi);
463 if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
464 int foldlo = lo;
465 int foldhi = hi;
466 if (foldlo < 'a')
467 foldlo = 'a';
468 if (foldhi > 'z')
469 foldhi = 'z';
470 if (foldlo <= foldhi) {
471 foldlo += 'A' - 'a';
472 foldhi += 'A' - 'a';
473 builder.Mark(foldlo, foldhi);
474 }
475 }
476 // If this Inst is not the last Inst in its list AND the next Inst is
477 // also a ByteRange AND the Insts have the same out, defer the merge.
478 if (!ip->last() &&
479 inst(id+1)->opcode() == kInstByteRange &&
480 ip->out() == inst(id+1)->out())
481 continue;
482 builder.Merge();
483 } else if (ip->opcode() == kInstEmptyWidth) {
484 if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
485 !marked_line_boundaries) {
486 builder.Mark('\n', '\n');
487 builder.Merge();
488 marked_line_boundaries = true;
489 }
490 if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
491 !marked_word_boundaries) {
492 // We require two batches here: the first for ranges that are word
493 // characters, the second for ranges that are not word characters.
494 for (bool isword : {true, false}) {
495 int j;
496 for (int i = 0; i < 256; i = j) {
497 for (j = i + 1; j < 256 &&
498 Prog::IsWordChar(static_cast<uint8_t>(i)) ==
499 Prog::IsWordChar(static_cast<uint8_t>(j));
500 j++)
501 ;
502 if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
503 builder.Mark(i, j - 1);
504 }
505 builder.Merge();
506 }
507 marked_word_boundaries = true;
508 }
509 }
510 }
511
512 builder.Build(bytemap_, &bytemap_range_);
513
514 if (0) { // For debugging, use trivial bytemap.
515 LOG(ERROR) << "Using trivial bytemap.";
516 for (int i = 0; i < 256; i++)
517 bytemap_[i] = static_cast<uint8_t>(i);
518 bytemap_range_ = 256;
519 }
520 }
521
522 // Prog::Flatten() implements a graph rewriting algorithm.
523 //
524 // The overall process is similar to epsilon removal, but retains some epsilon
525 // transitions: those from Capture and EmptyWidth instructions; and those from
526 // nullable subexpressions. (The latter avoids quadratic blowup in transitions
527 // in the worst case.) It might be best thought of as Alt instruction elision.
528 //
529 // In conceptual terms, it divides the Prog into "trees" of instructions, then
530 // traverses the "trees" in order to produce "lists" of instructions. A "tree"
531 // is one or more instructions that grow from one "root" instruction to one or
532 // more "leaf" instructions; if a "tree" has exactly one instruction, then the
533 // "root" is also the "leaf". In most cases, a "root" is the successor of some
534 // "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
535 // and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
536 // EmptyWidth or Match instruction. However, this is insufficient for handling
537 // nested nullable subexpressions correctly, so in some cases, a "root" is the
538 // dominator of the instructions reachable from some "successor root" (i.e. it
539 // has an unreachable predecessor) and is considered a "dominator root". Since
540 // only Alt instructions can be "dominator roots" (other instructions would be
541 // "leaves"), only Alt instructions are required to be marked as predecessors.
542 //
543 // Dividing the Prog into "trees" comprises two passes: marking the "successor
544 // roots" and the predecessors; and marking the "dominator roots". Sorting the
545 // "successor roots" by their bytecode offsets enables iteration in order from
546 // greatest to least during the second pass; by working backwards in this case
547 // and flooding the graph no further than "leaves" and already marked "roots",
548 // it becomes possible to mark "dominator roots" without doing excessive work.
549 //
550 // Traversing the "trees" is just iterating over the "roots" in order of their
551 // marking and flooding the graph no further than "leaves" and "roots". When a
552 // "leaf" is reached, the instruction is copied with its successor remapped to
553 // its "root" number. When a "root" is reached, a Nop instruction is generated
554 // with its successor remapped similarly. As each "list" is produced, its last
555 // instruction is marked as such. After all of the "lists" have been produced,
556 // a pass over their instructions remaps their successors to bytecode offsets.
Flatten()557 void Prog::Flatten() {
558 if (did_flatten_)
559 return;
560 did_flatten_ = true;
561
562 // Scratch structures. It's important that these are reused by functions
563 // that we call in loops because they would thrash the heap otherwise.
564 SparseSet reachable(size());
565 std::vector<int> stk;
566 stk.reserve(size());
567
568 // First pass: Marks "successor roots" and predecessors.
569 // Builds the mapping from inst-ids to root-ids.
570 SparseArray<int> rootmap(size());
571 SparseArray<int> predmap(size());
572 std::vector<std::vector<int>> predvec;
573 MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
574
575 // Second pass: Marks "dominator roots".
576 SparseArray<int> sorted(rootmap);
577 std::sort(sorted.begin(), sorted.end(), sorted.less);
578 for (SparseArray<int>::const_iterator i = sorted.end() - 1;
579 i != sorted.begin();
580 --i) {
581 if (i->index() != start_unanchored() && i->index() != start())
582 MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
583 }
584
585 // Third pass: Emits "lists". Remaps outs to root-ids.
586 // Builds the mapping from root-ids to flat-ids.
587 std::vector<int> flatmap(rootmap.size());
588 std::vector<Inst> flat;
589 flat.reserve(size());
590 for (SparseArray<int>::const_iterator i = rootmap.begin();
591 i != rootmap.end();
592 ++i) {
593 flatmap[i->value()] = static_cast<int>(flat.size());
594 EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
595 flat.back().set_last();
596 // We have the bounds of the "list", so this is the
597 // most convenient point at which to compute hints.
598 ComputeHints(&flat, flatmap[i->value()], static_cast<int>(flat.size()));
599 }
600
601 list_count_ = static_cast<int>(flatmap.size());
602 for (int i = 0; i < kNumInst; i++)
603 inst_count_[i] = 0;
604
605 // Fourth pass: Remaps outs to flat-ids.
606 // Counts instructions by opcode.
607 for (int id = 0; id < static_cast<int>(flat.size()); id++) {
608 Inst* ip = &flat[id];
609 if (ip->opcode() != kInstAltMatch) // handled in EmitList()
610 ip->set_out(flatmap[ip->out()]);
611 inst_count_[ip->opcode()]++;
612 }
613
614 int total = 0;
615 for (int i = 0; i < kNumInst; i++)
616 total += inst_count_[i];
617 DCHECK_EQ(total, static_cast<int>(flat.size()));
618
619 // Remap start_unanchored and start.
620 if (start_unanchored() == 0) {
621 DCHECK_EQ(start(), 0);
622 } else if (start_unanchored() == start()) {
623 set_start_unanchored(flatmap[1]);
624 set_start(flatmap[1]);
625 } else {
626 set_start_unanchored(flatmap[1]);
627 set_start(flatmap[2]);
628 }
629
630 // Finally, replace the old instructions with the new instructions.
631 size_ = static_cast<int>(flat.size());
632 inst_ = PODArray<Inst>(size_);
633 memmove(inst_.data(), flat.data(), size_*sizeof inst_[0]);
634
635 // Populate the list heads for BitState.
636 // 512 instructions limits the memory footprint to 1KiB.
637 if (size_ <= 512) {
638 list_heads_ = PODArray<uint16_t>(size_);
639 // 0xFF makes it more obvious if we try to look up a non-head.
640 memset(list_heads_.data(), 0xFF, size_*sizeof list_heads_[0]);
641 for (int i = 0; i < list_count_; ++i)
642 list_heads_[flatmap[i]] = i;
643 }
644
645 // BitState allocates a bitmap of size list_count_ * (text.size()+1)
646 // for tracking pairs of possibilities that it has already explored.
647 const size_t kBitStateBitmapMaxSize = 256*1024; // max size in bits
648 bit_state_text_max_size_ = kBitStateBitmapMaxSize / list_count_ - 1;
649 }
650
MarkSuccessors(SparseArray<int> * rootmap,SparseArray<int> * predmap,std::vector<std::vector<int>> * predvec,SparseSet * reachable,std::vector<int> * stk)651 void Prog::MarkSuccessors(SparseArray<int>* rootmap,
652 SparseArray<int>* predmap,
653 std::vector<std::vector<int>>* predvec,
654 SparseSet* reachable, std::vector<int>* stk) {
655 // Mark the kInstFail instruction.
656 rootmap->set_new(0, rootmap->size());
657
658 // Mark the start_unanchored and start instructions.
659 if (!rootmap->has_index(start_unanchored()))
660 rootmap->set_new(start_unanchored(), rootmap->size());
661 if (!rootmap->has_index(start()))
662 rootmap->set_new(start(), rootmap->size());
663
664 reachable->clear();
665 stk->clear();
666 stk->push_back(start_unanchored());
667 while (!stk->empty()) {
668 int id = stk->back();
669 stk->pop_back();
670 Loop:
671 if (reachable->contains(id))
672 continue;
673 reachable->insert_new(id);
674
675 Inst* ip = inst(id);
676 switch (ip->opcode()) {
677 default:
678 LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
679 break;
680
681 case kInstAltMatch:
682 case kInstAlt:
683 // Mark this instruction as a predecessor of each out.
684 for (int out : {ip->out(), ip->out1()}) {
685 if (!predmap->has_index(out)) {
686 predmap->set_new(out, static_cast<int>(predvec->size()));
687 predvec->emplace_back();
688 }
689 (*predvec)[predmap->get_existing(out)].emplace_back(id);
690 }
691 stk->push_back(ip->out1());
692 id = ip->out();
693 goto Loop;
694
695 case kInstByteRange:
696 case kInstCapture:
697 case kInstEmptyWidth:
698 // Mark the out of this instruction as a "root".
699 if (!rootmap->has_index(ip->out()))
700 rootmap->set_new(ip->out(), rootmap->size());
701 id = ip->out();
702 goto Loop;
703
704 case kInstNop:
705 id = ip->out();
706 goto Loop;
707
708 case kInstMatch:
709 case kInstFail:
710 break;
711 }
712 }
713 }
714
MarkDominator(int root,SparseArray<int> * rootmap,SparseArray<int> * predmap,std::vector<std::vector<int>> * predvec,SparseSet * reachable,std::vector<int> * stk)715 void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
716 SparseArray<int>* predmap,
717 std::vector<std::vector<int>>* predvec,
718 SparseSet* reachable, std::vector<int>* stk) {
719 reachable->clear();
720 stk->clear();
721 stk->push_back(root);
722 while (!stk->empty()) {
723 int id = stk->back();
724 stk->pop_back();
725 Loop:
726 if (reachable->contains(id))
727 continue;
728 reachable->insert_new(id);
729
730 if (id != root && rootmap->has_index(id)) {
731 // We reached another "tree" via epsilon transition.
732 continue;
733 }
734
735 Inst* ip = inst(id);
736 switch (ip->opcode()) {
737 default:
738 LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
739 break;
740
741 case kInstAltMatch:
742 case kInstAlt:
743 stk->push_back(ip->out1());
744 id = ip->out();
745 goto Loop;
746
747 case kInstByteRange:
748 case kInstCapture:
749 case kInstEmptyWidth:
750 break;
751
752 case kInstNop:
753 id = ip->out();
754 goto Loop;
755
756 case kInstMatch:
757 case kInstFail:
758 break;
759 }
760 }
761
762 for (SparseSet::const_iterator i = reachable->begin();
763 i != reachable->end();
764 ++i) {
765 int id = *i;
766 if (predmap->has_index(id)) {
767 for (int pred : (*predvec)[predmap->get_existing(id)]) {
768 if (!reachable->contains(pred)) {
769 // id has a predecessor that cannot be reached from root!
770 // Therefore, id must be a "root" too - mark it as such.
771 if (!rootmap->has_index(id))
772 rootmap->set_new(id, rootmap->size());
773 }
774 }
775 }
776 }
777 }
778
EmitList(int root,SparseArray<int> * rootmap,std::vector<Inst> * flat,SparseSet * reachable,std::vector<int> * stk)779 void Prog::EmitList(int root, SparseArray<int>* rootmap,
780 std::vector<Inst>* flat,
781 SparseSet* reachable, std::vector<int>* stk) {
782 reachable->clear();
783 stk->clear();
784 stk->push_back(root);
785 while (!stk->empty()) {
786 int id = stk->back();
787 stk->pop_back();
788 Loop:
789 if (reachable->contains(id))
790 continue;
791 reachable->insert_new(id);
792
793 if (id != root && rootmap->has_index(id)) {
794 // We reached another "tree" via epsilon transition. Emit a kInstNop
795 // instruction so that the Prog does not become quadratically larger.
796 flat->emplace_back();
797 flat->back().set_opcode(kInstNop);
798 flat->back().set_out(rootmap->get_existing(id));
799 continue;
800 }
801
802 Inst* ip = inst(id);
803 switch (ip->opcode()) {
804 default:
805 LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
806 break;
807
808 case kInstAltMatch:
809 flat->emplace_back();
810 flat->back().set_opcode(kInstAltMatch);
811 flat->back().set_out(static_cast<int>(flat->size()));
812 flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
813 FALLTHROUGH_INTENDED;
814
815 case kInstAlt:
816 stk->push_back(ip->out1());
817 id = ip->out();
818 goto Loop;
819
820 case kInstByteRange:
821 case kInstCapture:
822 case kInstEmptyWidth:
823 flat->emplace_back();
824 memmove(&flat->back(), ip, sizeof *ip);
825 flat->back().set_out(rootmap->get_existing(ip->out()));
826 break;
827
828 case kInstNop:
829 id = ip->out();
830 goto Loop;
831
832 case kInstMatch:
833 case kInstFail:
834 flat->emplace_back();
835 memmove(&flat->back(), ip, sizeof *ip);
836 break;
837 }
838 }
839 }
840
841 // For each ByteRange instruction in [begin, end), computes a hint to execution
842 // engines: the delta to the next instruction (in flat) worth exploring iff the
843 // current instruction matched.
844 //
845 // Implements a coloring algorithm related to ByteMapBuilder, but in this case,
846 // colors are instructions and recoloring ranges precisely identifies conflicts
847 // between instructions. Iterating backwards over [begin, end) is guaranteed to
848 // identify the nearest conflict (if any) with only linear complexity.
ComputeHints(std::vector<Inst> * flat,int begin,int end)849 void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {
850 Bitmap256 splits;
851 int colors[256];
852
853 bool dirty = false;
854 for (int id = end; id >= begin; --id) {
855 if (id == end ||
856 (*flat)[id].opcode() != kInstByteRange) {
857 if (dirty) {
858 dirty = false;
859 splits.Clear();
860 }
861 splits.Set(255);
862 colors[255] = id;
863 // At this point, the [0-255] range is colored with id.
864 // Thus, hints cannot point beyond id; and if id == end,
865 // hints that would have pointed to id will be 0 instead.
866 continue;
867 }
868 dirty = true;
869
870 // We recolor the [lo-hi] range with id. Note that first ratchets backwards
871 // from end to the nearest conflict (if any) during recoloring.
872 int first = end;
873 auto Recolor = [&](int lo, int hi) {
874 // Like ByteMapBuilder, we split at lo-1 and at hi.
875 --lo;
876
877 if (0 <= lo && !splits.Test(lo)) {
878 splits.Set(lo);
879 int next = splits.FindNextSetBit(lo+1);
880 colors[lo] = colors[next];
881 }
882 if (!splits.Test(hi)) {
883 splits.Set(hi);
884 int next = splits.FindNextSetBit(hi+1);
885 colors[hi] = colors[next];
886 }
887
888 int c = lo+1;
889 while (c < 256) {
890 int next = splits.FindNextSetBit(c);
891 // Ratchet backwards...
892 first = std::min(first, colors[next]);
893 // Recolor with id - because it's the new nearest conflict!
894 colors[next] = id;
895 if (next == hi)
896 break;
897 c = next+1;
898 }
899 };
900
901 Inst* ip = &(*flat)[id];
902 int lo = ip->lo();
903 int hi = ip->hi();
904 Recolor(lo, hi);
905 if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
906 int foldlo = lo;
907 int foldhi = hi;
908 if (foldlo < 'a')
909 foldlo = 'a';
910 if (foldhi > 'z')
911 foldhi = 'z';
912 if (foldlo <= foldhi) {
913 foldlo += 'A' - 'a';
914 foldhi += 'A' - 'a';
915 Recolor(foldlo, foldhi);
916 }
917 }
918
919 if (first != end) {
920 uint16_t hint = static_cast<uint16_t>(std::min(first - id, 32767));
921 ip->hint_foldcase_ |= hint<<1;
922 }
923 }
924 }
925
926 // The final state will always be this, which frees up a register for the hot
927 // loop and thus avoids the spilling that can occur when building with Clang.
928 static const size_t kShiftDFAFinal = 9;
929
930 // This function takes the prefix as std::string (i.e. not const std::string&
931 // as normal) because it's going to clobber it, so a temporary is convenient.
BuildShiftDFA(std::string prefix)932 static uint64_t* BuildShiftDFA(std::string prefix) {
933 // This constant is for convenience now and also for correctness later when
934 // we clobber the prefix, but still need to know how long it was initially.
935 const size_t size = prefix.size();
936
937 // Construct the NFA.
938 // The table is indexed by input byte; each element is a bitfield of states
939 // reachable by the input byte. Given a bitfield of the current states, the
940 // bitfield of states reachable from those is - for this specific purpose -
941 // always ((ncurr << 1) | 1). Intersecting the reachability bitfields gives
942 // the bitfield of the next states reached by stepping over the input byte.
943 // Credits for this technique: the Hyperscan paper by Geoff Langdale et al.
944 uint16_t nfa[256]{};
945 for (size_t i = 0; i < size; ++i) {
946 uint8_t b = prefix[i];
947 nfa[b] |= 1 << (i+1);
948 }
949 // This is the `\C*?` for unanchored search.
950 for (int b = 0; b < 256; ++b)
951 nfa[b] |= 1;
952
953 // This maps from DFA state to NFA states; the reverse mapping is used when
954 // recording transitions and gets implemented with plain old linear search.
955 // The "Shift DFA" technique limits this to ten states when using uint64_t;
956 // to allow for the initial state, we use at most nine bytes of the prefix.
957 // That same limit is also why uint16_t is sufficient for the NFA bitfield.
958 uint16_t states[kShiftDFAFinal+1]{};
959 states[0] = 1;
960 for (size_t dcurr = 0; dcurr < size; ++dcurr) {
961 uint8_t b = prefix[dcurr];
962 uint16_t ncurr = states[dcurr];
963 uint16_t nnext = nfa[b] & ((ncurr << 1) | 1);
964 size_t dnext = dcurr+1;
965 if (dnext == size)
966 dnext = kShiftDFAFinal;
967 states[dnext] = nnext;
968 }
969
970 // Sort and unique the bytes of the prefix to avoid repeating work while we
971 // record transitions. This clobbers the prefix, but it's no longer needed.
972 std::sort(prefix.begin(), prefix.end());
973 prefix.erase(std::unique(prefix.begin(), prefix.end()), prefix.end());
974
975 // Construct the DFA.
976 // The table is indexed by input byte; each element is effectively a packed
977 // array of uint6_t; each array value will be multiplied by six in order to
978 // avoid having to do so later in the hot loop as well as masking/shifting.
979 // Credits for this technique: "Shift-based DFAs" on GitHub by Per Vognsen.
980 uint64_t* dfa = new uint64_t[256]{};
981 // Record a transition from each state for each of the bytes of the prefix.
982 // Note that all other input bytes go back to the initial state by default.
983 for (size_t dcurr = 0; dcurr < size; ++dcurr) {
984 for (uint8_t b : prefix) {
985 uint16_t ncurr = states[dcurr];
986 uint16_t nnext = nfa[b] & ((ncurr << 1) | 1);
987 size_t dnext = 0;
988 while (states[dnext] != nnext)
989 ++dnext;
990 dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
991 // Convert ASCII letters to uppercase and record the extra transitions.
992 // Note that ASCII letters are guaranteed to be lowercase at this point
993 // because that's how the parser normalises them. #FunFact: 'k' and 's'
994 // match U+212A and U+017F, respectively, so they won't occur here when
995 // using UTF-8 encoding because the parser will emit character classes.
996 if ('a' <= b && b <= 'z') {
997 b -= 'a' - 'A';
998 dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
999 }
1000 }
1001 }
1002 // This lets the final state "saturate", which will matter for performance:
1003 // in the hot loop, we check for a match only at the end of each iteration,
1004 // so we must keep signalling the match until we get around to checking it.
1005 for (int b = 0; b < 256; ++b)
1006 dfa[b] |= static_cast<uint64_t>(kShiftDFAFinal * 6) << (kShiftDFAFinal * 6);
1007
1008 return dfa;
1009 }
1010
ConfigurePrefixAccel(const std::string & prefix,bool prefix_foldcase)1011 void Prog::ConfigurePrefixAccel(const std::string& prefix,
1012 bool prefix_foldcase) {
1013 prefix_foldcase_ = prefix_foldcase;
1014 prefix_size_ = prefix.size();
1015 if (prefix_foldcase_) {
1016 // Use PrefixAccel_ShiftDFA().
1017 // ... and no more than nine bytes of the prefix. (See above for details.)
1018 prefix_size_ = std::min(prefix_size_, kShiftDFAFinal);
1019 prefix_dfa_ = BuildShiftDFA(prefix.substr(0, prefix_size_));
1020 } else if (prefix_size_ != 1) {
1021 // Use PrefixAccel_FrontAndBack().
1022 prefix_front_ = prefix.front();
1023 prefix_back_ = prefix.back();
1024 } else {
1025 // Use memchr(3).
1026 prefix_front_ = prefix.front();
1027 }
1028 }
1029
PrefixAccel_ShiftDFA(const void * data,size_t size)1030 const void* Prog::PrefixAccel_ShiftDFA(const void* data, size_t size) {
1031 if (size < prefix_size_)
1032 return NULL;
1033
1034 uint64_t curr = 0;
1035
1036 // At the time of writing, rough benchmarks on a Broadwell machine showed
1037 // that this unroll factor (i.e. eight) achieves a speedup factor of two.
1038 if (size >= 8) {
1039 const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
1040 const uint8_t* endp = p + (size&~7);
1041 do {
1042 uint8_t b0 = p[0];
1043 uint8_t b1 = p[1];
1044 uint8_t b2 = p[2];
1045 uint8_t b3 = p[3];
1046 uint8_t b4 = p[4];
1047 uint8_t b5 = p[5];
1048 uint8_t b6 = p[6];
1049 uint8_t b7 = p[7];
1050
1051 uint64_t next0 = prefix_dfa_[b0];
1052 uint64_t next1 = prefix_dfa_[b1];
1053 uint64_t next2 = prefix_dfa_[b2];
1054 uint64_t next3 = prefix_dfa_[b3];
1055 uint64_t next4 = prefix_dfa_[b4];
1056 uint64_t next5 = prefix_dfa_[b5];
1057 uint64_t next6 = prefix_dfa_[b6];
1058 uint64_t next7 = prefix_dfa_[b7];
1059
1060 uint64_t curr0 = next0 >> (curr & 63);
1061 uint64_t curr1 = next1 >> (curr0 & 63);
1062 uint64_t curr2 = next2 >> (curr1 & 63);
1063 uint64_t curr3 = next3 >> (curr2 & 63);
1064 uint64_t curr4 = next4 >> (curr3 & 63);
1065 uint64_t curr5 = next5 >> (curr4 & 63);
1066 uint64_t curr6 = next6 >> (curr5 & 63);
1067 uint64_t curr7 = next7 >> (curr6 & 63);
1068
1069 if ((curr7 & 63) == kShiftDFAFinal * 6) {
1070 // At the time of writing, using the same masking subexpressions from
1071 // the preceding lines caused Clang to clutter the hot loop computing
1072 // them - even though they aren't actually needed for shifting! Hence
1073 // these rewritten conditions, which achieve a speedup factor of two.
1074 if (((curr7-curr0) & 63) == 0) return p+1-prefix_size_;
1075 if (((curr7-curr1) & 63) == 0) return p+2-prefix_size_;
1076 if (((curr7-curr2) & 63) == 0) return p+3-prefix_size_;
1077 if (((curr7-curr3) & 63) == 0) return p+4-prefix_size_;
1078 if (((curr7-curr4) & 63) == 0) return p+5-prefix_size_;
1079 if (((curr7-curr5) & 63) == 0) return p+6-prefix_size_;
1080 if (((curr7-curr6) & 63) == 0) return p+7-prefix_size_;
1081 if (((curr7-curr7) & 63) == 0) return p+8-prefix_size_;
1082 }
1083
1084 curr = curr7;
1085 p += 8;
1086 } while (p != endp);
1087 data = p;
1088 size = size&7;
1089 }
1090
1091 const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
1092 const uint8_t* endp = p + size;
1093 while (p != endp) {
1094 uint8_t b = *p++;
1095 uint64_t next = prefix_dfa_[b];
1096 curr = next >> (curr & 63);
1097 if ((curr & 63) == kShiftDFAFinal * 6)
1098 return p-prefix_size_;
1099 }
1100 return NULL;
1101 }
1102
1103 #if defined(__AVX2__)
1104 // Finds the least significant non-zero bit in n.
FindLSBSet(uint32_t n)1105 static int FindLSBSet(uint32_t n) {
1106 DCHECK_NE(n, 0);
1107 #if defined(__GNUC__)
1108 return __builtin_ctz(n);
1109 #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
1110 unsigned long c;
1111 _BitScanForward(&c, n);
1112 return static_cast<int>(c);
1113 #else
1114 int c = 31;
1115 for (int shift = 1 << 4; shift != 0; shift >>= 1) {
1116 uint32_t word = n << shift;
1117 if (word != 0) {
1118 n = word;
1119 c -= shift;
1120 }
1121 }
1122 return c;
1123 #endif
1124 }
1125 #endif
1126
PrefixAccel_FrontAndBack(const void * data,size_t size)1127 const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) {
1128 DCHECK_GE(prefix_size_, 2);
1129 if (size < prefix_size_)
1130 return NULL;
1131 // Don't bother searching the last prefix_size_-1 bytes for prefix_front_.
1132 // This also means that probing for prefix_back_ doesn't go out of bounds.
1133 size -= prefix_size_-1;
1134
1135 #if defined(__AVX2__)
1136 // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time.
1137 if (size >= sizeof(__m256i)) {
1138 const __m256i* fp = reinterpret_cast<const __m256i*>(
1139 reinterpret_cast<const char*>(data));
1140 const __m256i* bp = reinterpret_cast<const __m256i*>(
1141 reinterpret_cast<const char*>(data) + prefix_size_-1);
1142 const __m256i* endfp = fp + size/sizeof(__m256i);
1143 const __m256i f_set1 = _mm256_set1_epi8(prefix_front_);
1144 const __m256i b_set1 = _mm256_set1_epi8(prefix_back_);
1145 do {
1146 const __m256i f_loadu = _mm256_loadu_si256(fp++);
1147 const __m256i b_loadu = _mm256_loadu_si256(bp++);
1148 const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu);
1149 const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu);
1150 const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq);
1151 if (fb_testz == 0) { // ZF: 1 means zero, 0 means non-zero.
1152 const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq);
1153 const int fb_movemask = _mm256_movemask_epi8(fb_and);
1154 const int fb_ctz = FindLSBSet(fb_movemask);
1155 return reinterpret_cast<const char*>(fp-1) + fb_ctz;
1156 }
1157 } while (fp != endfp);
1158 data = fp;
1159 size = size%sizeof(__m256i);
1160 }
1161 #endif
1162
1163 const char* p0 = reinterpret_cast<const char*>(data);
1164 for (const char* p = p0;; p++) {
1165 DCHECK_GE(size, static_cast<size_t>(p-p0));
1166 p = reinterpret_cast<const char*>(memchr(p, prefix_front_, size - (p-p0)));
1167 if (p == NULL || p[prefix_size_-1] == prefix_back_)
1168 return p;
1169 }
1170 }
1171
1172 } // namespace re2
1173