1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc.
6 //
7 // Prog::SearchOnePass is an efficient implementation of
8 // regular expression search with submatch tracking for
9 // what I call "one-pass regular expressions".  (An alternate
10 // name might be "backtracking-free regular expressions".)
11 //
12 // One-pass regular expressions have the property that
13 // at each input byte during an anchored match, there may be
14 // multiple alternatives but only one can proceed for any
15 // given input byte.
16 //
17 // For example, the regexp /x*yx*/ is one-pass: you read
18 // x's until a y, then you read the y, then you keep reading x's.
19 // At no point do you have to guess what to do or back up
20 // and try a different guess.
21 //
22 // On the other hand, /x*x/ is not one-pass: when you're
23 // looking at an input "x", it's not clear whether you should
24 // use it to extend the x* or as the final x.
25 //
26 // More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
27 // /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
28 //
29 // A simple intuition for identifying one-pass regular expressions
30 // is that it's always immediately obvious when a repetition ends.
31 // It must also be immediately obvious which branch of an | to take:
32 //
33 // /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
34 //
35 // The NFA-based search in nfa.cc does some bookkeeping to
36 // avoid the need for backtracking and its associated exponential blowup.
37 // But if we have a one-pass regular expression, there is no
38 // possibility of backtracking, so there is no need for the
39 // extra bookkeeping.  Hence, this code.
40 //
41 // On a one-pass regular expression, the NFA code in nfa.cc
42 // runs at about 1/20 of the backtracking-based PCRE speed.
43 // In contrast, the code in this file runs at about the same
44 // speed as PCRE.
45 //
46 // One-pass regular expressions get used a lot when RE is
47 // used for parsing simple strings, so it pays off to
48 // notice them and handle them efficiently.
49 //
50 // See also Anne Brüggemann-Klein and Derick Wood,
51 // "One-unambiguous regular languages", Information and Computation 142(2).
52 
53 #include <string.h>
54 #include <map>
55 #include "util/util.h"
56 #include "util/arena.h"
57 #include "util/sparse_set.h"
58 #include "re2/prog.h"
59 #include "re2/stringpiece.h"
60 
61 namespace re2 {
62 
63 static const int Debug = 0;
64 
65 // The key insight behind this implementation is that the
66 // non-determinism in an NFA for a one-pass regular expression
67 // is contained.  To explain what that means, first a
68 // refresher about what regular expression programs look like
69 // and how the usual NFA execution runs.
70 //
71 // In a regular expression program, only the kInstByteRange
72 // instruction processes an input byte c and moves on to the
73 // next byte in the string (it does so if c is in the given range).
74 // The kInstByteRange instructions correspond to literal characters
75 // and character classes in the regular expression.
76 //
77 // The kInstAlt instructions are used as wiring to connect the
78 // kInstByteRange instructions together in interesting ways when
79 // implementing | + and *.
80 // The kInstAlt instruction forks execution, like a goto that
81 // jumps to ip->out() and ip->out1() in parallel.  Each of the
82 // resulting computation paths is called a thread.
83 //
84 // The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
85 // are interesting in their own right but like kInstAlt they don't
86 // advance the input pointer.  Only kInstByteRange does.
87 //
88 // The automaton execution in nfa.cc runs all the possible
89 // threads of execution in lock-step over the input.  To process
90 // a particular byte, each thread gets run until it either dies
91 // or finds a kInstByteRange instruction matching the byte.
92 // If the latter happens, the thread stops just past the
93 // kInstByteRange instruction (at ip->out()) and waits for
94 // the other threads to finish processing the input byte.
95 // Then, once all the threads have processed that input byte,
96 // the whole process repeats.  The kInstAlt state instruction
97 // might create new threads during input processing, but no
98 // matter what, all the threads stop after a kInstByteRange
99 // and wait for the other threads to "catch up".
100 // Running in lock step like this ensures that the NFA reads
101 // the input string only once.
102 //
103 // Each thread maintains its own set of capture registers
104 // (the string positions at which it executed the kInstCapture
105 // instructions corresponding to capturing parentheses in the
106 // regular expression).  Repeated copying of the capture registers
107 // is the main performance bottleneck in the NFA implementation.
108 //
109 // A regular expression program is "one-pass" if, no matter what
110 // the input string, there is only one thread that makes it
111 // past a kInstByteRange instruction at each input byte.  This means
112 // that there is in some sense only one active thread throughout
113 // the execution.  Other threads might be created during the
114 // processing of an input byte, but they are ephemeral: only one
115 // thread is left to start processing the next input byte.
116 // This is what I meant above when I said the non-determinism
117 // was "contained".
118 //
119 // To execute a one-pass regular expression program, we can build
120 // a DFA (no non-determinism) that has at most as many states as
121 // the NFA (compare this to the possibly exponential number of states
122 // in the general case).  Each state records, for each possible
123 // input byte, the next state along with the conditions required
124 // before entering that state -- empty-width flags that must be true
125 // and capture operations that must be performed.  It also records
126 // whether a set of conditions required to finish a match at that
127 // point in the input rather than process the next byte.
128 
129 // A state in the one-pass NFA (aka DFA) - just an array of actions.
130 struct OneState;
131 
132 // A state in the one-pass NFA - just an array of actions indexed
133 // by the bytemap_[] of the next input byte.  (The bytemap
134 // maps next input bytes into equivalence classes, to reduce
135 // the memory footprint.)
136 struct OneState {
137   uint32 matchcond;   // conditions to match right now.
138   uint32 action[1];
139 };
140 
141 // The uint32 conditions in the action are a combination of
142 // condition and capture bits and the next state.  The bottom 16 bits
143 // are the condition and capture bits, and the top 16 are the index of
144 // the next state.
145 //
146 // Bits 0-5 are the empty-width flags from prog.h.
147 // Bit 6 is kMatchWins, which means the match takes
148 // priority over moving to next in a first-match search.
149 // The remaining bits mark capture registers that should
150 // be set to the current input position.  The capture bits
151 // start at index 2, since the search loop can take care of
152 // cap[0], cap[1] (the overall match position).
153 // That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
154 // No input position can satisfy both kEmptyWordBoundary
155 // and kEmptyNonWordBoundary, so we can use that as a sentinel
156 // instead of needing an extra bit.
157 
158 static const int    kIndexShift    = 16;  // number of bits below index
159 static const int    kEmptyShift   = 6;  // number of empty flags in prog.h
160 static const int    kRealCapShift = kEmptyShift + 1;
161 static const int    kRealMaxCap   = (kIndexShift - kRealCapShift) / 2 * 2;
162 
163 // Parameters used to skip over cap[0], cap[1].
164 static const int    kCapShift     = kRealCapShift - 2;
165 static const int    kMaxCap       = kRealMaxCap + 2;
166 
167 static const uint32 kMatchWins    = 1 << kEmptyShift;
168 static const uint32 kCapMask      = ((1 << kRealMaxCap) - 1) << kRealCapShift;
169 
170 static const uint32 kImpossible   = kEmptyWordBoundary | kEmptyNonWordBoundary;
171 
172 // Check, at compile time, that prog.h agrees with math above.
173 // This function is never called.
OnePass_Checks()174 void OnePass_Checks() {
175   COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags,
176                  kEmptyShift_disagrees_with_kEmptyAllFlags);
177   // kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
178   COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2,
179                  kMaxCap_disagrees_with_kMaxOnePassCapture);
180 }
181 
Satisfy(uint32 cond,const StringPiece & context,const char * p)182 static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
183   uint32 satisfied = Prog::EmptyFlags(context, p);
184   if (cond & kEmptyAllFlags & ~satisfied)
185     return false;
186   return true;
187 }
188 
189 // Apply the capture bits in cond, saving p to the appropriate
190 // locations in cap[].
ApplyCaptures(uint32 cond,const char * p,const char ** cap,int ncap)191 static void ApplyCaptures(uint32 cond, const char* p,
192                           const char** cap, int ncap) {
193   for (int i = 2; i < ncap; i++)
194     if (cond & (1 << kCapShift << i))
195       cap[i] = p;
196 }
197 
198 // Compute a node pointer.
199 // Basically (OneState*)(nodes + statesize*nodeindex)
200 // but the version with the C++ casts overflows 80 characters (and is ugly).
IndexToNode(volatile uint8 * nodes,int statesize,int nodeindex)201 static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
202                                     int nodeindex) {
203   return reinterpret_cast<OneState*>(
204     const_cast<uint8*>(nodes + statesize*nodeindex));
205 }
206 
SearchOnePass(const StringPiece & text,const StringPiece & const_context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)207 bool Prog::SearchOnePass(const StringPiece& text,
208                          const StringPiece& const_context,
209                          Anchor anchor, MatchKind kind,
210                          StringPiece* match, int nmatch) {
211   if (anchor != kAnchored && kind != kFullMatch) {
212     LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
213     return false;
214   }
215 
216   // Make sure we have at least cap[1],
217   // because we use it to tell if we matched.
218   int ncap = 2*nmatch;
219   if (ncap < 2)
220     ncap = 2;
221 
222   const char* cap[kMaxCap];
223   for (int i = 0; i < ncap; i++)
224     cap[i] = NULL;
225 
226   const char* matchcap[kMaxCap];
227   for (int i = 0; i < ncap; i++)
228     matchcap[i] = NULL;
229 
230   StringPiece context = const_context;
231   if (context.begin() == NULL)
232     context = text;
233   if (anchor_start() && context.begin() != text.begin())
234     return false;
235   if (anchor_end() && context.end() != text.end())
236     return false;
237   if (anchor_end())
238     kind = kFullMatch;
239 
240   // State and act are marked volatile to
241   // keep the compiler from re-ordering the
242   // memory accesses walking over the NFA.
243   // This is worth about 5%.
244   volatile OneState* state = onepass_start_;
245   volatile uint8* nodes = onepass_nodes_;
246   volatile uint32 statesize = onepass_statesize_;
247   uint8* bytemap = bytemap_;
248   const char* bp = text.begin();
249   const char* ep = text.end();
250   const char* p;
251   bool matched = false;
252   matchcap[0] = bp;
253   cap[0] = bp;
254   uint32 nextmatchcond = state->matchcond;
255   for (p = bp; p < ep; p++) {
256     int c = bytemap[*p & 0xFF];
257     uint32 matchcond = nextmatchcond;
258     uint32 cond = state->action[c];
259 
260     // Determine whether we can reach act->next.
261     // If so, advance state and nextmatchcond.
262     if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
263       uint32 nextindex = cond >> kIndexShift;
264       state = IndexToNode(nodes, statesize, nextindex);
265       nextmatchcond = state->matchcond;
266     } else {
267       state = NULL;
268       nextmatchcond = kImpossible;
269     }
270 
271     // This code section is carefully tuned.
272     // The goto sequence is about 10% faster than the
273     // obvious rewrite as a large if statement in the
274     // ASCIIMatchRE2 and DotMatchRE2 benchmarks.
275 
276     // Saving the match capture registers is expensive.
277     // Is this intermediate match worth thinking about?
278 
279     // Not if we want a full match.
280     if (kind == kFullMatch)
281       goto skipmatch;
282 
283     // Not if it's impossible.
284     if (matchcond == kImpossible)
285       goto skipmatch;
286 
287     // Not if the possible match is beaten by the certain
288     // match at the next byte.  When this test is useless
289     // (e.g., HTTPPartialMatchRE2) it slows the loop by
290     // about 10%, but when it avoids work (e.g., DotMatchRE2),
291     // it cuts the loop execution by about 45%.
292     if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
293       goto skipmatch;
294 
295     // Finally, the match conditions must be satisfied.
296     if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
297       for (int i = 2; i < 2*nmatch; i++)
298         matchcap[i] = cap[i];
299       if (nmatch > 1 && (matchcond & kCapMask))
300         ApplyCaptures(matchcond, p, matchcap, ncap);
301       matchcap[1] = p;
302       matched = true;
303 
304       // If we're in longest match mode, we have to keep
305       // going and see if we find a longer match.
306       // In first match mode, we can stop if the match
307       // takes priority over the next state for this input byte.
308       // That bit is per-input byte and thus in cond, not matchcond.
309       if (kind == kFirstMatch && (cond & kMatchWins))
310         goto done;
311     }
312 
313   skipmatch:
314     if (state == NULL)
315       goto done;
316     if ((cond & kCapMask) && nmatch > 1)
317       ApplyCaptures(cond, p, cap, ncap);
318   }
319 
320   // Look for match at end of input.
321   {
322     uint32 matchcond = state->matchcond;
323     if (matchcond != kImpossible &&
324         ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
325       if (nmatch > 1 && (matchcond & kCapMask))
326         ApplyCaptures(matchcond, p, cap, ncap);
327       for (int i = 2; i < ncap; i++)
328         matchcap[i] = cap[i];
329       matchcap[1] = p;
330       matched = true;
331     }
332   }
333 
334 done:
335   if (!matched)
336     return false;
337   for (int i = 0; i < nmatch; i++)
338     match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]);
339   return true;
340 }
341 
342 
343 // Analysis to determine whether a given regexp program is one-pass.
344 
345 // If ip is not on workq, adds ip to work queue and returns true.
346 // If ip is already on work queue, does nothing and returns false.
347 // If ip is NULL, does nothing and returns true (pretends to add it).
348 typedef SparseSet Instq;
AddQ(Instq * q,int id)349 static bool AddQ(Instq *q, int id) {
350   if (id == 0)
351     return true;
352   if (q->contains(id))
353     return false;
354   q->insert(id);
355   return true;
356 }
357 
358 struct InstCond {
359   int id;
360   uint32 cond;
361 };
362 
363 // Returns whether this is a one-pass program; that is,
364 // returns whether it is safe to use SearchOnePass on this program.
365 // These conditions must be true for any instruction ip:
366 //
367 //   (1) for any other Inst nip, there is at most one input-free
368 //       path from ip to nip.
369 //   (2) there is at most one kInstByte instruction reachable from
370 //       ip that matches any particular byte c.
371 //   (3) there is at most one input-free path from ip to a kInstMatch
372 //       instruction.
373 //
374 // This is actually just a conservative approximation: it might
375 // return false when the answer is true, when kInstEmptyWidth
376 // instructions are involved.
377 // Constructs and saves corresponding one-pass NFA on success.
IsOnePass()378 bool Prog::IsOnePass() {
379   if (did_onepass_)
380     return onepass_start_ != NULL;
381   did_onepass_ = true;
382 
383   if (start() == 0)  // no match
384     return false;
385 
386   // Steal memory for the one-pass NFA from the overall DFA budget.
387   // Willing to use at most 1/4 of the DFA budget (heuristic).
388   // Limit max node count to 65000 as a conservative estimate to
389   // avoid overflowing 16-bit node index in encoding.
390   int maxnodes = 2 + byte_inst_count_;
391   int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
392   if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
393     return false;
394 
395   // Flood the graph starting at the start state, and check
396   // that in each reachable state, each possible byte leads
397   // to a unique next state.
398   int size = this->size();
399   InstCond *stack = new InstCond[size];
400 
401   int* nodebyid = new int[size];  // indexed by ip
402   memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
403 
404   uint8* nodes = new uint8[maxnodes*statesize];
405   uint8* nodep = nodes;
406 
407   Instq tovisit(size), workq(size);
408   AddQ(&tovisit, start());
409   nodebyid[start()] = 0;
410   nodep += statesize;
411   int nalloc = 1;
412   for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
413     int id = *it;
414     int nodeindex = nodebyid[id];
415     OneState* node = IndexToNode(nodes, statesize, nodeindex);
416 
417     // Flood graph using manual stack, filling in actions as found.
418     // Default is none.
419     for (int b = 0; b < bytemap_range_; b++)
420       node->action[b] = kImpossible;
421     node->matchcond = kImpossible;
422 
423     workq.clear();
424     bool matched = false;
425     int nstack = 0;
426     stack[nstack].id = id;
427     stack[nstack++].cond = 0;
428     while (nstack > 0) {
429       int id = stack[--nstack].id;
430       Prog::Inst* ip = inst(id);
431       uint32 cond = stack[nstack].cond;
432       switch (ip->opcode()) {
433         case kInstAltMatch:
434           // TODO(rsc): Ignoring kInstAltMatch optimization.
435           // Should implement it in this engine, but it's subtle.
436           // Fall through.
437         case kInstAlt:
438           // If already on work queue, (1) is violated: bail out.
439           if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
440             goto fail;
441           stack[nstack].id = ip->out1();
442           stack[nstack++].cond = cond;
443           stack[nstack].id = ip->out();
444           stack[nstack++].cond = cond;
445           break;
446 
447         case kInstByteRange: {
448           int nextindex = nodebyid[ip->out()];
449           if (nextindex == -1) {
450             if (nalloc >= maxnodes) {
451               if (Debug)
452                 LOG(ERROR)
453                   << StringPrintf("Not OnePass: hit node limit %d > %d",
454                                   nalloc, maxnodes);
455               goto fail;
456             }
457             nextindex = nalloc;
458             nodep += statesize;
459             nodebyid[ip->out()] = nextindex;
460             nalloc++;
461             AddQ(&tovisit, ip->out());
462           }
463           if (matched)
464             cond |= kMatchWins;
465           for (int c = ip->lo(); c <= ip->hi(); c++) {
466             int b = bytemap_[c];
467             c = unbytemap_[b];  // last c in byte class
468             uint32 act = node->action[b];
469             uint32 newact = (nextindex << kIndexShift) | cond;
470             if ((act & kImpossible) == kImpossible) {
471               node->action[b] = newact;
472             } else if (act != newact) {
473               if (Debug) {
474                 LOG(ERROR)
475                   << StringPrintf("Not OnePass: conflict on byte "
476                                   "%#x at state %d",
477                                   c, *it);
478               }
479               goto fail;
480             }
481           }
482           if (ip->foldcase()) {
483             Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a';
484             Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
485             for (int c = lo; c <= hi; c++) {
486               int b = bytemap_[c];
487               c = unbytemap_[b];  // last c in class
488               uint32 act = node->action[b];
489               uint32 newact = (nextindex << kIndexShift) | cond;
490               if ((act & kImpossible) == kImpossible) {
491                 node->action[b] = newact;
492               } else if (act != newact) {
493                 if (Debug) {
494                   LOG(ERROR)
495                     << StringPrintf("Not OnePass: conflict on byte "
496                                     "%#x at state %d",
497                                     c, *it);
498                 }
499                 goto fail;
500               }
501             }
502           }
503           break;
504         }
505 
506         case kInstCapture:
507           if (ip->cap() < kMaxCap)
508             cond |= (1 << kCapShift) << ip->cap();
509           goto QueueEmpty;
510 
511         case kInstEmptyWidth:
512           cond |= ip->empty();
513           goto QueueEmpty;
514 
515         case kInstNop:
516         QueueEmpty:
517           // kInstCapture and kInstNop always proceed to ip->out().
518           // kInstEmptyWidth only sometimes proceeds to ip->out(),
519           // but as a conservative approximation we assume it always does.
520           // We could be a little more precise by looking at what c
521           // is, but that seems like overkill.
522 
523           // If already on work queue, (1) is violated: bail out.
524           if (!AddQ(&workq, ip->out())) {
525             if (Debug) {
526               LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
527                                          " %d -> %d\n",
528                                          *it, ip->out());
529             }
530             goto fail;
531           }
532           stack[nstack].id = ip->out();
533           stack[nstack++].cond = cond;
534           break;
535 
536         case kInstMatch:
537           if (matched) {
538             // (3) is violated
539             if (Debug) {
540               LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
541                                          " from %d\n", *it);
542             }
543             goto fail;
544           }
545           matched = true;
546           node->matchcond = cond;
547           break;
548 
549         case kInstFail:
550           break;
551       }
552     }
553   }
554 
555   if (Debug) {  // For debugging, dump one-pass NFA to LOG(ERROR).
556     string dump = "prog dump:\n" + Dump() + "node dump\n";
557     map<int, int> idmap;
558     for (int i = 0; i < size; i++)
559       if (nodebyid[i] != -1)
560         idmap[nodebyid[i]] = i;
561 
562     StringAppendF(&dump, "byte ranges:\n");
563     int i = 0;
564     for (int b = 0; b < bytemap_range_; b++) {
565       int lo = i;
566       while (bytemap_[i] == b)
567         i++;
568       StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
569     }
570 
571     for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
572       int id = *it;
573       int nodeindex = nodebyid[id];
574       if (nodeindex == -1)
575       	continue;
576       OneState* node = IndexToNode(nodes, statesize, nodeindex);
577       string s;
578       StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
579                     nodeindex, id, node->matchcond);
580       for (int i = 0; i < bytemap_range_; i++) {
581         if ((node->action[i] & kImpossible) == kImpossible)
582           continue;
583         StringAppendF(&dump, "  %d cond %#x -> %d id=%d\n",
584                       i, node->action[i] & 0xFFFF,
585                       node->action[i] >> kIndexShift,
586                       idmap[node->action[i] >> kIndexShift]);
587       }
588     }
589     LOG(ERROR) << dump;
590   }
591 
592   // Overallocated earlier; cut down to actual size.
593   nodep = new uint8[nalloc*statesize];
594   memmove(nodep, nodes, nalloc*statesize);
595   delete[] nodes;
596   nodes = nodep;
597 
598   onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
599   onepass_nodes_ = nodes;
600   onepass_statesize_ = statesize;
601   dfa_mem_ -= nalloc*statesize;
602 
603   delete[] stack;
604   delete[] nodebyid;
605   return true;
606 
607 fail:
608   delete[] stack;
609   delete[] nodebyid;
610   delete[] nodes;
611   return false;
612 }
613 
614 }  // namespace re2
615