1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // A DFA (deterministic finite automaton)-based regular expression search.
6 //
7 // The DFA search has two main parts: the construction of the automaton,
8 // which is represented by a graph of State structures, and the execution
9 // of the automaton over a given input string.
10 //
11 // The basic idea is that the State graph is constructed so that the
12 // execution can simply start with a state s, and then for each byte c in
13 // the input string, execute "s = s->next[c]", checking at each point whether
14 // the current s represents a matching state.
15 //
16 // The simple explanation just given does convey the essence of this code,
17 // but it omits the details of how the State graph gets constructed as well
18 // as some performance-driven optimizations to the execution of the automaton.
19 // All these details are explained in the comments for the code following
20 // the definition of class DFA.
21 //
22 // See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent.
23 
24 #include <stddef.h>
25 #include <stdint.h>
26 #include <stdio.h>
27 #include <string.h>
28 #include <algorithm>
29 #include <atomic>
30 #include <deque>
31 #include <mutex>
32 #include <new>
33 #include <string>
34 #include <unordered_map>
35 #include <unordered_set>
36 #include <utility>
37 #include <vector>
38 
39 #include "util/logging.h"
40 #include "util/mix.h"
41 #include "util/mutex.h"
42 #include "util/strutil.h"
43 #include "re2/pod_array.h"
44 #include "re2/prog.h"
45 #include "re2/re2.h"
46 #include "re2/sparse_set.h"
47 #include "re2/stringpiece.h"
48 
49 // Silence "zero-sized array in struct/union" warning for DFA::State::next_.
50 #ifdef _MSC_VER
51 #pragma warning(disable: 4200)
52 #endif
53 
54 namespace re2 {
55 
56 // Controls whether the DFA should bail out early if the NFA would be faster.
57 static bool dfa_should_bail_when_slow = true;
58 
59 // Changing this to true compiles in prints that trace execution of the DFA.
60 // Generates a lot of output -- only useful for debugging.
61 static const bool ExtraDebug = false;
62 
63 // A DFA implementation of a regular expression program.
64 // Since this is entirely a forward declaration mandated by C++,
65 // some of the comments here are better understood after reading
66 // the comments in the sections that follow the DFA definition.
67 class DFA {
68  public:
69   DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem);
70   ~DFA();
ok() const71   bool ok() const { return !init_failed_; }
kind()72   Prog::MatchKind kind() { return kind_; }
73 
74   // Searches for the regular expression in text, which is considered
75   // as a subsection of context for the purposes of interpreting flags
76   // like ^ and $ and \A and \z.
77   // Returns whether a match was found.
78   // If a match is found, sets *ep to the end point of the best match in text.
79   // If "anchored", the match must begin at the start of text.
80   // If "want_earliest_match", the match that ends first is used, not
81   //   necessarily the best one.
82   // If "run_forward" is true, the DFA runs from text.begin() to text.end().
83   //   If it is false, the DFA runs from text.end() to text.begin(),
84   //   returning the leftmost end of the match instead of the rightmost one.
85   // If the DFA cannot complete the search (for example, if it is out of
86   //   memory), it sets *failed and returns false.
87   bool Search(const StringPiece& text, const StringPiece& context,
88               bool anchored, bool want_earliest_match, bool run_forward,
89               bool* failed, const char** ep, SparseSet* matches);
90 
91   // Builds out all states for the entire DFA.
92   // If cb is not empty, it receives one callback per state built.
93   // Returns the number of states built.
94   // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
95   int BuildAllStates(const Prog::DFAStateCallback& cb);
96 
97   // Computes min and max for matching strings.  Won't return strings
98   // bigger than maxlen.
99   bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
100 
101   // These data structures are logically private, but C++ makes it too
102   // difficult to mark them as such.
103   class RWLocker;
104   class StateSaver;
105   class Workq;
106 
107   // A single DFA state.  The DFA is represented as a graph of these
108   // States, linked by the next_ pointers.  If in state s and reading
109   // byte c, the next state should be s->next_[c].
110   struct State {
IsMatchre2::DFA::State111     inline bool IsMatch() const { return (flag_ & kFlagMatch) != 0; }
112 
113     int* inst_;         // Instruction pointers in the state.
114     int ninst_;         // # of inst_ pointers.
115     uint32_t flag_;     // Empty string bitfield flags in effect on the way
116                         // into this state, along with kFlagMatch if this
117                         // is a matching state.
118 
119 // Work around the bug affecting flexible array members in GCC 6.x (for x >= 1).
120 // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70932)
121 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && __GNUC_MINOR__ >= 1
122     std::atomic<State*> next_[0];   // Outgoing arrows from State,
123 #else
124     std::atomic<State*> next_[];    // Outgoing arrows from State,
125 #endif
126 
127                         // one per input byte class
128   };
129 
130   enum {
131     kByteEndText = 256,         // imaginary byte at end of text
132 
133     kFlagEmptyMask = 0xFF,      // State.flag_: bits holding kEmptyXXX flags
134     kFlagMatch = 0x0100,        // State.flag_: this is a matching state
135     kFlagLastWord = 0x0200,     // State.flag_: last byte was a word char
136     kFlagNeedShift = 16,        // needed kEmpty bits are or'ed in shifted left
137   };
138 
139   struct StateHash {
operator ()re2::DFA::StateHash140     size_t operator()(const State* a) const {
141       DCHECK(a != NULL);
142       HashMix mix(a->flag_);
143       for (int i = 0; i < a->ninst_; i++)
144         mix.Mix(a->inst_[i]);
145       mix.Mix(0);
146       return mix.get();
147     }
148   };
149 
150   struct StateEqual {
operator ()re2::DFA::StateEqual151     bool operator()(const State* a, const State* b) const {
152       DCHECK(a != NULL);
153       DCHECK(b != NULL);
154       if (a == b)
155         return true;
156       if (a->flag_ != b->flag_)
157         return false;
158       if (a->ninst_ != b->ninst_)
159         return false;
160       for (int i = 0; i < a->ninst_; i++)
161         if (a->inst_[i] != b->inst_[i])
162           return false;
163       return true;
164     }
165   };
166 
167   typedef std::unordered_set<State*, StateHash, StateEqual> StateSet;
168 
169  private:
170   enum {
171     // Indices into start_ for unanchored searches.
172     // Add kStartAnchored for anchored searches.
173     kStartBeginText = 0,          // text at beginning of context
174     kStartBeginLine = 2,          // text at beginning of line
175     kStartAfterWordChar = 4,      // text follows a word character
176     kStartAfterNonWordChar = 6,   // text follows non-word character
177     kMaxStart = 8,
178 
179     kStartAnchored = 1,
180   };
181 
182   // Resets the DFA State cache, flushing all saved State* information.
183   // Releases and reacquires cache_mutex_ via cache_lock, so any
184   // State* existing before the call are not valid after the call.
185   // Use a StateSaver to preserve important states across the call.
186   // cache_mutex_.r <= L < mutex_
187   // After: cache_mutex_.w <= L < mutex_
188   void ResetCache(RWLocker* cache_lock);
189 
190   // Looks up and returns the State corresponding to a Workq.
191   // L >= mutex_
192   State* WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag);
193 
194   // Looks up and returns a State matching the inst, ninst, and flag.
195   // L >= mutex_
196   State* CachedState(int* inst, int ninst, uint32_t flag);
197 
198   // Clear the cache entirely.
199   // Must hold cache_mutex_.w or be in destructor.
200   void ClearCache();
201 
202   // Converts a State into a Workq: the opposite of WorkqToCachedState.
203   // L >= mutex_
204   void StateToWorkq(State* s, Workq* q);
205 
206   // Runs a State on a given byte, returning the next state.
207   State* RunStateOnByteUnlocked(State*, int);  // cache_mutex_.r <= L < mutex_
208   State* RunStateOnByte(State*, int);          // L >= mutex_
209 
210   // Runs a Workq on a given byte followed by a set of empty-string flags,
211   // producing a new Workq in nq.  If a match instruction is encountered,
212   // sets *ismatch to true.
213   // L >= mutex_
214   void RunWorkqOnByte(Workq* q, Workq* nq,
215                       int c, uint32_t flag, bool* ismatch);
216 
217   // Runs a Workq on a set of empty-string flags, producing a new Workq in nq.
218   // L >= mutex_
219   void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint32_t flag);
220 
221   // Adds the instruction id to the Workq, following empty arrows
222   // according to flag.
223   // L >= mutex_
224   void AddToQueue(Workq* q, int id, uint32_t flag);
225 
226   // For debugging, returns a text representation of State.
227   static std::string DumpState(State* state);
228 
229   // For debugging, returns a text representation of a Workq.
230   static std::string DumpWorkq(Workq* q);
231 
232   // Search parameters
233   struct SearchParams {
SearchParamsre2::DFA::SearchParams234     SearchParams(const StringPiece& text, const StringPiece& context,
235                  RWLocker* cache_lock)
236       : text(text),
237         context(context),
238         anchored(false),
239         can_prefix_accel(false),
240         want_earliest_match(false),
241         run_forward(false),
242         start(NULL),
243         cache_lock(cache_lock),
244         failed(false),
245         ep(NULL),
246         matches(NULL) {}
247 
248     StringPiece text;
249     StringPiece context;
250     bool anchored;
251     bool can_prefix_accel;
252     bool want_earliest_match;
253     bool run_forward;
254     State* start;
255     RWLocker* cache_lock;
256     bool failed;     // "out" parameter: whether search gave up
257     const char* ep;  // "out" parameter: end pointer for match
258     SparseSet* matches;
259 
260    private:
261     SearchParams(const SearchParams&) = delete;
262     SearchParams& operator=(const SearchParams&) = delete;
263   };
264 
265   // Before each search, the parameters to Search are analyzed by
266   // AnalyzeSearch to determine the state in which to start.
267   struct StartInfo {
StartInfore2::DFA::StartInfo268     StartInfo() : start(NULL) {}
269     std::atomic<State*> start;
270   };
271 
272   // Fills in params->start and params->can_prefix_accel using
273   // the other search parameters.  Returns true on success,
274   // false on failure.
275   // cache_mutex_.r <= L < mutex_
276   bool AnalyzeSearch(SearchParams* params);
277   bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
278                            uint32_t flags);
279 
280   // The generic search loop, inlined to create specialized versions.
281   // cache_mutex_.r <= L < mutex_
282   // Might unlock and relock cache_mutex_ via params->cache_lock.
283   template <bool can_prefix_accel,
284             bool want_earliest_match,
285             bool run_forward>
286   inline bool InlinedSearchLoop(SearchParams* params);
287 
288   // The specialized versions of InlinedSearchLoop.  The three letters
289   // at the ends of the name denote the true/false values used as the
290   // last three parameters of InlinedSearchLoop.
291   // cache_mutex_.r <= L < mutex_
292   // Might unlock and relock cache_mutex_ via params->cache_lock.
293   bool SearchFFF(SearchParams* params);
294   bool SearchFFT(SearchParams* params);
295   bool SearchFTF(SearchParams* params);
296   bool SearchFTT(SearchParams* params);
297   bool SearchTFF(SearchParams* params);
298   bool SearchTFT(SearchParams* params);
299   bool SearchTTF(SearchParams* params);
300   bool SearchTTT(SearchParams* params);
301 
302   // The main search loop: calls an appropriate specialized version of
303   // InlinedSearchLoop.
304   // cache_mutex_.r <= L < mutex_
305   // Might unlock and relock cache_mutex_ via params->cache_lock.
306   bool FastSearchLoop(SearchParams* params);
307 
308 
309   // Looks up bytes in bytemap_ but handles case c == kByteEndText too.
ByteMap(int c)310   int ByteMap(int c) {
311     if (c == kByteEndText)
312       return prog_->bytemap_range();
313     return prog_->bytemap()[c];
314   }
315 
316   // Constant after initialization.
317   Prog* prog_;              // The regular expression program to run.
318   Prog::MatchKind kind_;    // The kind of DFA.
319   bool init_failed_;        // initialization failed (out of memory)
320 
321   Mutex mutex_;  // mutex_ >= cache_mutex_.r
322 
323   // Scratch areas, protected by mutex_.
324   Workq* q0_;             // Two pre-allocated work queues.
325   Workq* q1_;
326   PODArray<int> stack_;   // Pre-allocated stack for AddToQueue
327 
328   // State* cache.  Many threads use and add to the cache simultaneously,
329   // holding cache_mutex_ for reading and mutex_ (above) when adding.
330   // If the cache fills and needs to be discarded, the discarding is done
331   // while holding cache_mutex_ for writing, to avoid interrupting other
332   // readers.  Any State* pointers are only valid while cache_mutex_
333   // is held.
334   Mutex cache_mutex_;
335   int64_t mem_budget_;     // Total memory budget for all States.
336   int64_t state_budget_;   // Amount of memory remaining for new States.
337   StateSet state_cache_;   // All States computed so far.
338   StartInfo start_[kMaxStart];
339 
340   DFA(const DFA&) = delete;
341   DFA& operator=(const DFA&) = delete;
342 };
343 
344 // Shorthand for casting to uint8_t*.
BytePtr(const void * v)345 static inline const uint8_t* BytePtr(const void* v) {
346   return reinterpret_cast<const uint8_t*>(v);
347 }
348 
349 // Work queues
350 
351 // Marks separate thread groups of different priority
352 // in the work queue when in leftmost-longest matching mode.
353 #define Mark (-1)
354 
355 // Separates the match IDs from the instructions in inst_.
356 // Used only for "many match" DFA states.
357 #define MatchSep (-2)
358 
359 // Internally, the DFA uses a sparse array of
360 // program instruction pointers as a work queue.
361 // In leftmost longest mode, marks separate sections
362 // of workq that started executing at different
363 // locations in the string (earlier locations first).
364 class DFA::Workq : public SparseSet {
365  public:
366   // Constructor: n is number of normal slots, maxmark number of mark slots.
Workq(int n,int maxmark)367   Workq(int n, int maxmark) :
368     SparseSet(n+maxmark),
369     n_(n),
370     maxmark_(maxmark),
371     nextmark_(n),
372     last_was_mark_(true) {
373   }
374 
is_mark(int i)375   bool is_mark(int i) { return i >= n_; }
376 
maxmark()377   int maxmark() { return maxmark_; }
378 
clear()379   void clear() {
380     SparseSet::clear();
381     nextmark_ = n_;
382   }
383 
mark()384   void mark() {
385     if (last_was_mark_)
386       return;
387     last_was_mark_ = false;
388     SparseSet::insert_new(nextmark_++);
389   }
390 
size()391   int size() {
392     return n_ + maxmark_;
393   }
394 
insert(int id)395   void insert(int id) {
396     if (contains(id))
397       return;
398     insert_new(id);
399   }
400 
insert_new(int id)401   void insert_new(int id) {
402     last_was_mark_ = false;
403     SparseSet::insert_new(id);
404   }
405 
406  private:
407   int n_;                // size excluding marks
408   int maxmark_;          // maximum number of marks
409   int nextmark_;         // id of next mark
410   bool last_was_mark_;   // last inserted was mark
411 
412   Workq(const Workq&) = delete;
413   Workq& operator=(const Workq&) = delete;
414 };
415 
DFA(Prog * prog,Prog::MatchKind kind,int64_t max_mem)416 DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem)
417   : prog_(prog),
418     kind_(kind),
419     init_failed_(false),
420     q0_(NULL),
421     q1_(NULL),
422     mem_budget_(max_mem) {
423   if (ExtraDebug)
424     fprintf(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored().c_str());
425   int nmark = 0;
426   if (kind_ == Prog::kLongestMatch)
427     nmark = prog_->size();
428   // See DFA::AddToQueue() for why this is so.
429   int nstack = prog_->inst_count(kInstCapture) +
430                prog_->inst_count(kInstEmptyWidth) +
431                prog_->inst_count(kInstNop) +
432                nmark + 1;  // + 1 for start inst
433 
434   // Account for space needed for DFA, q0, q1, stack.
435   mem_budget_ -= sizeof(DFA);
436   mem_budget_ -= (prog_->size() + nmark) *
437                  (sizeof(int)+sizeof(int)) * 2;  // q0, q1
438   mem_budget_ -= nstack * sizeof(int);  // stack
439   if (mem_budget_ < 0) {
440     init_failed_ = true;
441     return;
442   }
443 
444   state_budget_ = mem_budget_;
445 
446   // Make sure there is a reasonable amount of working room left.
447   // At minimum, the search requires room for two states in order
448   // to limp along, restarting frequently.  We'll get better performance
449   // if there is room for a larger number of states, say 20.
450   // Note that a state stores list heads only, so we use the program
451   // list count for the upper bound, not the program size.
452   int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
453   int64_t one_state = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
454                       (prog_->list_count()+nmark)*sizeof(int);
455   if (state_budget_ < 20*one_state) {
456     init_failed_ = true;
457     return;
458   }
459 
460   q0_ = new Workq(prog_->size(), nmark);
461   q1_ = new Workq(prog_->size(), nmark);
462   stack_ = PODArray<int>(nstack);
463 }
464 
~DFA()465 DFA::~DFA() {
466   delete q0_;
467   delete q1_;
468   ClearCache();
469 }
470 
471 // In the DFA state graph, s->next[c] == NULL means that the
472 // state has not yet been computed and needs to be.  We need
473 // a different special value to signal that s->next[c] is a
474 // state that can never lead to a match (and thus the search
475 // can be called off).  Hence DeadState.
476 #define DeadState reinterpret_cast<State*>(1)
477 
478 // Signals that the rest of the string matches no matter what it is.
479 #define FullMatchState reinterpret_cast<State*>(2)
480 
481 #define SpecialStateMax FullMatchState
482 
483 // Debugging printouts
484 
485 // For debugging, returns a string representation of the work queue.
DumpWorkq(Workq * q)486 std::string DFA::DumpWorkq(Workq* q) {
487   std::string s;
488   const char* sep = "";
489   for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
490     if (q->is_mark(*it)) {
491       s += "|";
492       sep = "";
493     } else {
494       s += StringPrintf("%s%d", sep, *it);
495       sep = ",";
496     }
497   }
498   return s;
499 }
500 
501 // For debugging, returns a string representation of the state.
DumpState(State * state)502 std::string DFA::DumpState(State* state) {
503   if (state == NULL)
504     return "_";
505   if (state == DeadState)
506     return "X";
507   if (state == FullMatchState)
508     return "*";
509   std::string s;
510   const char* sep = "";
511   s += StringPrintf("(%p)", state);
512   for (int i = 0; i < state->ninst_; i++) {
513     if (state->inst_[i] == Mark) {
514       s += "|";
515       sep = "";
516     } else if (state->inst_[i] == MatchSep) {
517       s += "||";
518       sep = "";
519     } else {
520       s += StringPrintf("%s%d", sep, state->inst_[i]);
521       sep = ",";
522     }
523   }
524   s += StringPrintf(" flag=%#x", state->flag_);
525   return s;
526 }
527 
528 //////////////////////////////////////////////////////////////////////
529 //
530 // DFA state graph construction.
531 //
532 // The DFA state graph is a heavily-linked collection of State* structures.
533 // The state_cache_ is a set of all the State structures ever allocated,
534 // so that if the same state is reached by two different paths,
535 // the same State structure can be used.  This reduces allocation
536 // requirements and also avoids duplication of effort across the two
537 // identical states.
538 //
539 // A State is defined by an ordered list of instruction ids and a flag word.
540 //
541 // The choice of an ordered list of instructions differs from a typical
542 // textbook DFA implementation, which would use an unordered set.
543 // Textbook descriptions, however, only care about whether
544 // the DFA matches, not where it matches in the text.  To decide where the
545 // DFA matches, we need to mimic the behavior of the dominant backtracking
546 // implementations like PCRE, which try one possible regular expression
547 // execution, then another, then another, stopping when one of them succeeds.
548 // The DFA execution tries these many executions in parallel, representing
549 // each by an instruction id.  These pointers are ordered in the State.inst_
550 // list in the same order that the executions would happen in a backtracking
551 // search: if a match is found during execution of inst_[2], inst_[i] for i>=3
552 // can be discarded.
553 //
554 // Textbooks also typically do not consider context-aware empty string operators
555 // like ^ or $.  These are handled by the flag word, which specifies the set
556 // of empty-string operators that should be matched when executing at the
557 // current text position.  These flag bits are defined in prog.h.
558 // The flag word also contains two DFA-specific bits: kFlagMatch if the state
559 // is a matching state (one that reached a kInstMatch in the program)
560 // and kFlagLastWord if the last processed byte was a word character, for the
561 // implementation of \B and \b.
562 //
563 // The flag word also contains, shifted up 16 bits, the bits looked for by
564 // any kInstEmptyWidth instructions in the state.  These provide a useful
565 // summary indicating when new flags might be useful.
566 //
567 // The permanent representation of a State's instruction ids is just an array,
568 // but while a state is being analyzed, these instruction ids are represented
569 // as a Workq, which is an array that allows iteration in insertion order.
570 
571 // NOTE(rsc): The choice of State construction determines whether the DFA
572 // mimics backtracking implementations (so-called leftmost first matching) or
573 // traditional DFA implementations (so-called leftmost longest matching as
574 // prescribed by POSIX).  This implementation chooses to mimic the
575 // backtracking implementations, because we want to replace PCRE.  To get
576 // POSIX behavior, the states would need to be considered not as a simple
577 // ordered list of instruction ids, but as a list of unordered sets of instruction
578 // ids.  A match by a state in one set would inhibit the running of sets
579 // farther down the list but not other instruction ids in the same set.  Each
580 // set would correspond to matches beginning at a given point in the string.
581 // This is implemented by separating different sets with Mark pointers.
582 
583 // Looks in the State cache for a State matching q, flag.
584 // If one is found, returns it.  If one is not found, allocates one,
585 // inserts it in the cache, and returns it.
586 // If mq is not null, MatchSep and the match IDs in mq will be appended
587 // to the State.
WorkqToCachedState(Workq * q,Workq * mq,uint32_t flag)588 DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
589   //mutex_.AssertHeld();
590 
591   // Construct array of instruction ids for the new state.
592   // Only ByteRange, EmptyWidth, and Match instructions are useful to keep:
593   // those are the only operators with any effect in
594   // RunWorkqOnEmptyString or RunWorkqOnByte.
595   PODArray<int> inst(q->size());
596   int n = 0;
597   uint32_t needflags = 0;  // flags needed by kInstEmptyWidth instructions
598   bool sawmatch = false;   // whether queue contains guaranteed kInstMatch
599   bool sawmark = false;    // whether queue contains a Mark
600   if (ExtraDebug)
601     fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag);
602   for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
603     int id = *it;
604     if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id)))
605       break;
606     if (q->is_mark(id)) {
607       if (n > 0 && inst[n-1] != Mark) {
608         sawmark = true;
609         inst[n++] = Mark;
610       }
611       continue;
612     }
613     Prog::Inst* ip = prog_->inst(id);
614     switch (ip->opcode()) {
615       case kInstAltMatch:
616         // This state will continue to a match no matter what
617         // the rest of the input is.  If it is the highest priority match
618         // being considered, return the special FullMatchState
619         // to indicate that it's all matches from here out.
620         if (kind_ != Prog::kManyMatch &&
621             (kind_ != Prog::kFirstMatch ||
622              (it == q->begin() && ip->greedy(prog_))) &&
623             (kind_ != Prog::kLongestMatch || !sawmark) &&
624             (flag & kFlagMatch)) {
625           if (ExtraDebug)
626             fprintf(stderr, " -> FullMatchState\n");
627           return FullMatchState;
628         }
629         FALLTHROUGH_INTENDED;
630       default:
631         // Record iff id is the head of its list, which must
632         // be the case if id-1 is the last of *its* list. :)
633         if (prog_->inst(id-1)->last())
634           inst[n++] = *it;
635         if (ip->opcode() == kInstEmptyWidth)
636           needflags |= ip->empty();
637         if (ip->opcode() == kInstMatch && !prog_->anchor_end())
638           sawmatch = true;
639         break;
640     }
641   }
642   DCHECK_LE(n, q->size());
643   if (n > 0 && inst[n-1] == Mark)
644     n--;
645 
646   // If there are no empty-width instructions waiting to execute,
647   // then the extra flag bits will not be used, so there is no
648   // point in saving them.  (Discarding them reduces the number
649   // of distinct states.)
650   if (needflags == 0)
651     flag &= kFlagMatch;
652 
653   // NOTE(rsc): The code above cannot do flag &= needflags,
654   // because if the right flags were present to pass the current
655   // kInstEmptyWidth instructions, new kInstEmptyWidth instructions
656   // might be reached that in turn need different flags.
657   // The only sure thing is that if there are no kInstEmptyWidth
658   // instructions at all, no flags will be needed.
659   // We could do the extra work to figure out the full set of
660   // possibly needed flags by exploring past the kInstEmptyWidth
661   // instructions, but the check above -- are any flags needed
662   // at all? -- handles the most common case.  More fine-grained
663   // analysis can only be justified by measurements showing that
664   // too many redundant states are being allocated.
665 
666   // If there are no Insts in the list, it's a dead state,
667   // which is useful to signal with a special pointer so that
668   // the execution loop can stop early.  This is only okay
669   // if the state is *not* a matching state.
670   if (n == 0 && flag == 0) {
671     if (ExtraDebug)
672       fprintf(stderr, " -> DeadState\n");
673     return DeadState;
674   }
675 
676   // If we're in longest match mode, the state is a sequence of
677   // unordered state sets separated by Marks.  Sort each set
678   // to canonicalize, to reduce the number of distinct sets stored.
679   if (kind_ == Prog::kLongestMatch) {
680     int* ip = inst.data();
681     int* ep = ip + n;
682     while (ip < ep) {
683       int* markp = ip;
684       while (markp < ep && *markp != Mark)
685         markp++;
686       std::sort(ip, markp);
687       if (markp < ep)
688         markp++;
689       ip = markp;
690     }
691   }
692 
693   // If we're in many match mode, canonicalize for similar reasons:
694   // we have an unordered set of states (i.e. we don't have Marks)
695   // and sorting will reduce the number of distinct sets stored.
696   if (kind_ == Prog::kManyMatch) {
697     int* ip = inst.data();
698     int* ep = ip + n;
699     std::sort(ip, ep);
700   }
701 
702   // Append MatchSep and the match IDs in mq if necessary.
703   if (mq != NULL) {
704     inst[n++] = MatchSep;
705     for (Workq::iterator i = mq->begin(); i != mq->end(); ++i) {
706       int id = *i;
707       Prog::Inst* ip = prog_->inst(id);
708       if (ip->opcode() == kInstMatch)
709         inst[n++] = ip->match_id();
710     }
711   }
712 
713   // Save the needed empty-width flags in the top bits for use later.
714   flag |= needflags << kFlagNeedShift;
715 
716   State* state = CachedState(inst.data(), n, flag);
717   return state;
718 }
719 
720 // Looks in the State cache for a State matching inst, ninst, flag.
721 // If one is found, returns it.  If one is not found, allocates one,
722 // inserts it in the cache, and returns it.
CachedState(int * inst,int ninst,uint32_t flag)723 DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) {
724   //mutex_.AssertHeld();
725 
726   // Look in the cache for a pre-existing state.
727   // We have to initialise the struct like this because otherwise
728   // MSVC will complain about the flexible array member. :(
729   State state;
730   state.inst_ = inst;
731   state.ninst_ = ninst;
732   state.flag_ = flag;
733   StateSet::iterator it = state_cache_.find(&state);
734   if (it != state_cache_.end()) {
735     if (ExtraDebug)
736       fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str());
737     return *it;
738   }
739 
740   // Must have enough memory for new state.
741   // In addition to what we're going to allocate,
742   // the state cache hash table seems to incur about 40 bytes per
743   // State*, empirically.
744   const int kStateCacheOverhead = 40;
745   int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
746   int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
747             ninst*sizeof(int);
748   if (mem_budget_ < mem + kStateCacheOverhead) {
749     mem_budget_ = -1;
750     return NULL;
751   }
752   mem_budget_ -= mem + kStateCacheOverhead;
753 
754   // Allocate new state along with room for next_ and inst_.
755   char* space = std::allocator<char>().allocate(mem);
756   State* s = new (space) State;
757   (void) new (s->next_) std::atomic<State*>[nnext];
758   // Work around a unfortunate bug in older versions of libstdc++.
759   // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64658)
760   for (int i = 0; i < nnext; i++)
761     (void) new (s->next_ + i) std::atomic<State*>(NULL);
762   s->inst_ = new (s->next_ + nnext) int[ninst];
763   memmove(s->inst_, inst, ninst*sizeof s->inst_[0]);
764   s->ninst_ = ninst;
765   s->flag_ = flag;
766   if (ExtraDebug)
767     fprintf(stderr, " -> %s\n", DumpState(s).c_str());
768 
769   // Put state in cache and return it.
770   state_cache_.insert(s);
771   return s;
772 }
773 
774 // Clear the cache.  Must hold cache_mutex_.w or be in destructor.
ClearCache()775 void DFA::ClearCache() {
776   StateSet::iterator begin = state_cache_.begin();
777   StateSet::iterator end = state_cache_.end();
778   while (begin != end) {
779     StateSet::iterator tmp = begin;
780     ++begin;
781     // Deallocate the blob of memory that we allocated in DFA::CachedState().
782     // We recompute mem in order to benefit from sized delete where possible.
783     int ninst = (*tmp)->ninst_;
784     int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
785     int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
786               ninst*sizeof(int);
787     std::allocator<char>().deallocate(reinterpret_cast<char*>(*tmp), mem);
788   }
789   state_cache_.clear();
790 }
791 
792 // Copies insts in state s to the work queue q.
StateToWorkq(State * s,Workq * q)793 void DFA::StateToWorkq(State* s, Workq* q) {
794   q->clear();
795   for (int i = 0; i < s->ninst_; i++) {
796     if (s->inst_[i] == Mark) {
797       q->mark();
798     } else if (s->inst_[i] == MatchSep) {
799       // Nothing after this is an instruction!
800       break;
801     } else {
802       // Explore from the head of the list.
803       AddToQueue(q, s->inst_[i], s->flag_ & kFlagEmptyMask);
804     }
805   }
806 }
807 
808 // Adds ip to the work queue, following empty arrows according to flag.
AddToQueue(Workq * q,int id,uint32_t flag)809 void DFA::AddToQueue(Workq* q, int id, uint32_t flag) {
810 
811   // Use stack_ to hold our stack of instructions yet to process.
812   // It was preallocated as follows:
813   //   one entry per Capture;
814   //   one entry per EmptyWidth; and
815   //   one entry per Nop.
816   // This reflects the maximum number of stack pushes that each can
817   // perform. (Each instruction can be processed at most once.)
818   // When using marks, we also added nmark == prog_->size().
819   // (Otherwise, nmark == 0.)
820   int* stk = stack_.data();
821   int nstk = 0;
822 
823   stk[nstk++] = id;
824   while (nstk > 0) {
825     DCHECK_LE(nstk, stack_.size());
826     id = stk[--nstk];
827 
828   Loop:
829     if (id == Mark) {
830       q->mark();
831       continue;
832     }
833 
834     if (id == 0)
835       continue;
836 
837     // If ip is already on the queue, nothing to do.
838     // Otherwise add it.  We don't actually keep all the
839     // ones that get added, but adding all of them here
840     // increases the likelihood of q->contains(id),
841     // reducing the amount of duplicated work.
842     if (q->contains(id))
843       continue;
844     q->insert_new(id);
845 
846     // Process instruction.
847     Prog::Inst* ip = prog_->inst(id);
848     switch (ip->opcode()) {
849       default:
850         LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
851         break;
852 
853       case kInstByteRange:  // just save these on the queue
854       case kInstMatch:
855         if (ip->last())
856           break;
857         id = id+1;
858         goto Loop;
859 
860       case kInstCapture:    // DFA treats captures as no-ops.
861       case kInstNop:
862         if (!ip->last())
863           stk[nstk++] = id+1;
864 
865         // If this instruction is the [00-FF]* loop at the beginning of
866         // a leftmost-longest unanchored search, separate with a Mark so
867         // that future threads (which will start farther to the right in
868         // the input string) are lower priority than current threads.
869         if (ip->opcode() == kInstNop && q->maxmark() > 0 &&
870             id == prog_->start_unanchored() && id != prog_->start())
871           stk[nstk++] = Mark;
872         id = ip->out();
873         goto Loop;
874 
875       case kInstAltMatch:
876         DCHECK(!ip->last());
877         id = id+1;
878         goto Loop;
879 
880       case kInstEmptyWidth:
881         if (!ip->last())
882           stk[nstk++] = id+1;
883 
884         // Continue on if we have all the right flag bits.
885         if (ip->empty() & ~flag)
886           break;
887         id = ip->out();
888         goto Loop;
889     }
890   }
891 }
892 
893 // Running of work queues.  In the work queue, order matters:
894 // the queue is sorted in priority order.  If instruction i comes before j,
895 // then the instructions that i produces during the run must come before
896 // the ones that j produces.  In order to keep this invariant, all the
897 // work queue runners have to take an old queue to process and then
898 // also a new queue to fill in.  It's not acceptable to add to the end of
899 // an existing queue, because new instructions will not end up in the
900 // correct position.
901 
902 // Runs the work queue, processing the empty strings indicated by flag.
903 // For example, flag == kEmptyBeginLine|kEmptyEndLine means to match
904 // both ^ and $.  It is important that callers pass all flags at once:
905 // processing both ^ and $ is not the same as first processing only ^
906 // and then processing only $.  Doing the two-step sequence won't match
907 // ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior
908 // exhibited by existing implementations).
RunWorkqOnEmptyString(Workq * oldq,Workq * newq,uint32_t flag)909 void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint32_t flag) {
910   newq->clear();
911   for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
912     if (oldq->is_mark(*i))
913       AddToQueue(newq, Mark, flag);
914     else
915       AddToQueue(newq, *i, flag);
916   }
917 }
918 
919 // Runs the work queue, processing the single byte c followed by any empty
920 // strings indicated by flag.  For example, c == 'a' and flag == kEmptyEndLine,
921 // means to match c$.  Sets the bool *ismatch to true if the end of the
922 // regular expression program has been reached (the regexp has matched).
RunWorkqOnByte(Workq * oldq,Workq * newq,int c,uint32_t flag,bool * ismatch)923 void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
924                          int c, uint32_t flag, bool* ismatch) {
925   //mutex_.AssertHeld();
926 
927   newq->clear();
928   for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
929     if (oldq->is_mark(*i)) {
930       if (*ismatch)
931         return;
932       newq->mark();
933       continue;
934     }
935     int id = *i;
936     Prog::Inst* ip = prog_->inst(id);
937     switch (ip->opcode()) {
938       default:
939         LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
940         break;
941 
942       case kInstFail:        // never succeeds
943       case kInstCapture:     // already followed
944       case kInstNop:         // already followed
945       case kInstAltMatch:    // already followed
946       case kInstEmptyWidth:  // already followed
947         break;
948 
949       case kInstByteRange:   // can follow if c is in range
950         if (!ip->Matches(c))
951           break;
952         AddToQueue(newq, ip->out(), flag);
953         if (ip->hint() != 0) {
954           // We have a hint, but we must cancel out the
955           // increment that will occur after the break.
956           i += ip->hint() - 1;
957         } else {
958           // We have no hint, so we must find the end
959           // of the current list and then skip to it.
960           Prog::Inst* ip0 = ip;
961           while (!ip->last())
962             ++ip;
963           i += ip - ip0;
964         }
965         break;
966 
967       case kInstMatch:
968         if (prog_->anchor_end() && c != kByteEndText &&
969             kind_ != Prog::kManyMatch)
970           break;
971         *ismatch = true;
972         if (kind_ == Prog::kFirstMatch) {
973           // Can stop processing work queue since we found a match.
974           return;
975         }
976         break;
977     }
978   }
979 
980   if (ExtraDebug)
981     fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n",
982             DumpWorkq(oldq).c_str(), c, flag, DumpWorkq(newq).c_str(), *ismatch);
983 }
984 
985 // Processes input byte c in state, returning new state.
986 // Caller does not hold mutex.
RunStateOnByteUnlocked(State * state,int c)987 DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) {
988   // Keep only one RunStateOnByte going
989   // even if the DFA is being run by multiple threads.
990   MutexLock l(&mutex_);
991   return RunStateOnByte(state, c);
992 }
993 
994 // Processes input byte c in state, returning new state.
RunStateOnByte(State * state,int c)995 DFA::State* DFA::RunStateOnByte(State* state, int c) {
996   //mutex_.AssertHeld();
997 
998   if (state <= SpecialStateMax) {
999     if (state == FullMatchState) {
1000       // It is convenient for routines like PossibleMatchRange
1001       // if we implement RunStateOnByte for FullMatchState:
1002       // once you get into this state you never get out,
1003       // so it's pretty easy.
1004       return FullMatchState;
1005     }
1006     if (state == DeadState) {
1007       LOG(DFATAL) << "DeadState in RunStateOnByte";
1008       return NULL;
1009     }
1010     if (state == NULL) {
1011       LOG(DFATAL) << "NULL state in RunStateOnByte";
1012       return NULL;
1013     }
1014     LOG(DFATAL) << "Unexpected special state in RunStateOnByte";
1015     return NULL;
1016   }
1017 
1018   // If someone else already computed this, return it.
1019   State* ns = state->next_[ByteMap(c)].load(std::memory_order_relaxed);
1020   if (ns != NULL)
1021     return ns;
1022 
1023   // Convert state into Workq.
1024   StateToWorkq(state, q0_);
1025 
1026   // Flags marking the kinds of empty-width things (^ $ etc)
1027   // around this byte.  Before the byte we have the flags recorded
1028   // in the State structure itself.  After the byte we have
1029   // nothing yet (but that will change: read on).
1030   uint32_t needflag = state->flag_ >> kFlagNeedShift;
1031   uint32_t beforeflag = state->flag_ & kFlagEmptyMask;
1032   uint32_t oldbeforeflag = beforeflag;
1033   uint32_t afterflag = 0;
1034 
1035   if (c == '\n') {
1036     // Insert implicit $ and ^ around \n
1037     beforeflag |= kEmptyEndLine;
1038     afterflag |= kEmptyBeginLine;
1039   }
1040 
1041   if (c == kByteEndText) {
1042     // Insert implicit $ and \z before the fake "end text" byte.
1043     beforeflag |= kEmptyEndLine | kEmptyEndText;
1044   }
1045 
1046   // The state flag kFlagLastWord says whether the last
1047   // byte processed was a word character.  Use that info to
1048   // insert empty-width (non-)word boundaries.
1049   bool islastword = (state->flag_ & kFlagLastWord) != 0;
1050   bool isword = c != kByteEndText && Prog::IsWordChar(static_cast<uint8_t>(c));
1051   if (isword == islastword)
1052     beforeflag |= kEmptyNonWordBoundary;
1053   else
1054     beforeflag |= kEmptyWordBoundary;
1055 
1056   // Okay, finally ready to run.
1057   // Only useful to rerun on empty string if there are new, useful flags.
1058   if (beforeflag & ~oldbeforeflag & needflag) {
1059     RunWorkqOnEmptyString(q0_, q1_, beforeflag);
1060     using std::swap;
1061     swap(q0_, q1_);
1062   }
1063   bool ismatch = false;
1064   RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch);
1065   using std::swap;
1066   swap(q0_, q1_);
1067 
1068   // Save afterflag along with ismatch and isword in new state.
1069   uint32_t flag = afterflag;
1070   if (ismatch)
1071     flag |= kFlagMatch;
1072   if (isword)
1073     flag |= kFlagLastWord;
1074 
1075   if (ismatch && kind_ == Prog::kManyMatch)
1076     ns = WorkqToCachedState(q0_, q1_, flag);
1077   else
1078     ns = WorkqToCachedState(q0_, NULL, flag);
1079 
1080   // Flush ns before linking to it.
1081   // Write barrier before updating state->next_ so that the
1082   // main search loop can proceed without any locking, for speed.
1083   // (Otherwise it would need one mutex operation per input byte.)
1084   state->next_[ByteMap(c)].store(ns, std::memory_order_release);
1085   return ns;
1086 }
1087 
1088 
1089 //////////////////////////////////////////////////////////////////////
1090 // DFA cache reset.
1091 
1092 // Reader-writer lock helper.
1093 //
1094 // The DFA uses a reader-writer mutex to protect the state graph itself.
1095 // Traversing the state graph requires holding the mutex for reading,
1096 // and discarding the state graph and starting over requires holding the
1097 // lock for writing.  If a search needs to expand the graph but is out
1098 // of memory, it will need to drop its read lock and then acquire the
1099 // write lock.  Since it cannot then atomically downgrade from write lock
1100 // to read lock, it runs the rest of the search holding the write lock.
1101 // (This probably helps avoid repeated contention, but really the decision
1102 // is forced by the Mutex interface.)  It's a bit complicated to keep
1103 // track of whether the lock is held for reading or writing and thread
1104 // that through the search, so instead we encapsulate it in the RWLocker
1105 // and pass that around.
1106 
1107 class DFA::RWLocker {
1108  public:
1109   explicit RWLocker(Mutex* mu);
1110   ~RWLocker();
1111 
1112   // If the lock is only held for reading right now,
1113   // drop the read lock and re-acquire for writing.
1114   // Subsequent calls to LockForWriting are no-ops.
1115   // Notice that the lock is *released* temporarily.
1116   void LockForWriting();
1117 
1118  private:
1119   Mutex* mu_;
1120   bool writing_;
1121 
1122   RWLocker(const RWLocker&) = delete;
1123   RWLocker& operator=(const RWLocker&) = delete;
1124 };
1125 
RWLocker(Mutex * mu)1126 DFA::RWLocker::RWLocker(Mutex* mu) : mu_(mu), writing_(false) {
1127   mu_->ReaderLock();
1128 }
1129 
1130 // This function is marked as NO_THREAD_SAFETY_ANALYSIS because
1131 // the annotations don't support lock upgrade.
LockForWriting()1132 void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS {
1133   if (!writing_) {
1134     mu_->ReaderUnlock();
1135     mu_->WriterLock();
1136     writing_ = true;
1137   }
1138 }
1139 
~RWLocker()1140 DFA::RWLocker::~RWLocker() {
1141   if (!writing_)
1142     mu_->ReaderUnlock();
1143   else
1144     mu_->WriterUnlock();
1145 }
1146 
1147 
1148 // When the DFA's State cache fills, we discard all the states in the
1149 // cache and start over.  Many threads can be using and adding to the
1150 // cache at the same time, so we synchronize using the cache_mutex_
1151 // to keep from stepping on other threads.  Specifically, all the
1152 // threads using the current cache hold cache_mutex_ for reading.
1153 // When a thread decides to flush the cache, it drops cache_mutex_
1154 // and then re-acquires it for writing.  That ensures there are no
1155 // other threads accessing the cache anymore.  The rest of the search
1156 // runs holding cache_mutex_ for writing, avoiding any contention
1157 // with or cache pollution caused by other threads.
1158 
ResetCache(RWLocker * cache_lock)1159 void DFA::ResetCache(RWLocker* cache_lock) {
1160   // Re-acquire the cache_mutex_ for writing (exclusive use).
1161   cache_lock->LockForWriting();
1162 
1163   hooks::GetDFAStateCacheResetHook()({
1164       state_budget_,
1165       state_cache_.size(),
1166   });
1167 
1168   // Clear the cache, reset the memory budget.
1169   for (int i = 0; i < kMaxStart; i++)
1170     start_[i].start.store(NULL, std::memory_order_relaxed);
1171   ClearCache();
1172   mem_budget_ = state_budget_;
1173 }
1174 
1175 // Typically, a couple States do need to be preserved across a cache
1176 // reset, like the State at the current point in the search.
1177 // The StateSaver class helps keep States across cache resets.
1178 // It makes a copy of the state's guts outside the cache (before the reset)
1179 // and then can be asked, after the reset, to recreate the State
1180 // in the new cache.  For example, in a DFA method ("this" is a DFA):
1181 //
1182 //   StateSaver saver(this, s);
1183 //   ResetCache(cache_lock);
1184 //   s = saver.Restore();
1185 //
1186 // The saver should always have room in the cache to re-create the state,
1187 // because resetting the cache locks out all other threads, and the cache
1188 // is known to have room for at least a couple states (otherwise the DFA
1189 // constructor fails).
1190 
1191 class DFA::StateSaver {
1192  public:
1193   explicit StateSaver(DFA* dfa, State* state);
1194   ~StateSaver();
1195 
1196   // Recreates and returns a state equivalent to the
1197   // original state passed to the constructor.
1198   // Returns NULL if the cache has filled, but
1199   // since the DFA guarantees to have room in the cache
1200   // for a couple states, should never return NULL
1201   // if used right after ResetCache.
1202   State* Restore();
1203 
1204  private:
1205   DFA* dfa_;         // the DFA to use
1206   int* inst_;        // saved info from State
1207   int ninst_;
1208   uint32_t flag_;
1209   bool is_special_;  // whether original state was special
1210   State* special_;   // if is_special_, the original state
1211 
1212   StateSaver(const StateSaver&) = delete;
1213   StateSaver& operator=(const StateSaver&) = delete;
1214 };
1215 
StateSaver(DFA * dfa,State * state)1216 DFA::StateSaver::StateSaver(DFA* dfa, State* state) {
1217   dfa_ = dfa;
1218   if (state <= SpecialStateMax) {
1219     inst_ = NULL;
1220     ninst_ = 0;
1221     flag_ = 0;
1222     is_special_ = true;
1223     special_ = state;
1224     return;
1225   }
1226   is_special_ = false;
1227   special_ = NULL;
1228   flag_ = state->flag_;
1229   ninst_ = state->ninst_;
1230   inst_ = new int[ninst_];
1231   memmove(inst_, state->inst_, ninst_*sizeof inst_[0]);
1232 }
1233 
~StateSaver()1234 DFA::StateSaver::~StateSaver() {
1235   if (!is_special_)
1236     delete[] inst_;
1237 }
1238 
Restore()1239 DFA::State* DFA::StateSaver::Restore() {
1240   if (is_special_)
1241     return special_;
1242   MutexLock l(&dfa_->mutex_);
1243   State* s = dfa_->CachedState(inst_, ninst_, flag_);
1244   if (s == NULL)
1245     LOG(DFATAL) << "StateSaver failed to restore state.";
1246   return s;
1247 }
1248 
1249 
1250 //////////////////////////////////////////////////////////////////////
1251 //
1252 // DFA execution.
1253 //
1254 // The basic search loop is easy: start in a state s and then for each
1255 // byte c in the input, s = s->next[c].
1256 //
1257 // This simple description omits a few efficiency-driven complications.
1258 //
1259 // First, the State graph is constructed incrementally: it is possible
1260 // that s->next[c] is null, indicating that that state has not been
1261 // fully explored.  In this case, RunStateOnByte must be invoked to
1262 // determine the next state, which is cached in s->next[c] to save
1263 // future effort.  An alternative reason for s->next[c] to be null is
1264 // that the DFA has reached a so-called "dead state", in which any match
1265 // is no longer possible.  In this case RunStateOnByte will return NULL
1266 // and the processing of the string can stop early.
1267 //
1268 // Second, a 256-element pointer array for s->next_ makes each State
1269 // quite large (2kB on 64-bit machines).  Instead, dfa->bytemap_[]
1270 // maps from bytes to "byte classes" and then next_ only needs to have
1271 // as many pointers as there are byte classes.  A byte class is simply a
1272 // range of bytes that the regexp never distinguishes between.
1273 // A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1,
1274 // 'a', 'b' to 'c', and 'c' to 0xFF.  The bytemap slows us a little bit
1275 // but in exchange we typically cut the size of a State (and thus our
1276 // memory footprint) by about 5-10x.  The comments still refer to
1277 // s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]].
1278 //
1279 // Third, it is common for a DFA for an unanchored match to begin in a
1280 // state in which only one particular byte value can take the DFA to a
1281 // different state.  That is, s->next[c] != s for only one c.  In this
1282 // situation, the DFA can do better than executing the simple loop.
1283 // Instead, it can call memchr to search very quickly for the byte c.
1284 // Whether the start state has this property is determined during a
1285 // pre-compilation pass and the "can_prefix_accel" argument is set.
1286 //
1287 // Fourth, the desired behavior is to search for the leftmost-best match
1288 // (approximately, the same one that Perl would find), which is not
1289 // necessarily the match ending earliest in the string.  Each time a
1290 // match is found, it must be noted, but the DFA must continue on in
1291 // hope of finding a higher-priority match.  In some cases, the caller only
1292 // cares whether there is any match at all, not which one is found.
1293 // The "want_earliest_match" flag causes the search to stop at the first
1294 // match found.
1295 //
1296 // Fifth, one algorithm that uses the DFA needs it to run over the
1297 // input string backward, beginning at the end and ending at the beginning.
1298 // Passing false for the "run_forward" flag causes the DFA to run backward.
1299 //
1300 // The checks for these last three cases, which in a naive implementation
1301 // would be performed once per input byte, slow the general loop enough
1302 // to merit specialized versions of the search loop for each of the
1303 // eight possible settings of the three booleans.  Rather than write
1304 // eight different functions, we write one general implementation and then
1305 // inline it to create the specialized ones.
1306 //
1307 // Note that matches are delayed by one byte, to make it easier to
1308 // accomodate match conditions depending on the next input byte (like $ and \b).
1309 // When s->next[c]->IsMatch(), it means that there is a match ending just
1310 // *before* byte c.
1311 
1312 // The generic search loop.  Searches text for a match, returning
1313 // the pointer to the end of the chosen match, or NULL if no match.
1314 // The bools are equal to the same-named variables in params, but
1315 // making them function arguments lets the inliner specialize
1316 // this function to each combination (see two paragraphs above).
1317 template <bool can_prefix_accel,
1318           bool want_earliest_match,
1319           bool run_forward>
InlinedSearchLoop(SearchParams * params)1320 inline bool DFA::InlinedSearchLoop(SearchParams* params) {
1321   State* start = params->start;
1322   const uint8_t* bp = BytePtr(params->text.data());  // start of text
1323   const uint8_t* p = bp;                             // text scanning point
1324   const uint8_t* ep = BytePtr(params->text.data() +
1325                               params->text.size());  // end of text
1326   const uint8_t* resetp = NULL;                      // p at last cache reset
1327   if (!run_forward) {
1328     using std::swap;
1329     swap(p, ep);
1330   }
1331 
1332   const uint8_t* bytemap = prog_->bytemap();
1333   const uint8_t* lastmatch = NULL;   // most recent matching position in text
1334   bool matched = false;
1335 
1336   State* s = start;
1337   if (ExtraDebug)
1338     fprintf(stderr, "@stx: %s\n", DumpState(s).c_str());
1339 
1340   if (s->IsMatch()) {
1341     matched = true;
1342     lastmatch = p;
1343     if (ExtraDebug)
1344       fprintf(stderr, "match @stx! [%s]\n", DumpState(s).c_str());
1345     if (params->matches != NULL && kind_ == Prog::kManyMatch) {
1346       for (int i = s->ninst_ - 1; i >= 0; i--) {
1347         int id = s->inst_[i];
1348         if (id == MatchSep)
1349           break;
1350         params->matches->insert(id);
1351       }
1352     }
1353     if (want_earliest_match) {
1354       params->ep = reinterpret_cast<const char*>(lastmatch);
1355       return true;
1356     }
1357   }
1358 
1359   while (p != ep) {
1360     if (ExtraDebug)
1361       fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str());
1362 
1363     if (can_prefix_accel && s == start) {
1364       // In start state, only way out is to find the prefix,
1365       // so we use prefix accel (e.g. memchr) to skip ahead.
1366       // If not found, we can skip to the end of the string.
1367       p = BytePtr(prog_->PrefixAccel(p, ep - p));
1368       if (p == NULL) {
1369         p = ep;
1370         break;
1371       }
1372     }
1373 
1374     int c;
1375     if (run_forward)
1376       c = *p++;
1377     else
1378       c = *--p;
1379 
1380     // Note that multiple threads might be consulting
1381     // s->next_[bytemap[c]] simultaneously.
1382     // RunStateOnByte takes care of the appropriate locking,
1383     // including a memory barrier so that the unlocked access
1384     // (sometimes known as "double-checked locking") is safe.
1385     // The alternative would be either one DFA per thread
1386     // or one mutex operation per input byte.
1387     //
1388     // ns == DeadState means the state is known to be dead
1389     // (no more matches are possible).
1390     // ns == NULL means the state has not yet been computed
1391     // (need to call RunStateOnByteUnlocked).
1392     // RunStateOnByte returns ns == NULL if it is out of memory.
1393     // ns == FullMatchState means the rest of the string matches.
1394     //
1395     // Okay to use bytemap[] not ByteMap() here, because
1396     // c is known to be an actual byte and not kByteEndText.
1397 
1398     State* ns = s->next_[bytemap[c]].load(std::memory_order_acquire);
1399     if (ns == NULL) {
1400       ns = RunStateOnByteUnlocked(s, c);
1401       if (ns == NULL) {
1402         // After we reset the cache, we hold cache_mutex exclusively,
1403         // so if resetp != NULL, it means we filled the DFA state
1404         // cache with this search alone (without any other threads).
1405         // Benchmarks show that doing a state computation on every
1406         // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the
1407         // same at about 2 MB/s.  Unless we're processing an average
1408         // of 10 bytes per state computation, fail so that RE2 can
1409         // fall back to the NFA.  However, RE2::Set cannot fall back,
1410         // so we just have to keep on keeping on in that case.
1411         if (dfa_should_bail_when_slow && resetp != NULL &&
1412             static_cast<size_t>(p - resetp) < 10*state_cache_.size() &&
1413             kind_ != Prog::kManyMatch) {
1414           params->failed = true;
1415           return false;
1416         }
1417         resetp = p;
1418 
1419         // Prepare to save start and s across the reset.
1420         StateSaver save_start(this, start);
1421         StateSaver save_s(this, s);
1422 
1423         // Discard all the States in the cache.
1424         ResetCache(params->cache_lock);
1425 
1426         // Restore start and s so we can continue.
1427         if ((start = save_start.Restore()) == NULL ||
1428             (s = save_s.Restore()) == NULL) {
1429           // Restore already did LOG(DFATAL).
1430           params->failed = true;
1431           return false;
1432         }
1433         ns = RunStateOnByteUnlocked(s, c);
1434         if (ns == NULL) {
1435           LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache";
1436           params->failed = true;
1437           return false;
1438         }
1439       }
1440     }
1441     if (ns <= SpecialStateMax) {
1442       if (ns == DeadState) {
1443         params->ep = reinterpret_cast<const char*>(lastmatch);
1444         return matched;
1445       }
1446       // FullMatchState
1447       params->ep = reinterpret_cast<const char*>(ep);
1448       return true;
1449     }
1450 
1451     s = ns;
1452     if (s->IsMatch()) {
1453       matched = true;
1454       // The DFA notices the match one byte late,
1455       // so adjust p before using it in the match.
1456       if (run_forward)
1457         lastmatch = p - 1;
1458       else
1459         lastmatch = p + 1;
1460       if (ExtraDebug)
1461         fprintf(stderr, "match @%td! [%s]\n", lastmatch - bp, DumpState(s).c_str());
1462       if (params->matches != NULL && kind_ == Prog::kManyMatch) {
1463         for (int i = s->ninst_ - 1; i >= 0; i--) {
1464           int id = s->inst_[i];
1465           if (id == MatchSep)
1466             break;
1467           params->matches->insert(id);
1468         }
1469       }
1470       if (want_earliest_match) {
1471         params->ep = reinterpret_cast<const char*>(lastmatch);
1472         return true;
1473       }
1474     }
1475   }
1476 
1477   // Process one more byte to see if it triggers a match.
1478   // (Remember, matches are delayed one byte.)
1479   if (ExtraDebug)
1480     fprintf(stderr, "@etx: %s\n", DumpState(s).c_str());
1481 
1482   int lastbyte;
1483   if (run_forward) {
1484     if (params->text.end() == params->context.end())
1485       lastbyte = kByteEndText;
1486     else
1487       lastbyte = params->text.end()[0] & 0xFF;
1488   } else {
1489     if (params->text.begin() == params->context.begin())
1490       lastbyte = kByteEndText;
1491     else
1492       lastbyte = params->text.begin()[-1] & 0xFF;
1493   }
1494 
1495   State* ns = s->next_[ByteMap(lastbyte)].load(std::memory_order_acquire);
1496   if (ns == NULL) {
1497     ns = RunStateOnByteUnlocked(s, lastbyte);
1498     if (ns == NULL) {
1499       StateSaver save_s(this, s);
1500       ResetCache(params->cache_lock);
1501       if ((s = save_s.Restore()) == NULL) {
1502         params->failed = true;
1503         return false;
1504       }
1505       ns = RunStateOnByteUnlocked(s, lastbyte);
1506       if (ns == NULL) {
1507         LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset";
1508         params->failed = true;
1509         return false;
1510       }
1511     }
1512   }
1513   if (ns <= SpecialStateMax) {
1514     if (ns == DeadState) {
1515       params->ep = reinterpret_cast<const char*>(lastmatch);
1516       return matched;
1517     }
1518     // FullMatchState
1519     params->ep = reinterpret_cast<const char*>(ep);
1520     return true;
1521   }
1522 
1523   s = ns;
1524   if (s->IsMatch()) {
1525     matched = true;
1526     lastmatch = p;
1527     if (ExtraDebug)
1528       fprintf(stderr, "match @etx! [%s]\n", DumpState(s).c_str());
1529     if (params->matches != NULL && kind_ == Prog::kManyMatch) {
1530       for (int i = s->ninst_ - 1; i >= 0; i--) {
1531         int id = s->inst_[i];
1532         if (id == MatchSep)
1533           break;
1534         params->matches->insert(id);
1535       }
1536     }
1537   }
1538 
1539   params->ep = reinterpret_cast<const char*>(lastmatch);
1540   return matched;
1541 }
1542 
1543 // Inline specializations of the general loop.
SearchFFF(SearchParams * params)1544 bool DFA::SearchFFF(SearchParams* params) {
1545   return InlinedSearchLoop<false, false, false>(params);
1546 }
SearchFFT(SearchParams * params)1547 bool DFA::SearchFFT(SearchParams* params) {
1548   return InlinedSearchLoop<false, false, true>(params);
1549 }
SearchFTF(SearchParams * params)1550 bool DFA::SearchFTF(SearchParams* params) {
1551   return InlinedSearchLoop<false, true, false>(params);
1552 }
SearchFTT(SearchParams * params)1553 bool DFA::SearchFTT(SearchParams* params) {
1554   return InlinedSearchLoop<false, true, true>(params);
1555 }
SearchTFF(SearchParams * params)1556 bool DFA::SearchTFF(SearchParams* params) {
1557   return InlinedSearchLoop<true, false, false>(params);
1558 }
SearchTFT(SearchParams * params)1559 bool DFA::SearchTFT(SearchParams* params) {
1560   return InlinedSearchLoop<true, false, true>(params);
1561 }
SearchTTF(SearchParams * params)1562 bool DFA::SearchTTF(SearchParams* params) {
1563   return InlinedSearchLoop<true, true, false>(params);
1564 }
SearchTTT(SearchParams * params)1565 bool DFA::SearchTTT(SearchParams* params) {
1566   return InlinedSearchLoop<true, true, true>(params);
1567 }
1568 
1569 // For performance, calls the appropriate specialized version
1570 // of InlinedSearchLoop.
FastSearchLoop(SearchParams * params)1571 bool DFA::FastSearchLoop(SearchParams* params) {
1572   // Because the methods are private, the Searches array
1573   // cannot be declared at top level.
1574   static bool (DFA::*Searches[])(SearchParams*) = {
1575     &DFA::SearchFFF,
1576     &DFA::SearchFFT,
1577     &DFA::SearchFTF,
1578     &DFA::SearchFTT,
1579     &DFA::SearchTFF,
1580     &DFA::SearchTFT,
1581     &DFA::SearchTTF,
1582     &DFA::SearchTTT,
1583   };
1584 
1585   int index = 4 * params->can_prefix_accel +
1586               2 * params->want_earliest_match +
1587               1 * params->run_forward;
1588   return (this->*Searches[index])(params);
1589 }
1590 
1591 
1592 // The discussion of DFA execution above ignored the question of how
1593 // to determine the initial state for the search loop.  There are two
1594 // factors that influence the choice of start state.
1595 //
1596 // The first factor is whether the search is anchored or not.
1597 // The regexp program (Prog*) itself has
1598 // two different entry points: one for anchored searches and one for
1599 // unanchored searches.  (The unanchored version starts with a leading ".*?"
1600 // and then jumps to the anchored one.)
1601 //
1602 // The second factor is where text appears in the larger context, which
1603 // determines which empty-string operators can be matched at the beginning
1604 // of execution.  If text is at the very beginning of context, \A and ^ match.
1605 // Otherwise if text is at the beginning of a line, then ^ matches.
1606 // Otherwise it matters whether the character before text is a word character
1607 // or a non-word character.
1608 //
1609 // The two cases (unanchored vs not) and four cases (empty-string flags)
1610 // combine to make the eight cases recorded in the DFA's begin_text_[2],
1611 // begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached
1612 // StartInfos.  The start state for each is filled in the first time it
1613 // is used for an actual search.
1614 
1615 // Examines text, context, and anchored to determine the right start
1616 // state for the DFA search loop.  Fills in params and returns true on success.
1617 // Returns false on failure.
AnalyzeSearch(SearchParams * params)1618 bool DFA::AnalyzeSearch(SearchParams* params) {
1619   const StringPiece& text = params->text;
1620   const StringPiece& context = params->context;
1621 
1622   // Sanity check: make sure that text lies within context.
1623   if (text.begin() < context.begin() || text.end() > context.end()) {
1624     LOG(DFATAL) << "context does not contain text";
1625     params->start = DeadState;
1626     return true;
1627   }
1628 
1629   // Determine correct search type.
1630   int start;
1631   uint32_t flags;
1632   if (params->run_forward) {
1633     if (text.begin() == context.begin()) {
1634       start = kStartBeginText;
1635       flags = kEmptyBeginText|kEmptyBeginLine;
1636     } else if (text.begin()[-1] == '\n') {
1637       start = kStartBeginLine;
1638       flags = kEmptyBeginLine;
1639     } else if (Prog::IsWordChar(text.begin()[-1] & 0xFF)) {
1640       start = kStartAfterWordChar;
1641       flags = kFlagLastWord;
1642     } else {
1643       start = kStartAfterNonWordChar;
1644       flags = 0;
1645     }
1646   } else {
1647     if (text.end() == context.end()) {
1648       start = kStartBeginText;
1649       flags = kEmptyBeginText|kEmptyBeginLine;
1650     } else if (text.end()[0] == '\n') {
1651       start = kStartBeginLine;
1652       flags = kEmptyBeginLine;
1653     } else if (Prog::IsWordChar(text.end()[0] & 0xFF)) {
1654       start = kStartAfterWordChar;
1655       flags = kFlagLastWord;
1656     } else {
1657       start = kStartAfterNonWordChar;
1658       flags = 0;
1659     }
1660   }
1661   if (params->anchored)
1662     start |= kStartAnchored;
1663   StartInfo* info = &start_[start];
1664 
1665   // Try once without cache_lock for writing.
1666   // Try again after resetting the cache
1667   // (ResetCache will relock cache_lock for writing).
1668   if (!AnalyzeSearchHelper(params, info, flags)) {
1669     ResetCache(params->cache_lock);
1670     if (!AnalyzeSearchHelper(params, info, flags)) {
1671       LOG(DFATAL) << "Failed to analyze start state.";
1672       params->failed = true;
1673       return false;
1674     }
1675   }
1676 
1677   params->start = info->start.load(std::memory_order_acquire);
1678 
1679   // Even if we could prefix accel, we cannot do so when anchored and,
1680   // less obviously, we cannot do so when we are going to need flags.
1681   // This trick works only when there is a single byte that leads to a
1682   // different state!
1683   if (prog_->can_prefix_accel() &&
1684       !params->anchored &&
1685       params->start > SpecialStateMax &&
1686       params->start->flag_ >> kFlagNeedShift == 0)
1687     params->can_prefix_accel = true;
1688 
1689   if (ExtraDebug)
1690     fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n",
1691             params->anchored, params->run_forward, flags,
1692             DumpState(params->start).c_str(), params->can_prefix_accel);
1693 
1694   return true;
1695 }
1696 
1697 // Fills in info if needed.  Returns true on success, false on failure.
AnalyzeSearchHelper(SearchParams * params,StartInfo * info,uint32_t flags)1698 bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
1699                               uint32_t flags) {
1700   // Quick check.
1701   State* start = info->start.load(std::memory_order_acquire);
1702   if (start != NULL)
1703     return true;
1704 
1705   MutexLock l(&mutex_);
1706   start = info->start.load(std::memory_order_relaxed);
1707   if (start != NULL)
1708     return true;
1709 
1710   q0_->clear();
1711   AddToQueue(q0_,
1712              params->anchored ? prog_->start() : prog_->start_unanchored(),
1713              flags);
1714   start = WorkqToCachedState(q0_, NULL, flags);
1715   if (start == NULL)
1716     return false;
1717 
1718   // Synchronize with "quick check" above.
1719   info->start.store(start, std::memory_order_release);
1720   return true;
1721 }
1722 
1723 // The actual DFA search: calls AnalyzeSearch and then FastSearchLoop.
Search(const StringPiece & text,const StringPiece & context,bool anchored,bool want_earliest_match,bool run_forward,bool * failed,const char ** epp,SparseSet * matches)1724 bool DFA::Search(const StringPiece& text,
1725                  const StringPiece& context,
1726                  bool anchored,
1727                  bool want_earliest_match,
1728                  bool run_forward,
1729                  bool* failed,
1730                  const char** epp,
1731                  SparseSet* matches) {
1732   *epp = NULL;
1733   if (!ok()) {
1734     *failed = true;
1735     return false;
1736   }
1737   *failed = false;
1738 
1739   if (ExtraDebug) {
1740     fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str());
1741     fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n",
1742             std::string(text).c_str(), anchored, want_earliest_match, run_forward, kind_);
1743   }
1744 
1745   RWLocker l(&cache_mutex_);
1746   SearchParams params(text, context, &l);
1747   params.anchored = anchored;
1748   params.want_earliest_match = want_earliest_match;
1749   params.run_forward = run_forward;
1750   params.matches = matches;
1751 
1752   if (!AnalyzeSearch(&params)) {
1753     *failed = true;
1754     return false;
1755   }
1756   if (params.start == DeadState)
1757     return false;
1758   if (params.start == FullMatchState) {
1759     if (run_forward == want_earliest_match)
1760       *epp = text.data();
1761     else
1762       *epp = text.data() + text.size();
1763     return true;
1764   }
1765   if (ExtraDebug)
1766     fprintf(stderr, "start %s\n", DumpState(params.start).c_str());
1767   bool ret = FastSearchLoop(&params);
1768   if (params.failed) {
1769     *failed = true;
1770     return false;
1771   }
1772   *epp = params.ep;
1773   return ret;
1774 }
1775 
GetDFA(MatchKind kind)1776 DFA* Prog::GetDFA(MatchKind kind) {
1777   // For a forward DFA, half the memory goes to each DFA.
1778   // However, if it is a "many match" DFA, then there is
1779   // no counterpart with which the memory must be shared.
1780   //
1781   // For a reverse DFA, all the memory goes to the
1782   // "longest match" DFA, because RE2 never does reverse
1783   // "first match" searches.
1784   if (kind == kFirstMatch) {
1785     std::call_once(dfa_first_once_, [](Prog* prog) {
1786       prog->dfa_first_ = new DFA(prog, kFirstMatch, prog->dfa_mem_ / 2);
1787     }, this);
1788     return dfa_first_;
1789   } else if (kind == kManyMatch) {
1790     std::call_once(dfa_first_once_, [](Prog* prog) {
1791       prog->dfa_first_ = new DFA(prog, kManyMatch, prog->dfa_mem_);
1792     }, this);
1793     return dfa_first_;
1794   } else {
1795     std::call_once(dfa_longest_once_, [](Prog* prog) {
1796       if (!prog->reversed_)
1797         prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_ / 2);
1798       else
1799         prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_);
1800     }, this);
1801     return dfa_longest_;
1802   }
1803 }
1804 
DeleteDFA(DFA * dfa)1805 void Prog::DeleteDFA(DFA* dfa) {
1806   delete dfa;
1807 }
1808 
1809 // Executes the regexp program to search in text,
1810 // which itself is inside the larger context.  (As a convenience,
1811 // passing a NULL context is equivalent to passing text.)
1812 // Returns true if a match is found, false if not.
1813 // If a match is found, fills in match0->end() to point at the end of the match
1814 // and sets match0->begin() to text.begin(), since the DFA can't track
1815 // where the match actually began.
1816 //
1817 // This is the only external interface (class DFA only exists in this file).
1818 //
SearchDFA(const StringPiece & text,const StringPiece & const_context,Anchor anchor,MatchKind kind,StringPiece * match0,bool * failed,SparseSet * matches)1819 bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
1820                      Anchor anchor, MatchKind kind, StringPiece* match0,
1821                      bool* failed, SparseSet* matches) {
1822   *failed = false;
1823 
1824   StringPiece context = const_context;
1825   if (context.data() == NULL)
1826     context = text;
1827   bool caret = anchor_start();
1828   bool dollar = anchor_end();
1829   if (reversed_) {
1830     using std::swap;
1831     swap(caret, dollar);
1832   }
1833   if (caret && context.begin() != text.begin())
1834     return false;
1835   if (dollar && context.end() != text.end())
1836     return false;
1837 
1838   // Handle full match by running an anchored longest match
1839   // and then checking if it covers all of text.
1840   bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch;
1841   bool endmatch = false;
1842   if (kind == kManyMatch) {
1843     // This is split out in order to avoid clobbering kind.
1844   } else if (kind == kFullMatch || anchor_end()) {
1845     endmatch = true;
1846     kind = kLongestMatch;
1847   }
1848 
1849   // If the caller doesn't care where the match is (just whether one exists),
1850   // then we can stop at the very first match we find, the so-called
1851   // "earliest match".
1852   bool want_earliest_match = false;
1853   if (kind == kManyMatch) {
1854     // This is split out in order to avoid clobbering kind.
1855     if (matches == NULL) {
1856       want_earliest_match = true;
1857     }
1858   } else if (match0 == NULL && !endmatch) {
1859     want_earliest_match = true;
1860     kind = kLongestMatch;
1861   }
1862 
1863   DFA* dfa = GetDFA(kind);
1864   const char* ep;
1865   bool matched = dfa->Search(text, context, anchored,
1866                              want_earliest_match, !reversed_,
1867                              failed, &ep, matches);
1868   if (*failed) {
1869     hooks::GetDFASearchFailureHook()({
1870         // Nothing yet...
1871     });
1872     return false;
1873   }
1874   if (!matched)
1875     return false;
1876   if (endmatch && ep != (reversed_ ? text.data() : text.data() + text.size()))
1877     return false;
1878 
1879   // If caller cares, record the boundary of the match.
1880   // We only know where it ends, so use the boundary of text
1881   // as the beginning.
1882   if (match0) {
1883     if (reversed_)
1884       *match0 =
1885           StringPiece(ep, static_cast<size_t>(text.data() + text.size() - ep));
1886     else
1887       *match0 =
1888           StringPiece(text.data(), static_cast<size_t>(ep - text.data()));
1889   }
1890   return true;
1891 }
1892 
1893 // Build out all states in DFA.  Returns number of states.
BuildAllStates(const Prog::DFAStateCallback & cb)1894 int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) {
1895   if (!ok())
1896     return 0;
1897 
1898   // Pick out start state for unanchored search
1899   // at beginning of text.
1900   RWLocker l(&cache_mutex_);
1901   SearchParams params(StringPiece(), StringPiece(), &l);
1902   params.anchored = false;
1903   if (!AnalyzeSearch(&params) ||
1904       params.start == NULL ||
1905       params.start == DeadState)
1906     return 0;
1907 
1908   // Add start state to work queue.
1909   // Note that any State* that we handle here must point into the cache,
1910   // so we can simply depend on pointer-as-a-number hashing and equality.
1911   std::unordered_map<State*, int> m;
1912   std::deque<State*> q;
1913   m.emplace(params.start, static_cast<int>(m.size()));
1914   q.push_back(params.start);
1915 
1916   // Compute the input bytes needed to cover all of the next pointers.
1917   int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
1918   std::vector<int> input(nnext);
1919   for (int c = 0; c < 256; c++) {
1920     int b = prog_->bytemap()[c];
1921     while (c < 256-1 && prog_->bytemap()[c+1] == b)
1922       c++;
1923     input[b] = c;
1924   }
1925   input[prog_->bytemap_range()] = kByteEndText;
1926 
1927   // Scratch space for the output.
1928   std::vector<int> output(nnext);
1929 
1930   // Flood to expand every state.
1931   bool oom = false;
1932   while (!q.empty()) {
1933     State* s = q.front();
1934     q.pop_front();
1935     for (int c : input) {
1936       State* ns = RunStateOnByteUnlocked(s, c);
1937       if (ns == NULL) {
1938         oom = true;
1939         break;
1940       }
1941       if (ns == DeadState) {
1942         output[ByteMap(c)] = -1;
1943         continue;
1944       }
1945       if (m.find(ns) == m.end()) {
1946         m.emplace(ns, static_cast<int>(m.size()));
1947         q.push_back(ns);
1948       }
1949       output[ByteMap(c)] = m[ns];
1950     }
1951     if (cb)
1952       cb(oom ? NULL : output.data(),
1953          s == FullMatchState || s->IsMatch());
1954     if (oom)
1955       break;
1956   }
1957 
1958   return static_cast<int>(m.size());
1959 }
1960 
1961 // Build out all states in DFA for kind.  Returns number of states.
BuildEntireDFA(MatchKind kind,const DFAStateCallback & cb)1962 int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) {
1963   return GetDFA(kind)->BuildAllStates(cb);
1964 }
1965 
TEST_dfa_should_bail_when_slow(bool b)1966 void Prog::TEST_dfa_should_bail_when_slow(bool b) {
1967   dfa_should_bail_when_slow = b;
1968 }
1969 
1970 // Computes min and max for matching string.
1971 // Won't return strings bigger than maxlen.
PossibleMatchRange(std::string * min,std::string * max,int maxlen)1972 bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
1973   if (!ok())
1974     return false;
1975 
1976   // NOTE: if future users of PossibleMatchRange want more precision when
1977   // presented with infinitely repeated elements, consider making this a
1978   // parameter to PossibleMatchRange.
1979   static int kMaxEltRepetitions = 0;
1980 
1981   // Keep track of the number of times we've visited states previously. We only
1982   // revisit a given state if it's part of a repeated group, so if the value
1983   // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set
1984   // |*max| to |PrefixSuccessor(*max)|.
1985   //
1986   // Also note that previously_visited_states[UnseenStatePtr] will, in the STL
1987   // tradition, implicitly insert a '0' value at first use. We take advantage
1988   // of that property below.
1989   std::unordered_map<State*, int> previously_visited_states;
1990 
1991   // Pick out start state for anchored search at beginning of text.
1992   RWLocker l(&cache_mutex_);
1993   SearchParams params(StringPiece(), StringPiece(), &l);
1994   params.anchored = true;
1995   if (!AnalyzeSearch(&params))
1996     return false;
1997   if (params.start == DeadState) {  // No matching strings
1998     *min = "";
1999     *max = "";
2000     return true;
2001   }
2002   if (params.start == FullMatchState)  // Every string matches: no max
2003     return false;
2004 
2005   // The DFA is essentially a big graph rooted at params.start,
2006   // and paths in the graph correspond to accepted strings.
2007   // Each node in the graph has potentially 256+1 arrows
2008   // coming out, one for each byte plus the magic end of
2009   // text character kByteEndText.
2010 
2011   // To find the smallest possible prefix of an accepted
2012   // string, we just walk the graph preferring to follow
2013   // arrows with the lowest bytes possible.  To find the
2014   // largest possible prefix, we follow the largest bytes
2015   // possible.
2016 
2017   // The test for whether there is an arrow from s on byte j is
2018   //    ns = RunStateOnByteUnlocked(s, j);
2019   //    if (ns == NULL)
2020   //      return false;
2021   //    if (ns != DeadState && ns->ninst > 0)
2022   // The RunStateOnByteUnlocked call asks the DFA to build out the graph.
2023   // It returns NULL only if the DFA has run out of memory,
2024   // in which case we can't be sure of anything.
2025   // The second check sees whether there was graph built
2026   // and whether it is interesting graph.  Nodes might have
2027   // ns->ninst == 0 if they exist only to represent the fact
2028   // that a match was found on the previous byte.
2029 
2030   // Build minimum prefix.
2031   State* s = params.start;
2032   min->clear();
2033   MutexLock lock(&mutex_);
2034   for (int i = 0; i < maxlen; i++) {
2035     if (previously_visited_states[s] > kMaxEltRepetitions)
2036       break;
2037     previously_visited_states[s]++;
2038 
2039     // Stop if min is a match.
2040     State* ns = RunStateOnByte(s, kByteEndText);
2041     if (ns == NULL)  // DFA out of memory
2042       return false;
2043     if (ns != DeadState && (ns == FullMatchState || ns->IsMatch()))
2044       break;
2045 
2046     // Try to extend the string with low bytes.
2047     bool extended = false;
2048     for (int j = 0; j < 256; j++) {
2049       ns = RunStateOnByte(s, j);
2050       if (ns == NULL)  // DFA out of memory
2051         return false;
2052       if (ns == FullMatchState ||
2053           (ns > SpecialStateMax && ns->ninst_ > 0)) {
2054         extended = true;
2055         min->append(1, static_cast<char>(j));
2056         s = ns;
2057         break;
2058       }
2059     }
2060     if (!extended)
2061       break;
2062   }
2063 
2064   // Build maximum prefix.
2065   previously_visited_states.clear();
2066   s = params.start;
2067   max->clear();
2068   for (int i = 0; i < maxlen; i++) {
2069     if (previously_visited_states[s] > kMaxEltRepetitions)
2070       break;
2071     previously_visited_states[s] += 1;
2072 
2073     // Try to extend the string with high bytes.
2074     bool extended = false;
2075     for (int j = 255; j >= 0; j--) {
2076       State* ns = RunStateOnByte(s, j);
2077       if (ns == NULL)
2078         return false;
2079       if (ns == FullMatchState ||
2080           (ns > SpecialStateMax && ns->ninst_ > 0)) {
2081         extended = true;
2082         max->append(1, static_cast<char>(j));
2083         s = ns;
2084         break;
2085       }
2086     }
2087     if (!extended) {
2088       // Done, no need for PrefixSuccessor.
2089       return true;
2090     }
2091   }
2092 
2093   // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b
2094   PrefixSuccessor(max);
2095 
2096   // If there are no bytes left, we have no way to say "there is no maximum
2097   // string".  We could make the interface more complicated and be able to
2098   // return "there is no maximum but here is a minimum", but that seems like
2099   // overkill -- the most common no-max case is all possible strings, so not
2100   // telling the caller that the empty string is the minimum match isn't a
2101   // great loss.
2102   if (max->empty())
2103     return false;
2104 
2105   return true;
2106 }
2107 
2108 // PossibleMatchRange for a Prog.
PossibleMatchRange(std::string * min,std::string * max,int maxlen)2109 bool Prog::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
2110   // Have to use dfa_longest_ to get all strings for full matches.
2111   // For example, (a|aa) never matches aa in first-match mode.
2112   return GetDFA(kLongestMatch)->PossibleMatchRange(min, max, maxlen);
2113 }
2114 
2115 }  // namespace re2
2116