1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // A DFA (deterministic finite automaton)-based regular expression search.
6 //
7 // The DFA search has two main parts: the construction of the automaton,
8 // which is represented by a graph of State structures, and the execution
9 // of the automaton over a given input string.
10 //
11 // The basic idea is that the State graph is constructed so that the
12 // execution can simply start with a state s, and then for each byte c in
13 // the input string, execute "s = s->next[c]", checking at each point whether
14 // the current s represents a matching state.
15 //
16 // The simple explanation just given does convey the essence of this code,
17 // but it omits the details of how the State graph gets constructed as well
18 // as some performance-driven optimizations to the execution of the automaton.
19 // All these details are explained in the comments for the code following
20 // the definition of class DFA.
21 //
22 // See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent.
23 
24 #include <stddef.h>
25 #include <stdint.h>
26 #include <stdio.h>
27 #include <string.h>
28 #include <algorithm>
29 #include <atomic>
30 #include <deque>
31 #include <mutex>
32 #include <new>
33 #include <string>
34 #include <unordered_map>
35 #include <unordered_set>
36 #include <utility>
37 #include <vector>
38 
39 #include "util/logging.h"
40 #include "util/mix.h"
41 #include "util/mutex.h"
42 #include "util/strutil.h"
43 #include "re2/pod_array.h"
44 #include "re2/prog.h"
45 #include "re2/re2.h"
46 #include "re2/sparse_set.h"
47 #include "re2/stringpiece.h"
48 
49 // Silence "zero-sized array in struct/union" warning for DFA::State::next_.
50 #ifdef _MSC_VER
51 #pragma warning(disable: 4200)
52 #endif
53 
54 namespace re2 {
55 
56 // Controls whether the DFA should bail out early if the NFA would be faster.
57 static bool dfa_should_bail_when_slow = true;
58 
TESTING_ONLY_set_dfa_should_bail_when_slow(bool b)59 void Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(bool b) {
60   dfa_should_bail_when_slow = b;
61 }
62 
63 // Changing this to true compiles in prints that trace execution of the DFA.
64 // Generates a lot of output -- only useful for debugging.
65 static const bool ExtraDebug = false;
66 
67 // A DFA implementation of a regular expression program.
68 // Since this is entirely a forward declaration mandated by C++,
69 // some of the comments here are better understood after reading
70 // the comments in the sections that follow the DFA definition.
71 class DFA {
72  public:
73   DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem);
74   ~DFA();
ok() const75   bool ok() const { return !init_failed_; }
kind()76   Prog::MatchKind kind() { return kind_; }
77 
78   // Searches for the regular expression in text, which is considered
79   // as a subsection of context for the purposes of interpreting flags
80   // like ^ and $ and \A and \z.
81   // Returns whether a match was found.
82   // If a match is found, sets *ep to the end point of the best match in text.
83   // If "anchored", the match must begin at the start of text.
84   // If "want_earliest_match", the match that ends first is used, not
85   //   necessarily the best one.
86   // If "run_forward" is true, the DFA runs from text.begin() to text.end().
87   //   If it is false, the DFA runs from text.end() to text.begin(),
88   //   returning the leftmost end of the match instead of the rightmost one.
89   // If the DFA cannot complete the search (for example, if it is out of
90   //   memory), it sets *failed and returns false.
91   bool Search(const StringPiece& text, const StringPiece& context,
92               bool anchored, bool want_earliest_match, bool run_forward,
93               bool* failed, const char** ep, SparseSet* matches);
94 
95   // Builds out all states for the entire DFA.
96   // If cb is not empty, it receives one callback per state built.
97   // Returns the number of states built.
98   // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
99   int BuildAllStates(const Prog::DFAStateCallback& cb);
100 
101   // Computes min and max for matching strings.  Won't return strings
102   // bigger than maxlen.
103   bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
104 
105   // These data structures are logically private, but C++ makes it too
106   // difficult to mark them as such.
107   class RWLocker;
108   class StateSaver;
109   class Workq;
110 
111   // A single DFA state.  The DFA is represented as a graph of these
112   // States, linked by the next_ pointers.  If in state s and reading
113   // byte c, the next state should be s->next_[c].
114   struct State {
IsMatchre2::DFA::State115     inline bool IsMatch() const { return (flag_ & kFlagMatch) != 0; }
116 
117     int* inst_;         // Instruction pointers in the state.
118     int ninst_;         // # of inst_ pointers.
119     uint32_t flag_;     // Empty string bitfield flags in effect on the way
120                         // into this state, along with kFlagMatch if this
121                         // is a matching state.
122 
123 // Work around the bug affecting flexible array members in GCC 6.x (for x >= 1).
124 // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70932)
125 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && __GNUC_MINOR__ >= 1
126     std::atomic<State*> next_[0];   // Outgoing arrows from State,
127 #else
128     std::atomic<State*> next_[];    // Outgoing arrows from State,
129 #endif
130 
131                         // one per input byte class
132   };
133 
134   enum {
135     kByteEndText = 256,         // imaginary byte at end of text
136 
137     kFlagEmptyMask = 0xFF,      // State.flag_: bits holding kEmptyXXX flags
138     kFlagMatch = 0x0100,        // State.flag_: this is a matching state
139     kFlagLastWord = 0x0200,     // State.flag_: last byte was a word char
140     kFlagNeedShift = 16,        // needed kEmpty bits are or'ed in shifted left
141   };
142 
143   struct StateHash {
operator ()re2::DFA::StateHash144     size_t operator()(const State* a) const {
145       DCHECK(a != NULL);
146       HashMix mix(a->flag_);
147       for (int i = 0; i < a->ninst_; i++)
148         mix.Mix(a->inst_[i]);
149       mix.Mix(0);
150       return mix.get();
151     }
152   };
153 
154   struct StateEqual {
operator ()re2::DFA::StateEqual155     bool operator()(const State* a, const State* b) const {
156       DCHECK(a != NULL);
157       DCHECK(b != NULL);
158       if (a == b)
159         return true;
160       if (a->flag_ != b->flag_)
161         return false;
162       if (a->ninst_ != b->ninst_)
163         return false;
164       for (int i = 0; i < a->ninst_; i++)
165         if (a->inst_[i] != b->inst_[i])
166           return false;
167       return true;
168     }
169   };
170 
171   typedef std::unordered_set<State*, StateHash, StateEqual> StateSet;
172 
173  private:
174   // Make it easier to swap in a scalable reader-writer mutex.
175   using CacheMutex = Mutex;
176 
177   enum {
178     // Indices into start_ for unanchored searches.
179     // Add kStartAnchored for anchored searches.
180     kStartBeginText = 0,          // text at beginning of context
181     kStartBeginLine = 2,          // text at beginning of line
182     kStartAfterWordChar = 4,      // text follows a word character
183     kStartAfterNonWordChar = 6,   // text follows non-word character
184     kMaxStart = 8,
185 
186     kStartAnchored = 1,
187   };
188 
189   // Resets the DFA State cache, flushing all saved State* information.
190   // Releases and reacquires cache_mutex_ via cache_lock, so any
191   // State* existing before the call are not valid after the call.
192   // Use a StateSaver to preserve important states across the call.
193   // cache_mutex_.r <= L < mutex_
194   // After: cache_mutex_.w <= L < mutex_
195   void ResetCache(RWLocker* cache_lock);
196 
197   // Looks up and returns the State corresponding to a Workq.
198   // L >= mutex_
199   State* WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag);
200 
201   // Looks up and returns a State matching the inst, ninst, and flag.
202   // L >= mutex_
203   State* CachedState(int* inst, int ninst, uint32_t flag);
204 
205   // Clear the cache entirely.
206   // Must hold cache_mutex_.w or be in destructor.
207   void ClearCache();
208 
209   // Converts a State into a Workq: the opposite of WorkqToCachedState.
210   // L >= mutex_
211   void StateToWorkq(State* s, Workq* q);
212 
213   // Runs a State on a given byte, returning the next state.
214   State* RunStateOnByteUnlocked(State*, int);  // cache_mutex_.r <= L < mutex_
215   State* RunStateOnByte(State*, int);          // L >= mutex_
216 
217   // Runs a Workq on a given byte followed by a set of empty-string flags,
218   // producing a new Workq in nq.  If a match instruction is encountered,
219   // sets *ismatch to true.
220   // L >= mutex_
221   void RunWorkqOnByte(Workq* q, Workq* nq,
222                       int c, uint32_t flag, bool* ismatch);
223 
224   // Runs a Workq on a set of empty-string flags, producing a new Workq in nq.
225   // L >= mutex_
226   void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint32_t flag);
227 
228   // Adds the instruction id to the Workq, following empty arrows
229   // according to flag.
230   // L >= mutex_
231   void AddToQueue(Workq* q, int id, uint32_t flag);
232 
233   // For debugging, returns a text representation of State.
234   static std::string DumpState(State* state);
235 
236   // For debugging, returns a text representation of a Workq.
237   static std::string DumpWorkq(Workq* q);
238 
239   // Search parameters
240   struct SearchParams {
SearchParamsre2::DFA::SearchParams241     SearchParams(const StringPiece& text, const StringPiece& context,
242                  RWLocker* cache_lock)
243       : text(text),
244         context(context),
245         anchored(false),
246         can_prefix_accel(false),
247         want_earliest_match(false),
248         run_forward(false),
249         start(NULL),
250         cache_lock(cache_lock),
251         failed(false),
252         ep(NULL),
253         matches(NULL) {}
254 
255     StringPiece text;
256     StringPiece context;
257     bool anchored;
258     bool can_prefix_accel;
259     bool want_earliest_match;
260     bool run_forward;
261     State* start;
262     RWLocker* cache_lock;
263     bool failed;     // "out" parameter: whether search gave up
264     const char* ep;  // "out" parameter: end pointer for match
265     SparseSet* matches;
266 
267    private:
268     SearchParams(const SearchParams&) = delete;
269     SearchParams& operator=(const SearchParams&) = delete;
270   };
271 
272   // Before each search, the parameters to Search are analyzed by
273   // AnalyzeSearch to determine the state in which to start.
274   struct StartInfo {
StartInfore2::DFA::StartInfo275     StartInfo() : start(NULL) {}
276     std::atomic<State*> start;
277   };
278 
279   // Fills in params->start and params->can_prefix_accel using
280   // the other search parameters.  Returns true on success,
281   // false on failure.
282   // cache_mutex_.r <= L < mutex_
283   bool AnalyzeSearch(SearchParams* params);
284   bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
285                            uint32_t flags);
286 
287   // The generic search loop, inlined to create specialized versions.
288   // cache_mutex_.r <= L < mutex_
289   // Might unlock and relock cache_mutex_ via params->cache_lock.
290   template <bool can_prefix_accel,
291             bool want_earliest_match,
292             bool run_forward>
293   inline bool InlinedSearchLoop(SearchParams* params);
294 
295   // The specialized versions of InlinedSearchLoop.  The three letters
296   // at the ends of the name denote the true/false values used as the
297   // last three parameters of InlinedSearchLoop.
298   // cache_mutex_.r <= L < mutex_
299   // Might unlock and relock cache_mutex_ via params->cache_lock.
300   bool SearchFFF(SearchParams* params);
301   bool SearchFFT(SearchParams* params);
302   bool SearchFTF(SearchParams* params);
303   bool SearchFTT(SearchParams* params);
304   bool SearchTFF(SearchParams* params);
305   bool SearchTFT(SearchParams* params);
306   bool SearchTTF(SearchParams* params);
307   bool SearchTTT(SearchParams* params);
308 
309   // The main search loop: calls an appropriate specialized version of
310   // InlinedSearchLoop.
311   // cache_mutex_.r <= L < mutex_
312   // Might unlock and relock cache_mutex_ via params->cache_lock.
313   bool FastSearchLoop(SearchParams* params);
314 
315 
316   // Looks up bytes in bytemap_ but handles case c == kByteEndText too.
ByteMap(int c)317   int ByteMap(int c) {
318     if (c == kByteEndText)
319       return prog_->bytemap_range();
320     return prog_->bytemap()[c];
321   }
322 
323   // Constant after initialization.
324   Prog* prog_;              // The regular expression program to run.
325   Prog::MatchKind kind_;    // The kind of DFA.
326   bool init_failed_;        // initialization failed (out of memory)
327 
328   Mutex mutex_;  // mutex_ >= cache_mutex_.r
329 
330   // Scratch areas, protected by mutex_.
331   Workq* q0_;             // Two pre-allocated work queues.
332   Workq* q1_;
333   PODArray<int> stack_;   // Pre-allocated stack for AddToQueue
334 
335   // State* cache.  Many threads use and add to the cache simultaneously,
336   // holding cache_mutex_ for reading and mutex_ (above) when adding.
337   // If the cache fills and needs to be discarded, the discarding is done
338   // while holding cache_mutex_ for writing, to avoid interrupting other
339   // readers.  Any State* pointers are only valid while cache_mutex_
340   // is held.
341   CacheMutex cache_mutex_;
342   int64_t mem_budget_;     // Total memory budget for all States.
343   int64_t state_budget_;   // Amount of memory remaining for new States.
344   StateSet state_cache_;   // All States computed so far.
345   StartInfo start_[kMaxStart];
346 
347   DFA(const DFA&) = delete;
348   DFA& operator=(const DFA&) = delete;
349 };
350 
351 // Shorthand for casting to uint8_t*.
BytePtr(const void * v)352 static inline const uint8_t* BytePtr(const void* v) {
353   return reinterpret_cast<const uint8_t*>(v);
354 }
355 
356 // Work queues
357 
358 // Marks separate thread groups of different priority
359 // in the work queue when in leftmost-longest matching mode.
360 #define Mark (-1)
361 
362 // Separates the match IDs from the instructions in inst_.
363 // Used only for "many match" DFA states.
364 #define MatchSep (-2)
365 
366 // Internally, the DFA uses a sparse array of
367 // program instruction pointers as a work queue.
368 // In leftmost longest mode, marks separate sections
369 // of workq that started executing at different
370 // locations in the string (earlier locations first).
371 class DFA::Workq : public SparseSet {
372  public:
373   // Constructor: n is number of normal slots, maxmark number of mark slots.
Workq(int n,int maxmark)374   Workq(int n, int maxmark) :
375     SparseSet(n+maxmark),
376     n_(n),
377     maxmark_(maxmark),
378     nextmark_(n),
379     last_was_mark_(true) {
380   }
381 
is_mark(int i)382   bool is_mark(int i) { return i >= n_; }
383 
maxmark()384   int maxmark() { return maxmark_; }
385 
clear()386   void clear() {
387     SparseSet::clear();
388     nextmark_ = n_;
389   }
390 
mark()391   void mark() {
392     if (last_was_mark_)
393       return;
394     last_was_mark_ = false;
395     SparseSet::insert_new(nextmark_++);
396   }
397 
size()398   int size() {
399     return n_ + maxmark_;
400   }
401 
insert(int id)402   void insert(int id) {
403     if (contains(id))
404       return;
405     insert_new(id);
406   }
407 
insert_new(int id)408   void insert_new(int id) {
409     last_was_mark_ = false;
410     SparseSet::insert_new(id);
411   }
412 
413  private:
414   int n_;                // size excluding marks
415   int maxmark_;          // maximum number of marks
416   int nextmark_;         // id of next mark
417   bool last_was_mark_;   // last inserted was mark
418 
419   Workq(const Workq&) = delete;
420   Workq& operator=(const Workq&) = delete;
421 };
422 
DFA(Prog * prog,Prog::MatchKind kind,int64_t max_mem)423 DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem)
424   : prog_(prog),
425     kind_(kind),
426     init_failed_(false),
427     q0_(NULL),
428     q1_(NULL),
429     mem_budget_(max_mem) {
430   if (ExtraDebug)
431     fprintf(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored().c_str());
432   int nmark = 0;
433   if (kind_ == Prog::kLongestMatch)
434     nmark = prog_->size();
435   // See DFA::AddToQueue() for why this is so.
436   int nstack = prog_->inst_count(kInstCapture) +
437                prog_->inst_count(kInstEmptyWidth) +
438                prog_->inst_count(kInstNop) +
439                nmark + 1;  // + 1 for start inst
440 
441   // Account for space needed for DFA, q0, q1, stack.
442   mem_budget_ -= sizeof(DFA);
443   mem_budget_ -= (prog_->size() + nmark) *
444                  (sizeof(int)+sizeof(int)) * 2;  // q0, q1
445   mem_budget_ -= nstack * sizeof(int);  // stack
446   if (mem_budget_ < 0) {
447     init_failed_ = true;
448     return;
449   }
450 
451   state_budget_ = mem_budget_;
452 
453   // Make sure there is a reasonable amount of working room left.
454   // At minimum, the search requires room for two states in order
455   // to limp along, restarting frequently.  We'll get better performance
456   // if there is room for a larger number of states, say 20.
457   // Note that a state stores list heads only, so we use the program
458   // list count for the upper bound, not the program size.
459   int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
460   int64_t one_state = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
461                       (prog_->list_count()+nmark)*sizeof(int);
462   if (state_budget_ < 20*one_state) {
463     init_failed_ = true;
464     return;
465   }
466 
467   q0_ = new Workq(prog_->size(), nmark);
468   q1_ = new Workq(prog_->size(), nmark);
469   stack_ = PODArray<int>(nstack);
470 }
471 
~DFA()472 DFA::~DFA() {
473   delete q0_;
474   delete q1_;
475   ClearCache();
476 }
477 
478 // In the DFA state graph, s->next[c] == NULL means that the
479 // state has not yet been computed and needs to be.  We need
480 // a different special value to signal that s->next[c] is a
481 // state that can never lead to a match (and thus the search
482 // can be called off).  Hence DeadState.
483 #define DeadState reinterpret_cast<State*>(1)
484 
485 // Signals that the rest of the string matches no matter what it is.
486 #define FullMatchState reinterpret_cast<State*>(2)
487 
488 #define SpecialStateMax FullMatchState
489 
490 // Debugging printouts
491 
492 // For debugging, returns a string representation of the work queue.
DumpWorkq(Workq * q)493 std::string DFA::DumpWorkq(Workq* q) {
494   std::string s;
495   const char* sep = "";
496   for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
497     if (q->is_mark(*it)) {
498       s += "|";
499       sep = "";
500     } else {
501       s += StringPrintf("%s%d", sep, *it);
502       sep = ",";
503     }
504   }
505   return s;
506 }
507 
508 // For debugging, returns a string representation of the state.
DumpState(State * state)509 std::string DFA::DumpState(State* state) {
510   if (state == NULL)
511     return "_";
512   if (state == DeadState)
513     return "X";
514   if (state == FullMatchState)
515     return "*";
516   std::string s;
517   const char* sep = "";
518   s += StringPrintf("(%p)", state);
519   for (int i = 0; i < state->ninst_; i++) {
520     if (state->inst_[i] == Mark) {
521       s += "|";
522       sep = "";
523     } else if (state->inst_[i] == MatchSep) {
524       s += "||";
525       sep = "";
526     } else {
527       s += StringPrintf("%s%d", sep, state->inst_[i]);
528       sep = ",";
529     }
530   }
531   s += StringPrintf(" flag=%#x", state->flag_);
532   return s;
533 }
534 
535 //////////////////////////////////////////////////////////////////////
536 //
537 // DFA state graph construction.
538 //
539 // The DFA state graph is a heavily-linked collection of State* structures.
540 // The state_cache_ is a set of all the State structures ever allocated,
541 // so that if the same state is reached by two different paths,
542 // the same State structure can be used.  This reduces allocation
543 // requirements and also avoids duplication of effort across the two
544 // identical states.
545 //
546 // A State is defined by an ordered list of instruction ids and a flag word.
547 //
548 // The choice of an ordered list of instructions differs from a typical
549 // textbook DFA implementation, which would use an unordered set.
550 // Textbook descriptions, however, only care about whether
551 // the DFA matches, not where it matches in the text.  To decide where the
552 // DFA matches, we need to mimic the behavior of the dominant backtracking
553 // implementations like PCRE, which try one possible regular expression
554 // execution, then another, then another, stopping when one of them succeeds.
555 // The DFA execution tries these many executions in parallel, representing
556 // each by an instruction id.  These pointers are ordered in the State.inst_
557 // list in the same order that the executions would happen in a backtracking
558 // search: if a match is found during execution of inst_[2], inst_[i] for i>=3
559 // can be discarded.
560 //
561 // Textbooks also typically do not consider context-aware empty string operators
562 // like ^ or $.  These are handled by the flag word, which specifies the set
563 // of empty-string operators that should be matched when executing at the
564 // current text position.  These flag bits are defined in prog.h.
565 // The flag word also contains two DFA-specific bits: kFlagMatch if the state
566 // is a matching state (one that reached a kInstMatch in the program)
567 // and kFlagLastWord if the last processed byte was a word character, for the
568 // implementation of \B and \b.
569 //
570 // The flag word also contains, shifted up 16 bits, the bits looked for by
571 // any kInstEmptyWidth instructions in the state.  These provide a useful
572 // summary indicating when new flags might be useful.
573 //
574 // The permanent representation of a State's instruction ids is just an array,
575 // but while a state is being analyzed, these instruction ids are represented
576 // as a Workq, which is an array that allows iteration in insertion order.
577 
578 // NOTE(rsc): The choice of State construction determines whether the DFA
579 // mimics backtracking implementations (so-called leftmost first matching) or
580 // traditional DFA implementations (so-called leftmost longest matching as
581 // prescribed by POSIX).  This implementation chooses to mimic the
582 // backtracking implementations, because we want to replace PCRE.  To get
583 // POSIX behavior, the states would need to be considered not as a simple
584 // ordered list of instruction ids, but as a list of unordered sets of instruction
585 // ids.  A match by a state in one set would inhibit the running of sets
586 // farther down the list but not other instruction ids in the same set.  Each
587 // set would correspond to matches beginning at a given point in the string.
588 // This is implemented by separating different sets with Mark pointers.
589 
590 // Looks in the State cache for a State matching q, flag.
591 // If one is found, returns it.  If one is not found, allocates one,
592 // inserts it in the cache, and returns it.
593 // If mq is not null, MatchSep and the match IDs in mq will be appended
594 // to the State.
WorkqToCachedState(Workq * q,Workq * mq,uint32_t flag)595 DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
596   //mutex_.AssertHeld();
597 
598   // Construct array of instruction ids for the new state.
599   // Only ByteRange, EmptyWidth, and Match instructions are useful to keep:
600   // those are the only operators with any effect in
601   // RunWorkqOnEmptyString or RunWorkqOnByte.
602   PODArray<int> inst(q->size());
603   int n = 0;
604   uint32_t needflags = 0;  // flags needed by kInstEmptyWidth instructions
605   bool sawmatch = false;   // whether queue contains guaranteed kInstMatch
606   bool sawmark = false;    // whether queue contains a Mark
607   if (ExtraDebug)
608     fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag);
609   for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
610     int id = *it;
611     if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id)))
612       break;
613     if (q->is_mark(id)) {
614       if (n > 0 && inst[n-1] != Mark) {
615         sawmark = true;
616         inst[n++] = Mark;
617       }
618       continue;
619     }
620     Prog::Inst* ip = prog_->inst(id);
621     switch (ip->opcode()) {
622       case kInstAltMatch:
623         // This state will continue to a match no matter what
624         // the rest of the input is.  If it is the highest priority match
625         // being considered, return the special FullMatchState
626         // to indicate that it's all matches from here out.
627         if (kind_ != Prog::kManyMatch &&
628             (kind_ != Prog::kFirstMatch ||
629              (it == q->begin() && ip->greedy(prog_))) &&
630             (kind_ != Prog::kLongestMatch || !sawmark) &&
631             (flag & kFlagMatch)) {
632           if (ExtraDebug)
633             fprintf(stderr, " -> FullMatchState\n");
634           return FullMatchState;
635         }
636         FALLTHROUGH_INTENDED;
637       default:
638         // Record iff id is the head of its list, which must
639         // be the case if id-1 is the last of *its* list. :)
640         if (prog_->inst(id-1)->last())
641           inst[n++] = *it;
642         if (ip->opcode() == kInstEmptyWidth)
643           needflags |= ip->empty();
644         if (ip->opcode() == kInstMatch && !prog_->anchor_end())
645           sawmatch = true;
646         break;
647     }
648   }
649   DCHECK_LE(n, q->size());
650   if (n > 0 && inst[n-1] == Mark)
651     n--;
652 
653   // If there are no empty-width instructions waiting to execute,
654   // then the extra flag bits will not be used, so there is no
655   // point in saving them.  (Discarding them reduces the number
656   // of distinct states.)
657   if (needflags == 0)
658     flag &= kFlagMatch;
659 
660   // NOTE(rsc): The code above cannot do flag &= needflags,
661   // because if the right flags were present to pass the current
662   // kInstEmptyWidth instructions, new kInstEmptyWidth instructions
663   // might be reached that in turn need different flags.
664   // The only sure thing is that if there are no kInstEmptyWidth
665   // instructions at all, no flags will be needed.
666   // We could do the extra work to figure out the full set of
667   // possibly needed flags by exploring past the kInstEmptyWidth
668   // instructions, but the check above -- are any flags needed
669   // at all? -- handles the most common case.  More fine-grained
670   // analysis can only be justified by measurements showing that
671   // too many redundant states are being allocated.
672 
673   // If there are no Insts in the list, it's a dead state,
674   // which is useful to signal with a special pointer so that
675   // the execution loop can stop early.  This is only okay
676   // if the state is *not* a matching state.
677   if (n == 0 && flag == 0) {
678     if (ExtraDebug)
679       fprintf(stderr, " -> DeadState\n");
680     return DeadState;
681   }
682 
683   // If we're in longest match mode, the state is a sequence of
684   // unordered state sets separated by Marks.  Sort each set
685   // to canonicalize, to reduce the number of distinct sets stored.
686   if (kind_ == Prog::kLongestMatch) {
687     int* ip = inst.data();
688     int* ep = ip + n;
689     while (ip < ep) {
690       int* markp = ip;
691       while (markp < ep && *markp != Mark)
692         markp++;
693       std::sort(ip, markp);
694       if (markp < ep)
695         markp++;
696       ip = markp;
697     }
698   }
699 
700   // If we're in many match mode, canonicalize for similar reasons:
701   // we have an unordered set of states (i.e. we don't have Marks)
702   // and sorting will reduce the number of distinct sets stored.
703   if (kind_ == Prog::kManyMatch) {
704     int* ip = inst.data();
705     int* ep = ip + n;
706     std::sort(ip, ep);
707   }
708 
709   // Append MatchSep and the match IDs in mq if necessary.
710   if (mq != NULL) {
711     inst[n++] = MatchSep;
712     for (Workq::iterator i = mq->begin(); i != mq->end(); ++i) {
713       int id = *i;
714       Prog::Inst* ip = prog_->inst(id);
715       if (ip->opcode() == kInstMatch)
716         inst[n++] = ip->match_id();
717     }
718   }
719 
720   // Save the needed empty-width flags in the top bits for use later.
721   flag |= needflags << kFlagNeedShift;
722 
723   State* state = CachedState(inst.data(), n, flag);
724   return state;
725 }
726 
727 // Looks in the State cache for a State matching inst, ninst, flag.
728 // If one is found, returns it.  If one is not found, allocates one,
729 // inserts it in the cache, and returns it.
CachedState(int * inst,int ninst,uint32_t flag)730 DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) {
731   //mutex_.AssertHeld();
732 
733   // Look in the cache for a pre-existing state.
734   // We have to initialise the struct like this because otherwise
735   // MSVC will complain about the flexible array member. :(
736   State state;
737   state.inst_ = inst;
738   state.ninst_ = ninst;
739   state.flag_ = flag;
740   StateSet::iterator it = state_cache_.find(&state);
741   if (it != state_cache_.end()) {
742     if (ExtraDebug)
743       fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str());
744     return *it;
745   }
746 
747   // Must have enough memory for new state.
748   // In addition to what we're going to allocate,
749   // the state cache hash table seems to incur about 40 bytes per
750   // State*, empirically.
751   const int kStateCacheOverhead = 40;
752   int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
753   int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
754             ninst*sizeof(int);
755   if (mem_budget_ < mem + kStateCacheOverhead) {
756     mem_budget_ = -1;
757     return NULL;
758   }
759   mem_budget_ -= mem + kStateCacheOverhead;
760 
761   // Allocate new state along with room for next_ and inst_.
762   char* space = std::allocator<char>().allocate(mem);
763   State* s = new (space) State;
764   (void) new (s->next_) std::atomic<State*>[nnext];
765   // Work around a unfortunate bug in older versions of libstdc++.
766   // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64658)
767   for (int i = 0; i < nnext; i++)
768     (void) new (s->next_ + i) std::atomic<State*>(NULL);
769   s->inst_ = new (s->next_ + nnext) int[ninst];
770   memmove(s->inst_, inst, ninst*sizeof s->inst_[0]);
771   s->ninst_ = ninst;
772   s->flag_ = flag;
773   if (ExtraDebug)
774     fprintf(stderr, " -> %s\n", DumpState(s).c_str());
775 
776   // Put state in cache and return it.
777   state_cache_.insert(s);
778   return s;
779 }
780 
781 // Clear the cache.  Must hold cache_mutex_.w or be in destructor.
ClearCache()782 void DFA::ClearCache() {
783   StateSet::iterator begin = state_cache_.begin();
784   StateSet::iterator end = state_cache_.end();
785   while (begin != end) {
786     StateSet::iterator tmp = begin;
787     ++begin;
788     // Deallocate the blob of memory that we allocated in DFA::CachedState().
789     // We recompute mem in order to benefit from sized delete where possible.
790     int ninst = (*tmp)->ninst_;
791     int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
792     int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
793               ninst*sizeof(int);
794     std::allocator<char>().deallocate(reinterpret_cast<char*>(*tmp), mem);
795   }
796   state_cache_.clear();
797 }
798 
799 // Copies insts in state s to the work queue q.
StateToWorkq(State * s,Workq * q)800 void DFA::StateToWorkq(State* s, Workq* q) {
801   q->clear();
802   for (int i = 0; i < s->ninst_; i++) {
803     if (s->inst_[i] == Mark) {
804       q->mark();
805     } else if (s->inst_[i] == MatchSep) {
806       // Nothing after this is an instruction!
807       break;
808     } else {
809       // Explore from the head of the list.
810       AddToQueue(q, s->inst_[i], s->flag_ & kFlagEmptyMask);
811     }
812   }
813 }
814 
815 // Adds ip to the work queue, following empty arrows according to flag.
AddToQueue(Workq * q,int id,uint32_t flag)816 void DFA::AddToQueue(Workq* q, int id, uint32_t flag) {
817 
818   // Use stack_ to hold our stack of instructions yet to process.
819   // It was preallocated as follows:
820   //   one entry per Capture;
821   //   one entry per EmptyWidth; and
822   //   one entry per Nop.
823   // This reflects the maximum number of stack pushes that each can
824   // perform. (Each instruction can be processed at most once.)
825   // When using marks, we also added nmark == prog_->size().
826   // (Otherwise, nmark == 0.)
827   int* stk = stack_.data();
828   int nstk = 0;
829 
830   stk[nstk++] = id;
831   while (nstk > 0) {
832     DCHECK_LE(nstk, stack_.size());
833     id = stk[--nstk];
834 
835   Loop:
836     if (id == Mark) {
837       q->mark();
838       continue;
839     }
840 
841     if (id == 0)
842       continue;
843 
844     // If ip is already on the queue, nothing to do.
845     // Otherwise add it.  We don't actually keep all the
846     // ones that get added, but adding all of them here
847     // increases the likelihood of q->contains(id),
848     // reducing the amount of duplicated work.
849     if (q->contains(id))
850       continue;
851     q->insert_new(id);
852 
853     // Process instruction.
854     Prog::Inst* ip = prog_->inst(id);
855     switch (ip->opcode()) {
856       default:
857         LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
858         break;
859 
860       case kInstByteRange:  // just save these on the queue
861       case kInstMatch:
862         if (ip->last())
863           break;
864         id = id+1;
865         goto Loop;
866 
867       case kInstCapture:    // DFA treats captures as no-ops.
868       case kInstNop:
869         if (!ip->last())
870           stk[nstk++] = id+1;
871 
872         // If this instruction is the [00-FF]* loop at the beginning of
873         // a leftmost-longest unanchored search, separate with a Mark so
874         // that future threads (which will start farther to the right in
875         // the input string) are lower priority than current threads.
876         if (ip->opcode() == kInstNop && q->maxmark() > 0 &&
877             id == prog_->start_unanchored() && id != prog_->start())
878           stk[nstk++] = Mark;
879         id = ip->out();
880         goto Loop;
881 
882       case kInstAltMatch:
883         DCHECK(!ip->last());
884         id = id+1;
885         goto Loop;
886 
887       case kInstEmptyWidth:
888         if (!ip->last())
889           stk[nstk++] = id+1;
890 
891         // Continue on if we have all the right flag bits.
892         if (ip->empty() & ~flag)
893           break;
894         id = ip->out();
895         goto Loop;
896     }
897   }
898 }
899 
900 // Running of work queues.  In the work queue, order matters:
901 // the queue is sorted in priority order.  If instruction i comes before j,
902 // then the instructions that i produces during the run must come before
903 // the ones that j produces.  In order to keep this invariant, all the
904 // work queue runners have to take an old queue to process and then
905 // also a new queue to fill in.  It's not acceptable to add to the end of
906 // an existing queue, because new instructions will not end up in the
907 // correct position.
908 
909 // Runs the work queue, processing the empty strings indicated by flag.
910 // For example, flag == kEmptyBeginLine|kEmptyEndLine means to match
911 // both ^ and $.  It is important that callers pass all flags at once:
912 // processing both ^ and $ is not the same as first processing only ^
913 // and then processing only $.  Doing the two-step sequence won't match
914 // ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior
915 // exhibited by existing implementations).
RunWorkqOnEmptyString(Workq * oldq,Workq * newq,uint32_t flag)916 void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint32_t flag) {
917   newq->clear();
918   for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
919     if (oldq->is_mark(*i))
920       AddToQueue(newq, Mark, flag);
921     else
922       AddToQueue(newq, *i, flag);
923   }
924 }
925 
926 // Runs the work queue, processing the single byte c followed by any empty
927 // strings indicated by flag.  For example, c == 'a' and flag == kEmptyEndLine,
928 // means to match c$.  Sets the bool *ismatch to true if the end of the
929 // regular expression program has been reached (the regexp has matched).
RunWorkqOnByte(Workq * oldq,Workq * newq,int c,uint32_t flag,bool * ismatch)930 void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
931                          int c, uint32_t flag, bool* ismatch) {
932   //mutex_.AssertHeld();
933 
934   newq->clear();
935   for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
936     if (oldq->is_mark(*i)) {
937       if (*ismatch)
938         return;
939       newq->mark();
940       continue;
941     }
942     int id = *i;
943     Prog::Inst* ip = prog_->inst(id);
944     switch (ip->opcode()) {
945       default:
946         LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
947         break;
948 
949       case kInstFail:        // never succeeds
950       case kInstCapture:     // already followed
951       case kInstNop:         // already followed
952       case kInstAltMatch:    // already followed
953       case kInstEmptyWidth:  // already followed
954         break;
955 
956       case kInstByteRange:   // can follow if c is in range
957         if (!ip->Matches(c))
958           break;
959         AddToQueue(newq, ip->out(), flag);
960         if (ip->hint() != 0) {
961           // We have a hint, but we must cancel out the
962           // increment that will occur after the break.
963           i += ip->hint() - 1;
964         } else {
965           // We have no hint, so we must find the end
966           // of the current list and then skip to it.
967           Prog::Inst* ip0 = ip;
968           while (!ip->last())
969             ++ip;
970           i += ip - ip0;
971         }
972         break;
973 
974       case kInstMatch:
975         if (prog_->anchor_end() && c != kByteEndText &&
976             kind_ != Prog::kManyMatch)
977           break;
978         *ismatch = true;
979         if (kind_ == Prog::kFirstMatch) {
980           // Can stop processing work queue since we found a match.
981           return;
982         }
983         break;
984     }
985   }
986 
987   if (ExtraDebug)
988     fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n",
989             DumpWorkq(oldq).c_str(), c, flag, DumpWorkq(newq).c_str(), *ismatch);
990 }
991 
992 // Processes input byte c in state, returning new state.
993 // Caller does not hold mutex.
RunStateOnByteUnlocked(State * state,int c)994 DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) {
995   // Keep only one RunStateOnByte going
996   // even if the DFA is being run by multiple threads.
997   MutexLock l(&mutex_);
998   return RunStateOnByte(state, c);
999 }
1000 
1001 // Processes input byte c in state, returning new state.
RunStateOnByte(State * state,int c)1002 DFA::State* DFA::RunStateOnByte(State* state, int c) {
1003   //mutex_.AssertHeld();
1004 
1005   if (state <= SpecialStateMax) {
1006     if (state == FullMatchState) {
1007       // It is convenient for routines like PossibleMatchRange
1008       // if we implement RunStateOnByte for FullMatchState:
1009       // once you get into this state you never get out,
1010       // so it's pretty easy.
1011       return FullMatchState;
1012     }
1013     if (state == DeadState) {
1014       LOG(DFATAL) << "DeadState in RunStateOnByte";
1015       return NULL;
1016     }
1017     if (state == NULL) {
1018       LOG(DFATAL) << "NULL state in RunStateOnByte";
1019       return NULL;
1020     }
1021     LOG(DFATAL) << "Unexpected special state in RunStateOnByte";
1022     return NULL;
1023   }
1024 
1025   // If someone else already computed this, return it.
1026   State* ns = state->next_[ByteMap(c)].load(std::memory_order_relaxed);
1027   if (ns != NULL)
1028     return ns;
1029 
1030   // Convert state into Workq.
1031   StateToWorkq(state, q0_);
1032 
1033   // Flags marking the kinds of empty-width things (^ $ etc)
1034   // around this byte.  Before the byte we have the flags recorded
1035   // in the State structure itself.  After the byte we have
1036   // nothing yet (but that will change: read on).
1037   uint32_t needflag = state->flag_ >> kFlagNeedShift;
1038   uint32_t beforeflag = state->flag_ & kFlagEmptyMask;
1039   uint32_t oldbeforeflag = beforeflag;
1040   uint32_t afterflag = 0;
1041 
1042   if (c == '\n') {
1043     // Insert implicit $ and ^ around \n
1044     beforeflag |= kEmptyEndLine;
1045     afterflag |= kEmptyBeginLine;
1046   }
1047 
1048   if (c == kByteEndText) {
1049     // Insert implicit $ and \z before the fake "end text" byte.
1050     beforeflag |= kEmptyEndLine | kEmptyEndText;
1051   }
1052 
1053   // The state flag kFlagLastWord says whether the last
1054   // byte processed was a word character.  Use that info to
1055   // insert empty-width (non-)word boundaries.
1056   bool islastword = (state->flag_ & kFlagLastWord) != 0;
1057   bool isword = c != kByteEndText && Prog::IsWordChar(static_cast<uint8_t>(c));
1058   if (isword == islastword)
1059     beforeflag |= kEmptyNonWordBoundary;
1060   else
1061     beforeflag |= kEmptyWordBoundary;
1062 
1063   // Okay, finally ready to run.
1064   // Only useful to rerun on empty string if there are new, useful flags.
1065   if (beforeflag & ~oldbeforeflag & needflag) {
1066     RunWorkqOnEmptyString(q0_, q1_, beforeflag);
1067     using std::swap;
1068     swap(q0_, q1_);
1069   }
1070   bool ismatch = false;
1071   RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch);
1072   using std::swap;
1073   swap(q0_, q1_);
1074 
1075   // Save afterflag along with ismatch and isword in new state.
1076   uint32_t flag = afterflag;
1077   if (ismatch)
1078     flag |= kFlagMatch;
1079   if (isword)
1080     flag |= kFlagLastWord;
1081 
1082   if (ismatch && kind_ == Prog::kManyMatch)
1083     ns = WorkqToCachedState(q0_, q1_, flag);
1084   else
1085     ns = WorkqToCachedState(q0_, NULL, flag);
1086 
1087   // Flush ns before linking to it.
1088   // Write barrier before updating state->next_ so that the
1089   // main search loop can proceed without any locking, for speed.
1090   // (Otherwise it would need one mutex operation per input byte.)
1091   state->next_[ByteMap(c)].store(ns, std::memory_order_release);
1092   return ns;
1093 }
1094 
1095 
1096 //////////////////////////////////////////////////////////////////////
1097 // DFA cache reset.
1098 
1099 // Reader-writer lock helper.
1100 //
1101 // The DFA uses a reader-writer mutex to protect the state graph itself.
1102 // Traversing the state graph requires holding the mutex for reading,
1103 // and discarding the state graph and starting over requires holding the
1104 // lock for writing.  If a search needs to expand the graph but is out
1105 // of memory, it will need to drop its read lock and then acquire the
1106 // write lock.  Since it cannot then atomically downgrade from write lock
1107 // to read lock, it runs the rest of the search holding the write lock.
1108 // (This probably helps avoid repeated contention, but really the decision
1109 // is forced by the Mutex interface.)  It's a bit complicated to keep
1110 // track of whether the lock is held for reading or writing and thread
1111 // that through the search, so instead we encapsulate it in the RWLocker
1112 // and pass that around.
1113 
1114 class DFA::RWLocker {
1115  public:
1116   explicit RWLocker(CacheMutex* mu);
1117   ~RWLocker();
1118 
1119   // If the lock is only held for reading right now,
1120   // drop the read lock and re-acquire for writing.
1121   // Subsequent calls to LockForWriting are no-ops.
1122   // Notice that the lock is *released* temporarily.
1123   void LockForWriting();
1124 
1125  private:
1126   CacheMutex* mu_;
1127   bool writing_;
1128 
1129   RWLocker(const RWLocker&) = delete;
1130   RWLocker& operator=(const RWLocker&) = delete;
1131 };
1132 
RWLocker(CacheMutex * mu)1133 DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) {
1134   mu_->ReaderLock();
1135 }
1136 
1137 // This function is marked as NO_THREAD_SAFETY_ANALYSIS because
1138 // the annotations don't support lock upgrade.
LockForWriting()1139 void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS {
1140   if (!writing_) {
1141     mu_->ReaderUnlock();
1142     mu_->WriterLock();
1143     writing_ = true;
1144   }
1145 }
1146 
~RWLocker()1147 DFA::RWLocker::~RWLocker() {
1148   if (!writing_)
1149     mu_->ReaderUnlock();
1150   else
1151     mu_->WriterUnlock();
1152 }
1153 
1154 
1155 // When the DFA's State cache fills, we discard all the states in the
1156 // cache and start over.  Many threads can be using and adding to the
1157 // cache at the same time, so we synchronize using the cache_mutex_
1158 // to keep from stepping on other threads.  Specifically, all the
1159 // threads using the current cache hold cache_mutex_ for reading.
1160 // When a thread decides to flush the cache, it drops cache_mutex_
1161 // and then re-acquires it for writing.  That ensures there are no
1162 // other threads accessing the cache anymore.  The rest of the search
1163 // runs holding cache_mutex_ for writing, avoiding any contention
1164 // with or cache pollution caused by other threads.
1165 
ResetCache(RWLocker * cache_lock)1166 void DFA::ResetCache(RWLocker* cache_lock) {
1167   // Re-acquire the cache_mutex_ for writing (exclusive use).
1168   cache_lock->LockForWriting();
1169 
1170   hooks::GetDFAStateCacheResetHook()({
1171       state_budget_,
1172       state_cache_.size(),
1173   });
1174 
1175   // Clear the cache, reset the memory budget.
1176   for (int i = 0; i < kMaxStart; i++)
1177     start_[i].start.store(NULL, std::memory_order_relaxed);
1178   ClearCache();
1179   mem_budget_ = state_budget_;
1180 }
1181 
1182 // Typically, a couple States do need to be preserved across a cache
1183 // reset, like the State at the current point in the search.
1184 // The StateSaver class helps keep States across cache resets.
1185 // It makes a copy of the state's guts outside the cache (before the reset)
1186 // and then can be asked, after the reset, to recreate the State
1187 // in the new cache.  For example, in a DFA method ("this" is a DFA):
1188 //
1189 //   StateSaver saver(this, s);
1190 //   ResetCache(cache_lock);
1191 //   s = saver.Restore();
1192 //
1193 // The saver should always have room in the cache to re-create the state,
1194 // because resetting the cache locks out all other threads, and the cache
1195 // is known to have room for at least a couple states (otherwise the DFA
1196 // constructor fails).
1197 
1198 class DFA::StateSaver {
1199  public:
1200   explicit StateSaver(DFA* dfa, State* state);
1201   ~StateSaver();
1202 
1203   // Recreates and returns a state equivalent to the
1204   // original state passed to the constructor.
1205   // Returns NULL if the cache has filled, but
1206   // since the DFA guarantees to have room in the cache
1207   // for a couple states, should never return NULL
1208   // if used right after ResetCache.
1209   State* Restore();
1210 
1211  private:
1212   DFA* dfa_;         // the DFA to use
1213   int* inst_;        // saved info from State
1214   int ninst_;
1215   uint32_t flag_;
1216   bool is_special_;  // whether original state was special
1217   State* special_;   // if is_special_, the original state
1218 
1219   StateSaver(const StateSaver&) = delete;
1220   StateSaver& operator=(const StateSaver&) = delete;
1221 };
1222 
StateSaver(DFA * dfa,State * state)1223 DFA::StateSaver::StateSaver(DFA* dfa, State* state) {
1224   dfa_ = dfa;
1225   if (state <= SpecialStateMax) {
1226     inst_ = NULL;
1227     ninst_ = 0;
1228     flag_ = 0;
1229     is_special_ = true;
1230     special_ = state;
1231     return;
1232   }
1233   is_special_ = false;
1234   special_ = NULL;
1235   flag_ = state->flag_;
1236   ninst_ = state->ninst_;
1237   inst_ = new int[ninst_];
1238   memmove(inst_, state->inst_, ninst_*sizeof inst_[0]);
1239 }
1240 
~StateSaver()1241 DFA::StateSaver::~StateSaver() {
1242   if (!is_special_)
1243     delete[] inst_;
1244 }
1245 
Restore()1246 DFA::State* DFA::StateSaver::Restore() {
1247   if (is_special_)
1248     return special_;
1249   MutexLock l(&dfa_->mutex_);
1250   State* s = dfa_->CachedState(inst_, ninst_, flag_);
1251   if (s == NULL)
1252     LOG(DFATAL) << "StateSaver failed to restore state.";
1253   return s;
1254 }
1255 
1256 
1257 //////////////////////////////////////////////////////////////////////
1258 //
1259 // DFA execution.
1260 //
1261 // The basic search loop is easy: start in a state s and then for each
1262 // byte c in the input, s = s->next[c].
1263 //
1264 // This simple description omits a few efficiency-driven complications.
1265 //
1266 // First, the State graph is constructed incrementally: it is possible
1267 // that s->next[c] is null, indicating that that state has not been
1268 // fully explored.  In this case, RunStateOnByte must be invoked to
1269 // determine the next state, which is cached in s->next[c] to save
1270 // future effort.  An alternative reason for s->next[c] to be null is
1271 // that the DFA has reached a so-called "dead state", in which any match
1272 // is no longer possible.  In this case RunStateOnByte will return NULL
1273 // and the processing of the string can stop early.
1274 //
1275 // Second, a 256-element pointer array for s->next_ makes each State
1276 // quite large (2kB on 64-bit machines).  Instead, dfa->bytemap_[]
1277 // maps from bytes to "byte classes" and then next_ only needs to have
1278 // as many pointers as there are byte classes.  A byte class is simply a
1279 // range of bytes that the regexp never distinguishes between.
1280 // A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1,
1281 // 'a', 'b' to 'c', and 'c' to 0xFF.  The bytemap slows us a little bit
1282 // but in exchange we typically cut the size of a State (and thus our
1283 // memory footprint) by about 5-10x.  The comments still refer to
1284 // s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]].
1285 //
1286 // Third, it is common for a DFA for an unanchored match to begin in a
1287 // state in which only one particular byte value can take the DFA to a
1288 // different state.  That is, s->next[c] != s for only one c.  In this
1289 // situation, the DFA can do better than executing the simple loop.
1290 // Instead, it can call memchr to search very quickly for the byte c.
1291 // Whether the start state has this property is determined during a
1292 // pre-compilation pass and the "can_prefix_accel" argument is set.
1293 //
1294 // Fourth, the desired behavior is to search for the leftmost-best match
1295 // (approximately, the same one that Perl would find), which is not
1296 // necessarily the match ending earliest in the string.  Each time a
1297 // match is found, it must be noted, but the DFA must continue on in
1298 // hope of finding a higher-priority match.  In some cases, the caller only
1299 // cares whether there is any match at all, not which one is found.
1300 // The "want_earliest_match" flag causes the search to stop at the first
1301 // match found.
1302 //
1303 // Fifth, one algorithm that uses the DFA needs it to run over the
1304 // input string backward, beginning at the end and ending at the beginning.
1305 // Passing false for the "run_forward" flag causes the DFA to run backward.
1306 //
1307 // The checks for these last three cases, which in a naive implementation
1308 // would be performed once per input byte, slow the general loop enough
1309 // to merit specialized versions of the search loop for each of the
1310 // eight possible settings of the three booleans.  Rather than write
1311 // eight different functions, we write one general implementation and then
1312 // inline it to create the specialized ones.
1313 //
1314 // Note that matches are delayed by one byte, to make it easier to
1315 // accomodate match conditions depending on the next input byte (like $ and \b).
1316 // When s->next[c]->IsMatch(), it means that there is a match ending just
1317 // *before* byte c.
1318 
1319 // The generic search loop.  Searches text for a match, returning
1320 // the pointer to the end of the chosen match, or NULL if no match.
1321 // The bools are equal to the same-named variables in params, but
1322 // making them function arguments lets the inliner specialize
1323 // this function to each combination (see two paragraphs above).
1324 template <bool can_prefix_accel,
1325           bool want_earliest_match,
1326           bool run_forward>
InlinedSearchLoop(SearchParams * params)1327 inline bool DFA::InlinedSearchLoop(SearchParams* params) {
1328   State* start = params->start;
1329   const uint8_t* bp = BytePtr(params->text.data());  // start of text
1330   const uint8_t* p = bp;                             // text scanning point
1331   const uint8_t* ep = BytePtr(params->text.data() +
1332                               params->text.size());  // end of text
1333   const uint8_t* resetp = NULL;                      // p at last cache reset
1334   if (!run_forward) {
1335     using std::swap;
1336     swap(p, ep);
1337   }
1338 
1339   const uint8_t* bytemap = prog_->bytemap();
1340   const uint8_t* lastmatch = NULL;   // most recent matching position in text
1341   bool matched = false;
1342 
1343   State* s = start;
1344   if (ExtraDebug)
1345     fprintf(stderr, "@stx: %s\n", DumpState(s).c_str());
1346 
1347   if (s->IsMatch()) {
1348     matched = true;
1349     lastmatch = p;
1350     if (ExtraDebug)
1351       fprintf(stderr, "match @stx! [%s]\n", DumpState(s).c_str());
1352     if (params->matches != NULL && kind_ == Prog::kManyMatch) {
1353       for (int i = s->ninst_ - 1; i >= 0; i--) {
1354         int id = s->inst_[i];
1355         if (id == MatchSep)
1356           break;
1357         params->matches->insert(id);
1358       }
1359     }
1360     if (want_earliest_match) {
1361       params->ep = reinterpret_cast<const char*>(lastmatch);
1362       return true;
1363     }
1364   }
1365 
1366   while (p != ep) {
1367     if (ExtraDebug)
1368       fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str());
1369 
1370     if (can_prefix_accel && s == start) {
1371       // In start state, only way out is to find the prefix,
1372       // so we use prefix accel (e.g. memchr) to skip ahead.
1373       // If not found, we can skip to the end of the string.
1374       p = BytePtr(prog_->PrefixAccel(p, ep - p));
1375       if (p == NULL) {
1376         p = ep;
1377         break;
1378       }
1379     }
1380 
1381     int c;
1382     if (run_forward)
1383       c = *p++;
1384     else
1385       c = *--p;
1386 
1387     // Note that multiple threads might be consulting
1388     // s->next_[bytemap[c]] simultaneously.
1389     // RunStateOnByte takes care of the appropriate locking,
1390     // including a memory barrier so that the unlocked access
1391     // (sometimes known as "double-checked locking") is safe.
1392     // The alternative would be either one DFA per thread
1393     // or one mutex operation per input byte.
1394     //
1395     // ns == DeadState means the state is known to be dead
1396     // (no more matches are possible).
1397     // ns == NULL means the state has not yet been computed
1398     // (need to call RunStateOnByteUnlocked).
1399     // RunStateOnByte returns ns == NULL if it is out of memory.
1400     // ns == FullMatchState means the rest of the string matches.
1401     //
1402     // Okay to use bytemap[] not ByteMap() here, because
1403     // c is known to be an actual byte and not kByteEndText.
1404 
1405     State* ns = s->next_[bytemap[c]].load(std::memory_order_acquire);
1406     if (ns == NULL) {
1407       ns = RunStateOnByteUnlocked(s, c);
1408       if (ns == NULL) {
1409         // After we reset the cache, we hold cache_mutex exclusively,
1410         // so if resetp != NULL, it means we filled the DFA state
1411         // cache with this search alone (without any other threads).
1412         // Benchmarks show that doing a state computation on every
1413         // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the
1414         // same at about 2 MB/s.  Unless we're processing an average
1415         // of 10 bytes per state computation, fail so that RE2 can
1416         // fall back to the NFA.  However, RE2::Set cannot fall back,
1417         // so we just have to keep on keeping on in that case.
1418         if (dfa_should_bail_when_slow && resetp != NULL &&
1419             static_cast<size_t>(p - resetp) < 10*state_cache_.size() &&
1420             kind_ != Prog::kManyMatch) {
1421           params->failed = true;
1422           return false;
1423         }
1424         resetp = p;
1425 
1426         // Prepare to save start and s across the reset.
1427         StateSaver save_start(this, start);
1428         StateSaver save_s(this, s);
1429 
1430         // Discard all the States in the cache.
1431         ResetCache(params->cache_lock);
1432 
1433         // Restore start and s so we can continue.
1434         if ((start = save_start.Restore()) == NULL ||
1435             (s = save_s.Restore()) == NULL) {
1436           // Restore already did LOG(DFATAL).
1437           params->failed = true;
1438           return false;
1439         }
1440         ns = RunStateOnByteUnlocked(s, c);
1441         if (ns == NULL) {
1442           LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache";
1443           params->failed = true;
1444           return false;
1445         }
1446       }
1447     }
1448     if (ns <= SpecialStateMax) {
1449       if (ns == DeadState) {
1450         params->ep = reinterpret_cast<const char*>(lastmatch);
1451         return matched;
1452       }
1453       // FullMatchState
1454       params->ep = reinterpret_cast<const char*>(ep);
1455       return true;
1456     }
1457 
1458     s = ns;
1459     if (s->IsMatch()) {
1460       matched = true;
1461       // The DFA notices the match one byte late,
1462       // so adjust p before using it in the match.
1463       if (run_forward)
1464         lastmatch = p - 1;
1465       else
1466         lastmatch = p + 1;
1467       if (ExtraDebug)
1468         fprintf(stderr, "match @%td! [%s]\n", lastmatch - bp, DumpState(s).c_str());
1469       if (params->matches != NULL && kind_ == Prog::kManyMatch) {
1470         for (int i = s->ninst_ - 1; i >= 0; i--) {
1471           int id = s->inst_[i];
1472           if (id == MatchSep)
1473             break;
1474           params->matches->insert(id);
1475         }
1476       }
1477       if (want_earliest_match) {
1478         params->ep = reinterpret_cast<const char*>(lastmatch);
1479         return true;
1480       }
1481     }
1482   }
1483 
1484   // Process one more byte to see if it triggers a match.
1485   // (Remember, matches are delayed one byte.)
1486   if (ExtraDebug)
1487     fprintf(stderr, "@etx: %s\n", DumpState(s).c_str());
1488 
1489   int lastbyte;
1490   if (run_forward) {
1491     if (params->text.end() == params->context.end())
1492       lastbyte = kByteEndText;
1493     else
1494       lastbyte = params->text.end()[0] & 0xFF;
1495   } else {
1496     if (params->text.begin() == params->context.begin())
1497       lastbyte = kByteEndText;
1498     else
1499       lastbyte = params->text.begin()[-1] & 0xFF;
1500   }
1501 
1502   State* ns = s->next_[ByteMap(lastbyte)].load(std::memory_order_acquire);
1503   if (ns == NULL) {
1504     ns = RunStateOnByteUnlocked(s, lastbyte);
1505     if (ns == NULL) {
1506       StateSaver save_s(this, s);
1507       ResetCache(params->cache_lock);
1508       if ((s = save_s.Restore()) == NULL) {
1509         params->failed = true;
1510         return false;
1511       }
1512       ns = RunStateOnByteUnlocked(s, lastbyte);
1513       if (ns == NULL) {
1514         LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset";
1515         params->failed = true;
1516         return false;
1517       }
1518     }
1519   }
1520   if (ns <= SpecialStateMax) {
1521     if (ns == DeadState) {
1522       params->ep = reinterpret_cast<const char*>(lastmatch);
1523       return matched;
1524     }
1525     // FullMatchState
1526     params->ep = reinterpret_cast<const char*>(ep);
1527     return true;
1528   }
1529 
1530   s = ns;
1531   if (s->IsMatch()) {
1532     matched = true;
1533     lastmatch = p;
1534     if (ExtraDebug)
1535       fprintf(stderr, "match @etx! [%s]\n", DumpState(s).c_str());
1536     if (params->matches != NULL && kind_ == Prog::kManyMatch) {
1537       for (int i = s->ninst_ - 1; i >= 0; i--) {
1538         int id = s->inst_[i];
1539         if (id == MatchSep)
1540           break;
1541         params->matches->insert(id);
1542       }
1543     }
1544   }
1545 
1546   params->ep = reinterpret_cast<const char*>(lastmatch);
1547   return matched;
1548 }
1549 
1550 // Inline specializations of the general loop.
SearchFFF(SearchParams * params)1551 bool DFA::SearchFFF(SearchParams* params) {
1552   return InlinedSearchLoop<false, false, false>(params);
1553 }
SearchFFT(SearchParams * params)1554 bool DFA::SearchFFT(SearchParams* params) {
1555   return InlinedSearchLoop<false, false, true>(params);
1556 }
SearchFTF(SearchParams * params)1557 bool DFA::SearchFTF(SearchParams* params) {
1558   return InlinedSearchLoop<false, true, false>(params);
1559 }
SearchFTT(SearchParams * params)1560 bool DFA::SearchFTT(SearchParams* params) {
1561   return InlinedSearchLoop<false, true, true>(params);
1562 }
SearchTFF(SearchParams * params)1563 bool DFA::SearchTFF(SearchParams* params) {
1564   return InlinedSearchLoop<true, false, false>(params);
1565 }
SearchTFT(SearchParams * params)1566 bool DFA::SearchTFT(SearchParams* params) {
1567   return InlinedSearchLoop<true, false, true>(params);
1568 }
SearchTTF(SearchParams * params)1569 bool DFA::SearchTTF(SearchParams* params) {
1570   return InlinedSearchLoop<true, true, false>(params);
1571 }
SearchTTT(SearchParams * params)1572 bool DFA::SearchTTT(SearchParams* params) {
1573   return InlinedSearchLoop<true, true, true>(params);
1574 }
1575 
1576 // For performance, calls the appropriate specialized version
1577 // of InlinedSearchLoop.
FastSearchLoop(SearchParams * params)1578 bool DFA::FastSearchLoop(SearchParams* params) {
1579   // Because the methods are private, the Searches array
1580   // cannot be declared at top level.
1581   static bool (DFA::*Searches[])(SearchParams*) = {
1582     &DFA::SearchFFF,
1583     &DFA::SearchFFT,
1584     &DFA::SearchFTF,
1585     &DFA::SearchFTT,
1586     &DFA::SearchTFF,
1587     &DFA::SearchTFT,
1588     &DFA::SearchTTF,
1589     &DFA::SearchTTT,
1590   };
1591 
1592   int index = 4 * params->can_prefix_accel +
1593               2 * params->want_earliest_match +
1594               1 * params->run_forward;
1595   return (this->*Searches[index])(params);
1596 }
1597 
1598 
1599 // The discussion of DFA execution above ignored the question of how
1600 // to determine the initial state for the search loop.  There are two
1601 // factors that influence the choice of start state.
1602 //
1603 // The first factor is whether the search is anchored or not.
1604 // The regexp program (Prog*) itself has
1605 // two different entry points: one for anchored searches and one for
1606 // unanchored searches.  (The unanchored version starts with a leading ".*?"
1607 // and then jumps to the anchored one.)
1608 //
1609 // The second factor is where text appears in the larger context, which
1610 // determines which empty-string operators can be matched at the beginning
1611 // of execution.  If text is at the very beginning of context, \A and ^ match.
1612 // Otherwise if text is at the beginning of a line, then ^ matches.
1613 // Otherwise it matters whether the character before text is a word character
1614 // or a non-word character.
1615 //
1616 // The two cases (unanchored vs not) and four cases (empty-string flags)
1617 // combine to make the eight cases recorded in the DFA's begin_text_[2],
1618 // begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached
1619 // StartInfos.  The start state for each is filled in the first time it
1620 // is used for an actual search.
1621 
1622 // Examines text, context, and anchored to determine the right start
1623 // state for the DFA search loop.  Fills in params and returns true on success.
1624 // Returns false on failure.
AnalyzeSearch(SearchParams * params)1625 bool DFA::AnalyzeSearch(SearchParams* params) {
1626   const StringPiece& text = params->text;
1627   const StringPiece& context = params->context;
1628 
1629   // Sanity check: make sure that text lies within context.
1630   if (text.begin() < context.begin() || text.end() > context.end()) {
1631     LOG(DFATAL) << "context does not contain text";
1632     params->start = DeadState;
1633     return true;
1634   }
1635 
1636   // Determine correct search type.
1637   int start;
1638   uint32_t flags;
1639   if (params->run_forward) {
1640     if (text.begin() == context.begin()) {
1641       start = kStartBeginText;
1642       flags = kEmptyBeginText|kEmptyBeginLine;
1643     } else if (text.begin()[-1] == '\n') {
1644       start = kStartBeginLine;
1645       flags = kEmptyBeginLine;
1646     } else if (Prog::IsWordChar(text.begin()[-1] & 0xFF)) {
1647       start = kStartAfterWordChar;
1648       flags = kFlagLastWord;
1649     } else {
1650       start = kStartAfterNonWordChar;
1651       flags = 0;
1652     }
1653   } else {
1654     if (text.end() == context.end()) {
1655       start = kStartBeginText;
1656       flags = kEmptyBeginText|kEmptyBeginLine;
1657     } else if (text.end()[0] == '\n') {
1658       start = kStartBeginLine;
1659       flags = kEmptyBeginLine;
1660     } else if (Prog::IsWordChar(text.end()[0] & 0xFF)) {
1661       start = kStartAfterWordChar;
1662       flags = kFlagLastWord;
1663     } else {
1664       start = kStartAfterNonWordChar;
1665       flags = 0;
1666     }
1667   }
1668   if (params->anchored)
1669     start |= kStartAnchored;
1670   StartInfo* info = &start_[start];
1671 
1672   // Try once without cache_lock for writing.
1673   // Try again after resetting the cache
1674   // (ResetCache will relock cache_lock for writing).
1675   if (!AnalyzeSearchHelper(params, info, flags)) {
1676     ResetCache(params->cache_lock);
1677     if (!AnalyzeSearchHelper(params, info, flags)) {
1678       LOG(DFATAL) << "Failed to analyze start state.";
1679       params->failed = true;
1680       return false;
1681     }
1682   }
1683 
1684   params->start = info->start.load(std::memory_order_acquire);
1685 
1686   // Even if we could prefix accel, we cannot do so when anchored and,
1687   // less obviously, we cannot do so when we are going to need flags.
1688   // This trick works only when there is a single byte that leads to a
1689   // different state!
1690   if (prog_->can_prefix_accel() &&
1691       !params->anchored &&
1692       params->start > SpecialStateMax &&
1693       params->start->flag_ >> kFlagNeedShift == 0)
1694     params->can_prefix_accel = true;
1695 
1696   if (ExtraDebug)
1697     fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n",
1698             params->anchored, params->run_forward, flags,
1699             DumpState(params->start).c_str(), params->can_prefix_accel);
1700 
1701   return true;
1702 }
1703 
1704 // Fills in info if needed.  Returns true on success, false on failure.
AnalyzeSearchHelper(SearchParams * params,StartInfo * info,uint32_t flags)1705 bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
1706                               uint32_t flags) {
1707   // Quick check.
1708   State* start = info->start.load(std::memory_order_acquire);
1709   if (start != NULL)
1710     return true;
1711 
1712   MutexLock l(&mutex_);
1713   start = info->start.load(std::memory_order_relaxed);
1714   if (start != NULL)
1715     return true;
1716 
1717   q0_->clear();
1718   AddToQueue(q0_,
1719              params->anchored ? prog_->start() : prog_->start_unanchored(),
1720              flags);
1721   start = WorkqToCachedState(q0_, NULL, flags);
1722   if (start == NULL)
1723     return false;
1724 
1725   // Synchronize with "quick check" above.
1726   info->start.store(start, std::memory_order_release);
1727   return true;
1728 }
1729 
1730 // The actual DFA search: calls AnalyzeSearch and then FastSearchLoop.
Search(const StringPiece & text,const StringPiece & context,bool anchored,bool want_earliest_match,bool run_forward,bool * failed,const char ** epp,SparseSet * matches)1731 bool DFA::Search(const StringPiece& text,
1732                  const StringPiece& context,
1733                  bool anchored,
1734                  bool want_earliest_match,
1735                  bool run_forward,
1736                  bool* failed,
1737                  const char** epp,
1738                  SparseSet* matches) {
1739   *epp = NULL;
1740   if (!ok()) {
1741     *failed = true;
1742     return false;
1743   }
1744   *failed = false;
1745 
1746   if (ExtraDebug) {
1747     fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str());
1748     fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n",
1749             std::string(text).c_str(), anchored, want_earliest_match, run_forward, kind_);
1750   }
1751 
1752   RWLocker l(&cache_mutex_);
1753   SearchParams params(text, context, &l);
1754   params.anchored = anchored;
1755   params.want_earliest_match = want_earliest_match;
1756   params.run_forward = run_forward;
1757   params.matches = matches;
1758 
1759   if (!AnalyzeSearch(&params)) {
1760     *failed = true;
1761     return false;
1762   }
1763   if (params.start == DeadState)
1764     return false;
1765   if (params.start == FullMatchState) {
1766     if (run_forward == want_earliest_match)
1767       *epp = text.data();
1768     else
1769       *epp = text.data() + text.size();
1770     return true;
1771   }
1772   if (ExtraDebug)
1773     fprintf(stderr, "start %s\n", DumpState(params.start).c_str());
1774   bool ret = FastSearchLoop(&params);
1775   if (params.failed) {
1776     *failed = true;
1777     return false;
1778   }
1779   *epp = params.ep;
1780   return ret;
1781 }
1782 
GetDFA(MatchKind kind)1783 DFA* Prog::GetDFA(MatchKind kind) {
1784   // For a forward DFA, half the memory goes to each DFA.
1785   // However, if it is a "many match" DFA, then there is
1786   // no counterpart with which the memory must be shared.
1787   //
1788   // For a reverse DFA, all the memory goes to the
1789   // "longest match" DFA, because RE2 never does reverse
1790   // "first match" searches.
1791   if (kind == kFirstMatch) {
1792     std::call_once(dfa_first_once_, [](Prog* prog) {
1793       prog->dfa_first_ = new DFA(prog, kFirstMatch, prog->dfa_mem_ / 2);
1794     }, this);
1795     return dfa_first_;
1796   } else if (kind == kManyMatch) {
1797     std::call_once(dfa_first_once_, [](Prog* prog) {
1798       prog->dfa_first_ = new DFA(prog, kManyMatch, prog->dfa_mem_);
1799     }, this);
1800     return dfa_first_;
1801   } else {
1802     std::call_once(dfa_longest_once_, [](Prog* prog) {
1803       if (!prog->reversed_)
1804         prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_ / 2);
1805       else
1806         prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_);
1807     }, this);
1808     return dfa_longest_;
1809   }
1810 }
1811 
DeleteDFA(DFA * dfa)1812 void Prog::DeleteDFA(DFA* dfa) {
1813   delete dfa;
1814 }
1815 
1816 // Executes the regexp program to search in text,
1817 // which itself is inside the larger context.  (As a convenience,
1818 // passing a NULL context is equivalent to passing text.)
1819 // Returns true if a match is found, false if not.
1820 // If a match is found, fills in match0->end() to point at the end of the match
1821 // and sets match0->begin() to text.begin(), since the DFA can't track
1822 // where the match actually began.
1823 //
1824 // This is the only external interface (class DFA only exists in this file).
1825 //
SearchDFA(const StringPiece & text,const StringPiece & const_context,Anchor anchor,MatchKind kind,StringPiece * match0,bool * failed,SparseSet * matches)1826 bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
1827                      Anchor anchor, MatchKind kind, StringPiece* match0,
1828                      bool* failed, SparseSet* matches) {
1829   *failed = false;
1830 
1831   StringPiece context = const_context;
1832   if (context.data() == NULL)
1833     context = text;
1834   bool caret = anchor_start();
1835   bool dollar = anchor_end();
1836   if (reversed_) {
1837     using std::swap;
1838     swap(caret, dollar);
1839   }
1840   if (caret && context.begin() != text.begin())
1841     return false;
1842   if (dollar && context.end() != text.end())
1843     return false;
1844 
1845   // Handle full match by running an anchored longest match
1846   // and then checking if it covers all of text.
1847   bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch;
1848   bool endmatch = false;
1849   if (kind == kManyMatch) {
1850     // This is split out in order to avoid clobbering kind.
1851   } else if (kind == kFullMatch || anchor_end()) {
1852     endmatch = true;
1853     kind = kLongestMatch;
1854   }
1855 
1856   // If the caller doesn't care where the match is (just whether one exists),
1857   // then we can stop at the very first match we find, the so-called
1858   // "earliest match".
1859   bool want_earliest_match = false;
1860   if (kind == kManyMatch) {
1861     // This is split out in order to avoid clobbering kind.
1862     if (matches == NULL) {
1863       want_earliest_match = true;
1864     }
1865   } else if (match0 == NULL && !endmatch) {
1866     want_earliest_match = true;
1867     kind = kLongestMatch;
1868   }
1869 
1870   DFA* dfa = GetDFA(kind);
1871   const char* ep;
1872   bool matched = dfa->Search(text, context, anchored,
1873                              want_earliest_match, !reversed_,
1874                              failed, &ep, matches);
1875   if (*failed) {
1876     hooks::GetDFASearchFailureHook()({
1877         // Nothing yet...
1878     });
1879     return false;
1880   }
1881   if (!matched)
1882     return false;
1883   if (endmatch && ep != (reversed_ ? text.data() : text.data() + text.size()))
1884     return false;
1885 
1886   // If caller cares, record the boundary of the match.
1887   // We only know where it ends, so use the boundary of text
1888   // as the beginning.
1889   if (match0) {
1890     if (reversed_)
1891       *match0 =
1892           StringPiece(ep, static_cast<size_t>(text.data() + text.size() - ep));
1893     else
1894       *match0 =
1895           StringPiece(text.data(), static_cast<size_t>(ep - text.data()));
1896   }
1897   return true;
1898 }
1899 
1900 // Build out all states in DFA.  Returns number of states.
BuildAllStates(const Prog::DFAStateCallback & cb)1901 int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) {
1902   if (!ok())
1903     return 0;
1904 
1905   // Pick out start state for unanchored search
1906   // at beginning of text.
1907   RWLocker l(&cache_mutex_);
1908   SearchParams params(StringPiece(), StringPiece(), &l);
1909   params.anchored = false;
1910   if (!AnalyzeSearch(&params) ||
1911       params.start == NULL ||
1912       params.start == DeadState)
1913     return 0;
1914 
1915   // Add start state to work queue.
1916   // Note that any State* that we handle here must point into the cache,
1917   // so we can simply depend on pointer-as-a-number hashing and equality.
1918   std::unordered_map<State*, int> m;
1919   std::deque<State*> q;
1920   m.emplace(params.start, static_cast<int>(m.size()));
1921   q.push_back(params.start);
1922 
1923   // Compute the input bytes needed to cover all of the next pointers.
1924   int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
1925   std::vector<int> input(nnext);
1926   for (int c = 0; c < 256; c++) {
1927     int b = prog_->bytemap()[c];
1928     while (c < 256-1 && prog_->bytemap()[c+1] == b)
1929       c++;
1930     input[b] = c;
1931   }
1932   input[prog_->bytemap_range()] = kByteEndText;
1933 
1934   // Scratch space for the output.
1935   std::vector<int> output(nnext);
1936 
1937   // Flood to expand every state.
1938   bool oom = false;
1939   while (!q.empty()) {
1940     State* s = q.front();
1941     q.pop_front();
1942     for (int c : input) {
1943       State* ns = RunStateOnByteUnlocked(s, c);
1944       if (ns == NULL) {
1945         oom = true;
1946         break;
1947       }
1948       if (ns == DeadState) {
1949         output[ByteMap(c)] = -1;
1950         continue;
1951       }
1952       if (m.find(ns) == m.end()) {
1953         m.emplace(ns, static_cast<int>(m.size()));
1954         q.push_back(ns);
1955       }
1956       output[ByteMap(c)] = m[ns];
1957     }
1958     if (cb)
1959       cb(oom ? NULL : output.data(),
1960          s == FullMatchState || s->IsMatch());
1961     if (oom)
1962       break;
1963   }
1964 
1965   return static_cast<int>(m.size());
1966 }
1967 
1968 // Build out all states in DFA for kind.  Returns number of states.
BuildEntireDFA(MatchKind kind,const DFAStateCallback & cb)1969 int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) {
1970   return GetDFA(kind)->BuildAllStates(cb);
1971 }
1972 
1973 // Computes min and max for matching string.
1974 // Won't return strings bigger than maxlen.
PossibleMatchRange(std::string * min,std::string * max,int maxlen)1975 bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
1976   if (!ok())
1977     return false;
1978 
1979   // NOTE: if future users of PossibleMatchRange want more precision when
1980   // presented with infinitely repeated elements, consider making this a
1981   // parameter to PossibleMatchRange.
1982   static int kMaxEltRepetitions = 0;
1983 
1984   // Keep track of the number of times we've visited states previously. We only
1985   // revisit a given state if it's part of a repeated group, so if the value
1986   // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set
1987   // |*max| to |PrefixSuccessor(*max)|.
1988   //
1989   // Also note that previously_visited_states[UnseenStatePtr] will, in the STL
1990   // tradition, implicitly insert a '0' value at first use. We take advantage
1991   // of that property below.
1992   std::unordered_map<State*, int> previously_visited_states;
1993 
1994   // Pick out start state for anchored search at beginning of text.
1995   RWLocker l(&cache_mutex_);
1996   SearchParams params(StringPiece(), StringPiece(), &l);
1997   params.anchored = true;
1998   if (!AnalyzeSearch(&params))
1999     return false;
2000   if (params.start == DeadState) {  // No matching strings
2001     *min = "";
2002     *max = "";
2003     return true;
2004   }
2005   if (params.start == FullMatchState)  // Every string matches: no max
2006     return false;
2007 
2008   // The DFA is essentially a big graph rooted at params.start,
2009   // and paths in the graph correspond to accepted strings.
2010   // Each node in the graph has potentially 256+1 arrows
2011   // coming out, one for each byte plus the magic end of
2012   // text character kByteEndText.
2013 
2014   // To find the smallest possible prefix of an accepted
2015   // string, we just walk the graph preferring to follow
2016   // arrows with the lowest bytes possible.  To find the
2017   // largest possible prefix, we follow the largest bytes
2018   // possible.
2019 
2020   // The test for whether there is an arrow from s on byte j is
2021   //    ns = RunStateOnByteUnlocked(s, j);
2022   //    if (ns == NULL)
2023   //      return false;
2024   //    if (ns != DeadState && ns->ninst > 0)
2025   // The RunStateOnByteUnlocked call asks the DFA to build out the graph.
2026   // It returns NULL only if the DFA has run out of memory,
2027   // in which case we can't be sure of anything.
2028   // The second check sees whether there was graph built
2029   // and whether it is interesting graph.  Nodes might have
2030   // ns->ninst == 0 if they exist only to represent the fact
2031   // that a match was found on the previous byte.
2032 
2033   // Build minimum prefix.
2034   State* s = params.start;
2035   min->clear();
2036   MutexLock lock(&mutex_);
2037   for (int i = 0; i < maxlen; i++) {
2038     if (previously_visited_states[s] > kMaxEltRepetitions)
2039       break;
2040     previously_visited_states[s]++;
2041 
2042     // Stop if min is a match.
2043     State* ns = RunStateOnByte(s, kByteEndText);
2044     if (ns == NULL)  // DFA out of memory
2045       return false;
2046     if (ns != DeadState && (ns == FullMatchState || ns->IsMatch()))
2047       break;
2048 
2049     // Try to extend the string with low bytes.
2050     bool extended = false;
2051     for (int j = 0; j < 256; j++) {
2052       ns = RunStateOnByte(s, j);
2053       if (ns == NULL)  // DFA out of memory
2054         return false;
2055       if (ns == FullMatchState ||
2056           (ns > SpecialStateMax && ns->ninst_ > 0)) {
2057         extended = true;
2058         min->append(1, static_cast<char>(j));
2059         s = ns;
2060         break;
2061       }
2062     }
2063     if (!extended)
2064       break;
2065   }
2066 
2067   // Build maximum prefix.
2068   previously_visited_states.clear();
2069   s = params.start;
2070   max->clear();
2071   for (int i = 0; i < maxlen; i++) {
2072     if (previously_visited_states[s] > kMaxEltRepetitions)
2073       break;
2074     previously_visited_states[s] += 1;
2075 
2076     // Try to extend the string with high bytes.
2077     bool extended = false;
2078     for (int j = 255; j >= 0; j--) {
2079       State* ns = RunStateOnByte(s, j);
2080       if (ns == NULL)
2081         return false;
2082       if (ns == FullMatchState ||
2083           (ns > SpecialStateMax && ns->ninst_ > 0)) {
2084         extended = true;
2085         max->append(1, static_cast<char>(j));
2086         s = ns;
2087         break;
2088       }
2089     }
2090     if (!extended) {
2091       // Done, no need for PrefixSuccessor.
2092       return true;
2093     }
2094   }
2095 
2096   // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b
2097   PrefixSuccessor(max);
2098 
2099   // If there are no bytes left, we have no way to say "there is no maximum
2100   // string".  We could make the interface more complicated and be able to
2101   // return "there is no maximum but here is a minimum", but that seems like
2102   // overkill -- the most common no-max case is all possible strings, so not
2103   // telling the caller that the empty string is the minimum match isn't a
2104   // great loss.
2105   if (max->empty())
2106     return false;
2107 
2108   return true;
2109 }
2110 
2111 // PossibleMatchRange for a Prog.
PossibleMatchRange(std::string * min,std::string * max,int maxlen)2112 bool Prog::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
2113   // Have to use dfa_longest_ to get all strings for full matches.
2114   // For example, (a|aa) never matches aa in first-match mode.
2115   return GetDFA(kLongestMatch)->PossibleMatchRange(min, max, maxlen);
2116 }
2117 
2118 }  // namespace re2
2119