1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc
6 //
7 // Prog::BadSearchBacktrack is a backtracking regular expression search,
8 // except that it remembers where it has been, trading a lot of
9 // memory for a lot of time. It exists only for testing purposes.
10 //
11 // Let me repeat that.
12 //
13 // THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
14 //   - It uses a ton of memory.
15 //   - It uses a ton of stack.
16 //   - It uses CHECK and LOG(FATAL).
17 //   - It implements unanchored search by repeated anchored search.
18 //
19 // On the other hand, it is very simple and a good reference
20 // implementation for the more complicated regexp packages.
21 //
22 // In BUILD, this file is linked into the ":testing" library,
23 // not the main library, in order to make it harder to pick up
24 // accidentally.
25 
26 #include "util/util.h"
27 #include "re2/prog.h"
28 #include "re2/regexp.h"
29 
30 namespace re2 {
31 
32 // Backtracker holds the state for a backtracking search.
33 //
34 // Excluding the search parameters, the main search state
35 // is just the "capture registers", which record, for the
36 // current execution, the string position at which each
37 // parenthesis was passed.  cap_[0] and cap_[1] are the
38 // left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
39 //
40 // To avoid infinite loops during backtracking on expressions
41 // like (a*)*, the visited_[] bitmap marks the (state, string-position)
42 // pairs that have already been explored and are thus not worth
43 // re-exploring if we get there via another path.  Modern backtracking
44 // libraries engineer their program representation differently, to make
45 // such infinite loops possible to avoid without keeping a giant visited_
46 // bitmap, but visited_ works fine for a reference implementation
47 // and it has the nice benefit of making the search run in linear time.
48 class Backtracker {
49  public:
50   explicit Backtracker(Prog* prog);
51   ~Backtracker();
52 
53   bool Search(const StringPiece& text, const StringPiece& context,
54               bool anchored, bool longest,
55               StringPiece* submatch, int nsubmatch);
56 
57  private:
58   // Explores from instruction ip at string position p looking for a match.
59   // Returns true if found (so that caller can stop trying other possibilities).
60   bool Visit(int id, const char* p);
61 
62   // Search parameters
63   Prog* prog_;              // program being run
64   StringPiece text_;        // text being searched
65   StringPiece context_;     // greater context of text being searched
66   bool anchored_;           // whether search is anchored at text.begin()
67   bool longest_;            // whether search wants leftmost-longest match
68   bool endmatch_;           // whether search must end at text.end()
69   StringPiece *submatch_;   // submatches to fill in
70   int nsubmatch_;           //   # of submatches to fill in
71 
72   // Search state
73   const char* cap_[64];     // capture registers
74   uint32 *visited_;         // bitmap: (Inst*, char*) pairs already backtracked
75   int nvisited_;            //   # of words in bitmap
76 };
77 
Backtracker(Prog * prog)78 Backtracker::Backtracker(Prog* prog)
79   : prog_(prog),
80     anchored_(false),
81     longest_(false),
82     endmatch_(false),
83     submatch_(NULL),
84     nsubmatch_(0),
85     visited_(NULL),
86     nvisited_(0) {
87 }
88 
~Backtracker()89 Backtracker::~Backtracker() {
90   delete[] visited_;
91 }
92 
93 // Runs a backtracking search.
Search(const StringPiece & text,const StringPiece & context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)94 bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
95                          bool anchored, bool longest,
96                          StringPiece* submatch, int nsubmatch) {
97   text_ = text;
98   context_ = context;
99   if (context_.begin() == NULL)
100     context_ = text;
101   if (prog_->anchor_start() && text.begin() > context_.begin())
102     return false;
103   if (prog_->anchor_end() && text.end() < context_.end())
104     return false;
105   anchored_ = anchored | prog_->anchor_start();
106   longest_ = longest | prog_->anchor_end();
107   endmatch_ = prog_->anchor_end();
108   submatch_ = submatch;
109   nsubmatch_ = nsubmatch;
110   CHECK(2*nsubmatch_ < arraysize(cap_));
111   memset(cap_, 0, sizeof cap_);
112 
113   // We use submatch_[0] for our own bookkeeping,
114   // so it had better exist.
115   StringPiece sp0;
116   if (nsubmatch < 1) {
117     submatch_ = &sp0;
118     nsubmatch_ = 1;
119   }
120   submatch_[0] = NULL;
121 
122   // Allocate new visited_ bitmap -- size is proportional
123   // to text, so have to reallocate on each call to Search.
124   delete[] visited_;
125   nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
126   visited_ = new uint32[nvisited_];
127   memset(visited_, 0, nvisited_*sizeof visited_[0]);
128 
129   // Anchored search must start at text.begin().
130   if (anchored_) {
131     cap_[0] = text.begin();
132     return Visit(prog_->start(), text.begin());
133   }
134 
135   // Unanchored search, starting from each possible text position.
136   // Notice that we have to try the empty string at the end of
137   // the text, so the loop condition is p <= text.end(), not p < text.end().
138   for (const char* p = text.begin(); p <= text.end(); p++) {
139     cap_[0] = p;
140     if (Visit(prog_->start(), p))  // Match must be leftmost; done.
141       return true;
142   }
143   return false;
144 }
145 
146 // Explores from instruction ip at string position p looking for a match.
147 // Return true if found (so that caller can stop trying other possibilities).
Visit(int id,const char * p)148 bool Backtracker::Visit(int id, const char* p) {
149   // Check bitmap.  If we've already explored from here,
150   // either it didn't match or it did but we're hoping for a better match.
151   // Either way, don't go down that road again.
152   CHECK(p <= text_.end());
153   int n = id*(text_.size()+1) + (p - text_.begin());
154   CHECK_LT(n/32, nvisited_);
155   if (visited_[n/32] & (1 << (n&31)))
156     return false;
157   visited_[n/32] |= 1 << (n&31);
158 
159   // Pick out byte at current position.  If at end of string,
160   // have to explore in hope of finishing a match.  Use impossible byte -1.
161   int c = -1;
162   if (p < text_.end())
163     c = *p & 0xFF;
164 
165   Prog::Inst* ip = prog_->inst(id);
166   switch (ip->opcode()) {
167     default:
168       LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
169       return false;  // not reached
170 
171     case kInstAlt:
172     case kInstAltMatch:
173       // Try both possible next states: out is preferred to out1.
174       if (Visit(ip->out(), p)) {
175         if (longest_)
176           Visit(ip->out1(), p);
177         return true;
178       }
179       return Visit(ip->out1(), p);
180 
181     case kInstByteRange:
182       if (ip->Matches(c))
183         return Visit(ip->out(), p+1);
184       return false;
185 
186     case kInstCapture:
187       if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) {
188         // Capture p to register, but save old value.
189         const char* q = cap_[ip->cap()];
190         cap_[ip->cap()] = p;
191         bool ret = Visit(ip->out(), p);
192         // Restore old value as we backtrack.
193         cap_[ip->cap()] = q;
194         return ret;
195       }
196       return Visit(ip->out(), p);
197 
198     case kInstEmptyWidth:
199       if (ip->empty() & ~Prog::EmptyFlags(context_, p))
200         return false;
201       return Visit(ip->out(), p);
202 
203     case kInstNop:
204       return Visit(ip->out(), p);
205 
206     case kInstMatch:
207       // We found a match.  If it's the best so far, record the
208       // parameters in the caller's submatch_ array.
209       if (endmatch_ && p != context_.end())
210         return false;
211       cap_[1] = p;
212       if (submatch_[0].data() == NULL ||           // First match so far ...
213           (longest_ && p > submatch_[0].end())) {  // ... or better match
214         for (int i = 0; i < nsubmatch_; i++)
215           submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
216       }
217       return true;
218 
219     case kInstFail:
220       return false;
221   }
222 }
223 
224 // Runs a backtracking search.
UnsafeSearchBacktrack(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)225 bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
226                                  const StringPiece& context,
227                                  Anchor anchor,
228                                  MatchKind kind,
229                                  StringPiece* match,
230                                  int nmatch) {
231   // If full match, we ask for an anchored longest match
232   // and then check that match[0] == text.
233   // So make sure match[0] exists.
234   StringPiece sp0;
235   if (kind == kFullMatch) {
236     anchor = kAnchored;
237     if (nmatch < 1) {
238       match = &sp0;
239       nmatch = 1;
240     }
241   }
242 
243   // Run the search.
244   Backtracker b(this);
245   bool anchored = anchor == kAnchored;
246   bool longest = kind != kFirstMatch;
247   if (!b.Search(text, context, anchored, longest, match, nmatch))
248     return false;
249   if (kind == kFullMatch && match[0].end() != text.end())
250     return false;
251   return true;
252 }
253 
254 }  // namespace re2
255