1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Compiled representation of regular expressions.
6 // See regexp.h for the Regexp class, which represents a regular
7 // expression symbolically.
8 
9 #ifndef RE2_PROG_H__
10 #define RE2_PROG_H__
11 
12 #include "util/util.h"
13 #include "re2/re2.h"
14 
15 namespace re2 {
16 
17 // Simple fixed-size bitmap.
18 template<int Bits>
19 class Bitmap {
20  public:
Bitmap()21   Bitmap() { Reset(); }
Size()22   int Size() { return Bits; }
23 
Reset()24   void Reset() {
25     for (int i = 0; i < Words; i++)
26       w_[i] = 0;
27   }
Get(int k)28   bool Get(int k) const {
29     return w_[k >> WordLog] & (1<<(k & 31));
30   }
Set(int k)31   void Set(int k) {
32     w_[k >> WordLog] |= 1<<(k & 31);
33   }
Clear(int k)34   void Clear(int k) {
35     w_[k >> WordLog] &= ~(1<<(k & 31));
36   }
Word(int i)37   uint32 Word(int i) const {
38     return w_[i];
39   }
40 
41  private:
42   static const int WordLog = 5;
43   static const int Words = (Bits+31)/32;
44   uint32 w_[Words];
45   DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
46 };
47 
48 
49 // Opcodes for Inst
50 enum InstOp {
51   kInstAlt = 0,      // choose between out_ and out1_
52   kInstAltMatch,     // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
53   kInstByteRange,    // next (possible case-folded) byte must be in [lo_, hi_]
54   kInstCapture,      // capturing parenthesis number cap_
55   kInstEmptyWidth,   // empty-width special (^ $ ...); bit(s) set in empty_
56   kInstMatch,        // found a match!
57   kInstNop,          // no-op; occasionally unavoidable
58   kInstFail,         // never match; occasionally unavoidable
59 };
60 
61 // Bit flags for empty-width specials
62 enum EmptyOp {
63   kEmptyBeginLine        = 1<<0,      // ^ - beginning of line
64   kEmptyEndLine          = 1<<1,      // $ - end of line
65   kEmptyBeginText        = 1<<2,      // \A - beginning of text
66   kEmptyEndText          = 1<<3,      // \z - end of text
67   kEmptyWordBoundary     = 1<<4,      // \b - word boundary
68   kEmptyNonWordBoundary  = 1<<5,      // \B - not \b
69   kEmptyAllFlags         = (1<<6)-1,
70 };
71 
72 class Regexp;
73 
74 class DFA;
75 struct OneState;
76 
77 // Compiled form of regexp program.
78 class Prog {
79  public:
80   Prog();
81   ~Prog();
82 
83   // Single instruction in regexp program.
84   class Inst {
85    public:
Inst()86     Inst() : out_opcode_(0), out1_(0) { }
87 
88     // Constructors per opcode
89     void InitAlt(uint32 out, uint32 out1);
90     void InitByteRange(int lo, int hi, int foldcase, uint32 out);
91     void InitCapture(int cap, uint32 out);
92     void InitEmptyWidth(EmptyOp empty, uint32 out);
93     void InitMatch(int id);
94     void InitNop(uint32 out);
95     void InitFail();
96 
97     // Getters
id(Prog * p)98     int id(Prog* p) { return this - p->inst_; }
opcode()99     InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
out()100     int out()     { return out_opcode_>>3; }
out1()101     int out1()    { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
cap()102     int cap()       { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
lo()103     int lo()        { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
hi()104     int hi()        { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
foldcase()105     int foldcase()  { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
match_id()106     int match_id()  { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
empty()107     EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
greedy(Prog * p)108     bool greedy(Prog *p) {
109       DCHECK_EQ(opcode(), kInstAltMatch);
110       return p->inst(out())->opcode() == kInstByteRange;
111     }
112 
113     // Does this inst (an kInstByteRange) match c?
Matches(int c)114     inline bool Matches(int c) {
115       DCHECK_EQ(opcode(), kInstByteRange);
116       if (foldcase_ && 'A' <= c && c <= 'Z')
117         c += 'a' - 'A';
118       return lo_ <= c && c <= hi_;
119     }
120 
121     // Returns string representation for debugging.
122     string Dump();
123 
124     // Maximum instruction id.
125     // (Must fit in out_opcode_, and PatchList steals another bit.)
126     static const int kMaxInst = (1<<28) - 1;
127 
128    private:
set_opcode(InstOp opcode)129     void set_opcode(InstOp opcode) {
130       out_opcode_ = (out()<<3) | opcode;
131     }
132 
set_out(int out)133     void set_out(int out) {
134       out_opcode_ = (out<<3) | opcode();
135     }
136 
set_out_opcode(int out,InstOp opcode)137     void set_out_opcode(int out, InstOp opcode) {
138       out_opcode_ = (out<<3) | opcode;
139     }
140 
141     uint32 out_opcode_;  // 29 bits of out, 3 (low) bits opcode
142     union {              // additional instruction arguments:
143       uint32 out1_;      // opcode == kInstAlt
144                          //   alternate next instruction
145 
146       int32 cap_;        // opcode == kInstCapture
147                          //   Index of capture register (holds text
148                          //   position recorded by capturing parentheses).
149                          //   For \n (the submatch for the nth parentheses),
150                          //   the left parenthesis captures into register 2*n
151                          //   and the right one captures into register 2*n+1.
152 
153       int32 match_id_;   // opcode == kInstMatch
154                          //   Match ID to identify this match (for re2::Set).
155 
156       struct {           // opcode == kInstByteRange
157         uint8 lo_;       //   byte range is lo_-hi_ inclusive
158         uint8 hi_;       //
159         uint8 foldcase_; //   convert A-Z to a-z before checking range.
160       };
161 
162       EmptyOp empty_;    // opcode == kInstEmptyWidth
163                          //   empty_ is bitwise OR of kEmpty* flags above.
164     };
165 
166     friend class Compiler;
167     friend struct PatchList;
168     friend class Prog;
169 
170     DISALLOW_EVIL_CONSTRUCTORS(Inst);
171   };
172 
173   // Whether to anchor the search.
174   enum Anchor {
175     kUnanchored,  // match anywhere
176     kAnchored,    // match only starting at beginning of text
177   };
178 
179   // Kind of match to look for (for anchor != kFullMatch)
180   //
181   // kLongestMatch mode finds the overall longest
182   // match but still makes its submatch choices the way
183   // Perl would, not in the way prescribed by POSIX.
184   // The POSIX rules are much more expensive to implement,
185   // and no one has needed them.
186   //
187   // kFullMatch is not strictly necessary -- we could use
188   // kLongestMatch and then check the length of the match -- but
189   // the matching code can run faster if it knows to consider only
190   // full matches.
191   enum MatchKind {
192     kFirstMatch,     // like Perl, PCRE
193     kLongestMatch,   // like egrep or POSIX
194     kFullMatch,      // match only entire text; implies anchor==kAnchored
195     kManyMatch       // for SearchDFA, records set of matches
196   };
197 
inst(int id)198   Inst *inst(int id) { return &inst_[id]; }
start()199   int start() { return start_; }
start_unanchored()200   int start_unanchored() { return start_unanchored_; }
set_start(int start)201   void set_start(int start) { start_ = start; }
set_start_unanchored(int start)202   void set_start_unanchored(int start) { start_unanchored_ = start; }
size()203   int64 size() { return size_; }
reversed()204   bool reversed() { return reversed_; }
set_reversed(bool reversed)205   void set_reversed(bool reversed) { reversed_ = reversed; }
byte_inst_count()206   int64 byte_inst_count() { return byte_inst_count_; }
byterange()207   const Bitmap<256>& byterange() { return byterange_; }
set_dfa_mem(int64 dfa_mem)208   void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
dfa_mem()209   int64 dfa_mem() { return dfa_mem_; }
flags()210   int flags() { return flags_; }
set_flags(int flags)211   void set_flags(int flags) { flags_ = flags; }
anchor_start()212   bool anchor_start() { return anchor_start_; }
set_anchor_start(bool b)213   void set_anchor_start(bool b) { anchor_start_ = b; }
anchor_end()214   bool anchor_end() { return anchor_end_; }
set_anchor_end(bool b)215   void set_anchor_end(bool b) { anchor_end_ = b; }
bytemap_range()216   int bytemap_range() { return bytemap_range_; }
bytemap()217   const uint8* bytemap() { return bytemap_; }
218 
219   // Returns string representation of program for debugging.
220   string Dump();
221   string DumpUnanchored();
222 
223   // Record that at some point in the prog, the bytes in the range
224   // lo-hi (inclusive) are treated as different from bytes outside the range.
225   // Tracking this lets the DFA collapse commonly-treated byte ranges
226   // when recording state pointers, greatly reducing its memory footprint.
227   void MarkByteRange(int lo, int hi);
228 
229   // Returns the set of kEmpty flags that are in effect at
230   // position p within context.
231   static uint32 EmptyFlags(const StringPiece& context, const char* p);
232 
233   // Returns whether byte c is a word character: ASCII only.
234   // Used by the implementation of \b and \B.
235   // This is not right for Unicode, but:
236   //   - it's hard to get right in a byte-at-a-time matching world
237   //     (the DFA has only one-byte lookahead).
238   //   - even if the lookahead were possible, the Progs would be huge.
239   // This crude approximation is the same one PCRE uses.
IsWordChar(uint8 c)240   static bool IsWordChar(uint8 c) {
241     return ('A' <= c && c <= 'Z') ||
242            ('a' <= c && c <= 'z') ||
243            ('0' <= c && c <= '9') ||
244            c == '_';
245   }
246 
247   // Execution engines.  They all search for the regexp (run the prog)
248   // in text, which is in the larger context (used for ^ $ \b etc).
249   // Anchor and kind control the kind of search.
250   // Returns true if match found, false if not.
251   // If match found, fills match[0..nmatch-1] with submatch info.
252   // match[0] is overall match, match[1] is first set of parens, etc.
253   // If a particular submatch is not matched during the regexp match,
254   // it is set to NULL.
255   //
256   // Matching text == StringPiece(NULL, 0) is treated as any other empty
257   // string, but note that on return, it will not be possible to distinguish
258   // submatches that matched that empty string from submatches that didn't
259   // match anything.  Either way, match[i] == NULL.
260 
261   // Search using NFA: can find submatches but kind of slow.
262   bool SearchNFA(const StringPiece& text, const StringPiece& context,
263                  Anchor anchor, MatchKind kind,
264                  StringPiece* match, int nmatch);
265 
266   // Search using DFA: much faster than NFA but only finds
267   // end of match and can use a lot more memory.
268   // Returns whether a match was found.
269   // If the DFA runs out of memory, sets *failed to true and returns false.
270   // If matches != NULL and kind == kManyMatch and there is a match,
271   // SearchDFA fills matches with the match IDs of the final matching state.
272   bool SearchDFA(const StringPiece& text, const StringPiece& context,
273                  Anchor anchor, MatchKind kind,
274                  StringPiece* match0, bool* failed,
275                  vector<int>* matches);
276 
277   // Build the entire DFA for the given match kind.  FOR TESTING ONLY.
278   // Usually the DFA is built out incrementally, as needed, which
279   // avoids lots of unnecessary work.  This function is useful only
280   // for testing purposes.  Returns number of states.
281   int BuildEntireDFA(MatchKind kind);
282   std::string PrintEntireDFA(MatchKind kind);
283 
284   // Compute byte map.
285   void ComputeByteMap();
286 
287   // Run peep-hole optimizer on program.
288   void Optimize();
289 
290   // One-pass NFA: only correct if IsOnePass() is true,
291   // but much faster than NFA (competitive with PCRE)
292   // for those expressions.
293   bool IsOnePass();
294   bool SearchOnePass(const StringPiece& text, const StringPiece& context,
295                      Anchor anchor, MatchKind kind,
296                      StringPiece* match, int nmatch);
297 
298   // Bit-state backtracking.  Fast on small cases but uses memory
299   // proportional to the product of the program size and the text size.
300   bool SearchBitState(const StringPiece& text, const StringPiece& context,
301                       Anchor anchor, MatchKind kind,
302                       StringPiece* match, int nmatch);
303 
304   static const int kMaxOnePassCapture = 5;  // $0 through $4
305 
306   // Backtracking search: the gold standard against which the other
307   // implementations are checked.  FOR TESTING ONLY.
308   // It allocates a ton of memory to avoid running forever.
309   // It is also recursive, so can't use in production (will overflow stacks).
310   // The name "Unsafe" here is supposed to be a flag that
311   // you should not be using this function.
312   bool UnsafeSearchBacktrack(const StringPiece& text,
313                              const StringPiece& context,
314                              Anchor anchor, MatchKind kind,
315                              StringPiece* match, int nmatch);
316 
317   // Computes range for any strings matching regexp. The min and max can in
318   // some cases be arbitrarily precise, so the caller gets to specify the
319   // maximum desired length of string returned.
320   //
321   // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
322   // string s that is an anchored match for this regexp satisfies
323   //   min <= s && s <= max.
324   //
325   // Note that PossibleMatchRange() will only consider the first copy of an
326   // infinitely repeated element (i.e., any regexp element followed by a '*' or
327   // '+' operator). Regexps with "{N}" constructions are not affected, as those
328   // do not compile down to infinite repetitions.
329   //
330   // Returns true on success, false on error.
331   bool PossibleMatchRange(string* min, string* max, int maxlen);
332 
333   // Compiles a collection of regexps to Prog.  Each regexp will have
334   // its own Match instruction recording the index in the vector.
335   static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
336                           Regexp* re);
337 
338  private:
339   friend class Compiler;
340 
341   DFA* GetDFA(MatchKind kind);
342 
343   bool anchor_start_;       // regexp has explicit start anchor
344   bool anchor_end_;         // regexp has explicit end anchor
345   bool reversed_;           // whether program runs backward over input
346   bool did_onepass_;        // has IsOnePass been called?
347 
348   int start_;               // entry point for program
349   int start_unanchored_;    // unanchored entry point for program
350   int size_;                // number of instructions
351   int byte_inst_count_;     // number of kInstByteRange instructions
352   int bytemap_range_;       // bytemap_[x] < bytemap_range_
353   int flags_;               // regexp parse flags
354   int onepass_statesize_;   // byte size of each OneState* node
355 
356   Inst* inst_;              // pointer to instruction array
357 
358   Mutex dfa_mutex_;    // Protects dfa_first_, dfa_longest_
359   DFA* volatile dfa_first_;     // DFA cached for kFirstMatch
360   DFA* volatile dfa_longest_;   // DFA cached for kLongestMatch and kFullMatch
361   int64 dfa_mem_;      // Maximum memory for DFAs.
362   void (*delete_dfa_)(DFA* dfa);
363 
364   Bitmap<256> byterange_;    // byterange.Get(x) true if x ends a
365                              // commonly-treated byte range.
366   uint8 bytemap_[256];       // map from input bytes to byte classes
367   uint8 *unbytemap_;         // bytemap_[unbytemap_[x]] == x
368 
369   uint8* onepass_nodes_;     // data for OnePass nodes
370   OneState* onepass_start_;  // start node for OnePass program
371 
372   DISALLOW_EVIL_CONSTRUCTORS(Prog);
373 };
374 
375 }  // namespace re2
376 
377 #endif  // RE2_PROG_H__
378