1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #ifndef RE2_PROG_H_
6 #define RE2_PROG_H_
7 
8 // Compiled representation of regular expressions.
9 // See regexp.h for the Regexp class, which represents a regular
10 // expression symbolically.
11 
12 #include <stdint.h>
13 #include <functional>
14 #include <mutex>
15 #include <string>
16 #include <vector>
17 #include <type_traits>
18 
19 #include "util/util.h"
20 #include "util/logging.h"
21 #include "re2/pod_array.h"
22 #include "re2/re2.h"
23 #include "re2/sparse_array.h"
24 #include "re2/sparse_set.h"
25 
26 namespace re2 {
27 
28 // Opcodes for Inst
29 enum InstOp {
30   kInstAlt = 0,      // choose between out_ and out1_
31   kInstAltMatch,     // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
32   kInstByteRange,    // next (possible case-folded) byte must be in [lo_, hi_]
33   kInstCapture,      // capturing parenthesis number cap_
34   kInstEmptyWidth,   // empty-width special (^ $ ...); bit(s) set in empty_
35   kInstMatch,        // found a match!
36   kInstNop,          // no-op; occasionally unavoidable
37   kInstFail,         // never match; occasionally unavoidable
38   kNumInst,
39 };
40 
41 // Bit flags for empty-width specials
42 enum EmptyOp {
43   kEmptyBeginLine        = 1<<0,      // ^ - beginning of line
44   kEmptyEndLine          = 1<<1,      // $ - end of line
45   kEmptyBeginText        = 1<<2,      // \A - beginning of text
46   kEmptyEndText          = 1<<3,      // \z - end of text
47   kEmptyWordBoundary     = 1<<4,      // \b - word boundary
48   kEmptyNonWordBoundary  = 1<<5,      // \B - not \b
49   kEmptyAllFlags         = (1<<6)-1,
50 };
51 
52 class DFA;
53 class Regexp;
54 
55 // Compiled form of regexp program.
56 class Prog {
57  public:
58   Prog();
59   ~Prog();
60 
61   // Single instruction in regexp program.
62   class Inst {
63    public:
64     // See the assertion below for why this is so.
65     Inst() = default;
66 
67     // Copyable.
68     Inst(const Inst&) = default;
69     Inst& operator=(const Inst&) = default;
70 
71     // Constructors per opcode
72     void InitAlt(uint32_t out, uint32_t out1);
73     void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
74     void InitCapture(int cap, uint32_t out);
75     void InitEmptyWidth(EmptyOp empty, uint32_t out);
76     void InitMatch(int id);
77     void InitNop(uint32_t out);
78     void InitFail();
79 
80     // Getters
id(Prog * p)81     int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
opcode()82     InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
last()83     int last()      { return (out_opcode_>>3)&1; }
out()84     int out()       { return out_opcode_>>4; }
out1()85     int out1()      { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
cap()86     int cap()       { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
lo()87     int lo()        { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
hi()88     int hi()        { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
foldcase()89     int foldcase()  { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_&1; }
hint()90     int hint()      { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_>>1; }
match_id()91     int match_id()  { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
empty()92     EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
93 
greedy(Prog * p)94     bool greedy(Prog* p) {
95       DCHECK_EQ(opcode(), kInstAltMatch);
96       return p->inst(out())->opcode() == kInstByteRange ||
97              (p->inst(out())->opcode() == kInstNop &&
98               p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
99     }
100 
101     // Does this inst (an kInstByteRange) match c?
Matches(int c)102     inline bool Matches(int c) {
103       DCHECK_EQ(opcode(), kInstByteRange);
104       if (foldcase() && 'A' <= c && c <= 'Z')
105         c += 'a' - 'A';
106       return lo_ <= c && c <= hi_;
107     }
108 
109     // Returns string representation for debugging.
110     std::string Dump();
111 
112     // Maximum instruction id.
113     // (Must fit in out_opcode_. PatchList/last steal another bit.)
114     static const int kMaxInst = (1<<28) - 1;
115 
116    private:
set_opcode(InstOp opcode)117     void set_opcode(InstOp opcode) {
118       out_opcode_ = (out()<<4) | (last()<<3) | opcode;
119     }
120 
set_last()121     void set_last() {
122       out_opcode_ = (out()<<4) | (1<<3) | opcode();
123     }
124 
set_out(int out)125     void set_out(int out) {
126       out_opcode_ = (out<<4) | (last()<<3) | opcode();
127     }
128 
set_out_opcode(int out,InstOp opcode)129     void set_out_opcode(int out, InstOp opcode) {
130       out_opcode_ = (out<<4) | (last()<<3) | opcode;
131     }
132 
133     uint32_t out_opcode_;  // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
134     union {                // additional instruction arguments:
135       uint32_t out1_;      // opcode == kInstAlt
136                            //   alternate next instruction
137 
138       int32_t cap_;        // opcode == kInstCapture
139                            //   Index of capture register (holds text
140                            //   position recorded by capturing parentheses).
141                            //   For \n (the submatch for the nth parentheses),
142                            //   the left parenthesis captures into register 2*n
143                            //   and the right one captures into register 2*n+1.
144 
145       int32_t match_id_;   // opcode == kInstMatch
146                            //   Match ID to identify this match (for re2::Set).
147 
148       struct {             // opcode == kInstByteRange
149         uint8_t lo_;       //   byte range is lo_-hi_ inclusive
150         uint8_t hi_;       //
151         uint16_t hint_foldcase_;  // 15 bits: hint, 1 (low) bit: foldcase
152                            //   hint to execution engines: the delta to the
153                            //   next instruction (in the current list) worth
154                            //   exploring iff this instruction matched; 0
155                            //   means there are no remaining possibilities,
156                            //   which is most likely for character classes.
157                            //   foldcase: A-Z -> a-z before checking range.
158       };
159 
160       EmptyOp empty_;       // opcode == kInstEmptyWidth
161                             //   empty_ is bitwise OR of kEmpty* flags above.
162     };
163 
164     friend class Compiler;
165     friend struct PatchList;
166     friend class Prog;
167   };
168 
169   // Inst must be trivial so that we can freely clear it with memset(3).
170   // Arrays of Inst are initialised by copying the initial elements with
171   // memmove(3) and then clearing any remaining elements with memset(3).
172   static_assert(std::is_trivial<Inst>::value, "Inst must be trivial");
173 
174   // Whether to anchor the search.
175   enum Anchor {
176     kUnanchored,  // match anywhere
177     kAnchored,    // match only starting at beginning of text
178   };
179 
180   // Kind of match to look for (for anchor != kFullMatch)
181   //
182   // kLongestMatch mode finds the overall longest
183   // match but still makes its submatch choices the way
184   // Perl would, not in the way prescribed by POSIX.
185   // The POSIX rules are much more expensive to implement,
186   // and no one has needed them.
187   //
188   // kFullMatch is not strictly necessary -- we could use
189   // kLongestMatch and then check the length of the match -- but
190   // the matching code can run faster if it knows to consider only
191   // full matches.
192   enum MatchKind {
193     kFirstMatch,     // like Perl, PCRE
194     kLongestMatch,   // like egrep or POSIX
195     kFullMatch,      // match only entire text; implies anchor==kAnchored
196     kManyMatch       // for SearchDFA, records set of matches
197   };
198 
inst(int id)199   Inst *inst(int id) { return &inst_[id]; }
start()200   int start() { return start_; }
set_start(int start)201   void set_start(int start) { start_ = start; }
start_unanchored()202   int start_unanchored() { return start_unanchored_; }
set_start_unanchored(int start)203   void set_start_unanchored(int start) { start_unanchored_ = start; }
size()204   int size() { return size_; }
reversed()205   bool reversed() { return reversed_; }
set_reversed(bool reversed)206   void set_reversed(bool reversed) { reversed_ = reversed; }
list_count()207   int list_count() { return list_count_; }
inst_count(InstOp op)208   int inst_count(InstOp op) { return inst_count_[op]; }
list_heads()209   uint16_t* list_heads() { return list_heads_.data(); }
dfa_mem()210   int64_t dfa_mem() { return dfa_mem_; }
set_dfa_mem(int64_t dfa_mem)211   void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
anchor_start()212   bool anchor_start() { return anchor_start_; }
set_anchor_start(bool b)213   void set_anchor_start(bool b) { anchor_start_ = b; }
anchor_end()214   bool anchor_end() { return anchor_end_; }
set_anchor_end(bool b)215   void set_anchor_end(bool b) { anchor_end_ = b; }
bytemap_range()216   int bytemap_range() { return bytemap_range_; }
bytemap()217   const uint8_t* bytemap() { return bytemap_; }
can_prefix_accel()218   bool can_prefix_accel() { return prefix_size_ != 0; }
219 
220   // Accelerates to the first likely occurrence of the prefix.
221   // Returns a pointer to the first byte or NULL if not found.
PrefixAccel(const void * data,size_t size)222   const void* PrefixAccel(const void* data, size_t size) {
223     DCHECK(can_prefix_accel());
224     if (prefix_foldcase_) {
225       return PrefixAccel_ShiftDFA(data, size);
226     } else if (prefix_size_ != 1) {
227       return PrefixAccel_FrontAndBack(data, size);
228     } else {
229       return memchr(data, prefix_front_, size);
230     }
231   }
232 
233   // Configures prefix accel using the analysis performed during compilation.
234   void ConfigurePrefixAccel(const std::string& prefix, bool prefix_foldcase);
235 
236   // An implementation of prefix accel that uses prefix_dfa_ to perform
237   // case-insensitive search.
238   const void* PrefixAccel_ShiftDFA(const void* data, size_t size);
239 
240   // An implementation of prefix accel that looks for prefix_front_ and
241   // prefix_back_ to return fewer false positives than memchr(3) alone.
242   const void* PrefixAccel_FrontAndBack(const void* data, size_t size);
243 
244   // Returns string representation of program for debugging.
245   std::string Dump();
246   std::string DumpUnanchored();
247   std::string DumpByteMap();
248 
249   // Returns the set of kEmpty flags that are in effect at
250   // position p within context.
251   static uint32_t EmptyFlags(const StringPiece& context, const char* p);
252 
253   // Returns whether byte c is a word character: ASCII only.
254   // Used by the implementation of \b and \B.
255   // This is not right for Unicode, but:
256   //   - it's hard to get right in a byte-at-a-time matching world
257   //     (the DFA has only one-byte lookahead).
258   //   - even if the lookahead were possible, the Progs would be huge.
259   // This crude approximation is the same one PCRE uses.
IsWordChar(uint8_t c)260   static bool IsWordChar(uint8_t c) {
261     return ('A' <= c && c <= 'Z') ||
262            ('a' <= c && c <= 'z') ||
263            ('0' <= c && c <= '9') ||
264            c == '_';
265   }
266 
267   // Execution engines.  They all search for the regexp (run the prog)
268   // in text, which is in the larger context (used for ^ $ \b etc).
269   // Anchor and kind control the kind of search.
270   // Returns true if match found, false if not.
271   // If match found, fills match[0..nmatch-1] with submatch info.
272   // match[0] is overall match, match[1] is first set of parens, etc.
273   // If a particular submatch is not matched during the regexp match,
274   // it is set to NULL.
275   //
276   // Matching text == StringPiece(NULL, 0) is treated as any other empty
277   // string, but note that on return, it will not be possible to distinguish
278   // submatches that matched that empty string from submatches that didn't
279   // match anything.  Either way, match[i] == NULL.
280 
281   // Search using NFA: can find submatches but kind of slow.
282   bool SearchNFA(const StringPiece& text, const StringPiece& context,
283                  Anchor anchor, MatchKind kind,
284                  StringPiece* match, int nmatch);
285 
286   // Search using DFA: much faster than NFA but only finds
287   // end of match and can use a lot more memory.
288   // Returns whether a match was found.
289   // If the DFA runs out of memory, sets *failed to true and returns false.
290   // If matches != NULL and kind == kManyMatch and there is a match,
291   // SearchDFA fills matches with the match IDs of the final matching state.
292   bool SearchDFA(const StringPiece& text, const StringPiece& context,
293                  Anchor anchor, MatchKind kind, StringPiece* match0,
294                  bool* failed, SparseSet* matches);
295 
296   // The callback issued after building each DFA state with BuildEntireDFA().
297   // If next is null, then the memory budget has been exhausted and building
298   // will halt. Otherwise, the state has been built and next points to an array
299   // of bytemap_range()+1 slots holding the next states as per the bytemap and
300   // kByteEndText. The number of the state is implied by the callback sequence:
301   // the first callback is for state 0, the second callback is for state 1, ...
302   // match indicates whether the state is a matching state.
303   using DFAStateCallback = std::function<void(const int* next, bool match)>;
304 
305   // Build the entire DFA for the given match kind.
306   // Usually the DFA is built out incrementally, as needed, which
307   // avoids lots of unnecessary work.
308   // If cb is not empty, it receives one callback per state built.
309   // Returns the number of states built.
310   // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
311   int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
312 
313   // Compute bytemap.
314   void ComputeByteMap();
315 
316   // Run peep-hole optimizer on program.
317   void Optimize();
318 
319   // One-pass NFA: only correct if IsOnePass() is true,
320   // but much faster than NFA (competitive with PCRE)
321   // for those expressions.
322   bool IsOnePass();
323   bool SearchOnePass(const StringPiece& text, const StringPiece& context,
324                      Anchor anchor, MatchKind kind,
325                      StringPiece* match, int nmatch);
326 
327   // Bit-state backtracking.  Fast on small cases but uses memory
328   // proportional to the product of the list count and the text size.
CanBitState()329   bool CanBitState() { return list_heads_.data() != NULL; }
330   bool SearchBitState(const StringPiece& text, const StringPiece& context,
331                       Anchor anchor, MatchKind kind,
332                       StringPiece* match, int nmatch);
333 
334   static const int kMaxOnePassCapture = 5;  // $0 through $4
335 
336   // Backtracking search: the gold standard against which the other
337   // implementations are checked.  FOR TESTING ONLY.
338   // It allocates a ton of memory to avoid running forever.
339   // It is also recursive, so can't use in production (will overflow stacks).
340   // The name "Unsafe" here is supposed to be a flag that
341   // you should not be using this function.
342   bool UnsafeSearchBacktrack(const StringPiece& text,
343                              const StringPiece& context,
344                              Anchor anchor, MatchKind kind,
345                              StringPiece* match, int nmatch);
346 
347   // Computes range for any strings matching regexp. The min and max can in
348   // some cases be arbitrarily precise, so the caller gets to specify the
349   // maximum desired length of string returned.
350   //
351   // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
352   // string s that is an anchored match for this regexp satisfies
353   //   min <= s && s <= max.
354   //
355   // Note that PossibleMatchRange() will only consider the first copy of an
356   // infinitely repeated element (i.e., any regexp element followed by a '*' or
357   // '+' operator). Regexps with "{N}" constructions are not affected, as those
358   // do not compile down to infinite repetitions.
359   //
360   // Returns true on success, false on error.
361   bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
362 
363   // EXPERIMENTAL! SUBJECT TO CHANGE!
364   // Outputs the program fanout into the given sparse array.
365   void Fanout(SparseArray<int>* fanout);
366 
367   // Compiles a collection of regexps to Prog.  Each regexp will have
368   // its own Match instruction recording the index in the output vector.
369   static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
370 
371   // Flattens the Prog from "tree" form to "list" form. This is an in-place
372   // operation in the sense that the old instructions are lost.
373   void Flatten();
374 
375   // Walks the Prog; the "successor roots" or predecessors of the reachable
376   // instructions are marked in rootmap or predmap/predvec, respectively.
377   // reachable and stk are preallocated scratch structures.
378   void MarkSuccessors(SparseArray<int>* rootmap,
379                       SparseArray<int>* predmap,
380                       std::vector<std::vector<int>>* predvec,
381                       SparseSet* reachable, std::vector<int>* stk);
382 
383   // Walks the Prog from the given "root" instruction; the "dominator root"
384   // of the reachable instructions (if such exists) is marked in rootmap.
385   // reachable and stk are preallocated scratch structures.
386   void MarkDominator(int root, SparseArray<int>* rootmap,
387                      SparseArray<int>* predmap,
388                      std::vector<std::vector<int>>* predvec,
389                      SparseSet* reachable, std::vector<int>* stk);
390 
391   // Walks the Prog from the given "root" instruction; the reachable
392   // instructions are emitted in "list" form and appended to flat.
393   // reachable and stk are preallocated scratch structures.
394   void EmitList(int root, SparseArray<int>* rootmap,
395                 std::vector<Inst>* flat,
396                 SparseSet* reachable, std::vector<int>* stk);
397 
398   // Computes hints for ByteRange instructions in [begin, end).
399   void ComputeHints(std::vector<Inst>* flat, int begin, int end);
400 
401   // Controls whether the DFA should bail out early if the NFA would be faster.
402   // FOR TESTING ONLY.
403   static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b);
404 
405  private:
406   friend class Compiler;
407 
408   DFA* GetDFA(MatchKind kind);
409   void DeleteDFA(DFA* dfa);
410 
411   bool anchor_start_;       // regexp has explicit start anchor
412   bool anchor_end_;         // regexp has explicit end anchor
413   bool reversed_;           // whether program runs backward over input
414   bool did_flatten_;        // has Flatten been called?
415   bool did_onepass_;        // has IsOnePass been called?
416 
417   int start_;               // entry point for program
418   int start_unanchored_;    // unanchored entry point for program
419   int size_;                // number of instructions
420   int bytemap_range_;       // bytemap_[x] < bytemap_range_
421 
422   bool prefix_foldcase_;    // whether prefix is case-insensitive
423   size_t prefix_size_;      // size of prefix (0 if no prefix)
424   union {
425     uint64_t* prefix_dfa_;  // "Shift DFA" for prefix
426     struct {
427       int prefix_front_;    // first byte of prefix
428       int prefix_back_;     // last byte of prefix
429     };
430   };
431 
432   int list_count_;                 // count of lists (see above)
433   int inst_count_[kNumInst];       // count of instructions by opcode
434   PODArray<uint16_t> list_heads_;  // sparse array enumerating list heads
435                                    // not populated if size_ is overly large
436 
437   PODArray<Inst> inst_;              // pointer to instruction array
438   PODArray<uint8_t> onepass_nodes_;  // data for OnePass nodes
439 
440   int64_t dfa_mem_;         // Maximum memory for DFAs.
441   DFA* dfa_first_;          // DFA cached for kFirstMatch/kManyMatch
442   DFA* dfa_longest_;        // DFA cached for kLongestMatch/kFullMatch
443 
444   uint8_t bytemap_[256];    // map from input bytes to byte classes
445 
446   std::once_flag dfa_first_once_;
447   std::once_flag dfa_longest_once_;
448 
449   Prog(const Prog&) = delete;
450   Prog& operator=(const Prog&) = delete;
451 };
452 
453 }  // namespace re2
454 
455 #endif  // RE2_PROG_H_
456