1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_REGEXP_COMPILER_H_
6 #define V8_REGEXP_REGEXP_COMPILER_H_
7 
8 #include <bitset>
9 
10 #include "irregexp/imported/regexp-nodes.h"
11 
12 namespace v8 {
13 namespace internal {
14 
15 class DynamicBitSet;
16 class Isolate;
17 
18 namespace regexp_compiler_constants {
19 
20 // The '2' variant is has inclusive from and exclusive to.
21 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
22 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
23 constexpr uc32 kRangeEndMarker = 0x110000;
24 constexpr int kSpaceRanges[] = {
25     '\t',   '\r' + 1, ' ',    ' ' + 1, 0x00A0, 0x00A1, 0x1680,
26     0x1681, 0x2000,   0x200B, 0x2028,  0x202A, 0x202F, 0x2030,
27     0x205F, 0x2060,   0x3000, 0x3001,  0xFEFF, 0xFF00, kRangeEndMarker};
28 constexpr int kSpaceRangeCount = arraysize(kSpaceRanges);
29 
30 constexpr int kWordRanges[] = {'0',     '9' + 1, 'A',     'Z' + 1,        '_',
31                                '_' + 1, 'a',     'z' + 1, kRangeEndMarker};
32 constexpr int kWordRangeCount = arraysize(kWordRanges);
33 constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
34 constexpr int kDigitRangeCount = arraysize(kDigitRanges);
35 constexpr int kSurrogateRanges[] = {kLeadSurrogateStart,
36                                     kLeadSurrogateStart + 1, kRangeEndMarker};
37 constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges);
38 constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D,         0x000E,
39                                          0x2028, 0x202A, kRangeEndMarker};
40 constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
41 
42 // More makes code generation slower, less makes V8 benchmark score lower.
43 constexpr int kMaxLookaheadForBoyerMoore = 8;
44 // In a 3-character pattern you can maximally step forwards 3 characters
45 // at a time, which is not always enough to pay for the extra logic.
46 constexpr int kPatternTooShortForBoyerMoore = 2;
47 
48 }  // namespace regexp_compiler_constants
49 
IgnoreCase(JSRegExp::Flags flags)50 inline bool IgnoreCase(JSRegExp::Flags flags) {
51   return (flags & JSRegExp::kIgnoreCase) != 0;
52 }
53 
IsUnicode(JSRegExp::Flags flags)54 inline bool IsUnicode(JSRegExp::Flags flags) {
55   return (flags & JSRegExp::kUnicode) != 0;
56 }
57 
IsSticky(JSRegExp::Flags flags)58 inline bool IsSticky(JSRegExp::Flags flags) {
59   return (flags & JSRegExp::kSticky) != 0;
60 }
61 
IsGlobal(JSRegExp::Flags flags)62 inline bool IsGlobal(JSRegExp::Flags flags) {
63   return (flags & JSRegExp::kGlobal) != 0;
64 }
65 
DotAll(JSRegExp::Flags flags)66 inline bool DotAll(JSRegExp::Flags flags) {
67   return (flags & JSRegExp::kDotAll) != 0;
68 }
69 
Multiline(JSRegExp::Flags flags)70 inline bool Multiline(JSRegExp::Flags flags) {
71   return (flags & JSRegExp::kMultiline) != 0;
72 }
73 
NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags)74 inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
75   // Both unicode and ignore_case flags are set. We need to use ICU to find
76   // the closure over case equivalents.
77   return IsUnicode(flags) && IgnoreCase(flags);
78 }
79 
80 // Details of a quick mask-compare check that can look ahead in the
81 // input stream.
82 class QuickCheckDetails {
83  public:
QuickCheckDetails()84   QuickCheckDetails()
85       : characters_(0), mask_(0), value_(0), cannot_match_(false) {}
QuickCheckDetails(int characters)86   explicit QuickCheckDetails(int characters)
87       : characters_(characters), mask_(0), value_(0), cannot_match_(false) {}
88   bool Rationalize(bool one_byte);
89   // Merge in the information from another branch of an alternation.
90   void Merge(QuickCheckDetails* other, int from_index);
91   // Advance the current position by some amount.
92   void Advance(int by, bool one_byte);
93   void Clear();
cannot_match()94   bool cannot_match() { return cannot_match_; }
set_cannot_match()95   void set_cannot_match() { cannot_match_ = true; }
96   struct Position {
PositionPosition97     Position() : mask(0), value(0), determines_perfectly(false) {}
98     uc32 mask;
99     uc32 value;
100     bool determines_perfectly;
101   };
characters()102   int characters() { return characters_; }
set_characters(int characters)103   void set_characters(int characters) { characters_ = characters; }
positions(int index)104   Position* positions(int index) {
105     DCHECK_LE(0, index);
106     DCHECK_GT(characters_, index);
107     return positions_ + index;
108   }
mask()109   uint32_t mask() { return mask_; }
value()110   uint32_t value() { return value_; }
111 
112  private:
113   // How many characters do we have quick check information from.  This is
114   // the same for all branches of a choice node.
115   int characters_;
116   Position positions_[4];
117   // These values are the condensate of the above array after Rationalize().
118   uint32_t mask_;
119   uint32_t value_;
120   // If set to true, there is no way this quick check can match at all.
121   // E.g., if it requires to be at the start of the input, and isn't.
122   bool cannot_match_;
123 };
124 
125 // Improve the speed that we scan for an initial point where a non-anchored
126 // regexp can match by using a Boyer-Moore-like table. This is done by
127 // identifying non-greedy non-capturing loops in the nodes that eat any
128 // character one at a time.  For example in the middle of the regexp
129 // /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly
130 // inserted at the start of any non-anchored regexp.
131 //
132 // When we have found such a loop we look ahead in the nodes to find the set of
133 // characters that can come at given distances. For example for the regexp
134 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
135 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
136 // the lookahead info where the set of characters is reasonably constrained. In
137 // our example this is from index 1 to 2 (0 is not constrained). We can now
138 // look 3 characters ahead and if we don't find one of [f, o] (the union of
139 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
140 //
141 // For Unicode input strings we do the same, but modulo 128.
142 //
143 // We also look at the first string fed to the regexp and use that to get a hint
144 // of the character frequencies in the inputs. This affects the assessment of
145 // whether the set of characters is 'reasonably constrained'.
146 //
147 // We also have another lookahead mechanism (called quick check in the code),
148 // which uses a wide load of multiple characters followed by a mask and compare
149 // to determine whether a match is possible at this point.
150 enum ContainedInLattice {
151   kNotYet = 0,
152   kLatticeIn = 1,
153   kLatticeOut = 2,
154   kLatticeUnknown = 3  // Can also mean both in and out.
155 };
156 
Combine(ContainedInLattice a,ContainedInLattice b)157 inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
158   return static_cast<ContainedInLattice>(a | b);
159 }
160 
161 class BoyerMoorePositionInfo : public ZoneObject {
162  public:
at(int i)163   bool at(int i) const { return map_[i]; }
164 
165   static constexpr int kMapSize = 128;
166   static constexpr int kMask = kMapSize - 1;
167 
map_count()168   int map_count() const { return map_count_; }
169 
170   void Set(int character);
171   void SetInterval(const Interval& interval);
172   void SetAll();
173 
is_non_word()174   bool is_non_word() { return w_ == kLatticeOut; }
is_word()175   bool is_word() { return w_ == kLatticeIn; }
176 
177   using Bitset = std::bitset<kMapSize>;
raw_bitset()178   Bitset raw_bitset() const { return map_; }
179 
180  private:
181   Bitset map_;
182   int map_count_ = 0;               // Number of set bits in the map.
183   ContainedInLattice w_ = kNotYet;  // The \w character class.
184 };
185 
186 class BoyerMooreLookahead : public ZoneObject {
187  public:
188   BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
189 
length()190   int length() { return length_; }
max_char()191   int max_char() { return max_char_; }
compiler()192   RegExpCompiler* compiler() { return compiler_; }
193 
Count(int map_number)194   int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); }
195 
at(int i)196   BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
197 
Set(int map_number,int character)198   void Set(int map_number, int character) {
199     if (character > max_char_) return;
200     BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
201     info->Set(character);
202   }
203 
SetInterval(int map_number,const Interval & interval)204   void SetInterval(int map_number, const Interval& interval) {
205     if (interval.from() > max_char_) return;
206     BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
207     if (interval.to() > max_char_) {
208       info->SetInterval(Interval(interval.from(), max_char_));
209     } else {
210       info->SetInterval(interval);
211     }
212   }
213 
SetAll(int map_number)214   void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); }
215 
SetRest(int from_map)216   void SetRest(int from_map) {
217     for (int i = from_map; i < length_; i++) SetAll(i);
218   }
219   void EmitSkipInstructions(RegExpMacroAssembler* masm);
220 
221  private:
222   // This is the value obtained by EatsAtLeast.  If we do not have at least this
223   // many characters left in the sample string then the match is bound to fail.
224   // Therefore it is OK to read a character this far ahead of the current match
225   // point.
226   int length_;
227   RegExpCompiler* compiler_;
228   // 0xff for Latin1, 0xffff for UTF-16.
229   int max_char_;
230   ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
231 
232   int GetSkipTable(int min_lookahead, int max_lookahead,
233                    Handle<ByteArray> boolean_skip_table);
234   bool FindWorthwhileInterval(int* from, int* to);
235   int FindBestInterval(int max_number_of_chars, int old_biggest_points,
236                        int* from, int* to);
237 };
238 
239 // There are many ways to generate code for a node.  This class encapsulates
240 // the current way we should be generating.  In other words it encapsulates
241 // the current state of the code generator.  The effect of this is that we
242 // generate code for paths that the matcher can take through the regular
243 // expression.  A given node in the regexp can be code-generated several times
244 // as it can be part of several traces.  For example for the regexp:
245 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
246 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace.  The code
247 // to match foo is generated only once (the traces have a common prefix).  The
248 // code to store the capture is deferred and generated (twice) after the places
249 // where baz has been matched.
250 class Trace {
251  public:
252   // A value for a property that is either known to be true, know to be false,
253   // or not known.
254   enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 };
255 
256   class DeferredAction {
257    public:
DeferredAction(ActionNode::ActionType action_type,int reg)258     DeferredAction(ActionNode::ActionType action_type, int reg)
259         : action_type_(action_type), reg_(reg), next_(nullptr) {}
next()260     DeferredAction* next() { return next_; }
261     bool Mentions(int reg);
reg()262     int reg() { return reg_; }
action_type()263     ActionNode::ActionType action_type() { return action_type_; }
264 
265    private:
266     ActionNode::ActionType action_type_;
267     int reg_;
268     DeferredAction* next_;
269     friend class Trace;
270   };
271 
272   class DeferredCapture : public DeferredAction {
273    public:
DeferredCapture(int reg,bool is_capture,Trace * trace)274     DeferredCapture(int reg, bool is_capture, Trace* trace)
275         : DeferredAction(ActionNode::STORE_POSITION, reg),
276           cp_offset_(trace->cp_offset()),
277           is_capture_(is_capture) {}
cp_offset()278     int cp_offset() { return cp_offset_; }
is_capture()279     bool is_capture() { return is_capture_; }
280 
281    private:
282     int cp_offset_;
283     bool is_capture_;
set_cp_offset(int cp_offset)284     void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
285   };
286 
287   class DeferredSetRegisterForLoop : public DeferredAction {
288    public:
DeferredSetRegisterForLoop(int reg,int value)289     DeferredSetRegisterForLoop(int reg, int value)
290         : DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg),
291           value_(value) {}
value()292     int value() { return value_; }
293 
294    private:
295     int value_;
296   };
297 
298   class DeferredClearCaptures : public DeferredAction {
299    public:
DeferredClearCaptures(Interval range)300     explicit DeferredClearCaptures(Interval range)
301         : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {}
range()302     Interval range() { return range_; }
303 
304    private:
305     Interval range_;
306   };
307 
308   class DeferredIncrementRegister : public DeferredAction {
309    public:
DeferredIncrementRegister(int reg)310     explicit DeferredIncrementRegister(int reg)
311         : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {}
312   };
313 
Trace()314   Trace()
315       : cp_offset_(0),
316         actions_(nullptr),
317         backtrack_(nullptr),
318         stop_node_(nullptr),
319         loop_label_(nullptr),
320         characters_preloaded_(0),
321         bound_checked_up_to_(0),
322         flush_budget_(100),
323         at_start_(UNKNOWN) {}
324 
325   // End the trace.  This involves flushing the deferred actions in the trace
326   // and pushing a backtrack location onto the backtrack stack.  Once this is
327   // done we can start a new trace or go to one that has already been
328   // generated.
329   void Flush(RegExpCompiler* compiler, RegExpNode* successor);
cp_offset()330   int cp_offset() { return cp_offset_; }
actions()331   DeferredAction* actions() { return actions_; }
332   // A trivial trace is one that has no deferred actions or other state that
333   // affects the assumptions used when generating code.  There is no recorded
334   // backtrack location in a trivial trace, so with a trivial trace we will
335   // generate code that, on a failure to match, gets the backtrack location
336   // from the backtrack stack rather than using a direct jump instruction.  We
337   // always start code generation with a trivial trace and non-trivial traces
338   // are created as we emit code for nodes or add to the list of deferred
339   // actions in the trace.  The location of the code generated for a node using
340   // a trivial trace is recorded in a label in the node so that gotos can be
341   // generated to that code.
is_trivial()342   bool is_trivial() {
343     return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
344            characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
345            quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
346   }
at_start()347   TriBool at_start() { return at_start_; }
set_at_start(TriBool at_start)348   void set_at_start(TriBool at_start) { at_start_ = at_start; }
backtrack()349   Label* backtrack() { return backtrack_; }
loop_label()350   Label* loop_label() { return loop_label_; }
stop_node()351   RegExpNode* stop_node() { return stop_node_; }
characters_preloaded()352   int characters_preloaded() { return characters_preloaded_; }
bound_checked_up_to()353   int bound_checked_up_to() { return bound_checked_up_to_; }
flush_budget()354   int flush_budget() { return flush_budget_; }
quick_check_performed()355   QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
356   bool mentions_reg(int reg);
357   // Returns true if a deferred position store exists to the specified
358   // register and stores the offset in the out-parameter.  Otherwise
359   // returns false.
360   bool GetStoredPosition(int reg, int* cp_offset);
361   // These set methods and AdvanceCurrentPositionInTrace should be used only on
362   // new traces - the intention is that traces are immutable after creation.
add_action(DeferredAction * new_action)363   void add_action(DeferredAction* new_action) {
364     DCHECK(new_action->next_ == nullptr);
365     new_action->next_ = actions_;
366     actions_ = new_action;
367   }
set_backtrack(Label * backtrack)368   void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
set_stop_node(RegExpNode * node)369   void set_stop_node(RegExpNode* node) { stop_node_ = node; }
set_loop_label(Label * label)370   void set_loop_label(Label* label) { loop_label_ = label; }
set_characters_preloaded(int count)371   void set_characters_preloaded(int count) { characters_preloaded_ = count; }
set_bound_checked_up_to(int to)372   void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
set_flush_budget(int to)373   void set_flush_budget(int to) { flush_budget_ = to; }
set_quick_check_performed(QuickCheckDetails * d)374   void set_quick_check_performed(QuickCheckDetails* d) {
375     quick_check_performed_ = *d;
376   }
377   void InvalidateCurrentCharacter();
378   void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
379 
380  private:
381   int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone);
382   void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register,
383                               const DynamicBitSet& affected_registers,
384                               DynamicBitSet* registers_to_pop,
385                               DynamicBitSet* registers_to_clear, Zone* zone);
386   void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register,
387                                 const DynamicBitSet& registers_to_pop,
388                                 const DynamicBitSet& registers_to_clear);
389   int cp_offset_;
390   DeferredAction* actions_;
391   Label* backtrack_;
392   RegExpNode* stop_node_;
393   Label* loop_label_;
394   int characters_preloaded_;
395   int bound_checked_up_to_;
396   QuickCheckDetails quick_check_performed_;
397   int flush_budget_;
398   TriBool at_start_;
399 };
400 
401 class GreedyLoopState {
402  public:
403   explicit GreedyLoopState(bool not_at_start);
404 
label()405   Label* label() { return &label_; }
counter_backtrack_trace()406   Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
407 
408  private:
409   Label label_;
410   Trace counter_backtrack_trace_;
411 };
412 
413 struct PreloadState {
414   static const int kEatsAtLeastNotYetInitialized = -1;
415   bool preload_is_current_;
416   bool preload_has_checked_bounds_;
417   int preload_characters_;
418   int eats_at_least_;
initPreloadState419   void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; }
420 };
421 
422 // Analysis performs assertion propagation and computes eats_at_least_ values.
423 // See the comments on AssertionPropagator and EatsAtLeastPropagator for more
424 // details.
425 RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
426 
427 class FrequencyCollator {
428  public:
FrequencyCollator()429   FrequencyCollator() : total_samples_(0) {
430     for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
431       frequencies_[i] = CharacterFrequency(i);
432     }
433   }
434 
CountCharacter(int character)435   void CountCharacter(int character) {
436     int index = (character & RegExpMacroAssembler::kTableMask);
437     frequencies_[index].Increment();
438     total_samples_++;
439   }
440 
441   // Does not measure in percent, but rather per-128 (the table size from the
442   // regexp macro assembler).
Frequency(int in_character)443   int Frequency(int in_character) {
444     DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
445     if (total_samples_ < 1) return 1;  // Division by zero.
446     int freq_in_per128 =
447         (frequencies_[in_character].counter() * 128) / total_samples_;
448     return freq_in_per128;
449   }
450 
451  private:
452   class CharacterFrequency {
453    public:
CharacterFrequency()454     CharacterFrequency() : counter_(0), character_(-1) {}
CharacterFrequency(int character)455     explicit CharacterFrequency(int character)
456         : counter_(0), character_(character) {}
457 
Increment()458     void Increment() { counter_++; }
counter()459     int counter() { return counter_; }
character()460     int character() { return character_; }
461 
462    private:
463     int counter_;
464     int character_;
465   };
466 
467  private:
468   CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
469   int total_samples_;
470 };
471 
472 class RegExpCompiler {
473  public:
474   RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
475                  bool is_one_byte);
476 
AllocateRegister()477   int AllocateRegister() {
478     if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
479       reg_exp_too_big_ = true;
480       return next_register_;
481     }
482     return next_register_++;
483   }
484 
485   // Lookarounds to match lone surrogates for unicode character class matches
486   // are never nested. We can therefore reuse registers.
UnicodeLookaroundStackRegister()487   int UnicodeLookaroundStackRegister() {
488     if (unicode_lookaround_stack_register_ == kNoRegister) {
489       unicode_lookaround_stack_register_ = AllocateRegister();
490     }
491     return unicode_lookaround_stack_register_;
492   }
493 
UnicodeLookaroundPositionRegister()494   int UnicodeLookaroundPositionRegister() {
495     if (unicode_lookaround_position_register_ == kNoRegister) {
496       unicode_lookaround_position_register_ = AllocateRegister();
497     }
498     return unicode_lookaround_position_register_;
499   }
500 
501   struct CompilationResult final {
CompilationResultfinal502     explicit CompilationResult(RegExpError err) : error(err) {}
CompilationResultfinal503     CompilationResult(Handle<Object> code, int registers)
504         : code(code), num_registers(registers) {}
505 
RegExpTooBigfinal506     static CompilationResult RegExpTooBig() {
507       return CompilationResult(RegExpError::kTooLarge);
508     }
509 
Succeededfinal510     bool Succeeded() const { return error == RegExpError::kNone; }
511 
512     const RegExpError error = RegExpError::kNone;
513     Handle<Object> code;
514     int num_registers = 0;
515   };
516 
517   CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler,
518                              RegExpNode* start, int capture_count,
519                              Handle<String> pattern);
520 
521   // Preprocessing is the final step of node creation before analysis
522   // and assembly. It includes:
523   // - Wrapping the body of the regexp in capture 0.
524   // - Inserting the implicit .* before/after the regexp if necessary.
525   // - If the input is a one-byte string, filtering out nodes that can't match.
526   // - Fixing up regexp matches that start within a surrogate pair.
527   RegExpNode* PreprocessRegExp(RegExpCompileData* data, JSRegExp::Flags flags,
528                                bool is_one_byte);
529 
530   // If the regexp matching starts within a surrogate pair, step back to the
531   // lead surrogate and start matching from there.
532   RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success,
533                                                 JSRegExp::Flags flags);
534 
AddWork(RegExpNode * node)535   inline void AddWork(RegExpNode* node) {
536     if (!node->on_work_list() && !node->label()->is_bound()) {
537       node->set_on_work_list(true);
538       work_list_->push_back(node);
539     }
540   }
541 
542   static const int kImplementationOffset = 0;
543   static const int kNumberOfRegistersOffset = 0;
544   static const int kCodeOffset = 1;
545 
macro_assembler()546   RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
accept()547   EndNode* accept() { return accept_; }
548 
549   static const int kMaxRecursion = 100;
recursion_depth()550   inline int recursion_depth() { return recursion_depth_; }
IncrementRecursionDepth()551   inline void IncrementRecursionDepth() { recursion_depth_++; }
DecrementRecursionDepth()552   inline void DecrementRecursionDepth() { recursion_depth_--; }
553 
SetRegExpTooBig()554   void SetRegExpTooBig() { reg_exp_too_big_ = true; }
555 
one_byte()556   inline bool one_byte() { return one_byte_; }
optimize()557   inline bool optimize() { return optimize_; }
set_optimize(bool value)558   inline void set_optimize(bool value) { optimize_ = value; }
limiting_recursion()559   inline bool limiting_recursion() { return limiting_recursion_; }
set_limiting_recursion(bool value)560   inline void set_limiting_recursion(bool value) {
561     limiting_recursion_ = value;
562   }
read_backward()563   bool read_backward() { return read_backward_; }
set_read_backward(bool value)564   void set_read_backward(bool value) { read_backward_ = value; }
frequency_collator()565   FrequencyCollator* frequency_collator() { return &frequency_collator_; }
566 
current_expansion_factor()567   int current_expansion_factor() { return current_expansion_factor_; }
set_current_expansion_factor(int value)568   void set_current_expansion_factor(int value) {
569     current_expansion_factor_ = value;
570   }
571 
isolate()572   Isolate* isolate() const { return isolate_; }
zone()573   Zone* zone() const { return zone_; }
574 
575   static const int kNoRegister = -1;
576 
577  private:
578   EndNode* accept_;
579   int next_register_;
580   int unicode_lookaround_stack_register_;
581   int unicode_lookaround_position_register_;
582   ZoneVector<RegExpNode*>* work_list_;
583   int recursion_depth_;
584   RegExpMacroAssembler* macro_assembler_;
585   bool one_byte_;
586   bool reg_exp_too_big_;
587   bool limiting_recursion_;
588   bool optimize_;
589   bool read_backward_;
590   int current_expansion_factor_;
591   FrequencyCollator frequency_collator_;
592   Isolate* isolate_;
593   Zone* zone_;
594 };
595 
596 // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
597 class UnicodeRangeSplitter {
598  public:
599   V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base);
600 
601   static constexpr int kInitialSize = 8;
602   using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>;
603 
bmp()604   const CharacterRangeVector* bmp() const { return &bmp_; }
lead_surrogates()605   const CharacterRangeVector* lead_surrogates() const {
606     return &lead_surrogates_;
607   }
trail_surrogates()608   const CharacterRangeVector* trail_surrogates() const {
609     return &trail_surrogates_;
610   }
non_bmp()611   const CharacterRangeVector* non_bmp() const { return &non_bmp_; }
612 
613  private:
614   void AddRange(CharacterRange range);
615 
616   CharacterRangeVector bmp_;
617   CharacterRangeVector lead_surrogates_;
618   CharacterRangeVector trail_surrogates_;
619   CharacterRangeVector non_bmp_;
620 };
621 
622 // We need to check for the following characters: 0x39C 0x3BC 0x178.
623 // TODO(jgruber): Move to CharacterRange.
624 bool RangeContainsLatin1Equivalents(CharacterRange range);
625 
626 }  // namespace internal
627 }  // namespace v8
628 
629 #endif  // V8_REGEXP_REGEXP_COMPILER_H_
630