1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef V8_REGEXP_REGEXP_COMPILER_H_
6 #define V8_REGEXP_REGEXP_COMPILER_H_
7
8 #include <bitset>
9
10 #include "src/base/small-vector.h"
11 #include "src/regexp/regexp-nodes.h"
12
13 namespace v8 {
14 namespace internal {
15
16 class DynamicBitSet;
17 class Isolate;
18
19 namespace regexp_compiler_constants {
20
21 // The '2' variant is has inclusive from and exclusive to.
22 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
23 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
24 constexpr uc32 kRangeEndMarker = 0x110000;
25 constexpr int kSpaceRanges[] = {
26 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
27 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
28 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
29 constexpr int kSpaceRangeCount = arraysize(kSpaceRanges);
30
31 constexpr int kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_',
32 '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
33 constexpr int kWordRangeCount = arraysize(kWordRanges);
34 constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
35 constexpr int kDigitRangeCount = arraysize(kDigitRanges);
36 constexpr int kSurrogateRanges[] = {kLeadSurrogateStart,
37 kLeadSurrogateStart + 1, kRangeEndMarker};
38 constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges);
39 constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E,
40 0x2028, 0x202A, kRangeEndMarker};
41 constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
42
43 // More makes code generation slower, less makes V8 benchmark score lower.
44 constexpr int kMaxLookaheadForBoyerMoore = 8;
45 // In a 3-character pattern you can maximally step forwards 3 characters
46 // at a time, which is not always enough to pay for the extra logic.
47 constexpr int kPatternTooShortForBoyerMoore = 2;
48
49 } // namespace regexp_compiler_constants
50
IgnoreCase(JSRegExp::Flags flags)51 inline bool IgnoreCase(JSRegExp::Flags flags) {
52 return (flags & JSRegExp::kIgnoreCase) != 0;
53 }
54
IsUnicode(JSRegExp::Flags flags)55 inline bool IsUnicode(JSRegExp::Flags flags) {
56 return (flags & JSRegExp::kUnicode) != 0;
57 }
58
IsSticky(JSRegExp::Flags flags)59 inline bool IsSticky(JSRegExp::Flags flags) {
60 return (flags & JSRegExp::kSticky) != 0;
61 }
62
IsGlobal(JSRegExp::Flags flags)63 inline bool IsGlobal(JSRegExp::Flags flags) {
64 return (flags & JSRegExp::kGlobal) != 0;
65 }
66
DotAll(JSRegExp::Flags flags)67 inline bool DotAll(JSRegExp::Flags flags) {
68 return (flags & JSRegExp::kDotAll) != 0;
69 }
70
Multiline(JSRegExp::Flags flags)71 inline bool Multiline(JSRegExp::Flags flags) {
72 return (flags & JSRegExp::kMultiline) != 0;
73 }
74
NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags)75 inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
76 // Both unicode and ignore_case flags are set. We need to use ICU to find
77 // the closure over case equivalents.
78 return IsUnicode(flags) && IgnoreCase(flags);
79 }
80
81 // Details of a quick mask-compare check that can look ahead in the
82 // input stream.
83 class QuickCheckDetails {
84 public:
QuickCheckDetails()85 QuickCheckDetails()
86 : characters_(0), mask_(0), value_(0), cannot_match_(false) {}
QuickCheckDetails(int characters)87 explicit QuickCheckDetails(int characters)
88 : characters_(characters), mask_(0), value_(0), cannot_match_(false) {}
89 bool Rationalize(bool one_byte);
90 // Merge in the information from another branch of an alternation.
91 void Merge(QuickCheckDetails* other, int from_index);
92 // Advance the current position by some amount.
93 void Advance(int by, bool one_byte);
94 void Clear();
cannot_match()95 bool cannot_match() { return cannot_match_; }
set_cannot_match()96 void set_cannot_match() { cannot_match_ = true; }
97 struct Position {
PositionPosition98 Position() : mask(0), value(0), determines_perfectly(false) {}
99 uc16 mask;
100 uc16 value;
101 bool determines_perfectly;
102 };
characters()103 int characters() { return characters_; }
set_characters(int characters)104 void set_characters(int characters) { characters_ = characters; }
positions(int index)105 Position* positions(int index) {
106 DCHECK_LE(0, index);
107 DCHECK_GT(characters_, index);
108 return positions_ + index;
109 }
mask()110 uint32_t mask() { return mask_; }
value()111 uint32_t value() { return value_; }
112
113 private:
114 // How many characters do we have quick check information from. This is
115 // the same for all branches of a choice node.
116 int characters_;
117 Position positions_[4];
118 // These values are the condensate of the above array after Rationalize().
119 uint32_t mask_;
120 uint32_t value_;
121 // If set to true, there is no way this quick check can match at all.
122 // E.g., if it requires to be at the start of the input, and isn't.
123 bool cannot_match_;
124 };
125
126 // Improve the speed that we scan for an initial point where a non-anchored
127 // regexp can match by using a Boyer-Moore-like table. This is done by
128 // identifying non-greedy non-capturing loops in the nodes that eat any
129 // character one at a time. For example in the middle of the regexp
130 // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
131 // inserted at the start of any non-anchored regexp.
132 //
133 // When we have found such a loop we look ahead in the nodes to find the set of
134 // characters that can come at given distances. For example for the regexp
135 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
136 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
137 // the lookahead info where the set of characters is reasonably constrained. In
138 // our example this is from index 1 to 2 (0 is not constrained). We can now
139 // look 3 characters ahead and if we don't find one of [f, o] (the union of
140 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
141 //
142 // For Unicode input strings we do the same, but modulo 128.
143 //
144 // We also look at the first string fed to the regexp and use that to get a hint
145 // of the character frequencies in the inputs. This affects the assessment of
146 // whether the set of characters is 'reasonably constrained'.
147 //
148 // We also have another lookahead mechanism (called quick check in the code),
149 // which uses a wide load of multiple characters followed by a mask and compare
150 // to determine whether a match is possible at this point.
151 enum ContainedInLattice {
152 kNotYet = 0,
153 kLatticeIn = 1,
154 kLatticeOut = 2,
155 kLatticeUnknown = 3 // Can also mean both in and out.
156 };
157
Combine(ContainedInLattice a,ContainedInLattice b)158 inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
159 return static_cast<ContainedInLattice>(a | b);
160 }
161
162 class BoyerMoorePositionInfo : public ZoneObject {
163 public:
at(int i)164 bool at(int i) const { return map_[i]; }
165
166 static constexpr int kMapSize = 128;
167 static constexpr int kMask = kMapSize - 1;
168
map_count()169 int map_count() const { return map_count_; }
170
171 void Set(int character);
172 void SetInterval(const Interval& interval);
173 void SetAll();
174
is_non_word()175 bool is_non_word() { return w_ == kLatticeOut; }
is_word()176 bool is_word() { return w_ == kLatticeIn; }
177
178 using Bitset = std::bitset<kMapSize>;
raw_bitset()179 Bitset raw_bitset() const { return map_; }
180
181 private:
182 Bitset map_;
183 int map_count_ = 0; // Number of set bits in the map.
184 ContainedInLattice w_ = kNotYet; // The \w character class.
185 };
186
187 class BoyerMooreLookahead : public ZoneObject {
188 public:
189 BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
190
length()191 int length() { return length_; }
max_char()192 int max_char() { return max_char_; }
compiler()193 RegExpCompiler* compiler() { return compiler_; }
194
Count(int map_number)195 int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); }
196
at(int i)197 BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
198
Set(int map_number,int character)199 void Set(int map_number, int character) {
200 if (character > max_char_) return;
201 BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
202 info->Set(character);
203 }
204
SetInterval(int map_number,const Interval & interval)205 void SetInterval(int map_number, const Interval& interval) {
206 if (interval.from() > max_char_) return;
207 BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
208 if (interval.to() > max_char_) {
209 info->SetInterval(Interval(interval.from(), max_char_));
210 } else {
211 info->SetInterval(interval);
212 }
213 }
214
SetAll(int map_number)215 void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); }
216
SetRest(int from_map)217 void SetRest(int from_map) {
218 for (int i = from_map; i < length_; i++) SetAll(i);
219 }
220 void EmitSkipInstructions(RegExpMacroAssembler* masm);
221
222 private:
223 // This is the value obtained by EatsAtLeast. If we do not have at least this
224 // many characters left in the sample string then the match is bound to fail.
225 // Therefore it is OK to read a character this far ahead of the current match
226 // point.
227 int length_;
228 RegExpCompiler* compiler_;
229 // 0xff for Latin1, 0xffff for UTF-16.
230 int max_char_;
231 ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
232
233 int GetSkipTable(int min_lookahead, int max_lookahead,
234 Handle<ByteArray> boolean_skip_table);
235 bool FindWorthwhileInterval(int* from, int* to);
236 int FindBestInterval(int max_number_of_chars, int old_biggest_points,
237 int* from, int* to);
238 };
239
240 // There are many ways to generate code for a node. This class encapsulates
241 // the current way we should be generating. In other words it encapsulates
242 // the current state of the code generator. The effect of this is that we
243 // generate code for paths that the matcher can take through the regular
244 // expression. A given node in the regexp can be code-generated several times
245 // as it can be part of several traces. For example for the regexp:
246 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
247 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
248 // to match foo is generated only once (the traces have a common prefix). The
249 // code to store the capture is deferred and generated (twice) after the places
250 // where baz has been matched.
251 class Trace {
252 public:
253 // A value for a property that is either known to be true, know to be false,
254 // or not known.
255 enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 };
256
257 class DeferredAction {
258 public:
DeferredAction(ActionNode::ActionType action_type,int reg)259 DeferredAction(ActionNode::ActionType action_type, int reg)
260 : action_type_(action_type), reg_(reg), next_(nullptr) {}
next()261 DeferredAction* next() { return next_; }
262 bool Mentions(int reg);
reg()263 int reg() { return reg_; }
action_type()264 ActionNode::ActionType action_type() { return action_type_; }
265
266 private:
267 ActionNode::ActionType action_type_;
268 int reg_;
269 DeferredAction* next_;
270 friend class Trace;
271 };
272
273 class DeferredCapture : public DeferredAction {
274 public:
DeferredCapture(int reg,bool is_capture,Trace * trace)275 DeferredCapture(int reg, bool is_capture, Trace* trace)
276 : DeferredAction(ActionNode::STORE_POSITION, reg),
277 cp_offset_(trace->cp_offset()),
278 is_capture_(is_capture) {}
cp_offset()279 int cp_offset() { return cp_offset_; }
is_capture()280 bool is_capture() { return is_capture_; }
281
282 private:
283 int cp_offset_;
284 bool is_capture_;
set_cp_offset(int cp_offset)285 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
286 };
287
288 class DeferredSetRegisterForLoop : public DeferredAction {
289 public:
DeferredSetRegisterForLoop(int reg,int value)290 DeferredSetRegisterForLoop(int reg, int value)
291 : DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg),
292 value_(value) {}
value()293 int value() { return value_; }
294
295 private:
296 int value_;
297 };
298
299 class DeferredClearCaptures : public DeferredAction {
300 public:
DeferredClearCaptures(Interval range)301 explicit DeferredClearCaptures(Interval range)
302 : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {}
range()303 Interval range() { return range_; }
304
305 private:
306 Interval range_;
307 };
308
309 class DeferredIncrementRegister : public DeferredAction {
310 public:
DeferredIncrementRegister(int reg)311 explicit DeferredIncrementRegister(int reg)
312 : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {}
313 };
314
Trace()315 Trace()
316 : cp_offset_(0),
317 actions_(nullptr),
318 backtrack_(nullptr),
319 stop_node_(nullptr),
320 loop_label_(nullptr),
321 characters_preloaded_(0),
322 bound_checked_up_to_(0),
323 flush_budget_(100),
324 at_start_(UNKNOWN) {}
325
326 // End the trace. This involves flushing the deferred actions in the trace
327 // and pushing a backtrack location onto the backtrack stack. Once this is
328 // done we can start a new trace or go to one that has already been
329 // generated.
330 void Flush(RegExpCompiler* compiler, RegExpNode* successor);
cp_offset()331 int cp_offset() { return cp_offset_; }
actions()332 DeferredAction* actions() { return actions_; }
333 // A trivial trace is one that has no deferred actions or other state that
334 // affects the assumptions used when generating code. There is no recorded
335 // backtrack location in a trivial trace, so with a trivial trace we will
336 // generate code that, on a failure to match, gets the backtrack location
337 // from the backtrack stack rather than using a direct jump instruction. We
338 // always start code generation with a trivial trace and non-trivial traces
339 // are created as we emit code for nodes or add to the list of deferred
340 // actions in the trace. The location of the code generated for a node using
341 // a trivial trace is recorded in a label in the node so that gotos can be
342 // generated to that code.
is_trivial()343 bool is_trivial() {
344 return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
345 characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
346 quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
347 }
at_start()348 TriBool at_start() { return at_start_; }
set_at_start(TriBool at_start)349 void set_at_start(TriBool at_start) { at_start_ = at_start; }
backtrack()350 Label* backtrack() { return backtrack_; }
loop_label()351 Label* loop_label() { return loop_label_; }
stop_node()352 RegExpNode* stop_node() { return stop_node_; }
characters_preloaded()353 int characters_preloaded() { return characters_preloaded_; }
bound_checked_up_to()354 int bound_checked_up_to() { return bound_checked_up_to_; }
flush_budget()355 int flush_budget() { return flush_budget_; }
quick_check_performed()356 QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
357 bool mentions_reg(int reg);
358 // Returns true if a deferred position store exists to the specified
359 // register and stores the offset in the out-parameter. Otherwise
360 // returns false.
361 bool GetStoredPosition(int reg, int* cp_offset);
362 // These set methods and AdvanceCurrentPositionInTrace should be used only on
363 // new traces - the intention is that traces are immutable after creation.
add_action(DeferredAction * new_action)364 void add_action(DeferredAction* new_action) {
365 DCHECK(new_action->next_ == nullptr);
366 new_action->next_ = actions_;
367 actions_ = new_action;
368 }
set_backtrack(Label * backtrack)369 void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
set_stop_node(RegExpNode * node)370 void set_stop_node(RegExpNode* node) { stop_node_ = node; }
set_loop_label(Label * label)371 void set_loop_label(Label* label) { loop_label_ = label; }
set_characters_preloaded(int count)372 void set_characters_preloaded(int count) { characters_preloaded_ = count; }
set_bound_checked_up_to(int to)373 void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
set_flush_budget(int to)374 void set_flush_budget(int to) { flush_budget_ = to; }
set_quick_check_performed(QuickCheckDetails * d)375 void set_quick_check_performed(QuickCheckDetails* d) {
376 quick_check_performed_ = *d;
377 }
378 void InvalidateCurrentCharacter();
379 void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
380
381 private:
382 int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone);
383 void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register,
384 const DynamicBitSet& affected_registers,
385 DynamicBitSet* registers_to_pop,
386 DynamicBitSet* registers_to_clear, Zone* zone);
387 void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register,
388 const DynamicBitSet& registers_to_pop,
389 const DynamicBitSet& registers_to_clear);
390 int cp_offset_;
391 DeferredAction* actions_;
392 Label* backtrack_;
393 RegExpNode* stop_node_;
394 Label* loop_label_;
395 int characters_preloaded_;
396 int bound_checked_up_to_;
397 QuickCheckDetails quick_check_performed_;
398 int flush_budget_;
399 TriBool at_start_;
400 };
401
402 class GreedyLoopState {
403 public:
404 explicit GreedyLoopState(bool not_at_start);
405
label()406 Label* label() { return &label_; }
counter_backtrack_trace()407 Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
408
409 private:
410 Label label_;
411 Trace counter_backtrack_trace_;
412 };
413
414 struct PreloadState {
415 static const int kEatsAtLeastNotYetInitialized = -1;
416 bool preload_is_current_;
417 bool preload_has_checked_bounds_;
418 int preload_characters_;
419 int eats_at_least_;
initPreloadState420 void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; }
421 };
422
423 // Analysis performs assertion propagation and computes eats_at_least_ values.
424 // See the comments on AssertionPropagator and EatsAtLeastPropagator for more
425 // details.
426 RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
427
428 class FrequencyCollator {
429 public:
FrequencyCollator()430 FrequencyCollator() : total_samples_(0) {
431 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
432 frequencies_[i] = CharacterFrequency(i);
433 }
434 }
435
CountCharacter(int character)436 void CountCharacter(int character) {
437 int index = (character & RegExpMacroAssembler::kTableMask);
438 frequencies_[index].Increment();
439 total_samples_++;
440 }
441
442 // Does not measure in percent, but rather per-128 (the table size from the
443 // regexp macro assembler).
Frequency(int in_character)444 int Frequency(int in_character) {
445 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
446 if (total_samples_ < 1) return 1; // Division by zero.
447 int freq_in_per128 =
448 (frequencies_[in_character].counter() * 128) / total_samples_;
449 return freq_in_per128;
450 }
451
452 private:
453 class CharacterFrequency {
454 public:
CharacterFrequency()455 CharacterFrequency() : counter_(0), character_(-1) {}
CharacterFrequency(int character)456 explicit CharacterFrequency(int character)
457 : counter_(0), character_(character) {}
458
Increment()459 void Increment() { counter_++; }
counter()460 int counter() { return counter_; }
character()461 int character() { return character_; }
462
463 private:
464 int counter_;
465 int character_;
466 };
467
468 private:
469 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
470 int total_samples_;
471 };
472
473 class RegExpCompiler {
474 public:
475 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
476 bool is_one_byte);
477
AllocateRegister()478 int AllocateRegister() {
479 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
480 reg_exp_too_big_ = true;
481 return next_register_;
482 }
483 return next_register_++;
484 }
485
486 // Lookarounds to match lone surrogates for unicode character class matches
487 // are never nested. We can therefore reuse registers.
UnicodeLookaroundStackRegister()488 int UnicodeLookaroundStackRegister() {
489 if (unicode_lookaround_stack_register_ == kNoRegister) {
490 unicode_lookaround_stack_register_ = AllocateRegister();
491 }
492 return unicode_lookaround_stack_register_;
493 }
494
UnicodeLookaroundPositionRegister()495 int UnicodeLookaroundPositionRegister() {
496 if (unicode_lookaround_position_register_ == kNoRegister) {
497 unicode_lookaround_position_register_ = AllocateRegister();
498 }
499 return unicode_lookaround_position_register_;
500 }
501
502 struct CompilationResult final {
CompilationResultfinal503 explicit CompilationResult(RegExpError err) : error(err) {}
CompilationResultfinal504 CompilationResult(Object code, int registers)
505 : code(code), num_registers(registers) {}
506
RegExpTooBigfinal507 static CompilationResult RegExpTooBig() {
508 return CompilationResult(RegExpError::kTooLarge);
509 }
510
Succeededfinal511 bool Succeeded() const { return error == RegExpError::kNone; }
512
513 const RegExpError error = RegExpError::kNone;
514 Object code;
515 int num_registers = 0;
516 };
517
518 CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler,
519 RegExpNode* start, int capture_count,
520 Handle<String> pattern);
521
522 // If the regexp matching starts within a surrogate pair, step back to the
523 // lead surrogate and start matching from there.
524 static RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
525 RegExpNode* on_success,
526 JSRegExp::Flags flags);
527
AddWork(RegExpNode * node)528 inline void AddWork(RegExpNode* node) {
529 if (!node->on_work_list() && !node->label()->is_bound()) {
530 node->set_on_work_list(true);
531 work_list_->push_back(node);
532 }
533 }
534
535 static const int kImplementationOffset = 0;
536 static const int kNumberOfRegistersOffset = 0;
537 static const int kCodeOffset = 1;
538
macro_assembler()539 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
accept()540 EndNode* accept() { return accept_; }
541
542 static const int kMaxRecursion = 100;
recursion_depth()543 inline int recursion_depth() { return recursion_depth_; }
IncrementRecursionDepth()544 inline void IncrementRecursionDepth() { recursion_depth_++; }
DecrementRecursionDepth()545 inline void DecrementRecursionDepth() { recursion_depth_--; }
546
SetRegExpTooBig()547 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
548
one_byte()549 inline bool one_byte() { return one_byte_; }
optimize()550 inline bool optimize() { return optimize_; }
set_optimize(bool value)551 inline void set_optimize(bool value) { optimize_ = value; }
limiting_recursion()552 inline bool limiting_recursion() { return limiting_recursion_; }
set_limiting_recursion(bool value)553 inline void set_limiting_recursion(bool value) {
554 limiting_recursion_ = value;
555 }
read_backward()556 bool read_backward() { return read_backward_; }
set_read_backward(bool value)557 void set_read_backward(bool value) { read_backward_ = value; }
frequency_collator()558 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
559
current_expansion_factor()560 int current_expansion_factor() { return current_expansion_factor_; }
set_current_expansion_factor(int value)561 void set_current_expansion_factor(int value) {
562 current_expansion_factor_ = value;
563 }
564
isolate()565 Isolate* isolate() const { return isolate_; }
zone()566 Zone* zone() const { return zone_; }
567
568 static const int kNoRegister = -1;
569
570 private:
571 EndNode* accept_;
572 int next_register_;
573 int unicode_lookaround_stack_register_;
574 int unicode_lookaround_position_register_;
575 ZoneVector<RegExpNode*>* work_list_;
576 int recursion_depth_;
577 RegExpMacroAssembler* macro_assembler_;
578 bool one_byte_;
579 bool reg_exp_too_big_;
580 bool limiting_recursion_;
581 bool optimize_;
582 bool read_backward_;
583 int current_expansion_factor_;
584 FrequencyCollator frequency_collator_;
585 Isolate* isolate_;
586 Zone* zone_;
587 };
588
589 // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
590 class UnicodeRangeSplitter {
591 public:
592 V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base);
593
594 static constexpr int kInitialSize = 8;
595 using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>;
596
bmp()597 const CharacterRangeVector* bmp() const { return &bmp_; }
lead_surrogates()598 const CharacterRangeVector* lead_surrogates() const {
599 return &lead_surrogates_;
600 }
trail_surrogates()601 const CharacterRangeVector* trail_surrogates() const {
602 return &trail_surrogates_;
603 }
non_bmp()604 const CharacterRangeVector* non_bmp() const { return &non_bmp_; }
605
606 private:
607 void AddRange(CharacterRange range);
608
609 CharacterRangeVector bmp_;
610 CharacterRangeVector lead_surrogates_;
611 CharacterRangeVector trail_surrogates_;
612 CharacterRangeVector non_bmp_;
613 };
614
615 // We need to check for the following characters: 0x39C 0x3BC 0x178.
616 // TODO(jgruber): Move to CharacterRange.
617 bool RangeContainsLatin1Equivalents(CharacterRange range);
618
619 } // namespace internal
620 } // namespace v8
621
622 #endif // V8_REGEXP_REGEXP_COMPILER_H_
623