1 // Copyright 2019 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_NODES_H_ 6 #define V8_REGEXP_REGEXP_NODES_H_ 7 8 #include "src/codegen/label.h" 9 #include "src/regexp/regexp-macro-assembler.h" 10 #include "src/zone/zone.h" 11 12 namespace v8 { 13 namespace internal { 14 15 class AlternativeGenerationList; 16 class BoyerMooreLookahead; 17 class GreedyLoopState; 18 class NodeVisitor; 19 class QuickCheckDetails; 20 class RegExpCompiler; 21 class Trace; 22 struct PreloadState; 23 class ChoiceNode; 24 25 #define FOR_EACH_NODE_TYPE(VISIT) \ 26 VISIT(End) \ 27 VISIT(Action) \ 28 VISIT(Choice) \ 29 VISIT(LoopChoice) \ 30 VISIT(NegativeLookaroundChoice) \ 31 VISIT(BackReference) \ 32 VISIT(Assertion) \ 33 VISIT(Text) 34 35 struct NodeInfo final { NodeInfofinal36 NodeInfo() 37 : being_analyzed(false), 38 been_analyzed(false), 39 follows_word_interest(false), 40 follows_newline_interest(false), 41 follows_start_interest(false), 42 at_end(false), 43 visited(false), 44 replacement_calculated(false) {} 45 46 // Returns true if the interests and assumptions of this node 47 // matches the given one. Matchesfinal48 bool Matches(NodeInfo* that) { 49 return (at_end == that->at_end) && 50 (follows_word_interest == that->follows_word_interest) && 51 (follows_newline_interest == that->follows_newline_interest) && 52 (follows_start_interest == that->follows_start_interest); 53 } 54 55 // Updates the interests of this node given the interests of the 56 // node preceding it. AddFromPrecedingfinal57 void AddFromPreceding(NodeInfo* that) { 58 at_end |= that->at_end; 59 follows_word_interest |= that->follows_word_interest; 60 follows_newline_interest |= that->follows_newline_interest; 61 follows_start_interest |= that->follows_start_interest; 62 } 63 HasLookbehindfinal64 bool HasLookbehind() { 65 return follows_word_interest || follows_newline_interest || 66 follows_start_interest; 67 } 68 69 // Sets the interests of this node to include the interests of the 70 // following node. AddFromFollowingfinal71 void AddFromFollowing(NodeInfo* that) { 72 follows_word_interest |= that->follows_word_interest; 73 follows_newline_interest |= that->follows_newline_interest; 74 follows_start_interest |= that->follows_start_interest; 75 } 76 ResetCompilationStatefinal77 void ResetCompilationState() { 78 being_analyzed = false; 79 been_analyzed = false; 80 } 81 82 bool being_analyzed : 1; 83 bool been_analyzed : 1; 84 85 // These bits are set of this node has to know what the preceding 86 // character was. 87 bool follows_word_interest : 1; 88 bool follows_newline_interest : 1; 89 bool follows_start_interest : 1; 90 91 bool at_end : 1; 92 bool visited : 1; 93 bool replacement_calculated : 1; 94 }; 95 96 struct EatsAtLeastInfo final { EatsAtLeastInfofinal97 EatsAtLeastInfo() : EatsAtLeastInfo(0) {} EatsAtLeastInfofinal98 explicit EatsAtLeastInfo(uint8_t eats) 99 : eats_at_least_from_possibly_start(eats), 100 eats_at_least_from_not_start(eats) {} SetMinfinal101 void SetMin(const EatsAtLeastInfo& other) { 102 if (other.eats_at_least_from_possibly_start < 103 eats_at_least_from_possibly_start) { 104 eats_at_least_from_possibly_start = 105 other.eats_at_least_from_possibly_start; 106 } 107 if (other.eats_at_least_from_not_start < eats_at_least_from_not_start) { 108 eats_at_least_from_not_start = other.eats_at_least_from_not_start; 109 } 110 } 111 IsZerofinal112 bool IsZero() const { 113 return eats_at_least_from_possibly_start == 0 && 114 eats_at_least_from_not_start == 0; 115 } 116 117 // Any successful match starting from the current node will consume at least 118 // this many characters. This does not necessarily mean that there is a 119 // possible match with exactly this many characters, but we generally try to 120 // get this number as high as possible to allow for early exit on failure. 121 uint8_t eats_at_least_from_possibly_start; 122 123 // Like eats_at_least_from_possibly_start, but with the additional assumption 124 // that start-of-string assertions (^) can't match. This value is greater than 125 // or equal to eats_at_least_from_possibly_start. 126 uint8_t eats_at_least_from_not_start; 127 }; 128 129 class RegExpNode : public ZoneObject { 130 public: RegExpNode(Zone * zone)131 explicit RegExpNode(Zone* zone) 132 : replacement_(nullptr), 133 on_work_list_(false), 134 trace_count_(0), 135 zone_(zone) { 136 bm_info_[0] = bm_info_[1] = nullptr; 137 } 138 virtual ~RegExpNode(); 139 virtual void Accept(NodeVisitor* visitor) = 0; 140 // Generates a goto to this node or actually generates the code at this point. 141 virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0; 142 // How many characters must this node consume at a minimum in order to 143 // succeed. The not_at_start argument is used to indicate that we know we are 144 // not at the start of the input. In this case anchored branches will always 145 // fail and can be ignored when determining how many characters are consumed 146 // on success. If this node has not been analyzed yet, EatsAtLeast returns 0. 147 int EatsAtLeast(bool not_at_start); 148 // Returns how many characters this node must consume in order to succeed, 149 // given that this is a LoopChoiceNode whose counter register is in a 150 // newly-initialized state at the current position in the generated code. For 151 // example, consider /a{6,8}/. Absent any extra information, the 152 // LoopChoiceNode for the repetition must report that it consumes at least 153 // zero characters, because it may have already looped several times. However, 154 // with a newly-initialized counter, it can report that it consumes at least 155 // six characters. 156 virtual EatsAtLeastInfo EatsAtLeastFromLoopEntry(); 157 // Emits some quick code that checks whether the preloaded characters match. 158 // Falls through on certain failure, jumps to the label on possible success. 159 // If the node cannot make a quick check it does nothing and returns false. 160 bool EmitQuickCheck(RegExpCompiler* compiler, Trace* bounds_check_trace, 161 Trace* trace, bool preload_has_checked_bounds, 162 Label* on_possible_success, 163 QuickCheckDetails* details_return, 164 bool fall_through_on_failure, ChoiceNode* predecessor); 165 // For a given number of characters this returns a mask and a value. The 166 // next n characters are anded with the mask and compared with the value. 167 // A comparison failure indicates the node cannot match the next n characters. 168 // A comparison success indicates the node may match. 169 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 170 RegExpCompiler* compiler, 171 int characters_filled_in, 172 bool not_at_start) = 0; 173 // Fills in quick check details for this node, given that this is a 174 // LoopChoiceNode whose counter register is in a newly-initialized state at 175 // the current position in the generated code. For example, consider /a{6,8}/. 176 // Absent any extra information, the LoopChoiceNode for the repetition cannot 177 // generate any useful quick check because a match might be the (empty) 178 // continuation node. However, with a newly-initialized counter, it can 179 // generate a quick check for several 'a' characters at once. 180 virtual void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details, 181 RegExpCompiler* compiler, 182 int characters_filled_in, 183 bool not_at_start); 184 static const int kNodeIsTooComplexForGreedyLoops = kMinInt; GreedyLoopTextLength()185 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } 186 // Only returns the successor for a text node of length 1 that matches any 187 // character and that has no guards on it. GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)188 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( 189 RegExpCompiler* compiler) { 190 return nullptr; 191 } 192 193 // Collects information on the possible code units (mod 128) that can match if 194 // we look forward. This is used for a Boyer-Moore-like string searching 195 // implementation. TODO(erikcorry): This should share more code with 196 // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit 197 // the number of nodes we are willing to look at in order to create this data. 198 static const int kRecursionBudget = 200; 199 bool KeepRecursing(RegExpCompiler* compiler); FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)200 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, 201 BoyerMooreLookahead* bm, bool not_at_start) { 202 UNREACHABLE(); 203 } 204 205 // If we know that the input is one-byte then there are some nodes that can 206 // never match. This method returns a node that can be substituted for 207 // itself, or nullptr if the node can never match. FilterOneByte(int depth,RegExpFlags flags)208 virtual RegExpNode* FilterOneByte(int depth, RegExpFlags flags) { 209 return this; 210 } 211 // Helper for FilterOneByte. replacement()212 RegExpNode* replacement() { 213 DCHECK(info()->replacement_calculated); 214 return replacement_; 215 } set_replacement(RegExpNode * replacement)216 RegExpNode* set_replacement(RegExpNode* replacement) { 217 info()->replacement_calculated = true; 218 replacement_ = replacement; 219 return replacement; // For convenience. 220 } 221 222 // We want to avoid recalculating the lookahead info, so we store it on the 223 // node. Only info that is for this node is stored. We can tell that the 224 // info is for this node when offset == 0, so the information is calculated 225 // relative to this node. SaveBMInfo(BoyerMooreLookahead * bm,bool not_at_start,int offset)226 void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) { 227 if (offset == 0) set_bm_info(not_at_start, bm); 228 } 229 label()230 Label* label() { return &label_; } 231 // If non-generic code is generated for a node (i.e. the node is not at the 232 // start of the trace) then it cannot be reused. This variable sets a limit 233 // on how often we allow that to happen before we insist on starting a new 234 // trace and generating generic code for a node that can be reused by flushing 235 // the deferred actions in the current trace and generating a goto. 236 static const int kMaxCopiesCodeGenerated = 10; 237 on_work_list()238 bool on_work_list() { return on_work_list_; } set_on_work_list(bool value)239 void set_on_work_list(bool value) { on_work_list_ = value; } 240 info()241 NodeInfo* info() { return &info_; } eats_at_least_info()242 const EatsAtLeastInfo* eats_at_least_info() const { return &eats_at_least_; } set_eats_at_least_info(const EatsAtLeastInfo & eats_at_least)243 void set_eats_at_least_info(const EatsAtLeastInfo& eats_at_least) { 244 eats_at_least_ = eats_at_least; 245 } 246 247 // TODO(v8:10441): This is a hacky way to avoid exponential code size growth 248 // for very large choice nodes that can be generated by unicode property 249 // escapes. In order to avoid inlining (i.e. trace recursion), we pretend to 250 // have generated the maximum count of code copies already. 251 // We should instead fix this properly, e.g. by using the code size budget 252 // (flush_budget) or by generating property escape matches as calls to a C 253 // function. SetDoNotInline()254 void SetDoNotInline() { trace_count_ = kMaxCopiesCodeGenerated; } 255 bm_info(bool not_at_start)256 BoyerMooreLookahead* bm_info(bool not_at_start) { 257 return bm_info_[not_at_start ? 1 : 0]; 258 } 259 zone()260 Zone* zone() const { return zone_; } 261 262 protected: 263 enum LimitResult { DONE, CONTINUE }; 264 RegExpNode* replacement_; 265 266 LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace); 267 set_bm_info(bool not_at_start,BoyerMooreLookahead * bm)268 void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) { 269 bm_info_[not_at_start ? 1 : 0] = bm; 270 } 271 272 private: 273 static const int kFirstCharBudget = 10; 274 Label label_; 275 bool on_work_list_; 276 NodeInfo info_; 277 278 // Saved values for EatsAtLeast results, to avoid recomputation. Filled in 279 // during analysis (valid if info_.been_analyzed is true). 280 EatsAtLeastInfo eats_at_least_; 281 282 // This variable keeps track of how many times code has been generated for 283 // this node (in different traces). We don't keep track of where the 284 // generated code is located unless the code is generated at the start of 285 // a trace, in which case it is generic and can be reused by flushing the 286 // deferred operations in the current trace and generating a goto. 287 int trace_count_; 288 BoyerMooreLookahead* bm_info_[2]; 289 290 Zone* zone_; 291 }; 292 293 class SeqRegExpNode : public RegExpNode { 294 public: SeqRegExpNode(RegExpNode * on_success)295 explicit SeqRegExpNode(RegExpNode* on_success) 296 : RegExpNode(on_success->zone()), on_success_(on_success) {} on_success()297 RegExpNode* on_success() { return on_success_; } set_on_success(RegExpNode * node)298 void set_on_success(RegExpNode* node) { on_success_ = node; } 299 RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)300 void FillInBMInfo(Isolate* isolate, int offset, int budget, 301 BoyerMooreLookahead* bm, bool not_at_start) override { 302 on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 303 if (offset == 0) set_bm_info(not_at_start, bm); 304 } 305 306 protected: 307 RegExpNode* FilterSuccessor(int depth, RegExpFlags flags); 308 309 private: 310 RegExpNode* on_success_; 311 }; 312 313 class ActionNode : public SeqRegExpNode { 314 public: 315 enum ActionType { 316 SET_REGISTER_FOR_LOOP, 317 INCREMENT_REGISTER, 318 STORE_POSITION, 319 BEGIN_POSITIVE_SUBMATCH, 320 BEGIN_NEGATIVE_SUBMATCH, 321 POSITIVE_SUBMATCH_SUCCESS, 322 EMPTY_MATCH_CHECK, 323 CLEAR_CAPTURES 324 }; 325 static ActionNode* SetRegisterForLoop(int reg, int val, 326 RegExpNode* on_success); 327 static ActionNode* IncrementRegister(int reg, RegExpNode* on_success); 328 static ActionNode* StorePosition(int reg, bool is_capture, 329 RegExpNode* on_success); 330 static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success); 331 static ActionNode* BeginPositiveSubmatch(int stack_pointer_reg, 332 int position_reg, 333 RegExpNode* on_success); 334 static ActionNode* BeginNegativeSubmatch(int stack_pointer_reg, 335 int position_reg, 336 RegExpNode* on_success); 337 static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg, 338 int restore_reg, 339 int clear_capture_count, 340 int clear_capture_from, 341 RegExpNode* on_success); 342 static ActionNode* EmptyMatchCheck(int start_register, 343 int repetition_register, 344 int repetition_limit, 345 RegExpNode* on_success); 346 void Accept(NodeVisitor* visitor) override; 347 void Emit(RegExpCompiler* compiler, Trace* trace) override; 348 void GetQuickCheckDetails(QuickCheckDetails* details, 349 RegExpCompiler* compiler, int filled_in, 350 bool not_at_start) override; 351 void FillInBMInfo(Isolate* isolate, int offset, int budget, 352 BoyerMooreLookahead* bm, bool not_at_start) override; action_type()353 ActionType action_type() { return action_type_; } 354 // TODO(erikcorry): We should allow some action nodes in greedy loops. GreedyLoopTextLength()355 int GreedyLoopTextLength() override { 356 return kNodeIsTooComplexForGreedyLoops; 357 } 358 359 private: 360 union { 361 struct { 362 int reg; 363 int value; 364 } u_store_register; 365 struct { 366 int reg; 367 } u_increment_register; 368 struct { 369 int reg; 370 bool is_capture; 371 } u_position_register; 372 struct { 373 int stack_pointer_register; 374 int current_position_register; 375 int clear_register_count; 376 int clear_register_from; 377 } u_submatch; 378 struct { 379 int start_register; 380 int repetition_register; 381 int repetition_limit; 382 } u_empty_match_check; 383 struct { 384 int range_from; 385 int range_to; 386 } u_clear_captures; 387 } data_; ActionNode(ActionType action_type,RegExpNode * on_success)388 ActionNode(ActionType action_type, RegExpNode* on_success) 389 : SeqRegExpNode(on_success), action_type_(action_type) {} 390 ActionType action_type_; 391 friend class DotPrinterImpl; 392 friend Zone; 393 }; 394 395 class TextNode : public SeqRegExpNode { 396 public: TextNode(ZoneList<TextElement> * elms,bool read_backward,RegExpNode * on_success)397 TextNode(ZoneList<TextElement>* elms, bool read_backward, 398 RegExpNode* on_success) 399 : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {} TextNode(RegExpCharacterClass * that,bool read_backward,RegExpNode * on_success)400 TextNode(RegExpCharacterClass* that, bool read_backward, 401 RegExpNode* on_success) 402 : SeqRegExpNode(on_success), 403 elms_(zone()->New<ZoneList<TextElement>>(1, zone())), 404 read_backward_(read_backward) { 405 elms_->Add(TextElement::CharClass(that), zone()); 406 } 407 // Create TextNode for a single character class for the given ranges. 408 static TextNode* CreateForCharacterRanges(Zone* zone, 409 ZoneList<CharacterRange>* ranges, 410 bool read_backward, 411 RegExpNode* on_success); 412 // Create TextNode for a surrogate pair with a range given for the 413 // lead and the trail surrogate each. 414 static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead, 415 CharacterRange trail, 416 bool read_backward, 417 RegExpNode* on_success); 418 void Accept(NodeVisitor* visitor) override; 419 void Emit(RegExpCompiler* compiler, Trace* trace) override; 420 void GetQuickCheckDetails(QuickCheckDetails* details, 421 RegExpCompiler* compiler, int characters_filled_in, 422 bool not_at_start) override; elements()423 ZoneList<TextElement>* elements() { return elms_; } read_backward()424 bool read_backward() { return read_backward_; } 425 void MakeCaseIndependent(Isolate* isolate, bool is_one_byte, 426 RegExpFlags flags); 427 int GreedyLoopTextLength() override; 428 RegExpNode* GetSuccessorOfOmnivorousTextNode( 429 RegExpCompiler* compiler) override; 430 void FillInBMInfo(Isolate* isolate, int offset, int budget, 431 BoyerMooreLookahead* bm, bool not_at_start) override; 432 void CalculateOffsets(); 433 RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; 434 int Length(); 435 436 private: 437 enum TextEmitPassType { 438 NON_LATIN1_MATCH, // Check for characters that can't match. 439 SIMPLE_CHARACTER_MATCH, // Case-dependent single character check. 440 NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs. 441 CASE_CHARACTER_MATCH, // Case-independent single character check. 442 CHARACTER_CLASS_MATCH // Character class. 443 }; 444 static bool SkipPass(TextEmitPassType pass, bool ignore_case); 445 static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH; 446 static const int kLastPass = CHARACTER_CLASS_MATCH; 447 void TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, 448 bool preloaded, Trace* trace, bool first_element_checked, 449 int* checked_up_to); 450 ZoneList<TextElement>* elms_; 451 bool read_backward_; 452 }; 453 454 class AssertionNode : public SeqRegExpNode { 455 public: 456 enum AssertionType { 457 AT_END, 458 AT_START, 459 AT_BOUNDARY, 460 AT_NON_BOUNDARY, 461 AFTER_NEWLINE 462 }; AtEnd(RegExpNode * on_success)463 static AssertionNode* AtEnd(RegExpNode* on_success) { 464 return on_success->zone()->New<AssertionNode>(AT_END, on_success); 465 } AtStart(RegExpNode * on_success)466 static AssertionNode* AtStart(RegExpNode* on_success) { 467 return on_success->zone()->New<AssertionNode>(AT_START, on_success); 468 } AtBoundary(RegExpNode * on_success)469 static AssertionNode* AtBoundary(RegExpNode* on_success) { 470 return on_success->zone()->New<AssertionNode>(AT_BOUNDARY, on_success); 471 } AtNonBoundary(RegExpNode * on_success)472 static AssertionNode* AtNonBoundary(RegExpNode* on_success) { 473 return on_success->zone()->New<AssertionNode>(AT_NON_BOUNDARY, on_success); 474 } AfterNewline(RegExpNode * on_success)475 static AssertionNode* AfterNewline(RegExpNode* on_success) { 476 return on_success->zone()->New<AssertionNode>(AFTER_NEWLINE, on_success); 477 } 478 void Accept(NodeVisitor* visitor) override; 479 void Emit(RegExpCompiler* compiler, Trace* trace) override; 480 void GetQuickCheckDetails(QuickCheckDetails* details, 481 RegExpCompiler* compiler, int filled_in, 482 bool not_at_start) override; 483 void FillInBMInfo(Isolate* isolate, int offset, int budget, 484 BoyerMooreLookahead* bm, bool not_at_start) override; assertion_type()485 AssertionType assertion_type() { return assertion_type_; } 486 487 private: 488 friend Zone; 489 490 void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace); 491 enum IfPrevious { kIsNonWord, kIsWord }; 492 void BacktrackIfPrevious(RegExpCompiler* compiler, Trace* trace, 493 IfPrevious backtrack_if_previous); AssertionNode(AssertionType t,RegExpNode * on_success)494 AssertionNode(AssertionType t, RegExpNode* on_success) 495 : SeqRegExpNode(on_success), assertion_type_(t) {} 496 AssertionType assertion_type_; 497 }; 498 499 class BackReferenceNode : public SeqRegExpNode { 500 public: BackReferenceNode(int start_reg,int end_reg,RegExpFlags flags,bool read_backward,RegExpNode * on_success)501 BackReferenceNode(int start_reg, int end_reg, RegExpFlags flags, 502 bool read_backward, RegExpNode* on_success) 503 : SeqRegExpNode(on_success), 504 start_reg_(start_reg), 505 end_reg_(end_reg), 506 flags_(flags), 507 read_backward_(read_backward) {} 508 void Accept(NodeVisitor* visitor) override; start_register()509 int start_register() { return start_reg_; } end_register()510 int end_register() { return end_reg_; } read_backward()511 bool read_backward() { return read_backward_; } 512 void Emit(RegExpCompiler* compiler, Trace* trace) override; GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)513 void GetQuickCheckDetails(QuickCheckDetails* details, 514 RegExpCompiler* compiler, int characters_filled_in, 515 bool not_at_start) override { 516 return; 517 } 518 void FillInBMInfo(Isolate* isolate, int offset, int budget, 519 BoyerMooreLookahead* bm, bool not_at_start) override; 520 521 private: 522 int start_reg_; 523 int end_reg_; 524 RegExpFlags flags_; 525 bool read_backward_; 526 }; 527 528 class EndNode : public RegExpNode { 529 public: 530 enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS }; EndNode(Action action,Zone * zone)531 EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {} 532 void Accept(NodeVisitor* visitor) override; 533 void Emit(RegExpCompiler* compiler, Trace* trace) override; GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)534 void GetQuickCheckDetails(QuickCheckDetails* details, 535 RegExpCompiler* compiler, int characters_filled_in, 536 bool not_at_start) override { 537 // Returning 0 from EatsAtLeast should ensure we never get here. 538 UNREACHABLE(); 539 } FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)540 void FillInBMInfo(Isolate* isolate, int offset, int budget, 541 BoyerMooreLookahead* bm, bool not_at_start) override { 542 // Returning 0 from EatsAtLeast should ensure we never get here. 543 UNREACHABLE(); 544 } 545 546 private: 547 Action action_; 548 }; 549 550 class NegativeSubmatchSuccess : public EndNode { 551 public: NegativeSubmatchSuccess(int stack_pointer_reg,int position_reg,int clear_capture_count,int clear_capture_start,Zone * zone)552 NegativeSubmatchSuccess(int stack_pointer_reg, int position_reg, 553 int clear_capture_count, int clear_capture_start, 554 Zone* zone) 555 : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone), 556 stack_pointer_register_(stack_pointer_reg), 557 current_position_register_(position_reg), 558 clear_capture_count_(clear_capture_count), 559 clear_capture_start_(clear_capture_start) {} 560 void Emit(RegExpCompiler* compiler, Trace* trace) override; 561 562 private: 563 int stack_pointer_register_; 564 int current_position_register_; 565 int clear_capture_count_; 566 int clear_capture_start_; 567 }; 568 569 class Guard : public ZoneObject { 570 public: 571 enum Relation { LT, GEQ }; Guard(int reg,Relation op,int value)572 Guard(int reg, Relation op, int value) : reg_(reg), op_(op), value_(value) {} reg()573 int reg() { return reg_; } op()574 Relation op() { return op_; } value()575 int value() { return value_; } 576 577 private: 578 int reg_; 579 Relation op_; 580 int value_; 581 }; 582 583 class GuardedAlternative { 584 public: GuardedAlternative(RegExpNode * node)585 explicit GuardedAlternative(RegExpNode* node) 586 : node_(node), guards_(nullptr) {} 587 void AddGuard(Guard* guard, Zone* zone); node()588 RegExpNode* node() { return node_; } set_node(RegExpNode * node)589 void set_node(RegExpNode* node) { node_ = node; } guards()590 ZoneList<Guard*>* guards() { return guards_; } 591 592 private: 593 RegExpNode* node_; 594 ZoneList<Guard*>* guards_; 595 }; 596 597 class AlternativeGeneration; 598 599 class ChoiceNode : public RegExpNode { 600 public: ChoiceNode(int expected_size,Zone * zone)601 explicit ChoiceNode(int expected_size, Zone* zone) 602 : RegExpNode(zone), 603 alternatives_( 604 zone->New<ZoneList<GuardedAlternative>>(expected_size, zone)), 605 not_at_start_(false), 606 being_calculated_(false) {} 607 void Accept(NodeVisitor* visitor) override; AddAlternative(GuardedAlternative node)608 void AddAlternative(GuardedAlternative node) { 609 alternatives()->Add(node, zone()); 610 } alternatives()611 ZoneList<GuardedAlternative>* alternatives() { return alternatives_; } 612 void Emit(RegExpCompiler* compiler, Trace* trace) override; 613 void GetQuickCheckDetails(QuickCheckDetails* details, 614 RegExpCompiler* compiler, int characters_filled_in, 615 bool not_at_start) override; 616 void FillInBMInfo(Isolate* isolate, int offset, int budget, 617 BoyerMooreLookahead* bm, bool not_at_start) override; 618 being_calculated()619 bool being_calculated() { return being_calculated_; } not_at_start()620 bool not_at_start() { return not_at_start_; } set_not_at_start()621 void set_not_at_start() { not_at_start_ = true; } set_being_calculated(bool b)622 void set_being_calculated(bool b) { being_calculated_ = b; } try_to_emit_quick_check_for_alternative(bool is_first)623 virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { 624 return true; 625 } 626 RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; read_backward()627 virtual bool read_backward() { return false; } 628 629 protected: 630 int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative); 631 ZoneList<GuardedAlternative>* alternatives_; 632 633 private: 634 template <typename...> 635 friend class Analysis; 636 637 void GenerateGuard(RegExpMacroAssembler* macro_assembler, Guard* guard, 638 Trace* trace); 639 int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least); 640 void EmitOutOfLineContinuation(RegExpCompiler* compiler, Trace* trace, 641 GuardedAlternative alternative, 642 AlternativeGeneration* alt_gen, 643 int preload_characters, 644 bool next_expects_preload); 645 void SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace, 646 PreloadState* preloads); 647 void AssertGuardsMentionRegisters(Trace* trace); 648 int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace); 649 Trace* EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace, 650 AlternativeGenerationList* alt_gens, 651 PreloadState* preloads, 652 GreedyLoopState* greedy_loop_state, int text_length); 653 void EmitChoices(RegExpCompiler* compiler, 654 AlternativeGenerationList* alt_gens, int first_choice, 655 Trace* trace, PreloadState* preloads); 656 657 // If true, this node is never checked at the start of the input. 658 // Allows a new trace to start with at_start() set to false. 659 bool not_at_start_; 660 bool being_calculated_; 661 }; 662 663 class NegativeLookaroundChoiceNode : public ChoiceNode { 664 public: NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,GuardedAlternative then_do_this,Zone * zone)665 explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail, 666 GuardedAlternative then_do_this, 667 Zone* zone) 668 : ChoiceNode(2, zone) { 669 AddAlternative(this_must_fail); 670 AddAlternative(then_do_this); 671 } 672 void GetQuickCheckDetails(QuickCheckDetails* details, 673 RegExpCompiler* compiler, int characters_filled_in, 674 bool not_at_start) override; FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)675 void FillInBMInfo(Isolate* isolate, int offset, int budget, 676 BoyerMooreLookahead* bm, bool not_at_start) override { 677 continue_node()->FillInBMInfo(isolate, offset, budget - 1, bm, 678 not_at_start); 679 if (offset == 0) set_bm_info(not_at_start, bm); 680 } 681 static constexpr int kLookaroundIndex = 0; 682 static constexpr int kContinueIndex = 1; lookaround_node()683 RegExpNode* lookaround_node() { 684 return alternatives()->at(kLookaroundIndex).node(); 685 } continue_node()686 RegExpNode* continue_node() { 687 return alternatives()->at(kContinueIndex).node(); 688 } 689 // For a negative lookahead we don't emit the quick check for the 690 // alternative that is expected to fail. This is because quick check code 691 // starts by loading enough characters for the alternative that takes fewest 692 // characters, but on a negative lookahead the negative branch did not take 693 // part in that calculation (EatsAtLeast) so the assumptions don't hold. try_to_emit_quick_check_for_alternative(bool is_first)694 bool try_to_emit_quick_check_for_alternative(bool is_first) override { 695 return !is_first; 696 } 697 void Accept(NodeVisitor* visitor) override; 698 RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; 699 }; 700 701 class LoopChoiceNode : public ChoiceNode { 702 public: LoopChoiceNode(bool body_can_be_zero_length,bool read_backward,int min_loop_iterations,Zone * zone)703 LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, 704 int min_loop_iterations, Zone* zone) 705 : ChoiceNode(2, zone), 706 loop_node_(nullptr), 707 continue_node_(nullptr), 708 body_can_be_zero_length_(body_can_be_zero_length), 709 read_backward_(read_backward), 710 traversed_loop_initialization_node_(false), 711 min_loop_iterations_(min_loop_iterations) {} 712 void AddLoopAlternative(GuardedAlternative alt); 713 void AddContinueAlternative(GuardedAlternative alt); 714 void Emit(RegExpCompiler* compiler, Trace* trace) override; 715 void GetQuickCheckDetails(QuickCheckDetails* details, 716 RegExpCompiler* compiler, int characters_filled_in, 717 bool not_at_start) override; 718 void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details, 719 RegExpCompiler* compiler, 720 int characters_filled_in, 721 bool not_at_start) override; 722 void FillInBMInfo(Isolate* isolate, int offset, int budget, 723 BoyerMooreLookahead* bm, bool not_at_start) override; 724 EatsAtLeastInfo EatsAtLeastFromLoopEntry() override; loop_node()725 RegExpNode* loop_node() { return loop_node_; } continue_node()726 RegExpNode* continue_node() { return continue_node_; } body_can_be_zero_length()727 bool body_can_be_zero_length() { return body_can_be_zero_length_; } min_loop_iterations()728 int min_loop_iterations() const { return min_loop_iterations_; } read_backward()729 bool read_backward() override { return read_backward_; } 730 void Accept(NodeVisitor* visitor) override; 731 RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; 732 733 private: 734 // AddAlternative is made private for loop nodes because alternatives 735 // should not be added freely, we need to keep track of which node 736 // goes back to the node itself. AddAlternative(GuardedAlternative node)737 void AddAlternative(GuardedAlternative node) { 738 ChoiceNode::AddAlternative(node); 739 } 740 741 RegExpNode* loop_node_; 742 RegExpNode* continue_node_; 743 bool body_can_be_zero_length_; 744 bool read_backward_; 745 746 // Temporary marker set only while generating quick check details. Represents 747 // whether GetQuickCheckDetails traversed the initialization node for this 748 // loop's counter. If so, we may be able to generate stricter quick checks 749 // because we know the loop node must match at least min_loop_iterations_ 750 // times before the continuation node can match. 751 bool traversed_loop_initialization_node_; 752 753 // The minimum number of times the loop_node_ must match before the 754 // continue_node_ might be considered. This value can be temporarily decreased 755 // while generating quick check details, to represent the remaining iterations 756 // after the completed portion of the quick check details. 757 int min_loop_iterations_; 758 759 friend class IterationDecrementer; 760 friend class LoopInitializationMarker; 761 }; 762 763 class NodeVisitor { 764 public: 765 virtual ~NodeVisitor() = default; 766 #define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that) = 0; 767 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 768 #undef DECLARE_VISIT 769 }; 770 771 } // namespace internal 772 } // namespace v8 773 774 #endif // V8_REGEXP_REGEXP_NODES_H_ 775