1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "irregexp/imported/regexp-compiler.h"
6 
7 #include "irregexp/imported/regexp-macro-assembler-arch.h"
8 #ifdef V8_INTL_SUPPORT
9 #include "irregexp/imported/special-case.h"
10 #endif  // V8_INTL_SUPPORT
11 
12 #ifdef V8_INTL_SUPPORT
13 #include "unicode/locid.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utypes.h"
16 #endif  // V8_INTL_SUPPORT
17 
18 namespace v8 {
19 namespace internal {
20 
21 using namespace regexp_compiler_constants;  // NOLINT(build/namespaces)
22 
23 // -------------------------------------------------------------------
24 // Implementation of the Irregexp regular expression engine.
25 //
26 // The Irregexp regular expression engine is intended to be a complete
27 // implementation of ECMAScript regular expressions.  It generates either
28 // bytecodes or native code.
29 
30 //   The Irregexp regexp engine is structured in three steps.
31 //   1) The parser generates an abstract syntax tree.  See ast.cc.
32 //   2) From the AST a node network is created.  The nodes are all
33 //      subclasses of RegExpNode.  The nodes represent states when
34 //      executing a regular expression.  Several optimizations are
35 //      performed on the node network.
36 //   3) From the nodes we generate either byte codes or native code
37 //      that can actually execute the regular expression (perform
38 //      the search).  The code generation step is described in more
39 //      detail below.
40 
41 // Code generation.
42 //
43 //   The nodes are divided into four main categories.
44 //   * Choice nodes
45 //        These represent places where the regular expression can
46 //        match in more than one way.  For example on entry to an
47 //        alternation (foo|bar) or a repetition (*, +, ? or {}).
48 //   * Action nodes
49 //        These represent places where some action should be
50 //        performed.  Examples include recording the current position
51 //        in the input string to a register (in order to implement
52 //        captures) or other actions on register for example in order
53 //        to implement the counters needed for {} repetitions.
54 //   * Matching nodes
55 //        These attempt to match some element part of the input string.
56 //        Examples of elements include character classes, plain strings
57 //        or back references.
58 //   * End nodes
59 //        These are used to implement the actions required on finding
60 //        a successful match or failing to find a match.
61 //
62 //   The code generated (whether as byte codes or native code) maintains
63 //   some state as it runs.  This consists of the following elements:
64 //
65 //   * The capture registers.  Used for string captures.
66 //   * Other registers.  Used for counters etc.
67 //   * The current position.
68 //   * The stack of backtracking information.  Used when a matching node
69 //     fails to find a match and needs to try an alternative.
70 //
71 // Conceptual regular expression execution model:
72 //
73 //   There is a simple conceptual model of regular expression execution
74 //   which will be presented first.  The actual code generated is a more
75 //   efficient simulation of the simple conceptual model:
76 //
77 //   * Choice nodes are implemented as follows:
78 //     For each choice except the last {
79 //       push current position
80 //       push backtrack code location
81 //       <generate code to test for choice>
82 //       backtrack code location:
83 //       pop current position
84 //     }
85 //     <generate code to test for last choice>
86 //
87 //   * Actions nodes are generated as follows
88 //     <push affected registers on backtrack stack>
89 //     <generate code to perform action>
90 //     push backtrack code location
91 //     <generate code to test for following nodes>
92 //     backtrack code location:
93 //     <pop affected registers to restore their state>
94 //     <pop backtrack location from stack and go to it>
95 //
96 //   * Matching nodes are generated as follows:
97 //     if input string matches at current position
98 //       update current position
99 //       <generate code to test for following nodes>
100 //     else
101 //       <pop backtrack location from stack and go to it>
102 //
103 //   Thus it can be seen that the current position is saved and restored
104 //   by the choice nodes, whereas the registers are saved and restored by
105 //   by the action nodes that manipulate them.
106 //
107 //   The other interesting aspect of this model is that nodes are generated
108 //   at the point where they are needed by a recursive call to Emit().  If
109 //   the node has already been code generated then the Emit() call will
110 //   generate a jump to the previously generated code instead.  In order to
111 //   limit recursion it is possible for the Emit() function to put the node
112 //   on a work list for later generation and instead generate a jump.  The
113 //   destination of the jump is resolved later when the code is generated.
114 //
115 // Actual regular expression code generation.
116 //
117 //   Code generation is actually more complicated than the above.  In order
118 //   to improve the efficiency of the generated code some optimizations are
119 //   performed
120 //
121 //   * Choice nodes have 1-character lookahead.
122 //     A choice node looks at the following character and eliminates some of
123 //     the choices immediately based on that character.  This is not yet
124 //     implemented.
125 //   * Simple greedy loops store reduced backtracking information.
126 //     A quantifier like /.*foo/m will greedily match the whole input.  It will
127 //     then need to backtrack to a point where it can match "foo".  The naive
128 //     implementation of this would push each character position onto the
129 //     backtracking stack, then pop them off one by one.  This would use space
130 //     proportional to the length of the input string.  However since the "."
131 //     can only match in one way and always has a constant length (in this case
132 //     of 1) it suffices to store the current position on the top of the stack
133 //     once.  Matching now becomes merely incrementing the current position and
134 //     backtracking becomes decrementing the current position and checking the
135 //     result against the stored current position.  This is faster and saves
136 //     space.
137 //   * The current state is virtualized.
138 //     This is used to defer expensive operations until it is clear that they
139 //     are needed and to generate code for a node more than once, allowing
140 //     specialized an efficient versions of the code to be created. This is
141 //     explained in the section below.
142 //
143 // Execution state virtualization.
144 //
145 //   Instead of emitting code, nodes that manipulate the state can record their
146 //   manipulation in an object called the Trace.  The Trace object can record a
147 //   current position offset, an optional backtrack code location on the top of
148 //   the virtualized backtrack stack and some register changes.  When a node is
149 //   to be emitted it can flush the Trace or update it.  Flushing the Trace
150 //   will emit code to bring the actual state into line with the virtual state.
151 //   Avoiding flushing the state can postpone some work (e.g. updates of capture
152 //   registers).  Postponing work can save time when executing the regular
153 //   expression since it may be found that the work never has to be done as a
154 //   failure to match can occur.  In addition it is much faster to jump to a
155 //   known backtrack code location than it is to pop an unknown backtrack
156 //   location from the stack and jump there.
157 //
158 //   The virtual state found in the Trace affects code generation.  For example
159 //   the virtual state contains the difference between the actual current
160 //   position and the virtual current position, and matching code needs to use
161 //   this offset to attempt a match in the correct location of the input
162 //   string.  Therefore code generated for a non-trivial trace is specialized
163 //   to that trace.  The code generator therefore has the ability to generate
164 //   code for each node several times.  In order to limit the size of the
165 //   generated code there is an arbitrary limit on how many specialized sets of
166 //   code may be generated for a given node.  If the limit is reached, the
167 //   trace is flushed and a generic version of the code for a node is emitted.
168 //   This is subsequently used for that node.  The code emitted for non-generic
169 //   trace is not recorded in the node and so it cannot currently be reused in
170 //   the event that code generation is requested for an identical trace.
171 
172 namespace {
173 
MaxCodeUnit(const bool one_byte)174 constexpr uc32 MaxCodeUnit(const bool one_byte) {
175   STATIC_ASSERT(String::kMaxOneByteCharCodeU <=
176                 std::numeric_limits<uint16_t>::max());
177   STATIC_ASSERT(String::kMaxUtf16CodeUnitU <=
178                 std::numeric_limits<uint16_t>::max());
179   return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU;
180 }
181 
CharMask(const bool one_byte)182 constexpr uint32_t CharMask(const bool one_byte) {
183   STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1));
184   STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1));
185   return MaxCodeUnit(one_byte);
186 }
187 
188 }  // namespace
189 
AppendToText(RegExpText * text,Zone * zone)190 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); }
191 
AppendToText(RegExpText * text,Zone * zone)192 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
193   text->AddElement(TextElement::Atom(this), zone);
194 }
195 
AppendToText(RegExpText * text,Zone * zone)196 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
197   text->AddElement(TextElement::CharClass(this), zone);
198 }
199 
AppendToText(RegExpText * text,Zone * zone)200 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
201   for (int i = 0; i < elements()->length(); i++)
202     text->AddElement(elements()->at(i), zone);
203 }
204 
Atom(RegExpAtom * atom)205 TextElement TextElement::Atom(RegExpAtom* atom) {
206   return TextElement(ATOM, atom);
207 }
208 
CharClass(RegExpCharacterClass * char_class)209 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
210   return TextElement(CHAR_CLASS, char_class);
211 }
212 
length() const213 int TextElement::length() const {
214   switch (text_type()) {
215     case ATOM:
216       return atom()->length();
217 
218     case CHAR_CLASS:
219       return 1;
220   }
221   UNREACHABLE();
222 }
223 
224 class RecursionCheck {
225  public:
RecursionCheck(RegExpCompiler * compiler)226   explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
227     compiler->IncrementRecursionDepth();
228   }
~RecursionCheck()229   ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
230 
231  private:
232   RegExpCompiler* compiler_;
233 };
234 
235 // Attempts to compile the regexp using an Irregexp code generator.  Returns
236 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(Isolate * isolate,Zone * zone,int capture_count,bool one_byte)237 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
238                                bool one_byte)
239     : next_register_(JSRegExp::RegistersForCaptureCount(capture_count)),
240       unicode_lookaround_stack_register_(kNoRegister),
241       unicode_lookaround_position_register_(kNoRegister),
242       work_list_(nullptr),
243       recursion_depth_(0),
244       one_byte_(one_byte),
245       reg_exp_too_big_(false),
246       limiting_recursion_(false),
247       optimize_(FLAG_regexp_optimization),
248       read_backward_(false),
249       current_expansion_factor_(1),
250       frequency_collator_(),
251       isolate_(isolate),
252       zone_(zone) {
253   accept_ = zone->New<EndNode>(EndNode::ACCEPT, zone);
254   DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
255 }
256 
Assemble(Isolate * isolate,RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)257 RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
258     Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
259     int capture_count, Handle<String> pattern) {
260   macro_assembler_ = macro_assembler;
261 
262   ZoneVector<RegExpNode*> work_list(zone());
263   work_list_ = &work_list;
264   Label fail;
265   macro_assembler_->PushBacktrack(&fail);
266   Trace new_trace;
267   start->Emit(this, &new_trace);
268   macro_assembler_->BindJumpTarget(&fail);
269   macro_assembler_->Fail();
270   while (!work_list.empty()) {
271     RegExpNode* node = work_list.back();
272     work_list.pop_back();
273     node->set_on_work_list(false);
274     if (!node->label()->is_bound()) node->Emit(this, &new_trace);
275   }
276   if (reg_exp_too_big_) {
277     macro_assembler_->AbortedCodeGeneration();
278     return CompilationResult::RegExpTooBig();
279   }
280 
281   Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
282   isolate->IncreaseTotalRegexpCodeGenerated(code);
283   work_list_ = nullptr;
284 
285   return {code, next_register_};
286 }
287 
Mentions(int that)288 bool Trace::DeferredAction::Mentions(int that) {
289   if (action_type() == ActionNode::CLEAR_CAPTURES) {
290     Interval range = static_cast<DeferredClearCaptures*>(this)->range();
291     return range.Contains(that);
292   } else {
293     return reg() == that;
294   }
295 }
296 
mentions_reg(int reg)297 bool Trace::mentions_reg(int reg) {
298   for (DeferredAction* action = actions_; action != nullptr;
299        action = action->next()) {
300     if (action->Mentions(reg)) return true;
301   }
302   return false;
303 }
304 
GetStoredPosition(int reg,int * cp_offset)305 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
306   DCHECK_EQ(0, *cp_offset);
307   for (DeferredAction* action = actions_; action != nullptr;
308        action = action->next()) {
309     if (action->Mentions(reg)) {
310       if (action->action_type() == ActionNode::STORE_POSITION) {
311         *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
312         return true;
313       } else {
314         return false;
315       }
316     }
317   }
318   return false;
319 }
320 
321 // A (dynamically-sized) set of unsigned integers that behaves especially well
322 // on small integers (< kFirstLimit). May do zone-allocation.
323 class DynamicBitSet : public ZoneObject {
324  public:
Get(unsigned value) const325   V8_EXPORT_PRIVATE bool Get(unsigned value) const {
326     if (value < kFirstLimit) {
327       return (first_ & (1 << value)) != 0;
328     } else if (remaining_ == nullptr) {
329       return false;
330     } else {
331       return remaining_->Contains(value);
332     }
333   }
334 
335   // Destructively set a value in this set.
Set(unsigned value,Zone * zone)336   void Set(unsigned value, Zone* zone) {
337     if (value < kFirstLimit) {
338       first_ |= (1 << value);
339     } else {
340       if (remaining_ == nullptr)
341         remaining_ = zone->New<ZoneList<unsigned>>(1, zone);
342       if (remaining_->is_empty() || !remaining_->Contains(value))
343         remaining_->Add(value, zone);
344     }
345   }
346 
347  private:
348   static constexpr unsigned kFirstLimit = 32;
349 
350   uint32_t first_ = 0;
351   ZoneList<unsigned>* remaining_ = nullptr;
352 };
353 
FindAffectedRegisters(DynamicBitSet * affected_registers,Zone * zone)354 int Trace::FindAffectedRegisters(DynamicBitSet* affected_registers,
355                                  Zone* zone) {
356   int max_register = RegExpCompiler::kNoRegister;
357   for (DeferredAction* action = actions_; action != nullptr;
358        action = action->next()) {
359     if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
360       Interval range = static_cast<DeferredClearCaptures*>(action)->range();
361       for (int i = range.from(); i <= range.to(); i++)
362         affected_registers->Set(i, zone);
363       if (range.to() > max_register) max_register = range.to();
364     } else {
365       affected_registers->Set(action->reg(), zone);
366       if (action->reg() > max_register) max_register = action->reg();
367     }
368   }
369   return max_register;
370 }
371 
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,const DynamicBitSet & registers_to_pop,const DynamicBitSet & registers_to_clear)372 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
373                                      int max_register,
374                                      const DynamicBitSet& registers_to_pop,
375                                      const DynamicBitSet& registers_to_clear) {
376   for (int reg = max_register; reg >= 0; reg--) {
377     if (registers_to_pop.Get(reg)) {
378       assembler->PopRegister(reg);
379     } else if (registers_to_clear.Get(reg)) {
380       int clear_to = reg;
381       while (reg > 0 && registers_to_clear.Get(reg - 1)) {
382         reg--;
383       }
384       assembler->ClearRegisters(reg, clear_to);
385     }
386   }
387 }
388 
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,const DynamicBitSet & affected_registers,DynamicBitSet * registers_to_pop,DynamicBitSet * registers_to_clear,Zone * zone)389 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
390                                    int max_register,
391                                    const DynamicBitSet& affected_registers,
392                                    DynamicBitSet* registers_to_pop,
393                                    DynamicBitSet* registers_to_clear,
394                                    Zone* zone) {
395   // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
396   const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
397 
398   // Count pushes performed to force a stack limit check occasionally.
399   int pushes = 0;
400 
401   for (int reg = 0; reg <= max_register; reg++) {
402     if (!affected_registers.Get(reg)) continue;
403 
404     // The chronologically first deferred action in the trace
405     // is used to infer the action needed to restore a register
406     // to its previous state (or not, if it's safe to ignore it).
407     enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
408     DeferredActionUndoType undo_action = IGNORE;
409 
410     int value = 0;
411     bool absolute = false;
412     bool clear = false;
413     static const int kNoStore = kMinInt;
414     int store_position = kNoStore;
415     // This is a little tricky because we are scanning the actions in reverse
416     // historical order (newest first).
417     for (DeferredAction* action = actions_; action != nullptr;
418          action = action->next()) {
419       if (action->Mentions(reg)) {
420         switch (action->action_type()) {
421           case ActionNode::SET_REGISTER_FOR_LOOP: {
422             Trace::DeferredSetRegisterForLoop* psr =
423                 static_cast<Trace::DeferredSetRegisterForLoop*>(action);
424             if (!absolute) {
425               value += psr->value();
426               absolute = true;
427             }
428             // SET_REGISTER_FOR_LOOP is only used for newly introduced loop
429             // counters. They can have a significant previous value if they
430             // occur in a loop. TODO(lrn): Propagate this information, so
431             // we can set undo_action to IGNORE if we know there is no value to
432             // restore.
433             undo_action = RESTORE;
434             DCHECK_EQ(store_position, kNoStore);
435             DCHECK(!clear);
436             break;
437           }
438           case ActionNode::INCREMENT_REGISTER:
439             if (!absolute) {
440               value++;
441             }
442             DCHECK_EQ(store_position, kNoStore);
443             DCHECK(!clear);
444             undo_action = RESTORE;
445             break;
446           case ActionNode::STORE_POSITION: {
447             Trace::DeferredCapture* pc =
448                 static_cast<Trace::DeferredCapture*>(action);
449             if (!clear && store_position == kNoStore) {
450               store_position = pc->cp_offset();
451             }
452 
453             // For captures we know that stores and clears alternate.
454             // Other register, are never cleared, and if the occur
455             // inside a loop, they might be assigned more than once.
456             if (reg <= 1) {
457               // Registers zero and one, aka "capture zero", is
458               // always set correctly if we succeed. There is no
459               // need to undo a setting on backtrack, because we
460               // will set it again or fail.
461               undo_action = IGNORE;
462             } else {
463               undo_action = pc->is_capture() ? CLEAR : RESTORE;
464             }
465             DCHECK(!absolute);
466             DCHECK_EQ(value, 0);
467             break;
468           }
469           case ActionNode::CLEAR_CAPTURES: {
470             // Since we're scanning in reverse order, if we've already
471             // set the position we have to ignore historically earlier
472             // clearing operations.
473             if (store_position == kNoStore) {
474               clear = true;
475             }
476             undo_action = RESTORE;
477             DCHECK(!absolute);
478             DCHECK_EQ(value, 0);
479             break;
480           }
481           default:
482             UNREACHABLE();
483             break;
484         }
485       }
486     }
487     // Prepare for the undo-action (e.g., push if it's going to be popped).
488     if (undo_action == RESTORE) {
489       pushes++;
490       RegExpMacroAssembler::StackCheckFlag stack_check =
491           RegExpMacroAssembler::kNoStackLimitCheck;
492       if (pushes == push_limit) {
493         stack_check = RegExpMacroAssembler::kCheckStackLimit;
494         pushes = 0;
495       }
496 
497       assembler->PushRegister(reg, stack_check);
498       registers_to_pop->Set(reg, zone);
499     } else if (undo_action == CLEAR) {
500       registers_to_clear->Set(reg, zone);
501     }
502     // Perform the chronologically last action (or accumulated increment)
503     // for the register.
504     if (store_position != kNoStore) {
505       assembler->WriteCurrentPositionToRegister(reg, store_position);
506     } else if (clear) {
507       assembler->ClearRegisters(reg, reg);
508     } else if (absolute) {
509       assembler->SetRegister(reg, value);
510     } else if (value != 0) {
511       assembler->AdvanceRegister(reg, value);
512     }
513   }
514 }
515 
516 // This is called as we come into a loop choice node and some other tricky
517 // nodes.  It normalizes the state of the code generator to ensure we can
518 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)519 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
520   RegExpMacroAssembler* assembler = compiler->macro_assembler();
521 
522   DCHECK(!is_trivial());
523 
524   if (actions_ == nullptr && backtrack() == nullptr) {
525     // Here we just have some deferred cp advances to fix and we are back to
526     // a normal situation.  We may also have to forget some information gained
527     // through a quick check that was already performed.
528     if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
529     // Create a new trivial state and generate the node with that.
530     Trace new_state;
531     successor->Emit(compiler, &new_state);
532     return;
533   }
534 
535   // Generate deferred actions here along with code to undo them again.
536   DynamicBitSet affected_registers;
537 
538   if (backtrack() != nullptr) {
539     // Here we have a concrete backtrack location.  These are set up by choice
540     // nodes and so they indicate that we have a deferred save of the current
541     // position which we may need to emit here.
542     assembler->PushCurrentPosition();
543   }
544 
545   int max_register =
546       FindAffectedRegisters(&affected_registers, compiler->zone());
547   DynamicBitSet registers_to_pop;
548   DynamicBitSet registers_to_clear;
549   PerformDeferredActions(assembler, max_register, affected_registers,
550                          &registers_to_pop, &registers_to_clear,
551                          compiler->zone());
552   if (cp_offset_ != 0) {
553     assembler->AdvanceCurrentPosition(cp_offset_);
554   }
555 
556   // Create a new trivial state and generate the node with that.
557   Label undo;
558   assembler->PushBacktrack(&undo);
559   if (successor->KeepRecursing(compiler)) {
560     Trace new_state;
561     successor->Emit(compiler, &new_state);
562   } else {
563     compiler->AddWork(successor);
564     assembler->GoTo(successor->label());
565   }
566 
567   // On backtrack we need to restore state.
568   assembler->BindJumpTarget(&undo);
569   RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
570                            registers_to_clear);
571   if (backtrack() == nullptr) {
572     assembler->Backtrack();
573   } else {
574     assembler->PopCurrentPosition();
575     assembler->GoTo(backtrack());
576   }
577 }
578 
Emit(RegExpCompiler * compiler,Trace * trace)579 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
580   RegExpMacroAssembler* assembler = compiler->macro_assembler();
581 
582   // Omit flushing the trace. We discard the entire stack frame anyway.
583 
584   if (!label()->is_bound()) {
585     // We are completely independent of the trace, since we ignore it,
586     // so this code can be used as the generic version.
587     assembler->Bind(label());
588   }
589 
590   // Throw away everything on the backtrack stack since the start
591   // of the negative submatch and restore the character position.
592   assembler->ReadCurrentPositionFromRegister(current_position_register_);
593   assembler->ReadStackPointerFromRegister(stack_pointer_register_);
594   if (clear_capture_count_ > 0) {
595     // Clear any captures that might have been performed during the success
596     // of the body of the negative look-ahead.
597     int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
598     assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
599   }
600   // Now that we have unwound the stack we find at the top of the stack the
601   // backtrack that the BeginNegativeSubmatch node got.
602   assembler->Backtrack();
603 }
604 
Emit(RegExpCompiler * compiler,Trace * trace)605 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
606   if (!trace->is_trivial()) {
607     trace->Flush(compiler, this);
608     return;
609   }
610   RegExpMacroAssembler* assembler = compiler->macro_assembler();
611   if (!label()->is_bound()) {
612     assembler->Bind(label());
613   }
614   switch (action_) {
615     case ACCEPT:
616       assembler->Succeed();
617       return;
618     case BACKTRACK:
619       assembler->GoTo(trace->backtrack());
620       return;
621     case NEGATIVE_SUBMATCH_SUCCESS:
622       // This case is handled in a different virtual method.
623       UNREACHABLE();
624   }
625   UNIMPLEMENTED();
626 }
627 
AddGuard(Guard * guard,Zone * zone)628 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
629   if (guards_ == nullptr) guards_ = zone->New<ZoneList<Guard*>>(1, zone);
630   guards_->Add(guard, zone);
631 }
632 
SetRegisterForLoop(int reg,int val,RegExpNode * on_success)633 ActionNode* ActionNode::SetRegisterForLoop(int reg, int val,
634                                            RegExpNode* on_success) {
635   ActionNode* result =
636       on_success->zone()->New<ActionNode>(SET_REGISTER_FOR_LOOP, on_success);
637   result->data_.u_store_register.reg = reg;
638   result->data_.u_store_register.value = val;
639   return result;
640 }
641 
IncrementRegister(int reg,RegExpNode * on_success)642 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
643   ActionNode* result =
644       on_success->zone()->New<ActionNode>(INCREMENT_REGISTER, on_success);
645   result->data_.u_increment_register.reg = reg;
646   return result;
647 }
648 
StorePosition(int reg,bool is_capture,RegExpNode * on_success)649 ActionNode* ActionNode::StorePosition(int reg, bool is_capture,
650                                       RegExpNode* on_success) {
651   ActionNode* result =
652       on_success->zone()->New<ActionNode>(STORE_POSITION, on_success);
653   result->data_.u_position_register.reg = reg;
654   result->data_.u_position_register.is_capture = is_capture;
655   return result;
656 }
657 
ClearCaptures(Interval range,RegExpNode * on_success)658 ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) {
659   ActionNode* result =
660       on_success->zone()->New<ActionNode>(CLEAR_CAPTURES, on_success);
661   result->data_.u_clear_captures.range_from = range.from();
662   result->data_.u_clear_captures.range_to = range.to();
663   return result;
664 }
665 
BeginPositiveSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)666 ActionNode* ActionNode::BeginPositiveSubmatch(int stack_reg, int position_reg,
667                                               RegExpNode* on_success) {
668   ActionNode* result =
669       on_success->zone()->New<ActionNode>(BEGIN_POSITIVE_SUBMATCH, on_success);
670   result->data_.u_submatch.stack_pointer_register = stack_reg;
671   result->data_.u_submatch.current_position_register = position_reg;
672   return result;
673 }
674 
BeginNegativeSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)675 ActionNode* ActionNode::BeginNegativeSubmatch(int stack_reg, int position_reg,
676                                               RegExpNode* on_success) {
677   ActionNode* result =
678       on_success->zone()->New<ActionNode>(BEGIN_NEGATIVE_SUBMATCH, on_success);
679   result->data_.u_submatch.stack_pointer_register = stack_reg;
680   result->data_.u_submatch.current_position_register = position_reg;
681   return result;
682 }
683 
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)684 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, int position_reg,
685                                                 int clear_register_count,
686                                                 int clear_register_from,
687                                                 RegExpNode* on_success) {
688   ActionNode* result = on_success->zone()->New<ActionNode>(
689       POSITIVE_SUBMATCH_SUCCESS, on_success);
690   result->data_.u_submatch.stack_pointer_register = stack_reg;
691   result->data_.u_submatch.current_position_register = position_reg;
692   result->data_.u_submatch.clear_register_count = clear_register_count;
693   result->data_.u_submatch.clear_register_from = clear_register_from;
694   return result;
695 }
696 
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)697 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
698                                         int repetition_register,
699                                         int repetition_limit,
700                                         RegExpNode* on_success) {
701   ActionNode* result =
702       on_success->zone()->New<ActionNode>(EMPTY_MATCH_CHECK, on_success);
703   result->data_.u_empty_match_check.start_register = start_register;
704   result->data_.u_empty_match_check.repetition_register = repetition_register;
705   result->data_.u_empty_match_check.repetition_limit = repetition_limit;
706   return result;
707 }
708 
709 #define DEFINE_ACCEPT(Type) \
710   void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)711 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
712 #undef DEFINE_ACCEPT
713 
714 // -------------------------------------------------------------------
715 // Emit code.
716 
717 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
718                                Guard* guard, Trace* trace) {
719   switch (guard->op()) {
720     case Guard::LT:
721       DCHECK(!trace->mentions_reg(guard->reg()));
722       macro_assembler->IfRegisterGE(guard->reg(), guard->value(),
723                                     trace->backtrack());
724       break;
725     case Guard::GEQ:
726       DCHECK(!trace->mentions_reg(guard->reg()));
727       macro_assembler->IfRegisterLT(guard->reg(), guard->value(),
728                                     trace->backtrack());
729       break;
730   }
731 }
732 
733 namespace {
734 
735 #ifdef DEBUG
ContainsOnlyUtf16CodeUnits(unibrow::uchar * chars,int length)736 bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) {
737   STATIC_ASSERT(sizeof(unibrow::uchar) == 4);
738   for (int i = 0; i < length; i++) {
739     if (chars[i] > String::kMaxUtf16CodeUnit) return false;
740   }
741   return true;
742 }
743 #endif  // DEBUG
744 
745 }  // namespace
746 
747 // Returns the number of characters in the equivalence class, omitting those
748 // that cannot occur in the source string because it is Latin1.
GetCaseIndependentLetters(Isolate * isolate,uc16 character,bool one_byte_subject,unibrow::uchar * letters,int letter_length)749 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
750                                      bool one_byte_subject,
751                                      unibrow::uchar* letters,
752                                      int letter_length) {
753 #ifdef V8_INTL_SUPPORT
754   if (RegExpCaseFolding::IgnoreSet().contains(character)) {
755     letters[0] = character;
756     DCHECK(ContainsOnlyUtf16CodeUnits(letters, 1));
757     return 1;
758   }
759   bool in_special_add_set =
760       RegExpCaseFolding::SpecialAddSet().contains(character);
761 
762   icu::UnicodeSet set;
763   set.add(character);
764   set = set.closeOver(USET_CASE_INSENSITIVE);
765 
766   UChar32 canon = 0;
767   if (in_special_add_set) {
768     canon = RegExpCaseFolding::Canonicalize(character);
769   }
770 
771   int32_t range_count = set.getRangeCount();
772   int items = 0;
773   for (int32_t i = 0; i < range_count; i++) {
774     UChar32 start = set.getRangeStart(i);
775     UChar32 end = set.getRangeEnd(i);
776     CHECK(end - start + items <= letter_length);
777     for (UChar32 cu = start; cu <= end; cu++) {
778       if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
779       if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) {
780         continue;
781       }
782       letters[items++] = static_cast<unibrow::uchar>(cu);
783     }
784   }
785   DCHECK(ContainsOnlyUtf16CodeUnits(letters, items));
786   return items;
787 #else
788   int length =
789       isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
790   // Unibrow returns 0 or 1 for characters where case independence is
791   // trivial.
792   if (length == 0) {
793     letters[0] = character;
794     length = 1;
795   }
796 
797   if (one_byte_subject) {
798     int new_length = 0;
799     for (int i = 0; i < length; i++) {
800       if (letters[i] <= String::kMaxOneByteCharCode) {
801         letters[new_length++] = letters[i];
802       }
803     }
804     length = new_length;
805   }
806 
807   DCHECK(ContainsOnlyUtf16CodeUnits(letters, length));
808   return length;
809 #endif  // V8_INTL_SUPPORT
810 }
811 
EmitSimpleCharacter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)812 static inline bool EmitSimpleCharacter(Isolate* isolate,
813                                        RegExpCompiler* compiler, uc16 c,
814                                        Label* on_failure, int cp_offset,
815                                        bool check, bool preloaded) {
816   RegExpMacroAssembler* assembler = compiler->macro_assembler();
817   bool bound_checked = false;
818   if (!preloaded) {
819     assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
820     bound_checked = true;
821   }
822   assembler->CheckNotCharacter(c, on_failure);
823   return bound_checked;
824 }
825 
826 // Only emits non-letters (things that don't have case).  Only used for case
827 // independent matches.
EmitAtomNonLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)828 static inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
829                                      uc16 c, Label* on_failure, int cp_offset,
830                                      bool check, bool preloaded) {
831   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
832   bool one_byte = compiler->one_byte();
833   unibrow::uchar chars[4];
834   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
835   if (length < 1) {
836     // This can't match.  Must be an one-byte subject and a non-one-byte
837     // character.  We do not need to do anything since the one-byte pass
838     // already handled this.
839     return false;  // Bounds not checked.
840   }
841   bool checked = false;
842   // We handle the length > 1 case in a later pass.
843   if (length == 1) {
844     if (one_byte && c > String::kMaxOneByteCharCodeU) {
845       // Can't match - see above.
846       return false;  // Bounds not checked.
847     }
848     if (!preloaded) {
849       macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
850       checked = check;
851     }
852     macro_assembler->CheckNotCharacter(c, on_failure);
853   }
854   return checked;
855 }
856 
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool one_byte,uc16 c1,uc16 c2,Label * on_failure)857 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
858                                       bool one_byte, uc16 c1, uc16 c2,
859                                       Label* on_failure) {
860   const uint32_t char_mask = CharMask(one_byte);
861   uc16 exor = c1 ^ c2;
862   // Check whether exor has only one bit set.
863   if (((exor - 1) & exor) == 0) {
864     // If c1 and c2 differ only by one bit.
865     // Ecma262UnCanonicalize always gives the highest number last.
866     DCHECK(c2 > c1);
867     uc16 mask = char_mask ^ exor;
868     macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
869     return true;
870   }
871   DCHECK(c2 > c1);
872   uc16 diff = c2 - c1;
873   if (((diff - 1) & diff) == 0 && c1 >= diff) {
874     // If the characters differ by 2^n but don't differ by one bit then
875     // subtract the difference from the found character, then do the or
876     // trick.  We avoid the theoretical case where negative numbers are
877     // involved in order to simplify code generation.
878     uc16 mask = char_mask ^ diff;
879     macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask,
880                                                     on_failure);
881     return true;
882   }
883   return false;
884 }
885 
886 // Only emits letters (things that have case).  Only used for case independent
887 // matches.
EmitAtomLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)888 static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
889                                   uc16 c, Label* on_failure, int cp_offset,
890                                   bool check, bool preloaded) {
891   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
892   bool one_byte = compiler->one_byte();
893   unibrow::uchar chars[4];
894   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
895   if (length <= 1) return false;
896   // We may not need to check against the end of the input string
897   // if this character lies before a character that matched.
898   if (!preloaded) {
899     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
900   }
901   Label ok;
902   switch (length) {
903     case 2: {
904       if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
905                                     chars[1], on_failure)) {
906       } else {
907         macro_assembler->CheckCharacter(chars[0], &ok);
908         macro_assembler->CheckNotCharacter(chars[1], on_failure);
909         macro_assembler->Bind(&ok);
910       }
911       break;
912     }
913     case 4:
914       macro_assembler->CheckCharacter(chars[3], &ok);
915       V8_FALLTHROUGH;
916     case 3:
917       macro_assembler->CheckCharacter(chars[0], &ok);
918       macro_assembler->CheckCharacter(chars[1], &ok);
919       macro_assembler->CheckNotCharacter(chars[2], on_failure);
920       macro_assembler->Bind(&ok);
921       break;
922     default:
923       UNREACHABLE();
924   }
925   return true;
926 }
927 
EmitBoundaryTest(RegExpMacroAssembler * masm,int border,Label * fall_through,Label * above_or_equal,Label * below)928 static void EmitBoundaryTest(RegExpMacroAssembler* masm, int border,
929                              Label* fall_through, Label* above_or_equal,
930                              Label* below) {
931   if (below != fall_through) {
932     masm->CheckCharacterLT(border, below);
933     if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
934   } else {
935     masm->CheckCharacterGT(border - 1, above_or_equal);
936   }
937 }
938 
EmitDoubleBoundaryTest(RegExpMacroAssembler * masm,int first,int last,Label * fall_through,Label * in_range,Label * out_of_range)939 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first,
940                                    int last, Label* fall_through,
941                                    Label* in_range, Label* out_of_range) {
942   if (in_range == fall_through) {
943     if (first == last) {
944       masm->CheckNotCharacter(first, out_of_range);
945     } else {
946       masm->CheckCharacterNotInRange(first, last, out_of_range);
947     }
948   } else {
949     if (first == last) {
950       masm->CheckCharacter(first, in_range);
951     } else {
952       masm->CheckCharacterInRange(first, last, in_range);
953     }
954     if (out_of_range != fall_through) masm->GoTo(out_of_range);
955   }
956 }
957 
958 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
959 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
EmitUseLookupTable(RegExpMacroAssembler * masm,ZoneList<uc32> * ranges,uint32_t start_index,uint32_t end_index,uc32 min_char,Label * fall_through,Label * even_label,Label * odd_label)960 static void EmitUseLookupTable(RegExpMacroAssembler* masm,
961                                ZoneList<uc32>* ranges, uint32_t start_index,
962                                uint32_t end_index, uc32 min_char,
963                                Label* fall_through, Label* even_label,
964                                Label* odd_label) {
965   static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
966   static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
967 
968   uc32 base = (min_char & ~kMask);
969   USE(base);
970 
971   // Assert that everything is on one kTableSize page.
972   for (uint32_t i = start_index; i <= end_index; i++) {
973     DCHECK_EQ(ranges->at(i) & ~kMask, base);
974   }
975   DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
976 
977   char templ[kSize];
978   Label* on_bit_set;
979   Label* on_bit_clear;
980   int bit;
981   if (even_label == fall_through) {
982     on_bit_set = odd_label;
983     on_bit_clear = even_label;
984     bit = 1;
985   } else {
986     on_bit_set = even_label;
987     on_bit_clear = odd_label;
988     bit = 0;
989   }
990   for (uint32_t i = 0; i < (ranges->at(start_index) & kMask) && i < kSize;
991        i++) {
992     templ[i] = bit;
993   }
994   uint32_t j = 0;
995   bit ^= 1;
996   for (uint32_t i = start_index; i < end_index; i++) {
997     for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
998       templ[j] = bit;
999     }
1000     bit ^= 1;
1001   }
1002   for (uint32_t i = j; i < kSize; i++) {
1003     templ[i] = bit;
1004   }
1005   Factory* factory = masm->isolate()->factory();
1006   // TODO(erikcorry): Cache these.
1007   Handle<ByteArray> ba = factory->NewByteArray(kSize, AllocationType::kOld);
1008   for (uint32_t i = 0; i < kSize; i++) {
1009     ba->set(i, templ[i]);
1010   }
1011   masm->CheckBitInTable(ba, on_bit_set);
1012   if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1013 }
1014 
CutOutRange(RegExpMacroAssembler * masm,ZoneList<uc32> * ranges,uint32_t start_index,uint32_t end_index,uint32_t cut_index,Label * even_label,Label * odd_label)1015 static void CutOutRange(RegExpMacroAssembler* masm, ZoneList<uc32>* ranges,
1016                         uint32_t start_index, uint32_t end_index,
1017                         uint32_t cut_index, Label* even_label,
1018                         Label* odd_label) {
1019   bool odd = (((cut_index - start_index) & 1) == 1);
1020   Label* in_range_label = odd ? odd_label : even_label;
1021   Label dummy;
1022   EmitDoubleBoundaryTest(masm, ranges->at(cut_index),
1023                          ranges->at(cut_index + 1) - 1, &dummy, in_range_label,
1024                          &dummy);
1025   DCHECK(!dummy.is_linked());
1026   // Cut out the single range by rewriting the array.  This creates a new
1027   // range that is a merger of the two ranges on either side of the one we
1028   // are cutting out.  The oddity of the labels is preserved.
1029   for (uint32_t j = cut_index; j > start_index; j--) {
1030     ranges->at(j) = ranges->at(j - 1);
1031   }
1032   for (uint32_t j = cut_index + 1; j < end_index; j++) {
1033     ranges->at(j) = ranges->at(j + 1);
1034   }
1035 }
1036 
1037 // Unicode case.  Split the search space into kSize spaces that are handled
1038 // with recursion.
SplitSearchSpace(ZoneList<uc32> * ranges,uint32_t start_index,uint32_t end_index,uint32_t * new_start_index,uint32_t * new_end_index,uc32 * border)1039 static void SplitSearchSpace(ZoneList<uc32>* ranges, uint32_t start_index,
1040                              uint32_t end_index, uint32_t* new_start_index,
1041                              uint32_t* new_end_index, uc32* border) {
1042   static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
1043   static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
1044 
1045   uc32 first = ranges->at(start_index);
1046   uc32 last = ranges->at(end_index) - 1;
1047 
1048   *new_start_index = start_index;
1049   *border = (ranges->at(start_index) & ~kMask) + kSize;
1050   while (*new_start_index < end_index) {
1051     if (ranges->at(*new_start_index) > *border) break;
1052     (*new_start_index)++;
1053   }
1054   // new_start_index is the index of the first edge that is beyond the
1055   // current kSize space.
1056 
1057   // For very large search spaces we do a binary chop search of the non-Latin1
1058   // space instead of just going to the end of the current kSize space.  The
1059   // heuristics are complicated a little by the fact that any 128-character
1060   // encoding space can be quickly tested with a table lookup, so we don't
1061   // wish to do binary chop search at a smaller granularity than that.  A
1062   // 128-character space can take up a lot of space in the ranges array if,
1063   // for example, we only want to match every second character (eg. the lower
1064   // case characters on some Unicode pages).
1065   uint32_t binary_chop_index = (end_index + start_index) / 2;
1066   // The first test ensures that we get to the code that handles the Latin1
1067   // range with a single not-taken branch, speeding up this important
1068   // character range (even non-Latin1 charset-based text has spaces and
1069   // punctuation).
1070   if (*border - 1 > String::kMaxOneByteCharCode &&  // Latin1 case.
1071       end_index - start_index > (*new_start_index - start_index) * 2 &&
1072       last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1073       ranges->at(binary_chop_index) >= first + 2 * kSize) {
1074     uint32_t scan_forward_for_section_border = binary_chop_index;
1075     uint32_t new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1076 
1077     while (scan_forward_for_section_border < end_index) {
1078       if (ranges->at(scan_forward_for_section_border) > new_border) {
1079         *new_start_index = scan_forward_for_section_border;
1080         *border = new_border;
1081         break;
1082       }
1083       scan_forward_for_section_border++;
1084     }
1085   }
1086 
1087   DCHECK(*new_start_index > start_index);
1088   *new_end_index = *new_start_index - 1;
1089   if (ranges->at(*new_end_index) == *border) {
1090     (*new_end_index)--;
1091   }
1092   if (*border >= ranges->at(end_index)) {
1093     *border = ranges->at(end_index);
1094     *new_start_index = end_index;  // Won't be used.
1095     *new_end_index = end_index - 1;
1096   }
1097 }
1098 
1099 // Gets a series of segment boundaries representing a character class.  If the
1100 // character is in the range between an even and an odd boundary (counting from
1101 // start_index) then go to even_label, otherwise go to odd_label.  We already
1102 // know that the character is in the range of min_char to max_char inclusive.
1103 // Either label can be nullptr indicating backtracking.  Either label can also
1104 // be equal to the fall_through label.
GenerateBranches(RegExpMacroAssembler * masm,ZoneList<uc32> * ranges,uint32_t start_index,uint32_t end_index,uc32 min_char,uc32 max_char,Label * fall_through,Label * even_label,Label * odd_label)1105 static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<uc32>* ranges,
1106                              uint32_t start_index, uint32_t end_index,
1107                              uc32 min_char, uc32 max_char, Label* fall_through,
1108                              Label* even_label, Label* odd_label) {
1109   DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1110   DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1111 
1112   uc32 first = ranges->at(start_index);
1113   uc32 last = ranges->at(end_index) - 1;
1114 
1115   DCHECK_LT(min_char, first);
1116 
1117   // Just need to test if the character is before or on-or-after
1118   // a particular character.
1119   if (start_index == end_index) {
1120     EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1121     return;
1122   }
1123 
1124   // Another almost trivial case:  There is one interval in the middle that is
1125   // different from the end intervals.
1126   if (start_index + 1 == end_index) {
1127     EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label,
1128                            odd_label);
1129     return;
1130   }
1131 
1132   // It's not worth using table lookup if there are very few intervals in the
1133   // character class.
1134   if (end_index - start_index <= 6) {
1135     // It is faster to test for individual characters, so we look for those
1136     // first, then try arbitrary ranges in the second round.
1137     static uint32_t kNoCutIndex = -1;
1138     uint32_t cut = kNoCutIndex;
1139     for (uint32_t i = start_index; i < end_index; i++) {
1140       if (ranges->at(i) == ranges->at(i + 1) - 1) {
1141         cut = i;
1142         break;
1143       }
1144     }
1145     if (cut == kNoCutIndex) cut = start_index;
1146     CutOutRange(masm, ranges, start_index, end_index, cut, even_label,
1147                 odd_label);
1148     DCHECK_GE(end_index - start_index, 2);
1149     GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char,
1150                      max_char, fall_through, even_label, odd_label);
1151     return;
1152   }
1153 
1154   // If there are a lot of intervals in the regexp, then we will use tables to
1155   // determine whether the character is inside or outside the character class.
1156   static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1157 
1158   if ((max_char >> kBits) == (min_char >> kBits)) {
1159     EmitUseLookupTable(masm, ranges, start_index, end_index, min_char,
1160                        fall_through, even_label, odd_label);
1161     return;
1162   }
1163 
1164   if ((min_char >> kBits) != first >> kBits) {
1165     masm->CheckCharacterLT(first, odd_label);
1166     GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char,
1167                      fall_through, odd_label, even_label);
1168     return;
1169   }
1170 
1171   uint32_t new_start_index = 0;
1172   uint32_t new_end_index = 0;
1173   uc32 border = 0;
1174 
1175   SplitSearchSpace(ranges, start_index, end_index, &new_start_index,
1176                    &new_end_index, &border);
1177 
1178   Label handle_rest;
1179   Label* above = &handle_rest;
1180   if (border == last + 1) {
1181     // We didn't find any section that started after the limit, so everything
1182     // above the border is one of the terminal labels.
1183     above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
1184     DCHECK(new_end_index == end_index - 1);
1185   }
1186 
1187   DCHECK_LE(start_index, new_end_index);
1188   DCHECK_LE(new_start_index, end_index);
1189   DCHECK_LT(start_index, new_start_index);
1190   DCHECK_LT(new_end_index, end_index);
1191   DCHECK(new_end_index + 1 == new_start_index ||
1192          (new_end_index + 2 == new_start_index &&
1193           border == ranges->at(new_end_index + 1)));
1194   DCHECK_LT(min_char, border - 1);
1195   DCHECK_LT(border, max_char);
1196   DCHECK_LT(ranges->at(new_end_index), border);
1197   DCHECK(border < ranges->at(new_start_index) ||
1198          (border == ranges->at(new_start_index) &&
1199           new_start_index == end_index && new_end_index == end_index - 1 &&
1200           border == last + 1));
1201   DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
1202 
1203   masm->CheckCharacterGT(border - 1, above);
1204   Label dummy;
1205   GenerateBranches(masm, ranges, start_index, new_end_index, min_char,
1206                    border - 1, &dummy, even_label, odd_label);
1207   if (handle_rest.is_linked()) {
1208     masm->Bind(&handle_rest);
1209     bool flip = (new_start_index & 1) != (start_index & 1);
1210     GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char,
1211                      &dummy, flip ? odd_label : even_label,
1212                      flip ? even_label : odd_label);
1213   }
1214 }
1215 
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool one_byte,Label * on_failure,int cp_offset,bool check_offset,bool preloaded,Zone * zone)1216 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1217                           RegExpCharacterClass* cc, bool one_byte,
1218                           Label* on_failure, int cp_offset, bool check_offset,
1219                           bool preloaded, Zone* zone) {
1220   ZoneList<CharacterRange>* ranges = cc->ranges(zone);
1221   CharacterRange::Canonicalize(ranges);
1222 
1223   const uc32 max_char = MaxCodeUnit(one_byte);
1224   int range_count = ranges->length();
1225 
1226   int last_valid_range = range_count - 1;
1227   while (last_valid_range >= 0) {
1228     CharacterRange& range = ranges->at(last_valid_range);
1229     if (range.from() <= max_char) break;
1230     last_valid_range--;
1231   }
1232 
1233   if (last_valid_range < 0) {
1234     if (!cc->is_negated()) {
1235       macro_assembler->GoTo(on_failure);
1236     }
1237     if (check_offset) {
1238       macro_assembler->CheckPosition(cp_offset, on_failure);
1239     }
1240     return;
1241   }
1242 
1243   if (last_valid_range == 0 && ranges->at(0).IsEverything(max_char)) {
1244     if (cc->is_negated()) {
1245       macro_assembler->GoTo(on_failure);
1246     } else {
1247       // This is a common case hit by non-anchored expressions.
1248       if (check_offset) {
1249         macro_assembler->CheckPosition(cp_offset, on_failure);
1250       }
1251     }
1252     return;
1253   }
1254 
1255   if (!preloaded) {
1256     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1257   }
1258 
1259   if (cc->is_standard(zone) && macro_assembler->CheckSpecialCharacterClass(
1260                                    cc->standard_type(), on_failure)) {
1261     return;
1262   }
1263 
1264   // A new list with ascending entries.  Each entry is a code unit
1265   // where there is a boundary between code units that are part of
1266   // the class and code units that are not.  Normally we insert an
1267   // entry at zero which goes to the failure label, but if there
1268   // was already one there we fall through for success on that entry.
1269   // Subsequent entries have alternating meaning (success/failure).
1270   ZoneList<uc32>* range_boundaries =
1271       zone->New<ZoneList<uc32>>(last_valid_range, zone);
1272 
1273   bool zeroth_entry_is_failure = !cc->is_negated();
1274 
1275   for (int i = 0; i <= last_valid_range; i++) {
1276     CharacterRange& range = ranges->at(i);
1277     if (range.from() == 0) {
1278       DCHECK_EQ(i, 0);
1279       zeroth_entry_is_failure = !zeroth_entry_is_failure;
1280     } else {
1281       range_boundaries->Add(range.from(), zone);
1282     }
1283     range_boundaries->Add(range.to() + 1, zone);
1284   }
1285   int end_index = range_boundaries->length() - 1;
1286   if (range_boundaries->at(end_index) > max_char) {
1287     end_index--;
1288   }
1289 
1290   Label fall_through;
1291   GenerateBranches(macro_assembler, range_boundaries,
1292                    0,  // start_index.
1293                    end_index,
1294                    0,  // min_char.
1295                    max_char, &fall_through,
1296                    zeroth_entry_is_failure ? &fall_through : on_failure,
1297                    zeroth_entry_is_failure ? on_failure : &fall_through);
1298   macro_assembler->Bind(&fall_through);
1299 }
1300 
1301 RegExpNode::~RegExpNode() = default;
1302 
LimitVersions(RegExpCompiler * compiler,Trace * trace)1303 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1304                                                   Trace* trace) {
1305   // If we are generating a greedy loop then don't stop and don't reuse code.
1306   if (trace->stop_node() != nullptr) {
1307     return CONTINUE;
1308   }
1309 
1310   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1311   if (trace->is_trivial()) {
1312     if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
1313       // If a generic version is already scheduled to be generated or we have
1314       // recursed too deeply then just generate a jump to that code.
1315       macro_assembler->GoTo(&label_);
1316       // This will queue it up for generation of a generic version if it hasn't
1317       // already been queued.
1318       compiler->AddWork(this);
1319       return DONE;
1320     }
1321     // Generate generic version of the node and bind the label for later use.
1322     macro_assembler->Bind(&label_);
1323     return CONTINUE;
1324   }
1325 
1326   // We are being asked to make a non-generic version.  Keep track of how many
1327   // non-generic versions we generate so as not to overdo it.
1328   trace_count_++;
1329   if (KeepRecursing(compiler) && compiler->optimize() &&
1330       trace_count_ < kMaxCopiesCodeGenerated) {
1331     return CONTINUE;
1332   }
1333 
1334   // If we get here code has been generated for this node too many times or
1335   // recursion is too deep.  Time to switch to a generic version.  The code for
1336   // generic versions above can handle deep recursion properly.
1337   bool was_limiting = compiler->limiting_recursion();
1338   compiler->set_limiting_recursion(true);
1339   trace->Flush(compiler, this);
1340   compiler->set_limiting_recursion(was_limiting);
1341   return DONE;
1342 }
1343 
KeepRecursing(RegExpCompiler * compiler)1344 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
1345   return !compiler->limiting_recursion() &&
1346          compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
1347 }
1348 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)1349 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
1350                               BoyerMooreLookahead* bm, bool not_at_start) {
1351   if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) {
1352     // Anything may follow a positive submatch success, thus we need to accept
1353     // all characters from this position onwards.
1354     bm->SetRest(offset);
1355   } else {
1356     on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
1357   }
1358   SaveBMInfo(bm, not_at_start, offset);
1359 }
1360 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)1361 void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details,
1362                                       RegExpCompiler* compiler, int filled_in,
1363                                       bool not_at_start) {
1364   if (action_type_ == SET_REGISTER_FOR_LOOP) {
1365     on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler,
1366                                                     filled_in, not_at_start);
1367   } else {
1368     on_success()->GetQuickCheckDetails(details, compiler, filled_in,
1369                                        not_at_start);
1370   }
1371 }
1372 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)1373 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
1374                                  BoyerMooreLookahead* bm, bool not_at_start) {
1375   // Match the behaviour of EatsAtLeast on this node.
1376   if (assertion_type() == AT_START && not_at_start) return;
1377   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
1378   SaveBMInfo(bm, not_at_start, offset);
1379 }
1380 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)1381 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
1382     QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
1383     bool not_at_start) {
1384   RegExpNode* node = continue_node();
1385   return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1386 }
1387 
1388 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)1389 static inline uint32_t SmearBitsRight(uint32_t v) {
1390   v |= v >> 1;
1391   v |= v >> 2;
1392   v |= v >> 4;
1393   v |= v >> 8;
1394   v |= v >> 16;
1395   return v;
1396 }
1397 
Rationalize(bool asc)1398 bool QuickCheckDetails::Rationalize(bool asc) {
1399   bool found_useful_op = false;
1400   const uint32_t char_mask = CharMask(asc);
1401   mask_ = 0;
1402   value_ = 0;
1403   int char_shift = 0;
1404   for (int i = 0; i < characters_; i++) {
1405     Position* pos = &positions_[i];
1406     if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
1407       found_useful_op = true;
1408     }
1409     mask_ |= (pos->mask & char_mask) << char_shift;
1410     value_ |= (pos->value & char_mask) << char_shift;
1411     char_shift += asc ? 8 : 16;
1412   }
1413   return found_useful_op;
1414 }
1415 
EatsAtLeast(bool not_at_start)1416 int RegExpNode::EatsAtLeast(bool not_at_start) {
1417   return not_at_start ? eats_at_least_.eats_at_least_from_not_start
1418                       : eats_at_least_.eats_at_least_from_possibly_start;
1419 }
1420 
EatsAtLeastFromLoopEntry()1421 EatsAtLeastInfo RegExpNode::EatsAtLeastFromLoopEntry() {
1422   // SET_REGISTER_FOR_LOOP is only used to initialize loop counters, and it
1423   // implies that the following node must be a LoopChoiceNode. If we need to
1424   // set registers to constant values for other reasons, we could introduce a
1425   // new action type SET_REGISTER that doesn't imply anything about its
1426   // successor.
1427   UNREACHABLE();
1428 }
1429 
GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1430 void RegExpNode::GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
1431                                                    RegExpCompiler* compiler,
1432                                                    int characters_filled_in,
1433                                                    bool not_at_start) {
1434   // See comment in RegExpNode::EatsAtLeastFromLoopEntry.
1435   UNREACHABLE();
1436 }
1437 
EatsAtLeastFromLoopEntry()1438 EatsAtLeastInfo LoopChoiceNode::EatsAtLeastFromLoopEntry() {
1439   DCHECK_EQ(alternatives_->length(), 2);  // There's just loop and continue.
1440 
1441   if (read_backward()) {
1442     // The eats_at_least value is not used if reading backward. The
1443     // EatsAtLeastPropagator should've zeroed it as well.
1444     DCHECK_EQ(eats_at_least_info()->eats_at_least_from_possibly_start, 0);
1445     DCHECK_EQ(eats_at_least_info()->eats_at_least_from_not_start, 0);
1446     return {};
1447   }
1448 
1449   // Figure out how much the loop body itself eats, not including anything in
1450   // the continuation case. In general, the nodes in the loop body should report
1451   // that they eat at least the number eaten by the continuation node, since any
1452   // successful match in the loop body must also include the continuation node.
1453   // However, in some cases involving positive lookaround, the loop body under-
1454   // reports its appetite, so use saturated math here to avoid negative numbers.
1455   uint8_t loop_body_from_not_start = base::saturated_cast<uint8_t>(
1456       loop_node_->EatsAtLeast(true) - continue_node_->EatsAtLeast(true));
1457   uint8_t loop_body_from_possibly_start = base::saturated_cast<uint8_t>(
1458       loop_node_->EatsAtLeast(false) - continue_node_->EatsAtLeast(true));
1459 
1460   // Limit the number of loop iterations to avoid overflow in subsequent steps.
1461   int loop_iterations = base::saturated_cast<uint8_t>(min_loop_iterations());
1462 
1463   EatsAtLeastInfo result;
1464   result.eats_at_least_from_not_start =
1465       base::saturated_cast<uint8_t>(loop_iterations * loop_body_from_not_start +
1466                                     continue_node_->EatsAtLeast(true));
1467   if (loop_iterations > 0 && loop_body_from_possibly_start > 0) {
1468     // First loop iteration eats at least one, so all subsequent iterations
1469     // and the after-loop chunk are guaranteed to not be at the start.
1470     result.eats_at_least_from_possibly_start = base::saturated_cast<uint8_t>(
1471         loop_body_from_possibly_start +
1472         (loop_iterations - 1) * loop_body_from_not_start +
1473         continue_node_->EatsAtLeast(true));
1474   } else {
1475     // Loop body might eat nothing, so only continue node contributes.
1476     result.eats_at_least_from_possibly_start =
1477         continue_node_->EatsAtLeast(false);
1478   }
1479   return result;
1480 }
1481 
EmitQuickCheck(RegExpCompiler * compiler,Trace * bounds_check_trace,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure,ChoiceNode * predecessor)1482 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1483                                 Trace* bounds_check_trace, Trace* trace,
1484                                 bool preload_has_checked_bounds,
1485                                 Label* on_possible_success,
1486                                 QuickCheckDetails* details,
1487                                 bool fall_through_on_failure,
1488                                 ChoiceNode* predecessor) {
1489   DCHECK_NOT_NULL(predecessor);
1490   if (details->characters() == 0) return false;
1491   GetQuickCheckDetails(details, compiler, 0,
1492                        trace->at_start() == Trace::FALSE_VALUE);
1493   if (details->cannot_match()) return false;
1494   if (!details->Rationalize(compiler->one_byte())) return false;
1495   DCHECK(details->characters() == 1 ||
1496          compiler->macro_assembler()->CanReadUnaligned());
1497   uint32_t mask = details->mask();
1498   uint32_t value = details->value();
1499 
1500   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1501 
1502   if (trace->characters_preloaded() != details->characters()) {
1503     DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
1504     // The bounds check is performed using the minimum number of characters
1505     // any choice would eat, so if the bounds check fails, then none of the
1506     // choices can succeed, so we can just immediately backtrack, rather
1507     // than go to the next choice. The number of characters preloaded may be
1508     // less than the number used for the bounds check.
1509     int eats_at_least = predecessor->EatsAtLeast(
1510         bounds_check_trace->at_start() == Trace::FALSE_VALUE);
1511     DCHECK_GE(eats_at_least, details->characters());
1512     assembler->LoadCurrentCharacter(
1513         trace->cp_offset(), bounds_check_trace->backtrack(),
1514         !preload_has_checked_bounds, details->characters(), eats_at_least);
1515   }
1516 
1517   bool need_mask = true;
1518 
1519   if (details->characters() == 1) {
1520     // If number of characters preloaded is 1 then we used a byte or 16 bit
1521     // load so the value is already masked down.
1522     const uint32_t char_mask = CharMask(compiler->one_byte());
1523     if ((mask & char_mask) == char_mask) need_mask = false;
1524     mask &= char_mask;
1525   } else {
1526     // For 2-character preloads in one-byte mode or 1-character preloads in
1527     // two-byte mode we also use a 16 bit load with zero extend.
1528     static const uint32_t kTwoByteMask = 0xFFFF;
1529     static const uint32_t kFourByteMask = 0xFFFFFFFF;
1530     if (details->characters() == 2 && compiler->one_byte()) {
1531       if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
1532     } else if (details->characters() == 1 && !compiler->one_byte()) {
1533       if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
1534     } else {
1535       if (mask == kFourByteMask) need_mask = false;
1536     }
1537   }
1538 
1539   if (fall_through_on_failure) {
1540     if (need_mask) {
1541       assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1542     } else {
1543       assembler->CheckCharacter(value, on_possible_success);
1544     }
1545   } else {
1546     if (need_mask) {
1547       assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1548     } else {
1549       assembler->CheckNotCharacter(value, trace->backtrack());
1550     }
1551   }
1552   return true;
1553 }
1554 
1555 // Here is the meat of GetQuickCheckDetails (see also the comment on the
1556 // super-class in the .h file).
1557 //
1558 // We iterate along the text object, building up for each character a
1559 // mask and value that can be used to test for a quick failure to match.
1560 // The masks and values for the positions will be combined into a single
1561 // machine word for the current character width in order to be used in
1562 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1563 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1564                                     RegExpCompiler* compiler,
1565                                     int characters_filled_in,
1566                                     bool not_at_start) {
1567   // Do not collect any quick check details if the text node reads backward,
1568   // since it reads in the opposite direction than we use for quick checks.
1569   if (read_backward()) return;
1570   Isolate* isolate = compiler->macro_assembler()->isolate();
1571   DCHECK(characters_filled_in < details->characters());
1572   int characters = details->characters();
1573   const uint32_t char_mask = CharMask(compiler->one_byte());
1574   for (int k = 0; k < elements()->length(); k++) {
1575     TextElement elm = elements()->at(k);
1576     if (elm.text_type() == TextElement::ATOM) {
1577       Vector<const uc16> quarks = elm.atom()->data();
1578       for (int i = 0; i < characters && i < quarks.length(); i++) {
1579         QuickCheckDetails::Position* pos =
1580             details->positions(characters_filled_in);
1581         uc16 c = quarks[i];
1582         if (elm.atom()->ignore_case()) {
1583           unibrow::uchar chars[4];
1584           int length = GetCaseIndependentLetters(
1585               isolate, c, compiler->one_byte(), chars, 4);
1586           if (length == 0) {
1587             // This can happen because all case variants are non-Latin1, but we
1588             // know the input is Latin1.
1589             details->set_cannot_match();
1590             pos->determines_perfectly = false;
1591             return;
1592           }
1593           if (length == 1) {
1594             // This letter has no case equivalents, so it's nice and simple
1595             // and the mask-compare will determine definitely whether we have
1596             // a match at this character position.
1597             pos->mask = char_mask;
1598             pos->value = chars[0];
1599             pos->determines_perfectly = true;
1600           } else {
1601             uint32_t common_bits = char_mask;
1602             uint32_t bits = chars[0];
1603             for (int j = 1; j < length; j++) {
1604               uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1605               common_bits ^= differing_bits;
1606               bits &= common_bits;
1607             }
1608             // If length is 2 and common bits has only one zero in it then
1609             // our mask and compare instruction will determine definitely
1610             // whether we have a match at this character position.  Otherwise
1611             // it can only be an approximate check.
1612             uint32_t one_zero = (common_bits | ~char_mask);
1613             if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1614               pos->determines_perfectly = true;
1615             }
1616             pos->mask = common_bits;
1617             pos->value = bits;
1618           }
1619         } else {
1620           // Don't ignore case.  Nice simple case where the mask-compare will
1621           // determine definitely whether we have a match at this character
1622           // position.
1623           if (c > char_mask) {
1624             details->set_cannot_match();
1625             pos->determines_perfectly = false;
1626             return;
1627           }
1628           pos->mask = char_mask;
1629           pos->value = c;
1630           pos->determines_perfectly = true;
1631         }
1632         characters_filled_in++;
1633         DCHECK(characters_filled_in <= details->characters());
1634         if (characters_filled_in == details->characters()) {
1635           return;
1636         }
1637       }
1638     } else {
1639       QuickCheckDetails::Position* pos =
1640           details->positions(characters_filled_in);
1641       RegExpCharacterClass* tree = elm.char_class();
1642       ZoneList<CharacterRange>* ranges = tree->ranges(zone());
1643       DCHECK(!ranges->is_empty());
1644       if (tree->is_negated()) {
1645         // A quick check uses multi-character mask and compare.  There is no
1646         // useful way to incorporate a negative char class into this scheme
1647         // so we just conservatively create a mask and value that will always
1648         // succeed.
1649         pos->mask = 0;
1650         pos->value = 0;
1651       } else {
1652         int first_range = 0;
1653         while (ranges->at(first_range).from() > char_mask) {
1654           first_range++;
1655           if (first_range == ranges->length()) {
1656             details->set_cannot_match();
1657             pos->determines_perfectly = false;
1658             return;
1659           }
1660         }
1661         CharacterRange range = ranges->at(first_range);
1662         const uc32 first_from = range.from();
1663         const uc32 first_to = (range.to() > char_mask) ? char_mask : range.to();
1664         const uint32_t differing_bits = (first_from ^ first_to);
1665         // A mask and compare is only perfect if the differing bits form a
1666         // number like 00011111 with one single block of trailing 1s.
1667         if ((differing_bits & (differing_bits + 1)) == 0 &&
1668             first_from + differing_bits == first_to) {
1669           pos->determines_perfectly = true;
1670         }
1671         uint32_t common_bits = ~SmearBitsRight(differing_bits);
1672         uint32_t bits = (first_from & common_bits);
1673         for (int i = first_range + 1; i < ranges->length(); i++) {
1674           CharacterRange range = ranges->at(i);
1675           const uc32 from = range.from();
1676           if (from > char_mask) continue;
1677           const uc32 to = (range.to() > char_mask) ? char_mask : range.to();
1678           // Here we are combining more ranges into the mask and compare
1679           // value.  With each new range the mask becomes more sparse and
1680           // so the chances of a false positive rise.  A character class
1681           // with multiple ranges is assumed never to be equivalent to a
1682           // mask and compare operation.
1683           pos->determines_perfectly = false;
1684           uint32_t new_common_bits = (from ^ to);
1685           new_common_bits = ~SmearBitsRight(new_common_bits);
1686           common_bits &= new_common_bits;
1687           bits &= new_common_bits;
1688           uint32_t differing_bits = (from & common_bits) ^ bits;
1689           common_bits ^= differing_bits;
1690           bits &= common_bits;
1691         }
1692         pos->mask = common_bits;
1693         pos->value = bits;
1694       }
1695       characters_filled_in++;
1696       DCHECK(characters_filled_in <= details->characters());
1697       if (characters_filled_in == details->characters()) return;
1698     }
1699   }
1700   DCHECK(characters_filled_in != details->characters());
1701   if (!details->cannot_match()) {
1702     on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in,
1703                                        true);
1704   }
1705 }
1706 
Clear()1707 void QuickCheckDetails::Clear() {
1708   for (int i = 0; i < characters_; i++) {
1709     positions_[i].mask = 0;
1710     positions_[i].value = 0;
1711     positions_[i].determines_perfectly = false;
1712   }
1713   characters_ = 0;
1714 }
1715 
Advance(int by,bool one_byte)1716 void QuickCheckDetails::Advance(int by, bool one_byte) {
1717   if (by >= characters_ || by < 0) {
1718     DCHECK_IMPLIES(by < 0, characters_ == 0);
1719     Clear();
1720     return;
1721   }
1722   DCHECK_LE(characters_ - by, 4);
1723   DCHECK_LE(characters_, 4);
1724   for (int i = 0; i < characters_ - by; i++) {
1725     positions_[i] = positions_[by + i];
1726   }
1727   for (int i = characters_ - by; i < characters_; i++) {
1728     positions_[i].mask = 0;
1729     positions_[i].value = 0;
1730     positions_[i].determines_perfectly = false;
1731   }
1732   characters_ -= by;
1733   // We could change mask_ and value_ here but we would never advance unless
1734   // they had already been used in a check and they won't be used again because
1735   // it would gain us nothing.  So there's no point.
1736 }
1737 
Merge(QuickCheckDetails * other,int from_index)1738 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
1739   DCHECK(characters_ == other->characters_);
1740   if (other->cannot_match_) {
1741     return;
1742   }
1743   if (cannot_match_) {
1744     *this = *other;
1745     return;
1746   }
1747   for (int i = from_index; i < characters_; i++) {
1748     QuickCheckDetails::Position* pos = positions(i);
1749     QuickCheckDetails::Position* other_pos = other->positions(i);
1750     if (pos->mask != other_pos->mask || pos->value != other_pos->value ||
1751         !other_pos->determines_perfectly) {
1752       // Our mask-compare operation will be approximate unless we have the
1753       // exact same operation on both sides of the alternation.
1754       pos->determines_perfectly = false;
1755     }
1756     pos->mask &= other_pos->mask;
1757     pos->value &= pos->mask;
1758     other_pos->value &= pos->mask;
1759     uint32_t differing_bits = (pos->value ^ other_pos->value);
1760     pos->mask &= ~differing_bits;
1761     pos->value &= pos->mask;
1762   }
1763 }
1764 
1765 class VisitMarker {
1766  public:
VisitMarker(NodeInfo * info)1767   explicit VisitMarker(NodeInfo* info) : info_(info) {
1768     DCHECK(!info->visited);
1769     info->visited = true;
1770   }
~VisitMarker()1771   ~VisitMarker() { info_->visited = false; }
1772 
1773  private:
1774   NodeInfo* info_;
1775 };
1776 
1777 // Temporarily sets traversed_loop_initialization_node_.
1778 class LoopInitializationMarker {
1779  public:
LoopInitializationMarker(LoopChoiceNode * node)1780   explicit LoopInitializationMarker(LoopChoiceNode* node) : node_(node) {
1781     DCHECK(!node_->traversed_loop_initialization_node_);
1782     node_->traversed_loop_initialization_node_ = true;
1783   }
~LoopInitializationMarker()1784   ~LoopInitializationMarker() {
1785     DCHECK(node_->traversed_loop_initialization_node_);
1786     node_->traversed_loop_initialization_node_ = false;
1787   }
1788   LoopInitializationMarker(const LoopInitializationMarker&) = delete;
1789   LoopInitializationMarker& operator=(const LoopInitializationMarker&) = delete;
1790 
1791  private:
1792   LoopChoiceNode* node_;
1793 };
1794 
1795 // Temporarily decrements min_loop_iterations_.
1796 class IterationDecrementer {
1797  public:
IterationDecrementer(LoopChoiceNode * node)1798   explicit IterationDecrementer(LoopChoiceNode* node) : node_(node) {
1799     DCHECK_GT(node_->min_loop_iterations_, 0);
1800     --node_->min_loop_iterations_;
1801   }
~IterationDecrementer()1802   ~IterationDecrementer() { ++node_->min_loop_iterations_; }
1803   IterationDecrementer(const IterationDecrementer&) = delete;
1804   IterationDecrementer& operator=(const IterationDecrementer&) = delete;
1805 
1806  private:
1807   LoopChoiceNode* node_;
1808 };
1809 
FilterOneByte(int depth)1810 RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
1811   if (info()->replacement_calculated) return replacement();
1812   if (depth < 0) return this;
1813   DCHECK(!info()->visited);
1814   VisitMarker marker(info());
1815   return FilterSuccessor(depth - 1);
1816 }
1817 
FilterSuccessor(int depth)1818 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
1819   RegExpNode* next = on_success_->FilterOneByte(depth - 1);
1820   if (next == nullptr) return set_replacement(nullptr);
1821   on_success_ = next;
1822   return set_replacement(this);
1823 }
1824 
1825 // We need to check for the following characters: 0x39C 0x3BC 0x178.
RangeContainsLatin1Equivalents(CharacterRange range)1826 bool RangeContainsLatin1Equivalents(CharacterRange range) {
1827   // TODO(dcarney): this could be a lot more efficient.
1828   return range.Contains(0x039C) || range.Contains(0x03BC) ||
1829          range.Contains(0x0178);
1830 }
1831 
RangesContainLatin1Equivalents(ZoneList<CharacterRange> * ranges)1832 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
1833   for (int i = 0; i < ranges->length(); i++) {
1834     // TODO(dcarney): this could be a lot more efficient.
1835     if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
1836   }
1837   return false;
1838 }
1839 
FilterOneByte(int depth)1840 RegExpNode* TextNode::FilterOneByte(int depth) {
1841   if (info()->replacement_calculated) return replacement();
1842   if (depth < 0) return this;
1843   DCHECK(!info()->visited);
1844   VisitMarker marker(info());
1845   int element_count = elements()->length();
1846   for (int i = 0; i < element_count; i++) {
1847     TextElement elm = elements()->at(i);
1848     if (elm.text_type() == TextElement::ATOM) {
1849       Vector<const uc16> quarks = elm.atom()->data();
1850       for (int j = 0; j < quarks.length(); j++) {
1851         uc16 c = quarks[j];
1852         if (elm.atom()->ignore_case()) {
1853           c = unibrow::Latin1::TryConvertToLatin1(c);
1854         }
1855         if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
1856         // Replace quark in case we converted to Latin-1.
1857         uc16* writable_quarks = const_cast<uc16*>(quarks.begin());
1858         writable_quarks[j] = c;
1859       }
1860     } else {
1861       DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
1862       RegExpCharacterClass* cc = elm.char_class();
1863       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1864       CharacterRange::Canonicalize(ranges);
1865       // Now they are in order so we only need to look at the first.
1866       int range_count = ranges->length();
1867       if (cc->is_negated()) {
1868         if (range_count != 0 && ranges->at(0).from() == 0 &&
1869             ranges->at(0).to() >= String::kMaxOneByteCharCode) {
1870           // This will be handled in a later filter.
1871           if (IgnoreCase(cc->flags()) &&
1872               RangesContainLatin1Equivalents(ranges)) {
1873             continue;
1874           }
1875           return set_replacement(nullptr);
1876         }
1877       } else {
1878         if (range_count == 0 ||
1879             ranges->at(0).from() > String::kMaxOneByteCharCode) {
1880           // This will be handled in a later filter.
1881           if (IgnoreCase(cc->flags()) &&
1882               RangesContainLatin1Equivalents(ranges)) {
1883             continue;
1884           }
1885           return set_replacement(nullptr);
1886         }
1887       }
1888     }
1889   }
1890   return FilterSuccessor(depth - 1);
1891 }
1892 
FilterOneByte(int depth)1893 RegExpNode* LoopChoiceNode::FilterOneByte(int depth) {
1894   if (info()->replacement_calculated) return replacement();
1895   if (depth < 0) return this;
1896   if (info()->visited) return this;
1897   {
1898     VisitMarker marker(info());
1899 
1900     RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
1901     // If we can't continue after the loop then there is no sense in doing the
1902     // loop.
1903     if (continue_replacement == nullptr) return set_replacement(nullptr);
1904   }
1905 
1906   return ChoiceNode::FilterOneByte(depth - 1);
1907 }
1908 
FilterOneByte(int depth)1909 RegExpNode* ChoiceNode::FilterOneByte(int depth) {
1910   if (info()->replacement_calculated) return replacement();
1911   if (depth < 0) return this;
1912   if (info()->visited) return this;
1913   VisitMarker marker(info());
1914   int choice_count = alternatives_->length();
1915 
1916   for (int i = 0; i < choice_count; i++) {
1917     GuardedAlternative alternative = alternatives_->at(i);
1918     if (alternative.guards() != nullptr &&
1919         alternative.guards()->length() != 0) {
1920       set_replacement(this);
1921       return this;
1922     }
1923   }
1924 
1925   int surviving = 0;
1926   RegExpNode* survivor = nullptr;
1927   for (int i = 0; i < choice_count; i++) {
1928     GuardedAlternative alternative = alternatives_->at(i);
1929     RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
1930     DCHECK(replacement != this);  // No missing EMPTY_MATCH_CHECK.
1931     if (replacement != nullptr) {
1932       alternatives_->at(i).set_node(replacement);
1933       surviving++;
1934       survivor = replacement;
1935     }
1936   }
1937   if (surviving < 2) return set_replacement(survivor);
1938 
1939   set_replacement(this);
1940   if (surviving == choice_count) {
1941     return this;
1942   }
1943   // Only some of the nodes survived the filtering.  We need to rebuild the
1944   // alternatives list.
1945   ZoneList<GuardedAlternative>* new_alternatives =
1946       zone()->New<ZoneList<GuardedAlternative>>(surviving, zone());
1947   for (int i = 0; i < choice_count; i++) {
1948     RegExpNode* replacement =
1949         alternatives_->at(i).node()->FilterOneByte(depth - 1);
1950     if (replacement != nullptr) {
1951       alternatives_->at(i).set_node(replacement);
1952       new_alternatives->Add(alternatives_->at(i), zone());
1953     }
1954   }
1955   alternatives_ = new_alternatives;
1956   return this;
1957 }
1958 
FilterOneByte(int depth)1959 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
1960   if (info()->replacement_calculated) return replacement();
1961   if (depth < 0) return this;
1962   if (info()->visited) return this;
1963   VisitMarker marker(info());
1964   // Alternative 0 is the negative lookahead, alternative 1 is what comes
1965   // afterwards.
1966   RegExpNode* node = continue_node();
1967   RegExpNode* replacement = node->FilterOneByte(depth - 1);
1968   if (replacement == nullptr) return set_replacement(nullptr);
1969   alternatives_->at(kContinueIndex).set_node(replacement);
1970 
1971   RegExpNode* neg_node = lookaround_node();
1972   RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
1973   // If the negative lookahead is always going to fail then
1974   // we don't need to check it.
1975   if (neg_replacement == nullptr) return set_replacement(replacement);
1976   alternatives_->at(kLookaroundIndex).set_node(neg_replacement);
1977   return set_replacement(this);
1978 }
1979 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1980 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
1981                                           RegExpCompiler* compiler,
1982                                           int characters_filled_in,
1983                                           bool not_at_start) {
1984   if (body_can_be_zero_length_ || info()->visited) return;
1985   not_at_start = not_at_start || this->not_at_start();
1986   DCHECK_EQ(alternatives_->length(), 2);  // There's just loop and continue.
1987   if (traversed_loop_initialization_node_ && min_loop_iterations_ > 0 &&
1988       loop_node_->EatsAtLeast(not_at_start) >
1989           continue_node_->EatsAtLeast(true)) {
1990     // Loop body is guaranteed to execute at least once, and consume characters
1991     // when it does, meaning the only possible quick checks from this point
1992     // begin with the loop body. We may recursively visit this LoopChoiceNode,
1993     // but we temporarily decrease its minimum iteration counter so we know when
1994     // to check the continue case.
1995     IterationDecrementer next_iteration(this);
1996     loop_node_->GetQuickCheckDetails(details, compiler, characters_filled_in,
1997                                      not_at_start);
1998   } else {
1999     // Might not consume anything in the loop body, so treat it like a normal
2000     // ChoiceNode (and don't recursively visit this node again).
2001     VisitMarker marker(info());
2002     ChoiceNode::GetQuickCheckDetails(details, compiler, characters_filled_in,
2003                                      not_at_start);
2004   }
2005 }
2006 
GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2007 void LoopChoiceNode::GetQuickCheckDetailsFromLoopEntry(
2008     QuickCheckDetails* details, RegExpCompiler* compiler,
2009     int characters_filled_in, bool not_at_start) {
2010   if (traversed_loop_initialization_node_) {
2011     // We already entered this loop once, exited via its continuation node, and
2012     // followed an outer loop's back-edge to before the loop entry point. We
2013     // could try to reset the minimum iteration count to its starting value at
2014     // this point, but that seems like more trouble than it's worth. It's safe
2015     // to keep going with the current (possibly reduced) minimum iteration
2016     // count.
2017     GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
2018   } else {
2019     // We are entering a loop via its counter initialization action, meaning we
2020     // are guaranteed to run the loop body at least some minimum number of times
2021     // before running the continuation node. Set a flag so that this node knows
2022     // (now and any times we visit it again recursively) that it was entered
2023     // from the top.
2024     LoopInitializationMarker marker(this);
2025     GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
2026   }
2027 }
2028 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2029 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2030                                   BoyerMooreLookahead* bm, bool not_at_start) {
2031   if (body_can_be_zero_length_ || budget <= 0) {
2032     bm->SetRest(offset);
2033     SaveBMInfo(bm, not_at_start, offset);
2034     return;
2035   }
2036   ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2037   SaveBMInfo(bm, not_at_start, offset);
2038 }
2039 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2040 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2041                                       RegExpCompiler* compiler,
2042                                       int characters_filled_in,
2043                                       bool not_at_start) {
2044   not_at_start = (not_at_start || not_at_start_);
2045   int choice_count = alternatives_->length();
2046   DCHECK_LT(0, choice_count);
2047   alternatives_->at(0).node()->GetQuickCheckDetails(
2048       details, compiler, characters_filled_in, not_at_start);
2049   for (int i = 1; i < choice_count; i++) {
2050     QuickCheckDetails new_details(details->characters());
2051     RegExpNode* node = alternatives_->at(i).node();
2052     node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in,
2053                                not_at_start);
2054     // Here we merge the quick match details of the two branches.
2055     details->Merge(&new_details, characters_filled_in);
2056   }
2057 }
2058 
2059 namespace {
2060 
2061 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2062 void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word,
2063                    Label* non_word, bool fall_through_on_word) {
2064   if (assembler->CheckSpecialCharacterClass(
2065           fall_through_on_word ? 'w' : 'W',
2066           fall_through_on_word ? non_word : word)) {
2067     // Optimized implementation available.
2068     return;
2069   }
2070   assembler->CheckCharacterGT('z', non_word);
2071   assembler->CheckCharacterLT('0', non_word);
2072   assembler->CheckCharacterGT('a' - 1, word);
2073   assembler->CheckCharacterLT('9' + 1, word);
2074   assembler->CheckCharacterLT('A', non_word);
2075   assembler->CheckCharacterLT('Z' + 1, word);
2076   if (fall_through_on_word) {
2077     assembler->CheckNotCharacter('_', non_word);
2078   } else {
2079     assembler->CheckCharacter('_', word);
2080   }
2081 }
2082 
2083 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2084 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)2085 void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) {
2086   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2087 
2088   // We will load the previous character into the current character register.
2089   Trace new_trace(*trace);
2090   new_trace.InvalidateCurrentCharacter();
2091 
2092   // A positive (> 0) cp_offset means we've already successfully matched a
2093   // non-empty-width part of the pattern, and thus cannot be at or before the
2094   // start of the subject string. We can thus skip both at-start and
2095   // bounds-checks when loading the one-character lookbehind.
2096   const bool may_be_at_or_before_subject_string_start =
2097       new_trace.cp_offset() <= 0;
2098 
2099   Label ok;
2100   if (may_be_at_or_before_subject_string_start) {
2101     // The start of input counts as a newline in this context, so skip to ok if
2102     // we are at the start.
2103     assembler->CheckAtStart(new_trace.cp_offset(), &ok);
2104   }
2105 
2106   // If we've already checked that we are not at the start of input, it's okay
2107   // to load the previous character without bounds checks.
2108   const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
2109   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2110                                   new_trace.backtrack(), can_skip_bounds_check);
2111   if (!assembler->CheckSpecialCharacterClass('n', new_trace.backtrack())) {
2112     // Newline means \n, \r, 0x2028 or 0x2029.
2113     if (!compiler->one_byte()) {
2114       assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
2115     }
2116     assembler->CheckCharacter('\n', &ok);
2117     assembler->CheckNotCharacter('\r', new_trace.backtrack());
2118   }
2119   assembler->Bind(&ok);
2120   on_success->Emit(compiler, &new_trace);
2121 }
2122 
2123 }  // namespace
2124 
2125 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(RegExpCompiler * compiler,Trace * trace)2126 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2127   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2128   Isolate* isolate = assembler->isolate();
2129   Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2130   bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2131   BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2132   if (lookahead == nullptr) {
2133     int eats_at_least =
2134         std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(not_at_start));
2135     if (eats_at_least >= 1) {
2136       BoyerMooreLookahead* bm =
2137           zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
2138       FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
2139       if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE;
2140       if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2141     }
2142   } else {
2143     if (lookahead->at(0)->is_non_word())
2144       next_is_word_character = Trace::FALSE_VALUE;
2145     if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2146   }
2147   bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2148   if (next_is_word_character == Trace::UNKNOWN) {
2149     Label before_non_word;
2150     Label before_word;
2151     if (trace->characters_preloaded() != 1) {
2152       assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2153     }
2154     // Fall through on non-word.
2155     EmitWordCheck(assembler, &before_word, &before_non_word, false);
2156     // Next character is not a word character.
2157     assembler->Bind(&before_non_word);
2158     Label ok;
2159     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2160     assembler->GoTo(&ok);
2161 
2162     assembler->Bind(&before_word);
2163     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2164     assembler->Bind(&ok);
2165   } else if (next_is_word_character == Trace::TRUE_VALUE) {
2166     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2167   } else {
2168     DCHECK(next_is_word_character == Trace::FALSE_VALUE);
2169     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2170   }
2171 }
2172 
BacktrackIfPrevious(RegExpCompiler * compiler,Trace * trace,AssertionNode::IfPrevious backtrack_if_previous)2173 void AssertionNode::BacktrackIfPrevious(
2174     RegExpCompiler* compiler, Trace* trace,
2175     AssertionNode::IfPrevious backtrack_if_previous) {
2176   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2177   Trace new_trace(*trace);
2178   new_trace.InvalidateCurrentCharacter();
2179 
2180   Label fall_through;
2181   Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack()
2182                                                         : &fall_through;
2183   Label* word = backtrack_if_previous == kIsNonWord ? &fall_through
2184                                                     : new_trace.backtrack();
2185 
2186   // A positive (> 0) cp_offset means we've already successfully matched a
2187   // non-empty-width part of the pattern, and thus cannot be at or before the
2188   // start of the subject string. We can thus skip both at-start and
2189   // bounds-checks when loading the one-character lookbehind.
2190   const bool may_be_at_or_before_subject_string_start =
2191       new_trace.cp_offset() <= 0;
2192 
2193   if (may_be_at_or_before_subject_string_start) {
2194     // The start of input counts as a non-word character, so the question is
2195     // decided if we are at the start.
2196     assembler->CheckAtStart(new_trace.cp_offset(), non_word);
2197   }
2198 
2199   // If we've already checked that we are not at the start of input, it's okay
2200   // to load the previous character without bounds checks.
2201   const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
2202   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, non_word,
2203                                   can_skip_bounds_check);
2204   EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2205 
2206   assembler->Bind(&fall_through);
2207   on_success()->Emit(compiler, &new_trace);
2208 }
2209 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2210 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2211                                          RegExpCompiler* compiler,
2212                                          int filled_in, bool not_at_start) {
2213   if (assertion_type_ == AT_START && not_at_start) {
2214     details->set_cannot_match();
2215     return;
2216   }
2217   return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
2218                                             not_at_start);
2219 }
2220 
Emit(RegExpCompiler * compiler,Trace * trace)2221 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2222   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2223   switch (assertion_type_) {
2224     case AT_END: {
2225       Label ok;
2226       assembler->CheckPosition(trace->cp_offset(), &ok);
2227       assembler->GoTo(trace->backtrack());
2228       assembler->Bind(&ok);
2229       break;
2230     }
2231     case AT_START: {
2232       if (trace->at_start() == Trace::FALSE_VALUE) {
2233         assembler->GoTo(trace->backtrack());
2234         return;
2235       }
2236       if (trace->at_start() == Trace::UNKNOWN) {
2237         assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
2238         Trace at_start_trace = *trace;
2239         at_start_trace.set_at_start(Trace::TRUE_VALUE);
2240         on_success()->Emit(compiler, &at_start_trace);
2241         return;
2242       }
2243     } break;
2244     case AFTER_NEWLINE:
2245       EmitHat(compiler, on_success(), trace);
2246       return;
2247     case AT_BOUNDARY:
2248     case AT_NON_BOUNDARY: {
2249       EmitBoundaryCheck(compiler, trace);
2250       return;
2251     }
2252   }
2253   on_success()->Emit(compiler, trace);
2254 }
2255 
DeterminedAlready(QuickCheckDetails * quick_check,int offset)2256 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2257   if (quick_check == nullptr) return false;
2258   if (offset >= quick_check->characters()) return false;
2259   return quick_check->positions(offset)->determines_perfectly;
2260 }
2261 
UpdateBoundsCheck(int index,int * checked_up_to)2262 static void UpdateBoundsCheck(int index, int* checked_up_to) {
2263   if (index > *checked_up_to) {
2264     *checked_up_to = index;
2265   }
2266 }
2267 
2268 // We call this repeatedly to generate code for each pass over the text node.
2269 // The passes are in increasing order of difficulty because we hope one
2270 // of the first passes will fail in which case we are saved the work of the
2271 // later passes.  for example for the case independent regexp /%[asdfghjkl]a/
2272 // we will check the '%' in the first pass, the case independent 'a' in the
2273 // second pass and the character class in the last pass.
2274 //
2275 // The passes are done from right to left, so for example to test for /bar/
2276 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
2277 // and then a 'b' with offset 0.  This means we can avoid the end-of-input
2278 // bounds check most of the time.  In the example we only need to check for
2279 // end-of-input when loading the putative 'r'.
2280 //
2281 // A slight complication involves the fact that the first character may already
2282 // be fetched into a register by the previous node.  In this case we want to
2283 // do the test for that character first.  We do this in separate passes.  The
2284 // 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
2285 // pass has been performed then subsequent passes will have true in
2286 // first_element_checked to indicate that that character does not need to be
2287 // checked again.
2288 //
2289 // In addition to all this we are passed a Trace, which can
2290 // contain an AlternativeGeneration object.  In this AlternativeGeneration
2291 // object we can see details of any quick check that was already passed in
2292 // order to get to the code we are now generating.  The quick check can involve
2293 // loading characters, which means we do not need to recheck the bounds
2294 // up to the limit the quick check already checked.  In addition the quick
2295 // check can have involved a mask and compare operation which may simplify
2296 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)2297 void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
2298                             bool preloaded, Trace* trace,
2299                             bool first_element_checked, int* checked_up_to) {
2300   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2301   Isolate* isolate = assembler->isolate();
2302   bool one_byte = compiler->one_byte();
2303   Label* backtrack = trace->backtrack();
2304   QuickCheckDetails* quick_check = trace->quick_check_performed();
2305   int element_count = elements()->length();
2306   int backward_offset = read_backward() ? -Length() : 0;
2307   for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2308     TextElement elm = elements()->at(i);
2309     int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
2310     if (elm.text_type() == TextElement::ATOM) {
2311       if (SkipPass(pass, elm.atom()->ignore_case())) continue;
2312       Vector<const uc16> quarks = elm.atom()->data();
2313       for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2314         if (first_element_checked && i == 0 && j == 0) continue;
2315         if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
2316         uc16 quark = quarks[j];
2317         if (elm.atom()->ignore_case()) {
2318           // Everywhere else we assume that a non-Latin-1 character cannot match
2319           // a Latin-1 character. Avoid the cases where this is assumption is
2320           // invalid by using the Latin1 equivalent instead.
2321           quark = unibrow::Latin1::TryConvertToLatin1(quark);
2322         }
2323         bool needs_bounds_check =
2324             *checked_up_to < cp_offset + j || read_backward();
2325         bool bounds_checked = false;
2326         switch (pass) {
2327           case NON_LATIN1_MATCH:
2328             DCHECK(one_byte);
2329             if (quark > String::kMaxOneByteCharCode) {
2330               assembler->GoTo(backtrack);
2331               return;
2332             }
2333             break;
2334           case NON_LETTER_CHARACTER_MATCH:
2335             bounds_checked =
2336                 EmitAtomNonLetter(isolate, compiler, quark, backtrack,
2337                                   cp_offset + j, needs_bounds_check, preloaded);
2338             break;
2339           case SIMPLE_CHARACTER_MATCH:
2340             bounds_checked = EmitSimpleCharacter(isolate, compiler, quark,
2341                                                  backtrack, cp_offset + j,
2342                                                  needs_bounds_check, preloaded);
2343             break;
2344           case CASE_CHARACTER_MATCH:
2345             bounds_checked =
2346                 EmitAtomLetter(isolate, compiler, quark, backtrack,
2347                                cp_offset + j, needs_bounds_check, preloaded);
2348             break;
2349           default:
2350             break;
2351         }
2352         if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2353       }
2354     } else {
2355       DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
2356       if (pass == CHARACTER_CLASS_MATCH) {
2357         if (first_element_checked && i == 0) continue;
2358         if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
2359         RegExpCharacterClass* cc = elm.char_class();
2360         bool bounds_check = *checked_up_to < cp_offset || read_backward();
2361         EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
2362                       bounds_check, preloaded, zone());
2363         UpdateBoundsCheck(cp_offset, checked_up_to);
2364       }
2365     }
2366   }
2367 }
2368 
Length()2369 int TextNode::Length() {
2370   TextElement elm = elements()->last();
2371   DCHECK_LE(0, elm.cp_offset());
2372   return elm.cp_offset() + elm.length();
2373 }
2374 
SkipPass(TextEmitPassType pass,bool ignore_case)2375 bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
2376   if (ignore_case) {
2377     return pass == SIMPLE_CHARACTER_MATCH;
2378   } else {
2379     return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2380   }
2381 }
2382 
CreateForCharacterRanges(Zone * zone,ZoneList<CharacterRange> * ranges,bool read_backward,RegExpNode * on_success,JSRegExp::Flags flags)2383 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
2384                                              ZoneList<CharacterRange>* ranges,
2385                                              bool read_backward,
2386                                              RegExpNode* on_success,
2387                                              JSRegExp::Flags flags) {
2388   DCHECK_NOT_NULL(ranges);
2389   ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(1, zone);
2390   elms->Add(TextElement::CharClass(
2391                 zone->New<RegExpCharacterClass>(zone, ranges, flags)),
2392             zone);
2393   return zone->New<TextNode>(elms, read_backward, on_success);
2394 }
2395 
CreateForSurrogatePair(Zone * zone,CharacterRange lead,CharacterRange trail,bool read_backward,RegExpNode * on_success,JSRegExp::Flags flags)2396 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
2397                                            CharacterRange trail,
2398                                            bool read_backward,
2399                                            RegExpNode* on_success,
2400                                            JSRegExp::Flags flags) {
2401   ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
2402   ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
2403   ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
2404   elms->Add(TextElement::CharClass(
2405                 zone->New<RegExpCharacterClass>(zone, lead_ranges, flags)),
2406             zone);
2407   elms->Add(TextElement::CharClass(
2408                 zone->New<RegExpCharacterClass>(zone, trail_ranges, flags)),
2409             zone);
2410   return zone->New<TextNode>(elms, read_backward, on_success);
2411 }
2412 
2413 // This generates the code to match a text node.  A text node can contain
2414 // straight character sequences (possibly to be matched in a case-independent
2415 // way) and character classes.  For efficiency we do not do this in a single
2416 // pass from left to right.  Instead we pass over the text node several times,
2417 // emitting code for some character positions every time.  See the comment on
2418 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)2419 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2420   LimitResult limit_result = LimitVersions(compiler, trace);
2421   if (limit_result == DONE) return;
2422   DCHECK(limit_result == CONTINUE);
2423 
2424   if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2425     compiler->SetRegExpTooBig();
2426     return;
2427   }
2428 
2429   if (compiler->one_byte()) {
2430     int dummy = 0;
2431     TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
2432   }
2433 
2434   bool first_elt_done = false;
2435   int bound_checked_to = trace->cp_offset() - 1;
2436   bound_checked_to += trace->bound_checked_up_to();
2437 
2438   // If a character is preloaded into the current character register then
2439   // check that now.
2440   if (trace->characters_preloaded() == 1) {
2441     for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2442       TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
2443                    false, &bound_checked_to);
2444     }
2445     first_elt_done = true;
2446   }
2447 
2448   for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2449     TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
2450                  first_elt_done, &bound_checked_to);
2451   }
2452 
2453   Trace successor_trace(*trace);
2454   // If we advance backward, we may end up at the start.
2455   successor_trace.AdvanceCurrentPositionInTrace(
2456       read_backward() ? -Length() : Length(), compiler);
2457   successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
2458                                                : Trace::FALSE_VALUE);
2459   RecursionCheck rc(compiler);
2460   on_success()->Emit(compiler, &successor_trace);
2461 }
2462 
InvalidateCurrentCharacter()2463 void Trace::InvalidateCurrentCharacter() { characters_preloaded_ = 0; }
2464 
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)2465 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
2466   // We don't have an instruction for shifting the current character register
2467   // down or for using a shifted value for anything so lets just forget that
2468   // we preloaded any characters into it.
2469   characters_preloaded_ = 0;
2470   // Adjust the offsets of the quick check performed information.  This
2471   // information is used to find out what we already determined about the
2472   // characters by means of mask and compare.
2473   quick_check_performed_.Advance(by, compiler->one_byte());
2474   cp_offset_ += by;
2475   if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2476     compiler->SetRegExpTooBig();
2477     cp_offset_ = 0;
2478   }
2479   bound_checked_up_to_ = std::max(0, bound_checked_up_to_ - by);
2480 }
2481 
MakeCaseIndependent(Isolate * isolate,bool is_one_byte)2482 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
2483   int element_count = elements()->length();
2484   for (int i = 0; i < element_count; i++) {
2485     TextElement elm = elements()->at(i);
2486     if (elm.text_type() == TextElement::CHAR_CLASS) {
2487       RegExpCharacterClass* cc = elm.char_class();
2488 #ifdef V8_INTL_SUPPORT
2489       bool case_equivalents_already_added =
2490           NeedsUnicodeCaseEquivalents(cc->flags());
2491 #else
2492       bool case_equivalents_already_added = false;
2493 #endif
2494       if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) {
2495         // None of the standard character classes is different in the case
2496         // independent case and it slows us down if we don't know that.
2497         if (cc->is_standard(zone())) continue;
2498         ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2499         CharacterRange::AddCaseEquivalents(isolate, zone(), ranges,
2500                                            is_one_byte);
2501       }
2502     }
2503   }
2504 }
2505 
GreedyLoopTextLength()2506 int TextNode::GreedyLoopTextLength() { return Length(); }
2507 
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)2508 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
2509     RegExpCompiler* compiler) {
2510   if (read_backward()) return nullptr;
2511   if (elements()->length() != 1) return nullptr;
2512   TextElement elm = elements()->at(0);
2513   if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
2514   RegExpCharacterClass* node = elm.char_class();
2515   ZoneList<CharacterRange>* ranges = node->ranges(zone());
2516   CharacterRange::Canonicalize(ranges);
2517   if (node->is_negated()) {
2518     return ranges->length() == 0 ? on_success() : nullptr;
2519   }
2520   if (ranges->length() != 1) return nullptr;
2521   const uc32 max_char = MaxCodeUnit(compiler->one_byte());
2522   return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
2523 }
2524 
2525 // Finds the fixed match length of a sequence of nodes that goes from
2526 // this alternative and back to this choice node.  If there are variable
2527 // length nodes or other complications in the way then return a sentinel
2528 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLengthForAlternative(GuardedAlternative * alternative)2529 int ChoiceNode::GreedyLoopTextLengthForAlternative(
2530     GuardedAlternative* alternative) {
2531   int length = 0;
2532   RegExpNode* node = alternative->node();
2533   // Later we will generate code for all these text nodes using recursion
2534   // so we have to limit the max number.
2535   int recursion_depth = 0;
2536   while (node != this) {
2537     if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2538       return kNodeIsTooComplexForGreedyLoops;
2539     }
2540     int node_length = node->GreedyLoopTextLength();
2541     if (node_length == kNodeIsTooComplexForGreedyLoops) {
2542       return kNodeIsTooComplexForGreedyLoops;
2543     }
2544     length += node_length;
2545     SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2546     node = seq_node->on_success();
2547   }
2548   if (read_backward()) {
2549     length = -length;
2550   }
2551   // Check that we can jump by the whole text length. If not, return sentinel
2552   // to indicate the we can't construct a greedy loop.
2553   if (length < RegExpMacroAssembler::kMinCPOffset ||
2554       length > RegExpMacroAssembler::kMaxCPOffset) {
2555     return kNodeIsTooComplexForGreedyLoops;
2556   }
2557   return length;
2558 }
2559 
AddLoopAlternative(GuardedAlternative alt)2560 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2561   DCHECK_NULL(loop_node_);
2562   AddAlternative(alt);
2563   loop_node_ = alt.node();
2564 }
2565 
AddContinueAlternative(GuardedAlternative alt)2566 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2567   DCHECK_NULL(continue_node_);
2568   AddAlternative(alt);
2569   continue_node_ = alt.node();
2570 }
2571 
Emit(RegExpCompiler * compiler,Trace * trace)2572 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2573   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2574   if (trace->stop_node() == this) {
2575     // Back edge of greedy optimized loop node graph.
2576     int text_length =
2577         GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
2578     DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
2579     // Update the counter-based backtracking info on the stack.  This is an
2580     // optimization for greedy loops (see below).
2581     DCHECK(trace->cp_offset() == text_length);
2582     macro_assembler->AdvanceCurrentPosition(text_length);
2583     macro_assembler->GoTo(trace->loop_label());
2584     return;
2585   }
2586   DCHECK_NULL(trace->stop_node());
2587   if (!trace->is_trivial()) {
2588     trace->Flush(compiler, this);
2589     return;
2590   }
2591   ChoiceNode::Emit(compiler, trace);
2592 }
2593 
CalculatePreloadCharacters(RegExpCompiler * compiler,int eats_at_least)2594 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2595                                            int eats_at_least) {
2596   int preload_characters = std::min(4, eats_at_least);
2597   DCHECK_LE(preload_characters, 4);
2598   if (compiler->macro_assembler()->CanReadUnaligned()) {
2599     bool one_byte = compiler->one_byte();
2600     if (one_byte) {
2601       // We can't preload 3 characters because there is no machine instruction
2602       // to do that.  We can't just load 4 because we could be reading
2603       // beyond the end of the string, which could cause a memory fault.
2604       if (preload_characters == 3) preload_characters = 2;
2605     } else {
2606       if (preload_characters > 2) preload_characters = 2;
2607     }
2608   } else {
2609     if (preload_characters > 1) preload_characters = 1;
2610   }
2611   return preload_characters;
2612 }
2613 
2614 // This class is used when generating the alternatives in a choice node.  It
2615 // records the way the alternative is being code generated.
2616 class AlternativeGeneration : public Malloced {
2617  public:
AlternativeGeneration()2618   AlternativeGeneration()
2619       : possible_success(),
2620         expects_preload(false),
2621         after(),
2622         quick_check_details() {}
2623   Label possible_success;
2624   bool expects_preload;
2625   Label after;
2626   QuickCheckDetails quick_check_details;
2627 };
2628 
2629 // Creates a list of AlternativeGenerations.  If the list has a reasonable
2630 // size then it is on the stack, otherwise the excess is on the heap.
2631 class AlternativeGenerationList {
2632  public:
AlternativeGenerationList(int count,Zone * zone)2633   AlternativeGenerationList(int count, Zone* zone) : alt_gens_(count, zone) {
2634     for (int i = 0; i < count && i < kAFew; i++) {
2635       alt_gens_.Add(a_few_alt_gens_ + i, zone);
2636     }
2637     for (int i = kAFew; i < count; i++) {
2638       alt_gens_.Add(new AlternativeGeneration(), zone);
2639     }
2640   }
~AlternativeGenerationList()2641   ~AlternativeGenerationList() {
2642     for (int i = kAFew; i < alt_gens_.length(); i++) {
2643       delete alt_gens_[i];
2644       alt_gens_[i] = nullptr;
2645     }
2646   }
2647 
at(int i)2648   AlternativeGeneration* at(int i) { return alt_gens_[i]; }
2649 
2650  private:
2651   static const int kAFew = 10;
2652   ZoneList<AlternativeGeneration*> alt_gens_;
2653   AlternativeGeneration a_few_alt_gens_[kAFew];
2654 };
2655 
Set(int character)2656 void BoyerMoorePositionInfo::Set(int character) {
2657   SetInterval(Interval(character, character));
2658 }
2659 
2660 namespace {
2661 
AddRange(ContainedInLattice containment,const int * ranges,int ranges_length,Interval new_range)2662 ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges,
2663                             int ranges_length, Interval new_range) {
2664   DCHECK_EQ(1, ranges_length & 1);
2665   DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
2666   if (containment == kLatticeUnknown) return containment;
2667   bool inside = false;
2668   int last = 0;
2669   for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
2670     // Consider the range from last to ranges[i].
2671     // We haven't got to the new range yet.
2672     if (ranges[i] <= new_range.from()) continue;
2673     // New range is wholly inside last-ranges[i].  Note that new_range.to() is
2674     // inclusive, but the values in ranges are not.
2675     if (last <= new_range.from() && new_range.to() < ranges[i]) {
2676       return Combine(containment, inside ? kLatticeIn : kLatticeOut);
2677     }
2678     return kLatticeUnknown;
2679   }
2680   return containment;
2681 }
2682 
BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset)2683 int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) {
2684   STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
2685                 2 * kInt64Size * kBitsPerByte);
2686 
2687   // Slight fiddling is needed here, since the bitset is of length 128 while
2688   // CountTrailingZeros requires an integral type and std::bitset can only
2689   // convert to unsigned long long. So we handle the most- and least-significant
2690   // bits separately.
2691 
2692   {
2693     static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0});
2694     BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask;
2695     STATIC_ASSERT(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong())));
2696     uint64_t lsb = masked_bitset.to_ullong();
2697     if (lsb != 0) return base::bits::CountTrailingZeros(lsb);
2698   }
2699 
2700   {
2701     BoyerMoorePositionInfo::Bitset masked_bitset = bitset >> 64;
2702     uint64_t msb = masked_bitset.to_ullong();
2703     if (msb != 0) return 64 + base::bits::CountTrailingZeros(msb);
2704   }
2705 
2706   return -1;
2707 }
2708 
2709 }  // namespace
2710 
SetInterval(const Interval & interval)2711 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
2712   w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
2713 
2714   if (interval.size() >= kMapSize) {
2715     map_count_ = kMapSize;
2716     map_.set();
2717     return;
2718   }
2719 
2720   for (int i = interval.from(); i <= interval.to(); i++) {
2721     int mod_character = (i & kMask);
2722     if (!map_[mod_character]) {
2723       map_count_++;
2724       map_.set(mod_character);
2725     }
2726     if (map_count_ == kMapSize) return;
2727   }
2728 }
2729 
SetAll()2730 void BoyerMoorePositionInfo::SetAll() {
2731   w_ = kLatticeUnknown;
2732   if (map_count_ != kMapSize) {
2733     map_count_ = kMapSize;
2734     map_.set();
2735   }
2736 }
2737 
BoyerMooreLookahead(int length,RegExpCompiler * compiler,Zone * zone)2738 BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler,
2739                                          Zone* zone)
2740     : length_(length),
2741       compiler_(compiler),
2742       max_char_(MaxCodeUnit(compiler->one_byte())) {
2743   bitmaps_ = zone->New<ZoneList<BoyerMoorePositionInfo*>>(length, zone);
2744   for (int i = 0; i < length; i++) {
2745     bitmaps_->Add(zone->New<BoyerMoorePositionInfo>(), zone);
2746   }
2747 }
2748 
2749 // Find the longest range of lookahead that has the fewest number of different
2750 // characters that can occur at a given position.  Since we are optimizing two
2751 // different parameters at once this is a tradeoff.
FindWorthwhileInterval(int * from,int * to)2752 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
2753   int biggest_points = 0;
2754   // If more than 32 characters out of 128 can occur it is unlikely that we can
2755   // be lucky enough to step forwards much of the time.
2756   const int kMaxMax = 32;
2757   for (int max_number_of_chars = 4; max_number_of_chars < kMaxMax;
2758        max_number_of_chars *= 2) {
2759     biggest_points =
2760         FindBestInterval(max_number_of_chars, biggest_points, from, to);
2761   }
2762   if (biggest_points == 0) return false;
2763   return true;
2764 }
2765 
2766 // Find the highest-points range between 0 and length_ where the character
2767 // information is not too vague.  'Too vague' means that there are more than
2768 // max_number_of_chars that can occur at this position.  Calculates the number
2769 // of points as the product of width-of-the-range and
2770 // probability-of-finding-one-of-the-characters, where the probability is
2771 // calculated using the frequency distribution of the sample subject string.
FindBestInterval(int max_number_of_chars,int old_biggest_points,int * from,int * to)2772 int BoyerMooreLookahead::FindBestInterval(int max_number_of_chars,
2773                                           int old_biggest_points, int* from,
2774                                           int* to) {
2775   int biggest_points = old_biggest_points;
2776   static const int kSize = RegExpMacroAssembler::kTableSize;
2777   for (int i = 0; i < length_;) {
2778     while (i < length_ && Count(i) > max_number_of_chars) i++;
2779     if (i == length_) break;
2780     int remembered_from = i;
2781 
2782     BoyerMoorePositionInfo::Bitset union_bitset;
2783     for (; i < length_ && Count(i) <= max_number_of_chars; i++) {
2784       union_bitset |= bitmaps_->at(i)->raw_bitset();
2785     }
2786 
2787     int frequency = 0;
2788 
2789     // Iterate only over set bits.
2790     int j;
2791     while ((j = BitsetFirstSetBit(union_bitset)) != -1) {
2792       DCHECK(union_bitset[j]);  // Sanity check.
2793       // Add 1 to the frequency to give a small per-character boost for
2794       // the cases where our sampling is not good enough and many
2795       // characters have a frequency of zero.  This means the frequency
2796       // can theoretically be up to 2*kSize though we treat it mostly as
2797       // a fraction of kSize.
2798       frequency += compiler_->frequency_collator()->Frequency(j) + 1;
2799       union_bitset.reset(j);
2800     }
2801 
2802     // We use the probability of skipping times the distance we are skipping to
2803     // judge the effectiveness of this.  Actually we have a cut-off:  By
2804     // dividing by 2 we switch off the skipping if the probability of skipping
2805     // is less than 50%.  This is because the multibyte mask-and-compare
2806     // skipping in quickcheck is more likely to do well on this case.
2807     bool in_quickcheck_range =
2808         ((i - remembered_from < 4) ||
2809          (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
2810     // Called 'probability' but it is only a rough estimate and can actually
2811     // be outside the 0-kSize range.
2812     int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
2813     int points = (i - remembered_from) * probability;
2814     if (points > biggest_points) {
2815       *from = remembered_from;
2816       *to = i - 1;
2817       biggest_points = points;
2818     }
2819   }
2820   return biggest_points;
2821 }
2822 
2823 // Take all the characters that will not prevent a successful match if they
2824 // occur in the subject string in the range between min_lookahead and
2825 // max_lookahead (inclusive) measured from the current position.  If the
2826 // character at max_lookahead offset is not one of these characters, then we
2827 // can safely skip forwards by the number of characters in the range.
GetSkipTable(int min_lookahead,int max_lookahead,Handle<ByteArray> boolean_skip_table)2828 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, int max_lookahead,
2829                                       Handle<ByteArray> boolean_skip_table) {
2830   const int kSkipArrayEntry = 0;
2831   const int kDontSkipArrayEntry = 1;
2832 
2833   std::memset(boolean_skip_table->GetDataStartAddress(), kSkipArrayEntry,
2834               boolean_skip_table->length());
2835 
2836   for (int i = max_lookahead; i >= min_lookahead; i--) {
2837     BoyerMoorePositionInfo::Bitset bitset = bitmaps_->at(i)->raw_bitset();
2838 
2839     // Iterate only over set bits.
2840     int j;
2841     while ((j = BitsetFirstSetBit(bitset)) != -1) {
2842       DCHECK(bitset[j]);  // Sanity check.
2843       boolean_skip_table->set(j, kDontSkipArrayEntry);
2844       bitset.reset(j);
2845     }
2846   }
2847 
2848   const int skip = max_lookahead + 1 - min_lookahead;
2849   return skip;
2850 }
2851 
2852 // See comment above on the implementation of GetSkipTable.
EmitSkipInstructions(RegExpMacroAssembler * masm)2853 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
2854   const int kSize = RegExpMacroAssembler::kTableSize;
2855 
2856   int min_lookahead = 0;
2857   int max_lookahead = 0;
2858 
2859   if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
2860 
2861   // Check if we only have a single non-empty position info, and that info
2862   // contains precisely one character.
2863   bool found_single_character = false;
2864   int single_character = 0;
2865   for (int i = max_lookahead; i >= min_lookahead; i--) {
2866     BoyerMoorePositionInfo* map = bitmaps_->at(i);
2867     if (map->map_count() == 0) continue;
2868 
2869     if (found_single_character || map->map_count() > 1) {
2870       found_single_character = false;
2871       break;
2872     }
2873 
2874     DCHECK(!found_single_character);
2875     DCHECK_EQ(map->map_count(), 1);
2876 
2877     found_single_character = true;
2878     single_character = BitsetFirstSetBit(map->raw_bitset());
2879 
2880     DCHECK_NE(single_character, -1);
2881   }
2882 
2883   int lookahead_width = max_lookahead + 1 - min_lookahead;
2884 
2885   if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
2886     // The mask-compare can probably handle this better.
2887     return;
2888   }
2889 
2890   if (found_single_character) {
2891     Label cont, again;
2892     masm->Bind(&again);
2893     masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2894     if (max_char_ > kSize) {
2895       masm->CheckCharacterAfterAnd(single_character,
2896                                    RegExpMacroAssembler::kTableMask, &cont);
2897     } else {
2898       masm->CheckCharacter(single_character, &cont);
2899     }
2900     masm->AdvanceCurrentPosition(lookahead_width);
2901     masm->GoTo(&again);
2902     masm->Bind(&cont);
2903     return;
2904   }
2905 
2906   Factory* factory = masm->isolate()->factory();
2907   Handle<ByteArray> boolean_skip_table =
2908       factory->NewByteArray(kSize, AllocationType::kOld);
2909   int skip_distance =
2910       GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table);
2911   DCHECK_NE(0, skip_distance);
2912 
2913   Label cont, again;
2914   masm->Bind(&again);
2915   masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2916   masm->CheckBitInTable(boolean_skip_table, &cont);
2917   masm->AdvanceCurrentPosition(skip_distance);
2918   masm->GoTo(&again);
2919   masm->Bind(&cont);
2920 }
2921 
2922 /* Code generation for choice nodes.
2923  *
2924  * We generate quick checks that do a mask and compare to eliminate a
2925  * choice.  If the quick check succeeds then it jumps to the continuation to
2926  * do slow checks and check subsequent nodes.  If it fails (the common case)
2927  * it falls through to the next choice.
2928  *
2929  * Here is the desired flow graph.  Nodes directly below each other imply
2930  * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
2931  * 3 doesn't have a quick check so we have to call the slow check.
2932  * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
2933  * regexp continuation is generated directly after the Sn node, up to the
2934  * next GoTo if we decide to reuse some already generated code.  Some
2935  * nodes expect preload_characters to be preloaded into the current
2936  * character register.  R nodes do this preloading.  Vertices are marked
2937  * F for failures and S for success (possible success in the case of quick
2938  * nodes).  L, V, < and > are used as arrow heads.
2939  *
2940  * ----------> R
2941  *             |
2942  *             V
2943  *            Q1 -----> S1
2944  *             |   S   /
2945  *            F|      /
2946  *             |    F/
2947  *             |    /
2948  *             |   R
2949  *             |  /
2950  *             V L
2951  *            Q2 -----> S2
2952  *             |   S   /
2953  *            F|      /
2954  *             |    F/
2955  *             |    /
2956  *             |   R
2957  *             |  /
2958  *             V L
2959  *            S3
2960  *             |
2961  *            F|
2962  *             |
2963  *             R
2964  *             |
2965  * backtrack   V
2966  * <----------Q4
2967  *   \    F    |
2968  *    \        |S
2969  *     \   F   V
2970  *      \-----S4
2971  *
2972  * For greedy loops we push the current position, then generate the code that
2973  * eats the input specially in EmitGreedyLoop.  The other choice (the
2974  * continuation) is generated by the normal code in EmitChoices, and steps back
2975  * in the input to the starting position when it fails to match.  The loop code
2976  * looks like this (U is the unwind code that steps back in the greedy loop).
2977  *
2978  *              _____
2979  *             /     \
2980  *             V     |
2981  * ----------> S1    |
2982  *            /|     |
2983  *           / |S    |
2984  *         F/  \_____/
2985  *         /
2986  *        |<-----
2987  *        |      \
2988  *        V       |S
2989  *        Q2 ---> U----->backtrack
2990  *        |  F   /
2991  *       S|     /
2992  *        V  F /
2993  *        S2--/
2994  */
2995 
GreedyLoopState(bool not_at_start)2996 GreedyLoopState::GreedyLoopState(bool not_at_start) {
2997   counter_backtrack_trace_.set_backtrack(&label_);
2998   if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
2999 }
3000 
AssertGuardsMentionRegisters(Trace * trace)3001 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3002 #ifdef DEBUG
3003   int choice_count = alternatives_->length();
3004   for (int i = 0; i < choice_count - 1; i++) {
3005     GuardedAlternative alternative = alternatives_->at(i);
3006     ZoneList<Guard*>* guards = alternative.guards();
3007     int guard_count = (guards == nullptr) ? 0 : guards->length();
3008     for (int j = 0; j < guard_count; j++) {
3009       DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3010     }
3011   }
3012 #endif
3013 }
3014 
SetUpPreLoad(RegExpCompiler * compiler,Trace * current_trace,PreloadState * state)3015 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace,
3016                               PreloadState* state) {
3017   if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3018     // Save some time by looking at most one machine word ahead.
3019     state->eats_at_least_ =
3020         EatsAtLeast(current_trace->at_start() == Trace::FALSE_VALUE);
3021   }
3022   state->preload_characters_ =
3023       CalculatePreloadCharacters(compiler, state->eats_at_least_);
3024 
3025   state->preload_is_current_ =
3026       (current_trace->characters_preloaded() == state->preload_characters_);
3027   state->preload_has_checked_bounds_ = state->preload_is_current_;
3028 }
3029 
Emit(RegExpCompiler * compiler,Trace * trace)3030 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3031   int choice_count = alternatives_->length();
3032 
3033   if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
3034     alternatives_->at(0).node()->Emit(compiler, trace);
3035     return;
3036   }
3037 
3038   AssertGuardsMentionRegisters(trace);
3039 
3040   LimitResult limit_result = LimitVersions(compiler, trace);
3041   if (limit_result == DONE) return;
3042   DCHECK(limit_result == CONTINUE);
3043 
3044   // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3045   // other choice nodes we only flush if we are out of code size budget.
3046   if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3047     trace->Flush(compiler, this);
3048     return;
3049   }
3050 
3051   RecursionCheck rc(compiler);
3052 
3053   PreloadState preload;
3054   preload.init();
3055   GreedyLoopState greedy_loop_state(not_at_start());
3056 
3057   int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3058   AlternativeGenerationList alt_gens(choice_count, zone());
3059 
3060   if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3061     trace = EmitGreedyLoop(compiler, trace, &alt_gens, &preload,
3062                            &greedy_loop_state, text_length);
3063   } else {
3064     // TODO(erikcorry): Delete this.  We don't need this label, but it makes us
3065     // match the traces produced pre-cleanup.
3066     Label second_choice;
3067     compiler->macro_assembler()->Bind(&second_choice);
3068 
3069     preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3070 
3071     EmitChoices(compiler, &alt_gens, 0, trace, &preload);
3072   }
3073 
3074   // At this point we need to generate slow checks for the alternatives where
3075   // the quick check was inlined.  We can recognize these because the associated
3076   // label was bound.
3077   int new_flush_budget = trace->flush_budget() / choice_count;
3078   for (int i = 0; i < choice_count; i++) {
3079     AlternativeGeneration* alt_gen = alt_gens.at(i);
3080     Trace new_trace(*trace);
3081     // If there are actions to be flushed we have to limit how many times
3082     // they are flushed.  Take the budget of the parent trace and distribute
3083     // it fairly amongst the children.
3084     if (new_trace.actions() != nullptr) {
3085       new_trace.set_flush_budget(new_flush_budget);
3086     }
3087     bool next_expects_preload =
3088         i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3089     EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->at(i),
3090                               alt_gen, preload.preload_characters_,
3091                               next_expects_preload);
3092   }
3093 }
3094 
EmitGreedyLoop(RegExpCompiler * compiler,Trace * trace,AlternativeGenerationList * alt_gens,PreloadState * preload,GreedyLoopState * greedy_loop_state,int text_length)3095 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace,
3096                                   AlternativeGenerationList* alt_gens,
3097                                   PreloadState* preload,
3098                                   GreedyLoopState* greedy_loop_state,
3099                                   int text_length) {
3100   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3101   // Here we have special handling for greedy loops containing only text nodes
3102   // and other simple nodes.  These are handled by pushing the current
3103   // position on the stack and then incrementing the current position each
3104   // time around the switch.  On backtrack we decrement the current position
3105   // and check it against the pushed value.  This avoids pushing backtrack
3106   // information for each iteration of the loop, which could take up a lot of
3107   // space.
3108   DCHECK(trace->stop_node() == nullptr);
3109   macro_assembler->PushCurrentPosition();
3110   Label greedy_match_failed;
3111   Trace greedy_match_trace;
3112   if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3113   greedy_match_trace.set_backtrack(&greedy_match_failed);
3114   Label loop_label;
3115   macro_assembler->Bind(&loop_label);
3116   greedy_match_trace.set_stop_node(this);
3117   greedy_match_trace.set_loop_label(&loop_label);
3118   alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
3119   macro_assembler->Bind(&greedy_match_failed);
3120 
3121   Label second_choice;  // For use in greedy matches.
3122   macro_assembler->Bind(&second_choice);
3123 
3124   Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3125 
3126   EmitChoices(compiler, alt_gens, 1, new_trace, preload);
3127 
3128   macro_assembler->Bind(greedy_loop_state->label());
3129   // If we have unwound to the bottom then backtrack.
3130   macro_assembler->CheckGreedyLoop(trace->backtrack());
3131   // Otherwise try the second priority at an earlier position.
3132   macro_assembler->AdvanceCurrentPosition(-text_length);
3133   macro_assembler->GoTo(&second_choice);
3134   return new_trace;
3135 }
3136 
EmitOptimizedUnanchoredSearch(RegExpCompiler * compiler,Trace * trace)3137 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
3138                                               Trace* trace) {
3139   int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
3140   if (alternatives_->length() != 2) return eats_at_least;
3141 
3142   GuardedAlternative alt1 = alternatives_->at(1);
3143   if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
3144     return eats_at_least;
3145   }
3146   RegExpNode* eats_anything_node = alt1.node();
3147   if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
3148     return eats_at_least;
3149   }
3150 
3151   // Really we should be creating a new trace when we execute this function,
3152   // but there is no need, because the code it generates cannot backtrack, and
3153   // we always arrive here with a trivial trace (since it's the entry to a
3154   // loop.  That also implies that there are no preloaded characters, which is
3155   // good, because it means we won't be violating any assumptions by
3156   // overwriting those characters with new load instructions.
3157   DCHECK(trace->is_trivial());
3158 
3159   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3160   Isolate* isolate = macro_assembler->isolate();
3161   // At this point we know that we are at a non-greedy loop that will eat
3162   // any character one at a time.  Any non-anchored regexp has such a
3163   // loop prepended to it in order to find where it starts.  We look for
3164   // a pattern of the form ...abc... where we can look 6 characters ahead
3165   // and step forwards 3 if the character is not one of abc.  Abc need
3166   // not be atoms, they can be any reasonably limited character class or
3167   // small alternation.
3168   BoyerMooreLookahead* bm = bm_info(false);
3169   if (bm == nullptr) {
3170     eats_at_least = std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(false));
3171     if (eats_at_least >= 1) {
3172       bm = zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
3173       GuardedAlternative alt0 = alternatives_->at(0);
3174       alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
3175     }
3176   }
3177   if (bm != nullptr) {
3178     bm->EmitSkipInstructions(macro_assembler);
3179   }
3180   return eats_at_least;
3181 }
3182 
EmitChoices(RegExpCompiler * compiler,AlternativeGenerationList * alt_gens,int first_choice,Trace * trace,PreloadState * preload)3183 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
3184                              AlternativeGenerationList* alt_gens,
3185                              int first_choice, Trace* trace,
3186                              PreloadState* preload) {
3187   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3188   SetUpPreLoad(compiler, trace, preload);
3189 
3190   // For now we just call all choices one after the other.  The idea ultimately
3191   // is to use the Dispatch table to try only the relevant ones.
3192   int choice_count = alternatives_->length();
3193 
3194   int new_flush_budget = trace->flush_budget() / choice_count;
3195 
3196   for (int i = first_choice; i < choice_count; i++) {
3197     bool is_last = i == choice_count - 1;
3198     bool fall_through_on_failure = !is_last;
3199     GuardedAlternative alternative = alternatives_->at(i);
3200     AlternativeGeneration* alt_gen = alt_gens->at(i);
3201     alt_gen->quick_check_details.set_characters(preload->preload_characters_);
3202     ZoneList<Guard*>* guards = alternative.guards();
3203     int guard_count = (guards == nullptr) ? 0 : guards->length();
3204     Trace new_trace(*trace);
3205     new_trace.set_characters_preloaded(
3206         preload->preload_is_current_ ? preload->preload_characters_ : 0);
3207     if (preload->preload_has_checked_bounds_) {
3208       new_trace.set_bound_checked_up_to(preload->preload_characters_);
3209     }
3210     new_trace.quick_check_performed()->Clear();
3211     if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
3212     if (!is_last) {
3213       new_trace.set_backtrack(&alt_gen->after);
3214     }
3215     alt_gen->expects_preload = preload->preload_is_current_;
3216     bool generate_full_check_inline = false;
3217     if (compiler->optimize() &&
3218         try_to_emit_quick_check_for_alternative(i == 0) &&
3219         alternative.node()->EmitQuickCheck(
3220             compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
3221             &alt_gen->possible_success, &alt_gen->quick_check_details,
3222             fall_through_on_failure, this)) {
3223       // Quick check was generated for this choice.
3224       preload->preload_is_current_ = true;
3225       preload->preload_has_checked_bounds_ = true;
3226       // If we generated the quick check to fall through on possible success,
3227       // we now need to generate the full check inline.
3228       if (!fall_through_on_failure) {
3229         macro_assembler->Bind(&alt_gen->possible_success);
3230         new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3231         new_trace.set_characters_preloaded(preload->preload_characters_);
3232         new_trace.set_bound_checked_up_to(preload->preload_characters_);
3233         generate_full_check_inline = true;
3234       }
3235     } else if (alt_gen->quick_check_details.cannot_match()) {
3236       if (!fall_through_on_failure) {
3237         macro_assembler->GoTo(trace->backtrack());
3238       }
3239       continue;
3240     } else {
3241       // No quick check was generated.  Put the full code here.
3242       // If this is not the first choice then there could be slow checks from
3243       // previous cases that go here when they fail.  There's no reason to
3244       // insist that they preload characters since the slow check we are about
3245       // to generate probably can't use it.
3246       if (i != first_choice) {
3247         alt_gen->expects_preload = false;
3248         new_trace.InvalidateCurrentCharacter();
3249       }
3250       generate_full_check_inline = true;
3251     }
3252     if (generate_full_check_inline) {
3253       if (new_trace.actions() != nullptr) {
3254         new_trace.set_flush_budget(new_flush_budget);
3255       }
3256       for (int j = 0; j < guard_count; j++) {
3257         GenerateGuard(macro_assembler, guards->at(j), &new_trace);
3258       }
3259       alternative.node()->Emit(compiler, &new_trace);
3260       preload->preload_is_current_ = false;
3261     }
3262     macro_assembler->Bind(&alt_gen->after);
3263   }
3264 }
3265 
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)3266 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
3267                                            Trace* trace,
3268                                            GuardedAlternative alternative,
3269                                            AlternativeGeneration* alt_gen,
3270                                            int preload_characters,
3271                                            bool next_expects_preload) {
3272   if (!alt_gen->possible_success.is_linked()) return;
3273 
3274   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3275   macro_assembler->Bind(&alt_gen->possible_success);
3276   Trace out_of_line_trace(*trace);
3277   out_of_line_trace.set_characters_preloaded(preload_characters);
3278   out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3279   if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
3280   ZoneList<Guard*>* guards = alternative.guards();
3281   int guard_count = (guards == nullptr) ? 0 : guards->length();
3282   if (next_expects_preload) {
3283     Label reload_current_char;
3284     out_of_line_trace.set_backtrack(&reload_current_char);
3285     for (int j = 0; j < guard_count; j++) {
3286       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3287     }
3288     alternative.node()->Emit(compiler, &out_of_line_trace);
3289     macro_assembler->Bind(&reload_current_char);
3290     // Reload the current character, since the next quick check expects that.
3291     // We don't need to check bounds here because we only get into this
3292     // code through a quick check which already did the checked load.
3293     macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
3294                                           preload_characters);
3295     macro_assembler->GoTo(&(alt_gen->after));
3296   } else {
3297     out_of_line_trace.set_backtrack(&(alt_gen->after));
3298     for (int j = 0; j < guard_count; j++) {
3299       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3300     }
3301     alternative.node()->Emit(compiler, &out_of_line_trace);
3302   }
3303 }
3304 
Emit(RegExpCompiler * compiler,Trace * trace)3305 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3306   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3307   LimitResult limit_result = LimitVersions(compiler, trace);
3308   if (limit_result == DONE) return;
3309   DCHECK(limit_result == CONTINUE);
3310 
3311   RecursionCheck rc(compiler);
3312 
3313   switch (action_type_) {
3314     case STORE_POSITION: {
3315       Trace::DeferredCapture new_capture(data_.u_position_register.reg,
3316                                          data_.u_position_register.is_capture,
3317                                          trace);
3318       Trace new_trace = *trace;
3319       new_trace.add_action(&new_capture);
3320       on_success()->Emit(compiler, &new_trace);
3321       break;
3322     }
3323     case INCREMENT_REGISTER: {
3324       Trace::DeferredIncrementRegister new_increment(
3325           data_.u_increment_register.reg);
3326       Trace new_trace = *trace;
3327       new_trace.add_action(&new_increment);
3328       on_success()->Emit(compiler, &new_trace);
3329       break;
3330     }
3331     case SET_REGISTER_FOR_LOOP: {
3332       Trace::DeferredSetRegisterForLoop new_set(data_.u_store_register.reg,
3333                                                 data_.u_store_register.value);
3334       Trace new_trace = *trace;
3335       new_trace.add_action(&new_set);
3336       on_success()->Emit(compiler, &new_trace);
3337       break;
3338     }
3339     case CLEAR_CAPTURES: {
3340       Trace::DeferredClearCaptures new_capture(Interval(
3341           data_.u_clear_captures.range_from, data_.u_clear_captures.range_to));
3342       Trace new_trace = *trace;
3343       new_trace.add_action(&new_capture);
3344       on_success()->Emit(compiler, &new_trace);
3345       break;
3346     }
3347     case BEGIN_POSITIVE_SUBMATCH:
3348     case BEGIN_NEGATIVE_SUBMATCH:
3349       if (!trace->is_trivial()) {
3350         trace->Flush(compiler, this);
3351       } else {
3352         assembler->WriteCurrentPositionToRegister(
3353             data_.u_submatch.current_position_register, 0);
3354         assembler->WriteStackPointerToRegister(
3355             data_.u_submatch.stack_pointer_register);
3356         on_success()->Emit(compiler, trace);
3357       }
3358       break;
3359     case EMPTY_MATCH_CHECK: {
3360       int start_pos_reg = data_.u_empty_match_check.start_register;
3361       int stored_pos = 0;
3362       int rep_reg = data_.u_empty_match_check.repetition_register;
3363       bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3364       bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3365       if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3366         // If we know we haven't advanced and there is no minimum we
3367         // can just backtrack immediately.
3368         assembler->GoTo(trace->backtrack());
3369       } else if (know_dist && stored_pos < trace->cp_offset()) {
3370         // If we know we've advanced we can generate the continuation
3371         // immediately.
3372         on_success()->Emit(compiler, trace);
3373       } else if (!trace->is_trivial()) {
3374         trace->Flush(compiler, this);
3375       } else {
3376         Label skip_empty_check;
3377         // If we have a minimum number of repetitions we check the current
3378         // number first and skip the empty check if it's not enough.
3379         if (has_minimum) {
3380           int limit = data_.u_empty_match_check.repetition_limit;
3381           assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3382         }
3383         // If the match is empty we bail out, otherwise we fall through
3384         // to the on-success continuation.
3385         assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3386                                    trace->backtrack());
3387         assembler->Bind(&skip_empty_check);
3388         on_success()->Emit(compiler, trace);
3389       }
3390       break;
3391     }
3392     case POSITIVE_SUBMATCH_SUCCESS: {
3393       if (!trace->is_trivial()) {
3394         trace->Flush(compiler, this);
3395         return;
3396       }
3397       assembler->ReadCurrentPositionFromRegister(
3398           data_.u_submatch.current_position_register);
3399       assembler->ReadStackPointerFromRegister(
3400           data_.u_submatch.stack_pointer_register);
3401       int clear_register_count = data_.u_submatch.clear_register_count;
3402       if (clear_register_count == 0) {
3403         on_success()->Emit(compiler, trace);
3404         return;
3405       }
3406       int clear_registers_from = data_.u_submatch.clear_register_from;
3407       Label clear_registers_backtrack;
3408       Trace new_trace = *trace;
3409       new_trace.set_backtrack(&clear_registers_backtrack);
3410       on_success()->Emit(compiler, &new_trace);
3411 
3412       assembler->Bind(&clear_registers_backtrack);
3413       int clear_registers_to = clear_registers_from + clear_register_count - 1;
3414       assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3415 
3416       DCHECK(trace->backtrack() == nullptr);
3417       assembler->Backtrack();
3418       return;
3419     }
3420     default:
3421       UNREACHABLE();
3422   }
3423 }
3424 
Emit(RegExpCompiler * compiler,Trace * trace)3425 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3426   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3427   if (!trace->is_trivial()) {
3428     trace->Flush(compiler, this);
3429     return;
3430   }
3431 
3432   LimitResult limit_result = LimitVersions(compiler, trace);
3433   if (limit_result == DONE) return;
3434   DCHECK(limit_result == CONTINUE);
3435 
3436   RecursionCheck rc(compiler);
3437 
3438   DCHECK_EQ(start_reg_ + 1, end_reg_);
3439   if (IgnoreCase(flags_)) {
3440     bool unicode = IsUnicode(flags_);
3441     assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
3442                                                unicode, trace->backtrack());
3443   } else {
3444     assembler->CheckNotBackReference(start_reg_, read_backward(),
3445                                      trace->backtrack());
3446   }
3447   // We are going to advance backward, so we may end up at the start.
3448   if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
3449 
3450   // Check that the back reference does not end inside a surrogate pair.
3451   if (IsUnicode(flags_) && !compiler->one_byte()) {
3452     assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
3453   }
3454   on_success()->Emit(compiler, trace);
3455 }
3456 
CalculateOffsets()3457 void TextNode::CalculateOffsets() {
3458   int element_count = elements()->length();
3459   // Set up the offsets of the elements relative to the start.  This is a fixed
3460   // quantity since a TextNode can only contain fixed-width things.
3461   int cp_offset = 0;
3462   for (int i = 0; i < element_count; i++) {
3463     TextElement& elm = elements()->at(i);
3464     elm.set_cp_offset(cp_offset);
3465     cp_offset += elm.length();
3466   }
3467 }
3468 
3469 namespace {
3470 
3471 // Assertion propagation moves information about assertions such as
3472 // \b to the affected nodes.  For instance, in /.\b./ information must
3473 // be propagated to the first '.' that whatever follows needs to know
3474 // if it matched a word or a non-word, and to the second '.' that it
3475 // has to check if it succeeds a word or non-word.  In this case the
3476 // result will be something like:
3477 //
3478 //   +-------+        +------------+
3479 //   |   .   |        |      .     |
3480 //   +-------+  --->  +------------+
3481 //   | word? |        | check word |
3482 //   +-------+        +------------+
3483 class AssertionPropagator : public AllStatic {
3484  public:
VisitText(TextNode * that)3485   static void VisitText(TextNode* that) {}
3486 
VisitAction(ActionNode * that)3487   static void VisitAction(ActionNode* that) {
3488     // If the next node is interested in what it follows then this node
3489     // has to be interested too so it can pass the information on.
3490     that->info()->AddFromFollowing(that->on_success()->info());
3491   }
3492 
VisitChoice(ChoiceNode * that,int i)3493   static void VisitChoice(ChoiceNode* that, int i) {
3494     // Anything the following nodes need to know has to be known by
3495     // this node also, so it can pass it on.
3496     that->info()->AddFromFollowing(that->alternatives()->at(i).node()->info());
3497   }
3498 
VisitLoopChoiceContinueNode(LoopChoiceNode * that)3499   static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
3500     that->info()->AddFromFollowing(that->continue_node()->info());
3501   }
3502 
VisitLoopChoiceLoopNode(LoopChoiceNode * that)3503   static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {
3504     that->info()->AddFromFollowing(that->loop_node()->info());
3505   }
3506 
VisitNegativeLookaroundChoiceLookaroundNode(NegativeLookaroundChoiceNode * that)3507   static void VisitNegativeLookaroundChoiceLookaroundNode(
3508       NegativeLookaroundChoiceNode* that) {
3509     VisitChoice(that, NegativeLookaroundChoiceNode::kLookaroundIndex);
3510   }
3511 
VisitNegativeLookaroundChoiceContinueNode(NegativeLookaroundChoiceNode * that)3512   static void VisitNegativeLookaroundChoiceContinueNode(
3513       NegativeLookaroundChoiceNode* that) {
3514     VisitChoice(that, NegativeLookaroundChoiceNode::kContinueIndex);
3515   }
3516 
VisitBackReference(BackReferenceNode * that)3517   static void VisitBackReference(BackReferenceNode* that) {}
3518 
VisitAssertion(AssertionNode * that)3519   static void VisitAssertion(AssertionNode* that) {}
3520 };
3521 
3522 // Propagates information about the minimum size of successful matches from
3523 // successor nodes to their predecessors. Note that all eats_at_least values
3524 // are initialized to zero before analysis.
3525 class EatsAtLeastPropagator : public AllStatic {
3526  public:
VisitText(TextNode * that)3527   static void VisitText(TextNode* that) {
3528     // The eats_at_least value is not used if reading backward.
3529     if (!that->read_backward()) {
3530       // We are not at the start after this node, and thus we can use the
3531       // successor's eats_at_least_from_not_start value.
3532       uint8_t eats_at_least = base::saturated_cast<uint8_t>(
3533           that->Length() + that->on_success()
3534                                ->eats_at_least_info()
3535                                ->eats_at_least_from_not_start);
3536       that->set_eats_at_least_info(EatsAtLeastInfo(eats_at_least));
3537     }
3538   }
3539 
VisitAction(ActionNode * that)3540   static void VisitAction(ActionNode* that) {
3541     switch (that->action_type()) {
3542       case ActionNode::BEGIN_POSITIVE_SUBMATCH:
3543       case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3544         // We do not propagate eats_at_least data through positive lookarounds,
3545         // because they rewind input.
3546         // TODO(v8:11859) Potential approaches for fixing this include:
3547         // 1. Add a dedicated choice node for positive lookaround, similar to
3548         //    NegativeLookaroundChoiceNode.
3549         // 2. Add an eats_at_least_inside_loop field to EatsAtLeastInfo, which
3550         //    is <= eats_at_least_from_possibly_start, and use that value in
3551         //    EatsAtLeastFromLoopEntry.
3552         DCHECK(that->eats_at_least_info()->IsZero());
3553         break;
3554       case ActionNode::SET_REGISTER_FOR_LOOP:
3555         // SET_REGISTER_FOR_LOOP indicates a loop entry point, which means the
3556         // loop body will run at least the minimum number of times before the
3557         // continuation case can run.
3558         that->set_eats_at_least_info(
3559             that->on_success()->EatsAtLeastFromLoopEntry());
3560         break;
3561       case ActionNode::BEGIN_NEGATIVE_SUBMATCH:
3562       default:
3563         // Otherwise, the current node eats at least as much as its successor.
3564         // Note: we can propagate eats_at_least data for BEGIN_NEGATIVE_SUBMATCH
3565         // because NegativeLookaroundChoiceNode ignores its lookaround successor
3566         // when computing eats-at-least and quick check information.
3567         that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
3568         break;
3569     }
3570   }
3571 
VisitChoice(ChoiceNode * that,int i)3572   static void VisitChoice(ChoiceNode* that, int i) {
3573     // The minimum possible match from a choice node is the minimum of its
3574     // successors.
3575     EatsAtLeastInfo eats_at_least =
3576         i == 0 ? EatsAtLeastInfo(UINT8_MAX) : *that->eats_at_least_info();
3577     eats_at_least.SetMin(
3578         *that->alternatives()->at(i).node()->eats_at_least_info());
3579     that->set_eats_at_least_info(eats_at_least);
3580   }
3581 
VisitLoopChoiceContinueNode(LoopChoiceNode * that)3582   static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
3583     if (!that->read_backward()) {
3584       that->set_eats_at_least_info(
3585           *that->continue_node()->eats_at_least_info());
3586     }
3587   }
3588 
VisitLoopChoiceLoopNode(LoopChoiceNode * that)3589   static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {}
3590 
VisitNegativeLookaroundChoiceLookaroundNode(NegativeLookaroundChoiceNode * that)3591   static void VisitNegativeLookaroundChoiceLookaroundNode(
3592       NegativeLookaroundChoiceNode* that) {}
3593 
VisitNegativeLookaroundChoiceContinueNode(NegativeLookaroundChoiceNode * that)3594   static void VisitNegativeLookaroundChoiceContinueNode(
3595       NegativeLookaroundChoiceNode* that) {
3596     that->set_eats_at_least_info(*that->continue_node()->eats_at_least_info());
3597   }
3598 
VisitBackReference(BackReferenceNode * that)3599   static void VisitBackReference(BackReferenceNode* that) {
3600     if (!that->read_backward()) {
3601       that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
3602     }
3603   }
3604 
VisitAssertion(AssertionNode * that)3605   static void VisitAssertion(AssertionNode* that) {
3606     EatsAtLeastInfo eats_at_least = *that->on_success()->eats_at_least_info();
3607     if (that->assertion_type() == AssertionNode::AT_START) {
3608       // If we know we are not at the start and we are asked "how many
3609       // characters will you match if you succeed?" then we can answer anything
3610       // since false implies false.  So let's just set the max answer
3611       // (UINT8_MAX) since that won't prevent us from preloading a lot of
3612       // characters for the other branches in the node graph.
3613       eats_at_least.eats_at_least_from_not_start = UINT8_MAX;
3614     }
3615     that->set_eats_at_least_info(eats_at_least);
3616   }
3617 };
3618 
3619 }  // namespace
3620 
3621 // -------------------------------------------------------------------
3622 // Analysis
3623 
3624 // Iterates the node graph and provides the opportunity for propagators to set
3625 // values that depend on successor nodes.
3626 template <typename... Propagators>
3627 class Analysis : public NodeVisitor {
3628  public:
Analysis(Isolate * isolate,bool is_one_byte)3629   Analysis(Isolate* isolate, bool is_one_byte)
3630       : isolate_(isolate),
3631         is_one_byte_(is_one_byte),
3632         error_(RegExpError::kNone) {}
3633 
EnsureAnalyzed(RegExpNode * that)3634   void EnsureAnalyzed(RegExpNode* that) {
3635     StackLimitCheck check(isolate());
3636     if (check.HasOverflowed()) {
3637       if (FLAG_correctness_fuzzer_suppressions) {
3638         FATAL("Analysis: Aborting on stack overflow");
3639       }
3640       fail(RegExpError::kAnalysisStackOverflow);
3641       return;
3642     }
3643     if (that->info()->been_analyzed || that->info()->being_analyzed) return;
3644     that->info()->being_analyzed = true;
3645     that->Accept(this);
3646     that->info()->being_analyzed = false;
3647     that->info()->been_analyzed = true;
3648   }
3649 
has_failed()3650   bool has_failed() { return error_ != RegExpError::kNone; }
error()3651   RegExpError error() {
3652     DCHECK(error_ != RegExpError::kNone);
3653     return error_;
3654   }
fail(RegExpError error)3655   void fail(RegExpError error) { error_ = error; }
3656 
isolate() const3657   Isolate* isolate() const { return isolate_; }
3658 
VisitEnd(EndNode * that)3659   void VisitEnd(EndNode* that) override {
3660     // nothing to do
3661   }
3662 
3663 // Used to call the given static function on each propagator / variadic template
3664 // argument.
3665 #define STATIC_FOR_EACH(expr)       \
3666   do {                              \
3667     int dummy[] = {((expr), 0)...}; \
3668     USE(dummy);                     \
3669   } while (false)
3670 
VisitText(TextNode * that)3671   void VisitText(TextNode* that) override {
3672     that->MakeCaseIndependent(isolate(), is_one_byte_);
3673     EnsureAnalyzed(that->on_success());
3674     if (has_failed()) return;
3675     that->CalculateOffsets();
3676     STATIC_FOR_EACH(Propagators::VisitText(that));
3677   }
3678 
VisitAction(ActionNode * that)3679   void VisitAction(ActionNode* that) override {
3680     EnsureAnalyzed(that->on_success());
3681     if (has_failed()) return;
3682     STATIC_FOR_EACH(Propagators::VisitAction(that));
3683   }
3684 
VisitChoice(ChoiceNode * that)3685   void VisitChoice(ChoiceNode* that) override {
3686     for (int i = 0; i < that->alternatives()->length(); i++) {
3687       EnsureAnalyzed(that->alternatives()->at(i).node());
3688       if (has_failed()) return;
3689       STATIC_FOR_EACH(Propagators::VisitChoice(that, i));
3690     }
3691   }
3692 
VisitLoopChoice(LoopChoiceNode * that)3693   void VisitLoopChoice(LoopChoiceNode* that) override {
3694     DCHECK_EQ(that->alternatives()->length(), 2);  // Just loop and continue.
3695 
3696     // First propagate all information from the continuation node.
3697     EnsureAnalyzed(that->continue_node());
3698     if (has_failed()) return;
3699     STATIC_FOR_EACH(Propagators::VisitLoopChoiceContinueNode(that));
3700 
3701     // Check the loop last since it may need the value of this node
3702     // to get a correct result.
3703     EnsureAnalyzed(that->loop_node());
3704     if (has_failed()) return;
3705     STATIC_FOR_EACH(Propagators::VisitLoopChoiceLoopNode(that));
3706   }
3707 
VisitNegativeLookaroundChoice(NegativeLookaroundChoiceNode * that)3708   void VisitNegativeLookaroundChoice(
3709       NegativeLookaroundChoiceNode* that) override {
3710     DCHECK_EQ(that->alternatives()->length(), 2);  // Lookaround and continue.
3711 
3712     EnsureAnalyzed(that->lookaround_node());
3713     if (has_failed()) return;
3714     STATIC_FOR_EACH(
3715         Propagators::VisitNegativeLookaroundChoiceLookaroundNode(that));
3716 
3717     EnsureAnalyzed(that->continue_node());
3718     if (has_failed()) return;
3719     STATIC_FOR_EACH(
3720         Propagators::VisitNegativeLookaroundChoiceContinueNode(that));
3721   }
3722 
VisitBackReference(BackReferenceNode * that)3723   void VisitBackReference(BackReferenceNode* that) override {
3724     EnsureAnalyzed(that->on_success());
3725     if (has_failed()) return;
3726     STATIC_FOR_EACH(Propagators::VisitBackReference(that));
3727   }
3728 
VisitAssertion(AssertionNode * that)3729   void VisitAssertion(AssertionNode* that) override {
3730     EnsureAnalyzed(that->on_success());
3731     if (has_failed()) return;
3732     STATIC_FOR_EACH(Propagators::VisitAssertion(that));
3733   }
3734 
3735 #undef STATIC_FOR_EACH
3736 
3737  private:
3738   Isolate* isolate_;
3739   bool is_one_byte_;
3740   RegExpError error_;
3741 
3742   DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
3743 };
3744 
AnalyzeRegExp(Isolate * isolate,bool is_one_byte,RegExpNode * node)3745 RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte,
3746                           RegExpNode* node) {
3747   Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(isolate,
3748                                                                 is_one_byte);
3749   DCHECK_EQ(node->info()->been_analyzed, false);
3750   analysis.EnsureAnalyzed(node);
3751   DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone);
3752   return analysis.has_failed() ? analysis.error() : RegExpError::kNone;
3753 }
3754 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3755 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
3756                                      BoyerMooreLookahead* bm,
3757                                      bool not_at_start) {
3758   // Working out the set of characters that a backreference can match is too
3759   // hard, so we just say that any character can match.
3760   bm->SetRest(offset);
3761   SaveBMInfo(bm, not_at_start, offset);
3762 }
3763 
3764 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
3765               RegExpMacroAssembler::kTableSize);
3766 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3767 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
3768                               BoyerMooreLookahead* bm, bool not_at_start) {
3769   ZoneList<GuardedAlternative>* alts = alternatives();
3770   budget = (budget - 1) / alts->length();
3771   for (int i = 0; i < alts->length(); i++) {
3772     GuardedAlternative& alt = alts->at(i);
3773     if (alt.guards() != nullptr && alt.guards()->length() != 0) {
3774       bm->SetRest(offset);  // Give up trying to fill in info.
3775       SaveBMInfo(bm, not_at_start, offset);
3776       return;
3777     }
3778     alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
3779   }
3780   SaveBMInfo(bm, not_at_start, offset);
3781 }
3782 
FillInBMInfo(Isolate * isolate,int initial_offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3783 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
3784                             BoyerMooreLookahead* bm, bool not_at_start) {
3785   if (initial_offset >= bm->length()) return;
3786   int offset = initial_offset;
3787   int max_char = bm->max_char();
3788   for (int i = 0; i < elements()->length(); i++) {
3789     if (offset >= bm->length()) {
3790       if (initial_offset == 0) set_bm_info(not_at_start, bm);
3791       return;
3792     }
3793     TextElement text = elements()->at(i);
3794     if (text.text_type() == TextElement::ATOM) {
3795       RegExpAtom* atom = text.atom();
3796       for (int j = 0; j < atom->length(); j++, offset++) {
3797         if (offset >= bm->length()) {
3798           if (initial_offset == 0) set_bm_info(not_at_start, bm);
3799           return;
3800         }
3801         uc16 character = atom->data()[j];
3802         if (IgnoreCase(atom->flags())) {
3803           unibrow::uchar chars[4];
3804           int length = GetCaseIndependentLetters(
3805               isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
3806               chars, 4);
3807           for (int j = 0; j < length; j++) {
3808             bm->Set(offset, chars[j]);
3809           }
3810         } else {
3811           if (character <= max_char) bm->Set(offset, character);
3812         }
3813       }
3814     } else {
3815       DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
3816       RegExpCharacterClass* char_class = text.char_class();
3817       ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
3818       if (char_class->is_negated()) {
3819         bm->SetAll(offset);
3820       } else {
3821         for (int k = 0; k < ranges->length(); k++) {
3822           CharacterRange& range = ranges->at(k);
3823           if (static_cast<int>(range.from()) > max_char) continue;
3824           int to = std::min(max_char, static_cast<int>(range.to()));
3825           bm->SetInterval(offset, Interval(range.from(), to));
3826         }
3827       }
3828       offset++;
3829     }
3830   }
3831   if (offset >= bm->length()) {
3832     if (initial_offset == 0) set_bm_info(not_at_start, bm);
3833     return;
3834   }
3835   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
3836                              true);  // Not at start after a text node.
3837   if (initial_offset == 0) set_bm_info(not_at_start, bm);
3838 }
3839 
OptionallyStepBackToLeadSurrogate(RegExpNode * on_success,JSRegExp::Flags flags)3840 RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
3841     RegExpNode* on_success, JSRegExp::Flags flags) {
3842   DCHECK(!read_backward());
3843   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
3844       zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
3845   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
3846       zone(), CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
3847 
3848   ChoiceNode* optional_step_back = zone()->New<ChoiceNode>(2, zone());
3849 
3850   int stack_register = UnicodeLookaroundStackRegister();
3851   int position_register = UnicodeLookaroundPositionRegister();
3852   RegExpNode* step_back = TextNode::CreateForCharacterRanges(
3853       zone(), lead_surrogates, true, on_success, flags);
3854   RegExpLookaround::Builder builder(true, step_back, stack_register,
3855                                     position_register);
3856   RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
3857       zone(), trail_surrogates, false, builder.on_match_success(), flags);
3858 
3859   optional_step_back->AddAlternative(
3860       GuardedAlternative(builder.ForMatch(match_trail)));
3861   optional_step_back->AddAlternative(GuardedAlternative(on_success));
3862 
3863   return optional_step_back;
3864 }
3865 
PreprocessRegExp(RegExpCompileData * data,JSRegExp::Flags flags,bool is_one_byte)3866 RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
3867                                              JSRegExp::Flags flags,
3868                                              bool is_one_byte) {
3869   // Wrap the body of the regexp in capture #0.
3870   RegExpNode* captured_body =
3871       RegExpCapture::ToNode(data->tree, 0, this, accept());
3872   RegExpNode* node = captured_body;
3873   if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) {
3874     // Add a .*? at the beginning, outside the body capture, unless
3875     // this expression is anchored at the beginning or sticky.
3876     JSRegExp::Flags default_flags = JSRegExp::Flags();
3877     RegExpNode* loop_node = RegExpQuantifier::ToNode(
3878         0, RegExpTree::kInfinity, false,
3879         zone()->New<RegExpCharacterClass>('*', default_flags), this,
3880         captured_body, data->contains_anchor);
3881 
3882     if (data->contains_anchor) {
3883       // Unroll loop once, to take care of the case that might start
3884       // at the start of input.
3885       ChoiceNode* first_step_node = zone()->New<ChoiceNode>(2, zone());
3886       first_step_node->AddAlternative(GuardedAlternative(captured_body));
3887       first_step_node->AddAlternative(GuardedAlternative(zone()->New<TextNode>(
3888           zone()->New<RegExpCharacterClass>('*', default_flags), false,
3889           loop_node)));
3890       node = first_step_node;
3891     } else {
3892       node = loop_node;
3893     }
3894   }
3895   if (is_one_byte) {
3896     node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
3897     // Do it again to propagate the new nodes to places where they were not
3898     // put because they had not been calculated yet.
3899     if (node != nullptr) {
3900       node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
3901     }
3902   } else if (IsUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) {
3903     node = OptionallyStepBackToLeadSurrogate(node, flags);
3904   }
3905 
3906   if (node == nullptr) node = zone()->New<EndNode>(EndNode::BACKTRACK, zone());
3907   return node;
3908 }
3909 
3910 }  // namespace internal
3911 }  // namespace v8
3912