1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "irregexp/imported/regexp-compiler.h"
6
7 #include "irregexp/imported/regexp-macro-assembler-arch.h"
8 #ifdef V8_INTL_SUPPORT
9 #include "irregexp/imported/special-case.h"
10 #endif // V8_INTL_SUPPORT
11
12 #ifdef V8_INTL_SUPPORT
13 #include "unicode/locid.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utypes.h"
16 #endif // V8_INTL_SUPPORT
17
18 namespace v8 {
19 namespace internal {
20
21 using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
22
23 // -------------------------------------------------------------------
24 // Implementation of the Irregexp regular expression engine.
25 //
26 // The Irregexp regular expression engine is intended to be a complete
27 // implementation of ECMAScript regular expressions. It generates either
28 // bytecodes or native code.
29
30 // The Irregexp regexp engine is structured in three steps.
31 // 1) The parser generates an abstract syntax tree. See ast.cc.
32 // 2) From the AST a node network is created. The nodes are all
33 // subclasses of RegExpNode. The nodes represent states when
34 // executing a regular expression. Several optimizations are
35 // performed on the node network.
36 // 3) From the nodes we generate either byte codes or native code
37 // that can actually execute the regular expression (perform
38 // the search). The code generation step is described in more
39 // detail below.
40
41 // Code generation.
42 //
43 // The nodes are divided into four main categories.
44 // * Choice nodes
45 // These represent places where the regular expression can
46 // match in more than one way. For example on entry to an
47 // alternation (foo|bar) or a repetition (*, +, ? or {}).
48 // * Action nodes
49 // These represent places where some action should be
50 // performed. Examples include recording the current position
51 // in the input string to a register (in order to implement
52 // captures) or other actions on register for example in order
53 // to implement the counters needed for {} repetitions.
54 // * Matching nodes
55 // These attempt to match some element part of the input string.
56 // Examples of elements include character classes, plain strings
57 // or back references.
58 // * End nodes
59 // These are used to implement the actions required on finding
60 // a successful match or failing to find a match.
61 //
62 // The code generated (whether as byte codes or native code) maintains
63 // some state as it runs. This consists of the following elements:
64 //
65 // * The capture registers. Used for string captures.
66 // * Other registers. Used for counters etc.
67 // * The current position.
68 // * The stack of backtracking information. Used when a matching node
69 // fails to find a match and needs to try an alternative.
70 //
71 // Conceptual regular expression execution model:
72 //
73 // There is a simple conceptual model of regular expression execution
74 // which will be presented first. The actual code generated is a more
75 // efficient simulation of the simple conceptual model:
76 //
77 // * Choice nodes are implemented as follows:
78 // For each choice except the last {
79 // push current position
80 // push backtrack code location
81 // <generate code to test for choice>
82 // backtrack code location:
83 // pop current position
84 // }
85 // <generate code to test for last choice>
86 //
87 // * Actions nodes are generated as follows
88 // <push affected registers on backtrack stack>
89 // <generate code to perform action>
90 // push backtrack code location
91 // <generate code to test for following nodes>
92 // backtrack code location:
93 // <pop affected registers to restore their state>
94 // <pop backtrack location from stack and go to it>
95 //
96 // * Matching nodes are generated as follows:
97 // if input string matches at current position
98 // update current position
99 // <generate code to test for following nodes>
100 // else
101 // <pop backtrack location from stack and go to it>
102 //
103 // Thus it can be seen that the current position is saved and restored
104 // by the choice nodes, whereas the registers are saved and restored by
105 // by the action nodes that manipulate them.
106 //
107 // The other interesting aspect of this model is that nodes are generated
108 // at the point where they are needed by a recursive call to Emit(). If
109 // the node has already been code generated then the Emit() call will
110 // generate a jump to the previously generated code instead. In order to
111 // limit recursion it is possible for the Emit() function to put the node
112 // on a work list for later generation and instead generate a jump. The
113 // destination of the jump is resolved later when the code is generated.
114 //
115 // Actual regular expression code generation.
116 //
117 // Code generation is actually more complicated than the above. In order
118 // to improve the efficiency of the generated code some optimizations are
119 // performed
120 //
121 // * Choice nodes have 1-character lookahead.
122 // A choice node looks at the following character and eliminates some of
123 // the choices immediately based on that character. This is not yet
124 // implemented.
125 // * Simple greedy loops store reduced backtracking information.
126 // A quantifier like /.*foo/m will greedily match the whole input. It will
127 // then need to backtrack to a point where it can match "foo". The naive
128 // implementation of this would push each character position onto the
129 // backtracking stack, then pop them off one by one. This would use space
130 // proportional to the length of the input string. However since the "."
131 // can only match in one way and always has a constant length (in this case
132 // of 1) it suffices to store the current position on the top of the stack
133 // once. Matching now becomes merely incrementing the current position and
134 // backtracking becomes decrementing the current position and checking the
135 // result against the stored current position. This is faster and saves
136 // space.
137 // * The current state is virtualized.
138 // This is used to defer expensive operations until it is clear that they
139 // are needed and to generate code for a node more than once, allowing
140 // specialized an efficient versions of the code to be created. This is
141 // explained in the section below.
142 //
143 // Execution state virtualization.
144 //
145 // Instead of emitting code, nodes that manipulate the state can record their
146 // manipulation in an object called the Trace. The Trace object can record a
147 // current position offset, an optional backtrack code location on the top of
148 // the virtualized backtrack stack and some register changes. When a node is
149 // to be emitted it can flush the Trace or update it. Flushing the Trace
150 // will emit code to bring the actual state into line with the virtual state.
151 // Avoiding flushing the state can postpone some work (e.g. updates of capture
152 // registers). Postponing work can save time when executing the regular
153 // expression since it may be found that the work never has to be done as a
154 // failure to match can occur. In addition it is much faster to jump to a
155 // known backtrack code location than it is to pop an unknown backtrack
156 // location from the stack and jump there.
157 //
158 // The virtual state found in the Trace affects code generation. For example
159 // the virtual state contains the difference between the actual current
160 // position and the virtual current position, and matching code needs to use
161 // this offset to attempt a match in the correct location of the input
162 // string. Therefore code generated for a non-trivial trace is specialized
163 // to that trace. The code generator therefore has the ability to generate
164 // code for each node several times. In order to limit the size of the
165 // generated code there is an arbitrary limit on how many specialized sets of
166 // code may be generated for a given node. If the limit is reached, the
167 // trace is flushed and a generic version of the code for a node is emitted.
168 // This is subsequently used for that node. The code emitted for non-generic
169 // trace is not recorded in the node and so it cannot currently be reused in
170 // the event that code generation is requested for an identical trace.
171
172 namespace {
173
MaxCodeUnit(const bool one_byte)174 constexpr uc32 MaxCodeUnit(const bool one_byte) {
175 STATIC_ASSERT(String::kMaxOneByteCharCodeU <=
176 std::numeric_limits<uint16_t>::max());
177 STATIC_ASSERT(String::kMaxUtf16CodeUnitU <=
178 std::numeric_limits<uint16_t>::max());
179 return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU;
180 }
181
CharMask(const bool one_byte)182 constexpr uint32_t CharMask(const bool one_byte) {
183 STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1));
184 STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1));
185 return MaxCodeUnit(one_byte);
186 }
187
188 } // namespace
189
AppendToText(RegExpText * text,Zone * zone)190 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); }
191
AppendToText(RegExpText * text,Zone * zone)192 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
193 text->AddElement(TextElement::Atom(this), zone);
194 }
195
AppendToText(RegExpText * text,Zone * zone)196 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
197 text->AddElement(TextElement::CharClass(this), zone);
198 }
199
AppendToText(RegExpText * text,Zone * zone)200 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
201 for (int i = 0; i < elements()->length(); i++)
202 text->AddElement(elements()->at(i), zone);
203 }
204
Atom(RegExpAtom * atom)205 TextElement TextElement::Atom(RegExpAtom* atom) {
206 return TextElement(ATOM, atom);
207 }
208
CharClass(RegExpCharacterClass * char_class)209 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
210 return TextElement(CHAR_CLASS, char_class);
211 }
212
length() const213 int TextElement::length() const {
214 switch (text_type()) {
215 case ATOM:
216 return atom()->length();
217
218 case CHAR_CLASS:
219 return 1;
220 }
221 UNREACHABLE();
222 }
223
224 class RecursionCheck {
225 public:
RecursionCheck(RegExpCompiler * compiler)226 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
227 compiler->IncrementRecursionDepth();
228 }
~RecursionCheck()229 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
230
231 private:
232 RegExpCompiler* compiler_;
233 };
234
235 // Attempts to compile the regexp using an Irregexp code generator. Returns
236 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(Isolate * isolate,Zone * zone,int capture_count,bool one_byte)237 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
238 bool one_byte)
239 : next_register_(JSRegExp::RegistersForCaptureCount(capture_count)),
240 unicode_lookaround_stack_register_(kNoRegister),
241 unicode_lookaround_position_register_(kNoRegister),
242 work_list_(nullptr),
243 recursion_depth_(0),
244 one_byte_(one_byte),
245 reg_exp_too_big_(false),
246 limiting_recursion_(false),
247 optimize_(FLAG_regexp_optimization),
248 read_backward_(false),
249 current_expansion_factor_(1),
250 frequency_collator_(),
251 isolate_(isolate),
252 zone_(zone) {
253 accept_ = zone->New<EndNode>(EndNode::ACCEPT, zone);
254 DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
255 }
256
Assemble(Isolate * isolate,RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)257 RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
258 Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
259 int capture_count, Handle<String> pattern) {
260 macro_assembler_ = macro_assembler;
261
262 ZoneVector<RegExpNode*> work_list(zone());
263 work_list_ = &work_list;
264 Label fail;
265 macro_assembler_->PushBacktrack(&fail);
266 Trace new_trace;
267 start->Emit(this, &new_trace);
268 macro_assembler_->BindJumpTarget(&fail);
269 macro_assembler_->Fail();
270 while (!work_list.empty()) {
271 RegExpNode* node = work_list.back();
272 work_list.pop_back();
273 node->set_on_work_list(false);
274 if (!node->label()->is_bound()) node->Emit(this, &new_trace);
275 }
276 if (reg_exp_too_big_) {
277 macro_assembler_->AbortedCodeGeneration();
278 return CompilationResult::RegExpTooBig();
279 }
280
281 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
282 isolate->IncreaseTotalRegexpCodeGenerated(code);
283 work_list_ = nullptr;
284
285 return {code, next_register_};
286 }
287
Mentions(int that)288 bool Trace::DeferredAction::Mentions(int that) {
289 if (action_type() == ActionNode::CLEAR_CAPTURES) {
290 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
291 return range.Contains(that);
292 } else {
293 return reg() == that;
294 }
295 }
296
mentions_reg(int reg)297 bool Trace::mentions_reg(int reg) {
298 for (DeferredAction* action = actions_; action != nullptr;
299 action = action->next()) {
300 if (action->Mentions(reg)) return true;
301 }
302 return false;
303 }
304
GetStoredPosition(int reg,int * cp_offset)305 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
306 DCHECK_EQ(0, *cp_offset);
307 for (DeferredAction* action = actions_; action != nullptr;
308 action = action->next()) {
309 if (action->Mentions(reg)) {
310 if (action->action_type() == ActionNode::STORE_POSITION) {
311 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
312 return true;
313 } else {
314 return false;
315 }
316 }
317 }
318 return false;
319 }
320
321 // A (dynamically-sized) set of unsigned integers that behaves especially well
322 // on small integers (< kFirstLimit). May do zone-allocation.
323 class DynamicBitSet : public ZoneObject {
324 public:
Get(unsigned value) const325 V8_EXPORT_PRIVATE bool Get(unsigned value) const {
326 if (value < kFirstLimit) {
327 return (first_ & (1 << value)) != 0;
328 } else if (remaining_ == nullptr) {
329 return false;
330 } else {
331 return remaining_->Contains(value);
332 }
333 }
334
335 // Destructively set a value in this set.
Set(unsigned value,Zone * zone)336 void Set(unsigned value, Zone* zone) {
337 if (value < kFirstLimit) {
338 first_ |= (1 << value);
339 } else {
340 if (remaining_ == nullptr)
341 remaining_ = zone->New<ZoneList<unsigned>>(1, zone);
342 if (remaining_->is_empty() || !remaining_->Contains(value))
343 remaining_->Add(value, zone);
344 }
345 }
346
347 private:
348 static constexpr unsigned kFirstLimit = 32;
349
350 uint32_t first_ = 0;
351 ZoneList<unsigned>* remaining_ = nullptr;
352 };
353
FindAffectedRegisters(DynamicBitSet * affected_registers,Zone * zone)354 int Trace::FindAffectedRegisters(DynamicBitSet* affected_registers,
355 Zone* zone) {
356 int max_register = RegExpCompiler::kNoRegister;
357 for (DeferredAction* action = actions_; action != nullptr;
358 action = action->next()) {
359 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
360 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
361 for (int i = range.from(); i <= range.to(); i++)
362 affected_registers->Set(i, zone);
363 if (range.to() > max_register) max_register = range.to();
364 } else {
365 affected_registers->Set(action->reg(), zone);
366 if (action->reg() > max_register) max_register = action->reg();
367 }
368 }
369 return max_register;
370 }
371
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,const DynamicBitSet & registers_to_pop,const DynamicBitSet & registers_to_clear)372 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
373 int max_register,
374 const DynamicBitSet& registers_to_pop,
375 const DynamicBitSet& registers_to_clear) {
376 for (int reg = max_register; reg >= 0; reg--) {
377 if (registers_to_pop.Get(reg)) {
378 assembler->PopRegister(reg);
379 } else if (registers_to_clear.Get(reg)) {
380 int clear_to = reg;
381 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
382 reg--;
383 }
384 assembler->ClearRegisters(reg, clear_to);
385 }
386 }
387 }
388
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,const DynamicBitSet & affected_registers,DynamicBitSet * registers_to_pop,DynamicBitSet * registers_to_clear,Zone * zone)389 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
390 int max_register,
391 const DynamicBitSet& affected_registers,
392 DynamicBitSet* registers_to_pop,
393 DynamicBitSet* registers_to_clear,
394 Zone* zone) {
395 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
396 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
397
398 // Count pushes performed to force a stack limit check occasionally.
399 int pushes = 0;
400
401 for (int reg = 0; reg <= max_register; reg++) {
402 if (!affected_registers.Get(reg)) continue;
403
404 // The chronologically first deferred action in the trace
405 // is used to infer the action needed to restore a register
406 // to its previous state (or not, if it's safe to ignore it).
407 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
408 DeferredActionUndoType undo_action = IGNORE;
409
410 int value = 0;
411 bool absolute = false;
412 bool clear = false;
413 static const int kNoStore = kMinInt;
414 int store_position = kNoStore;
415 // This is a little tricky because we are scanning the actions in reverse
416 // historical order (newest first).
417 for (DeferredAction* action = actions_; action != nullptr;
418 action = action->next()) {
419 if (action->Mentions(reg)) {
420 switch (action->action_type()) {
421 case ActionNode::SET_REGISTER_FOR_LOOP: {
422 Trace::DeferredSetRegisterForLoop* psr =
423 static_cast<Trace::DeferredSetRegisterForLoop*>(action);
424 if (!absolute) {
425 value += psr->value();
426 absolute = true;
427 }
428 // SET_REGISTER_FOR_LOOP is only used for newly introduced loop
429 // counters. They can have a significant previous value if they
430 // occur in a loop. TODO(lrn): Propagate this information, so
431 // we can set undo_action to IGNORE if we know there is no value to
432 // restore.
433 undo_action = RESTORE;
434 DCHECK_EQ(store_position, kNoStore);
435 DCHECK(!clear);
436 break;
437 }
438 case ActionNode::INCREMENT_REGISTER:
439 if (!absolute) {
440 value++;
441 }
442 DCHECK_EQ(store_position, kNoStore);
443 DCHECK(!clear);
444 undo_action = RESTORE;
445 break;
446 case ActionNode::STORE_POSITION: {
447 Trace::DeferredCapture* pc =
448 static_cast<Trace::DeferredCapture*>(action);
449 if (!clear && store_position == kNoStore) {
450 store_position = pc->cp_offset();
451 }
452
453 // For captures we know that stores and clears alternate.
454 // Other register, are never cleared, and if the occur
455 // inside a loop, they might be assigned more than once.
456 if (reg <= 1) {
457 // Registers zero and one, aka "capture zero", is
458 // always set correctly if we succeed. There is no
459 // need to undo a setting on backtrack, because we
460 // will set it again or fail.
461 undo_action = IGNORE;
462 } else {
463 undo_action = pc->is_capture() ? CLEAR : RESTORE;
464 }
465 DCHECK(!absolute);
466 DCHECK_EQ(value, 0);
467 break;
468 }
469 case ActionNode::CLEAR_CAPTURES: {
470 // Since we're scanning in reverse order, if we've already
471 // set the position we have to ignore historically earlier
472 // clearing operations.
473 if (store_position == kNoStore) {
474 clear = true;
475 }
476 undo_action = RESTORE;
477 DCHECK(!absolute);
478 DCHECK_EQ(value, 0);
479 break;
480 }
481 default:
482 UNREACHABLE();
483 break;
484 }
485 }
486 }
487 // Prepare for the undo-action (e.g., push if it's going to be popped).
488 if (undo_action == RESTORE) {
489 pushes++;
490 RegExpMacroAssembler::StackCheckFlag stack_check =
491 RegExpMacroAssembler::kNoStackLimitCheck;
492 if (pushes == push_limit) {
493 stack_check = RegExpMacroAssembler::kCheckStackLimit;
494 pushes = 0;
495 }
496
497 assembler->PushRegister(reg, stack_check);
498 registers_to_pop->Set(reg, zone);
499 } else if (undo_action == CLEAR) {
500 registers_to_clear->Set(reg, zone);
501 }
502 // Perform the chronologically last action (or accumulated increment)
503 // for the register.
504 if (store_position != kNoStore) {
505 assembler->WriteCurrentPositionToRegister(reg, store_position);
506 } else if (clear) {
507 assembler->ClearRegisters(reg, reg);
508 } else if (absolute) {
509 assembler->SetRegister(reg, value);
510 } else if (value != 0) {
511 assembler->AdvanceRegister(reg, value);
512 }
513 }
514 }
515
516 // This is called as we come into a loop choice node and some other tricky
517 // nodes. It normalizes the state of the code generator to ensure we can
518 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)519 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
520 RegExpMacroAssembler* assembler = compiler->macro_assembler();
521
522 DCHECK(!is_trivial());
523
524 if (actions_ == nullptr && backtrack() == nullptr) {
525 // Here we just have some deferred cp advances to fix and we are back to
526 // a normal situation. We may also have to forget some information gained
527 // through a quick check that was already performed.
528 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
529 // Create a new trivial state and generate the node with that.
530 Trace new_state;
531 successor->Emit(compiler, &new_state);
532 return;
533 }
534
535 // Generate deferred actions here along with code to undo them again.
536 DynamicBitSet affected_registers;
537
538 if (backtrack() != nullptr) {
539 // Here we have a concrete backtrack location. These are set up by choice
540 // nodes and so they indicate that we have a deferred save of the current
541 // position which we may need to emit here.
542 assembler->PushCurrentPosition();
543 }
544
545 int max_register =
546 FindAffectedRegisters(&affected_registers, compiler->zone());
547 DynamicBitSet registers_to_pop;
548 DynamicBitSet registers_to_clear;
549 PerformDeferredActions(assembler, max_register, affected_registers,
550 ®isters_to_pop, ®isters_to_clear,
551 compiler->zone());
552 if (cp_offset_ != 0) {
553 assembler->AdvanceCurrentPosition(cp_offset_);
554 }
555
556 // Create a new trivial state and generate the node with that.
557 Label undo;
558 assembler->PushBacktrack(&undo);
559 if (successor->KeepRecursing(compiler)) {
560 Trace new_state;
561 successor->Emit(compiler, &new_state);
562 } else {
563 compiler->AddWork(successor);
564 assembler->GoTo(successor->label());
565 }
566
567 // On backtrack we need to restore state.
568 assembler->BindJumpTarget(&undo);
569 RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
570 registers_to_clear);
571 if (backtrack() == nullptr) {
572 assembler->Backtrack();
573 } else {
574 assembler->PopCurrentPosition();
575 assembler->GoTo(backtrack());
576 }
577 }
578
Emit(RegExpCompiler * compiler,Trace * trace)579 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
580 RegExpMacroAssembler* assembler = compiler->macro_assembler();
581
582 // Omit flushing the trace. We discard the entire stack frame anyway.
583
584 if (!label()->is_bound()) {
585 // We are completely independent of the trace, since we ignore it,
586 // so this code can be used as the generic version.
587 assembler->Bind(label());
588 }
589
590 // Throw away everything on the backtrack stack since the start
591 // of the negative submatch and restore the character position.
592 assembler->ReadCurrentPositionFromRegister(current_position_register_);
593 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
594 if (clear_capture_count_ > 0) {
595 // Clear any captures that might have been performed during the success
596 // of the body of the negative look-ahead.
597 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
598 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
599 }
600 // Now that we have unwound the stack we find at the top of the stack the
601 // backtrack that the BeginNegativeSubmatch node got.
602 assembler->Backtrack();
603 }
604
Emit(RegExpCompiler * compiler,Trace * trace)605 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
606 if (!trace->is_trivial()) {
607 trace->Flush(compiler, this);
608 return;
609 }
610 RegExpMacroAssembler* assembler = compiler->macro_assembler();
611 if (!label()->is_bound()) {
612 assembler->Bind(label());
613 }
614 switch (action_) {
615 case ACCEPT:
616 assembler->Succeed();
617 return;
618 case BACKTRACK:
619 assembler->GoTo(trace->backtrack());
620 return;
621 case NEGATIVE_SUBMATCH_SUCCESS:
622 // This case is handled in a different virtual method.
623 UNREACHABLE();
624 }
625 UNIMPLEMENTED();
626 }
627
AddGuard(Guard * guard,Zone * zone)628 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
629 if (guards_ == nullptr) guards_ = zone->New<ZoneList<Guard*>>(1, zone);
630 guards_->Add(guard, zone);
631 }
632
SetRegisterForLoop(int reg,int val,RegExpNode * on_success)633 ActionNode* ActionNode::SetRegisterForLoop(int reg, int val,
634 RegExpNode* on_success) {
635 ActionNode* result =
636 on_success->zone()->New<ActionNode>(SET_REGISTER_FOR_LOOP, on_success);
637 result->data_.u_store_register.reg = reg;
638 result->data_.u_store_register.value = val;
639 return result;
640 }
641
IncrementRegister(int reg,RegExpNode * on_success)642 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
643 ActionNode* result =
644 on_success->zone()->New<ActionNode>(INCREMENT_REGISTER, on_success);
645 result->data_.u_increment_register.reg = reg;
646 return result;
647 }
648
StorePosition(int reg,bool is_capture,RegExpNode * on_success)649 ActionNode* ActionNode::StorePosition(int reg, bool is_capture,
650 RegExpNode* on_success) {
651 ActionNode* result =
652 on_success->zone()->New<ActionNode>(STORE_POSITION, on_success);
653 result->data_.u_position_register.reg = reg;
654 result->data_.u_position_register.is_capture = is_capture;
655 return result;
656 }
657
ClearCaptures(Interval range,RegExpNode * on_success)658 ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) {
659 ActionNode* result =
660 on_success->zone()->New<ActionNode>(CLEAR_CAPTURES, on_success);
661 result->data_.u_clear_captures.range_from = range.from();
662 result->data_.u_clear_captures.range_to = range.to();
663 return result;
664 }
665
BeginPositiveSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)666 ActionNode* ActionNode::BeginPositiveSubmatch(int stack_reg, int position_reg,
667 RegExpNode* on_success) {
668 ActionNode* result =
669 on_success->zone()->New<ActionNode>(BEGIN_POSITIVE_SUBMATCH, on_success);
670 result->data_.u_submatch.stack_pointer_register = stack_reg;
671 result->data_.u_submatch.current_position_register = position_reg;
672 return result;
673 }
674
BeginNegativeSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)675 ActionNode* ActionNode::BeginNegativeSubmatch(int stack_reg, int position_reg,
676 RegExpNode* on_success) {
677 ActionNode* result =
678 on_success->zone()->New<ActionNode>(BEGIN_NEGATIVE_SUBMATCH, on_success);
679 result->data_.u_submatch.stack_pointer_register = stack_reg;
680 result->data_.u_submatch.current_position_register = position_reg;
681 return result;
682 }
683
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)684 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, int position_reg,
685 int clear_register_count,
686 int clear_register_from,
687 RegExpNode* on_success) {
688 ActionNode* result = on_success->zone()->New<ActionNode>(
689 POSITIVE_SUBMATCH_SUCCESS, on_success);
690 result->data_.u_submatch.stack_pointer_register = stack_reg;
691 result->data_.u_submatch.current_position_register = position_reg;
692 result->data_.u_submatch.clear_register_count = clear_register_count;
693 result->data_.u_submatch.clear_register_from = clear_register_from;
694 return result;
695 }
696
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)697 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
698 int repetition_register,
699 int repetition_limit,
700 RegExpNode* on_success) {
701 ActionNode* result =
702 on_success->zone()->New<ActionNode>(EMPTY_MATCH_CHECK, on_success);
703 result->data_.u_empty_match_check.start_register = start_register;
704 result->data_.u_empty_match_check.repetition_register = repetition_register;
705 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
706 return result;
707 }
708
709 #define DEFINE_ACCEPT(Type) \
710 void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)711 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
712 #undef DEFINE_ACCEPT
713
714 // -------------------------------------------------------------------
715 // Emit code.
716
717 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
718 Guard* guard, Trace* trace) {
719 switch (guard->op()) {
720 case Guard::LT:
721 DCHECK(!trace->mentions_reg(guard->reg()));
722 macro_assembler->IfRegisterGE(guard->reg(), guard->value(),
723 trace->backtrack());
724 break;
725 case Guard::GEQ:
726 DCHECK(!trace->mentions_reg(guard->reg()));
727 macro_assembler->IfRegisterLT(guard->reg(), guard->value(),
728 trace->backtrack());
729 break;
730 }
731 }
732
733 namespace {
734
735 #ifdef DEBUG
ContainsOnlyUtf16CodeUnits(unibrow::uchar * chars,int length)736 bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) {
737 STATIC_ASSERT(sizeof(unibrow::uchar) == 4);
738 for (int i = 0; i < length; i++) {
739 if (chars[i] > String::kMaxUtf16CodeUnit) return false;
740 }
741 return true;
742 }
743 #endif // DEBUG
744
745 } // namespace
746
747 // Returns the number of characters in the equivalence class, omitting those
748 // that cannot occur in the source string because it is Latin1.
GetCaseIndependentLetters(Isolate * isolate,uc16 character,bool one_byte_subject,unibrow::uchar * letters,int letter_length)749 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
750 bool one_byte_subject,
751 unibrow::uchar* letters,
752 int letter_length) {
753 #ifdef V8_INTL_SUPPORT
754 if (RegExpCaseFolding::IgnoreSet().contains(character)) {
755 letters[0] = character;
756 DCHECK(ContainsOnlyUtf16CodeUnits(letters, 1));
757 return 1;
758 }
759 bool in_special_add_set =
760 RegExpCaseFolding::SpecialAddSet().contains(character);
761
762 icu::UnicodeSet set;
763 set.add(character);
764 set = set.closeOver(USET_CASE_INSENSITIVE);
765
766 UChar32 canon = 0;
767 if (in_special_add_set) {
768 canon = RegExpCaseFolding::Canonicalize(character);
769 }
770
771 int32_t range_count = set.getRangeCount();
772 int items = 0;
773 for (int32_t i = 0; i < range_count; i++) {
774 UChar32 start = set.getRangeStart(i);
775 UChar32 end = set.getRangeEnd(i);
776 CHECK(end - start + items <= letter_length);
777 for (UChar32 cu = start; cu <= end; cu++) {
778 if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
779 if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) {
780 continue;
781 }
782 letters[items++] = static_cast<unibrow::uchar>(cu);
783 }
784 }
785 DCHECK(ContainsOnlyUtf16CodeUnits(letters, items));
786 return items;
787 #else
788 int length =
789 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
790 // Unibrow returns 0 or 1 for characters where case independence is
791 // trivial.
792 if (length == 0) {
793 letters[0] = character;
794 length = 1;
795 }
796
797 if (one_byte_subject) {
798 int new_length = 0;
799 for (int i = 0; i < length; i++) {
800 if (letters[i] <= String::kMaxOneByteCharCode) {
801 letters[new_length++] = letters[i];
802 }
803 }
804 length = new_length;
805 }
806
807 DCHECK(ContainsOnlyUtf16CodeUnits(letters, length));
808 return length;
809 #endif // V8_INTL_SUPPORT
810 }
811
EmitSimpleCharacter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)812 static inline bool EmitSimpleCharacter(Isolate* isolate,
813 RegExpCompiler* compiler, uc16 c,
814 Label* on_failure, int cp_offset,
815 bool check, bool preloaded) {
816 RegExpMacroAssembler* assembler = compiler->macro_assembler();
817 bool bound_checked = false;
818 if (!preloaded) {
819 assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
820 bound_checked = true;
821 }
822 assembler->CheckNotCharacter(c, on_failure);
823 return bound_checked;
824 }
825
826 // Only emits non-letters (things that don't have case). Only used for case
827 // independent matches.
EmitAtomNonLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)828 static inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
829 uc16 c, Label* on_failure, int cp_offset,
830 bool check, bool preloaded) {
831 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
832 bool one_byte = compiler->one_byte();
833 unibrow::uchar chars[4];
834 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
835 if (length < 1) {
836 // This can't match. Must be an one-byte subject and a non-one-byte
837 // character. We do not need to do anything since the one-byte pass
838 // already handled this.
839 return false; // Bounds not checked.
840 }
841 bool checked = false;
842 // We handle the length > 1 case in a later pass.
843 if (length == 1) {
844 if (one_byte && c > String::kMaxOneByteCharCodeU) {
845 // Can't match - see above.
846 return false; // Bounds not checked.
847 }
848 if (!preloaded) {
849 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
850 checked = check;
851 }
852 macro_assembler->CheckNotCharacter(c, on_failure);
853 }
854 return checked;
855 }
856
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool one_byte,uc16 c1,uc16 c2,Label * on_failure)857 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
858 bool one_byte, uc16 c1, uc16 c2,
859 Label* on_failure) {
860 const uint32_t char_mask = CharMask(one_byte);
861 uc16 exor = c1 ^ c2;
862 // Check whether exor has only one bit set.
863 if (((exor - 1) & exor) == 0) {
864 // If c1 and c2 differ only by one bit.
865 // Ecma262UnCanonicalize always gives the highest number last.
866 DCHECK(c2 > c1);
867 uc16 mask = char_mask ^ exor;
868 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
869 return true;
870 }
871 DCHECK(c2 > c1);
872 uc16 diff = c2 - c1;
873 if (((diff - 1) & diff) == 0 && c1 >= diff) {
874 // If the characters differ by 2^n but don't differ by one bit then
875 // subtract the difference from the found character, then do the or
876 // trick. We avoid the theoretical case where negative numbers are
877 // involved in order to simplify code generation.
878 uc16 mask = char_mask ^ diff;
879 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask,
880 on_failure);
881 return true;
882 }
883 return false;
884 }
885
886 // Only emits letters (things that have case). Only used for case independent
887 // matches.
EmitAtomLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)888 static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
889 uc16 c, Label* on_failure, int cp_offset,
890 bool check, bool preloaded) {
891 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
892 bool one_byte = compiler->one_byte();
893 unibrow::uchar chars[4];
894 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
895 if (length <= 1) return false;
896 // We may not need to check against the end of the input string
897 // if this character lies before a character that matched.
898 if (!preloaded) {
899 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
900 }
901 Label ok;
902 switch (length) {
903 case 2: {
904 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
905 chars[1], on_failure)) {
906 } else {
907 macro_assembler->CheckCharacter(chars[0], &ok);
908 macro_assembler->CheckNotCharacter(chars[1], on_failure);
909 macro_assembler->Bind(&ok);
910 }
911 break;
912 }
913 case 4:
914 macro_assembler->CheckCharacter(chars[3], &ok);
915 V8_FALLTHROUGH;
916 case 3:
917 macro_assembler->CheckCharacter(chars[0], &ok);
918 macro_assembler->CheckCharacter(chars[1], &ok);
919 macro_assembler->CheckNotCharacter(chars[2], on_failure);
920 macro_assembler->Bind(&ok);
921 break;
922 default:
923 UNREACHABLE();
924 }
925 return true;
926 }
927
EmitBoundaryTest(RegExpMacroAssembler * masm,int border,Label * fall_through,Label * above_or_equal,Label * below)928 static void EmitBoundaryTest(RegExpMacroAssembler* masm, int border,
929 Label* fall_through, Label* above_or_equal,
930 Label* below) {
931 if (below != fall_through) {
932 masm->CheckCharacterLT(border, below);
933 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
934 } else {
935 masm->CheckCharacterGT(border - 1, above_or_equal);
936 }
937 }
938
EmitDoubleBoundaryTest(RegExpMacroAssembler * masm,int first,int last,Label * fall_through,Label * in_range,Label * out_of_range)939 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first,
940 int last, Label* fall_through,
941 Label* in_range, Label* out_of_range) {
942 if (in_range == fall_through) {
943 if (first == last) {
944 masm->CheckNotCharacter(first, out_of_range);
945 } else {
946 masm->CheckCharacterNotInRange(first, last, out_of_range);
947 }
948 } else {
949 if (first == last) {
950 masm->CheckCharacter(first, in_range);
951 } else {
952 masm->CheckCharacterInRange(first, last, in_range);
953 }
954 if (out_of_range != fall_through) masm->GoTo(out_of_range);
955 }
956 }
957
958 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
959 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
EmitUseLookupTable(RegExpMacroAssembler * masm,ZoneList<uc32> * ranges,uint32_t start_index,uint32_t end_index,uc32 min_char,Label * fall_through,Label * even_label,Label * odd_label)960 static void EmitUseLookupTable(RegExpMacroAssembler* masm,
961 ZoneList<uc32>* ranges, uint32_t start_index,
962 uint32_t end_index, uc32 min_char,
963 Label* fall_through, Label* even_label,
964 Label* odd_label) {
965 static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
966 static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
967
968 uc32 base = (min_char & ~kMask);
969 USE(base);
970
971 // Assert that everything is on one kTableSize page.
972 for (uint32_t i = start_index; i <= end_index; i++) {
973 DCHECK_EQ(ranges->at(i) & ~kMask, base);
974 }
975 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
976
977 char templ[kSize];
978 Label* on_bit_set;
979 Label* on_bit_clear;
980 int bit;
981 if (even_label == fall_through) {
982 on_bit_set = odd_label;
983 on_bit_clear = even_label;
984 bit = 1;
985 } else {
986 on_bit_set = even_label;
987 on_bit_clear = odd_label;
988 bit = 0;
989 }
990 for (uint32_t i = 0; i < (ranges->at(start_index) & kMask) && i < kSize;
991 i++) {
992 templ[i] = bit;
993 }
994 uint32_t j = 0;
995 bit ^= 1;
996 for (uint32_t i = start_index; i < end_index; i++) {
997 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
998 templ[j] = bit;
999 }
1000 bit ^= 1;
1001 }
1002 for (uint32_t i = j; i < kSize; i++) {
1003 templ[i] = bit;
1004 }
1005 Factory* factory = masm->isolate()->factory();
1006 // TODO(erikcorry): Cache these.
1007 Handle<ByteArray> ba = factory->NewByteArray(kSize, AllocationType::kOld);
1008 for (uint32_t i = 0; i < kSize; i++) {
1009 ba->set(i, templ[i]);
1010 }
1011 masm->CheckBitInTable(ba, on_bit_set);
1012 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1013 }
1014
CutOutRange(RegExpMacroAssembler * masm,ZoneList<uc32> * ranges,uint32_t start_index,uint32_t end_index,uint32_t cut_index,Label * even_label,Label * odd_label)1015 static void CutOutRange(RegExpMacroAssembler* masm, ZoneList<uc32>* ranges,
1016 uint32_t start_index, uint32_t end_index,
1017 uint32_t cut_index, Label* even_label,
1018 Label* odd_label) {
1019 bool odd = (((cut_index - start_index) & 1) == 1);
1020 Label* in_range_label = odd ? odd_label : even_label;
1021 Label dummy;
1022 EmitDoubleBoundaryTest(masm, ranges->at(cut_index),
1023 ranges->at(cut_index + 1) - 1, &dummy, in_range_label,
1024 &dummy);
1025 DCHECK(!dummy.is_linked());
1026 // Cut out the single range by rewriting the array. This creates a new
1027 // range that is a merger of the two ranges on either side of the one we
1028 // are cutting out. The oddity of the labels is preserved.
1029 for (uint32_t j = cut_index; j > start_index; j--) {
1030 ranges->at(j) = ranges->at(j - 1);
1031 }
1032 for (uint32_t j = cut_index + 1; j < end_index; j++) {
1033 ranges->at(j) = ranges->at(j + 1);
1034 }
1035 }
1036
1037 // Unicode case. Split the search space into kSize spaces that are handled
1038 // with recursion.
SplitSearchSpace(ZoneList<uc32> * ranges,uint32_t start_index,uint32_t end_index,uint32_t * new_start_index,uint32_t * new_end_index,uc32 * border)1039 static void SplitSearchSpace(ZoneList<uc32>* ranges, uint32_t start_index,
1040 uint32_t end_index, uint32_t* new_start_index,
1041 uint32_t* new_end_index, uc32* border) {
1042 static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
1043 static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
1044
1045 uc32 first = ranges->at(start_index);
1046 uc32 last = ranges->at(end_index) - 1;
1047
1048 *new_start_index = start_index;
1049 *border = (ranges->at(start_index) & ~kMask) + kSize;
1050 while (*new_start_index < end_index) {
1051 if (ranges->at(*new_start_index) > *border) break;
1052 (*new_start_index)++;
1053 }
1054 // new_start_index is the index of the first edge that is beyond the
1055 // current kSize space.
1056
1057 // For very large search spaces we do a binary chop search of the non-Latin1
1058 // space instead of just going to the end of the current kSize space. The
1059 // heuristics are complicated a little by the fact that any 128-character
1060 // encoding space can be quickly tested with a table lookup, so we don't
1061 // wish to do binary chop search at a smaller granularity than that. A
1062 // 128-character space can take up a lot of space in the ranges array if,
1063 // for example, we only want to match every second character (eg. the lower
1064 // case characters on some Unicode pages).
1065 uint32_t binary_chop_index = (end_index + start_index) / 2;
1066 // The first test ensures that we get to the code that handles the Latin1
1067 // range with a single not-taken branch, speeding up this important
1068 // character range (even non-Latin1 charset-based text has spaces and
1069 // punctuation).
1070 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1071 end_index - start_index > (*new_start_index - start_index) * 2 &&
1072 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1073 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1074 uint32_t scan_forward_for_section_border = binary_chop_index;
1075 uint32_t new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1076
1077 while (scan_forward_for_section_border < end_index) {
1078 if (ranges->at(scan_forward_for_section_border) > new_border) {
1079 *new_start_index = scan_forward_for_section_border;
1080 *border = new_border;
1081 break;
1082 }
1083 scan_forward_for_section_border++;
1084 }
1085 }
1086
1087 DCHECK(*new_start_index > start_index);
1088 *new_end_index = *new_start_index - 1;
1089 if (ranges->at(*new_end_index) == *border) {
1090 (*new_end_index)--;
1091 }
1092 if (*border >= ranges->at(end_index)) {
1093 *border = ranges->at(end_index);
1094 *new_start_index = end_index; // Won't be used.
1095 *new_end_index = end_index - 1;
1096 }
1097 }
1098
1099 // Gets a series of segment boundaries representing a character class. If the
1100 // character is in the range between an even and an odd boundary (counting from
1101 // start_index) then go to even_label, otherwise go to odd_label. We already
1102 // know that the character is in the range of min_char to max_char inclusive.
1103 // Either label can be nullptr indicating backtracking. Either label can also
1104 // be equal to the fall_through label.
GenerateBranches(RegExpMacroAssembler * masm,ZoneList<uc32> * ranges,uint32_t start_index,uint32_t end_index,uc32 min_char,uc32 max_char,Label * fall_through,Label * even_label,Label * odd_label)1105 static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<uc32>* ranges,
1106 uint32_t start_index, uint32_t end_index,
1107 uc32 min_char, uc32 max_char, Label* fall_through,
1108 Label* even_label, Label* odd_label) {
1109 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1110 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1111
1112 uc32 first = ranges->at(start_index);
1113 uc32 last = ranges->at(end_index) - 1;
1114
1115 DCHECK_LT(min_char, first);
1116
1117 // Just need to test if the character is before or on-or-after
1118 // a particular character.
1119 if (start_index == end_index) {
1120 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1121 return;
1122 }
1123
1124 // Another almost trivial case: There is one interval in the middle that is
1125 // different from the end intervals.
1126 if (start_index + 1 == end_index) {
1127 EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label,
1128 odd_label);
1129 return;
1130 }
1131
1132 // It's not worth using table lookup if there are very few intervals in the
1133 // character class.
1134 if (end_index - start_index <= 6) {
1135 // It is faster to test for individual characters, so we look for those
1136 // first, then try arbitrary ranges in the second round.
1137 static uint32_t kNoCutIndex = -1;
1138 uint32_t cut = kNoCutIndex;
1139 for (uint32_t i = start_index; i < end_index; i++) {
1140 if (ranges->at(i) == ranges->at(i + 1) - 1) {
1141 cut = i;
1142 break;
1143 }
1144 }
1145 if (cut == kNoCutIndex) cut = start_index;
1146 CutOutRange(masm, ranges, start_index, end_index, cut, even_label,
1147 odd_label);
1148 DCHECK_GE(end_index - start_index, 2);
1149 GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char,
1150 max_char, fall_through, even_label, odd_label);
1151 return;
1152 }
1153
1154 // If there are a lot of intervals in the regexp, then we will use tables to
1155 // determine whether the character is inside or outside the character class.
1156 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1157
1158 if ((max_char >> kBits) == (min_char >> kBits)) {
1159 EmitUseLookupTable(masm, ranges, start_index, end_index, min_char,
1160 fall_through, even_label, odd_label);
1161 return;
1162 }
1163
1164 if ((min_char >> kBits) != first >> kBits) {
1165 masm->CheckCharacterLT(first, odd_label);
1166 GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char,
1167 fall_through, odd_label, even_label);
1168 return;
1169 }
1170
1171 uint32_t new_start_index = 0;
1172 uint32_t new_end_index = 0;
1173 uc32 border = 0;
1174
1175 SplitSearchSpace(ranges, start_index, end_index, &new_start_index,
1176 &new_end_index, &border);
1177
1178 Label handle_rest;
1179 Label* above = &handle_rest;
1180 if (border == last + 1) {
1181 // We didn't find any section that started after the limit, so everything
1182 // above the border is one of the terminal labels.
1183 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
1184 DCHECK(new_end_index == end_index - 1);
1185 }
1186
1187 DCHECK_LE(start_index, new_end_index);
1188 DCHECK_LE(new_start_index, end_index);
1189 DCHECK_LT(start_index, new_start_index);
1190 DCHECK_LT(new_end_index, end_index);
1191 DCHECK(new_end_index + 1 == new_start_index ||
1192 (new_end_index + 2 == new_start_index &&
1193 border == ranges->at(new_end_index + 1)));
1194 DCHECK_LT(min_char, border - 1);
1195 DCHECK_LT(border, max_char);
1196 DCHECK_LT(ranges->at(new_end_index), border);
1197 DCHECK(border < ranges->at(new_start_index) ||
1198 (border == ranges->at(new_start_index) &&
1199 new_start_index == end_index && new_end_index == end_index - 1 &&
1200 border == last + 1));
1201 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
1202
1203 masm->CheckCharacterGT(border - 1, above);
1204 Label dummy;
1205 GenerateBranches(masm, ranges, start_index, new_end_index, min_char,
1206 border - 1, &dummy, even_label, odd_label);
1207 if (handle_rest.is_linked()) {
1208 masm->Bind(&handle_rest);
1209 bool flip = (new_start_index & 1) != (start_index & 1);
1210 GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char,
1211 &dummy, flip ? odd_label : even_label,
1212 flip ? even_label : odd_label);
1213 }
1214 }
1215
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool one_byte,Label * on_failure,int cp_offset,bool check_offset,bool preloaded,Zone * zone)1216 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1217 RegExpCharacterClass* cc, bool one_byte,
1218 Label* on_failure, int cp_offset, bool check_offset,
1219 bool preloaded, Zone* zone) {
1220 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
1221 CharacterRange::Canonicalize(ranges);
1222
1223 const uc32 max_char = MaxCodeUnit(one_byte);
1224 int range_count = ranges->length();
1225
1226 int last_valid_range = range_count - 1;
1227 while (last_valid_range >= 0) {
1228 CharacterRange& range = ranges->at(last_valid_range);
1229 if (range.from() <= max_char) break;
1230 last_valid_range--;
1231 }
1232
1233 if (last_valid_range < 0) {
1234 if (!cc->is_negated()) {
1235 macro_assembler->GoTo(on_failure);
1236 }
1237 if (check_offset) {
1238 macro_assembler->CheckPosition(cp_offset, on_failure);
1239 }
1240 return;
1241 }
1242
1243 if (last_valid_range == 0 && ranges->at(0).IsEverything(max_char)) {
1244 if (cc->is_negated()) {
1245 macro_assembler->GoTo(on_failure);
1246 } else {
1247 // This is a common case hit by non-anchored expressions.
1248 if (check_offset) {
1249 macro_assembler->CheckPosition(cp_offset, on_failure);
1250 }
1251 }
1252 return;
1253 }
1254
1255 if (!preloaded) {
1256 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1257 }
1258
1259 if (cc->is_standard(zone) && macro_assembler->CheckSpecialCharacterClass(
1260 cc->standard_type(), on_failure)) {
1261 return;
1262 }
1263
1264 // A new list with ascending entries. Each entry is a code unit
1265 // where there is a boundary between code units that are part of
1266 // the class and code units that are not. Normally we insert an
1267 // entry at zero which goes to the failure label, but if there
1268 // was already one there we fall through for success on that entry.
1269 // Subsequent entries have alternating meaning (success/failure).
1270 ZoneList<uc32>* range_boundaries =
1271 zone->New<ZoneList<uc32>>(last_valid_range, zone);
1272
1273 bool zeroth_entry_is_failure = !cc->is_negated();
1274
1275 for (int i = 0; i <= last_valid_range; i++) {
1276 CharacterRange& range = ranges->at(i);
1277 if (range.from() == 0) {
1278 DCHECK_EQ(i, 0);
1279 zeroth_entry_is_failure = !zeroth_entry_is_failure;
1280 } else {
1281 range_boundaries->Add(range.from(), zone);
1282 }
1283 range_boundaries->Add(range.to() + 1, zone);
1284 }
1285 int end_index = range_boundaries->length() - 1;
1286 if (range_boundaries->at(end_index) > max_char) {
1287 end_index--;
1288 }
1289
1290 Label fall_through;
1291 GenerateBranches(macro_assembler, range_boundaries,
1292 0, // start_index.
1293 end_index,
1294 0, // min_char.
1295 max_char, &fall_through,
1296 zeroth_entry_is_failure ? &fall_through : on_failure,
1297 zeroth_entry_is_failure ? on_failure : &fall_through);
1298 macro_assembler->Bind(&fall_through);
1299 }
1300
1301 RegExpNode::~RegExpNode() = default;
1302
LimitVersions(RegExpCompiler * compiler,Trace * trace)1303 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1304 Trace* trace) {
1305 // If we are generating a greedy loop then don't stop and don't reuse code.
1306 if (trace->stop_node() != nullptr) {
1307 return CONTINUE;
1308 }
1309
1310 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1311 if (trace->is_trivial()) {
1312 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
1313 // If a generic version is already scheduled to be generated or we have
1314 // recursed too deeply then just generate a jump to that code.
1315 macro_assembler->GoTo(&label_);
1316 // This will queue it up for generation of a generic version if it hasn't
1317 // already been queued.
1318 compiler->AddWork(this);
1319 return DONE;
1320 }
1321 // Generate generic version of the node and bind the label for later use.
1322 macro_assembler->Bind(&label_);
1323 return CONTINUE;
1324 }
1325
1326 // We are being asked to make a non-generic version. Keep track of how many
1327 // non-generic versions we generate so as not to overdo it.
1328 trace_count_++;
1329 if (KeepRecursing(compiler) && compiler->optimize() &&
1330 trace_count_ < kMaxCopiesCodeGenerated) {
1331 return CONTINUE;
1332 }
1333
1334 // If we get here code has been generated for this node too many times or
1335 // recursion is too deep. Time to switch to a generic version. The code for
1336 // generic versions above can handle deep recursion properly.
1337 bool was_limiting = compiler->limiting_recursion();
1338 compiler->set_limiting_recursion(true);
1339 trace->Flush(compiler, this);
1340 compiler->set_limiting_recursion(was_limiting);
1341 return DONE;
1342 }
1343
KeepRecursing(RegExpCompiler * compiler)1344 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
1345 return !compiler->limiting_recursion() &&
1346 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
1347 }
1348
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)1349 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
1350 BoyerMooreLookahead* bm, bool not_at_start) {
1351 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) {
1352 // Anything may follow a positive submatch success, thus we need to accept
1353 // all characters from this position onwards.
1354 bm->SetRest(offset);
1355 } else {
1356 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
1357 }
1358 SaveBMInfo(bm, not_at_start, offset);
1359 }
1360
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)1361 void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details,
1362 RegExpCompiler* compiler, int filled_in,
1363 bool not_at_start) {
1364 if (action_type_ == SET_REGISTER_FOR_LOOP) {
1365 on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler,
1366 filled_in, not_at_start);
1367 } else {
1368 on_success()->GetQuickCheckDetails(details, compiler, filled_in,
1369 not_at_start);
1370 }
1371 }
1372
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)1373 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
1374 BoyerMooreLookahead* bm, bool not_at_start) {
1375 // Match the behaviour of EatsAtLeast on this node.
1376 if (assertion_type() == AT_START && not_at_start) return;
1377 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
1378 SaveBMInfo(bm, not_at_start, offset);
1379 }
1380
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)1381 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
1382 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
1383 bool not_at_start) {
1384 RegExpNode* node = continue_node();
1385 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1386 }
1387
1388 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)1389 static inline uint32_t SmearBitsRight(uint32_t v) {
1390 v |= v >> 1;
1391 v |= v >> 2;
1392 v |= v >> 4;
1393 v |= v >> 8;
1394 v |= v >> 16;
1395 return v;
1396 }
1397
Rationalize(bool asc)1398 bool QuickCheckDetails::Rationalize(bool asc) {
1399 bool found_useful_op = false;
1400 const uint32_t char_mask = CharMask(asc);
1401 mask_ = 0;
1402 value_ = 0;
1403 int char_shift = 0;
1404 for (int i = 0; i < characters_; i++) {
1405 Position* pos = &positions_[i];
1406 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
1407 found_useful_op = true;
1408 }
1409 mask_ |= (pos->mask & char_mask) << char_shift;
1410 value_ |= (pos->value & char_mask) << char_shift;
1411 char_shift += asc ? 8 : 16;
1412 }
1413 return found_useful_op;
1414 }
1415
EatsAtLeast(bool not_at_start)1416 int RegExpNode::EatsAtLeast(bool not_at_start) {
1417 return not_at_start ? eats_at_least_.eats_at_least_from_not_start
1418 : eats_at_least_.eats_at_least_from_possibly_start;
1419 }
1420
EatsAtLeastFromLoopEntry()1421 EatsAtLeastInfo RegExpNode::EatsAtLeastFromLoopEntry() {
1422 // SET_REGISTER_FOR_LOOP is only used to initialize loop counters, and it
1423 // implies that the following node must be a LoopChoiceNode. If we need to
1424 // set registers to constant values for other reasons, we could introduce a
1425 // new action type SET_REGISTER that doesn't imply anything about its
1426 // successor.
1427 UNREACHABLE();
1428 }
1429
GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1430 void RegExpNode::GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
1431 RegExpCompiler* compiler,
1432 int characters_filled_in,
1433 bool not_at_start) {
1434 // See comment in RegExpNode::EatsAtLeastFromLoopEntry.
1435 UNREACHABLE();
1436 }
1437
EatsAtLeastFromLoopEntry()1438 EatsAtLeastInfo LoopChoiceNode::EatsAtLeastFromLoopEntry() {
1439 DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue.
1440
1441 if (read_backward()) {
1442 // The eats_at_least value is not used if reading backward. The
1443 // EatsAtLeastPropagator should've zeroed it as well.
1444 DCHECK_EQ(eats_at_least_info()->eats_at_least_from_possibly_start, 0);
1445 DCHECK_EQ(eats_at_least_info()->eats_at_least_from_not_start, 0);
1446 return {};
1447 }
1448
1449 // Figure out how much the loop body itself eats, not including anything in
1450 // the continuation case. In general, the nodes in the loop body should report
1451 // that they eat at least the number eaten by the continuation node, since any
1452 // successful match in the loop body must also include the continuation node.
1453 // However, in some cases involving positive lookaround, the loop body under-
1454 // reports its appetite, so use saturated math here to avoid negative numbers.
1455 uint8_t loop_body_from_not_start = base::saturated_cast<uint8_t>(
1456 loop_node_->EatsAtLeast(true) - continue_node_->EatsAtLeast(true));
1457 uint8_t loop_body_from_possibly_start = base::saturated_cast<uint8_t>(
1458 loop_node_->EatsAtLeast(false) - continue_node_->EatsAtLeast(true));
1459
1460 // Limit the number of loop iterations to avoid overflow in subsequent steps.
1461 int loop_iterations = base::saturated_cast<uint8_t>(min_loop_iterations());
1462
1463 EatsAtLeastInfo result;
1464 result.eats_at_least_from_not_start =
1465 base::saturated_cast<uint8_t>(loop_iterations * loop_body_from_not_start +
1466 continue_node_->EatsAtLeast(true));
1467 if (loop_iterations > 0 && loop_body_from_possibly_start > 0) {
1468 // First loop iteration eats at least one, so all subsequent iterations
1469 // and the after-loop chunk are guaranteed to not be at the start.
1470 result.eats_at_least_from_possibly_start = base::saturated_cast<uint8_t>(
1471 loop_body_from_possibly_start +
1472 (loop_iterations - 1) * loop_body_from_not_start +
1473 continue_node_->EatsAtLeast(true));
1474 } else {
1475 // Loop body might eat nothing, so only continue node contributes.
1476 result.eats_at_least_from_possibly_start =
1477 continue_node_->EatsAtLeast(false);
1478 }
1479 return result;
1480 }
1481
EmitQuickCheck(RegExpCompiler * compiler,Trace * bounds_check_trace,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure,ChoiceNode * predecessor)1482 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1483 Trace* bounds_check_trace, Trace* trace,
1484 bool preload_has_checked_bounds,
1485 Label* on_possible_success,
1486 QuickCheckDetails* details,
1487 bool fall_through_on_failure,
1488 ChoiceNode* predecessor) {
1489 DCHECK_NOT_NULL(predecessor);
1490 if (details->characters() == 0) return false;
1491 GetQuickCheckDetails(details, compiler, 0,
1492 trace->at_start() == Trace::FALSE_VALUE);
1493 if (details->cannot_match()) return false;
1494 if (!details->Rationalize(compiler->one_byte())) return false;
1495 DCHECK(details->characters() == 1 ||
1496 compiler->macro_assembler()->CanReadUnaligned());
1497 uint32_t mask = details->mask();
1498 uint32_t value = details->value();
1499
1500 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1501
1502 if (trace->characters_preloaded() != details->characters()) {
1503 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
1504 // The bounds check is performed using the minimum number of characters
1505 // any choice would eat, so if the bounds check fails, then none of the
1506 // choices can succeed, so we can just immediately backtrack, rather
1507 // than go to the next choice. The number of characters preloaded may be
1508 // less than the number used for the bounds check.
1509 int eats_at_least = predecessor->EatsAtLeast(
1510 bounds_check_trace->at_start() == Trace::FALSE_VALUE);
1511 DCHECK_GE(eats_at_least, details->characters());
1512 assembler->LoadCurrentCharacter(
1513 trace->cp_offset(), bounds_check_trace->backtrack(),
1514 !preload_has_checked_bounds, details->characters(), eats_at_least);
1515 }
1516
1517 bool need_mask = true;
1518
1519 if (details->characters() == 1) {
1520 // If number of characters preloaded is 1 then we used a byte or 16 bit
1521 // load so the value is already masked down.
1522 const uint32_t char_mask = CharMask(compiler->one_byte());
1523 if ((mask & char_mask) == char_mask) need_mask = false;
1524 mask &= char_mask;
1525 } else {
1526 // For 2-character preloads in one-byte mode or 1-character preloads in
1527 // two-byte mode we also use a 16 bit load with zero extend.
1528 static const uint32_t kTwoByteMask = 0xFFFF;
1529 static const uint32_t kFourByteMask = 0xFFFFFFFF;
1530 if (details->characters() == 2 && compiler->one_byte()) {
1531 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
1532 } else if (details->characters() == 1 && !compiler->one_byte()) {
1533 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
1534 } else {
1535 if (mask == kFourByteMask) need_mask = false;
1536 }
1537 }
1538
1539 if (fall_through_on_failure) {
1540 if (need_mask) {
1541 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1542 } else {
1543 assembler->CheckCharacter(value, on_possible_success);
1544 }
1545 } else {
1546 if (need_mask) {
1547 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1548 } else {
1549 assembler->CheckNotCharacter(value, trace->backtrack());
1550 }
1551 }
1552 return true;
1553 }
1554
1555 // Here is the meat of GetQuickCheckDetails (see also the comment on the
1556 // super-class in the .h file).
1557 //
1558 // We iterate along the text object, building up for each character a
1559 // mask and value that can be used to test for a quick failure to match.
1560 // The masks and values for the positions will be combined into a single
1561 // machine word for the current character width in order to be used in
1562 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1563 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1564 RegExpCompiler* compiler,
1565 int characters_filled_in,
1566 bool not_at_start) {
1567 // Do not collect any quick check details if the text node reads backward,
1568 // since it reads in the opposite direction than we use for quick checks.
1569 if (read_backward()) return;
1570 Isolate* isolate = compiler->macro_assembler()->isolate();
1571 DCHECK(characters_filled_in < details->characters());
1572 int characters = details->characters();
1573 const uint32_t char_mask = CharMask(compiler->one_byte());
1574 for (int k = 0; k < elements()->length(); k++) {
1575 TextElement elm = elements()->at(k);
1576 if (elm.text_type() == TextElement::ATOM) {
1577 Vector<const uc16> quarks = elm.atom()->data();
1578 for (int i = 0; i < characters && i < quarks.length(); i++) {
1579 QuickCheckDetails::Position* pos =
1580 details->positions(characters_filled_in);
1581 uc16 c = quarks[i];
1582 if (elm.atom()->ignore_case()) {
1583 unibrow::uchar chars[4];
1584 int length = GetCaseIndependentLetters(
1585 isolate, c, compiler->one_byte(), chars, 4);
1586 if (length == 0) {
1587 // This can happen because all case variants are non-Latin1, but we
1588 // know the input is Latin1.
1589 details->set_cannot_match();
1590 pos->determines_perfectly = false;
1591 return;
1592 }
1593 if (length == 1) {
1594 // This letter has no case equivalents, so it's nice and simple
1595 // and the mask-compare will determine definitely whether we have
1596 // a match at this character position.
1597 pos->mask = char_mask;
1598 pos->value = chars[0];
1599 pos->determines_perfectly = true;
1600 } else {
1601 uint32_t common_bits = char_mask;
1602 uint32_t bits = chars[0];
1603 for (int j = 1; j < length; j++) {
1604 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1605 common_bits ^= differing_bits;
1606 bits &= common_bits;
1607 }
1608 // If length is 2 and common bits has only one zero in it then
1609 // our mask and compare instruction will determine definitely
1610 // whether we have a match at this character position. Otherwise
1611 // it can only be an approximate check.
1612 uint32_t one_zero = (common_bits | ~char_mask);
1613 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1614 pos->determines_perfectly = true;
1615 }
1616 pos->mask = common_bits;
1617 pos->value = bits;
1618 }
1619 } else {
1620 // Don't ignore case. Nice simple case where the mask-compare will
1621 // determine definitely whether we have a match at this character
1622 // position.
1623 if (c > char_mask) {
1624 details->set_cannot_match();
1625 pos->determines_perfectly = false;
1626 return;
1627 }
1628 pos->mask = char_mask;
1629 pos->value = c;
1630 pos->determines_perfectly = true;
1631 }
1632 characters_filled_in++;
1633 DCHECK(characters_filled_in <= details->characters());
1634 if (characters_filled_in == details->characters()) {
1635 return;
1636 }
1637 }
1638 } else {
1639 QuickCheckDetails::Position* pos =
1640 details->positions(characters_filled_in);
1641 RegExpCharacterClass* tree = elm.char_class();
1642 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
1643 DCHECK(!ranges->is_empty());
1644 if (tree->is_negated()) {
1645 // A quick check uses multi-character mask and compare. There is no
1646 // useful way to incorporate a negative char class into this scheme
1647 // so we just conservatively create a mask and value that will always
1648 // succeed.
1649 pos->mask = 0;
1650 pos->value = 0;
1651 } else {
1652 int first_range = 0;
1653 while (ranges->at(first_range).from() > char_mask) {
1654 first_range++;
1655 if (first_range == ranges->length()) {
1656 details->set_cannot_match();
1657 pos->determines_perfectly = false;
1658 return;
1659 }
1660 }
1661 CharacterRange range = ranges->at(first_range);
1662 const uc32 first_from = range.from();
1663 const uc32 first_to = (range.to() > char_mask) ? char_mask : range.to();
1664 const uint32_t differing_bits = (first_from ^ first_to);
1665 // A mask and compare is only perfect if the differing bits form a
1666 // number like 00011111 with one single block of trailing 1s.
1667 if ((differing_bits & (differing_bits + 1)) == 0 &&
1668 first_from + differing_bits == first_to) {
1669 pos->determines_perfectly = true;
1670 }
1671 uint32_t common_bits = ~SmearBitsRight(differing_bits);
1672 uint32_t bits = (first_from & common_bits);
1673 for (int i = first_range + 1; i < ranges->length(); i++) {
1674 CharacterRange range = ranges->at(i);
1675 const uc32 from = range.from();
1676 if (from > char_mask) continue;
1677 const uc32 to = (range.to() > char_mask) ? char_mask : range.to();
1678 // Here we are combining more ranges into the mask and compare
1679 // value. With each new range the mask becomes more sparse and
1680 // so the chances of a false positive rise. A character class
1681 // with multiple ranges is assumed never to be equivalent to a
1682 // mask and compare operation.
1683 pos->determines_perfectly = false;
1684 uint32_t new_common_bits = (from ^ to);
1685 new_common_bits = ~SmearBitsRight(new_common_bits);
1686 common_bits &= new_common_bits;
1687 bits &= new_common_bits;
1688 uint32_t differing_bits = (from & common_bits) ^ bits;
1689 common_bits ^= differing_bits;
1690 bits &= common_bits;
1691 }
1692 pos->mask = common_bits;
1693 pos->value = bits;
1694 }
1695 characters_filled_in++;
1696 DCHECK(characters_filled_in <= details->characters());
1697 if (characters_filled_in == details->characters()) return;
1698 }
1699 }
1700 DCHECK(characters_filled_in != details->characters());
1701 if (!details->cannot_match()) {
1702 on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in,
1703 true);
1704 }
1705 }
1706
Clear()1707 void QuickCheckDetails::Clear() {
1708 for (int i = 0; i < characters_; i++) {
1709 positions_[i].mask = 0;
1710 positions_[i].value = 0;
1711 positions_[i].determines_perfectly = false;
1712 }
1713 characters_ = 0;
1714 }
1715
Advance(int by,bool one_byte)1716 void QuickCheckDetails::Advance(int by, bool one_byte) {
1717 if (by >= characters_ || by < 0) {
1718 DCHECK_IMPLIES(by < 0, characters_ == 0);
1719 Clear();
1720 return;
1721 }
1722 DCHECK_LE(characters_ - by, 4);
1723 DCHECK_LE(characters_, 4);
1724 for (int i = 0; i < characters_ - by; i++) {
1725 positions_[i] = positions_[by + i];
1726 }
1727 for (int i = characters_ - by; i < characters_; i++) {
1728 positions_[i].mask = 0;
1729 positions_[i].value = 0;
1730 positions_[i].determines_perfectly = false;
1731 }
1732 characters_ -= by;
1733 // We could change mask_ and value_ here but we would never advance unless
1734 // they had already been used in a check and they won't be used again because
1735 // it would gain us nothing. So there's no point.
1736 }
1737
Merge(QuickCheckDetails * other,int from_index)1738 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
1739 DCHECK(characters_ == other->characters_);
1740 if (other->cannot_match_) {
1741 return;
1742 }
1743 if (cannot_match_) {
1744 *this = *other;
1745 return;
1746 }
1747 for (int i = from_index; i < characters_; i++) {
1748 QuickCheckDetails::Position* pos = positions(i);
1749 QuickCheckDetails::Position* other_pos = other->positions(i);
1750 if (pos->mask != other_pos->mask || pos->value != other_pos->value ||
1751 !other_pos->determines_perfectly) {
1752 // Our mask-compare operation will be approximate unless we have the
1753 // exact same operation on both sides of the alternation.
1754 pos->determines_perfectly = false;
1755 }
1756 pos->mask &= other_pos->mask;
1757 pos->value &= pos->mask;
1758 other_pos->value &= pos->mask;
1759 uint32_t differing_bits = (pos->value ^ other_pos->value);
1760 pos->mask &= ~differing_bits;
1761 pos->value &= pos->mask;
1762 }
1763 }
1764
1765 class VisitMarker {
1766 public:
VisitMarker(NodeInfo * info)1767 explicit VisitMarker(NodeInfo* info) : info_(info) {
1768 DCHECK(!info->visited);
1769 info->visited = true;
1770 }
~VisitMarker()1771 ~VisitMarker() { info_->visited = false; }
1772
1773 private:
1774 NodeInfo* info_;
1775 };
1776
1777 // Temporarily sets traversed_loop_initialization_node_.
1778 class LoopInitializationMarker {
1779 public:
LoopInitializationMarker(LoopChoiceNode * node)1780 explicit LoopInitializationMarker(LoopChoiceNode* node) : node_(node) {
1781 DCHECK(!node_->traversed_loop_initialization_node_);
1782 node_->traversed_loop_initialization_node_ = true;
1783 }
~LoopInitializationMarker()1784 ~LoopInitializationMarker() {
1785 DCHECK(node_->traversed_loop_initialization_node_);
1786 node_->traversed_loop_initialization_node_ = false;
1787 }
1788 LoopInitializationMarker(const LoopInitializationMarker&) = delete;
1789 LoopInitializationMarker& operator=(const LoopInitializationMarker&) = delete;
1790
1791 private:
1792 LoopChoiceNode* node_;
1793 };
1794
1795 // Temporarily decrements min_loop_iterations_.
1796 class IterationDecrementer {
1797 public:
IterationDecrementer(LoopChoiceNode * node)1798 explicit IterationDecrementer(LoopChoiceNode* node) : node_(node) {
1799 DCHECK_GT(node_->min_loop_iterations_, 0);
1800 --node_->min_loop_iterations_;
1801 }
~IterationDecrementer()1802 ~IterationDecrementer() { ++node_->min_loop_iterations_; }
1803 IterationDecrementer(const IterationDecrementer&) = delete;
1804 IterationDecrementer& operator=(const IterationDecrementer&) = delete;
1805
1806 private:
1807 LoopChoiceNode* node_;
1808 };
1809
FilterOneByte(int depth)1810 RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
1811 if (info()->replacement_calculated) return replacement();
1812 if (depth < 0) return this;
1813 DCHECK(!info()->visited);
1814 VisitMarker marker(info());
1815 return FilterSuccessor(depth - 1);
1816 }
1817
FilterSuccessor(int depth)1818 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
1819 RegExpNode* next = on_success_->FilterOneByte(depth - 1);
1820 if (next == nullptr) return set_replacement(nullptr);
1821 on_success_ = next;
1822 return set_replacement(this);
1823 }
1824
1825 // We need to check for the following characters: 0x39C 0x3BC 0x178.
RangeContainsLatin1Equivalents(CharacterRange range)1826 bool RangeContainsLatin1Equivalents(CharacterRange range) {
1827 // TODO(dcarney): this could be a lot more efficient.
1828 return range.Contains(0x039C) || range.Contains(0x03BC) ||
1829 range.Contains(0x0178);
1830 }
1831
RangesContainLatin1Equivalents(ZoneList<CharacterRange> * ranges)1832 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
1833 for (int i = 0; i < ranges->length(); i++) {
1834 // TODO(dcarney): this could be a lot more efficient.
1835 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
1836 }
1837 return false;
1838 }
1839
FilterOneByte(int depth)1840 RegExpNode* TextNode::FilterOneByte(int depth) {
1841 if (info()->replacement_calculated) return replacement();
1842 if (depth < 0) return this;
1843 DCHECK(!info()->visited);
1844 VisitMarker marker(info());
1845 int element_count = elements()->length();
1846 for (int i = 0; i < element_count; i++) {
1847 TextElement elm = elements()->at(i);
1848 if (elm.text_type() == TextElement::ATOM) {
1849 Vector<const uc16> quarks = elm.atom()->data();
1850 for (int j = 0; j < quarks.length(); j++) {
1851 uc16 c = quarks[j];
1852 if (elm.atom()->ignore_case()) {
1853 c = unibrow::Latin1::TryConvertToLatin1(c);
1854 }
1855 if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
1856 // Replace quark in case we converted to Latin-1.
1857 uc16* writable_quarks = const_cast<uc16*>(quarks.begin());
1858 writable_quarks[j] = c;
1859 }
1860 } else {
1861 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
1862 RegExpCharacterClass* cc = elm.char_class();
1863 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1864 CharacterRange::Canonicalize(ranges);
1865 // Now they are in order so we only need to look at the first.
1866 int range_count = ranges->length();
1867 if (cc->is_negated()) {
1868 if (range_count != 0 && ranges->at(0).from() == 0 &&
1869 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
1870 // This will be handled in a later filter.
1871 if (IgnoreCase(cc->flags()) &&
1872 RangesContainLatin1Equivalents(ranges)) {
1873 continue;
1874 }
1875 return set_replacement(nullptr);
1876 }
1877 } else {
1878 if (range_count == 0 ||
1879 ranges->at(0).from() > String::kMaxOneByteCharCode) {
1880 // This will be handled in a later filter.
1881 if (IgnoreCase(cc->flags()) &&
1882 RangesContainLatin1Equivalents(ranges)) {
1883 continue;
1884 }
1885 return set_replacement(nullptr);
1886 }
1887 }
1888 }
1889 }
1890 return FilterSuccessor(depth - 1);
1891 }
1892
FilterOneByte(int depth)1893 RegExpNode* LoopChoiceNode::FilterOneByte(int depth) {
1894 if (info()->replacement_calculated) return replacement();
1895 if (depth < 0) return this;
1896 if (info()->visited) return this;
1897 {
1898 VisitMarker marker(info());
1899
1900 RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
1901 // If we can't continue after the loop then there is no sense in doing the
1902 // loop.
1903 if (continue_replacement == nullptr) return set_replacement(nullptr);
1904 }
1905
1906 return ChoiceNode::FilterOneByte(depth - 1);
1907 }
1908
FilterOneByte(int depth)1909 RegExpNode* ChoiceNode::FilterOneByte(int depth) {
1910 if (info()->replacement_calculated) return replacement();
1911 if (depth < 0) return this;
1912 if (info()->visited) return this;
1913 VisitMarker marker(info());
1914 int choice_count = alternatives_->length();
1915
1916 for (int i = 0; i < choice_count; i++) {
1917 GuardedAlternative alternative = alternatives_->at(i);
1918 if (alternative.guards() != nullptr &&
1919 alternative.guards()->length() != 0) {
1920 set_replacement(this);
1921 return this;
1922 }
1923 }
1924
1925 int surviving = 0;
1926 RegExpNode* survivor = nullptr;
1927 for (int i = 0; i < choice_count; i++) {
1928 GuardedAlternative alternative = alternatives_->at(i);
1929 RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
1930 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
1931 if (replacement != nullptr) {
1932 alternatives_->at(i).set_node(replacement);
1933 surviving++;
1934 survivor = replacement;
1935 }
1936 }
1937 if (surviving < 2) return set_replacement(survivor);
1938
1939 set_replacement(this);
1940 if (surviving == choice_count) {
1941 return this;
1942 }
1943 // Only some of the nodes survived the filtering. We need to rebuild the
1944 // alternatives list.
1945 ZoneList<GuardedAlternative>* new_alternatives =
1946 zone()->New<ZoneList<GuardedAlternative>>(surviving, zone());
1947 for (int i = 0; i < choice_count; i++) {
1948 RegExpNode* replacement =
1949 alternatives_->at(i).node()->FilterOneByte(depth - 1);
1950 if (replacement != nullptr) {
1951 alternatives_->at(i).set_node(replacement);
1952 new_alternatives->Add(alternatives_->at(i), zone());
1953 }
1954 }
1955 alternatives_ = new_alternatives;
1956 return this;
1957 }
1958
FilterOneByte(int depth)1959 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
1960 if (info()->replacement_calculated) return replacement();
1961 if (depth < 0) return this;
1962 if (info()->visited) return this;
1963 VisitMarker marker(info());
1964 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1965 // afterwards.
1966 RegExpNode* node = continue_node();
1967 RegExpNode* replacement = node->FilterOneByte(depth - 1);
1968 if (replacement == nullptr) return set_replacement(nullptr);
1969 alternatives_->at(kContinueIndex).set_node(replacement);
1970
1971 RegExpNode* neg_node = lookaround_node();
1972 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
1973 // If the negative lookahead is always going to fail then
1974 // we don't need to check it.
1975 if (neg_replacement == nullptr) return set_replacement(replacement);
1976 alternatives_->at(kLookaroundIndex).set_node(neg_replacement);
1977 return set_replacement(this);
1978 }
1979
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1980 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
1981 RegExpCompiler* compiler,
1982 int characters_filled_in,
1983 bool not_at_start) {
1984 if (body_can_be_zero_length_ || info()->visited) return;
1985 not_at_start = not_at_start || this->not_at_start();
1986 DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue.
1987 if (traversed_loop_initialization_node_ && min_loop_iterations_ > 0 &&
1988 loop_node_->EatsAtLeast(not_at_start) >
1989 continue_node_->EatsAtLeast(true)) {
1990 // Loop body is guaranteed to execute at least once, and consume characters
1991 // when it does, meaning the only possible quick checks from this point
1992 // begin with the loop body. We may recursively visit this LoopChoiceNode,
1993 // but we temporarily decrease its minimum iteration counter so we know when
1994 // to check the continue case.
1995 IterationDecrementer next_iteration(this);
1996 loop_node_->GetQuickCheckDetails(details, compiler, characters_filled_in,
1997 not_at_start);
1998 } else {
1999 // Might not consume anything in the loop body, so treat it like a normal
2000 // ChoiceNode (and don't recursively visit this node again).
2001 VisitMarker marker(info());
2002 ChoiceNode::GetQuickCheckDetails(details, compiler, characters_filled_in,
2003 not_at_start);
2004 }
2005 }
2006
GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2007 void LoopChoiceNode::GetQuickCheckDetailsFromLoopEntry(
2008 QuickCheckDetails* details, RegExpCompiler* compiler,
2009 int characters_filled_in, bool not_at_start) {
2010 if (traversed_loop_initialization_node_) {
2011 // We already entered this loop once, exited via its continuation node, and
2012 // followed an outer loop's back-edge to before the loop entry point. We
2013 // could try to reset the minimum iteration count to its starting value at
2014 // this point, but that seems like more trouble than it's worth. It's safe
2015 // to keep going with the current (possibly reduced) minimum iteration
2016 // count.
2017 GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
2018 } else {
2019 // We are entering a loop via its counter initialization action, meaning we
2020 // are guaranteed to run the loop body at least some minimum number of times
2021 // before running the continuation node. Set a flag so that this node knows
2022 // (now and any times we visit it again recursively) that it was entered
2023 // from the top.
2024 LoopInitializationMarker marker(this);
2025 GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
2026 }
2027 }
2028
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2029 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2030 BoyerMooreLookahead* bm, bool not_at_start) {
2031 if (body_can_be_zero_length_ || budget <= 0) {
2032 bm->SetRest(offset);
2033 SaveBMInfo(bm, not_at_start, offset);
2034 return;
2035 }
2036 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2037 SaveBMInfo(bm, not_at_start, offset);
2038 }
2039
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2040 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2041 RegExpCompiler* compiler,
2042 int characters_filled_in,
2043 bool not_at_start) {
2044 not_at_start = (not_at_start || not_at_start_);
2045 int choice_count = alternatives_->length();
2046 DCHECK_LT(0, choice_count);
2047 alternatives_->at(0).node()->GetQuickCheckDetails(
2048 details, compiler, characters_filled_in, not_at_start);
2049 for (int i = 1; i < choice_count; i++) {
2050 QuickCheckDetails new_details(details->characters());
2051 RegExpNode* node = alternatives_->at(i).node();
2052 node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in,
2053 not_at_start);
2054 // Here we merge the quick match details of the two branches.
2055 details->Merge(&new_details, characters_filled_in);
2056 }
2057 }
2058
2059 namespace {
2060
2061 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2062 void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word,
2063 Label* non_word, bool fall_through_on_word) {
2064 if (assembler->CheckSpecialCharacterClass(
2065 fall_through_on_word ? 'w' : 'W',
2066 fall_through_on_word ? non_word : word)) {
2067 // Optimized implementation available.
2068 return;
2069 }
2070 assembler->CheckCharacterGT('z', non_word);
2071 assembler->CheckCharacterLT('0', non_word);
2072 assembler->CheckCharacterGT('a' - 1, word);
2073 assembler->CheckCharacterLT('9' + 1, word);
2074 assembler->CheckCharacterLT('A', non_word);
2075 assembler->CheckCharacterLT('Z' + 1, word);
2076 if (fall_through_on_word) {
2077 assembler->CheckNotCharacter('_', non_word);
2078 } else {
2079 assembler->CheckCharacter('_', word);
2080 }
2081 }
2082
2083 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2084 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)2085 void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) {
2086 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2087
2088 // We will load the previous character into the current character register.
2089 Trace new_trace(*trace);
2090 new_trace.InvalidateCurrentCharacter();
2091
2092 // A positive (> 0) cp_offset means we've already successfully matched a
2093 // non-empty-width part of the pattern, and thus cannot be at or before the
2094 // start of the subject string. We can thus skip both at-start and
2095 // bounds-checks when loading the one-character lookbehind.
2096 const bool may_be_at_or_before_subject_string_start =
2097 new_trace.cp_offset() <= 0;
2098
2099 Label ok;
2100 if (may_be_at_or_before_subject_string_start) {
2101 // The start of input counts as a newline in this context, so skip to ok if
2102 // we are at the start.
2103 assembler->CheckAtStart(new_trace.cp_offset(), &ok);
2104 }
2105
2106 // If we've already checked that we are not at the start of input, it's okay
2107 // to load the previous character without bounds checks.
2108 const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
2109 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2110 new_trace.backtrack(), can_skip_bounds_check);
2111 if (!assembler->CheckSpecialCharacterClass('n', new_trace.backtrack())) {
2112 // Newline means \n, \r, 0x2028 or 0x2029.
2113 if (!compiler->one_byte()) {
2114 assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
2115 }
2116 assembler->CheckCharacter('\n', &ok);
2117 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2118 }
2119 assembler->Bind(&ok);
2120 on_success->Emit(compiler, &new_trace);
2121 }
2122
2123 } // namespace
2124
2125 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(RegExpCompiler * compiler,Trace * trace)2126 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2127 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2128 Isolate* isolate = assembler->isolate();
2129 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2130 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2131 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2132 if (lookahead == nullptr) {
2133 int eats_at_least =
2134 std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(not_at_start));
2135 if (eats_at_least >= 1) {
2136 BoyerMooreLookahead* bm =
2137 zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
2138 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
2139 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE;
2140 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2141 }
2142 } else {
2143 if (lookahead->at(0)->is_non_word())
2144 next_is_word_character = Trace::FALSE_VALUE;
2145 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2146 }
2147 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2148 if (next_is_word_character == Trace::UNKNOWN) {
2149 Label before_non_word;
2150 Label before_word;
2151 if (trace->characters_preloaded() != 1) {
2152 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2153 }
2154 // Fall through on non-word.
2155 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2156 // Next character is not a word character.
2157 assembler->Bind(&before_non_word);
2158 Label ok;
2159 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2160 assembler->GoTo(&ok);
2161
2162 assembler->Bind(&before_word);
2163 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2164 assembler->Bind(&ok);
2165 } else if (next_is_word_character == Trace::TRUE_VALUE) {
2166 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2167 } else {
2168 DCHECK(next_is_word_character == Trace::FALSE_VALUE);
2169 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2170 }
2171 }
2172
BacktrackIfPrevious(RegExpCompiler * compiler,Trace * trace,AssertionNode::IfPrevious backtrack_if_previous)2173 void AssertionNode::BacktrackIfPrevious(
2174 RegExpCompiler* compiler, Trace* trace,
2175 AssertionNode::IfPrevious backtrack_if_previous) {
2176 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2177 Trace new_trace(*trace);
2178 new_trace.InvalidateCurrentCharacter();
2179
2180 Label fall_through;
2181 Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack()
2182 : &fall_through;
2183 Label* word = backtrack_if_previous == kIsNonWord ? &fall_through
2184 : new_trace.backtrack();
2185
2186 // A positive (> 0) cp_offset means we've already successfully matched a
2187 // non-empty-width part of the pattern, and thus cannot be at or before the
2188 // start of the subject string. We can thus skip both at-start and
2189 // bounds-checks when loading the one-character lookbehind.
2190 const bool may_be_at_or_before_subject_string_start =
2191 new_trace.cp_offset() <= 0;
2192
2193 if (may_be_at_or_before_subject_string_start) {
2194 // The start of input counts as a non-word character, so the question is
2195 // decided if we are at the start.
2196 assembler->CheckAtStart(new_trace.cp_offset(), non_word);
2197 }
2198
2199 // If we've already checked that we are not at the start of input, it's okay
2200 // to load the previous character without bounds checks.
2201 const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
2202 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, non_word,
2203 can_skip_bounds_check);
2204 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2205
2206 assembler->Bind(&fall_through);
2207 on_success()->Emit(compiler, &new_trace);
2208 }
2209
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2210 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2211 RegExpCompiler* compiler,
2212 int filled_in, bool not_at_start) {
2213 if (assertion_type_ == AT_START && not_at_start) {
2214 details->set_cannot_match();
2215 return;
2216 }
2217 return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
2218 not_at_start);
2219 }
2220
Emit(RegExpCompiler * compiler,Trace * trace)2221 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2222 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2223 switch (assertion_type_) {
2224 case AT_END: {
2225 Label ok;
2226 assembler->CheckPosition(trace->cp_offset(), &ok);
2227 assembler->GoTo(trace->backtrack());
2228 assembler->Bind(&ok);
2229 break;
2230 }
2231 case AT_START: {
2232 if (trace->at_start() == Trace::FALSE_VALUE) {
2233 assembler->GoTo(trace->backtrack());
2234 return;
2235 }
2236 if (trace->at_start() == Trace::UNKNOWN) {
2237 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
2238 Trace at_start_trace = *trace;
2239 at_start_trace.set_at_start(Trace::TRUE_VALUE);
2240 on_success()->Emit(compiler, &at_start_trace);
2241 return;
2242 }
2243 } break;
2244 case AFTER_NEWLINE:
2245 EmitHat(compiler, on_success(), trace);
2246 return;
2247 case AT_BOUNDARY:
2248 case AT_NON_BOUNDARY: {
2249 EmitBoundaryCheck(compiler, trace);
2250 return;
2251 }
2252 }
2253 on_success()->Emit(compiler, trace);
2254 }
2255
DeterminedAlready(QuickCheckDetails * quick_check,int offset)2256 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2257 if (quick_check == nullptr) return false;
2258 if (offset >= quick_check->characters()) return false;
2259 return quick_check->positions(offset)->determines_perfectly;
2260 }
2261
UpdateBoundsCheck(int index,int * checked_up_to)2262 static void UpdateBoundsCheck(int index, int* checked_up_to) {
2263 if (index > *checked_up_to) {
2264 *checked_up_to = index;
2265 }
2266 }
2267
2268 // We call this repeatedly to generate code for each pass over the text node.
2269 // The passes are in increasing order of difficulty because we hope one
2270 // of the first passes will fail in which case we are saved the work of the
2271 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
2272 // we will check the '%' in the first pass, the case independent 'a' in the
2273 // second pass and the character class in the last pass.
2274 //
2275 // The passes are done from right to left, so for example to test for /bar/
2276 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
2277 // and then a 'b' with offset 0. This means we can avoid the end-of-input
2278 // bounds check most of the time. In the example we only need to check for
2279 // end-of-input when loading the putative 'r'.
2280 //
2281 // A slight complication involves the fact that the first character may already
2282 // be fetched into a register by the previous node. In this case we want to
2283 // do the test for that character first. We do this in separate passes. The
2284 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2285 // pass has been performed then subsequent passes will have true in
2286 // first_element_checked to indicate that that character does not need to be
2287 // checked again.
2288 //
2289 // In addition to all this we are passed a Trace, which can
2290 // contain an AlternativeGeneration object. In this AlternativeGeneration
2291 // object we can see details of any quick check that was already passed in
2292 // order to get to the code we are now generating. The quick check can involve
2293 // loading characters, which means we do not need to recheck the bounds
2294 // up to the limit the quick check already checked. In addition the quick
2295 // check can have involved a mask and compare operation which may simplify
2296 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)2297 void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
2298 bool preloaded, Trace* trace,
2299 bool first_element_checked, int* checked_up_to) {
2300 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2301 Isolate* isolate = assembler->isolate();
2302 bool one_byte = compiler->one_byte();
2303 Label* backtrack = trace->backtrack();
2304 QuickCheckDetails* quick_check = trace->quick_check_performed();
2305 int element_count = elements()->length();
2306 int backward_offset = read_backward() ? -Length() : 0;
2307 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2308 TextElement elm = elements()->at(i);
2309 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
2310 if (elm.text_type() == TextElement::ATOM) {
2311 if (SkipPass(pass, elm.atom()->ignore_case())) continue;
2312 Vector<const uc16> quarks = elm.atom()->data();
2313 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2314 if (first_element_checked && i == 0 && j == 0) continue;
2315 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
2316 uc16 quark = quarks[j];
2317 if (elm.atom()->ignore_case()) {
2318 // Everywhere else we assume that a non-Latin-1 character cannot match
2319 // a Latin-1 character. Avoid the cases where this is assumption is
2320 // invalid by using the Latin1 equivalent instead.
2321 quark = unibrow::Latin1::TryConvertToLatin1(quark);
2322 }
2323 bool needs_bounds_check =
2324 *checked_up_to < cp_offset + j || read_backward();
2325 bool bounds_checked = false;
2326 switch (pass) {
2327 case NON_LATIN1_MATCH:
2328 DCHECK(one_byte);
2329 if (quark > String::kMaxOneByteCharCode) {
2330 assembler->GoTo(backtrack);
2331 return;
2332 }
2333 break;
2334 case NON_LETTER_CHARACTER_MATCH:
2335 bounds_checked =
2336 EmitAtomNonLetter(isolate, compiler, quark, backtrack,
2337 cp_offset + j, needs_bounds_check, preloaded);
2338 break;
2339 case SIMPLE_CHARACTER_MATCH:
2340 bounds_checked = EmitSimpleCharacter(isolate, compiler, quark,
2341 backtrack, cp_offset + j,
2342 needs_bounds_check, preloaded);
2343 break;
2344 case CASE_CHARACTER_MATCH:
2345 bounds_checked =
2346 EmitAtomLetter(isolate, compiler, quark, backtrack,
2347 cp_offset + j, needs_bounds_check, preloaded);
2348 break;
2349 default:
2350 break;
2351 }
2352 if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2353 }
2354 } else {
2355 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
2356 if (pass == CHARACTER_CLASS_MATCH) {
2357 if (first_element_checked && i == 0) continue;
2358 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
2359 RegExpCharacterClass* cc = elm.char_class();
2360 bool bounds_check = *checked_up_to < cp_offset || read_backward();
2361 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
2362 bounds_check, preloaded, zone());
2363 UpdateBoundsCheck(cp_offset, checked_up_to);
2364 }
2365 }
2366 }
2367 }
2368
Length()2369 int TextNode::Length() {
2370 TextElement elm = elements()->last();
2371 DCHECK_LE(0, elm.cp_offset());
2372 return elm.cp_offset() + elm.length();
2373 }
2374
SkipPass(TextEmitPassType pass,bool ignore_case)2375 bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
2376 if (ignore_case) {
2377 return pass == SIMPLE_CHARACTER_MATCH;
2378 } else {
2379 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2380 }
2381 }
2382
CreateForCharacterRanges(Zone * zone,ZoneList<CharacterRange> * ranges,bool read_backward,RegExpNode * on_success,JSRegExp::Flags flags)2383 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
2384 ZoneList<CharacterRange>* ranges,
2385 bool read_backward,
2386 RegExpNode* on_success,
2387 JSRegExp::Flags flags) {
2388 DCHECK_NOT_NULL(ranges);
2389 ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(1, zone);
2390 elms->Add(TextElement::CharClass(
2391 zone->New<RegExpCharacterClass>(zone, ranges, flags)),
2392 zone);
2393 return zone->New<TextNode>(elms, read_backward, on_success);
2394 }
2395
CreateForSurrogatePair(Zone * zone,CharacterRange lead,CharacterRange trail,bool read_backward,RegExpNode * on_success,JSRegExp::Flags flags)2396 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
2397 CharacterRange trail,
2398 bool read_backward,
2399 RegExpNode* on_success,
2400 JSRegExp::Flags flags) {
2401 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
2402 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
2403 ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
2404 elms->Add(TextElement::CharClass(
2405 zone->New<RegExpCharacterClass>(zone, lead_ranges, flags)),
2406 zone);
2407 elms->Add(TextElement::CharClass(
2408 zone->New<RegExpCharacterClass>(zone, trail_ranges, flags)),
2409 zone);
2410 return zone->New<TextNode>(elms, read_backward, on_success);
2411 }
2412
2413 // This generates the code to match a text node. A text node can contain
2414 // straight character sequences (possibly to be matched in a case-independent
2415 // way) and character classes. For efficiency we do not do this in a single
2416 // pass from left to right. Instead we pass over the text node several times,
2417 // emitting code for some character positions every time. See the comment on
2418 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)2419 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2420 LimitResult limit_result = LimitVersions(compiler, trace);
2421 if (limit_result == DONE) return;
2422 DCHECK(limit_result == CONTINUE);
2423
2424 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2425 compiler->SetRegExpTooBig();
2426 return;
2427 }
2428
2429 if (compiler->one_byte()) {
2430 int dummy = 0;
2431 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
2432 }
2433
2434 bool first_elt_done = false;
2435 int bound_checked_to = trace->cp_offset() - 1;
2436 bound_checked_to += trace->bound_checked_up_to();
2437
2438 // If a character is preloaded into the current character register then
2439 // check that now.
2440 if (trace->characters_preloaded() == 1) {
2441 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2442 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
2443 false, &bound_checked_to);
2444 }
2445 first_elt_done = true;
2446 }
2447
2448 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2449 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
2450 first_elt_done, &bound_checked_to);
2451 }
2452
2453 Trace successor_trace(*trace);
2454 // If we advance backward, we may end up at the start.
2455 successor_trace.AdvanceCurrentPositionInTrace(
2456 read_backward() ? -Length() : Length(), compiler);
2457 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
2458 : Trace::FALSE_VALUE);
2459 RecursionCheck rc(compiler);
2460 on_success()->Emit(compiler, &successor_trace);
2461 }
2462
InvalidateCurrentCharacter()2463 void Trace::InvalidateCurrentCharacter() { characters_preloaded_ = 0; }
2464
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)2465 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
2466 // We don't have an instruction for shifting the current character register
2467 // down or for using a shifted value for anything so lets just forget that
2468 // we preloaded any characters into it.
2469 characters_preloaded_ = 0;
2470 // Adjust the offsets of the quick check performed information. This
2471 // information is used to find out what we already determined about the
2472 // characters by means of mask and compare.
2473 quick_check_performed_.Advance(by, compiler->one_byte());
2474 cp_offset_ += by;
2475 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2476 compiler->SetRegExpTooBig();
2477 cp_offset_ = 0;
2478 }
2479 bound_checked_up_to_ = std::max(0, bound_checked_up_to_ - by);
2480 }
2481
MakeCaseIndependent(Isolate * isolate,bool is_one_byte)2482 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
2483 int element_count = elements()->length();
2484 for (int i = 0; i < element_count; i++) {
2485 TextElement elm = elements()->at(i);
2486 if (elm.text_type() == TextElement::CHAR_CLASS) {
2487 RegExpCharacterClass* cc = elm.char_class();
2488 #ifdef V8_INTL_SUPPORT
2489 bool case_equivalents_already_added =
2490 NeedsUnicodeCaseEquivalents(cc->flags());
2491 #else
2492 bool case_equivalents_already_added = false;
2493 #endif
2494 if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) {
2495 // None of the standard character classes is different in the case
2496 // independent case and it slows us down if we don't know that.
2497 if (cc->is_standard(zone())) continue;
2498 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2499 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges,
2500 is_one_byte);
2501 }
2502 }
2503 }
2504 }
2505
GreedyLoopTextLength()2506 int TextNode::GreedyLoopTextLength() { return Length(); }
2507
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)2508 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
2509 RegExpCompiler* compiler) {
2510 if (read_backward()) return nullptr;
2511 if (elements()->length() != 1) return nullptr;
2512 TextElement elm = elements()->at(0);
2513 if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
2514 RegExpCharacterClass* node = elm.char_class();
2515 ZoneList<CharacterRange>* ranges = node->ranges(zone());
2516 CharacterRange::Canonicalize(ranges);
2517 if (node->is_negated()) {
2518 return ranges->length() == 0 ? on_success() : nullptr;
2519 }
2520 if (ranges->length() != 1) return nullptr;
2521 const uc32 max_char = MaxCodeUnit(compiler->one_byte());
2522 return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
2523 }
2524
2525 // Finds the fixed match length of a sequence of nodes that goes from
2526 // this alternative and back to this choice node. If there are variable
2527 // length nodes or other complications in the way then return a sentinel
2528 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLengthForAlternative(GuardedAlternative * alternative)2529 int ChoiceNode::GreedyLoopTextLengthForAlternative(
2530 GuardedAlternative* alternative) {
2531 int length = 0;
2532 RegExpNode* node = alternative->node();
2533 // Later we will generate code for all these text nodes using recursion
2534 // so we have to limit the max number.
2535 int recursion_depth = 0;
2536 while (node != this) {
2537 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2538 return kNodeIsTooComplexForGreedyLoops;
2539 }
2540 int node_length = node->GreedyLoopTextLength();
2541 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2542 return kNodeIsTooComplexForGreedyLoops;
2543 }
2544 length += node_length;
2545 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2546 node = seq_node->on_success();
2547 }
2548 if (read_backward()) {
2549 length = -length;
2550 }
2551 // Check that we can jump by the whole text length. If not, return sentinel
2552 // to indicate the we can't construct a greedy loop.
2553 if (length < RegExpMacroAssembler::kMinCPOffset ||
2554 length > RegExpMacroAssembler::kMaxCPOffset) {
2555 return kNodeIsTooComplexForGreedyLoops;
2556 }
2557 return length;
2558 }
2559
AddLoopAlternative(GuardedAlternative alt)2560 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2561 DCHECK_NULL(loop_node_);
2562 AddAlternative(alt);
2563 loop_node_ = alt.node();
2564 }
2565
AddContinueAlternative(GuardedAlternative alt)2566 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2567 DCHECK_NULL(continue_node_);
2568 AddAlternative(alt);
2569 continue_node_ = alt.node();
2570 }
2571
Emit(RegExpCompiler * compiler,Trace * trace)2572 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2573 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2574 if (trace->stop_node() == this) {
2575 // Back edge of greedy optimized loop node graph.
2576 int text_length =
2577 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
2578 DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
2579 // Update the counter-based backtracking info on the stack. This is an
2580 // optimization for greedy loops (see below).
2581 DCHECK(trace->cp_offset() == text_length);
2582 macro_assembler->AdvanceCurrentPosition(text_length);
2583 macro_assembler->GoTo(trace->loop_label());
2584 return;
2585 }
2586 DCHECK_NULL(trace->stop_node());
2587 if (!trace->is_trivial()) {
2588 trace->Flush(compiler, this);
2589 return;
2590 }
2591 ChoiceNode::Emit(compiler, trace);
2592 }
2593
CalculatePreloadCharacters(RegExpCompiler * compiler,int eats_at_least)2594 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2595 int eats_at_least) {
2596 int preload_characters = std::min(4, eats_at_least);
2597 DCHECK_LE(preload_characters, 4);
2598 if (compiler->macro_assembler()->CanReadUnaligned()) {
2599 bool one_byte = compiler->one_byte();
2600 if (one_byte) {
2601 // We can't preload 3 characters because there is no machine instruction
2602 // to do that. We can't just load 4 because we could be reading
2603 // beyond the end of the string, which could cause a memory fault.
2604 if (preload_characters == 3) preload_characters = 2;
2605 } else {
2606 if (preload_characters > 2) preload_characters = 2;
2607 }
2608 } else {
2609 if (preload_characters > 1) preload_characters = 1;
2610 }
2611 return preload_characters;
2612 }
2613
2614 // This class is used when generating the alternatives in a choice node. It
2615 // records the way the alternative is being code generated.
2616 class AlternativeGeneration : public Malloced {
2617 public:
AlternativeGeneration()2618 AlternativeGeneration()
2619 : possible_success(),
2620 expects_preload(false),
2621 after(),
2622 quick_check_details() {}
2623 Label possible_success;
2624 bool expects_preload;
2625 Label after;
2626 QuickCheckDetails quick_check_details;
2627 };
2628
2629 // Creates a list of AlternativeGenerations. If the list has a reasonable
2630 // size then it is on the stack, otherwise the excess is on the heap.
2631 class AlternativeGenerationList {
2632 public:
AlternativeGenerationList(int count,Zone * zone)2633 AlternativeGenerationList(int count, Zone* zone) : alt_gens_(count, zone) {
2634 for (int i = 0; i < count && i < kAFew; i++) {
2635 alt_gens_.Add(a_few_alt_gens_ + i, zone);
2636 }
2637 for (int i = kAFew; i < count; i++) {
2638 alt_gens_.Add(new AlternativeGeneration(), zone);
2639 }
2640 }
~AlternativeGenerationList()2641 ~AlternativeGenerationList() {
2642 for (int i = kAFew; i < alt_gens_.length(); i++) {
2643 delete alt_gens_[i];
2644 alt_gens_[i] = nullptr;
2645 }
2646 }
2647
at(int i)2648 AlternativeGeneration* at(int i) { return alt_gens_[i]; }
2649
2650 private:
2651 static const int kAFew = 10;
2652 ZoneList<AlternativeGeneration*> alt_gens_;
2653 AlternativeGeneration a_few_alt_gens_[kAFew];
2654 };
2655
Set(int character)2656 void BoyerMoorePositionInfo::Set(int character) {
2657 SetInterval(Interval(character, character));
2658 }
2659
2660 namespace {
2661
AddRange(ContainedInLattice containment,const int * ranges,int ranges_length,Interval new_range)2662 ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges,
2663 int ranges_length, Interval new_range) {
2664 DCHECK_EQ(1, ranges_length & 1);
2665 DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
2666 if (containment == kLatticeUnknown) return containment;
2667 bool inside = false;
2668 int last = 0;
2669 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
2670 // Consider the range from last to ranges[i].
2671 // We haven't got to the new range yet.
2672 if (ranges[i] <= new_range.from()) continue;
2673 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
2674 // inclusive, but the values in ranges are not.
2675 if (last <= new_range.from() && new_range.to() < ranges[i]) {
2676 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
2677 }
2678 return kLatticeUnknown;
2679 }
2680 return containment;
2681 }
2682
BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset)2683 int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) {
2684 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
2685 2 * kInt64Size * kBitsPerByte);
2686
2687 // Slight fiddling is needed here, since the bitset is of length 128 while
2688 // CountTrailingZeros requires an integral type and std::bitset can only
2689 // convert to unsigned long long. So we handle the most- and least-significant
2690 // bits separately.
2691
2692 {
2693 static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0});
2694 BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask;
2695 STATIC_ASSERT(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong())));
2696 uint64_t lsb = masked_bitset.to_ullong();
2697 if (lsb != 0) return base::bits::CountTrailingZeros(lsb);
2698 }
2699
2700 {
2701 BoyerMoorePositionInfo::Bitset masked_bitset = bitset >> 64;
2702 uint64_t msb = masked_bitset.to_ullong();
2703 if (msb != 0) return 64 + base::bits::CountTrailingZeros(msb);
2704 }
2705
2706 return -1;
2707 }
2708
2709 } // namespace
2710
SetInterval(const Interval & interval)2711 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
2712 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
2713
2714 if (interval.size() >= kMapSize) {
2715 map_count_ = kMapSize;
2716 map_.set();
2717 return;
2718 }
2719
2720 for (int i = interval.from(); i <= interval.to(); i++) {
2721 int mod_character = (i & kMask);
2722 if (!map_[mod_character]) {
2723 map_count_++;
2724 map_.set(mod_character);
2725 }
2726 if (map_count_ == kMapSize) return;
2727 }
2728 }
2729
SetAll()2730 void BoyerMoorePositionInfo::SetAll() {
2731 w_ = kLatticeUnknown;
2732 if (map_count_ != kMapSize) {
2733 map_count_ = kMapSize;
2734 map_.set();
2735 }
2736 }
2737
BoyerMooreLookahead(int length,RegExpCompiler * compiler,Zone * zone)2738 BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler,
2739 Zone* zone)
2740 : length_(length),
2741 compiler_(compiler),
2742 max_char_(MaxCodeUnit(compiler->one_byte())) {
2743 bitmaps_ = zone->New<ZoneList<BoyerMoorePositionInfo*>>(length, zone);
2744 for (int i = 0; i < length; i++) {
2745 bitmaps_->Add(zone->New<BoyerMoorePositionInfo>(), zone);
2746 }
2747 }
2748
2749 // Find the longest range of lookahead that has the fewest number of different
2750 // characters that can occur at a given position. Since we are optimizing two
2751 // different parameters at once this is a tradeoff.
FindWorthwhileInterval(int * from,int * to)2752 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
2753 int biggest_points = 0;
2754 // If more than 32 characters out of 128 can occur it is unlikely that we can
2755 // be lucky enough to step forwards much of the time.
2756 const int kMaxMax = 32;
2757 for (int max_number_of_chars = 4; max_number_of_chars < kMaxMax;
2758 max_number_of_chars *= 2) {
2759 biggest_points =
2760 FindBestInterval(max_number_of_chars, biggest_points, from, to);
2761 }
2762 if (biggest_points == 0) return false;
2763 return true;
2764 }
2765
2766 // Find the highest-points range between 0 and length_ where the character
2767 // information is not too vague. 'Too vague' means that there are more than
2768 // max_number_of_chars that can occur at this position. Calculates the number
2769 // of points as the product of width-of-the-range and
2770 // probability-of-finding-one-of-the-characters, where the probability is
2771 // calculated using the frequency distribution of the sample subject string.
FindBestInterval(int max_number_of_chars,int old_biggest_points,int * from,int * to)2772 int BoyerMooreLookahead::FindBestInterval(int max_number_of_chars,
2773 int old_biggest_points, int* from,
2774 int* to) {
2775 int biggest_points = old_biggest_points;
2776 static const int kSize = RegExpMacroAssembler::kTableSize;
2777 for (int i = 0; i < length_;) {
2778 while (i < length_ && Count(i) > max_number_of_chars) i++;
2779 if (i == length_) break;
2780 int remembered_from = i;
2781
2782 BoyerMoorePositionInfo::Bitset union_bitset;
2783 for (; i < length_ && Count(i) <= max_number_of_chars; i++) {
2784 union_bitset |= bitmaps_->at(i)->raw_bitset();
2785 }
2786
2787 int frequency = 0;
2788
2789 // Iterate only over set bits.
2790 int j;
2791 while ((j = BitsetFirstSetBit(union_bitset)) != -1) {
2792 DCHECK(union_bitset[j]); // Sanity check.
2793 // Add 1 to the frequency to give a small per-character boost for
2794 // the cases where our sampling is not good enough and many
2795 // characters have a frequency of zero. This means the frequency
2796 // can theoretically be up to 2*kSize though we treat it mostly as
2797 // a fraction of kSize.
2798 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
2799 union_bitset.reset(j);
2800 }
2801
2802 // We use the probability of skipping times the distance we are skipping to
2803 // judge the effectiveness of this. Actually we have a cut-off: By
2804 // dividing by 2 we switch off the skipping if the probability of skipping
2805 // is less than 50%. This is because the multibyte mask-and-compare
2806 // skipping in quickcheck is more likely to do well on this case.
2807 bool in_quickcheck_range =
2808 ((i - remembered_from < 4) ||
2809 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
2810 // Called 'probability' but it is only a rough estimate and can actually
2811 // be outside the 0-kSize range.
2812 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
2813 int points = (i - remembered_from) * probability;
2814 if (points > biggest_points) {
2815 *from = remembered_from;
2816 *to = i - 1;
2817 biggest_points = points;
2818 }
2819 }
2820 return biggest_points;
2821 }
2822
2823 // Take all the characters that will not prevent a successful match if they
2824 // occur in the subject string in the range between min_lookahead and
2825 // max_lookahead (inclusive) measured from the current position. If the
2826 // character at max_lookahead offset is not one of these characters, then we
2827 // can safely skip forwards by the number of characters in the range.
GetSkipTable(int min_lookahead,int max_lookahead,Handle<ByteArray> boolean_skip_table)2828 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, int max_lookahead,
2829 Handle<ByteArray> boolean_skip_table) {
2830 const int kSkipArrayEntry = 0;
2831 const int kDontSkipArrayEntry = 1;
2832
2833 std::memset(boolean_skip_table->GetDataStartAddress(), kSkipArrayEntry,
2834 boolean_skip_table->length());
2835
2836 for (int i = max_lookahead; i >= min_lookahead; i--) {
2837 BoyerMoorePositionInfo::Bitset bitset = bitmaps_->at(i)->raw_bitset();
2838
2839 // Iterate only over set bits.
2840 int j;
2841 while ((j = BitsetFirstSetBit(bitset)) != -1) {
2842 DCHECK(bitset[j]); // Sanity check.
2843 boolean_skip_table->set(j, kDontSkipArrayEntry);
2844 bitset.reset(j);
2845 }
2846 }
2847
2848 const int skip = max_lookahead + 1 - min_lookahead;
2849 return skip;
2850 }
2851
2852 // See comment above on the implementation of GetSkipTable.
EmitSkipInstructions(RegExpMacroAssembler * masm)2853 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
2854 const int kSize = RegExpMacroAssembler::kTableSize;
2855
2856 int min_lookahead = 0;
2857 int max_lookahead = 0;
2858
2859 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
2860
2861 // Check if we only have a single non-empty position info, and that info
2862 // contains precisely one character.
2863 bool found_single_character = false;
2864 int single_character = 0;
2865 for (int i = max_lookahead; i >= min_lookahead; i--) {
2866 BoyerMoorePositionInfo* map = bitmaps_->at(i);
2867 if (map->map_count() == 0) continue;
2868
2869 if (found_single_character || map->map_count() > 1) {
2870 found_single_character = false;
2871 break;
2872 }
2873
2874 DCHECK(!found_single_character);
2875 DCHECK_EQ(map->map_count(), 1);
2876
2877 found_single_character = true;
2878 single_character = BitsetFirstSetBit(map->raw_bitset());
2879
2880 DCHECK_NE(single_character, -1);
2881 }
2882
2883 int lookahead_width = max_lookahead + 1 - min_lookahead;
2884
2885 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
2886 // The mask-compare can probably handle this better.
2887 return;
2888 }
2889
2890 if (found_single_character) {
2891 Label cont, again;
2892 masm->Bind(&again);
2893 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2894 if (max_char_ > kSize) {
2895 masm->CheckCharacterAfterAnd(single_character,
2896 RegExpMacroAssembler::kTableMask, &cont);
2897 } else {
2898 masm->CheckCharacter(single_character, &cont);
2899 }
2900 masm->AdvanceCurrentPosition(lookahead_width);
2901 masm->GoTo(&again);
2902 masm->Bind(&cont);
2903 return;
2904 }
2905
2906 Factory* factory = masm->isolate()->factory();
2907 Handle<ByteArray> boolean_skip_table =
2908 factory->NewByteArray(kSize, AllocationType::kOld);
2909 int skip_distance =
2910 GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table);
2911 DCHECK_NE(0, skip_distance);
2912
2913 Label cont, again;
2914 masm->Bind(&again);
2915 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2916 masm->CheckBitInTable(boolean_skip_table, &cont);
2917 masm->AdvanceCurrentPosition(skip_distance);
2918 masm->GoTo(&again);
2919 masm->Bind(&cont);
2920 }
2921
2922 /* Code generation for choice nodes.
2923 *
2924 * We generate quick checks that do a mask and compare to eliminate a
2925 * choice. If the quick check succeeds then it jumps to the continuation to
2926 * do slow checks and check subsequent nodes. If it fails (the common case)
2927 * it falls through to the next choice.
2928 *
2929 * Here is the desired flow graph. Nodes directly below each other imply
2930 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
2931 * 3 doesn't have a quick check so we have to call the slow check.
2932 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
2933 * regexp continuation is generated directly after the Sn node, up to the
2934 * next GoTo if we decide to reuse some already generated code. Some
2935 * nodes expect preload_characters to be preloaded into the current
2936 * character register. R nodes do this preloading. Vertices are marked
2937 * F for failures and S for success (possible success in the case of quick
2938 * nodes). L, V, < and > are used as arrow heads.
2939 *
2940 * ----------> R
2941 * |
2942 * V
2943 * Q1 -----> S1
2944 * | S /
2945 * F| /
2946 * | F/
2947 * | /
2948 * | R
2949 * | /
2950 * V L
2951 * Q2 -----> S2
2952 * | S /
2953 * F| /
2954 * | F/
2955 * | /
2956 * | R
2957 * | /
2958 * V L
2959 * S3
2960 * |
2961 * F|
2962 * |
2963 * R
2964 * |
2965 * backtrack V
2966 * <----------Q4
2967 * \ F |
2968 * \ |S
2969 * \ F V
2970 * \-----S4
2971 *
2972 * For greedy loops we push the current position, then generate the code that
2973 * eats the input specially in EmitGreedyLoop. The other choice (the
2974 * continuation) is generated by the normal code in EmitChoices, and steps back
2975 * in the input to the starting position when it fails to match. The loop code
2976 * looks like this (U is the unwind code that steps back in the greedy loop).
2977 *
2978 * _____
2979 * / \
2980 * V |
2981 * ----------> S1 |
2982 * /| |
2983 * / |S |
2984 * F/ \_____/
2985 * /
2986 * |<-----
2987 * | \
2988 * V |S
2989 * Q2 ---> U----->backtrack
2990 * | F /
2991 * S| /
2992 * V F /
2993 * S2--/
2994 */
2995
GreedyLoopState(bool not_at_start)2996 GreedyLoopState::GreedyLoopState(bool not_at_start) {
2997 counter_backtrack_trace_.set_backtrack(&label_);
2998 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
2999 }
3000
AssertGuardsMentionRegisters(Trace * trace)3001 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3002 #ifdef DEBUG
3003 int choice_count = alternatives_->length();
3004 for (int i = 0; i < choice_count - 1; i++) {
3005 GuardedAlternative alternative = alternatives_->at(i);
3006 ZoneList<Guard*>* guards = alternative.guards();
3007 int guard_count = (guards == nullptr) ? 0 : guards->length();
3008 for (int j = 0; j < guard_count; j++) {
3009 DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3010 }
3011 }
3012 #endif
3013 }
3014
SetUpPreLoad(RegExpCompiler * compiler,Trace * current_trace,PreloadState * state)3015 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace,
3016 PreloadState* state) {
3017 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3018 // Save some time by looking at most one machine word ahead.
3019 state->eats_at_least_ =
3020 EatsAtLeast(current_trace->at_start() == Trace::FALSE_VALUE);
3021 }
3022 state->preload_characters_ =
3023 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3024
3025 state->preload_is_current_ =
3026 (current_trace->characters_preloaded() == state->preload_characters_);
3027 state->preload_has_checked_bounds_ = state->preload_is_current_;
3028 }
3029
Emit(RegExpCompiler * compiler,Trace * trace)3030 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3031 int choice_count = alternatives_->length();
3032
3033 if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
3034 alternatives_->at(0).node()->Emit(compiler, trace);
3035 return;
3036 }
3037
3038 AssertGuardsMentionRegisters(trace);
3039
3040 LimitResult limit_result = LimitVersions(compiler, trace);
3041 if (limit_result == DONE) return;
3042 DCHECK(limit_result == CONTINUE);
3043
3044 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3045 // other choice nodes we only flush if we are out of code size budget.
3046 if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3047 trace->Flush(compiler, this);
3048 return;
3049 }
3050
3051 RecursionCheck rc(compiler);
3052
3053 PreloadState preload;
3054 preload.init();
3055 GreedyLoopState greedy_loop_state(not_at_start());
3056
3057 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3058 AlternativeGenerationList alt_gens(choice_count, zone());
3059
3060 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3061 trace = EmitGreedyLoop(compiler, trace, &alt_gens, &preload,
3062 &greedy_loop_state, text_length);
3063 } else {
3064 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3065 // match the traces produced pre-cleanup.
3066 Label second_choice;
3067 compiler->macro_assembler()->Bind(&second_choice);
3068
3069 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3070
3071 EmitChoices(compiler, &alt_gens, 0, trace, &preload);
3072 }
3073
3074 // At this point we need to generate slow checks for the alternatives where
3075 // the quick check was inlined. We can recognize these because the associated
3076 // label was bound.
3077 int new_flush_budget = trace->flush_budget() / choice_count;
3078 for (int i = 0; i < choice_count; i++) {
3079 AlternativeGeneration* alt_gen = alt_gens.at(i);
3080 Trace new_trace(*trace);
3081 // If there are actions to be flushed we have to limit how many times
3082 // they are flushed. Take the budget of the parent trace and distribute
3083 // it fairly amongst the children.
3084 if (new_trace.actions() != nullptr) {
3085 new_trace.set_flush_budget(new_flush_budget);
3086 }
3087 bool next_expects_preload =
3088 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3089 EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->at(i),
3090 alt_gen, preload.preload_characters_,
3091 next_expects_preload);
3092 }
3093 }
3094
EmitGreedyLoop(RegExpCompiler * compiler,Trace * trace,AlternativeGenerationList * alt_gens,PreloadState * preload,GreedyLoopState * greedy_loop_state,int text_length)3095 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace,
3096 AlternativeGenerationList* alt_gens,
3097 PreloadState* preload,
3098 GreedyLoopState* greedy_loop_state,
3099 int text_length) {
3100 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3101 // Here we have special handling for greedy loops containing only text nodes
3102 // and other simple nodes. These are handled by pushing the current
3103 // position on the stack and then incrementing the current position each
3104 // time around the switch. On backtrack we decrement the current position
3105 // and check it against the pushed value. This avoids pushing backtrack
3106 // information for each iteration of the loop, which could take up a lot of
3107 // space.
3108 DCHECK(trace->stop_node() == nullptr);
3109 macro_assembler->PushCurrentPosition();
3110 Label greedy_match_failed;
3111 Trace greedy_match_trace;
3112 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3113 greedy_match_trace.set_backtrack(&greedy_match_failed);
3114 Label loop_label;
3115 macro_assembler->Bind(&loop_label);
3116 greedy_match_trace.set_stop_node(this);
3117 greedy_match_trace.set_loop_label(&loop_label);
3118 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
3119 macro_assembler->Bind(&greedy_match_failed);
3120
3121 Label second_choice; // For use in greedy matches.
3122 macro_assembler->Bind(&second_choice);
3123
3124 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3125
3126 EmitChoices(compiler, alt_gens, 1, new_trace, preload);
3127
3128 macro_assembler->Bind(greedy_loop_state->label());
3129 // If we have unwound to the bottom then backtrack.
3130 macro_assembler->CheckGreedyLoop(trace->backtrack());
3131 // Otherwise try the second priority at an earlier position.
3132 macro_assembler->AdvanceCurrentPosition(-text_length);
3133 macro_assembler->GoTo(&second_choice);
3134 return new_trace;
3135 }
3136
EmitOptimizedUnanchoredSearch(RegExpCompiler * compiler,Trace * trace)3137 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
3138 Trace* trace) {
3139 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
3140 if (alternatives_->length() != 2) return eats_at_least;
3141
3142 GuardedAlternative alt1 = alternatives_->at(1);
3143 if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
3144 return eats_at_least;
3145 }
3146 RegExpNode* eats_anything_node = alt1.node();
3147 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
3148 return eats_at_least;
3149 }
3150
3151 // Really we should be creating a new trace when we execute this function,
3152 // but there is no need, because the code it generates cannot backtrack, and
3153 // we always arrive here with a trivial trace (since it's the entry to a
3154 // loop. That also implies that there are no preloaded characters, which is
3155 // good, because it means we won't be violating any assumptions by
3156 // overwriting those characters with new load instructions.
3157 DCHECK(trace->is_trivial());
3158
3159 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3160 Isolate* isolate = macro_assembler->isolate();
3161 // At this point we know that we are at a non-greedy loop that will eat
3162 // any character one at a time. Any non-anchored regexp has such a
3163 // loop prepended to it in order to find where it starts. We look for
3164 // a pattern of the form ...abc... where we can look 6 characters ahead
3165 // and step forwards 3 if the character is not one of abc. Abc need
3166 // not be atoms, they can be any reasonably limited character class or
3167 // small alternation.
3168 BoyerMooreLookahead* bm = bm_info(false);
3169 if (bm == nullptr) {
3170 eats_at_least = std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(false));
3171 if (eats_at_least >= 1) {
3172 bm = zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
3173 GuardedAlternative alt0 = alternatives_->at(0);
3174 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
3175 }
3176 }
3177 if (bm != nullptr) {
3178 bm->EmitSkipInstructions(macro_assembler);
3179 }
3180 return eats_at_least;
3181 }
3182
EmitChoices(RegExpCompiler * compiler,AlternativeGenerationList * alt_gens,int first_choice,Trace * trace,PreloadState * preload)3183 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
3184 AlternativeGenerationList* alt_gens,
3185 int first_choice, Trace* trace,
3186 PreloadState* preload) {
3187 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3188 SetUpPreLoad(compiler, trace, preload);
3189
3190 // For now we just call all choices one after the other. The idea ultimately
3191 // is to use the Dispatch table to try only the relevant ones.
3192 int choice_count = alternatives_->length();
3193
3194 int new_flush_budget = trace->flush_budget() / choice_count;
3195
3196 for (int i = first_choice; i < choice_count; i++) {
3197 bool is_last = i == choice_count - 1;
3198 bool fall_through_on_failure = !is_last;
3199 GuardedAlternative alternative = alternatives_->at(i);
3200 AlternativeGeneration* alt_gen = alt_gens->at(i);
3201 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
3202 ZoneList<Guard*>* guards = alternative.guards();
3203 int guard_count = (guards == nullptr) ? 0 : guards->length();
3204 Trace new_trace(*trace);
3205 new_trace.set_characters_preloaded(
3206 preload->preload_is_current_ ? preload->preload_characters_ : 0);
3207 if (preload->preload_has_checked_bounds_) {
3208 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3209 }
3210 new_trace.quick_check_performed()->Clear();
3211 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
3212 if (!is_last) {
3213 new_trace.set_backtrack(&alt_gen->after);
3214 }
3215 alt_gen->expects_preload = preload->preload_is_current_;
3216 bool generate_full_check_inline = false;
3217 if (compiler->optimize() &&
3218 try_to_emit_quick_check_for_alternative(i == 0) &&
3219 alternative.node()->EmitQuickCheck(
3220 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
3221 &alt_gen->possible_success, &alt_gen->quick_check_details,
3222 fall_through_on_failure, this)) {
3223 // Quick check was generated for this choice.
3224 preload->preload_is_current_ = true;
3225 preload->preload_has_checked_bounds_ = true;
3226 // If we generated the quick check to fall through on possible success,
3227 // we now need to generate the full check inline.
3228 if (!fall_through_on_failure) {
3229 macro_assembler->Bind(&alt_gen->possible_success);
3230 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3231 new_trace.set_characters_preloaded(preload->preload_characters_);
3232 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3233 generate_full_check_inline = true;
3234 }
3235 } else if (alt_gen->quick_check_details.cannot_match()) {
3236 if (!fall_through_on_failure) {
3237 macro_assembler->GoTo(trace->backtrack());
3238 }
3239 continue;
3240 } else {
3241 // No quick check was generated. Put the full code here.
3242 // If this is not the first choice then there could be slow checks from
3243 // previous cases that go here when they fail. There's no reason to
3244 // insist that they preload characters since the slow check we are about
3245 // to generate probably can't use it.
3246 if (i != first_choice) {
3247 alt_gen->expects_preload = false;
3248 new_trace.InvalidateCurrentCharacter();
3249 }
3250 generate_full_check_inline = true;
3251 }
3252 if (generate_full_check_inline) {
3253 if (new_trace.actions() != nullptr) {
3254 new_trace.set_flush_budget(new_flush_budget);
3255 }
3256 for (int j = 0; j < guard_count; j++) {
3257 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
3258 }
3259 alternative.node()->Emit(compiler, &new_trace);
3260 preload->preload_is_current_ = false;
3261 }
3262 macro_assembler->Bind(&alt_gen->after);
3263 }
3264 }
3265
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)3266 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
3267 Trace* trace,
3268 GuardedAlternative alternative,
3269 AlternativeGeneration* alt_gen,
3270 int preload_characters,
3271 bool next_expects_preload) {
3272 if (!alt_gen->possible_success.is_linked()) return;
3273
3274 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3275 macro_assembler->Bind(&alt_gen->possible_success);
3276 Trace out_of_line_trace(*trace);
3277 out_of_line_trace.set_characters_preloaded(preload_characters);
3278 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3279 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
3280 ZoneList<Guard*>* guards = alternative.guards();
3281 int guard_count = (guards == nullptr) ? 0 : guards->length();
3282 if (next_expects_preload) {
3283 Label reload_current_char;
3284 out_of_line_trace.set_backtrack(&reload_current_char);
3285 for (int j = 0; j < guard_count; j++) {
3286 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3287 }
3288 alternative.node()->Emit(compiler, &out_of_line_trace);
3289 macro_assembler->Bind(&reload_current_char);
3290 // Reload the current character, since the next quick check expects that.
3291 // We don't need to check bounds here because we only get into this
3292 // code through a quick check which already did the checked load.
3293 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
3294 preload_characters);
3295 macro_assembler->GoTo(&(alt_gen->after));
3296 } else {
3297 out_of_line_trace.set_backtrack(&(alt_gen->after));
3298 for (int j = 0; j < guard_count; j++) {
3299 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3300 }
3301 alternative.node()->Emit(compiler, &out_of_line_trace);
3302 }
3303 }
3304
Emit(RegExpCompiler * compiler,Trace * trace)3305 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3306 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3307 LimitResult limit_result = LimitVersions(compiler, trace);
3308 if (limit_result == DONE) return;
3309 DCHECK(limit_result == CONTINUE);
3310
3311 RecursionCheck rc(compiler);
3312
3313 switch (action_type_) {
3314 case STORE_POSITION: {
3315 Trace::DeferredCapture new_capture(data_.u_position_register.reg,
3316 data_.u_position_register.is_capture,
3317 trace);
3318 Trace new_trace = *trace;
3319 new_trace.add_action(&new_capture);
3320 on_success()->Emit(compiler, &new_trace);
3321 break;
3322 }
3323 case INCREMENT_REGISTER: {
3324 Trace::DeferredIncrementRegister new_increment(
3325 data_.u_increment_register.reg);
3326 Trace new_trace = *trace;
3327 new_trace.add_action(&new_increment);
3328 on_success()->Emit(compiler, &new_trace);
3329 break;
3330 }
3331 case SET_REGISTER_FOR_LOOP: {
3332 Trace::DeferredSetRegisterForLoop new_set(data_.u_store_register.reg,
3333 data_.u_store_register.value);
3334 Trace new_trace = *trace;
3335 new_trace.add_action(&new_set);
3336 on_success()->Emit(compiler, &new_trace);
3337 break;
3338 }
3339 case CLEAR_CAPTURES: {
3340 Trace::DeferredClearCaptures new_capture(Interval(
3341 data_.u_clear_captures.range_from, data_.u_clear_captures.range_to));
3342 Trace new_trace = *trace;
3343 new_trace.add_action(&new_capture);
3344 on_success()->Emit(compiler, &new_trace);
3345 break;
3346 }
3347 case BEGIN_POSITIVE_SUBMATCH:
3348 case BEGIN_NEGATIVE_SUBMATCH:
3349 if (!trace->is_trivial()) {
3350 trace->Flush(compiler, this);
3351 } else {
3352 assembler->WriteCurrentPositionToRegister(
3353 data_.u_submatch.current_position_register, 0);
3354 assembler->WriteStackPointerToRegister(
3355 data_.u_submatch.stack_pointer_register);
3356 on_success()->Emit(compiler, trace);
3357 }
3358 break;
3359 case EMPTY_MATCH_CHECK: {
3360 int start_pos_reg = data_.u_empty_match_check.start_register;
3361 int stored_pos = 0;
3362 int rep_reg = data_.u_empty_match_check.repetition_register;
3363 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3364 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3365 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3366 // If we know we haven't advanced and there is no minimum we
3367 // can just backtrack immediately.
3368 assembler->GoTo(trace->backtrack());
3369 } else if (know_dist && stored_pos < trace->cp_offset()) {
3370 // If we know we've advanced we can generate the continuation
3371 // immediately.
3372 on_success()->Emit(compiler, trace);
3373 } else if (!trace->is_trivial()) {
3374 trace->Flush(compiler, this);
3375 } else {
3376 Label skip_empty_check;
3377 // If we have a minimum number of repetitions we check the current
3378 // number first and skip the empty check if it's not enough.
3379 if (has_minimum) {
3380 int limit = data_.u_empty_match_check.repetition_limit;
3381 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3382 }
3383 // If the match is empty we bail out, otherwise we fall through
3384 // to the on-success continuation.
3385 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3386 trace->backtrack());
3387 assembler->Bind(&skip_empty_check);
3388 on_success()->Emit(compiler, trace);
3389 }
3390 break;
3391 }
3392 case POSITIVE_SUBMATCH_SUCCESS: {
3393 if (!trace->is_trivial()) {
3394 trace->Flush(compiler, this);
3395 return;
3396 }
3397 assembler->ReadCurrentPositionFromRegister(
3398 data_.u_submatch.current_position_register);
3399 assembler->ReadStackPointerFromRegister(
3400 data_.u_submatch.stack_pointer_register);
3401 int clear_register_count = data_.u_submatch.clear_register_count;
3402 if (clear_register_count == 0) {
3403 on_success()->Emit(compiler, trace);
3404 return;
3405 }
3406 int clear_registers_from = data_.u_submatch.clear_register_from;
3407 Label clear_registers_backtrack;
3408 Trace new_trace = *trace;
3409 new_trace.set_backtrack(&clear_registers_backtrack);
3410 on_success()->Emit(compiler, &new_trace);
3411
3412 assembler->Bind(&clear_registers_backtrack);
3413 int clear_registers_to = clear_registers_from + clear_register_count - 1;
3414 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3415
3416 DCHECK(trace->backtrack() == nullptr);
3417 assembler->Backtrack();
3418 return;
3419 }
3420 default:
3421 UNREACHABLE();
3422 }
3423 }
3424
Emit(RegExpCompiler * compiler,Trace * trace)3425 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3426 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3427 if (!trace->is_trivial()) {
3428 trace->Flush(compiler, this);
3429 return;
3430 }
3431
3432 LimitResult limit_result = LimitVersions(compiler, trace);
3433 if (limit_result == DONE) return;
3434 DCHECK(limit_result == CONTINUE);
3435
3436 RecursionCheck rc(compiler);
3437
3438 DCHECK_EQ(start_reg_ + 1, end_reg_);
3439 if (IgnoreCase(flags_)) {
3440 bool unicode = IsUnicode(flags_);
3441 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
3442 unicode, trace->backtrack());
3443 } else {
3444 assembler->CheckNotBackReference(start_reg_, read_backward(),
3445 trace->backtrack());
3446 }
3447 // We are going to advance backward, so we may end up at the start.
3448 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
3449
3450 // Check that the back reference does not end inside a surrogate pair.
3451 if (IsUnicode(flags_) && !compiler->one_byte()) {
3452 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
3453 }
3454 on_success()->Emit(compiler, trace);
3455 }
3456
CalculateOffsets()3457 void TextNode::CalculateOffsets() {
3458 int element_count = elements()->length();
3459 // Set up the offsets of the elements relative to the start. This is a fixed
3460 // quantity since a TextNode can only contain fixed-width things.
3461 int cp_offset = 0;
3462 for (int i = 0; i < element_count; i++) {
3463 TextElement& elm = elements()->at(i);
3464 elm.set_cp_offset(cp_offset);
3465 cp_offset += elm.length();
3466 }
3467 }
3468
3469 namespace {
3470
3471 // Assertion propagation moves information about assertions such as
3472 // \b to the affected nodes. For instance, in /.\b./ information must
3473 // be propagated to the first '.' that whatever follows needs to know
3474 // if it matched a word or a non-word, and to the second '.' that it
3475 // has to check if it succeeds a word or non-word. In this case the
3476 // result will be something like:
3477 //
3478 // +-------+ +------------+
3479 // | . | | . |
3480 // +-------+ ---> +------------+
3481 // | word? | | check word |
3482 // +-------+ +------------+
3483 class AssertionPropagator : public AllStatic {
3484 public:
VisitText(TextNode * that)3485 static void VisitText(TextNode* that) {}
3486
VisitAction(ActionNode * that)3487 static void VisitAction(ActionNode* that) {
3488 // If the next node is interested in what it follows then this node
3489 // has to be interested too so it can pass the information on.
3490 that->info()->AddFromFollowing(that->on_success()->info());
3491 }
3492
VisitChoice(ChoiceNode * that,int i)3493 static void VisitChoice(ChoiceNode* that, int i) {
3494 // Anything the following nodes need to know has to be known by
3495 // this node also, so it can pass it on.
3496 that->info()->AddFromFollowing(that->alternatives()->at(i).node()->info());
3497 }
3498
VisitLoopChoiceContinueNode(LoopChoiceNode * that)3499 static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
3500 that->info()->AddFromFollowing(that->continue_node()->info());
3501 }
3502
VisitLoopChoiceLoopNode(LoopChoiceNode * that)3503 static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {
3504 that->info()->AddFromFollowing(that->loop_node()->info());
3505 }
3506
VisitNegativeLookaroundChoiceLookaroundNode(NegativeLookaroundChoiceNode * that)3507 static void VisitNegativeLookaroundChoiceLookaroundNode(
3508 NegativeLookaroundChoiceNode* that) {
3509 VisitChoice(that, NegativeLookaroundChoiceNode::kLookaroundIndex);
3510 }
3511
VisitNegativeLookaroundChoiceContinueNode(NegativeLookaroundChoiceNode * that)3512 static void VisitNegativeLookaroundChoiceContinueNode(
3513 NegativeLookaroundChoiceNode* that) {
3514 VisitChoice(that, NegativeLookaroundChoiceNode::kContinueIndex);
3515 }
3516
VisitBackReference(BackReferenceNode * that)3517 static void VisitBackReference(BackReferenceNode* that) {}
3518
VisitAssertion(AssertionNode * that)3519 static void VisitAssertion(AssertionNode* that) {}
3520 };
3521
3522 // Propagates information about the minimum size of successful matches from
3523 // successor nodes to their predecessors. Note that all eats_at_least values
3524 // are initialized to zero before analysis.
3525 class EatsAtLeastPropagator : public AllStatic {
3526 public:
VisitText(TextNode * that)3527 static void VisitText(TextNode* that) {
3528 // The eats_at_least value is not used if reading backward.
3529 if (!that->read_backward()) {
3530 // We are not at the start after this node, and thus we can use the
3531 // successor's eats_at_least_from_not_start value.
3532 uint8_t eats_at_least = base::saturated_cast<uint8_t>(
3533 that->Length() + that->on_success()
3534 ->eats_at_least_info()
3535 ->eats_at_least_from_not_start);
3536 that->set_eats_at_least_info(EatsAtLeastInfo(eats_at_least));
3537 }
3538 }
3539
VisitAction(ActionNode * that)3540 static void VisitAction(ActionNode* that) {
3541 switch (that->action_type()) {
3542 case ActionNode::BEGIN_POSITIVE_SUBMATCH:
3543 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3544 // We do not propagate eats_at_least data through positive lookarounds,
3545 // because they rewind input.
3546 // TODO(v8:11859) Potential approaches for fixing this include:
3547 // 1. Add a dedicated choice node for positive lookaround, similar to
3548 // NegativeLookaroundChoiceNode.
3549 // 2. Add an eats_at_least_inside_loop field to EatsAtLeastInfo, which
3550 // is <= eats_at_least_from_possibly_start, and use that value in
3551 // EatsAtLeastFromLoopEntry.
3552 DCHECK(that->eats_at_least_info()->IsZero());
3553 break;
3554 case ActionNode::SET_REGISTER_FOR_LOOP:
3555 // SET_REGISTER_FOR_LOOP indicates a loop entry point, which means the
3556 // loop body will run at least the minimum number of times before the
3557 // continuation case can run.
3558 that->set_eats_at_least_info(
3559 that->on_success()->EatsAtLeastFromLoopEntry());
3560 break;
3561 case ActionNode::BEGIN_NEGATIVE_SUBMATCH:
3562 default:
3563 // Otherwise, the current node eats at least as much as its successor.
3564 // Note: we can propagate eats_at_least data for BEGIN_NEGATIVE_SUBMATCH
3565 // because NegativeLookaroundChoiceNode ignores its lookaround successor
3566 // when computing eats-at-least and quick check information.
3567 that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
3568 break;
3569 }
3570 }
3571
VisitChoice(ChoiceNode * that,int i)3572 static void VisitChoice(ChoiceNode* that, int i) {
3573 // The minimum possible match from a choice node is the minimum of its
3574 // successors.
3575 EatsAtLeastInfo eats_at_least =
3576 i == 0 ? EatsAtLeastInfo(UINT8_MAX) : *that->eats_at_least_info();
3577 eats_at_least.SetMin(
3578 *that->alternatives()->at(i).node()->eats_at_least_info());
3579 that->set_eats_at_least_info(eats_at_least);
3580 }
3581
VisitLoopChoiceContinueNode(LoopChoiceNode * that)3582 static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
3583 if (!that->read_backward()) {
3584 that->set_eats_at_least_info(
3585 *that->continue_node()->eats_at_least_info());
3586 }
3587 }
3588
VisitLoopChoiceLoopNode(LoopChoiceNode * that)3589 static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {}
3590
VisitNegativeLookaroundChoiceLookaroundNode(NegativeLookaroundChoiceNode * that)3591 static void VisitNegativeLookaroundChoiceLookaroundNode(
3592 NegativeLookaroundChoiceNode* that) {}
3593
VisitNegativeLookaroundChoiceContinueNode(NegativeLookaroundChoiceNode * that)3594 static void VisitNegativeLookaroundChoiceContinueNode(
3595 NegativeLookaroundChoiceNode* that) {
3596 that->set_eats_at_least_info(*that->continue_node()->eats_at_least_info());
3597 }
3598
VisitBackReference(BackReferenceNode * that)3599 static void VisitBackReference(BackReferenceNode* that) {
3600 if (!that->read_backward()) {
3601 that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
3602 }
3603 }
3604
VisitAssertion(AssertionNode * that)3605 static void VisitAssertion(AssertionNode* that) {
3606 EatsAtLeastInfo eats_at_least = *that->on_success()->eats_at_least_info();
3607 if (that->assertion_type() == AssertionNode::AT_START) {
3608 // If we know we are not at the start and we are asked "how many
3609 // characters will you match if you succeed?" then we can answer anything
3610 // since false implies false. So let's just set the max answer
3611 // (UINT8_MAX) since that won't prevent us from preloading a lot of
3612 // characters for the other branches in the node graph.
3613 eats_at_least.eats_at_least_from_not_start = UINT8_MAX;
3614 }
3615 that->set_eats_at_least_info(eats_at_least);
3616 }
3617 };
3618
3619 } // namespace
3620
3621 // -------------------------------------------------------------------
3622 // Analysis
3623
3624 // Iterates the node graph and provides the opportunity for propagators to set
3625 // values that depend on successor nodes.
3626 template <typename... Propagators>
3627 class Analysis : public NodeVisitor {
3628 public:
Analysis(Isolate * isolate,bool is_one_byte)3629 Analysis(Isolate* isolate, bool is_one_byte)
3630 : isolate_(isolate),
3631 is_one_byte_(is_one_byte),
3632 error_(RegExpError::kNone) {}
3633
EnsureAnalyzed(RegExpNode * that)3634 void EnsureAnalyzed(RegExpNode* that) {
3635 StackLimitCheck check(isolate());
3636 if (check.HasOverflowed()) {
3637 if (FLAG_correctness_fuzzer_suppressions) {
3638 FATAL("Analysis: Aborting on stack overflow");
3639 }
3640 fail(RegExpError::kAnalysisStackOverflow);
3641 return;
3642 }
3643 if (that->info()->been_analyzed || that->info()->being_analyzed) return;
3644 that->info()->being_analyzed = true;
3645 that->Accept(this);
3646 that->info()->being_analyzed = false;
3647 that->info()->been_analyzed = true;
3648 }
3649
has_failed()3650 bool has_failed() { return error_ != RegExpError::kNone; }
error()3651 RegExpError error() {
3652 DCHECK(error_ != RegExpError::kNone);
3653 return error_;
3654 }
fail(RegExpError error)3655 void fail(RegExpError error) { error_ = error; }
3656
isolate() const3657 Isolate* isolate() const { return isolate_; }
3658
VisitEnd(EndNode * that)3659 void VisitEnd(EndNode* that) override {
3660 // nothing to do
3661 }
3662
3663 // Used to call the given static function on each propagator / variadic template
3664 // argument.
3665 #define STATIC_FOR_EACH(expr) \
3666 do { \
3667 int dummy[] = {((expr), 0)...}; \
3668 USE(dummy); \
3669 } while (false)
3670
VisitText(TextNode * that)3671 void VisitText(TextNode* that) override {
3672 that->MakeCaseIndependent(isolate(), is_one_byte_);
3673 EnsureAnalyzed(that->on_success());
3674 if (has_failed()) return;
3675 that->CalculateOffsets();
3676 STATIC_FOR_EACH(Propagators::VisitText(that));
3677 }
3678
VisitAction(ActionNode * that)3679 void VisitAction(ActionNode* that) override {
3680 EnsureAnalyzed(that->on_success());
3681 if (has_failed()) return;
3682 STATIC_FOR_EACH(Propagators::VisitAction(that));
3683 }
3684
VisitChoice(ChoiceNode * that)3685 void VisitChoice(ChoiceNode* that) override {
3686 for (int i = 0; i < that->alternatives()->length(); i++) {
3687 EnsureAnalyzed(that->alternatives()->at(i).node());
3688 if (has_failed()) return;
3689 STATIC_FOR_EACH(Propagators::VisitChoice(that, i));
3690 }
3691 }
3692
VisitLoopChoice(LoopChoiceNode * that)3693 void VisitLoopChoice(LoopChoiceNode* that) override {
3694 DCHECK_EQ(that->alternatives()->length(), 2); // Just loop and continue.
3695
3696 // First propagate all information from the continuation node.
3697 EnsureAnalyzed(that->continue_node());
3698 if (has_failed()) return;
3699 STATIC_FOR_EACH(Propagators::VisitLoopChoiceContinueNode(that));
3700
3701 // Check the loop last since it may need the value of this node
3702 // to get a correct result.
3703 EnsureAnalyzed(that->loop_node());
3704 if (has_failed()) return;
3705 STATIC_FOR_EACH(Propagators::VisitLoopChoiceLoopNode(that));
3706 }
3707
VisitNegativeLookaroundChoice(NegativeLookaroundChoiceNode * that)3708 void VisitNegativeLookaroundChoice(
3709 NegativeLookaroundChoiceNode* that) override {
3710 DCHECK_EQ(that->alternatives()->length(), 2); // Lookaround and continue.
3711
3712 EnsureAnalyzed(that->lookaround_node());
3713 if (has_failed()) return;
3714 STATIC_FOR_EACH(
3715 Propagators::VisitNegativeLookaroundChoiceLookaroundNode(that));
3716
3717 EnsureAnalyzed(that->continue_node());
3718 if (has_failed()) return;
3719 STATIC_FOR_EACH(
3720 Propagators::VisitNegativeLookaroundChoiceContinueNode(that));
3721 }
3722
VisitBackReference(BackReferenceNode * that)3723 void VisitBackReference(BackReferenceNode* that) override {
3724 EnsureAnalyzed(that->on_success());
3725 if (has_failed()) return;
3726 STATIC_FOR_EACH(Propagators::VisitBackReference(that));
3727 }
3728
VisitAssertion(AssertionNode * that)3729 void VisitAssertion(AssertionNode* that) override {
3730 EnsureAnalyzed(that->on_success());
3731 if (has_failed()) return;
3732 STATIC_FOR_EACH(Propagators::VisitAssertion(that));
3733 }
3734
3735 #undef STATIC_FOR_EACH
3736
3737 private:
3738 Isolate* isolate_;
3739 bool is_one_byte_;
3740 RegExpError error_;
3741
3742 DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
3743 };
3744
AnalyzeRegExp(Isolate * isolate,bool is_one_byte,RegExpNode * node)3745 RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte,
3746 RegExpNode* node) {
3747 Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(isolate,
3748 is_one_byte);
3749 DCHECK_EQ(node->info()->been_analyzed, false);
3750 analysis.EnsureAnalyzed(node);
3751 DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone);
3752 return analysis.has_failed() ? analysis.error() : RegExpError::kNone;
3753 }
3754
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3755 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
3756 BoyerMooreLookahead* bm,
3757 bool not_at_start) {
3758 // Working out the set of characters that a backreference can match is too
3759 // hard, so we just say that any character can match.
3760 bm->SetRest(offset);
3761 SaveBMInfo(bm, not_at_start, offset);
3762 }
3763
3764 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
3765 RegExpMacroAssembler::kTableSize);
3766
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3767 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
3768 BoyerMooreLookahead* bm, bool not_at_start) {
3769 ZoneList<GuardedAlternative>* alts = alternatives();
3770 budget = (budget - 1) / alts->length();
3771 for (int i = 0; i < alts->length(); i++) {
3772 GuardedAlternative& alt = alts->at(i);
3773 if (alt.guards() != nullptr && alt.guards()->length() != 0) {
3774 bm->SetRest(offset); // Give up trying to fill in info.
3775 SaveBMInfo(bm, not_at_start, offset);
3776 return;
3777 }
3778 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
3779 }
3780 SaveBMInfo(bm, not_at_start, offset);
3781 }
3782
FillInBMInfo(Isolate * isolate,int initial_offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3783 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
3784 BoyerMooreLookahead* bm, bool not_at_start) {
3785 if (initial_offset >= bm->length()) return;
3786 int offset = initial_offset;
3787 int max_char = bm->max_char();
3788 for (int i = 0; i < elements()->length(); i++) {
3789 if (offset >= bm->length()) {
3790 if (initial_offset == 0) set_bm_info(not_at_start, bm);
3791 return;
3792 }
3793 TextElement text = elements()->at(i);
3794 if (text.text_type() == TextElement::ATOM) {
3795 RegExpAtom* atom = text.atom();
3796 for (int j = 0; j < atom->length(); j++, offset++) {
3797 if (offset >= bm->length()) {
3798 if (initial_offset == 0) set_bm_info(not_at_start, bm);
3799 return;
3800 }
3801 uc16 character = atom->data()[j];
3802 if (IgnoreCase(atom->flags())) {
3803 unibrow::uchar chars[4];
3804 int length = GetCaseIndependentLetters(
3805 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
3806 chars, 4);
3807 for (int j = 0; j < length; j++) {
3808 bm->Set(offset, chars[j]);
3809 }
3810 } else {
3811 if (character <= max_char) bm->Set(offset, character);
3812 }
3813 }
3814 } else {
3815 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
3816 RegExpCharacterClass* char_class = text.char_class();
3817 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
3818 if (char_class->is_negated()) {
3819 bm->SetAll(offset);
3820 } else {
3821 for (int k = 0; k < ranges->length(); k++) {
3822 CharacterRange& range = ranges->at(k);
3823 if (static_cast<int>(range.from()) > max_char) continue;
3824 int to = std::min(max_char, static_cast<int>(range.to()));
3825 bm->SetInterval(offset, Interval(range.from(), to));
3826 }
3827 }
3828 offset++;
3829 }
3830 }
3831 if (offset >= bm->length()) {
3832 if (initial_offset == 0) set_bm_info(not_at_start, bm);
3833 return;
3834 }
3835 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
3836 true); // Not at start after a text node.
3837 if (initial_offset == 0) set_bm_info(not_at_start, bm);
3838 }
3839
OptionallyStepBackToLeadSurrogate(RegExpNode * on_success,JSRegExp::Flags flags)3840 RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
3841 RegExpNode* on_success, JSRegExp::Flags flags) {
3842 DCHECK(!read_backward());
3843 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
3844 zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
3845 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
3846 zone(), CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
3847
3848 ChoiceNode* optional_step_back = zone()->New<ChoiceNode>(2, zone());
3849
3850 int stack_register = UnicodeLookaroundStackRegister();
3851 int position_register = UnicodeLookaroundPositionRegister();
3852 RegExpNode* step_back = TextNode::CreateForCharacterRanges(
3853 zone(), lead_surrogates, true, on_success, flags);
3854 RegExpLookaround::Builder builder(true, step_back, stack_register,
3855 position_register);
3856 RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
3857 zone(), trail_surrogates, false, builder.on_match_success(), flags);
3858
3859 optional_step_back->AddAlternative(
3860 GuardedAlternative(builder.ForMatch(match_trail)));
3861 optional_step_back->AddAlternative(GuardedAlternative(on_success));
3862
3863 return optional_step_back;
3864 }
3865
PreprocessRegExp(RegExpCompileData * data,JSRegExp::Flags flags,bool is_one_byte)3866 RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
3867 JSRegExp::Flags flags,
3868 bool is_one_byte) {
3869 // Wrap the body of the regexp in capture #0.
3870 RegExpNode* captured_body =
3871 RegExpCapture::ToNode(data->tree, 0, this, accept());
3872 RegExpNode* node = captured_body;
3873 if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) {
3874 // Add a .*? at the beginning, outside the body capture, unless
3875 // this expression is anchored at the beginning or sticky.
3876 JSRegExp::Flags default_flags = JSRegExp::Flags();
3877 RegExpNode* loop_node = RegExpQuantifier::ToNode(
3878 0, RegExpTree::kInfinity, false,
3879 zone()->New<RegExpCharacterClass>('*', default_flags), this,
3880 captured_body, data->contains_anchor);
3881
3882 if (data->contains_anchor) {
3883 // Unroll loop once, to take care of the case that might start
3884 // at the start of input.
3885 ChoiceNode* first_step_node = zone()->New<ChoiceNode>(2, zone());
3886 first_step_node->AddAlternative(GuardedAlternative(captured_body));
3887 first_step_node->AddAlternative(GuardedAlternative(zone()->New<TextNode>(
3888 zone()->New<RegExpCharacterClass>('*', default_flags), false,
3889 loop_node)));
3890 node = first_step_node;
3891 } else {
3892 node = loop_node;
3893 }
3894 }
3895 if (is_one_byte) {
3896 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
3897 // Do it again to propagate the new nodes to places where they were not
3898 // put because they had not been calculated yet.
3899 if (node != nullptr) {
3900 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
3901 }
3902 } else if (IsUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) {
3903 node = OptionallyStepBackToLeadSurrogate(node, flags);
3904 }
3905
3906 if (node == nullptr) node = zone()->New<EndNode>(EndNode::BACKTRACK, zone());
3907 return node;
3908 }
3909
3910 } // namespace internal
3911 } // namespace v8
3912