1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "irregexp/imported/regexp-compiler.h"
6 
7 #include "irregexp/imported/regexp.h"
8 #ifdef V8_INTL_SUPPORT
9 #include "irregexp/imported/special-case.h"
10 #endif  // V8_INTL_SUPPORT
11 
12 #ifdef V8_INTL_SUPPORT
13 #include "unicode/locid.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utypes.h"
16 #endif  // V8_INTL_SUPPORT
17 
18 namespace v8 {
19 namespace internal {
20 
21 using namespace regexp_compiler_constants;  // NOLINT(build/namespaces)
22 
23 // -------------------------------------------------------------------
24 // Tree to graph conversion
25 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)26 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
27                                RegExpNode* on_success) {
28   ZoneList<TextElement>* elms =
29       compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone());
30   elms->Add(TextElement::Atom(this), compiler->zone());
31   return compiler->zone()->New<TextNode>(elms, compiler->read_backward(),
32                                          on_success);
33 }
34 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)35 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
36                                RegExpNode* on_success) {
37   return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(),
38                                          on_success);
39 }
40 
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)41 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
42                                  const int* special_class, int length) {
43   length--;  // Remove final marker.
44   DCHECK_EQ(kRangeEndMarker, special_class[length]);
45   DCHECK_NE(0, ranges->length());
46   DCHECK_NE(0, length);
47   DCHECK_NE(0, special_class[0]);
48   if (ranges->length() != (length >> 1) + 1) {
49     return false;
50   }
51   CharacterRange range = ranges->at(0);
52   if (range.from() != 0) {
53     return false;
54   }
55   for (int i = 0; i < length; i += 2) {
56     if (static_cast<uc32>(special_class[i]) != (range.to() + 1)) {
57       return false;
58     }
59     range = ranges->at((i >> 1) + 1);
60     if (static_cast<uc32>(special_class[i + 1]) != range.from()) {
61       return false;
62     }
63   }
64   if (range.to() != String::kMaxCodePoint) {
65     return false;
66   }
67   return true;
68 }
69 
CompareRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)70 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
71                           const int* special_class, int length) {
72   length--;  // Remove final marker.
73   DCHECK_EQ(kRangeEndMarker, special_class[length]);
74   if (ranges->length() * 2 != length) {
75     return false;
76   }
77   for (int i = 0; i < length; i += 2) {
78     CharacterRange range = ranges->at(i >> 1);
79     if (range.from() != static_cast<uc32>(special_class[i]) ||
80         range.to() != static_cast<uc32>(special_class[i + 1] - 1)) {
81       return false;
82     }
83   }
84   return true;
85 }
86 
is_standard(Zone * zone)87 bool RegExpCharacterClass::is_standard(Zone* zone) {
88   // TODO(lrn): Remove need for this function, by not throwing away information
89   // along the way.
90   if (is_negated()) {
91     return false;
92   }
93   if (set_.is_standard()) {
94     return true;
95   }
96   if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
97     set_.set_standard_set_type('s');
98     return true;
99   }
100   if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
101     set_.set_standard_set_type('S');
102     return true;
103   }
104   if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,
105                            kLineTerminatorRangeCount)) {
106     set_.set_standard_set_type('.');
107     return true;
108   }
109   if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,
110                     kLineTerminatorRangeCount)) {
111     set_.set_standard_set_type('n');
112     return true;
113   }
114   if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
115     set_.set_standard_set_type('w');
116     return true;
117   }
118   if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
119     set_.set_standard_set_type('W');
120     return true;
121   }
122   return false;
123 }
124 
UnicodeRangeSplitter(ZoneList<CharacterRange> * base)125 UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) {
126   // The unicode range splitter categorizes given character ranges into:
127   // - Code points from the BMP representable by one code unit.
128   // - Code points outside the BMP that need to be split into surrogate pairs.
129   // - Lone lead surrogates.
130   // - Lone trail surrogates.
131   // Lone surrogates are valid code points, even though no actual characters.
132   // They require special matching to make sure we do not split surrogate pairs.
133 
134   for (int i = 0; i < base->length(); i++) AddRange(base->at(i));
135 }
136 
AddRange(CharacterRange range)137 void UnicodeRangeSplitter::AddRange(CharacterRange range) {
138   static constexpr uc32 kBmp1Start = 0;
139   static constexpr uc32 kBmp1End = kLeadSurrogateStart - 1;
140   static constexpr uc32 kBmp2Start = kTrailSurrogateEnd + 1;
141   static constexpr uc32 kBmp2End = kNonBmpStart - 1;
142 
143   // Ends are all inclusive.
144   STATIC_ASSERT(kBmp1Start == 0);
145   STATIC_ASSERT(kBmp1Start < kBmp1End);
146   STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart);
147   STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd);
148   STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart);
149   STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd);
150   STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start);
151   STATIC_ASSERT(kBmp2Start < kBmp2End);
152   STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart);
153   STATIC_ASSERT(kNonBmpStart < kNonBmpEnd);
154 
155   static constexpr uc32 kStarts[] = {
156       kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart,
157       kBmp2Start, kNonBmpStart,
158   };
159 
160   static constexpr uc32 kEnds[] = {
161       kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd,
162   };
163 
164   CharacterRangeVector* const kTargets[] = {
165       &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_,
166   };
167 
168   static constexpr int kCount = arraysize(kStarts);
169   STATIC_ASSERT(kCount == arraysize(kEnds));
170   STATIC_ASSERT(kCount == arraysize(kTargets));
171 
172   for (int i = 0; i < kCount; i++) {
173     if (kStarts[i] > range.to()) break;
174     const uc32 from = std::max(kStarts[i], range.from());
175     const uc32 to = std::min(kEnds[i], range.to());
176     if (from > to) continue;
177     kTargets[i]->emplace_back(CharacterRange::Range(from, to));
178   }
179 }
180 
181 namespace {
182 
183 // Translates between new and old V8-isms (SmallVector, ZoneList).
ToCanonicalZoneList(const UnicodeRangeSplitter::CharacterRangeVector * v,Zone * zone)184 ZoneList<CharacterRange>* ToCanonicalZoneList(
185     const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) {
186   if (v->empty()) return nullptr;
187 
188   ZoneList<CharacterRange>* result =
189       zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone);
190   for (size_t i = 0; i < v->size(); i++) {
191     result->Add(v->at(i), zone);
192   }
193 
194   CharacterRange::Canonicalize(result);
195   return result;
196 }
197 
AddBmpCharacters(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)198 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
199                       RegExpNode* on_success, UnicodeRangeSplitter* splitter,
200                       JSRegExp::Flags flags) {
201   ZoneList<CharacterRange>* bmp =
202       ToCanonicalZoneList(splitter->bmp(), compiler->zone());
203   if (bmp == nullptr) return;
204   result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
205       compiler->zone(), bmp, compiler->read_backward(), on_success, flags)));
206 }
207 
AddNonBmpSurrogatePairs(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)208 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
209                              RegExpNode* on_success,
210                              UnicodeRangeSplitter* splitter,
211                              JSRegExp::Flags flags) {
212   ZoneList<CharacterRange>* non_bmp =
213       ToCanonicalZoneList(splitter->non_bmp(), compiler->zone());
214   if (non_bmp == nullptr) return;
215   DCHECK(!compiler->one_byte());
216   Zone* zone = compiler->zone();
217   CharacterRange::Canonicalize(non_bmp);
218   for (int i = 0; i < non_bmp->length(); i++) {
219     // Match surrogate pair.
220     // E.g. [\u10005-\u11005] becomes
221     //      \ud800[\udc05-\udfff]|
222     //      [\ud801-\ud803][\udc00-\udfff]|
223     //      \ud804[\udc00-\udc05]
224     uc32 from = non_bmp->at(i).from();
225     uc32 to = non_bmp->at(i).to();
226     uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
227     uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
228     uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
229     uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
230     if (from_l == to_l) {
231       // The lead surrogate is the same.
232       result->AddAlternative(
233           GuardedAlternative(TextNode::CreateForSurrogatePair(
234               zone, CharacterRange::Singleton(from_l),
235               CharacterRange::Range(from_t, to_t), compiler->read_backward(),
236               on_success, flags)));
237     } else {
238       if (from_t != kTrailSurrogateStart) {
239         // Add [from_l][from_t-\udfff]
240         result->AddAlternative(
241             GuardedAlternative(TextNode::CreateForSurrogatePair(
242                 zone, CharacterRange::Singleton(from_l),
243                 CharacterRange::Range(from_t, kTrailSurrogateEnd),
244                 compiler->read_backward(), on_success, flags)));
245         from_l++;
246       }
247       if (to_t != kTrailSurrogateEnd) {
248         // Add [to_l][\udc00-to_t]
249         result->AddAlternative(
250             GuardedAlternative(TextNode::CreateForSurrogatePair(
251                 zone, CharacterRange::Singleton(to_l),
252                 CharacterRange::Range(kTrailSurrogateStart, to_t),
253                 compiler->read_backward(), on_success, flags)));
254         to_l--;
255       }
256       if (from_l <= to_l) {
257         // Add [from_l-to_l][\udc00-\udfff]
258         result->AddAlternative(
259             GuardedAlternative(TextNode::CreateForSurrogatePair(
260                 zone, CharacterRange::Range(from_l, to_l),
261                 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
262                 compiler->read_backward(), on_success, flags)));
263       }
264     }
265   }
266 }
267 
NegativeLookaroundAgainstReadDirectionAndMatch(RegExpCompiler * compiler,ZoneList<CharacterRange> * lookbehind,ZoneList<CharacterRange> * match,RegExpNode * on_success,bool read_backward,JSRegExp::Flags flags)268 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
269     RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
270     ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
271     JSRegExp::Flags flags) {
272   Zone* zone = compiler->zone();
273   RegExpNode* match_node = TextNode::CreateForCharacterRanges(
274       zone, match, read_backward, on_success, flags);
275   int stack_register = compiler->UnicodeLookaroundStackRegister();
276   int position_register = compiler->UnicodeLookaroundPositionRegister();
277   RegExpLookaround::Builder lookaround(false, match_node, stack_register,
278                                        position_register);
279   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
280       zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
281   return lookaround.ForMatch(negative_match);
282 }
283 
MatchAndNegativeLookaroundInReadDirection(RegExpCompiler * compiler,ZoneList<CharacterRange> * match,ZoneList<CharacterRange> * lookahead,RegExpNode * on_success,bool read_backward,JSRegExp::Flags flags)284 RegExpNode* MatchAndNegativeLookaroundInReadDirection(
285     RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
286     ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
287     bool read_backward, JSRegExp::Flags flags) {
288   Zone* zone = compiler->zone();
289   int stack_register = compiler->UnicodeLookaroundStackRegister();
290   int position_register = compiler->UnicodeLookaroundPositionRegister();
291   RegExpLookaround::Builder lookaround(false, on_success, stack_register,
292                                        position_register);
293   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
294       zone, lookahead, read_backward, lookaround.on_match_success(), flags);
295   return TextNode::CreateForCharacterRanges(
296       zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
297 }
298 
AddLoneLeadSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)299 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
300                            RegExpNode* on_success,
301                            UnicodeRangeSplitter* splitter,
302                            JSRegExp::Flags flags) {
303   ZoneList<CharacterRange>* lead_surrogates =
304       ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());
305   if (lead_surrogates == nullptr) return;
306   Zone* zone = compiler->zone();
307   // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
308   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
309       zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
310 
311   RegExpNode* match;
312   if (compiler->read_backward()) {
313     // Reading backward. Assert that reading forward, there is no trail
314     // surrogate, and then backward match the lead surrogate.
315     match = NegativeLookaroundAgainstReadDirectionAndMatch(
316         compiler, trail_surrogates, lead_surrogates, on_success, true, flags);
317   } else {
318     // Reading forward. Forward match the lead surrogate and assert that
319     // no trail surrogate follows.
320     match = MatchAndNegativeLookaroundInReadDirection(
321         compiler, lead_surrogates, trail_surrogates, on_success, false, flags);
322   }
323   result->AddAlternative(GuardedAlternative(match));
324 }
325 
AddLoneTrailSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)326 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
327                             RegExpNode* on_success,
328                             UnicodeRangeSplitter* splitter,
329                             JSRegExp::Flags flags) {
330   ZoneList<CharacterRange>* trail_surrogates =
331       ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());
332   if (trail_surrogates == nullptr) return;
333   Zone* zone = compiler->zone();
334   // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
335   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
336       zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
337 
338   RegExpNode* match;
339   if (compiler->read_backward()) {
340     // Reading backward. Backward match the trail surrogate and assert that no
341     // lead surrogate precedes it.
342     match = MatchAndNegativeLookaroundInReadDirection(
343         compiler, trail_surrogates, lead_surrogates, on_success, true, flags);
344   } else {
345     // Reading forward. Assert that reading backward, there is no lead
346     // surrogate, and then forward match the trail surrogate.
347     match = NegativeLookaroundAgainstReadDirectionAndMatch(
348         compiler, lead_surrogates, trail_surrogates, on_success, false, flags);
349   }
350   result->AddAlternative(GuardedAlternative(match));
351 }
352 
UnanchoredAdvance(RegExpCompiler * compiler,RegExpNode * on_success)353 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
354                               RegExpNode* on_success) {
355   // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
356   DCHECK(!compiler->read_backward());
357   Zone* zone = compiler->zone();
358   // Advance any character. If the character happens to be a lead surrogate and
359   // we advanced into the middle of a surrogate pair, it will work out, as
360   // nothing will match from there. We will have to advance again, consuming
361   // the associated trail surrogate.
362   ZoneList<CharacterRange>* range = CharacterRange::List(
363       zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
364   JSRegExp::Flags default_flags = JSRegExp::Flags();
365   return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
366                                             default_flags);
367 }
368 
AddUnicodeCaseEquivalents(ZoneList<CharacterRange> * ranges,Zone * zone)369 void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
370 #ifdef V8_INTL_SUPPORT
371   DCHECK(CharacterRange::IsCanonical(ranges));
372 
373   // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
374   // See also https://crbug.com/v8/6727.
375   // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
376   // which we use frequently internally. But large ranges can also easily be
377   // created by the user. We might want to have a more general caching mechanism
378   // for such ranges.
379   if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
380 
381   // Use ICU to compute the case fold closure over the ranges.
382   icu::UnicodeSet set;
383   for (int i = 0; i < ranges->length(); i++) {
384     set.add(ranges->at(i).from(), ranges->at(i).to());
385   }
386   // Clear the ranges list without freeing the backing store.
387   ranges->Rewind(0);
388   set.closeOver(USET_CASE_INSENSITIVE);
389   // Full case mapping map single characters to multiple characters.
390   // Those are represented as strings in the set. Remove them so that
391   // we end up with only simple and common case mappings.
392   set.removeAllStrings();
393   for (int i = 0; i < set.getRangeCount(); i++) {
394     ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
395                 zone);
396   }
397   // No errors and everything we collected have been ranges.
398   CharacterRange::Canonicalize(ranges);
399 #endif  // V8_INTL_SUPPORT
400 }
401 
402 }  // namespace
403 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)404 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
405                                          RegExpNode* on_success) {
406   set_.Canonicalize();
407   Zone* zone = compiler->zone();
408   ZoneList<CharacterRange>* ranges = this->ranges(zone);
409   if (NeedsUnicodeCaseEquivalents(flags_)) {
410     AddUnicodeCaseEquivalents(ranges, zone);
411   }
412   if (IsUnicode(flags_) && !compiler->one_byte() &&
413       !contains_split_surrogate()) {
414     if (is_negated()) {
415       ZoneList<CharacterRange>* negated =
416           zone->New<ZoneList<CharacterRange>>(2, zone);
417       CharacterRange::Negate(ranges, negated, zone);
418       ranges = negated;
419     }
420     if (ranges->length() == 0) {
421       JSRegExp::Flags default_flags;
422       RegExpCharacterClass* fail =
423           zone->New<RegExpCharacterClass>(zone, ranges, default_flags);
424       return zone->New<TextNode>(fail, compiler->read_backward(), on_success);
425     }
426     if (standard_type() == '*') {
427       return UnanchoredAdvance(compiler, on_success);
428     } else {
429       ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
430       UnicodeRangeSplitter splitter(ranges);
431       AddBmpCharacters(compiler, result, on_success, &splitter, flags_);
432       AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter, flags_);
433       AddLoneLeadSurrogates(compiler, result, on_success, &splitter, flags_);
434       AddLoneTrailSurrogates(compiler, result, on_success, &splitter, flags_);
435       static constexpr int kMaxRangesToInline = 32;  // Arbitrary.
436       if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline();
437       return result;
438     }
439   } else {
440     return zone->New<TextNode>(this, compiler->read_backward(), on_success);
441   }
442 }
443 
CompareFirstChar(RegExpTree * const * a,RegExpTree * const * b)444 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
445   RegExpAtom* atom1 = (*a)->AsAtom();
446   RegExpAtom* atom2 = (*b)->AsAtom();
447   uc16 character1 = atom1->data().at(0);
448   uc16 character2 = atom2->data().at(0);
449   if (character1 < character2) return -1;
450   if (character1 > character2) return 1;
451   return 0;
452 }
453 
454 #ifdef V8_INTL_SUPPORT
455 
456 // Case Insensitve comparesion
CompareFirstCharCaseInsensitve(RegExpTree * const * a,RegExpTree * const * b)457 int CompareFirstCharCaseInsensitve(RegExpTree* const* a, RegExpTree* const* b) {
458   RegExpAtom* atom1 = (*a)->AsAtom();
459   RegExpAtom* atom2 = (*b)->AsAtom();
460   icu::UnicodeString character1(atom1->data().at(0));
461   return character1.caseCompare(atom2->data().at(0), U_FOLD_CASE_DEFAULT);
462 }
463 
464 #else
465 
Canonical(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,unibrow::uchar c)466 static unibrow::uchar Canonical(
467     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
468     unibrow::uchar c) {
469   unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
470   int length = canonicalize->get(c, '\0', chars);
471   DCHECK_LE(length, 1);
472   unibrow::uchar canonical = c;
473   if (length == 1) canonical = chars[0];
474   return canonical;
475 }
476 
CompareFirstCharCaseIndependent(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,RegExpTree * const * a,RegExpTree * const * b)477 int CompareFirstCharCaseIndependent(
478     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
479     RegExpTree* const* a, RegExpTree* const* b) {
480   RegExpAtom* atom1 = (*a)->AsAtom();
481   RegExpAtom* atom2 = (*b)->AsAtom();
482   unibrow::uchar character1 = atom1->data().at(0);
483   unibrow::uchar character2 = atom2->data().at(0);
484   if (character1 == character2) return 0;
485   if (character1 >= 'a' || character2 >= 'a') {
486     character1 = Canonical(canonicalize, character1);
487     character2 = Canonical(canonicalize, character2);
488   }
489   return static_cast<int>(character1) - static_cast<int>(character2);
490 }
491 #endif  // V8_INTL_SUPPORT
492 
493 // We can stable sort runs of atoms, since the order does not matter if they
494 // start with different characters.
495 // Returns true if any consecutive atoms were found.
SortConsecutiveAtoms(RegExpCompiler * compiler)496 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
497   ZoneList<RegExpTree*>* alternatives = this->alternatives();
498   int length = alternatives->length();
499   bool found_consecutive_atoms = false;
500   for (int i = 0; i < length; i++) {
501     while (i < length) {
502       RegExpTree* alternative = alternatives->at(i);
503       if (alternative->IsAtom()) break;
504       i++;
505     }
506     // i is length or it is the index of an atom.
507     if (i == length) break;
508     int first_atom = i;
509     JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
510     i++;
511     while (i < length) {
512       RegExpTree* alternative = alternatives->at(i);
513       if (!alternative->IsAtom()) break;
514       if (alternative->AsAtom()->flags() != flags) break;
515       i++;
516     }
517     // Sort atoms to get ones with common prefixes together.
518     // This step is more tricky if we are in a case-independent regexp,
519     // because it would change /is|I/ to /I|is/, and order matters when
520     // the regexp parts don't match only disjoint starting points. To fix
521     // this we have a version of CompareFirstChar that uses case-
522     // independent character classes for comparison.
523     DCHECK_LT(first_atom, alternatives->length());
524     DCHECK_LE(i, alternatives->length());
525     DCHECK_LE(first_atom, i);
526     if (IgnoreCase(flags)) {
527 #ifdef V8_INTL_SUPPORT
528       alternatives->StableSort(CompareFirstCharCaseInsensitve, first_atom,
529                                i - first_atom);
530 #else
531       unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
532           compiler->isolate()->regexp_macro_assembler_canonicalize();
533       auto compare_closure = [canonicalize](RegExpTree* const* a,
534                                             RegExpTree* const* b) {
535         return CompareFirstCharCaseIndependent(canonicalize, a, b);
536       };
537       alternatives->StableSort(compare_closure, first_atom, i - first_atom);
538 #endif  // V8_INTL_SUPPORT
539     } else {
540       alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
541     }
542     if (i - first_atom > 1) found_consecutive_atoms = true;
543   }
544   return found_consecutive_atoms;
545 }
546 
547 // Optimizes ab|ac|az to a(?:b|c|d).
RationalizeConsecutiveAtoms(RegExpCompiler * compiler)548 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
549   Zone* zone = compiler->zone();
550   ZoneList<RegExpTree*>* alternatives = this->alternatives();
551   int length = alternatives->length();
552 
553   int write_posn = 0;
554   int i = 0;
555   while (i < length) {
556     RegExpTree* alternative = alternatives->at(i);
557     if (!alternative->IsAtom()) {
558       alternatives->at(write_posn++) = alternatives->at(i);
559       i++;
560       continue;
561     }
562     RegExpAtom* const atom = alternative->AsAtom();
563     JSRegExp::Flags flags = atom->flags();
564 #ifdef V8_INTL_SUPPORT
565     icu::UnicodeString common_prefix(atom->data().at(0));
566 #else
567     unibrow::uchar common_prefix = atom->data().at(0);
568 #endif  // V8_INTL_SUPPORT
569     int first_with_prefix = i;
570     int prefix_length = atom->length();
571     i++;
572     while (i < length) {
573       alternative = alternatives->at(i);
574       if (!alternative->IsAtom()) break;
575       RegExpAtom* const atom = alternative->AsAtom();
576       if (atom->flags() != flags) break;
577 #ifdef V8_INTL_SUPPORT
578       icu::UnicodeString new_prefix(atom->data().at(0));
579       if (new_prefix != common_prefix) {
580         if (!IgnoreCase(flags)) break;
581         if (common_prefix.caseCompare(new_prefix, U_FOLD_CASE_DEFAULT) != 0)
582           break;
583       }
584 #else
585       unibrow::uchar new_prefix = atom->data().at(0);
586       if (new_prefix != common_prefix) {
587         if (!IgnoreCase(flags)) break;
588         unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
589             compiler->isolate()->regexp_macro_assembler_canonicalize();
590         new_prefix = Canonical(canonicalize, new_prefix);
591         common_prefix = Canonical(canonicalize, common_prefix);
592         if (new_prefix != common_prefix) break;
593       }
594 #endif  // V8_INTL_SUPPORT
595       prefix_length = std::min(prefix_length, atom->length());
596       i++;
597     }
598     if (i > first_with_prefix + 2) {
599       // Found worthwhile run of alternatives with common prefix of at least one
600       // character.  The sorting function above did not sort on more than one
601       // character for reasons of correctness, but there may still be a longer
602       // common prefix if the terms were similar or presorted in the input.
603       // Find out how long the common prefix is.
604       int run_length = i - first_with_prefix;
605       RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
606       for (int j = 1; j < run_length && prefix_length > 1; j++) {
607         RegExpAtom* old_atom =
608             alternatives->at(j + first_with_prefix)->AsAtom();
609         for (int k = 1; k < prefix_length; k++) {
610           if (atom->data().at(k) != old_atom->data().at(k)) {
611             prefix_length = k;
612             break;
613           }
614         }
615       }
616       RegExpAtom* prefix = zone->New<RegExpAtom>(
617           atom->data().SubVector(0, prefix_length), flags);
618       ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone);
619       pair->Add(prefix, zone);
620       ZoneList<RegExpTree*>* suffixes =
621           zone->New<ZoneList<RegExpTree*>>(run_length, zone);
622       for (int j = 0; j < run_length; j++) {
623         RegExpAtom* old_atom =
624             alternatives->at(j + first_with_prefix)->AsAtom();
625         int len = old_atom->length();
626         if (len == prefix_length) {
627           suffixes->Add(zone->New<RegExpEmpty>(), zone);
628         } else {
629           RegExpTree* suffix = zone->New<RegExpAtom>(
630               old_atom->data().SubVector(prefix_length, old_atom->length()),
631               flags);
632           suffixes->Add(suffix, zone);
633         }
634       }
635       pair->Add(zone->New<RegExpDisjunction>(suffixes), zone);
636       alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair);
637     } else {
638       // Just copy any non-worthwhile alternatives.
639       for (int j = first_with_prefix; j < i; j++) {
640         alternatives->at(write_posn++) = alternatives->at(j);
641       }
642     }
643   }
644   alternatives->Rewind(write_posn);  // Trim end of array.
645 }
646 
647 // Optimizes b|c|z to [bcz].
FixSingleCharacterDisjunctions(RegExpCompiler * compiler)648 void RegExpDisjunction::FixSingleCharacterDisjunctions(
649     RegExpCompiler* compiler) {
650   Zone* zone = compiler->zone();
651   ZoneList<RegExpTree*>* alternatives = this->alternatives();
652   int length = alternatives->length();
653 
654   int write_posn = 0;
655   int i = 0;
656   while (i < length) {
657     RegExpTree* alternative = alternatives->at(i);
658     if (!alternative->IsAtom()) {
659       alternatives->at(write_posn++) = alternatives->at(i);
660       i++;
661       continue;
662     }
663     RegExpAtom* const atom = alternative->AsAtom();
664     if (atom->length() != 1) {
665       alternatives->at(write_posn++) = alternatives->at(i);
666       i++;
667       continue;
668     }
669     JSRegExp::Flags flags = atom->flags();
670     DCHECK_IMPLIES(IsUnicode(flags),
671                    !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
672     bool contains_trail_surrogate =
673         unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
674     int first_in_run = i;
675     i++;
676     // Find a run of single-character atom alternatives that have identical
677     // flags (case independence and unicode-ness).
678     while (i < length) {
679       alternative = alternatives->at(i);
680       if (!alternative->IsAtom()) break;
681       RegExpAtom* const atom = alternative->AsAtom();
682       if (atom->length() != 1) break;
683       if (atom->flags() != flags) break;
684       DCHECK_IMPLIES(IsUnicode(flags),
685                      !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
686       contains_trail_surrogate |=
687           unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
688       i++;
689     }
690     if (i > first_in_run + 1) {
691       // Found non-trivial run of single-character alternatives.
692       int run_length = i - first_in_run;
693       ZoneList<CharacterRange>* ranges =
694           zone->New<ZoneList<CharacterRange>>(2, zone);
695       for (int j = 0; j < run_length; j++) {
696         RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
697         DCHECK_EQ(old_atom->length(), 1);
698         ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
699       }
700       RegExpCharacterClass::CharacterClassFlags character_class_flags;
701       if (IsUnicode(flags) && contains_trail_surrogate) {
702         character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
703       }
704       alternatives->at(write_posn++) = zone->New<RegExpCharacterClass>(
705           zone, ranges, flags, character_class_flags);
706     } else {
707       // Just copy any trivial alternatives.
708       for (int j = first_in_run; j < i; j++) {
709         alternatives->at(write_posn++) = alternatives->at(j);
710       }
711     }
712   }
713   alternatives->Rewind(write_posn);  // Trim end of array.
714 }
715 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)716 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
717                                       RegExpNode* on_success) {
718   ZoneList<RegExpTree*>* alternatives = this->alternatives();
719 
720   if (alternatives->length() > 2) {
721     bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
722     if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
723     FixSingleCharacterDisjunctions(compiler);
724     if (alternatives->length() == 1) {
725       return alternatives->at(0)->ToNode(compiler, on_success);
726     }
727   }
728 
729   int length = alternatives->length();
730 
731   ChoiceNode* result =
732       compiler->zone()->New<ChoiceNode>(length, compiler->zone());
733   for (int i = 0; i < length; i++) {
734     GuardedAlternative alternative(
735         alternatives->at(i)->ToNode(compiler, on_success));
736     result->AddAlternative(alternative);
737   }
738   return result;
739 }
740 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)741 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
742                                      RegExpNode* on_success) {
743   return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
744 }
745 
746 namespace {
747 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
748 //         \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
BoundaryAssertionAsLookaround(RegExpCompiler * compiler,RegExpNode * on_success,RegExpAssertion::AssertionType type,JSRegExp::Flags flags)749 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
750                                           RegExpNode* on_success,
751                                           RegExpAssertion::AssertionType type,
752                                           JSRegExp::Flags flags) {
753   DCHECK(NeedsUnicodeCaseEquivalents(flags));
754   Zone* zone = compiler->zone();
755   ZoneList<CharacterRange>* word_range =
756       zone->New<ZoneList<CharacterRange>>(2, zone);
757   CharacterRange::AddClassEscape('w', word_range, true, zone);
758   int stack_register = compiler->UnicodeLookaroundStackRegister();
759   int position_register = compiler->UnicodeLookaroundPositionRegister();
760   ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
761   // Add two choices. The (non-)boundary could start with a word or
762   // a non-word-character.
763   for (int i = 0; i < 2; i++) {
764     bool lookbehind_for_word = i == 0;
765     bool lookahead_for_word =
766         (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
767     // Look to the left.
768     RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
769                                          stack_register, position_register);
770     RegExpNode* backward = TextNode::CreateForCharacterRanges(
771         zone, word_range, true, lookbehind.on_match_success(), flags);
772     // Look to the right.
773     RegExpLookaround::Builder lookahead(lookahead_for_word,
774                                         lookbehind.ForMatch(backward),
775                                         stack_register, position_register);
776     RegExpNode* forward = TextNode::CreateForCharacterRanges(
777         zone, word_range, false, lookahead.on_match_success(), flags);
778     result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
779   }
780   return result;
781 }
782 }  // anonymous namespace
783 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)784 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
785                                     RegExpNode* on_success) {
786   NodeInfo info;
787   Zone* zone = compiler->zone();
788 
789   switch (assertion_type()) {
790     case START_OF_LINE:
791       return AssertionNode::AfterNewline(on_success);
792     case START_OF_INPUT:
793       return AssertionNode::AtStart(on_success);
794     case BOUNDARY:
795       return NeedsUnicodeCaseEquivalents(flags_)
796                  ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
797                                                  flags_)
798                  : AssertionNode::AtBoundary(on_success);
799     case NON_BOUNDARY:
800       return NeedsUnicodeCaseEquivalents(flags_)
801                  ? BoundaryAssertionAsLookaround(compiler, on_success,
802                                                  NON_BOUNDARY, flags_)
803                  : AssertionNode::AtNonBoundary(on_success);
804     case END_OF_INPUT:
805       return AssertionNode::AtEnd(on_success);
806     case END_OF_LINE: {
807       // Compile $ in multiline regexps as an alternation with a positive
808       // lookahead in one side and an end-of-input on the other side.
809       // We need two registers for the lookahead.
810       int stack_pointer_register = compiler->AllocateRegister();
811       int position_register = compiler->AllocateRegister();
812       // The ChoiceNode to distinguish between a newline and end-of-input.
813       ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
814       // Create a newline atom.
815       ZoneList<CharacterRange>* newline_ranges =
816           zone->New<ZoneList<CharacterRange>>(3, zone);
817       CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
818       JSRegExp::Flags default_flags = JSRegExp::Flags();
819       RegExpCharacterClass* newline_atom =
820           zone->New<RegExpCharacterClass>('n', default_flags);
821       TextNode* newline_matcher =
822           zone->New<TextNode>(newline_atom, false,
823                               ActionNode::PositiveSubmatchSuccess(
824                                   stack_pointer_register, position_register,
825                                   0,   // No captures inside.
826                                   -1,  // Ignored if no captures.
827                                   on_success));
828       // Create an end-of-input matcher.
829       RegExpNode* end_of_line = ActionNode::BeginPositiveSubmatch(
830           stack_pointer_register, position_register, newline_matcher);
831       // Add the two alternatives to the ChoiceNode.
832       GuardedAlternative eol_alternative(end_of_line);
833       result->AddAlternative(eol_alternative);
834       GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
835       result->AddAlternative(end_alternative);
836       return result;
837     }
838     default:
839       UNREACHABLE();
840   }
841   return on_success;
842 }
843 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)844 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
845                                         RegExpNode* on_success) {
846   return compiler->zone()->New<BackReferenceNode>(
847       RegExpCapture::StartRegister(index()),
848       RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(),
849       on_success);
850 }
851 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)852 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
853                                 RegExpNode* on_success) {
854   return on_success;
855 }
856 
Builder(bool is_positive,RegExpNode * on_success,int stack_pointer_register,int position_register,int capture_register_count,int capture_register_start)857 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
858                                    int stack_pointer_register,
859                                    int position_register,
860                                    int capture_register_count,
861                                    int capture_register_start)
862     : is_positive_(is_positive),
863       on_success_(on_success),
864       stack_pointer_register_(stack_pointer_register),
865       position_register_(position_register) {
866   if (is_positive_) {
867     on_match_success_ = ActionNode::PositiveSubmatchSuccess(
868         stack_pointer_register, position_register, capture_register_count,
869         capture_register_start, on_success_);
870   } else {
871     Zone* zone = on_success_->zone();
872     on_match_success_ = zone->New<NegativeSubmatchSuccess>(
873         stack_pointer_register, position_register, capture_register_count,
874         capture_register_start, zone);
875   }
876 }
877 
ForMatch(RegExpNode * match)878 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
879   if (is_positive_) {
880     return ActionNode::BeginPositiveSubmatch(stack_pointer_register_,
881                                              position_register_, match);
882   } else {
883     Zone* zone = on_success_->zone();
884     // We use a ChoiceNode to represent the negative lookaround. The first
885     // alternative is the negative match. On success, the end node backtracks.
886     // On failure, the second alternative is tried and leads to success.
887     // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
888     // first exit when calculating quick checks.
889     ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>(
890         GuardedAlternative(match), GuardedAlternative(on_success_), zone);
891     return ActionNode::BeginNegativeSubmatch(stack_pointer_register_,
892                                              position_register_, choice_node);
893   }
894 }
895 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)896 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
897                                      RegExpNode* on_success) {
898   int stack_pointer_register = compiler->AllocateRegister();
899   int position_register = compiler->AllocateRegister();
900 
901   const int registers_per_capture = 2;
902   const int register_of_first_capture = 2;
903   int register_count = capture_count_ * registers_per_capture;
904   int register_start =
905       register_of_first_capture + capture_from_ * registers_per_capture;
906 
907   RegExpNode* result;
908   bool was_reading_backward = compiler->read_backward();
909   compiler->set_read_backward(type() == LOOKBEHIND);
910   Builder builder(is_positive(), on_success, stack_pointer_register,
911                   position_register, register_count, register_start);
912   RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
913   result = builder.ForMatch(match);
914   compiler->set_read_backward(was_reading_backward);
915   return result;
916 }
917 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)918 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
919                                   RegExpNode* on_success) {
920   return ToNode(body(), index(), compiler, on_success);
921 }
922 
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)923 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,
924                                   RegExpCompiler* compiler,
925                                   RegExpNode* on_success) {
926   DCHECK_NOT_NULL(body);
927   int start_reg = RegExpCapture::StartRegister(index);
928   int end_reg = RegExpCapture::EndRegister(index);
929   if (compiler->read_backward()) std::swap(start_reg, end_reg);
930   RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
931   RegExpNode* body_node = body->ToNode(compiler, store_end);
932   return ActionNode::StorePosition(start_reg, true, body_node);
933 }
934 
935 namespace {
936 
937 class AssertionSequenceRewriter final {
938  public:
939   // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass
940   // instead of sprinkling rewrites into the AST->Node conversion process.
MaybeRewrite(ZoneList<RegExpTree * > * terms,Zone * zone)941   static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {
942     AssertionSequenceRewriter rewriter(terms, zone);
943 
944     static constexpr int kNoIndex = -1;
945     int from = kNoIndex;
946 
947     for (int i = 0; i < terms->length(); i++) {
948       RegExpTree* t = terms->at(i);
949       if (from == kNoIndex && t->IsAssertion()) {
950         from = i;  // Start a sequence.
951       } else if (from != kNoIndex && !t->IsAssertion()) {
952         // Terminate and process the sequence.
953         if (i - from > 1) rewriter.Rewrite(from, i);
954         from = kNoIndex;
955       }
956     }
957 
958     if (from != kNoIndex && terms->length() - from > 1) {
959       rewriter.Rewrite(from, terms->length());
960     }
961   }
962 
963   // All assertions are zero width. A consecutive sequence of assertions is
964   // order-independent. There's two ways we can optimize here:
965   // 1. fold all identical assertions.
966   // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire
967   //    sequence fails.
Rewrite(int from,int to)968   void Rewrite(int from, int to) {
969     DCHECK_GT(to, from + 1);
970 
971     // Bitfield of all seen assertions.
972     uint32_t seen_assertions = 0;
973     STATIC_ASSERT(RegExpAssertion::LAST_TYPE < kUInt32Size * kBitsPerByte);
974 
975     // Flags must match for folding.
976     JSRegExp::Flags flags = terms_->at(from)->AsAssertion()->flags();
977     bool saw_mismatched_flags = false;
978 
979     for (int i = from; i < to; i++) {
980       RegExpAssertion* t = terms_->at(i)->AsAssertion();
981       if (t->flags() != flags) saw_mismatched_flags = true;
982       const uint32_t bit = 1 << t->assertion_type();
983 
984       if ((seen_assertions & bit) && !saw_mismatched_flags) {
985         // Fold duplicates.
986         terms_->Set(i, zone_->New<RegExpEmpty>());
987       }
988 
989       seen_assertions |= bit;
990     }
991 
992     // Collapse failures.
993     const uint32_t always_fails_mask =
994         1 << RegExpAssertion::BOUNDARY | 1 << RegExpAssertion::NON_BOUNDARY;
995     if ((seen_assertions & always_fails_mask) == always_fails_mask) {
996       ReplaceSequenceWithFailure(from, to);
997     }
998   }
999 
ReplaceSequenceWithFailure(int from,int to)1000   void ReplaceSequenceWithFailure(int from, int to) {
1001     // Replace the entire sequence with a single node that always fails.
1002     // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the
1003     // negated '*' (everything) range serves the purpose.
1004     ZoneList<CharacterRange>* ranges =
1005         zone_->New<ZoneList<CharacterRange>>(0, zone_);
1006     RegExpCharacterClass* cc =
1007         zone_->New<RegExpCharacterClass>(zone_, ranges, JSRegExp::Flags());
1008     terms_->Set(from, cc);
1009 
1010     // Zero out the rest.
1011     RegExpEmpty* empty = zone_->New<RegExpEmpty>();
1012     for (int i = from + 1; i < to; i++) terms_->Set(i, empty);
1013   }
1014 
1015  private:
AssertionSequenceRewriter(ZoneList<RegExpTree * > * terms,Zone * zone)1016   AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)
1017       : zone_(zone), terms_(terms) {}
1018 
1019   Zone* zone_;
1020   ZoneList<RegExpTree*>* terms_;
1021 };
1022 
1023 }  // namespace
1024 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)1025 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
1026                                       RegExpNode* on_success) {
1027   ZoneList<RegExpTree*>* children = nodes();
1028 
1029   AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());
1030 
1031   RegExpNode* current = on_success;
1032   if (compiler->read_backward()) {
1033     for (int i = 0; i < children->length(); i++) {
1034       current = children->at(i)->ToNode(compiler, current);
1035     }
1036   } else {
1037     for (int i = children->length() - 1; i >= 0; i--) {
1038       current = children->at(i)->ToNode(compiler, current);
1039     }
1040   }
1041   return current;
1042 }
1043 
AddClass(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)1044 static void AddClass(const int* elmv, int elmc,
1045                      ZoneList<CharacterRange>* ranges, Zone* zone) {
1046   elmc--;
1047   DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
1048   for (int i = 0; i < elmc; i += 2) {
1049     DCHECK(elmv[i] < elmv[i + 1]);
1050     ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
1051   }
1052 }
1053 
AddClassNegated(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)1054 static void AddClassNegated(const int* elmv, int elmc,
1055                             ZoneList<CharacterRange>* ranges, Zone* zone) {
1056   elmc--;
1057   DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
1058   DCHECK_NE(0x0000, elmv[0]);
1059   DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
1060   uc16 last = 0x0000;
1061   for (int i = 0; i < elmc; i += 2) {
1062     DCHECK(last <= elmv[i] - 1);
1063     DCHECK(elmv[i] < elmv[i + 1]);
1064     ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
1065     last = elmv[i + 1];
1066   }
1067   ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
1068 }
1069 
AddClassEscape(char type,ZoneList<CharacterRange> * ranges,bool add_unicode_case_equivalents,Zone * zone)1070 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
1071                                     bool add_unicode_case_equivalents,
1072                                     Zone* zone) {
1073   if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
1074     // See #sec-runtime-semantics-wordcharacters-abstract-operation
1075     // In case of unicode and ignore_case, we need to create the closure over
1076     // case equivalent characters before negating.
1077     ZoneList<CharacterRange>* new_ranges =
1078         zone->New<ZoneList<CharacterRange>>(2, zone);
1079     AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
1080     AddUnicodeCaseEquivalents(new_ranges, zone);
1081     if (type == 'W') {
1082       ZoneList<CharacterRange>* negated =
1083           zone->New<ZoneList<CharacterRange>>(2, zone);
1084       CharacterRange::Negate(new_ranges, negated, zone);
1085       new_ranges = negated;
1086     }
1087     ranges->AddAll(*new_ranges, zone);
1088     return;
1089   }
1090   AddClassEscape(type, ranges, zone);
1091 }
1092 
AddClassEscape(char type,ZoneList<CharacterRange> * ranges,Zone * zone)1093 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
1094                                     Zone* zone) {
1095   switch (type) {
1096     case 's':
1097       AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
1098       break;
1099     case 'S':
1100       AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
1101       break;
1102     case 'w':
1103       AddClass(kWordRanges, kWordRangeCount, ranges, zone);
1104       break;
1105     case 'W':
1106       AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
1107       break;
1108     case 'd':
1109       AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
1110       break;
1111     case 'D':
1112       AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
1113       break;
1114     case '.':
1115       AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,
1116                       zone);
1117       break;
1118     // This is not a character range as defined by the spec but a
1119     // convenient shorthand for a character class that matches any
1120     // character.
1121     case '*':
1122       ranges->Add(CharacterRange::Everything(), zone);
1123       break;
1124     // This is the set of characters matched by the $ and ^ symbols
1125     // in multiline mode.
1126     case 'n':
1127       AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
1128       break;
1129     default:
1130       UNREACHABLE();
1131   }
1132 }
1133 
GetWordBounds()1134 Vector<const int> CharacterRange::GetWordBounds() {
1135   return Vector<const int>(kWordRanges, kWordRangeCount - 1);
1136 }
1137 
1138 // static
AddCaseEquivalents(Isolate * isolate,Zone * zone,ZoneList<CharacterRange> * ranges,bool is_one_byte)1139 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
1140                                         ZoneList<CharacterRange>* ranges,
1141                                         bool is_one_byte) {
1142   CharacterRange::Canonicalize(ranges);
1143   int range_count = ranges->length();
1144 #ifdef V8_INTL_SUPPORT
1145   icu::UnicodeSet others;
1146   for (int i = 0; i < range_count; i++) {
1147     CharacterRange range = ranges->at(i);
1148     uc32 from = range.from();
1149     if (from > String::kMaxUtf16CodeUnit) continue;
1150     uc32 to = std::min({range.to(), String::kMaxUtf16CodeUnitU});
1151     // Nothing to be done for surrogates.
1152     if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
1153     if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
1154       if (from > String::kMaxOneByteCharCode) continue;
1155       if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode;
1156     }
1157     others.add(from, to);
1158   }
1159 
1160   // Compute the set of additional characters that should be added,
1161   // using UnicodeSet::closeOver. ECMA 262 defines slightly different
1162   // case-folding rules than Unicode, so some characters that are
1163   // added by closeOver do not match anything other than themselves in
1164   // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
1165   // same case-insensitive character as 's' or 'S' according to
1166   // Unicode, but does not match any other character in JS. To handle
1167   // this case, we add such characters to the IgnoreSet and filter
1168   // them out. We filter twice: once before calling closeOver (to
1169   // prevent 'ſ' from adding 's'), and once after calling closeOver
1170   // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
1171   // more information.
1172   icu::UnicodeSet already_added(others);
1173   others.removeAll(RegExpCaseFolding::IgnoreSet());
1174   others.closeOver(USET_CASE_INSENSITIVE);
1175   others.removeAll(RegExpCaseFolding::IgnoreSet());
1176   others.removeAll(already_added);
1177 
1178   // Add others to the ranges
1179   for (int32_t i = 0; i < others.getRangeCount(); i++) {
1180     UChar32 from = others.getRangeStart(i);
1181     UChar32 to = others.getRangeEnd(i);
1182     if (from == to) {
1183       ranges->Add(CharacterRange::Singleton(from), zone);
1184     } else {
1185       ranges->Add(CharacterRange::Range(from, to), zone);
1186     }
1187   }
1188 #else
1189   for (int i = 0; i < range_count; i++) {
1190     CharacterRange range = ranges->at(i);
1191     uc32 bottom = range.from();
1192     if (bottom > String::kMaxUtf16CodeUnit) continue;
1193     uc32 top = std::min({range.to(), String::kMaxUtf16CodeUnitU});
1194     // Nothing to be done for surrogates.
1195     if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
1196     if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
1197       if (bottom > String::kMaxOneByteCharCode) continue;
1198       if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
1199     }
1200     unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1201     if (top == bottom) {
1202       // If this is a singleton we just expand the one character.
1203       int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
1204       for (int i = 0; i < length; i++) {
1205         uc32 chr = chars[i];
1206         if (chr != bottom) {
1207           ranges->Add(CharacterRange::Singleton(chars[i]), zone);
1208         }
1209       }
1210     } else {
1211       // If this is a range we expand the characters block by block, expanding
1212       // contiguous subranges (blocks) one at a time.  The approach is as
1213       // follows.  For a given start character we look up the remainder of the
1214       // block that contains it (represented by the end point), for instance we
1215       // find 'z' if the character is 'c'.  A block is characterized by the
1216       // property that all characters uncanonicalize in the same way, except
1217       // that each entry in the result is incremented by the distance from the
1218       // first element.  So a-z is a block because 'a' uncanonicalizes to ['a',
1219       // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].  Once
1220       // we've found the end point we look up its uncanonicalization and
1221       // produce a range for each element.  For instance for [c-f] we look up
1222       // ['z', 'Z'] and produce [c-f] and [C-F].  We then only add a range if
1223       // it is not already contained in the input, so [c-f] will be skipped but
1224       // [C-F] will be added.  If this range is not completely contained in a
1225       // block we do this for all the blocks covered by the range (handling
1226       // characters that is not in a block as a "singleton block").
1227       unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1228       uc32 pos = bottom;
1229       while (pos <= top) {
1230         int length =
1231             isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
1232         uc32 block_end;
1233         if (length == 0) {
1234           block_end = pos;
1235         } else {
1236           DCHECK_EQ(1, length);
1237           block_end = equivalents[0];
1238         }
1239         int end = (block_end > top) ? top : block_end;
1240         length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
1241                                                          equivalents);
1242         for (int i = 0; i < length; i++) {
1243           uc32 c = equivalents[i];
1244           uc32 range_from = c - (block_end - pos);
1245           uc32 range_to = c - (block_end - end);
1246           if (!(bottom <= range_from && range_to <= top)) {
1247             ranges->Add(CharacterRange::Range(range_from, range_to), zone);
1248           }
1249         }
1250         pos = end + 1;
1251       }
1252     }
1253   }
1254 #endif  // V8_INTL_SUPPORT
1255 }
1256 
IsCanonical(ZoneList<CharacterRange> * ranges)1257 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
1258   DCHECK_NOT_NULL(ranges);
1259   int n = ranges->length();
1260   if (n <= 1) return true;
1261   uc32 max = ranges->at(0).to();
1262   for (int i = 1; i < n; i++) {
1263     CharacterRange next_range = ranges->at(i);
1264     if (next_range.from() <= max + 1) return false;
1265     max = next_range.to();
1266   }
1267   return true;
1268 }
1269 
ranges(Zone * zone)1270 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
1271   if (ranges_ == nullptr) {
1272     ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone);
1273     CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
1274   }
1275   return ranges_;
1276 }
1277 
1278 // Move a number of elements in a zonelist to another position
1279 // in the same list. Handles overlapping source and target areas.
MoveRanges(ZoneList<CharacterRange> * list,int from,int to,int count)1280 static void MoveRanges(ZoneList<CharacterRange>* list, int from, int to,
1281                        int count) {
1282   // Ranges are potentially overlapping.
1283   if (from < to) {
1284     for (int i = count - 1; i >= 0; i--) {
1285       list->at(to + i) = list->at(from + i);
1286     }
1287   } else {
1288     for (int i = 0; i < count; i++) {
1289       list->at(to + i) = list->at(from + i);
1290     }
1291   }
1292 }
1293 
InsertRangeInCanonicalList(ZoneList<CharacterRange> * list,int count,CharacterRange insert)1294 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,
1295                                       CharacterRange insert) {
1296   // Inserts a range into list[0..count[, which must be sorted
1297   // by from value and non-overlapping and non-adjacent, using at most
1298   // list[0..count] for the result. Returns the number of resulting
1299   // canonicalized ranges. Inserting a range may collapse existing ranges into
1300   // fewer ranges, so the return value can be anything in the range 1..count+1.
1301   uc32 from = insert.from();
1302   uc32 to = insert.to();
1303   int start_pos = 0;
1304   int end_pos = count;
1305   for (int i = count - 1; i >= 0; i--) {
1306     CharacterRange current = list->at(i);
1307     if (current.from() > to + 1) {
1308       end_pos = i;
1309     } else if (current.to() + 1 < from) {
1310       start_pos = i + 1;
1311       break;
1312     }
1313   }
1314 
1315   // Inserted range overlaps, or is adjacent to, ranges at positions
1316   // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
1317   // not affected by the insertion.
1318   // If start_pos == end_pos, the range must be inserted before start_pos.
1319   // if start_pos < end_pos, the entire range from start_pos to end_pos
1320   // must be merged with the insert range.
1321 
1322   if (start_pos == end_pos) {
1323     // Insert between existing ranges at position start_pos.
1324     if (start_pos < count) {
1325       MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
1326     }
1327     list->at(start_pos) = insert;
1328     return count + 1;
1329   }
1330   if (start_pos + 1 == end_pos) {
1331     // Replace single existing range at position start_pos.
1332     CharacterRange to_replace = list->at(start_pos);
1333     int new_from = std::min(to_replace.from(), from);
1334     int new_to = std::max(to_replace.to(), to);
1335     list->at(start_pos) = CharacterRange::Range(new_from, new_to);
1336     return count;
1337   }
1338   // Replace a number of existing ranges from start_pos to end_pos - 1.
1339   // Move the remaining ranges down.
1340 
1341   int new_from = std::min(list->at(start_pos).from(), from);
1342   int new_to = std::max(list->at(end_pos - 1).to(), to);
1343   if (end_pos < count) {
1344     MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
1345   }
1346   list->at(start_pos) = CharacterRange::Range(new_from, new_to);
1347   return count - (end_pos - start_pos) + 1;
1348 }
1349 
Canonicalize()1350 void CharacterSet::Canonicalize() {
1351   // Special/default classes are always considered canonical. The result
1352   // of calling ranges() will be sorted.
1353   if (ranges_ == nullptr) return;
1354   CharacterRange::Canonicalize(ranges_);
1355 }
1356 
Canonicalize(ZoneList<CharacterRange> * character_ranges)1357 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
1358   if (character_ranges->length() <= 1) return;
1359   // Check whether ranges are already canonical (increasing, non-overlapping,
1360   // non-adjacent).
1361   int n = character_ranges->length();
1362   uc32 max = character_ranges->at(0).to();
1363   int i = 1;
1364   while (i < n) {
1365     CharacterRange current = character_ranges->at(i);
1366     if (current.from() <= max + 1) {
1367       break;
1368     }
1369     max = current.to();
1370     i++;
1371   }
1372   // Canonical until the i'th range. If that's all of them, we are done.
1373   if (i == n) return;
1374 
1375   // The ranges at index i and forward are not canonicalized. Make them so by
1376   // doing the equivalent of insertion sort (inserting each into the previous
1377   // list, in order).
1378   // Notice that inserting a range can reduce the number of ranges in the
1379   // result due to combining of adjacent and overlapping ranges.
1380   int read = i;           // Range to insert.
1381   int num_canonical = i;  // Length of canonicalized part of list.
1382   do {
1383     num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
1384                                                character_ranges->at(read));
1385     read++;
1386   } while (read < n);
1387   character_ranges->Rewind(num_canonical);
1388 
1389   DCHECK(CharacterRange::IsCanonical(character_ranges));
1390 }
1391 
Negate(ZoneList<CharacterRange> * ranges,ZoneList<CharacterRange> * negated_ranges,Zone * zone)1392 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
1393                             ZoneList<CharacterRange>* negated_ranges,
1394                             Zone* zone) {
1395   DCHECK(CharacterRange::IsCanonical(ranges));
1396   DCHECK_EQ(0, negated_ranges->length());
1397   int range_count = ranges->length();
1398   uc32 from = 0;
1399   int i = 0;
1400   if (range_count > 0 && ranges->at(0).from() == 0) {
1401     from = ranges->at(0).to() + 1;
1402     i = 1;
1403   }
1404   while (i < range_count) {
1405     CharacterRange range = ranges->at(i);
1406     negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
1407     from = range.to() + 1;
1408     i++;
1409   }
1410   if (from < String::kMaxCodePoint) {
1411     negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
1412                         zone);
1413   }
1414 }
1415 
1416 // Scoped object to keep track of how much we unroll quantifier loops in the
1417 // regexp graph generator.
1418 class RegExpExpansionLimiter {
1419  public:
1420   static const int kMaxExpansionFactor = 6;
RegExpExpansionLimiter(RegExpCompiler * compiler,int factor)1421   RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
1422       : compiler_(compiler),
1423         saved_expansion_factor_(compiler->current_expansion_factor()),
1424         ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
1425     DCHECK_LT(0, factor);
1426     if (ok_to_expand_) {
1427       if (factor > kMaxExpansionFactor) {
1428         // Avoid integer overflow of the current expansion factor.
1429         ok_to_expand_ = false;
1430         compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
1431       } else {
1432         int new_factor = saved_expansion_factor_ * factor;
1433         ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
1434         compiler->set_current_expansion_factor(new_factor);
1435       }
1436     }
1437   }
1438 
~RegExpExpansionLimiter()1439   ~RegExpExpansionLimiter() {
1440     compiler_->set_current_expansion_factor(saved_expansion_factor_);
1441   }
1442 
ok_to_expand()1443   bool ok_to_expand() { return ok_to_expand_; }
1444 
1445  private:
1446   RegExpCompiler* compiler_;
1447   int saved_expansion_factor_;
1448   bool ok_to_expand_;
1449 
1450   DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
1451 };
1452 
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)1453 RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
1454                                      RegExpTree* body, RegExpCompiler* compiler,
1455                                      RegExpNode* on_success,
1456                                      bool not_at_start) {
1457   // x{f, t} becomes this:
1458   //
1459   //             (r++)<-.
1460   //               |     `
1461   //               |     (x)
1462   //               v     ^
1463   //      (r=0)-->(?)---/ [if r < t]
1464   //               |
1465   //   [if r >= f] \----> ...
1466   //
1467 
1468   // 15.10.2.5 RepeatMatcher algorithm.
1469   // The parser has already eliminated the case where max is 0.  In the case
1470   // where max_match is zero the parser has removed the quantifier if min was
1471   // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
1472 
1473   // If we know that we cannot match zero length then things are a little
1474   // simpler since we don't need to make the special zero length match check
1475   // from step 2.1.  If the min and max are small we can unroll a little in
1476   // this case.
1477   static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
1478   static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
1479   if (max == 0) return on_success;  // This can happen due to recursion.
1480   bool body_can_be_empty = (body->min_match() == 0);
1481   int body_start_reg = RegExpCompiler::kNoRegister;
1482   Interval capture_registers = body->CaptureRegisters();
1483   bool needs_capture_clearing = !capture_registers.is_empty();
1484   Zone* zone = compiler->zone();
1485 
1486   if (body_can_be_empty) {
1487     body_start_reg = compiler->AllocateRegister();
1488   } else if (compiler->optimize() && !needs_capture_clearing) {
1489     // Only unroll if there are no captures and the body can't be
1490     // empty.
1491     {
1492       RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
1493       if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
1494         int new_max = (max == kInfinity) ? max : max - min;
1495         // Recurse once to get the loop or optional matches after the fixed
1496         // ones.
1497         RegExpNode* answer =
1498             ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
1499         // Unroll the forced matches from 0 to min.  This can cause chains of
1500         // TextNodes (which the parser does not generate).  These should be
1501         // combined if it turns out they hinder good code generation.
1502         for (int i = 0; i < min; i++) {
1503           answer = body->ToNode(compiler, answer);
1504         }
1505         return answer;
1506       }
1507     }
1508     if (max <= kMaxUnrolledMaxMatches && min == 0) {
1509       DCHECK_LT(0, max);  // Due to the 'if' above.
1510       RegExpExpansionLimiter limiter(compiler, max);
1511       if (limiter.ok_to_expand()) {
1512         // Unroll the optional matches up to max.
1513         RegExpNode* answer = on_success;
1514         for (int i = 0; i < max; i++) {
1515           ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone);
1516           if (is_greedy) {
1517             alternation->AddAlternative(
1518                 GuardedAlternative(body->ToNode(compiler, answer)));
1519             alternation->AddAlternative(GuardedAlternative(on_success));
1520           } else {
1521             alternation->AddAlternative(GuardedAlternative(on_success));
1522             alternation->AddAlternative(
1523                 GuardedAlternative(body->ToNode(compiler, answer)));
1524           }
1525           answer = alternation;
1526           if (not_at_start && !compiler->read_backward()) {
1527             alternation->set_not_at_start();
1528           }
1529         }
1530         return answer;
1531       }
1532     }
1533   }
1534   bool has_min = min > 0;
1535   bool has_max = max < RegExpTree::kInfinity;
1536   bool needs_counter = has_min || has_max;
1537   int reg_ctr = needs_counter ? compiler->AllocateRegister()
1538                               : RegExpCompiler::kNoRegister;
1539   LoopChoiceNode* center = zone->New<LoopChoiceNode>(
1540       body->min_match() == 0, compiler->read_backward(), min, zone);
1541   if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
1542   RegExpNode* loop_return =
1543       needs_counter ? static_cast<RegExpNode*>(
1544                           ActionNode::IncrementRegister(reg_ctr, center))
1545                     : static_cast<RegExpNode*>(center);
1546   if (body_can_be_empty) {
1547     // If the body can be empty we need to check if it was and then
1548     // backtrack.
1549     loop_return =
1550         ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
1551   }
1552   RegExpNode* body_node = body->ToNode(compiler, loop_return);
1553   if (body_can_be_empty) {
1554     // If the body can be empty we need to store the start position
1555     // so we can bail out if it was empty.
1556     body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
1557   }
1558   if (needs_capture_clearing) {
1559     // Before entering the body of this loop we need to clear captures.
1560     body_node = ActionNode::ClearCaptures(capture_registers, body_node);
1561   }
1562   GuardedAlternative body_alt(body_node);
1563   if (has_max) {
1564     Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max);
1565     body_alt.AddGuard(body_guard, zone);
1566   }
1567   GuardedAlternative rest_alt(on_success);
1568   if (has_min) {
1569     Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min);
1570     rest_alt.AddGuard(rest_guard, zone);
1571   }
1572   if (is_greedy) {
1573     center->AddLoopAlternative(body_alt);
1574     center->AddContinueAlternative(rest_alt);
1575   } else {
1576     center->AddContinueAlternative(rest_alt);
1577     center->AddLoopAlternative(body_alt);
1578   }
1579   if (needs_counter) {
1580     return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
1581   } else {
1582     return center;
1583   }
1584 }
1585 
1586 }  // namespace internal
1587 }  // namespace v8
1588