1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "irregexp/imported/regexp-compiler.h"
6
7 #include "irregexp/imported/regexp.h"
8 #ifdef V8_INTL_SUPPORT
9 #include "irregexp/imported/special-case.h"
10 #endif // V8_INTL_SUPPORT
11
12 #ifdef V8_INTL_SUPPORT
13 #include "unicode/locid.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utypes.h"
16 #endif // V8_INTL_SUPPORT
17
18 namespace v8 {
19 namespace internal {
20
21 using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
22
23 // -------------------------------------------------------------------
24 // Tree to graph conversion
25
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)26 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
27 RegExpNode* on_success) {
28 ZoneList<TextElement>* elms =
29 compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone());
30 elms->Add(TextElement::Atom(this), compiler->zone());
31 return compiler->zone()->New<TextNode>(elms, compiler->read_backward(),
32 on_success);
33 }
34
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)35 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
36 RegExpNode* on_success) {
37 return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(),
38 on_success);
39 }
40
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)41 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
42 const int* special_class, int length) {
43 length--; // Remove final marker.
44 DCHECK_EQ(kRangeEndMarker, special_class[length]);
45 DCHECK_NE(0, ranges->length());
46 DCHECK_NE(0, length);
47 DCHECK_NE(0, special_class[0]);
48 if (ranges->length() != (length >> 1) + 1) {
49 return false;
50 }
51 CharacterRange range = ranges->at(0);
52 if (range.from() != 0) {
53 return false;
54 }
55 for (int i = 0; i < length; i += 2) {
56 if (static_cast<uc32>(special_class[i]) != (range.to() + 1)) {
57 return false;
58 }
59 range = ranges->at((i >> 1) + 1);
60 if (static_cast<uc32>(special_class[i + 1]) != range.from()) {
61 return false;
62 }
63 }
64 if (range.to() != String::kMaxCodePoint) {
65 return false;
66 }
67 return true;
68 }
69
CompareRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)70 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
71 const int* special_class, int length) {
72 length--; // Remove final marker.
73 DCHECK_EQ(kRangeEndMarker, special_class[length]);
74 if (ranges->length() * 2 != length) {
75 return false;
76 }
77 for (int i = 0; i < length; i += 2) {
78 CharacterRange range = ranges->at(i >> 1);
79 if (range.from() != static_cast<uc32>(special_class[i]) ||
80 range.to() != static_cast<uc32>(special_class[i + 1] - 1)) {
81 return false;
82 }
83 }
84 return true;
85 }
86
is_standard(Zone * zone)87 bool RegExpCharacterClass::is_standard(Zone* zone) {
88 // TODO(lrn): Remove need for this function, by not throwing away information
89 // along the way.
90 if (is_negated()) {
91 return false;
92 }
93 if (set_.is_standard()) {
94 return true;
95 }
96 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
97 set_.set_standard_set_type('s');
98 return true;
99 }
100 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
101 set_.set_standard_set_type('S');
102 return true;
103 }
104 if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,
105 kLineTerminatorRangeCount)) {
106 set_.set_standard_set_type('.');
107 return true;
108 }
109 if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,
110 kLineTerminatorRangeCount)) {
111 set_.set_standard_set_type('n');
112 return true;
113 }
114 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
115 set_.set_standard_set_type('w');
116 return true;
117 }
118 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
119 set_.set_standard_set_type('W');
120 return true;
121 }
122 return false;
123 }
124
UnicodeRangeSplitter(ZoneList<CharacterRange> * base)125 UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) {
126 // The unicode range splitter categorizes given character ranges into:
127 // - Code points from the BMP representable by one code unit.
128 // - Code points outside the BMP that need to be split into surrogate pairs.
129 // - Lone lead surrogates.
130 // - Lone trail surrogates.
131 // Lone surrogates are valid code points, even though no actual characters.
132 // They require special matching to make sure we do not split surrogate pairs.
133
134 for (int i = 0; i < base->length(); i++) AddRange(base->at(i));
135 }
136
AddRange(CharacterRange range)137 void UnicodeRangeSplitter::AddRange(CharacterRange range) {
138 static constexpr uc32 kBmp1Start = 0;
139 static constexpr uc32 kBmp1End = kLeadSurrogateStart - 1;
140 static constexpr uc32 kBmp2Start = kTrailSurrogateEnd + 1;
141 static constexpr uc32 kBmp2End = kNonBmpStart - 1;
142
143 // Ends are all inclusive.
144 STATIC_ASSERT(kBmp1Start == 0);
145 STATIC_ASSERT(kBmp1Start < kBmp1End);
146 STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart);
147 STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd);
148 STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart);
149 STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd);
150 STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start);
151 STATIC_ASSERT(kBmp2Start < kBmp2End);
152 STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart);
153 STATIC_ASSERT(kNonBmpStart < kNonBmpEnd);
154
155 static constexpr uc32 kStarts[] = {
156 kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart,
157 kBmp2Start, kNonBmpStart,
158 };
159
160 static constexpr uc32 kEnds[] = {
161 kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd,
162 };
163
164 CharacterRangeVector* const kTargets[] = {
165 &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_,
166 };
167
168 static constexpr int kCount = arraysize(kStarts);
169 STATIC_ASSERT(kCount == arraysize(kEnds));
170 STATIC_ASSERT(kCount == arraysize(kTargets));
171
172 for (int i = 0; i < kCount; i++) {
173 if (kStarts[i] > range.to()) break;
174 const uc32 from = std::max(kStarts[i], range.from());
175 const uc32 to = std::min(kEnds[i], range.to());
176 if (from > to) continue;
177 kTargets[i]->emplace_back(CharacterRange::Range(from, to));
178 }
179 }
180
181 namespace {
182
183 // Translates between new and old V8-isms (SmallVector, ZoneList).
ToCanonicalZoneList(const UnicodeRangeSplitter::CharacterRangeVector * v,Zone * zone)184 ZoneList<CharacterRange>* ToCanonicalZoneList(
185 const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) {
186 if (v->empty()) return nullptr;
187
188 ZoneList<CharacterRange>* result =
189 zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone);
190 for (size_t i = 0; i < v->size(); i++) {
191 result->Add(v->at(i), zone);
192 }
193
194 CharacterRange::Canonicalize(result);
195 return result;
196 }
197
AddBmpCharacters(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)198 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
199 RegExpNode* on_success, UnicodeRangeSplitter* splitter,
200 JSRegExp::Flags flags) {
201 ZoneList<CharacterRange>* bmp =
202 ToCanonicalZoneList(splitter->bmp(), compiler->zone());
203 if (bmp == nullptr) return;
204 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
205 compiler->zone(), bmp, compiler->read_backward(), on_success, flags)));
206 }
207
AddNonBmpSurrogatePairs(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)208 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
209 RegExpNode* on_success,
210 UnicodeRangeSplitter* splitter,
211 JSRegExp::Flags flags) {
212 ZoneList<CharacterRange>* non_bmp =
213 ToCanonicalZoneList(splitter->non_bmp(), compiler->zone());
214 if (non_bmp == nullptr) return;
215 DCHECK(!compiler->one_byte());
216 Zone* zone = compiler->zone();
217 CharacterRange::Canonicalize(non_bmp);
218 for (int i = 0; i < non_bmp->length(); i++) {
219 // Match surrogate pair.
220 // E.g. [\u10005-\u11005] becomes
221 // \ud800[\udc05-\udfff]|
222 // [\ud801-\ud803][\udc00-\udfff]|
223 // \ud804[\udc00-\udc05]
224 uc32 from = non_bmp->at(i).from();
225 uc32 to = non_bmp->at(i).to();
226 uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
227 uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
228 uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
229 uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
230 if (from_l == to_l) {
231 // The lead surrogate is the same.
232 result->AddAlternative(
233 GuardedAlternative(TextNode::CreateForSurrogatePair(
234 zone, CharacterRange::Singleton(from_l),
235 CharacterRange::Range(from_t, to_t), compiler->read_backward(),
236 on_success, flags)));
237 } else {
238 if (from_t != kTrailSurrogateStart) {
239 // Add [from_l][from_t-\udfff]
240 result->AddAlternative(
241 GuardedAlternative(TextNode::CreateForSurrogatePair(
242 zone, CharacterRange::Singleton(from_l),
243 CharacterRange::Range(from_t, kTrailSurrogateEnd),
244 compiler->read_backward(), on_success, flags)));
245 from_l++;
246 }
247 if (to_t != kTrailSurrogateEnd) {
248 // Add [to_l][\udc00-to_t]
249 result->AddAlternative(
250 GuardedAlternative(TextNode::CreateForSurrogatePair(
251 zone, CharacterRange::Singleton(to_l),
252 CharacterRange::Range(kTrailSurrogateStart, to_t),
253 compiler->read_backward(), on_success, flags)));
254 to_l--;
255 }
256 if (from_l <= to_l) {
257 // Add [from_l-to_l][\udc00-\udfff]
258 result->AddAlternative(
259 GuardedAlternative(TextNode::CreateForSurrogatePair(
260 zone, CharacterRange::Range(from_l, to_l),
261 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
262 compiler->read_backward(), on_success, flags)));
263 }
264 }
265 }
266 }
267
NegativeLookaroundAgainstReadDirectionAndMatch(RegExpCompiler * compiler,ZoneList<CharacterRange> * lookbehind,ZoneList<CharacterRange> * match,RegExpNode * on_success,bool read_backward,JSRegExp::Flags flags)268 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
269 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
270 ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
271 JSRegExp::Flags flags) {
272 Zone* zone = compiler->zone();
273 RegExpNode* match_node = TextNode::CreateForCharacterRanges(
274 zone, match, read_backward, on_success, flags);
275 int stack_register = compiler->UnicodeLookaroundStackRegister();
276 int position_register = compiler->UnicodeLookaroundPositionRegister();
277 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
278 position_register);
279 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
280 zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
281 return lookaround.ForMatch(negative_match);
282 }
283
MatchAndNegativeLookaroundInReadDirection(RegExpCompiler * compiler,ZoneList<CharacterRange> * match,ZoneList<CharacterRange> * lookahead,RegExpNode * on_success,bool read_backward,JSRegExp::Flags flags)284 RegExpNode* MatchAndNegativeLookaroundInReadDirection(
285 RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
286 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
287 bool read_backward, JSRegExp::Flags flags) {
288 Zone* zone = compiler->zone();
289 int stack_register = compiler->UnicodeLookaroundStackRegister();
290 int position_register = compiler->UnicodeLookaroundPositionRegister();
291 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
292 position_register);
293 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
294 zone, lookahead, read_backward, lookaround.on_match_success(), flags);
295 return TextNode::CreateForCharacterRanges(
296 zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
297 }
298
AddLoneLeadSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)299 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
300 RegExpNode* on_success,
301 UnicodeRangeSplitter* splitter,
302 JSRegExp::Flags flags) {
303 ZoneList<CharacterRange>* lead_surrogates =
304 ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());
305 if (lead_surrogates == nullptr) return;
306 Zone* zone = compiler->zone();
307 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
308 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
309 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
310
311 RegExpNode* match;
312 if (compiler->read_backward()) {
313 // Reading backward. Assert that reading forward, there is no trail
314 // surrogate, and then backward match the lead surrogate.
315 match = NegativeLookaroundAgainstReadDirectionAndMatch(
316 compiler, trail_surrogates, lead_surrogates, on_success, true, flags);
317 } else {
318 // Reading forward. Forward match the lead surrogate and assert that
319 // no trail surrogate follows.
320 match = MatchAndNegativeLookaroundInReadDirection(
321 compiler, lead_surrogates, trail_surrogates, on_success, false, flags);
322 }
323 result->AddAlternative(GuardedAlternative(match));
324 }
325
AddLoneTrailSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter,JSRegExp::Flags flags)326 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
327 RegExpNode* on_success,
328 UnicodeRangeSplitter* splitter,
329 JSRegExp::Flags flags) {
330 ZoneList<CharacterRange>* trail_surrogates =
331 ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());
332 if (trail_surrogates == nullptr) return;
333 Zone* zone = compiler->zone();
334 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
335 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
336 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
337
338 RegExpNode* match;
339 if (compiler->read_backward()) {
340 // Reading backward. Backward match the trail surrogate and assert that no
341 // lead surrogate precedes it.
342 match = MatchAndNegativeLookaroundInReadDirection(
343 compiler, trail_surrogates, lead_surrogates, on_success, true, flags);
344 } else {
345 // Reading forward. Assert that reading backward, there is no lead
346 // surrogate, and then forward match the trail surrogate.
347 match = NegativeLookaroundAgainstReadDirectionAndMatch(
348 compiler, lead_surrogates, trail_surrogates, on_success, false, flags);
349 }
350 result->AddAlternative(GuardedAlternative(match));
351 }
352
UnanchoredAdvance(RegExpCompiler * compiler,RegExpNode * on_success)353 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
354 RegExpNode* on_success) {
355 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
356 DCHECK(!compiler->read_backward());
357 Zone* zone = compiler->zone();
358 // Advance any character. If the character happens to be a lead surrogate and
359 // we advanced into the middle of a surrogate pair, it will work out, as
360 // nothing will match from there. We will have to advance again, consuming
361 // the associated trail surrogate.
362 ZoneList<CharacterRange>* range = CharacterRange::List(
363 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
364 JSRegExp::Flags default_flags = JSRegExp::Flags();
365 return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
366 default_flags);
367 }
368
AddUnicodeCaseEquivalents(ZoneList<CharacterRange> * ranges,Zone * zone)369 void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
370 #ifdef V8_INTL_SUPPORT
371 DCHECK(CharacterRange::IsCanonical(ranges));
372
373 // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
374 // See also https://crbug.com/v8/6727.
375 // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
376 // which we use frequently internally. But large ranges can also easily be
377 // created by the user. We might want to have a more general caching mechanism
378 // for such ranges.
379 if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
380
381 // Use ICU to compute the case fold closure over the ranges.
382 icu::UnicodeSet set;
383 for (int i = 0; i < ranges->length(); i++) {
384 set.add(ranges->at(i).from(), ranges->at(i).to());
385 }
386 // Clear the ranges list without freeing the backing store.
387 ranges->Rewind(0);
388 set.closeOver(USET_CASE_INSENSITIVE);
389 // Full case mapping map single characters to multiple characters.
390 // Those are represented as strings in the set. Remove them so that
391 // we end up with only simple and common case mappings.
392 set.removeAllStrings();
393 for (int i = 0; i < set.getRangeCount(); i++) {
394 ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
395 zone);
396 }
397 // No errors and everything we collected have been ranges.
398 CharacterRange::Canonicalize(ranges);
399 #endif // V8_INTL_SUPPORT
400 }
401
402 } // namespace
403
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)404 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
405 RegExpNode* on_success) {
406 set_.Canonicalize();
407 Zone* zone = compiler->zone();
408 ZoneList<CharacterRange>* ranges = this->ranges(zone);
409 if (NeedsUnicodeCaseEquivalents(flags_)) {
410 AddUnicodeCaseEquivalents(ranges, zone);
411 }
412 if (IsUnicode(flags_) && !compiler->one_byte() &&
413 !contains_split_surrogate()) {
414 if (is_negated()) {
415 ZoneList<CharacterRange>* negated =
416 zone->New<ZoneList<CharacterRange>>(2, zone);
417 CharacterRange::Negate(ranges, negated, zone);
418 ranges = negated;
419 }
420 if (ranges->length() == 0) {
421 JSRegExp::Flags default_flags;
422 RegExpCharacterClass* fail =
423 zone->New<RegExpCharacterClass>(zone, ranges, default_flags);
424 return zone->New<TextNode>(fail, compiler->read_backward(), on_success);
425 }
426 if (standard_type() == '*') {
427 return UnanchoredAdvance(compiler, on_success);
428 } else {
429 ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
430 UnicodeRangeSplitter splitter(ranges);
431 AddBmpCharacters(compiler, result, on_success, &splitter, flags_);
432 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter, flags_);
433 AddLoneLeadSurrogates(compiler, result, on_success, &splitter, flags_);
434 AddLoneTrailSurrogates(compiler, result, on_success, &splitter, flags_);
435 static constexpr int kMaxRangesToInline = 32; // Arbitrary.
436 if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline();
437 return result;
438 }
439 } else {
440 return zone->New<TextNode>(this, compiler->read_backward(), on_success);
441 }
442 }
443
CompareFirstChar(RegExpTree * const * a,RegExpTree * const * b)444 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
445 RegExpAtom* atom1 = (*a)->AsAtom();
446 RegExpAtom* atom2 = (*b)->AsAtom();
447 uc16 character1 = atom1->data().at(0);
448 uc16 character2 = atom2->data().at(0);
449 if (character1 < character2) return -1;
450 if (character1 > character2) return 1;
451 return 0;
452 }
453
454 #ifdef V8_INTL_SUPPORT
455
456 // Case Insensitve comparesion
CompareFirstCharCaseInsensitve(RegExpTree * const * a,RegExpTree * const * b)457 int CompareFirstCharCaseInsensitve(RegExpTree* const* a, RegExpTree* const* b) {
458 RegExpAtom* atom1 = (*a)->AsAtom();
459 RegExpAtom* atom2 = (*b)->AsAtom();
460 icu::UnicodeString character1(atom1->data().at(0));
461 return character1.caseCompare(atom2->data().at(0), U_FOLD_CASE_DEFAULT);
462 }
463
464 #else
465
Canonical(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,unibrow::uchar c)466 static unibrow::uchar Canonical(
467 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
468 unibrow::uchar c) {
469 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
470 int length = canonicalize->get(c, '\0', chars);
471 DCHECK_LE(length, 1);
472 unibrow::uchar canonical = c;
473 if (length == 1) canonical = chars[0];
474 return canonical;
475 }
476
CompareFirstCharCaseIndependent(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,RegExpTree * const * a,RegExpTree * const * b)477 int CompareFirstCharCaseIndependent(
478 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
479 RegExpTree* const* a, RegExpTree* const* b) {
480 RegExpAtom* atom1 = (*a)->AsAtom();
481 RegExpAtom* atom2 = (*b)->AsAtom();
482 unibrow::uchar character1 = atom1->data().at(0);
483 unibrow::uchar character2 = atom2->data().at(0);
484 if (character1 == character2) return 0;
485 if (character1 >= 'a' || character2 >= 'a') {
486 character1 = Canonical(canonicalize, character1);
487 character2 = Canonical(canonicalize, character2);
488 }
489 return static_cast<int>(character1) - static_cast<int>(character2);
490 }
491 #endif // V8_INTL_SUPPORT
492
493 // We can stable sort runs of atoms, since the order does not matter if they
494 // start with different characters.
495 // Returns true if any consecutive atoms were found.
SortConsecutiveAtoms(RegExpCompiler * compiler)496 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
497 ZoneList<RegExpTree*>* alternatives = this->alternatives();
498 int length = alternatives->length();
499 bool found_consecutive_atoms = false;
500 for (int i = 0; i < length; i++) {
501 while (i < length) {
502 RegExpTree* alternative = alternatives->at(i);
503 if (alternative->IsAtom()) break;
504 i++;
505 }
506 // i is length or it is the index of an atom.
507 if (i == length) break;
508 int first_atom = i;
509 JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
510 i++;
511 while (i < length) {
512 RegExpTree* alternative = alternatives->at(i);
513 if (!alternative->IsAtom()) break;
514 if (alternative->AsAtom()->flags() != flags) break;
515 i++;
516 }
517 // Sort atoms to get ones with common prefixes together.
518 // This step is more tricky if we are in a case-independent regexp,
519 // because it would change /is|I/ to /I|is/, and order matters when
520 // the regexp parts don't match only disjoint starting points. To fix
521 // this we have a version of CompareFirstChar that uses case-
522 // independent character classes for comparison.
523 DCHECK_LT(first_atom, alternatives->length());
524 DCHECK_LE(i, alternatives->length());
525 DCHECK_LE(first_atom, i);
526 if (IgnoreCase(flags)) {
527 #ifdef V8_INTL_SUPPORT
528 alternatives->StableSort(CompareFirstCharCaseInsensitve, first_atom,
529 i - first_atom);
530 #else
531 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
532 compiler->isolate()->regexp_macro_assembler_canonicalize();
533 auto compare_closure = [canonicalize](RegExpTree* const* a,
534 RegExpTree* const* b) {
535 return CompareFirstCharCaseIndependent(canonicalize, a, b);
536 };
537 alternatives->StableSort(compare_closure, first_atom, i - first_atom);
538 #endif // V8_INTL_SUPPORT
539 } else {
540 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
541 }
542 if (i - first_atom > 1) found_consecutive_atoms = true;
543 }
544 return found_consecutive_atoms;
545 }
546
547 // Optimizes ab|ac|az to a(?:b|c|d).
RationalizeConsecutiveAtoms(RegExpCompiler * compiler)548 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
549 Zone* zone = compiler->zone();
550 ZoneList<RegExpTree*>* alternatives = this->alternatives();
551 int length = alternatives->length();
552
553 int write_posn = 0;
554 int i = 0;
555 while (i < length) {
556 RegExpTree* alternative = alternatives->at(i);
557 if (!alternative->IsAtom()) {
558 alternatives->at(write_posn++) = alternatives->at(i);
559 i++;
560 continue;
561 }
562 RegExpAtom* const atom = alternative->AsAtom();
563 JSRegExp::Flags flags = atom->flags();
564 #ifdef V8_INTL_SUPPORT
565 icu::UnicodeString common_prefix(atom->data().at(0));
566 #else
567 unibrow::uchar common_prefix = atom->data().at(0);
568 #endif // V8_INTL_SUPPORT
569 int first_with_prefix = i;
570 int prefix_length = atom->length();
571 i++;
572 while (i < length) {
573 alternative = alternatives->at(i);
574 if (!alternative->IsAtom()) break;
575 RegExpAtom* const atom = alternative->AsAtom();
576 if (atom->flags() != flags) break;
577 #ifdef V8_INTL_SUPPORT
578 icu::UnicodeString new_prefix(atom->data().at(0));
579 if (new_prefix != common_prefix) {
580 if (!IgnoreCase(flags)) break;
581 if (common_prefix.caseCompare(new_prefix, U_FOLD_CASE_DEFAULT) != 0)
582 break;
583 }
584 #else
585 unibrow::uchar new_prefix = atom->data().at(0);
586 if (new_prefix != common_prefix) {
587 if (!IgnoreCase(flags)) break;
588 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
589 compiler->isolate()->regexp_macro_assembler_canonicalize();
590 new_prefix = Canonical(canonicalize, new_prefix);
591 common_prefix = Canonical(canonicalize, common_prefix);
592 if (new_prefix != common_prefix) break;
593 }
594 #endif // V8_INTL_SUPPORT
595 prefix_length = std::min(prefix_length, atom->length());
596 i++;
597 }
598 if (i > first_with_prefix + 2) {
599 // Found worthwhile run of alternatives with common prefix of at least one
600 // character. The sorting function above did not sort on more than one
601 // character for reasons of correctness, but there may still be a longer
602 // common prefix if the terms were similar or presorted in the input.
603 // Find out how long the common prefix is.
604 int run_length = i - first_with_prefix;
605 RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
606 for (int j = 1; j < run_length && prefix_length > 1; j++) {
607 RegExpAtom* old_atom =
608 alternatives->at(j + first_with_prefix)->AsAtom();
609 for (int k = 1; k < prefix_length; k++) {
610 if (atom->data().at(k) != old_atom->data().at(k)) {
611 prefix_length = k;
612 break;
613 }
614 }
615 }
616 RegExpAtom* prefix = zone->New<RegExpAtom>(
617 atom->data().SubVector(0, prefix_length), flags);
618 ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone);
619 pair->Add(prefix, zone);
620 ZoneList<RegExpTree*>* suffixes =
621 zone->New<ZoneList<RegExpTree*>>(run_length, zone);
622 for (int j = 0; j < run_length; j++) {
623 RegExpAtom* old_atom =
624 alternatives->at(j + first_with_prefix)->AsAtom();
625 int len = old_atom->length();
626 if (len == prefix_length) {
627 suffixes->Add(zone->New<RegExpEmpty>(), zone);
628 } else {
629 RegExpTree* suffix = zone->New<RegExpAtom>(
630 old_atom->data().SubVector(prefix_length, old_atom->length()),
631 flags);
632 suffixes->Add(suffix, zone);
633 }
634 }
635 pair->Add(zone->New<RegExpDisjunction>(suffixes), zone);
636 alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair);
637 } else {
638 // Just copy any non-worthwhile alternatives.
639 for (int j = first_with_prefix; j < i; j++) {
640 alternatives->at(write_posn++) = alternatives->at(j);
641 }
642 }
643 }
644 alternatives->Rewind(write_posn); // Trim end of array.
645 }
646
647 // Optimizes b|c|z to [bcz].
FixSingleCharacterDisjunctions(RegExpCompiler * compiler)648 void RegExpDisjunction::FixSingleCharacterDisjunctions(
649 RegExpCompiler* compiler) {
650 Zone* zone = compiler->zone();
651 ZoneList<RegExpTree*>* alternatives = this->alternatives();
652 int length = alternatives->length();
653
654 int write_posn = 0;
655 int i = 0;
656 while (i < length) {
657 RegExpTree* alternative = alternatives->at(i);
658 if (!alternative->IsAtom()) {
659 alternatives->at(write_posn++) = alternatives->at(i);
660 i++;
661 continue;
662 }
663 RegExpAtom* const atom = alternative->AsAtom();
664 if (atom->length() != 1) {
665 alternatives->at(write_posn++) = alternatives->at(i);
666 i++;
667 continue;
668 }
669 JSRegExp::Flags flags = atom->flags();
670 DCHECK_IMPLIES(IsUnicode(flags),
671 !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
672 bool contains_trail_surrogate =
673 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
674 int first_in_run = i;
675 i++;
676 // Find a run of single-character atom alternatives that have identical
677 // flags (case independence and unicode-ness).
678 while (i < length) {
679 alternative = alternatives->at(i);
680 if (!alternative->IsAtom()) break;
681 RegExpAtom* const atom = alternative->AsAtom();
682 if (atom->length() != 1) break;
683 if (atom->flags() != flags) break;
684 DCHECK_IMPLIES(IsUnicode(flags),
685 !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
686 contains_trail_surrogate |=
687 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
688 i++;
689 }
690 if (i > first_in_run + 1) {
691 // Found non-trivial run of single-character alternatives.
692 int run_length = i - first_in_run;
693 ZoneList<CharacterRange>* ranges =
694 zone->New<ZoneList<CharacterRange>>(2, zone);
695 for (int j = 0; j < run_length; j++) {
696 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
697 DCHECK_EQ(old_atom->length(), 1);
698 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
699 }
700 RegExpCharacterClass::CharacterClassFlags character_class_flags;
701 if (IsUnicode(flags) && contains_trail_surrogate) {
702 character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
703 }
704 alternatives->at(write_posn++) = zone->New<RegExpCharacterClass>(
705 zone, ranges, flags, character_class_flags);
706 } else {
707 // Just copy any trivial alternatives.
708 for (int j = first_in_run; j < i; j++) {
709 alternatives->at(write_posn++) = alternatives->at(j);
710 }
711 }
712 }
713 alternatives->Rewind(write_posn); // Trim end of array.
714 }
715
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)716 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
717 RegExpNode* on_success) {
718 ZoneList<RegExpTree*>* alternatives = this->alternatives();
719
720 if (alternatives->length() > 2) {
721 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
722 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
723 FixSingleCharacterDisjunctions(compiler);
724 if (alternatives->length() == 1) {
725 return alternatives->at(0)->ToNode(compiler, on_success);
726 }
727 }
728
729 int length = alternatives->length();
730
731 ChoiceNode* result =
732 compiler->zone()->New<ChoiceNode>(length, compiler->zone());
733 for (int i = 0; i < length; i++) {
734 GuardedAlternative alternative(
735 alternatives->at(i)->ToNode(compiler, on_success));
736 result->AddAlternative(alternative);
737 }
738 return result;
739 }
740
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)741 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
742 RegExpNode* on_success) {
743 return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
744 }
745
746 namespace {
747 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
748 // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
BoundaryAssertionAsLookaround(RegExpCompiler * compiler,RegExpNode * on_success,RegExpAssertion::AssertionType type,JSRegExp::Flags flags)749 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
750 RegExpNode* on_success,
751 RegExpAssertion::AssertionType type,
752 JSRegExp::Flags flags) {
753 DCHECK(NeedsUnicodeCaseEquivalents(flags));
754 Zone* zone = compiler->zone();
755 ZoneList<CharacterRange>* word_range =
756 zone->New<ZoneList<CharacterRange>>(2, zone);
757 CharacterRange::AddClassEscape('w', word_range, true, zone);
758 int stack_register = compiler->UnicodeLookaroundStackRegister();
759 int position_register = compiler->UnicodeLookaroundPositionRegister();
760 ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
761 // Add two choices. The (non-)boundary could start with a word or
762 // a non-word-character.
763 for (int i = 0; i < 2; i++) {
764 bool lookbehind_for_word = i == 0;
765 bool lookahead_for_word =
766 (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
767 // Look to the left.
768 RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
769 stack_register, position_register);
770 RegExpNode* backward = TextNode::CreateForCharacterRanges(
771 zone, word_range, true, lookbehind.on_match_success(), flags);
772 // Look to the right.
773 RegExpLookaround::Builder lookahead(lookahead_for_word,
774 lookbehind.ForMatch(backward),
775 stack_register, position_register);
776 RegExpNode* forward = TextNode::CreateForCharacterRanges(
777 zone, word_range, false, lookahead.on_match_success(), flags);
778 result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
779 }
780 return result;
781 }
782 } // anonymous namespace
783
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)784 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
785 RegExpNode* on_success) {
786 NodeInfo info;
787 Zone* zone = compiler->zone();
788
789 switch (assertion_type()) {
790 case START_OF_LINE:
791 return AssertionNode::AfterNewline(on_success);
792 case START_OF_INPUT:
793 return AssertionNode::AtStart(on_success);
794 case BOUNDARY:
795 return NeedsUnicodeCaseEquivalents(flags_)
796 ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
797 flags_)
798 : AssertionNode::AtBoundary(on_success);
799 case NON_BOUNDARY:
800 return NeedsUnicodeCaseEquivalents(flags_)
801 ? BoundaryAssertionAsLookaround(compiler, on_success,
802 NON_BOUNDARY, flags_)
803 : AssertionNode::AtNonBoundary(on_success);
804 case END_OF_INPUT:
805 return AssertionNode::AtEnd(on_success);
806 case END_OF_LINE: {
807 // Compile $ in multiline regexps as an alternation with a positive
808 // lookahead in one side and an end-of-input on the other side.
809 // We need two registers for the lookahead.
810 int stack_pointer_register = compiler->AllocateRegister();
811 int position_register = compiler->AllocateRegister();
812 // The ChoiceNode to distinguish between a newline and end-of-input.
813 ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
814 // Create a newline atom.
815 ZoneList<CharacterRange>* newline_ranges =
816 zone->New<ZoneList<CharacterRange>>(3, zone);
817 CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
818 JSRegExp::Flags default_flags = JSRegExp::Flags();
819 RegExpCharacterClass* newline_atom =
820 zone->New<RegExpCharacterClass>('n', default_flags);
821 TextNode* newline_matcher =
822 zone->New<TextNode>(newline_atom, false,
823 ActionNode::PositiveSubmatchSuccess(
824 stack_pointer_register, position_register,
825 0, // No captures inside.
826 -1, // Ignored if no captures.
827 on_success));
828 // Create an end-of-input matcher.
829 RegExpNode* end_of_line = ActionNode::BeginPositiveSubmatch(
830 stack_pointer_register, position_register, newline_matcher);
831 // Add the two alternatives to the ChoiceNode.
832 GuardedAlternative eol_alternative(end_of_line);
833 result->AddAlternative(eol_alternative);
834 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
835 result->AddAlternative(end_alternative);
836 return result;
837 }
838 default:
839 UNREACHABLE();
840 }
841 return on_success;
842 }
843
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)844 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
845 RegExpNode* on_success) {
846 return compiler->zone()->New<BackReferenceNode>(
847 RegExpCapture::StartRegister(index()),
848 RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(),
849 on_success);
850 }
851
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)852 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
853 RegExpNode* on_success) {
854 return on_success;
855 }
856
Builder(bool is_positive,RegExpNode * on_success,int stack_pointer_register,int position_register,int capture_register_count,int capture_register_start)857 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
858 int stack_pointer_register,
859 int position_register,
860 int capture_register_count,
861 int capture_register_start)
862 : is_positive_(is_positive),
863 on_success_(on_success),
864 stack_pointer_register_(stack_pointer_register),
865 position_register_(position_register) {
866 if (is_positive_) {
867 on_match_success_ = ActionNode::PositiveSubmatchSuccess(
868 stack_pointer_register, position_register, capture_register_count,
869 capture_register_start, on_success_);
870 } else {
871 Zone* zone = on_success_->zone();
872 on_match_success_ = zone->New<NegativeSubmatchSuccess>(
873 stack_pointer_register, position_register, capture_register_count,
874 capture_register_start, zone);
875 }
876 }
877
ForMatch(RegExpNode * match)878 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
879 if (is_positive_) {
880 return ActionNode::BeginPositiveSubmatch(stack_pointer_register_,
881 position_register_, match);
882 } else {
883 Zone* zone = on_success_->zone();
884 // We use a ChoiceNode to represent the negative lookaround. The first
885 // alternative is the negative match. On success, the end node backtracks.
886 // On failure, the second alternative is tried and leads to success.
887 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
888 // first exit when calculating quick checks.
889 ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>(
890 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
891 return ActionNode::BeginNegativeSubmatch(stack_pointer_register_,
892 position_register_, choice_node);
893 }
894 }
895
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)896 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
897 RegExpNode* on_success) {
898 int stack_pointer_register = compiler->AllocateRegister();
899 int position_register = compiler->AllocateRegister();
900
901 const int registers_per_capture = 2;
902 const int register_of_first_capture = 2;
903 int register_count = capture_count_ * registers_per_capture;
904 int register_start =
905 register_of_first_capture + capture_from_ * registers_per_capture;
906
907 RegExpNode* result;
908 bool was_reading_backward = compiler->read_backward();
909 compiler->set_read_backward(type() == LOOKBEHIND);
910 Builder builder(is_positive(), on_success, stack_pointer_register,
911 position_register, register_count, register_start);
912 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
913 result = builder.ForMatch(match);
914 compiler->set_read_backward(was_reading_backward);
915 return result;
916 }
917
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)918 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
919 RegExpNode* on_success) {
920 return ToNode(body(), index(), compiler, on_success);
921 }
922
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)923 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,
924 RegExpCompiler* compiler,
925 RegExpNode* on_success) {
926 DCHECK_NOT_NULL(body);
927 int start_reg = RegExpCapture::StartRegister(index);
928 int end_reg = RegExpCapture::EndRegister(index);
929 if (compiler->read_backward()) std::swap(start_reg, end_reg);
930 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
931 RegExpNode* body_node = body->ToNode(compiler, store_end);
932 return ActionNode::StorePosition(start_reg, true, body_node);
933 }
934
935 namespace {
936
937 class AssertionSequenceRewriter final {
938 public:
939 // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass
940 // instead of sprinkling rewrites into the AST->Node conversion process.
MaybeRewrite(ZoneList<RegExpTree * > * terms,Zone * zone)941 static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {
942 AssertionSequenceRewriter rewriter(terms, zone);
943
944 static constexpr int kNoIndex = -1;
945 int from = kNoIndex;
946
947 for (int i = 0; i < terms->length(); i++) {
948 RegExpTree* t = terms->at(i);
949 if (from == kNoIndex && t->IsAssertion()) {
950 from = i; // Start a sequence.
951 } else if (from != kNoIndex && !t->IsAssertion()) {
952 // Terminate and process the sequence.
953 if (i - from > 1) rewriter.Rewrite(from, i);
954 from = kNoIndex;
955 }
956 }
957
958 if (from != kNoIndex && terms->length() - from > 1) {
959 rewriter.Rewrite(from, terms->length());
960 }
961 }
962
963 // All assertions are zero width. A consecutive sequence of assertions is
964 // order-independent. There's two ways we can optimize here:
965 // 1. fold all identical assertions.
966 // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire
967 // sequence fails.
Rewrite(int from,int to)968 void Rewrite(int from, int to) {
969 DCHECK_GT(to, from + 1);
970
971 // Bitfield of all seen assertions.
972 uint32_t seen_assertions = 0;
973 STATIC_ASSERT(RegExpAssertion::LAST_TYPE < kUInt32Size * kBitsPerByte);
974
975 // Flags must match for folding.
976 JSRegExp::Flags flags = terms_->at(from)->AsAssertion()->flags();
977 bool saw_mismatched_flags = false;
978
979 for (int i = from; i < to; i++) {
980 RegExpAssertion* t = terms_->at(i)->AsAssertion();
981 if (t->flags() != flags) saw_mismatched_flags = true;
982 const uint32_t bit = 1 << t->assertion_type();
983
984 if ((seen_assertions & bit) && !saw_mismatched_flags) {
985 // Fold duplicates.
986 terms_->Set(i, zone_->New<RegExpEmpty>());
987 }
988
989 seen_assertions |= bit;
990 }
991
992 // Collapse failures.
993 const uint32_t always_fails_mask =
994 1 << RegExpAssertion::BOUNDARY | 1 << RegExpAssertion::NON_BOUNDARY;
995 if ((seen_assertions & always_fails_mask) == always_fails_mask) {
996 ReplaceSequenceWithFailure(from, to);
997 }
998 }
999
ReplaceSequenceWithFailure(int from,int to)1000 void ReplaceSequenceWithFailure(int from, int to) {
1001 // Replace the entire sequence with a single node that always fails.
1002 // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the
1003 // negated '*' (everything) range serves the purpose.
1004 ZoneList<CharacterRange>* ranges =
1005 zone_->New<ZoneList<CharacterRange>>(0, zone_);
1006 RegExpCharacterClass* cc =
1007 zone_->New<RegExpCharacterClass>(zone_, ranges, JSRegExp::Flags());
1008 terms_->Set(from, cc);
1009
1010 // Zero out the rest.
1011 RegExpEmpty* empty = zone_->New<RegExpEmpty>();
1012 for (int i = from + 1; i < to; i++) terms_->Set(i, empty);
1013 }
1014
1015 private:
AssertionSequenceRewriter(ZoneList<RegExpTree * > * terms,Zone * zone)1016 AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)
1017 : zone_(zone), terms_(terms) {}
1018
1019 Zone* zone_;
1020 ZoneList<RegExpTree*>* terms_;
1021 };
1022
1023 } // namespace
1024
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)1025 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
1026 RegExpNode* on_success) {
1027 ZoneList<RegExpTree*>* children = nodes();
1028
1029 AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());
1030
1031 RegExpNode* current = on_success;
1032 if (compiler->read_backward()) {
1033 for (int i = 0; i < children->length(); i++) {
1034 current = children->at(i)->ToNode(compiler, current);
1035 }
1036 } else {
1037 for (int i = children->length() - 1; i >= 0; i--) {
1038 current = children->at(i)->ToNode(compiler, current);
1039 }
1040 }
1041 return current;
1042 }
1043
AddClass(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)1044 static void AddClass(const int* elmv, int elmc,
1045 ZoneList<CharacterRange>* ranges, Zone* zone) {
1046 elmc--;
1047 DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
1048 for (int i = 0; i < elmc; i += 2) {
1049 DCHECK(elmv[i] < elmv[i + 1]);
1050 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
1051 }
1052 }
1053
AddClassNegated(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)1054 static void AddClassNegated(const int* elmv, int elmc,
1055 ZoneList<CharacterRange>* ranges, Zone* zone) {
1056 elmc--;
1057 DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
1058 DCHECK_NE(0x0000, elmv[0]);
1059 DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
1060 uc16 last = 0x0000;
1061 for (int i = 0; i < elmc; i += 2) {
1062 DCHECK(last <= elmv[i] - 1);
1063 DCHECK(elmv[i] < elmv[i + 1]);
1064 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
1065 last = elmv[i + 1];
1066 }
1067 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
1068 }
1069
AddClassEscape(char type,ZoneList<CharacterRange> * ranges,bool add_unicode_case_equivalents,Zone * zone)1070 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
1071 bool add_unicode_case_equivalents,
1072 Zone* zone) {
1073 if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
1074 // See #sec-runtime-semantics-wordcharacters-abstract-operation
1075 // In case of unicode and ignore_case, we need to create the closure over
1076 // case equivalent characters before negating.
1077 ZoneList<CharacterRange>* new_ranges =
1078 zone->New<ZoneList<CharacterRange>>(2, zone);
1079 AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
1080 AddUnicodeCaseEquivalents(new_ranges, zone);
1081 if (type == 'W') {
1082 ZoneList<CharacterRange>* negated =
1083 zone->New<ZoneList<CharacterRange>>(2, zone);
1084 CharacterRange::Negate(new_ranges, negated, zone);
1085 new_ranges = negated;
1086 }
1087 ranges->AddAll(*new_ranges, zone);
1088 return;
1089 }
1090 AddClassEscape(type, ranges, zone);
1091 }
1092
AddClassEscape(char type,ZoneList<CharacterRange> * ranges,Zone * zone)1093 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
1094 Zone* zone) {
1095 switch (type) {
1096 case 's':
1097 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
1098 break;
1099 case 'S':
1100 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
1101 break;
1102 case 'w':
1103 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
1104 break;
1105 case 'W':
1106 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
1107 break;
1108 case 'd':
1109 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
1110 break;
1111 case 'D':
1112 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
1113 break;
1114 case '.':
1115 AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,
1116 zone);
1117 break;
1118 // This is not a character range as defined by the spec but a
1119 // convenient shorthand for a character class that matches any
1120 // character.
1121 case '*':
1122 ranges->Add(CharacterRange::Everything(), zone);
1123 break;
1124 // This is the set of characters matched by the $ and ^ symbols
1125 // in multiline mode.
1126 case 'n':
1127 AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
1128 break;
1129 default:
1130 UNREACHABLE();
1131 }
1132 }
1133
GetWordBounds()1134 Vector<const int> CharacterRange::GetWordBounds() {
1135 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
1136 }
1137
1138 // static
AddCaseEquivalents(Isolate * isolate,Zone * zone,ZoneList<CharacterRange> * ranges,bool is_one_byte)1139 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
1140 ZoneList<CharacterRange>* ranges,
1141 bool is_one_byte) {
1142 CharacterRange::Canonicalize(ranges);
1143 int range_count = ranges->length();
1144 #ifdef V8_INTL_SUPPORT
1145 icu::UnicodeSet others;
1146 for (int i = 0; i < range_count; i++) {
1147 CharacterRange range = ranges->at(i);
1148 uc32 from = range.from();
1149 if (from > String::kMaxUtf16CodeUnit) continue;
1150 uc32 to = std::min({range.to(), String::kMaxUtf16CodeUnitU});
1151 // Nothing to be done for surrogates.
1152 if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
1153 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
1154 if (from > String::kMaxOneByteCharCode) continue;
1155 if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode;
1156 }
1157 others.add(from, to);
1158 }
1159
1160 // Compute the set of additional characters that should be added,
1161 // using UnicodeSet::closeOver. ECMA 262 defines slightly different
1162 // case-folding rules than Unicode, so some characters that are
1163 // added by closeOver do not match anything other than themselves in
1164 // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
1165 // same case-insensitive character as 's' or 'S' according to
1166 // Unicode, but does not match any other character in JS. To handle
1167 // this case, we add such characters to the IgnoreSet and filter
1168 // them out. We filter twice: once before calling closeOver (to
1169 // prevent 'ſ' from adding 's'), and once after calling closeOver
1170 // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
1171 // more information.
1172 icu::UnicodeSet already_added(others);
1173 others.removeAll(RegExpCaseFolding::IgnoreSet());
1174 others.closeOver(USET_CASE_INSENSITIVE);
1175 others.removeAll(RegExpCaseFolding::IgnoreSet());
1176 others.removeAll(already_added);
1177
1178 // Add others to the ranges
1179 for (int32_t i = 0; i < others.getRangeCount(); i++) {
1180 UChar32 from = others.getRangeStart(i);
1181 UChar32 to = others.getRangeEnd(i);
1182 if (from == to) {
1183 ranges->Add(CharacterRange::Singleton(from), zone);
1184 } else {
1185 ranges->Add(CharacterRange::Range(from, to), zone);
1186 }
1187 }
1188 #else
1189 for (int i = 0; i < range_count; i++) {
1190 CharacterRange range = ranges->at(i);
1191 uc32 bottom = range.from();
1192 if (bottom > String::kMaxUtf16CodeUnit) continue;
1193 uc32 top = std::min({range.to(), String::kMaxUtf16CodeUnitU});
1194 // Nothing to be done for surrogates.
1195 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
1196 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
1197 if (bottom > String::kMaxOneByteCharCode) continue;
1198 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
1199 }
1200 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1201 if (top == bottom) {
1202 // If this is a singleton we just expand the one character.
1203 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
1204 for (int i = 0; i < length; i++) {
1205 uc32 chr = chars[i];
1206 if (chr != bottom) {
1207 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
1208 }
1209 }
1210 } else {
1211 // If this is a range we expand the characters block by block, expanding
1212 // contiguous subranges (blocks) one at a time. The approach is as
1213 // follows. For a given start character we look up the remainder of the
1214 // block that contains it (represented by the end point), for instance we
1215 // find 'z' if the character is 'c'. A block is characterized by the
1216 // property that all characters uncanonicalize in the same way, except
1217 // that each entry in the result is incremented by the distance from the
1218 // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
1219 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
1220 // we've found the end point we look up its uncanonicalization and
1221 // produce a range for each element. For instance for [c-f] we look up
1222 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
1223 // it is not already contained in the input, so [c-f] will be skipped but
1224 // [C-F] will be added. If this range is not completely contained in a
1225 // block we do this for all the blocks covered by the range (handling
1226 // characters that is not in a block as a "singleton block").
1227 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1228 uc32 pos = bottom;
1229 while (pos <= top) {
1230 int length =
1231 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
1232 uc32 block_end;
1233 if (length == 0) {
1234 block_end = pos;
1235 } else {
1236 DCHECK_EQ(1, length);
1237 block_end = equivalents[0];
1238 }
1239 int end = (block_end > top) ? top : block_end;
1240 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
1241 equivalents);
1242 for (int i = 0; i < length; i++) {
1243 uc32 c = equivalents[i];
1244 uc32 range_from = c - (block_end - pos);
1245 uc32 range_to = c - (block_end - end);
1246 if (!(bottom <= range_from && range_to <= top)) {
1247 ranges->Add(CharacterRange::Range(range_from, range_to), zone);
1248 }
1249 }
1250 pos = end + 1;
1251 }
1252 }
1253 }
1254 #endif // V8_INTL_SUPPORT
1255 }
1256
IsCanonical(ZoneList<CharacterRange> * ranges)1257 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
1258 DCHECK_NOT_NULL(ranges);
1259 int n = ranges->length();
1260 if (n <= 1) return true;
1261 uc32 max = ranges->at(0).to();
1262 for (int i = 1; i < n; i++) {
1263 CharacterRange next_range = ranges->at(i);
1264 if (next_range.from() <= max + 1) return false;
1265 max = next_range.to();
1266 }
1267 return true;
1268 }
1269
ranges(Zone * zone)1270 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
1271 if (ranges_ == nullptr) {
1272 ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone);
1273 CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
1274 }
1275 return ranges_;
1276 }
1277
1278 // Move a number of elements in a zonelist to another position
1279 // in the same list. Handles overlapping source and target areas.
MoveRanges(ZoneList<CharacterRange> * list,int from,int to,int count)1280 static void MoveRanges(ZoneList<CharacterRange>* list, int from, int to,
1281 int count) {
1282 // Ranges are potentially overlapping.
1283 if (from < to) {
1284 for (int i = count - 1; i >= 0; i--) {
1285 list->at(to + i) = list->at(from + i);
1286 }
1287 } else {
1288 for (int i = 0; i < count; i++) {
1289 list->at(to + i) = list->at(from + i);
1290 }
1291 }
1292 }
1293
InsertRangeInCanonicalList(ZoneList<CharacterRange> * list,int count,CharacterRange insert)1294 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,
1295 CharacterRange insert) {
1296 // Inserts a range into list[0..count[, which must be sorted
1297 // by from value and non-overlapping and non-adjacent, using at most
1298 // list[0..count] for the result. Returns the number of resulting
1299 // canonicalized ranges. Inserting a range may collapse existing ranges into
1300 // fewer ranges, so the return value can be anything in the range 1..count+1.
1301 uc32 from = insert.from();
1302 uc32 to = insert.to();
1303 int start_pos = 0;
1304 int end_pos = count;
1305 for (int i = count - 1; i >= 0; i--) {
1306 CharacterRange current = list->at(i);
1307 if (current.from() > to + 1) {
1308 end_pos = i;
1309 } else if (current.to() + 1 < from) {
1310 start_pos = i + 1;
1311 break;
1312 }
1313 }
1314
1315 // Inserted range overlaps, or is adjacent to, ranges at positions
1316 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
1317 // not affected by the insertion.
1318 // If start_pos == end_pos, the range must be inserted before start_pos.
1319 // if start_pos < end_pos, the entire range from start_pos to end_pos
1320 // must be merged with the insert range.
1321
1322 if (start_pos == end_pos) {
1323 // Insert between existing ranges at position start_pos.
1324 if (start_pos < count) {
1325 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
1326 }
1327 list->at(start_pos) = insert;
1328 return count + 1;
1329 }
1330 if (start_pos + 1 == end_pos) {
1331 // Replace single existing range at position start_pos.
1332 CharacterRange to_replace = list->at(start_pos);
1333 int new_from = std::min(to_replace.from(), from);
1334 int new_to = std::max(to_replace.to(), to);
1335 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
1336 return count;
1337 }
1338 // Replace a number of existing ranges from start_pos to end_pos - 1.
1339 // Move the remaining ranges down.
1340
1341 int new_from = std::min(list->at(start_pos).from(), from);
1342 int new_to = std::max(list->at(end_pos - 1).to(), to);
1343 if (end_pos < count) {
1344 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
1345 }
1346 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
1347 return count - (end_pos - start_pos) + 1;
1348 }
1349
Canonicalize()1350 void CharacterSet::Canonicalize() {
1351 // Special/default classes are always considered canonical. The result
1352 // of calling ranges() will be sorted.
1353 if (ranges_ == nullptr) return;
1354 CharacterRange::Canonicalize(ranges_);
1355 }
1356
Canonicalize(ZoneList<CharacterRange> * character_ranges)1357 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
1358 if (character_ranges->length() <= 1) return;
1359 // Check whether ranges are already canonical (increasing, non-overlapping,
1360 // non-adjacent).
1361 int n = character_ranges->length();
1362 uc32 max = character_ranges->at(0).to();
1363 int i = 1;
1364 while (i < n) {
1365 CharacterRange current = character_ranges->at(i);
1366 if (current.from() <= max + 1) {
1367 break;
1368 }
1369 max = current.to();
1370 i++;
1371 }
1372 // Canonical until the i'th range. If that's all of them, we are done.
1373 if (i == n) return;
1374
1375 // The ranges at index i and forward are not canonicalized. Make them so by
1376 // doing the equivalent of insertion sort (inserting each into the previous
1377 // list, in order).
1378 // Notice that inserting a range can reduce the number of ranges in the
1379 // result due to combining of adjacent and overlapping ranges.
1380 int read = i; // Range to insert.
1381 int num_canonical = i; // Length of canonicalized part of list.
1382 do {
1383 num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
1384 character_ranges->at(read));
1385 read++;
1386 } while (read < n);
1387 character_ranges->Rewind(num_canonical);
1388
1389 DCHECK(CharacterRange::IsCanonical(character_ranges));
1390 }
1391
Negate(ZoneList<CharacterRange> * ranges,ZoneList<CharacterRange> * negated_ranges,Zone * zone)1392 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
1393 ZoneList<CharacterRange>* negated_ranges,
1394 Zone* zone) {
1395 DCHECK(CharacterRange::IsCanonical(ranges));
1396 DCHECK_EQ(0, negated_ranges->length());
1397 int range_count = ranges->length();
1398 uc32 from = 0;
1399 int i = 0;
1400 if (range_count > 0 && ranges->at(0).from() == 0) {
1401 from = ranges->at(0).to() + 1;
1402 i = 1;
1403 }
1404 while (i < range_count) {
1405 CharacterRange range = ranges->at(i);
1406 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
1407 from = range.to() + 1;
1408 i++;
1409 }
1410 if (from < String::kMaxCodePoint) {
1411 negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
1412 zone);
1413 }
1414 }
1415
1416 // Scoped object to keep track of how much we unroll quantifier loops in the
1417 // regexp graph generator.
1418 class RegExpExpansionLimiter {
1419 public:
1420 static const int kMaxExpansionFactor = 6;
RegExpExpansionLimiter(RegExpCompiler * compiler,int factor)1421 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
1422 : compiler_(compiler),
1423 saved_expansion_factor_(compiler->current_expansion_factor()),
1424 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
1425 DCHECK_LT(0, factor);
1426 if (ok_to_expand_) {
1427 if (factor > kMaxExpansionFactor) {
1428 // Avoid integer overflow of the current expansion factor.
1429 ok_to_expand_ = false;
1430 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
1431 } else {
1432 int new_factor = saved_expansion_factor_ * factor;
1433 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
1434 compiler->set_current_expansion_factor(new_factor);
1435 }
1436 }
1437 }
1438
~RegExpExpansionLimiter()1439 ~RegExpExpansionLimiter() {
1440 compiler_->set_current_expansion_factor(saved_expansion_factor_);
1441 }
1442
ok_to_expand()1443 bool ok_to_expand() { return ok_to_expand_; }
1444
1445 private:
1446 RegExpCompiler* compiler_;
1447 int saved_expansion_factor_;
1448 bool ok_to_expand_;
1449
1450 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
1451 };
1452
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)1453 RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
1454 RegExpTree* body, RegExpCompiler* compiler,
1455 RegExpNode* on_success,
1456 bool not_at_start) {
1457 // x{f, t} becomes this:
1458 //
1459 // (r++)<-.
1460 // | `
1461 // | (x)
1462 // v ^
1463 // (r=0)-->(?)---/ [if r < t]
1464 // |
1465 // [if r >= f] \----> ...
1466 //
1467
1468 // 15.10.2.5 RepeatMatcher algorithm.
1469 // The parser has already eliminated the case where max is 0. In the case
1470 // where max_match is zero the parser has removed the quantifier if min was
1471 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
1472
1473 // If we know that we cannot match zero length then things are a little
1474 // simpler since we don't need to make the special zero length match check
1475 // from step 2.1. If the min and max are small we can unroll a little in
1476 // this case.
1477 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
1478 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
1479 if (max == 0) return on_success; // This can happen due to recursion.
1480 bool body_can_be_empty = (body->min_match() == 0);
1481 int body_start_reg = RegExpCompiler::kNoRegister;
1482 Interval capture_registers = body->CaptureRegisters();
1483 bool needs_capture_clearing = !capture_registers.is_empty();
1484 Zone* zone = compiler->zone();
1485
1486 if (body_can_be_empty) {
1487 body_start_reg = compiler->AllocateRegister();
1488 } else if (compiler->optimize() && !needs_capture_clearing) {
1489 // Only unroll if there are no captures and the body can't be
1490 // empty.
1491 {
1492 RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
1493 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
1494 int new_max = (max == kInfinity) ? max : max - min;
1495 // Recurse once to get the loop or optional matches after the fixed
1496 // ones.
1497 RegExpNode* answer =
1498 ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
1499 // Unroll the forced matches from 0 to min. This can cause chains of
1500 // TextNodes (which the parser does not generate). These should be
1501 // combined if it turns out they hinder good code generation.
1502 for (int i = 0; i < min; i++) {
1503 answer = body->ToNode(compiler, answer);
1504 }
1505 return answer;
1506 }
1507 }
1508 if (max <= kMaxUnrolledMaxMatches && min == 0) {
1509 DCHECK_LT(0, max); // Due to the 'if' above.
1510 RegExpExpansionLimiter limiter(compiler, max);
1511 if (limiter.ok_to_expand()) {
1512 // Unroll the optional matches up to max.
1513 RegExpNode* answer = on_success;
1514 for (int i = 0; i < max; i++) {
1515 ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone);
1516 if (is_greedy) {
1517 alternation->AddAlternative(
1518 GuardedAlternative(body->ToNode(compiler, answer)));
1519 alternation->AddAlternative(GuardedAlternative(on_success));
1520 } else {
1521 alternation->AddAlternative(GuardedAlternative(on_success));
1522 alternation->AddAlternative(
1523 GuardedAlternative(body->ToNode(compiler, answer)));
1524 }
1525 answer = alternation;
1526 if (not_at_start && !compiler->read_backward()) {
1527 alternation->set_not_at_start();
1528 }
1529 }
1530 return answer;
1531 }
1532 }
1533 }
1534 bool has_min = min > 0;
1535 bool has_max = max < RegExpTree::kInfinity;
1536 bool needs_counter = has_min || has_max;
1537 int reg_ctr = needs_counter ? compiler->AllocateRegister()
1538 : RegExpCompiler::kNoRegister;
1539 LoopChoiceNode* center = zone->New<LoopChoiceNode>(
1540 body->min_match() == 0, compiler->read_backward(), min, zone);
1541 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
1542 RegExpNode* loop_return =
1543 needs_counter ? static_cast<RegExpNode*>(
1544 ActionNode::IncrementRegister(reg_ctr, center))
1545 : static_cast<RegExpNode*>(center);
1546 if (body_can_be_empty) {
1547 // If the body can be empty we need to check if it was and then
1548 // backtrack.
1549 loop_return =
1550 ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
1551 }
1552 RegExpNode* body_node = body->ToNode(compiler, loop_return);
1553 if (body_can_be_empty) {
1554 // If the body can be empty we need to store the start position
1555 // so we can bail out if it was empty.
1556 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
1557 }
1558 if (needs_capture_clearing) {
1559 // Before entering the body of this loop we need to clear captures.
1560 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
1561 }
1562 GuardedAlternative body_alt(body_node);
1563 if (has_max) {
1564 Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max);
1565 body_alt.AddGuard(body_guard, zone);
1566 }
1567 GuardedAlternative rest_alt(on_success);
1568 if (has_min) {
1569 Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min);
1570 rest_alt.AddGuard(rest_guard, zone);
1571 }
1572 if (is_greedy) {
1573 center->AddLoopAlternative(body_alt);
1574 center->AddContinueAlternative(rest_alt);
1575 } else {
1576 center->AddContinueAlternative(rest_alt);
1577 center->AddLoopAlternative(body_alt);
1578 }
1579 if (needs_counter) {
1580 return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
1581 } else {
1582 return center;
1583 }
1584 }
1585
1586 } // namespace internal
1587 } // namespace v8
1588