1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "new-regexp/regexp-parser.h"
6 
7 #include <vector>
8 
9 #include "new-regexp/property-sequences.h"
10 #include "new-regexp/regexp-macro-assembler.h"
11 #include "new-regexp/regexp.h"
12 
13 #ifdef V8_INTL_SUPPORT
14 #include "unicode/uniset.h"
15 #endif  // V8_INTL_SUPPORT
16 
17 namespace v8 {
18 namespace internal {
19 
RegExpParser(FlatStringReader * in,JSRegExp::Flags flags,Isolate * isolate,Zone * zone)20 RegExpParser::RegExpParser(FlatStringReader* in, JSRegExp::Flags flags,
21                            Isolate* isolate, Zone* zone)
22     : isolate_(isolate),
23       zone_(zone),
24       captures_(nullptr),
25       named_captures_(nullptr),
26       named_back_references_(nullptr),
27       in_(in),
28       current_(kEndMarker),
29       top_level_flags_(flags),
30       next_pos_(0),
31       captures_started_(0),
32       capture_count_(0),
33       has_more_(true),
34       simple_(false),
35       contains_anchor_(false),
36       is_scanned_for_captures_(false),
37       has_named_captures_(false),
38       failed_(false) {
39   Advance();
40 }
41 
42 template <bool update_position>
ReadNext()43 inline uc32 RegExpParser::ReadNext() {
44   int position = next_pos_;
45   uc32 c0 = in()->Get(position);
46   position++;
47   // Read the whole surrogate pair in case of unicode flag, if possible.
48   if (unicode() && position < in()->length() &&
49       unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
50     uc16 c1 = in()->Get(position);
51     if (unibrow::Utf16::IsTrailSurrogate(c1)) {
52       c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
53       position++;
54     }
55   }
56   if (update_position) next_pos_ = position;
57   return c0;
58 }
59 
60 
Next()61 uc32 RegExpParser::Next() {
62   if (has_next()) {
63     return ReadNext<false>();
64   } else {
65     return kEndMarker;
66   }
67 }
68 
Advance()69 void RegExpParser::Advance() {
70   if (has_next()) {
71     StackLimitCheck check(isolate());
72     if (check.HasOverflowed()) {
73       if (FLAG_correctness_fuzzer_suppressions) {
74         FATAL("Aborting on stack overflow");
75       }
76       ReportError(RegExpError::kStackOverflow);
77     } else if (zone()->excess_allocation()) {
78       if (FLAG_correctness_fuzzer_suppressions) {
79         FATAL("Aborting on excess zone allocation");
80       }
81       ReportError(RegExpError::kTooLarge);
82     } else {
83       current_ = ReadNext<true>();
84     }
85   } else {
86     current_ = kEndMarker;
87     // Advance so that position() points to 1-after-the-last-character. This is
88     // important so that Reset() to this position works correctly.
89     next_pos_ = in()->length() + 1;
90     has_more_ = false;
91   }
92 }
93 
94 
Reset(int pos)95 void RegExpParser::Reset(int pos) {
96   next_pos_ = pos;
97   has_more_ = (pos < in()->length());
98   Advance();
99 }
100 
Advance(int dist)101 void RegExpParser::Advance(int dist) {
102   next_pos_ += dist - 1;
103   Advance();
104 }
105 
106 
simple()107 bool RegExpParser::simple() { return simple_; }
108 
IsSyntaxCharacterOrSlash(uc32 c)109 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
110   switch (c) {
111     case '^':
112     case '$':
113     case '\\':
114     case '.':
115     case '*':
116     case '+':
117     case '?':
118     case '(':
119     case ')':
120     case '[':
121     case ']':
122     case '{':
123     case '}':
124     case '|':
125     case '/':
126       return true;
127     default:
128       break;
129   }
130   return false;
131 }
132 
ReportError(RegExpError error)133 RegExpTree* RegExpParser::ReportError(RegExpError error) {
134   if (failed_) return nullptr;  // Do not overwrite any existing error.
135   failed_ = true;
136   error_ = error;
137   error_pos_ = position();
138   // Zip to the end to make sure no more input is read.
139   current_ = kEndMarker;
140   next_pos_ = in()->length();
141   return nullptr;
142 }
143 
144 #define CHECK_FAILED /**/);    \
145   if (failed_) return nullptr; \
146   ((void)0
147 
148 // Pattern ::
149 //   Disjunction
ParsePattern()150 RegExpTree* RegExpParser::ParsePattern() {
151   RegExpTree* result = ParseDisjunction(CHECK_FAILED);
152   PatchNamedBackReferences(CHECK_FAILED);
153   DCHECK(!has_more());
154   // If the result of parsing is a literal string atom, and it has the
155   // same length as the input, then the atom is identical to the input.
156   if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
157     simple_ = true;
158   }
159   return result;
160 }
161 
162 
163 // Disjunction ::
164 //   Alternative
165 //   Alternative | Disjunction
166 // Alternative ::
167 //   [empty]
168 //   Term Alternative
169 // Term ::
170 //   Assertion
171 //   Atom
172 //   Atom Quantifier
ParseDisjunction()173 RegExpTree* RegExpParser::ParseDisjunction() {
174   // Used to store current state while parsing subexpressions.
175   RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
176                                   0, nullptr, top_level_flags_, zone());
177   RegExpParserState* state = &initial_state;
178   // Cache the builder in a local variable for quick access.
179   RegExpBuilder* builder = initial_state.builder();
180   while (true) {
181     switch (current()) {
182       case kEndMarker:
183         if (state->IsSubexpression()) {
184           // Inside a parenthesized group when hitting end of input.
185           return ReportError(RegExpError::kUnterminatedGroup);
186         }
187         DCHECK_EQ(INITIAL, state->group_type());
188         // Parsing completed successfully.
189         return builder->ToRegExp();
190       case ')': {
191         if (!state->IsSubexpression()) {
192           return ReportError(RegExpError::kUnmatchedParen);
193         }
194         DCHECK_NE(INITIAL, state->group_type());
195 
196         Advance();
197         // End disjunction parsing and convert builder content to new single
198         // regexp atom.
199         RegExpTree* body = builder->ToRegExp();
200 
201         int end_capture_index = captures_started();
202 
203         int capture_index = state->capture_index();
204         SubexpressionType group_type = state->group_type();
205 
206         // Build result of subexpression.
207         if (group_type == CAPTURE) {
208           if (state->IsNamedCapture()) {
209             CreateNamedCaptureAtIndex(state->capture_name(),
210                                       capture_index CHECK_FAILED);
211           }
212           RegExpCapture* capture = GetCapture(capture_index);
213           capture->set_body(body);
214           body = capture;
215         } else if (group_type == GROUPING) {
216           body = new (zone()) RegExpGroup(body);
217         } else {
218           DCHECK(group_type == POSITIVE_LOOKAROUND ||
219                  group_type == NEGATIVE_LOOKAROUND);
220           bool is_positive = (group_type == POSITIVE_LOOKAROUND);
221           body = new (zone()) RegExpLookaround(
222               body, is_positive, end_capture_index - capture_index,
223               capture_index, state->lookaround_type());
224         }
225 
226         // Restore previous state.
227         state = state->previous_state();
228         builder = state->builder();
229 
230         builder->AddAtom(body);
231         // For compatibility with JSC and ES3, we allow quantifiers after
232         // lookaheads, and break in all cases.
233         break;
234       }
235       case '|': {
236         Advance();
237         builder->NewAlternative();
238         continue;
239       }
240       case '*':
241       case '+':
242       case '?':
243         return ReportError(RegExpError::kNothingToRepeat);
244       case '^': {
245         Advance();
246         if (builder->multiline()) {
247           builder->AddAssertion(new (zone()) RegExpAssertion(
248               RegExpAssertion::START_OF_LINE, builder->flags()));
249         } else {
250           builder->AddAssertion(new (zone()) RegExpAssertion(
251               RegExpAssertion::START_OF_INPUT, builder->flags()));
252           set_contains_anchor();
253         }
254         continue;
255       }
256       case '$': {
257         Advance();
258         RegExpAssertion::AssertionType assertion_type =
259             builder->multiline() ? RegExpAssertion::END_OF_LINE
260                                  : RegExpAssertion::END_OF_INPUT;
261         builder->AddAssertion(
262             new (zone()) RegExpAssertion(assertion_type, builder->flags()));
263         continue;
264       }
265       case '.': {
266         Advance();
267         ZoneList<CharacterRange>* ranges =
268             new (zone()) ZoneList<CharacterRange>(2, zone());
269 
270         if (builder->dotall()) {
271           // Everything.
272           CharacterRange::AddClassEscape('*', ranges, false, zone());
273         } else {
274           // Everything except \x0A, \x0D, \u2028 and \u2029
275           CharacterRange::AddClassEscape('.', ranges, false, zone());
276         }
277 
278         RegExpCharacterClass* cc =
279             new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags());
280         builder->AddCharacterClass(cc);
281         break;
282       }
283       case '(': {
284         state = ParseOpenParenthesis(state CHECK_FAILED);
285         builder = state->builder();
286         continue;
287       }
288       case '[': {
289         RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
290         builder->AddCharacterClass(cc->AsCharacterClass());
291         break;
292       }
293       // Atom ::
294       //   \ AtomEscape
295       case '\\':
296         switch (Next()) {
297           case kEndMarker:
298             return ReportError(RegExpError::kEscapeAtEndOfPattern);
299           case 'b':
300             Advance(2);
301             builder->AddAssertion(new (zone()) RegExpAssertion(
302                 RegExpAssertion::BOUNDARY, builder->flags()));
303             continue;
304           case 'B':
305             Advance(2);
306             builder->AddAssertion(new (zone()) RegExpAssertion(
307                 RegExpAssertion::NON_BOUNDARY, builder->flags()));
308             continue;
309           // AtomEscape ::
310           //   CharacterClassEscape
311           //
312           // CharacterClassEscape :: one of
313           //   d D s S w W
314           case 'd':
315           case 'D':
316           case 's':
317           case 'S':
318           case 'w':
319           case 'W': {
320             uc32 c = Next();
321             Advance(2);
322             ZoneList<CharacterRange>* ranges =
323                 new (zone()) ZoneList<CharacterRange>(2, zone());
324             CharacterRange::AddClassEscape(
325                 c, ranges, unicode() && builder->ignore_case(), zone());
326             RegExpCharacterClass* cc = new (zone())
327                 RegExpCharacterClass(zone(), ranges, builder->flags());
328             builder->AddCharacterClass(cc);
329             break;
330           }
331           case 'p':
332           case 'P': {
333             uc32 p = Next();
334             Advance(2);
335             if (unicode()) {
336               ZoneList<CharacterRange>* ranges =
337                   new (zone()) ZoneList<CharacterRange>(2, zone());
338               ZoneVector<char> name_1(zone());
339               ZoneVector<char> name_2(zone());
340               if (ParsePropertyClassName(&name_1, &name_2)) {
341                 if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
342                   RegExpCharacterClass* cc = new (zone())
343                       RegExpCharacterClass(zone(), ranges, builder->flags());
344                   builder->AddCharacterClass(cc);
345                   break;
346                 }
347                 if (p == 'p' && name_2.empty()) {
348                   RegExpTree* sequence = GetPropertySequence(name_1);
349                   if (sequence != nullptr) {
350                     builder->AddAtom(sequence);
351                     break;
352                   }
353                 }
354               }
355               return ReportError(RegExpError::kInvalidPropertyName);
356             } else {
357               builder->AddCharacter(p);
358             }
359             break;
360           }
361           case '1':
362           case '2':
363           case '3':
364           case '4':
365           case '5':
366           case '6':
367           case '7':
368           case '8':
369           case '9': {
370             int index = 0;
371             bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
372             if (is_backref) {
373               if (state->IsInsideCaptureGroup(index)) {
374                 // The back reference is inside the capture group it refers to.
375                 // Nothing can possibly have been captured yet, so we use empty
376                 // instead. This ensures that, when checking a back reference,
377                 // the capture registers of the referenced capture are either
378                 // both set or both cleared.
379                 builder->AddEmpty();
380               } else {
381                 RegExpCapture* capture = GetCapture(index);
382                 RegExpTree* atom =
383                     new (zone()) RegExpBackReference(capture, builder->flags());
384                 builder->AddAtom(atom);
385               }
386               break;
387             }
388             // With /u, no identity escapes except for syntax characters
389             // are allowed. Otherwise, all identity escapes are allowed.
390             if (unicode()) {
391               return ReportError(RegExpError::kInvalidEscape);
392             }
393             uc32 first_digit = Next();
394             if (first_digit == '8' || first_digit == '9') {
395               builder->AddCharacter(first_digit);
396               Advance(2);
397               break;
398             }
399             V8_FALLTHROUGH;
400           }
401           case '0': {
402             Advance();
403             if (unicode() && Next() >= '0' && Next() <= '9') {
404               // With /u, decimal escape with leading 0 are not parsed as octal.
405               return ReportError(RegExpError::kInvalidDecimalEscape);
406             }
407             uc32 octal = ParseOctalLiteral();
408             builder->AddCharacter(octal);
409             break;
410           }
411           // ControlEscape :: one of
412           //   f n r t v
413           case 'f':
414             Advance(2);
415             builder->AddCharacter('\f');
416             break;
417           case 'n':
418             Advance(2);
419             builder->AddCharacter('\n');
420             break;
421           case 'r':
422             Advance(2);
423             builder->AddCharacter('\r');
424             break;
425           case 't':
426             Advance(2);
427             builder->AddCharacter('\t');
428             break;
429           case 'v':
430             Advance(2);
431             builder->AddCharacter('\v');
432             break;
433           case 'c': {
434             Advance();
435             uc32 controlLetter = Next();
436             // Special case if it is an ASCII letter.
437             // Convert lower case letters to uppercase.
438             uc32 letter = controlLetter & ~('a' ^ 'A');
439             if (letter < 'A' || 'Z' < letter) {
440               // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
441               // Read the backslash as a literal character instead of as
442               // starting an escape.
443               // ES#prod-annexB-ExtendedPatternCharacter
444               if (unicode()) {
445                 // With /u, invalid escapes are not treated as identity escapes.
446                 return ReportError(RegExpError::kInvalidUnicodeEscape);
447               }
448               builder->AddCharacter('\\');
449             } else {
450               Advance(2);
451               builder->AddCharacter(controlLetter & 0x1F);
452             }
453             break;
454           }
455           case 'x': {
456             Advance(2);
457             uc32 value;
458             if (ParseHexEscape(2, &value)) {
459               builder->AddCharacter(value);
460             } else if (!unicode()) {
461               builder->AddCharacter('x');
462             } else {
463               // With /u, invalid escapes are not treated as identity escapes.
464               return ReportError(RegExpError::kInvalidEscape);
465             }
466             break;
467           }
468           case 'u': {
469             Advance(2);
470             uc32 value;
471             if (ParseUnicodeEscape(&value)) {
472               builder->AddEscapedUnicodeCharacter(value);
473             } else if (!unicode()) {
474               builder->AddCharacter('u');
475             } else {
476               // With /u, invalid escapes are not treated as identity escapes.
477               return ReportError(RegExpError::kInvalidUnicodeEscape);
478             }
479             break;
480           }
481           case 'k':
482             // Either an identity escape or a named back-reference.  The two
483             // interpretations are mutually exclusive: '\k' is interpreted as
484             // an identity escape for non-Unicode patterns without named
485             // capture groups, and as the beginning of a named back-reference
486             // in all other cases.
487             if (unicode() || HasNamedCaptures()) {
488               Advance(2);
489               ParseNamedBackReference(builder, state CHECK_FAILED);
490               break;
491             }
492             V8_FALLTHROUGH;
493           default:
494             Advance();
495             // With /u, no identity escapes except for syntax characters
496             // are allowed. Otherwise, all identity escapes are allowed.
497             if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
498               builder->AddCharacter(current());
499               Advance();
500             } else {
501               return ReportError(RegExpError::kInvalidEscape);
502             }
503             break;
504         }
505         break;
506       case '{': {
507         int dummy;
508         bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
509         if (parsed) return ReportError(RegExpError::kNothingToRepeat);
510         V8_FALLTHROUGH;
511       }
512       case '}':
513       case ']':
514         if (unicode()) {
515           return ReportError(RegExpError::kLoneQuantifierBrackets);
516         }
517         V8_FALLTHROUGH;
518       default:
519         builder->AddUnicodeCharacter(current());
520         Advance();
521         break;
522     }  // end switch(current())
523 
524     int min;
525     int max;
526     switch (current()) {
527       // QuantifierPrefix ::
528       //   *
529       //   +
530       //   ?
531       //   {
532       case '*':
533         min = 0;
534         max = RegExpTree::kInfinity;
535         Advance();
536         break;
537       case '+':
538         min = 1;
539         max = RegExpTree::kInfinity;
540         Advance();
541         break;
542       case '?':
543         min = 0;
544         max = 1;
545         Advance();
546         break;
547       case '{':
548         if (ParseIntervalQuantifier(&min, &max)) {
549           if (max < min) {
550             return ReportError(RegExpError::kRangeOutOfOrder);
551           }
552           break;
553         } else if (unicode()) {
554           // With /u, incomplete quantifiers are not allowed.
555           return ReportError(RegExpError::kIncompleteQuantifier);
556         }
557         continue;
558       default:
559         continue;
560     }
561     RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
562     if (current() == '?') {
563       quantifier_type = RegExpQuantifier::NON_GREEDY;
564       Advance();
565     } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
566       // FLAG_regexp_possessive_quantifier is a debug-only flag.
567       quantifier_type = RegExpQuantifier::POSSESSIVE;
568       Advance();
569     }
570     if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
571       return ReportError(RegExpError::kInvalidQuantifier);
572     }
573   }
574 }
575 
ParseOpenParenthesis(RegExpParserState * state)576 RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
577     RegExpParserState* state) {
578   RegExpLookaround::Type lookaround_type = state->lookaround_type();
579   bool is_named_capture = false;
580   JSRegExp::Flags switch_on = JSRegExp::kNone;
581   JSRegExp::Flags switch_off = JSRegExp::kNone;
582   const ZoneVector<uc16>* capture_name = nullptr;
583   SubexpressionType subexpr_type = CAPTURE;
584   Advance();
585   if (current() == '?') {
586     switch (Next()) {
587       case ':':
588         Advance(2);
589         subexpr_type = GROUPING;
590         break;
591       case '=':
592         Advance(2);
593         lookaround_type = RegExpLookaround::LOOKAHEAD;
594         subexpr_type = POSITIVE_LOOKAROUND;
595         break;
596       case '!':
597         Advance(2);
598         lookaround_type = RegExpLookaround::LOOKAHEAD;
599         subexpr_type = NEGATIVE_LOOKAROUND;
600         break;
601       case '-':
602       case 'i':
603       case 's':
604       case 'm': {
605         if (!FLAG_regexp_mode_modifiers) {
606           ReportError(RegExpError::kInvalidGroup);
607           return nullptr;
608         }
609         Advance();
610         bool flags_sense = true;  // Switching on flags.
611         while (subexpr_type != GROUPING) {
612           switch (current()) {
613             case '-':
614               if (!flags_sense) {
615                 ReportError(RegExpError::kMultipleFlagDashes);
616                 return nullptr;
617               }
618               flags_sense = false;
619               Advance();
620               continue;
621             case 's':
622             case 'i':
623             case 'm': {
624               JSRegExp::Flags bit = JSRegExp::kUnicode;
625               if (current() == 'i') bit = JSRegExp::kIgnoreCase;
626               if (current() == 'm') bit = JSRegExp::kMultiline;
627               if (current() == 's') bit = JSRegExp::kDotAll;
628               if (((switch_on | switch_off) & bit) != 0) {
629                 ReportError(RegExpError::kRepeatedFlag);
630                 return nullptr;
631               }
632               if (flags_sense) {
633                 switch_on |= bit;
634               } else {
635                 switch_off |= bit;
636               }
637               Advance();
638               continue;
639             }
640             case ')': {
641               Advance();
642               state->builder()
643                   ->FlushText();  // Flush pending text using old flags.
644               // These (?i)-style flag switches don't put us in a subexpression
645               // at all, they just modify the flags in the rest of the current
646               // subexpression.
647               JSRegExp::Flags flags =
648                   (state->builder()->flags() | switch_on) & ~switch_off;
649               state->builder()->set_flags(flags);
650               return state;
651             }
652             case ':':
653               Advance();
654               subexpr_type = GROUPING;  // Will break us out of the outer loop.
655               continue;
656             default:
657               ReportError(RegExpError::kInvalidFlagGroup);
658               return nullptr;
659           }
660         }
661         break;
662       }
663       case '<':
664         Advance();
665         if (Next() == '=') {
666           Advance(2);
667           lookaround_type = RegExpLookaround::LOOKBEHIND;
668           subexpr_type = POSITIVE_LOOKAROUND;
669           break;
670         } else if (Next() == '!') {
671           Advance(2);
672           lookaround_type = RegExpLookaround::LOOKBEHIND;
673           subexpr_type = NEGATIVE_LOOKAROUND;
674           break;
675         }
676         is_named_capture = true;
677         has_named_captures_ = true;
678         Advance();
679         break;
680       default:
681         ReportError(RegExpError::kInvalidGroup);
682         return nullptr;
683     }
684   }
685   if (subexpr_type == CAPTURE) {
686     if (captures_started_ >= JSRegExp::kMaxCaptures) {
687       ReportError(RegExpError::kTooManyCaptures);
688       return nullptr;
689     }
690     captures_started_++;
691 
692     if (is_named_capture) {
693       capture_name = ParseCaptureGroupName(CHECK_FAILED);
694     }
695   }
696   JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off;
697   // Store current state and begin new disjunction parsing.
698   return new (zone())
699       RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
700                         capture_name, flags, zone());
701 }
702 
703 #ifdef DEBUG
704 // Currently only used in an DCHECK.
IsSpecialClassEscape(uc32 c)705 static bool IsSpecialClassEscape(uc32 c) {
706   switch (c) {
707     case 'd':
708     case 'D':
709     case 's':
710     case 'S':
711     case 'w':
712     case 'W':
713       return true;
714     default:
715       return false;
716   }
717 }
718 #endif
719 
720 
721 // In order to know whether an escape is a backreference or not we have to scan
722 // the entire regexp and find the number of capturing parentheses.  However we
723 // don't want to scan the regexp twice unless it is necessary.  This mini-parser
724 // is called when needed.  It can see the difference between capturing and
725 // noncapturing parentheses and can skip character classes and backslash-escaped
726 // characters.
ScanForCaptures()727 void RegExpParser::ScanForCaptures() {
728   DCHECK(!is_scanned_for_captures_);
729   const int saved_position = position();
730   // Start with captures started previous to current position
731   int capture_count = captures_started();
732   // Add count of captures after this position.
733   int n;
734   while ((n = current()) != kEndMarker) {
735     Advance();
736     switch (n) {
737       case '\\':
738         Advance();
739         break;
740       case '[': {
741         int c;
742         while ((c = current()) != kEndMarker) {
743           Advance();
744           if (c == '\\') {
745             Advance();
746           } else {
747             if (c == ']') break;
748           }
749         }
750         break;
751       }
752       case '(':
753         if (current() == '?') {
754           // At this point we could be in
755           // * a non-capturing group '(:',
756           // * a lookbehind assertion '(?<=' '(?<!'
757           // * or a named capture '(?<'.
758           //
759           // Of these, only named captures are capturing groups.
760 
761           Advance();
762           if (current() != '<') break;
763 
764           Advance();
765           if (current() == '=' || current() == '!') break;
766 
767           // Found a possible named capture. It could turn out to be a syntax
768           // error (e.g. an unterminated or invalid name), but that distinction
769           // does not matter for our purposes.
770           has_named_captures_ = true;
771         }
772         capture_count++;
773         break;
774     }
775   }
776   capture_count_ = capture_count;
777   is_scanned_for_captures_ = true;
778   Reset(saved_position);
779 }
780 
781 
ParseBackReferenceIndex(int * index_out)782 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
783   DCHECK_EQ('\\', current());
784   DCHECK('1' <= Next() && Next() <= '9');
785   // Try to parse a decimal literal that is no greater than the total number
786   // of left capturing parentheses in the input.
787   int start = position();
788   int value = Next() - '0';
789   Advance(2);
790   while (true) {
791     uc32 c = current();
792     if (IsDecimalDigit(c)) {
793       value = 10 * value + (c - '0');
794       if (value > JSRegExp::kMaxCaptures) {
795         Reset(start);
796         return false;
797       }
798       Advance();
799     } else {
800       break;
801     }
802   }
803   if (value > captures_started()) {
804     if (!is_scanned_for_captures_) ScanForCaptures();
805     if (value > capture_count_) {
806       Reset(start);
807       return false;
808     }
809   }
810   *index_out = value;
811   return true;
812 }
813 
push_code_unit(ZoneVector<uc16> * v,uint32_t code_unit)814 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
815   if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
816     v->push_back(code_unit);
817   } else {
818     v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
819     v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
820   }
821 }
822 
ParseCaptureGroupName()823 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
824   ZoneVector<uc16>* name =
825       new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
826 
827   bool at_start = true;
828   while (true) {
829     uc32 c = current();
830     Advance();
831 
832     // Convert unicode escapes.
833     if (c == '\\' && current() == 'u') {
834       Advance();
835       if (!ParseUnicodeEscape(&c)) {
836         ReportError(RegExpError::kInvalidUnicodeEscape);
837         return nullptr;
838       }
839     }
840 
841     // The backslash char is misclassified as both ID_Start and ID_Continue.
842     if (c == '\\') {
843       ReportError(RegExpError::kInvalidCaptureGroupName);
844       return nullptr;
845     }
846 
847     if (at_start) {
848       if (!IsIdentifierStart(c)) {
849         ReportError(RegExpError::kInvalidCaptureGroupName);
850         return nullptr;
851       }
852       push_code_unit(name, c);
853       at_start = false;
854     } else {
855       if (c == '>') {
856         break;
857       } else if (IsIdentifierPart(c)) {
858         push_code_unit(name, c);
859       } else {
860         ReportError(RegExpError::kInvalidCaptureGroupName);
861         return nullptr;
862       }
863     }
864   }
865 
866   return name;
867 }
868 
CreateNamedCaptureAtIndex(const ZoneVector<uc16> * name,int index)869 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
870                                              int index) {
871   DCHECK(0 < index && index <= captures_started_);
872   DCHECK_NOT_NULL(name);
873 
874   RegExpCapture* capture = GetCapture(index);
875   DCHECK_NULL(capture->name());
876 
877   capture->set_name(name);
878 
879   if (named_captures_ == nullptr) {
880     named_captures_ = new (zone_->New(sizeof(*named_captures_)))
881         ZoneSet<RegExpCapture*, RegExpCaptureNameLess>(zone());
882   } else {
883     // Check for duplicates and bail if we find any.
884 
885     const auto& named_capture_it = named_captures_->find(capture);
886     if (named_capture_it != named_captures_->end()) {
887       ReportError(RegExpError::kDuplicateCaptureGroupName);
888       return false;
889     }
890   }
891 
892   named_captures_->emplace(capture);
893 
894   return true;
895 }
896 
ParseNamedBackReference(RegExpBuilder * builder,RegExpParserState * state)897 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
898                                            RegExpParserState* state) {
899   // The parser is assumed to be on the '<' in \k<name>.
900   if (current() != '<') {
901     ReportError(RegExpError::kInvalidNamedReference);
902     return false;
903   }
904 
905   Advance();
906   const ZoneVector<uc16>* name = ParseCaptureGroupName();
907   if (name == nullptr) {
908     return false;
909   }
910 
911   if (state->IsInsideCaptureGroup(name)) {
912     builder->AddEmpty();
913   } else {
914     RegExpBackReference* atom =
915         new (zone()) RegExpBackReference(builder->flags());
916     atom->set_name(name);
917 
918     builder->AddAtom(atom);
919 
920     if (named_back_references_ == nullptr) {
921       named_back_references_ =
922           new (zone()) ZoneList<RegExpBackReference*>(1, zone());
923     }
924     named_back_references_->Add(atom, zone());
925   }
926 
927   return true;
928 }
929 
PatchNamedBackReferences()930 void RegExpParser::PatchNamedBackReferences() {
931   if (named_back_references_ == nullptr) return;
932 
933   if (named_captures_ == nullptr) {
934     ReportError(RegExpError::kInvalidNamedCaptureReference);
935     return;
936   }
937 
938   // Look up and patch the actual capture for each named back reference.
939 
940   for (int i = 0; i < named_back_references_->length(); i++) {
941     RegExpBackReference* ref = named_back_references_->at(i);
942 
943     // Capture used to search the named_captures_ by name, index of the
944     // capture is never used.
945     static const int kInvalidIndex = 0;
946     RegExpCapture* search_capture = new (zone()) RegExpCapture(kInvalidIndex);
947     DCHECK_NULL(search_capture->name());
948     search_capture->set_name(ref->name());
949 
950     int index = -1;
951     const auto& capture_it = named_captures_->find(search_capture);
952     if (capture_it != named_captures_->end()) {
953       index = (*capture_it)->index();
954     } else {
955       ReportError(RegExpError::kInvalidNamedCaptureReference);
956       return;
957     }
958 
959     ref->set_capture(GetCapture(index));
960   }
961 }
962 
GetCapture(int index)963 RegExpCapture* RegExpParser::GetCapture(int index) {
964   // The index for the capture groups are one-based. Its index in the list is
965   // zero-based.
966   int know_captures =
967       is_scanned_for_captures_ ? capture_count_ : captures_started_;
968   DCHECK(index <= know_captures);
969   if (captures_ == nullptr) {
970     captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
971   }
972   while (captures_->length() < know_captures) {
973     captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
974   }
975   return captures_->at(index - 1);
976 }
977 
978 namespace {
979 
980 struct RegExpCaptureIndexLess {
operator ()v8::internal::__anon499c9d230111::RegExpCaptureIndexLess981   bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
982     DCHECK_NOT_NULL(lhs);
983     DCHECK_NOT_NULL(rhs);
984     return lhs->index() < rhs->index();
985   }
986 };
987 
988 }  // namespace
989 
CreateCaptureNameMap()990 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
991   if (named_captures_ == nullptr || named_captures_->empty()) {
992     return Handle<FixedArray>();
993   }
994 
995   // Named captures are sorted by name (because the set is used to ensure
996   // name uniqueness). But the capture name map must to be sorted by index.
997 
998   ZoneVector<RegExpCapture*> sorted_named_captures(
999       named_captures_->begin(), named_captures_->end(), zone());
1000   std::sort(sorted_named_captures.begin(), sorted_named_captures.end(),
1001             RegExpCaptureIndexLess{});
1002   DCHECK_EQ(sorted_named_captures.size(), named_captures_->size());
1003 
1004   Factory* factory = isolate()->factory();
1005 
1006   int len = static_cast<int>(sorted_named_captures.size()) * 2;
1007   Handle<FixedArray> array = factory->NewFixedArray(len);
1008 
1009   int i = 0;
1010   for (const auto& capture : sorted_named_captures) {
1011     Vector<const uc16> capture_name(capture->name()->data(),
1012                                     capture->name()->size());
1013     // CSA code in ConstructNewResultFromMatchInfo requires these strings to be
1014     // internalized so they can be used as property names in the 'exec' results.
1015     Handle<String> name = factory->InternalizeString(capture_name);
1016     array->set(i * 2, *name);
1017     array->set(i * 2 + 1, Smi::FromInt(capture->index()));
1018 
1019     i++;
1020   }
1021   DCHECK_EQ(i * 2, len);
1022 
1023   return array;
1024 }
1025 
HasNamedCaptures()1026 bool RegExpParser::HasNamedCaptures() {
1027   if (has_named_captures_ || is_scanned_for_captures_) {
1028     return has_named_captures_;
1029   }
1030 
1031   ScanForCaptures();
1032   DCHECK(is_scanned_for_captures_);
1033   return has_named_captures_;
1034 }
1035 
IsInsideCaptureGroup(int index)1036 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
1037   for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1038     if (s->group_type() != CAPTURE) continue;
1039     // Return true if we found the matching capture index.
1040     if (index == s->capture_index()) return true;
1041     // Abort if index is larger than what has been parsed up till this state.
1042     if (index > s->capture_index()) return false;
1043   }
1044   return false;
1045 }
1046 
IsInsideCaptureGroup(const ZoneVector<uc16> * name)1047 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
1048     const ZoneVector<uc16>* name) {
1049   DCHECK_NOT_NULL(name);
1050   for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1051     if (s->capture_name() == nullptr) continue;
1052     if (*s->capture_name() == *name) return true;
1053   }
1054   return false;
1055 }
1056 
1057 // QuantifierPrefix ::
1058 //   { DecimalDigits }
1059 //   { DecimalDigits , }
1060 //   { DecimalDigits , DecimalDigits }
1061 //
1062 // Returns true if parsing succeeds, and set the min_out and max_out
1063 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
ParseIntervalQuantifier(int * min_out,int * max_out)1064 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
1065   DCHECK_EQ(current(), '{');
1066   int start = position();
1067   Advance();
1068   int min = 0;
1069   if (!IsDecimalDigit(current())) {
1070     Reset(start);
1071     return false;
1072   }
1073   while (IsDecimalDigit(current())) {
1074     int next = current() - '0';
1075     if (min > (RegExpTree::kInfinity - next) / 10) {
1076       // Overflow. Skip past remaining decimal digits and return -1.
1077       do {
1078         Advance();
1079       } while (IsDecimalDigit(current()));
1080       min = RegExpTree::kInfinity;
1081       break;
1082     }
1083     min = 10 * min + next;
1084     Advance();
1085   }
1086   int max = 0;
1087   if (current() == '}') {
1088     max = min;
1089     Advance();
1090   } else if (current() == ',') {
1091     Advance();
1092     if (current() == '}') {
1093       max = RegExpTree::kInfinity;
1094       Advance();
1095     } else {
1096       while (IsDecimalDigit(current())) {
1097         int next = current() - '0';
1098         if (max > (RegExpTree::kInfinity - next) / 10) {
1099           do {
1100             Advance();
1101           } while (IsDecimalDigit(current()));
1102           max = RegExpTree::kInfinity;
1103           break;
1104         }
1105         max = 10 * max + next;
1106         Advance();
1107       }
1108       if (current() != '}') {
1109         Reset(start);
1110         return false;
1111       }
1112       Advance();
1113     }
1114   } else {
1115     Reset(start);
1116     return false;
1117   }
1118   *min_out = min;
1119   *max_out = max;
1120   return true;
1121 }
1122 
1123 
ParseOctalLiteral()1124 uc32 RegExpParser::ParseOctalLiteral() {
1125   DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
1126   // For compatibility with some other browsers (not all), we parse
1127   // up to three octal digits with a value below 256.
1128   // ES#prod-annexB-LegacyOctalEscapeSequence
1129   uc32 value = current() - '0';
1130   Advance();
1131   if ('0' <= current() && current() <= '7') {
1132     value = value * 8 + current() - '0';
1133     Advance();
1134     if (value < 32 && '0' <= current() && current() <= '7') {
1135       value = value * 8 + current() - '0';
1136       Advance();
1137     }
1138   }
1139   return value;
1140 }
1141 
1142 
ParseHexEscape(int length,uc32 * value)1143 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
1144   int start = position();
1145   uc32 val = 0;
1146   for (int i = 0; i < length; ++i) {
1147     uc32 c = current();
1148     int d = HexValue(c);
1149     if (d < 0) {
1150       Reset(start);
1151       return false;
1152     }
1153     val = val * 16 + d;
1154     Advance();
1155   }
1156   *value = val;
1157   return true;
1158 }
1159 
1160 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uc32 * value)1161 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1162   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1163   // allowed). In the latter case, the number of hex digits between { } is
1164   // arbitrary. \ and u have already been read.
1165   if (current() == '{' && unicode()) {
1166     int start = position();
1167     Advance();
1168     if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1169       if (current() == '}') {
1170         Advance();
1171         return true;
1172       }
1173     }
1174     Reset(start);
1175     return false;
1176   }
1177   // \u but no {, or \u{...} escapes not allowed.
1178   bool result = ParseHexEscape(4, value);
1179   if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1180       current() == '\\') {
1181     // Attempt to read trail surrogate.
1182     int start = position();
1183     if (Next() == 'u') {
1184       Advance(2);
1185       uc32 trail;
1186       if (ParseHexEscape(4, &trail) &&
1187           unibrow::Utf16::IsTrailSurrogate(trail)) {
1188         *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1189                                                       static_cast<uc16>(trail));
1190         return true;
1191       }
1192     }
1193     Reset(start);
1194   }
1195   return result;
1196 }
1197 
1198 #ifdef V8_INTL_SUPPORT
1199 
1200 namespace {
1201 
IsExactPropertyAlias(const char * property_name,UProperty property)1202 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1203   const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1204   if (short_name != nullptr && strcmp(property_name, short_name) == 0)
1205     return true;
1206   for (int i = 0;; i++) {
1207     const char* long_name = u_getPropertyName(
1208         property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1209     if (long_name == nullptr) break;
1210     if (strcmp(property_name, long_name) == 0) return true;
1211   }
1212   return false;
1213 }
1214 
IsExactPropertyValueAlias(const char * property_value_name,UProperty property,int32_t property_value)1215 bool IsExactPropertyValueAlias(const char* property_value_name,
1216                                UProperty property, int32_t property_value) {
1217   const char* short_name =
1218       u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1219   if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1220     return true;
1221   }
1222   for (int i = 0;; i++) {
1223     const char* long_name = u_getPropertyValueName(
1224         property, property_value,
1225         static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1226     if (long_name == nullptr) break;
1227     if (strcmp(property_value_name, long_name) == 0) return true;
1228   }
1229   return false;
1230 }
1231 
LookupPropertyValueName(UProperty property,const char * property_value_name,bool negate,ZoneList<CharacterRange> * result,Zone * zone)1232 bool LookupPropertyValueName(UProperty property,
1233                              const char* property_value_name, bool negate,
1234                              ZoneList<CharacterRange>* result, Zone* zone) {
1235   UProperty property_for_lookup = property;
1236   if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1237     // For the property Script_Extensions, we have to do the property value
1238     // name lookup as if the property is Script.
1239     property_for_lookup = UCHAR_SCRIPT;
1240   }
1241   int32_t property_value =
1242       u_getPropertyValueEnum(property_for_lookup, property_value_name);
1243   if (property_value == UCHAR_INVALID_CODE) return false;
1244 
1245   // We require the property name to match exactly to one of the property value
1246   // aliases. However, u_getPropertyValueEnum uses loose matching.
1247   if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1248                                  property_value)) {
1249     return false;
1250   }
1251 
1252   UErrorCode ec = U_ZERO_ERROR;
1253   icu::UnicodeSet set;
1254   set.applyIntPropertyValue(property, property_value, ec);
1255   bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1256 
1257   if (success) {
1258     set.removeAllStrings();
1259     if (negate) set.complement();
1260     for (int i = 0; i < set.getRangeCount(); i++) {
1261       result->Add(
1262           CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1263           zone);
1264     }
1265   }
1266   return success;
1267 }
1268 
1269 template <size_t N>
NameEquals(const char * name,const char (& literal)[N])1270 inline bool NameEquals(const char* name, const char (&literal)[N]) {
1271   return strncmp(name, literal, N + 1) == 0;
1272 }
1273 
LookupSpecialPropertyValueName(const char * name,ZoneList<CharacterRange> * result,bool negate,Zone * zone)1274 bool LookupSpecialPropertyValueName(const char* name,
1275                                     ZoneList<CharacterRange>* result,
1276                                     bool negate, Zone* zone) {
1277   if (NameEquals(name, "Any")) {
1278     if (negate) {
1279       // Leave the list of character ranges empty, since the negation of 'Any'
1280       // is the empty set.
1281     } else {
1282       result->Add(CharacterRange::Everything(), zone);
1283     }
1284   } else if (NameEquals(name, "ASCII")) {
1285     result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1286                        : CharacterRange::Range(0x0, 0x7F),
1287                 zone);
1288   } else if (NameEquals(name, "Assigned")) {
1289     return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1290                                    !negate, result, zone);
1291   } else {
1292     return false;
1293   }
1294   return true;
1295 }
1296 
1297 // Explicitly whitelist supported binary properties. The spec forbids supporting
1298 // properties outside of this set to ensure interoperability.
IsSupportedBinaryProperty(UProperty property)1299 bool IsSupportedBinaryProperty(UProperty property) {
1300   switch (property) {
1301     case UCHAR_ALPHABETIC:
1302     // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1303     // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1304     case UCHAR_ASCII_HEX_DIGIT:
1305     // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1306     case UCHAR_BIDI_CONTROL:
1307     case UCHAR_BIDI_MIRRORED:
1308     case UCHAR_CASE_IGNORABLE:
1309     case UCHAR_CASED:
1310     case UCHAR_CHANGES_WHEN_CASEFOLDED:
1311     case UCHAR_CHANGES_WHEN_CASEMAPPED:
1312     case UCHAR_CHANGES_WHEN_LOWERCASED:
1313     case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1314     case UCHAR_CHANGES_WHEN_TITLECASED:
1315     case UCHAR_CHANGES_WHEN_UPPERCASED:
1316     case UCHAR_DASH:
1317     case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1318     case UCHAR_DEPRECATED:
1319     case UCHAR_DIACRITIC:
1320     case UCHAR_EMOJI:
1321     case UCHAR_EMOJI_COMPONENT:
1322     case UCHAR_EMOJI_MODIFIER_BASE:
1323     case UCHAR_EMOJI_MODIFIER:
1324     case UCHAR_EMOJI_PRESENTATION:
1325     case UCHAR_EXTENDED_PICTOGRAPHIC:
1326     case UCHAR_EXTENDER:
1327     case UCHAR_GRAPHEME_BASE:
1328     case UCHAR_GRAPHEME_EXTEND:
1329     case UCHAR_HEX_DIGIT:
1330     case UCHAR_ID_CONTINUE:
1331     case UCHAR_ID_START:
1332     case UCHAR_IDEOGRAPHIC:
1333     case UCHAR_IDS_BINARY_OPERATOR:
1334     case UCHAR_IDS_TRINARY_OPERATOR:
1335     case UCHAR_JOIN_CONTROL:
1336     case UCHAR_LOGICAL_ORDER_EXCEPTION:
1337     case UCHAR_LOWERCASE:
1338     case UCHAR_MATH:
1339     case UCHAR_NONCHARACTER_CODE_POINT:
1340     case UCHAR_PATTERN_SYNTAX:
1341     case UCHAR_PATTERN_WHITE_SPACE:
1342     case UCHAR_QUOTATION_MARK:
1343     case UCHAR_RADICAL:
1344     case UCHAR_REGIONAL_INDICATOR:
1345     case UCHAR_S_TERM:
1346     case UCHAR_SOFT_DOTTED:
1347     case UCHAR_TERMINAL_PUNCTUATION:
1348     case UCHAR_UNIFIED_IDEOGRAPH:
1349     case UCHAR_UPPERCASE:
1350     case UCHAR_VARIATION_SELECTOR:
1351     case UCHAR_WHITE_SPACE:
1352     case UCHAR_XID_CONTINUE:
1353     case UCHAR_XID_START:
1354       return true;
1355     default:
1356       break;
1357   }
1358   return false;
1359 }
1360 
IsUnicodePropertyValueCharacter(char c)1361 bool IsUnicodePropertyValueCharacter(char c) {
1362   // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1363   //
1364   // Note that using this to validate each parsed char is quite conservative.
1365   // A possible alternative solution would be to only ensure the parsed
1366   // property name/value candidate string does not contain '\0' characters and
1367   // let ICU lookups trigger the final failure.
1368   if ('a' <= c && c <= 'z') return true;
1369   if ('A' <= c && c <= 'Z') return true;
1370   if ('0' <= c && c <= '9') return true;
1371   return (c == '_');
1372 }
1373 
1374 }  // anonymous namespace
1375 
ParsePropertyClassName(ZoneVector<char> * name_1,ZoneVector<char> * name_2)1376 bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1,
1377                                           ZoneVector<char>* name_2) {
1378   DCHECK(name_1->empty());
1379   DCHECK(name_2->empty());
1380   // Parse the property class as follows:
1381   // - In \p{name}, 'name' is interpreted
1382   //   - either as a general category property value name.
1383   //   - or as a binary property name.
1384   // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1385   //   and 'value' is interpreted as one of the available property value names.
1386   // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1387   // - Loose matching is not applied.
1388   if (current() == '{') {
1389     // Parse \p{[PropertyName=]PropertyNameValue}
1390     for (Advance(); current() != '}' && current() != '='; Advance()) {
1391       if (!IsUnicodePropertyValueCharacter(current())) return false;
1392       if (!has_next()) return false;
1393       name_1->push_back(static_cast<char>(current()));
1394     }
1395     if (current() == '=') {
1396       for (Advance(); current() != '}'; Advance()) {
1397         if (!IsUnicodePropertyValueCharacter(current())) return false;
1398         if (!has_next()) return false;
1399         name_2->push_back(static_cast<char>(current()));
1400       }
1401       name_2->push_back(0);  // null-terminate string.
1402     }
1403   } else {
1404     return false;
1405   }
1406   Advance();
1407   name_1->push_back(0);  // null-terminate string.
1408 
1409   DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
1410   DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
1411   return true;
1412 }
1413 
AddPropertyClassRange(ZoneList<CharacterRange> * add_to,bool negate,const ZoneVector<char> & name_1,const ZoneVector<char> & name_2)1414 bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1415                                          bool negate,
1416                                          const ZoneVector<char>& name_1,
1417                                          const ZoneVector<char>& name_2) {
1418   if (name_2.empty()) {
1419     // First attempt to interpret as general category property value name.
1420     const char* name = name_1.data();
1421     if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1422                                 add_to, zone())) {
1423       return true;
1424     }
1425     // Interpret "Any", "ASCII", and "Assigned".
1426     if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
1427       return true;
1428     }
1429     // Then attempt to interpret as binary property name with value name 'Y'.
1430     UProperty property = u_getPropertyEnum(name);
1431     if (!IsSupportedBinaryProperty(property)) return false;
1432     if (!IsExactPropertyAlias(name, property)) return false;
1433     return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
1434                                    zone());
1435   } else {
1436     // Both property name and value name are specified. Attempt to interpret
1437     // the property name as enumerated property.
1438     const char* property_name = name_1.data();
1439     const char* value_name = name_2.data();
1440     UProperty property = u_getPropertyEnum(property_name);
1441     if (!IsExactPropertyAlias(property_name, property)) return false;
1442     if (property == UCHAR_GENERAL_CATEGORY) {
1443       // We want to allow aggregate value names such as "Letter".
1444       property = UCHAR_GENERAL_CATEGORY_MASK;
1445     } else if (property != UCHAR_SCRIPT &&
1446                property != UCHAR_SCRIPT_EXTENSIONS) {
1447       return false;
1448     }
1449     return LookupPropertyValueName(property, value_name, negate, add_to,
1450                                    zone());
1451   }
1452 }
1453 
GetPropertySequence(const ZoneVector<char> & name_1)1454 RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name_1) {
1455   if (!FLAG_harmony_regexp_sequence) return nullptr;
1456   const char* name = name_1.data();
1457   const uc32* sequence_list = nullptr;
1458   JSRegExp::Flags flags = JSRegExp::kUnicode;
1459   if (NameEquals(name, "Emoji_Flag_Sequence")) {
1460     sequence_list = UnicodePropertySequences::kEmojiFlagSequences;
1461   } else if (NameEquals(name, "Emoji_Tag_Sequence")) {
1462     sequence_list = UnicodePropertySequences::kEmojiTagSequences;
1463   } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) {
1464     sequence_list = UnicodePropertySequences::kEmojiZWJSequences;
1465   }
1466   if (sequence_list != nullptr) {
1467     // TODO(yangguo): this creates huge regexp code. Alternative to this is
1468     // to create a new operator that checks for these sequences at runtime.
1469     RegExpBuilder builder(zone(), flags);
1470     while (true) {                   // Iterate through list of sequences.
1471       while (*sequence_list != 0) {  // Iterate through sequence.
1472         builder.AddUnicodeCharacter(*sequence_list);
1473         sequence_list++;
1474       }
1475       sequence_list++;
1476       if (*sequence_list == 0) break;
1477       builder.NewAlternative();
1478     }
1479     return builder.ToRegExp();
1480   }
1481 
1482   if (NameEquals(name, "Emoji_Keycap_Sequence")) {
1483     // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence
1484     // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3}
1485     RegExpBuilder builder(zone(), flags);
1486     ZoneList<CharacterRange>* prefix_ranges =
1487         new (zone()) ZoneList<CharacterRange>(2, zone());
1488     prefix_ranges->Add(CharacterRange::Range('0', '9'), zone());
1489     prefix_ranges->Add(CharacterRange::Singleton('#'), zone());
1490     prefix_ranges->Add(CharacterRange::Singleton('*'), zone());
1491     builder.AddCharacterClass(
1492         new (zone()) RegExpCharacterClass(zone(), prefix_ranges, flags));
1493     builder.AddCharacter(0xFE0F);
1494     builder.AddCharacter(0x20E3);
1495     return builder.ToRegExp();
1496   } else if (NameEquals(name, "Emoji_Modifier_Sequence")) {
1497     // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
1498     // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
1499     RegExpBuilder builder(zone(), flags);
1500     ZoneList<CharacterRange>* modifier_base_ranges =
1501         new (zone()) ZoneList<CharacterRange>(2, zone());
1502     LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false,
1503                             modifier_base_ranges, zone());
1504     builder.AddCharacterClass(
1505         new (zone()) RegExpCharacterClass(zone(), modifier_base_ranges, flags));
1506     ZoneList<CharacterRange>* modifier_ranges =
1507         new (zone()) ZoneList<CharacterRange>(2, zone());
1508     LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges,
1509                             zone());
1510     builder.AddCharacterClass(
1511         new (zone()) RegExpCharacterClass(zone(), modifier_ranges, flags));
1512     return builder.ToRegExp();
1513   }
1514 
1515   return nullptr;
1516 }
1517 
1518 #else  // V8_INTL_SUPPORT
1519 
ParsePropertyClassName(ZoneVector<char> * name_1,ZoneVector<char> * name_2)1520 bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1,
1521                                           ZoneVector<char>* name_2) {
1522   return false;
1523 }
1524 
AddPropertyClassRange(ZoneList<CharacterRange> * add_to,bool negate,const ZoneVector<char> & name_1,const ZoneVector<char> & name_2)1525 bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1526                                          bool negate,
1527                                          const ZoneVector<char>& name_1,
1528                                          const ZoneVector<char>& name_2) {
1529   return false;
1530 }
1531 
GetPropertySequence(const ZoneVector<char> & name)1532 RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name) {
1533   return nullptr;
1534 }
1535 
1536 #endif  // V8_INTL_SUPPORT
1537 
ParseUnlimitedLengthHexNumber(int max_value,uc32 * value)1538 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1539   uc32 x = 0;
1540   int d = HexValue(current());
1541   if (d < 0) {
1542     return false;
1543   }
1544   while (d >= 0) {
1545     x = x * 16 + d;
1546     if (x > max_value) {
1547       return false;
1548     }
1549     Advance();
1550     d = HexValue(current());
1551   }
1552   *value = x;
1553   return true;
1554 }
1555 
1556 
ParseClassCharacterEscape()1557 uc32 RegExpParser::ParseClassCharacterEscape() {
1558   DCHECK_EQ('\\', current());
1559   DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1560   Advance();
1561   switch (current()) {
1562     case 'b':
1563       Advance();
1564       return '\b';
1565     // ControlEscape :: one of
1566     //   f n r t v
1567     case 'f':
1568       Advance();
1569       return '\f';
1570     case 'n':
1571       Advance();
1572       return '\n';
1573     case 'r':
1574       Advance();
1575       return '\r';
1576     case 't':
1577       Advance();
1578       return '\t';
1579     case 'v':
1580       Advance();
1581       return '\v';
1582     case 'c': {
1583       uc32 controlLetter = Next();
1584       uc32 letter = controlLetter & ~('A' ^ 'a');
1585       // Inside a character class, we also accept digits and underscore as
1586       // control characters, unless with /u. See Annex B:
1587       // ES#prod-annexB-ClassControlLetter
1588       if (letter >= 'A' && letter <= 'Z') {
1589         Advance(2);
1590         // Control letters mapped to ASCII control characters in the range
1591         // 0x00-0x1F.
1592         return controlLetter & 0x1F;
1593       }
1594       if (unicode()) {
1595         // With /u, invalid escapes are not treated as identity escapes.
1596         ReportError(RegExpError::kInvalidClassEscape);
1597         return 0;
1598       }
1599       if ((controlLetter >= '0' && controlLetter <= '9') ||
1600           controlLetter == '_') {
1601         Advance(2);
1602         return controlLetter & 0x1F;
1603       }
1604       // We match JSC in reading the backslash as a literal
1605       // character instead of as starting an escape.
1606       // TODO(v8:6201): Not yet covered by the spec.
1607       return '\\';
1608     }
1609     case '0':
1610       // With /u, \0 is interpreted as NUL if not followed by another digit.
1611       if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1612         Advance();
1613         return 0;
1614       }
1615       V8_FALLTHROUGH;
1616     case '1':
1617     case '2':
1618     case '3':
1619     case '4':
1620     case '5':
1621     case '6':
1622     case '7':
1623       // For compatibility, we interpret a decimal escape that isn't
1624       // a back reference (and therefore either \0 or not valid according
1625       // to the specification) as a 1..3 digit octal character code.
1626       // ES#prod-annexB-LegacyOctalEscapeSequence
1627       if (unicode()) {
1628         // With /u, decimal escape is not interpreted as octal character code.
1629         ReportError(RegExpError::kInvalidClassEscape);
1630         return 0;
1631       }
1632       return ParseOctalLiteral();
1633     case 'x': {
1634       Advance();
1635       uc32 value;
1636       if (ParseHexEscape(2, &value)) return value;
1637       if (unicode()) {
1638         // With /u, invalid escapes are not treated as identity escapes.
1639         ReportError(RegExpError::kInvalidEscape);
1640         return 0;
1641       }
1642       // If \x is not followed by a two-digit hexadecimal, treat it
1643       // as an identity escape.
1644       return 'x';
1645     }
1646     case 'u': {
1647       Advance();
1648       uc32 value;
1649       if (ParseUnicodeEscape(&value)) return value;
1650       if (unicode()) {
1651         // With /u, invalid escapes are not treated as identity escapes.
1652         ReportError(RegExpError::kInvalidUnicodeEscape);
1653         return 0;
1654       }
1655       // If \u is not followed by a two-digit hexadecimal, treat it
1656       // as an identity escape.
1657       return 'u';
1658     }
1659     default: {
1660       uc32 result = current();
1661       // With /u, no identity escapes except for syntax characters and '-' are
1662       // allowed. Otherwise, all identity escapes are allowed.
1663       if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1664         Advance();
1665         return result;
1666       }
1667       ReportError(RegExpError::kInvalidEscape);
1668       return 0;
1669     }
1670   }
1671   UNREACHABLE();
1672 }
1673 
ParseClassEscape(ZoneList<CharacterRange> * ranges,Zone * zone,bool add_unicode_case_equivalents,uc32 * char_out,bool * is_class_escape)1674 void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
1675                                     Zone* zone,
1676                                     bool add_unicode_case_equivalents,
1677                                     uc32* char_out, bool* is_class_escape) {
1678   uc32 current_char = current();
1679   if (current_char == '\\') {
1680     switch (Next()) {
1681       case 'w':
1682       case 'W':
1683       case 'd':
1684       case 'D':
1685       case 's':
1686       case 'S': {
1687         CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
1688                                        add_unicode_case_equivalents, zone);
1689         Advance(2);
1690         *is_class_escape = true;
1691         return;
1692       }
1693       case kEndMarker:
1694         ReportError(RegExpError::kEscapeAtEndOfPattern);
1695         return;
1696       case 'p':
1697       case 'P':
1698         if (unicode()) {
1699           bool negate = Next() == 'P';
1700           Advance(2);
1701           ZoneVector<char> name_1(zone);
1702           ZoneVector<char> name_2(zone);
1703           if (!ParsePropertyClassName(&name_1, &name_2) ||
1704               !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
1705             ReportError(RegExpError::kInvalidClassPropertyName);
1706           }
1707           *is_class_escape = true;
1708           return;
1709         }
1710         break;
1711       default:
1712         break;
1713     }
1714     *char_out = ParseClassCharacterEscape();
1715     *is_class_escape = false;
1716   } else {
1717     Advance();
1718     *char_out = current_char;
1719     *is_class_escape = false;
1720   }
1721 }
1722 
ParseCharacterClass(const RegExpBuilder * builder)1723 RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
1724   DCHECK_EQ(current(), '[');
1725   Advance();
1726   bool is_negated = false;
1727   if (current() == '^') {
1728     is_negated = true;
1729     Advance();
1730   }
1731   ZoneList<CharacterRange>* ranges =
1732       new (zone()) ZoneList<CharacterRange>(2, zone());
1733   bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
1734   while (has_more() && current() != ']') {
1735     uc32 char_1, char_2;
1736     bool is_class_1, is_class_2;
1737     ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
1738                      &is_class_1 CHECK_FAILED);
1739     if (current() == '-') {
1740       Advance();
1741       if (current() == kEndMarker) {
1742         // If we reach the end we break out of the loop and let the
1743         // following code report an error.
1744         break;
1745       } else if (current() == ']') {
1746         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1747         ranges->Add(CharacterRange::Singleton('-'), zone());
1748         break;
1749       }
1750       ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
1751                        &is_class_2 CHECK_FAILED);
1752       if (is_class_1 || is_class_2) {
1753         // Either end is an escaped character class. Treat the '-' verbatim.
1754         if (unicode()) {
1755           // ES2015 21.2.2.15.1 step 1.
1756           return ReportError(RegExpError::kInvalidCharacterClass);
1757         }
1758         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1759         ranges->Add(CharacterRange::Singleton('-'), zone());
1760         if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
1761         continue;
1762       }
1763       // ES2015 21.2.2.15.1 step 6.
1764       if (char_1 > char_2) {
1765         return ReportError(RegExpError::kOutOfOrderCharacterClass);
1766       }
1767       ranges->Add(CharacterRange::Range(char_1, char_2), zone());
1768     } else {
1769       if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1770     }
1771   }
1772   if (!has_more()) {
1773     return ReportError(RegExpError::kUnterminatedCharacterClass);
1774   }
1775   Advance();
1776   RegExpCharacterClass::CharacterClassFlags character_class_flags;
1777   if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
1778   return new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags(),
1779                                            character_class_flags);
1780 }
1781 
1782 
1783 #undef CHECK_FAILED
1784 
1785 
ParseRegExp(Isolate * isolate,Zone * zone,FlatStringReader * input,JSRegExp::Flags flags,RegExpCompileData * result)1786 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1787                                FlatStringReader* input, JSRegExp::Flags flags,
1788                                RegExpCompileData* result) {
1789   DCHECK(result != nullptr);
1790   RegExpParser parser(input, flags, isolate, zone);
1791   RegExpTree* tree = parser.ParsePattern();
1792   if (parser.failed()) {
1793     DCHECK(tree == nullptr);
1794     DCHECK(parser.error_ != RegExpError::kNone);
1795     result->error = parser.error_;
1796     result->error_pos = parser.error_pos_;
1797   } else {
1798     DCHECK(tree != nullptr);
1799     DCHECK(parser.error_ == RegExpError::kNone);
1800     if (FLAG_trace_regexp_parser) {
1801       StdoutStream os;
1802       tree->Print(os, zone);
1803       os << "\n";
1804     }
1805     result->tree = tree;
1806     int capture_count = parser.captures_started();
1807     result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1808     result->contains_anchor = parser.contains_anchor();
1809     result->capture_name_map = parser.CreateCaptureNameMap();
1810     result->capture_count = capture_count;
1811   }
1812   return !parser.failed();
1813 }
1814 
RegExpBuilder(Zone * zone,JSRegExp::Flags flags)1815 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
1816     : zone_(zone),
1817       pending_empty_(false),
1818       flags_(flags),
1819       characters_(nullptr),
1820       pending_surrogate_(kNoPendingSurrogate),
1821       terms_(),
1822       alternatives_()
1823 #ifdef DEBUG
1824       ,
1825       last_added_(ADD_NONE)
1826 #endif
1827 {
1828 }
1829 
1830 
AddLeadSurrogate(uc16 lead_surrogate)1831 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1832   DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1833   FlushPendingSurrogate();
1834   // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1835   pending_surrogate_ = lead_surrogate;
1836 }
1837 
1838 
AddTrailSurrogate(uc16 trail_surrogate)1839 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1840   DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1841   if (pending_surrogate_ != kNoPendingSurrogate) {
1842     uc16 lead_surrogate = pending_surrogate_;
1843     pending_surrogate_ = kNoPendingSurrogate;
1844     DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1845     uc32 combined =
1846         unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1847     if (NeedsDesugaringForIgnoreCase(combined)) {
1848       AddCharacterClassForDesugaring(combined);
1849     } else {
1850       ZoneList<uc16> surrogate_pair(2, zone());
1851       surrogate_pair.Add(lead_surrogate, zone());
1852       surrogate_pair.Add(trail_surrogate, zone());
1853       RegExpAtom* atom =
1854           new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_);
1855       AddAtom(atom);
1856     }
1857   } else {
1858     pending_surrogate_ = trail_surrogate;
1859     FlushPendingSurrogate();
1860   }
1861 }
1862 
1863 
FlushPendingSurrogate()1864 void RegExpBuilder::FlushPendingSurrogate() {
1865   if (pending_surrogate_ != kNoPendingSurrogate) {
1866     DCHECK(unicode());
1867     uc32 c = pending_surrogate_;
1868     pending_surrogate_ = kNoPendingSurrogate;
1869     AddCharacterClassForDesugaring(c);
1870   }
1871 }
1872 
1873 
FlushCharacters()1874 void RegExpBuilder::FlushCharacters() {
1875   FlushPendingSurrogate();
1876   pending_empty_ = false;
1877   if (characters_ != nullptr) {
1878     RegExpTree* atom =
1879         new (zone()) RegExpAtom(characters_->ToConstVector(), flags_);
1880     characters_ = nullptr;
1881     text_.Add(atom, zone());
1882     LAST(ADD_ATOM);
1883   }
1884 }
1885 
1886 
FlushText()1887 void RegExpBuilder::FlushText() {
1888   FlushCharacters();
1889   int num_text = text_.length();
1890   if (num_text == 0) {
1891     return;
1892   } else if (num_text == 1) {
1893     terms_.Add(text_.last(), zone());
1894   } else {
1895     RegExpText* text = new (zone()) RegExpText(zone());
1896     for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1897     terms_.Add(text, zone());
1898   }
1899   text_.Clear();
1900 }
1901 
1902 
AddCharacter(uc16 c)1903 void RegExpBuilder::AddCharacter(uc16 c) {
1904   FlushPendingSurrogate();
1905   pending_empty_ = false;
1906   if (NeedsDesugaringForIgnoreCase(c)) {
1907     AddCharacterClassForDesugaring(c);
1908   } else {
1909     if (characters_ == nullptr) {
1910       characters_ = new (zone()) ZoneList<uc16>(4, zone());
1911     }
1912     characters_->Add(c, zone());
1913     LAST(ADD_CHAR);
1914   }
1915 }
1916 
1917 
AddUnicodeCharacter(uc32 c)1918 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1919   if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1920     DCHECK(unicode());
1921     AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1922     AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1923   } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1924     AddLeadSurrogate(c);
1925   } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1926     AddTrailSurrogate(c);
1927   } else {
1928     AddCharacter(static_cast<uc16>(c));
1929   }
1930 }
1931 
AddEscapedUnicodeCharacter(uc32 character)1932 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1933   // A lead or trail surrogate parsed via escape sequence will not
1934   // pair up with any preceding lead or following trail surrogate.
1935   FlushPendingSurrogate();
1936   AddUnicodeCharacter(character);
1937   FlushPendingSurrogate();
1938 }
1939 
AddEmpty()1940 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1941 
1942 
AddCharacterClass(RegExpCharacterClass * cc)1943 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1944   if (NeedsDesugaringForUnicode(cc)) {
1945     // With /u, character class needs to be desugared, so it
1946     // must be a standalone term instead of being part of a RegExpText.
1947     AddTerm(cc);
1948   } else {
1949     AddAtom(cc);
1950   }
1951 }
1952 
AddCharacterClassForDesugaring(uc32 c)1953 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1954   AddTerm(new (zone()) RegExpCharacterClass(
1955       zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)),
1956       flags_));
1957 }
1958 
1959 
AddAtom(RegExpTree * term)1960 void RegExpBuilder::AddAtom(RegExpTree* term) {
1961   if (term->IsEmpty()) {
1962     AddEmpty();
1963     return;
1964   }
1965   if (term->IsTextElement()) {
1966     FlushCharacters();
1967     text_.Add(term, zone());
1968   } else {
1969     FlushText();
1970     terms_.Add(term, zone());
1971   }
1972   LAST(ADD_ATOM);
1973 }
1974 
1975 
AddTerm(RegExpTree * term)1976 void RegExpBuilder::AddTerm(RegExpTree* term) {
1977   FlushText();
1978   terms_.Add(term, zone());
1979   LAST(ADD_ATOM);
1980 }
1981 
1982 
AddAssertion(RegExpTree * assert)1983 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1984   FlushText();
1985   terms_.Add(assert, zone());
1986   LAST(ADD_ASSERT);
1987 }
1988 
1989 
NewAlternative()1990 void RegExpBuilder::NewAlternative() { FlushTerms(); }
1991 
1992 
FlushTerms()1993 void RegExpBuilder::FlushTerms() {
1994   FlushText();
1995   int num_terms = terms_.length();
1996   RegExpTree* alternative;
1997   if (num_terms == 0) {
1998     alternative = new (zone()) RegExpEmpty();
1999   } else if (num_terms == 1) {
2000     alternative = terms_.last();
2001   } else {
2002     alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
2003   }
2004   alternatives_.Add(alternative, zone());
2005   terms_.Clear();
2006   LAST(ADD_NONE);
2007 }
2008 
2009 
NeedsDesugaringForUnicode(RegExpCharacterClass * cc)2010 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
2011   if (!unicode()) return false;
2012   // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
2013   // necessarily mean that we need to desugar. It's probably nicer to have a
2014   // separate pass to figure out unicode desugarings.
2015   if (ignore_case()) return true;
2016   ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2017   CharacterRange::Canonicalize(ranges);
2018   for (int i = ranges->length() - 1; i >= 0; i--) {
2019     uc32 from = ranges->at(i).from();
2020     uc32 to = ranges->at(i).to();
2021     // Check for non-BMP characters.
2022     if (to >= kNonBmpStart) return true;
2023     // Check for lone surrogates.
2024     if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
2025   }
2026   return false;
2027 }
2028 
2029 
NeedsDesugaringForIgnoreCase(uc32 c)2030 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
2031 #ifdef V8_INTL_SUPPORT
2032   if (unicode() && ignore_case()) {
2033     icu::UnicodeSet set(c, c);
2034     set.closeOver(USET_CASE_INSENSITIVE);
2035     set.removeAllStrings();
2036     return set.size() > 1;
2037   }
2038   // In the case where ICU is not included, we act as if the unicode flag is
2039   // not set, and do not desugar.
2040 #endif  // V8_INTL_SUPPORT
2041   return false;
2042 }
2043 
2044 
ToRegExp()2045 RegExpTree* RegExpBuilder::ToRegExp() {
2046   FlushTerms();
2047   int num_alternatives = alternatives_.length();
2048   if (num_alternatives == 0) return new (zone()) RegExpEmpty();
2049   if (num_alternatives == 1) return alternatives_.last();
2050   return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
2051 }
2052 
AddQuantifierToAtom(int min,int max,RegExpQuantifier::QuantifierType quantifier_type)2053 bool RegExpBuilder::AddQuantifierToAtom(
2054     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
2055   FlushPendingSurrogate();
2056   if (pending_empty_) {
2057     pending_empty_ = false;
2058     return true;
2059   }
2060   RegExpTree* atom;
2061   if (characters_ != nullptr) {
2062     DCHECK(last_added_ == ADD_CHAR);
2063     // Last atom was character.
2064     Vector<const uc16> char_vector = characters_->ToConstVector();
2065     int num_chars = char_vector.length();
2066     if (num_chars > 1) {
2067       Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
2068       text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone());
2069       char_vector = char_vector.SubVector(num_chars - 1, num_chars);
2070     }
2071     characters_ = nullptr;
2072     atom = new (zone()) RegExpAtom(char_vector, flags_);
2073     FlushText();
2074   } else if (text_.length() > 0) {
2075     DCHECK(last_added_ == ADD_ATOM);
2076     atom = text_.RemoveLast();
2077     FlushText();
2078   } else if (terms_.length() > 0) {
2079     DCHECK(last_added_ == ADD_ATOM);
2080     atom = terms_.RemoveLast();
2081     if (atom->IsLookaround()) {
2082       // With /u, lookarounds are not quantifiable.
2083       if (unicode()) return false;
2084       // Lookbehinds are not quantifiable.
2085       if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
2086         return false;
2087       }
2088     }
2089     if (atom->max_match() == 0) {
2090       // Guaranteed to only match an empty string.
2091       LAST(ADD_TERM);
2092       if (min == 0) {
2093         return true;
2094       }
2095       terms_.Add(atom, zone());
2096       return true;
2097     }
2098   } else {
2099     // Only call immediately after adding an atom or character!
2100     UNREACHABLE();
2101   }
2102   terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
2103              zone());
2104   LAST(ADD_TERM);
2105   return true;
2106 }
2107 
2108 }  // namespace internal
2109 }  // namespace v8
2110