1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "new-regexp/regexp-parser.h"
6
7 #include <vector>
8
9 #include "new-regexp/property-sequences.h"
10 #include "new-regexp/regexp-macro-assembler.h"
11 #include "new-regexp/regexp.h"
12
13 #ifdef V8_INTL_SUPPORT
14 #include "unicode/uniset.h"
15 #endif // V8_INTL_SUPPORT
16
17 namespace v8 {
18 namespace internal {
19
RegExpParser(FlatStringReader * in,JSRegExp::Flags flags,Isolate * isolate,Zone * zone)20 RegExpParser::RegExpParser(FlatStringReader* in, JSRegExp::Flags flags,
21 Isolate* isolate, Zone* zone)
22 : isolate_(isolate),
23 zone_(zone),
24 captures_(nullptr),
25 named_captures_(nullptr),
26 named_back_references_(nullptr),
27 in_(in),
28 current_(kEndMarker),
29 top_level_flags_(flags),
30 next_pos_(0),
31 captures_started_(0),
32 capture_count_(0),
33 has_more_(true),
34 simple_(false),
35 contains_anchor_(false),
36 is_scanned_for_captures_(false),
37 has_named_captures_(false),
38 failed_(false) {
39 Advance();
40 }
41
42 template <bool update_position>
ReadNext()43 inline uc32 RegExpParser::ReadNext() {
44 int position = next_pos_;
45 uc32 c0 = in()->Get(position);
46 position++;
47 // Read the whole surrogate pair in case of unicode flag, if possible.
48 if (unicode() && position < in()->length() &&
49 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
50 uc16 c1 = in()->Get(position);
51 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
52 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
53 position++;
54 }
55 }
56 if (update_position) next_pos_ = position;
57 return c0;
58 }
59
60
Next()61 uc32 RegExpParser::Next() {
62 if (has_next()) {
63 return ReadNext<false>();
64 } else {
65 return kEndMarker;
66 }
67 }
68
Advance()69 void RegExpParser::Advance() {
70 if (has_next()) {
71 StackLimitCheck check(isolate());
72 if (check.HasOverflowed()) {
73 if (FLAG_correctness_fuzzer_suppressions) {
74 FATAL("Aborting on stack overflow");
75 }
76 ReportError(RegExpError::kStackOverflow);
77 } else if (zone()->excess_allocation()) {
78 if (FLAG_correctness_fuzzer_suppressions) {
79 FATAL("Aborting on excess zone allocation");
80 }
81 ReportError(RegExpError::kTooLarge);
82 } else {
83 current_ = ReadNext<true>();
84 }
85 } else {
86 current_ = kEndMarker;
87 // Advance so that position() points to 1-after-the-last-character. This is
88 // important so that Reset() to this position works correctly.
89 next_pos_ = in()->length() + 1;
90 has_more_ = false;
91 }
92 }
93
94
Reset(int pos)95 void RegExpParser::Reset(int pos) {
96 next_pos_ = pos;
97 has_more_ = (pos < in()->length());
98 Advance();
99 }
100
Advance(int dist)101 void RegExpParser::Advance(int dist) {
102 next_pos_ += dist - 1;
103 Advance();
104 }
105
106
simple()107 bool RegExpParser::simple() { return simple_; }
108
IsSyntaxCharacterOrSlash(uc32 c)109 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
110 switch (c) {
111 case '^':
112 case '$':
113 case '\\':
114 case '.':
115 case '*':
116 case '+':
117 case '?':
118 case '(':
119 case ')':
120 case '[':
121 case ']':
122 case '{':
123 case '}':
124 case '|':
125 case '/':
126 return true;
127 default:
128 break;
129 }
130 return false;
131 }
132
ReportError(RegExpError error)133 RegExpTree* RegExpParser::ReportError(RegExpError error) {
134 if (failed_) return nullptr; // Do not overwrite any existing error.
135 failed_ = true;
136 error_ = error;
137 error_pos_ = position();
138 // Zip to the end to make sure no more input is read.
139 current_ = kEndMarker;
140 next_pos_ = in()->length();
141 return nullptr;
142 }
143
144 #define CHECK_FAILED /**/); \
145 if (failed_) return nullptr; \
146 ((void)0
147
148 // Pattern ::
149 // Disjunction
ParsePattern()150 RegExpTree* RegExpParser::ParsePattern() {
151 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
152 PatchNamedBackReferences(CHECK_FAILED);
153 DCHECK(!has_more());
154 // If the result of parsing is a literal string atom, and it has the
155 // same length as the input, then the atom is identical to the input.
156 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
157 simple_ = true;
158 }
159 return result;
160 }
161
162
163 // Disjunction ::
164 // Alternative
165 // Alternative | Disjunction
166 // Alternative ::
167 // [empty]
168 // Term Alternative
169 // Term ::
170 // Assertion
171 // Atom
172 // Atom Quantifier
ParseDisjunction()173 RegExpTree* RegExpParser::ParseDisjunction() {
174 // Used to store current state while parsing subexpressions.
175 RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
176 0, nullptr, top_level_flags_, zone());
177 RegExpParserState* state = &initial_state;
178 // Cache the builder in a local variable for quick access.
179 RegExpBuilder* builder = initial_state.builder();
180 while (true) {
181 switch (current()) {
182 case kEndMarker:
183 if (state->IsSubexpression()) {
184 // Inside a parenthesized group when hitting end of input.
185 return ReportError(RegExpError::kUnterminatedGroup);
186 }
187 DCHECK_EQ(INITIAL, state->group_type());
188 // Parsing completed successfully.
189 return builder->ToRegExp();
190 case ')': {
191 if (!state->IsSubexpression()) {
192 return ReportError(RegExpError::kUnmatchedParen);
193 }
194 DCHECK_NE(INITIAL, state->group_type());
195
196 Advance();
197 // End disjunction parsing and convert builder content to new single
198 // regexp atom.
199 RegExpTree* body = builder->ToRegExp();
200
201 int end_capture_index = captures_started();
202
203 int capture_index = state->capture_index();
204 SubexpressionType group_type = state->group_type();
205
206 // Build result of subexpression.
207 if (group_type == CAPTURE) {
208 if (state->IsNamedCapture()) {
209 CreateNamedCaptureAtIndex(state->capture_name(),
210 capture_index CHECK_FAILED);
211 }
212 RegExpCapture* capture = GetCapture(capture_index);
213 capture->set_body(body);
214 body = capture;
215 } else if (group_type == GROUPING) {
216 body = new (zone()) RegExpGroup(body);
217 } else {
218 DCHECK(group_type == POSITIVE_LOOKAROUND ||
219 group_type == NEGATIVE_LOOKAROUND);
220 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
221 body = new (zone()) RegExpLookaround(
222 body, is_positive, end_capture_index - capture_index,
223 capture_index, state->lookaround_type());
224 }
225
226 // Restore previous state.
227 state = state->previous_state();
228 builder = state->builder();
229
230 builder->AddAtom(body);
231 // For compatibility with JSC and ES3, we allow quantifiers after
232 // lookaheads, and break in all cases.
233 break;
234 }
235 case '|': {
236 Advance();
237 builder->NewAlternative();
238 continue;
239 }
240 case '*':
241 case '+':
242 case '?':
243 return ReportError(RegExpError::kNothingToRepeat);
244 case '^': {
245 Advance();
246 if (builder->multiline()) {
247 builder->AddAssertion(new (zone()) RegExpAssertion(
248 RegExpAssertion::START_OF_LINE, builder->flags()));
249 } else {
250 builder->AddAssertion(new (zone()) RegExpAssertion(
251 RegExpAssertion::START_OF_INPUT, builder->flags()));
252 set_contains_anchor();
253 }
254 continue;
255 }
256 case '$': {
257 Advance();
258 RegExpAssertion::AssertionType assertion_type =
259 builder->multiline() ? RegExpAssertion::END_OF_LINE
260 : RegExpAssertion::END_OF_INPUT;
261 builder->AddAssertion(
262 new (zone()) RegExpAssertion(assertion_type, builder->flags()));
263 continue;
264 }
265 case '.': {
266 Advance();
267 ZoneList<CharacterRange>* ranges =
268 new (zone()) ZoneList<CharacterRange>(2, zone());
269
270 if (builder->dotall()) {
271 // Everything.
272 CharacterRange::AddClassEscape('*', ranges, false, zone());
273 } else {
274 // Everything except \x0A, \x0D, \u2028 and \u2029
275 CharacterRange::AddClassEscape('.', ranges, false, zone());
276 }
277
278 RegExpCharacterClass* cc =
279 new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags());
280 builder->AddCharacterClass(cc);
281 break;
282 }
283 case '(': {
284 state = ParseOpenParenthesis(state CHECK_FAILED);
285 builder = state->builder();
286 continue;
287 }
288 case '[': {
289 RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
290 builder->AddCharacterClass(cc->AsCharacterClass());
291 break;
292 }
293 // Atom ::
294 // \ AtomEscape
295 case '\\':
296 switch (Next()) {
297 case kEndMarker:
298 return ReportError(RegExpError::kEscapeAtEndOfPattern);
299 case 'b':
300 Advance(2);
301 builder->AddAssertion(new (zone()) RegExpAssertion(
302 RegExpAssertion::BOUNDARY, builder->flags()));
303 continue;
304 case 'B':
305 Advance(2);
306 builder->AddAssertion(new (zone()) RegExpAssertion(
307 RegExpAssertion::NON_BOUNDARY, builder->flags()));
308 continue;
309 // AtomEscape ::
310 // CharacterClassEscape
311 //
312 // CharacterClassEscape :: one of
313 // d D s S w W
314 case 'd':
315 case 'D':
316 case 's':
317 case 'S':
318 case 'w':
319 case 'W': {
320 uc32 c = Next();
321 Advance(2);
322 ZoneList<CharacterRange>* ranges =
323 new (zone()) ZoneList<CharacterRange>(2, zone());
324 CharacterRange::AddClassEscape(
325 c, ranges, unicode() && builder->ignore_case(), zone());
326 RegExpCharacterClass* cc = new (zone())
327 RegExpCharacterClass(zone(), ranges, builder->flags());
328 builder->AddCharacterClass(cc);
329 break;
330 }
331 case 'p':
332 case 'P': {
333 uc32 p = Next();
334 Advance(2);
335 if (unicode()) {
336 ZoneList<CharacterRange>* ranges =
337 new (zone()) ZoneList<CharacterRange>(2, zone());
338 ZoneVector<char> name_1(zone());
339 ZoneVector<char> name_2(zone());
340 if (ParsePropertyClassName(&name_1, &name_2)) {
341 if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
342 RegExpCharacterClass* cc = new (zone())
343 RegExpCharacterClass(zone(), ranges, builder->flags());
344 builder->AddCharacterClass(cc);
345 break;
346 }
347 if (p == 'p' && name_2.empty()) {
348 RegExpTree* sequence = GetPropertySequence(name_1);
349 if (sequence != nullptr) {
350 builder->AddAtom(sequence);
351 break;
352 }
353 }
354 }
355 return ReportError(RegExpError::kInvalidPropertyName);
356 } else {
357 builder->AddCharacter(p);
358 }
359 break;
360 }
361 case '1':
362 case '2':
363 case '3':
364 case '4':
365 case '5':
366 case '6':
367 case '7':
368 case '8':
369 case '9': {
370 int index = 0;
371 bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
372 if (is_backref) {
373 if (state->IsInsideCaptureGroup(index)) {
374 // The back reference is inside the capture group it refers to.
375 // Nothing can possibly have been captured yet, so we use empty
376 // instead. This ensures that, when checking a back reference,
377 // the capture registers of the referenced capture are either
378 // both set or both cleared.
379 builder->AddEmpty();
380 } else {
381 RegExpCapture* capture = GetCapture(index);
382 RegExpTree* atom =
383 new (zone()) RegExpBackReference(capture, builder->flags());
384 builder->AddAtom(atom);
385 }
386 break;
387 }
388 // With /u, no identity escapes except for syntax characters
389 // are allowed. Otherwise, all identity escapes are allowed.
390 if (unicode()) {
391 return ReportError(RegExpError::kInvalidEscape);
392 }
393 uc32 first_digit = Next();
394 if (first_digit == '8' || first_digit == '9') {
395 builder->AddCharacter(first_digit);
396 Advance(2);
397 break;
398 }
399 V8_FALLTHROUGH;
400 }
401 case '0': {
402 Advance();
403 if (unicode() && Next() >= '0' && Next() <= '9') {
404 // With /u, decimal escape with leading 0 are not parsed as octal.
405 return ReportError(RegExpError::kInvalidDecimalEscape);
406 }
407 uc32 octal = ParseOctalLiteral();
408 builder->AddCharacter(octal);
409 break;
410 }
411 // ControlEscape :: one of
412 // f n r t v
413 case 'f':
414 Advance(2);
415 builder->AddCharacter('\f');
416 break;
417 case 'n':
418 Advance(2);
419 builder->AddCharacter('\n');
420 break;
421 case 'r':
422 Advance(2);
423 builder->AddCharacter('\r');
424 break;
425 case 't':
426 Advance(2);
427 builder->AddCharacter('\t');
428 break;
429 case 'v':
430 Advance(2);
431 builder->AddCharacter('\v');
432 break;
433 case 'c': {
434 Advance();
435 uc32 controlLetter = Next();
436 // Special case if it is an ASCII letter.
437 // Convert lower case letters to uppercase.
438 uc32 letter = controlLetter & ~('a' ^ 'A');
439 if (letter < 'A' || 'Z' < letter) {
440 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
441 // Read the backslash as a literal character instead of as
442 // starting an escape.
443 // ES#prod-annexB-ExtendedPatternCharacter
444 if (unicode()) {
445 // With /u, invalid escapes are not treated as identity escapes.
446 return ReportError(RegExpError::kInvalidUnicodeEscape);
447 }
448 builder->AddCharacter('\\');
449 } else {
450 Advance(2);
451 builder->AddCharacter(controlLetter & 0x1F);
452 }
453 break;
454 }
455 case 'x': {
456 Advance(2);
457 uc32 value;
458 if (ParseHexEscape(2, &value)) {
459 builder->AddCharacter(value);
460 } else if (!unicode()) {
461 builder->AddCharacter('x');
462 } else {
463 // With /u, invalid escapes are not treated as identity escapes.
464 return ReportError(RegExpError::kInvalidEscape);
465 }
466 break;
467 }
468 case 'u': {
469 Advance(2);
470 uc32 value;
471 if (ParseUnicodeEscape(&value)) {
472 builder->AddEscapedUnicodeCharacter(value);
473 } else if (!unicode()) {
474 builder->AddCharacter('u');
475 } else {
476 // With /u, invalid escapes are not treated as identity escapes.
477 return ReportError(RegExpError::kInvalidUnicodeEscape);
478 }
479 break;
480 }
481 case 'k':
482 // Either an identity escape or a named back-reference. The two
483 // interpretations are mutually exclusive: '\k' is interpreted as
484 // an identity escape for non-Unicode patterns without named
485 // capture groups, and as the beginning of a named back-reference
486 // in all other cases.
487 if (unicode() || HasNamedCaptures()) {
488 Advance(2);
489 ParseNamedBackReference(builder, state CHECK_FAILED);
490 break;
491 }
492 V8_FALLTHROUGH;
493 default:
494 Advance();
495 // With /u, no identity escapes except for syntax characters
496 // are allowed. Otherwise, all identity escapes are allowed.
497 if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
498 builder->AddCharacter(current());
499 Advance();
500 } else {
501 return ReportError(RegExpError::kInvalidEscape);
502 }
503 break;
504 }
505 break;
506 case '{': {
507 int dummy;
508 bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
509 if (parsed) return ReportError(RegExpError::kNothingToRepeat);
510 V8_FALLTHROUGH;
511 }
512 case '}':
513 case ']':
514 if (unicode()) {
515 return ReportError(RegExpError::kLoneQuantifierBrackets);
516 }
517 V8_FALLTHROUGH;
518 default:
519 builder->AddUnicodeCharacter(current());
520 Advance();
521 break;
522 } // end switch(current())
523
524 int min;
525 int max;
526 switch (current()) {
527 // QuantifierPrefix ::
528 // *
529 // +
530 // ?
531 // {
532 case '*':
533 min = 0;
534 max = RegExpTree::kInfinity;
535 Advance();
536 break;
537 case '+':
538 min = 1;
539 max = RegExpTree::kInfinity;
540 Advance();
541 break;
542 case '?':
543 min = 0;
544 max = 1;
545 Advance();
546 break;
547 case '{':
548 if (ParseIntervalQuantifier(&min, &max)) {
549 if (max < min) {
550 return ReportError(RegExpError::kRangeOutOfOrder);
551 }
552 break;
553 } else if (unicode()) {
554 // With /u, incomplete quantifiers are not allowed.
555 return ReportError(RegExpError::kIncompleteQuantifier);
556 }
557 continue;
558 default:
559 continue;
560 }
561 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
562 if (current() == '?') {
563 quantifier_type = RegExpQuantifier::NON_GREEDY;
564 Advance();
565 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
566 // FLAG_regexp_possessive_quantifier is a debug-only flag.
567 quantifier_type = RegExpQuantifier::POSSESSIVE;
568 Advance();
569 }
570 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
571 return ReportError(RegExpError::kInvalidQuantifier);
572 }
573 }
574 }
575
ParseOpenParenthesis(RegExpParserState * state)576 RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
577 RegExpParserState* state) {
578 RegExpLookaround::Type lookaround_type = state->lookaround_type();
579 bool is_named_capture = false;
580 JSRegExp::Flags switch_on = JSRegExp::kNone;
581 JSRegExp::Flags switch_off = JSRegExp::kNone;
582 const ZoneVector<uc16>* capture_name = nullptr;
583 SubexpressionType subexpr_type = CAPTURE;
584 Advance();
585 if (current() == '?') {
586 switch (Next()) {
587 case ':':
588 Advance(2);
589 subexpr_type = GROUPING;
590 break;
591 case '=':
592 Advance(2);
593 lookaround_type = RegExpLookaround::LOOKAHEAD;
594 subexpr_type = POSITIVE_LOOKAROUND;
595 break;
596 case '!':
597 Advance(2);
598 lookaround_type = RegExpLookaround::LOOKAHEAD;
599 subexpr_type = NEGATIVE_LOOKAROUND;
600 break;
601 case '-':
602 case 'i':
603 case 's':
604 case 'm': {
605 if (!FLAG_regexp_mode_modifiers) {
606 ReportError(RegExpError::kInvalidGroup);
607 return nullptr;
608 }
609 Advance();
610 bool flags_sense = true; // Switching on flags.
611 while (subexpr_type != GROUPING) {
612 switch (current()) {
613 case '-':
614 if (!flags_sense) {
615 ReportError(RegExpError::kMultipleFlagDashes);
616 return nullptr;
617 }
618 flags_sense = false;
619 Advance();
620 continue;
621 case 's':
622 case 'i':
623 case 'm': {
624 JSRegExp::Flags bit = JSRegExp::kUnicode;
625 if (current() == 'i') bit = JSRegExp::kIgnoreCase;
626 if (current() == 'm') bit = JSRegExp::kMultiline;
627 if (current() == 's') bit = JSRegExp::kDotAll;
628 if (((switch_on | switch_off) & bit) != 0) {
629 ReportError(RegExpError::kRepeatedFlag);
630 return nullptr;
631 }
632 if (flags_sense) {
633 switch_on |= bit;
634 } else {
635 switch_off |= bit;
636 }
637 Advance();
638 continue;
639 }
640 case ')': {
641 Advance();
642 state->builder()
643 ->FlushText(); // Flush pending text using old flags.
644 // These (?i)-style flag switches don't put us in a subexpression
645 // at all, they just modify the flags in the rest of the current
646 // subexpression.
647 JSRegExp::Flags flags =
648 (state->builder()->flags() | switch_on) & ~switch_off;
649 state->builder()->set_flags(flags);
650 return state;
651 }
652 case ':':
653 Advance();
654 subexpr_type = GROUPING; // Will break us out of the outer loop.
655 continue;
656 default:
657 ReportError(RegExpError::kInvalidFlagGroup);
658 return nullptr;
659 }
660 }
661 break;
662 }
663 case '<':
664 Advance();
665 if (Next() == '=') {
666 Advance(2);
667 lookaround_type = RegExpLookaround::LOOKBEHIND;
668 subexpr_type = POSITIVE_LOOKAROUND;
669 break;
670 } else if (Next() == '!') {
671 Advance(2);
672 lookaround_type = RegExpLookaround::LOOKBEHIND;
673 subexpr_type = NEGATIVE_LOOKAROUND;
674 break;
675 }
676 is_named_capture = true;
677 has_named_captures_ = true;
678 Advance();
679 break;
680 default:
681 ReportError(RegExpError::kInvalidGroup);
682 return nullptr;
683 }
684 }
685 if (subexpr_type == CAPTURE) {
686 if (captures_started_ >= JSRegExp::kMaxCaptures) {
687 ReportError(RegExpError::kTooManyCaptures);
688 return nullptr;
689 }
690 captures_started_++;
691
692 if (is_named_capture) {
693 capture_name = ParseCaptureGroupName(CHECK_FAILED);
694 }
695 }
696 JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off;
697 // Store current state and begin new disjunction parsing.
698 return new (zone())
699 RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
700 capture_name, flags, zone());
701 }
702
703 #ifdef DEBUG
704 // Currently only used in an DCHECK.
IsSpecialClassEscape(uc32 c)705 static bool IsSpecialClassEscape(uc32 c) {
706 switch (c) {
707 case 'd':
708 case 'D':
709 case 's':
710 case 'S':
711 case 'w':
712 case 'W':
713 return true;
714 default:
715 return false;
716 }
717 }
718 #endif
719
720
721 // In order to know whether an escape is a backreference or not we have to scan
722 // the entire regexp and find the number of capturing parentheses. However we
723 // don't want to scan the regexp twice unless it is necessary. This mini-parser
724 // is called when needed. It can see the difference between capturing and
725 // noncapturing parentheses and can skip character classes and backslash-escaped
726 // characters.
ScanForCaptures()727 void RegExpParser::ScanForCaptures() {
728 DCHECK(!is_scanned_for_captures_);
729 const int saved_position = position();
730 // Start with captures started previous to current position
731 int capture_count = captures_started();
732 // Add count of captures after this position.
733 int n;
734 while ((n = current()) != kEndMarker) {
735 Advance();
736 switch (n) {
737 case '\\':
738 Advance();
739 break;
740 case '[': {
741 int c;
742 while ((c = current()) != kEndMarker) {
743 Advance();
744 if (c == '\\') {
745 Advance();
746 } else {
747 if (c == ']') break;
748 }
749 }
750 break;
751 }
752 case '(':
753 if (current() == '?') {
754 // At this point we could be in
755 // * a non-capturing group '(:',
756 // * a lookbehind assertion '(?<=' '(?<!'
757 // * or a named capture '(?<'.
758 //
759 // Of these, only named captures are capturing groups.
760
761 Advance();
762 if (current() != '<') break;
763
764 Advance();
765 if (current() == '=' || current() == '!') break;
766
767 // Found a possible named capture. It could turn out to be a syntax
768 // error (e.g. an unterminated or invalid name), but that distinction
769 // does not matter for our purposes.
770 has_named_captures_ = true;
771 }
772 capture_count++;
773 break;
774 }
775 }
776 capture_count_ = capture_count;
777 is_scanned_for_captures_ = true;
778 Reset(saved_position);
779 }
780
781
ParseBackReferenceIndex(int * index_out)782 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
783 DCHECK_EQ('\\', current());
784 DCHECK('1' <= Next() && Next() <= '9');
785 // Try to parse a decimal literal that is no greater than the total number
786 // of left capturing parentheses in the input.
787 int start = position();
788 int value = Next() - '0';
789 Advance(2);
790 while (true) {
791 uc32 c = current();
792 if (IsDecimalDigit(c)) {
793 value = 10 * value + (c - '0');
794 if (value > JSRegExp::kMaxCaptures) {
795 Reset(start);
796 return false;
797 }
798 Advance();
799 } else {
800 break;
801 }
802 }
803 if (value > captures_started()) {
804 if (!is_scanned_for_captures_) ScanForCaptures();
805 if (value > capture_count_) {
806 Reset(start);
807 return false;
808 }
809 }
810 *index_out = value;
811 return true;
812 }
813
push_code_unit(ZoneVector<uc16> * v,uint32_t code_unit)814 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
815 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
816 v->push_back(code_unit);
817 } else {
818 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
819 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
820 }
821 }
822
ParseCaptureGroupName()823 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
824 ZoneVector<uc16>* name =
825 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
826
827 bool at_start = true;
828 while (true) {
829 uc32 c = current();
830 Advance();
831
832 // Convert unicode escapes.
833 if (c == '\\' && current() == 'u') {
834 Advance();
835 if (!ParseUnicodeEscape(&c)) {
836 ReportError(RegExpError::kInvalidUnicodeEscape);
837 return nullptr;
838 }
839 }
840
841 // The backslash char is misclassified as both ID_Start and ID_Continue.
842 if (c == '\\') {
843 ReportError(RegExpError::kInvalidCaptureGroupName);
844 return nullptr;
845 }
846
847 if (at_start) {
848 if (!IsIdentifierStart(c)) {
849 ReportError(RegExpError::kInvalidCaptureGroupName);
850 return nullptr;
851 }
852 push_code_unit(name, c);
853 at_start = false;
854 } else {
855 if (c == '>') {
856 break;
857 } else if (IsIdentifierPart(c)) {
858 push_code_unit(name, c);
859 } else {
860 ReportError(RegExpError::kInvalidCaptureGroupName);
861 return nullptr;
862 }
863 }
864 }
865
866 return name;
867 }
868
CreateNamedCaptureAtIndex(const ZoneVector<uc16> * name,int index)869 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
870 int index) {
871 DCHECK(0 < index && index <= captures_started_);
872 DCHECK_NOT_NULL(name);
873
874 RegExpCapture* capture = GetCapture(index);
875 DCHECK_NULL(capture->name());
876
877 capture->set_name(name);
878
879 if (named_captures_ == nullptr) {
880 named_captures_ = new (zone_->New(sizeof(*named_captures_)))
881 ZoneSet<RegExpCapture*, RegExpCaptureNameLess>(zone());
882 } else {
883 // Check for duplicates and bail if we find any.
884
885 const auto& named_capture_it = named_captures_->find(capture);
886 if (named_capture_it != named_captures_->end()) {
887 ReportError(RegExpError::kDuplicateCaptureGroupName);
888 return false;
889 }
890 }
891
892 named_captures_->emplace(capture);
893
894 return true;
895 }
896
ParseNamedBackReference(RegExpBuilder * builder,RegExpParserState * state)897 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
898 RegExpParserState* state) {
899 // The parser is assumed to be on the '<' in \k<name>.
900 if (current() != '<') {
901 ReportError(RegExpError::kInvalidNamedReference);
902 return false;
903 }
904
905 Advance();
906 const ZoneVector<uc16>* name = ParseCaptureGroupName();
907 if (name == nullptr) {
908 return false;
909 }
910
911 if (state->IsInsideCaptureGroup(name)) {
912 builder->AddEmpty();
913 } else {
914 RegExpBackReference* atom =
915 new (zone()) RegExpBackReference(builder->flags());
916 atom->set_name(name);
917
918 builder->AddAtom(atom);
919
920 if (named_back_references_ == nullptr) {
921 named_back_references_ =
922 new (zone()) ZoneList<RegExpBackReference*>(1, zone());
923 }
924 named_back_references_->Add(atom, zone());
925 }
926
927 return true;
928 }
929
PatchNamedBackReferences()930 void RegExpParser::PatchNamedBackReferences() {
931 if (named_back_references_ == nullptr) return;
932
933 if (named_captures_ == nullptr) {
934 ReportError(RegExpError::kInvalidNamedCaptureReference);
935 return;
936 }
937
938 // Look up and patch the actual capture for each named back reference.
939
940 for (int i = 0; i < named_back_references_->length(); i++) {
941 RegExpBackReference* ref = named_back_references_->at(i);
942
943 // Capture used to search the named_captures_ by name, index of the
944 // capture is never used.
945 static const int kInvalidIndex = 0;
946 RegExpCapture* search_capture = new (zone()) RegExpCapture(kInvalidIndex);
947 DCHECK_NULL(search_capture->name());
948 search_capture->set_name(ref->name());
949
950 int index = -1;
951 const auto& capture_it = named_captures_->find(search_capture);
952 if (capture_it != named_captures_->end()) {
953 index = (*capture_it)->index();
954 } else {
955 ReportError(RegExpError::kInvalidNamedCaptureReference);
956 return;
957 }
958
959 ref->set_capture(GetCapture(index));
960 }
961 }
962
GetCapture(int index)963 RegExpCapture* RegExpParser::GetCapture(int index) {
964 // The index for the capture groups are one-based. Its index in the list is
965 // zero-based.
966 int know_captures =
967 is_scanned_for_captures_ ? capture_count_ : captures_started_;
968 DCHECK(index <= know_captures);
969 if (captures_ == nullptr) {
970 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
971 }
972 while (captures_->length() < know_captures) {
973 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
974 }
975 return captures_->at(index - 1);
976 }
977
978 namespace {
979
980 struct RegExpCaptureIndexLess {
operator ()v8::internal::__anon499c9d230111::RegExpCaptureIndexLess981 bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
982 DCHECK_NOT_NULL(lhs);
983 DCHECK_NOT_NULL(rhs);
984 return lhs->index() < rhs->index();
985 }
986 };
987
988 } // namespace
989
CreateCaptureNameMap()990 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
991 if (named_captures_ == nullptr || named_captures_->empty()) {
992 return Handle<FixedArray>();
993 }
994
995 // Named captures are sorted by name (because the set is used to ensure
996 // name uniqueness). But the capture name map must to be sorted by index.
997
998 ZoneVector<RegExpCapture*> sorted_named_captures(
999 named_captures_->begin(), named_captures_->end(), zone());
1000 std::sort(sorted_named_captures.begin(), sorted_named_captures.end(),
1001 RegExpCaptureIndexLess{});
1002 DCHECK_EQ(sorted_named_captures.size(), named_captures_->size());
1003
1004 Factory* factory = isolate()->factory();
1005
1006 int len = static_cast<int>(sorted_named_captures.size()) * 2;
1007 Handle<FixedArray> array = factory->NewFixedArray(len);
1008
1009 int i = 0;
1010 for (const auto& capture : sorted_named_captures) {
1011 Vector<const uc16> capture_name(capture->name()->data(),
1012 capture->name()->size());
1013 // CSA code in ConstructNewResultFromMatchInfo requires these strings to be
1014 // internalized so they can be used as property names in the 'exec' results.
1015 Handle<String> name = factory->InternalizeString(capture_name);
1016 array->set(i * 2, *name);
1017 array->set(i * 2 + 1, Smi::FromInt(capture->index()));
1018
1019 i++;
1020 }
1021 DCHECK_EQ(i * 2, len);
1022
1023 return array;
1024 }
1025
HasNamedCaptures()1026 bool RegExpParser::HasNamedCaptures() {
1027 if (has_named_captures_ || is_scanned_for_captures_) {
1028 return has_named_captures_;
1029 }
1030
1031 ScanForCaptures();
1032 DCHECK(is_scanned_for_captures_);
1033 return has_named_captures_;
1034 }
1035
IsInsideCaptureGroup(int index)1036 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
1037 for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1038 if (s->group_type() != CAPTURE) continue;
1039 // Return true if we found the matching capture index.
1040 if (index == s->capture_index()) return true;
1041 // Abort if index is larger than what has been parsed up till this state.
1042 if (index > s->capture_index()) return false;
1043 }
1044 return false;
1045 }
1046
IsInsideCaptureGroup(const ZoneVector<uc16> * name)1047 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
1048 const ZoneVector<uc16>* name) {
1049 DCHECK_NOT_NULL(name);
1050 for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1051 if (s->capture_name() == nullptr) continue;
1052 if (*s->capture_name() == *name) return true;
1053 }
1054 return false;
1055 }
1056
1057 // QuantifierPrefix ::
1058 // { DecimalDigits }
1059 // { DecimalDigits , }
1060 // { DecimalDigits , DecimalDigits }
1061 //
1062 // Returns true if parsing succeeds, and set the min_out and max_out
1063 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
ParseIntervalQuantifier(int * min_out,int * max_out)1064 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
1065 DCHECK_EQ(current(), '{');
1066 int start = position();
1067 Advance();
1068 int min = 0;
1069 if (!IsDecimalDigit(current())) {
1070 Reset(start);
1071 return false;
1072 }
1073 while (IsDecimalDigit(current())) {
1074 int next = current() - '0';
1075 if (min > (RegExpTree::kInfinity - next) / 10) {
1076 // Overflow. Skip past remaining decimal digits and return -1.
1077 do {
1078 Advance();
1079 } while (IsDecimalDigit(current()));
1080 min = RegExpTree::kInfinity;
1081 break;
1082 }
1083 min = 10 * min + next;
1084 Advance();
1085 }
1086 int max = 0;
1087 if (current() == '}') {
1088 max = min;
1089 Advance();
1090 } else if (current() == ',') {
1091 Advance();
1092 if (current() == '}') {
1093 max = RegExpTree::kInfinity;
1094 Advance();
1095 } else {
1096 while (IsDecimalDigit(current())) {
1097 int next = current() - '0';
1098 if (max > (RegExpTree::kInfinity - next) / 10) {
1099 do {
1100 Advance();
1101 } while (IsDecimalDigit(current()));
1102 max = RegExpTree::kInfinity;
1103 break;
1104 }
1105 max = 10 * max + next;
1106 Advance();
1107 }
1108 if (current() != '}') {
1109 Reset(start);
1110 return false;
1111 }
1112 Advance();
1113 }
1114 } else {
1115 Reset(start);
1116 return false;
1117 }
1118 *min_out = min;
1119 *max_out = max;
1120 return true;
1121 }
1122
1123
ParseOctalLiteral()1124 uc32 RegExpParser::ParseOctalLiteral() {
1125 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
1126 // For compatibility with some other browsers (not all), we parse
1127 // up to three octal digits with a value below 256.
1128 // ES#prod-annexB-LegacyOctalEscapeSequence
1129 uc32 value = current() - '0';
1130 Advance();
1131 if ('0' <= current() && current() <= '7') {
1132 value = value * 8 + current() - '0';
1133 Advance();
1134 if (value < 32 && '0' <= current() && current() <= '7') {
1135 value = value * 8 + current() - '0';
1136 Advance();
1137 }
1138 }
1139 return value;
1140 }
1141
1142
ParseHexEscape(int length,uc32 * value)1143 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
1144 int start = position();
1145 uc32 val = 0;
1146 for (int i = 0; i < length; ++i) {
1147 uc32 c = current();
1148 int d = HexValue(c);
1149 if (d < 0) {
1150 Reset(start);
1151 return false;
1152 }
1153 val = val * 16 + d;
1154 Advance();
1155 }
1156 *value = val;
1157 return true;
1158 }
1159
1160 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uc32 * value)1161 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1162 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1163 // allowed). In the latter case, the number of hex digits between { } is
1164 // arbitrary. \ and u have already been read.
1165 if (current() == '{' && unicode()) {
1166 int start = position();
1167 Advance();
1168 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1169 if (current() == '}') {
1170 Advance();
1171 return true;
1172 }
1173 }
1174 Reset(start);
1175 return false;
1176 }
1177 // \u but no {, or \u{...} escapes not allowed.
1178 bool result = ParseHexEscape(4, value);
1179 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1180 current() == '\\') {
1181 // Attempt to read trail surrogate.
1182 int start = position();
1183 if (Next() == 'u') {
1184 Advance(2);
1185 uc32 trail;
1186 if (ParseHexEscape(4, &trail) &&
1187 unibrow::Utf16::IsTrailSurrogate(trail)) {
1188 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1189 static_cast<uc16>(trail));
1190 return true;
1191 }
1192 }
1193 Reset(start);
1194 }
1195 return result;
1196 }
1197
1198 #ifdef V8_INTL_SUPPORT
1199
1200 namespace {
1201
IsExactPropertyAlias(const char * property_name,UProperty property)1202 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1203 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1204 if (short_name != nullptr && strcmp(property_name, short_name) == 0)
1205 return true;
1206 for (int i = 0;; i++) {
1207 const char* long_name = u_getPropertyName(
1208 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1209 if (long_name == nullptr) break;
1210 if (strcmp(property_name, long_name) == 0) return true;
1211 }
1212 return false;
1213 }
1214
IsExactPropertyValueAlias(const char * property_value_name,UProperty property,int32_t property_value)1215 bool IsExactPropertyValueAlias(const char* property_value_name,
1216 UProperty property, int32_t property_value) {
1217 const char* short_name =
1218 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1219 if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1220 return true;
1221 }
1222 for (int i = 0;; i++) {
1223 const char* long_name = u_getPropertyValueName(
1224 property, property_value,
1225 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1226 if (long_name == nullptr) break;
1227 if (strcmp(property_value_name, long_name) == 0) return true;
1228 }
1229 return false;
1230 }
1231
LookupPropertyValueName(UProperty property,const char * property_value_name,bool negate,ZoneList<CharacterRange> * result,Zone * zone)1232 bool LookupPropertyValueName(UProperty property,
1233 const char* property_value_name, bool negate,
1234 ZoneList<CharacterRange>* result, Zone* zone) {
1235 UProperty property_for_lookup = property;
1236 if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1237 // For the property Script_Extensions, we have to do the property value
1238 // name lookup as if the property is Script.
1239 property_for_lookup = UCHAR_SCRIPT;
1240 }
1241 int32_t property_value =
1242 u_getPropertyValueEnum(property_for_lookup, property_value_name);
1243 if (property_value == UCHAR_INVALID_CODE) return false;
1244
1245 // We require the property name to match exactly to one of the property value
1246 // aliases. However, u_getPropertyValueEnum uses loose matching.
1247 if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1248 property_value)) {
1249 return false;
1250 }
1251
1252 UErrorCode ec = U_ZERO_ERROR;
1253 icu::UnicodeSet set;
1254 set.applyIntPropertyValue(property, property_value, ec);
1255 bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1256
1257 if (success) {
1258 set.removeAllStrings();
1259 if (negate) set.complement();
1260 for (int i = 0; i < set.getRangeCount(); i++) {
1261 result->Add(
1262 CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1263 zone);
1264 }
1265 }
1266 return success;
1267 }
1268
1269 template <size_t N>
NameEquals(const char * name,const char (& literal)[N])1270 inline bool NameEquals(const char* name, const char (&literal)[N]) {
1271 return strncmp(name, literal, N + 1) == 0;
1272 }
1273
LookupSpecialPropertyValueName(const char * name,ZoneList<CharacterRange> * result,bool negate,Zone * zone)1274 bool LookupSpecialPropertyValueName(const char* name,
1275 ZoneList<CharacterRange>* result,
1276 bool negate, Zone* zone) {
1277 if (NameEquals(name, "Any")) {
1278 if (negate) {
1279 // Leave the list of character ranges empty, since the negation of 'Any'
1280 // is the empty set.
1281 } else {
1282 result->Add(CharacterRange::Everything(), zone);
1283 }
1284 } else if (NameEquals(name, "ASCII")) {
1285 result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1286 : CharacterRange::Range(0x0, 0x7F),
1287 zone);
1288 } else if (NameEquals(name, "Assigned")) {
1289 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1290 !negate, result, zone);
1291 } else {
1292 return false;
1293 }
1294 return true;
1295 }
1296
1297 // Explicitly whitelist supported binary properties. The spec forbids supporting
1298 // properties outside of this set to ensure interoperability.
IsSupportedBinaryProperty(UProperty property)1299 bool IsSupportedBinaryProperty(UProperty property) {
1300 switch (property) {
1301 case UCHAR_ALPHABETIC:
1302 // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1303 // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1304 case UCHAR_ASCII_HEX_DIGIT:
1305 // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1306 case UCHAR_BIDI_CONTROL:
1307 case UCHAR_BIDI_MIRRORED:
1308 case UCHAR_CASE_IGNORABLE:
1309 case UCHAR_CASED:
1310 case UCHAR_CHANGES_WHEN_CASEFOLDED:
1311 case UCHAR_CHANGES_WHEN_CASEMAPPED:
1312 case UCHAR_CHANGES_WHEN_LOWERCASED:
1313 case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1314 case UCHAR_CHANGES_WHEN_TITLECASED:
1315 case UCHAR_CHANGES_WHEN_UPPERCASED:
1316 case UCHAR_DASH:
1317 case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1318 case UCHAR_DEPRECATED:
1319 case UCHAR_DIACRITIC:
1320 case UCHAR_EMOJI:
1321 case UCHAR_EMOJI_COMPONENT:
1322 case UCHAR_EMOJI_MODIFIER_BASE:
1323 case UCHAR_EMOJI_MODIFIER:
1324 case UCHAR_EMOJI_PRESENTATION:
1325 case UCHAR_EXTENDED_PICTOGRAPHIC:
1326 case UCHAR_EXTENDER:
1327 case UCHAR_GRAPHEME_BASE:
1328 case UCHAR_GRAPHEME_EXTEND:
1329 case UCHAR_HEX_DIGIT:
1330 case UCHAR_ID_CONTINUE:
1331 case UCHAR_ID_START:
1332 case UCHAR_IDEOGRAPHIC:
1333 case UCHAR_IDS_BINARY_OPERATOR:
1334 case UCHAR_IDS_TRINARY_OPERATOR:
1335 case UCHAR_JOIN_CONTROL:
1336 case UCHAR_LOGICAL_ORDER_EXCEPTION:
1337 case UCHAR_LOWERCASE:
1338 case UCHAR_MATH:
1339 case UCHAR_NONCHARACTER_CODE_POINT:
1340 case UCHAR_PATTERN_SYNTAX:
1341 case UCHAR_PATTERN_WHITE_SPACE:
1342 case UCHAR_QUOTATION_MARK:
1343 case UCHAR_RADICAL:
1344 case UCHAR_REGIONAL_INDICATOR:
1345 case UCHAR_S_TERM:
1346 case UCHAR_SOFT_DOTTED:
1347 case UCHAR_TERMINAL_PUNCTUATION:
1348 case UCHAR_UNIFIED_IDEOGRAPH:
1349 case UCHAR_UPPERCASE:
1350 case UCHAR_VARIATION_SELECTOR:
1351 case UCHAR_WHITE_SPACE:
1352 case UCHAR_XID_CONTINUE:
1353 case UCHAR_XID_START:
1354 return true;
1355 default:
1356 break;
1357 }
1358 return false;
1359 }
1360
IsUnicodePropertyValueCharacter(char c)1361 bool IsUnicodePropertyValueCharacter(char c) {
1362 // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1363 //
1364 // Note that using this to validate each parsed char is quite conservative.
1365 // A possible alternative solution would be to only ensure the parsed
1366 // property name/value candidate string does not contain '\0' characters and
1367 // let ICU lookups trigger the final failure.
1368 if ('a' <= c && c <= 'z') return true;
1369 if ('A' <= c && c <= 'Z') return true;
1370 if ('0' <= c && c <= '9') return true;
1371 return (c == '_');
1372 }
1373
1374 } // anonymous namespace
1375
ParsePropertyClassName(ZoneVector<char> * name_1,ZoneVector<char> * name_2)1376 bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1,
1377 ZoneVector<char>* name_2) {
1378 DCHECK(name_1->empty());
1379 DCHECK(name_2->empty());
1380 // Parse the property class as follows:
1381 // - In \p{name}, 'name' is interpreted
1382 // - either as a general category property value name.
1383 // - or as a binary property name.
1384 // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1385 // and 'value' is interpreted as one of the available property value names.
1386 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1387 // - Loose matching is not applied.
1388 if (current() == '{') {
1389 // Parse \p{[PropertyName=]PropertyNameValue}
1390 for (Advance(); current() != '}' && current() != '='; Advance()) {
1391 if (!IsUnicodePropertyValueCharacter(current())) return false;
1392 if (!has_next()) return false;
1393 name_1->push_back(static_cast<char>(current()));
1394 }
1395 if (current() == '=') {
1396 for (Advance(); current() != '}'; Advance()) {
1397 if (!IsUnicodePropertyValueCharacter(current())) return false;
1398 if (!has_next()) return false;
1399 name_2->push_back(static_cast<char>(current()));
1400 }
1401 name_2->push_back(0); // null-terminate string.
1402 }
1403 } else {
1404 return false;
1405 }
1406 Advance();
1407 name_1->push_back(0); // null-terminate string.
1408
1409 DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
1410 DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
1411 return true;
1412 }
1413
AddPropertyClassRange(ZoneList<CharacterRange> * add_to,bool negate,const ZoneVector<char> & name_1,const ZoneVector<char> & name_2)1414 bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1415 bool negate,
1416 const ZoneVector<char>& name_1,
1417 const ZoneVector<char>& name_2) {
1418 if (name_2.empty()) {
1419 // First attempt to interpret as general category property value name.
1420 const char* name = name_1.data();
1421 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1422 add_to, zone())) {
1423 return true;
1424 }
1425 // Interpret "Any", "ASCII", and "Assigned".
1426 if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
1427 return true;
1428 }
1429 // Then attempt to interpret as binary property name with value name 'Y'.
1430 UProperty property = u_getPropertyEnum(name);
1431 if (!IsSupportedBinaryProperty(property)) return false;
1432 if (!IsExactPropertyAlias(name, property)) return false;
1433 return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
1434 zone());
1435 } else {
1436 // Both property name and value name are specified. Attempt to interpret
1437 // the property name as enumerated property.
1438 const char* property_name = name_1.data();
1439 const char* value_name = name_2.data();
1440 UProperty property = u_getPropertyEnum(property_name);
1441 if (!IsExactPropertyAlias(property_name, property)) return false;
1442 if (property == UCHAR_GENERAL_CATEGORY) {
1443 // We want to allow aggregate value names such as "Letter".
1444 property = UCHAR_GENERAL_CATEGORY_MASK;
1445 } else if (property != UCHAR_SCRIPT &&
1446 property != UCHAR_SCRIPT_EXTENSIONS) {
1447 return false;
1448 }
1449 return LookupPropertyValueName(property, value_name, negate, add_to,
1450 zone());
1451 }
1452 }
1453
GetPropertySequence(const ZoneVector<char> & name_1)1454 RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name_1) {
1455 if (!FLAG_harmony_regexp_sequence) return nullptr;
1456 const char* name = name_1.data();
1457 const uc32* sequence_list = nullptr;
1458 JSRegExp::Flags flags = JSRegExp::kUnicode;
1459 if (NameEquals(name, "Emoji_Flag_Sequence")) {
1460 sequence_list = UnicodePropertySequences::kEmojiFlagSequences;
1461 } else if (NameEquals(name, "Emoji_Tag_Sequence")) {
1462 sequence_list = UnicodePropertySequences::kEmojiTagSequences;
1463 } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) {
1464 sequence_list = UnicodePropertySequences::kEmojiZWJSequences;
1465 }
1466 if (sequence_list != nullptr) {
1467 // TODO(yangguo): this creates huge regexp code. Alternative to this is
1468 // to create a new operator that checks for these sequences at runtime.
1469 RegExpBuilder builder(zone(), flags);
1470 while (true) { // Iterate through list of sequences.
1471 while (*sequence_list != 0) { // Iterate through sequence.
1472 builder.AddUnicodeCharacter(*sequence_list);
1473 sequence_list++;
1474 }
1475 sequence_list++;
1476 if (*sequence_list == 0) break;
1477 builder.NewAlternative();
1478 }
1479 return builder.ToRegExp();
1480 }
1481
1482 if (NameEquals(name, "Emoji_Keycap_Sequence")) {
1483 // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence
1484 // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3}
1485 RegExpBuilder builder(zone(), flags);
1486 ZoneList<CharacterRange>* prefix_ranges =
1487 new (zone()) ZoneList<CharacterRange>(2, zone());
1488 prefix_ranges->Add(CharacterRange::Range('0', '9'), zone());
1489 prefix_ranges->Add(CharacterRange::Singleton('#'), zone());
1490 prefix_ranges->Add(CharacterRange::Singleton('*'), zone());
1491 builder.AddCharacterClass(
1492 new (zone()) RegExpCharacterClass(zone(), prefix_ranges, flags));
1493 builder.AddCharacter(0xFE0F);
1494 builder.AddCharacter(0x20E3);
1495 return builder.ToRegExp();
1496 } else if (NameEquals(name, "Emoji_Modifier_Sequence")) {
1497 // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
1498 // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
1499 RegExpBuilder builder(zone(), flags);
1500 ZoneList<CharacterRange>* modifier_base_ranges =
1501 new (zone()) ZoneList<CharacterRange>(2, zone());
1502 LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false,
1503 modifier_base_ranges, zone());
1504 builder.AddCharacterClass(
1505 new (zone()) RegExpCharacterClass(zone(), modifier_base_ranges, flags));
1506 ZoneList<CharacterRange>* modifier_ranges =
1507 new (zone()) ZoneList<CharacterRange>(2, zone());
1508 LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges,
1509 zone());
1510 builder.AddCharacterClass(
1511 new (zone()) RegExpCharacterClass(zone(), modifier_ranges, flags));
1512 return builder.ToRegExp();
1513 }
1514
1515 return nullptr;
1516 }
1517
1518 #else // V8_INTL_SUPPORT
1519
ParsePropertyClassName(ZoneVector<char> * name_1,ZoneVector<char> * name_2)1520 bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1,
1521 ZoneVector<char>* name_2) {
1522 return false;
1523 }
1524
AddPropertyClassRange(ZoneList<CharacterRange> * add_to,bool negate,const ZoneVector<char> & name_1,const ZoneVector<char> & name_2)1525 bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1526 bool negate,
1527 const ZoneVector<char>& name_1,
1528 const ZoneVector<char>& name_2) {
1529 return false;
1530 }
1531
GetPropertySequence(const ZoneVector<char> & name)1532 RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name) {
1533 return nullptr;
1534 }
1535
1536 #endif // V8_INTL_SUPPORT
1537
ParseUnlimitedLengthHexNumber(int max_value,uc32 * value)1538 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1539 uc32 x = 0;
1540 int d = HexValue(current());
1541 if (d < 0) {
1542 return false;
1543 }
1544 while (d >= 0) {
1545 x = x * 16 + d;
1546 if (x > max_value) {
1547 return false;
1548 }
1549 Advance();
1550 d = HexValue(current());
1551 }
1552 *value = x;
1553 return true;
1554 }
1555
1556
ParseClassCharacterEscape()1557 uc32 RegExpParser::ParseClassCharacterEscape() {
1558 DCHECK_EQ('\\', current());
1559 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1560 Advance();
1561 switch (current()) {
1562 case 'b':
1563 Advance();
1564 return '\b';
1565 // ControlEscape :: one of
1566 // f n r t v
1567 case 'f':
1568 Advance();
1569 return '\f';
1570 case 'n':
1571 Advance();
1572 return '\n';
1573 case 'r':
1574 Advance();
1575 return '\r';
1576 case 't':
1577 Advance();
1578 return '\t';
1579 case 'v':
1580 Advance();
1581 return '\v';
1582 case 'c': {
1583 uc32 controlLetter = Next();
1584 uc32 letter = controlLetter & ~('A' ^ 'a');
1585 // Inside a character class, we also accept digits and underscore as
1586 // control characters, unless with /u. See Annex B:
1587 // ES#prod-annexB-ClassControlLetter
1588 if (letter >= 'A' && letter <= 'Z') {
1589 Advance(2);
1590 // Control letters mapped to ASCII control characters in the range
1591 // 0x00-0x1F.
1592 return controlLetter & 0x1F;
1593 }
1594 if (unicode()) {
1595 // With /u, invalid escapes are not treated as identity escapes.
1596 ReportError(RegExpError::kInvalidClassEscape);
1597 return 0;
1598 }
1599 if ((controlLetter >= '0' && controlLetter <= '9') ||
1600 controlLetter == '_') {
1601 Advance(2);
1602 return controlLetter & 0x1F;
1603 }
1604 // We match JSC in reading the backslash as a literal
1605 // character instead of as starting an escape.
1606 // TODO(v8:6201): Not yet covered by the spec.
1607 return '\\';
1608 }
1609 case '0':
1610 // With /u, \0 is interpreted as NUL if not followed by another digit.
1611 if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1612 Advance();
1613 return 0;
1614 }
1615 V8_FALLTHROUGH;
1616 case '1':
1617 case '2':
1618 case '3':
1619 case '4':
1620 case '5':
1621 case '6':
1622 case '7':
1623 // For compatibility, we interpret a decimal escape that isn't
1624 // a back reference (and therefore either \0 or not valid according
1625 // to the specification) as a 1..3 digit octal character code.
1626 // ES#prod-annexB-LegacyOctalEscapeSequence
1627 if (unicode()) {
1628 // With /u, decimal escape is not interpreted as octal character code.
1629 ReportError(RegExpError::kInvalidClassEscape);
1630 return 0;
1631 }
1632 return ParseOctalLiteral();
1633 case 'x': {
1634 Advance();
1635 uc32 value;
1636 if (ParseHexEscape(2, &value)) return value;
1637 if (unicode()) {
1638 // With /u, invalid escapes are not treated as identity escapes.
1639 ReportError(RegExpError::kInvalidEscape);
1640 return 0;
1641 }
1642 // If \x is not followed by a two-digit hexadecimal, treat it
1643 // as an identity escape.
1644 return 'x';
1645 }
1646 case 'u': {
1647 Advance();
1648 uc32 value;
1649 if (ParseUnicodeEscape(&value)) return value;
1650 if (unicode()) {
1651 // With /u, invalid escapes are not treated as identity escapes.
1652 ReportError(RegExpError::kInvalidUnicodeEscape);
1653 return 0;
1654 }
1655 // If \u is not followed by a two-digit hexadecimal, treat it
1656 // as an identity escape.
1657 return 'u';
1658 }
1659 default: {
1660 uc32 result = current();
1661 // With /u, no identity escapes except for syntax characters and '-' are
1662 // allowed. Otherwise, all identity escapes are allowed.
1663 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1664 Advance();
1665 return result;
1666 }
1667 ReportError(RegExpError::kInvalidEscape);
1668 return 0;
1669 }
1670 }
1671 UNREACHABLE();
1672 }
1673
ParseClassEscape(ZoneList<CharacterRange> * ranges,Zone * zone,bool add_unicode_case_equivalents,uc32 * char_out,bool * is_class_escape)1674 void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
1675 Zone* zone,
1676 bool add_unicode_case_equivalents,
1677 uc32* char_out, bool* is_class_escape) {
1678 uc32 current_char = current();
1679 if (current_char == '\\') {
1680 switch (Next()) {
1681 case 'w':
1682 case 'W':
1683 case 'd':
1684 case 'D':
1685 case 's':
1686 case 'S': {
1687 CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
1688 add_unicode_case_equivalents, zone);
1689 Advance(2);
1690 *is_class_escape = true;
1691 return;
1692 }
1693 case kEndMarker:
1694 ReportError(RegExpError::kEscapeAtEndOfPattern);
1695 return;
1696 case 'p':
1697 case 'P':
1698 if (unicode()) {
1699 bool negate = Next() == 'P';
1700 Advance(2);
1701 ZoneVector<char> name_1(zone);
1702 ZoneVector<char> name_2(zone);
1703 if (!ParsePropertyClassName(&name_1, &name_2) ||
1704 !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
1705 ReportError(RegExpError::kInvalidClassPropertyName);
1706 }
1707 *is_class_escape = true;
1708 return;
1709 }
1710 break;
1711 default:
1712 break;
1713 }
1714 *char_out = ParseClassCharacterEscape();
1715 *is_class_escape = false;
1716 } else {
1717 Advance();
1718 *char_out = current_char;
1719 *is_class_escape = false;
1720 }
1721 }
1722
ParseCharacterClass(const RegExpBuilder * builder)1723 RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
1724 DCHECK_EQ(current(), '[');
1725 Advance();
1726 bool is_negated = false;
1727 if (current() == '^') {
1728 is_negated = true;
1729 Advance();
1730 }
1731 ZoneList<CharacterRange>* ranges =
1732 new (zone()) ZoneList<CharacterRange>(2, zone());
1733 bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
1734 while (has_more() && current() != ']') {
1735 uc32 char_1, char_2;
1736 bool is_class_1, is_class_2;
1737 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
1738 &is_class_1 CHECK_FAILED);
1739 if (current() == '-') {
1740 Advance();
1741 if (current() == kEndMarker) {
1742 // If we reach the end we break out of the loop and let the
1743 // following code report an error.
1744 break;
1745 } else if (current() == ']') {
1746 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1747 ranges->Add(CharacterRange::Singleton('-'), zone());
1748 break;
1749 }
1750 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
1751 &is_class_2 CHECK_FAILED);
1752 if (is_class_1 || is_class_2) {
1753 // Either end is an escaped character class. Treat the '-' verbatim.
1754 if (unicode()) {
1755 // ES2015 21.2.2.15.1 step 1.
1756 return ReportError(RegExpError::kInvalidCharacterClass);
1757 }
1758 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1759 ranges->Add(CharacterRange::Singleton('-'), zone());
1760 if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
1761 continue;
1762 }
1763 // ES2015 21.2.2.15.1 step 6.
1764 if (char_1 > char_2) {
1765 return ReportError(RegExpError::kOutOfOrderCharacterClass);
1766 }
1767 ranges->Add(CharacterRange::Range(char_1, char_2), zone());
1768 } else {
1769 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1770 }
1771 }
1772 if (!has_more()) {
1773 return ReportError(RegExpError::kUnterminatedCharacterClass);
1774 }
1775 Advance();
1776 RegExpCharacterClass::CharacterClassFlags character_class_flags;
1777 if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
1778 return new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags(),
1779 character_class_flags);
1780 }
1781
1782
1783 #undef CHECK_FAILED
1784
1785
ParseRegExp(Isolate * isolate,Zone * zone,FlatStringReader * input,JSRegExp::Flags flags,RegExpCompileData * result)1786 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1787 FlatStringReader* input, JSRegExp::Flags flags,
1788 RegExpCompileData* result) {
1789 DCHECK(result != nullptr);
1790 RegExpParser parser(input, flags, isolate, zone);
1791 RegExpTree* tree = parser.ParsePattern();
1792 if (parser.failed()) {
1793 DCHECK(tree == nullptr);
1794 DCHECK(parser.error_ != RegExpError::kNone);
1795 result->error = parser.error_;
1796 result->error_pos = parser.error_pos_;
1797 } else {
1798 DCHECK(tree != nullptr);
1799 DCHECK(parser.error_ == RegExpError::kNone);
1800 if (FLAG_trace_regexp_parser) {
1801 StdoutStream os;
1802 tree->Print(os, zone);
1803 os << "\n";
1804 }
1805 result->tree = tree;
1806 int capture_count = parser.captures_started();
1807 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1808 result->contains_anchor = parser.contains_anchor();
1809 result->capture_name_map = parser.CreateCaptureNameMap();
1810 result->capture_count = capture_count;
1811 }
1812 return !parser.failed();
1813 }
1814
RegExpBuilder(Zone * zone,JSRegExp::Flags flags)1815 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
1816 : zone_(zone),
1817 pending_empty_(false),
1818 flags_(flags),
1819 characters_(nullptr),
1820 pending_surrogate_(kNoPendingSurrogate),
1821 terms_(),
1822 alternatives_()
1823 #ifdef DEBUG
1824 ,
1825 last_added_(ADD_NONE)
1826 #endif
1827 {
1828 }
1829
1830
AddLeadSurrogate(uc16 lead_surrogate)1831 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1832 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1833 FlushPendingSurrogate();
1834 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1835 pending_surrogate_ = lead_surrogate;
1836 }
1837
1838
AddTrailSurrogate(uc16 trail_surrogate)1839 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1840 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1841 if (pending_surrogate_ != kNoPendingSurrogate) {
1842 uc16 lead_surrogate = pending_surrogate_;
1843 pending_surrogate_ = kNoPendingSurrogate;
1844 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1845 uc32 combined =
1846 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1847 if (NeedsDesugaringForIgnoreCase(combined)) {
1848 AddCharacterClassForDesugaring(combined);
1849 } else {
1850 ZoneList<uc16> surrogate_pair(2, zone());
1851 surrogate_pair.Add(lead_surrogate, zone());
1852 surrogate_pair.Add(trail_surrogate, zone());
1853 RegExpAtom* atom =
1854 new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_);
1855 AddAtom(atom);
1856 }
1857 } else {
1858 pending_surrogate_ = trail_surrogate;
1859 FlushPendingSurrogate();
1860 }
1861 }
1862
1863
FlushPendingSurrogate()1864 void RegExpBuilder::FlushPendingSurrogate() {
1865 if (pending_surrogate_ != kNoPendingSurrogate) {
1866 DCHECK(unicode());
1867 uc32 c = pending_surrogate_;
1868 pending_surrogate_ = kNoPendingSurrogate;
1869 AddCharacterClassForDesugaring(c);
1870 }
1871 }
1872
1873
FlushCharacters()1874 void RegExpBuilder::FlushCharacters() {
1875 FlushPendingSurrogate();
1876 pending_empty_ = false;
1877 if (characters_ != nullptr) {
1878 RegExpTree* atom =
1879 new (zone()) RegExpAtom(characters_->ToConstVector(), flags_);
1880 characters_ = nullptr;
1881 text_.Add(atom, zone());
1882 LAST(ADD_ATOM);
1883 }
1884 }
1885
1886
FlushText()1887 void RegExpBuilder::FlushText() {
1888 FlushCharacters();
1889 int num_text = text_.length();
1890 if (num_text == 0) {
1891 return;
1892 } else if (num_text == 1) {
1893 terms_.Add(text_.last(), zone());
1894 } else {
1895 RegExpText* text = new (zone()) RegExpText(zone());
1896 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1897 terms_.Add(text, zone());
1898 }
1899 text_.Clear();
1900 }
1901
1902
AddCharacter(uc16 c)1903 void RegExpBuilder::AddCharacter(uc16 c) {
1904 FlushPendingSurrogate();
1905 pending_empty_ = false;
1906 if (NeedsDesugaringForIgnoreCase(c)) {
1907 AddCharacterClassForDesugaring(c);
1908 } else {
1909 if (characters_ == nullptr) {
1910 characters_ = new (zone()) ZoneList<uc16>(4, zone());
1911 }
1912 characters_->Add(c, zone());
1913 LAST(ADD_CHAR);
1914 }
1915 }
1916
1917
AddUnicodeCharacter(uc32 c)1918 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1919 if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1920 DCHECK(unicode());
1921 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1922 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1923 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1924 AddLeadSurrogate(c);
1925 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1926 AddTrailSurrogate(c);
1927 } else {
1928 AddCharacter(static_cast<uc16>(c));
1929 }
1930 }
1931
AddEscapedUnicodeCharacter(uc32 character)1932 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1933 // A lead or trail surrogate parsed via escape sequence will not
1934 // pair up with any preceding lead or following trail surrogate.
1935 FlushPendingSurrogate();
1936 AddUnicodeCharacter(character);
1937 FlushPendingSurrogate();
1938 }
1939
AddEmpty()1940 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1941
1942
AddCharacterClass(RegExpCharacterClass * cc)1943 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1944 if (NeedsDesugaringForUnicode(cc)) {
1945 // With /u, character class needs to be desugared, so it
1946 // must be a standalone term instead of being part of a RegExpText.
1947 AddTerm(cc);
1948 } else {
1949 AddAtom(cc);
1950 }
1951 }
1952
AddCharacterClassForDesugaring(uc32 c)1953 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1954 AddTerm(new (zone()) RegExpCharacterClass(
1955 zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)),
1956 flags_));
1957 }
1958
1959
AddAtom(RegExpTree * term)1960 void RegExpBuilder::AddAtom(RegExpTree* term) {
1961 if (term->IsEmpty()) {
1962 AddEmpty();
1963 return;
1964 }
1965 if (term->IsTextElement()) {
1966 FlushCharacters();
1967 text_.Add(term, zone());
1968 } else {
1969 FlushText();
1970 terms_.Add(term, zone());
1971 }
1972 LAST(ADD_ATOM);
1973 }
1974
1975
AddTerm(RegExpTree * term)1976 void RegExpBuilder::AddTerm(RegExpTree* term) {
1977 FlushText();
1978 terms_.Add(term, zone());
1979 LAST(ADD_ATOM);
1980 }
1981
1982
AddAssertion(RegExpTree * assert)1983 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1984 FlushText();
1985 terms_.Add(assert, zone());
1986 LAST(ADD_ASSERT);
1987 }
1988
1989
NewAlternative()1990 void RegExpBuilder::NewAlternative() { FlushTerms(); }
1991
1992
FlushTerms()1993 void RegExpBuilder::FlushTerms() {
1994 FlushText();
1995 int num_terms = terms_.length();
1996 RegExpTree* alternative;
1997 if (num_terms == 0) {
1998 alternative = new (zone()) RegExpEmpty();
1999 } else if (num_terms == 1) {
2000 alternative = terms_.last();
2001 } else {
2002 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
2003 }
2004 alternatives_.Add(alternative, zone());
2005 terms_.Clear();
2006 LAST(ADD_NONE);
2007 }
2008
2009
NeedsDesugaringForUnicode(RegExpCharacterClass * cc)2010 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
2011 if (!unicode()) return false;
2012 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
2013 // necessarily mean that we need to desugar. It's probably nicer to have a
2014 // separate pass to figure out unicode desugarings.
2015 if (ignore_case()) return true;
2016 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2017 CharacterRange::Canonicalize(ranges);
2018 for (int i = ranges->length() - 1; i >= 0; i--) {
2019 uc32 from = ranges->at(i).from();
2020 uc32 to = ranges->at(i).to();
2021 // Check for non-BMP characters.
2022 if (to >= kNonBmpStart) return true;
2023 // Check for lone surrogates.
2024 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
2025 }
2026 return false;
2027 }
2028
2029
NeedsDesugaringForIgnoreCase(uc32 c)2030 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
2031 #ifdef V8_INTL_SUPPORT
2032 if (unicode() && ignore_case()) {
2033 icu::UnicodeSet set(c, c);
2034 set.closeOver(USET_CASE_INSENSITIVE);
2035 set.removeAllStrings();
2036 return set.size() > 1;
2037 }
2038 // In the case where ICU is not included, we act as if the unicode flag is
2039 // not set, and do not desugar.
2040 #endif // V8_INTL_SUPPORT
2041 return false;
2042 }
2043
2044
ToRegExp()2045 RegExpTree* RegExpBuilder::ToRegExp() {
2046 FlushTerms();
2047 int num_alternatives = alternatives_.length();
2048 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
2049 if (num_alternatives == 1) return alternatives_.last();
2050 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
2051 }
2052
AddQuantifierToAtom(int min,int max,RegExpQuantifier::QuantifierType quantifier_type)2053 bool RegExpBuilder::AddQuantifierToAtom(
2054 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
2055 FlushPendingSurrogate();
2056 if (pending_empty_) {
2057 pending_empty_ = false;
2058 return true;
2059 }
2060 RegExpTree* atom;
2061 if (characters_ != nullptr) {
2062 DCHECK(last_added_ == ADD_CHAR);
2063 // Last atom was character.
2064 Vector<const uc16> char_vector = characters_->ToConstVector();
2065 int num_chars = char_vector.length();
2066 if (num_chars > 1) {
2067 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
2068 text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone());
2069 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
2070 }
2071 characters_ = nullptr;
2072 atom = new (zone()) RegExpAtom(char_vector, flags_);
2073 FlushText();
2074 } else if (text_.length() > 0) {
2075 DCHECK(last_added_ == ADD_ATOM);
2076 atom = text_.RemoveLast();
2077 FlushText();
2078 } else if (terms_.length() > 0) {
2079 DCHECK(last_added_ == ADD_ATOM);
2080 atom = terms_.RemoveLast();
2081 if (atom->IsLookaround()) {
2082 // With /u, lookarounds are not quantifiable.
2083 if (unicode()) return false;
2084 // Lookbehinds are not quantifiable.
2085 if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
2086 return false;
2087 }
2088 }
2089 if (atom->max_match() == 0) {
2090 // Guaranteed to only match an empty string.
2091 LAST(ADD_TERM);
2092 if (min == 0) {
2093 return true;
2094 }
2095 terms_.Add(atom, zone());
2096 return true;
2097 }
2098 } else {
2099 // Only call immediately after adding an atom or character!
2100 UNREACHABLE();
2101 }
2102 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
2103 zone());
2104 LAST(ADD_TERM);
2105 return true;
2106 }
2107
2108 } // namespace internal
2109 } // namespace v8
2110