1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2  * vim: set ts=8 sts=4 et sw=4 tw=99: */
3 
4 // Copyright 2012 the V8 project authors. All rights reserved.
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 //       notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 //       copyright notice, this list of conditions and the following
13 //       disclaimer in the documentation and/or other materials provided
14 //       with the distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 //       contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 #include "irregexp/RegExpParser.h"
32 
33 #include "mozilla/ArrayUtils.h"
34 #include "mozilla/Move.h"
35 
36 #include "frontend/TokenStream.h"
37 #include "gc/GC.h"
38 #include "irregexp/RegExpCharacters.h"
39 #include "util/StringBuffer.h"
40 #include "vm/ErrorReporting.h"
41 
42 using namespace js;
43 using namespace js::irregexp;
44 
45 using mozilla::Move;
46 using mozilla::PointerRangeSize;
47 
48 // ----------------------------------------------------------------------------
49 // RegExpBuilder
50 
RegExpBuilder(LifoAlloc * alloc)51 RegExpBuilder::RegExpBuilder(LifoAlloc* alloc)
52   : alloc(alloc),
53     pending_empty_(false),
54     characters_(nullptr)
55 #ifdef DEBUG
56   , last_added_(ADD_NONE)
57 #endif
58 {}
59 
60 void
FlushCharacters()61 RegExpBuilder::FlushCharacters()
62 {
63     pending_empty_ = false;
64     if (characters_ != nullptr) {
65         RegExpTree* atom = alloc->newInfallible<RegExpAtom>(characters_);
66         characters_ = nullptr;
67         text_.Add(alloc, atom);
68 #ifdef DEBUG
69         last_added_ = ADD_ATOM;
70 #endif
71     }
72 }
73 
74 void
FlushText()75 RegExpBuilder::FlushText()
76 {
77     FlushCharacters();
78     int num_text = text_.length();
79     if (num_text == 0)
80         return;
81     if (num_text == 1) {
82         terms_.Add(alloc, text_.last());
83     } else {
84         RegExpText* text = alloc->newInfallible<RegExpText>(alloc);
85         for (int i = 0; i < num_text; i++)
86             text_.Get(i)->AppendToText(text);
87         terms_.Add(alloc, text);
88     }
89     text_.Clear();
90 }
91 
92 void
AddCharacter(char16_t c)93 RegExpBuilder::AddCharacter(char16_t c)
94 {
95     pending_empty_ = false;
96     if (characters_ == nullptr)
97         characters_ = alloc->newInfallible<CharacterVector>(*alloc);
98     characters_->append(c);
99 #ifdef DEBUG
100     last_added_ = ADD_CHAR;
101 #endif
102 }
103 
104 void
AddEmpty()105 RegExpBuilder::AddEmpty()
106 {
107     pending_empty_ = true;
108 }
109 
110 void
AddAtom(RegExpTree * term)111 RegExpBuilder::AddAtom(RegExpTree* term)
112 {
113     if (term->IsEmpty()) {
114         AddEmpty();
115         return;
116     }
117     if (term->IsTextElement()) {
118         FlushCharacters();
119         text_.Add(alloc, term);
120     } else {
121         FlushText();
122         terms_.Add(alloc, term);
123     }
124 #ifdef DEBUG
125     last_added_ = ADD_ATOM;
126 #endif
127 }
128 
129 void
AddAssertion(RegExpTree * assert)130 RegExpBuilder::AddAssertion(RegExpTree* assert)
131 {
132     FlushText();
133     if (terms_.length() > 0 && terms_.last()->IsAssertion()) {
134         // Omit repeated assertions of the same type.
135         RegExpAssertion* last = terms_.last()->AsAssertion();
136         RegExpAssertion* next = assert->AsAssertion();
137         if (last->assertion_type() == next->assertion_type()) return;
138     }
139     terms_.Add(alloc, assert);
140 #ifdef DEBUG
141     last_added_ = ADD_ASSERT;
142 #endif
143 }
144 
145 void
NewAlternative()146 RegExpBuilder::NewAlternative()
147 {
148     FlushTerms();
149 }
150 
151 void
FlushTerms()152 RegExpBuilder::FlushTerms()
153 {
154     FlushText();
155     int num_terms = terms_.length();
156     RegExpTree* alternative;
157     if (num_terms == 0)
158         alternative = RegExpEmpty::GetInstance();
159     else if (num_terms == 1)
160         alternative = terms_.last();
161     else
162         alternative = alloc->newInfallible<RegExpAlternative>(terms_.GetList(alloc));
163     alternatives_.Add(alloc, alternative);
164     terms_.Clear();
165 #ifdef DEBUG
166     last_added_ = ADD_NONE;
167 #endif
168 }
169 
170 RegExpTree*
ToRegExp()171 RegExpBuilder::ToRegExp()
172 {
173     FlushTerms();
174     int num_alternatives = alternatives_.length();
175     if (num_alternatives == 0) {
176         return RegExpEmpty::GetInstance();
177     }
178     if (num_alternatives == 1) {
179         return alternatives_.last();
180     }
181     return alloc->newInfallible<RegExpDisjunction>(alternatives_.GetList(alloc));
182 }
183 
184 void
AddQuantifierToAtom(int min,int max,RegExpQuantifier::QuantifierType quantifier_type)185 RegExpBuilder::AddQuantifierToAtom(int min, int max,
186                                    RegExpQuantifier::QuantifierType quantifier_type)
187 {
188     if (pending_empty_) {
189         pending_empty_ = false;
190         return;
191     }
192     RegExpTree* atom;
193     if (characters_ != nullptr) {
194         MOZ_ASSERT(last_added_ == ADD_CHAR);
195         // Last atom was character.
196         CharacterVector* char_vector = characters_;
197         int num_chars = char_vector->length();
198         if (num_chars > 1) {
199             CharacterVector* prefix = alloc->newInfallible<CharacterVector>(*alloc);
200             prefix->append(char_vector->begin(), num_chars - 1);
201             text_.Add(alloc, alloc->newInfallible<RegExpAtom>(prefix));
202             char_vector = alloc->newInfallible<CharacterVector>(*alloc);
203             char_vector->append((*characters_)[num_chars - 1]);
204         }
205         characters_ = nullptr;
206         atom = alloc->newInfallible<RegExpAtom>(char_vector);
207         FlushText();
208     } else if (text_.length() > 0) {
209         MOZ_ASSERT(last_added_ == ADD_ATOM);
210         atom = text_.RemoveLast();
211         FlushText();
212     } else if (terms_.length() > 0) {
213         MOZ_ASSERT(last_added_ == ADD_ATOM);
214         atom = terms_.RemoveLast();
215         if (atom->max_match() == 0) {
216             // Guaranteed to only match an empty string.
217 #ifdef DEBUG
218             last_added_ = ADD_TERM;
219 #endif
220             if (min == 0)
221                 return;
222             terms_.Add(alloc, atom);
223             return;
224         }
225     } else {
226         // Only call immediately after adding an atom or character!
227         MOZ_CRASH("Bad call");
228     }
229     terms_.Add(alloc, alloc->newInfallible<RegExpQuantifier>(min, max, quantifier_type, atom));
230 #ifdef DEBUG
231     last_added_ = ADD_TERM;
232 #endif
233 }
234 
235 // ----------------------------------------------------------------------------
236 // RegExpParser
237 
238 template <typename CharT>
RegExpParser(frontend::TokenStreamAnyChars & ts,LifoAlloc * alloc,const CharT * chars,const CharT * end,bool multiline_mode,bool unicode,bool ignore_case)239 RegExpParser<CharT>::RegExpParser(frontend::TokenStreamAnyChars& ts, LifoAlloc* alloc,
240                                   const CharT* chars, const CharT* end, bool multiline_mode,
241                                   bool unicode, bool ignore_case)
242   : ts(ts),
243     alloc(alloc),
244     captures_(nullptr),
245     start_(chars),
246     next_pos_(start_),
247     end_(end),
248     current_(kEndMarker),
249     capture_count_(0),
250     has_more_(true),
251     multiline_(multiline_mode),
252     unicode_(unicode),
253     ignore_case_(ignore_case),
254     simple_(false),
255     contains_anchor_(false),
256     is_scanned_for_captures_(false)
257 {
258     Advance();
259 }
260 
261 template <typename CharT>
262 void
SyntaxError(unsigned errorNumber,...)263 RegExpParser<CharT>::SyntaxError(unsigned errorNumber, ...)
264 {
265     ErrorMetadata err;
266 
267     ts.fillExcludingContext(&err, ts.currentToken().pos.begin);
268 
269     // For most error reporting, the line of context derives from the token
270     // stream.  So when location information doesn't come from the token
271     // stream, we can't give a line of context.  But here the "line of context"
272     // can be (and is) derived from the pattern text, so we can provide it no
273     // matter if the location is derived from the caller.
274     size_t offset = PointerRangeSize(start_, next_pos_ - 1);
275     size_t end = PointerRangeSize(start_, end_);
276 
277     const CharT* windowStart = (offset > ErrorMetadata::lineOfContextRadius)
278                                ? start_ + (offset - ErrorMetadata::lineOfContextRadius)
279                                : start_;
280 
281     const CharT* windowEnd = (end - offset > ErrorMetadata::lineOfContextRadius)
282                              ? start_ + offset + ErrorMetadata::lineOfContextRadius
283                              : end_;
284 
285     size_t windowLength = PointerRangeSize(windowStart, windowEnd);
286     MOZ_ASSERT(windowLength <= ErrorMetadata::lineOfContextRadius * 2);
287 
288     // Create the windowed string, not including the potential line
289     // terminator.
290     StringBuffer windowBuf(ts.context());
291     if (!windowBuf.append(windowStart, windowEnd))
292         return;
293 
294     // The line of context must be null-terminated, and StringBuffer doesn't
295     // make that happen unless we force it to.
296     if (!windowBuf.append('\0'))
297         return;
298 
299     err.lineOfContext.reset(windowBuf.stealChars());
300     if (!err.lineOfContext)
301         return;
302 
303     err.lineLength = windowLength;
304     err.tokenOffset = offset - (windowStart - start_);
305 
306     va_list args;
307     va_start(args, errorNumber);
308 
309     ReportCompileError(ts.context(), Move(err), nullptr, JSREPORT_ERROR, errorNumber, args);
310 
311     va_end(args);
312 }
313 
314 template <typename CharT>
315 RegExpTree*
ReportError(unsigned errorNumber,const char * param)316 RegExpParser<CharT>::ReportError(unsigned errorNumber, const char* param /* = nullptr */)
317 {
318     gc::AutoSuppressGC suppressGC(ts.context());
319     SyntaxError(errorNumber, param);
320     return nullptr;
321 }
322 
323 template <typename CharT>
324 void
Advance()325 RegExpParser<CharT>::Advance()
326 {
327     if (next_pos_ < end_) {
328         current_ = *next_pos_;
329         next_pos_++;
330     } else {
331         current_ = kEndMarker;
332         next_pos_ = end_ + 1;
333         has_more_ = false;
334     }
335 }
336 
337 // Returns the value (0 .. 15) of a hexadecimal character c.
338 // If c is not a legal hexadecimal character, returns a value < 0.
339 inline int
HexValue(uint32_t c)340 HexValue(uint32_t c)
341 {
342     c -= '0';
343     if (static_cast<unsigned>(c) <= 9) return c;
344     c = (c | 0x20) - ('a' - '0');  // detect 0x11..0x16 and 0x31..0x36.
345     if (static_cast<unsigned>(c) <= 5) return c + 10;
346     return -1;
347 }
348 
349 template <typename CharT>
350 widechar
ParseOctalLiteral()351 RegExpParser<CharT>::ParseOctalLiteral()
352 {
353     MOZ_ASSERT('0' <= current() && current() <= '7');
354     // For compatibility with some other browsers (not all), we parse
355     // up to three octal digits with a value below 256.
356     widechar value = current() - '0';
357     Advance();
358     if ('0' <= current() && current() <= '7') {
359         value = value * 8 + current() - '0';
360         Advance();
361         if (value < 32 && '0' <= current() && current() <= '7') {
362             value = value * 8 + current() - '0';
363             Advance();
364         }
365     }
366     return value;
367 }
368 
369 template <typename CharT>
370 bool
ParseHexEscape(int length,widechar * value)371 RegExpParser<CharT>::ParseHexEscape(int length, widechar* value)
372 {
373     const CharT* start = position();
374     uint32_t val = 0;
375     bool done = false;
376     for (int i = 0; !done; i++) {
377         widechar c = current();
378         int d = HexValue(c);
379         if (d < 0) {
380             Reset(start);
381             return false;
382         }
383         val = val * 16 + d;
384         Advance();
385         if (i == length - 1) {
386             done = true;
387         }
388     }
389     *value = val;
390     return true;
391 }
392 
393 template <typename CharT>
394 bool
ParseBracedHexEscape(widechar * value)395 RegExpParser<CharT>::ParseBracedHexEscape(widechar* value)
396 {
397     MOZ_ASSERT(current() == '{');
398     Advance();
399 
400     bool first = true;
401     uint32_t code = 0;
402     while (true) {
403         widechar c = current();
404         if (c == kEndMarker) {
405             ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
406             return false;
407         }
408         if (c == '}') {
409             if (first) {
410                 ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
411                 return false;
412             }
413             Advance();
414             break;
415         }
416 
417         int d = HexValue(c);
418         if (d < 0) {
419             ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
420             return false;
421         }
422         code = (code << 4) | d;
423         if (code > unicode::NonBMPMax) {
424             ReportError(JSMSG_UNICODE_OVERFLOW, "regular expression");
425             return false;
426         }
427         Advance();
428         first = false;
429     }
430 
431     *value = code;
432     return true;
433 }
434 
435 template <typename CharT>
436 bool
ParseTrailSurrogate(widechar * value)437 RegExpParser<CharT>::ParseTrailSurrogate(widechar* value)
438 {
439     if (current() != '\\')
440         return false;
441 
442     const CharT* start = position();
443     Advance();
444     if (current() != 'u') {
445         Reset(start);
446         return false;
447     }
448     Advance();
449     if (!ParseHexEscape(4, value)) {
450         Reset(start);
451         return false;
452     }
453     if (!unicode::IsTrailSurrogate(*value)) {
454         Reset(start);
455         return false;
456     }
457     return true;
458 }
459 
460 template <typename CharT>
461 bool
ParseRawSurrogatePair(char16_t * lead,char16_t * trail)462 RegExpParser<CharT>::ParseRawSurrogatePair(char16_t* lead, char16_t* trail)
463 {
464     widechar c1 = current();
465     if (!unicode::IsLeadSurrogate(c1))
466         return false;
467 
468     const CharT* start = position();
469     Advance();
470     widechar c2 = current();
471     if (!unicode::IsTrailSurrogate(c2)) {
472         Reset(start);
473         return false;
474     }
475     Advance();
476     *lead = c1;
477     *trail = c2;
478     return true;
479 }
480 
481 static inline RegExpTree*
RangeAtom(LifoAlloc * alloc,char16_t from,char16_t to)482 RangeAtom(LifoAlloc* alloc, char16_t from, char16_t to)
483 {
484     CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
485     ranges->append(CharacterRange::Range(from, to));
486     return alloc->newInfallible<RegExpCharacterClass>(ranges, false);
487 }
488 
489 static inline RegExpTree*
NegativeLookahead(LifoAlloc * alloc,char16_t from,char16_t to)490 NegativeLookahead(LifoAlloc* alloc, char16_t from, char16_t to)
491 {
492     return alloc->newInfallible<RegExpLookahead>(RangeAtom(alloc, from, to), false, 0, 0);
493 }
494 
495 static bool
IsSyntaxCharacter(widechar c)496 IsSyntaxCharacter(widechar c)
497 {
498   switch (c) {
499     case '^':
500     case '$':
501     case '\\':
502     case '.':
503     case '*':
504     case '+':
505     case '?':
506     case '(':
507     case ')':
508     case '[':
509     case ']':
510     case '{':
511     case '}':
512     case '|':
513     case '/':
514       return true;
515     default:
516       return false;
517   }
518 }
519 
520 inline bool
IsInRange(int value,int lower_limit,int higher_limit)521 IsInRange(int value, int lower_limit, int higher_limit)
522 {
523     MOZ_ASSERT(lower_limit <= higher_limit);
524     return static_cast<unsigned int>(value - lower_limit) <=
525            static_cast<unsigned int>(higher_limit - lower_limit);
526 }
527 
528 inline bool
IsDecimalDigit(widechar c)529 IsDecimalDigit(widechar c)
530 {
531     // ECMA-262, 3rd, 7.8.3 (p 16)
532     return IsInRange(c, '0', '9');
533 }
534 
535 #ifdef DEBUG
536 // Currently only used in an assert.kASSERT.
537 static bool
IsSpecialClassEscape(widechar c)538 IsSpecialClassEscape(widechar c)
539 {
540   switch (c) {
541     case 'd': case 'D':
542     case 's': case 'S':
543     case 'w': case 'W':
544       return true;
545     default:
546       return false;
547   }
548 }
549 #endif
550 
551 template <typename CharT>
552 bool
ParseClassCharacterEscape(widechar * code)553 RegExpParser<CharT>::ParseClassCharacterEscape(widechar* code)
554 {
555     MOZ_ASSERT(current() == '\\');
556     MOZ_ASSERT(has_next() && !IsSpecialClassEscape(Next()));
557     Advance();
558     switch (current()) {
559       case 'b':
560         Advance();
561         *code = '\b';
562         return true;
563       // ControlEscape :: one of
564       //   f n r t v
565       case 'f':
566         Advance();
567         *code = '\f';
568         return true;
569       case 'n':
570         Advance();
571         *code = '\n';
572         return true;
573       case 'r':
574         Advance();
575         *code = '\r';
576         return true;
577       case 't':
578         Advance();
579         *code = '\t';
580         return true;
581       case 'v':
582         Advance();
583         *code = '\v';
584         return true;
585       case 'c': {
586         widechar controlLetter = Next();
587         widechar letter = controlLetter & ~('A' ^ 'a');
588         // For compatibility with JSC, inside a character class
589         // we also accept digits and underscore as control characters,
590         // but only in non-unicode mode
591         if ((!unicode_ &&
592              ((controlLetter >= '0' && controlLetter <= '9') ||
593               controlLetter == '_')) ||
594             (letter >= 'A' && letter <= 'Z'))
595         {
596             Advance(2);
597             // Control letters mapped to ASCII control characters in the range
598             // 0x00-0x1f.
599             *code = controlLetter & 0x1f;
600             return true;
601         }
602         if (unicode_) {
603             ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
604             return false;
605         }
606         // We match JSC in reading the backslash as a literal
607         // character instead of as starting an escape.
608         *code = '\\';
609         return true;
610       }
611       case '0':
612         if (unicode_) {
613             Advance();
614             if (IsDecimalDigit(current()))
615                 return ReportError(JSMSG_INVALID_DECIMAL_ESCAPE);
616             *code = 0;
617             return true;
618         }
619         MOZ_FALLTHROUGH;
620       case '1': case '2': case '3': case '4': case '5': case '6': case '7':
621         if (unicode_) {
622             ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
623             return false;
624         }
625         // For compatibility, outside of unicode mode, we interpret a decimal
626         // escape that isn't a back reference (and therefore either \0 or not
627         // valid according to the specification) as a 1..3 digit octal
628         // character code.
629         *code = ParseOctalLiteral();
630         return true;
631       case 'x': {
632         Advance();
633         widechar value;
634         if (ParseHexEscape(2, &value)) {
635             *code = value;
636             return true;
637         }
638         if (unicode_) {
639             ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
640             return false;
641         }
642         // If \x is not followed by a two-digit hexadecimal, treat it
643         // as an identity escape in non-unicode mode.
644         *code = 'x';
645         return true;
646       }
647       case 'u': {
648         Advance();
649         widechar value;
650         if (unicode_) {
651             if (current() == '{') {
652                 if (!ParseBracedHexEscape(&value))
653                     return false;
654                 *code = value;
655                 return true;
656             }
657             if (ParseHexEscape(4, &value)) {
658                 if (unicode::IsLeadSurrogate(value)) {
659                     widechar trail;
660                     if (ParseTrailSurrogate(&trail)) {
661                         *code = unicode::UTF16Decode(value, trail);
662                         return true;
663                     }
664                 }
665                 *code = value;
666                 return true;
667             }
668             ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
669             return false;
670         }
671         if (ParseHexEscape(4, &value)) {
672             *code = value;
673             return true;
674         }
675         // If \u is not followed by a four-digit or braced hexadecimal, treat it
676         // as an identity escape.
677         *code = 'u';
678         return true;
679       }
680       default: {
681         // Extended identity escape (non-unicode only). We accept any character
682         // that hasn't been matched by a more specific case, not just the subset
683         // required by the ECMAScript specification.
684         widechar result = current();
685         if (unicode_ && result != '-' && !IsSyntaxCharacter(result)) {
686             ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
687             return false;
688         }
689         Advance();
690         *code = result;
691         return true;
692       }
693     }
694     return true;
695 }
696 
697 class WideCharRange
698 {
699   public:
WideCharRange()700     WideCharRange()
701       : from_(0), to_(0)
702     {}
703 
WideCharRange(widechar from,widechar to)704     WideCharRange(widechar from, widechar to)
705       : from_(from), to_(to)
706     {}
707 
Singleton(widechar value)708     static inline WideCharRange Singleton(widechar value) {
709         return WideCharRange(value, value);
710     }
Range(widechar from,widechar to)711     static inline WideCharRange Range(widechar from, widechar to) {
712         MOZ_ASSERT(from <= to);
713         return WideCharRange(from, to);
714     }
715 
Contains(widechar i) const716     bool Contains(widechar i) const { return from_ <= i && i <= to_; }
from() const717     widechar from() const { return from_; }
to() const718     widechar to() const { return to_; }
719 
720   private:
721     widechar from_;
722     widechar to_;
723 };
724 
725 typedef InfallibleVector<WideCharRange, 1> WideCharRangeVector;
726 
727 static inline CharacterRange
LeadSurrogateRange()728 LeadSurrogateRange()
729 {
730     return CharacterRange::Range(unicode::LeadSurrogateMin, unicode::LeadSurrogateMax);
731 }
732 
733 static inline CharacterRange
TrailSurrogateRange()734 TrailSurrogateRange()
735 {
736     return CharacterRange::Range(unicode::TrailSurrogateMin, unicode::TrailSurrogateMax);
737 }
738 
739 static inline WideCharRange
NonBMPRange()740 NonBMPRange()
741 {
742     return WideCharRange::Range(unicode::NonBMPMin, unicode::NonBMPMax);
743 }
744 
745 static const char16_t kNoCharClass = 0;
746 
747 // Adds a character or pre-defined character class to character ranges.
748 // If char_class is not kInvalidClass, it's interpreted as a class
749 // escape (i.e., 's' means whitespace, from '\s').
750 static inline void
AddCharOrEscape(LifoAlloc * alloc,CharacterRangeVector * ranges,char16_t char_class,widechar c)751 AddCharOrEscape(LifoAlloc* alloc,
752                 CharacterRangeVector* ranges,
753                 char16_t char_class,
754                 widechar c)
755 {
756     if (char_class != kNoCharClass)
757         CharacterRange::AddClassEscape(alloc, char_class, ranges);
758     else
759         ranges->append(CharacterRange::Singleton(c));
760 }
761 
762 static inline void
AddCharOrEscapeUnicode(LifoAlloc * alloc,CharacterRangeVector * ranges,CharacterRangeVector * lead_ranges,CharacterRangeVector * trail_ranges,WideCharRangeVector * wide_ranges,char16_t char_class,widechar c,bool ignore_case)763 AddCharOrEscapeUnicode(LifoAlloc* alloc,
764                        CharacterRangeVector* ranges,
765                        CharacterRangeVector* lead_ranges,
766                        CharacterRangeVector* trail_ranges,
767                        WideCharRangeVector* wide_ranges,
768                        char16_t char_class,
769                        widechar c,
770                        bool ignore_case)
771 {
772     if (char_class != kNoCharClass) {
773         CharacterRange::AddClassEscapeUnicode(alloc, char_class, ranges, ignore_case);
774         switch (char_class) {
775           case 'S':
776           case 'W':
777           case 'D':
778             lead_ranges->append(LeadSurrogateRange());
779             trail_ranges->append(TrailSurrogateRange());
780             wide_ranges->append(NonBMPRange());
781             break;
782           case '.':
783             MOZ_CRASH("Bad char_class!");
784         }
785         return;
786     }
787 
788     if (unicode::IsLeadSurrogate(c))
789         lead_ranges->append(CharacterRange::Singleton(c));
790     else if (unicode::IsTrailSurrogate(c))
791         trail_ranges->append(CharacterRange::Singleton(c));
792     else if (c >= unicode::NonBMPMin)
793         wide_ranges->append(WideCharRange::Singleton(c));
794     else
795         ranges->append(CharacterRange::Singleton(c));
796 }
797 
798 static inline void
AddUnicodeRange(LifoAlloc * alloc,CharacterRangeVector * ranges,CharacterRangeVector * lead_ranges,CharacterRangeVector * trail_ranges,WideCharRangeVector * wide_ranges,widechar first,widechar next)799 AddUnicodeRange(LifoAlloc* alloc,
800                 CharacterRangeVector* ranges,
801                 CharacterRangeVector* lead_ranges,
802                 CharacterRangeVector* trail_ranges,
803                 WideCharRangeVector* wide_ranges,
804                 widechar first,
805                 widechar next)
806 {
807     MOZ_ASSERT(first <= next);
808     if (first < unicode::LeadSurrogateMin) {
809         if (next < unicode::LeadSurrogateMin) {
810             ranges->append(CharacterRange::Range(first, next));
811             return;
812         }
813         ranges->append(CharacterRange::Range(first, unicode::LeadSurrogateMin - 1));
814         first = unicode::LeadSurrogateMin;
815     }
816     if (first <= unicode::LeadSurrogateMax) {
817         if (next <= unicode::LeadSurrogateMax) {
818             lead_ranges->append(CharacterRange::Range(first, next));
819             return;
820         }
821         lead_ranges->append(CharacterRange::Range(first, unicode::LeadSurrogateMax));
822         first = unicode::LeadSurrogateMax + 1;
823     }
824     MOZ_ASSERT(unicode::LeadSurrogateMax + 1 == unicode::TrailSurrogateMin);
825     if (first <= unicode::TrailSurrogateMax) {
826         if (next <= unicode::TrailSurrogateMax) {
827             trail_ranges->append(CharacterRange::Range(first, next));
828             return;
829         }
830         trail_ranges->append(CharacterRange::Range(first, unicode::TrailSurrogateMax));
831         first = unicode::TrailSurrogateMax + 1;
832     }
833     if (first <= unicode::UTF16Max) {
834         if (next <= unicode::UTF16Max) {
835             ranges->append(CharacterRange::Range(first, next));
836             return;
837         }
838         ranges->append(CharacterRange::Range(first, unicode::UTF16Max));
839         first = unicode::NonBMPMin;
840     }
841     MOZ_ASSERT(unicode::UTF16Max + 1 == unicode::NonBMPMin);
842     wide_ranges->append(WideCharRange::Range(first, next));
843 }
844 
845 // Negate a vector of ranges by subtracting its ranges from a range
846 // encompassing the full range of possible values.
847 template <typename RangeType>
848 static inline void
NegateUnicodeRanges(LifoAlloc * alloc,InfallibleVector<RangeType,1> ** ranges,RangeType full_range)849 NegateUnicodeRanges(LifoAlloc* alloc, InfallibleVector<RangeType, 1>** ranges,
850                     RangeType full_range)
851 {
852     typedef InfallibleVector<RangeType, 1> RangeVector;
853     RangeVector* tmp_ranges = alloc->newInfallible<RangeVector>(*alloc);
854     tmp_ranges->append(full_range);
855     RangeVector* result_ranges = alloc->newInfallible<RangeVector>(*alloc);
856 
857     // Perform the following calculation:
858     //   result_ranges = tmp_ranges - ranges
859     // with the following steps:
860     //   result_ranges = tmp_ranges - ranges[0]
861     //   SWAP(result_ranges, tmp_ranges)
862     //   result_ranges = tmp_ranges - ranges[1]
863     //   SWAP(result_ranges, tmp_ranges)
864     //   ...
865     //   result_ranges = tmp_ranges - ranges[N-1]
866     //   SWAP(result_ranges, tmp_ranges)
867     // The last SWAP is just for simplicity of the loop.
868     for (size_t i = 0; i < (*ranges)->length(); i++) {
869         result_ranges->clear();
870 
871         const RangeType& range = (**ranges)[i];
872         for (size_t j = 0; j < tmp_ranges->length(); j++) {
873             const RangeType& tmpRange = (*tmp_ranges)[j];
874             auto from1 = tmpRange.from();
875             auto to1 = tmpRange.to();
876             auto from2 = range.from();
877             auto to2 = range.to();
878 
879             if (from1 < from2) {
880                 if (to1 < from2) {
881                     result_ranges->append(tmpRange);
882                 } else if (to1 <= to2) {
883                     result_ranges->append(RangeType::Range(from1, from2 - 1));
884                 } else {
885                     result_ranges->append(RangeType::Range(from1, from2 - 1));
886                     result_ranges->append(RangeType::Range(to2 + 1, to1));
887                 }
888             } else if (from1 <= to2) {
889                 if (to1 > to2)
890                     result_ranges->append(RangeType::Range(to2 + 1, to1));
891             } else {
892                 result_ranges->append(tmpRange);
893             }
894         }
895 
896         auto tmp = tmp_ranges;
897         tmp_ranges = result_ranges;
898         result_ranges = tmp;
899     }
900 
901     // After the loop, result is pointed at by tmp_ranges, instead of
902     // result_ranges.
903     *ranges = tmp_ranges;
904 }
905 
906 static bool
WideCharRangesContain(WideCharRangeVector * wide_ranges,widechar c)907 WideCharRangesContain(WideCharRangeVector* wide_ranges, widechar c)
908 {
909     for (size_t i = 0; i < wide_ranges->length(); i++) {
910         const WideCharRange& range = (*wide_ranges)[i];
911         if (range.Contains(c))
912             return true;
913     }
914     return false;
915 }
916 
917 static void
CalculateCaseInsensitiveRanges(LifoAlloc * alloc,widechar from,widechar to,int32_t diff,WideCharRangeVector * wide_ranges,WideCharRangeVector ** tmp_wide_ranges)918 CalculateCaseInsensitiveRanges(LifoAlloc* alloc, widechar from, widechar to, int32_t diff,
919                                WideCharRangeVector* wide_ranges,
920                                WideCharRangeVector** tmp_wide_ranges)
921 {
922     widechar contains_from = 0;
923     widechar contains_to = 0;
924     for (widechar c = from; c <= to; c++) {
925         if (WideCharRangesContain(wide_ranges, c) &&
926             !WideCharRangesContain(wide_ranges, c + diff))
927         {
928             if (contains_from == 0)
929                 contains_from = c;
930             contains_to = c;
931         } else if (contains_from != 0) {
932             if (!*tmp_wide_ranges)
933                 *tmp_wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
934 
935             (*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff,
936                                                             contains_to + diff));
937             contains_from = 0;
938         }
939     }
940 
941     if (contains_from != 0) {
942         if (!*tmp_wide_ranges)
943             *tmp_wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
944 
945         (*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff,
946                                                         contains_to + diff));
947     }
948 }
949 
950 static RegExpTree*
UnicodeRangesAtom(LifoAlloc * alloc,CharacterRangeVector * ranges,CharacterRangeVector * lead_ranges,CharacterRangeVector * trail_ranges,WideCharRangeVector * wide_ranges,bool is_negated,bool ignore_case)951 UnicodeRangesAtom(LifoAlloc* alloc,
952                   CharacterRangeVector* ranges,
953                   CharacterRangeVector* lead_ranges,
954                   CharacterRangeVector* trail_ranges,
955                   WideCharRangeVector* wide_ranges,
956                   bool is_negated,
957                   bool ignore_case)
958 {
959     // Calculate case folding for non-BMP first and negate the range if needed.
960     if (ignore_case) {
961         WideCharRangeVector* tmp_wide_ranges = nullptr;
962 #define CALL_CALC(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
963         CalculateCaseInsensitiveRanges(alloc, FROM, TO, DIFF, wide_ranges, &tmp_wide_ranges);
964         FOR_EACH_NON_BMP_CASE_FOLDING(CALL_CALC)
965         FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_CALC)
966 #undef CALL_CALC
967 
968         if (tmp_wide_ranges) {
969             for (size_t i = 0; i < tmp_wide_ranges->length(); i++)
970                 wide_ranges->append((*tmp_wide_ranges)[i]);
971         }
972     }
973 
974     if (is_negated) {
975         NegateUnicodeRanges(alloc, &lead_ranges, LeadSurrogateRange());
976         NegateUnicodeRanges(alloc, &trail_ranges, TrailSurrogateRange());
977         NegateUnicodeRanges(alloc, &wide_ranges, NonBMPRange());
978     }
979 
980     RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
981 
982     bool added = false;
983 
984     if (is_negated) {
985         ranges->append(LeadSurrogateRange());
986         ranges->append(TrailSurrogateRange());
987     }
988     if (ranges->length() > 0) {
989         builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, is_negated));
990         added = true;
991     }
992 
993     if (lead_ranges->length() > 0) {
994         if (added)
995             builder->NewAlternative();
996         builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(lead_ranges, false));
997         builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
998                                            unicode::TrailSurrogateMax));
999         added = true;
1000     }
1001 
1002     if (trail_ranges->length() > 0) {
1003         if (added)
1004             builder->NewAlternative();
1005         builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
1006             RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
1007         builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(trail_ranges, false));
1008         added = true;
1009     }
1010 
1011     for (size_t i = 0; i < wide_ranges->length(); i++) {
1012         if (added)
1013             builder->NewAlternative();
1014 
1015         const WideCharRange& range = (*wide_ranges)[i];
1016         widechar from = range.from();
1017         widechar to = range.to();
1018         char16_t from_lead, from_trail;
1019         char16_t to_lead, to_trail;
1020 
1021         unicode::UTF16Encode(from, &from_lead, &from_trail);
1022         if (from == to) {
1023             builder->AddCharacter(from_lead);
1024             builder->AddCharacter(from_trail);
1025         } else {
1026             unicode::UTF16Encode(to, &to_lead, &to_trail);
1027             if (from_lead == to_lead) {
1028                 MOZ_ASSERT(from_trail != to_trail);
1029                 builder->AddCharacter(from_lead);
1030                 builder->AddAtom(RangeAtom(alloc, from_trail, to_trail));
1031             } else if (from_trail == unicode::TrailSurrogateMin &&
1032                        to_trail == unicode::TrailSurrogateMax)
1033             {
1034                 builder->AddAtom(RangeAtom(alloc, from_lead, to_lead));
1035                 builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin,
1036                                            unicode::TrailSurrogateMax));
1037             } else if (from_lead + 1 == to_lead) {
1038                 builder->AddCharacter(from_lead);
1039                 builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax));
1040 
1041                 builder->NewAlternative();
1042 
1043                 builder->AddCharacter(to_lead);
1044                 builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail));
1045             } else if (from_lead + 2 == to_lead) {
1046                 builder->AddCharacter(from_lead);
1047                 builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax));
1048 
1049                 builder->NewAlternative();
1050 
1051                 builder->AddCharacter(from_lead + 1);
1052                 builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin,
1053                                            unicode::TrailSurrogateMax));
1054 
1055                 builder->NewAlternative();
1056 
1057                 builder->AddCharacter(to_lead);
1058                 builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail));
1059             } else {
1060                 builder->AddCharacter(from_lead);
1061                 builder->AddAtom(RangeAtom(alloc, from_trail, unicode::TrailSurrogateMax));
1062 
1063                 builder->NewAlternative();
1064 
1065                 builder->AddAtom(RangeAtom(alloc, from_lead + 1, to_lead - 1));
1066                 builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin,
1067                                            unicode::TrailSurrogateMax));
1068 
1069                 builder->NewAlternative();
1070 
1071                 builder->AddCharacter(to_lead);
1072                 builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, to_trail));
1073             }
1074         }
1075         added = true;
1076     }
1077 
1078     return builder->ToRegExp();
1079 }
1080 
1081 template <typename CharT>
1082 RegExpTree*
ParseCharacterClass()1083 RegExpParser<CharT>::ParseCharacterClass()
1084 {
1085     MOZ_ASSERT(current() == '[');
1086     Advance();
1087     bool is_negated = false;
1088     if (current() == '^') {
1089         is_negated = true;
1090         Advance();
1091     }
1092     CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1093     CharacterRangeVector* lead_ranges = nullptr;
1094     CharacterRangeVector* trail_ranges = nullptr;
1095     WideCharRangeVector* wide_ranges = nullptr;
1096 
1097     if (unicode_) {
1098         lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1099         trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1100         wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
1101     }
1102 
1103     while (has_more() && current() != ']') {
1104         char16_t char_class = kNoCharClass;
1105         widechar first = 0;
1106         if (!ParseClassAtom(&char_class, &first))
1107             return nullptr;
1108         if (current() == '-') {
1109             Advance();
1110             if (current() == kEndMarker) {
1111                 // If we reach the end we break out of the loop and let the
1112                 // following code report an error.
1113                 break;
1114             } else if (current() == ']') {
1115                 if (unicode_) {
1116                     AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges,
1117                                            char_class, first, ignore_case_);
1118                 } else {
1119                     AddCharOrEscape(alloc, ranges, char_class, first);
1120                 }
1121                 ranges->append(CharacterRange::Singleton('-'));
1122                 break;
1123             }
1124             char16_t char_class_2 = kNoCharClass;
1125             widechar next = 0;
1126             if (!ParseClassAtom(&char_class_2, &next))
1127                 return nullptr;
1128             if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
1129                 if (unicode_)
1130                     return ReportError(JSMSG_RANGE_WITH_CLASS_ESCAPE);
1131 
1132                 // Either end is an escaped character class. Treat the '-' verbatim.
1133                 AddCharOrEscape(alloc, ranges, char_class, first);
1134                 ranges->append(CharacterRange::Singleton('-'));
1135                 AddCharOrEscape(alloc, ranges, char_class_2, next);
1136                 continue;
1137             }
1138             if (first > next)
1139                 return ReportError(JSMSG_BAD_CLASS_RANGE);
1140             if (unicode_)
1141                 AddUnicodeRange(alloc, ranges, lead_ranges, trail_ranges,wide_ranges, first, next);
1142             else
1143                 ranges->append(CharacterRange::Range(first, next));
1144         } else {
1145             if (unicode_) {
1146                 AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges,
1147                                        char_class, first, ignore_case_);
1148             } else {
1149                 AddCharOrEscape(alloc, ranges, char_class, first);
1150             }
1151         }
1152     }
1153     if (!has_more())
1154         return ReportError(JSMSG_UNTERM_CLASS);
1155     Advance();
1156     if (!unicode_) {
1157         if (ranges->length() == 0) {
1158             ranges->append(CharacterRange::Everything());
1159             is_negated = !is_negated;
1160         }
1161         return alloc->newInfallible<RegExpCharacterClass>(ranges, is_negated);
1162     }
1163 
1164     if (!is_negated && ranges->length() == 0 && lead_ranges->length() == 0 &&
1165         trail_ranges->length() == 0 && wide_ranges->length() == 0)
1166     {
1167         ranges->append(CharacterRange::Everything());
1168         return alloc->newInfallible<RegExpCharacterClass>(ranges, true);
1169     }
1170 
1171     return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, is_negated,
1172                              ignore_case_);
1173 }
1174 
1175 template <typename CharT>
1176 bool
ParseClassAtom(char16_t * char_class,widechar * value)1177 RegExpParser<CharT>::ParseClassAtom(char16_t* char_class, widechar* value)
1178 {
1179     MOZ_ASSERT(*char_class == kNoCharClass);
1180     widechar first = current();
1181     if (first == '\\') {
1182         switch (Next()) {
1183           case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {
1184             *char_class = Next();
1185             Advance(2);
1186             return true;
1187           }
1188           case kEndMarker:
1189             return ReportError(JSMSG_ESCAPE_AT_END_OF_REGEXP);
1190           default:
1191             if (!ParseClassCharacterEscape(value))
1192                 return false;
1193             return true;
1194         }
1195     } else {
1196         if (unicode_) {
1197             char16_t lead, trail;
1198             if (ParseRawSurrogatePair(&lead, &trail)) {
1199                 *value = unicode::UTF16Decode(lead, trail);
1200                 return true;
1201             }
1202         }
1203         Advance();
1204         *value = first;
1205         return true;
1206     }
1207 }
1208 
1209 // In order to know whether an escape is a backreference or not we have to scan
1210 // the entire regexp and find the number of capturing parentheses.  However we
1211 // don't want to scan the regexp twice unless it is necessary.  This mini-parser
1212 // is called when needed.  It can see the difference between capturing and
1213 // noncapturing parentheses and can skip character classes and backslash-escaped
1214 // characters.
1215 template <typename CharT>
1216 void
ScanForCaptures()1217 RegExpParser<CharT>::ScanForCaptures()
1218 {
1219     // Start with captures started previous to current position
1220     int capture_count = captures_started();
1221     // Add count of captures after this position.
1222     widechar n;
1223     while ((n = current()) != kEndMarker) {
1224         Advance();
1225         switch (n) {
1226           case '\\':
1227             Advance();
1228             break;
1229           case '[': {
1230             widechar c;
1231             while ((c = current()) != kEndMarker) {
1232                 Advance();
1233                 if (c == '\\') {
1234                     Advance();
1235                 } else {
1236                     if (c == ']') break;
1237                 }
1238             }
1239             break;
1240           }
1241           case '(':
1242             if (current() != '?') capture_count++;
1243             break;
1244         }
1245     }
1246     capture_count_ = capture_count;
1247     is_scanned_for_captures_ = true;
1248 }
1249 
1250 template <typename CharT>
1251 bool
ParseBackReferenceIndex(int * index_out)1252 RegExpParser<CharT>::ParseBackReferenceIndex(int* index_out)
1253 {
1254     MOZ_ASSERT('\\' == current());
1255     MOZ_ASSERT('1' <= Next() && Next() <= '9');
1256 
1257     // Try to parse a decimal literal that is no greater than the total number
1258     // of left capturing parentheses in the input.
1259     const CharT* start = position();
1260     int value = Next() - '0';
1261     Advance(2);
1262     while (true) {
1263         widechar c = current();
1264         if (IsDecimalDigit(c)) {
1265             value = 10 * value + (c - '0');
1266             if (value > kMaxCaptures) {
1267                 Reset(start);
1268                 return false;
1269             }
1270             Advance();
1271         } else {
1272             break;
1273         }
1274     }
1275     if (value > captures_started()) {
1276         if (!is_scanned_for_captures_) {
1277             const CharT* saved_position = position();
1278             ScanForCaptures();
1279             Reset(saved_position);
1280         }
1281         if (value > capture_count_) {
1282             Reset(start);
1283             return false;
1284         }
1285     }
1286     *index_out = value;
1287     return true;
1288 }
1289 
1290 // QuantifierPrefix ::
1291 //   { DecimalDigits }
1292 //   { DecimalDigits , }
1293 //   { DecimalDigits , DecimalDigits }
1294 //
1295 // Returns true if parsing succeeds, and set the min_out and max_out
1296 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
1297 template <typename CharT>
1298 bool
ParseIntervalQuantifier(int * min_out,int * max_out)1299 RegExpParser<CharT>::ParseIntervalQuantifier(int* min_out, int* max_out)
1300 {
1301     MOZ_ASSERT(current() == '{');
1302     const CharT* start = position();
1303     Advance();
1304     int min = 0;
1305     if (!IsDecimalDigit(current())) {
1306         Reset(start);
1307         return false;
1308     }
1309     while (IsDecimalDigit(current())) {
1310         int next = current() - '0';
1311         if (min > (RegExpTree::kInfinity - next) / 10) {
1312             // Overflow. Skip past remaining decimal digits and return -1.
1313             do {
1314                 Advance();
1315             } while (IsDecimalDigit(current()));
1316             min = RegExpTree::kInfinity;
1317             break;
1318         }
1319         min = 10 * min + next;
1320         Advance();
1321     }
1322     int max = 0;
1323     if (current() == '}') {
1324         max = min;
1325         Advance();
1326     } else if (current() == ',') {
1327         Advance();
1328         if (current() == '}') {
1329             max = RegExpTree::kInfinity;
1330             Advance();
1331         } else {
1332             while (IsDecimalDigit(current())) {
1333                 int next = current() - '0';
1334                 if (max > (RegExpTree::kInfinity - next) / 10) {
1335                     do {
1336                         Advance();
1337                     } while (IsDecimalDigit(current()));
1338                     max = RegExpTree::kInfinity;
1339                     break;
1340                 }
1341                 max = 10 * max + next;
1342                 Advance();
1343             }
1344             if (current() != '}') {
1345                 Reset(start);
1346                 return false;
1347             }
1348             Advance();
1349         }
1350     } else {
1351         Reset(start);
1352         return false;
1353     }
1354     *min_out = min;
1355     *max_out = max;
1356     return true;
1357 }
1358 
1359 // Pattern ::
1360 //   Disjunction
1361 template <typename CharT>
1362 RegExpTree*
ParsePattern()1363 RegExpParser<CharT>::ParsePattern()
1364 {
1365     RegExpTree* result = ParseDisjunction();
1366     MOZ_ASSERT_IF(result, !has_more());
1367     return result;
1368 }
1369 
1370 static inline RegExpTree*
CaseFoldingSurrogatePairAtom(LifoAlloc * alloc,char16_t lead,char16_t trail,int32_t diff)1371 CaseFoldingSurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, int32_t diff)
1372 {
1373     RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
1374 
1375     builder->AddCharacter(lead);
1376     CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1377     ranges->append(CharacterRange::Range(trail, trail));
1378     ranges->append(CharacterRange::Range(trail + diff, trail + diff));
1379     builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false));
1380 
1381     return builder->ToRegExp();
1382 }
1383 
1384 static inline RegExpTree*
SurrogatePairAtom(LifoAlloc * alloc,char16_t lead,char16_t trail,bool ignore_case)1385 SurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, bool ignore_case)
1386 {
1387     if (ignore_case) {
1388 #define CALL_ATOM(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
1389         if (lead == LEAD &&trail >= TRAIL_FROM && trail <= TRAIL_TO) \
1390             return CaseFoldingSurrogatePairAtom(alloc, lead, trail, DIFF);
1391         FOR_EACH_NON_BMP_CASE_FOLDING(CALL_ATOM)
1392         FOR_EACH_NON_BMP_REV_CASE_FOLDING(CALL_ATOM)
1393 #undef CALL_ATOM
1394     }
1395 
1396     RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
1397     builder->AddCharacter(lead);
1398     builder->AddCharacter(trail);
1399     return builder->ToRegExp();
1400 }
1401 
1402 static inline RegExpTree*
LeadSurrogateAtom(LifoAlloc * alloc,char16_t value)1403 LeadSurrogateAtom(LifoAlloc* alloc, char16_t value)
1404 {
1405     RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
1406     builder->AddCharacter(value);
1407     builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
1408                                        unicode::TrailSurrogateMax));
1409     return builder->ToRegExp();
1410 }
1411 
1412 static inline RegExpTree*
TrailSurrogateAtom(LifoAlloc * alloc,char16_t value)1413 TrailSurrogateAtom(LifoAlloc* alloc, char16_t value)
1414 {
1415     RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
1416     builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
1417         RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
1418     builder->AddCharacter(value);
1419     return builder->ToRegExp();
1420 }
1421 
1422 static inline RegExpTree*
UnicodeEverythingAtom(LifoAlloc * alloc)1423 UnicodeEverythingAtom(LifoAlloc* alloc)
1424 {
1425     RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
1426 
1427     // everything except \x0a, \x0d, \u2028 and \u2029
1428 
1429     CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1430     AddClassNegated(kLineTerminatorAndSurrogateRanges,
1431                     kLineTerminatorAndSurrogateRangeCount,
1432                     ranges);
1433     builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false));
1434 
1435     builder->NewAlternative();
1436 
1437     builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
1438     builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
1439                                        unicode::TrailSurrogateMax));
1440 
1441     builder->NewAlternative();
1442 
1443     builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
1444         RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
1445     builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
1446 
1447     builder->NewAlternative();
1448 
1449     builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
1450     builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
1451 
1452     return builder->ToRegExp();
1453 }
1454 
1455 RegExpTree*
UnicodeCharacterClassEscapeAtom(LifoAlloc * alloc,char16_t char_class,bool ignore_case)1456 UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool ignore_case)
1457 {
1458     CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1459     CharacterRangeVector* lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1460     CharacterRangeVector* trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1461     WideCharRangeVector* wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
1462     AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, 0,
1463                            ignore_case);
1464 
1465     return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false, false);
1466 }
1467 
1468 static inline RegExpTree*
UnicodeBackReferenceAtom(LifoAlloc * alloc,RegExpTree * atom)1469 UnicodeBackReferenceAtom(LifoAlloc* alloc, RegExpTree* atom)
1470 {
1471     // If a back reference has a standalone lead surrogate as its last
1472     // character, then that lead surrogate shouldn't match lead surrogates that
1473     // are paired with a corresponding trail surrogate.
1474     RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
1475 
1476     builder->AddAtom(atom);
1477     builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
1478         RegExpAssertion::NOT_IN_SURROGATE_PAIR));
1479 
1480     return builder->ToRegExp();
1481 }
1482 
1483 // Disjunction ::
1484 //   Alternative
1485 //   Alternative | Disjunction
1486 // Alternative ::
1487 //   [empty]
1488 //   Term Alternative
1489 // Term ::
1490 //   Assertion
1491 //   Atom
1492 //   Atom Quantifier
1493 template <typename CharT>
1494 RegExpTree*
ParseDisjunction()1495 RegExpParser<CharT>::ParseDisjunction()
1496 {
1497     // Used to store current state while parsing subexpressions.
1498     RegExpParserState initial_state(alloc, nullptr, INITIAL, 0);
1499     RegExpParserState* stored_state = &initial_state;
1500     // Cache the builder in a local variable for quick access.
1501     RegExpBuilder* builder = initial_state.builder();
1502     while (true) {
1503         switch (current()) {
1504           case kEndMarker:
1505             if (stored_state->IsSubexpression()) {
1506                 // Inside a parenthesized group when hitting end of input.
1507                 return ReportError(JSMSG_MISSING_PAREN);
1508             }
1509             MOZ_ASSERT(INITIAL == stored_state->group_type());
1510             // Parsing completed successfully.
1511             return builder->ToRegExp();
1512           case ')': {
1513             if (!stored_state->IsSubexpression())
1514                 return ReportError(JSMSG_UNMATCHED_RIGHT_PAREN);
1515             MOZ_ASSERT(INITIAL != stored_state->group_type());
1516 
1517             Advance();
1518             // End disjunction parsing and convert builder content to new single
1519             // regexp atom.
1520             RegExpTree* body = builder->ToRegExp();
1521 
1522             int end_capture_index = captures_started();
1523 
1524             int capture_index = stored_state->capture_index();
1525             SubexpressionType group_type = stored_state->group_type();
1526 
1527             // Restore previous state.
1528             stored_state = stored_state->previous_state();
1529             builder = stored_state->builder();
1530 
1531             // Build result of subexpression.
1532             if (group_type == CAPTURE) {
1533                 RegExpCapture* capture = alloc->newInfallible<RegExpCapture>(body, capture_index);
1534                 (*captures_)[capture_index - 1] = capture;
1535                 body = capture;
1536             } else if (group_type != GROUPING) {
1537                 MOZ_ASSERT(group_type == POSITIVE_LOOKAHEAD ||
1538                            group_type == NEGATIVE_LOOKAHEAD);
1539                 bool is_positive = (group_type == POSITIVE_LOOKAHEAD);
1540                 body = alloc->newInfallible<RegExpLookahead>(body,
1541                                                    is_positive,
1542                                                    end_capture_index - capture_index,
1543                                                    capture_index);
1544             }
1545             builder->AddAtom(body);
1546             if (unicode_ && (group_type == POSITIVE_LOOKAHEAD || group_type == NEGATIVE_LOOKAHEAD))
1547                 continue;
1548             // For compatability with JSC and ES3, we allow quantifiers after
1549             // lookaheads, and break in all cases.
1550             break;
1551           }
1552           case '|': {
1553             Advance();
1554             builder->NewAlternative();
1555             continue;
1556           }
1557           case '*':
1558           case '+':
1559           case '?':
1560             return ReportError(JSMSG_NOTHING_TO_REPEAT);
1561           case '^': {
1562             Advance();
1563             if (multiline_) {
1564                 builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::START_OF_LINE));
1565             } else {
1566                 builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::START_OF_INPUT));
1567                 set_contains_anchor();
1568             }
1569             continue;
1570           }
1571           case '$': {
1572             Advance();
1573             RegExpAssertion::AssertionType assertion_type =
1574                 multiline_ ? RegExpAssertion::END_OF_LINE :
1575                 RegExpAssertion::END_OF_INPUT;
1576             builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(assertion_type));
1577             continue;
1578           }
1579           case '.': {
1580             Advance();
1581             // everything except \x0a, \x0d, \u2028 and \u2029
1582             if (unicode_) {
1583                 builder->AddAtom(UnicodeEverythingAtom(alloc));
1584                 break;
1585             }
1586             CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
1587             CharacterRange::AddClassEscape(alloc, '.', ranges);
1588             RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false);
1589             builder->AddAtom(atom);
1590             break;
1591           }
1592           case '(': {
1593             SubexpressionType subexpr_type = CAPTURE;
1594             Advance();
1595             if (current() == '?') {
1596                 switch (Next()) {
1597                   case ':':
1598                     subexpr_type = GROUPING;
1599                     break;
1600                   case '=':
1601                     subexpr_type = POSITIVE_LOOKAHEAD;
1602                     break;
1603                   case '!':
1604                     subexpr_type = NEGATIVE_LOOKAHEAD;
1605                     break;
1606                   default:
1607                     return ReportError(JSMSG_INVALID_GROUP);
1608                 }
1609                 Advance(2);
1610             } else {
1611                 if (captures_ == nullptr)
1612                     captures_ = alloc->newInfallible<RegExpCaptureVector>(*alloc);
1613                 if (captures_started() >= kMaxCaptures)
1614                     return ReportError(JSMSG_TOO_MANY_PARENS);
1615                 captures_->append((RegExpCapture*) nullptr);
1616             }
1617             // Store current state and begin new disjunction parsing.
1618             stored_state = alloc->newInfallible<RegExpParserState>(alloc, stored_state, subexpr_type,
1619                                                                    captures_started());
1620             builder = stored_state->builder();
1621             continue;
1622           }
1623           case '[': {
1624             RegExpTree* atom = ParseCharacterClass();
1625             if (!atom)
1626                 return nullptr;
1627             builder->AddAtom(atom);
1628             break;
1629           }
1630             // Atom ::
1631             //   \ AtomEscape
1632           case '\\':
1633             switch (Next()) {
1634               case kEndMarker:
1635                 return ReportError(JSMSG_ESCAPE_AT_END_OF_REGEXP);
1636               case 'b':
1637                 Advance(2);
1638                 builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::BOUNDARY));
1639                 continue;
1640               case 'B':
1641                 Advance(2);
1642                 builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(RegExpAssertion::NON_BOUNDARY));
1643                 continue;
1644                 // AtomEscape ::
1645                 //   CharacterClassEscape
1646                 //
1647                 // CharacterClassEscape :: one of
1648                 //   d D s S w W
1649               case 'D': case 'S': case 'W':
1650                 if (unicode_) {
1651                     Advance();
1652                     builder->AddAtom(UnicodeCharacterClassEscapeAtom(alloc, current(),
1653                                                                      ignore_case_));
1654                     Advance();
1655                     break;
1656                 }
1657                 MOZ_FALLTHROUGH;
1658               case 'd': case 's': case 'w': {
1659                 widechar c = Next();
1660                 Advance(2);
1661                 CharacterRangeVector* ranges =
1662                     alloc->newInfallible<CharacterRangeVector>(*alloc);
1663                 if (unicode_)
1664                     CharacterRange::AddClassEscapeUnicode(alloc, c, ranges, ignore_case_);
1665                 else
1666                     CharacterRange::AddClassEscape(alloc, c, ranges);
1667                 RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false);
1668                 builder->AddAtom(atom);
1669                 break;
1670               }
1671               case '1': case '2': case '3': case '4': case '5': case '6':
1672               case '7': case '8': case '9': {
1673                 int index = 0;
1674                 if (ParseBackReferenceIndex(&index)) {
1675                     RegExpCapture* capture = nullptr;
1676                     if (captures_ != nullptr && index <= (int) captures_->length()) {
1677                         capture = (*captures_)[index - 1];
1678                     }
1679                     if (capture == nullptr) {
1680                         builder->AddEmpty();
1681                         break;
1682                     }
1683                     RegExpTree* atom = alloc->newInfallible<RegExpBackReference>(capture);
1684                     if (unicode_)
1685                         builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom));
1686                     else
1687                         builder->AddAtom(atom);
1688                     break;
1689                 }
1690                 if (unicode_)
1691                     return ReportError(JSMSG_BACK_REF_OUT_OF_RANGE);
1692                 widechar first_digit = Next();
1693                 if (first_digit == '8' || first_digit == '9') {
1694                     // Treat as identity escape
1695                     builder->AddCharacter(first_digit);
1696                     Advance(2);
1697                     break;
1698                 }
1699                 MOZ_FALLTHROUGH;
1700               }
1701               case '0': {
1702                 if (unicode_) {
1703                     Advance(2);
1704                     if (IsDecimalDigit(current()))
1705                         return ReportError(JSMSG_INVALID_DECIMAL_ESCAPE);
1706                     builder->AddCharacter(0);
1707                     break;
1708                 }
1709 
1710                 Advance();
1711                 widechar octal = ParseOctalLiteral();
1712                 builder->AddCharacter(octal);
1713                 break;
1714               }
1715                 // ControlEscape :: one of
1716                 //   f n r t v
1717               case 'f':
1718                 Advance(2);
1719                 builder->AddCharacter('\f');
1720                 break;
1721               case 'n':
1722                 Advance(2);
1723                 builder->AddCharacter('\n');
1724                 break;
1725               case 'r':
1726                 Advance(2);
1727                 builder->AddCharacter('\r');
1728                 break;
1729               case 't':
1730                 Advance(2);
1731                 builder->AddCharacter('\t');
1732                 break;
1733               case 'v':
1734                 Advance(2);
1735                 builder->AddCharacter('\v');
1736                 break;
1737               case 'c': {
1738                 Advance();
1739                 widechar controlLetter = Next();
1740                 // Special case if it is an ASCII letter.
1741                 // Convert lower case letters to uppercase.
1742                 widechar letter = controlLetter & ~('a' ^ 'A');
1743                 if (letter < 'A' || 'Z' < letter) {
1744                     if (unicode_)
1745                         return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
1746                     // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
1747                     // This is outside the specification. We match JSC in
1748                     // reading the backslash as a literal character instead
1749                     // of as starting an escape.
1750                     builder->AddCharacter('\\');
1751                 } else {
1752                     Advance(2);
1753                     builder->AddCharacter(controlLetter & 0x1f);
1754                 }
1755                 break;
1756               }
1757               case 'x': {
1758                 Advance(2);
1759                 widechar value;
1760                 if (ParseHexEscape(2, &value)) {
1761                     builder->AddCharacter(value);
1762                 } else {
1763                     if (unicode_)
1764                         return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
1765                     builder->AddCharacter('x');
1766                 }
1767                 break;
1768               }
1769               case 'u': {
1770                 Advance(2);
1771                 widechar value;
1772                 if (unicode_) {
1773                     if (current() == '{') {
1774                         if (!ParseBracedHexEscape(&value))
1775                             return nullptr;
1776                         if (unicode::IsLeadSurrogate(value)) {
1777                             builder->AddAtom(LeadSurrogateAtom(alloc, value));
1778                         } else if (unicode::IsTrailSurrogate(value)) {
1779                             builder->AddAtom(TrailSurrogateAtom(alloc, value));
1780                         } else if (value >= unicode::NonBMPMin) {
1781                             char16_t lead, trail;
1782                             unicode::UTF16Encode(value, &lead, &trail);
1783                             builder->AddAtom(SurrogatePairAtom(alloc, lead, trail,
1784                                                                ignore_case_));
1785                         } else {
1786                             builder->AddCharacter(value);
1787                         }
1788                     } else if (ParseHexEscape(4, &value)) {
1789                         if (unicode::IsLeadSurrogate(value)) {
1790                             widechar trail;
1791                             if (ParseTrailSurrogate(&trail)) {
1792                                 builder->AddAtom(SurrogatePairAtom(alloc, value, trail,
1793                                                                    ignore_case_));
1794                             } else {
1795                                 builder->AddAtom(LeadSurrogateAtom(alloc, value));
1796                             }
1797                         } else if (unicode::IsTrailSurrogate(value)) {
1798                             builder->AddAtom(TrailSurrogateAtom(alloc, value));
1799                         } else {
1800                             builder->AddCharacter(value);
1801                         }
1802                     } else {
1803                         return ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
1804                     }
1805                     break;
1806                 }
1807                 if (ParseHexEscape(4, &value)) {
1808                     builder->AddCharacter(value);
1809                 } else {
1810                     builder->AddCharacter('u');
1811                 }
1812                 break;
1813               }
1814               default:
1815                 // Identity escape.
1816                 if (unicode_ && !IsSyntaxCharacter(Next()))
1817                     return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
1818                 builder->AddCharacter(Next());
1819                 Advance(2);
1820                 break;
1821             }
1822             break;
1823           case '{': {
1824             if (unicode_)
1825                 return ReportError(JSMSG_RAW_BRACE_IN_REGEP);
1826             int dummy;
1827             if (ParseIntervalQuantifier(&dummy, &dummy))
1828                 return ReportError(JSMSG_NOTHING_TO_REPEAT);
1829             MOZ_FALLTHROUGH;
1830           }
1831           default:
1832             if (unicode_) {
1833                 char16_t lead, trail;
1834                 if (ParseRawSurrogatePair(&lead, &trail)) {
1835                     builder->AddAtom(SurrogatePairAtom(alloc, lead, trail, ignore_case_));
1836                 } else {
1837                     widechar c = current();
1838                     if (unicode::IsLeadSurrogate(c))
1839                         builder->AddAtom(LeadSurrogateAtom(alloc, c));
1840                     else if (unicode::IsTrailSurrogate(c))
1841                         builder->AddAtom(TrailSurrogateAtom(alloc, c));
1842                     else if (c == ']')
1843                         return ReportError(JSMSG_RAW_BRACKET_IN_REGEP);
1844                     else if (c == '}')
1845                         return ReportError(JSMSG_RAW_BRACE_IN_REGEP);
1846                     else
1847                         builder->AddCharacter(c);
1848                     Advance();
1849                 }
1850                 break;
1851             }
1852             builder->AddCharacter(current());
1853             Advance();
1854             break;
1855         }  // end switch(current())
1856 
1857         int min;
1858         int max;
1859         switch (current()) {
1860             // QuantifierPrefix ::
1861             //   *
1862             //   +
1863             //   ?
1864             //   {
1865           case '*':
1866             min = 0;
1867             max = RegExpTree::kInfinity;
1868             Advance();
1869             break;
1870           case '+':
1871             min = 1;
1872             max = RegExpTree::kInfinity;
1873             Advance();
1874             break;
1875           case '?':
1876             min = 0;
1877             max = 1;
1878             Advance();
1879             break;
1880           case '{':
1881             if (ParseIntervalQuantifier(&min, &max)) {
1882                 if (max < min)
1883                     return ReportError(JSMSG_NUMBERS_OUT_OF_ORDER);
1884                 break;
1885             } else {
1886                 continue;
1887             }
1888           default:
1889             continue;
1890         }
1891         RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
1892         if (current() == '?') {
1893             quantifier_type = RegExpQuantifier::NON_GREEDY;
1894             Advance();
1895         }
1896         builder->AddQuantifierToAtom(min, max, quantifier_type);
1897     }
1898 }
1899 
1900 template class irregexp::RegExpParser<Latin1Char>;
1901 template class irregexp::RegExpParser<char16_t>;
1902 
1903 template <typename CharT>
1904 static bool
ParsePattern(frontend::TokenStreamAnyChars & ts,LifoAlloc & alloc,const CharT * chars,size_t length,bool multiline,bool match_only,bool unicode,bool ignore_case,bool global,bool sticky,RegExpCompileData * data)1905 ParsePattern(frontend::TokenStreamAnyChars& ts, LifoAlloc& alloc,
1906              const CharT* chars, size_t length,
1907              bool multiline, bool match_only, bool unicode, bool ignore_case,
1908              bool global, bool sticky, RegExpCompileData* data)
1909 {
1910     // We shouldn't strip pattern for exec, or test with global/sticky,
1911     // to reflect correct match position and lastIndex.
1912     if (match_only && !global && !sticky) {
1913         // Try to strip a leading '.*' from the RegExp, but only if it is not
1914         // followed by a '?' (which will affect how the .* is parsed). This
1915         // pattern will affect the captures produced by the RegExp, but not
1916         // whether there is a match or not.
1917         if (length >= 3 && chars[0] == '.' && chars[1] == '*' && chars[2] != '?') {
1918             chars += 2;
1919             length -= 2;
1920         }
1921 
1922         // Try to strip a trailing '.*' from the RegExp, which as above will
1923         // affect the captures but not whether there is a match. Only do this
1924         // when there are no other meta characters in the RegExp, so that we
1925         // are sure this will not affect how the RegExp is parsed.
1926         if (length >= 3 && !HasRegExpMetaChars(chars, length - 2) &&
1927             chars[length - 2] == '.' && chars[length - 1] == '*')
1928         {
1929             length -= 2;
1930         }
1931     }
1932 
1933     RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case);
1934     data->tree = parser.ParsePattern();
1935     if (!data->tree)
1936         return false;
1937 
1938     data->simple = parser.simple();
1939     data->contains_anchor = parser.contains_anchor();
1940     data->capture_count = parser.captures_started();
1941     return true;
1942 }
1943 
1944 bool
ParsePattern(frontend::TokenStreamAnyChars & ts,LifoAlloc & alloc,JSAtom * str,bool multiline,bool match_only,bool unicode,bool ignore_case,bool global,bool sticky,RegExpCompileData * data)1945 irregexp::ParsePattern(frontend::TokenStreamAnyChars& ts, LifoAlloc& alloc, JSAtom* str,
1946                        bool multiline, bool match_only, bool unicode, bool ignore_case,
1947                        bool global, bool sticky, RegExpCompileData* data)
1948 {
1949     JS::AutoCheckCannotGC nogc;
1950     return str->hasLatin1Chars()
1951            ? ::ParsePattern(ts, alloc, str->latin1Chars(nogc), str->length(),
1952                             multiline, match_only, unicode, ignore_case, global, sticky, data)
1953            : ::ParsePattern(ts, alloc, str->twoByteChars(nogc), str->length(),
1954                             multiline, match_only, unicode, ignore_case, global, sticky, data);
1955 }
1956 
1957 template <typename CharT>
1958 static bool
ParsePatternSyntax(frontend::TokenStreamAnyChars & ts,LifoAlloc & alloc,const CharT * chars,size_t length,bool unicode)1959 ParsePatternSyntax(frontend::TokenStreamAnyChars& ts, LifoAlloc& alloc,
1960                    const CharT* chars, size_t length, bool unicode)
1961 {
1962     LifoAllocScope scope(&alloc);
1963 
1964     RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, false);
1965     return parser.ParsePattern() != nullptr;
1966 }
1967 
1968 bool
ParsePatternSyntax(frontend::TokenStreamAnyChars & ts,LifoAlloc & alloc,JSAtom * str,bool unicode)1969 irregexp::ParsePatternSyntax(frontend::TokenStreamAnyChars& ts, LifoAlloc& alloc, JSAtom* str,
1970                              bool unicode)
1971 {
1972     JS::AutoCheckCannotGC nogc;
1973     return str->hasLatin1Chars()
1974            ? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode)
1975            : ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode);
1976 }
1977 
1978 bool
ParsePatternSyntax(frontend::TokenStreamAnyChars & ts,LifoAlloc & alloc,const mozilla::Range<const char16_t> chars,bool unicode)1979 irregexp::ParsePatternSyntax(frontend::TokenStreamAnyChars& ts, LifoAlloc& alloc,
1980                              const mozilla::Range<const char16_t> chars, bool unicode)
1981 {
1982     return ::ParsePatternSyntax(ts, alloc, chars.begin().get(), chars.length(), unicode);
1983 }
1984