1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2  * vim: set ts=8 sts=2 et sw=2 tw=80:
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 // JS lexical scanner.
8 
9 #include "frontend/TokenStream.h"
10 
11 #include "mozilla/ArrayUtils.h"
12 #include "mozilla/Attributes.h"
13 #include "mozilla/Likely.h"
14 #include "mozilla/Maybe.h"
15 #include "mozilla/MemoryChecking.h"
16 #include "mozilla/ScopeExit.h"
17 #include "mozilla/Span.h"
18 #include "mozilla/TemplateLib.h"
19 #include "mozilla/TextUtils.h"
20 #include "mozilla/Utf8.h"
21 
22 #include <algorithm>
23 #include <iterator>
24 #include <stdarg.h>
25 #include <stdint.h>
26 #include <stdio.h>
27 #include <type_traits>
28 #include <utility>
29 
30 #include "jsnum.h"
31 
32 #include "frontend/BytecodeCompiler.h"
33 #include "frontend/Parser.h"
34 #include "frontend/ParserAtom.h"
35 #include "frontend/ReservedWords.h"
36 #include "js/friend/ErrorMessages.h"  // js::GetErrorMessage, JSMSG_*
37 #include "js/Printf.h"                // JS_smprintf
38 #include "js/RegExpFlags.h"           // JS::RegExpFlags
39 #include "js/UniquePtr.h"
40 #include "util/Text.h"
41 #include "util/Unicode.h"
42 #include "vm/FrameIter.h"  // js::{,NonBuiltin}FrameIter
43 #include "vm/JSContext.h"
44 #include "vm/Realm.h"
45 #include "vm/WellKnownAtom.h"  // js_*_str
46 
47 using mozilla::AsciiAlphanumericToNumber;
48 using mozilla::AssertedCast;
49 using mozilla::DecodeOneUtf8CodePoint;
50 using mozilla::IsAscii;
51 using mozilla::IsAsciiAlpha;
52 using mozilla::IsAsciiDigit;
53 using mozilla::IsAsciiHexDigit;
54 using mozilla::IsTrailingUnit;
55 using mozilla::MakeScopeExit;
56 using mozilla::Maybe;
57 using mozilla::PointerRangeSize;
58 using mozilla::Span;
59 using mozilla::Utf8Unit;
60 
61 using JS::ReadOnlyCompileOptions;
62 using JS::RegExpFlag;
63 using JS::RegExpFlags;
64 
65 struct ReservedWordInfo {
66   const char* chars;  // C string with reserved word text
67   js::frontend::TokenKind tokentype;
68 };
69 
70 static const ReservedWordInfo reservedWords[] = {
71 #define RESERVED_WORD_INFO(word, name, type) \
72   {js_##word##_str, js::frontend::type},
73     FOR_EACH_JAVASCRIPT_RESERVED_WORD(RESERVED_WORD_INFO)
74 #undef RESERVED_WORD_INFO
75 };
76 
77 enum class ReservedWordsIndex : size_t {
78 #define ENTRY_(_1, NAME, _3) NAME,
79   FOR_EACH_JAVASCRIPT_RESERVED_WORD(ENTRY_)
80 #undef ENTRY_
81 };
82 
83 // Returns a ReservedWordInfo for the specified characters, or nullptr if the
84 // string is not a reserved word.
85 template <typename CharT>
FindReservedWord(const CharT * s,size_t length)86 static const ReservedWordInfo* FindReservedWord(const CharT* s, size_t length) {
87   MOZ_ASSERT(length != 0);
88 
89   size_t i;
90   const ReservedWordInfo* rw;
91   const char* chars;
92 
93 #define JSRW_LENGTH() length
94 #define JSRW_AT(column) s[column]
95 #define JSRW_GOT_MATCH(index) \
96   i = (index);                \
97   goto got_match;
98 #define JSRW_TEST_GUESS(index) \
99   i = (index);                 \
100   goto test_guess;
101 #define JSRW_NO_MATCH() goto no_match;
102 #include "frontend/ReservedWordsGenerated.h"
103 #undef JSRW_NO_MATCH
104 #undef JSRW_TEST_GUESS
105 #undef JSRW_GOT_MATCH
106 #undef JSRW_AT
107 #undef JSRW_LENGTH
108 
109 got_match:
110   return &reservedWords[i];
111 
112 test_guess:
113   rw = &reservedWords[i];
114   chars = rw->chars;
115   do {
116     if (*s++ != static_cast<unsigned char>(*chars++)) {
117       goto no_match;
118     }
119   } while (--length != 0);
120   return rw;
121 
122 no_match:
123   return nullptr;
124 }
125 
126 template <>
FindReservedWord(const Utf8Unit * units,size_t length)127 MOZ_ALWAYS_INLINE const ReservedWordInfo* FindReservedWord<Utf8Unit>(
128     const Utf8Unit* units, size_t length) {
129   return FindReservedWord(Utf8AsUnsignedChars(units), length);
130 }
131 
FindReservedWord(const js::frontend::TaggedParserAtomIndex atom)132 static const ReservedWordInfo* FindReservedWord(
133     const js::frontend::TaggedParserAtomIndex atom) {
134   switch (atom.rawData()) {
135 #define CASE_(_1, NAME, _3)                                           \
136   case js::frontend::TaggedParserAtomIndex::WellKnownRawData::NAME(): \
137     return &reservedWords[size_t(ReservedWordsIndex::NAME)];
138     FOR_EACH_JAVASCRIPT_RESERVED_WORD(CASE_)
139 #undef CASE_
140   }
141 
142   return nullptr;
143 }
144 
GetSingleCodePoint(const char16_t ** p,const char16_t * end)145 static uint32_t GetSingleCodePoint(const char16_t** p, const char16_t* end) {
146   using namespace js;
147 
148   uint32_t codePoint;
149   if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(**p)) && *p + 1 < end) {
150     char16_t lead = **p;
151     char16_t maybeTrail = *(*p + 1);
152     if (unicode::IsTrailSurrogate(maybeTrail)) {
153       *p += 2;
154       return unicode::UTF16Decode(lead, maybeTrail);
155     }
156   }
157 
158   codePoint = **p;
159   (*p)++;
160   return codePoint;
161 }
162 
163 template <typename CharT>
IsAsciiBinary(CharT c)164 static constexpr bool IsAsciiBinary(CharT c) {
165   using UnsignedCharT = std::make_unsigned_t<CharT>;
166   auto uc = static_cast<UnsignedCharT>(c);
167   return uc == '0' || uc == '1';
168 }
169 
170 template <typename CharT>
IsAsciiOctal(CharT c)171 static constexpr bool IsAsciiOctal(CharT c) {
172   using UnsignedCharT = std::make_unsigned_t<CharT>;
173   auto uc = static_cast<UnsignedCharT>(c);
174   return '0' <= uc && uc <= '7';
175 }
176 
177 template <typename CharT>
AsciiOctalToNumber(CharT c)178 static constexpr uint8_t AsciiOctalToNumber(CharT c) {
179   using UnsignedCharT = std::make_unsigned_t<CharT>;
180   auto uc = static_cast<UnsignedCharT>(c);
181   return uc - '0';
182 }
183 
184 namespace js {
185 
186 namespace frontend {
187 
IsIdentifier(JSLinearString * str)188 bool IsIdentifier(JSLinearString* str) {
189   JS::AutoCheckCannotGC nogc;
190   MOZ_ASSERT(str);
191   if (str->hasLatin1Chars()) {
192     return IsIdentifier(str->latin1Chars(nogc), str->length());
193   }
194   return IsIdentifier(str->twoByteChars(nogc), str->length());
195 }
196 
IsIdentifierNameOrPrivateName(JSLinearString * str)197 bool IsIdentifierNameOrPrivateName(JSLinearString* str) {
198   JS::AutoCheckCannotGC nogc;
199   MOZ_ASSERT(str);
200   if (str->hasLatin1Chars()) {
201     return IsIdentifierNameOrPrivateName(str->latin1Chars(nogc), str->length());
202   }
203   return IsIdentifierNameOrPrivateName(str->twoByteChars(nogc), str->length());
204 }
205 
IsIdentifier(const Latin1Char * chars,size_t length)206 bool IsIdentifier(const Latin1Char* chars, size_t length) {
207   if (length == 0) {
208     return false;
209   }
210 
211   if (!unicode::IsIdentifierStart(char16_t(*chars))) {
212     return false;
213   }
214 
215   const Latin1Char* end = chars + length;
216   while (++chars != end) {
217     if (!unicode::IsIdentifierPart(char16_t(*chars))) {
218       return false;
219     }
220   }
221 
222   return true;
223 }
224 
IsIdentifierASCII(char c)225 bool IsIdentifierASCII(char c) { return unicode::IsIdentifierStartASCII(c); }
226 
IsIdentifierASCII(char c1,char c2)227 bool IsIdentifierASCII(char c1, char c2) {
228   return unicode::IsIdentifierStartASCII(c1) &&
229          unicode::IsIdentifierPartASCII(c2);
230 }
231 
IsIdentifierNameOrPrivateName(const Latin1Char * chars,size_t length)232 bool IsIdentifierNameOrPrivateName(const Latin1Char* chars, size_t length) {
233   if (length == 0) {
234     return false;
235   }
236 
237   // Skip over any private name marker.
238   if (*chars == '#') {
239     ++chars;
240     --length;
241   }
242 
243   return IsIdentifier(chars, length);
244 }
245 
IsIdentifier(const char16_t * chars,size_t length)246 bool IsIdentifier(const char16_t* chars, size_t length) {
247   if (length == 0) {
248     return false;
249   }
250 
251   const char16_t* p = chars;
252   const char16_t* end = chars + length;
253   uint32_t codePoint;
254 
255   codePoint = GetSingleCodePoint(&p, end);
256   if (!unicode::IsIdentifierStart(codePoint)) {
257     return false;
258   }
259 
260   while (p < end) {
261     codePoint = GetSingleCodePoint(&p, end);
262     if (!unicode::IsIdentifierPart(codePoint)) {
263       return false;
264     }
265   }
266 
267   return true;
268 }
269 
IsIdentifierNameOrPrivateName(const char16_t * chars,size_t length)270 bool IsIdentifierNameOrPrivateName(const char16_t* chars, size_t length) {
271   if (length == 0) {
272     return false;
273   }
274 
275   const char16_t* p = chars;
276   const char16_t* end = chars + length;
277   uint32_t codePoint;
278 
279   codePoint = GetSingleCodePoint(&p, end);
280 
281   // Skip over any private name marker.
282   if (codePoint == '#') {
283     // The identifier part of a private name mustn't be empty.
284     if (length == 1) {
285       return false;
286     }
287 
288     codePoint = GetSingleCodePoint(&p, end);
289   }
290 
291   if (!unicode::IsIdentifierStart(codePoint)) {
292     return false;
293   }
294 
295   while (p < end) {
296     codePoint = GetSingleCodePoint(&p, end);
297     if (!unicode::IsIdentifierPart(codePoint)) {
298       return false;
299     }
300   }
301 
302   return true;
303 }
304 
IsKeyword(TaggedParserAtomIndex atom)305 bool IsKeyword(TaggedParserAtomIndex atom) {
306   if (const ReservedWordInfo* rw = FindReservedWord(atom)) {
307     return TokenKindIsKeyword(rw->tokentype);
308   }
309 
310   return false;
311 }
312 
ReservedWordTokenKind(TaggedParserAtomIndex name)313 TokenKind ReservedWordTokenKind(TaggedParserAtomIndex name) {
314   if (const ReservedWordInfo* rw = FindReservedWord(name)) {
315     return rw->tokentype;
316   }
317 
318   return TokenKind::Limit;
319 }
320 
ReservedWordToCharZ(TaggedParserAtomIndex name)321 const char* ReservedWordToCharZ(TaggedParserAtomIndex name) {
322   if (const ReservedWordInfo* rw = FindReservedWord(name)) {
323     return ReservedWordToCharZ(rw->tokentype);
324   }
325 
326   return nullptr;
327 }
328 
ReservedWordToCharZ(TokenKind tt)329 const char* ReservedWordToCharZ(TokenKind tt) {
330   MOZ_ASSERT(tt != TokenKind::Name);
331   switch (tt) {
332 #define EMIT_CASE(word, name, type) \
333   case type:                        \
334     return js_##word##_str;
335     FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
336 #undef EMIT_CASE
337     default:
338       MOZ_ASSERT_UNREACHABLE("Not a reserved word PropertyName.");
339   }
340   return nullptr;
341 }
342 
reservedWordToPropertyName(TokenKind tt) const343 TaggedParserAtomIndex TokenStreamAnyChars::reservedWordToPropertyName(
344     TokenKind tt) const {
345   MOZ_ASSERT(tt != TokenKind::Name);
346   switch (tt) {
347 #define EMIT_CASE(word, name, type) \
348   case type:                        \
349     return TaggedParserAtomIndex::WellKnown::name();
350     FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
351 #undef EMIT_CASE
352     default:
353       MOZ_ASSERT_UNREACHABLE("Not a reserved word TokenKind.");
354   }
355   return TaggedParserAtomIndex::null();
356 }
357 
SourceCoords(JSContext * cx,uint32_t initialLineNumber,uint32_t initialOffset)358 SourceCoords::SourceCoords(JSContext* cx, uint32_t initialLineNumber,
359                            uint32_t initialOffset)
360     : lineStartOffsets_(cx), initialLineNum_(initialLineNumber), lastIndex_(0) {
361   // This is actually necessary!  Removing it causes compile errors on
362   // GCC and clang.  You could try declaring this:
363   //
364   //   const uint32_t SourceCoords::MAX_PTR;
365   //
366   // which fixes the GCC/clang error, but causes bustage on Windows.  Sigh.
367   //
368   uint32_t maxPtr = MAX_PTR;
369 
370   // The first line begins at buffer offset |initialOffset|.  MAX_PTR is the
371   // sentinel.  The appends cannot fail because |lineStartOffsets_| has
372   // statically-allocated elements.
373   MOZ_ASSERT(lineStartOffsets_.capacity() >= 2);
374   MOZ_ALWAYS_TRUE(lineStartOffsets_.reserve(2));
375   lineStartOffsets_.infallibleAppend(initialOffset);
376   lineStartOffsets_.infallibleAppend(maxPtr);
377 }
378 
add(uint32_t lineNum,uint32_t lineStartOffset)379 MOZ_ALWAYS_INLINE bool SourceCoords::add(uint32_t lineNum,
380                                          uint32_t lineStartOffset) {
381   uint32_t index = indexFromLineNumber(lineNum);
382   uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
383 
384   MOZ_ASSERT(lineStartOffsets_[0] <= lineStartOffset);
385   MOZ_ASSERT(lineStartOffsets_[sentinelIndex] == MAX_PTR);
386 
387   if (index == sentinelIndex) {
388     // We haven't seen this newline before.  Update lineStartOffsets_
389     // only if lineStartOffsets_.append succeeds, to keep sentinel.
390     // Otherwise return false to tell TokenStream about OOM.
391     uint32_t maxPtr = MAX_PTR;
392     if (!lineStartOffsets_.append(maxPtr)) {
393       static_assert(std::is_same_v<decltype(lineStartOffsets_.allocPolicy()),
394                                    TempAllocPolicy&>,
395                     "this function's caller depends on it reporting an "
396                     "error on failure, as TempAllocPolicy ensures");
397       return false;
398     }
399 
400     lineStartOffsets_[index] = lineStartOffset;
401   } else {
402     // We have seen this newline before (and ungot it).  Do nothing (other
403     // than checking it hasn't mysteriously changed).
404     // This path can be executed after hitting OOM, so check index.
405     MOZ_ASSERT_IF(index < sentinelIndex,
406                   lineStartOffsets_[index] == lineStartOffset);
407   }
408   return true;
409 }
410 
fill(const SourceCoords & other)411 MOZ_ALWAYS_INLINE bool SourceCoords::fill(const SourceCoords& other) {
412   MOZ_ASSERT(lineStartOffsets_[0] == other.lineStartOffsets_[0]);
413   MOZ_ASSERT(lineStartOffsets_.back() == MAX_PTR);
414   MOZ_ASSERT(other.lineStartOffsets_.back() == MAX_PTR);
415 
416   if (lineStartOffsets_.length() >= other.lineStartOffsets_.length()) {
417     return true;
418   }
419 
420   uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
421   lineStartOffsets_[sentinelIndex] = other.lineStartOffsets_[sentinelIndex];
422 
423   for (size_t i = sentinelIndex + 1; i < other.lineStartOffsets_.length();
424        i++) {
425     if (!lineStartOffsets_.append(other.lineStartOffsets_[i])) {
426       return false;
427     }
428   }
429   return true;
430 }
431 
432 MOZ_ALWAYS_INLINE uint32_t
indexFromOffset(uint32_t offset) const433 SourceCoords::indexFromOffset(uint32_t offset) const {
434   uint32_t iMin, iMax, iMid;
435 
436   if (lineStartOffsets_[lastIndex_] <= offset) {
437     // If we reach here, offset is on a line the same as or higher than
438     // last time.  Check first for the +0, +1, +2 cases, because they
439     // typically cover 85--98% of cases.
440     if (offset < lineStartOffsets_[lastIndex_ + 1]) {
441       return lastIndex_;  // index is same as last time
442     }
443 
444     // If we reach here, there must be at least one more entry (plus the
445     // sentinel).  Try it.
446     lastIndex_++;
447     if (offset < lineStartOffsets_[lastIndex_ + 1]) {
448       return lastIndex_;  // index is one higher than last time
449     }
450 
451     // The same logic applies here.
452     lastIndex_++;
453     if (offset < lineStartOffsets_[lastIndex_ + 1]) {
454       return lastIndex_;  // index is two higher than last time
455     }
456 
457     // No luck.  Oh well, we have a better-than-default starting point for
458     // the binary search.
459     iMin = lastIndex_ + 1;
460     MOZ_ASSERT(iMin <
461                lineStartOffsets_.length() - 1);  // -1 due to the sentinel
462 
463   } else {
464     iMin = 0;
465   }
466 
467   // This is a binary search with deferred detection of equality, which was
468   // marginally faster in this case than a standard binary search.
469   // The -2 is because |lineStartOffsets_.length() - 1| is the sentinel, and we
470   // want one before that.
471   iMax = lineStartOffsets_.length() - 2;
472   while (iMax > iMin) {
473     iMid = iMin + (iMax - iMin) / 2;
474     if (offset >= lineStartOffsets_[iMid + 1]) {
475       iMin = iMid + 1;  // offset is above lineStartOffsets_[iMid]
476     } else {
477       iMax = iMid;  // offset is below or within lineStartOffsets_[iMid]
478     }
479   }
480 
481   MOZ_ASSERT(iMax == iMin);
482   MOZ_ASSERT(lineStartOffsets_[iMin] <= offset);
483   MOZ_ASSERT(offset < lineStartOffsets_[iMin + 1]);
484 
485   lastIndex_ = iMin;
486   return iMin;
487 }
488 
lineToken(uint32_t offset) const489 SourceCoords::LineToken SourceCoords::lineToken(uint32_t offset) const {
490   return LineToken(indexFromOffset(offset), offset);
491 }
492 
TokenStreamAnyChars(JSContext * cx,const ReadOnlyCompileOptions & options,StrictModeGetter * smg)493 TokenStreamAnyChars::TokenStreamAnyChars(JSContext* cx,
494                                          const ReadOnlyCompileOptions& options,
495                                          StrictModeGetter* smg)
496     : cx(cx),
497       options_(options),
498       strictModeGetter_(smg),
499       filename_(options.filename()),
500       longLineColumnInfo_(cx),
501       srcCoords(cx, options.lineno, options.scriptSourceOffset),
502       lineno(options.lineno),
503       mutedErrors(options.mutedErrors()) {
504   // |isExprEnding| was initially zeroed: overwrite the true entries here.
505   isExprEnding[size_t(TokenKind::Comma)] = true;
506   isExprEnding[size_t(TokenKind::Semi)] = true;
507   isExprEnding[size_t(TokenKind::Colon)] = true;
508   isExprEnding[size_t(TokenKind::RightParen)] = true;
509   isExprEnding[size_t(TokenKind::RightBracket)] = true;
510   isExprEnding[size_t(TokenKind::RightCurly)] = true;
511 }
512 
513 template <typename Unit>
TokenStreamCharsBase(JSContext * cx,ParserAtomsTable * pasrerAtoms,const Unit * units,size_t length,size_t startOffset)514 TokenStreamCharsBase<Unit>::TokenStreamCharsBase(JSContext* cx,
515                                                  ParserAtomsTable* pasrerAtoms,
516                                                  const Unit* units,
517                                                  size_t length,
518                                                  size_t startOffset)
519     : TokenStreamCharsShared(cx, pasrerAtoms),
520       sourceUnits(units, length, startOffset) {}
521 
FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer & charBuffer,const char16_t * cur,const char16_t * end)522 bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer,
523                                                         const char16_t* cur,
524                                                         const char16_t* end) {
525   MOZ_ASSERT(charBuffer.length() == 0);
526 
527   while (cur < end) {
528     char16_t ch = *cur++;
529     if (ch == '\r') {
530       ch = '\n';
531       if (cur < end && *cur == '\n') {
532         cur++;
533       }
534     }
535 
536     if (!charBuffer.append(ch)) {
537       return false;
538     }
539   }
540 
541   MOZ_ASSERT(cur == end);
542   return true;
543 }
544 
FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer & charBuffer,const Utf8Unit * cur,const Utf8Unit * end)545 bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer,
546                                                         const Utf8Unit* cur,
547                                                         const Utf8Unit* end) {
548   MOZ_ASSERT(charBuffer.length() == 0);
549 
550   while (cur < end) {
551     Utf8Unit unit = *cur++;
552     if (MOZ_LIKELY(IsAscii(unit))) {
553       char16_t ch = unit.toUint8();
554       if (ch == '\r') {
555         ch = '\n';
556         if (cur < end && *cur == Utf8Unit('\n')) {
557           cur++;
558         }
559       }
560 
561       if (!charBuffer.append(ch)) {
562         return false;
563       }
564 
565       continue;
566     }
567 
568     Maybe<char32_t> ch = DecodeOneUtf8CodePoint(unit, &cur, end);
569     MOZ_ASSERT(ch.isSome(),
570                "provided source text should already have been validated");
571 
572     if (!AppendCodePointToCharBuffer(charBuffer, ch.value())) {
573       return false;
574     }
575   }
576 
577   MOZ_ASSERT(cur == end);
578   return true;
579 }
580 
581 template <typename Unit, class AnyCharsAccess>
TokenStreamSpecific(JSContext * cx,ParserAtomsTable * pasrerAtoms,const ReadOnlyCompileOptions & options,const Unit * units,size_t length)582 TokenStreamSpecific<Unit, AnyCharsAccess>::TokenStreamSpecific(
583     JSContext* cx, ParserAtomsTable* pasrerAtoms,
584     const ReadOnlyCompileOptions& options, const Unit* units, size_t length)
585     : TokenStreamChars<Unit, AnyCharsAccess>(cx, pasrerAtoms, units, length,
586                                              options.scriptSourceOffset) {}
587 
checkOptions()588 bool TokenStreamAnyChars::checkOptions() {
589   // Constrain starting columns to where they will saturate.
590   if (options().column > ColumnLimit) {
591     reportErrorNoOffset(JSMSG_BAD_COLUMN_NUMBER);
592     return false;
593   }
594 
595   return true;
596 }
597 
reportErrorNoOffset(unsigned errorNumber,...)598 void TokenStreamAnyChars::reportErrorNoOffset(unsigned errorNumber, ...) {
599   va_list args;
600   va_start(args, errorNumber);
601 
602   reportErrorNoOffsetVA(errorNumber, &args);
603 
604   va_end(args);
605 }
606 
reportErrorNoOffsetVA(unsigned errorNumber,va_list * args)607 void TokenStreamAnyChars::reportErrorNoOffsetVA(unsigned errorNumber,
608                                                 va_list* args) {
609   ErrorMetadata metadata;
610   computeErrorMetadataNoOffset(&metadata);
611 
612   ReportCompileErrorLatin1(cx, std::move(metadata), nullptr, errorNumber, args);
613 }
614 
615 [[nodiscard]] MOZ_ALWAYS_INLINE bool
internalUpdateLineInfoForEOL(uint32_t lineStartOffset)616 TokenStreamAnyChars::internalUpdateLineInfoForEOL(uint32_t lineStartOffset) {
617   prevLinebase = linebase;
618   linebase = lineStartOffset;
619   lineno++;
620 
621   // On overflow, report error.
622   if (MOZ_UNLIKELY(!lineno)) {
623     reportErrorNoOffset(JSMSG_BAD_LINE_NUMBER);
624     return false;
625   }
626 
627   return srcCoords.add(lineno, linebase);
628 }
629 
630 #ifdef DEBUG
631 
632 template <>
assertNextCodePoint(const PeekedCodePoint<char16_t> & peeked)633 inline void SourceUnits<char16_t>::assertNextCodePoint(
634     const PeekedCodePoint<char16_t>& peeked) {
635   char32_t c = peeked.codePoint();
636   if (c < unicode::NonBMPMin) {
637     MOZ_ASSERT(peeked.lengthInUnits() == 1);
638     MOZ_ASSERT(ptr[0] == c);
639   } else {
640     MOZ_ASSERT(peeked.lengthInUnits() == 2);
641     char16_t lead, trail;
642     unicode::UTF16Encode(c, &lead, &trail);
643     MOZ_ASSERT(ptr[0] == lead);
644     MOZ_ASSERT(ptr[1] == trail);
645   }
646 }
647 
648 template <>
assertNextCodePoint(const PeekedCodePoint<Utf8Unit> & peeked)649 inline void SourceUnits<Utf8Unit>::assertNextCodePoint(
650     const PeekedCodePoint<Utf8Unit>& peeked) {
651   char32_t c = peeked.codePoint();
652 
653   // This is all roughly indulgence of paranoia only for assertions, so the
654   // reimplementation of UTF-8 encoding a code point is (we think) a virtue.
655   uint8_t expectedUnits[4] = {};
656   if (c < 0x80) {
657     expectedUnits[0] = AssertedCast<uint8_t>(c);
658   } else if (c < 0x800) {
659     expectedUnits[0] = 0b1100'0000 | (c >> 6);
660     expectedUnits[1] = 0b1000'0000 | (c & 0b11'1111);
661   } else if (c < 0x10000) {
662     expectedUnits[0] = 0b1110'0000 | (c >> 12);
663     expectedUnits[1] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
664     expectedUnits[2] = 0b1000'0000 | (c & 0b11'1111);
665   } else {
666     expectedUnits[0] = 0b1111'0000 | (c >> 18);
667     expectedUnits[1] = 0b1000'0000 | ((c >> 12) & 0b11'1111);
668     expectedUnits[2] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
669     expectedUnits[3] = 0b1000'0000 | (c & 0b11'1111);
670   }
671 
672   MOZ_ASSERT(peeked.lengthInUnits() <= 4);
673   for (uint8_t i = 0; i < peeked.lengthInUnits(); i++) {
674     MOZ_ASSERT(expectedUnits[i] == ptr[i].toUint8());
675   }
676 }
677 
678 #endif  // DEBUG
679 
RetractPointerToCodePointBoundary(const Utf8Unit ** ptr,const Utf8Unit * limit)680 static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
681     const Utf8Unit** ptr, const Utf8Unit* limit) {
682   MOZ_ASSERT(*ptr <= limit);
683 
684   // |limit| is a code point boundary.
685   if (MOZ_UNLIKELY(*ptr == limit)) {
686     return;
687   }
688 
689   // Otherwise rewind past trailing units to the start of the code point.
690 #ifdef DEBUG
691   size_t retracted = 0;
692 #endif
693   while (MOZ_UNLIKELY(IsTrailingUnit((*ptr)[0]))) {
694     --*ptr;
695 #ifdef DEBUG
696     retracted++;
697 #endif
698   }
699 
700   MOZ_ASSERT(retracted < 4,
701              "the longest UTF-8 code point is four units, so this should never "
702              "retract more than three units");
703 }
704 
RetractPointerToCodePointBoundary(const char16_t ** ptr,const char16_t * limit)705 static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
706     const char16_t** ptr, const char16_t* limit) {
707   MOZ_ASSERT(*ptr <= limit);
708 
709   // |limit| is a code point boundary.
710   if (MOZ_UNLIKELY(*ptr == limit)) {
711     return;
712   }
713 
714   // Otherwise the pointer must be retracted by one iff it splits a two-unit
715   // code point.
716   if (MOZ_UNLIKELY(unicode::IsTrailSurrogate((*ptr)[0]))) {
717     // Outside test suites testing garbage WTF-16, it's basically guaranteed
718     // here that |(*ptr)[-1] (*ptr)[0]| is a surrogate pair.
719     if (MOZ_LIKELY(unicode::IsLeadSurrogate((*ptr)[-1]))) {
720       --*ptr;
721     }
722   }
723 }
724 
725 template <typename Unit>
computePartialColumn(const LineToken lineToken,const uint32_t offset,const SourceUnits<Unit> & sourceUnits) const726 uint32_t TokenStreamAnyChars::computePartialColumn(
727     const LineToken lineToken, const uint32_t offset,
728     const SourceUnits<Unit>& sourceUnits) const {
729   lineToken.assertConsistentOffset(offset);
730 
731   const uint32_t line = lineNumber(lineToken);
732   const uint32_t start = srcCoords.lineStart(lineToken);
733 
734   // Reset the previous offset/column cache for this line, if the previous
735   // lookup wasn't on this line.
736   if (line != lineOfLastColumnComputation_) {
737     lineOfLastColumnComputation_ = line;
738     lastChunkVectorForLine_ = nullptr;
739     lastOffsetOfComputedColumn_ = start;
740     lastComputedColumn_ = 0;
741   }
742 
743   // Compute and return the final column number from a partial offset/column,
744   // using the last-cached offset/column if they're more optimal.
745   auto ColumnFromPartial = [this, offset, &sourceUnits](uint32_t partialOffset,
746                                                         uint32_t partialCols,
747                                                         UnitsType unitsType) {
748     MOZ_ASSERT(partialOffset <= offset);
749 
750     // If the last lookup on this line was closer to |offset|, use it.
751     if (partialOffset < this->lastOffsetOfComputedColumn_ &&
752         this->lastOffsetOfComputedColumn_ <= offset) {
753       partialOffset = this->lastOffsetOfComputedColumn_;
754       partialCols = this->lastComputedColumn_;
755     }
756 
757     const Unit* begin = sourceUnits.codeUnitPtrAt(partialOffset);
758     const Unit* end = sourceUnits.codeUnitPtrAt(offset);
759 
760     size_t offsetDelta = AssertedCast<uint32_t>(PointerRangeSize(begin, end));
761     partialOffset += offsetDelta;
762 
763     if (unitsType == UnitsType::GuaranteedSingleUnit) {
764       MOZ_ASSERT(unicode::CountCodePoints(begin, end) == offsetDelta,
765                  "guaranteed-single-units also guarantee pointer distance "
766                  "equals code point count");
767       partialCols += offsetDelta;
768     } else {
769       partialCols +=
770           AssertedCast<uint32_t>(unicode::CountCodePoints(begin, end));
771     }
772 
773     this->lastOffsetOfComputedColumn_ = partialOffset;
774     this->lastComputedColumn_ = partialCols;
775     return partialCols;
776   };
777 
778   const uint32_t offsetInLine = offset - start;
779 
780   // We won't add an entry to |longLineColumnInfo_| for lines where the maximum
781   // column has offset less than this value.  The most common (non-minified)
782   // long line length is likely 80ch, maybe 100ch, so we use that, rounded up to
783   // the next power of two for efficient division/multiplication below.
784   constexpr uint32_t ColumnChunkLength = mozilla::tl::RoundUpPow2<100>::value;
785 
786   // The index within any associated |Vector<ChunkInfo>| of |offset|'s chunk.
787   const uint32_t chunkIndex = offsetInLine / ColumnChunkLength;
788   if (chunkIndex == 0) {
789     // We don't know from an |offset| in the zeroth chunk that this line is even
790     // long.  First-chunk info is mostly useless, anyway -- we have |start|
791     // already.  So if we have *easy* access to that zeroth chunk, use it --
792     // otherwise just count pessimally.  (This will still benefit from caching
793     // the last column/offset for computations for successive offsets, so it's
794     // not *always* worst-case.)
795     UnitsType unitsType;
796     if (lastChunkVectorForLine_ && lastChunkVectorForLine_->length() > 0) {
797       MOZ_ASSERT((*lastChunkVectorForLine_)[0].column() == 0);
798       unitsType = (*lastChunkVectorForLine_)[0].unitsType();
799     } else {
800       unitsType = UnitsType::PossiblyMultiUnit;
801     }
802 
803     return ColumnFromPartial(start, 0, unitsType);
804   }
805 
806   // If this line has no chunk vector yet, insert one in the hash map.  (The
807   // required index is allocated and filled further down.)
808   if (!lastChunkVectorForLine_) {
809     auto ptr = longLineColumnInfo_.lookupForAdd(line);
810     if (!ptr) {
811       // This could rehash and invalidate a cached vector pointer, but the outer
812       // condition means we don't have a cached pointer.
813       if (!longLineColumnInfo_.add(ptr, line, Vector<ChunkInfo>(cx))) {
814         // In case of OOM, just count columns from the start of the line.
815         cx->recoverFromOutOfMemory();
816         return ColumnFromPartial(start, 0, UnitsType::PossiblyMultiUnit);
817       }
818     }
819 
820     // Note that adding elements to this vector won't invalidate this pointer.
821     lastChunkVectorForLine_ = &ptr->value();
822   }
823 
824   const Unit* const limit = sourceUnits.codeUnitPtrAt(offset);
825 
826   auto RetractedOffsetOfChunk = [
827 #ifdef DEBUG
828                                     this,
829 #endif
830                                     start, limit,
831                                     &sourceUnits](uint32_t index) {
832     MOZ_ASSERT(index < this->lastChunkVectorForLine_->length());
833 
834     uint32_t naiveOffset = start + index * ColumnChunkLength;
835     const Unit* naivePtr = sourceUnits.codeUnitPtrAt(naiveOffset);
836 
837     const Unit* actualPtr = naivePtr;
838     RetractPointerToCodePointBoundary(&actualPtr, limit);
839 
840 #ifdef DEBUG
841     if ((*this->lastChunkVectorForLine_)[index].unitsType() ==
842         UnitsType::GuaranteedSingleUnit) {
843       MOZ_ASSERT(naivePtr == actualPtr, "miscomputed unitsType value");
844     }
845 #endif
846 
847     return naiveOffset - PointerRangeSize(actualPtr, naivePtr);
848   };
849 
850   uint32_t partialOffset;
851   uint32_t partialColumn;
852   UnitsType unitsType;
853 
854   auto entriesLen = AssertedCast<uint32_t>(lastChunkVectorForLine_->length());
855   if (chunkIndex < entriesLen) {
856     // We've computed the chunk |offset| resides in.  Compute the column number
857     // from the chunk.
858     partialOffset = RetractedOffsetOfChunk(chunkIndex);
859     partialColumn = (*lastChunkVectorForLine_)[chunkIndex].column();
860 
861     // This is exact if |chunkIndex| isn't the last chunk.
862     unitsType = (*lastChunkVectorForLine_)[chunkIndex].unitsType();
863 
864     // Otherwise the last chunk is pessimistically assumed to contain multi-unit
865     // code points because we haven't fully examined its contents yet -- they
866     // may not have been tokenized yet, they could contain encoding errors, or
867     // they might not even exist.
868     MOZ_ASSERT_IF(chunkIndex == entriesLen - 1,
869                   (*lastChunkVectorForLine_)[chunkIndex].unitsType() ==
870                       UnitsType::PossiblyMultiUnit);
871   } else {
872     // Extend the vector from its last entry or the start of the line.  (This is
873     // also a suitable partial start point if we must recover from OOM.)
874     if (entriesLen > 0) {
875       partialOffset = RetractedOffsetOfChunk(entriesLen - 1);
876       partialColumn = (*lastChunkVectorForLine_)[entriesLen - 1].column();
877     } else {
878       partialOffset = start;
879       partialColumn = 0;
880     }
881 
882     if (!lastChunkVectorForLine_->reserve(chunkIndex + 1)) {
883       // As earlier, just start from the greatest offset/column in case of OOM.
884       cx->recoverFromOutOfMemory();
885       return ColumnFromPartial(partialOffset, partialColumn,
886                                UnitsType::PossiblyMultiUnit);
887     }
888 
889     // OOM is no longer possible now.  \o/
890 
891     // The vector always begins with the column of the line start, i.e. zero,
892     // with chunk units pessimally assumed not single-unit.
893     if (entriesLen == 0) {
894       lastChunkVectorForLine_->infallibleAppend(
895           ChunkInfo(0, UnitsType::PossiblyMultiUnit));
896       entriesLen++;
897     }
898 
899     do {
900       const Unit* const begin = sourceUnits.codeUnitPtrAt(partialOffset);
901       const Unit* chunkLimit = sourceUnits.codeUnitPtrAt(
902           start + std::min(entriesLen++ * ColumnChunkLength, offsetInLine));
903 
904       MOZ_ASSERT(begin < chunkLimit);
905       MOZ_ASSERT(chunkLimit <= limit);
906 
907       static_assert(
908           ColumnChunkLength > SourceUnitTraits<Unit>::maxUnitsLength - 1,
909           "any retraction below is assumed to never underflow to the "
910           "preceding chunk, even for the longest code point");
911 
912       // Prior tokenizing ensured that [begin, limit) is validly encoded, and
913       // |begin < chunkLimit|, so any retraction here can't underflow.
914       RetractPointerToCodePointBoundary(&chunkLimit, limit);
915 
916       MOZ_ASSERT(begin < chunkLimit);
917       MOZ_ASSERT(chunkLimit <= limit);
918 
919       size_t numUnits = PointerRangeSize(begin, chunkLimit);
920       size_t numCodePoints = unicode::CountCodePoints(begin, chunkLimit);
921 
922       // If this chunk (which will become non-final at the end of the loop) is
923       // all single-unit code points, annotate the chunk accordingly.
924       if (numUnits == numCodePoints) {
925         lastChunkVectorForLine_->back().guaranteeSingleUnits();
926       }
927 
928       partialOffset += numUnits;
929       partialColumn += numCodePoints;
930 
931       lastChunkVectorForLine_->infallibleEmplaceBack(
932           partialColumn, UnitsType::PossiblyMultiUnit);
933     } while (entriesLen < chunkIndex + 1);
934 
935     // We're at a spot in the current final chunk, and final chunks never have
936     // complete units information, so be pessimistic.
937     unitsType = UnitsType::PossiblyMultiUnit;
938   }
939 
940   return ColumnFromPartial(partialOffset, partialColumn, unitsType);
941 }
942 
943 template <typename Unit, class AnyCharsAccess>
computeColumn(LineToken lineToken,uint32_t offset) const944 uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeColumn(
945     LineToken lineToken, uint32_t offset) const {
946   lineToken.assertConsistentOffset(offset);
947 
948   const TokenStreamAnyChars& anyChars = anyCharsAccess();
949 
950   uint32_t column =
951       anyChars.computePartialColumn(lineToken, offset, this->sourceUnits);
952 
953   if (lineToken.isFirstLine()) {
954     if (column > ColumnLimit) {
955       return ColumnLimit;
956     }
957 
958     static_assert(uint32_t(ColumnLimit + ColumnLimit) > ColumnLimit,
959                   "Adding ColumnLimit should not overflow");
960 
961     uint32_t firstLineOffset = anyChars.options_.column;
962     column += firstLineOffset;
963   }
964 
965   if (column > ColumnLimit) {
966     return ColumnLimit;
967   }
968 
969   return column;
970 }
971 
972 template <typename Unit, class AnyCharsAccess>
computeLineAndColumn(uint32_t offset,uint32_t * line,uint32_t * column) const973 void GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeLineAndColumn(
974     uint32_t offset, uint32_t* line, uint32_t* column) const {
975   const TokenStreamAnyChars& anyChars = anyCharsAccess();
976 
977   auto lineToken = anyChars.lineToken(offset);
978   *line = anyChars.lineNumber(lineToken);
979   *column = computeColumn(lineToken, offset);
980 }
981 
982 template <class AnyCharsAccess>
internalEncodingError(uint8_t relevantUnits,unsigned errorNumber,...)983 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::internalEncodingError(
984     uint8_t relevantUnits, unsigned errorNumber, ...) {
985   va_list args;
986   va_start(args, errorNumber);
987 
988   do {
989     size_t offset = this->sourceUnits.offset();
990 
991     ErrorMetadata err;
992 
993     TokenStreamAnyChars& anyChars = anyCharsAccess();
994 
995     bool canAddLineOfContext = fillExceptingContext(&err, offset);
996     if (canAddLineOfContext) {
997       if (!internalComputeLineOfContext(&err, offset)) {
998         break;
999       }
1000 
1001       // As this is an encoding error, the computed window-end must be
1002       // identical to the location of the error -- any further on and the
1003       // window would contain invalid Unicode.
1004       MOZ_ASSERT_IF(err.lineOfContext != nullptr,
1005                     err.lineLength == err.tokenOffset);
1006     }
1007 
1008     auto notes = MakeUnique<JSErrorNotes>();
1009     if (!notes) {
1010       ReportOutOfMemory(anyChars.cx);
1011       break;
1012     }
1013 
1014     // The largest encoding of a UTF-8 code point is 4 units.  (Encoding an
1015     // obsolete 5- or 6-byte code point will complain only about a bad lead
1016     // code unit.)
1017     constexpr size_t MaxWidth = sizeof("0xHH 0xHH 0xHH 0xHH");
1018 
1019     MOZ_ASSERT(relevantUnits > 0);
1020 
1021     char badUnitsStr[MaxWidth];
1022     char* ptr = badUnitsStr;
1023     while (relevantUnits > 0) {
1024       byteToString(this->sourceUnits.getCodeUnit().toUint8(), ptr);
1025       ptr[4] = ' ';
1026 
1027       ptr += 5;
1028       relevantUnits--;
1029     }
1030 
1031     ptr[-1] = '\0';
1032 
1033     uint32_t line, column;
1034     computeLineAndColumn(offset, &line, &column);
1035 
1036     if (!notes->addNoteASCII(anyChars.cx, anyChars.getFilename(), 0, line,
1037                              column, GetErrorMessage, nullptr,
1038                              JSMSG_BAD_CODE_UNITS, badUnitsStr)) {
1039       break;
1040     }
1041 
1042     ReportCompileErrorLatin1(anyChars.cx, std::move(err), std::move(notes),
1043                              errorNumber, &args);
1044   } while (false);
1045 
1046   va_end(args);
1047 }
1048 
1049 template <class AnyCharsAccess>
badLeadUnit(Utf8Unit lead)1050 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badLeadUnit(
1051     Utf8Unit lead) {
1052   uint8_t leadValue = lead.toUint8();
1053 
1054   char leadByteStr[5];
1055   byteToTerminatedString(leadValue, leadByteStr);
1056 
1057   internalEncodingError(1, JSMSG_BAD_LEADING_UTF8_UNIT, leadByteStr);
1058 }
1059 
1060 template <class AnyCharsAccess>
notEnoughUnits(Utf8Unit lead,uint8_t remaining,uint8_t required)1061 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::notEnoughUnits(
1062     Utf8Unit lead, uint8_t remaining, uint8_t required) {
1063   uint8_t leadValue = lead.toUint8();
1064 
1065   MOZ_ASSERT(required == 2 || required == 3 || required == 4);
1066   MOZ_ASSERT(remaining < 4);
1067   MOZ_ASSERT(remaining < required);
1068 
1069   char leadByteStr[5];
1070   byteToTerminatedString(leadValue, leadByteStr);
1071 
1072   // |toHexChar| produces the desired decimal numbers for values < 4.
1073   const char expectedStr[] = {toHexChar(required - 1), '\0'};
1074   const char actualStr[] = {toHexChar(remaining - 1), '\0'};
1075 
1076   internalEncodingError(remaining, JSMSG_NOT_ENOUGH_CODE_UNITS, leadByteStr,
1077                         expectedStr, required == 2 ? "" : "s", actualStr,
1078                         remaining == 2 ? " was" : "s were");
1079 }
1080 
1081 template <class AnyCharsAccess>
badTrailingUnit(uint8_t unitsObserved)1082 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badTrailingUnit(
1083     uint8_t unitsObserved) {
1084   Utf8Unit badUnit =
1085       this->sourceUnits.addressOfNextCodeUnit()[unitsObserved - 1];
1086 
1087   char badByteStr[5];
1088   byteToTerminatedString(badUnit.toUint8(), badByteStr);
1089 
1090   internalEncodingError(unitsObserved, JSMSG_BAD_TRAILING_UTF8_UNIT,
1091                         badByteStr);
1092 }
1093 
1094 template <class AnyCharsAccess>
1095 MOZ_COLD void
badStructurallyValidCodePoint(uint32_t codePoint,uint8_t codePointLength,const char * reason)1096 TokenStreamChars<Utf8Unit, AnyCharsAccess>::badStructurallyValidCodePoint(
1097     uint32_t codePoint, uint8_t codePointLength, const char* reason) {
1098   // Construct a string like "0x203D" (including null terminator) to include
1099   // in the error message.  Write the string end-to-start from end to start
1100   // of an adequately sized |char| array, shifting least significant nibbles
1101   // off the number and writing the corresponding hex digits until done, then
1102   // prefixing with "0x".  |codePointStr| points at the incrementally
1103   // computed string, within |codePointCharsArray|'s bounds.
1104 
1105   // 0x1F'FFFF is the maximum value that can fit in 3+6+6+6 unconstrained
1106   // bits in a four-byte UTF-8 code unit sequence.
1107   constexpr size_t MaxHexSize = sizeof(
1108       "0x1F"
1109       "FFFF");  // including '\0'
1110   char codePointCharsArray[MaxHexSize];
1111 
1112   char* codePointStr = std::end(codePointCharsArray);
1113   *--codePointStr = '\0';
1114 
1115   // Note that by do-while looping here rather than while-looping, this
1116   // writes a '0' when |codePoint == 0|.
1117   do {
1118     MOZ_ASSERT(codePointCharsArray < codePointStr);
1119     *--codePointStr = toHexChar(codePoint & 0xF);
1120     codePoint >>= 4;
1121   } while (codePoint);
1122 
1123   MOZ_ASSERT(codePointCharsArray + 2 <= codePointStr);
1124   *--codePointStr = 'x';
1125   *--codePointStr = '0';
1126 
1127   internalEncodingError(codePointLength, JSMSG_FORBIDDEN_UTF8_CODE_POINT,
1128                         codePointStr, reason);
1129 }
1130 
1131 template <class AnyCharsAccess>
1132 [[nodiscard]] bool
getNonAsciiCodePointDontNormalize(Utf8Unit lead,char32_t * codePoint)1133 TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePointDontNormalize(
1134     Utf8Unit lead, char32_t* codePoint) {
1135   auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
1136 
1137   auto onNotEnoughUnits = [this, &lead](uint8_t remaining, uint8_t required) {
1138     this->notEnoughUnits(lead, remaining, required);
1139   };
1140 
1141   auto onBadTrailingUnit = [this](uint8_t unitsObserved) {
1142     this->badTrailingUnit(unitsObserved);
1143   };
1144 
1145   auto onBadCodePoint = [this](char32_t badCodePoint, uint8_t unitsObserved) {
1146     this->badCodePoint(badCodePoint, unitsObserved);
1147   };
1148 
1149   auto onNotShortestForm = [this](char32_t badCodePoint,
1150                                   uint8_t unitsObserved) {
1151     this->notShortestForm(badCodePoint, unitsObserved);
1152   };
1153 
1154   // If a valid code point is decoded, this function call consumes its code
1155   // units.  If not, it ungets the lead code unit and invokes the right error
1156   // handler, so on failure we must immediately return false.
1157   SourceUnitsIterator iter(this->sourceUnits);
1158   Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePointInline(
1159       lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
1160       onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
1161   if (maybeCodePoint.isNothing()) {
1162     return false;
1163   }
1164 
1165   *codePoint = maybeCodePoint.value();
1166   return true;
1167 }
1168 
1169 template <class AnyCharsAccess>
getNonAsciiCodePoint(int32_t lead,int32_t * codePoint)1170 bool TokenStreamChars<char16_t, AnyCharsAccess>::getNonAsciiCodePoint(
1171     int32_t lead, int32_t* codePoint) {
1172   MOZ_ASSERT(lead != EOF);
1173   MOZ_ASSERT(!isAsciiCodePoint(lead),
1174              "ASCII code unit/point must be handled separately");
1175   MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
1176              "getNonAsciiCodePoint called incorrectly");
1177 
1178   // The code point is usually |lead|: overwrite later if needed.
1179   *codePoint = lead;
1180 
1181   // ECMAScript specifically requires that unpaired UTF-16 surrogates be
1182   // treated as the corresponding code point and not as an error.  See
1183   // <https://tc39.github.io/ecma262/#sec-ecmascript-language-types-string-type>.
1184   // Thus this function does not consider any sequence of 16-bit numbers to
1185   // be intrinsically in error.
1186 
1187   // Dispense with single-unit code points and lone trailing surrogates.
1188   if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead))) {
1189     if (MOZ_UNLIKELY(lead == unicode::LINE_SEPARATOR ||
1190                      lead == unicode::PARA_SEPARATOR)) {
1191       if (!updateLineInfoForEOL()) {
1192 #ifdef DEBUG
1193         *codePoint = EOF;  // sentinel value to hopefully cause errors
1194 #endif
1195         MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
1196         return false;
1197       }
1198 
1199       *codePoint = '\n';
1200     } else {
1201       MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1202     }
1203 
1204     return true;
1205   }
1206 
1207   // Also handle a lead surrogate not paired with a trailing surrogate.
1208   if (MOZ_UNLIKELY(
1209           this->sourceUnits.atEnd() ||
1210           !unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) {
1211     MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1212     return true;
1213   }
1214 
1215   // Otherwise we have a multi-unit code point.
1216   *codePoint = unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
1217   MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1218   return true;
1219 }
1220 
1221 template <typename Unit, class AnyCharsAccess>
getCodePoint(int32_t * cp)1222 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getCodePoint(int32_t* cp) {
1223   int32_t unit = getCodeUnit();
1224   if (unit == EOF) {
1225     MOZ_ASSERT(anyCharsAccess().flags.isEOF,
1226                "flags.isEOF should have been set by getCodeUnit()");
1227     *cp = EOF;
1228     return true;
1229   }
1230 
1231   if (isAsciiCodePoint(unit)) {
1232     return getFullAsciiCodePoint(unit, cp);
1233   }
1234 
1235   return getNonAsciiCodePoint(unit, cp);
1236 }
1237 
1238 template <class AnyCharsAccess>
getNonAsciiCodePoint(int32_t unit,int32_t * codePoint)1239 bool TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePoint(
1240     int32_t unit, int32_t* codePoint) {
1241   MOZ_ASSERT(unit != EOF);
1242   MOZ_ASSERT(!isAsciiCodePoint(unit),
1243              "ASCII code unit/point must be handled separately");
1244 
1245   Utf8Unit lead = Utf8Unit(static_cast<unsigned char>(unit));
1246   MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
1247              "getNonAsciiCodePoint called incorrectly");
1248 
1249   auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
1250 
1251   auto onNotEnoughUnits = [this, &lead](uint_fast8_t remaining,
1252                                         uint_fast8_t required) {
1253     this->notEnoughUnits(lead, remaining, required);
1254   };
1255 
1256   auto onBadTrailingUnit = [this](uint_fast8_t unitsObserved) {
1257     this->badTrailingUnit(unitsObserved);
1258   };
1259 
1260   auto onBadCodePoint = [this](char32_t badCodePoint,
1261                                uint_fast8_t unitsObserved) {
1262     this->badCodePoint(badCodePoint, unitsObserved);
1263   };
1264 
1265   auto onNotShortestForm = [this](char32_t badCodePoint,
1266                                   uint_fast8_t unitsObserved) {
1267     this->notShortestForm(badCodePoint, unitsObserved);
1268   };
1269 
1270   // This consumes the full, valid code point or ungets |lead| and calls the
1271   // appropriate error functor on failure.
1272   SourceUnitsIterator iter(this->sourceUnits);
1273   Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePoint(
1274       lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
1275       onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
1276   if (maybeCodePoint.isNothing()) {
1277     return false;
1278   }
1279 
1280   char32_t cp = maybeCodePoint.value();
1281   if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
1282                    cp == unicode::PARA_SEPARATOR)) {
1283     if (!updateLineInfoForEOL()) {
1284 #ifdef DEBUG
1285       *codePoint = EOF;  // sentinel value to hopefully cause errors
1286 #endif
1287       MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
1288       return false;
1289     }
1290 
1291     *codePoint = '\n';
1292   } else {
1293     MOZ_ASSERT(!IsLineTerminator(cp));
1294     *codePoint = AssertedCast<int32_t>(cp);
1295   }
1296 
1297   return true;
1298 }
1299 
1300 template <>
findWindowStart(size_t offset) const1301 size_t SourceUnits<char16_t>::findWindowStart(size_t offset) const {
1302   // This is JS's understanding of UTF-16 that allows lone surrogates, so
1303   // we have to exclude lone surrogates from [windowStart, offset) ourselves.
1304 
1305   const char16_t* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
1306 
1307   const char16_t* const initial = codeUnitPtrAt(offset);
1308   const char16_t* p = initial;
1309 
1310   auto HalfWindowSize = [&p, &initial]() {
1311     return PointerRangeSize(p, initial);
1312   };
1313 
1314   while (true) {
1315     MOZ_ASSERT(earliestPossibleStart <= p);
1316     MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1317     if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
1318       break;
1319     }
1320 
1321     char16_t c = p[-1];
1322 
1323     // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
1324     // string and template literals.  These code points do affect line and
1325     // column coordinates, even as they encode their literal values.
1326     if (IsLineTerminator(c)) {
1327       break;
1328     }
1329 
1330     // Don't allow invalid UTF-16 in pre-context.  (Current users don't
1331     // require this, and this behavior isn't currently imposed on
1332     // pre-context, but these facts might change someday.)
1333 
1334     if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(c))) {
1335       break;
1336     }
1337 
1338     // Optimistically include the code unit, reverting below if needed.
1339     p--;
1340 
1341     // If it's not a surrogate at all, keep going.
1342     if (MOZ_LIKELY(!unicode::IsTrailSurrogate(c))) {
1343       continue;
1344     }
1345 
1346     // Stop if we don't have a usable surrogate pair.
1347     if (HalfWindowSize() >= WindowRadius ||
1348         p <= earliestPossibleStart ||      // trail surrogate at low end
1349         !unicode::IsLeadSurrogate(p[-1]))  // no paired lead surrogate
1350     {
1351       p++;
1352       break;
1353     }
1354 
1355     p--;
1356   }
1357 
1358   MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1359   return offset - HalfWindowSize();
1360 }
1361 
1362 template <>
findWindowStart(size_t offset) const1363 size_t SourceUnits<Utf8Unit>::findWindowStart(size_t offset) const {
1364   // |offset| must be the location of the error or somewhere before it, so we
1365   // know preceding data is valid UTF-8.
1366 
1367   const Utf8Unit* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
1368 
1369   const Utf8Unit* const initial = codeUnitPtrAt(offset);
1370   const Utf8Unit* p = initial;
1371 
1372   auto HalfWindowSize = [&p, &initial]() {
1373     return PointerRangeSize(p, initial);
1374   };
1375 
1376   while (true) {
1377     MOZ_ASSERT(earliestPossibleStart <= p);
1378     MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1379     if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
1380       break;
1381     }
1382 
1383     // Peek backward for a line break, and only decrement if there is none.
1384     uint8_t prev = p[-1].toUint8();
1385 
1386     // First check for the ASCII LineTerminators.
1387     if (prev == '\r' || prev == '\n') {
1388       break;
1389     }
1390 
1391     // Now check for the non-ASCII LineTerminators U+2028 LINE SEPARATOR
1392     // (0xE2 0x80 0xA8) and U+2029 PARAGRAPH (0xE2 0x80 0xA9).  If there
1393     // aren't three code units available, some comparison here will fail
1394     // before we'd underflow.
1395     if (MOZ_UNLIKELY((prev == 0xA8 || prev == 0xA9) &&
1396                      p[-2].toUint8() == 0x80 && p[-3].toUint8() == 0xE2)) {
1397       break;
1398     }
1399 
1400     // Rewind over the non-LineTerminator.  This can't underflow
1401     // |earliestPossibleStart| because it begins a code point.
1402     while (IsTrailingUnit(*--p)) {
1403       continue;
1404     }
1405 
1406     MOZ_ASSERT(earliestPossibleStart <= p);
1407 
1408     // But if we underflowed |WindowRadius|, adjust forward and stop.
1409     if (HalfWindowSize() > WindowRadius) {
1410       static_assert(WindowRadius > 3,
1411                     "skipping over non-lead code units below must not "
1412                     "advance past |offset|");
1413 
1414       while (IsTrailingUnit(*++p)) {
1415         continue;
1416       }
1417 
1418       MOZ_ASSERT(HalfWindowSize() < WindowRadius);
1419       break;
1420     }
1421   }
1422 
1423   MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1424   return offset - HalfWindowSize();
1425 }
1426 
1427 template <>
findWindowEnd(size_t offset) const1428 size_t SourceUnits<char16_t>::findWindowEnd(size_t offset) const {
1429   const char16_t* const initial = codeUnitPtrAt(offset);
1430   const char16_t* p = initial;
1431 
1432   auto HalfWindowSize = [&initial, &p]() {
1433     return PointerRangeSize(initial, p);
1434   };
1435 
1436   while (true) {
1437     MOZ_ASSERT(p <= limit_);
1438     MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1439     if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
1440       break;
1441     }
1442 
1443     char16_t c = *p;
1444 
1445     // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
1446     // string and template literals.  These code points do affect line and
1447     // column coordinates, even as they encode their literal values.
1448     if (IsLineTerminator(c)) {
1449       break;
1450     }
1451 
1452     // Don't allow invalid UTF-16 in post-context.  (Current users don't
1453     // require this, and this behavior isn't currently imposed on
1454     // pre-context, but these facts might change someday.)
1455 
1456     if (MOZ_UNLIKELY(unicode::IsTrailSurrogate(c))) {
1457       break;
1458     }
1459 
1460     // Optimistically consume the code unit, ungetting it below if needed.
1461     p++;
1462 
1463     // If it's not a surrogate at all, keep going.
1464     if (MOZ_LIKELY(!unicode::IsLeadSurrogate(c))) {
1465       continue;
1466     }
1467 
1468     // Retract if the lead surrogate would stand alone at the end of the
1469     // window.
1470     if (HalfWindowSize() >= WindowRadius ||  // split pair
1471         p >= limit_ ||                       // half-pair at end of source
1472         !unicode::IsTrailSurrogate(*p))      // no paired trail surrogate
1473     {
1474       p--;
1475       break;
1476     }
1477 
1478     p++;
1479   }
1480 
1481   return offset + HalfWindowSize();
1482 }
1483 
1484 template <>
findWindowEnd(size_t offset) const1485 size_t SourceUnits<Utf8Unit>::findWindowEnd(size_t offset) const {
1486   const Utf8Unit* const initial = codeUnitPtrAt(offset);
1487   const Utf8Unit* p = initial;
1488 
1489   auto HalfWindowSize = [&initial, &p]() {
1490     return PointerRangeSize(initial, p);
1491   };
1492 
1493   while (true) {
1494     MOZ_ASSERT(p <= limit_);
1495     MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1496     if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
1497       break;
1498     }
1499 
1500     // A non-encoding error might be followed by an encoding error within
1501     // |maxEnd|, so we must validate as we go to not include invalid UTF-8
1502     // in the computed window.  What joy!
1503 
1504     Utf8Unit lead = *p;
1505     if (mozilla::IsAscii(lead)) {
1506       if (IsSingleUnitLineTerminator(lead)) {
1507         break;
1508       }
1509 
1510       p++;
1511       continue;
1512     }
1513 
1514     PeekedCodePoint<Utf8Unit> peeked = PeekCodePoint(p, limit_);
1515     if (peeked.isNone()) {
1516       break;  // encoding error
1517     }
1518 
1519     char32_t c = peeked.codePoint();
1520     if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR ||
1521                      c == unicode::PARA_SEPARATOR)) {
1522       break;
1523     }
1524 
1525     MOZ_ASSERT(!IsLineTerminator(c));
1526 
1527     uint8_t len = peeked.lengthInUnits();
1528     if (HalfWindowSize() + len > WindowRadius) {
1529       break;
1530     }
1531 
1532     p += len;
1533   }
1534 
1535   MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1536   return offset + HalfWindowSize();
1537 }
1538 
1539 template <typename Unit, class AnyCharsAccess>
advance(size_t position)1540 bool TokenStreamSpecific<Unit, AnyCharsAccess>::advance(size_t position) {
1541   const Unit* end = this->sourceUnits.codeUnitPtrAt(position);
1542   while (this->sourceUnits.addressOfNextCodeUnit() < end) {
1543     int32_t c;
1544     if (!getCodePoint(&c)) {
1545       return false;
1546     }
1547   }
1548 
1549   TokenStreamAnyChars& anyChars = anyCharsAccess();
1550   Token* cur = const_cast<Token*>(&anyChars.currentToken());
1551   cur->pos.begin = this->sourceUnits.offset();
1552   cur->pos.end = cur->pos.begin;
1553 #ifdef DEBUG
1554   cur->type = TokenKind::Limit;
1555 #endif
1556   MOZ_MAKE_MEM_UNDEFINED(&cur->type, sizeof(cur->type));
1557   anyChars.lookahead = 0;
1558   return true;
1559 }
1560 
1561 template <typename Unit, class AnyCharsAccess>
seekTo(const Position & pos)1562 void TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(const Position& pos) {
1563   TokenStreamAnyChars& anyChars = anyCharsAccess();
1564 
1565   this->sourceUnits.setAddressOfNextCodeUnit(pos.buf,
1566                                              /* allowPoisoned = */ true);
1567   anyChars.flags = pos.flags;
1568   anyChars.lineno = pos.lineno;
1569   anyChars.linebase = pos.linebase;
1570   anyChars.prevLinebase = pos.prevLinebase;
1571   anyChars.lookahead = pos.lookahead;
1572 
1573   anyChars.tokens[anyChars.cursor()] = pos.currentToken;
1574   for (unsigned i = 0; i < anyChars.lookahead; i++) {
1575     anyChars.tokens[anyChars.aheadCursor(1 + i)] = pos.lookaheadTokens[i];
1576   }
1577 }
1578 
1579 template <typename Unit, class AnyCharsAccess>
seekTo(const Position & pos,const TokenStreamAnyChars & other)1580 bool TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(
1581     const Position& pos, const TokenStreamAnyChars& other) {
1582   if (!anyCharsAccess().srcCoords.fill(other.srcCoords)) {
1583     return false;
1584   }
1585 
1586   seekTo(pos);
1587   return true;
1588 }
1589 
computeErrorMetadataNoOffset(ErrorMetadata * err)1590 void TokenStreamAnyChars::computeErrorMetadataNoOffset(ErrorMetadata* err) {
1591   err->isMuted = mutedErrors;
1592   err->filename = filename_;
1593   err->lineNumber = 0;
1594   err->columnNumber = 0;
1595 
1596   MOZ_ASSERT(err->lineOfContext == nullptr);
1597 }
1598 
fillExceptingContext(ErrorMetadata * err,uint32_t offset)1599 bool TokenStreamAnyChars::fillExceptingContext(ErrorMetadata* err,
1600                                                uint32_t offset) {
1601   err->isMuted = mutedErrors;
1602 
1603   // If this TokenStreamAnyChars doesn't have location information, try to
1604   // get it from the caller.
1605   if (!filename_ && !cx->isHelperThreadContext()) {
1606     NonBuiltinFrameIter iter(cx, FrameIter::FOLLOW_DEBUGGER_EVAL_PREV_LINK,
1607                              cx->realm()->principals());
1608     if (!iter.done() && iter.filename()) {
1609       err->filename = iter.filename();
1610       err->lineNumber = iter.computeLine(&err->columnNumber);
1611       return false;
1612     }
1613   }
1614 
1615   // Otherwise use this TokenStreamAnyChars's location information.
1616   err->filename = filename_;
1617   return true;
1618 }
1619 
1620 template <typename Unit, class AnyCharsAccess>
hasTokenizationStarted() const1621 bool TokenStreamSpecific<Unit, AnyCharsAccess>::hasTokenizationStarted() const {
1622   const TokenStreamAnyChars& anyChars = anyCharsAccess();
1623   return anyChars.isCurrentTokenType(TokenKind::Eof) && !anyChars.isEOF();
1624 }
1625 
1626 template <>
computeWindowOffsetAndLength(const char16_t * encodedWindow,size_t encodedTokenOffset,size_t * utf16TokenOffset,size_t encodedWindowLength,size_t * utf16WindowLength)1627 inline void SourceUnits<char16_t>::computeWindowOffsetAndLength(
1628     const char16_t* encodedWindow, size_t encodedTokenOffset,
1629     size_t* utf16TokenOffset, size_t encodedWindowLength,
1630     size_t* utf16WindowLength) {
1631   MOZ_ASSERT_UNREACHABLE("shouldn't need to recompute for UTF-16");
1632 }
1633 
1634 template <>
computeWindowOffsetAndLength(const Utf8Unit * encodedWindow,size_t encodedTokenOffset,size_t * utf16TokenOffset,size_t encodedWindowLength,size_t * utf16WindowLength)1635 inline void SourceUnits<Utf8Unit>::computeWindowOffsetAndLength(
1636     const Utf8Unit* encodedWindow, size_t encodedTokenOffset,
1637     size_t* utf16TokenOffset, size_t encodedWindowLength,
1638     size_t* utf16WindowLength) {
1639   MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
1640              "token offset must be within the window, and the two lambda "
1641              "calls below presume this ordering of values");
1642 
1643   const Utf8Unit* const encodedWindowEnd = encodedWindow + encodedWindowLength;
1644 
1645   size_t i = 0;
1646   auto ComputeUtf16Count = [&i, &encodedWindow](const Utf8Unit* limit) {
1647     while (encodedWindow < limit) {
1648       Utf8Unit lead = *encodedWindow++;
1649       if (MOZ_LIKELY(IsAscii(lead))) {
1650         // ASCII contributes a single UTF-16 code unit.
1651         i++;
1652         continue;
1653       }
1654 
1655       Maybe<char32_t> cp = DecodeOneUtf8CodePoint(lead, &encodedWindow, limit);
1656       MOZ_ASSERT(cp.isSome(),
1657                  "computed window should only contain valid UTF-8");
1658 
1659       i += unicode::IsSupplementary(cp.value()) ? 2 : 1;
1660     }
1661 
1662     return i;
1663   };
1664 
1665   // Compute the token offset from |i == 0| and the initial |encodedWindow|.
1666   const Utf8Unit* token = encodedWindow + encodedTokenOffset;
1667   MOZ_ASSERT(token <= encodedWindowEnd);
1668   *utf16TokenOffset = ComputeUtf16Count(token);
1669 
1670   // Compute the window length, picking up from |i| and |encodedWindow| that,
1671   // in general, were modified just above.
1672   *utf16WindowLength = ComputeUtf16Count(encodedWindowEnd);
1673 }
1674 
1675 template <typename Unit>
addLineOfContext(ErrorMetadata * err,uint32_t offset)1676 bool TokenStreamCharsBase<Unit>::addLineOfContext(ErrorMetadata* err,
1677                                                   uint32_t offset) {
1678   // Rename the variable to make meaning clearer: an offset into source units
1679   // in Unit encoding.
1680   size_t encodedOffset = offset;
1681 
1682   // These are also offsets into source units in Unit encoding.
1683   size_t encodedWindowStart = sourceUnits.findWindowStart(encodedOffset);
1684   size_t encodedWindowEnd = sourceUnits.findWindowEnd(encodedOffset);
1685 
1686   size_t encodedWindowLength = encodedWindowEnd - encodedWindowStart;
1687   MOZ_ASSERT(encodedWindowLength <= SourceUnits::WindowRadius * 2);
1688 
1689   // Don't add a useless "line" of context when the window ends up empty
1690   // because of an invalid encoding at the start of a line.
1691   if (encodedWindowLength == 0) {
1692     MOZ_ASSERT(err->lineOfContext == nullptr,
1693                "ErrorMetadata::lineOfContext must be null so we don't "
1694                "have to set the lineLength/tokenOffset fields");
1695     return true;
1696   }
1697 
1698   CharBuffer lineOfContext(cx);
1699 
1700   const Unit* encodedWindow = sourceUnits.codeUnitPtrAt(encodedWindowStart);
1701   if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(
1702           lineOfContext, encodedWindow, encodedWindow + encodedWindowLength)) {
1703     return false;
1704   }
1705 
1706   size_t utf16WindowLength = lineOfContext.length();
1707 
1708   // The windowed string is null-terminated.
1709   if (!lineOfContext.append('\0')) {
1710     return false;
1711   }
1712 
1713   err->lineOfContext.reset(lineOfContext.extractOrCopyRawBuffer());
1714   if (!err->lineOfContext) {
1715     return false;
1716   }
1717 
1718   size_t encodedTokenOffset = encodedOffset - encodedWindowStart;
1719 
1720   MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
1721              "token offset must be inside the window");
1722 
1723   // The length in UTF-8 code units of a code point is always greater than or
1724   // equal to the same code point's length in UTF-16 code points.  ASCII code
1725   // points are 1 unit in either encoding.  Code points in [U+0080, U+10000)
1726   // are 2-3 UTF-8 code units to 1 UTF-16 code unit.  And code points in
1727   // [U+10000, U+10FFFF] are 4 UTF-8 code units to 2 UTF-16 code units.
1728   //
1729   // Therefore, if encoded window length equals the length in UTF-16 (this is
1730   // always the case for Unit=char16_t), the UTF-16 offsets are exactly the
1731   // encoded offsets.  Otherwise we must convert offset/length from UTF-8 to
1732   // UTF-16.
1733   if constexpr (std::is_same_v<Unit, char16_t>) {
1734     MOZ_ASSERT(utf16WindowLength == encodedWindowLength,
1735                "UTF-16 to UTF-16 shouldn't change window length");
1736     err->tokenOffset = encodedTokenOffset;
1737     err->lineLength = encodedWindowLength;
1738   } else {
1739     static_assert(std::is_same_v<Unit, Utf8Unit>, "should only see UTF-8 here");
1740 
1741     bool simple = utf16WindowLength == encodedWindowLength;
1742 #ifdef DEBUG
1743     auto isAscii = [](Unit u) { return IsAscii(u); };
1744     MOZ_ASSERT(std::all_of(encodedWindow, encodedWindow + encodedWindowLength,
1745                            isAscii) == simple,
1746                "equal window lengths in UTF-8 should correspond only to "
1747                "wholly-ASCII text");
1748 #endif
1749     if (simple) {
1750       err->tokenOffset = encodedTokenOffset;
1751       err->lineLength = encodedWindowLength;
1752     } else {
1753       sourceUnits.computeWindowOffsetAndLength(
1754           encodedWindow, encodedTokenOffset, &err->tokenOffset,
1755           encodedWindowLength, &err->lineLength);
1756     }
1757   }
1758 
1759   return true;
1760 }
1761 
1762 template <typename Unit, class AnyCharsAccess>
computeErrorMetadata(ErrorMetadata * err,const ErrorOffset & errorOffset)1763 bool TokenStreamSpecific<Unit, AnyCharsAccess>::computeErrorMetadata(
1764     ErrorMetadata* err, const ErrorOffset& errorOffset) {
1765   if (errorOffset.is<NoOffset>()) {
1766     anyCharsAccess().computeErrorMetadataNoOffset(err);
1767     return true;
1768   }
1769 
1770   uint32_t offset;
1771   if (errorOffset.is<uint32_t>()) {
1772     offset = errorOffset.as<uint32_t>();
1773   } else {
1774     offset = this->sourceUnits.offset();
1775   }
1776 
1777   // This function's return value isn't a success/failure indication: it
1778   // returns true if this TokenStream can be used to provide a line of
1779   // context.
1780   if (fillExceptingContext(err, offset)) {
1781     // Add a line of context from this TokenStream to help with debugging.
1782     return internalComputeLineOfContext(err, offset);
1783   }
1784 
1785   // We can't fill in any more here.
1786   return true;
1787 }
1788 
1789 template <typename Unit, class AnyCharsAccess>
reportIllegalCharacter(int32_t cp)1790 void TokenStreamSpecific<Unit, AnyCharsAccess>::reportIllegalCharacter(
1791     int32_t cp) {
1792   UniqueChars display = JS_smprintf("U+%04X", cp);
1793   if (!display) {
1794     ReportOutOfMemory(anyCharsAccess().cx);
1795     return;
1796   }
1797   error(JSMSG_ILLEGAL_CHARACTER, display.get());
1798 }
1799 
1800 // We have encountered a '\': check for a Unicode escape sequence after it.
1801 // Return the length of the escape sequence and the encoded code point (by
1802 // value) if we found a Unicode escape sequence, and skip all code units
1803 // involed.  Otherwise, return 0 and don't advance along the buffer.
1804 template <typename Unit, class AnyCharsAccess>
matchUnicodeEscape(uint32_t * codePoint)1805 uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscape(
1806     uint32_t* codePoint) {
1807   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1808 
1809   int32_t unit = getCodeUnit();
1810   if (unit != 'u') {
1811     // NOTE: |unit| may be EOF here.
1812     ungetCodeUnit(unit);
1813     MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1814     return 0;
1815   }
1816 
1817   char16_t v;
1818   unit = getCodeUnit();
1819   if (IsAsciiHexDigit(unit) && this->sourceUnits.matchHexDigits(3, &v)) {
1820     *codePoint = (AsciiAlphanumericToNumber(unit) << 12) | v;
1821     return 5;
1822   }
1823 
1824   if (unit == '{') {
1825     return matchExtendedUnicodeEscape(codePoint);
1826   }
1827 
1828   // NOTE: |unit| may be EOF here, so this ungets either one or two units.
1829   ungetCodeUnit(unit);
1830   ungetCodeUnit('u');
1831   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1832   return 0;
1833 }
1834 
1835 template <typename Unit, class AnyCharsAccess>
1836 uint32_t
matchExtendedUnicodeEscape(uint32_t * codePoint)1837 GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchExtendedUnicodeEscape(
1838     uint32_t* codePoint) {
1839   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('{'));
1840 
1841   int32_t unit = getCodeUnit();
1842 
1843   // Skip leading zeroes.
1844   uint32_t leadingZeroes = 0;
1845   while (unit == '0') {
1846     leadingZeroes++;
1847     unit = getCodeUnit();
1848   }
1849 
1850   size_t i = 0;
1851   uint32_t code = 0;
1852   while (IsAsciiHexDigit(unit) && i < 6) {
1853     code = (code << 4) | AsciiAlphanumericToNumber(unit);
1854     unit = getCodeUnit();
1855     i++;
1856   }
1857 
1858   uint32_t gotten =
1859       2 +                  // 'u{'
1860       leadingZeroes + i +  // significant hexdigits
1861       (unit != EOF);       // subtract a get if it didn't contribute to length
1862 
1863   if (unit == '}' && (leadingZeroes > 0 || i > 0) &&
1864       code <= unicode::NonBMPMax) {
1865     *codePoint = code;
1866     return gotten;
1867   }
1868 
1869   this->sourceUnits.unskipCodeUnits(gotten);
1870   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1871   return 0;
1872 }
1873 
1874 template <typename Unit, class AnyCharsAccess>
1875 uint32_t
matchUnicodeEscapeIdStart(uint32_t * codePoint)1876 GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdStart(
1877     uint32_t* codePoint) {
1878   uint32_t length = matchUnicodeEscape(codePoint);
1879   if (MOZ_LIKELY(length > 0)) {
1880     if (MOZ_LIKELY(unicode::IsIdentifierStart(*codePoint))) {
1881       return length;
1882     }
1883 
1884     this->sourceUnits.unskipCodeUnits(length);
1885   }
1886 
1887   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1888   return 0;
1889 }
1890 
1891 template <typename Unit, class AnyCharsAccess>
matchUnicodeEscapeIdent(uint32_t * codePoint)1892 bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdent(
1893     uint32_t* codePoint) {
1894   uint32_t length = matchUnicodeEscape(codePoint);
1895   if (MOZ_LIKELY(length > 0)) {
1896     if (MOZ_LIKELY(unicode::IsIdentifierPart(*codePoint))) {
1897       return true;
1898     }
1899 
1900     this->sourceUnits.unskipCodeUnits(length);
1901   }
1902 
1903   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1904   return false;
1905 }
1906 
1907 template <typename Unit, class AnyCharsAccess>
1908 [[nodiscard]] bool
matchIdentifierStart(IdentifierEscapes * sawEscape)1909 TokenStreamSpecific<Unit, AnyCharsAccess>::matchIdentifierStart(
1910     IdentifierEscapes* sawEscape) {
1911   int32_t unit = getCodeUnit();
1912   if (unicode::IsIdentifierStart(char16_t(unit))) {
1913     ungetCodeUnit(unit);
1914     *sawEscape = IdentifierEscapes::None;
1915     return true;
1916   }
1917 
1918   if (unit == '\\') {
1919     *sawEscape = IdentifierEscapes::SawUnicodeEscape;
1920 
1921     uint32_t codePoint;
1922     uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint);
1923     if (escapeLength != 0) {
1924       return true;
1925     }
1926 
1927     // We could point "into" a mistyped escape, e.g. for "\u{41H}" we
1928     // could point at the 'H'.  But we don't do that now, so the code
1929     // unit after the '\' isn't necessarily bad, so just point at the
1930     // start of the actually-invalid escape.
1931     ungetCodeUnit('\\');
1932     error(JSMSG_BAD_ESCAPE);
1933     return false;
1934   }
1935 
1936   *sawEscape = IdentifierEscapes::None;
1937 
1938   // NOTE: |unit| may be EOF here.
1939   ungetCodeUnit(unit);
1940   error(JSMSG_MISSING_PRIVATE_NAME);
1941   return false;
1942 }
1943 
1944 template <typename Unit, class AnyCharsAccess>
getDirectives(bool isMultiline,bool shouldWarnDeprecated)1945 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirectives(
1946     bool isMultiline, bool shouldWarnDeprecated) {
1947   // Match directive comments used in debugging, such as "//# sourceURL" and
1948   // "//# sourceMappingURL". Use of "//@" instead of "//#" is deprecated.
1949   //
1950   // To avoid a crashing bug in IE, several JavaScript transpilers wrap single
1951   // line comments containing a source mapping URL inside a multiline
1952   // comment. To avoid potentially expensive lookahead and backtracking, we
1953   // only check for this case if we encounter a '#' code unit.
1954 
1955   bool res = getDisplayURL(isMultiline, shouldWarnDeprecated) &&
1956              getSourceMappingURL(isMultiline, shouldWarnDeprecated);
1957   if (!res) {
1958     badToken();
1959   }
1960 
1961   return res;
1962 }
1963 
copyCharBufferTo(JSContext * cx,UniquePtr<char16_t[],JS::FreePolicy> * destination)1964 [[nodiscard]] bool TokenStreamCharsShared::copyCharBufferTo(
1965     JSContext* cx, UniquePtr<char16_t[], JS::FreePolicy>* destination) {
1966   size_t length = charBuffer.length();
1967 
1968   *destination = cx->make_pod_array<char16_t>(length + 1);
1969   if (!*destination) {
1970     return false;
1971   }
1972 
1973   std::copy(charBuffer.begin(), charBuffer.end(), destination->get());
1974   (*destination)[length] = '\0';
1975   return true;
1976 }
1977 
1978 template <typename Unit, class AnyCharsAccess>
getDirective(bool isMultiline,bool shouldWarnDeprecated,const char * directive,uint8_t directiveLength,const char * errorMsgPragma,UniquePtr<char16_t[],JS::FreePolicy> * destination)1979 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirective(
1980     bool isMultiline, bool shouldWarnDeprecated, const char* directive,
1981     uint8_t directiveLength, const char* errorMsgPragma,
1982     UniquePtr<char16_t[], JS::FreePolicy>* destination) {
1983   // Stop if we don't find |directive|.  (Note that |directive| must be
1984   // ASCII, so there are no tricky encoding issues to consider in matching
1985   // UTF-8/16-agnostically.)
1986   if (!this->sourceUnits.matchCodeUnits(directive, directiveLength)) {
1987     return true;
1988   }
1989 
1990   if (shouldWarnDeprecated) {
1991     if (!warning(JSMSG_DEPRECATED_PRAGMA, errorMsgPragma)) {
1992       return false;
1993     }
1994   }
1995 
1996   this->charBuffer.clear();
1997 
1998   do {
1999     int32_t unit = peekCodeUnit();
2000     if (unit == EOF) {
2001       break;
2002     }
2003 
2004     if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2005       if (unicode::IsSpace(AssertedCast<Latin1Char>(unit))) {
2006         break;
2007       }
2008 
2009       consumeKnownCodeUnit(unit);
2010 
2011       // Debugging directives can occur in both single- and multi-line
2012       // comments. If we're currently inside a multi-line comment, we
2013       // also must recognize multi-line comment terminators.
2014       if (isMultiline && unit == '*' && peekCodeUnit() == '/') {
2015         ungetCodeUnit('*');
2016         break;
2017       }
2018 
2019       if (!this->charBuffer.append(unit)) {
2020         return false;
2021       }
2022 
2023       continue;
2024     }
2025 
2026     // This ignores encoding errors: subsequent caller-side code to
2027     // handle the remaining source text in the comment will do so.
2028     PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2029     if (peeked.isNone() || unicode::IsSpace(peeked.codePoint())) {
2030       break;
2031     }
2032 
2033     MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()),
2034                "!IsSpace must imply !IsLineTerminator or else we'll fail to "
2035                "maintain line-info/flags for EOL");
2036     this->sourceUnits.consumeKnownCodePoint(peeked);
2037 
2038     if (!AppendCodePointToCharBuffer(this->charBuffer, peeked.codePoint())) {
2039       return false;
2040     }
2041   } while (true);
2042 
2043   if (this->charBuffer.empty()) {
2044     // The directive's URL was missing, but comments can contain anything,
2045     // so it isn't an error.
2046     return true;
2047   }
2048 
2049   return copyCharBufferTo(anyCharsAccess().cx, destination);
2050 }
2051 
2052 template <typename Unit, class AnyCharsAccess>
getDisplayURL(bool isMultiline,bool shouldWarnDeprecated)2053 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDisplayURL(
2054     bool isMultiline, bool shouldWarnDeprecated) {
2055   // Match comments of the form "//# sourceURL=<url>" or
2056   // "/\* //# sourceURL=<url> *\/"
2057   //
2058   // Note that while these are labeled "sourceURL" in the source text,
2059   // internally we refer to it as a "displayURL" to distinguish what the
2060   // developer would like to refer to the source as from the source's actual
2061   // URL.
2062 
2063   static constexpr char sourceURLDirective[] = " sourceURL=";
2064   constexpr uint8_t sourceURLDirectiveLength = js_strlen(sourceURLDirective);
2065   return getDirective(isMultiline, shouldWarnDeprecated, sourceURLDirective,
2066                       sourceURLDirectiveLength, "sourceURL",
2067                       &anyCharsAccess().displayURL_);
2068 }
2069 
2070 template <typename Unit, class AnyCharsAccess>
getSourceMappingURL(bool isMultiline,bool shouldWarnDeprecated)2071 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getSourceMappingURL(
2072     bool isMultiline, bool shouldWarnDeprecated) {
2073   // Match comments of the form "//# sourceMappingURL=<url>" or
2074   // "/\* //# sourceMappingURL=<url> *\/"
2075 
2076   static constexpr char sourceMappingURLDirective[] = " sourceMappingURL=";
2077   constexpr uint8_t sourceMappingURLDirectiveLength =
2078       js_strlen(sourceMappingURLDirective);
2079   return getDirective(isMultiline, shouldWarnDeprecated,
2080                       sourceMappingURLDirective,
2081                       sourceMappingURLDirectiveLength, "sourceMappingURL",
2082                       &anyCharsAccess().sourceMapURL_);
2083 }
2084 
2085 template <typename Unit, class AnyCharsAccess>
2086 MOZ_ALWAYS_INLINE Token*
newTokenInternal(TokenKind kind,TokenStart start,TokenKind * out)2087 GeneralTokenStreamChars<Unit, AnyCharsAccess>::newTokenInternal(
2088     TokenKind kind, TokenStart start, TokenKind* out) {
2089   MOZ_ASSERT(kind < TokenKind::Limit);
2090   MOZ_ASSERT(kind != TokenKind::Eol,
2091              "TokenKind::Eol should never be used in an actual Token, only "
2092              "returned by peekTokenSameLine()");
2093 
2094   TokenStreamAnyChars& anyChars = anyCharsAccess();
2095   anyChars.flags.isDirtyLine = true;
2096 
2097   Token* token = anyChars.allocateToken();
2098 
2099   *out = token->type = kind;
2100   token->pos = TokenPos(start.offset(), this->sourceUnits.offset());
2101   MOZ_ASSERT(token->pos.begin <= token->pos.end);
2102 
2103   // NOTE: |token->modifier| is set in |newToken()| so that optimized,
2104   // non-debug code won't do any work to pass a modifier-argument that will
2105   // never be used.
2106 
2107   return token;
2108 }
2109 
2110 template <typename Unit, class AnyCharsAccess>
badToken()2111 MOZ_COLD bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::badToken() {
2112   // We didn't get a token, so don't set |flags.isDirtyLine|.
2113   anyCharsAccess().flags.hadError = true;
2114 
2115   // Poisoning sourceUnits on error establishes an invariant: once an
2116   // erroneous token has been seen, sourceUnits will not be consulted again.
2117   // This is true because the parser will deal with the illegal token by
2118   // aborting parsing immediately.
2119   this->sourceUnits.poisonInDebug();
2120 
2121   return false;
2122 };
2123 
AppendCodePointToCharBuffer(CharBuffer & charBuffer,uint32_t codePoint)2124 bool AppendCodePointToCharBuffer(CharBuffer& charBuffer, uint32_t codePoint) {
2125   MOZ_ASSERT(codePoint <= unicode::NonBMPMax,
2126              "should only be processing code points validly decoded from UTF-8 "
2127              "or WTF-16 source text (surrogate code points permitted)");
2128 
2129   char16_t units[2];
2130   unsigned numUnits = 0;
2131   unicode::UTF16Encode(codePoint, units, &numUnits);
2132 
2133   MOZ_ASSERT(numUnits == 1 || numUnits == 2,
2134              "UTF-16 code points are only encoded in one or two units");
2135 
2136   if (!charBuffer.append(units[0])) {
2137     return false;
2138   }
2139 
2140   if (numUnits == 1) {
2141     return true;
2142   }
2143 
2144   return charBuffer.append(units[1]);
2145 }
2146 
2147 template <typename Unit, class AnyCharsAccess>
putIdentInCharBuffer(const Unit * identStart)2148 bool TokenStreamSpecific<Unit, AnyCharsAccess>::putIdentInCharBuffer(
2149     const Unit* identStart) {
2150   const Unit* const originalAddress = this->sourceUnits.addressOfNextCodeUnit();
2151   this->sourceUnits.setAddressOfNextCodeUnit(identStart);
2152 
2153   auto restoreNextRawCharAddress = MakeScopeExit([this, originalAddress]() {
2154     this->sourceUnits.setAddressOfNextCodeUnit(originalAddress);
2155   });
2156 
2157   this->charBuffer.clear();
2158   do {
2159     int32_t unit = getCodeUnit();
2160     if (unit == EOF) {
2161       break;
2162     }
2163 
2164     uint32_t codePoint;
2165     if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2166       if (unicode::IsIdentifierPart(char16_t(unit)) || unit == '#') {
2167         if (!this->charBuffer.append(unit)) {
2168           return false;
2169         }
2170 
2171         continue;
2172       }
2173 
2174       if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) {
2175         break;
2176       }
2177     } else {
2178       // |restoreNextRawCharAddress| undoes all gets, and this function
2179       // doesn't update line/column info.
2180       char32_t cp;
2181       if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) {
2182         return false;
2183       }
2184 
2185       codePoint = cp;
2186       if (!unicode::IsIdentifierPart(codePoint)) {
2187         break;
2188       }
2189     }
2190 
2191     if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) {
2192       return false;
2193     }
2194   } while (true);
2195 
2196   return true;
2197 }
2198 
2199 template <typename Unit, class AnyCharsAccess>
identifierName(TokenStart start,const Unit * identStart,IdentifierEscapes escaping,Modifier modifier,NameVisibility visibility,TokenKind * out)2200 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::identifierName(
2201     TokenStart start, const Unit* identStart, IdentifierEscapes escaping,
2202     Modifier modifier, NameVisibility visibility, TokenKind* out) {
2203   // Run the bad-token code for every path out of this function except the
2204   // two success-cases.
2205   auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
2206 
2207   // We've already consumed an initial code point in the identifer, to *know*
2208   // that this is an identifier.  So no need to worry about not consuming any
2209   // code points in the loop below.
2210   int32_t unit;
2211   while (true) {
2212     unit = peekCodeUnit();
2213     if (unit == EOF) {
2214       break;
2215     }
2216 
2217     if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2218       consumeKnownCodeUnit(unit);
2219 
2220       if (MOZ_UNLIKELY(
2221               !unicode::IsIdentifierPart(static_cast<char16_t>(unit)))) {
2222         // Handle a Unicode escape -- otherwise it's not part of the
2223         // identifier.
2224         uint32_t codePoint;
2225         if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) {
2226           ungetCodeUnit(unit);
2227           break;
2228         }
2229 
2230         escaping = IdentifierEscapes::SawUnicodeEscape;
2231       }
2232     } else {
2233       // This ignores encoding errors: subsequent caller-side code to
2234       // handle source text after the IdentifierName will do so.
2235       PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2236       if (peeked.isNone() || !unicode::IsIdentifierPart(peeked.codePoint())) {
2237         break;
2238       }
2239 
2240       MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()),
2241                  "IdentifierPart must guarantee !IsLineTerminator or "
2242                  "else we'll fail to maintain line-info/flags for EOL");
2243 
2244       this->sourceUnits.consumeKnownCodePoint(peeked);
2245     }
2246   }
2247 
2248   TaggedParserAtomIndex atom;
2249   if (MOZ_UNLIKELY(escaping == IdentifierEscapes::SawUnicodeEscape)) {
2250     // Identifiers containing Unicode escapes have to be converted into
2251     // tokenbuf before atomizing.
2252     if (!putIdentInCharBuffer(identStart)) {
2253       return false;
2254     }
2255 
2256     atom = drainCharBufferIntoAtom();
2257   } else {
2258     // Escape-free identifiers can be created directly from sourceUnits.
2259     const Unit* chars = identStart;
2260     size_t length = this->sourceUnits.addressOfNextCodeUnit() - identStart;
2261 
2262     // Private identifiers start with a '#', and so cannot be reserved words.
2263     if (visibility == NameVisibility::Public) {
2264       // Represent reserved words lacking escapes as reserved word tokens.
2265       if (const ReservedWordInfo* rw = FindReservedWord(chars, length)) {
2266         noteBadToken.release();
2267         newSimpleToken(rw->tokentype, start, modifier, out);
2268         return true;
2269       }
2270     }
2271 
2272     atom = atomizeSourceChars(Span(chars, length));
2273   }
2274   if (!atom) {
2275     return false;
2276   }
2277 
2278   noteBadToken.release();
2279   if (visibility == NameVisibility::Private) {
2280     newPrivateNameToken(atom, start, modifier, out);
2281     return true;
2282   }
2283   newNameToken(atom, start, modifier, out);
2284   return true;
2285 }
2286 
2287 enum FirstCharKind {
2288   // A char16_t has the 'OneChar' kind if it, by itself, constitutes a valid
2289   // token that cannot also be a prefix of a longer token.  E.g. ';' has the
2290   // OneChar kind, but '+' does not, because '++' and '+=' are valid longer
2291   // tokens
2292   // that begin with '+'.
2293   //
2294   // The few token kinds satisfying these properties cover roughly 35--45%
2295   // of the tokens seen in practice.
2296   //
2297   // We represent the 'OneChar' kind with any positive value less than
2298   // TokenKind::Limit.  This representation lets us associate
2299   // each one-char token char16_t with a TokenKind and thus avoid
2300   // a subsequent char16_t-to-TokenKind conversion.
2301   OneChar_Min = 0,
2302   OneChar_Max = size_t(TokenKind::Limit) - 1,
2303 
2304   Space = size_t(TokenKind::Limit),
2305   Ident,
2306   Dec,
2307   String,
2308   EOL,
2309   ZeroDigit,
2310   Other,
2311 
2312   LastCharKind = Other
2313 };
2314 
2315 // OneChar: 40,  41,  44,  58,  59,  91,  93,  123, 125, 126:
2316 //          '(', ')', ',', ':', ';', '[', ']', '{', '}', '~'
2317 // Ident:   36, 65..90, 95, 97..122: '$', 'A'..'Z', '_', 'a'..'z'
2318 // Dot:     46: '.'
2319 // Equals:  61: '='
2320 // String:  34, 39, 96: '"', '\'', '`'
2321 // Dec:     49..57: '1'..'9'
2322 // Plus:    43: '+'
2323 // ZeroDigit:  48: '0'
2324 // Space:   9, 11, 12, 32: '\t', '\v', '\f', ' '
2325 // EOL:     10, 13: '\n', '\r'
2326 //
2327 #define T_COMMA size_t(TokenKind::Comma)
2328 #define T_COLON size_t(TokenKind::Colon)
2329 #define T_BITNOT size_t(TokenKind::BitNot)
2330 #define T_LP size_t(TokenKind::LeftParen)
2331 #define T_RP size_t(TokenKind::RightParen)
2332 #define T_SEMI size_t(TokenKind::Semi)
2333 #define T_LB size_t(TokenKind::LeftBracket)
2334 #define T_RB size_t(TokenKind::RightBracket)
2335 #define T_LC size_t(TokenKind::LeftCurly)
2336 #define T_RC size_t(TokenKind::RightCurly)
2337 #define _______ Other
2338 static const uint8_t firstCharKinds[] = {
2339     // clang-format off
2340 /*         0        1        2        3        4        5        6        7        8        9    */
2341 /*   0+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______,   Space,
2342 /*  10+ */     EOL,   Space,   Space,     EOL, _______, _______, _______, _______, _______, _______,
2343 /*  20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
2344 /*  30+ */ _______, _______,   Space, _______,  String, _______,   Ident, _______, _______,  String,
2345 /*  40+ */    T_LP,    T_RP, _______, _______, T_COMMA, _______, _______, _______,ZeroDigit,    Dec,
2346 /*  50+ */     Dec,     Dec,     Dec,     Dec,     Dec,     Dec,     Dec,     Dec, T_COLON,  T_SEMI,
2347 /*  60+ */ _______, _______, _______, _______, _______,   Ident,   Ident,   Ident,   Ident,   Ident,
2348 /*  70+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
2349 /*  80+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
2350 /*  90+ */   Ident,    T_LB, _______,    T_RB, _______,   Ident,  String,   Ident,   Ident,   Ident,
2351 /* 100+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
2352 /* 110+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
2353 /* 120+ */   Ident,   Ident,   Ident,    T_LC, _______,    T_RC,T_BITNOT, _______
2354     // clang-format on
2355 };
2356 #undef T_COMMA
2357 #undef T_COLON
2358 #undef T_BITNOT
2359 #undef T_LP
2360 #undef T_RP
2361 #undef T_SEMI
2362 #undef T_LB
2363 #undef T_RB
2364 #undef T_LC
2365 #undef T_RC
2366 #undef _______
2367 
2368 static_assert(LastCharKind < (1 << (sizeof(firstCharKinds[0]) * 8)),
2369               "Elements of firstCharKinds[] are too small");
2370 
2371 template <>
consumeRestOfSingleLineComment()2372 void SourceUnits<char16_t>::consumeRestOfSingleLineComment() {
2373   while (MOZ_LIKELY(!atEnd())) {
2374     char16_t unit = peekCodeUnit();
2375     if (IsLineTerminator(unit)) {
2376       return;
2377     }
2378 
2379     consumeKnownCodeUnit(unit);
2380   }
2381 }
2382 
2383 template <>
consumeRestOfSingleLineComment()2384 void SourceUnits<Utf8Unit>::consumeRestOfSingleLineComment() {
2385   while (MOZ_LIKELY(!atEnd())) {
2386     const Utf8Unit unit = peekCodeUnit();
2387     if (IsSingleUnitLineTerminator(unit)) {
2388       return;
2389     }
2390 
2391     if (MOZ_LIKELY(IsAscii(unit))) {
2392       consumeKnownCodeUnit(unit);
2393       continue;
2394     }
2395 
2396     PeekedCodePoint<Utf8Unit> peeked = peekCodePoint();
2397     if (peeked.isNone()) {
2398       return;
2399     }
2400 
2401     char32_t c = peeked.codePoint();
2402     if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR ||
2403                      c == unicode::PARA_SEPARATOR)) {
2404       return;
2405     }
2406 
2407     consumeKnownCodePoint(peeked);
2408   }
2409 }
2410 
2411 template <typename Unit, class AnyCharsAccess>
2412 [[nodiscard]] MOZ_ALWAYS_INLINE bool
matchInteger(IsIntegerUnit isIntegerUnit,int32_t * nextUnit)2413 TokenStreamSpecific<Unit, AnyCharsAccess>::matchInteger(
2414     IsIntegerUnit isIntegerUnit, int32_t* nextUnit) {
2415   int32_t unit = getCodeUnit();
2416   if (!isIntegerUnit(unit)) {
2417     *nextUnit = unit;
2418     return true;
2419   }
2420   return matchIntegerAfterFirstDigit(isIntegerUnit, nextUnit);
2421 }
2422 
2423 template <typename Unit, class AnyCharsAccess>
2424 [[nodiscard]] MOZ_ALWAYS_INLINE bool
matchIntegerAfterFirstDigit(IsIntegerUnit isIntegerUnit,int32_t * nextUnit)2425 TokenStreamSpecific<Unit, AnyCharsAccess>::matchIntegerAfterFirstDigit(
2426     IsIntegerUnit isIntegerUnit, int32_t* nextUnit) {
2427   int32_t unit;
2428   while (true) {
2429     unit = getCodeUnit();
2430     if (isIntegerUnit(unit)) {
2431       continue;
2432     }
2433     if (unit != '_') {
2434       break;
2435     }
2436     unit = getCodeUnit();
2437     if (!isIntegerUnit(unit)) {
2438       if (unit == '_') {
2439         error(JSMSG_NUMBER_MULTIPLE_ADJACENT_UNDERSCORES);
2440       } else {
2441         error(JSMSG_NUMBER_END_WITH_UNDERSCORE);
2442       }
2443       return false;
2444     }
2445   }
2446 
2447   *nextUnit = unit;
2448   return true;
2449 }
2450 
2451 template <typename Unit, class AnyCharsAccess>
decimalNumber(int32_t unit,TokenStart start,const Unit * numStart,Modifier modifier,TokenKind * out)2452 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::decimalNumber(
2453     int32_t unit, TokenStart start, const Unit* numStart, Modifier modifier,
2454     TokenKind* out) {
2455   // Run the bad-token code for every path out of this function except the
2456   // one success-case.
2457   auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
2458 
2459   // Consume integral component digits.
2460   if (IsAsciiDigit(unit)) {
2461     if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) {
2462       return false;
2463     }
2464   }
2465 
2466   // Numbers contain no escapes, so we can read directly from |sourceUnits|.
2467   double dval;
2468   bool isBigInt = false;
2469   DecimalPoint decimalPoint = NoDecimal;
2470   if (unit != '.' && unit != 'e' && unit != 'E' && unit != 'n') {
2471     // NOTE: |unit| may be EOF here.
2472     ungetCodeUnit(unit);
2473 
2474     // Most numbers are pure decimal integers without fractional component
2475     // or exponential notation.  Handle that with optimized code.
2476     if (!GetDecimalInteger(anyCharsAccess().cx, numStart,
2477                            this->sourceUnits.addressOfNextCodeUnit(), &dval)) {
2478       return false;
2479     }
2480   } else if (unit == 'n') {
2481     isBigInt = true;
2482     unit = peekCodeUnit();
2483   } else {
2484     // Consume any decimal dot and fractional component.
2485     if (unit == '.') {
2486       decimalPoint = HasDecimal;
2487       if (!matchInteger(IsAsciiDigit, &unit)) {
2488         return false;
2489       }
2490     }
2491 
2492     // Consume any exponential notation.
2493     if (unit == 'e' || unit == 'E') {
2494       unit = getCodeUnit();
2495       if (unit == '+' || unit == '-') {
2496         unit = getCodeUnit();
2497       }
2498 
2499       // Exponential notation must contain at least one digit.
2500       if (!IsAsciiDigit(unit)) {
2501         ungetCodeUnit(unit);
2502         error(JSMSG_MISSING_EXPONENT);
2503         return false;
2504       }
2505 
2506       // Consume exponential digits.
2507       if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) {
2508         return false;
2509       }
2510     }
2511 
2512     ungetCodeUnit(unit);
2513 
2514     // "0." and "0e..." numbers parse "." or "e..." here.  Neither range
2515     // contains a number, so we can't use |FullStringToDouble|.  (Parse
2516     // failures return 0.0, so we'll still get the right result.)
2517     if (!GetDecimalNonInteger(anyCharsAccess().cx, numStart,
2518                               this->sourceUnits.addressOfNextCodeUnit(),
2519                               &dval)) {
2520       return false;
2521     }
2522   }
2523 
2524   // Number followed by IdentifierStart is an error.  (This is the only place
2525   // in ECMAScript where token boundary is inadequate to properly separate
2526   // two tokens, necessitating this unaesthetic lookahead.)
2527   if (unit != EOF) {
2528     if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2529       if (unicode::IsIdentifierStart(char16_t(unit))) {
2530         error(JSMSG_IDSTART_AFTER_NUMBER);
2531         return false;
2532       }
2533     } else {
2534       // This ignores encoding errors: subsequent caller-side code to
2535       // handle source text after the number will do so.
2536       PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2537       if (!peeked.isNone() && unicode::IsIdentifierStart(peeked.codePoint())) {
2538         error(JSMSG_IDSTART_AFTER_NUMBER);
2539         return false;
2540       }
2541     }
2542   }
2543 
2544   noteBadToken.release();
2545 
2546   if (isBigInt) {
2547     return bigIntLiteral(start, modifier, out);
2548   }
2549 
2550   newNumberToken(dval, decimalPoint, start, modifier, out);
2551   return true;
2552 }
2553 
2554 template <typename Unit, class AnyCharsAccess>
regexpLiteral(TokenStart start,TokenKind * out)2555 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::regexpLiteral(
2556     TokenStart start, TokenKind* out) {
2557   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('/'));
2558   this->charBuffer.clear();
2559 
2560   auto ProcessNonAsciiCodePoint = [this](int32_t lead) {
2561     MOZ_ASSERT(lead != EOF);
2562     MOZ_ASSERT(!this->isAsciiCodePoint(lead));
2563 
2564     char32_t codePoint;
2565     if (!this->getNonAsciiCodePointDontNormalize(this->toUnit(lead),
2566                                                  &codePoint)) {
2567       return false;
2568     }
2569 
2570     if (MOZ_UNLIKELY(codePoint == unicode::LINE_SEPARATOR ||
2571                      codePoint == unicode::PARA_SEPARATOR)) {
2572       this->sourceUnits.ungetLineOrParagraphSeparator();
2573       this->error(JSMSG_UNTERMINATED_REGEXP);
2574       return false;
2575     }
2576 
2577     return AppendCodePointToCharBuffer(this->charBuffer, codePoint);
2578   };
2579 
2580   auto ReportUnterminatedRegExp = [this](int32_t unit) {
2581     this->ungetCodeUnit(unit);
2582     this->error(JSMSG_UNTERMINATED_REGEXP);
2583   };
2584 
2585   bool inCharClass = false;
2586   do {
2587     int32_t unit = getCodeUnit();
2588     if (unit == EOF) {
2589       ReportUnterminatedRegExp(unit);
2590       return badToken();
2591     }
2592 
2593     if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
2594       if (!ProcessNonAsciiCodePoint(unit)) {
2595         return badToken();
2596       }
2597 
2598       continue;
2599     }
2600 
2601     if (unit == '\\') {
2602       if (!this->charBuffer.append(unit)) {
2603         return badToken();
2604       }
2605 
2606       unit = getCodeUnit();
2607       if (unit == EOF) {
2608         ReportUnterminatedRegExp(unit);
2609         return badToken();
2610       }
2611 
2612       // Fallthrough only handles ASCII code points, so
2613       // deal with non-ASCII and skip everything else.
2614       if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
2615         if (!ProcessNonAsciiCodePoint(unit)) {
2616           return badToken();
2617         }
2618 
2619         continue;
2620       }
2621     } else if (unit == '[') {
2622       inCharClass = true;
2623     } else if (unit == ']') {
2624       inCharClass = false;
2625     } else if (unit == '/' && !inCharClass) {
2626       // For IE compat, allow unescaped / in char classes.
2627       break;
2628     }
2629 
2630     // NOTE: Non-ASCII LineTerminators were handled by
2631     //       ProcessNonAsciiCodePoint calls above.
2632     if (unit == '\r' || unit == '\n') {
2633       ReportUnterminatedRegExp(unit);
2634       return badToken();
2635     }
2636 
2637     MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(unit)));
2638     if (!this->charBuffer.append(unit)) {
2639       return badToken();
2640     }
2641   } while (true);
2642 
2643   int32_t unit;
2644   RegExpFlags reflags = RegExpFlag::NoFlags;
2645   while (true) {
2646     uint8_t flag;
2647     unit = getCodeUnit();
2648     if (unit == 'd') {
2649       flag = RegExpFlag::HasIndices;
2650     } else if (unit == 'g') {
2651       flag = RegExpFlag::Global;
2652     } else if (unit == 'i') {
2653       flag = RegExpFlag::IgnoreCase;
2654     } else if (unit == 'm') {
2655       flag = RegExpFlag::Multiline;
2656     } else if (unit == 's') {
2657       flag = RegExpFlag::DotAll;
2658     } else if (unit == 'u') {
2659       flag = RegExpFlag::Unicode;
2660     } else if (unit == 'y') {
2661       flag = RegExpFlag::Sticky;
2662     } else if (IsAsciiAlpha(unit)) {
2663       flag = RegExpFlag::NoFlags;
2664     } else {
2665       break;
2666     }
2667 
2668     if ((reflags & flag) || flag == RegExpFlag::NoFlags) {
2669       ungetCodeUnit(unit);
2670       char buf[2] = {char(unit), '\0'};
2671       error(JSMSG_BAD_REGEXP_FLAG, buf);
2672       return badToken();
2673     }
2674 
2675     reflags |= flag;
2676   }
2677   ungetCodeUnit(unit);
2678 
2679   newRegExpToken(reflags, start, out);
2680   return true;
2681 }
2682 
2683 template <typename Unit, class AnyCharsAccess>
bigIntLiteral(TokenStart start,Modifier modifier,TokenKind * out)2684 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::bigIntLiteral(
2685     TokenStart start, Modifier modifier, TokenKind* out) {
2686   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == toUnit('n'));
2687   MOZ_ASSERT(this->sourceUnits.offset() > start.offset());
2688   uint32_t length = this->sourceUnits.offset() - start.offset();
2689   MOZ_ASSERT(length >= 2);
2690   this->charBuffer.clear();
2691   mozilla::Range<const Unit> chars(
2692       this->sourceUnits.codeUnitPtrAt(start.offset()), length);
2693   for (uint32_t idx = 0; idx < length - 1; idx++) {
2694     int32_t unit = CodeUnitValue(chars[idx]);
2695     // Char buffer may start with a 0[bBoOxX] prefix, then follows with
2696     // binary, octal, decimal, or hex digits.  Already checked by caller, as
2697     // the "n" indicating bigint comes at the end.
2698     MOZ_ASSERT(isAsciiCodePoint(unit));
2699     // Skip over any separators.
2700     if (unit == '_') {
2701       continue;
2702     }
2703     if (!AppendCodePointToCharBuffer(this->charBuffer, unit)) {
2704       return false;
2705     }
2706   }
2707   newBigIntToken(start, modifier, out);
2708   return true;
2709 }
2710 
2711 template <typename Unit, class AnyCharsAccess>
2712 void GeneralTokenStreamChars<Unit,
consumeOptionalHashbangComment()2713                              AnyCharsAccess>::consumeOptionalHashbangComment() {
2714   MOZ_ASSERT(this->sourceUnits.atStart(),
2715              "HashBangComment can only appear immediately at the start of a "
2716              "Script or Module");
2717 
2718   // HashbangComment ::
2719   //   #!  SingleLineCommentChars_opt
2720 
2721   if (!matchCodeUnit('#')) {
2722     // HashbangComment is optional at start of Script or Module.
2723     return;
2724   }
2725 
2726   if (!matchCodeUnit('!')) {
2727     // # not followed by ! at start of Script or Module is an error, but normal
2728     // parsing code will handle that error just fine if we let it.
2729     ungetCodeUnit('#');
2730     return;
2731   }
2732 
2733   // This doesn't consume a concluding LineTerminator, and it stops consuming
2734   // just before any encoding error.  The subsequent |getToken| call will call
2735   // |getTokenInternal| below which will handle these possibilities.
2736   this->sourceUnits.consumeRestOfSingleLineComment();
2737 }
2738 
2739 template <typename Unit, class AnyCharsAccess>
getTokenInternal(TokenKind * const ttp,const Modifier modifier)2740 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getTokenInternal(
2741     TokenKind* const ttp, const Modifier modifier) {
2742   // Assume we'll fail: success cases will overwrite this.
2743 #ifdef DEBUG
2744   *ttp = TokenKind::Limit;
2745 #endif
2746   MOZ_MAKE_MEM_UNDEFINED(ttp, sizeof(*ttp));
2747 
2748   // This loop runs more than once only when whitespace or comments are
2749   // encountered.
2750   do {
2751     int32_t unit = peekCodeUnit();
2752     if (MOZ_UNLIKELY(unit == EOF)) {
2753       MOZ_ASSERT(this->sourceUnits.atEnd());
2754       anyCharsAccess().flags.isEOF = true;
2755       TokenStart start(this->sourceUnits, 0);
2756       newSimpleToken(TokenKind::Eof, start, modifier, ttp);
2757       return true;
2758     }
2759 
2760     if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
2761       // Non-ASCII code points can only be identifiers or whitespace.  It would
2762       // be nice to compute these *after* discarding whitespace, but IN A WORLD
2763       // where |unicode::IsSpace| requires consuming a variable number of code
2764       // units, it's easier to assume it's an identifier and maybe do a little
2765       // wasted work, than to unget and compute and reget if whitespace.
2766       TokenStart start(this->sourceUnits, 0);
2767       const Unit* identStart = this->sourceUnits.addressOfNextCodeUnit();
2768 
2769       PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2770       if (peeked.isNone()) {
2771         int32_t bad;
2772         MOZ_ALWAYS_FALSE(getCodePoint(&bad));
2773         return badToken();
2774       }
2775 
2776       char32_t cp = peeked.codePoint();
2777       if (unicode::IsSpace(cp)) {
2778         this->sourceUnits.consumeKnownCodePoint(peeked);
2779         if (IsLineTerminator(cp)) {
2780           if (!updateLineInfoForEOL()) {
2781             return badToken();
2782           }
2783 
2784           anyCharsAccess().updateFlagsForEOL();
2785         }
2786 
2787         continue;
2788       }
2789 
2790       static_assert(isAsciiCodePoint('$'),
2791                     "IdentifierStart contains '$', but as "
2792                     "!IsUnicodeIDStart('$'), ensure that '$' is never "
2793                     "handled here");
2794       static_assert(isAsciiCodePoint('_'),
2795                     "IdentifierStart contains '_', but as "
2796                     "!IsUnicodeIDStart('_'), ensure that '_' is never "
2797                     "handled here");
2798 
2799       if (MOZ_LIKELY(unicode::IsUnicodeIDStart(cp))) {
2800         this->sourceUnits.consumeKnownCodePoint(peeked);
2801         MOZ_ASSERT(!IsLineTerminator(cp),
2802                    "IdentifierStart must guarantee !IsLineTerminator "
2803                    "or else we'll fail to maintain line-info/flags "
2804                    "for EOL here");
2805 
2806         return identifierName(start, identStart, IdentifierEscapes::None,
2807                               modifier, NameVisibility::Public, ttp);
2808       }
2809 
2810       reportIllegalCharacter(cp);
2811       return badToken();
2812     }  // !isAsciiCodePoint(unit)
2813 
2814     consumeKnownCodeUnit(unit);
2815 
2816     // Get the token kind, based on the first char.  The ordering of c1kind
2817     // comparison is based on the frequency of tokens in real code:
2818     // Parsemark (which represents typical JS code on the web) and the
2819     // Unreal demo (which represents asm.js code).
2820     //
2821     //                  Parsemark   Unreal
2822     //  OneChar         32.9%       39.7%
2823     //  Space           25.0%        0.6%
2824     //  Ident           19.2%       36.4%
2825     //  Dec              7.2%        5.1%
2826     //  String           7.9%        0.0%
2827     //  EOL              1.7%        0.0%
2828     //  ZeroDigit        0.4%        4.9%
2829     //  Other            5.7%       13.3%
2830     //
2831     // The ordering is based mostly only Parsemark frequencies, with Unreal
2832     // frequencies used to break close categories (e.g. |Dec| and
2833     // |String|).  |Other| is biggish, but no other token kind is common
2834     // enough for it to be worth adding extra values to FirstCharKind.
2835     FirstCharKind c1kind = FirstCharKind(firstCharKinds[unit]);
2836 
2837     // Look for an unambiguous single-char token.
2838     //
2839     if (c1kind <= OneChar_Max) {
2840       TokenStart start(this->sourceUnits, -1);
2841       newSimpleToken(TokenKind(c1kind), start, modifier, ttp);
2842       return true;
2843     }
2844 
2845     // Skip over non-EOL whitespace chars.
2846     //
2847     if (c1kind == Space) {
2848       continue;
2849     }
2850 
2851     // Look for an identifier.
2852     //
2853     if (c1kind == Ident) {
2854       TokenStart start(this->sourceUnits, -1);
2855       return identifierName(
2856           start, this->sourceUnits.addressOfNextCodeUnit() - 1,
2857           IdentifierEscapes::None, modifier, NameVisibility::Public, ttp);
2858     }
2859 
2860     // Look for a decimal number.
2861     //
2862     if (c1kind == Dec) {
2863       TokenStart start(this->sourceUnits, -1);
2864       const Unit* numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2865       return decimalNumber(unit, start, numStart, modifier, ttp);
2866     }
2867 
2868     // Look for a string or a template string.
2869     //
2870     if (c1kind == String) {
2871       return getStringOrTemplateToken(static_cast<char>(unit), modifier, ttp);
2872     }
2873 
2874     // Skip over EOL chars, updating line state along the way.
2875     //
2876     if (c1kind == EOL) {
2877       if (unit == '\r') {
2878         matchLineTerminator('\n');
2879       }
2880 
2881       if (!updateLineInfoForEOL()) {
2882         return badToken();
2883       }
2884 
2885       anyCharsAccess().updateFlagsForEOL();
2886       continue;
2887     }
2888 
2889     // From a '0', look for a hexadecimal, binary, octal, or "noctal" (a
2890     // number starting with '0' that contains '8' or '9' and is treated as
2891     // decimal) number.
2892     //
2893     if (c1kind == ZeroDigit) {
2894       TokenStart start(this->sourceUnits, -1);
2895       int radix;
2896       bool isBigInt = false;
2897       const Unit* numStart;
2898       unit = getCodeUnit();
2899       if (unit == 'x' || unit == 'X') {
2900         radix = 16;
2901         unit = getCodeUnit();
2902         if (!IsAsciiHexDigit(unit)) {
2903           // NOTE: |unit| may be EOF here.
2904           ungetCodeUnit(unit);
2905           error(JSMSG_MISSING_HEXDIGITS);
2906           return badToken();
2907         }
2908 
2909         // one past the '0x'
2910         numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2911 
2912         if (!matchIntegerAfterFirstDigit(IsAsciiHexDigit, &unit)) {
2913           return badToken();
2914         }
2915       } else if (unit == 'b' || unit == 'B') {
2916         radix = 2;
2917         unit = getCodeUnit();
2918         if (!IsAsciiBinary(unit)) {
2919           // NOTE: |unit| may be EOF here.
2920           ungetCodeUnit(unit);
2921           error(JSMSG_MISSING_BINARY_DIGITS);
2922           return badToken();
2923         }
2924 
2925         // one past the '0b'
2926         numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2927 
2928         if (!matchIntegerAfterFirstDigit(IsAsciiBinary, &unit)) {
2929           return badToken();
2930         }
2931       } else if (unit == 'o' || unit == 'O') {
2932         radix = 8;
2933         unit = getCodeUnit();
2934         if (!IsAsciiOctal(unit)) {
2935           // NOTE: |unit| may be EOF here.
2936           ungetCodeUnit(unit);
2937           error(JSMSG_MISSING_OCTAL_DIGITS);
2938           return badToken();
2939         }
2940 
2941         // one past the '0o'
2942         numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2943 
2944         if (!matchIntegerAfterFirstDigit(IsAsciiOctal, &unit)) {
2945           return badToken();
2946         }
2947       } else if (IsAsciiDigit(unit)) {
2948         // Reject octal literals that appear in strict mode code.
2949         if (!strictModeError(JSMSG_DEPRECATED_OCTAL_LITERAL)) {
2950           return badToken();
2951         }
2952 
2953         // The above test doesn't catch a few edge cases; see
2954         // |GeneralParser::maybeParseDirective|.  Record the violation so that
2955         // that function can handle them.
2956         anyCharsAccess().setSawDeprecatedOctalLiteral();
2957 
2958         radix = 8;
2959         // one past the '0'
2960         numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2961 
2962         bool nonOctalDecimalIntegerLiteral = false;
2963         do {
2964           if (unit >= '8') {
2965             nonOctalDecimalIntegerLiteral = true;
2966           }
2967           unit = getCodeUnit();
2968         } while (IsAsciiDigit(unit));
2969 
2970         if (unit == '_') {
2971           error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER);
2972           return badToken();
2973         }
2974 
2975         if (unit == 'n') {
2976           error(JSMSG_BIGINT_INVALID_SYNTAX);
2977           return badToken();
2978         }
2979 
2980         if (nonOctalDecimalIntegerLiteral) {
2981           // Use the decimal scanner for the rest of the number.
2982           return decimalNumber(unit, start, numStart, modifier, ttp);
2983         }
2984       } else if (unit == '_') {
2985         // Give a more explicit error message when '_' is used after '0'.
2986         error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER);
2987         return badToken();
2988       } else {
2989         // '0' not followed by [XxBbOo0-9_];  scan as a decimal number.
2990         numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2991 
2992         // NOTE: |unit| may be EOF here.  (This is permitted by case #3
2993         //       in TokenStream.h docs for this function.)
2994         return decimalNumber(unit, start, numStart, modifier, ttp);
2995       }
2996 
2997       if (unit == 'n') {
2998         isBigInt = true;
2999         unit = peekCodeUnit();
3000       } else {
3001         ungetCodeUnit(unit);
3002       }
3003 
3004       // Error if an identifier-start code point appears immediately
3005       // after the number.  Somewhat surprisingly, if we don't check
3006       // here, we'll never check at all.
3007       if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
3008         if (unicode::IsIdentifierStart(char16_t(unit))) {
3009           error(JSMSG_IDSTART_AFTER_NUMBER);
3010           return badToken();
3011         }
3012       } else if (MOZ_LIKELY(unit != EOF)) {
3013         // This ignores encoding errors: subsequent caller-side code to
3014         // handle source text after the number will do so.
3015         PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
3016         if (!peeked.isNone() &&
3017             unicode::IsIdentifierStart(peeked.codePoint())) {
3018           error(JSMSG_IDSTART_AFTER_NUMBER);
3019           return badToken();
3020         }
3021       }
3022 
3023       if (isBigInt) {
3024         return bigIntLiteral(start, modifier, ttp);
3025       }
3026 
3027       double dval;
3028       if (!GetFullInteger(anyCharsAccess().cx, numStart,
3029                           this->sourceUnits.addressOfNextCodeUnit(), radix,
3030                           IntegerSeparatorHandling::SkipUnderscore, &dval)) {
3031         return badToken();
3032       }
3033       newNumberToken(dval, NoDecimal, start, modifier, ttp);
3034       return true;
3035     }
3036 
3037     MOZ_ASSERT(c1kind == Other);
3038 
3039     // This handles everything else.  Simple tokens distinguished solely by
3040     // TokenKind should set |simpleKind| and break, to share simple-token
3041     // creation code for all such tokens.  All other tokens must be handled
3042     // by returning (or by continuing from the loop enclosing this).
3043     //
3044     TokenStart start(this->sourceUnits, -1);
3045     TokenKind simpleKind;
3046 #ifdef DEBUG
3047     simpleKind = TokenKind::Limit;  // sentinel value for code after switch
3048 #endif
3049 
3050     // The block a ways above eliminated all non-ASCII, so cast to the
3051     // smallest type possible to assist the C++ compiler.
3052     switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) {
3053       case '.':
3054         if (IsAsciiDigit(peekCodeUnit())) {
3055           return decimalNumber('.', start,
3056                                this->sourceUnits.addressOfNextCodeUnit() - 1,
3057                                modifier, ttp);
3058         }
3059 
3060         unit = getCodeUnit();
3061         if (unit == '.') {
3062           if (matchCodeUnit('.')) {
3063             simpleKind = TokenKind::TripleDot;
3064             break;
3065           }
3066         }
3067 
3068         // NOTE: |unit| may be EOF here.  A stray '.' at EOF would be an
3069         //       error, but subsequent code will handle it.
3070         ungetCodeUnit(unit);
3071 
3072         simpleKind = TokenKind::Dot;
3073         break;
3074 
3075       case '#': {
3076 #ifdef ENABLE_RECORD_TUPLE
3077         if (matchCodeUnit('{')) {
3078           simpleKind = TokenKind::HashCurly;
3079           break;
3080         }
3081         if (matchCodeUnit('[')) {
3082           simpleKind = TokenKind::HashBracket;
3083           break;
3084         }
3085 #endif
3086 
3087         TokenStart start(this->sourceUnits, -1);
3088         const Unit* identStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
3089         IdentifierEscapes sawEscape;
3090         if (!matchIdentifierStart(&sawEscape)) {
3091           return badToken();
3092         }
3093         return identifierName(start, identStart, sawEscape, modifier,
3094                               NameVisibility::Private, ttp);
3095       }
3096 
3097       case '=':
3098         if (matchCodeUnit('=')) {
3099           simpleKind = matchCodeUnit('=') ? TokenKind::StrictEq : TokenKind::Eq;
3100         } else if (matchCodeUnit('>')) {
3101           simpleKind = TokenKind::Arrow;
3102         } else {
3103           simpleKind = TokenKind::Assign;
3104         }
3105         break;
3106 
3107       case '+':
3108         if (matchCodeUnit('+')) {
3109           simpleKind = TokenKind::Inc;
3110         } else {
3111           simpleKind =
3112               matchCodeUnit('=') ? TokenKind::AddAssign : TokenKind::Add;
3113         }
3114         break;
3115 
3116       case '\\': {
3117         uint32_t codePoint;
3118         if (uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint)) {
3119           return identifierName(
3120               start,
3121               this->sourceUnits.addressOfNextCodeUnit() - escapeLength - 1,
3122               IdentifierEscapes::SawUnicodeEscape, modifier,
3123               NameVisibility::Public, ttp);
3124         }
3125 
3126         // We could point "into" a mistyped escape, e.g. for "\u{41H}" we
3127         // could point at the 'H'.  But we don't do that now, so the code
3128         // unit after the '\' isn't necessarily bad, so just point at the
3129         // start of the actually-invalid escape.
3130         ungetCodeUnit('\\');
3131         error(JSMSG_BAD_ESCAPE);
3132         return badToken();
3133       }
3134 
3135       case '|':
3136         if (matchCodeUnit('|')) {
3137           simpleKind = matchCodeUnit('=') ? TokenKind::OrAssign : TokenKind::Or;
3138         } else {
3139           simpleKind =
3140               matchCodeUnit('=') ? TokenKind::BitOrAssign : TokenKind::BitOr;
3141         }
3142         break;
3143 
3144       case '^':
3145         simpleKind =
3146             matchCodeUnit('=') ? TokenKind::BitXorAssign : TokenKind::BitXor;
3147         break;
3148 
3149       case '&':
3150         if (matchCodeUnit('&')) {
3151           simpleKind =
3152               matchCodeUnit('=') ? TokenKind::AndAssign : TokenKind::And;
3153         } else {
3154           simpleKind =
3155               matchCodeUnit('=') ? TokenKind::BitAndAssign : TokenKind::BitAnd;
3156         }
3157         break;
3158 
3159       case '?':
3160         if (matchCodeUnit('.')) {
3161           unit = getCodeUnit();
3162           if (IsAsciiDigit(unit)) {
3163             // if the code unit is followed by a number, for example it has the
3164             // following form `<...> ?.5 <..> then it should be treated as a
3165             // ternary rather than as an optional chain
3166             simpleKind = TokenKind::Hook;
3167             ungetCodeUnit(unit);
3168             ungetCodeUnit('.');
3169           } else {
3170             ungetCodeUnit(unit);
3171             simpleKind = TokenKind::OptionalChain;
3172           }
3173         } else if (matchCodeUnit('?')) {
3174           simpleKind = matchCodeUnit('=') ? TokenKind::CoalesceAssign
3175                                           : TokenKind::Coalesce;
3176         } else {
3177           simpleKind = TokenKind::Hook;
3178         }
3179         break;
3180 
3181       case '!':
3182         if (matchCodeUnit('=')) {
3183           simpleKind = matchCodeUnit('=') ? TokenKind::StrictNe : TokenKind::Ne;
3184         } else {
3185           simpleKind = TokenKind::Not;
3186         }
3187         break;
3188 
3189       case '<':
3190         if (anyCharsAccess().options().allowHTMLComments) {
3191           // Treat HTML begin-comment as comment-till-end-of-line.
3192           if (matchCodeUnit('!')) {
3193             if (matchCodeUnit('-')) {
3194               if (matchCodeUnit('-')) {
3195                 this->sourceUnits.consumeRestOfSingleLineComment();
3196                 continue;
3197               }
3198               ungetCodeUnit('-');
3199             }
3200             ungetCodeUnit('!');
3201           }
3202         }
3203         if (matchCodeUnit('<')) {
3204           simpleKind =
3205               matchCodeUnit('=') ? TokenKind::LshAssign : TokenKind::Lsh;
3206         } else {
3207           simpleKind = matchCodeUnit('=') ? TokenKind::Le : TokenKind::Lt;
3208         }
3209         break;
3210 
3211       case '>':
3212         if (matchCodeUnit('>')) {
3213           if (matchCodeUnit('>')) {
3214             simpleKind =
3215                 matchCodeUnit('=') ? TokenKind::UrshAssign : TokenKind::Ursh;
3216           } else {
3217             simpleKind =
3218                 matchCodeUnit('=') ? TokenKind::RshAssign : TokenKind::Rsh;
3219           }
3220         } else {
3221           simpleKind = matchCodeUnit('=') ? TokenKind::Ge : TokenKind::Gt;
3222         }
3223         break;
3224 
3225       case '*':
3226         if (matchCodeUnit('*')) {
3227           simpleKind =
3228               matchCodeUnit('=') ? TokenKind::PowAssign : TokenKind::Pow;
3229         } else {
3230           simpleKind =
3231               matchCodeUnit('=') ? TokenKind::MulAssign : TokenKind::Mul;
3232         }
3233         break;
3234 
3235       case '/':
3236         // Look for a single-line comment.
3237         if (matchCodeUnit('/')) {
3238           unit = getCodeUnit();
3239           if (unit == '@' || unit == '#') {
3240             bool shouldWarn = unit == '@';
3241             if (!getDirectives(false, shouldWarn)) {
3242               return false;
3243             }
3244           } else {
3245             // NOTE: |unit| may be EOF here.
3246             ungetCodeUnit(unit);
3247           }
3248 
3249           this->sourceUnits.consumeRestOfSingleLineComment();
3250           continue;
3251         }
3252 
3253         // Look for a multi-line comment.
3254         if (matchCodeUnit('*')) {
3255           TokenStreamAnyChars& anyChars = anyCharsAccess();
3256           unsigned linenoBefore = anyChars.lineno;
3257 
3258           do {
3259             int32_t unit = getCodeUnit();
3260             if (unit == EOF) {
3261               error(JSMSG_UNTERMINATED_COMMENT);
3262               return badToken();
3263             }
3264 
3265             if (unit == '*' && matchCodeUnit('/')) {
3266               break;
3267             }
3268 
3269             if (unit == '@' || unit == '#') {
3270               bool shouldWarn = unit == '@';
3271               if (!getDirectives(true, shouldWarn)) {
3272                 return badToken();
3273               }
3274             } else if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
3275               int32_t codePoint;
3276               if (!getFullAsciiCodePoint(unit, &codePoint)) {
3277                 return badToken();
3278               }
3279             } else {
3280               int32_t codePoint;
3281               if (!getNonAsciiCodePoint(unit, &codePoint)) {
3282                 return badToken();
3283               }
3284             }
3285           } while (true);
3286 
3287           if (linenoBefore != anyChars.lineno) {
3288             anyChars.updateFlagsForEOL();
3289           }
3290 
3291           continue;
3292         }
3293 
3294         // Look for a regexp.
3295         if (modifier == SlashIsRegExp) {
3296           return regexpLiteral(start, ttp);
3297         }
3298 
3299         simpleKind = matchCodeUnit('=') ? TokenKind::DivAssign : TokenKind::Div;
3300         break;
3301 
3302       case '%':
3303         simpleKind = matchCodeUnit('=') ? TokenKind::ModAssign : TokenKind::Mod;
3304         break;
3305 
3306       case '-':
3307         if (matchCodeUnit('-')) {
3308           if (anyCharsAccess().options().allowHTMLComments &&
3309               !anyCharsAccess().flags.isDirtyLine) {
3310             if (matchCodeUnit('>')) {
3311               this->sourceUnits.consumeRestOfSingleLineComment();
3312               continue;
3313             }
3314           }
3315 
3316           simpleKind = TokenKind::Dec;
3317         } else {
3318           simpleKind =
3319               matchCodeUnit('=') ? TokenKind::SubAssign : TokenKind::Sub;
3320         }
3321         break;
3322 
3323       default:
3324         // We consumed a bad ASCII code point/unit.  Put it back so the
3325         // error location is the bad code point.
3326         ungetCodeUnit(unit);
3327         reportIllegalCharacter(unit);
3328         return badToken();
3329     }  // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit))))
3330 
3331     MOZ_ASSERT(simpleKind != TokenKind::Limit,
3332                "switch-statement should have set |simpleKind| before "
3333                "breaking");
3334 
3335     newSimpleToken(simpleKind, start, modifier, ttp);
3336     return true;
3337   } while (true);
3338 }
3339 
3340 template <typename Unit, class AnyCharsAccess>
getStringOrTemplateToken(char untilChar,Modifier modifier,TokenKind * out)3341 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getStringOrTemplateToken(
3342     char untilChar, Modifier modifier, TokenKind* out) {
3343   MOZ_ASSERT(untilChar == '\'' || untilChar == '"' || untilChar == '`',
3344              "unexpected string/template literal delimiter");
3345 
3346   bool parsingTemplate = (untilChar == '`');
3347   bool templateHead = false;
3348 
3349   TokenStart start(this->sourceUnits, -1);
3350   this->charBuffer.clear();
3351 
3352   // Run the bad-token code for every path out of this function except the
3353   // one success-case.
3354   auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
3355 
3356   auto ReportPrematureEndOfLiteral = [this, untilChar](unsigned errnum) {
3357     // Unicode separators aren't end-of-line in template or (as of
3358     // recently) string literals, so this assertion doesn't allow them.
3359     MOZ_ASSERT(this->sourceUnits.atEnd() ||
3360                    this->sourceUnits.peekCodeUnit() == Unit('\r') ||
3361                    this->sourceUnits.peekCodeUnit() == Unit('\n'),
3362                "must be parked at EOF or EOL to call this function");
3363 
3364     // The various errors reported here include language like "in a ''
3365     // literal" or similar, with '' being '', "", or `` as appropriate.
3366     const char delimiters[] = {untilChar, untilChar, '\0'};
3367 
3368     this->error(errnum, delimiters);
3369     return;
3370   };
3371 
3372   // We need to detect any of these chars:  " or ', \n (or its
3373   // equivalents), \\, EOF.  Because we detect EOL sequences here and
3374   // put them back immediately, we can use getCodeUnit().
3375   int32_t unit;
3376   while ((unit = getCodeUnit()) != untilChar) {
3377     if (unit == EOF) {
3378       ReportPrematureEndOfLiteral(JSMSG_EOF_BEFORE_END_OF_LITERAL);
3379       return false;
3380     }
3381 
3382     // Non-ASCII code points are always directly appended -- even
3383     // U+2028 LINE SEPARATOR and U+2029 PARAGRAPH SEPARATOR that are
3384     // ordinarily LineTerminatorSequences.  (They contribute their literal
3385     // values to template and [as of recently] string literals, but they're
3386     // line terminators when computing line/column coordinates.)  Handle
3387     // the non-ASCII case early for readability.
3388     if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
3389       char32_t cp;
3390       if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) {
3391         return false;
3392       }
3393 
3394       if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
3395                        cp == unicode::PARA_SEPARATOR)) {
3396         if (!updateLineInfoForEOL()) {
3397           return false;
3398         }
3399 
3400         anyCharsAccess().updateFlagsForEOL();
3401       } else {
3402         MOZ_ASSERT(!IsLineTerminator(cp));
3403       }
3404 
3405       if (!AppendCodePointToCharBuffer(this->charBuffer, cp)) {
3406         return false;
3407       }
3408 
3409       continue;
3410     }
3411 
3412     if (unit == '\\') {
3413       // When parsing templates, we don't immediately report errors for
3414       // invalid escapes; these are handled by the parser.  We don't
3415       // append to charBuffer in those cases because it won't be read.
3416       unit = getCodeUnit();
3417       if (unit == EOF) {
3418         ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3419         return false;
3420       }
3421 
3422       // Non-ASCII |unit| isn't handled by code after this, so dedicate
3423       // an unlikely special-case to it and then continue.
3424       if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
3425         int32_t codePoint;
3426         if (!getNonAsciiCodePoint(unit, &codePoint)) {
3427           return false;
3428         }
3429 
3430         // If we consumed U+2028 LINE SEPARATOR or U+2029 PARAGRAPH
3431         // SEPARATOR, they'll be normalized to '\n'.  '\' followed by
3432         // LineContinuation represents no code points, so don't append
3433         // in this case.
3434         if (codePoint != '\n') {
3435           if (!AppendCodePointToCharBuffer(this->charBuffer,
3436                                            AssertedCast<char32_t>(codePoint))) {
3437             return false;
3438           }
3439         }
3440 
3441         continue;
3442       }
3443 
3444       // The block above eliminated all non-ASCII, so cast to the
3445       // smallest type possible to assist the C++ compiler.
3446       switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) {
3447         case 'b':
3448           unit = '\b';
3449           break;
3450         case 'f':
3451           unit = '\f';
3452           break;
3453         case 'n':
3454           unit = '\n';
3455           break;
3456         case 'r':
3457           unit = '\r';
3458           break;
3459         case 't':
3460           unit = '\t';
3461           break;
3462         case 'v':
3463           unit = '\v';
3464           break;
3465 
3466         case '\r':
3467           matchLineTerminator('\n');
3468           [[fallthrough]];
3469         case '\n': {
3470           // LineContinuation represents no code points.  We're manually
3471           // consuming a LineTerminatorSequence, so we must manually
3472           // update line/column info.
3473           if (!updateLineInfoForEOL()) {
3474             return false;
3475           }
3476 
3477           continue;
3478         }
3479 
3480         // Unicode character specification.
3481         case 'u': {
3482           int32_t c2 = getCodeUnit();
3483           if (c2 == EOF) {
3484             ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3485             return false;
3486           }
3487 
3488           // First handle a delimited Unicode escape, e.g. \u{1F4A9}.
3489           if (c2 == '{') {
3490             uint32_t start = this->sourceUnits.offset() - 3;
3491             uint32_t code = 0;
3492             bool first = true;
3493             bool valid = true;
3494             do {
3495               int32_t u3 = getCodeUnit();
3496               if (u3 == EOF) {
3497                 if (parsingTemplate) {
3498                   TokenStreamAnyChars& anyChars = anyCharsAccess();
3499                   anyChars.setInvalidTemplateEscape(start,
3500                                                     InvalidEscapeType::Unicode);
3501                   valid = false;
3502                   break;
3503                 }
3504                 reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3505                 return false;
3506               }
3507               if (u3 == '}') {
3508                 if (first) {
3509                   if (parsingTemplate) {
3510                     TokenStreamAnyChars& anyChars = anyCharsAccess();
3511                     anyChars.setInvalidTemplateEscape(
3512                         start, InvalidEscapeType::Unicode);
3513                     valid = false;
3514                     break;
3515                   }
3516                   reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3517                   return false;
3518                 }
3519                 break;
3520               }
3521 
3522               // Beware: |u3| may be a non-ASCII code point here; if
3523               // so it'll pass into this |if|-block.
3524               if (!IsAsciiHexDigit(u3)) {
3525                 if (parsingTemplate) {
3526                   // We put the code unit back so that we read it
3527                   // on the next pass, which matters if it was
3528                   // '`' or '\'.
3529                   ungetCodeUnit(u3);
3530 
3531                   TokenStreamAnyChars& anyChars = anyCharsAccess();
3532                   anyChars.setInvalidTemplateEscape(start,
3533                                                     InvalidEscapeType::Unicode);
3534                   valid = false;
3535                   break;
3536                 }
3537                 reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3538                 return false;
3539               }
3540 
3541               code = (code << 4) | AsciiAlphanumericToNumber(u3);
3542               if (code > unicode::NonBMPMax) {
3543                 if (parsingTemplate) {
3544                   TokenStreamAnyChars& anyChars = anyCharsAccess();
3545                   anyChars.setInvalidTemplateEscape(
3546                       start + 3, InvalidEscapeType::UnicodeOverflow);
3547                   valid = false;
3548                   break;
3549                 }
3550                 reportInvalidEscapeError(start + 3,
3551                                          InvalidEscapeType::UnicodeOverflow);
3552                 return false;
3553               }
3554 
3555               first = false;
3556             } while (true);
3557 
3558             if (!valid) {
3559               continue;
3560             }
3561 
3562             MOZ_ASSERT(code <= unicode::NonBMPMax);
3563             if (!AppendCodePointToCharBuffer(this->charBuffer, code)) {
3564               return false;
3565             }
3566 
3567             continue;
3568           }  // end of delimited Unicode escape handling
3569 
3570           // Otherwise it must be a fixed-length \uXXXX Unicode escape.
3571           // If it isn't, this is usually an error -- but if this is a
3572           // template literal, we must defer error reporting because
3573           // malformed escapes are okay in *tagged* template literals.
3574           char16_t v;
3575           if (IsAsciiHexDigit(c2) && this->sourceUnits.matchHexDigits(3, &v)) {
3576             unit = (AsciiAlphanumericToNumber(c2) << 12) | v;
3577           } else {
3578             // Beware: |c2| may not be an ASCII code point here!
3579             ungetCodeUnit(c2);
3580             uint32_t start = this->sourceUnits.offset() - 2;
3581             if (parsingTemplate) {
3582               TokenStreamAnyChars& anyChars = anyCharsAccess();
3583               anyChars.setInvalidTemplateEscape(start,
3584                                                 InvalidEscapeType::Unicode);
3585               continue;
3586             }
3587             reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3588             return false;
3589           }
3590           break;
3591         }  // case 'u'
3592 
3593         // Hexadecimal character specification.
3594         case 'x': {
3595           char16_t v;
3596           if (this->sourceUnits.matchHexDigits(2, &v)) {
3597             unit = v;
3598           } else {
3599             uint32_t start = this->sourceUnits.offset() - 2;
3600             if (parsingTemplate) {
3601               TokenStreamAnyChars& anyChars = anyCharsAccess();
3602               anyChars.setInvalidTemplateEscape(start,
3603                                                 InvalidEscapeType::Hexadecimal);
3604               continue;
3605             }
3606             reportInvalidEscapeError(start, InvalidEscapeType::Hexadecimal);
3607             return false;
3608           }
3609           break;
3610         }
3611 
3612         default: {
3613           if (!IsAsciiOctal(unit)) {
3614             // \8 or \9 in an untagged template literal is a syntax error,
3615             // reported in GeneralParser::noSubstitutionUntaggedTemplate.
3616             //
3617             // Tagged template literals, however, may contain \8 and \9.  The
3618             // "cooked" representation of such a part will be |undefined|, and
3619             // the "raw" representation will contain the literal characters.
3620             //
3621             //   function f(parts) {
3622             //     assertEq(parts[0], undefined);
3623             //     assertEq(parts.raw[0], "\\8");
3624             //     return "composed";
3625             //   }
3626             //   assertEq(f`\8`, "composed");
3627             if (unit == '8' || unit == '9') {
3628               TokenStreamAnyChars& anyChars = anyCharsAccess();
3629               if (parsingTemplate) {
3630                 anyChars.setInvalidTemplateEscape(
3631                     this->sourceUnits.offset() - 2,
3632                     InvalidEscapeType::EightOrNine);
3633                 continue;
3634               }
3635 
3636               // \8 and \9 are forbidden in string literals in strict mode code.
3637               if (!strictModeError(JSMSG_DEPRECATED_EIGHT_OR_NINE_ESCAPE)) {
3638                 return false;
3639               }
3640 
3641               // The above test doesn't catch a few edge cases; see
3642               // |GeneralParser::maybeParseDirective|.  Record the violation so
3643               // that that function can handle them.
3644               anyChars.setSawDeprecatedEightOrNineEscape();
3645             }
3646             break;
3647           }
3648 
3649           // Octal character specification.
3650           int32_t val = AsciiOctalToNumber(unit);
3651 
3652           unit = peekCodeUnit();
3653           if (MOZ_UNLIKELY(unit == EOF)) {
3654             ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3655             return false;
3656           }
3657 
3658           // Strict mode code allows only \0 followed by a non-digit.
3659           if (val != 0 || IsAsciiDigit(unit)) {
3660             TokenStreamAnyChars& anyChars = anyCharsAccess();
3661             if (parsingTemplate) {
3662               anyChars.setInvalidTemplateEscape(this->sourceUnits.offset() - 2,
3663                                                 InvalidEscapeType::Octal);
3664               continue;
3665             }
3666 
3667             if (!strictModeError(JSMSG_DEPRECATED_OCTAL_ESCAPE)) {
3668               return false;
3669             }
3670 
3671             // The above test doesn't catch a few edge cases; see
3672             // |GeneralParser::maybeParseDirective|.  Record the violation so
3673             // that that function can handle them.
3674             anyChars.setSawDeprecatedOctalEscape();
3675           }
3676 
3677           if (IsAsciiOctal(unit)) {
3678             val = 8 * val + AsciiOctalToNumber(unit);
3679             consumeKnownCodeUnit(unit);
3680 
3681             unit = peekCodeUnit();
3682             if (MOZ_UNLIKELY(unit == EOF)) {
3683               ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3684               return false;
3685             }
3686 
3687             if (IsAsciiOctal(unit)) {
3688               int32_t save = val;
3689               val = 8 * val + AsciiOctalToNumber(unit);
3690               if (val <= 0xFF) {
3691                 consumeKnownCodeUnit(unit);
3692               } else {
3693                 val = save;
3694               }
3695             }
3696           }
3697 
3698           unit = char16_t(val);
3699           break;
3700         }  // default
3701       }    // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit))))
3702 
3703       if (!this->charBuffer.append(unit)) {
3704         return false;
3705       }
3706 
3707       continue;
3708     }  // (unit == '\\')
3709 
3710     if (unit == '\r' || unit == '\n') {
3711       if (!parsingTemplate) {
3712         // String literals don't allow ASCII line breaks.
3713         ungetCodeUnit(unit);
3714         ReportPrematureEndOfLiteral(JSMSG_EOL_BEFORE_END_OF_STRING);
3715         return false;
3716       }
3717 
3718       if (unit == '\r') {
3719         unit = '\n';
3720         matchLineTerminator('\n');
3721       }
3722 
3723       if (!updateLineInfoForEOL()) {
3724         return false;
3725       }
3726 
3727       anyCharsAccess().updateFlagsForEOL();
3728     } else if (parsingTemplate && unit == '$' && matchCodeUnit('{')) {
3729       templateHead = true;
3730       break;
3731     }
3732 
3733     if (!this->charBuffer.append(unit)) {
3734       return false;
3735     }
3736   }
3737 
3738   TaggedParserAtomIndex atom = drainCharBufferIntoAtom();
3739   if (!atom) {
3740     return false;
3741   }
3742 
3743   noteBadToken.release();
3744 
3745   MOZ_ASSERT_IF(!parsingTemplate, !templateHead);
3746 
3747   TokenKind kind = !parsingTemplate ? TokenKind::String
3748                    : templateHead   ? TokenKind::TemplateHead
3749                                     : TokenKind::NoSubsTemplate;
3750   newAtomToken(kind, atom, start, modifier, out);
3751   return true;
3752 }
3753 
TokenKindToDesc(TokenKind tt)3754 const char* TokenKindToDesc(TokenKind tt) {
3755   switch (tt) {
3756 #define EMIT_CASE(name, desc) \
3757   case TokenKind::name:       \
3758     return desc;
3759     FOR_EACH_TOKEN_KIND(EMIT_CASE)
3760 #undef EMIT_CASE
3761     case TokenKind::Limit:
3762       MOZ_ASSERT_UNREACHABLE("TokenKind::Limit should not be passed.");
3763       break;
3764   }
3765 
3766   return "<bad TokenKind>";
3767 }
3768 
3769 #ifdef DEBUG
TokenKindToString(TokenKind tt)3770 const char* TokenKindToString(TokenKind tt) {
3771   switch (tt) {
3772 #  define EMIT_CASE(name, desc) \
3773     case TokenKind::name:       \
3774       return "TokenKind::" #name;
3775     FOR_EACH_TOKEN_KIND(EMIT_CASE)
3776 #  undef EMIT_CASE
3777     case TokenKind::Limit:
3778       break;
3779   }
3780 
3781   return "<bad TokenKind>";
3782 }
3783 #endif
3784 
3785 template class TokenStreamCharsBase<Utf8Unit>;
3786 template class TokenStreamCharsBase<char16_t>;
3787 
3788 template class GeneralTokenStreamChars<char16_t, TokenStreamAnyCharsAccess>;
3789 template class TokenStreamChars<char16_t, TokenStreamAnyCharsAccess>;
3790 template class TokenStreamSpecific<char16_t, TokenStreamAnyCharsAccess>;
3791 
3792 template class GeneralTokenStreamChars<
3793     Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
3794 template class GeneralTokenStreamChars<
3795     Utf8Unit,
3796     ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
3797 template class GeneralTokenStreamChars<
3798     char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
3799 template class GeneralTokenStreamChars<
3800     char16_t,
3801     ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
3802 
3803 template class TokenStreamChars<
3804     Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
3805 template class TokenStreamChars<
3806     Utf8Unit,
3807     ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
3808 template class TokenStreamChars<
3809     char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
3810 template class TokenStreamChars<
3811     char16_t,
3812     ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
3813 
3814 template class TokenStreamSpecific<
3815     Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
3816 template class TokenStreamSpecific<
3817     Utf8Unit,
3818     ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
3819 template class TokenStreamSpecific<
3820     char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
3821 template class TokenStreamSpecific<
3822     char16_t,
3823     ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
3824 
3825 }  // namespace frontend
3826 
3827 }  // namespace js
3828