1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2  * vim: set ts=8 sts=2 et sw=2 tw=80:
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 // JS lexical scanner.
8 
9 #include "frontend/TokenStream.h"
10 
11 #include "mozilla/ArrayUtils.h"
12 #include "mozilla/Attributes.h"
13 #include "mozilla/IntegerTypeTraits.h"
14 #include "mozilla/Likely.h"
15 #include "mozilla/Maybe.h"
16 #include "mozilla/MemoryChecking.h"
17 #include "mozilla/ScopeExit.h"
18 #include "mozilla/Span.h"
19 #include "mozilla/TemplateLib.h"
20 #include "mozilla/TextUtils.h"
21 #include "mozilla/Utf8.h"
22 
23 #include <algorithm>
24 #include <iterator>
25 #include <stdarg.h>
26 #include <stdint.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <type_traits>
30 #include <utility>
31 
32 #include "jsexn.h"
33 #include "jsnum.h"
34 
35 #include "frontend/BytecodeCompiler.h"
36 #include "frontend/Parser.h"
37 #include "frontend/ParserAtom.h"
38 #include "frontend/ReservedWords.h"
39 #include "js/CharacterEncoding.h"
40 #include "js/friend/ErrorMessages.h"  // js::GetErrorMessage, JSMSG_*
41 #include "js/Printf.h"                // JS_smprintf
42 #include "js/RegExpFlags.h"           // JS::RegExpFlags
43 #include "js/UniquePtr.h"
44 #include "util/StringBuffer.h"
45 #include "util/Text.h"
46 #include "util/Unicode.h"
47 #include "vm/FrameIter.h"  // js::{,NonBuiltin}FrameIter
48 #include "vm/HelperThreads.h"
49 #include "vm/JSAtom.h"
50 #include "vm/JSContext.h"
51 #include "vm/Realm.h"
52 #include "vm/WellKnownAtom.h"  // js_*_str
53 
54 using mozilla::AsciiAlphanumericToNumber;
55 using mozilla::AssertedCast;
56 using mozilla::DecodeOneUtf8CodePoint;
57 using mozilla::IsAscii;
58 using mozilla::IsAsciiAlpha;
59 using mozilla::IsAsciiDigit;
60 using mozilla::IsAsciiHexDigit;
61 using mozilla::IsTrailingUnit;
62 using mozilla::MakeScopeExit;
63 using mozilla::Maybe;
64 using mozilla::PointerRangeSize;
65 using mozilla::Span;
66 using mozilla::Utf8Unit;
67 
68 using JS::ReadOnlyCompileOptions;
69 using JS::RegExpFlag;
70 using JS::RegExpFlags;
71 
72 struct ReservedWordInfo {
73   const char* chars;  // C string with reserved word text
74   js::frontend::TokenKind tokentype;
75 };
76 
77 static const ReservedWordInfo reservedWords[] = {
78 #define RESERVED_WORD_INFO(word, name, type) \
79   {js_##word##_str, js::frontend::type},
80     FOR_EACH_JAVASCRIPT_RESERVED_WORD(RESERVED_WORD_INFO)
81 #undef RESERVED_WORD_INFO
82 };
83 
84 enum class ReservedWordsIndex : size_t {
85 #define ENTRY_(_1, NAME, _3) NAME,
86   FOR_EACH_JAVASCRIPT_RESERVED_WORD(ENTRY_)
87 #undef ENTRY_
88 };
89 
90 // Returns a ReservedWordInfo for the specified characters, or nullptr if the
91 // string is not a reserved word.
92 template <typename CharT>
FindReservedWord(const CharT * s,size_t length)93 static const ReservedWordInfo* FindReservedWord(const CharT* s, size_t length) {
94   MOZ_ASSERT(length != 0);
95 
96   size_t i;
97   const ReservedWordInfo* rw;
98   const char* chars;
99 
100 #define JSRW_LENGTH() length
101 #define JSRW_AT(column) s[column]
102 #define JSRW_GOT_MATCH(index) \
103   i = (index);                \
104   goto got_match;
105 #define JSRW_TEST_GUESS(index) \
106   i = (index);                 \
107   goto test_guess;
108 #define JSRW_NO_MATCH() goto no_match;
109 #include "frontend/ReservedWordsGenerated.h"
110 #undef JSRW_NO_MATCH
111 #undef JSRW_TEST_GUESS
112 #undef JSRW_GOT_MATCH
113 #undef JSRW_AT
114 #undef JSRW_LENGTH
115 
116 got_match:
117   return &reservedWords[i];
118 
119 test_guess:
120   rw = &reservedWords[i];
121   chars = rw->chars;
122   do {
123     if (*s++ != static_cast<unsigned char>(*chars++)) {
124       goto no_match;
125     }
126   } while (--length != 0);
127   return rw;
128 
129 no_match:
130   return nullptr;
131 }
132 
133 template <>
FindReservedWord(const Utf8Unit * units,size_t length)134 MOZ_ALWAYS_INLINE const ReservedWordInfo* FindReservedWord<Utf8Unit>(
135     const Utf8Unit* units, size_t length) {
136   return FindReservedWord(Utf8AsUnsignedChars(units), length);
137 }
138 
FindReservedWord(const js::frontend::TaggedParserAtomIndex atom)139 static const ReservedWordInfo* FindReservedWord(
140     const js::frontend::TaggedParserAtomIndex atom) {
141   switch (atom.rawData()) {
142 #define CASE_(_1, NAME, _3)                                           \
143   case js::frontend::TaggedParserAtomIndex::WellKnownRawData::NAME(): \
144     return &reservedWords[size_t(ReservedWordsIndex::NAME)];
145     FOR_EACH_JAVASCRIPT_RESERVED_WORD(CASE_)
146 #undef CASE_
147   }
148 
149   return nullptr;
150 }
151 
GetSingleCodePoint(const char16_t ** p,const char16_t * end)152 static uint32_t GetSingleCodePoint(const char16_t** p, const char16_t* end) {
153   using namespace js;
154 
155   uint32_t codePoint;
156   if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(**p)) && *p + 1 < end) {
157     char16_t lead = **p;
158     char16_t maybeTrail = *(*p + 1);
159     if (unicode::IsTrailSurrogate(maybeTrail)) {
160       *p += 2;
161       return unicode::UTF16Decode(lead, maybeTrail);
162     }
163   }
164 
165   codePoint = **p;
166   (*p)++;
167   return codePoint;
168 }
169 
170 template <typename CharT>
IsAsciiBinary(CharT c)171 static constexpr bool IsAsciiBinary(CharT c) {
172   using UnsignedCharT = std::make_unsigned_t<CharT>;
173   auto uc = static_cast<UnsignedCharT>(c);
174   return uc == '0' || uc == '1';
175 }
176 
177 template <typename CharT>
IsAsciiOctal(CharT c)178 static constexpr bool IsAsciiOctal(CharT c) {
179   using UnsignedCharT = std::make_unsigned_t<CharT>;
180   auto uc = static_cast<UnsignedCharT>(c);
181   return '0' <= uc && uc <= '7';
182 }
183 
184 template <typename CharT>
AsciiOctalToNumber(CharT c)185 static constexpr uint8_t AsciiOctalToNumber(CharT c) {
186   using UnsignedCharT = std::make_unsigned_t<CharT>;
187   auto uc = static_cast<UnsignedCharT>(c);
188   return uc - '0';
189 }
190 
191 namespace js {
192 
193 namespace frontend {
194 
IsIdentifier(JSLinearString * str)195 bool IsIdentifier(JSLinearString* str) {
196   JS::AutoCheckCannotGC nogc;
197   MOZ_ASSERT(str);
198   if (str->hasLatin1Chars()) {
199     return IsIdentifier(str->latin1Chars(nogc), str->length());
200   }
201   return IsIdentifier(str->twoByteChars(nogc), str->length());
202 }
203 
IsIdentifierNameOrPrivateName(JSLinearString * str)204 bool IsIdentifierNameOrPrivateName(JSLinearString* str) {
205   JS::AutoCheckCannotGC nogc;
206   MOZ_ASSERT(str);
207   if (str->hasLatin1Chars()) {
208     return IsIdentifierNameOrPrivateName(str->latin1Chars(nogc), str->length());
209   }
210   return IsIdentifierNameOrPrivateName(str->twoByteChars(nogc), str->length());
211 }
212 
IsIdentifier(const Latin1Char * chars,size_t length)213 bool IsIdentifier(const Latin1Char* chars, size_t length) {
214   if (length == 0) {
215     return false;
216   }
217 
218   if (!unicode::IsIdentifierStart(char16_t(*chars))) {
219     return false;
220   }
221 
222   const Latin1Char* end = chars + length;
223   while (++chars != end) {
224     if (!unicode::IsIdentifierPart(char16_t(*chars))) {
225       return false;
226     }
227   }
228 
229   return true;
230 }
231 
IsIdentifierASCII(char c)232 bool IsIdentifierASCII(char c) { return unicode::IsIdentifierStartASCII(c); }
233 
IsIdentifierASCII(char c1,char c2)234 bool IsIdentifierASCII(char c1, char c2) {
235   return unicode::IsIdentifierStartASCII(c1) &&
236          unicode::IsIdentifierPartASCII(c2);
237 }
238 
IsIdentifierNameOrPrivateName(const Latin1Char * chars,size_t length)239 bool IsIdentifierNameOrPrivateName(const Latin1Char* chars, size_t length) {
240   if (length == 0) {
241     return false;
242   }
243 
244   // Skip over any private name marker.
245   if (*chars == '#') {
246     ++chars;
247     --length;
248   }
249 
250   return IsIdentifier(chars, length);
251 }
252 
IsIdentifier(const char16_t * chars,size_t length)253 bool IsIdentifier(const char16_t* chars, size_t length) {
254   if (length == 0) {
255     return false;
256   }
257 
258   const char16_t* p = chars;
259   const char16_t* end = chars + length;
260   uint32_t codePoint;
261 
262   codePoint = GetSingleCodePoint(&p, end);
263   if (!unicode::IsIdentifierStart(codePoint)) {
264     return false;
265   }
266 
267   while (p < end) {
268     codePoint = GetSingleCodePoint(&p, end);
269     if (!unicode::IsIdentifierPart(codePoint)) {
270       return false;
271     }
272   }
273 
274   return true;
275 }
276 
IsIdentifierNameOrPrivateName(const char16_t * chars,size_t length)277 bool IsIdentifierNameOrPrivateName(const char16_t* chars, size_t length) {
278   if (length == 0) {
279     return false;
280   }
281 
282   const char16_t* p = chars;
283   const char16_t* end = chars + length;
284   uint32_t codePoint;
285 
286   codePoint = GetSingleCodePoint(&p, end);
287 
288   // Skip over any private name marker.
289   if (codePoint == '#') {
290     // The identifier part of a private name mustn't be empty.
291     if (length == 1) {
292       return false;
293     }
294 
295     codePoint = GetSingleCodePoint(&p, end);
296   }
297 
298   if (!unicode::IsIdentifierStart(codePoint)) {
299     return false;
300   }
301 
302   while (p < end) {
303     codePoint = GetSingleCodePoint(&p, end);
304     if (!unicode::IsIdentifierPart(codePoint)) {
305       return false;
306     }
307   }
308 
309   return true;
310 }
311 
IsKeyword(TaggedParserAtomIndex atom)312 bool IsKeyword(TaggedParserAtomIndex atom) {
313   if (const ReservedWordInfo* rw = FindReservedWord(atom)) {
314     return TokenKindIsKeyword(rw->tokentype);
315   }
316 
317   return false;
318 }
319 
ReservedWordTokenKind(TaggedParserAtomIndex name)320 TokenKind ReservedWordTokenKind(TaggedParserAtomIndex name) {
321   if (const ReservedWordInfo* rw = FindReservedWord(name)) {
322     return rw->tokentype;
323   }
324 
325   return TokenKind::Limit;
326 }
327 
ReservedWordToCharZ(TaggedParserAtomIndex name)328 const char* ReservedWordToCharZ(TaggedParserAtomIndex name) {
329   if (const ReservedWordInfo* rw = FindReservedWord(name)) {
330     return ReservedWordToCharZ(rw->tokentype);
331   }
332 
333   return nullptr;
334 }
335 
ReservedWordToCharZ(TokenKind tt)336 const char* ReservedWordToCharZ(TokenKind tt) {
337   MOZ_ASSERT(tt != TokenKind::Name);
338   switch (tt) {
339 #define EMIT_CASE(word, name, type) \
340   case type:                        \
341     return js_##word##_str;
342     FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
343 #undef EMIT_CASE
344     default:
345       MOZ_ASSERT_UNREACHABLE("Not a reserved word PropertyName.");
346   }
347   return nullptr;
348 }
349 
reservedWordToPropertyName(TokenKind tt) const350 TaggedParserAtomIndex TokenStreamAnyChars::reservedWordToPropertyName(
351     TokenKind tt) const {
352   MOZ_ASSERT(tt != TokenKind::Name);
353   switch (tt) {
354 #define EMIT_CASE(word, name, type) \
355   case type:                        \
356     return TaggedParserAtomIndex::WellKnown::name();
357     FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
358 #undef EMIT_CASE
359     default:
360       MOZ_ASSERT_UNREACHABLE("Not a reserved word TokenKind.");
361   }
362   return TaggedParserAtomIndex::null();
363 }
364 
SourceCoords(JSContext * cx,uint32_t initialLineNumber,uint32_t initialOffset)365 SourceCoords::SourceCoords(JSContext* cx, uint32_t initialLineNumber,
366                            uint32_t initialOffset)
367     : lineStartOffsets_(cx), initialLineNum_(initialLineNumber), lastIndex_(0) {
368   // This is actually necessary!  Removing it causes compile errors on
369   // GCC and clang.  You could try declaring this:
370   //
371   //   const uint32_t SourceCoords::MAX_PTR;
372   //
373   // which fixes the GCC/clang error, but causes bustage on Windows.  Sigh.
374   //
375   uint32_t maxPtr = MAX_PTR;
376 
377   // The first line begins at buffer offset |initialOffset|.  MAX_PTR is the
378   // sentinel.  The appends cannot fail because |lineStartOffsets_| has
379   // statically-allocated elements.
380   MOZ_ASSERT(lineStartOffsets_.capacity() >= 2);
381   MOZ_ALWAYS_TRUE(lineStartOffsets_.reserve(2));
382   lineStartOffsets_.infallibleAppend(initialOffset);
383   lineStartOffsets_.infallibleAppend(maxPtr);
384 }
385 
add(uint32_t lineNum,uint32_t lineStartOffset)386 MOZ_ALWAYS_INLINE bool SourceCoords::add(uint32_t lineNum,
387                                          uint32_t lineStartOffset) {
388   uint32_t index = indexFromLineNumber(lineNum);
389   uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
390 
391   MOZ_ASSERT(lineStartOffsets_[0] <= lineStartOffset);
392   MOZ_ASSERT(lineStartOffsets_[sentinelIndex] == MAX_PTR);
393 
394   if (index == sentinelIndex) {
395     // We haven't seen this newline before.  Update lineStartOffsets_
396     // only if lineStartOffsets_.append succeeds, to keep sentinel.
397     // Otherwise return false to tell TokenStream about OOM.
398     uint32_t maxPtr = MAX_PTR;
399     if (!lineStartOffsets_.append(maxPtr)) {
400       static_assert(std::is_same_v<decltype(lineStartOffsets_.allocPolicy()),
401                                    TempAllocPolicy&>,
402                     "this function's caller depends on it reporting an "
403                     "error on failure, as TempAllocPolicy ensures");
404       return false;
405     }
406 
407     lineStartOffsets_[index] = lineStartOffset;
408   } else {
409     // We have seen this newline before (and ungot it).  Do nothing (other
410     // than checking it hasn't mysteriously changed).
411     // This path can be executed after hitting OOM, so check index.
412     MOZ_ASSERT_IF(index < sentinelIndex,
413                   lineStartOffsets_[index] == lineStartOffset);
414   }
415   return true;
416 }
417 
fill(const SourceCoords & other)418 MOZ_ALWAYS_INLINE bool SourceCoords::fill(const SourceCoords& other) {
419   MOZ_ASSERT(lineStartOffsets_[0] == other.lineStartOffsets_[0]);
420   MOZ_ASSERT(lineStartOffsets_.back() == MAX_PTR);
421   MOZ_ASSERT(other.lineStartOffsets_.back() == MAX_PTR);
422 
423   if (lineStartOffsets_.length() >= other.lineStartOffsets_.length()) {
424     return true;
425   }
426 
427   uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
428   lineStartOffsets_[sentinelIndex] = other.lineStartOffsets_[sentinelIndex];
429 
430   for (size_t i = sentinelIndex + 1; i < other.lineStartOffsets_.length();
431        i++) {
432     if (!lineStartOffsets_.append(other.lineStartOffsets_[i])) {
433       return false;
434     }
435   }
436   return true;
437 }
438 
439 MOZ_ALWAYS_INLINE uint32_t
indexFromOffset(uint32_t offset) const440 SourceCoords::indexFromOffset(uint32_t offset) const {
441   uint32_t iMin, iMax, iMid;
442 
443   if (lineStartOffsets_[lastIndex_] <= offset) {
444     // If we reach here, offset is on a line the same as or higher than
445     // last time.  Check first for the +0, +1, +2 cases, because they
446     // typically cover 85--98% of cases.
447     if (offset < lineStartOffsets_[lastIndex_ + 1]) {
448       return lastIndex_;  // index is same as last time
449     }
450 
451     // If we reach here, there must be at least one more entry (plus the
452     // sentinel).  Try it.
453     lastIndex_++;
454     if (offset < lineStartOffsets_[lastIndex_ + 1]) {
455       return lastIndex_;  // index is one higher than last time
456     }
457 
458     // The same logic applies here.
459     lastIndex_++;
460     if (offset < lineStartOffsets_[lastIndex_ + 1]) {
461       return lastIndex_;  // index is two higher than last time
462     }
463 
464     // No luck.  Oh well, we have a better-than-default starting point for
465     // the binary search.
466     iMin = lastIndex_ + 1;
467     MOZ_ASSERT(iMin <
468                lineStartOffsets_.length() - 1);  // -1 due to the sentinel
469 
470   } else {
471     iMin = 0;
472   }
473 
474   // This is a binary search with deferred detection of equality, which was
475   // marginally faster in this case than a standard binary search.
476   // The -2 is because |lineStartOffsets_.length() - 1| is the sentinel, and we
477   // want one before that.
478   iMax = lineStartOffsets_.length() - 2;
479   while (iMax > iMin) {
480     iMid = iMin + (iMax - iMin) / 2;
481     if (offset >= lineStartOffsets_[iMid + 1]) {
482       iMin = iMid + 1;  // offset is above lineStartOffsets_[iMid]
483     } else {
484       iMax = iMid;  // offset is below or within lineStartOffsets_[iMid]
485     }
486   }
487 
488   MOZ_ASSERT(iMax == iMin);
489   MOZ_ASSERT(lineStartOffsets_[iMin] <= offset);
490   MOZ_ASSERT(offset < lineStartOffsets_[iMin + 1]);
491 
492   lastIndex_ = iMin;
493   return iMin;
494 }
495 
lineToken(uint32_t offset) const496 SourceCoords::LineToken SourceCoords::lineToken(uint32_t offset) const {
497   return LineToken(indexFromOffset(offset), offset);
498 }
499 
TokenStreamAnyChars(JSContext * cx,const ReadOnlyCompileOptions & options,StrictModeGetter * smg)500 TokenStreamAnyChars::TokenStreamAnyChars(JSContext* cx,
501                                          const ReadOnlyCompileOptions& options,
502                                          StrictModeGetter* smg)
503     : cx(cx),
504       options_(options),
505       strictModeGetter_(smg),
506       filename_(options.filename()),
507       longLineColumnInfo_(cx),
508       srcCoords(cx, options.lineno, options.scriptSourceOffset),
509       lineno(options.lineno),
510       mutedErrors(options.mutedErrors()) {
511   // |isExprEnding| was initially zeroed: overwrite the true entries here.
512   isExprEnding[size_t(TokenKind::Comma)] = true;
513   isExprEnding[size_t(TokenKind::Semi)] = true;
514   isExprEnding[size_t(TokenKind::Colon)] = true;
515   isExprEnding[size_t(TokenKind::RightParen)] = true;
516   isExprEnding[size_t(TokenKind::RightBracket)] = true;
517   isExprEnding[size_t(TokenKind::RightCurly)] = true;
518 }
519 
520 template <typename Unit>
TokenStreamCharsBase(JSContext * cx,ParserAtomsTable * pasrerAtoms,const Unit * units,size_t length,size_t startOffset)521 TokenStreamCharsBase<Unit>::TokenStreamCharsBase(JSContext* cx,
522                                                  ParserAtomsTable* pasrerAtoms,
523                                                  const Unit* units,
524                                                  size_t length,
525                                                  size_t startOffset)
526     : TokenStreamCharsShared(cx, pasrerAtoms),
527       sourceUnits(units, length, startOffset) {}
528 
FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer & charBuffer,const char16_t * cur,const char16_t * end)529 bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer,
530                                                         const char16_t* cur,
531                                                         const char16_t* end) {
532   MOZ_ASSERT(charBuffer.length() == 0);
533 
534   while (cur < end) {
535     char16_t ch = *cur++;
536     if (ch == '\r') {
537       ch = '\n';
538       if (cur < end && *cur == '\n') {
539         cur++;
540       }
541     }
542 
543     if (!charBuffer.append(ch)) {
544       return false;
545     }
546   }
547 
548   MOZ_ASSERT(cur == end);
549   return true;
550 }
551 
FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer & charBuffer,const Utf8Unit * cur,const Utf8Unit * end)552 bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer,
553                                                         const Utf8Unit* cur,
554                                                         const Utf8Unit* end) {
555   MOZ_ASSERT(charBuffer.length() == 0);
556 
557   while (cur < end) {
558     Utf8Unit unit = *cur++;
559     if (MOZ_LIKELY(IsAscii(unit))) {
560       char16_t ch = unit.toUint8();
561       if (ch == '\r') {
562         ch = '\n';
563         if (cur < end && *cur == Utf8Unit('\n')) {
564           cur++;
565         }
566       }
567 
568       if (!charBuffer.append(ch)) {
569         return false;
570       }
571 
572       continue;
573     }
574 
575     Maybe<char32_t> ch = DecodeOneUtf8CodePoint(unit, &cur, end);
576     MOZ_ASSERT(ch.isSome(),
577                "provided source text should already have been validated");
578 
579     if (!AppendCodePointToCharBuffer(charBuffer, ch.value())) {
580       return false;
581     }
582   }
583 
584   MOZ_ASSERT(cur == end);
585   return true;
586 }
587 
588 template <typename Unit, class AnyCharsAccess>
TokenStreamSpecific(JSContext * cx,ParserAtomsTable * pasrerAtoms,const ReadOnlyCompileOptions & options,const Unit * units,size_t length)589 TokenStreamSpecific<Unit, AnyCharsAccess>::TokenStreamSpecific(
590     JSContext* cx, ParserAtomsTable* pasrerAtoms,
591     const ReadOnlyCompileOptions& options, const Unit* units, size_t length)
592     : TokenStreamChars<Unit, AnyCharsAccess>(cx, pasrerAtoms, units, length,
593                                              options.scriptSourceOffset) {}
594 
checkOptions()595 bool TokenStreamAnyChars::checkOptions() {
596   // Constrain starting columns to where they will saturate.
597   if (options().column > ColumnLimit) {
598     reportErrorNoOffset(JSMSG_BAD_COLUMN_NUMBER);
599     return false;
600   }
601 
602   return true;
603 }
604 
reportErrorNoOffset(unsigned errorNumber,...)605 void TokenStreamAnyChars::reportErrorNoOffset(unsigned errorNumber, ...) {
606   va_list args;
607   va_start(args, errorNumber);
608 
609   reportErrorNoOffsetVA(errorNumber, &args);
610 
611   va_end(args);
612 }
613 
reportErrorNoOffsetVA(unsigned errorNumber,va_list * args)614 void TokenStreamAnyChars::reportErrorNoOffsetVA(unsigned errorNumber,
615                                                 va_list* args) {
616   ErrorMetadata metadata;
617   computeErrorMetadataNoOffset(&metadata);
618 
619   ReportCompileErrorLatin1(cx, std::move(metadata), nullptr, errorNumber, args);
620 }
621 
622 [[nodiscard]] MOZ_ALWAYS_INLINE bool
internalUpdateLineInfoForEOL(uint32_t lineStartOffset)623 TokenStreamAnyChars::internalUpdateLineInfoForEOL(uint32_t lineStartOffset) {
624   prevLinebase = linebase;
625   linebase = lineStartOffset;
626   lineno++;
627 
628   // On overflow, report error.
629   if (MOZ_UNLIKELY(!lineno)) {
630     reportErrorNoOffset(JSMSG_BAD_LINE_NUMBER);
631     return false;
632   }
633 
634   return srcCoords.add(lineno, linebase);
635 }
636 
637 #ifdef DEBUG
638 
639 template <>
assertNextCodePoint(const PeekedCodePoint<char16_t> & peeked)640 inline void SourceUnits<char16_t>::assertNextCodePoint(
641     const PeekedCodePoint<char16_t>& peeked) {
642   char32_t c = peeked.codePoint();
643   if (c < unicode::NonBMPMin) {
644     MOZ_ASSERT(peeked.lengthInUnits() == 1);
645     MOZ_ASSERT(ptr[0] == c);
646   } else {
647     MOZ_ASSERT(peeked.lengthInUnits() == 2);
648     char16_t lead, trail;
649     unicode::UTF16Encode(c, &lead, &trail);
650     MOZ_ASSERT(ptr[0] == lead);
651     MOZ_ASSERT(ptr[1] == trail);
652   }
653 }
654 
655 template <>
assertNextCodePoint(const PeekedCodePoint<Utf8Unit> & peeked)656 inline void SourceUnits<Utf8Unit>::assertNextCodePoint(
657     const PeekedCodePoint<Utf8Unit>& peeked) {
658   char32_t c = peeked.codePoint();
659 
660   // This is all roughly indulgence of paranoia only for assertions, so the
661   // reimplementation of UTF-8 encoding a code point is (we think) a virtue.
662   uint8_t expectedUnits[4] = {};
663   if (c < 0x80) {
664     expectedUnits[0] = AssertedCast<uint8_t>(c);
665   } else if (c < 0x800) {
666     expectedUnits[0] = 0b1100'0000 | (c >> 6);
667     expectedUnits[1] = 0b1000'0000 | (c & 0b11'1111);
668   } else if (c < 0x10000) {
669     expectedUnits[0] = 0b1110'0000 | (c >> 12);
670     expectedUnits[1] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
671     expectedUnits[2] = 0b1000'0000 | (c & 0b11'1111);
672   } else {
673     expectedUnits[0] = 0b1111'0000 | (c >> 18);
674     expectedUnits[1] = 0b1000'0000 | ((c >> 12) & 0b11'1111);
675     expectedUnits[2] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
676     expectedUnits[3] = 0b1000'0000 | (c & 0b11'1111);
677   }
678 
679   MOZ_ASSERT(peeked.lengthInUnits() <= 4);
680   for (uint8_t i = 0; i < peeked.lengthInUnits(); i++) {
681     MOZ_ASSERT(expectedUnits[i] == ptr[i].toUint8());
682   }
683 }
684 
685 #endif  // DEBUG
686 
RetractPointerToCodePointBoundary(const Utf8Unit ** ptr,const Utf8Unit * limit)687 static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
688     const Utf8Unit** ptr, const Utf8Unit* limit) {
689   MOZ_ASSERT(*ptr <= limit);
690 
691   // |limit| is a code point boundary.
692   if (MOZ_UNLIKELY(*ptr == limit)) {
693     return;
694   }
695 
696   // Otherwise rewind past trailing units to the start of the code point.
697 #ifdef DEBUG
698   size_t retracted = 0;
699 #endif
700   while (MOZ_UNLIKELY(IsTrailingUnit((*ptr)[0]))) {
701     --*ptr;
702 #ifdef DEBUG
703     retracted++;
704 #endif
705   }
706 
707   MOZ_ASSERT(retracted < 4,
708              "the longest UTF-8 code point is four units, so this should never "
709              "retract more than three units");
710 }
711 
RetractPointerToCodePointBoundary(const char16_t ** ptr,const char16_t * limit)712 static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
713     const char16_t** ptr, const char16_t* limit) {
714   MOZ_ASSERT(*ptr <= limit);
715 
716   // |limit| is a code point boundary.
717   if (MOZ_UNLIKELY(*ptr == limit)) {
718     return;
719   }
720 
721   // Otherwise the pointer must be retracted by one iff it splits a two-unit
722   // code point.
723   if (MOZ_UNLIKELY(unicode::IsTrailSurrogate((*ptr)[0]))) {
724     // Outside test suites testing garbage WTF-16, it's basically guaranteed
725     // here that |(*ptr)[-1] (*ptr)[0]| is a surrogate pair.
726     if (MOZ_LIKELY(unicode::IsLeadSurrogate((*ptr)[-1]))) {
727       --*ptr;
728     }
729   }
730 }
731 
732 template <typename Unit>
computePartialColumn(const LineToken lineToken,const uint32_t offset,const SourceUnits<Unit> & sourceUnits) const733 uint32_t TokenStreamAnyChars::computePartialColumn(
734     const LineToken lineToken, const uint32_t offset,
735     const SourceUnits<Unit>& sourceUnits) const {
736   lineToken.assertConsistentOffset(offset);
737 
738   const uint32_t line = lineNumber(lineToken);
739   const uint32_t start = srcCoords.lineStart(lineToken);
740 
741   // Reset the previous offset/column cache for this line, if the previous
742   // lookup wasn't on this line.
743   if (line != lineOfLastColumnComputation_) {
744     lineOfLastColumnComputation_ = line;
745     lastChunkVectorForLine_ = nullptr;
746     lastOffsetOfComputedColumn_ = start;
747     lastComputedColumn_ = 0;
748   }
749 
750   // Compute and return the final column number from a partial offset/column,
751   // using the last-cached offset/column if they're more optimal.
752   auto ColumnFromPartial = [this, offset, &sourceUnits](uint32_t partialOffset,
753                                                         uint32_t partialCols,
754                                                         UnitsType unitsType) {
755     MOZ_ASSERT(partialOffset <= offset);
756 
757     // If the last lookup on this line was closer to |offset|, use it.
758     if (partialOffset < this->lastOffsetOfComputedColumn_ &&
759         this->lastOffsetOfComputedColumn_ <= offset) {
760       partialOffset = this->lastOffsetOfComputedColumn_;
761       partialCols = this->lastComputedColumn_;
762     }
763 
764     const Unit* begin = sourceUnits.codeUnitPtrAt(partialOffset);
765     const Unit* end = sourceUnits.codeUnitPtrAt(offset);
766 
767     size_t offsetDelta = AssertedCast<uint32_t>(PointerRangeSize(begin, end));
768     partialOffset += offsetDelta;
769 
770     if (unitsType == UnitsType::GuaranteedSingleUnit) {
771       MOZ_ASSERT(unicode::CountCodePoints(begin, end) == offsetDelta,
772                  "guaranteed-single-units also guarantee pointer distance "
773                  "equals code point count");
774       partialCols += offsetDelta;
775     } else {
776       partialCols +=
777           AssertedCast<uint32_t>(unicode::CountCodePoints(begin, end));
778     }
779 
780     this->lastOffsetOfComputedColumn_ = partialOffset;
781     this->lastComputedColumn_ = partialCols;
782     return partialCols;
783   };
784 
785   const uint32_t offsetInLine = offset - start;
786 
787   // We won't add an entry to |longLineColumnInfo_| for lines where the maximum
788   // column has offset less than this value.  The most common (non-minified)
789   // long line length is likely 80ch, maybe 100ch, so we use that, rounded up to
790   // the next power of two for efficient division/multiplication below.
791   constexpr uint32_t ColumnChunkLength = mozilla::tl::RoundUpPow2<100>::value;
792 
793   // The index within any associated |Vector<ChunkInfo>| of |offset|'s chunk.
794   const uint32_t chunkIndex = offsetInLine / ColumnChunkLength;
795   if (chunkIndex == 0) {
796     // We don't know from an |offset| in the zeroth chunk that this line is even
797     // long.  First-chunk info is mostly useless, anyway -- we have |start|
798     // already.  So if we have *easy* access to that zeroth chunk, use it --
799     // otherwise just count pessimally.  (This will still benefit from caching
800     // the last column/offset for computations for successive offsets, so it's
801     // not *always* worst-case.)
802     UnitsType unitsType;
803     if (lastChunkVectorForLine_ && lastChunkVectorForLine_->length() > 0) {
804       MOZ_ASSERT((*lastChunkVectorForLine_)[0].column() == 0);
805       unitsType = (*lastChunkVectorForLine_)[0].unitsType();
806     } else {
807       unitsType = UnitsType::PossiblyMultiUnit;
808     }
809 
810     return ColumnFromPartial(start, 0, unitsType);
811   }
812 
813   // If this line has no chunk vector yet, insert one in the hash map.  (The
814   // required index is allocated and filled further down.)
815   if (!lastChunkVectorForLine_) {
816     auto ptr = longLineColumnInfo_.lookupForAdd(line);
817     if (!ptr) {
818       // This could rehash and invalidate a cached vector pointer, but the outer
819       // condition means we don't have a cached pointer.
820       if (!longLineColumnInfo_.add(ptr, line, Vector<ChunkInfo>(cx))) {
821         // In case of OOM, just count columns from the start of the line.
822         cx->recoverFromOutOfMemory();
823         return ColumnFromPartial(start, 0, UnitsType::PossiblyMultiUnit);
824       }
825     }
826 
827     // Note that adding elements to this vector won't invalidate this pointer.
828     lastChunkVectorForLine_ = &ptr->value();
829   }
830 
831   const Unit* const limit = sourceUnits.codeUnitPtrAt(offset);
832 
833   auto RetractedOffsetOfChunk = [
834 #ifdef DEBUG
835                                     this,
836 #endif
837                                     start, limit,
838                                     &sourceUnits](uint32_t index) {
839     MOZ_ASSERT(index < this->lastChunkVectorForLine_->length());
840 
841     uint32_t naiveOffset = start + index * ColumnChunkLength;
842     const Unit* naivePtr = sourceUnits.codeUnitPtrAt(naiveOffset);
843 
844     const Unit* actualPtr = naivePtr;
845     RetractPointerToCodePointBoundary(&actualPtr, limit);
846 
847 #ifdef DEBUG
848     if ((*this->lastChunkVectorForLine_)[index].unitsType() ==
849         UnitsType::GuaranteedSingleUnit) {
850       MOZ_ASSERT(naivePtr == actualPtr, "miscomputed unitsType value");
851     }
852 #endif
853 
854     return naiveOffset - PointerRangeSize(actualPtr, naivePtr);
855   };
856 
857   uint32_t partialOffset;
858   uint32_t partialColumn;
859   UnitsType unitsType;
860 
861   auto entriesLen = AssertedCast<uint32_t>(lastChunkVectorForLine_->length());
862   if (chunkIndex < entriesLen) {
863     // We've computed the chunk |offset| resides in.  Compute the column number
864     // from the chunk.
865     partialOffset = RetractedOffsetOfChunk(chunkIndex);
866     partialColumn = (*lastChunkVectorForLine_)[chunkIndex].column();
867 
868     // This is exact if |chunkIndex| isn't the last chunk.
869     unitsType = (*lastChunkVectorForLine_)[chunkIndex].unitsType();
870 
871     // Otherwise the last chunk is pessimistically assumed to contain multi-unit
872     // code points because we haven't fully examined its contents yet -- they
873     // may not have been tokenized yet, they could contain encoding errors, or
874     // they might not even exist.
875     MOZ_ASSERT_IF(chunkIndex == entriesLen - 1,
876                   (*lastChunkVectorForLine_)[chunkIndex].unitsType() ==
877                       UnitsType::PossiblyMultiUnit);
878   } else {
879     // Extend the vector from its last entry or the start of the line.  (This is
880     // also a suitable partial start point if we must recover from OOM.)
881     if (entriesLen > 0) {
882       partialOffset = RetractedOffsetOfChunk(entriesLen - 1);
883       partialColumn = (*lastChunkVectorForLine_)[entriesLen - 1].column();
884     } else {
885       partialOffset = start;
886       partialColumn = 0;
887     }
888 
889     if (!lastChunkVectorForLine_->reserve(chunkIndex + 1)) {
890       // As earlier, just start from the greatest offset/column in case of OOM.
891       cx->recoverFromOutOfMemory();
892       return ColumnFromPartial(partialOffset, partialColumn,
893                                UnitsType::PossiblyMultiUnit);
894     }
895 
896     // OOM is no longer possible now.  \o/
897 
898     // The vector always begins with the column of the line start, i.e. zero,
899     // with chunk units pessimally assumed not single-unit.
900     if (entriesLen == 0) {
901       lastChunkVectorForLine_->infallibleAppend(
902           ChunkInfo(0, UnitsType::PossiblyMultiUnit));
903       entriesLen++;
904     }
905 
906     do {
907       const Unit* const begin = sourceUnits.codeUnitPtrAt(partialOffset);
908       const Unit* chunkLimit = sourceUnits.codeUnitPtrAt(
909           start + std::min(entriesLen++ * ColumnChunkLength, offsetInLine));
910 
911       MOZ_ASSERT(begin < chunkLimit);
912       MOZ_ASSERT(chunkLimit <= limit);
913 
914       static_assert(
915           ColumnChunkLength > SourceUnitTraits<Unit>::maxUnitsLength - 1,
916           "any retraction below is assumed to never underflow to the "
917           "preceding chunk, even for the longest code point");
918 
919       // Prior tokenizing ensured that [begin, limit) is validly encoded, and
920       // |begin < chunkLimit|, so any retraction here can't underflow.
921       RetractPointerToCodePointBoundary(&chunkLimit, limit);
922 
923       MOZ_ASSERT(begin < chunkLimit);
924       MOZ_ASSERT(chunkLimit <= limit);
925 
926       size_t numUnits = PointerRangeSize(begin, chunkLimit);
927       size_t numCodePoints = unicode::CountCodePoints(begin, chunkLimit);
928 
929       // If this chunk (which will become non-final at the end of the loop) is
930       // all single-unit code points, annotate the chunk accordingly.
931       if (numUnits == numCodePoints) {
932         lastChunkVectorForLine_->back().guaranteeSingleUnits();
933       }
934 
935       partialOffset += numUnits;
936       partialColumn += numCodePoints;
937 
938       lastChunkVectorForLine_->infallibleEmplaceBack(
939           partialColumn, UnitsType::PossiblyMultiUnit);
940     } while (entriesLen < chunkIndex + 1);
941 
942     // We're at a spot in the current final chunk, and final chunks never have
943     // complete units information, so be pessimistic.
944     unitsType = UnitsType::PossiblyMultiUnit;
945   }
946 
947   return ColumnFromPartial(partialOffset, partialColumn, unitsType);
948 }
949 
950 template <typename Unit, class AnyCharsAccess>
computeColumn(LineToken lineToken,uint32_t offset) const951 uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeColumn(
952     LineToken lineToken, uint32_t offset) const {
953   lineToken.assertConsistentOffset(offset);
954 
955   const TokenStreamAnyChars& anyChars = anyCharsAccess();
956 
957   uint32_t column =
958       anyChars.computePartialColumn(lineToken, offset, this->sourceUnits);
959 
960   if (lineToken.isFirstLine()) {
961     if (column > ColumnLimit) {
962       return ColumnLimit;
963     }
964 
965     static_assert(uint32_t(ColumnLimit + ColumnLimit) > ColumnLimit,
966                   "Adding ColumnLimit should not overflow");
967 
968     uint32_t firstLineOffset = anyChars.options_.column;
969     column += firstLineOffset;
970   }
971 
972   if (column > ColumnLimit) {
973     return ColumnLimit;
974   }
975 
976   return column;
977 }
978 
979 template <typename Unit, class AnyCharsAccess>
computeLineAndColumn(uint32_t offset,uint32_t * line,uint32_t * column) const980 void GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeLineAndColumn(
981     uint32_t offset, uint32_t* line, uint32_t* column) const {
982   const TokenStreamAnyChars& anyChars = anyCharsAccess();
983 
984   auto lineToken = anyChars.lineToken(offset);
985   *line = anyChars.lineNumber(lineToken);
986   *column = computeColumn(lineToken, offset);
987 }
988 
989 template <class AnyCharsAccess>
internalEncodingError(uint8_t relevantUnits,unsigned errorNumber,...)990 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::internalEncodingError(
991     uint8_t relevantUnits, unsigned errorNumber, ...) {
992   va_list args;
993   va_start(args, errorNumber);
994 
995   do {
996     size_t offset = this->sourceUnits.offset();
997 
998     ErrorMetadata err;
999 
1000     TokenStreamAnyChars& anyChars = anyCharsAccess();
1001 
1002     bool canAddLineOfContext = fillExceptingContext(&err, offset);
1003     if (canAddLineOfContext) {
1004       if (!internalComputeLineOfContext(&err, offset)) {
1005         break;
1006       }
1007 
1008       // As this is an encoding error, the computed window-end must be
1009       // identical to the location of the error -- any further on and the
1010       // window would contain invalid Unicode.
1011       MOZ_ASSERT_IF(err.lineOfContext != nullptr,
1012                     err.lineLength == err.tokenOffset);
1013     }
1014 
1015     auto notes = MakeUnique<JSErrorNotes>();
1016     if (!notes) {
1017       ReportOutOfMemory(anyChars.cx);
1018       break;
1019     }
1020 
1021     // The largest encoding of a UTF-8 code point is 4 units.  (Encoding an
1022     // obsolete 5- or 6-byte code point will complain only about a bad lead
1023     // code unit.)
1024     constexpr size_t MaxWidth = sizeof("0xHH 0xHH 0xHH 0xHH");
1025 
1026     MOZ_ASSERT(relevantUnits > 0);
1027 
1028     char badUnitsStr[MaxWidth];
1029     char* ptr = badUnitsStr;
1030     while (relevantUnits > 0) {
1031       byteToString(this->sourceUnits.getCodeUnit().toUint8(), ptr);
1032       ptr[4] = ' ';
1033 
1034       ptr += 5;
1035       relevantUnits--;
1036     }
1037 
1038     ptr[-1] = '\0';
1039 
1040     uint32_t line, column;
1041     computeLineAndColumn(offset, &line, &column);
1042 
1043     if (!notes->addNoteASCII(anyChars.cx, anyChars.getFilename(), 0, line,
1044                              column, GetErrorMessage, nullptr,
1045                              JSMSG_BAD_CODE_UNITS, badUnitsStr)) {
1046       break;
1047     }
1048 
1049     ReportCompileErrorLatin1(anyChars.cx, std::move(err), std::move(notes),
1050                              errorNumber, &args);
1051   } while (false);
1052 
1053   va_end(args);
1054 }
1055 
1056 template <class AnyCharsAccess>
badLeadUnit(Utf8Unit lead)1057 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badLeadUnit(
1058     Utf8Unit lead) {
1059   uint8_t leadValue = lead.toUint8();
1060 
1061   char leadByteStr[5];
1062   byteToTerminatedString(leadValue, leadByteStr);
1063 
1064   internalEncodingError(1, JSMSG_BAD_LEADING_UTF8_UNIT, leadByteStr);
1065 }
1066 
1067 template <class AnyCharsAccess>
notEnoughUnits(Utf8Unit lead,uint8_t remaining,uint8_t required)1068 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::notEnoughUnits(
1069     Utf8Unit lead, uint8_t remaining, uint8_t required) {
1070   uint8_t leadValue = lead.toUint8();
1071 
1072   MOZ_ASSERT(required == 2 || required == 3 || required == 4);
1073   MOZ_ASSERT(remaining < 4);
1074   MOZ_ASSERT(remaining < required);
1075 
1076   char leadByteStr[5];
1077   byteToTerminatedString(leadValue, leadByteStr);
1078 
1079   // |toHexChar| produces the desired decimal numbers for values < 4.
1080   const char expectedStr[] = {toHexChar(required - 1), '\0'};
1081   const char actualStr[] = {toHexChar(remaining - 1), '\0'};
1082 
1083   internalEncodingError(remaining, JSMSG_NOT_ENOUGH_CODE_UNITS, leadByteStr,
1084                         expectedStr, required == 2 ? "" : "s", actualStr,
1085                         remaining == 2 ? " was" : "s were");
1086 }
1087 
1088 template <class AnyCharsAccess>
badTrailingUnit(uint8_t unitsObserved)1089 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badTrailingUnit(
1090     uint8_t unitsObserved) {
1091   Utf8Unit badUnit =
1092       this->sourceUnits.addressOfNextCodeUnit()[unitsObserved - 1];
1093 
1094   char badByteStr[5];
1095   byteToTerminatedString(badUnit.toUint8(), badByteStr);
1096 
1097   internalEncodingError(unitsObserved, JSMSG_BAD_TRAILING_UTF8_UNIT,
1098                         badByteStr);
1099 }
1100 
1101 template <class AnyCharsAccess>
1102 MOZ_COLD void
badStructurallyValidCodePoint(uint32_t codePoint,uint8_t codePointLength,const char * reason)1103 TokenStreamChars<Utf8Unit, AnyCharsAccess>::badStructurallyValidCodePoint(
1104     uint32_t codePoint, uint8_t codePointLength, const char* reason) {
1105   // Construct a string like "0x203D" (including null terminator) to include
1106   // in the error message.  Write the string end-to-start from end to start
1107   // of an adequately sized |char| array, shifting least significant nibbles
1108   // off the number and writing the corresponding hex digits until done, then
1109   // prefixing with "0x".  |codePointStr| points at the incrementally
1110   // computed string, within |codePointCharsArray|'s bounds.
1111 
1112   // 0x1F'FFFF is the maximum value that can fit in 3+6+6+6 unconstrained
1113   // bits in a four-byte UTF-8 code unit sequence.
1114   constexpr size_t MaxHexSize = sizeof(
1115       "0x1F"
1116       "FFFF");  // including '\0'
1117   char codePointCharsArray[MaxHexSize];
1118 
1119   char* codePointStr = std::end(codePointCharsArray);
1120   *--codePointStr = '\0';
1121 
1122   // Note that by do-while looping here rather than while-looping, this
1123   // writes a '0' when |codePoint == 0|.
1124   do {
1125     MOZ_ASSERT(codePointCharsArray < codePointStr);
1126     *--codePointStr = toHexChar(codePoint & 0xF);
1127     codePoint >>= 4;
1128   } while (codePoint);
1129 
1130   MOZ_ASSERT(codePointCharsArray + 2 <= codePointStr);
1131   *--codePointStr = 'x';
1132   *--codePointStr = '0';
1133 
1134   internalEncodingError(codePointLength, JSMSG_FORBIDDEN_UTF8_CODE_POINT,
1135                         codePointStr, reason);
1136 }
1137 
1138 template <class AnyCharsAccess>
1139 [[nodiscard]] bool
getNonAsciiCodePointDontNormalize(Utf8Unit lead,char32_t * codePoint)1140 TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePointDontNormalize(
1141     Utf8Unit lead, char32_t* codePoint) {
1142   auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
1143 
1144   auto onNotEnoughUnits = [this, &lead](uint8_t remaining, uint8_t required) {
1145     this->notEnoughUnits(lead, remaining, required);
1146   };
1147 
1148   auto onBadTrailingUnit = [this](uint8_t unitsObserved) {
1149     this->badTrailingUnit(unitsObserved);
1150   };
1151 
1152   auto onBadCodePoint = [this](char32_t badCodePoint, uint8_t unitsObserved) {
1153     this->badCodePoint(badCodePoint, unitsObserved);
1154   };
1155 
1156   auto onNotShortestForm = [this](char32_t badCodePoint,
1157                                   uint8_t unitsObserved) {
1158     this->notShortestForm(badCodePoint, unitsObserved);
1159   };
1160 
1161   // If a valid code point is decoded, this function call consumes its code
1162   // units.  If not, it ungets the lead code unit and invokes the right error
1163   // handler, so on failure we must immediately return false.
1164   SourceUnitsIterator iter(this->sourceUnits);
1165   Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePointInline(
1166       lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
1167       onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
1168   if (maybeCodePoint.isNothing()) {
1169     return false;
1170   }
1171 
1172   *codePoint = maybeCodePoint.value();
1173   return true;
1174 }
1175 
1176 template <class AnyCharsAccess>
getNonAsciiCodePoint(int32_t lead,int32_t * codePoint)1177 bool TokenStreamChars<char16_t, AnyCharsAccess>::getNonAsciiCodePoint(
1178     int32_t lead, int32_t* codePoint) {
1179   MOZ_ASSERT(lead != EOF);
1180   MOZ_ASSERT(!isAsciiCodePoint(lead),
1181              "ASCII code unit/point must be handled separately");
1182   MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
1183              "getNonAsciiCodePoint called incorrectly");
1184 
1185   // The code point is usually |lead|: overwrite later if needed.
1186   *codePoint = lead;
1187 
1188   // ECMAScript specifically requires that unpaired UTF-16 surrogates be
1189   // treated as the corresponding code point and not as an error.  See
1190   // <https://tc39.github.io/ecma262/#sec-ecmascript-language-types-string-type>.
1191   // Thus this function does not consider any sequence of 16-bit numbers to
1192   // be intrinsically in error.
1193 
1194   // Dispense with single-unit code points and lone trailing surrogates.
1195   if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead))) {
1196     if (MOZ_UNLIKELY(lead == unicode::LINE_SEPARATOR ||
1197                      lead == unicode::PARA_SEPARATOR)) {
1198       if (!updateLineInfoForEOL()) {
1199 #ifdef DEBUG
1200         *codePoint = EOF;  // sentinel value to hopefully cause errors
1201 #endif
1202         MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
1203         return false;
1204       }
1205 
1206       *codePoint = '\n';
1207     } else {
1208       MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1209     }
1210 
1211     return true;
1212   }
1213 
1214   // Also handle a lead surrogate not paired with a trailing surrogate.
1215   if (MOZ_UNLIKELY(
1216           this->sourceUnits.atEnd() ||
1217           !unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) {
1218     MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1219     return true;
1220   }
1221 
1222   // Otherwise we have a multi-unit code point.
1223   *codePoint = unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
1224   MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1225   return true;
1226 }
1227 
1228 template <typename Unit, class AnyCharsAccess>
getCodePoint(int32_t * cp)1229 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getCodePoint(int32_t* cp) {
1230   int32_t unit = getCodeUnit();
1231   if (unit == EOF) {
1232     MOZ_ASSERT(anyCharsAccess().flags.isEOF,
1233                "flags.isEOF should have been set by getCodeUnit()");
1234     *cp = EOF;
1235     return true;
1236   }
1237 
1238   if (isAsciiCodePoint(unit)) {
1239     return getFullAsciiCodePoint(unit, cp);
1240   }
1241 
1242   return getNonAsciiCodePoint(unit, cp);
1243 }
1244 
1245 template <class AnyCharsAccess>
getNonAsciiCodePoint(int32_t unit,int32_t * codePoint)1246 bool TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePoint(
1247     int32_t unit, int32_t* codePoint) {
1248   MOZ_ASSERT(unit != EOF);
1249   MOZ_ASSERT(!isAsciiCodePoint(unit),
1250              "ASCII code unit/point must be handled separately");
1251 
1252   Utf8Unit lead = Utf8Unit(static_cast<unsigned char>(unit));
1253   MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
1254              "getNonAsciiCodePoint called incorrectly");
1255 
1256   auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
1257 
1258   auto onNotEnoughUnits = [this, &lead](uint_fast8_t remaining,
1259                                         uint_fast8_t required) {
1260     this->notEnoughUnits(lead, remaining, required);
1261   };
1262 
1263   auto onBadTrailingUnit = [this](uint_fast8_t unitsObserved) {
1264     this->badTrailingUnit(unitsObserved);
1265   };
1266 
1267   auto onBadCodePoint = [this](char32_t badCodePoint,
1268                                uint_fast8_t unitsObserved) {
1269     this->badCodePoint(badCodePoint, unitsObserved);
1270   };
1271 
1272   auto onNotShortestForm = [this](char32_t badCodePoint,
1273                                   uint_fast8_t unitsObserved) {
1274     this->notShortestForm(badCodePoint, unitsObserved);
1275   };
1276 
1277   // This consumes the full, valid code point or ungets |lead| and calls the
1278   // appropriate error functor on failure.
1279   SourceUnitsIterator iter(this->sourceUnits);
1280   Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePoint(
1281       lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
1282       onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
1283   if (maybeCodePoint.isNothing()) {
1284     return false;
1285   }
1286 
1287   char32_t cp = maybeCodePoint.value();
1288   if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
1289                    cp == unicode::PARA_SEPARATOR)) {
1290     if (!updateLineInfoForEOL()) {
1291 #ifdef DEBUG
1292       *codePoint = EOF;  // sentinel value to hopefully cause errors
1293 #endif
1294       MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
1295       return false;
1296     }
1297 
1298     *codePoint = '\n';
1299   } else {
1300     MOZ_ASSERT(!IsLineTerminator(cp));
1301     *codePoint = AssertedCast<int32_t>(cp);
1302   }
1303 
1304   return true;
1305 }
1306 
1307 template <>
findWindowStart(size_t offset) const1308 size_t SourceUnits<char16_t>::findWindowStart(size_t offset) const {
1309   // This is JS's understanding of UTF-16 that allows lone surrogates, so
1310   // we have to exclude lone surrogates from [windowStart, offset) ourselves.
1311 
1312   const char16_t* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
1313 
1314   const char16_t* const initial = codeUnitPtrAt(offset);
1315   const char16_t* p = initial;
1316 
1317   auto HalfWindowSize = [&p, &initial]() {
1318     return PointerRangeSize(p, initial);
1319   };
1320 
1321   while (true) {
1322     MOZ_ASSERT(earliestPossibleStart <= p);
1323     MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1324     if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
1325       break;
1326     }
1327 
1328     char16_t c = p[-1];
1329 
1330     // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
1331     // string and template literals.  These code points do affect line and
1332     // column coordinates, even as they encode their literal values.
1333     if (IsLineTerminator(c)) {
1334       break;
1335     }
1336 
1337     // Don't allow invalid UTF-16 in pre-context.  (Current users don't
1338     // require this, and this behavior isn't currently imposed on
1339     // pre-context, but these facts might change someday.)
1340 
1341     if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(c))) {
1342       break;
1343     }
1344 
1345     // Optimistically include the code unit, reverting below if needed.
1346     p--;
1347 
1348     // If it's not a surrogate at all, keep going.
1349     if (MOZ_LIKELY(!unicode::IsTrailSurrogate(c))) {
1350       continue;
1351     }
1352 
1353     // Stop if we don't have a usable surrogate pair.
1354     if (HalfWindowSize() >= WindowRadius ||
1355         p <= earliestPossibleStart ||      // trail surrogate at low end
1356         !unicode::IsLeadSurrogate(p[-1]))  // no paired lead surrogate
1357     {
1358       p++;
1359       break;
1360     }
1361 
1362     p--;
1363   }
1364 
1365   MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1366   return offset - HalfWindowSize();
1367 }
1368 
1369 template <>
findWindowStart(size_t offset) const1370 size_t SourceUnits<Utf8Unit>::findWindowStart(size_t offset) const {
1371   // |offset| must be the location of the error or somewhere before it, so we
1372   // know preceding data is valid UTF-8.
1373 
1374   const Utf8Unit* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
1375 
1376   const Utf8Unit* const initial = codeUnitPtrAt(offset);
1377   const Utf8Unit* p = initial;
1378 
1379   auto HalfWindowSize = [&p, &initial]() {
1380     return PointerRangeSize(p, initial);
1381   };
1382 
1383   while (true) {
1384     MOZ_ASSERT(earliestPossibleStart <= p);
1385     MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1386     if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
1387       break;
1388     }
1389 
1390     // Peek backward for a line break, and only decrement if there is none.
1391     uint8_t prev = p[-1].toUint8();
1392 
1393     // First check for the ASCII LineTerminators.
1394     if (prev == '\r' || prev == '\n') {
1395       break;
1396     }
1397 
1398     // Now check for the non-ASCII LineTerminators U+2028 LINE SEPARATOR
1399     // (0xE2 0x80 0xA8) and U+2029 PARAGRAPH (0xE2 0x80 0xA9).  If there
1400     // aren't three code units available, some comparison here will fail
1401     // before we'd underflow.
1402     if (MOZ_UNLIKELY((prev == 0xA8 || prev == 0xA9) &&
1403                      p[-2].toUint8() == 0x80 && p[-3].toUint8() == 0xE2)) {
1404       break;
1405     }
1406 
1407     // Rewind over the non-LineTerminator.  This can't underflow
1408     // |earliestPossibleStart| because it begins a code point.
1409     while (IsTrailingUnit(*--p)) {
1410       continue;
1411     }
1412 
1413     MOZ_ASSERT(earliestPossibleStart <= p);
1414 
1415     // But if we underflowed |WindowRadius|, adjust forward and stop.
1416     if (HalfWindowSize() > WindowRadius) {
1417       static_assert(WindowRadius > 3,
1418                     "skipping over non-lead code units below must not "
1419                     "advance past |offset|");
1420 
1421       while (IsTrailingUnit(*++p)) {
1422         continue;
1423       }
1424 
1425       MOZ_ASSERT(HalfWindowSize() < WindowRadius);
1426       break;
1427     }
1428   }
1429 
1430   MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1431   return offset - HalfWindowSize();
1432 }
1433 
1434 template <>
findWindowEnd(size_t offset) const1435 size_t SourceUnits<char16_t>::findWindowEnd(size_t offset) const {
1436   const char16_t* const initial = codeUnitPtrAt(offset);
1437   const char16_t* p = initial;
1438 
1439   auto HalfWindowSize = [&initial, &p]() {
1440     return PointerRangeSize(initial, p);
1441   };
1442 
1443   while (true) {
1444     MOZ_ASSERT(p <= limit_);
1445     MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1446     if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
1447       break;
1448     }
1449 
1450     char16_t c = *p;
1451 
1452     // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
1453     // string and template literals.  These code points do affect line and
1454     // column coordinates, even as they encode their literal values.
1455     if (IsLineTerminator(c)) {
1456       break;
1457     }
1458 
1459     // Don't allow invalid UTF-16 in post-context.  (Current users don't
1460     // require this, and this behavior isn't currently imposed on
1461     // pre-context, but these facts might change someday.)
1462 
1463     if (MOZ_UNLIKELY(unicode::IsTrailSurrogate(c))) {
1464       break;
1465     }
1466 
1467     // Optimistically consume the code unit, ungetting it below if needed.
1468     p++;
1469 
1470     // If it's not a surrogate at all, keep going.
1471     if (MOZ_LIKELY(!unicode::IsLeadSurrogate(c))) {
1472       continue;
1473     }
1474 
1475     // Retract if the lead surrogate would stand alone at the end of the
1476     // window.
1477     if (HalfWindowSize() >= WindowRadius ||  // split pair
1478         p >= limit_ ||                       // half-pair at end of source
1479         !unicode::IsTrailSurrogate(*p))      // no paired trail surrogate
1480     {
1481       p--;
1482       break;
1483     }
1484 
1485     p++;
1486   }
1487 
1488   return offset + HalfWindowSize();
1489 }
1490 
1491 template <>
findWindowEnd(size_t offset) const1492 size_t SourceUnits<Utf8Unit>::findWindowEnd(size_t offset) const {
1493   const Utf8Unit* const initial = codeUnitPtrAt(offset);
1494   const Utf8Unit* p = initial;
1495 
1496   auto HalfWindowSize = [&initial, &p]() {
1497     return PointerRangeSize(initial, p);
1498   };
1499 
1500   while (true) {
1501     MOZ_ASSERT(p <= limit_);
1502     MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1503     if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
1504       break;
1505     }
1506 
1507     // A non-encoding error might be followed by an encoding error within
1508     // |maxEnd|, so we must validate as we go to not include invalid UTF-8
1509     // in the computed window.  What joy!
1510 
1511     Utf8Unit lead = *p;
1512     if (mozilla::IsAscii(lead)) {
1513       if (IsSingleUnitLineTerminator(lead)) {
1514         break;
1515       }
1516 
1517       p++;
1518       continue;
1519     }
1520 
1521     PeekedCodePoint<Utf8Unit> peeked = PeekCodePoint(p, limit_);
1522     if (peeked.isNone()) {
1523       break;  // encoding error
1524     }
1525 
1526     char32_t c = peeked.codePoint();
1527     if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR ||
1528                      c == unicode::PARA_SEPARATOR)) {
1529       break;
1530     }
1531 
1532     MOZ_ASSERT(!IsLineTerminator(c));
1533 
1534     uint8_t len = peeked.lengthInUnits();
1535     if (HalfWindowSize() + len > WindowRadius) {
1536       break;
1537     }
1538 
1539     p += len;
1540   }
1541 
1542   MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1543   return offset + HalfWindowSize();
1544 }
1545 
1546 template <typename Unit, class AnyCharsAccess>
advance(size_t position)1547 bool TokenStreamSpecific<Unit, AnyCharsAccess>::advance(size_t position) {
1548   const Unit* end = this->sourceUnits.codeUnitPtrAt(position);
1549   while (this->sourceUnits.addressOfNextCodeUnit() < end) {
1550     int32_t c;
1551     if (!getCodePoint(&c)) {
1552       return false;
1553     }
1554   }
1555 
1556   TokenStreamAnyChars& anyChars = anyCharsAccess();
1557   Token* cur = const_cast<Token*>(&anyChars.currentToken());
1558   cur->pos.begin = this->sourceUnits.offset();
1559   cur->pos.end = cur->pos.begin;
1560   MOZ_MAKE_MEM_UNDEFINED(&cur->type, sizeof(cur->type));
1561   anyChars.lookahead = 0;
1562   return true;
1563 }
1564 
1565 template <typename Unit, class AnyCharsAccess>
seekTo(const Position & pos)1566 void TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(const Position& pos) {
1567   TokenStreamAnyChars& anyChars = anyCharsAccess();
1568 
1569   this->sourceUnits.setAddressOfNextCodeUnit(pos.buf,
1570                                              /* allowPoisoned = */ true);
1571   anyChars.flags = pos.flags;
1572   anyChars.lineno = pos.lineno;
1573   anyChars.linebase = pos.linebase;
1574   anyChars.prevLinebase = pos.prevLinebase;
1575   anyChars.lookahead = pos.lookahead;
1576 
1577   anyChars.tokens[anyChars.cursor()] = pos.currentToken;
1578   for (unsigned i = 0; i < anyChars.lookahead; i++) {
1579     anyChars.tokens[anyChars.aheadCursor(1 + i)] = pos.lookaheadTokens[i];
1580   }
1581 }
1582 
1583 template <typename Unit, class AnyCharsAccess>
seekTo(const Position & pos,const TokenStreamAnyChars & other)1584 bool TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(
1585     const Position& pos, const TokenStreamAnyChars& other) {
1586   if (!anyCharsAccess().srcCoords.fill(other.srcCoords)) {
1587     return false;
1588   }
1589 
1590   seekTo(pos);
1591   return true;
1592 }
1593 
computeErrorMetadataNoOffset(ErrorMetadata * err)1594 void TokenStreamAnyChars::computeErrorMetadataNoOffset(ErrorMetadata* err) {
1595   err->isMuted = mutedErrors;
1596   err->filename = filename_;
1597   err->lineNumber = 0;
1598   err->columnNumber = 0;
1599 
1600   MOZ_ASSERT(err->lineOfContext == nullptr);
1601 }
1602 
fillExceptingContext(ErrorMetadata * err,uint32_t offset)1603 bool TokenStreamAnyChars::fillExceptingContext(ErrorMetadata* err,
1604                                                uint32_t offset) {
1605   err->isMuted = mutedErrors;
1606 
1607   // If this TokenStreamAnyChars doesn't have location information, try to
1608   // get it from the caller.
1609   if (!filename_ && !cx->isHelperThreadContext()) {
1610     NonBuiltinFrameIter iter(cx, FrameIter::FOLLOW_DEBUGGER_EVAL_PREV_LINK,
1611                              cx->realm()->principals());
1612     if (!iter.done() && iter.filename()) {
1613       err->filename = iter.filename();
1614       err->lineNumber = iter.computeLine(&err->columnNumber);
1615       return false;
1616     }
1617   }
1618 
1619   // Otherwise use this TokenStreamAnyChars's location information.
1620   err->filename = filename_;
1621   return true;
1622 }
1623 
1624 template <typename Unit, class AnyCharsAccess>
hasTokenizationStarted() const1625 bool TokenStreamSpecific<Unit, AnyCharsAccess>::hasTokenizationStarted() const {
1626   const TokenStreamAnyChars& anyChars = anyCharsAccess();
1627   return anyChars.isCurrentTokenType(TokenKind::Eof) && !anyChars.isEOF();
1628 }
1629 
1630 template <>
computeWindowOffsetAndLength(const char16_t * encodedWindow,size_t encodedTokenOffset,size_t * utf16TokenOffset,size_t encodedWindowLength,size_t * utf16WindowLength)1631 inline void SourceUnits<char16_t>::computeWindowOffsetAndLength(
1632     const char16_t* encodedWindow, size_t encodedTokenOffset,
1633     size_t* utf16TokenOffset, size_t encodedWindowLength,
1634     size_t* utf16WindowLength) {
1635   MOZ_ASSERT_UNREACHABLE("shouldn't need to recompute for UTF-16");
1636 }
1637 
1638 template <>
computeWindowOffsetAndLength(const Utf8Unit * encodedWindow,size_t encodedTokenOffset,size_t * utf16TokenOffset,size_t encodedWindowLength,size_t * utf16WindowLength)1639 inline void SourceUnits<Utf8Unit>::computeWindowOffsetAndLength(
1640     const Utf8Unit* encodedWindow, size_t encodedTokenOffset,
1641     size_t* utf16TokenOffset, size_t encodedWindowLength,
1642     size_t* utf16WindowLength) {
1643   MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
1644              "token offset must be within the window, and the two lambda "
1645              "calls below presume this ordering of values");
1646 
1647   const Utf8Unit* const encodedWindowEnd = encodedWindow + encodedWindowLength;
1648 
1649   size_t i = 0;
1650   auto ComputeUtf16Count = [&i, &encodedWindow](const Utf8Unit* limit) {
1651     while (encodedWindow < limit) {
1652       Utf8Unit lead = *encodedWindow++;
1653       if (MOZ_LIKELY(IsAscii(lead))) {
1654         // ASCII contributes a single UTF-16 code unit.
1655         i++;
1656         continue;
1657       }
1658 
1659       Maybe<char32_t> cp = DecodeOneUtf8CodePoint(lead, &encodedWindow, limit);
1660       MOZ_ASSERT(cp.isSome(),
1661                  "computed window should only contain valid UTF-8");
1662 
1663       i += unicode::IsSupplementary(cp.value()) ? 2 : 1;
1664     }
1665 
1666     return i;
1667   };
1668 
1669   // Compute the token offset from |i == 0| and the initial |encodedWindow|.
1670   const Utf8Unit* token = encodedWindow + encodedTokenOffset;
1671   MOZ_ASSERT(token <= encodedWindowEnd);
1672   *utf16TokenOffset = ComputeUtf16Count(token);
1673 
1674   // Compute the window length, picking up from |i| and |encodedWindow| that,
1675   // in general, were modified just above.
1676   *utf16WindowLength = ComputeUtf16Count(encodedWindowEnd);
1677 }
1678 
1679 template <typename Unit>
addLineOfContext(ErrorMetadata * err,uint32_t offset)1680 bool TokenStreamCharsBase<Unit>::addLineOfContext(ErrorMetadata* err,
1681                                                   uint32_t offset) {
1682   // Rename the variable to make meaning clearer: an offset into source units
1683   // in Unit encoding.
1684   size_t encodedOffset = offset;
1685 
1686   // These are also offsets into source units in Unit encoding.
1687   size_t encodedWindowStart = sourceUnits.findWindowStart(encodedOffset);
1688   size_t encodedWindowEnd = sourceUnits.findWindowEnd(encodedOffset);
1689 
1690   size_t encodedWindowLength = encodedWindowEnd - encodedWindowStart;
1691   MOZ_ASSERT(encodedWindowLength <= SourceUnits::WindowRadius * 2);
1692 
1693   // Don't add a useless "line" of context when the window ends up empty
1694   // because of an invalid encoding at the start of a line.
1695   if (encodedWindowLength == 0) {
1696     MOZ_ASSERT(err->lineOfContext == nullptr,
1697                "ErrorMetadata::lineOfContext must be null so we don't "
1698                "have to set the lineLength/tokenOffset fields");
1699     return true;
1700   }
1701 
1702   CharBuffer lineOfContext(cx);
1703 
1704   const Unit* encodedWindow = sourceUnits.codeUnitPtrAt(encodedWindowStart);
1705   if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(
1706           lineOfContext, encodedWindow, encodedWindow + encodedWindowLength)) {
1707     return false;
1708   }
1709 
1710   size_t utf16WindowLength = lineOfContext.length();
1711 
1712   // The windowed string is null-terminated.
1713   if (!lineOfContext.append('\0')) {
1714     return false;
1715   }
1716 
1717   err->lineOfContext.reset(lineOfContext.extractOrCopyRawBuffer());
1718   if (!err->lineOfContext) {
1719     return false;
1720   }
1721 
1722   size_t encodedTokenOffset = encodedOffset - encodedWindowStart;
1723 
1724   MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
1725              "token offset must be inside the window");
1726 
1727   // The length in UTF-8 code units of a code point is always greater than or
1728   // equal to the same code point's length in UTF-16 code points.  ASCII code
1729   // points are 1 unit in either encoding.  Code points in [U+0080, U+10000)
1730   // are 2-3 UTF-8 code units to 1 UTF-16 code unit.  And code points in
1731   // [U+10000, U+10FFFF] are 4 UTF-8 code units to 2 UTF-16 code units.
1732   //
1733   // Therefore, if encoded window length equals the length in UTF-16 (this is
1734   // always the case for Unit=char16_t), the UTF-16 offsets are exactly the
1735   // encoded offsets.  Otherwise we must convert offset/length from UTF-8 to
1736   // UTF-16.
1737   if constexpr (std::is_same_v<Unit, char16_t>) {
1738     MOZ_ASSERT(utf16WindowLength == encodedWindowLength,
1739                "UTF-16 to UTF-16 shouldn't change window length");
1740     err->tokenOffset = encodedTokenOffset;
1741     err->lineLength = encodedWindowLength;
1742   } else {
1743     static_assert(std::is_same_v<Unit, Utf8Unit>, "should only see UTF-8 here");
1744 
1745     bool simple = utf16WindowLength == encodedWindowLength;
1746 #ifdef DEBUG
1747     auto isAscii = [](Unit u) { return IsAscii(u); };
1748     MOZ_ASSERT(std::all_of(encodedWindow, encodedWindow + encodedWindowLength,
1749                            isAscii) == simple,
1750                "equal window lengths in UTF-8 should correspond only to "
1751                "wholly-ASCII text");
1752 #endif
1753     if (simple) {
1754       err->tokenOffset = encodedTokenOffset;
1755       err->lineLength = encodedWindowLength;
1756     } else {
1757       sourceUnits.computeWindowOffsetAndLength(
1758           encodedWindow, encodedTokenOffset, &err->tokenOffset,
1759           encodedWindowLength, &err->lineLength);
1760     }
1761   }
1762 
1763   return true;
1764 }
1765 
1766 template <typename Unit, class AnyCharsAccess>
computeErrorMetadata(ErrorMetadata * err,const ErrorOffset & errorOffset)1767 bool TokenStreamSpecific<Unit, AnyCharsAccess>::computeErrorMetadata(
1768     ErrorMetadata* err, const ErrorOffset& errorOffset) {
1769   if (errorOffset.is<NoOffset>()) {
1770     anyCharsAccess().computeErrorMetadataNoOffset(err);
1771     return true;
1772   }
1773 
1774   uint32_t offset;
1775   if (errorOffset.is<uint32_t>()) {
1776     offset = errorOffset.as<uint32_t>();
1777   } else {
1778     offset = this->sourceUnits.offset();
1779   }
1780 
1781   // This function's return value isn't a success/failure indication: it
1782   // returns true if this TokenStream can be used to provide a line of
1783   // context.
1784   if (fillExceptingContext(err, offset)) {
1785     // Add a line of context from this TokenStream to help with debugging.
1786     return internalComputeLineOfContext(err, offset);
1787   }
1788 
1789   // We can't fill in any more here.
1790   return true;
1791 }
1792 
1793 template <typename Unit, class AnyCharsAccess>
reportIllegalCharacter(int32_t cp)1794 void TokenStreamSpecific<Unit, AnyCharsAccess>::reportIllegalCharacter(
1795     int32_t cp) {
1796   UniqueChars display = JS_smprintf("U+%04X", cp);
1797   if (!display) {
1798     ReportOutOfMemory(anyCharsAccess().cx);
1799     return;
1800   }
1801   error(JSMSG_ILLEGAL_CHARACTER, display.get());
1802 }
1803 
1804 // We have encountered a '\': check for a Unicode escape sequence after it.
1805 // Return the length of the escape sequence and the encoded code point (by
1806 // value) if we found a Unicode escape sequence, and skip all code units
1807 // involed.  Otherwise, return 0 and don't advance along the buffer.
1808 template <typename Unit, class AnyCharsAccess>
matchUnicodeEscape(uint32_t * codePoint)1809 uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscape(
1810     uint32_t* codePoint) {
1811   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1812 
1813   int32_t unit = getCodeUnit();
1814   if (unit != 'u') {
1815     // NOTE: |unit| may be EOF here.
1816     ungetCodeUnit(unit);
1817     MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1818     return 0;
1819   }
1820 
1821   char16_t v;
1822   unit = getCodeUnit();
1823   if (IsAsciiHexDigit(unit) && this->sourceUnits.matchHexDigits(3, &v)) {
1824     *codePoint = (AsciiAlphanumericToNumber(unit) << 12) | v;
1825     return 5;
1826   }
1827 
1828   if (unit == '{') {
1829     return matchExtendedUnicodeEscape(codePoint);
1830   }
1831 
1832   // NOTE: |unit| may be EOF here, so this ungets either one or two units.
1833   ungetCodeUnit(unit);
1834   ungetCodeUnit('u');
1835   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1836   return 0;
1837 }
1838 
1839 template <typename Unit, class AnyCharsAccess>
1840 uint32_t
matchExtendedUnicodeEscape(uint32_t * codePoint)1841 GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchExtendedUnicodeEscape(
1842     uint32_t* codePoint) {
1843   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('{'));
1844 
1845   int32_t unit = getCodeUnit();
1846 
1847   // Skip leading zeroes.
1848   uint32_t leadingZeroes = 0;
1849   while (unit == '0') {
1850     leadingZeroes++;
1851     unit = getCodeUnit();
1852   }
1853 
1854   size_t i = 0;
1855   uint32_t code = 0;
1856   while (IsAsciiHexDigit(unit) && i < 6) {
1857     code = (code << 4) | AsciiAlphanumericToNumber(unit);
1858     unit = getCodeUnit();
1859     i++;
1860   }
1861 
1862   uint32_t gotten =
1863       2 +                  // 'u{'
1864       leadingZeroes + i +  // significant hexdigits
1865       (unit != EOF);       // subtract a get if it didn't contribute to length
1866 
1867   if (unit == '}' && (leadingZeroes > 0 || i > 0) &&
1868       code <= unicode::NonBMPMax) {
1869     *codePoint = code;
1870     return gotten;
1871   }
1872 
1873   this->sourceUnits.unskipCodeUnits(gotten);
1874   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1875   return 0;
1876 }
1877 
1878 template <typename Unit, class AnyCharsAccess>
1879 uint32_t
matchUnicodeEscapeIdStart(uint32_t * codePoint)1880 GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdStart(
1881     uint32_t* codePoint) {
1882   uint32_t length = matchUnicodeEscape(codePoint);
1883   if (MOZ_LIKELY(length > 0)) {
1884     if (MOZ_LIKELY(unicode::IsIdentifierStart(*codePoint))) {
1885       return length;
1886     }
1887 
1888     this->sourceUnits.unskipCodeUnits(length);
1889   }
1890 
1891   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1892   return 0;
1893 }
1894 
1895 template <typename Unit, class AnyCharsAccess>
matchUnicodeEscapeIdent(uint32_t * codePoint)1896 bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdent(
1897     uint32_t* codePoint) {
1898   uint32_t length = matchUnicodeEscape(codePoint);
1899   if (MOZ_LIKELY(length > 0)) {
1900     if (MOZ_LIKELY(unicode::IsIdentifierPart(*codePoint))) {
1901       return true;
1902     }
1903 
1904     this->sourceUnits.unskipCodeUnits(length);
1905   }
1906 
1907   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1908   return false;
1909 }
1910 
1911 template <typename Unit, class AnyCharsAccess>
1912 [[nodiscard]] bool
matchIdentifierStart(IdentifierEscapes * sawEscape)1913 TokenStreamSpecific<Unit, AnyCharsAccess>::matchIdentifierStart(
1914     IdentifierEscapes* sawEscape) {
1915   int32_t unit = getCodeUnit();
1916   if (unicode::IsIdentifierStart(char16_t(unit))) {
1917     ungetCodeUnit(unit);
1918     *sawEscape = IdentifierEscapes::None;
1919     return true;
1920   }
1921 
1922   if (unit == '\\') {
1923     *sawEscape = IdentifierEscapes::SawUnicodeEscape;
1924 
1925     uint32_t codePoint;
1926     uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint);
1927     if (escapeLength != 0) {
1928       return true;
1929     }
1930 
1931     // We could point "into" a mistyped escape, e.g. for "\u{41H}" we
1932     // could point at the 'H'.  But we don't do that now, so the code
1933     // unit after the '\' isn't necessarily bad, so just point at the
1934     // start of the actually-invalid escape.
1935     ungetCodeUnit('\\');
1936     error(JSMSG_BAD_ESCAPE);
1937     return false;
1938   }
1939 
1940   *sawEscape = IdentifierEscapes::None;
1941 
1942   // NOTE: |unit| may be EOF here.
1943   ungetCodeUnit(unit);
1944   error(JSMSG_MISSING_PRIVATE_NAME);
1945   return false;
1946 }
1947 
1948 template <typename Unit, class AnyCharsAccess>
getDirectives(bool isMultiline,bool shouldWarnDeprecated)1949 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirectives(
1950     bool isMultiline, bool shouldWarnDeprecated) {
1951   // Match directive comments used in debugging, such as "//# sourceURL" and
1952   // "//# sourceMappingURL". Use of "//@" instead of "//#" is deprecated.
1953   //
1954   // To avoid a crashing bug in IE, several JavaScript transpilers wrap single
1955   // line comments containing a source mapping URL inside a multiline
1956   // comment. To avoid potentially expensive lookahead and backtracking, we
1957   // only check for this case if we encounter a '#' code unit.
1958 
1959   bool res = getDisplayURL(isMultiline, shouldWarnDeprecated) &&
1960              getSourceMappingURL(isMultiline, shouldWarnDeprecated);
1961   if (!res) {
1962     badToken();
1963   }
1964 
1965   return res;
1966 }
1967 
copyCharBufferTo(JSContext * cx,UniquePtr<char16_t[],JS::FreePolicy> * destination)1968 [[nodiscard]] bool TokenStreamCharsShared::copyCharBufferTo(
1969     JSContext* cx, UniquePtr<char16_t[], JS::FreePolicy>* destination) {
1970   size_t length = charBuffer.length();
1971 
1972   *destination = cx->make_pod_array<char16_t>(length + 1);
1973   if (!*destination) {
1974     return false;
1975   }
1976 
1977   std::copy(charBuffer.begin(), charBuffer.end(), destination->get());
1978   (*destination)[length] = '\0';
1979   return true;
1980 }
1981 
1982 template <typename Unit, class AnyCharsAccess>
getDirective(bool isMultiline,bool shouldWarnDeprecated,const char * directive,uint8_t directiveLength,const char * errorMsgPragma,UniquePtr<char16_t[],JS::FreePolicy> * destination)1983 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirective(
1984     bool isMultiline, bool shouldWarnDeprecated, const char* directive,
1985     uint8_t directiveLength, const char* errorMsgPragma,
1986     UniquePtr<char16_t[], JS::FreePolicy>* destination) {
1987   // Stop if we don't find |directive|.  (Note that |directive| must be
1988   // ASCII, so there are no tricky encoding issues to consider in matching
1989   // UTF-8/16-agnostically.)
1990   if (!this->sourceUnits.matchCodeUnits(directive, directiveLength)) {
1991     return true;
1992   }
1993 
1994   if (shouldWarnDeprecated) {
1995     if (!warning(JSMSG_DEPRECATED_PRAGMA, errorMsgPragma)) {
1996       return false;
1997     }
1998   }
1999 
2000   this->charBuffer.clear();
2001 
2002   do {
2003     int32_t unit = peekCodeUnit();
2004     if (unit == EOF) {
2005       break;
2006     }
2007 
2008     if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2009       if (unicode::IsSpace(AssertedCast<Latin1Char>(unit))) {
2010         break;
2011       }
2012 
2013       consumeKnownCodeUnit(unit);
2014 
2015       // Debugging directives can occur in both single- and multi-line
2016       // comments. If we're currently inside a multi-line comment, we
2017       // also must recognize multi-line comment terminators.
2018       if (isMultiline && unit == '*' && peekCodeUnit() == '/') {
2019         ungetCodeUnit('*');
2020         break;
2021       }
2022 
2023       if (!this->charBuffer.append(unit)) {
2024         return false;
2025       }
2026 
2027       continue;
2028     }
2029 
2030     // This ignores encoding errors: subsequent caller-side code to
2031     // handle the remaining source text in the comment will do so.
2032     PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2033     if (peeked.isNone() || unicode::IsSpace(peeked.codePoint())) {
2034       break;
2035     }
2036 
2037     MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()),
2038                "!IsSpace must imply !IsLineTerminator or else we'll fail to "
2039                "maintain line-info/flags for EOL");
2040     this->sourceUnits.consumeKnownCodePoint(peeked);
2041 
2042     if (!AppendCodePointToCharBuffer(this->charBuffer, peeked.codePoint())) {
2043       return false;
2044     }
2045   } while (true);
2046 
2047   if (this->charBuffer.empty()) {
2048     // The directive's URL was missing, but comments can contain anything,
2049     // so it isn't an error.
2050     return true;
2051   }
2052 
2053   return copyCharBufferTo(anyCharsAccess().cx, destination);
2054 }
2055 
2056 template <typename Unit, class AnyCharsAccess>
getDisplayURL(bool isMultiline,bool shouldWarnDeprecated)2057 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDisplayURL(
2058     bool isMultiline, bool shouldWarnDeprecated) {
2059   // Match comments of the form "//# sourceURL=<url>" or
2060   // "/\* //# sourceURL=<url> *\/"
2061   //
2062   // Note that while these are labeled "sourceURL" in the source text,
2063   // internally we refer to it as a "displayURL" to distinguish what the
2064   // developer would like to refer to the source as from the source's actual
2065   // URL.
2066 
2067   static constexpr char sourceURLDirective[] = " sourceURL=";
2068   constexpr uint8_t sourceURLDirectiveLength = js_strlen(sourceURLDirective);
2069   return getDirective(isMultiline, shouldWarnDeprecated, sourceURLDirective,
2070                       sourceURLDirectiveLength, "sourceURL",
2071                       &anyCharsAccess().displayURL_);
2072 }
2073 
2074 template <typename Unit, class AnyCharsAccess>
getSourceMappingURL(bool isMultiline,bool shouldWarnDeprecated)2075 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getSourceMappingURL(
2076     bool isMultiline, bool shouldWarnDeprecated) {
2077   // Match comments of the form "//# sourceMappingURL=<url>" or
2078   // "/\* //# sourceMappingURL=<url> *\/"
2079 
2080   static constexpr char sourceMappingURLDirective[] = " sourceMappingURL=";
2081   constexpr uint8_t sourceMappingURLDirectiveLength =
2082       js_strlen(sourceMappingURLDirective);
2083   return getDirective(isMultiline, shouldWarnDeprecated,
2084                       sourceMappingURLDirective,
2085                       sourceMappingURLDirectiveLength, "sourceMappingURL",
2086                       &anyCharsAccess().sourceMapURL_);
2087 }
2088 
2089 template <typename Unit, class AnyCharsAccess>
2090 MOZ_ALWAYS_INLINE Token*
newTokenInternal(TokenKind kind,TokenStart start,TokenKind * out)2091 GeneralTokenStreamChars<Unit, AnyCharsAccess>::newTokenInternal(
2092     TokenKind kind, TokenStart start, TokenKind* out) {
2093   MOZ_ASSERT(kind < TokenKind::Limit);
2094   MOZ_ASSERT(kind != TokenKind::Eol,
2095              "TokenKind::Eol should never be used in an actual Token, only "
2096              "returned by peekTokenSameLine()");
2097 
2098   TokenStreamAnyChars& anyChars = anyCharsAccess();
2099   anyChars.flags.isDirtyLine = true;
2100 
2101   Token* token = anyChars.allocateToken();
2102 
2103   *out = token->type = kind;
2104   token->pos = TokenPos(start.offset(), this->sourceUnits.offset());
2105   MOZ_ASSERT(token->pos.begin <= token->pos.end);
2106 
2107   // NOTE: |token->modifier| is set in |newToken()| so that optimized,
2108   // non-debug code won't do any work to pass a modifier-argument that will
2109   // never be used.
2110 
2111   return token;
2112 }
2113 
2114 template <typename Unit, class AnyCharsAccess>
badToken()2115 MOZ_COLD bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::badToken() {
2116   // We didn't get a token, so don't set |flags.isDirtyLine|.
2117   anyCharsAccess().flags.hadError = true;
2118 
2119   // Poisoning sourceUnits on error establishes an invariant: once an
2120   // erroneous token has been seen, sourceUnits will not be consulted again.
2121   // This is true because the parser will deal with the illegal token by
2122   // aborting parsing immediately.
2123   this->sourceUnits.poisonInDebug();
2124 
2125   return false;
2126 };
2127 
AppendCodePointToCharBuffer(CharBuffer & charBuffer,uint32_t codePoint)2128 bool AppendCodePointToCharBuffer(CharBuffer& charBuffer, uint32_t codePoint) {
2129   MOZ_ASSERT(codePoint <= unicode::NonBMPMax,
2130              "should only be processing code points validly decoded from UTF-8 "
2131              "or WTF-16 source text (surrogate code points permitted)");
2132 
2133   char16_t units[2];
2134   unsigned numUnits = 0;
2135   unicode::UTF16Encode(codePoint, units, &numUnits);
2136 
2137   MOZ_ASSERT(numUnits == 1 || numUnits == 2,
2138              "UTF-16 code points are only encoded in one or two units");
2139 
2140   if (!charBuffer.append(units[0])) {
2141     return false;
2142   }
2143 
2144   if (numUnits == 1) {
2145     return true;
2146   }
2147 
2148   return charBuffer.append(units[1]);
2149 }
2150 
2151 template <typename Unit, class AnyCharsAccess>
putIdentInCharBuffer(const Unit * identStart)2152 bool TokenStreamSpecific<Unit, AnyCharsAccess>::putIdentInCharBuffer(
2153     const Unit* identStart) {
2154   const Unit* const originalAddress = this->sourceUnits.addressOfNextCodeUnit();
2155   this->sourceUnits.setAddressOfNextCodeUnit(identStart);
2156 
2157   auto restoreNextRawCharAddress = MakeScopeExit([this, originalAddress]() {
2158     this->sourceUnits.setAddressOfNextCodeUnit(originalAddress);
2159   });
2160 
2161   this->charBuffer.clear();
2162   do {
2163     int32_t unit = getCodeUnit();
2164     if (unit == EOF) {
2165       break;
2166     }
2167 
2168     uint32_t codePoint;
2169     if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2170       if (unicode::IsIdentifierPart(char16_t(unit)) || unit == '#') {
2171         if (!this->charBuffer.append(unit)) {
2172           return false;
2173         }
2174 
2175         continue;
2176       }
2177 
2178       if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) {
2179         break;
2180       }
2181     } else {
2182       // |restoreNextRawCharAddress| undoes all gets, and this function
2183       // doesn't update line/column info.
2184       char32_t cp;
2185       if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) {
2186         return false;
2187       }
2188 
2189       codePoint = cp;
2190       if (!unicode::IsIdentifierPart(codePoint)) {
2191         break;
2192       }
2193     }
2194 
2195     if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) {
2196       return false;
2197     }
2198   } while (true);
2199 
2200   return true;
2201 }
2202 
2203 template <typename Unit, class AnyCharsAccess>
identifierName(TokenStart start,const Unit * identStart,IdentifierEscapes escaping,Modifier modifier,NameVisibility visibility,TokenKind * out)2204 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::identifierName(
2205     TokenStart start, const Unit* identStart, IdentifierEscapes escaping,
2206     Modifier modifier, NameVisibility visibility, TokenKind* out) {
2207   // Run the bad-token code for every path out of this function except the
2208   // two success-cases.
2209   auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
2210 
2211   // We've already consumed an initial code point in the identifer, to *know*
2212   // that this is an identifier.  So no need to worry about not consuming any
2213   // code points in the loop below.
2214   int32_t unit;
2215   while (true) {
2216     unit = peekCodeUnit();
2217     if (unit == EOF) {
2218       break;
2219     }
2220 
2221     if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2222       consumeKnownCodeUnit(unit);
2223 
2224       if (MOZ_UNLIKELY(
2225               !unicode::IsIdentifierPart(static_cast<char16_t>(unit)))) {
2226         // Handle a Unicode escape -- otherwise it's not part of the
2227         // identifier.
2228         uint32_t codePoint;
2229         if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) {
2230           ungetCodeUnit(unit);
2231           break;
2232         }
2233 
2234         escaping = IdentifierEscapes::SawUnicodeEscape;
2235       }
2236     } else {
2237       // This ignores encoding errors: subsequent caller-side code to
2238       // handle source text after the IdentifierName will do so.
2239       PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2240       if (peeked.isNone() || !unicode::IsIdentifierPart(peeked.codePoint())) {
2241         break;
2242       }
2243 
2244       MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()),
2245                  "IdentifierPart must guarantee !IsLineTerminator or "
2246                  "else we'll fail to maintain line-info/flags for EOL");
2247 
2248       this->sourceUnits.consumeKnownCodePoint(peeked);
2249     }
2250   }
2251 
2252   TaggedParserAtomIndex atom;
2253   if (MOZ_UNLIKELY(escaping == IdentifierEscapes::SawUnicodeEscape)) {
2254     // Identifiers containing Unicode escapes have to be converted into
2255     // tokenbuf before atomizing.
2256     if (!putIdentInCharBuffer(identStart)) {
2257       return false;
2258     }
2259 
2260     atom = drainCharBufferIntoAtom();
2261   } else {
2262     // Escape-free identifiers can be created directly from sourceUnits.
2263     const Unit* chars = identStart;
2264     size_t length = this->sourceUnits.addressOfNextCodeUnit() - identStart;
2265 
2266     // Private identifiers start with a '#', and so cannot be reserved words.
2267     if (visibility == NameVisibility::Public) {
2268       // Represent reserved words lacking escapes as reserved word tokens.
2269       if (const ReservedWordInfo* rw = FindReservedWord(chars, length)) {
2270         noteBadToken.release();
2271         newSimpleToken(rw->tokentype, start, modifier, out);
2272         return true;
2273       }
2274     }
2275 
2276     atom = atomizeSourceChars(Span(chars, length));
2277   }
2278   if (!atom) {
2279     return false;
2280   }
2281 
2282   noteBadToken.release();
2283   if (visibility == NameVisibility::Private) {
2284     newPrivateNameToken(atom, start, modifier, out);
2285     return true;
2286   }
2287   newNameToken(atom, start, modifier, out);
2288   return true;
2289 }
2290 
2291 enum FirstCharKind {
2292   // A char16_t has the 'OneChar' kind if it, by itself, constitutes a valid
2293   // token that cannot also be a prefix of a longer token.  E.g. ';' has the
2294   // OneChar kind, but '+' does not, because '++' and '+=' are valid longer
2295   // tokens
2296   // that begin with '+'.
2297   //
2298   // The few token kinds satisfying these properties cover roughly 35--45%
2299   // of the tokens seen in practice.
2300   //
2301   // We represent the 'OneChar' kind with any positive value less than
2302   // TokenKind::Limit.  This representation lets us associate
2303   // each one-char token char16_t with a TokenKind and thus avoid
2304   // a subsequent char16_t-to-TokenKind conversion.
2305   OneChar_Min = 0,
2306   OneChar_Max = size_t(TokenKind::Limit) - 1,
2307 
2308   Space = size_t(TokenKind::Limit),
2309   Ident,
2310   Dec,
2311   String,
2312   EOL,
2313   ZeroDigit,
2314   Other,
2315 
2316   LastCharKind = Other
2317 };
2318 
2319 // OneChar: 40,  41,  44,  58,  59,  91,  93,  123, 125, 126:
2320 //          '(', ')', ',', ':', ';', '[', ']', '{', '}', '~'
2321 // Ident:   36, 65..90, 95, 97..122: '$', 'A'..'Z', '_', 'a'..'z'
2322 // Dot:     46: '.'
2323 // Equals:  61: '='
2324 // String:  34, 39, 96: '"', '\'', '`'
2325 // Dec:     49..57: '1'..'9'
2326 // Plus:    43: '+'
2327 // ZeroDigit:  48: '0'
2328 // Space:   9, 11, 12, 32: '\t', '\v', '\f', ' '
2329 // EOL:     10, 13: '\n', '\r'
2330 //
2331 #define T_COMMA size_t(TokenKind::Comma)
2332 #define T_COLON size_t(TokenKind::Colon)
2333 #define T_BITNOT size_t(TokenKind::BitNot)
2334 #define T_LP size_t(TokenKind::LeftParen)
2335 #define T_RP size_t(TokenKind::RightParen)
2336 #define T_SEMI size_t(TokenKind::Semi)
2337 #define T_LB size_t(TokenKind::LeftBracket)
2338 #define T_RB size_t(TokenKind::RightBracket)
2339 #define T_LC size_t(TokenKind::LeftCurly)
2340 #define T_RC size_t(TokenKind::RightCurly)
2341 #define _______ Other
2342 static const uint8_t firstCharKinds[] = {
2343     // clang-format off
2344 /*         0        1        2        3        4        5        6        7        8        9    */
2345 /*   0+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______,   Space,
2346 /*  10+ */     EOL,   Space,   Space,     EOL, _______, _______, _______, _______, _______, _______,
2347 /*  20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
2348 /*  30+ */ _______, _______,   Space, _______,  String, _______,   Ident, _______, _______,  String,
2349 /*  40+ */    T_LP,    T_RP, _______, _______, T_COMMA, _______, _______, _______,ZeroDigit,    Dec,
2350 /*  50+ */     Dec,     Dec,     Dec,     Dec,     Dec,     Dec,     Dec,     Dec, T_COLON,  T_SEMI,
2351 /*  60+ */ _______, _______, _______, _______, _______,   Ident,   Ident,   Ident,   Ident,   Ident,
2352 /*  70+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
2353 /*  80+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
2354 /*  90+ */   Ident,    T_LB, _______,    T_RB, _______,   Ident,  String,   Ident,   Ident,   Ident,
2355 /* 100+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
2356 /* 110+ */   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,   Ident,
2357 /* 120+ */   Ident,   Ident,   Ident,    T_LC, _______,    T_RC,T_BITNOT, _______
2358     // clang-format on
2359 };
2360 #undef T_COMMA
2361 #undef T_COLON
2362 #undef T_BITNOT
2363 #undef T_LP
2364 #undef T_RP
2365 #undef T_SEMI
2366 #undef T_LB
2367 #undef T_RB
2368 #undef T_LC
2369 #undef T_RC
2370 #undef _______
2371 
2372 static_assert(LastCharKind < (1 << (sizeof(firstCharKinds[0]) * 8)),
2373               "Elements of firstCharKinds[] are too small");
2374 
2375 template <>
consumeRestOfSingleLineComment()2376 void SourceUnits<char16_t>::consumeRestOfSingleLineComment() {
2377   while (MOZ_LIKELY(!atEnd())) {
2378     char16_t unit = peekCodeUnit();
2379     if (IsLineTerminator(unit)) {
2380       return;
2381     }
2382 
2383     consumeKnownCodeUnit(unit);
2384   }
2385 }
2386 
2387 template <>
consumeRestOfSingleLineComment()2388 void SourceUnits<Utf8Unit>::consumeRestOfSingleLineComment() {
2389   while (MOZ_LIKELY(!atEnd())) {
2390     const Utf8Unit unit = peekCodeUnit();
2391     if (IsSingleUnitLineTerminator(unit)) {
2392       return;
2393     }
2394 
2395     if (MOZ_LIKELY(IsAscii(unit))) {
2396       consumeKnownCodeUnit(unit);
2397       continue;
2398     }
2399 
2400     PeekedCodePoint<Utf8Unit> peeked = peekCodePoint();
2401     if (peeked.isNone()) {
2402       return;
2403     }
2404 
2405     char32_t c = peeked.codePoint();
2406     if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR ||
2407                      c == unicode::PARA_SEPARATOR)) {
2408       return;
2409     }
2410 
2411     consumeKnownCodePoint(peeked);
2412   }
2413 }
2414 
2415 template <typename Unit, class AnyCharsAccess>
2416 [[nodiscard]] MOZ_ALWAYS_INLINE bool
matchInteger(IsIntegerUnit isIntegerUnit,int32_t * nextUnit)2417 TokenStreamSpecific<Unit, AnyCharsAccess>::matchInteger(
2418     IsIntegerUnit isIntegerUnit, int32_t* nextUnit) {
2419   int32_t unit = getCodeUnit();
2420   if (!isIntegerUnit(unit)) {
2421     *nextUnit = unit;
2422     return true;
2423   }
2424   return matchIntegerAfterFirstDigit(isIntegerUnit, nextUnit);
2425 }
2426 
2427 template <typename Unit, class AnyCharsAccess>
2428 [[nodiscard]] MOZ_ALWAYS_INLINE bool
matchIntegerAfterFirstDigit(IsIntegerUnit isIntegerUnit,int32_t * nextUnit)2429 TokenStreamSpecific<Unit, AnyCharsAccess>::matchIntegerAfterFirstDigit(
2430     IsIntegerUnit isIntegerUnit, int32_t* nextUnit) {
2431   int32_t unit;
2432   while (true) {
2433     unit = getCodeUnit();
2434     if (isIntegerUnit(unit)) {
2435       continue;
2436     }
2437     if (unit != '_') {
2438       break;
2439     }
2440     unit = getCodeUnit();
2441     if (!isIntegerUnit(unit)) {
2442       if (unit == '_') {
2443         error(JSMSG_NUMBER_MULTIPLE_ADJACENT_UNDERSCORES);
2444       } else {
2445         error(JSMSG_NUMBER_END_WITH_UNDERSCORE);
2446       }
2447       return false;
2448     }
2449   }
2450 
2451   *nextUnit = unit;
2452   return true;
2453 }
2454 
2455 template <typename Unit, class AnyCharsAccess>
decimalNumber(int32_t unit,TokenStart start,const Unit * numStart,Modifier modifier,TokenKind * out)2456 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::decimalNumber(
2457     int32_t unit, TokenStart start, const Unit* numStart, Modifier modifier,
2458     TokenKind* out) {
2459   // Run the bad-token code for every path out of this function except the
2460   // one success-case.
2461   auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
2462 
2463   // Consume integral component digits.
2464   if (IsAsciiDigit(unit)) {
2465     if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) {
2466       return false;
2467     }
2468   }
2469 
2470   // Numbers contain no escapes, so we can read directly from |sourceUnits|.
2471   double dval;
2472   bool isBigInt = false;
2473   DecimalPoint decimalPoint = NoDecimal;
2474   if (unit != '.' && unit != 'e' && unit != 'E' && unit != 'n') {
2475     // NOTE: |unit| may be EOF here.
2476     ungetCodeUnit(unit);
2477 
2478     // Most numbers are pure decimal integers without fractional component
2479     // or exponential notation.  Handle that with optimized code.
2480     if (!GetDecimalInteger(anyCharsAccess().cx, numStart,
2481                            this->sourceUnits.addressOfNextCodeUnit(), &dval)) {
2482       return false;
2483     }
2484   } else if (unit == 'n') {
2485     isBigInt = true;
2486     unit = peekCodeUnit();
2487   } else {
2488     // Consume any decimal dot and fractional component.
2489     if (unit == '.') {
2490       decimalPoint = HasDecimal;
2491       if (!matchInteger(IsAsciiDigit, &unit)) {
2492         return false;
2493       }
2494     }
2495 
2496     // Consume any exponential notation.
2497     if (unit == 'e' || unit == 'E') {
2498       unit = getCodeUnit();
2499       if (unit == '+' || unit == '-') {
2500         unit = getCodeUnit();
2501       }
2502 
2503       // Exponential notation must contain at least one digit.
2504       if (!IsAsciiDigit(unit)) {
2505         ungetCodeUnit(unit);
2506         error(JSMSG_MISSING_EXPONENT);
2507         return false;
2508       }
2509 
2510       // Consume exponential digits.
2511       if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) {
2512         return false;
2513       }
2514     }
2515 
2516     ungetCodeUnit(unit);
2517 
2518     // "0." and "0e..." numbers parse "." or "e..." here.  Neither range
2519     // contains a number, so we can't use |FullStringToDouble|.  (Parse
2520     // failures return 0.0, so we'll still get the right result.)
2521     if (!GetDecimalNonInteger(anyCharsAccess().cx, numStart,
2522                               this->sourceUnits.addressOfNextCodeUnit(),
2523                               &dval)) {
2524       return false;
2525     }
2526   }
2527 
2528   // Number followed by IdentifierStart is an error.  (This is the only place
2529   // in ECMAScript where token boundary is inadequate to properly separate
2530   // two tokens, necessitating this unaesthetic lookahead.)
2531   if (unit != EOF) {
2532     if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2533       if (unicode::IsIdentifierStart(char16_t(unit))) {
2534         error(JSMSG_IDSTART_AFTER_NUMBER);
2535         return false;
2536       }
2537     } else {
2538       // This ignores encoding errors: subsequent caller-side code to
2539       // handle source text after the number will do so.
2540       PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2541       if (!peeked.isNone() && unicode::IsIdentifierStart(peeked.codePoint())) {
2542         error(JSMSG_IDSTART_AFTER_NUMBER);
2543         return false;
2544       }
2545     }
2546   }
2547 
2548   noteBadToken.release();
2549 
2550   if (isBigInt) {
2551     return bigIntLiteral(start, modifier, out);
2552   }
2553 
2554   newNumberToken(dval, decimalPoint, start, modifier, out);
2555   return true;
2556 }
2557 
2558 template <typename Unit, class AnyCharsAccess>
regexpLiteral(TokenStart start,TokenKind * out)2559 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::regexpLiteral(
2560     TokenStart start, TokenKind* out) {
2561   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('/'));
2562   this->charBuffer.clear();
2563 
2564   auto ProcessNonAsciiCodePoint = [this](int32_t lead) {
2565     MOZ_ASSERT(lead != EOF);
2566     MOZ_ASSERT(!this->isAsciiCodePoint(lead));
2567 
2568     char32_t codePoint;
2569     if (!this->getNonAsciiCodePointDontNormalize(this->toUnit(lead),
2570                                                  &codePoint)) {
2571       return false;
2572     }
2573 
2574     if (MOZ_UNLIKELY(codePoint == unicode::LINE_SEPARATOR ||
2575                      codePoint == unicode::PARA_SEPARATOR)) {
2576       this->sourceUnits.ungetLineOrParagraphSeparator();
2577       this->error(JSMSG_UNTERMINATED_REGEXP);
2578       return false;
2579     }
2580 
2581     return AppendCodePointToCharBuffer(this->charBuffer, codePoint);
2582   };
2583 
2584   auto ReportUnterminatedRegExp = [this](int32_t unit) {
2585     this->ungetCodeUnit(unit);
2586     this->error(JSMSG_UNTERMINATED_REGEXP);
2587   };
2588 
2589   bool inCharClass = false;
2590   do {
2591     int32_t unit = getCodeUnit();
2592     if (unit == EOF) {
2593       ReportUnterminatedRegExp(unit);
2594       return badToken();
2595     }
2596 
2597     if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
2598       if (!ProcessNonAsciiCodePoint(unit)) {
2599         return badToken();
2600       }
2601 
2602       continue;
2603     }
2604 
2605     if (unit == '\\') {
2606       if (!this->charBuffer.append(unit)) {
2607         return badToken();
2608       }
2609 
2610       unit = getCodeUnit();
2611       if (unit == EOF) {
2612         ReportUnterminatedRegExp(unit);
2613         return badToken();
2614       }
2615 
2616       // Fallthrough only handles ASCII code points, so
2617       // deal with non-ASCII and skip everything else.
2618       if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
2619         if (!ProcessNonAsciiCodePoint(unit)) {
2620           return badToken();
2621         }
2622 
2623         continue;
2624       }
2625     } else if (unit == '[') {
2626       inCharClass = true;
2627     } else if (unit == ']') {
2628       inCharClass = false;
2629     } else if (unit == '/' && !inCharClass) {
2630       // For IE compat, allow unescaped / in char classes.
2631       break;
2632     }
2633 
2634     // NOTE: Non-ASCII LineTerminators were handled by
2635     //       ProcessNonAsciiCodePoint calls above.
2636     if (unit == '\r' || unit == '\n') {
2637       ReportUnterminatedRegExp(unit);
2638       return badToken();
2639     }
2640 
2641     MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(unit)));
2642     if (!this->charBuffer.append(unit)) {
2643       return badToken();
2644     }
2645   } while (true);
2646 
2647   int32_t unit;
2648   RegExpFlags reflags = RegExpFlag::NoFlags;
2649   while (true) {
2650     uint8_t flag;
2651     unit = getCodeUnit();
2652     if (unit == 'd') {
2653       flag = RegExpFlag::HasIndices;
2654     } else if (unit == 'g') {
2655       flag = RegExpFlag::Global;
2656     } else if (unit == 'i') {
2657       flag = RegExpFlag::IgnoreCase;
2658     } else if (unit == 'm') {
2659       flag = RegExpFlag::Multiline;
2660     } else if (unit == 's') {
2661       flag = RegExpFlag::DotAll;
2662     } else if (unit == 'u') {
2663       flag = RegExpFlag::Unicode;
2664     } else if (unit == 'y') {
2665       flag = RegExpFlag::Sticky;
2666     } else if (IsAsciiAlpha(unit)) {
2667       flag = RegExpFlag::NoFlags;
2668     } else {
2669       break;
2670     }
2671 
2672     if ((reflags & flag) || flag == RegExpFlag::NoFlags) {
2673       ungetCodeUnit(unit);
2674       char buf[2] = {char(unit), '\0'};
2675       error(JSMSG_BAD_REGEXP_FLAG, buf);
2676       return badToken();
2677     }
2678 
2679     reflags |= flag;
2680   }
2681   ungetCodeUnit(unit);
2682 
2683   newRegExpToken(reflags, start, out);
2684   return true;
2685 }
2686 
2687 template <typename Unit, class AnyCharsAccess>
bigIntLiteral(TokenStart start,Modifier modifier,TokenKind * out)2688 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::bigIntLiteral(
2689     TokenStart start, Modifier modifier, TokenKind* out) {
2690   MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == toUnit('n'));
2691   MOZ_ASSERT(this->sourceUnits.offset() > start.offset());
2692   uint32_t length = this->sourceUnits.offset() - start.offset();
2693   MOZ_ASSERT(length >= 2);
2694   this->charBuffer.clear();
2695   mozilla::Range<const Unit> chars(
2696       this->sourceUnits.codeUnitPtrAt(start.offset()), length);
2697   for (uint32_t idx = 0; idx < length - 1; idx++) {
2698     int32_t unit = CodeUnitValue(chars[idx]);
2699     // Char buffer may start with a 0[bBoOxX] prefix, then follows with
2700     // binary, octal, decimal, or hex digits.  Already checked by caller, as
2701     // the "n" indicating bigint comes at the end.
2702     MOZ_ASSERT(isAsciiCodePoint(unit));
2703     // Skip over any separators.
2704     if (unit == '_') {
2705       continue;
2706     }
2707     if (!AppendCodePointToCharBuffer(this->charBuffer, unit)) {
2708       return false;
2709     }
2710   }
2711   newBigIntToken(start, modifier, out);
2712   return true;
2713 }
2714 
2715 template <typename Unit, class AnyCharsAccess>
2716 void GeneralTokenStreamChars<Unit,
consumeOptionalHashbangComment()2717                              AnyCharsAccess>::consumeOptionalHashbangComment() {
2718   MOZ_ASSERT(this->sourceUnits.atStart(),
2719              "HashBangComment can only appear immediately at the start of a "
2720              "Script or Module");
2721 
2722   // HashbangComment ::
2723   //   #!  SingleLineCommentChars_opt
2724 
2725   if (!matchCodeUnit('#')) {
2726     // HashbangComment is optional at start of Script or Module.
2727     return;
2728   }
2729 
2730   if (!matchCodeUnit('!')) {
2731     // # not followed by ! at start of Script or Module is an error, but normal
2732     // parsing code will handle that error just fine if we let it.
2733     ungetCodeUnit('#');
2734     return;
2735   }
2736 
2737   // This doesn't consume a concluding LineTerminator, and it stops consuming
2738   // just before any encoding error.  The subsequent |getToken| call will call
2739   // |getTokenInternal| below which will handle these possibilities.
2740   this->sourceUnits.consumeRestOfSingleLineComment();
2741 }
2742 
2743 template <typename Unit, class AnyCharsAccess>
getTokenInternal(TokenKind * const ttp,const Modifier modifier)2744 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getTokenInternal(
2745     TokenKind* const ttp, const Modifier modifier) {
2746   // Assume we'll fail: success cases will overwrite this.
2747 #ifdef DEBUG
2748   *ttp = TokenKind::Limit;
2749 #endif
2750   MOZ_MAKE_MEM_UNDEFINED(ttp, sizeof(*ttp));
2751 
2752   // This loop runs more than once only when whitespace or comments are
2753   // encountered.
2754   do {
2755     int32_t unit = peekCodeUnit();
2756     if (MOZ_UNLIKELY(unit == EOF)) {
2757       MOZ_ASSERT(this->sourceUnits.atEnd());
2758       anyCharsAccess().flags.isEOF = true;
2759       TokenStart start(this->sourceUnits, 0);
2760       newSimpleToken(TokenKind::Eof, start, modifier, ttp);
2761       return true;
2762     }
2763 
2764     if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
2765       // Non-ASCII code points can only be identifiers or whitespace.  It would
2766       // be nice to compute these *after* discarding whitespace, but IN A WORLD
2767       // where |unicode::IsSpace| requires consuming a variable number of code
2768       // units, it's easier to assume it's an identifier and maybe do a little
2769       // wasted work, than to unget and compute and reget if whitespace.
2770       TokenStart start(this->sourceUnits, 0);
2771       const Unit* identStart = this->sourceUnits.addressOfNextCodeUnit();
2772 
2773       PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2774       if (peeked.isNone()) {
2775         int32_t bad;
2776         MOZ_ALWAYS_FALSE(getCodePoint(&bad));
2777         return badToken();
2778       }
2779 
2780       char32_t cp = peeked.codePoint();
2781       if (unicode::IsSpace(cp)) {
2782         this->sourceUnits.consumeKnownCodePoint(peeked);
2783         if (IsLineTerminator(cp)) {
2784           if (!updateLineInfoForEOL()) {
2785             return badToken();
2786           }
2787 
2788           anyCharsAccess().updateFlagsForEOL();
2789         }
2790 
2791         continue;
2792       }
2793 
2794       static_assert(isAsciiCodePoint('$'),
2795                     "IdentifierStart contains '$', but as "
2796                     "!IsUnicodeIDStart('$'), ensure that '$' is never "
2797                     "handled here");
2798       static_assert(isAsciiCodePoint('_'),
2799                     "IdentifierStart contains '_', but as "
2800                     "!IsUnicodeIDStart('_'), ensure that '_' is never "
2801                     "handled here");
2802 
2803       if (MOZ_LIKELY(unicode::IsUnicodeIDStart(cp))) {
2804         this->sourceUnits.consumeKnownCodePoint(peeked);
2805         MOZ_ASSERT(!IsLineTerminator(cp),
2806                    "IdentifierStart must guarantee !IsLineTerminator "
2807                    "or else we'll fail to maintain line-info/flags "
2808                    "for EOL here");
2809 
2810         return identifierName(start, identStart, IdentifierEscapes::None,
2811                               modifier, NameVisibility::Public, ttp);
2812       }
2813 
2814       reportIllegalCharacter(cp);
2815       return badToken();
2816     }  // !isAsciiCodePoint(unit)
2817 
2818     consumeKnownCodeUnit(unit);
2819 
2820     // Get the token kind, based on the first char.  The ordering of c1kind
2821     // comparison is based on the frequency of tokens in real code:
2822     // Parsemark (which represents typical JS code on the web) and the
2823     // Unreal demo (which represents asm.js code).
2824     //
2825     //                  Parsemark   Unreal
2826     //  OneChar         32.9%       39.7%
2827     //  Space           25.0%        0.6%
2828     //  Ident           19.2%       36.4%
2829     //  Dec              7.2%        5.1%
2830     //  String           7.9%        0.0%
2831     //  EOL              1.7%        0.0%
2832     //  ZeroDigit        0.4%        4.9%
2833     //  Other            5.7%       13.3%
2834     //
2835     // The ordering is based mostly only Parsemark frequencies, with Unreal
2836     // frequencies used to break close categories (e.g. |Dec| and
2837     // |String|).  |Other| is biggish, but no other token kind is common
2838     // enough for it to be worth adding extra values to FirstCharKind.
2839     FirstCharKind c1kind = FirstCharKind(firstCharKinds[unit]);
2840 
2841     // Look for an unambiguous single-char token.
2842     //
2843     if (c1kind <= OneChar_Max) {
2844       TokenStart start(this->sourceUnits, -1);
2845       newSimpleToken(TokenKind(c1kind), start, modifier, ttp);
2846       return true;
2847     }
2848 
2849     // Skip over non-EOL whitespace chars.
2850     //
2851     if (c1kind == Space) {
2852       continue;
2853     }
2854 
2855     // Look for an identifier.
2856     //
2857     if (c1kind == Ident) {
2858       TokenStart start(this->sourceUnits, -1);
2859       return identifierName(
2860           start, this->sourceUnits.addressOfNextCodeUnit() - 1,
2861           IdentifierEscapes::None, modifier, NameVisibility::Public, ttp);
2862     }
2863 
2864     // Look for a decimal number.
2865     //
2866     if (c1kind == Dec) {
2867       TokenStart start(this->sourceUnits, -1);
2868       const Unit* numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2869       return decimalNumber(unit, start, numStart, modifier, ttp);
2870     }
2871 
2872     // Look for a string or a template string.
2873     //
2874     if (c1kind == String) {
2875       return getStringOrTemplateToken(static_cast<char>(unit), modifier, ttp);
2876     }
2877 
2878     // Skip over EOL chars, updating line state along the way.
2879     //
2880     if (c1kind == EOL) {
2881       if (unit == '\r') {
2882         matchLineTerminator('\n');
2883       }
2884 
2885       if (!updateLineInfoForEOL()) {
2886         return badToken();
2887       }
2888 
2889       anyCharsAccess().updateFlagsForEOL();
2890       continue;
2891     }
2892 
2893     // From a '0', look for a hexadecimal, binary, octal, or "noctal" (a
2894     // number starting with '0' that contains '8' or '9' and is treated as
2895     // decimal) number.
2896     //
2897     if (c1kind == ZeroDigit) {
2898       TokenStart start(this->sourceUnits, -1);
2899       int radix;
2900       bool isBigInt = false;
2901       const Unit* numStart;
2902       unit = getCodeUnit();
2903       if (unit == 'x' || unit == 'X') {
2904         radix = 16;
2905         unit = getCodeUnit();
2906         if (!IsAsciiHexDigit(unit)) {
2907           // NOTE: |unit| may be EOF here.
2908           ungetCodeUnit(unit);
2909           error(JSMSG_MISSING_HEXDIGITS);
2910           return badToken();
2911         }
2912 
2913         // one past the '0x'
2914         numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2915 
2916         if (!matchIntegerAfterFirstDigit(IsAsciiHexDigit, &unit)) {
2917           return badToken();
2918         }
2919       } else if (unit == 'b' || unit == 'B') {
2920         radix = 2;
2921         unit = getCodeUnit();
2922         if (!IsAsciiBinary(unit)) {
2923           // NOTE: |unit| may be EOF here.
2924           ungetCodeUnit(unit);
2925           error(JSMSG_MISSING_BINARY_DIGITS);
2926           return badToken();
2927         }
2928 
2929         // one past the '0b'
2930         numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2931 
2932         if (!matchIntegerAfterFirstDigit(IsAsciiBinary, &unit)) {
2933           return badToken();
2934         }
2935       } else if (unit == 'o' || unit == 'O') {
2936         radix = 8;
2937         unit = getCodeUnit();
2938         if (!IsAsciiOctal(unit)) {
2939           // NOTE: |unit| may be EOF here.
2940           ungetCodeUnit(unit);
2941           error(JSMSG_MISSING_OCTAL_DIGITS);
2942           return badToken();
2943         }
2944 
2945         // one past the '0o'
2946         numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2947 
2948         if (!matchIntegerAfterFirstDigit(IsAsciiOctal, &unit)) {
2949           return badToken();
2950         }
2951       } else if (IsAsciiDigit(unit)) {
2952         // Reject octal literals that appear in strict mode code.
2953         if (!strictModeError(JSMSG_DEPRECATED_OCTAL_LITERAL)) {
2954           return badToken();
2955         }
2956 
2957         // The above test doesn't catch a few edge cases; see
2958         // |GeneralParser::maybeParseDirective|.  Record the violation so that
2959         // that function can handle them.
2960         anyCharsAccess().setSawDeprecatedOctalLiteral();
2961 
2962         radix = 8;
2963         // one past the '0'
2964         numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2965 
2966         bool nonOctalDecimalIntegerLiteral = false;
2967         do {
2968           if (unit >= '8') {
2969             nonOctalDecimalIntegerLiteral = true;
2970           }
2971           unit = getCodeUnit();
2972         } while (IsAsciiDigit(unit));
2973 
2974         if (unit == '_') {
2975           error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER);
2976           return badToken();
2977         }
2978 
2979         if (unit == 'n') {
2980           error(JSMSG_BIGINT_INVALID_SYNTAX);
2981           return badToken();
2982         }
2983 
2984         if (nonOctalDecimalIntegerLiteral) {
2985           // Use the decimal scanner for the rest of the number.
2986           return decimalNumber(unit, start, numStart, modifier, ttp);
2987         }
2988       } else if (unit == '_') {
2989         // Give a more explicit error message when '_' is used after '0'.
2990         error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER);
2991         return badToken();
2992       } else {
2993         // '0' not followed by [XxBbOo0-9_];  scan as a decimal number.
2994         numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2995 
2996         // NOTE: |unit| may be EOF here.  (This is permitted by case #3
2997         //       in TokenStream.h docs for this function.)
2998         return decimalNumber(unit, start, numStart, modifier, ttp);
2999       }
3000 
3001       if (unit == 'n') {
3002         isBigInt = true;
3003         unit = peekCodeUnit();
3004       } else {
3005         ungetCodeUnit(unit);
3006       }
3007 
3008       // Error if an identifier-start code point appears immediately
3009       // after the number.  Somewhat surprisingly, if we don't check
3010       // here, we'll never check at all.
3011       if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
3012         if (unicode::IsIdentifierStart(char16_t(unit))) {
3013           error(JSMSG_IDSTART_AFTER_NUMBER);
3014           return badToken();
3015         }
3016       } else if (MOZ_LIKELY(unit != EOF)) {
3017         // This ignores encoding errors: subsequent caller-side code to
3018         // handle source text after the number will do so.
3019         PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
3020         if (!peeked.isNone() &&
3021             unicode::IsIdentifierStart(peeked.codePoint())) {
3022           error(JSMSG_IDSTART_AFTER_NUMBER);
3023           return badToken();
3024         }
3025       }
3026 
3027       if (isBigInt) {
3028         return bigIntLiteral(start, modifier, ttp);
3029       }
3030 
3031       double dval;
3032       if (!GetFullInteger(anyCharsAccess().cx, numStart,
3033                           this->sourceUnits.addressOfNextCodeUnit(), radix,
3034                           IntegerSeparatorHandling::SkipUnderscore, &dval)) {
3035         return badToken();
3036       }
3037       newNumberToken(dval, NoDecimal, start, modifier, ttp);
3038       return true;
3039     }
3040 
3041     MOZ_ASSERT(c1kind == Other);
3042 
3043     // This handles everything else.  Simple tokens distinguished solely by
3044     // TokenKind should set |simpleKind| and break, to share simple-token
3045     // creation code for all such tokens.  All other tokens must be handled
3046     // by returning (or by continuing from the loop enclosing this).
3047     //
3048     TokenStart start(this->sourceUnits, -1);
3049     TokenKind simpleKind;
3050 #ifdef DEBUG
3051     simpleKind = TokenKind::Limit;  // sentinel value for code after switch
3052 #endif
3053 
3054     // The block a ways above eliminated all non-ASCII, so cast to the
3055     // smallest type possible to assist the C++ compiler.
3056     switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) {
3057       case '.':
3058         if (IsAsciiDigit(peekCodeUnit())) {
3059           return decimalNumber('.', start,
3060                                this->sourceUnits.addressOfNextCodeUnit() - 1,
3061                                modifier, ttp);
3062         }
3063 
3064         unit = getCodeUnit();
3065         if (unit == '.') {
3066           if (matchCodeUnit('.')) {
3067             simpleKind = TokenKind::TripleDot;
3068             break;
3069           }
3070         }
3071 
3072         // NOTE: |unit| may be EOF here.  A stray '.' at EOF would be an
3073         //       error, but subsequent code will handle it.
3074         ungetCodeUnit(unit);
3075 
3076         simpleKind = TokenKind::Dot;
3077         break;
3078 
3079       case '#': {
3080         if (options().privateClassFields) {
3081           TokenStart start(this->sourceUnits, -1);
3082           const Unit* identStart =
3083               this->sourceUnits.addressOfNextCodeUnit() - 1;
3084           IdentifierEscapes sawEscape;
3085           if (!matchIdentifierStart(&sawEscape)) {
3086             return badToken();
3087           }
3088           return identifierName(start, identStart, sawEscape, modifier,
3089                                 NameVisibility::Private, ttp);
3090         }
3091         ungetCodeUnit(unit);
3092         error(JSMSG_PRIVATE_FIELDS_NOT_SUPPORTED);
3093         return badToken();
3094       }
3095 
3096       case '=':
3097         if (matchCodeUnit('=')) {
3098           simpleKind = matchCodeUnit('=') ? TokenKind::StrictEq : TokenKind::Eq;
3099         } else if (matchCodeUnit('>')) {
3100           simpleKind = TokenKind::Arrow;
3101         } else {
3102           simpleKind = TokenKind::Assign;
3103         }
3104         break;
3105 
3106       case '+':
3107         if (matchCodeUnit('+')) {
3108           simpleKind = TokenKind::Inc;
3109         } else {
3110           simpleKind =
3111               matchCodeUnit('=') ? TokenKind::AddAssign : TokenKind::Add;
3112         }
3113         break;
3114 
3115       case '\\': {
3116         uint32_t codePoint;
3117         if (uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint)) {
3118           return identifierName(
3119               start,
3120               this->sourceUnits.addressOfNextCodeUnit() - escapeLength - 1,
3121               IdentifierEscapes::SawUnicodeEscape, modifier,
3122               NameVisibility::Public, ttp);
3123         }
3124 
3125         // We could point "into" a mistyped escape, e.g. for "\u{41H}" we
3126         // could point at the 'H'.  But we don't do that now, so the code
3127         // unit after the '\' isn't necessarily bad, so just point at the
3128         // start of the actually-invalid escape.
3129         ungetCodeUnit('\\');
3130         error(JSMSG_BAD_ESCAPE);
3131         return badToken();
3132       }
3133 
3134       case '|':
3135         if (matchCodeUnit('|')) {
3136           simpleKind = matchCodeUnit('=') ? TokenKind::OrAssign : TokenKind::Or;
3137         } else {
3138           simpleKind =
3139               matchCodeUnit('=') ? TokenKind::BitOrAssign : TokenKind::BitOr;
3140         }
3141         break;
3142 
3143       case '^':
3144         simpleKind =
3145             matchCodeUnit('=') ? TokenKind::BitXorAssign : TokenKind::BitXor;
3146         break;
3147 
3148       case '&':
3149         if (matchCodeUnit('&')) {
3150           simpleKind =
3151               matchCodeUnit('=') ? TokenKind::AndAssign : TokenKind::And;
3152         } else {
3153           simpleKind =
3154               matchCodeUnit('=') ? TokenKind::BitAndAssign : TokenKind::BitAnd;
3155         }
3156         break;
3157 
3158       case '?':
3159         if (matchCodeUnit('.')) {
3160           unit = getCodeUnit();
3161           if (IsAsciiDigit(unit)) {
3162             // if the code unit is followed by a number, for example it has the
3163             // following form `<...> ?.5 <..> then it should be treated as a
3164             // ternary rather than as an optional chain
3165             simpleKind = TokenKind::Hook;
3166             ungetCodeUnit(unit);
3167             ungetCodeUnit('.');
3168           } else {
3169             ungetCodeUnit(unit);
3170             simpleKind = TokenKind::OptionalChain;
3171           }
3172         } else if (matchCodeUnit('?')) {
3173           simpleKind = matchCodeUnit('=') ? TokenKind::CoalesceAssign
3174                                           : TokenKind::Coalesce;
3175         } else {
3176           simpleKind = TokenKind::Hook;
3177         }
3178         break;
3179 
3180       case '!':
3181         if (matchCodeUnit('=')) {
3182           simpleKind = matchCodeUnit('=') ? TokenKind::StrictNe : TokenKind::Ne;
3183         } else {
3184           simpleKind = TokenKind::Not;
3185         }
3186         break;
3187 
3188       case '<':
3189         if (anyCharsAccess().options().allowHTMLComments) {
3190           // Treat HTML begin-comment as comment-till-end-of-line.
3191           if (matchCodeUnit('!')) {
3192             if (matchCodeUnit('-')) {
3193               if (matchCodeUnit('-')) {
3194                 this->sourceUnits.consumeRestOfSingleLineComment();
3195                 continue;
3196               }
3197               ungetCodeUnit('-');
3198             }
3199             ungetCodeUnit('!');
3200           }
3201         }
3202         if (matchCodeUnit('<')) {
3203           simpleKind =
3204               matchCodeUnit('=') ? TokenKind::LshAssign : TokenKind::Lsh;
3205         } else {
3206           simpleKind = matchCodeUnit('=') ? TokenKind::Le : TokenKind::Lt;
3207         }
3208         break;
3209 
3210       case '>':
3211         if (matchCodeUnit('>')) {
3212           if (matchCodeUnit('>')) {
3213             simpleKind =
3214                 matchCodeUnit('=') ? TokenKind::UrshAssign : TokenKind::Ursh;
3215           } else {
3216             simpleKind =
3217                 matchCodeUnit('=') ? TokenKind::RshAssign : TokenKind::Rsh;
3218           }
3219         } else {
3220           simpleKind = matchCodeUnit('=') ? TokenKind::Ge : TokenKind::Gt;
3221         }
3222         break;
3223 
3224       case '*':
3225         if (matchCodeUnit('*')) {
3226           simpleKind =
3227               matchCodeUnit('=') ? TokenKind::PowAssign : TokenKind::Pow;
3228         } else {
3229           simpleKind =
3230               matchCodeUnit('=') ? TokenKind::MulAssign : TokenKind::Mul;
3231         }
3232         break;
3233 
3234       case '/':
3235         // Look for a single-line comment.
3236         if (matchCodeUnit('/')) {
3237           unit = getCodeUnit();
3238           if (unit == '@' || unit == '#') {
3239             bool shouldWarn = unit == '@';
3240             if (!getDirectives(false, shouldWarn)) {
3241               return false;
3242             }
3243           } else {
3244             // NOTE: |unit| may be EOF here.
3245             ungetCodeUnit(unit);
3246           }
3247 
3248           this->sourceUnits.consumeRestOfSingleLineComment();
3249           continue;
3250         }
3251 
3252         // Look for a multi-line comment.
3253         if (matchCodeUnit('*')) {
3254           TokenStreamAnyChars& anyChars = anyCharsAccess();
3255           unsigned linenoBefore = anyChars.lineno;
3256 
3257           do {
3258             int32_t unit = getCodeUnit();
3259             if (unit == EOF) {
3260               error(JSMSG_UNTERMINATED_COMMENT);
3261               return badToken();
3262             }
3263 
3264             if (unit == '*' && matchCodeUnit('/')) {
3265               break;
3266             }
3267 
3268             if (unit == '@' || unit == '#') {
3269               bool shouldWarn = unit == '@';
3270               if (!getDirectives(true, shouldWarn)) {
3271                 return badToken();
3272               }
3273             } else if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
3274               int32_t codePoint;
3275               if (!getFullAsciiCodePoint(unit, &codePoint)) {
3276                 return badToken();
3277               }
3278             } else {
3279               int32_t codePoint;
3280               if (!getNonAsciiCodePoint(unit, &codePoint)) {
3281                 return badToken();
3282               }
3283             }
3284           } while (true);
3285 
3286           if (linenoBefore != anyChars.lineno) {
3287             anyChars.updateFlagsForEOL();
3288           }
3289 
3290           continue;
3291         }
3292 
3293         // Look for a regexp.
3294         if (modifier == SlashIsRegExp) {
3295           return regexpLiteral(start, ttp);
3296         }
3297 
3298         simpleKind = matchCodeUnit('=') ? TokenKind::DivAssign : TokenKind::Div;
3299         break;
3300 
3301       case '%':
3302         simpleKind = matchCodeUnit('=') ? TokenKind::ModAssign : TokenKind::Mod;
3303         break;
3304 
3305       case '-':
3306         if (matchCodeUnit('-')) {
3307           if (anyCharsAccess().options().allowHTMLComments &&
3308               !anyCharsAccess().flags.isDirtyLine) {
3309             if (matchCodeUnit('>')) {
3310               this->sourceUnits.consumeRestOfSingleLineComment();
3311               continue;
3312             }
3313           }
3314 
3315           simpleKind = TokenKind::Dec;
3316         } else {
3317           simpleKind =
3318               matchCodeUnit('=') ? TokenKind::SubAssign : TokenKind::Sub;
3319         }
3320         break;
3321 
3322       default:
3323         // We consumed a bad ASCII code point/unit.  Put it back so the
3324         // error location is the bad code point.
3325         ungetCodeUnit(unit);
3326         reportIllegalCharacter(unit);
3327         return badToken();
3328     }  // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit))))
3329 
3330     MOZ_ASSERT(simpleKind != TokenKind::Limit,
3331                "switch-statement should have set |simpleKind| before "
3332                "breaking");
3333 
3334     newSimpleToken(simpleKind, start, modifier, ttp);
3335     return true;
3336   } while (true);
3337 }
3338 
3339 template <typename Unit, class AnyCharsAccess>
getStringOrTemplateToken(char untilChar,Modifier modifier,TokenKind * out)3340 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getStringOrTemplateToken(
3341     char untilChar, Modifier modifier, TokenKind* out) {
3342   MOZ_ASSERT(untilChar == '\'' || untilChar == '"' || untilChar == '`',
3343              "unexpected string/template literal delimiter");
3344 
3345   bool parsingTemplate = (untilChar == '`');
3346   bool templateHead = false;
3347 
3348   TokenStart start(this->sourceUnits, -1);
3349   this->charBuffer.clear();
3350 
3351   // Run the bad-token code for every path out of this function except the
3352   // one success-case.
3353   auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
3354 
3355   auto ReportPrematureEndOfLiteral = [this, untilChar](unsigned errnum) {
3356     // Unicode separators aren't end-of-line in template or (as of
3357     // recently) string literals, so this assertion doesn't allow them.
3358     MOZ_ASSERT(this->sourceUnits.atEnd() ||
3359                    this->sourceUnits.peekCodeUnit() == Unit('\r') ||
3360                    this->sourceUnits.peekCodeUnit() == Unit('\n'),
3361                "must be parked at EOF or EOL to call this function");
3362 
3363     // The various errors reported here include language like "in a ''
3364     // literal" or similar, with '' being '', "", or `` as appropriate.
3365     const char delimiters[] = {untilChar, untilChar, '\0'};
3366 
3367     this->error(errnum, delimiters);
3368     return;
3369   };
3370 
3371   // We need to detect any of these chars:  " or ', \n (or its
3372   // equivalents), \\, EOF.  Because we detect EOL sequences here and
3373   // put them back immediately, we can use getCodeUnit().
3374   int32_t unit;
3375   while ((unit = getCodeUnit()) != untilChar) {
3376     if (unit == EOF) {
3377       ReportPrematureEndOfLiteral(JSMSG_EOF_BEFORE_END_OF_LITERAL);
3378       return false;
3379     }
3380 
3381     // Non-ASCII code points are always directly appended -- even
3382     // U+2028 LINE SEPARATOR and U+2029 PARAGRAPH SEPARATOR that are
3383     // ordinarily LineTerminatorSequences.  (They contribute their literal
3384     // values to template and [as of recently] string literals, but they're
3385     // line terminators when computing line/column coordinates.)  Handle
3386     // the non-ASCII case early for readability.
3387     if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
3388       char32_t cp;
3389       if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) {
3390         return false;
3391       }
3392 
3393       if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
3394                        cp == unicode::PARA_SEPARATOR)) {
3395         if (!updateLineInfoForEOL()) {
3396           return false;
3397         }
3398 
3399         anyCharsAccess().updateFlagsForEOL();
3400       } else {
3401         MOZ_ASSERT(!IsLineTerminator(cp));
3402       }
3403 
3404       if (!AppendCodePointToCharBuffer(this->charBuffer, cp)) {
3405         return false;
3406       }
3407 
3408       continue;
3409     }
3410 
3411     if (unit == '\\') {
3412       // When parsing templates, we don't immediately report errors for
3413       // invalid escapes; these are handled by the parser.  We don't
3414       // append to charBuffer in those cases because it won't be read.
3415       unit = getCodeUnit();
3416       if (unit == EOF) {
3417         ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3418         return false;
3419       }
3420 
3421       // Non-ASCII |unit| isn't handled by code after this, so dedicate
3422       // an unlikely special-case to it and then continue.
3423       if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
3424         int32_t codePoint;
3425         if (!getNonAsciiCodePoint(unit, &codePoint)) {
3426           return false;
3427         }
3428 
3429         // If we consumed U+2028 LINE SEPARATOR or U+2029 PARAGRAPH
3430         // SEPARATOR, they'll be normalized to '\n'.  '\' followed by
3431         // LineContinuation represents no code points, so don't append
3432         // in this case.
3433         if (codePoint != '\n') {
3434           if (!AppendCodePointToCharBuffer(this->charBuffer,
3435                                            AssertedCast<char32_t>(codePoint))) {
3436             return false;
3437           }
3438         }
3439 
3440         continue;
3441       }
3442 
3443       // The block above eliminated all non-ASCII, so cast to the
3444       // smallest type possible to assist the C++ compiler.
3445       switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) {
3446         case 'b':
3447           unit = '\b';
3448           break;
3449         case 'f':
3450           unit = '\f';
3451           break;
3452         case 'n':
3453           unit = '\n';
3454           break;
3455         case 'r':
3456           unit = '\r';
3457           break;
3458         case 't':
3459           unit = '\t';
3460           break;
3461         case 'v':
3462           unit = '\v';
3463           break;
3464 
3465         case '\r':
3466           matchLineTerminator('\n');
3467           [[fallthrough]];
3468         case '\n': {
3469           // LineContinuation represents no code points.  We're manually
3470           // consuming a LineTerminatorSequence, so we must manually
3471           // update line/column info.
3472           if (!updateLineInfoForEOL()) {
3473             return false;
3474           }
3475 
3476           continue;
3477         }
3478 
3479         // Unicode character specification.
3480         case 'u': {
3481           int32_t c2 = getCodeUnit();
3482           if (c2 == EOF) {
3483             ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3484             return false;
3485           }
3486 
3487           // First handle a delimited Unicode escape, e.g. \u{1F4A9}.
3488           if (c2 == '{') {
3489             uint32_t start = this->sourceUnits.offset() - 3;
3490             uint32_t code = 0;
3491             bool first = true;
3492             bool valid = true;
3493             do {
3494               int32_t u3 = getCodeUnit();
3495               if (u3 == EOF) {
3496                 if (parsingTemplate) {
3497                   TokenStreamAnyChars& anyChars = anyCharsAccess();
3498                   anyChars.setInvalidTemplateEscape(start,
3499                                                     InvalidEscapeType::Unicode);
3500                   valid = false;
3501                   break;
3502                 }
3503                 reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3504                 return false;
3505               }
3506               if (u3 == '}') {
3507                 if (first) {
3508                   if (parsingTemplate) {
3509                     TokenStreamAnyChars& anyChars = anyCharsAccess();
3510                     anyChars.setInvalidTemplateEscape(
3511                         start, InvalidEscapeType::Unicode);
3512                     valid = false;
3513                     break;
3514                   }
3515                   reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3516                   return false;
3517                 }
3518                 break;
3519               }
3520 
3521               // Beware: |u3| may be a non-ASCII code point here; if
3522               // so it'll pass into this |if|-block.
3523               if (!IsAsciiHexDigit(u3)) {
3524                 if (parsingTemplate) {
3525                   // We put the code unit back so that we read it
3526                   // on the next pass, which matters if it was
3527                   // '`' or '\'.
3528                   ungetCodeUnit(u3);
3529 
3530                   TokenStreamAnyChars& anyChars = anyCharsAccess();
3531                   anyChars.setInvalidTemplateEscape(start,
3532                                                     InvalidEscapeType::Unicode);
3533                   valid = false;
3534                   break;
3535                 }
3536                 reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3537                 return false;
3538               }
3539 
3540               code = (code << 4) | AsciiAlphanumericToNumber(u3);
3541               if (code > unicode::NonBMPMax) {
3542                 if (parsingTemplate) {
3543                   TokenStreamAnyChars& anyChars = anyCharsAccess();
3544                   anyChars.setInvalidTemplateEscape(
3545                       start + 3, InvalidEscapeType::UnicodeOverflow);
3546                   valid = false;
3547                   break;
3548                 }
3549                 reportInvalidEscapeError(start + 3,
3550                                          InvalidEscapeType::UnicodeOverflow);
3551                 return false;
3552               }
3553 
3554               first = false;
3555             } while (true);
3556 
3557             if (!valid) {
3558               continue;
3559             }
3560 
3561             MOZ_ASSERT(code <= unicode::NonBMPMax);
3562             if (!AppendCodePointToCharBuffer(this->charBuffer, code)) {
3563               return false;
3564             }
3565 
3566             continue;
3567           }  // end of delimited Unicode escape handling
3568 
3569           // Otherwise it must be a fixed-length \uXXXX Unicode escape.
3570           // If it isn't, this is usually an error -- but if this is a
3571           // template literal, we must defer error reporting because
3572           // malformed escapes are okay in *tagged* template literals.
3573           char16_t v;
3574           if (IsAsciiHexDigit(c2) && this->sourceUnits.matchHexDigits(3, &v)) {
3575             unit = (AsciiAlphanumericToNumber(c2) << 12) | v;
3576           } else {
3577             // Beware: |c2| may not be an ASCII code point here!
3578             ungetCodeUnit(c2);
3579             uint32_t start = this->sourceUnits.offset() - 2;
3580             if (parsingTemplate) {
3581               TokenStreamAnyChars& anyChars = anyCharsAccess();
3582               anyChars.setInvalidTemplateEscape(start,
3583                                                 InvalidEscapeType::Unicode);
3584               continue;
3585             }
3586             reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3587             return false;
3588           }
3589           break;
3590         }  // case 'u'
3591 
3592         // Hexadecimal character specification.
3593         case 'x': {
3594           char16_t v;
3595           if (this->sourceUnits.matchHexDigits(2, &v)) {
3596             unit = v;
3597           } else {
3598             uint32_t start = this->sourceUnits.offset() - 2;
3599             if (parsingTemplate) {
3600               TokenStreamAnyChars& anyChars = anyCharsAccess();
3601               anyChars.setInvalidTemplateEscape(start,
3602                                                 InvalidEscapeType::Hexadecimal);
3603               continue;
3604             }
3605             reportInvalidEscapeError(start, InvalidEscapeType::Hexadecimal);
3606             return false;
3607           }
3608           break;
3609         }
3610 
3611         default: {
3612           if (!IsAsciiOctal(unit)) {
3613             // \8 or \9 in an untagged template literal is a syntax error,
3614             // reported in GeneralParser::noSubstitutionUntaggedTemplate.
3615             //
3616             // Tagged template literals, however, may contain \8 and \9.  The
3617             // "cooked" representation of such a part will be |undefined|, and
3618             // the "raw" representation will contain the literal characters.
3619             //
3620             //   function f(parts) {
3621             //     assertEq(parts[0], undefined);
3622             //     assertEq(parts.raw[0], "\\8");
3623             //     return "composed";
3624             //   }
3625             //   assertEq(f`\8`, "composed");
3626             if (unit == '8' || unit == '9') {
3627               TokenStreamAnyChars& anyChars = anyCharsAccess();
3628               if (parsingTemplate) {
3629                 anyChars.setInvalidTemplateEscape(
3630                     this->sourceUnits.offset() - 2,
3631                     InvalidEscapeType::EightOrNine);
3632                 continue;
3633               }
3634 
3635               // \8 and \9 are forbidden in string literals in strict mode code.
3636               if (!strictModeError(JSMSG_DEPRECATED_EIGHT_OR_NINE_ESCAPE)) {
3637                 return false;
3638               }
3639 
3640               // The above test doesn't catch a few edge cases; see
3641               // |GeneralParser::maybeParseDirective|.  Record the violation so
3642               // that that function can handle them.
3643               anyChars.setSawDeprecatedEightOrNineEscape();
3644             }
3645             break;
3646           }
3647 
3648           // Octal character specification.
3649           int32_t val = AsciiOctalToNumber(unit);
3650 
3651           unit = peekCodeUnit();
3652           if (MOZ_UNLIKELY(unit == EOF)) {
3653             ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3654             return false;
3655           }
3656 
3657           // Strict mode code allows only \0 followed by a non-digit.
3658           if (val != 0 || IsAsciiDigit(unit)) {
3659             TokenStreamAnyChars& anyChars = anyCharsAccess();
3660             if (parsingTemplate) {
3661               anyChars.setInvalidTemplateEscape(this->sourceUnits.offset() - 2,
3662                                                 InvalidEscapeType::Octal);
3663               continue;
3664             }
3665 
3666             if (!strictModeError(JSMSG_DEPRECATED_OCTAL_ESCAPE)) {
3667               return false;
3668             }
3669 
3670             // The above test doesn't catch a few edge cases; see
3671             // |GeneralParser::maybeParseDirective|.  Record the violation so
3672             // that that function can handle them.
3673             anyChars.setSawDeprecatedOctalEscape();
3674           }
3675 
3676           if (IsAsciiOctal(unit)) {
3677             val = 8 * val + AsciiOctalToNumber(unit);
3678             consumeKnownCodeUnit(unit);
3679 
3680             unit = peekCodeUnit();
3681             if (MOZ_UNLIKELY(unit == EOF)) {
3682               ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3683               return false;
3684             }
3685 
3686             if (IsAsciiOctal(unit)) {
3687               int32_t save = val;
3688               val = 8 * val + AsciiOctalToNumber(unit);
3689               if (val <= 0xFF) {
3690                 consumeKnownCodeUnit(unit);
3691               } else {
3692                 val = save;
3693               }
3694             }
3695           }
3696 
3697           unit = char16_t(val);
3698           break;
3699         }  // default
3700       }    // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit))))
3701 
3702       if (!this->charBuffer.append(unit)) {
3703         return false;
3704       }
3705 
3706       continue;
3707     }  // (unit == '\\')
3708 
3709     if (unit == '\r' || unit == '\n') {
3710       if (!parsingTemplate) {
3711         // String literals don't allow ASCII line breaks.
3712         ungetCodeUnit(unit);
3713         ReportPrematureEndOfLiteral(JSMSG_EOL_BEFORE_END_OF_STRING);
3714         return false;
3715       }
3716 
3717       if (unit == '\r') {
3718         unit = '\n';
3719         matchLineTerminator('\n');
3720       }
3721 
3722       if (!updateLineInfoForEOL()) {
3723         return false;
3724       }
3725 
3726       anyCharsAccess().updateFlagsForEOL();
3727     } else if (parsingTemplate && unit == '$' && matchCodeUnit('{')) {
3728       templateHead = true;
3729       break;
3730     }
3731 
3732     if (!this->charBuffer.append(unit)) {
3733       return false;
3734     }
3735   }
3736 
3737   TaggedParserAtomIndex atom = drainCharBufferIntoAtom();
3738   if (!atom) {
3739     return false;
3740   }
3741 
3742   noteBadToken.release();
3743 
3744   MOZ_ASSERT_IF(!parsingTemplate, !templateHead);
3745 
3746   TokenKind kind = !parsingTemplate ? TokenKind::String
3747                    : templateHead   ? TokenKind::TemplateHead
3748                                     : TokenKind::NoSubsTemplate;
3749   newAtomToken(kind, atom, start, modifier, out);
3750   return true;
3751 }
3752 
TokenKindToDesc(TokenKind tt)3753 const char* TokenKindToDesc(TokenKind tt) {
3754   switch (tt) {
3755 #define EMIT_CASE(name, desc) \
3756   case TokenKind::name:       \
3757     return desc;
3758     FOR_EACH_TOKEN_KIND(EMIT_CASE)
3759 #undef EMIT_CASE
3760     case TokenKind::Limit:
3761       MOZ_ASSERT_UNREACHABLE("TokenKind::Limit should not be passed.");
3762       break;
3763   }
3764 
3765   return "<bad TokenKind>";
3766 }
3767 
3768 #ifdef DEBUG
TokenKindToString(TokenKind tt)3769 const char* TokenKindToString(TokenKind tt) {
3770   switch (tt) {
3771 #  define EMIT_CASE(name, desc) \
3772     case TokenKind::name:       \
3773       return "TokenKind::" #name;
3774     FOR_EACH_TOKEN_KIND(EMIT_CASE)
3775 #  undef EMIT_CASE
3776     case TokenKind::Limit:
3777       break;
3778   }
3779 
3780   return "<bad TokenKind>";
3781 }
3782 #endif
3783 
3784 template class TokenStreamCharsBase<Utf8Unit>;
3785 template class TokenStreamCharsBase<char16_t>;
3786 
3787 template class GeneralTokenStreamChars<char16_t, TokenStreamAnyCharsAccess>;
3788 template class TokenStreamChars<char16_t, TokenStreamAnyCharsAccess>;
3789 template class TokenStreamSpecific<char16_t, TokenStreamAnyCharsAccess>;
3790 
3791 template class GeneralTokenStreamChars<
3792     Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
3793 template class GeneralTokenStreamChars<
3794     Utf8Unit,
3795     ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
3796 template class GeneralTokenStreamChars<
3797     char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
3798 template class GeneralTokenStreamChars<
3799     char16_t,
3800     ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
3801 
3802 template class TokenStreamChars<
3803     Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
3804 template class TokenStreamChars<
3805     Utf8Unit,
3806     ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
3807 template class TokenStreamChars<
3808     char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
3809 template class TokenStreamChars<
3810     char16_t,
3811     ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
3812 
3813 template class TokenStreamSpecific<
3814     Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
3815 template class TokenStreamSpecific<
3816     Utf8Unit,
3817     ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
3818 template class TokenStreamSpecific<
3819     char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
3820 template class TokenStreamSpecific<
3821     char16_t,
3822     ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
3823 
3824 }  // namespace frontend
3825 
3826 }  // namespace js
3827