1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * vim: set ts=8 sts=2 et sw=2 tw=80:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 // JS lexical scanner.
8
9 #include "frontend/TokenStream.h"
10
11 #include "mozilla/ArrayUtils.h"
12 #include "mozilla/Attributes.h"
13 #include "mozilla/IntegerTypeTraits.h"
14 #include "mozilla/Likely.h"
15 #include "mozilla/Maybe.h"
16 #include "mozilla/MemoryChecking.h"
17 #include "mozilla/ScopeExit.h"
18 #include "mozilla/Span.h"
19 #include "mozilla/TemplateLib.h"
20 #include "mozilla/TextUtils.h"
21 #include "mozilla/Utf8.h"
22
23 #include <algorithm>
24 #include <iterator>
25 #include <stdarg.h>
26 #include <stdint.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <type_traits>
30 #include <utility>
31
32 #include "jsexn.h"
33 #include "jsnum.h"
34
35 #include "frontend/BytecodeCompiler.h"
36 #include "frontend/Parser.h"
37 #include "frontend/ParserAtom.h"
38 #include "frontend/ReservedWords.h"
39 #include "js/CharacterEncoding.h"
40 #include "js/friend/ErrorMessages.h" // js::GetErrorMessage, JSMSG_*
41 #include "js/Printf.h" // JS_smprintf
42 #include "js/RegExpFlags.h" // JS::RegExpFlags
43 #include "js/UniquePtr.h"
44 #include "util/StringBuffer.h"
45 #include "util/Text.h"
46 #include "util/Unicode.h"
47 #include "vm/FrameIter.h" // js::{,NonBuiltin}FrameIter
48 #include "vm/HelperThreads.h"
49 #include "vm/JSAtom.h"
50 #include "vm/JSContext.h"
51 #include "vm/Realm.h"
52 #include "vm/WellKnownAtom.h" // js_*_str
53
54 using mozilla::AsciiAlphanumericToNumber;
55 using mozilla::AssertedCast;
56 using mozilla::DecodeOneUtf8CodePoint;
57 using mozilla::IsAscii;
58 using mozilla::IsAsciiAlpha;
59 using mozilla::IsAsciiDigit;
60 using mozilla::IsAsciiHexDigit;
61 using mozilla::IsTrailingUnit;
62 using mozilla::MakeScopeExit;
63 using mozilla::Maybe;
64 using mozilla::PointerRangeSize;
65 using mozilla::Span;
66 using mozilla::Utf8Unit;
67
68 using JS::ReadOnlyCompileOptions;
69 using JS::RegExpFlag;
70 using JS::RegExpFlags;
71
72 struct ReservedWordInfo {
73 const char* chars; // C string with reserved word text
74 js::frontend::TokenKind tokentype;
75 };
76
77 static const ReservedWordInfo reservedWords[] = {
78 #define RESERVED_WORD_INFO(word, name, type) \
79 {js_##word##_str, js::frontend::type},
80 FOR_EACH_JAVASCRIPT_RESERVED_WORD(RESERVED_WORD_INFO)
81 #undef RESERVED_WORD_INFO
82 };
83
84 enum class ReservedWordsIndex : size_t {
85 #define ENTRY_(_1, NAME, _3) NAME,
86 FOR_EACH_JAVASCRIPT_RESERVED_WORD(ENTRY_)
87 #undef ENTRY_
88 };
89
90 // Returns a ReservedWordInfo for the specified characters, or nullptr if the
91 // string is not a reserved word.
92 template <typename CharT>
FindReservedWord(const CharT * s,size_t length)93 static const ReservedWordInfo* FindReservedWord(const CharT* s, size_t length) {
94 MOZ_ASSERT(length != 0);
95
96 size_t i;
97 const ReservedWordInfo* rw;
98 const char* chars;
99
100 #define JSRW_LENGTH() length
101 #define JSRW_AT(column) s[column]
102 #define JSRW_GOT_MATCH(index) \
103 i = (index); \
104 goto got_match;
105 #define JSRW_TEST_GUESS(index) \
106 i = (index); \
107 goto test_guess;
108 #define JSRW_NO_MATCH() goto no_match;
109 #include "frontend/ReservedWordsGenerated.h"
110 #undef JSRW_NO_MATCH
111 #undef JSRW_TEST_GUESS
112 #undef JSRW_GOT_MATCH
113 #undef JSRW_AT
114 #undef JSRW_LENGTH
115
116 got_match:
117 return &reservedWords[i];
118
119 test_guess:
120 rw = &reservedWords[i];
121 chars = rw->chars;
122 do {
123 if (*s++ != static_cast<unsigned char>(*chars++)) {
124 goto no_match;
125 }
126 } while (--length != 0);
127 return rw;
128
129 no_match:
130 return nullptr;
131 }
132
133 template <>
FindReservedWord(const Utf8Unit * units,size_t length)134 MOZ_ALWAYS_INLINE const ReservedWordInfo* FindReservedWord<Utf8Unit>(
135 const Utf8Unit* units, size_t length) {
136 return FindReservedWord(Utf8AsUnsignedChars(units), length);
137 }
138
FindReservedWord(const js::frontend::TaggedParserAtomIndex atom)139 static const ReservedWordInfo* FindReservedWord(
140 const js::frontend::TaggedParserAtomIndex atom) {
141 switch (atom.rawData()) {
142 #define CASE_(_1, NAME, _3) \
143 case js::frontend::TaggedParserAtomIndex::WellKnownRawData::NAME(): \
144 return &reservedWords[size_t(ReservedWordsIndex::NAME)];
145 FOR_EACH_JAVASCRIPT_RESERVED_WORD(CASE_)
146 #undef CASE_
147 }
148
149 return nullptr;
150 }
151
GetSingleCodePoint(const char16_t ** p,const char16_t * end)152 static uint32_t GetSingleCodePoint(const char16_t** p, const char16_t* end) {
153 using namespace js;
154
155 uint32_t codePoint;
156 if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(**p)) && *p + 1 < end) {
157 char16_t lead = **p;
158 char16_t maybeTrail = *(*p + 1);
159 if (unicode::IsTrailSurrogate(maybeTrail)) {
160 *p += 2;
161 return unicode::UTF16Decode(lead, maybeTrail);
162 }
163 }
164
165 codePoint = **p;
166 (*p)++;
167 return codePoint;
168 }
169
170 template <typename CharT>
IsAsciiBinary(CharT c)171 static constexpr bool IsAsciiBinary(CharT c) {
172 using UnsignedCharT = std::make_unsigned_t<CharT>;
173 auto uc = static_cast<UnsignedCharT>(c);
174 return uc == '0' || uc == '1';
175 }
176
177 template <typename CharT>
IsAsciiOctal(CharT c)178 static constexpr bool IsAsciiOctal(CharT c) {
179 using UnsignedCharT = std::make_unsigned_t<CharT>;
180 auto uc = static_cast<UnsignedCharT>(c);
181 return '0' <= uc && uc <= '7';
182 }
183
184 template <typename CharT>
AsciiOctalToNumber(CharT c)185 static constexpr uint8_t AsciiOctalToNumber(CharT c) {
186 using UnsignedCharT = std::make_unsigned_t<CharT>;
187 auto uc = static_cast<UnsignedCharT>(c);
188 return uc - '0';
189 }
190
191 namespace js {
192
193 namespace frontend {
194
IsIdentifier(JSLinearString * str)195 bool IsIdentifier(JSLinearString* str) {
196 JS::AutoCheckCannotGC nogc;
197 MOZ_ASSERT(str);
198 if (str->hasLatin1Chars()) {
199 return IsIdentifier(str->latin1Chars(nogc), str->length());
200 }
201 return IsIdentifier(str->twoByteChars(nogc), str->length());
202 }
203
IsIdentifierNameOrPrivateName(JSLinearString * str)204 bool IsIdentifierNameOrPrivateName(JSLinearString* str) {
205 JS::AutoCheckCannotGC nogc;
206 MOZ_ASSERT(str);
207 if (str->hasLatin1Chars()) {
208 return IsIdentifierNameOrPrivateName(str->latin1Chars(nogc), str->length());
209 }
210 return IsIdentifierNameOrPrivateName(str->twoByteChars(nogc), str->length());
211 }
212
IsIdentifier(const Latin1Char * chars,size_t length)213 bool IsIdentifier(const Latin1Char* chars, size_t length) {
214 if (length == 0) {
215 return false;
216 }
217
218 if (!unicode::IsIdentifierStart(char16_t(*chars))) {
219 return false;
220 }
221
222 const Latin1Char* end = chars + length;
223 while (++chars != end) {
224 if (!unicode::IsIdentifierPart(char16_t(*chars))) {
225 return false;
226 }
227 }
228
229 return true;
230 }
231
IsIdentifierASCII(char c)232 bool IsIdentifierASCII(char c) { return unicode::IsIdentifierStartASCII(c); }
233
IsIdentifierASCII(char c1,char c2)234 bool IsIdentifierASCII(char c1, char c2) {
235 return unicode::IsIdentifierStartASCII(c1) &&
236 unicode::IsIdentifierPartASCII(c2);
237 }
238
IsIdentifierNameOrPrivateName(const Latin1Char * chars,size_t length)239 bool IsIdentifierNameOrPrivateName(const Latin1Char* chars, size_t length) {
240 if (length == 0) {
241 return false;
242 }
243
244 // Skip over any private name marker.
245 if (*chars == '#') {
246 ++chars;
247 --length;
248 }
249
250 return IsIdentifier(chars, length);
251 }
252
IsIdentifier(const char16_t * chars,size_t length)253 bool IsIdentifier(const char16_t* chars, size_t length) {
254 if (length == 0) {
255 return false;
256 }
257
258 const char16_t* p = chars;
259 const char16_t* end = chars + length;
260 uint32_t codePoint;
261
262 codePoint = GetSingleCodePoint(&p, end);
263 if (!unicode::IsIdentifierStart(codePoint)) {
264 return false;
265 }
266
267 while (p < end) {
268 codePoint = GetSingleCodePoint(&p, end);
269 if (!unicode::IsIdentifierPart(codePoint)) {
270 return false;
271 }
272 }
273
274 return true;
275 }
276
IsIdentifierNameOrPrivateName(const char16_t * chars,size_t length)277 bool IsIdentifierNameOrPrivateName(const char16_t* chars, size_t length) {
278 if (length == 0) {
279 return false;
280 }
281
282 const char16_t* p = chars;
283 const char16_t* end = chars + length;
284 uint32_t codePoint;
285
286 codePoint = GetSingleCodePoint(&p, end);
287
288 // Skip over any private name marker.
289 if (codePoint == '#') {
290 // The identifier part of a private name mustn't be empty.
291 if (length == 1) {
292 return false;
293 }
294
295 codePoint = GetSingleCodePoint(&p, end);
296 }
297
298 if (!unicode::IsIdentifierStart(codePoint)) {
299 return false;
300 }
301
302 while (p < end) {
303 codePoint = GetSingleCodePoint(&p, end);
304 if (!unicode::IsIdentifierPart(codePoint)) {
305 return false;
306 }
307 }
308
309 return true;
310 }
311
IsKeyword(TaggedParserAtomIndex atom)312 bool IsKeyword(TaggedParserAtomIndex atom) {
313 if (const ReservedWordInfo* rw = FindReservedWord(atom)) {
314 return TokenKindIsKeyword(rw->tokentype);
315 }
316
317 return false;
318 }
319
ReservedWordTokenKind(TaggedParserAtomIndex name)320 TokenKind ReservedWordTokenKind(TaggedParserAtomIndex name) {
321 if (const ReservedWordInfo* rw = FindReservedWord(name)) {
322 return rw->tokentype;
323 }
324
325 return TokenKind::Limit;
326 }
327
ReservedWordToCharZ(TaggedParserAtomIndex name)328 const char* ReservedWordToCharZ(TaggedParserAtomIndex name) {
329 if (const ReservedWordInfo* rw = FindReservedWord(name)) {
330 return ReservedWordToCharZ(rw->tokentype);
331 }
332
333 return nullptr;
334 }
335
ReservedWordToCharZ(TokenKind tt)336 const char* ReservedWordToCharZ(TokenKind tt) {
337 MOZ_ASSERT(tt != TokenKind::Name);
338 switch (tt) {
339 #define EMIT_CASE(word, name, type) \
340 case type: \
341 return js_##word##_str;
342 FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
343 #undef EMIT_CASE
344 default:
345 MOZ_ASSERT_UNREACHABLE("Not a reserved word PropertyName.");
346 }
347 return nullptr;
348 }
349
reservedWordToPropertyName(TokenKind tt) const350 TaggedParserAtomIndex TokenStreamAnyChars::reservedWordToPropertyName(
351 TokenKind tt) const {
352 MOZ_ASSERT(tt != TokenKind::Name);
353 switch (tt) {
354 #define EMIT_CASE(word, name, type) \
355 case type: \
356 return TaggedParserAtomIndex::WellKnown::name();
357 FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
358 #undef EMIT_CASE
359 default:
360 MOZ_ASSERT_UNREACHABLE("Not a reserved word TokenKind.");
361 }
362 return TaggedParserAtomIndex::null();
363 }
364
SourceCoords(JSContext * cx,uint32_t initialLineNumber,uint32_t initialOffset)365 SourceCoords::SourceCoords(JSContext* cx, uint32_t initialLineNumber,
366 uint32_t initialOffset)
367 : lineStartOffsets_(cx), initialLineNum_(initialLineNumber), lastIndex_(0) {
368 // This is actually necessary! Removing it causes compile errors on
369 // GCC and clang. You could try declaring this:
370 //
371 // const uint32_t SourceCoords::MAX_PTR;
372 //
373 // which fixes the GCC/clang error, but causes bustage on Windows. Sigh.
374 //
375 uint32_t maxPtr = MAX_PTR;
376
377 // The first line begins at buffer offset |initialOffset|. MAX_PTR is the
378 // sentinel. The appends cannot fail because |lineStartOffsets_| has
379 // statically-allocated elements.
380 MOZ_ASSERT(lineStartOffsets_.capacity() >= 2);
381 MOZ_ALWAYS_TRUE(lineStartOffsets_.reserve(2));
382 lineStartOffsets_.infallibleAppend(initialOffset);
383 lineStartOffsets_.infallibleAppend(maxPtr);
384 }
385
add(uint32_t lineNum,uint32_t lineStartOffset)386 MOZ_ALWAYS_INLINE bool SourceCoords::add(uint32_t lineNum,
387 uint32_t lineStartOffset) {
388 uint32_t index = indexFromLineNumber(lineNum);
389 uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
390
391 MOZ_ASSERT(lineStartOffsets_[0] <= lineStartOffset);
392 MOZ_ASSERT(lineStartOffsets_[sentinelIndex] == MAX_PTR);
393
394 if (index == sentinelIndex) {
395 // We haven't seen this newline before. Update lineStartOffsets_
396 // only if lineStartOffsets_.append succeeds, to keep sentinel.
397 // Otherwise return false to tell TokenStream about OOM.
398 uint32_t maxPtr = MAX_PTR;
399 if (!lineStartOffsets_.append(maxPtr)) {
400 static_assert(std::is_same_v<decltype(lineStartOffsets_.allocPolicy()),
401 TempAllocPolicy&>,
402 "this function's caller depends on it reporting an "
403 "error on failure, as TempAllocPolicy ensures");
404 return false;
405 }
406
407 lineStartOffsets_[index] = lineStartOffset;
408 } else {
409 // We have seen this newline before (and ungot it). Do nothing (other
410 // than checking it hasn't mysteriously changed).
411 // This path can be executed after hitting OOM, so check index.
412 MOZ_ASSERT_IF(index < sentinelIndex,
413 lineStartOffsets_[index] == lineStartOffset);
414 }
415 return true;
416 }
417
fill(const SourceCoords & other)418 MOZ_ALWAYS_INLINE bool SourceCoords::fill(const SourceCoords& other) {
419 MOZ_ASSERT(lineStartOffsets_[0] == other.lineStartOffsets_[0]);
420 MOZ_ASSERT(lineStartOffsets_.back() == MAX_PTR);
421 MOZ_ASSERT(other.lineStartOffsets_.back() == MAX_PTR);
422
423 if (lineStartOffsets_.length() >= other.lineStartOffsets_.length()) {
424 return true;
425 }
426
427 uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
428 lineStartOffsets_[sentinelIndex] = other.lineStartOffsets_[sentinelIndex];
429
430 for (size_t i = sentinelIndex + 1; i < other.lineStartOffsets_.length();
431 i++) {
432 if (!lineStartOffsets_.append(other.lineStartOffsets_[i])) {
433 return false;
434 }
435 }
436 return true;
437 }
438
439 MOZ_ALWAYS_INLINE uint32_t
indexFromOffset(uint32_t offset) const440 SourceCoords::indexFromOffset(uint32_t offset) const {
441 uint32_t iMin, iMax, iMid;
442
443 if (lineStartOffsets_[lastIndex_] <= offset) {
444 // If we reach here, offset is on a line the same as or higher than
445 // last time. Check first for the +0, +1, +2 cases, because they
446 // typically cover 85--98% of cases.
447 if (offset < lineStartOffsets_[lastIndex_ + 1]) {
448 return lastIndex_; // index is same as last time
449 }
450
451 // If we reach here, there must be at least one more entry (plus the
452 // sentinel). Try it.
453 lastIndex_++;
454 if (offset < lineStartOffsets_[lastIndex_ + 1]) {
455 return lastIndex_; // index is one higher than last time
456 }
457
458 // The same logic applies here.
459 lastIndex_++;
460 if (offset < lineStartOffsets_[lastIndex_ + 1]) {
461 return lastIndex_; // index is two higher than last time
462 }
463
464 // No luck. Oh well, we have a better-than-default starting point for
465 // the binary search.
466 iMin = lastIndex_ + 1;
467 MOZ_ASSERT(iMin <
468 lineStartOffsets_.length() - 1); // -1 due to the sentinel
469
470 } else {
471 iMin = 0;
472 }
473
474 // This is a binary search with deferred detection of equality, which was
475 // marginally faster in this case than a standard binary search.
476 // The -2 is because |lineStartOffsets_.length() - 1| is the sentinel, and we
477 // want one before that.
478 iMax = lineStartOffsets_.length() - 2;
479 while (iMax > iMin) {
480 iMid = iMin + (iMax - iMin) / 2;
481 if (offset >= lineStartOffsets_[iMid + 1]) {
482 iMin = iMid + 1; // offset is above lineStartOffsets_[iMid]
483 } else {
484 iMax = iMid; // offset is below or within lineStartOffsets_[iMid]
485 }
486 }
487
488 MOZ_ASSERT(iMax == iMin);
489 MOZ_ASSERT(lineStartOffsets_[iMin] <= offset);
490 MOZ_ASSERT(offset < lineStartOffsets_[iMin + 1]);
491
492 lastIndex_ = iMin;
493 return iMin;
494 }
495
lineToken(uint32_t offset) const496 SourceCoords::LineToken SourceCoords::lineToken(uint32_t offset) const {
497 return LineToken(indexFromOffset(offset), offset);
498 }
499
TokenStreamAnyChars(JSContext * cx,const ReadOnlyCompileOptions & options,StrictModeGetter * smg)500 TokenStreamAnyChars::TokenStreamAnyChars(JSContext* cx,
501 const ReadOnlyCompileOptions& options,
502 StrictModeGetter* smg)
503 : cx(cx),
504 options_(options),
505 strictModeGetter_(smg),
506 filename_(options.filename()),
507 longLineColumnInfo_(cx),
508 srcCoords(cx, options.lineno, options.scriptSourceOffset),
509 lineno(options.lineno),
510 mutedErrors(options.mutedErrors()) {
511 // |isExprEnding| was initially zeroed: overwrite the true entries here.
512 isExprEnding[size_t(TokenKind::Comma)] = true;
513 isExprEnding[size_t(TokenKind::Semi)] = true;
514 isExprEnding[size_t(TokenKind::Colon)] = true;
515 isExprEnding[size_t(TokenKind::RightParen)] = true;
516 isExprEnding[size_t(TokenKind::RightBracket)] = true;
517 isExprEnding[size_t(TokenKind::RightCurly)] = true;
518 }
519
520 template <typename Unit>
TokenStreamCharsBase(JSContext * cx,ParserAtomsTable * pasrerAtoms,const Unit * units,size_t length,size_t startOffset)521 TokenStreamCharsBase<Unit>::TokenStreamCharsBase(JSContext* cx,
522 ParserAtomsTable* pasrerAtoms,
523 const Unit* units,
524 size_t length,
525 size_t startOffset)
526 : TokenStreamCharsShared(cx, pasrerAtoms),
527 sourceUnits(units, length, startOffset) {}
528
FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer & charBuffer,const char16_t * cur,const char16_t * end)529 bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer,
530 const char16_t* cur,
531 const char16_t* end) {
532 MOZ_ASSERT(charBuffer.length() == 0);
533
534 while (cur < end) {
535 char16_t ch = *cur++;
536 if (ch == '\r') {
537 ch = '\n';
538 if (cur < end && *cur == '\n') {
539 cur++;
540 }
541 }
542
543 if (!charBuffer.append(ch)) {
544 return false;
545 }
546 }
547
548 MOZ_ASSERT(cur == end);
549 return true;
550 }
551
FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer & charBuffer,const Utf8Unit * cur,const Utf8Unit * end)552 bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer,
553 const Utf8Unit* cur,
554 const Utf8Unit* end) {
555 MOZ_ASSERT(charBuffer.length() == 0);
556
557 while (cur < end) {
558 Utf8Unit unit = *cur++;
559 if (MOZ_LIKELY(IsAscii(unit))) {
560 char16_t ch = unit.toUint8();
561 if (ch == '\r') {
562 ch = '\n';
563 if (cur < end && *cur == Utf8Unit('\n')) {
564 cur++;
565 }
566 }
567
568 if (!charBuffer.append(ch)) {
569 return false;
570 }
571
572 continue;
573 }
574
575 Maybe<char32_t> ch = DecodeOneUtf8CodePoint(unit, &cur, end);
576 MOZ_ASSERT(ch.isSome(),
577 "provided source text should already have been validated");
578
579 if (!AppendCodePointToCharBuffer(charBuffer, ch.value())) {
580 return false;
581 }
582 }
583
584 MOZ_ASSERT(cur == end);
585 return true;
586 }
587
588 template <typename Unit, class AnyCharsAccess>
TokenStreamSpecific(JSContext * cx,ParserAtomsTable * pasrerAtoms,const ReadOnlyCompileOptions & options,const Unit * units,size_t length)589 TokenStreamSpecific<Unit, AnyCharsAccess>::TokenStreamSpecific(
590 JSContext* cx, ParserAtomsTable* pasrerAtoms,
591 const ReadOnlyCompileOptions& options, const Unit* units, size_t length)
592 : TokenStreamChars<Unit, AnyCharsAccess>(cx, pasrerAtoms, units, length,
593 options.scriptSourceOffset) {}
594
checkOptions()595 bool TokenStreamAnyChars::checkOptions() {
596 // Constrain starting columns to where they will saturate.
597 if (options().column > ColumnLimit) {
598 reportErrorNoOffset(JSMSG_BAD_COLUMN_NUMBER);
599 return false;
600 }
601
602 return true;
603 }
604
reportErrorNoOffset(unsigned errorNumber,...)605 void TokenStreamAnyChars::reportErrorNoOffset(unsigned errorNumber, ...) {
606 va_list args;
607 va_start(args, errorNumber);
608
609 reportErrorNoOffsetVA(errorNumber, &args);
610
611 va_end(args);
612 }
613
reportErrorNoOffsetVA(unsigned errorNumber,va_list * args)614 void TokenStreamAnyChars::reportErrorNoOffsetVA(unsigned errorNumber,
615 va_list* args) {
616 ErrorMetadata metadata;
617 computeErrorMetadataNoOffset(&metadata);
618
619 ReportCompileErrorLatin1(cx, std::move(metadata), nullptr, errorNumber, args);
620 }
621
622 [[nodiscard]] MOZ_ALWAYS_INLINE bool
internalUpdateLineInfoForEOL(uint32_t lineStartOffset)623 TokenStreamAnyChars::internalUpdateLineInfoForEOL(uint32_t lineStartOffset) {
624 prevLinebase = linebase;
625 linebase = lineStartOffset;
626 lineno++;
627
628 // On overflow, report error.
629 if (MOZ_UNLIKELY(!lineno)) {
630 reportErrorNoOffset(JSMSG_BAD_LINE_NUMBER);
631 return false;
632 }
633
634 return srcCoords.add(lineno, linebase);
635 }
636
637 #ifdef DEBUG
638
639 template <>
assertNextCodePoint(const PeekedCodePoint<char16_t> & peeked)640 inline void SourceUnits<char16_t>::assertNextCodePoint(
641 const PeekedCodePoint<char16_t>& peeked) {
642 char32_t c = peeked.codePoint();
643 if (c < unicode::NonBMPMin) {
644 MOZ_ASSERT(peeked.lengthInUnits() == 1);
645 MOZ_ASSERT(ptr[0] == c);
646 } else {
647 MOZ_ASSERT(peeked.lengthInUnits() == 2);
648 char16_t lead, trail;
649 unicode::UTF16Encode(c, &lead, &trail);
650 MOZ_ASSERT(ptr[0] == lead);
651 MOZ_ASSERT(ptr[1] == trail);
652 }
653 }
654
655 template <>
assertNextCodePoint(const PeekedCodePoint<Utf8Unit> & peeked)656 inline void SourceUnits<Utf8Unit>::assertNextCodePoint(
657 const PeekedCodePoint<Utf8Unit>& peeked) {
658 char32_t c = peeked.codePoint();
659
660 // This is all roughly indulgence of paranoia only for assertions, so the
661 // reimplementation of UTF-8 encoding a code point is (we think) a virtue.
662 uint8_t expectedUnits[4] = {};
663 if (c < 0x80) {
664 expectedUnits[0] = AssertedCast<uint8_t>(c);
665 } else if (c < 0x800) {
666 expectedUnits[0] = 0b1100'0000 | (c >> 6);
667 expectedUnits[1] = 0b1000'0000 | (c & 0b11'1111);
668 } else if (c < 0x10000) {
669 expectedUnits[0] = 0b1110'0000 | (c >> 12);
670 expectedUnits[1] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
671 expectedUnits[2] = 0b1000'0000 | (c & 0b11'1111);
672 } else {
673 expectedUnits[0] = 0b1111'0000 | (c >> 18);
674 expectedUnits[1] = 0b1000'0000 | ((c >> 12) & 0b11'1111);
675 expectedUnits[2] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
676 expectedUnits[3] = 0b1000'0000 | (c & 0b11'1111);
677 }
678
679 MOZ_ASSERT(peeked.lengthInUnits() <= 4);
680 for (uint8_t i = 0; i < peeked.lengthInUnits(); i++) {
681 MOZ_ASSERT(expectedUnits[i] == ptr[i].toUint8());
682 }
683 }
684
685 #endif // DEBUG
686
RetractPointerToCodePointBoundary(const Utf8Unit ** ptr,const Utf8Unit * limit)687 static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
688 const Utf8Unit** ptr, const Utf8Unit* limit) {
689 MOZ_ASSERT(*ptr <= limit);
690
691 // |limit| is a code point boundary.
692 if (MOZ_UNLIKELY(*ptr == limit)) {
693 return;
694 }
695
696 // Otherwise rewind past trailing units to the start of the code point.
697 #ifdef DEBUG
698 size_t retracted = 0;
699 #endif
700 while (MOZ_UNLIKELY(IsTrailingUnit((*ptr)[0]))) {
701 --*ptr;
702 #ifdef DEBUG
703 retracted++;
704 #endif
705 }
706
707 MOZ_ASSERT(retracted < 4,
708 "the longest UTF-8 code point is four units, so this should never "
709 "retract more than three units");
710 }
711
RetractPointerToCodePointBoundary(const char16_t ** ptr,const char16_t * limit)712 static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
713 const char16_t** ptr, const char16_t* limit) {
714 MOZ_ASSERT(*ptr <= limit);
715
716 // |limit| is a code point boundary.
717 if (MOZ_UNLIKELY(*ptr == limit)) {
718 return;
719 }
720
721 // Otherwise the pointer must be retracted by one iff it splits a two-unit
722 // code point.
723 if (MOZ_UNLIKELY(unicode::IsTrailSurrogate((*ptr)[0]))) {
724 // Outside test suites testing garbage WTF-16, it's basically guaranteed
725 // here that |(*ptr)[-1] (*ptr)[0]| is a surrogate pair.
726 if (MOZ_LIKELY(unicode::IsLeadSurrogate((*ptr)[-1]))) {
727 --*ptr;
728 }
729 }
730 }
731
732 template <typename Unit>
computePartialColumn(const LineToken lineToken,const uint32_t offset,const SourceUnits<Unit> & sourceUnits) const733 uint32_t TokenStreamAnyChars::computePartialColumn(
734 const LineToken lineToken, const uint32_t offset,
735 const SourceUnits<Unit>& sourceUnits) const {
736 lineToken.assertConsistentOffset(offset);
737
738 const uint32_t line = lineNumber(lineToken);
739 const uint32_t start = srcCoords.lineStart(lineToken);
740
741 // Reset the previous offset/column cache for this line, if the previous
742 // lookup wasn't on this line.
743 if (line != lineOfLastColumnComputation_) {
744 lineOfLastColumnComputation_ = line;
745 lastChunkVectorForLine_ = nullptr;
746 lastOffsetOfComputedColumn_ = start;
747 lastComputedColumn_ = 0;
748 }
749
750 // Compute and return the final column number from a partial offset/column,
751 // using the last-cached offset/column if they're more optimal.
752 auto ColumnFromPartial = [this, offset, &sourceUnits](uint32_t partialOffset,
753 uint32_t partialCols,
754 UnitsType unitsType) {
755 MOZ_ASSERT(partialOffset <= offset);
756
757 // If the last lookup on this line was closer to |offset|, use it.
758 if (partialOffset < this->lastOffsetOfComputedColumn_ &&
759 this->lastOffsetOfComputedColumn_ <= offset) {
760 partialOffset = this->lastOffsetOfComputedColumn_;
761 partialCols = this->lastComputedColumn_;
762 }
763
764 const Unit* begin = sourceUnits.codeUnitPtrAt(partialOffset);
765 const Unit* end = sourceUnits.codeUnitPtrAt(offset);
766
767 size_t offsetDelta = AssertedCast<uint32_t>(PointerRangeSize(begin, end));
768 partialOffset += offsetDelta;
769
770 if (unitsType == UnitsType::GuaranteedSingleUnit) {
771 MOZ_ASSERT(unicode::CountCodePoints(begin, end) == offsetDelta,
772 "guaranteed-single-units also guarantee pointer distance "
773 "equals code point count");
774 partialCols += offsetDelta;
775 } else {
776 partialCols +=
777 AssertedCast<uint32_t>(unicode::CountCodePoints(begin, end));
778 }
779
780 this->lastOffsetOfComputedColumn_ = partialOffset;
781 this->lastComputedColumn_ = partialCols;
782 return partialCols;
783 };
784
785 const uint32_t offsetInLine = offset - start;
786
787 // We won't add an entry to |longLineColumnInfo_| for lines where the maximum
788 // column has offset less than this value. The most common (non-minified)
789 // long line length is likely 80ch, maybe 100ch, so we use that, rounded up to
790 // the next power of two for efficient division/multiplication below.
791 constexpr uint32_t ColumnChunkLength = mozilla::tl::RoundUpPow2<100>::value;
792
793 // The index within any associated |Vector<ChunkInfo>| of |offset|'s chunk.
794 const uint32_t chunkIndex = offsetInLine / ColumnChunkLength;
795 if (chunkIndex == 0) {
796 // We don't know from an |offset| in the zeroth chunk that this line is even
797 // long. First-chunk info is mostly useless, anyway -- we have |start|
798 // already. So if we have *easy* access to that zeroth chunk, use it --
799 // otherwise just count pessimally. (This will still benefit from caching
800 // the last column/offset for computations for successive offsets, so it's
801 // not *always* worst-case.)
802 UnitsType unitsType;
803 if (lastChunkVectorForLine_ && lastChunkVectorForLine_->length() > 0) {
804 MOZ_ASSERT((*lastChunkVectorForLine_)[0].column() == 0);
805 unitsType = (*lastChunkVectorForLine_)[0].unitsType();
806 } else {
807 unitsType = UnitsType::PossiblyMultiUnit;
808 }
809
810 return ColumnFromPartial(start, 0, unitsType);
811 }
812
813 // If this line has no chunk vector yet, insert one in the hash map. (The
814 // required index is allocated and filled further down.)
815 if (!lastChunkVectorForLine_) {
816 auto ptr = longLineColumnInfo_.lookupForAdd(line);
817 if (!ptr) {
818 // This could rehash and invalidate a cached vector pointer, but the outer
819 // condition means we don't have a cached pointer.
820 if (!longLineColumnInfo_.add(ptr, line, Vector<ChunkInfo>(cx))) {
821 // In case of OOM, just count columns from the start of the line.
822 cx->recoverFromOutOfMemory();
823 return ColumnFromPartial(start, 0, UnitsType::PossiblyMultiUnit);
824 }
825 }
826
827 // Note that adding elements to this vector won't invalidate this pointer.
828 lastChunkVectorForLine_ = &ptr->value();
829 }
830
831 const Unit* const limit = sourceUnits.codeUnitPtrAt(offset);
832
833 auto RetractedOffsetOfChunk = [
834 #ifdef DEBUG
835 this,
836 #endif
837 start, limit,
838 &sourceUnits](uint32_t index) {
839 MOZ_ASSERT(index < this->lastChunkVectorForLine_->length());
840
841 uint32_t naiveOffset = start + index * ColumnChunkLength;
842 const Unit* naivePtr = sourceUnits.codeUnitPtrAt(naiveOffset);
843
844 const Unit* actualPtr = naivePtr;
845 RetractPointerToCodePointBoundary(&actualPtr, limit);
846
847 #ifdef DEBUG
848 if ((*this->lastChunkVectorForLine_)[index].unitsType() ==
849 UnitsType::GuaranteedSingleUnit) {
850 MOZ_ASSERT(naivePtr == actualPtr, "miscomputed unitsType value");
851 }
852 #endif
853
854 return naiveOffset - PointerRangeSize(actualPtr, naivePtr);
855 };
856
857 uint32_t partialOffset;
858 uint32_t partialColumn;
859 UnitsType unitsType;
860
861 auto entriesLen = AssertedCast<uint32_t>(lastChunkVectorForLine_->length());
862 if (chunkIndex < entriesLen) {
863 // We've computed the chunk |offset| resides in. Compute the column number
864 // from the chunk.
865 partialOffset = RetractedOffsetOfChunk(chunkIndex);
866 partialColumn = (*lastChunkVectorForLine_)[chunkIndex].column();
867
868 // This is exact if |chunkIndex| isn't the last chunk.
869 unitsType = (*lastChunkVectorForLine_)[chunkIndex].unitsType();
870
871 // Otherwise the last chunk is pessimistically assumed to contain multi-unit
872 // code points because we haven't fully examined its contents yet -- they
873 // may not have been tokenized yet, they could contain encoding errors, or
874 // they might not even exist.
875 MOZ_ASSERT_IF(chunkIndex == entriesLen - 1,
876 (*lastChunkVectorForLine_)[chunkIndex].unitsType() ==
877 UnitsType::PossiblyMultiUnit);
878 } else {
879 // Extend the vector from its last entry or the start of the line. (This is
880 // also a suitable partial start point if we must recover from OOM.)
881 if (entriesLen > 0) {
882 partialOffset = RetractedOffsetOfChunk(entriesLen - 1);
883 partialColumn = (*lastChunkVectorForLine_)[entriesLen - 1].column();
884 } else {
885 partialOffset = start;
886 partialColumn = 0;
887 }
888
889 if (!lastChunkVectorForLine_->reserve(chunkIndex + 1)) {
890 // As earlier, just start from the greatest offset/column in case of OOM.
891 cx->recoverFromOutOfMemory();
892 return ColumnFromPartial(partialOffset, partialColumn,
893 UnitsType::PossiblyMultiUnit);
894 }
895
896 // OOM is no longer possible now. \o/
897
898 // The vector always begins with the column of the line start, i.e. zero,
899 // with chunk units pessimally assumed not single-unit.
900 if (entriesLen == 0) {
901 lastChunkVectorForLine_->infallibleAppend(
902 ChunkInfo(0, UnitsType::PossiblyMultiUnit));
903 entriesLen++;
904 }
905
906 do {
907 const Unit* const begin = sourceUnits.codeUnitPtrAt(partialOffset);
908 const Unit* chunkLimit = sourceUnits.codeUnitPtrAt(
909 start + std::min(entriesLen++ * ColumnChunkLength, offsetInLine));
910
911 MOZ_ASSERT(begin < chunkLimit);
912 MOZ_ASSERT(chunkLimit <= limit);
913
914 static_assert(
915 ColumnChunkLength > SourceUnitTraits<Unit>::maxUnitsLength - 1,
916 "any retraction below is assumed to never underflow to the "
917 "preceding chunk, even for the longest code point");
918
919 // Prior tokenizing ensured that [begin, limit) is validly encoded, and
920 // |begin < chunkLimit|, so any retraction here can't underflow.
921 RetractPointerToCodePointBoundary(&chunkLimit, limit);
922
923 MOZ_ASSERT(begin < chunkLimit);
924 MOZ_ASSERT(chunkLimit <= limit);
925
926 size_t numUnits = PointerRangeSize(begin, chunkLimit);
927 size_t numCodePoints = unicode::CountCodePoints(begin, chunkLimit);
928
929 // If this chunk (which will become non-final at the end of the loop) is
930 // all single-unit code points, annotate the chunk accordingly.
931 if (numUnits == numCodePoints) {
932 lastChunkVectorForLine_->back().guaranteeSingleUnits();
933 }
934
935 partialOffset += numUnits;
936 partialColumn += numCodePoints;
937
938 lastChunkVectorForLine_->infallibleEmplaceBack(
939 partialColumn, UnitsType::PossiblyMultiUnit);
940 } while (entriesLen < chunkIndex + 1);
941
942 // We're at a spot in the current final chunk, and final chunks never have
943 // complete units information, so be pessimistic.
944 unitsType = UnitsType::PossiblyMultiUnit;
945 }
946
947 return ColumnFromPartial(partialOffset, partialColumn, unitsType);
948 }
949
950 template <typename Unit, class AnyCharsAccess>
computeColumn(LineToken lineToken,uint32_t offset) const951 uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeColumn(
952 LineToken lineToken, uint32_t offset) const {
953 lineToken.assertConsistentOffset(offset);
954
955 const TokenStreamAnyChars& anyChars = anyCharsAccess();
956
957 uint32_t column =
958 anyChars.computePartialColumn(lineToken, offset, this->sourceUnits);
959
960 if (lineToken.isFirstLine()) {
961 if (column > ColumnLimit) {
962 return ColumnLimit;
963 }
964
965 static_assert(uint32_t(ColumnLimit + ColumnLimit) > ColumnLimit,
966 "Adding ColumnLimit should not overflow");
967
968 uint32_t firstLineOffset = anyChars.options_.column;
969 column += firstLineOffset;
970 }
971
972 if (column > ColumnLimit) {
973 return ColumnLimit;
974 }
975
976 return column;
977 }
978
979 template <typename Unit, class AnyCharsAccess>
computeLineAndColumn(uint32_t offset,uint32_t * line,uint32_t * column) const980 void GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeLineAndColumn(
981 uint32_t offset, uint32_t* line, uint32_t* column) const {
982 const TokenStreamAnyChars& anyChars = anyCharsAccess();
983
984 auto lineToken = anyChars.lineToken(offset);
985 *line = anyChars.lineNumber(lineToken);
986 *column = computeColumn(lineToken, offset);
987 }
988
989 template <class AnyCharsAccess>
internalEncodingError(uint8_t relevantUnits,unsigned errorNumber,...)990 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::internalEncodingError(
991 uint8_t relevantUnits, unsigned errorNumber, ...) {
992 va_list args;
993 va_start(args, errorNumber);
994
995 do {
996 size_t offset = this->sourceUnits.offset();
997
998 ErrorMetadata err;
999
1000 TokenStreamAnyChars& anyChars = anyCharsAccess();
1001
1002 bool canAddLineOfContext = fillExceptingContext(&err, offset);
1003 if (canAddLineOfContext) {
1004 if (!internalComputeLineOfContext(&err, offset)) {
1005 break;
1006 }
1007
1008 // As this is an encoding error, the computed window-end must be
1009 // identical to the location of the error -- any further on and the
1010 // window would contain invalid Unicode.
1011 MOZ_ASSERT_IF(err.lineOfContext != nullptr,
1012 err.lineLength == err.tokenOffset);
1013 }
1014
1015 auto notes = MakeUnique<JSErrorNotes>();
1016 if (!notes) {
1017 ReportOutOfMemory(anyChars.cx);
1018 break;
1019 }
1020
1021 // The largest encoding of a UTF-8 code point is 4 units. (Encoding an
1022 // obsolete 5- or 6-byte code point will complain only about a bad lead
1023 // code unit.)
1024 constexpr size_t MaxWidth = sizeof("0xHH 0xHH 0xHH 0xHH");
1025
1026 MOZ_ASSERT(relevantUnits > 0);
1027
1028 char badUnitsStr[MaxWidth];
1029 char* ptr = badUnitsStr;
1030 while (relevantUnits > 0) {
1031 byteToString(this->sourceUnits.getCodeUnit().toUint8(), ptr);
1032 ptr[4] = ' ';
1033
1034 ptr += 5;
1035 relevantUnits--;
1036 }
1037
1038 ptr[-1] = '\0';
1039
1040 uint32_t line, column;
1041 computeLineAndColumn(offset, &line, &column);
1042
1043 if (!notes->addNoteASCII(anyChars.cx, anyChars.getFilename(), 0, line,
1044 column, GetErrorMessage, nullptr,
1045 JSMSG_BAD_CODE_UNITS, badUnitsStr)) {
1046 break;
1047 }
1048
1049 ReportCompileErrorLatin1(anyChars.cx, std::move(err), std::move(notes),
1050 errorNumber, &args);
1051 } while (false);
1052
1053 va_end(args);
1054 }
1055
1056 template <class AnyCharsAccess>
badLeadUnit(Utf8Unit lead)1057 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badLeadUnit(
1058 Utf8Unit lead) {
1059 uint8_t leadValue = lead.toUint8();
1060
1061 char leadByteStr[5];
1062 byteToTerminatedString(leadValue, leadByteStr);
1063
1064 internalEncodingError(1, JSMSG_BAD_LEADING_UTF8_UNIT, leadByteStr);
1065 }
1066
1067 template <class AnyCharsAccess>
notEnoughUnits(Utf8Unit lead,uint8_t remaining,uint8_t required)1068 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::notEnoughUnits(
1069 Utf8Unit lead, uint8_t remaining, uint8_t required) {
1070 uint8_t leadValue = lead.toUint8();
1071
1072 MOZ_ASSERT(required == 2 || required == 3 || required == 4);
1073 MOZ_ASSERT(remaining < 4);
1074 MOZ_ASSERT(remaining < required);
1075
1076 char leadByteStr[5];
1077 byteToTerminatedString(leadValue, leadByteStr);
1078
1079 // |toHexChar| produces the desired decimal numbers for values < 4.
1080 const char expectedStr[] = {toHexChar(required - 1), '\0'};
1081 const char actualStr[] = {toHexChar(remaining - 1), '\0'};
1082
1083 internalEncodingError(remaining, JSMSG_NOT_ENOUGH_CODE_UNITS, leadByteStr,
1084 expectedStr, required == 2 ? "" : "s", actualStr,
1085 remaining == 2 ? " was" : "s were");
1086 }
1087
1088 template <class AnyCharsAccess>
badTrailingUnit(uint8_t unitsObserved)1089 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badTrailingUnit(
1090 uint8_t unitsObserved) {
1091 Utf8Unit badUnit =
1092 this->sourceUnits.addressOfNextCodeUnit()[unitsObserved - 1];
1093
1094 char badByteStr[5];
1095 byteToTerminatedString(badUnit.toUint8(), badByteStr);
1096
1097 internalEncodingError(unitsObserved, JSMSG_BAD_TRAILING_UTF8_UNIT,
1098 badByteStr);
1099 }
1100
1101 template <class AnyCharsAccess>
1102 MOZ_COLD void
badStructurallyValidCodePoint(uint32_t codePoint,uint8_t codePointLength,const char * reason)1103 TokenStreamChars<Utf8Unit, AnyCharsAccess>::badStructurallyValidCodePoint(
1104 uint32_t codePoint, uint8_t codePointLength, const char* reason) {
1105 // Construct a string like "0x203D" (including null terminator) to include
1106 // in the error message. Write the string end-to-start from end to start
1107 // of an adequately sized |char| array, shifting least significant nibbles
1108 // off the number and writing the corresponding hex digits until done, then
1109 // prefixing with "0x". |codePointStr| points at the incrementally
1110 // computed string, within |codePointCharsArray|'s bounds.
1111
1112 // 0x1F'FFFF is the maximum value that can fit in 3+6+6+6 unconstrained
1113 // bits in a four-byte UTF-8 code unit sequence.
1114 constexpr size_t MaxHexSize = sizeof(
1115 "0x1F"
1116 "FFFF"); // including '\0'
1117 char codePointCharsArray[MaxHexSize];
1118
1119 char* codePointStr = std::end(codePointCharsArray);
1120 *--codePointStr = '\0';
1121
1122 // Note that by do-while looping here rather than while-looping, this
1123 // writes a '0' when |codePoint == 0|.
1124 do {
1125 MOZ_ASSERT(codePointCharsArray < codePointStr);
1126 *--codePointStr = toHexChar(codePoint & 0xF);
1127 codePoint >>= 4;
1128 } while (codePoint);
1129
1130 MOZ_ASSERT(codePointCharsArray + 2 <= codePointStr);
1131 *--codePointStr = 'x';
1132 *--codePointStr = '0';
1133
1134 internalEncodingError(codePointLength, JSMSG_FORBIDDEN_UTF8_CODE_POINT,
1135 codePointStr, reason);
1136 }
1137
1138 template <class AnyCharsAccess>
1139 [[nodiscard]] bool
getNonAsciiCodePointDontNormalize(Utf8Unit lead,char32_t * codePoint)1140 TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePointDontNormalize(
1141 Utf8Unit lead, char32_t* codePoint) {
1142 auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
1143
1144 auto onNotEnoughUnits = [this, &lead](uint8_t remaining, uint8_t required) {
1145 this->notEnoughUnits(lead, remaining, required);
1146 };
1147
1148 auto onBadTrailingUnit = [this](uint8_t unitsObserved) {
1149 this->badTrailingUnit(unitsObserved);
1150 };
1151
1152 auto onBadCodePoint = [this](char32_t badCodePoint, uint8_t unitsObserved) {
1153 this->badCodePoint(badCodePoint, unitsObserved);
1154 };
1155
1156 auto onNotShortestForm = [this](char32_t badCodePoint,
1157 uint8_t unitsObserved) {
1158 this->notShortestForm(badCodePoint, unitsObserved);
1159 };
1160
1161 // If a valid code point is decoded, this function call consumes its code
1162 // units. If not, it ungets the lead code unit and invokes the right error
1163 // handler, so on failure we must immediately return false.
1164 SourceUnitsIterator iter(this->sourceUnits);
1165 Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePointInline(
1166 lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
1167 onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
1168 if (maybeCodePoint.isNothing()) {
1169 return false;
1170 }
1171
1172 *codePoint = maybeCodePoint.value();
1173 return true;
1174 }
1175
1176 template <class AnyCharsAccess>
getNonAsciiCodePoint(int32_t lead,int32_t * codePoint)1177 bool TokenStreamChars<char16_t, AnyCharsAccess>::getNonAsciiCodePoint(
1178 int32_t lead, int32_t* codePoint) {
1179 MOZ_ASSERT(lead != EOF);
1180 MOZ_ASSERT(!isAsciiCodePoint(lead),
1181 "ASCII code unit/point must be handled separately");
1182 MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
1183 "getNonAsciiCodePoint called incorrectly");
1184
1185 // The code point is usually |lead|: overwrite later if needed.
1186 *codePoint = lead;
1187
1188 // ECMAScript specifically requires that unpaired UTF-16 surrogates be
1189 // treated as the corresponding code point and not as an error. See
1190 // <https://tc39.github.io/ecma262/#sec-ecmascript-language-types-string-type>.
1191 // Thus this function does not consider any sequence of 16-bit numbers to
1192 // be intrinsically in error.
1193
1194 // Dispense with single-unit code points and lone trailing surrogates.
1195 if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead))) {
1196 if (MOZ_UNLIKELY(lead == unicode::LINE_SEPARATOR ||
1197 lead == unicode::PARA_SEPARATOR)) {
1198 if (!updateLineInfoForEOL()) {
1199 #ifdef DEBUG
1200 *codePoint = EOF; // sentinel value to hopefully cause errors
1201 #endif
1202 MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
1203 return false;
1204 }
1205
1206 *codePoint = '\n';
1207 } else {
1208 MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1209 }
1210
1211 return true;
1212 }
1213
1214 // Also handle a lead surrogate not paired with a trailing surrogate.
1215 if (MOZ_UNLIKELY(
1216 this->sourceUnits.atEnd() ||
1217 !unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) {
1218 MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1219 return true;
1220 }
1221
1222 // Otherwise we have a multi-unit code point.
1223 *codePoint = unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
1224 MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1225 return true;
1226 }
1227
1228 template <typename Unit, class AnyCharsAccess>
getCodePoint(int32_t * cp)1229 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getCodePoint(int32_t* cp) {
1230 int32_t unit = getCodeUnit();
1231 if (unit == EOF) {
1232 MOZ_ASSERT(anyCharsAccess().flags.isEOF,
1233 "flags.isEOF should have been set by getCodeUnit()");
1234 *cp = EOF;
1235 return true;
1236 }
1237
1238 if (isAsciiCodePoint(unit)) {
1239 return getFullAsciiCodePoint(unit, cp);
1240 }
1241
1242 return getNonAsciiCodePoint(unit, cp);
1243 }
1244
1245 template <class AnyCharsAccess>
getNonAsciiCodePoint(int32_t unit,int32_t * codePoint)1246 bool TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePoint(
1247 int32_t unit, int32_t* codePoint) {
1248 MOZ_ASSERT(unit != EOF);
1249 MOZ_ASSERT(!isAsciiCodePoint(unit),
1250 "ASCII code unit/point must be handled separately");
1251
1252 Utf8Unit lead = Utf8Unit(static_cast<unsigned char>(unit));
1253 MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
1254 "getNonAsciiCodePoint called incorrectly");
1255
1256 auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
1257
1258 auto onNotEnoughUnits = [this, &lead](uint_fast8_t remaining,
1259 uint_fast8_t required) {
1260 this->notEnoughUnits(lead, remaining, required);
1261 };
1262
1263 auto onBadTrailingUnit = [this](uint_fast8_t unitsObserved) {
1264 this->badTrailingUnit(unitsObserved);
1265 };
1266
1267 auto onBadCodePoint = [this](char32_t badCodePoint,
1268 uint_fast8_t unitsObserved) {
1269 this->badCodePoint(badCodePoint, unitsObserved);
1270 };
1271
1272 auto onNotShortestForm = [this](char32_t badCodePoint,
1273 uint_fast8_t unitsObserved) {
1274 this->notShortestForm(badCodePoint, unitsObserved);
1275 };
1276
1277 // This consumes the full, valid code point or ungets |lead| and calls the
1278 // appropriate error functor on failure.
1279 SourceUnitsIterator iter(this->sourceUnits);
1280 Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePoint(
1281 lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
1282 onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
1283 if (maybeCodePoint.isNothing()) {
1284 return false;
1285 }
1286
1287 char32_t cp = maybeCodePoint.value();
1288 if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
1289 cp == unicode::PARA_SEPARATOR)) {
1290 if (!updateLineInfoForEOL()) {
1291 #ifdef DEBUG
1292 *codePoint = EOF; // sentinel value to hopefully cause errors
1293 #endif
1294 MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
1295 return false;
1296 }
1297
1298 *codePoint = '\n';
1299 } else {
1300 MOZ_ASSERT(!IsLineTerminator(cp));
1301 *codePoint = AssertedCast<int32_t>(cp);
1302 }
1303
1304 return true;
1305 }
1306
1307 template <>
findWindowStart(size_t offset) const1308 size_t SourceUnits<char16_t>::findWindowStart(size_t offset) const {
1309 // This is JS's understanding of UTF-16 that allows lone surrogates, so
1310 // we have to exclude lone surrogates from [windowStart, offset) ourselves.
1311
1312 const char16_t* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
1313
1314 const char16_t* const initial = codeUnitPtrAt(offset);
1315 const char16_t* p = initial;
1316
1317 auto HalfWindowSize = [&p, &initial]() {
1318 return PointerRangeSize(p, initial);
1319 };
1320
1321 while (true) {
1322 MOZ_ASSERT(earliestPossibleStart <= p);
1323 MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1324 if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
1325 break;
1326 }
1327
1328 char16_t c = p[-1];
1329
1330 // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
1331 // string and template literals. These code points do affect line and
1332 // column coordinates, even as they encode their literal values.
1333 if (IsLineTerminator(c)) {
1334 break;
1335 }
1336
1337 // Don't allow invalid UTF-16 in pre-context. (Current users don't
1338 // require this, and this behavior isn't currently imposed on
1339 // pre-context, but these facts might change someday.)
1340
1341 if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(c))) {
1342 break;
1343 }
1344
1345 // Optimistically include the code unit, reverting below if needed.
1346 p--;
1347
1348 // If it's not a surrogate at all, keep going.
1349 if (MOZ_LIKELY(!unicode::IsTrailSurrogate(c))) {
1350 continue;
1351 }
1352
1353 // Stop if we don't have a usable surrogate pair.
1354 if (HalfWindowSize() >= WindowRadius ||
1355 p <= earliestPossibleStart || // trail surrogate at low end
1356 !unicode::IsLeadSurrogate(p[-1])) // no paired lead surrogate
1357 {
1358 p++;
1359 break;
1360 }
1361
1362 p--;
1363 }
1364
1365 MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1366 return offset - HalfWindowSize();
1367 }
1368
1369 template <>
findWindowStart(size_t offset) const1370 size_t SourceUnits<Utf8Unit>::findWindowStart(size_t offset) const {
1371 // |offset| must be the location of the error or somewhere before it, so we
1372 // know preceding data is valid UTF-8.
1373
1374 const Utf8Unit* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
1375
1376 const Utf8Unit* const initial = codeUnitPtrAt(offset);
1377 const Utf8Unit* p = initial;
1378
1379 auto HalfWindowSize = [&p, &initial]() {
1380 return PointerRangeSize(p, initial);
1381 };
1382
1383 while (true) {
1384 MOZ_ASSERT(earliestPossibleStart <= p);
1385 MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1386 if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
1387 break;
1388 }
1389
1390 // Peek backward for a line break, and only decrement if there is none.
1391 uint8_t prev = p[-1].toUint8();
1392
1393 // First check for the ASCII LineTerminators.
1394 if (prev == '\r' || prev == '\n') {
1395 break;
1396 }
1397
1398 // Now check for the non-ASCII LineTerminators U+2028 LINE SEPARATOR
1399 // (0xE2 0x80 0xA8) and U+2029 PARAGRAPH (0xE2 0x80 0xA9). If there
1400 // aren't three code units available, some comparison here will fail
1401 // before we'd underflow.
1402 if (MOZ_UNLIKELY((prev == 0xA8 || prev == 0xA9) &&
1403 p[-2].toUint8() == 0x80 && p[-3].toUint8() == 0xE2)) {
1404 break;
1405 }
1406
1407 // Rewind over the non-LineTerminator. This can't underflow
1408 // |earliestPossibleStart| because it begins a code point.
1409 while (IsTrailingUnit(*--p)) {
1410 continue;
1411 }
1412
1413 MOZ_ASSERT(earliestPossibleStart <= p);
1414
1415 // But if we underflowed |WindowRadius|, adjust forward and stop.
1416 if (HalfWindowSize() > WindowRadius) {
1417 static_assert(WindowRadius > 3,
1418 "skipping over non-lead code units below must not "
1419 "advance past |offset|");
1420
1421 while (IsTrailingUnit(*++p)) {
1422 continue;
1423 }
1424
1425 MOZ_ASSERT(HalfWindowSize() < WindowRadius);
1426 break;
1427 }
1428 }
1429
1430 MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1431 return offset - HalfWindowSize();
1432 }
1433
1434 template <>
findWindowEnd(size_t offset) const1435 size_t SourceUnits<char16_t>::findWindowEnd(size_t offset) const {
1436 const char16_t* const initial = codeUnitPtrAt(offset);
1437 const char16_t* p = initial;
1438
1439 auto HalfWindowSize = [&initial, &p]() {
1440 return PointerRangeSize(initial, p);
1441 };
1442
1443 while (true) {
1444 MOZ_ASSERT(p <= limit_);
1445 MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1446 if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
1447 break;
1448 }
1449
1450 char16_t c = *p;
1451
1452 // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
1453 // string and template literals. These code points do affect line and
1454 // column coordinates, even as they encode their literal values.
1455 if (IsLineTerminator(c)) {
1456 break;
1457 }
1458
1459 // Don't allow invalid UTF-16 in post-context. (Current users don't
1460 // require this, and this behavior isn't currently imposed on
1461 // pre-context, but these facts might change someday.)
1462
1463 if (MOZ_UNLIKELY(unicode::IsTrailSurrogate(c))) {
1464 break;
1465 }
1466
1467 // Optimistically consume the code unit, ungetting it below if needed.
1468 p++;
1469
1470 // If it's not a surrogate at all, keep going.
1471 if (MOZ_LIKELY(!unicode::IsLeadSurrogate(c))) {
1472 continue;
1473 }
1474
1475 // Retract if the lead surrogate would stand alone at the end of the
1476 // window.
1477 if (HalfWindowSize() >= WindowRadius || // split pair
1478 p >= limit_ || // half-pair at end of source
1479 !unicode::IsTrailSurrogate(*p)) // no paired trail surrogate
1480 {
1481 p--;
1482 break;
1483 }
1484
1485 p++;
1486 }
1487
1488 return offset + HalfWindowSize();
1489 }
1490
1491 template <>
findWindowEnd(size_t offset) const1492 size_t SourceUnits<Utf8Unit>::findWindowEnd(size_t offset) const {
1493 const Utf8Unit* const initial = codeUnitPtrAt(offset);
1494 const Utf8Unit* p = initial;
1495
1496 auto HalfWindowSize = [&initial, &p]() {
1497 return PointerRangeSize(initial, p);
1498 };
1499
1500 while (true) {
1501 MOZ_ASSERT(p <= limit_);
1502 MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1503 if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
1504 break;
1505 }
1506
1507 // A non-encoding error might be followed by an encoding error within
1508 // |maxEnd|, so we must validate as we go to not include invalid UTF-8
1509 // in the computed window. What joy!
1510
1511 Utf8Unit lead = *p;
1512 if (mozilla::IsAscii(lead)) {
1513 if (IsSingleUnitLineTerminator(lead)) {
1514 break;
1515 }
1516
1517 p++;
1518 continue;
1519 }
1520
1521 PeekedCodePoint<Utf8Unit> peeked = PeekCodePoint(p, limit_);
1522 if (peeked.isNone()) {
1523 break; // encoding error
1524 }
1525
1526 char32_t c = peeked.codePoint();
1527 if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR ||
1528 c == unicode::PARA_SEPARATOR)) {
1529 break;
1530 }
1531
1532 MOZ_ASSERT(!IsLineTerminator(c));
1533
1534 uint8_t len = peeked.lengthInUnits();
1535 if (HalfWindowSize() + len > WindowRadius) {
1536 break;
1537 }
1538
1539 p += len;
1540 }
1541
1542 MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1543 return offset + HalfWindowSize();
1544 }
1545
1546 template <typename Unit, class AnyCharsAccess>
advance(size_t position)1547 bool TokenStreamSpecific<Unit, AnyCharsAccess>::advance(size_t position) {
1548 const Unit* end = this->sourceUnits.codeUnitPtrAt(position);
1549 while (this->sourceUnits.addressOfNextCodeUnit() < end) {
1550 int32_t c;
1551 if (!getCodePoint(&c)) {
1552 return false;
1553 }
1554 }
1555
1556 TokenStreamAnyChars& anyChars = anyCharsAccess();
1557 Token* cur = const_cast<Token*>(&anyChars.currentToken());
1558 cur->pos.begin = this->sourceUnits.offset();
1559 cur->pos.end = cur->pos.begin;
1560 MOZ_MAKE_MEM_UNDEFINED(&cur->type, sizeof(cur->type));
1561 anyChars.lookahead = 0;
1562 return true;
1563 }
1564
1565 template <typename Unit, class AnyCharsAccess>
seekTo(const Position & pos)1566 void TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(const Position& pos) {
1567 TokenStreamAnyChars& anyChars = anyCharsAccess();
1568
1569 this->sourceUnits.setAddressOfNextCodeUnit(pos.buf,
1570 /* allowPoisoned = */ true);
1571 anyChars.flags = pos.flags;
1572 anyChars.lineno = pos.lineno;
1573 anyChars.linebase = pos.linebase;
1574 anyChars.prevLinebase = pos.prevLinebase;
1575 anyChars.lookahead = pos.lookahead;
1576
1577 anyChars.tokens[anyChars.cursor()] = pos.currentToken;
1578 for (unsigned i = 0; i < anyChars.lookahead; i++) {
1579 anyChars.tokens[anyChars.aheadCursor(1 + i)] = pos.lookaheadTokens[i];
1580 }
1581 }
1582
1583 template <typename Unit, class AnyCharsAccess>
seekTo(const Position & pos,const TokenStreamAnyChars & other)1584 bool TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(
1585 const Position& pos, const TokenStreamAnyChars& other) {
1586 if (!anyCharsAccess().srcCoords.fill(other.srcCoords)) {
1587 return false;
1588 }
1589
1590 seekTo(pos);
1591 return true;
1592 }
1593
computeErrorMetadataNoOffset(ErrorMetadata * err)1594 void TokenStreamAnyChars::computeErrorMetadataNoOffset(ErrorMetadata* err) {
1595 err->isMuted = mutedErrors;
1596 err->filename = filename_;
1597 err->lineNumber = 0;
1598 err->columnNumber = 0;
1599
1600 MOZ_ASSERT(err->lineOfContext == nullptr);
1601 }
1602
fillExceptingContext(ErrorMetadata * err,uint32_t offset)1603 bool TokenStreamAnyChars::fillExceptingContext(ErrorMetadata* err,
1604 uint32_t offset) {
1605 err->isMuted = mutedErrors;
1606
1607 // If this TokenStreamAnyChars doesn't have location information, try to
1608 // get it from the caller.
1609 if (!filename_ && !cx->isHelperThreadContext()) {
1610 NonBuiltinFrameIter iter(cx, FrameIter::FOLLOW_DEBUGGER_EVAL_PREV_LINK,
1611 cx->realm()->principals());
1612 if (!iter.done() && iter.filename()) {
1613 err->filename = iter.filename();
1614 err->lineNumber = iter.computeLine(&err->columnNumber);
1615 return false;
1616 }
1617 }
1618
1619 // Otherwise use this TokenStreamAnyChars's location information.
1620 err->filename = filename_;
1621 return true;
1622 }
1623
1624 template <typename Unit, class AnyCharsAccess>
hasTokenizationStarted() const1625 bool TokenStreamSpecific<Unit, AnyCharsAccess>::hasTokenizationStarted() const {
1626 const TokenStreamAnyChars& anyChars = anyCharsAccess();
1627 return anyChars.isCurrentTokenType(TokenKind::Eof) && !anyChars.isEOF();
1628 }
1629
1630 template <>
computeWindowOffsetAndLength(const char16_t * encodedWindow,size_t encodedTokenOffset,size_t * utf16TokenOffset,size_t encodedWindowLength,size_t * utf16WindowLength)1631 inline void SourceUnits<char16_t>::computeWindowOffsetAndLength(
1632 const char16_t* encodedWindow, size_t encodedTokenOffset,
1633 size_t* utf16TokenOffset, size_t encodedWindowLength,
1634 size_t* utf16WindowLength) {
1635 MOZ_ASSERT_UNREACHABLE("shouldn't need to recompute for UTF-16");
1636 }
1637
1638 template <>
computeWindowOffsetAndLength(const Utf8Unit * encodedWindow,size_t encodedTokenOffset,size_t * utf16TokenOffset,size_t encodedWindowLength,size_t * utf16WindowLength)1639 inline void SourceUnits<Utf8Unit>::computeWindowOffsetAndLength(
1640 const Utf8Unit* encodedWindow, size_t encodedTokenOffset,
1641 size_t* utf16TokenOffset, size_t encodedWindowLength,
1642 size_t* utf16WindowLength) {
1643 MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
1644 "token offset must be within the window, and the two lambda "
1645 "calls below presume this ordering of values");
1646
1647 const Utf8Unit* const encodedWindowEnd = encodedWindow + encodedWindowLength;
1648
1649 size_t i = 0;
1650 auto ComputeUtf16Count = [&i, &encodedWindow](const Utf8Unit* limit) {
1651 while (encodedWindow < limit) {
1652 Utf8Unit lead = *encodedWindow++;
1653 if (MOZ_LIKELY(IsAscii(lead))) {
1654 // ASCII contributes a single UTF-16 code unit.
1655 i++;
1656 continue;
1657 }
1658
1659 Maybe<char32_t> cp = DecodeOneUtf8CodePoint(lead, &encodedWindow, limit);
1660 MOZ_ASSERT(cp.isSome(),
1661 "computed window should only contain valid UTF-8");
1662
1663 i += unicode::IsSupplementary(cp.value()) ? 2 : 1;
1664 }
1665
1666 return i;
1667 };
1668
1669 // Compute the token offset from |i == 0| and the initial |encodedWindow|.
1670 const Utf8Unit* token = encodedWindow + encodedTokenOffset;
1671 MOZ_ASSERT(token <= encodedWindowEnd);
1672 *utf16TokenOffset = ComputeUtf16Count(token);
1673
1674 // Compute the window length, picking up from |i| and |encodedWindow| that,
1675 // in general, were modified just above.
1676 *utf16WindowLength = ComputeUtf16Count(encodedWindowEnd);
1677 }
1678
1679 template <typename Unit>
addLineOfContext(ErrorMetadata * err,uint32_t offset)1680 bool TokenStreamCharsBase<Unit>::addLineOfContext(ErrorMetadata* err,
1681 uint32_t offset) {
1682 // Rename the variable to make meaning clearer: an offset into source units
1683 // in Unit encoding.
1684 size_t encodedOffset = offset;
1685
1686 // These are also offsets into source units in Unit encoding.
1687 size_t encodedWindowStart = sourceUnits.findWindowStart(encodedOffset);
1688 size_t encodedWindowEnd = sourceUnits.findWindowEnd(encodedOffset);
1689
1690 size_t encodedWindowLength = encodedWindowEnd - encodedWindowStart;
1691 MOZ_ASSERT(encodedWindowLength <= SourceUnits::WindowRadius * 2);
1692
1693 // Don't add a useless "line" of context when the window ends up empty
1694 // because of an invalid encoding at the start of a line.
1695 if (encodedWindowLength == 0) {
1696 MOZ_ASSERT(err->lineOfContext == nullptr,
1697 "ErrorMetadata::lineOfContext must be null so we don't "
1698 "have to set the lineLength/tokenOffset fields");
1699 return true;
1700 }
1701
1702 CharBuffer lineOfContext(cx);
1703
1704 const Unit* encodedWindow = sourceUnits.codeUnitPtrAt(encodedWindowStart);
1705 if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(
1706 lineOfContext, encodedWindow, encodedWindow + encodedWindowLength)) {
1707 return false;
1708 }
1709
1710 size_t utf16WindowLength = lineOfContext.length();
1711
1712 // The windowed string is null-terminated.
1713 if (!lineOfContext.append('\0')) {
1714 return false;
1715 }
1716
1717 err->lineOfContext.reset(lineOfContext.extractOrCopyRawBuffer());
1718 if (!err->lineOfContext) {
1719 return false;
1720 }
1721
1722 size_t encodedTokenOffset = encodedOffset - encodedWindowStart;
1723
1724 MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
1725 "token offset must be inside the window");
1726
1727 // The length in UTF-8 code units of a code point is always greater than or
1728 // equal to the same code point's length in UTF-16 code points. ASCII code
1729 // points are 1 unit in either encoding. Code points in [U+0080, U+10000)
1730 // are 2-3 UTF-8 code units to 1 UTF-16 code unit. And code points in
1731 // [U+10000, U+10FFFF] are 4 UTF-8 code units to 2 UTF-16 code units.
1732 //
1733 // Therefore, if encoded window length equals the length in UTF-16 (this is
1734 // always the case for Unit=char16_t), the UTF-16 offsets are exactly the
1735 // encoded offsets. Otherwise we must convert offset/length from UTF-8 to
1736 // UTF-16.
1737 if constexpr (std::is_same_v<Unit, char16_t>) {
1738 MOZ_ASSERT(utf16WindowLength == encodedWindowLength,
1739 "UTF-16 to UTF-16 shouldn't change window length");
1740 err->tokenOffset = encodedTokenOffset;
1741 err->lineLength = encodedWindowLength;
1742 } else {
1743 static_assert(std::is_same_v<Unit, Utf8Unit>, "should only see UTF-8 here");
1744
1745 bool simple = utf16WindowLength == encodedWindowLength;
1746 #ifdef DEBUG
1747 auto isAscii = [](Unit u) { return IsAscii(u); };
1748 MOZ_ASSERT(std::all_of(encodedWindow, encodedWindow + encodedWindowLength,
1749 isAscii) == simple,
1750 "equal window lengths in UTF-8 should correspond only to "
1751 "wholly-ASCII text");
1752 #endif
1753 if (simple) {
1754 err->tokenOffset = encodedTokenOffset;
1755 err->lineLength = encodedWindowLength;
1756 } else {
1757 sourceUnits.computeWindowOffsetAndLength(
1758 encodedWindow, encodedTokenOffset, &err->tokenOffset,
1759 encodedWindowLength, &err->lineLength);
1760 }
1761 }
1762
1763 return true;
1764 }
1765
1766 template <typename Unit, class AnyCharsAccess>
computeErrorMetadata(ErrorMetadata * err,const ErrorOffset & errorOffset)1767 bool TokenStreamSpecific<Unit, AnyCharsAccess>::computeErrorMetadata(
1768 ErrorMetadata* err, const ErrorOffset& errorOffset) {
1769 if (errorOffset.is<NoOffset>()) {
1770 anyCharsAccess().computeErrorMetadataNoOffset(err);
1771 return true;
1772 }
1773
1774 uint32_t offset;
1775 if (errorOffset.is<uint32_t>()) {
1776 offset = errorOffset.as<uint32_t>();
1777 } else {
1778 offset = this->sourceUnits.offset();
1779 }
1780
1781 // This function's return value isn't a success/failure indication: it
1782 // returns true if this TokenStream can be used to provide a line of
1783 // context.
1784 if (fillExceptingContext(err, offset)) {
1785 // Add a line of context from this TokenStream to help with debugging.
1786 return internalComputeLineOfContext(err, offset);
1787 }
1788
1789 // We can't fill in any more here.
1790 return true;
1791 }
1792
1793 template <typename Unit, class AnyCharsAccess>
reportIllegalCharacter(int32_t cp)1794 void TokenStreamSpecific<Unit, AnyCharsAccess>::reportIllegalCharacter(
1795 int32_t cp) {
1796 UniqueChars display = JS_smprintf("U+%04X", cp);
1797 if (!display) {
1798 ReportOutOfMemory(anyCharsAccess().cx);
1799 return;
1800 }
1801 error(JSMSG_ILLEGAL_CHARACTER, display.get());
1802 }
1803
1804 // We have encountered a '\': check for a Unicode escape sequence after it.
1805 // Return the length of the escape sequence and the encoded code point (by
1806 // value) if we found a Unicode escape sequence, and skip all code units
1807 // involed. Otherwise, return 0 and don't advance along the buffer.
1808 template <typename Unit, class AnyCharsAccess>
matchUnicodeEscape(uint32_t * codePoint)1809 uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscape(
1810 uint32_t* codePoint) {
1811 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1812
1813 int32_t unit = getCodeUnit();
1814 if (unit != 'u') {
1815 // NOTE: |unit| may be EOF here.
1816 ungetCodeUnit(unit);
1817 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1818 return 0;
1819 }
1820
1821 char16_t v;
1822 unit = getCodeUnit();
1823 if (IsAsciiHexDigit(unit) && this->sourceUnits.matchHexDigits(3, &v)) {
1824 *codePoint = (AsciiAlphanumericToNumber(unit) << 12) | v;
1825 return 5;
1826 }
1827
1828 if (unit == '{') {
1829 return matchExtendedUnicodeEscape(codePoint);
1830 }
1831
1832 // NOTE: |unit| may be EOF here, so this ungets either one or two units.
1833 ungetCodeUnit(unit);
1834 ungetCodeUnit('u');
1835 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1836 return 0;
1837 }
1838
1839 template <typename Unit, class AnyCharsAccess>
1840 uint32_t
matchExtendedUnicodeEscape(uint32_t * codePoint)1841 GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchExtendedUnicodeEscape(
1842 uint32_t* codePoint) {
1843 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('{'));
1844
1845 int32_t unit = getCodeUnit();
1846
1847 // Skip leading zeroes.
1848 uint32_t leadingZeroes = 0;
1849 while (unit == '0') {
1850 leadingZeroes++;
1851 unit = getCodeUnit();
1852 }
1853
1854 size_t i = 0;
1855 uint32_t code = 0;
1856 while (IsAsciiHexDigit(unit) && i < 6) {
1857 code = (code << 4) | AsciiAlphanumericToNumber(unit);
1858 unit = getCodeUnit();
1859 i++;
1860 }
1861
1862 uint32_t gotten =
1863 2 + // 'u{'
1864 leadingZeroes + i + // significant hexdigits
1865 (unit != EOF); // subtract a get if it didn't contribute to length
1866
1867 if (unit == '}' && (leadingZeroes > 0 || i > 0) &&
1868 code <= unicode::NonBMPMax) {
1869 *codePoint = code;
1870 return gotten;
1871 }
1872
1873 this->sourceUnits.unskipCodeUnits(gotten);
1874 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1875 return 0;
1876 }
1877
1878 template <typename Unit, class AnyCharsAccess>
1879 uint32_t
matchUnicodeEscapeIdStart(uint32_t * codePoint)1880 GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdStart(
1881 uint32_t* codePoint) {
1882 uint32_t length = matchUnicodeEscape(codePoint);
1883 if (MOZ_LIKELY(length > 0)) {
1884 if (MOZ_LIKELY(unicode::IsIdentifierStart(*codePoint))) {
1885 return length;
1886 }
1887
1888 this->sourceUnits.unskipCodeUnits(length);
1889 }
1890
1891 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1892 return 0;
1893 }
1894
1895 template <typename Unit, class AnyCharsAccess>
matchUnicodeEscapeIdent(uint32_t * codePoint)1896 bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdent(
1897 uint32_t* codePoint) {
1898 uint32_t length = matchUnicodeEscape(codePoint);
1899 if (MOZ_LIKELY(length > 0)) {
1900 if (MOZ_LIKELY(unicode::IsIdentifierPart(*codePoint))) {
1901 return true;
1902 }
1903
1904 this->sourceUnits.unskipCodeUnits(length);
1905 }
1906
1907 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1908 return false;
1909 }
1910
1911 template <typename Unit, class AnyCharsAccess>
1912 [[nodiscard]] bool
matchIdentifierStart(IdentifierEscapes * sawEscape)1913 TokenStreamSpecific<Unit, AnyCharsAccess>::matchIdentifierStart(
1914 IdentifierEscapes* sawEscape) {
1915 int32_t unit = getCodeUnit();
1916 if (unicode::IsIdentifierStart(char16_t(unit))) {
1917 ungetCodeUnit(unit);
1918 *sawEscape = IdentifierEscapes::None;
1919 return true;
1920 }
1921
1922 if (unit == '\\') {
1923 *sawEscape = IdentifierEscapes::SawUnicodeEscape;
1924
1925 uint32_t codePoint;
1926 uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint);
1927 if (escapeLength != 0) {
1928 return true;
1929 }
1930
1931 // We could point "into" a mistyped escape, e.g. for "\u{41H}" we
1932 // could point at the 'H'. But we don't do that now, so the code
1933 // unit after the '\' isn't necessarily bad, so just point at the
1934 // start of the actually-invalid escape.
1935 ungetCodeUnit('\\');
1936 error(JSMSG_BAD_ESCAPE);
1937 return false;
1938 }
1939
1940 *sawEscape = IdentifierEscapes::None;
1941
1942 // NOTE: |unit| may be EOF here.
1943 ungetCodeUnit(unit);
1944 error(JSMSG_MISSING_PRIVATE_NAME);
1945 return false;
1946 }
1947
1948 template <typename Unit, class AnyCharsAccess>
getDirectives(bool isMultiline,bool shouldWarnDeprecated)1949 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirectives(
1950 bool isMultiline, bool shouldWarnDeprecated) {
1951 // Match directive comments used in debugging, such as "//# sourceURL" and
1952 // "//# sourceMappingURL". Use of "//@" instead of "//#" is deprecated.
1953 //
1954 // To avoid a crashing bug in IE, several JavaScript transpilers wrap single
1955 // line comments containing a source mapping URL inside a multiline
1956 // comment. To avoid potentially expensive lookahead and backtracking, we
1957 // only check for this case if we encounter a '#' code unit.
1958
1959 bool res = getDisplayURL(isMultiline, shouldWarnDeprecated) &&
1960 getSourceMappingURL(isMultiline, shouldWarnDeprecated);
1961 if (!res) {
1962 badToken();
1963 }
1964
1965 return res;
1966 }
1967
copyCharBufferTo(JSContext * cx,UniquePtr<char16_t[],JS::FreePolicy> * destination)1968 [[nodiscard]] bool TokenStreamCharsShared::copyCharBufferTo(
1969 JSContext* cx, UniquePtr<char16_t[], JS::FreePolicy>* destination) {
1970 size_t length = charBuffer.length();
1971
1972 *destination = cx->make_pod_array<char16_t>(length + 1);
1973 if (!*destination) {
1974 return false;
1975 }
1976
1977 std::copy(charBuffer.begin(), charBuffer.end(), destination->get());
1978 (*destination)[length] = '\0';
1979 return true;
1980 }
1981
1982 template <typename Unit, class AnyCharsAccess>
getDirective(bool isMultiline,bool shouldWarnDeprecated,const char * directive,uint8_t directiveLength,const char * errorMsgPragma,UniquePtr<char16_t[],JS::FreePolicy> * destination)1983 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirective(
1984 bool isMultiline, bool shouldWarnDeprecated, const char* directive,
1985 uint8_t directiveLength, const char* errorMsgPragma,
1986 UniquePtr<char16_t[], JS::FreePolicy>* destination) {
1987 // Stop if we don't find |directive|. (Note that |directive| must be
1988 // ASCII, so there are no tricky encoding issues to consider in matching
1989 // UTF-8/16-agnostically.)
1990 if (!this->sourceUnits.matchCodeUnits(directive, directiveLength)) {
1991 return true;
1992 }
1993
1994 if (shouldWarnDeprecated) {
1995 if (!warning(JSMSG_DEPRECATED_PRAGMA, errorMsgPragma)) {
1996 return false;
1997 }
1998 }
1999
2000 this->charBuffer.clear();
2001
2002 do {
2003 int32_t unit = peekCodeUnit();
2004 if (unit == EOF) {
2005 break;
2006 }
2007
2008 if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2009 if (unicode::IsSpace(AssertedCast<Latin1Char>(unit))) {
2010 break;
2011 }
2012
2013 consumeKnownCodeUnit(unit);
2014
2015 // Debugging directives can occur in both single- and multi-line
2016 // comments. If we're currently inside a multi-line comment, we
2017 // also must recognize multi-line comment terminators.
2018 if (isMultiline && unit == '*' && peekCodeUnit() == '/') {
2019 ungetCodeUnit('*');
2020 break;
2021 }
2022
2023 if (!this->charBuffer.append(unit)) {
2024 return false;
2025 }
2026
2027 continue;
2028 }
2029
2030 // This ignores encoding errors: subsequent caller-side code to
2031 // handle the remaining source text in the comment will do so.
2032 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2033 if (peeked.isNone() || unicode::IsSpace(peeked.codePoint())) {
2034 break;
2035 }
2036
2037 MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()),
2038 "!IsSpace must imply !IsLineTerminator or else we'll fail to "
2039 "maintain line-info/flags for EOL");
2040 this->sourceUnits.consumeKnownCodePoint(peeked);
2041
2042 if (!AppendCodePointToCharBuffer(this->charBuffer, peeked.codePoint())) {
2043 return false;
2044 }
2045 } while (true);
2046
2047 if (this->charBuffer.empty()) {
2048 // The directive's URL was missing, but comments can contain anything,
2049 // so it isn't an error.
2050 return true;
2051 }
2052
2053 return copyCharBufferTo(anyCharsAccess().cx, destination);
2054 }
2055
2056 template <typename Unit, class AnyCharsAccess>
getDisplayURL(bool isMultiline,bool shouldWarnDeprecated)2057 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDisplayURL(
2058 bool isMultiline, bool shouldWarnDeprecated) {
2059 // Match comments of the form "//# sourceURL=<url>" or
2060 // "/\* //# sourceURL=<url> *\/"
2061 //
2062 // Note that while these are labeled "sourceURL" in the source text,
2063 // internally we refer to it as a "displayURL" to distinguish what the
2064 // developer would like to refer to the source as from the source's actual
2065 // URL.
2066
2067 static constexpr char sourceURLDirective[] = " sourceURL=";
2068 constexpr uint8_t sourceURLDirectiveLength = js_strlen(sourceURLDirective);
2069 return getDirective(isMultiline, shouldWarnDeprecated, sourceURLDirective,
2070 sourceURLDirectiveLength, "sourceURL",
2071 &anyCharsAccess().displayURL_);
2072 }
2073
2074 template <typename Unit, class AnyCharsAccess>
getSourceMappingURL(bool isMultiline,bool shouldWarnDeprecated)2075 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getSourceMappingURL(
2076 bool isMultiline, bool shouldWarnDeprecated) {
2077 // Match comments of the form "//# sourceMappingURL=<url>" or
2078 // "/\* //# sourceMappingURL=<url> *\/"
2079
2080 static constexpr char sourceMappingURLDirective[] = " sourceMappingURL=";
2081 constexpr uint8_t sourceMappingURLDirectiveLength =
2082 js_strlen(sourceMappingURLDirective);
2083 return getDirective(isMultiline, shouldWarnDeprecated,
2084 sourceMappingURLDirective,
2085 sourceMappingURLDirectiveLength, "sourceMappingURL",
2086 &anyCharsAccess().sourceMapURL_);
2087 }
2088
2089 template <typename Unit, class AnyCharsAccess>
2090 MOZ_ALWAYS_INLINE Token*
newTokenInternal(TokenKind kind,TokenStart start,TokenKind * out)2091 GeneralTokenStreamChars<Unit, AnyCharsAccess>::newTokenInternal(
2092 TokenKind kind, TokenStart start, TokenKind* out) {
2093 MOZ_ASSERT(kind < TokenKind::Limit);
2094 MOZ_ASSERT(kind != TokenKind::Eol,
2095 "TokenKind::Eol should never be used in an actual Token, only "
2096 "returned by peekTokenSameLine()");
2097
2098 TokenStreamAnyChars& anyChars = anyCharsAccess();
2099 anyChars.flags.isDirtyLine = true;
2100
2101 Token* token = anyChars.allocateToken();
2102
2103 *out = token->type = kind;
2104 token->pos = TokenPos(start.offset(), this->sourceUnits.offset());
2105 MOZ_ASSERT(token->pos.begin <= token->pos.end);
2106
2107 // NOTE: |token->modifier| is set in |newToken()| so that optimized,
2108 // non-debug code won't do any work to pass a modifier-argument that will
2109 // never be used.
2110
2111 return token;
2112 }
2113
2114 template <typename Unit, class AnyCharsAccess>
badToken()2115 MOZ_COLD bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::badToken() {
2116 // We didn't get a token, so don't set |flags.isDirtyLine|.
2117 anyCharsAccess().flags.hadError = true;
2118
2119 // Poisoning sourceUnits on error establishes an invariant: once an
2120 // erroneous token has been seen, sourceUnits will not be consulted again.
2121 // This is true because the parser will deal with the illegal token by
2122 // aborting parsing immediately.
2123 this->sourceUnits.poisonInDebug();
2124
2125 return false;
2126 };
2127
AppendCodePointToCharBuffer(CharBuffer & charBuffer,uint32_t codePoint)2128 bool AppendCodePointToCharBuffer(CharBuffer& charBuffer, uint32_t codePoint) {
2129 MOZ_ASSERT(codePoint <= unicode::NonBMPMax,
2130 "should only be processing code points validly decoded from UTF-8 "
2131 "or WTF-16 source text (surrogate code points permitted)");
2132
2133 char16_t units[2];
2134 unsigned numUnits = 0;
2135 unicode::UTF16Encode(codePoint, units, &numUnits);
2136
2137 MOZ_ASSERT(numUnits == 1 || numUnits == 2,
2138 "UTF-16 code points are only encoded in one or two units");
2139
2140 if (!charBuffer.append(units[0])) {
2141 return false;
2142 }
2143
2144 if (numUnits == 1) {
2145 return true;
2146 }
2147
2148 return charBuffer.append(units[1]);
2149 }
2150
2151 template <typename Unit, class AnyCharsAccess>
putIdentInCharBuffer(const Unit * identStart)2152 bool TokenStreamSpecific<Unit, AnyCharsAccess>::putIdentInCharBuffer(
2153 const Unit* identStart) {
2154 const Unit* const originalAddress = this->sourceUnits.addressOfNextCodeUnit();
2155 this->sourceUnits.setAddressOfNextCodeUnit(identStart);
2156
2157 auto restoreNextRawCharAddress = MakeScopeExit([this, originalAddress]() {
2158 this->sourceUnits.setAddressOfNextCodeUnit(originalAddress);
2159 });
2160
2161 this->charBuffer.clear();
2162 do {
2163 int32_t unit = getCodeUnit();
2164 if (unit == EOF) {
2165 break;
2166 }
2167
2168 uint32_t codePoint;
2169 if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2170 if (unicode::IsIdentifierPart(char16_t(unit)) || unit == '#') {
2171 if (!this->charBuffer.append(unit)) {
2172 return false;
2173 }
2174
2175 continue;
2176 }
2177
2178 if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) {
2179 break;
2180 }
2181 } else {
2182 // |restoreNextRawCharAddress| undoes all gets, and this function
2183 // doesn't update line/column info.
2184 char32_t cp;
2185 if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) {
2186 return false;
2187 }
2188
2189 codePoint = cp;
2190 if (!unicode::IsIdentifierPart(codePoint)) {
2191 break;
2192 }
2193 }
2194
2195 if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) {
2196 return false;
2197 }
2198 } while (true);
2199
2200 return true;
2201 }
2202
2203 template <typename Unit, class AnyCharsAccess>
identifierName(TokenStart start,const Unit * identStart,IdentifierEscapes escaping,Modifier modifier,NameVisibility visibility,TokenKind * out)2204 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::identifierName(
2205 TokenStart start, const Unit* identStart, IdentifierEscapes escaping,
2206 Modifier modifier, NameVisibility visibility, TokenKind* out) {
2207 // Run the bad-token code for every path out of this function except the
2208 // two success-cases.
2209 auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
2210
2211 // We've already consumed an initial code point in the identifer, to *know*
2212 // that this is an identifier. So no need to worry about not consuming any
2213 // code points in the loop below.
2214 int32_t unit;
2215 while (true) {
2216 unit = peekCodeUnit();
2217 if (unit == EOF) {
2218 break;
2219 }
2220
2221 if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2222 consumeKnownCodeUnit(unit);
2223
2224 if (MOZ_UNLIKELY(
2225 !unicode::IsIdentifierPart(static_cast<char16_t>(unit)))) {
2226 // Handle a Unicode escape -- otherwise it's not part of the
2227 // identifier.
2228 uint32_t codePoint;
2229 if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) {
2230 ungetCodeUnit(unit);
2231 break;
2232 }
2233
2234 escaping = IdentifierEscapes::SawUnicodeEscape;
2235 }
2236 } else {
2237 // This ignores encoding errors: subsequent caller-side code to
2238 // handle source text after the IdentifierName will do so.
2239 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2240 if (peeked.isNone() || !unicode::IsIdentifierPart(peeked.codePoint())) {
2241 break;
2242 }
2243
2244 MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()),
2245 "IdentifierPart must guarantee !IsLineTerminator or "
2246 "else we'll fail to maintain line-info/flags for EOL");
2247
2248 this->sourceUnits.consumeKnownCodePoint(peeked);
2249 }
2250 }
2251
2252 TaggedParserAtomIndex atom;
2253 if (MOZ_UNLIKELY(escaping == IdentifierEscapes::SawUnicodeEscape)) {
2254 // Identifiers containing Unicode escapes have to be converted into
2255 // tokenbuf before atomizing.
2256 if (!putIdentInCharBuffer(identStart)) {
2257 return false;
2258 }
2259
2260 atom = drainCharBufferIntoAtom();
2261 } else {
2262 // Escape-free identifiers can be created directly from sourceUnits.
2263 const Unit* chars = identStart;
2264 size_t length = this->sourceUnits.addressOfNextCodeUnit() - identStart;
2265
2266 // Private identifiers start with a '#', and so cannot be reserved words.
2267 if (visibility == NameVisibility::Public) {
2268 // Represent reserved words lacking escapes as reserved word tokens.
2269 if (const ReservedWordInfo* rw = FindReservedWord(chars, length)) {
2270 noteBadToken.release();
2271 newSimpleToken(rw->tokentype, start, modifier, out);
2272 return true;
2273 }
2274 }
2275
2276 atom = atomizeSourceChars(Span(chars, length));
2277 }
2278 if (!atom) {
2279 return false;
2280 }
2281
2282 noteBadToken.release();
2283 if (visibility == NameVisibility::Private) {
2284 newPrivateNameToken(atom, start, modifier, out);
2285 return true;
2286 }
2287 newNameToken(atom, start, modifier, out);
2288 return true;
2289 }
2290
2291 enum FirstCharKind {
2292 // A char16_t has the 'OneChar' kind if it, by itself, constitutes a valid
2293 // token that cannot also be a prefix of a longer token. E.g. ';' has the
2294 // OneChar kind, but '+' does not, because '++' and '+=' are valid longer
2295 // tokens
2296 // that begin with '+'.
2297 //
2298 // The few token kinds satisfying these properties cover roughly 35--45%
2299 // of the tokens seen in practice.
2300 //
2301 // We represent the 'OneChar' kind with any positive value less than
2302 // TokenKind::Limit. This representation lets us associate
2303 // each one-char token char16_t with a TokenKind and thus avoid
2304 // a subsequent char16_t-to-TokenKind conversion.
2305 OneChar_Min = 0,
2306 OneChar_Max = size_t(TokenKind::Limit) - 1,
2307
2308 Space = size_t(TokenKind::Limit),
2309 Ident,
2310 Dec,
2311 String,
2312 EOL,
2313 ZeroDigit,
2314 Other,
2315
2316 LastCharKind = Other
2317 };
2318
2319 // OneChar: 40, 41, 44, 58, 59, 91, 93, 123, 125, 126:
2320 // '(', ')', ',', ':', ';', '[', ']', '{', '}', '~'
2321 // Ident: 36, 65..90, 95, 97..122: '$', 'A'..'Z', '_', 'a'..'z'
2322 // Dot: 46: '.'
2323 // Equals: 61: '='
2324 // String: 34, 39, 96: '"', '\'', '`'
2325 // Dec: 49..57: '1'..'9'
2326 // Plus: 43: '+'
2327 // ZeroDigit: 48: '0'
2328 // Space: 9, 11, 12, 32: '\t', '\v', '\f', ' '
2329 // EOL: 10, 13: '\n', '\r'
2330 //
2331 #define T_COMMA size_t(TokenKind::Comma)
2332 #define T_COLON size_t(TokenKind::Colon)
2333 #define T_BITNOT size_t(TokenKind::BitNot)
2334 #define T_LP size_t(TokenKind::LeftParen)
2335 #define T_RP size_t(TokenKind::RightParen)
2336 #define T_SEMI size_t(TokenKind::Semi)
2337 #define T_LB size_t(TokenKind::LeftBracket)
2338 #define T_RB size_t(TokenKind::RightBracket)
2339 #define T_LC size_t(TokenKind::LeftCurly)
2340 #define T_RC size_t(TokenKind::RightCurly)
2341 #define _______ Other
2342 static const uint8_t firstCharKinds[] = {
2343 // clang-format off
2344 /* 0 1 2 3 4 5 6 7 8 9 */
2345 /* 0+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, Space,
2346 /* 10+ */ EOL, Space, Space, EOL, _______, _______, _______, _______, _______, _______,
2347 /* 20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
2348 /* 30+ */ _______, _______, Space, _______, String, _______, Ident, _______, _______, String,
2349 /* 40+ */ T_LP, T_RP, _______, _______, T_COMMA, _______, _______, _______,ZeroDigit, Dec,
2350 /* 50+ */ Dec, Dec, Dec, Dec, Dec, Dec, Dec, Dec, T_COLON, T_SEMI,
2351 /* 60+ */ _______, _______, _______, _______, _______, Ident, Ident, Ident, Ident, Ident,
2352 /* 70+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident,
2353 /* 80+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident,
2354 /* 90+ */ Ident, T_LB, _______, T_RB, _______, Ident, String, Ident, Ident, Ident,
2355 /* 100+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident,
2356 /* 110+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident,
2357 /* 120+ */ Ident, Ident, Ident, T_LC, _______, T_RC,T_BITNOT, _______
2358 // clang-format on
2359 };
2360 #undef T_COMMA
2361 #undef T_COLON
2362 #undef T_BITNOT
2363 #undef T_LP
2364 #undef T_RP
2365 #undef T_SEMI
2366 #undef T_LB
2367 #undef T_RB
2368 #undef T_LC
2369 #undef T_RC
2370 #undef _______
2371
2372 static_assert(LastCharKind < (1 << (sizeof(firstCharKinds[0]) * 8)),
2373 "Elements of firstCharKinds[] are too small");
2374
2375 template <>
consumeRestOfSingleLineComment()2376 void SourceUnits<char16_t>::consumeRestOfSingleLineComment() {
2377 while (MOZ_LIKELY(!atEnd())) {
2378 char16_t unit = peekCodeUnit();
2379 if (IsLineTerminator(unit)) {
2380 return;
2381 }
2382
2383 consumeKnownCodeUnit(unit);
2384 }
2385 }
2386
2387 template <>
consumeRestOfSingleLineComment()2388 void SourceUnits<Utf8Unit>::consumeRestOfSingleLineComment() {
2389 while (MOZ_LIKELY(!atEnd())) {
2390 const Utf8Unit unit = peekCodeUnit();
2391 if (IsSingleUnitLineTerminator(unit)) {
2392 return;
2393 }
2394
2395 if (MOZ_LIKELY(IsAscii(unit))) {
2396 consumeKnownCodeUnit(unit);
2397 continue;
2398 }
2399
2400 PeekedCodePoint<Utf8Unit> peeked = peekCodePoint();
2401 if (peeked.isNone()) {
2402 return;
2403 }
2404
2405 char32_t c = peeked.codePoint();
2406 if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR ||
2407 c == unicode::PARA_SEPARATOR)) {
2408 return;
2409 }
2410
2411 consumeKnownCodePoint(peeked);
2412 }
2413 }
2414
2415 template <typename Unit, class AnyCharsAccess>
2416 [[nodiscard]] MOZ_ALWAYS_INLINE bool
matchInteger(IsIntegerUnit isIntegerUnit,int32_t * nextUnit)2417 TokenStreamSpecific<Unit, AnyCharsAccess>::matchInteger(
2418 IsIntegerUnit isIntegerUnit, int32_t* nextUnit) {
2419 int32_t unit = getCodeUnit();
2420 if (!isIntegerUnit(unit)) {
2421 *nextUnit = unit;
2422 return true;
2423 }
2424 return matchIntegerAfterFirstDigit(isIntegerUnit, nextUnit);
2425 }
2426
2427 template <typename Unit, class AnyCharsAccess>
2428 [[nodiscard]] MOZ_ALWAYS_INLINE bool
matchIntegerAfterFirstDigit(IsIntegerUnit isIntegerUnit,int32_t * nextUnit)2429 TokenStreamSpecific<Unit, AnyCharsAccess>::matchIntegerAfterFirstDigit(
2430 IsIntegerUnit isIntegerUnit, int32_t* nextUnit) {
2431 int32_t unit;
2432 while (true) {
2433 unit = getCodeUnit();
2434 if (isIntegerUnit(unit)) {
2435 continue;
2436 }
2437 if (unit != '_') {
2438 break;
2439 }
2440 unit = getCodeUnit();
2441 if (!isIntegerUnit(unit)) {
2442 if (unit == '_') {
2443 error(JSMSG_NUMBER_MULTIPLE_ADJACENT_UNDERSCORES);
2444 } else {
2445 error(JSMSG_NUMBER_END_WITH_UNDERSCORE);
2446 }
2447 return false;
2448 }
2449 }
2450
2451 *nextUnit = unit;
2452 return true;
2453 }
2454
2455 template <typename Unit, class AnyCharsAccess>
decimalNumber(int32_t unit,TokenStart start,const Unit * numStart,Modifier modifier,TokenKind * out)2456 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::decimalNumber(
2457 int32_t unit, TokenStart start, const Unit* numStart, Modifier modifier,
2458 TokenKind* out) {
2459 // Run the bad-token code for every path out of this function except the
2460 // one success-case.
2461 auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
2462
2463 // Consume integral component digits.
2464 if (IsAsciiDigit(unit)) {
2465 if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) {
2466 return false;
2467 }
2468 }
2469
2470 // Numbers contain no escapes, so we can read directly from |sourceUnits|.
2471 double dval;
2472 bool isBigInt = false;
2473 DecimalPoint decimalPoint = NoDecimal;
2474 if (unit != '.' && unit != 'e' && unit != 'E' && unit != 'n') {
2475 // NOTE: |unit| may be EOF here.
2476 ungetCodeUnit(unit);
2477
2478 // Most numbers are pure decimal integers without fractional component
2479 // or exponential notation. Handle that with optimized code.
2480 if (!GetDecimalInteger(anyCharsAccess().cx, numStart,
2481 this->sourceUnits.addressOfNextCodeUnit(), &dval)) {
2482 return false;
2483 }
2484 } else if (unit == 'n') {
2485 isBigInt = true;
2486 unit = peekCodeUnit();
2487 } else {
2488 // Consume any decimal dot and fractional component.
2489 if (unit == '.') {
2490 decimalPoint = HasDecimal;
2491 if (!matchInteger(IsAsciiDigit, &unit)) {
2492 return false;
2493 }
2494 }
2495
2496 // Consume any exponential notation.
2497 if (unit == 'e' || unit == 'E') {
2498 unit = getCodeUnit();
2499 if (unit == '+' || unit == '-') {
2500 unit = getCodeUnit();
2501 }
2502
2503 // Exponential notation must contain at least one digit.
2504 if (!IsAsciiDigit(unit)) {
2505 ungetCodeUnit(unit);
2506 error(JSMSG_MISSING_EXPONENT);
2507 return false;
2508 }
2509
2510 // Consume exponential digits.
2511 if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) {
2512 return false;
2513 }
2514 }
2515
2516 ungetCodeUnit(unit);
2517
2518 // "0." and "0e..." numbers parse "." or "e..." here. Neither range
2519 // contains a number, so we can't use |FullStringToDouble|. (Parse
2520 // failures return 0.0, so we'll still get the right result.)
2521 if (!GetDecimalNonInteger(anyCharsAccess().cx, numStart,
2522 this->sourceUnits.addressOfNextCodeUnit(),
2523 &dval)) {
2524 return false;
2525 }
2526 }
2527
2528 // Number followed by IdentifierStart is an error. (This is the only place
2529 // in ECMAScript where token boundary is inadequate to properly separate
2530 // two tokens, necessitating this unaesthetic lookahead.)
2531 if (unit != EOF) {
2532 if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2533 if (unicode::IsIdentifierStart(char16_t(unit))) {
2534 error(JSMSG_IDSTART_AFTER_NUMBER);
2535 return false;
2536 }
2537 } else {
2538 // This ignores encoding errors: subsequent caller-side code to
2539 // handle source text after the number will do so.
2540 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2541 if (!peeked.isNone() && unicode::IsIdentifierStart(peeked.codePoint())) {
2542 error(JSMSG_IDSTART_AFTER_NUMBER);
2543 return false;
2544 }
2545 }
2546 }
2547
2548 noteBadToken.release();
2549
2550 if (isBigInt) {
2551 return bigIntLiteral(start, modifier, out);
2552 }
2553
2554 newNumberToken(dval, decimalPoint, start, modifier, out);
2555 return true;
2556 }
2557
2558 template <typename Unit, class AnyCharsAccess>
regexpLiteral(TokenStart start,TokenKind * out)2559 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::regexpLiteral(
2560 TokenStart start, TokenKind* out) {
2561 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('/'));
2562 this->charBuffer.clear();
2563
2564 auto ProcessNonAsciiCodePoint = [this](int32_t lead) {
2565 MOZ_ASSERT(lead != EOF);
2566 MOZ_ASSERT(!this->isAsciiCodePoint(lead));
2567
2568 char32_t codePoint;
2569 if (!this->getNonAsciiCodePointDontNormalize(this->toUnit(lead),
2570 &codePoint)) {
2571 return false;
2572 }
2573
2574 if (MOZ_UNLIKELY(codePoint == unicode::LINE_SEPARATOR ||
2575 codePoint == unicode::PARA_SEPARATOR)) {
2576 this->sourceUnits.ungetLineOrParagraphSeparator();
2577 this->error(JSMSG_UNTERMINATED_REGEXP);
2578 return false;
2579 }
2580
2581 return AppendCodePointToCharBuffer(this->charBuffer, codePoint);
2582 };
2583
2584 auto ReportUnterminatedRegExp = [this](int32_t unit) {
2585 this->ungetCodeUnit(unit);
2586 this->error(JSMSG_UNTERMINATED_REGEXP);
2587 };
2588
2589 bool inCharClass = false;
2590 do {
2591 int32_t unit = getCodeUnit();
2592 if (unit == EOF) {
2593 ReportUnterminatedRegExp(unit);
2594 return badToken();
2595 }
2596
2597 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
2598 if (!ProcessNonAsciiCodePoint(unit)) {
2599 return badToken();
2600 }
2601
2602 continue;
2603 }
2604
2605 if (unit == '\\') {
2606 if (!this->charBuffer.append(unit)) {
2607 return badToken();
2608 }
2609
2610 unit = getCodeUnit();
2611 if (unit == EOF) {
2612 ReportUnterminatedRegExp(unit);
2613 return badToken();
2614 }
2615
2616 // Fallthrough only handles ASCII code points, so
2617 // deal with non-ASCII and skip everything else.
2618 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
2619 if (!ProcessNonAsciiCodePoint(unit)) {
2620 return badToken();
2621 }
2622
2623 continue;
2624 }
2625 } else if (unit == '[') {
2626 inCharClass = true;
2627 } else if (unit == ']') {
2628 inCharClass = false;
2629 } else if (unit == '/' && !inCharClass) {
2630 // For IE compat, allow unescaped / in char classes.
2631 break;
2632 }
2633
2634 // NOTE: Non-ASCII LineTerminators were handled by
2635 // ProcessNonAsciiCodePoint calls above.
2636 if (unit == '\r' || unit == '\n') {
2637 ReportUnterminatedRegExp(unit);
2638 return badToken();
2639 }
2640
2641 MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(unit)));
2642 if (!this->charBuffer.append(unit)) {
2643 return badToken();
2644 }
2645 } while (true);
2646
2647 int32_t unit;
2648 RegExpFlags reflags = RegExpFlag::NoFlags;
2649 while (true) {
2650 uint8_t flag;
2651 unit = getCodeUnit();
2652 if (unit == 'd') {
2653 flag = RegExpFlag::HasIndices;
2654 } else if (unit == 'g') {
2655 flag = RegExpFlag::Global;
2656 } else if (unit == 'i') {
2657 flag = RegExpFlag::IgnoreCase;
2658 } else if (unit == 'm') {
2659 flag = RegExpFlag::Multiline;
2660 } else if (unit == 's') {
2661 flag = RegExpFlag::DotAll;
2662 } else if (unit == 'u') {
2663 flag = RegExpFlag::Unicode;
2664 } else if (unit == 'y') {
2665 flag = RegExpFlag::Sticky;
2666 } else if (IsAsciiAlpha(unit)) {
2667 flag = RegExpFlag::NoFlags;
2668 } else {
2669 break;
2670 }
2671
2672 if ((reflags & flag) || flag == RegExpFlag::NoFlags) {
2673 ungetCodeUnit(unit);
2674 char buf[2] = {char(unit), '\0'};
2675 error(JSMSG_BAD_REGEXP_FLAG, buf);
2676 return badToken();
2677 }
2678
2679 reflags |= flag;
2680 }
2681 ungetCodeUnit(unit);
2682
2683 newRegExpToken(reflags, start, out);
2684 return true;
2685 }
2686
2687 template <typename Unit, class AnyCharsAccess>
bigIntLiteral(TokenStart start,Modifier modifier,TokenKind * out)2688 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::bigIntLiteral(
2689 TokenStart start, Modifier modifier, TokenKind* out) {
2690 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == toUnit('n'));
2691 MOZ_ASSERT(this->sourceUnits.offset() > start.offset());
2692 uint32_t length = this->sourceUnits.offset() - start.offset();
2693 MOZ_ASSERT(length >= 2);
2694 this->charBuffer.clear();
2695 mozilla::Range<const Unit> chars(
2696 this->sourceUnits.codeUnitPtrAt(start.offset()), length);
2697 for (uint32_t idx = 0; idx < length - 1; idx++) {
2698 int32_t unit = CodeUnitValue(chars[idx]);
2699 // Char buffer may start with a 0[bBoOxX] prefix, then follows with
2700 // binary, octal, decimal, or hex digits. Already checked by caller, as
2701 // the "n" indicating bigint comes at the end.
2702 MOZ_ASSERT(isAsciiCodePoint(unit));
2703 // Skip over any separators.
2704 if (unit == '_') {
2705 continue;
2706 }
2707 if (!AppendCodePointToCharBuffer(this->charBuffer, unit)) {
2708 return false;
2709 }
2710 }
2711 newBigIntToken(start, modifier, out);
2712 return true;
2713 }
2714
2715 template <typename Unit, class AnyCharsAccess>
2716 void GeneralTokenStreamChars<Unit,
consumeOptionalHashbangComment()2717 AnyCharsAccess>::consumeOptionalHashbangComment() {
2718 MOZ_ASSERT(this->sourceUnits.atStart(),
2719 "HashBangComment can only appear immediately at the start of a "
2720 "Script or Module");
2721
2722 // HashbangComment ::
2723 // #! SingleLineCommentChars_opt
2724
2725 if (!matchCodeUnit('#')) {
2726 // HashbangComment is optional at start of Script or Module.
2727 return;
2728 }
2729
2730 if (!matchCodeUnit('!')) {
2731 // # not followed by ! at start of Script or Module is an error, but normal
2732 // parsing code will handle that error just fine if we let it.
2733 ungetCodeUnit('#');
2734 return;
2735 }
2736
2737 // This doesn't consume a concluding LineTerminator, and it stops consuming
2738 // just before any encoding error. The subsequent |getToken| call will call
2739 // |getTokenInternal| below which will handle these possibilities.
2740 this->sourceUnits.consumeRestOfSingleLineComment();
2741 }
2742
2743 template <typename Unit, class AnyCharsAccess>
getTokenInternal(TokenKind * const ttp,const Modifier modifier)2744 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getTokenInternal(
2745 TokenKind* const ttp, const Modifier modifier) {
2746 // Assume we'll fail: success cases will overwrite this.
2747 #ifdef DEBUG
2748 *ttp = TokenKind::Limit;
2749 #endif
2750 MOZ_MAKE_MEM_UNDEFINED(ttp, sizeof(*ttp));
2751
2752 // This loop runs more than once only when whitespace or comments are
2753 // encountered.
2754 do {
2755 int32_t unit = peekCodeUnit();
2756 if (MOZ_UNLIKELY(unit == EOF)) {
2757 MOZ_ASSERT(this->sourceUnits.atEnd());
2758 anyCharsAccess().flags.isEOF = true;
2759 TokenStart start(this->sourceUnits, 0);
2760 newSimpleToken(TokenKind::Eof, start, modifier, ttp);
2761 return true;
2762 }
2763
2764 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
2765 // Non-ASCII code points can only be identifiers or whitespace. It would
2766 // be nice to compute these *after* discarding whitespace, but IN A WORLD
2767 // where |unicode::IsSpace| requires consuming a variable number of code
2768 // units, it's easier to assume it's an identifier and maybe do a little
2769 // wasted work, than to unget and compute and reget if whitespace.
2770 TokenStart start(this->sourceUnits, 0);
2771 const Unit* identStart = this->sourceUnits.addressOfNextCodeUnit();
2772
2773 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2774 if (peeked.isNone()) {
2775 int32_t bad;
2776 MOZ_ALWAYS_FALSE(getCodePoint(&bad));
2777 return badToken();
2778 }
2779
2780 char32_t cp = peeked.codePoint();
2781 if (unicode::IsSpace(cp)) {
2782 this->sourceUnits.consumeKnownCodePoint(peeked);
2783 if (IsLineTerminator(cp)) {
2784 if (!updateLineInfoForEOL()) {
2785 return badToken();
2786 }
2787
2788 anyCharsAccess().updateFlagsForEOL();
2789 }
2790
2791 continue;
2792 }
2793
2794 static_assert(isAsciiCodePoint('$'),
2795 "IdentifierStart contains '$', but as "
2796 "!IsUnicodeIDStart('$'), ensure that '$' is never "
2797 "handled here");
2798 static_assert(isAsciiCodePoint('_'),
2799 "IdentifierStart contains '_', but as "
2800 "!IsUnicodeIDStart('_'), ensure that '_' is never "
2801 "handled here");
2802
2803 if (MOZ_LIKELY(unicode::IsUnicodeIDStart(cp))) {
2804 this->sourceUnits.consumeKnownCodePoint(peeked);
2805 MOZ_ASSERT(!IsLineTerminator(cp),
2806 "IdentifierStart must guarantee !IsLineTerminator "
2807 "or else we'll fail to maintain line-info/flags "
2808 "for EOL here");
2809
2810 return identifierName(start, identStart, IdentifierEscapes::None,
2811 modifier, NameVisibility::Public, ttp);
2812 }
2813
2814 reportIllegalCharacter(cp);
2815 return badToken();
2816 } // !isAsciiCodePoint(unit)
2817
2818 consumeKnownCodeUnit(unit);
2819
2820 // Get the token kind, based on the first char. The ordering of c1kind
2821 // comparison is based on the frequency of tokens in real code:
2822 // Parsemark (which represents typical JS code on the web) and the
2823 // Unreal demo (which represents asm.js code).
2824 //
2825 // Parsemark Unreal
2826 // OneChar 32.9% 39.7%
2827 // Space 25.0% 0.6%
2828 // Ident 19.2% 36.4%
2829 // Dec 7.2% 5.1%
2830 // String 7.9% 0.0%
2831 // EOL 1.7% 0.0%
2832 // ZeroDigit 0.4% 4.9%
2833 // Other 5.7% 13.3%
2834 //
2835 // The ordering is based mostly only Parsemark frequencies, with Unreal
2836 // frequencies used to break close categories (e.g. |Dec| and
2837 // |String|). |Other| is biggish, but no other token kind is common
2838 // enough for it to be worth adding extra values to FirstCharKind.
2839 FirstCharKind c1kind = FirstCharKind(firstCharKinds[unit]);
2840
2841 // Look for an unambiguous single-char token.
2842 //
2843 if (c1kind <= OneChar_Max) {
2844 TokenStart start(this->sourceUnits, -1);
2845 newSimpleToken(TokenKind(c1kind), start, modifier, ttp);
2846 return true;
2847 }
2848
2849 // Skip over non-EOL whitespace chars.
2850 //
2851 if (c1kind == Space) {
2852 continue;
2853 }
2854
2855 // Look for an identifier.
2856 //
2857 if (c1kind == Ident) {
2858 TokenStart start(this->sourceUnits, -1);
2859 return identifierName(
2860 start, this->sourceUnits.addressOfNextCodeUnit() - 1,
2861 IdentifierEscapes::None, modifier, NameVisibility::Public, ttp);
2862 }
2863
2864 // Look for a decimal number.
2865 //
2866 if (c1kind == Dec) {
2867 TokenStart start(this->sourceUnits, -1);
2868 const Unit* numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2869 return decimalNumber(unit, start, numStart, modifier, ttp);
2870 }
2871
2872 // Look for a string or a template string.
2873 //
2874 if (c1kind == String) {
2875 return getStringOrTemplateToken(static_cast<char>(unit), modifier, ttp);
2876 }
2877
2878 // Skip over EOL chars, updating line state along the way.
2879 //
2880 if (c1kind == EOL) {
2881 if (unit == '\r') {
2882 matchLineTerminator('\n');
2883 }
2884
2885 if (!updateLineInfoForEOL()) {
2886 return badToken();
2887 }
2888
2889 anyCharsAccess().updateFlagsForEOL();
2890 continue;
2891 }
2892
2893 // From a '0', look for a hexadecimal, binary, octal, or "noctal" (a
2894 // number starting with '0' that contains '8' or '9' and is treated as
2895 // decimal) number.
2896 //
2897 if (c1kind == ZeroDigit) {
2898 TokenStart start(this->sourceUnits, -1);
2899 int radix;
2900 bool isBigInt = false;
2901 const Unit* numStart;
2902 unit = getCodeUnit();
2903 if (unit == 'x' || unit == 'X') {
2904 radix = 16;
2905 unit = getCodeUnit();
2906 if (!IsAsciiHexDigit(unit)) {
2907 // NOTE: |unit| may be EOF here.
2908 ungetCodeUnit(unit);
2909 error(JSMSG_MISSING_HEXDIGITS);
2910 return badToken();
2911 }
2912
2913 // one past the '0x'
2914 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2915
2916 if (!matchIntegerAfterFirstDigit(IsAsciiHexDigit, &unit)) {
2917 return badToken();
2918 }
2919 } else if (unit == 'b' || unit == 'B') {
2920 radix = 2;
2921 unit = getCodeUnit();
2922 if (!IsAsciiBinary(unit)) {
2923 // NOTE: |unit| may be EOF here.
2924 ungetCodeUnit(unit);
2925 error(JSMSG_MISSING_BINARY_DIGITS);
2926 return badToken();
2927 }
2928
2929 // one past the '0b'
2930 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2931
2932 if (!matchIntegerAfterFirstDigit(IsAsciiBinary, &unit)) {
2933 return badToken();
2934 }
2935 } else if (unit == 'o' || unit == 'O') {
2936 radix = 8;
2937 unit = getCodeUnit();
2938 if (!IsAsciiOctal(unit)) {
2939 // NOTE: |unit| may be EOF here.
2940 ungetCodeUnit(unit);
2941 error(JSMSG_MISSING_OCTAL_DIGITS);
2942 return badToken();
2943 }
2944
2945 // one past the '0o'
2946 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2947
2948 if (!matchIntegerAfterFirstDigit(IsAsciiOctal, &unit)) {
2949 return badToken();
2950 }
2951 } else if (IsAsciiDigit(unit)) {
2952 // Reject octal literals that appear in strict mode code.
2953 if (!strictModeError(JSMSG_DEPRECATED_OCTAL_LITERAL)) {
2954 return badToken();
2955 }
2956
2957 // The above test doesn't catch a few edge cases; see
2958 // |GeneralParser::maybeParseDirective|. Record the violation so that
2959 // that function can handle them.
2960 anyCharsAccess().setSawDeprecatedOctalLiteral();
2961
2962 radix = 8;
2963 // one past the '0'
2964 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2965
2966 bool nonOctalDecimalIntegerLiteral = false;
2967 do {
2968 if (unit >= '8') {
2969 nonOctalDecimalIntegerLiteral = true;
2970 }
2971 unit = getCodeUnit();
2972 } while (IsAsciiDigit(unit));
2973
2974 if (unit == '_') {
2975 error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER);
2976 return badToken();
2977 }
2978
2979 if (unit == 'n') {
2980 error(JSMSG_BIGINT_INVALID_SYNTAX);
2981 return badToken();
2982 }
2983
2984 if (nonOctalDecimalIntegerLiteral) {
2985 // Use the decimal scanner for the rest of the number.
2986 return decimalNumber(unit, start, numStart, modifier, ttp);
2987 }
2988 } else if (unit == '_') {
2989 // Give a more explicit error message when '_' is used after '0'.
2990 error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER);
2991 return badToken();
2992 } else {
2993 // '0' not followed by [XxBbOo0-9_]; scan as a decimal number.
2994 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2995
2996 // NOTE: |unit| may be EOF here. (This is permitted by case #3
2997 // in TokenStream.h docs for this function.)
2998 return decimalNumber(unit, start, numStart, modifier, ttp);
2999 }
3000
3001 if (unit == 'n') {
3002 isBigInt = true;
3003 unit = peekCodeUnit();
3004 } else {
3005 ungetCodeUnit(unit);
3006 }
3007
3008 // Error if an identifier-start code point appears immediately
3009 // after the number. Somewhat surprisingly, if we don't check
3010 // here, we'll never check at all.
3011 if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
3012 if (unicode::IsIdentifierStart(char16_t(unit))) {
3013 error(JSMSG_IDSTART_AFTER_NUMBER);
3014 return badToken();
3015 }
3016 } else if (MOZ_LIKELY(unit != EOF)) {
3017 // This ignores encoding errors: subsequent caller-side code to
3018 // handle source text after the number will do so.
3019 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
3020 if (!peeked.isNone() &&
3021 unicode::IsIdentifierStart(peeked.codePoint())) {
3022 error(JSMSG_IDSTART_AFTER_NUMBER);
3023 return badToken();
3024 }
3025 }
3026
3027 if (isBigInt) {
3028 return bigIntLiteral(start, modifier, ttp);
3029 }
3030
3031 double dval;
3032 if (!GetFullInteger(anyCharsAccess().cx, numStart,
3033 this->sourceUnits.addressOfNextCodeUnit(), radix,
3034 IntegerSeparatorHandling::SkipUnderscore, &dval)) {
3035 return badToken();
3036 }
3037 newNumberToken(dval, NoDecimal, start, modifier, ttp);
3038 return true;
3039 }
3040
3041 MOZ_ASSERT(c1kind == Other);
3042
3043 // This handles everything else. Simple tokens distinguished solely by
3044 // TokenKind should set |simpleKind| and break, to share simple-token
3045 // creation code for all such tokens. All other tokens must be handled
3046 // by returning (or by continuing from the loop enclosing this).
3047 //
3048 TokenStart start(this->sourceUnits, -1);
3049 TokenKind simpleKind;
3050 #ifdef DEBUG
3051 simpleKind = TokenKind::Limit; // sentinel value for code after switch
3052 #endif
3053
3054 // The block a ways above eliminated all non-ASCII, so cast to the
3055 // smallest type possible to assist the C++ compiler.
3056 switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) {
3057 case '.':
3058 if (IsAsciiDigit(peekCodeUnit())) {
3059 return decimalNumber('.', start,
3060 this->sourceUnits.addressOfNextCodeUnit() - 1,
3061 modifier, ttp);
3062 }
3063
3064 unit = getCodeUnit();
3065 if (unit == '.') {
3066 if (matchCodeUnit('.')) {
3067 simpleKind = TokenKind::TripleDot;
3068 break;
3069 }
3070 }
3071
3072 // NOTE: |unit| may be EOF here. A stray '.' at EOF would be an
3073 // error, but subsequent code will handle it.
3074 ungetCodeUnit(unit);
3075
3076 simpleKind = TokenKind::Dot;
3077 break;
3078
3079 case '#': {
3080 if (options().privateClassFields) {
3081 TokenStart start(this->sourceUnits, -1);
3082 const Unit* identStart =
3083 this->sourceUnits.addressOfNextCodeUnit() - 1;
3084 IdentifierEscapes sawEscape;
3085 if (!matchIdentifierStart(&sawEscape)) {
3086 return badToken();
3087 }
3088 return identifierName(start, identStart, sawEscape, modifier,
3089 NameVisibility::Private, ttp);
3090 }
3091 ungetCodeUnit(unit);
3092 error(JSMSG_PRIVATE_FIELDS_NOT_SUPPORTED);
3093 return badToken();
3094 }
3095
3096 case '=':
3097 if (matchCodeUnit('=')) {
3098 simpleKind = matchCodeUnit('=') ? TokenKind::StrictEq : TokenKind::Eq;
3099 } else if (matchCodeUnit('>')) {
3100 simpleKind = TokenKind::Arrow;
3101 } else {
3102 simpleKind = TokenKind::Assign;
3103 }
3104 break;
3105
3106 case '+':
3107 if (matchCodeUnit('+')) {
3108 simpleKind = TokenKind::Inc;
3109 } else {
3110 simpleKind =
3111 matchCodeUnit('=') ? TokenKind::AddAssign : TokenKind::Add;
3112 }
3113 break;
3114
3115 case '\\': {
3116 uint32_t codePoint;
3117 if (uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint)) {
3118 return identifierName(
3119 start,
3120 this->sourceUnits.addressOfNextCodeUnit() - escapeLength - 1,
3121 IdentifierEscapes::SawUnicodeEscape, modifier,
3122 NameVisibility::Public, ttp);
3123 }
3124
3125 // We could point "into" a mistyped escape, e.g. for "\u{41H}" we
3126 // could point at the 'H'. But we don't do that now, so the code
3127 // unit after the '\' isn't necessarily bad, so just point at the
3128 // start of the actually-invalid escape.
3129 ungetCodeUnit('\\');
3130 error(JSMSG_BAD_ESCAPE);
3131 return badToken();
3132 }
3133
3134 case '|':
3135 if (matchCodeUnit('|')) {
3136 simpleKind = matchCodeUnit('=') ? TokenKind::OrAssign : TokenKind::Or;
3137 } else {
3138 simpleKind =
3139 matchCodeUnit('=') ? TokenKind::BitOrAssign : TokenKind::BitOr;
3140 }
3141 break;
3142
3143 case '^':
3144 simpleKind =
3145 matchCodeUnit('=') ? TokenKind::BitXorAssign : TokenKind::BitXor;
3146 break;
3147
3148 case '&':
3149 if (matchCodeUnit('&')) {
3150 simpleKind =
3151 matchCodeUnit('=') ? TokenKind::AndAssign : TokenKind::And;
3152 } else {
3153 simpleKind =
3154 matchCodeUnit('=') ? TokenKind::BitAndAssign : TokenKind::BitAnd;
3155 }
3156 break;
3157
3158 case '?':
3159 if (matchCodeUnit('.')) {
3160 unit = getCodeUnit();
3161 if (IsAsciiDigit(unit)) {
3162 // if the code unit is followed by a number, for example it has the
3163 // following form `<...> ?.5 <..> then it should be treated as a
3164 // ternary rather than as an optional chain
3165 simpleKind = TokenKind::Hook;
3166 ungetCodeUnit(unit);
3167 ungetCodeUnit('.');
3168 } else {
3169 ungetCodeUnit(unit);
3170 simpleKind = TokenKind::OptionalChain;
3171 }
3172 } else if (matchCodeUnit('?')) {
3173 simpleKind = matchCodeUnit('=') ? TokenKind::CoalesceAssign
3174 : TokenKind::Coalesce;
3175 } else {
3176 simpleKind = TokenKind::Hook;
3177 }
3178 break;
3179
3180 case '!':
3181 if (matchCodeUnit('=')) {
3182 simpleKind = matchCodeUnit('=') ? TokenKind::StrictNe : TokenKind::Ne;
3183 } else {
3184 simpleKind = TokenKind::Not;
3185 }
3186 break;
3187
3188 case '<':
3189 if (anyCharsAccess().options().allowHTMLComments) {
3190 // Treat HTML begin-comment as comment-till-end-of-line.
3191 if (matchCodeUnit('!')) {
3192 if (matchCodeUnit('-')) {
3193 if (matchCodeUnit('-')) {
3194 this->sourceUnits.consumeRestOfSingleLineComment();
3195 continue;
3196 }
3197 ungetCodeUnit('-');
3198 }
3199 ungetCodeUnit('!');
3200 }
3201 }
3202 if (matchCodeUnit('<')) {
3203 simpleKind =
3204 matchCodeUnit('=') ? TokenKind::LshAssign : TokenKind::Lsh;
3205 } else {
3206 simpleKind = matchCodeUnit('=') ? TokenKind::Le : TokenKind::Lt;
3207 }
3208 break;
3209
3210 case '>':
3211 if (matchCodeUnit('>')) {
3212 if (matchCodeUnit('>')) {
3213 simpleKind =
3214 matchCodeUnit('=') ? TokenKind::UrshAssign : TokenKind::Ursh;
3215 } else {
3216 simpleKind =
3217 matchCodeUnit('=') ? TokenKind::RshAssign : TokenKind::Rsh;
3218 }
3219 } else {
3220 simpleKind = matchCodeUnit('=') ? TokenKind::Ge : TokenKind::Gt;
3221 }
3222 break;
3223
3224 case '*':
3225 if (matchCodeUnit('*')) {
3226 simpleKind =
3227 matchCodeUnit('=') ? TokenKind::PowAssign : TokenKind::Pow;
3228 } else {
3229 simpleKind =
3230 matchCodeUnit('=') ? TokenKind::MulAssign : TokenKind::Mul;
3231 }
3232 break;
3233
3234 case '/':
3235 // Look for a single-line comment.
3236 if (matchCodeUnit('/')) {
3237 unit = getCodeUnit();
3238 if (unit == '@' || unit == '#') {
3239 bool shouldWarn = unit == '@';
3240 if (!getDirectives(false, shouldWarn)) {
3241 return false;
3242 }
3243 } else {
3244 // NOTE: |unit| may be EOF here.
3245 ungetCodeUnit(unit);
3246 }
3247
3248 this->sourceUnits.consumeRestOfSingleLineComment();
3249 continue;
3250 }
3251
3252 // Look for a multi-line comment.
3253 if (matchCodeUnit('*')) {
3254 TokenStreamAnyChars& anyChars = anyCharsAccess();
3255 unsigned linenoBefore = anyChars.lineno;
3256
3257 do {
3258 int32_t unit = getCodeUnit();
3259 if (unit == EOF) {
3260 error(JSMSG_UNTERMINATED_COMMENT);
3261 return badToken();
3262 }
3263
3264 if (unit == '*' && matchCodeUnit('/')) {
3265 break;
3266 }
3267
3268 if (unit == '@' || unit == '#') {
3269 bool shouldWarn = unit == '@';
3270 if (!getDirectives(true, shouldWarn)) {
3271 return badToken();
3272 }
3273 } else if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
3274 int32_t codePoint;
3275 if (!getFullAsciiCodePoint(unit, &codePoint)) {
3276 return badToken();
3277 }
3278 } else {
3279 int32_t codePoint;
3280 if (!getNonAsciiCodePoint(unit, &codePoint)) {
3281 return badToken();
3282 }
3283 }
3284 } while (true);
3285
3286 if (linenoBefore != anyChars.lineno) {
3287 anyChars.updateFlagsForEOL();
3288 }
3289
3290 continue;
3291 }
3292
3293 // Look for a regexp.
3294 if (modifier == SlashIsRegExp) {
3295 return regexpLiteral(start, ttp);
3296 }
3297
3298 simpleKind = matchCodeUnit('=') ? TokenKind::DivAssign : TokenKind::Div;
3299 break;
3300
3301 case '%':
3302 simpleKind = matchCodeUnit('=') ? TokenKind::ModAssign : TokenKind::Mod;
3303 break;
3304
3305 case '-':
3306 if (matchCodeUnit('-')) {
3307 if (anyCharsAccess().options().allowHTMLComments &&
3308 !anyCharsAccess().flags.isDirtyLine) {
3309 if (matchCodeUnit('>')) {
3310 this->sourceUnits.consumeRestOfSingleLineComment();
3311 continue;
3312 }
3313 }
3314
3315 simpleKind = TokenKind::Dec;
3316 } else {
3317 simpleKind =
3318 matchCodeUnit('=') ? TokenKind::SubAssign : TokenKind::Sub;
3319 }
3320 break;
3321
3322 default:
3323 // We consumed a bad ASCII code point/unit. Put it back so the
3324 // error location is the bad code point.
3325 ungetCodeUnit(unit);
3326 reportIllegalCharacter(unit);
3327 return badToken();
3328 } // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit))))
3329
3330 MOZ_ASSERT(simpleKind != TokenKind::Limit,
3331 "switch-statement should have set |simpleKind| before "
3332 "breaking");
3333
3334 newSimpleToken(simpleKind, start, modifier, ttp);
3335 return true;
3336 } while (true);
3337 }
3338
3339 template <typename Unit, class AnyCharsAccess>
getStringOrTemplateToken(char untilChar,Modifier modifier,TokenKind * out)3340 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getStringOrTemplateToken(
3341 char untilChar, Modifier modifier, TokenKind* out) {
3342 MOZ_ASSERT(untilChar == '\'' || untilChar == '"' || untilChar == '`',
3343 "unexpected string/template literal delimiter");
3344
3345 bool parsingTemplate = (untilChar == '`');
3346 bool templateHead = false;
3347
3348 TokenStart start(this->sourceUnits, -1);
3349 this->charBuffer.clear();
3350
3351 // Run the bad-token code for every path out of this function except the
3352 // one success-case.
3353 auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
3354
3355 auto ReportPrematureEndOfLiteral = [this, untilChar](unsigned errnum) {
3356 // Unicode separators aren't end-of-line in template or (as of
3357 // recently) string literals, so this assertion doesn't allow them.
3358 MOZ_ASSERT(this->sourceUnits.atEnd() ||
3359 this->sourceUnits.peekCodeUnit() == Unit('\r') ||
3360 this->sourceUnits.peekCodeUnit() == Unit('\n'),
3361 "must be parked at EOF or EOL to call this function");
3362
3363 // The various errors reported here include language like "in a ''
3364 // literal" or similar, with '' being '', "", or `` as appropriate.
3365 const char delimiters[] = {untilChar, untilChar, '\0'};
3366
3367 this->error(errnum, delimiters);
3368 return;
3369 };
3370
3371 // We need to detect any of these chars: " or ', \n (or its
3372 // equivalents), \\, EOF. Because we detect EOL sequences here and
3373 // put them back immediately, we can use getCodeUnit().
3374 int32_t unit;
3375 while ((unit = getCodeUnit()) != untilChar) {
3376 if (unit == EOF) {
3377 ReportPrematureEndOfLiteral(JSMSG_EOF_BEFORE_END_OF_LITERAL);
3378 return false;
3379 }
3380
3381 // Non-ASCII code points are always directly appended -- even
3382 // U+2028 LINE SEPARATOR and U+2029 PARAGRAPH SEPARATOR that are
3383 // ordinarily LineTerminatorSequences. (They contribute their literal
3384 // values to template and [as of recently] string literals, but they're
3385 // line terminators when computing line/column coordinates.) Handle
3386 // the non-ASCII case early for readability.
3387 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
3388 char32_t cp;
3389 if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) {
3390 return false;
3391 }
3392
3393 if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
3394 cp == unicode::PARA_SEPARATOR)) {
3395 if (!updateLineInfoForEOL()) {
3396 return false;
3397 }
3398
3399 anyCharsAccess().updateFlagsForEOL();
3400 } else {
3401 MOZ_ASSERT(!IsLineTerminator(cp));
3402 }
3403
3404 if (!AppendCodePointToCharBuffer(this->charBuffer, cp)) {
3405 return false;
3406 }
3407
3408 continue;
3409 }
3410
3411 if (unit == '\\') {
3412 // When parsing templates, we don't immediately report errors for
3413 // invalid escapes; these are handled by the parser. We don't
3414 // append to charBuffer in those cases because it won't be read.
3415 unit = getCodeUnit();
3416 if (unit == EOF) {
3417 ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3418 return false;
3419 }
3420
3421 // Non-ASCII |unit| isn't handled by code after this, so dedicate
3422 // an unlikely special-case to it and then continue.
3423 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
3424 int32_t codePoint;
3425 if (!getNonAsciiCodePoint(unit, &codePoint)) {
3426 return false;
3427 }
3428
3429 // If we consumed U+2028 LINE SEPARATOR or U+2029 PARAGRAPH
3430 // SEPARATOR, they'll be normalized to '\n'. '\' followed by
3431 // LineContinuation represents no code points, so don't append
3432 // in this case.
3433 if (codePoint != '\n') {
3434 if (!AppendCodePointToCharBuffer(this->charBuffer,
3435 AssertedCast<char32_t>(codePoint))) {
3436 return false;
3437 }
3438 }
3439
3440 continue;
3441 }
3442
3443 // The block above eliminated all non-ASCII, so cast to the
3444 // smallest type possible to assist the C++ compiler.
3445 switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) {
3446 case 'b':
3447 unit = '\b';
3448 break;
3449 case 'f':
3450 unit = '\f';
3451 break;
3452 case 'n':
3453 unit = '\n';
3454 break;
3455 case 'r':
3456 unit = '\r';
3457 break;
3458 case 't':
3459 unit = '\t';
3460 break;
3461 case 'v':
3462 unit = '\v';
3463 break;
3464
3465 case '\r':
3466 matchLineTerminator('\n');
3467 [[fallthrough]];
3468 case '\n': {
3469 // LineContinuation represents no code points. We're manually
3470 // consuming a LineTerminatorSequence, so we must manually
3471 // update line/column info.
3472 if (!updateLineInfoForEOL()) {
3473 return false;
3474 }
3475
3476 continue;
3477 }
3478
3479 // Unicode character specification.
3480 case 'u': {
3481 int32_t c2 = getCodeUnit();
3482 if (c2 == EOF) {
3483 ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3484 return false;
3485 }
3486
3487 // First handle a delimited Unicode escape, e.g. \u{1F4A9}.
3488 if (c2 == '{') {
3489 uint32_t start = this->sourceUnits.offset() - 3;
3490 uint32_t code = 0;
3491 bool first = true;
3492 bool valid = true;
3493 do {
3494 int32_t u3 = getCodeUnit();
3495 if (u3 == EOF) {
3496 if (parsingTemplate) {
3497 TokenStreamAnyChars& anyChars = anyCharsAccess();
3498 anyChars.setInvalidTemplateEscape(start,
3499 InvalidEscapeType::Unicode);
3500 valid = false;
3501 break;
3502 }
3503 reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3504 return false;
3505 }
3506 if (u3 == '}') {
3507 if (first) {
3508 if (parsingTemplate) {
3509 TokenStreamAnyChars& anyChars = anyCharsAccess();
3510 anyChars.setInvalidTemplateEscape(
3511 start, InvalidEscapeType::Unicode);
3512 valid = false;
3513 break;
3514 }
3515 reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3516 return false;
3517 }
3518 break;
3519 }
3520
3521 // Beware: |u3| may be a non-ASCII code point here; if
3522 // so it'll pass into this |if|-block.
3523 if (!IsAsciiHexDigit(u3)) {
3524 if (parsingTemplate) {
3525 // We put the code unit back so that we read it
3526 // on the next pass, which matters if it was
3527 // '`' or '\'.
3528 ungetCodeUnit(u3);
3529
3530 TokenStreamAnyChars& anyChars = anyCharsAccess();
3531 anyChars.setInvalidTemplateEscape(start,
3532 InvalidEscapeType::Unicode);
3533 valid = false;
3534 break;
3535 }
3536 reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3537 return false;
3538 }
3539
3540 code = (code << 4) | AsciiAlphanumericToNumber(u3);
3541 if (code > unicode::NonBMPMax) {
3542 if (parsingTemplate) {
3543 TokenStreamAnyChars& anyChars = anyCharsAccess();
3544 anyChars.setInvalidTemplateEscape(
3545 start + 3, InvalidEscapeType::UnicodeOverflow);
3546 valid = false;
3547 break;
3548 }
3549 reportInvalidEscapeError(start + 3,
3550 InvalidEscapeType::UnicodeOverflow);
3551 return false;
3552 }
3553
3554 first = false;
3555 } while (true);
3556
3557 if (!valid) {
3558 continue;
3559 }
3560
3561 MOZ_ASSERT(code <= unicode::NonBMPMax);
3562 if (!AppendCodePointToCharBuffer(this->charBuffer, code)) {
3563 return false;
3564 }
3565
3566 continue;
3567 } // end of delimited Unicode escape handling
3568
3569 // Otherwise it must be a fixed-length \uXXXX Unicode escape.
3570 // If it isn't, this is usually an error -- but if this is a
3571 // template literal, we must defer error reporting because
3572 // malformed escapes are okay in *tagged* template literals.
3573 char16_t v;
3574 if (IsAsciiHexDigit(c2) && this->sourceUnits.matchHexDigits(3, &v)) {
3575 unit = (AsciiAlphanumericToNumber(c2) << 12) | v;
3576 } else {
3577 // Beware: |c2| may not be an ASCII code point here!
3578 ungetCodeUnit(c2);
3579 uint32_t start = this->sourceUnits.offset() - 2;
3580 if (parsingTemplate) {
3581 TokenStreamAnyChars& anyChars = anyCharsAccess();
3582 anyChars.setInvalidTemplateEscape(start,
3583 InvalidEscapeType::Unicode);
3584 continue;
3585 }
3586 reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3587 return false;
3588 }
3589 break;
3590 } // case 'u'
3591
3592 // Hexadecimal character specification.
3593 case 'x': {
3594 char16_t v;
3595 if (this->sourceUnits.matchHexDigits(2, &v)) {
3596 unit = v;
3597 } else {
3598 uint32_t start = this->sourceUnits.offset() - 2;
3599 if (parsingTemplate) {
3600 TokenStreamAnyChars& anyChars = anyCharsAccess();
3601 anyChars.setInvalidTemplateEscape(start,
3602 InvalidEscapeType::Hexadecimal);
3603 continue;
3604 }
3605 reportInvalidEscapeError(start, InvalidEscapeType::Hexadecimal);
3606 return false;
3607 }
3608 break;
3609 }
3610
3611 default: {
3612 if (!IsAsciiOctal(unit)) {
3613 // \8 or \9 in an untagged template literal is a syntax error,
3614 // reported in GeneralParser::noSubstitutionUntaggedTemplate.
3615 //
3616 // Tagged template literals, however, may contain \8 and \9. The
3617 // "cooked" representation of such a part will be |undefined|, and
3618 // the "raw" representation will contain the literal characters.
3619 //
3620 // function f(parts) {
3621 // assertEq(parts[0], undefined);
3622 // assertEq(parts.raw[0], "\\8");
3623 // return "composed";
3624 // }
3625 // assertEq(f`\8`, "composed");
3626 if (unit == '8' || unit == '9') {
3627 TokenStreamAnyChars& anyChars = anyCharsAccess();
3628 if (parsingTemplate) {
3629 anyChars.setInvalidTemplateEscape(
3630 this->sourceUnits.offset() - 2,
3631 InvalidEscapeType::EightOrNine);
3632 continue;
3633 }
3634
3635 // \8 and \9 are forbidden in string literals in strict mode code.
3636 if (!strictModeError(JSMSG_DEPRECATED_EIGHT_OR_NINE_ESCAPE)) {
3637 return false;
3638 }
3639
3640 // The above test doesn't catch a few edge cases; see
3641 // |GeneralParser::maybeParseDirective|. Record the violation so
3642 // that that function can handle them.
3643 anyChars.setSawDeprecatedEightOrNineEscape();
3644 }
3645 break;
3646 }
3647
3648 // Octal character specification.
3649 int32_t val = AsciiOctalToNumber(unit);
3650
3651 unit = peekCodeUnit();
3652 if (MOZ_UNLIKELY(unit == EOF)) {
3653 ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3654 return false;
3655 }
3656
3657 // Strict mode code allows only \0 followed by a non-digit.
3658 if (val != 0 || IsAsciiDigit(unit)) {
3659 TokenStreamAnyChars& anyChars = anyCharsAccess();
3660 if (parsingTemplate) {
3661 anyChars.setInvalidTemplateEscape(this->sourceUnits.offset() - 2,
3662 InvalidEscapeType::Octal);
3663 continue;
3664 }
3665
3666 if (!strictModeError(JSMSG_DEPRECATED_OCTAL_ESCAPE)) {
3667 return false;
3668 }
3669
3670 // The above test doesn't catch a few edge cases; see
3671 // |GeneralParser::maybeParseDirective|. Record the violation so
3672 // that that function can handle them.
3673 anyChars.setSawDeprecatedOctalEscape();
3674 }
3675
3676 if (IsAsciiOctal(unit)) {
3677 val = 8 * val + AsciiOctalToNumber(unit);
3678 consumeKnownCodeUnit(unit);
3679
3680 unit = peekCodeUnit();
3681 if (MOZ_UNLIKELY(unit == EOF)) {
3682 ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3683 return false;
3684 }
3685
3686 if (IsAsciiOctal(unit)) {
3687 int32_t save = val;
3688 val = 8 * val + AsciiOctalToNumber(unit);
3689 if (val <= 0xFF) {
3690 consumeKnownCodeUnit(unit);
3691 } else {
3692 val = save;
3693 }
3694 }
3695 }
3696
3697 unit = char16_t(val);
3698 break;
3699 } // default
3700 } // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit))))
3701
3702 if (!this->charBuffer.append(unit)) {
3703 return false;
3704 }
3705
3706 continue;
3707 } // (unit == '\\')
3708
3709 if (unit == '\r' || unit == '\n') {
3710 if (!parsingTemplate) {
3711 // String literals don't allow ASCII line breaks.
3712 ungetCodeUnit(unit);
3713 ReportPrematureEndOfLiteral(JSMSG_EOL_BEFORE_END_OF_STRING);
3714 return false;
3715 }
3716
3717 if (unit == '\r') {
3718 unit = '\n';
3719 matchLineTerminator('\n');
3720 }
3721
3722 if (!updateLineInfoForEOL()) {
3723 return false;
3724 }
3725
3726 anyCharsAccess().updateFlagsForEOL();
3727 } else if (parsingTemplate && unit == '$' && matchCodeUnit('{')) {
3728 templateHead = true;
3729 break;
3730 }
3731
3732 if (!this->charBuffer.append(unit)) {
3733 return false;
3734 }
3735 }
3736
3737 TaggedParserAtomIndex atom = drainCharBufferIntoAtom();
3738 if (!atom) {
3739 return false;
3740 }
3741
3742 noteBadToken.release();
3743
3744 MOZ_ASSERT_IF(!parsingTemplate, !templateHead);
3745
3746 TokenKind kind = !parsingTemplate ? TokenKind::String
3747 : templateHead ? TokenKind::TemplateHead
3748 : TokenKind::NoSubsTemplate;
3749 newAtomToken(kind, atom, start, modifier, out);
3750 return true;
3751 }
3752
TokenKindToDesc(TokenKind tt)3753 const char* TokenKindToDesc(TokenKind tt) {
3754 switch (tt) {
3755 #define EMIT_CASE(name, desc) \
3756 case TokenKind::name: \
3757 return desc;
3758 FOR_EACH_TOKEN_KIND(EMIT_CASE)
3759 #undef EMIT_CASE
3760 case TokenKind::Limit:
3761 MOZ_ASSERT_UNREACHABLE("TokenKind::Limit should not be passed.");
3762 break;
3763 }
3764
3765 return "<bad TokenKind>";
3766 }
3767
3768 #ifdef DEBUG
TokenKindToString(TokenKind tt)3769 const char* TokenKindToString(TokenKind tt) {
3770 switch (tt) {
3771 # define EMIT_CASE(name, desc) \
3772 case TokenKind::name: \
3773 return "TokenKind::" #name;
3774 FOR_EACH_TOKEN_KIND(EMIT_CASE)
3775 # undef EMIT_CASE
3776 case TokenKind::Limit:
3777 break;
3778 }
3779
3780 return "<bad TokenKind>";
3781 }
3782 #endif
3783
3784 template class TokenStreamCharsBase<Utf8Unit>;
3785 template class TokenStreamCharsBase<char16_t>;
3786
3787 template class GeneralTokenStreamChars<char16_t, TokenStreamAnyCharsAccess>;
3788 template class TokenStreamChars<char16_t, TokenStreamAnyCharsAccess>;
3789 template class TokenStreamSpecific<char16_t, TokenStreamAnyCharsAccess>;
3790
3791 template class GeneralTokenStreamChars<
3792 Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
3793 template class GeneralTokenStreamChars<
3794 Utf8Unit,
3795 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
3796 template class GeneralTokenStreamChars<
3797 char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
3798 template class GeneralTokenStreamChars<
3799 char16_t,
3800 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
3801
3802 template class TokenStreamChars<
3803 Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
3804 template class TokenStreamChars<
3805 Utf8Unit,
3806 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
3807 template class TokenStreamChars<
3808 char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
3809 template class TokenStreamChars<
3810 char16_t,
3811 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
3812
3813 template class TokenStreamSpecific<
3814 Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
3815 template class TokenStreamSpecific<
3816 Utf8Unit,
3817 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
3818 template class TokenStreamSpecific<
3819 char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
3820 template class TokenStreamSpecific<
3821 char16_t,
3822 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
3823
3824 } // namespace frontend
3825
3826 } // namespace js
3827