1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * vim: set ts=8 sts=2 et sw=2 tw=80:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 // JS lexical scanner.
8
9 #include "frontend/TokenStream.h"
10
11 #include "mozilla/ArrayUtils.h"
12 #include "mozilla/Attributes.h"
13 #include "mozilla/Likely.h"
14 #include "mozilla/Maybe.h"
15 #include "mozilla/MemoryChecking.h"
16 #include "mozilla/ScopeExit.h"
17 #include "mozilla/Span.h"
18 #include "mozilla/TemplateLib.h"
19 #include "mozilla/TextUtils.h"
20 #include "mozilla/Utf8.h"
21
22 #include <algorithm>
23 #include <iterator>
24 #include <stdarg.h>
25 #include <stdint.h>
26 #include <stdio.h>
27 #include <type_traits>
28 #include <utility>
29
30 #include "jsnum.h"
31
32 #include "frontend/BytecodeCompiler.h"
33 #include "frontend/Parser.h"
34 #include "frontend/ParserAtom.h"
35 #include "frontend/ReservedWords.h"
36 #include "js/friend/ErrorMessages.h" // js::GetErrorMessage, JSMSG_*
37 #include "js/Printf.h" // JS_smprintf
38 #include "js/RegExpFlags.h" // JS::RegExpFlags
39 #include "js/UniquePtr.h"
40 #include "util/Text.h"
41 #include "util/Unicode.h"
42 #include "vm/FrameIter.h" // js::{,NonBuiltin}FrameIter
43 #include "vm/JSContext.h"
44 #include "vm/Realm.h"
45 #include "vm/WellKnownAtom.h" // js_*_str
46
47 using mozilla::AsciiAlphanumericToNumber;
48 using mozilla::AssertedCast;
49 using mozilla::DecodeOneUtf8CodePoint;
50 using mozilla::IsAscii;
51 using mozilla::IsAsciiAlpha;
52 using mozilla::IsAsciiDigit;
53 using mozilla::IsAsciiHexDigit;
54 using mozilla::IsTrailingUnit;
55 using mozilla::MakeScopeExit;
56 using mozilla::Maybe;
57 using mozilla::PointerRangeSize;
58 using mozilla::Span;
59 using mozilla::Utf8Unit;
60
61 using JS::ReadOnlyCompileOptions;
62 using JS::RegExpFlag;
63 using JS::RegExpFlags;
64
65 struct ReservedWordInfo {
66 const char* chars; // C string with reserved word text
67 js::frontend::TokenKind tokentype;
68 };
69
70 static const ReservedWordInfo reservedWords[] = {
71 #define RESERVED_WORD_INFO(word, name, type) \
72 {js_##word##_str, js::frontend::type},
73 FOR_EACH_JAVASCRIPT_RESERVED_WORD(RESERVED_WORD_INFO)
74 #undef RESERVED_WORD_INFO
75 };
76
77 enum class ReservedWordsIndex : size_t {
78 #define ENTRY_(_1, NAME, _3) NAME,
79 FOR_EACH_JAVASCRIPT_RESERVED_WORD(ENTRY_)
80 #undef ENTRY_
81 };
82
83 // Returns a ReservedWordInfo for the specified characters, or nullptr if the
84 // string is not a reserved word.
85 template <typename CharT>
FindReservedWord(const CharT * s,size_t length)86 static const ReservedWordInfo* FindReservedWord(const CharT* s, size_t length) {
87 MOZ_ASSERT(length != 0);
88
89 size_t i;
90 const ReservedWordInfo* rw;
91 const char* chars;
92
93 #define JSRW_LENGTH() length
94 #define JSRW_AT(column) s[column]
95 #define JSRW_GOT_MATCH(index) \
96 i = (index); \
97 goto got_match;
98 #define JSRW_TEST_GUESS(index) \
99 i = (index); \
100 goto test_guess;
101 #define JSRW_NO_MATCH() goto no_match;
102 #include "frontend/ReservedWordsGenerated.h"
103 #undef JSRW_NO_MATCH
104 #undef JSRW_TEST_GUESS
105 #undef JSRW_GOT_MATCH
106 #undef JSRW_AT
107 #undef JSRW_LENGTH
108
109 got_match:
110 return &reservedWords[i];
111
112 test_guess:
113 rw = &reservedWords[i];
114 chars = rw->chars;
115 do {
116 if (*s++ != static_cast<unsigned char>(*chars++)) {
117 goto no_match;
118 }
119 } while (--length != 0);
120 return rw;
121
122 no_match:
123 return nullptr;
124 }
125
126 template <>
FindReservedWord(const Utf8Unit * units,size_t length)127 MOZ_ALWAYS_INLINE const ReservedWordInfo* FindReservedWord<Utf8Unit>(
128 const Utf8Unit* units, size_t length) {
129 return FindReservedWord(Utf8AsUnsignedChars(units), length);
130 }
131
FindReservedWord(const js::frontend::TaggedParserAtomIndex atom)132 static const ReservedWordInfo* FindReservedWord(
133 const js::frontend::TaggedParserAtomIndex atom) {
134 switch (atom.rawData()) {
135 #define CASE_(_1, NAME, _3) \
136 case js::frontend::TaggedParserAtomIndex::WellKnownRawData::NAME(): \
137 return &reservedWords[size_t(ReservedWordsIndex::NAME)];
138 FOR_EACH_JAVASCRIPT_RESERVED_WORD(CASE_)
139 #undef CASE_
140 }
141
142 return nullptr;
143 }
144
GetSingleCodePoint(const char16_t ** p,const char16_t * end)145 static uint32_t GetSingleCodePoint(const char16_t** p, const char16_t* end) {
146 using namespace js;
147
148 uint32_t codePoint;
149 if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(**p)) && *p + 1 < end) {
150 char16_t lead = **p;
151 char16_t maybeTrail = *(*p + 1);
152 if (unicode::IsTrailSurrogate(maybeTrail)) {
153 *p += 2;
154 return unicode::UTF16Decode(lead, maybeTrail);
155 }
156 }
157
158 codePoint = **p;
159 (*p)++;
160 return codePoint;
161 }
162
163 template <typename CharT>
IsAsciiBinary(CharT c)164 static constexpr bool IsAsciiBinary(CharT c) {
165 using UnsignedCharT = std::make_unsigned_t<CharT>;
166 auto uc = static_cast<UnsignedCharT>(c);
167 return uc == '0' || uc == '1';
168 }
169
170 template <typename CharT>
IsAsciiOctal(CharT c)171 static constexpr bool IsAsciiOctal(CharT c) {
172 using UnsignedCharT = std::make_unsigned_t<CharT>;
173 auto uc = static_cast<UnsignedCharT>(c);
174 return '0' <= uc && uc <= '7';
175 }
176
177 template <typename CharT>
AsciiOctalToNumber(CharT c)178 static constexpr uint8_t AsciiOctalToNumber(CharT c) {
179 using UnsignedCharT = std::make_unsigned_t<CharT>;
180 auto uc = static_cast<UnsignedCharT>(c);
181 return uc - '0';
182 }
183
184 namespace js {
185
186 namespace frontend {
187
IsIdentifier(JSLinearString * str)188 bool IsIdentifier(JSLinearString* str) {
189 JS::AutoCheckCannotGC nogc;
190 MOZ_ASSERT(str);
191 if (str->hasLatin1Chars()) {
192 return IsIdentifier(str->latin1Chars(nogc), str->length());
193 }
194 return IsIdentifier(str->twoByteChars(nogc), str->length());
195 }
196
IsIdentifierNameOrPrivateName(JSLinearString * str)197 bool IsIdentifierNameOrPrivateName(JSLinearString* str) {
198 JS::AutoCheckCannotGC nogc;
199 MOZ_ASSERT(str);
200 if (str->hasLatin1Chars()) {
201 return IsIdentifierNameOrPrivateName(str->latin1Chars(nogc), str->length());
202 }
203 return IsIdentifierNameOrPrivateName(str->twoByteChars(nogc), str->length());
204 }
205
IsIdentifier(const Latin1Char * chars,size_t length)206 bool IsIdentifier(const Latin1Char* chars, size_t length) {
207 if (length == 0) {
208 return false;
209 }
210
211 if (!unicode::IsIdentifierStart(char16_t(*chars))) {
212 return false;
213 }
214
215 const Latin1Char* end = chars + length;
216 while (++chars != end) {
217 if (!unicode::IsIdentifierPart(char16_t(*chars))) {
218 return false;
219 }
220 }
221
222 return true;
223 }
224
IsIdentifierASCII(char c)225 bool IsIdentifierASCII(char c) { return unicode::IsIdentifierStartASCII(c); }
226
IsIdentifierASCII(char c1,char c2)227 bool IsIdentifierASCII(char c1, char c2) {
228 return unicode::IsIdentifierStartASCII(c1) &&
229 unicode::IsIdentifierPartASCII(c2);
230 }
231
IsIdentifierNameOrPrivateName(const Latin1Char * chars,size_t length)232 bool IsIdentifierNameOrPrivateName(const Latin1Char* chars, size_t length) {
233 if (length == 0) {
234 return false;
235 }
236
237 // Skip over any private name marker.
238 if (*chars == '#') {
239 ++chars;
240 --length;
241 }
242
243 return IsIdentifier(chars, length);
244 }
245
IsIdentifier(const char16_t * chars,size_t length)246 bool IsIdentifier(const char16_t* chars, size_t length) {
247 if (length == 0) {
248 return false;
249 }
250
251 const char16_t* p = chars;
252 const char16_t* end = chars + length;
253 uint32_t codePoint;
254
255 codePoint = GetSingleCodePoint(&p, end);
256 if (!unicode::IsIdentifierStart(codePoint)) {
257 return false;
258 }
259
260 while (p < end) {
261 codePoint = GetSingleCodePoint(&p, end);
262 if (!unicode::IsIdentifierPart(codePoint)) {
263 return false;
264 }
265 }
266
267 return true;
268 }
269
IsIdentifierNameOrPrivateName(const char16_t * chars,size_t length)270 bool IsIdentifierNameOrPrivateName(const char16_t* chars, size_t length) {
271 if (length == 0) {
272 return false;
273 }
274
275 const char16_t* p = chars;
276 const char16_t* end = chars + length;
277 uint32_t codePoint;
278
279 codePoint = GetSingleCodePoint(&p, end);
280
281 // Skip over any private name marker.
282 if (codePoint == '#') {
283 // The identifier part of a private name mustn't be empty.
284 if (length == 1) {
285 return false;
286 }
287
288 codePoint = GetSingleCodePoint(&p, end);
289 }
290
291 if (!unicode::IsIdentifierStart(codePoint)) {
292 return false;
293 }
294
295 while (p < end) {
296 codePoint = GetSingleCodePoint(&p, end);
297 if (!unicode::IsIdentifierPart(codePoint)) {
298 return false;
299 }
300 }
301
302 return true;
303 }
304
IsKeyword(TaggedParserAtomIndex atom)305 bool IsKeyword(TaggedParserAtomIndex atom) {
306 if (const ReservedWordInfo* rw = FindReservedWord(atom)) {
307 return TokenKindIsKeyword(rw->tokentype);
308 }
309
310 return false;
311 }
312
ReservedWordTokenKind(TaggedParserAtomIndex name)313 TokenKind ReservedWordTokenKind(TaggedParserAtomIndex name) {
314 if (const ReservedWordInfo* rw = FindReservedWord(name)) {
315 return rw->tokentype;
316 }
317
318 return TokenKind::Limit;
319 }
320
ReservedWordToCharZ(TaggedParserAtomIndex name)321 const char* ReservedWordToCharZ(TaggedParserAtomIndex name) {
322 if (const ReservedWordInfo* rw = FindReservedWord(name)) {
323 return ReservedWordToCharZ(rw->tokentype);
324 }
325
326 return nullptr;
327 }
328
ReservedWordToCharZ(TokenKind tt)329 const char* ReservedWordToCharZ(TokenKind tt) {
330 MOZ_ASSERT(tt != TokenKind::Name);
331 switch (tt) {
332 #define EMIT_CASE(word, name, type) \
333 case type: \
334 return js_##word##_str;
335 FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
336 #undef EMIT_CASE
337 default:
338 MOZ_ASSERT_UNREACHABLE("Not a reserved word PropertyName.");
339 }
340 return nullptr;
341 }
342
reservedWordToPropertyName(TokenKind tt) const343 TaggedParserAtomIndex TokenStreamAnyChars::reservedWordToPropertyName(
344 TokenKind tt) const {
345 MOZ_ASSERT(tt != TokenKind::Name);
346 switch (tt) {
347 #define EMIT_CASE(word, name, type) \
348 case type: \
349 return TaggedParserAtomIndex::WellKnown::name();
350 FOR_EACH_JAVASCRIPT_RESERVED_WORD(EMIT_CASE)
351 #undef EMIT_CASE
352 default:
353 MOZ_ASSERT_UNREACHABLE("Not a reserved word TokenKind.");
354 }
355 return TaggedParserAtomIndex::null();
356 }
357
SourceCoords(JSContext * cx,uint32_t initialLineNumber,uint32_t initialOffset)358 SourceCoords::SourceCoords(JSContext* cx, uint32_t initialLineNumber,
359 uint32_t initialOffset)
360 : lineStartOffsets_(cx), initialLineNum_(initialLineNumber), lastIndex_(0) {
361 // This is actually necessary! Removing it causes compile errors on
362 // GCC and clang. You could try declaring this:
363 //
364 // const uint32_t SourceCoords::MAX_PTR;
365 //
366 // which fixes the GCC/clang error, but causes bustage on Windows. Sigh.
367 //
368 uint32_t maxPtr = MAX_PTR;
369
370 // The first line begins at buffer offset |initialOffset|. MAX_PTR is the
371 // sentinel. The appends cannot fail because |lineStartOffsets_| has
372 // statically-allocated elements.
373 MOZ_ASSERT(lineStartOffsets_.capacity() >= 2);
374 MOZ_ALWAYS_TRUE(lineStartOffsets_.reserve(2));
375 lineStartOffsets_.infallibleAppend(initialOffset);
376 lineStartOffsets_.infallibleAppend(maxPtr);
377 }
378
add(uint32_t lineNum,uint32_t lineStartOffset)379 MOZ_ALWAYS_INLINE bool SourceCoords::add(uint32_t lineNum,
380 uint32_t lineStartOffset) {
381 uint32_t index = indexFromLineNumber(lineNum);
382 uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
383
384 MOZ_ASSERT(lineStartOffsets_[0] <= lineStartOffset);
385 MOZ_ASSERT(lineStartOffsets_[sentinelIndex] == MAX_PTR);
386
387 if (index == sentinelIndex) {
388 // We haven't seen this newline before. Update lineStartOffsets_
389 // only if lineStartOffsets_.append succeeds, to keep sentinel.
390 // Otherwise return false to tell TokenStream about OOM.
391 uint32_t maxPtr = MAX_PTR;
392 if (!lineStartOffsets_.append(maxPtr)) {
393 static_assert(std::is_same_v<decltype(lineStartOffsets_.allocPolicy()),
394 TempAllocPolicy&>,
395 "this function's caller depends on it reporting an "
396 "error on failure, as TempAllocPolicy ensures");
397 return false;
398 }
399
400 lineStartOffsets_[index] = lineStartOffset;
401 } else {
402 // We have seen this newline before (and ungot it). Do nothing (other
403 // than checking it hasn't mysteriously changed).
404 // This path can be executed after hitting OOM, so check index.
405 MOZ_ASSERT_IF(index < sentinelIndex,
406 lineStartOffsets_[index] == lineStartOffset);
407 }
408 return true;
409 }
410
fill(const SourceCoords & other)411 MOZ_ALWAYS_INLINE bool SourceCoords::fill(const SourceCoords& other) {
412 MOZ_ASSERT(lineStartOffsets_[0] == other.lineStartOffsets_[0]);
413 MOZ_ASSERT(lineStartOffsets_.back() == MAX_PTR);
414 MOZ_ASSERT(other.lineStartOffsets_.back() == MAX_PTR);
415
416 if (lineStartOffsets_.length() >= other.lineStartOffsets_.length()) {
417 return true;
418 }
419
420 uint32_t sentinelIndex = lineStartOffsets_.length() - 1;
421 lineStartOffsets_[sentinelIndex] = other.lineStartOffsets_[sentinelIndex];
422
423 for (size_t i = sentinelIndex + 1; i < other.lineStartOffsets_.length();
424 i++) {
425 if (!lineStartOffsets_.append(other.lineStartOffsets_[i])) {
426 return false;
427 }
428 }
429 return true;
430 }
431
432 MOZ_ALWAYS_INLINE uint32_t
indexFromOffset(uint32_t offset) const433 SourceCoords::indexFromOffset(uint32_t offset) const {
434 uint32_t iMin, iMax, iMid;
435
436 if (lineStartOffsets_[lastIndex_] <= offset) {
437 // If we reach here, offset is on a line the same as or higher than
438 // last time. Check first for the +0, +1, +2 cases, because they
439 // typically cover 85--98% of cases.
440 if (offset < lineStartOffsets_[lastIndex_ + 1]) {
441 return lastIndex_; // index is same as last time
442 }
443
444 // If we reach here, there must be at least one more entry (plus the
445 // sentinel). Try it.
446 lastIndex_++;
447 if (offset < lineStartOffsets_[lastIndex_ + 1]) {
448 return lastIndex_; // index is one higher than last time
449 }
450
451 // The same logic applies here.
452 lastIndex_++;
453 if (offset < lineStartOffsets_[lastIndex_ + 1]) {
454 return lastIndex_; // index is two higher than last time
455 }
456
457 // No luck. Oh well, we have a better-than-default starting point for
458 // the binary search.
459 iMin = lastIndex_ + 1;
460 MOZ_ASSERT(iMin <
461 lineStartOffsets_.length() - 1); // -1 due to the sentinel
462
463 } else {
464 iMin = 0;
465 }
466
467 // This is a binary search with deferred detection of equality, which was
468 // marginally faster in this case than a standard binary search.
469 // The -2 is because |lineStartOffsets_.length() - 1| is the sentinel, and we
470 // want one before that.
471 iMax = lineStartOffsets_.length() - 2;
472 while (iMax > iMin) {
473 iMid = iMin + (iMax - iMin) / 2;
474 if (offset >= lineStartOffsets_[iMid + 1]) {
475 iMin = iMid + 1; // offset is above lineStartOffsets_[iMid]
476 } else {
477 iMax = iMid; // offset is below or within lineStartOffsets_[iMid]
478 }
479 }
480
481 MOZ_ASSERT(iMax == iMin);
482 MOZ_ASSERT(lineStartOffsets_[iMin] <= offset);
483 MOZ_ASSERT(offset < lineStartOffsets_[iMin + 1]);
484
485 lastIndex_ = iMin;
486 return iMin;
487 }
488
lineToken(uint32_t offset) const489 SourceCoords::LineToken SourceCoords::lineToken(uint32_t offset) const {
490 return LineToken(indexFromOffset(offset), offset);
491 }
492
TokenStreamAnyChars(JSContext * cx,const ReadOnlyCompileOptions & options,StrictModeGetter * smg)493 TokenStreamAnyChars::TokenStreamAnyChars(JSContext* cx,
494 const ReadOnlyCompileOptions& options,
495 StrictModeGetter* smg)
496 : cx(cx),
497 options_(options),
498 strictModeGetter_(smg),
499 filename_(options.filename()),
500 longLineColumnInfo_(cx),
501 srcCoords(cx, options.lineno, options.scriptSourceOffset),
502 lineno(options.lineno),
503 mutedErrors(options.mutedErrors()) {
504 // |isExprEnding| was initially zeroed: overwrite the true entries here.
505 isExprEnding[size_t(TokenKind::Comma)] = true;
506 isExprEnding[size_t(TokenKind::Semi)] = true;
507 isExprEnding[size_t(TokenKind::Colon)] = true;
508 isExprEnding[size_t(TokenKind::RightParen)] = true;
509 isExprEnding[size_t(TokenKind::RightBracket)] = true;
510 isExprEnding[size_t(TokenKind::RightCurly)] = true;
511 }
512
513 template <typename Unit>
TokenStreamCharsBase(JSContext * cx,ParserAtomsTable * pasrerAtoms,const Unit * units,size_t length,size_t startOffset)514 TokenStreamCharsBase<Unit>::TokenStreamCharsBase(JSContext* cx,
515 ParserAtomsTable* pasrerAtoms,
516 const Unit* units,
517 size_t length,
518 size_t startOffset)
519 : TokenStreamCharsShared(cx, pasrerAtoms),
520 sourceUnits(units, length, startOffset) {}
521
FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer & charBuffer,const char16_t * cur,const char16_t * end)522 bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer,
523 const char16_t* cur,
524 const char16_t* end) {
525 MOZ_ASSERT(charBuffer.length() == 0);
526
527 while (cur < end) {
528 char16_t ch = *cur++;
529 if (ch == '\r') {
530 ch = '\n';
531 if (cur < end && *cur == '\n') {
532 cur++;
533 }
534 }
535
536 if (!charBuffer.append(ch)) {
537 return false;
538 }
539 }
540
541 MOZ_ASSERT(cur == end);
542 return true;
543 }
544
FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer & charBuffer,const Utf8Unit * cur,const Utf8Unit * end)545 bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(CharBuffer& charBuffer,
546 const Utf8Unit* cur,
547 const Utf8Unit* end) {
548 MOZ_ASSERT(charBuffer.length() == 0);
549
550 while (cur < end) {
551 Utf8Unit unit = *cur++;
552 if (MOZ_LIKELY(IsAscii(unit))) {
553 char16_t ch = unit.toUint8();
554 if (ch == '\r') {
555 ch = '\n';
556 if (cur < end && *cur == Utf8Unit('\n')) {
557 cur++;
558 }
559 }
560
561 if (!charBuffer.append(ch)) {
562 return false;
563 }
564
565 continue;
566 }
567
568 Maybe<char32_t> ch = DecodeOneUtf8CodePoint(unit, &cur, end);
569 MOZ_ASSERT(ch.isSome(),
570 "provided source text should already have been validated");
571
572 if (!AppendCodePointToCharBuffer(charBuffer, ch.value())) {
573 return false;
574 }
575 }
576
577 MOZ_ASSERT(cur == end);
578 return true;
579 }
580
581 template <typename Unit, class AnyCharsAccess>
TokenStreamSpecific(JSContext * cx,ParserAtomsTable * pasrerAtoms,const ReadOnlyCompileOptions & options,const Unit * units,size_t length)582 TokenStreamSpecific<Unit, AnyCharsAccess>::TokenStreamSpecific(
583 JSContext* cx, ParserAtomsTable* pasrerAtoms,
584 const ReadOnlyCompileOptions& options, const Unit* units, size_t length)
585 : TokenStreamChars<Unit, AnyCharsAccess>(cx, pasrerAtoms, units, length,
586 options.scriptSourceOffset) {}
587
checkOptions()588 bool TokenStreamAnyChars::checkOptions() {
589 // Constrain starting columns to where they will saturate.
590 if (options().column > ColumnLimit) {
591 reportErrorNoOffset(JSMSG_BAD_COLUMN_NUMBER);
592 return false;
593 }
594
595 return true;
596 }
597
reportErrorNoOffset(unsigned errorNumber,...)598 void TokenStreamAnyChars::reportErrorNoOffset(unsigned errorNumber, ...) {
599 va_list args;
600 va_start(args, errorNumber);
601
602 reportErrorNoOffsetVA(errorNumber, &args);
603
604 va_end(args);
605 }
606
reportErrorNoOffsetVA(unsigned errorNumber,va_list * args)607 void TokenStreamAnyChars::reportErrorNoOffsetVA(unsigned errorNumber,
608 va_list* args) {
609 ErrorMetadata metadata;
610 computeErrorMetadataNoOffset(&metadata);
611
612 ReportCompileErrorLatin1(cx, std::move(metadata), nullptr, errorNumber, args);
613 }
614
615 [[nodiscard]] MOZ_ALWAYS_INLINE bool
internalUpdateLineInfoForEOL(uint32_t lineStartOffset)616 TokenStreamAnyChars::internalUpdateLineInfoForEOL(uint32_t lineStartOffset) {
617 prevLinebase = linebase;
618 linebase = lineStartOffset;
619 lineno++;
620
621 // On overflow, report error.
622 if (MOZ_UNLIKELY(!lineno)) {
623 reportErrorNoOffset(JSMSG_BAD_LINE_NUMBER);
624 return false;
625 }
626
627 return srcCoords.add(lineno, linebase);
628 }
629
630 #ifdef DEBUG
631
632 template <>
assertNextCodePoint(const PeekedCodePoint<char16_t> & peeked)633 inline void SourceUnits<char16_t>::assertNextCodePoint(
634 const PeekedCodePoint<char16_t>& peeked) {
635 char32_t c = peeked.codePoint();
636 if (c < unicode::NonBMPMin) {
637 MOZ_ASSERT(peeked.lengthInUnits() == 1);
638 MOZ_ASSERT(ptr[0] == c);
639 } else {
640 MOZ_ASSERT(peeked.lengthInUnits() == 2);
641 char16_t lead, trail;
642 unicode::UTF16Encode(c, &lead, &trail);
643 MOZ_ASSERT(ptr[0] == lead);
644 MOZ_ASSERT(ptr[1] == trail);
645 }
646 }
647
648 template <>
assertNextCodePoint(const PeekedCodePoint<Utf8Unit> & peeked)649 inline void SourceUnits<Utf8Unit>::assertNextCodePoint(
650 const PeekedCodePoint<Utf8Unit>& peeked) {
651 char32_t c = peeked.codePoint();
652
653 // This is all roughly indulgence of paranoia only for assertions, so the
654 // reimplementation of UTF-8 encoding a code point is (we think) a virtue.
655 uint8_t expectedUnits[4] = {};
656 if (c < 0x80) {
657 expectedUnits[0] = AssertedCast<uint8_t>(c);
658 } else if (c < 0x800) {
659 expectedUnits[0] = 0b1100'0000 | (c >> 6);
660 expectedUnits[1] = 0b1000'0000 | (c & 0b11'1111);
661 } else if (c < 0x10000) {
662 expectedUnits[0] = 0b1110'0000 | (c >> 12);
663 expectedUnits[1] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
664 expectedUnits[2] = 0b1000'0000 | (c & 0b11'1111);
665 } else {
666 expectedUnits[0] = 0b1111'0000 | (c >> 18);
667 expectedUnits[1] = 0b1000'0000 | ((c >> 12) & 0b11'1111);
668 expectedUnits[2] = 0b1000'0000 | ((c >> 6) & 0b11'1111);
669 expectedUnits[3] = 0b1000'0000 | (c & 0b11'1111);
670 }
671
672 MOZ_ASSERT(peeked.lengthInUnits() <= 4);
673 for (uint8_t i = 0; i < peeked.lengthInUnits(); i++) {
674 MOZ_ASSERT(expectedUnits[i] == ptr[i].toUint8());
675 }
676 }
677
678 #endif // DEBUG
679
RetractPointerToCodePointBoundary(const Utf8Unit ** ptr,const Utf8Unit * limit)680 static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
681 const Utf8Unit** ptr, const Utf8Unit* limit) {
682 MOZ_ASSERT(*ptr <= limit);
683
684 // |limit| is a code point boundary.
685 if (MOZ_UNLIKELY(*ptr == limit)) {
686 return;
687 }
688
689 // Otherwise rewind past trailing units to the start of the code point.
690 #ifdef DEBUG
691 size_t retracted = 0;
692 #endif
693 while (MOZ_UNLIKELY(IsTrailingUnit((*ptr)[0]))) {
694 --*ptr;
695 #ifdef DEBUG
696 retracted++;
697 #endif
698 }
699
700 MOZ_ASSERT(retracted < 4,
701 "the longest UTF-8 code point is four units, so this should never "
702 "retract more than three units");
703 }
704
RetractPointerToCodePointBoundary(const char16_t ** ptr,const char16_t * limit)705 static MOZ_ALWAYS_INLINE void RetractPointerToCodePointBoundary(
706 const char16_t** ptr, const char16_t* limit) {
707 MOZ_ASSERT(*ptr <= limit);
708
709 // |limit| is a code point boundary.
710 if (MOZ_UNLIKELY(*ptr == limit)) {
711 return;
712 }
713
714 // Otherwise the pointer must be retracted by one iff it splits a two-unit
715 // code point.
716 if (MOZ_UNLIKELY(unicode::IsTrailSurrogate((*ptr)[0]))) {
717 // Outside test suites testing garbage WTF-16, it's basically guaranteed
718 // here that |(*ptr)[-1] (*ptr)[0]| is a surrogate pair.
719 if (MOZ_LIKELY(unicode::IsLeadSurrogate((*ptr)[-1]))) {
720 --*ptr;
721 }
722 }
723 }
724
725 template <typename Unit>
computePartialColumn(const LineToken lineToken,const uint32_t offset,const SourceUnits<Unit> & sourceUnits) const726 uint32_t TokenStreamAnyChars::computePartialColumn(
727 const LineToken lineToken, const uint32_t offset,
728 const SourceUnits<Unit>& sourceUnits) const {
729 lineToken.assertConsistentOffset(offset);
730
731 const uint32_t line = lineNumber(lineToken);
732 const uint32_t start = srcCoords.lineStart(lineToken);
733
734 // Reset the previous offset/column cache for this line, if the previous
735 // lookup wasn't on this line.
736 if (line != lineOfLastColumnComputation_) {
737 lineOfLastColumnComputation_ = line;
738 lastChunkVectorForLine_ = nullptr;
739 lastOffsetOfComputedColumn_ = start;
740 lastComputedColumn_ = 0;
741 }
742
743 // Compute and return the final column number from a partial offset/column,
744 // using the last-cached offset/column if they're more optimal.
745 auto ColumnFromPartial = [this, offset, &sourceUnits](uint32_t partialOffset,
746 uint32_t partialCols,
747 UnitsType unitsType) {
748 MOZ_ASSERT(partialOffset <= offset);
749
750 // If the last lookup on this line was closer to |offset|, use it.
751 if (partialOffset < this->lastOffsetOfComputedColumn_ &&
752 this->lastOffsetOfComputedColumn_ <= offset) {
753 partialOffset = this->lastOffsetOfComputedColumn_;
754 partialCols = this->lastComputedColumn_;
755 }
756
757 const Unit* begin = sourceUnits.codeUnitPtrAt(partialOffset);
758 const Unit* end = sourceUnits.codeUnitPtrAt(offset);
759
760 size_t offsetDelta = AssertedCast<uint32_t>(PointerRangeSize(begin, end));
761 partialOffset += offsetDelta;
762
763 if (unitsType == UnitsType::GuaranteedSingleUnit) {
764 MOZ_ASSERT(unicode::CountCodePoints(begin, end) == offsetDelta,
765 "guaranteed-single-units also guarantee pointer distance "
766 "equals code point count");
767 partialCols += offsetDelta;
768 } else {
769 partialCols +=
770 AssertedCast<uint32_t>(unicode::CountCodePoints(begin, end));
771 }
772
773 this->lastOffsetOfComputedColumn_ = partialOffset;
774 this->lastComputedColumn_ = partialCols;
775 return partialCols;
776 };
777
778 const uint32_t offsetInLine = offset - start;
779
780 // We won't add an entry to |longLineColumnInfo_| for lines where the maximum
781 // column has offset less than this value. The most common (non-minified)
782 // long line length is likely 80ch, maybe 100ch, so we use that, rounded up to
783 // the next power of two for efficient division/multiplication below.
784 constexpr uint32_t ColumnChunkLength = mozilla::tl::RoundUpPow2<100>::value;
785
786 // The index within any associated |Vector<ChunkInfo>| of |offset|'s chunk.
787 const uint32_t chunkIndex = offsetInLine / ColumnChunkLength;
788 if (chunkIndex == 0) {
789 // We don't know from an |offset| in the zeroth chunk that this line is even
790 // long. First-chunk info is mostly useless, anyway -- we have |start|
791 // already. So if we have *easy* access to that zeroth chunk, use it --
792 // otherwise just count pessimally. (This will still benefit from caching
793 // the last column/offset for computations for successive offsets, so it's
794 // not *always* worst-case.)
795 UnitsType unitsType;
796 if (lastChunkVectorForLine_ && lastChunkVectorForLine_->length() > 0) {
797 MOZ_ASSERT((*lastChunkVectorForLine_)[0].column() == 0);
798 unitsType = (*lastChunkVectorForLine_)[0].unitsType();
799 } else {
800 unitsType = UnitsType::PossiblyMultiUnit;
801 }
802
803 return ColumnFromPartial(start, 0, unitsType);
804 }
805
806 // If this line has no chunk vector yet, insert one in the hash map. (The
807 // required index is allocated and filled further down.)
808 if (!lastChunkVectorForLine_) {
809 auto ptr = longLineColumnInfo_.lookupForAdd(line);
810 if (!ptr) {
811 // This could rehash and invalidate a cached vector pointer, but the outer
812 // condition means we don't have a cached pointer.
813 if (!longLineColumnInfo_.add(ptr, line, Vector<ChunkInfo>(cx))) {
814 // In case of OOM, just count columns from the start of the line.
815 cx->recoverFromOutOfMemory();
816 return ColumnFromPartial(start, 0, UnitsType::PossiblyMultiUnit);
817 }
818 }
819
820 // Note that adding elements to this vector won't invalidate this pointer.
821 lastChunkVectorForLine_ = &ptr->value();
822 }
823
824 const Unit* const limit = sourceUnits.codeUnitPtrAt(offset);
825
826 auto RetractedOffsetOfChunk = [
827 #ifdef DEBUG
828 this,
829 #endif
830 start, limit,
831 &sourceUnits](uint32_t index) {
832 MOZ_ASSERT(index < this->lastChunkVectorForLine_->length());
833
834 uint32_t naiveOffset = start + index * ColumnChunkLength;
835 const Unit* naivePtr = sourceUnits.codeUnitPtrAt(naiveOffset);
836
837 const Unit* actualPtr = naivePtr;
838 RetractPointerToCodePointBoundary(&actualPtr, limit);
839
840 #ifdef DEBUG
841 if ((*this->lastChunkVectorForLine_)[index].unitsType() ==
842 UnitsType::GuaranteedSingleUnit) {
843 MOZ_ASSERT(naivePtr == actualPtr, "miscomputed unitsType value");
844 }
845 #endif
846
847 return naiveOffset - PointerRangeSize(actualPtr, naivePtr);
848 };
849
850 uint32_t partialOffset;
851 uint32_t partialColumn;
852 UnitsType unitsType;
853
854 auto entriesLen = AssertedCast<uint32_t>(lastChunkVectorForLine_->length());
855 if (chunkIndex < entriesLen) {
856 // We've computed the chunk |offset| resides in. Compute the column number
857 // from the chunk.
858 partialOffset = RetractedOffsetOfChunk(chunkIndex);
859 partialColumn = (*lastChunkVectorForLine_)[chunkIndex].column();
860
861 // This is exact if |chunkIndex| isn't the last chunk.
862 unitsType = (*lastChunkVectorForLine_)[chunkIndex].unitsType();
863
864 // Otherwise the last chunk is pessimistically assumed to contain multi-unit
865 // code points because we haven't fully examined its contents yet -- they
866 // may not have been tokenized yet, they could contain encoding errors, or
867 // they might not even exist.
868 MOZ_ASSERT_IF(chunkIndex == entriesLen - 1,
869 (*lastChunkVectorForLine_)[chunkIndex].unitsType() ==
870 UnitsType::PossiblyMultiUnit);
871 } else {
872 // Extend the vector from its last entry or the start of the line. (This is
873 // also a suitable partial start point if we must recover from OOM.)
874 if (entriesLen > 0) {
875 partialOffset = RetractedOffsetOfChunk(entriesLen - 1);
876 partialColumn = (*lastChunkVectorForLine_)[entriesLen - 1].column();
877 } else {
878 partialOffset = start;
879 partialColumn = 0;
880 }
881
882 if (!lastChunkVectorForLine_->reserve(chunkIndex + 1)) {
883 // As earlier, just start from the greatest offset/column in case of OOM.
884 cx->recoverFromOutOfMemory();
885 return ColumnFromPartial(partialOffset, partialColumn,
886 UnitsType::PossiblyMultiUnit);
887 }
888
889 // OOM is no longer possible now. \o/
890
891 // The vector always begins with the column of the line start, i.e. zero,
892 // with chunk units pessimally assumed not single-unit.
893 if (entriesLen == 0) {
894 lastChunkVectorForLine_->infallibleAppend(
895 ChunkInfo(0, UnitsType::PossiblyMultiUnit));
896 entriesLen++;
897 }
898
899 do {
900 const Unit* const begin = sourceUnits.codeUnitPtrAt(partialOffset);
901 const Unit* chunkLimit = sourceUnits.codeUnitPtrAt(
902 start + std::min(entriesLen++ * ColumnChunkLength, offsetInLine));
903
904 MOZ_ASSERT(begin < chunkLimit);
905 MOZ_ASSERT(chunkLimit <= limit);
906
907 static_assert(
908 ColumnChunkLength > SourceUnitTraits<Unit>::maxUnitsLength - 1,
909 "any retraction below is assumed to never underflow to the "
910 "preceding chunk, even for the longest code point");
911
912 // Prior tokenizing ensured that [begin, limit) is validly encoded, and
913 // |begin < chunkLimit|, so any retraction here can't underflow.
914 RetractPointerToCodePointBoundary(&chunkLimit, limit);
915
916 MOZ_ASSERT(begin < chunkLimit);
917 MOZ_ASSERT(chunkLimit <= limit);
918
919 size_t numUnits = PointerRangeSize(begin, chunkLimit);
920 size_t numCodePoints = unicode::CountCodePoints(begin, chunkLimit);
921
922 // If this chunk (which will become non-final at the end of the loop) is
923 // all single-unit code points, annotate the chunk accordingly.
924 if (numUnits == numCodePoints) {
925 lastChunkVectorForLine_->back().guaranteeSingleUnits();
926 }
927
928 partialOffset += numUnits;
929 partialColumn += numCodePoints;
930
931 lastChunkVectorForLine_->infallibleEmplaceBack(
932 partialColumn, UnitsType::PossiblyMultiUnit);
933 } while (entriesLen < chunkIndex + 1);
934
935 // We're at a spot in the current final chunk, and final chunks never have
936 // complete units information, so be pessimistic.
937 unitsType = UnitsType::PossiblyMultiUnit;
938 }
939
940 return ColumnFromPartial(partialOffset, partialColumn, unitsType);
941 }
942
943 template <typename Unit, class AnyCharsAccess>
computeColumn(LineToken lineToken,uint32_t offset) const944 uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeColumn(
945 LineToken lineToken, uint32_t offset) const {
946 lineToken.assertConsistentOffset(offset);
947
948 const TokenStreamAnyChars& anyChars = anyCharsAccess();
949
950 uint32_t column =
951 anyChars.computePartialColumn(lineToken, offset, this->sourceUnits);
952
953 if (lineToken.isFirstLine()) {
954 if (column > ColumnLimit) {
955 return ColumnLimit;
956 }
957
958 static_assert(uint32_t(ColumnLimit + ColumnLimit) > ColumnLimit,
959 "Adding ColumnLimit should not overflow");
960
961 uint32_t firstLineOffset = anyChars.options_.column;
962 column += firstLineOffset;
963 }
964
965 if (column > ColumnLimit) {
966 return ColumnLimit;
967 }
968
969 return column;
970 }
971
972 template <typename Unit, class AnyCharsAccess>
computeLineAndColumn(uint32_t offset,uint32_t * line,uint32_t * column) const973 void GeneralTokenStreamChars<Unit, AnyCharsAccess>::computeLineAndColumn(
974 uint32_t offset, uint32_t* line, uint32_t* column) const {
975 const TokenStreamAnyChars& anyChars = anyCharsAccess();
976
977 auto lineToken = anyChars.lineToken(offset);
978 *line = anyChars.lineNumber(lineToken);
979 *column = computeColumn(lineToken, offset);
980 }
981
982 template <class AnyCharsAccess>
internalEncodingError(uint8_t relevantUnits,unsigned errorNumber,...)983 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::internalEncodingError(
984 uint8_t relevantUnits, unsigned errorNumber, ...) {
985 va_list args;
986 va_start(args, errorNumber);
987
988 do {
989 size_t offset = this->sourceUnits.offset();
990
991 ErrorMetadata err;
992
993 TokenStreamAnyChars& anyChars = anyCharsAccess();
994
995 bool canAddLineOfContext = fillExceptingContext(&err, offset);
996 if (canAddLineOfContext) {
997 if (!internalComputeLineOfContext(&err, offset)) {
998 break;
999 }
1000
1001 // As this is an encoding error, the computed window-end must be
1002 // identical to the location of the error -- any further on and the
1003 // window would contain invalid Unicode.
1004 MOZ_ASSERT_IF(err.lineOfContext != nullptr,
1005 err.lineLength == err.tokenOffset);
1006 }
1007
1008 auto notes = MakeUnique<JSErrorNotes>();
1009 if (!notes) {
1010 ReportOutOfMemory(anyChars.cx);
1011 break;
1012 }
1013
1014 // The largest encoding of a UTF-8 code point is 4 units. (Encoding an
1015 // obsolete 5- or 6-byte code point will complain only about a bad lead
1016 // code unit.)
1017 constexpr size_t MaxWidth = sizeof("0xHH 0xHH 0xHH 0xHH");
1018
1019 MOZ_ASSERT(relevantUnits > 0);
1020
1021 char badUnitsStr[MaxWidth];
1022 char* ptr = badUnitsStr;
1023 while (relevantUnits > 0) {
1024 byteToString(this->sourceUnits.getCodeUnit().toUint8(), ptr);
1025 ptr[4] = ' ';
1026
1027 ptr += 5;
1028 relevantUnits--;
1029 }
1030
1031 ptr[-1] = '\0';
1032
1033 uint32_t line, column;
1034 computeLineAndColumn(offset, &line, &column);
1035
1036 if (!notes->addNoteASCII(anyChars.cx, anyChars.getFilename(), 0, line,
1037 column, GetErrorMessage, nullptr,
1038 JSMSG_BAD_CODE_UNITS, badUnitsStr)) {
1039 break;
1040 }
1041
1042 ReportCompileErrorLatin1(anyChars.cx, std::move(err), std::move(notes),
1043 errorNumber, &args);
1044 } while (false);
1045
1046 va_end(args);
1047 }
1048
1049 template <class AnyCharsAccess>
badLeadUnit(Utf8Unit lead)1050 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badLeadUnit(
1051 Utf8Unit lead) {
1052 uint8_t leadValue = lead.toUint8();
1053
1054 char leadByteStr[5];
1055 byteToTerminatedString(leadValue, leadByteStr);
1056
1057 internalEncodingError(1, JSMSG_BAD_LEADING_UTF8_UNIT, leadByteStr);
1058 }
1059
1060 template <class AnyCharsAccess>
notEnoughUnits(Utf8Unit lead,uint8_t remaining,uint8_t required)1061 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::notEnoughUnits(
1062 Utf8Unit lead, uint8_t remaining, uint8_t required) {
1063 uint8_t leadValue = lead.toUint8();
1064
1065 MOZ_ASSERT(required == 2 || required == 3 || required == 4);
1066 MOZ_ASSERT(remaining < 4);
1067 MOZ_ASSERT(remaining < required);
1068
1069 char leadByteStr[5];
1070 byteToTerminatedString(leadValue, leadByteStr);
1071
1072 // |toHexChar| produces the desired decimal numbers for values < 4.
1073 const char expectedStr[] = {toHexChar(required - 1), '\0'};
1074 const char actualStr[] = {toHexChar(remaining - 1), '\0'};
1075
1076 internalEncodingError(remaining, JSMSG_NOT_ENOUGH_CODE_UNITS, leadByteStr,
1077 expectedStr, required == 2 ? "" : "s", actualStr,
1078 remaining == 2 ? " was" : "s were");
1079 }
1080
1081 template <class AnyCharsAccess>
badTrailingUnit(uint8_t unitsObserved)1082 MOZ_COLD void TokenStreamChars<Utf8Unit, AnyCharsAccess>::badTrailingUnit(
1083 uint8_t unitsObserved) {
1084 Utf8Unit badUnit =
1085 this->sourceUnits.addressOfNextCodeUnit()[unitsObserved - 1];
1086
1087 char badByteStr[5];
1088 byteToTerminatedString(badUnit.toUint8(), badByteStr);
1089
1090 internalEncodingError(unitsObserved, JSMSG_BAD_TRAILING_UTF8_UNIT,
1091 badByteStr);
1092 }
1093
1094 template <class AnyCharsAccess>
1095 MOZ_COLD void
badStructurallyValidCodePoint(uint32_t codePoint,uint8_t codePointLength,const char * reason)1096 TokenStreamChars<Utf8Unit, AnyCharsAccess>::badStructurallyValidCodePoint(
1097 uint32_t codePoint, uint8_t codePointLength, const char* reason) {
1098 // Construct a string like "0x203D" (including null terminator) to include
1099 // in the error message. Write the string end-to-start from end to start
1100 // of an adequately sized |char| array, shifting least significant nibbles
1101 // off the number and writing the corresponding hex digits until done, then
1102 // prefixing with "0x". |codePointStr| points at the incrementally
1103 // computed string, within |codePointCharsArray|'s bounds.
1104
1105 // 0x1F'FFFF is the maximum value that can fit in 3+6+6+6 unconstrained
1106 // bits in a four-byte UTF-8 code unit sequence.
1107 constexpr size_t MaxHexSize = sizeof(
1108 "0x1F"
1109 "FFFF"); // including '\0'
1110 char codePointCharsArray[MaxHexSize];
1111
1112 char* codePointStr = std::end(codePointCharsArray);
1113 *--codePointStr = '\0';
1114
1115 // Note that by do-while looping here rather than while-looping, this
1116 // writes a '0' when |codePoint == 0|.
1117 do {
1118 MOZ_ASSERT(codePointCharsArray < codePointStr);
1119 *--codePointStr = toHexChar(codePoint & 0xF);
1120 codePoint >>= 4;
1121 } while (codePoint);
1122
1123 MOZ_ASSERT(codePointCharsArray + 2 <= codePointStr);
1124 *--codePointStr = 'x';
1125 *--codePointStr = '0';
1126
1127 internalEncodingError(codePointLength, JSMSG_FORBIDDEN_UTF8_CODE_POINT,
1128 codePointStr, reason);
1129 }
1130
1131 template <class AnyCharsAccess>
1132 [[nodiscard]] bool
getNonAsciiCodePointDontNormalize(Utf8Unit lead,char32_t * codePoint)1133 TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePointDontNormalize(
1134 Utf8Unit lead, char32_t* codePoint) {
1135 auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
1136
1137 auto onNotEnoughUnits = [this, &lead](uint8_t remaining, uint8_t required) {
1138 this->notEnoughUnits(lead, remaining, required);
1139 };
1140
1141 auto onBadTrailingUnit = [this](uint8_t unitsObserved) {
1142 this->badTrailingUnit(unitsObserved);
1143 };
1144
1145 auto onBadCodePoint = [this](char32_t badCodePoint, uint8_t unitsObserved) {
1146 this->badCodePoint(badCodePoint, unitsObserved);
1147 };
1148
1149 auto onNotShortestForm = [this](char32_t badCodePoint,
1150 uint8_t unitsObserved) {
1151 this->notShortestForm(badCodePoint, unitsObserved);
1152 };
1153
1154 // If a valid code point is decoded, this function call consumes its code
1155 // units. If not, it ungets the lead code unit and invokes the right error
1156 // handler, so on failure we must immediately return false.
1157 SourceUnitsIterator iter(this->sourceUnits);
1158 Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePointInline(
1159 lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
1160 onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
1161 if (maybeCodePoint.isNothing()) {
1162 return false;
1163 }
1164
1165 *codePoint = maybeCodePoint.value();
1166 return true;
1167 }
1168
1169 template <class AnyCharsAccess>
getNonAsciiCodePoint(int32_t lead,int32_t * codePoint)1170 bool TokenStreamChars<char16_t, AnyCharsAccess>::getNonAsciiCodePoint(
1171 int32_t lead, int32_t* codePoint) {
1172 MOZ_ASSERT(lead != EOF);
1173 MOZ_ASSERT(!isAsciiCodePoint(lead),
1174 "ASCII code unit/point must be handled separately");
1175 MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
1176 "getNonAsciiCodePoint called incorrectly");
1177
1178 // The code point is usually |lead|: overwrite later if needed.
1179 *codePoint = lead;
1180
1181 // ECMAScript specifically requires that unpaired UTF-16 surrogates be
1182 // treated as the corresponding code point and not as an error. See
1183 // <https://tc39.github.io/ecma262/#sec-ecmascript-language-types-string-type>.
1184 // Thus this function does not consider any sequence of 16-bit numbers to
1185 // be intrinsically in error.
1186
1187 // Dispense with single-unit code points and lone trailing surrogates.
1188 if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead))) {
1189 if (MOZ_UNLIKELY(lead == unicode::LINE_SEPARATOR ||
1190 lead == unicode::PARA_SEPARATOR)) {
1191 if (!updateLineInfoForEOL()) {
1192 #ifdef DEBUG
1193 *codePoint = EOF; // sentinel value to hopefully cause errors
1194 #endif
1195 MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
1196 return false;
1197 }
1198
1199 *codePoint = '\n';
1200 } else {
1201 MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1202 }
1203
1204 return true;
1205 }
1206
1207 // Also handle a lead surrogate not paired with a trailing surrogate.
1208 if (MOZ_UNLIKELY(
1209 this->sourceUnits.atEnd() ||
1210 !unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) {
1211 MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1212 return true;
1213 }
1214
1215 // Otherwise we have a multi-unit code point.
1216 *codePoint = unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
1217 MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(*codePoint)));
1218 return true;
1219 }
1220
1221 template <typename Unit, class AnyCharsAccess>
getCodePoint(int32_t * cp)1222 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getCodePoint(int32_t* cp) {
1223 int32_t unit = getCodeUnit();
1224 if (unit == EOF) {
1225 MOZ_ASSERT(anyCharsAccess().flags.isEOF,
1226 "flags.isEOF should have been set by getCodeUnit()");
1227 *cp = EOF;
1228 return true;
1229 }
1230
1231 if (isAsciiCodePoint(unit)) {
1232 return getFullAsciiCodePoint(unit, cp);
1233 }
1234
1235 return getNonAsciiCodePoint(unit, cp);
1236 }
1237
1238 template <class AnyCharsAccess>
getNonAsciiCodePoint(int32_t unit,int32_t * codePoint)1239 bool TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePoint(
1240 int32_t unit, int32_t* codePoint) {
1241 MOZ_ASSERT(unit != EOF);
1242 MOZ_ASSERT(!isAsciiCodePoint(unit),
1243 "ASCII code unit/point must be handled separately");
1244
1245 Utf8Unit lead = Utf8Unit(static_cast<unsigned char>(unit));
1246 MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(),
1247 "getNonAsciiCodePoint called incorrectly");
1248
1249 auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
1250
1251 auto onNotEnoughUnits = [this, &lead](uint_fast8_t remaining,
1252 uint_fast8_t required) {
1253 this->notEnoughUnits(lead, remaining, required);
1254 };
1255
1256 auto onBadTrailingUnit = [this](uint_fast8_t unitsObserved) {
1257 this->badTrailingUnit(unitsObserved);
1258 };
1259
1260 auto onBadCodePoint = [this](char32_t badCodePoint,
1261 uint_fast8_t unitsObserved) {
1262 this->badCodePoint(badCodePoint, unitsObserved);
1263 };
1264
1265 auto onNotShortestForm = [this](char32_t badCodePoint,
1266 uint_fast8_t unitsObserved) {
1267 this->notShortestForm(badCodePoint, unitsObserved);
1268 };
1269
1270 // This consumes the full, valid code point or ungets |lead| and calls the
1271 // appropriate error functor on failure.
1272 SourceUnitsIterator iter(this->sourceUnits);
1273 Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePoint(
1274 lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
1275 onBadTrailingUnit, onBadCodePoint, onNotShortestForm);
1276 if (maybeCodePoint.isNothing()) {
1277 return false;
1278 }
1279
1280 char32_t cp = maybeCodePoint.value();
1281 if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
1282 cp == unicode::PARA_SEPARATOR)) {
1283 if (!updateLineInfoForEOL()) {
1284 #ifdef DEBUG
1285 *codePoint = EOF; // sentinel value to hopefully cause errors
1286 #endif
1287 MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
1288 return false;
1289 }
1290
1291 *codePoint = '\n';
1292 } else {
1293 MOZ_ASSERT(!IsLineTerminator(cp));
1294 *codePoint = AssertedCast<int32_t>(cp);
1295 }
1296
1297 return true;
1298 }
1299
1300 template <>
findWindowStart(size_t offset) const1301 size_t SourceUnits<char16_t>::findWindowStart(size_t offset) const {
1302 // This is JS's understanding of UTF-16 that allows lone surrogates, so
1303 // we have to exclude lone surrogates from [windowStart, offset) ourselves.
1304
1305 const char16_t* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
1306
1307 const char16_t* const initial = codeUnitPtrAt(offset);
1308 const char16_t* p = initial;
1309
1310 auto HalfWindowSize = [&p, &initial]() {
1311 return PointerRangeSize(p, initial);
1312 };
1313
1314 while (true) {
1315 MOZ_ASSERT(earliestPossibleStart <= p);
1316 MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1317 if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
1318 break;
1319 }
1320
1321 char16_t c = p[-1];
1322
1323 // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
1324 // string and template literals. These code points do affect line and
1325 // column coordinates, even as they encode their literal values.
1326 if (IsLineTerminator(c)) {
1327 break;
1328 }
1329
1330 // Don't allow invalid UTF-16 in pre-context. (Current users don't
1331 // require this, and this behavior isn't currently imposed on
1332 // pre-context, but these facts might change someday.)
1333
1334 if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(c))) {
1335 break;
1336 }
1337
1338 // Optimistically include the code unit, reverting below if needed.
1339 p--;
1340
1341 // If it's not a surrogate at all, keep going.
1342 if (MOZ_LIKELY(!unicode::IsTrailSurrogate(c))) {
1343 continue;
1344 }
1345
1346 // Stop if we don't have a usable surrogate pair.
1347 if (HalfWindowSize() >= WindowRadius ||
1348 p <= earliestPossibleStart || // trail surrogate at low end
1349 !unicode::IsLeadSurrogate(p[-1])) // no paired lead surrogate
1350 {
1351 p++;
1352 break;
1353 }
1354
1355 p--;
1356 }
1357
1358 MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1359 return offset - HalfWindowSize();
1360 }
1361
1362 template <>
findWindowStart(size_t offset) const1363 size_t SourceUnits<Utf8Unit>::findWindowStart(size_t offset) const {
1364 // |offset| must be the location of the error or somewhere before it, so we
1365 // know preceding data is valid UTF-8.
1366
1367 const Utf8Unit* const earliestPossibleStart = codeUnitPtrAt(startOffset_);
1368
1369 const Utf8Unit* const initial = codeUnitPtrAt(offset);
1370 const Utf8Unit* p = initial;
1371
1372 auto HalfWindowSize = [&p, &initial]() {
1373 return PointerRangeSize(p, initial);
1374 };
1375
1376 while (true) {
1377 MOZ_ASSERT(earliestPossibleStart <= p);
1378 MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1379 if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) {
1380 break;
1381 }
1382
1383 // Peek backward for a line break, and only decrement if there is none.
1384 uint8_t prev = p[-1].toUint8();
1385
1386 // First check for the ASCII LineTerminators.
1387 if (prev == '\r' || prev == '\n') {
1388 break;
1389 }
1390
1391 // Now check for the non-ASCII LineTerminators U+2028 LINE SEPARATOR
1392 // (0xE2 0x80 0xA8) and U+2029 PARAGRAPH (0xE2 0x80 0xA9). If there
1393 // aren't three code units available, some comparison here will fail
1394 // before we'd underflow.
1395 if (MOZ_UNLIKELY((prev == 0xA8 || prev == 0xA9) &&
1396 p[-2].toUint8() == 0x80 && p[-3].toUint8() == 0xE2)) {
1397 break;
1398 }
1399
1400 // Rewind over the non-LineTerminator. This can't underflow
1401 // |earliestPossibleStart| because it begins a code point.
1402 while (IsTrailingUnit(*--p)) {
1403 continue;
1404 }
1405
1406 MOZ_ASSERT(earliestPossibleStart <= p);
1407
1408 // But if we underflowed |WindowRadius|, adjust forward and stop.
1409 if (HalfWindowSize() > WindowRadius) {
1410 static_assert(WindowRadius > 3,
1411 "skipping over non-lead code units below must not "
1412 "advance past |offset|");
1413
1414 while (IsTrailingUnit(*++p)) {
1415 continue;
1416 }
1417
1418 MOZ_ASSERT(HalfWindowSize() < WindowRadius);
1419 break;
1420 }
1421 }
1422
1423 MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1424 return offset - HalfWindowSize();
1425 }
1426
1427 template <>
findWindowEnd(size_t offset) const1428 size_t SourceUnits<char16_t>::findWindowEnd(size_t offset) const {
1429 const char16_t* const initial = codeUnitPtrAt(offset);
1430 const char16_t* p = initial;
1431
1432 auto HalfWindowSize = [&initial, &p]() {
1433 return PointerRangeSize(initial, p);
1434 };
1435
1436 while (true) {
1437 MOZ_ASSERT(p <= limit_);
1438 MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1439 if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
1440 break;
1441 }
1442
1443 char16_t c = *p;
1444
1445 // This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in
1446 // string and template literals. These code points do affect line and
1447 // column coordinates, even as they encode their literal values.
1448 if (IsLineTerminator(c)) {
1449 break;
1450 }
1451
1452 // Don't allow invalid UTF-16 in post-context. (Current users don't
1453 // require this, and this behavior isn't currently imposed on
1454 // pre-context, but these facts might change someday.)
1455
1456 if (MOZ_UNLIKELY(unicode::IsTrailSurrogate(c))) {
1457 break;
1458 }
1459
1460 // Optimistically consume the code unit, ungetting it below if needed.
1461 p++;
1462
1463 // If it's not a surrogate at all, keep going.
1464 if (MOZ_LIKELY(!unicode::IsLeadSurrogate(c))) {
1465 continue;
1466 }
1467
1468 // Retract if the lead surrogate would stand alone at the end of the
1469 // window.
1470 if (HalfWindowSize() >= WindowRadius || // split pair
1471 p >= limit_ || // half-pair at end of source
1472 !unicode::IsTrailSurrogate(*p)) // no paired trail surrogate
1473 {
1474 p--;
1475 break;
1476 }
1477
1478 p++;
1479 }
1480
1481 return offset + HalfWindowSize();
1482 }
1483
1484 template <>
findWindowEnd(size_t offset) const1485 size_t SourceUnits<Utf8Unit>::findWindowEnd(size_t offset) const {
1486 const Utf8Unit* const initial = codeUnitPtrAt(offset);
1487 const Utf8Unit* p = initial;
1488
1489 auto HalfWindowSize = [&initial, &p]() {
1490 return PointerRangeSize(initial, p);
1491 };
1492
1493 while (true) {
1494 MOZ_ASSERT(p <= limit_);
1495 MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1496 if (p >= limit_ || HalfWindowSize() >= WindowRadius) {
1497 break;
1498 }
1499
1500 // A non-encoding error might be followed by an encoding error within
1501 // |maxEnd|, so we must validate as we go to not include invalid UTF-8
1502 // in the computed window. What joy!
1503
1504 Utf8Unit lead = *p;
1505 if (mozilla::IsAscii(lead)) {
1506 if (IsSingleUnitLineTerminator(lead)) {
1507 break;
1508 }
1509
1510 p++;
1511 continue;
1512 }
1513
1514 PeekedCodePoint<Utf8Unit> peeked = PeekCodePoint(p, limit_);
1515 if (peeked.isNone()) {
1516 break; // encoding error
1517 }
1518
1519 char32_t c = peeked.codePoint();
1520 if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR ||
1521 c == unicode::PARA_SEPARATOR)) {
1522 break;
1523 }
1524
1525 MOZ_ASSERT(!IsLineTerminator(c));
1526
1527 uint8_t len = peeked.lengthInUnits();
1528 if (HalfWindowSize() + len > WindowRadius) {
1529 break;
1530 }
1531
1532 p += len;
1533 }
1534
1535 MOZ_ASSERT(HalfWindowSize() <= WindowRadius);
1536 return offset + HalfWindowSize();
1537 }
1538
1539 template <typename Unit, class AnyCharsAccess>
advance(size_t position)1540 bool TokenStreamSpecific<Unit, AnyCharsAccess>::advance(size_t position) {
1541 const Unit* end = this->sourceUnits.codeUnitPtrAt(position);
1542 while (this->sourceUnits.addressOfNextCodeUnit() < end) {
1543 int32_t c;
1544 if (!getCodePoint(&c)) {
1545 return false;
1546 }
1547 }
1548
1549 TokenStreamAnyChars& anyChars = anyCharsAccess();
1550 Token* cur = const_cast<Token*>(&anyChars.currentToken());
1551 cur->pos.begin = this->sourceUnits.offset();
1552 cur->pos.end = cur->pos.begin;
1553 #ifdef DEBUG
1554 cur->type = TokenKind::Limit;
1555 #endif
1556 MOZ_MAKE_MEM_UNDEFINED(&cur->type, sizeof(cur->type));
1557 anyChars.lookahead = 0;
1558 return true;
1559 }
1560
1561 template <typename Unit, class AnyCharsAccess>
seekTo(const Position & pos)1562 void TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(const Position& pos) {
1563 TokenStreamAnyChars& anyChars = anyCharsAccess();
1564
1565 this->sourceUnits.setAddressOfNextCodeUnit(pos.buf,
1566 /* allowPoisoned = */ true);
1567 anyChars.flags = pos.flags;
1568 anyChars.lineno = pos.lineno;
1569 anyChars.linebase = pos.linebase;
1570 anyChars.prevLinebase = pos.prevLinebase;
1571 anyChars.lookahead = pos.lookahead;
1572
1573 anyChars.tokens[anyChars.cursor()] = pos.currentToken;
1574 for (unsigned i = 0; i < anyChars.lookahead; i++) {
1575 anyChars.tokens[anyChars.aheadCursor(1 + i)] = pos.lookaheadTokens[i];
1576 }
1577 }
1578
1579 template <typename Unit, class AnyCharsAccess>
seekTo(const Position & pos,const TokenStreamAnyChars & other)1580 bool TokenStreamSpecific<Unit, AnyCharsAccess>::seekTo(
1581 const Position& pos, const TokenStreamAnyChars& other) {
1582 if (!anyCharsAccess().srcCoords.fill(other.srcCoords)) {
1583 return false;
1584 }
1585
1586 seekTo(pos);
1587 return true;
1588 }
1589
computeErrorMetadataNoOffset(ErrorMetadata * err)1590 void TokenStreamAnyChars::computeErrorMetadataNoOffset(ErrorMetadata* err) {
1591 err->isMuted = mutedErrors;
1592 err->filename = filename_;
1593 err->lineNumber = 0;
1594 err->columnNumber = 0;
1595
1596 MOZ_ASSERT(err->lineOfContext == nullptr);
1597 }
1598
fillExceptingContext(ErrorMetadata * err,uint32_t offset)1599 bool TokenStreamAnyChars::fillExceptingContext(ErrorMetadata* err,
1600 uint32_t offset) {
1601 err->isMuted = mutedErrors;
1602
1603 // If this TokenStreamAnyChars doesn't have location information, try to
1604 // get it from the caller.
1605 if (!filename_ && !cx->isHelperThreadContext()) {
1606 NonBuiltinFrameIter iter(cx, FrameIter::FOLLOW_DEBUGGER_EVAL_PREV_LINK,
1607 cx->realm()->principals());
1608 if (!iter.done() && iter.filename()) {
1609 err->filename = iter.filename();
1610 err->lineNumber = iter.computeLine(&err->columnNumber);
1611 return false;
1612 }
1613 }
1614
1615 // Otherwise use this TokenStreamAnyChars's location information.
1616 err->filename = filename_;
1617 return true;
1618 }
1619
1620 template <typename Unit, class AnyCharsAccess>
hasTokenizationStarted() const1621 bool TokenStreamSpecific<Unit, AnyCharsAccess>::hasTokenizationStarted() const {
1622 const TokenStreamAnyChars& anyChars = anyCharsAccess();
1623 return anyChars.isCurrentTokenType(TokenKind::Eof) && !anyChars.isEOF();
1624 }
1625
1626 template <>
computeWindowOffsetAndLength(const char16_t * encodedWindow,size_t encodedTokenOffset,size_t * utf16TokenOffset,size_t encodedWindowLength,size_t * utf16WindowLength)1627 inline void SourceUnits<char16_t>::computeWindowOffsetAndLength(
1628 const char16_t* encodedWindow, size_t encodedTokenOffset,
1629 size_t* utf16TokenOffset, size_t encodedWindowLength,
1630 size_t* utf16WindowLength) {
1631 MOZ_ASSERT_UNREACHABLE("shouldn't need to recompute for UTF-16");
1632 }
1633
1634 template <>
computeWindowOffsetAndLength(const Utf8Unit * encodedWindow,size_t encodedTokenOffset,size_t * utf16TokenOffset,size_t encodedWindowLength,size_t * utf16WindowLength)1635 inline void SourceUnits<Utf8Unit>::computeWindowOffsetAndLength(
1636 const Utf8Unit* encodedWindow, size_t encodedTokenOffset,
1637 size_t* utf16TokenOffset, size_t encodedWindowLength,
1638 size_t* utf16WindowLength) {
1639 MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
1640 "token offset must be within the window, and the two lambda "
1641 "calls below presume this ordering of values");
1642
1643 const Utf8Unit* const encodedWindowEnd = encodedWindow + encodedWindowLength;
1644
1645 size_t i = 0;
1646 auto ComputeUtf16Count = [&i, &encodedWindow](const Utf8Unit* limit) {
1647 while (encodedWindow < limit) {
1648 Utf8Unit lead = *encodedWindow++;
1649 if (MOZ_LIKELY(IsAscii(lead))) {
1650 // ASCII contributes a single UTF-16 code unit.
1651 i++;
1652 continue;
1653 }
1654
1655 Maybe<char32_t> cp = DecodeOneUtf8CodePoint(lead, &encodedWindow, limit);
1656 MOZ_ASSERT(cp.isSome(),
1657 "computed window should only contain valid UTF-8");
1658
1659 i += unicode::IsSupplementary(cp.value()) ? 2 : 1;
1660 }
1661
1662 return i;
1663 };
1664
1665 // Compute the token offset from |i == 0| and the initial |encodedWindow|.
1666 const Utf8Unit* token = encodedWindow + encodedTokenOffset;
1667 MOZ_ASSERT(token <= encodedWindowEnd);
1668 *utf16TokenOffset = ComputeUtf16Count(token);
1669
1670 // Compute the window length, picking up from |i| and |encodedWindow| that,
1671 // in general, were modified just above.
1672 *utf16WindowLength = ComputeUtf16Count(encodedWindowEnd);
1673 }
1674
1675 template <typename Unit>
addLineOfContext(ErrorMetadata * err,uint32_t offset)1676 bool TokenStreamCharsBase<Unit>::addLineOfContext(ErrorMetadata* err,
1677 uint32_t offset) {
1678 // Rename the variable to make meaning clearer: an offset into source units
1679 // in Unit encoding.
1680 size_t encodedOffset = offset;
1681
1682 // These are also offsets into source units in Unit encoding.
1683 size_t encodedWindowStart = sourceUnits.findWindowStart(encodedOffset);
1684 size_t encodedWindowEnd = sourceUnits.findWindowEnd(encodedOffset);
1685
1686 size_t encodedWindowLength = encodedWindowEnd - encodedWindowStart;
1687 MOZ_ASSERT(encodedWindowLength <= SourceUnits::WindowRadius * 2);
1688
1689 // Don't add a useless "line" of context when the window ends up empty
1690 // because of an invalid encoding at the start of a line.
1691 if (encodedWindowLength == 0) {
1692 MOZ_ASSERT(err->lineOfContext == nullptr,
1693 "ErrorMetadata::lineOfContext must be null so we don't "
1694 "have to set the lineLength/tokenOffset fields");
1695 return true;
1696 }
1697
1698 CharBuffer lineOfContext(cx);
1699
1700 const Unit* encodedWindow = sourceUnits.codeUnitPtrAt(encodedWindowStart);
1701 if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(
1702 lineOfContext, encodedWindow, encodedWindow + encodedWindowLength)) {
1703 return false;
1704 }
1705
1706 size_t utf16WindowLength = lineOfContext.length();
1707
1708 // The windowed string is null-terminated.
1709 if (!lineOfContext.append('\0')) {
1710 return false;
1711 }
1712
1713 err->lineOfContext.reset(lineOfContext.extractOrCopyRawBuffer());
1714 if (!err->lineOfContext) {
1715 return false;
1716 }
1717
1718 size_t encodedTokenOffset = encodedOffset - encodedWindowStart;
1719
1720 MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength,
1721 "token offset must be inside the window");
1722
1723 // The length in UTF-8 code units of a code point is always greater than or
1724 // equal to the same code point's length in UTF-16 code points. ASCII code
1725 // points are 1 unit in either encoding. Code points in [U+0080, U+10000)
1726 // are 2-3 UTF-8 code units to 1 UTF-16 code unit. And code points in
1727 // [U+10000, U+10FFFF] are 4 UTF-8 code units to 2 UTF-16 code units.
1728 //
1729 // Therefore, if encoded window length equals the length in UTF-16 (this is
1730 // always the case for Unit=char16_t), the UTF-16 offsets are exactly the
1731 // encoded offsets. Otherwise we must convert offset/length from UTF-8 to
1732 // UTF-16.
1733 if constexpr (std::is_same_v<Unit, char16_t>) {
1734 MOZ_ASSERT(utf16WindowLength == encodedWindowLength,
1735 "UTF-16 to UTF-16 shouldn't change window length");
1736 err->tokenOffset = encodedTokenOffset;
1737 err->lineLength = encodedWindowLength;
1738 } else {
1739 static_assert(std::is_same_v<Unit, Utf8Unit>, "should only see UTF-8 here");
1740
1741 bool simple = utf16WindowLength == encodedWindowLength;
1742 #ifdef DEBUG
1743 auto isAscii = [](Unit u) { return IsAscii(u); };
1744 MOZ_ASSERT(std::all_of(encodedWindow, encodedWindow + encodedWindowLength,
1745 isAscii) == simple,
1746 "equal window lengths in UTF-8 should correspond only to "
1747 "wholly-ASCII text");
1748 #endif
1749 if (simple) {
1750 err->tokenOffset = encodedTokenOffset;
1751 err->lineLength = encodedWindowLength;
1752 } else {
1753 sourceUnits.computeWindowOffsetAndLength(
1754 encodedWindow, encodedTokenOffset, &err->tokenOffset,
1755 encodedWindowLength, &err->lineLength);
1756 }
1757 }
1758
1759 return true;
1760 }
1761
1762 template <typename Unit, class AnyCharsAccess>
computeErrorMetadata(ErrorMetadata * err,const ErrorOffset & errorOffset)1763 bool TokenStreamSpecific<Unit, AnyCharsAccess>::computeErrorMetadata(
1764 ErrorMetadata* err, const ErrorOffset& errorOffset) {
1765 if (errorOffset.is<NoOffset>()) {
1766 anyCharsAccess().computeErrorMetadataNoOffset(err);
1767 return true;
1768 }
1769
1770 uint32_t offset;
1771 if (errorOffset.is<uint32_t>()) {
1772 offset = errorOffset.as<uint32_t>();
1773 } else {
1774 offset = this->sourceUnits.offset();
1775 }
1776
1777 // This function's return value isn't a success/failure indication: it
1778 // returns true if this TokenStream can be used to provide a line of
1779 // context.
1780 if (fillExceptingContext(err, offset)) {
1781 // Add a line of context from this TokenStream to help with debugging.
1782 return internalComputeLineOfContext(err, offset);
1783 }
1784
1785 // We can't fill in any more here.
1786 return true;
1787 }
1788
1789 template <typename Unit, class AnyCharsAccess>
reportIllegalCharacter(int32_t cp)1790 void TokenStreamSpecific<Unit, AnyCharsAccess>::reportIllegalCharacter(
1791 int32_t cp) {
1792 UniqueChars display = JS_smprintf("U+%04X", cp);
1793 if (!display) {
1794 ReportOutOfMemory(anyCharsAccess().cx);
1795 return;
1796 }
1797 error(JSMSG_ILLEGAL_CHARACTER, display.get());
1798 }
1799
1800 // We have encountered a '\': check for a Unicode escape sequence after it.
1801 // Return the length of the escape sequence and the encoded code point (by
1802 // value) if we found a Unicode escape sequence, and skip all code units
1803 // involed. Otherwise, return 0 and don't advance along the buffer.
1804 template <typename Unit, class AnyCharsAccess>
matchUnicodeEscape(uint32_t * codePoint)1805 uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscape(
1806 uint32_t* codePoint) {
1807 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1808
1809 int32_t unit = getCodeUnit();
1810 if (unit != 'u') {
1811 // NOTE: |unit| may be EOF here.
1812 ungetCodeUnit(unit);
1813 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1814 return 0;
1815 }
1816
1817 char16_t v;
1818 unit = getCodeUnit();
1819 if (IsAsciiHexDigit(unit) && this->sourceUnits.matchHexDigits(3, &v)) {
1820 *codePoint = (AsciiAlphanumericToNumber(unit) << 12) | v;
1821 return 5;
1822 }
1823
1824 if (unit == '{') {
1825 return matchExtendedUnicodeEscape(codePoint);
1826 }
1827
1828 // NOTE: |unit| may be EOF here, so this ungets either one or two units.
1829 ungetCodeUnit(unit);
1830 ungetCodeUnit('u');
1831 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1832 return 0;
1833 }
1834
1835 template <typename Unit, class AnyCharsAccess>
1836 uint32_t
matchExtendedUnicodeEscape(uint32_t * codePoint)1837 GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchExtendedUnicodeEscape(
1838 uint32_t* codePoint) {
1839 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('{'));
1840
1841 int32_t unit = getCodeUnit();
1842
1843 // Skip leading zeroes.
1844 uint32_t leadingZeroes = 0;
1845 while (unit == '0') {
1846 leadingZeroes++;
1847 unit = getCodeUnit();
1848 }
1849
1850 size_t i = 0;
1851 uint32_t code = 0;
1852 while (IsAsciiHexDigit(unit) && i < 6) {
1853 code = (code << 4) | AsciiAlphanumericToNumber(unit);
1854 unit = getCodeUnit();
1855 i++;
1856 }
1857
1858 uint32_t gotten =
1859 2 + // 'u{'
1860 leadingZeroes + i + // significant hexdigits
1861 (unit != EOF); // subtract a get if it didn't contribute to length
1862
1863 if (unit == '}' && (leadingZeroes > 0 || i > 0) &&
1864 code <= unicode::NonBMPMax) {
1865 *codePoint = code;
1866 return gotten;
1867 }
1868
1869 this->sourceUnits.unskipCodeUnits(gotten);
1870 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1871 return 0;
1872 }
1873
1874 template <typename Unit, class AnyCharsAccess>
1875 uint32_t
matchUnicodeEscapeIdStart(uint32_t * codePoint)1876 GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdStart(
1877 uint32_t* codePoint) {
1878 uint32_t length = matchUnicodeEscape(codePoint);
1879 if (MOZ_LIKELY(length > 0)) {
1880 if (MOZ_LIKELY(unicode::IsIdentifierStart(*codePoint))) {
1881 return length;
1882 }
1883
1884 this->sourceUnits.unskipCodeUnits(length);
1885 }
1886
1887 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1888 return 0;
1889 }
1890
1891 template <typename Unit, class AnyCharsAccess>
matchUnicodeEscapeIdent(uint32_t * codePoint)1892 bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscapeIdent(
1893 uint32_t* codePoint) {
1894 uint32_t length = matchUnicodeEscape(codePoint);
1895 if (MOZ_LIKELY(length > 0)) {
1896 if (MOZ_LIKELY(unicode::IsIdentifierPart(*codePoint))) {
1897 return true;
1898 }
1899
1900 this->sourceUnits.unskipCodeUnits(length);
1901 }
1902
1903 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
1904 return false;
1905 }
1906
1907 template <typename Unit, class AnyCharsAccess>
1908 [[nodiscard]] bool
matchIdentifierStart(IdentifierEscapes * sawEscape)1909 TokenStreamSpecific<Unit, AnyCharsAccess>::matchIdentifierStart(
1910 IdentifierEscapes* sawEscape) {
1911 int32_t unit = getCodeUnit();
1912 if (unicode::IsIdentifierStart(char16_t(unit))) {
1913 ungetCodeUnit(unit);
1914 *sawEscape = IdentifierEscapes::None;
1915 return true;
1916 }
1917
1918 if (unit == '\\') {
1919 *sawEscape = IdentifierEscapes::SawUnicodeEscape;
1920
1921 uint32_t codePoint;
1922 uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint);
1923 if (escapeLength != 0) {
1924 return true;
1925 }
1926
1927 // We could point "into" a mistyped escape, e.g. for "\u{41H}" we
1928 // could point at the 'H'. But we don't do that now, so the code
1929 // unit after the '\' isn't necessarily bad, so just point at the
1930 // start of the actually-invalid escape.
1931 ungetCodeUnit('\\');
1932 error(JSMSG_BAD_ESCAPE);
1933 return false;
1934 }
1935
1936 *sawEscape = IdentifierEscapes::None;
1937
1938 // NOTE: |unit| may be EOF here.
1939 ungetCodeUnit(unit);
1940 error(JSMSG_MISSING_PRIVATE_NAME);
1941 return false;
1942 }
1943
1944 template <typename Unit, class AnyCharsAccess>
getDirectives(bool isMultiline,bool shouldWarnDeprecated)1945 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirectives(
1946 bool isMultiline, bool shouldWarnDeprecated) {
1947 // Match directive comments used in debugging, such as "//# sourceURL" and
1948 // "//# sourceMappingURL". Use of "//@" instead of "//#" is deprecated.
1949 //
1950 // To avoid a crashing bug in IE, several JavaScript transpilers wrap single
1951 // line comments containing a source mapping URL inside a multiline
1952 // comment. To avoid potentially expensive lookahead and backtracking, we
1953 // only check for this case if we encounter a '#' code unit.
1954
1955 bool res = getDisplayURL(isMultiline, shouldWarnDeprecated) &&
1956 getSourceMappingURL(isMultiline, shouldWarnDeprecated);
1957 if (!res) {
1958 badToken();
1959 }
1960
1961 return res;
1962 }
1963
copyCharBufferTo(JSContext * cx,UniquePtr<char16_t[],JS::FreePolicy> * destination)1964 [[nodiscard]] bool TokenStreamCharsShared::copyCharBufferTo(
1965 JSContext* cx, UniquePtr<char16_t[], JS::FreePolicy>* destination) {
1966 size_t length = charBuffer.length();
1967
1968 *destination = cx->make_pod_array<char16_t>(length + 1);
1969 if (!*destination) {
1970 return false;
1971 }
1972
1973 std::copy(charBuffer.begin(), charBuffer.end(), destination->get());
1974 (*destination)[length] = '\0';
1975 return true;
1976 }
1977
1978 template <typename Unit, class AnyCharsAccess>
getDirective(bool isMultiline,bool shouldWarnDeprecated,const char * directive,uint8_t directiveLength,const char * errorMsgPragma,UniquePtr<char16_t[],JS::FreePolicy> * destination)1979 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirective(
1980 bool isMultiline, bool shouldWarnDeprecated, const char* directive,
1981 uint8_t directiveLength, const char* errorMsgPragma,
1982 UniquePtr<char16_t[], JS::FreePolicy>* destination) {
1983 // Stop if we don't find |directive|. (Note that |directive| must be
1984 // ASCII, so there are no tricky encoding issues to consider in matching
1985 // UTF-8/16-agnostically.)
1986 if (!this->sourceUnits.matchCodeUnits(directive, directiveLength)) {
1987 return true;
1988 }
1989
1990 if (shouldWarnDeprecated) {
1991 if (!warning(JSMSG_DEPRECATED_PRAGMA, errorMsgPragma)) {
1992 return false;
1993 }
1994 }
1995
1996 this->charBuffer.clear();
1997
1998 do {
1999 int32_t unit = peekCodeUnit();
2000 if (unit == EOF) {
2001 break;
2002 }
2003
2004 if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2005 if (unicode::IsSpace(AssertedCast<Latin1Char>(unit))) {
2006 break;
2007 }
2008
2009 consumeKnownCodeUnit(unit);
2010
2011 // Debugging directives can occur in both single- and multi-line
2012 // comments. If we're currently inside a multi-line comment, we
2013 // also must recognize multi-line comment terminators.
2014 if (isMultiline && unit == '*' && peekCodeUnit() == '/') {
2015 ungetCodeUnit('*');
2016 break;
2017 }
2018
2019 if (!this->charBuffer.append(unit)) {
2020 return false;
2021 }
2022
2023 continue;
2024 }
2025
2026 // This ignores encoding errors: subsequent caller-side code to
2027 // handle the remaining source text in the comment will do so.
2028 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2029 if (peeked.isNone() || unicode::IsSpace(peeked.codePoint())) {
2030 break;
2031 }
2032
2033 MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()),
2034 "!IsSpace must imply !IsLineTerminator or else we'll fail to "
2035 "maintain line-info/flags for EOL");
2036 this->sourceUnits.consumeKnownCodePoint(peeked);
2037
2038 if (!AppendCodePointToCharBuffer(this->charBuffer, peeked.codePoint())) {
2039 return false;
2040 }
2041 } while (true);
2042
2043 if (this->charBuffer.empty()) {
2044 // The directive's URL was missing, but comments can contain anything,
2045 // so it isn't an error.
2046 return true;
2047 }
2048
2049 return copyCharBufferTo(anyCharsAccess().cx, destination);
2050 }
2051
2052 template <typename Unit, class AnyCharsAccess>
getDisplayURL(bool isMultiline,bool shouldWarnDeprecated)2053 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDisplayURL(
2054 bool isMultiline, bool shouldWarnDeprecated) {
2055 // Match comments of the form "//# sourceURL=<url>" or
2056 // "/\* //# sourceURL=<url> *\/"
2057 //
2058 // Note that while these are labeled "sourceURL" in the source text,
2059 // internally we refer to it as a "displayURL" to distinguish what the
2060 // developer would like to refer to the source as from the source's actual
2061 // URL.
2062
2063 static constexpr char sourceURLDirective[] = " sourceURL=";
2064 constexpr uint8_t sourceURLDirectiveLength = js_strlen(sourceURLDirective);
2065 return getDirective(isMultiline, shouldWarnDeprecated, sourceURLDirective,
2066 sourceURLDirectiveLength, "sourceURL",
2067 &anyCharsAccess().displayURL_);
2068 }
2069
2070 template <typename Unit, class AnyCharsAccess>
getSourceMappingURL(bool isMultiline,bool shouldWarnDeprecated)2071 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getSourceMappingURL(
2072 bool isMultiline, bool shouldWarnDeprecated) {
2073 // Match comments of the form "//# sourceMappingURL=<url>" or
2074 // "/\* //# sourceMappingURL=<url> *\/"
2075
2076 static constexpr char sourceMappingURLDirective[] = " sourceMappingURL=";
2077 constexpr uint8_t sourceMappingURLDirectiveLength =
2078 js_strlen(sourceMappingURLDirective);
2079 return getDirective(isMultiline, shouldWarnDeprecated,
2080 sourceMappingURLDirective,
2081 sourceMappingURLDirectiveLength, "sourceMappingURL",
2082 &anyCharsAccess().sourceMapURL_);
2083 }
2084
2085 template <typename Unit, class AnyCharsAccess>
2086 MOZ_ALWAYS_INLINE Token*
newTokenInternal(TokenKind kind,TokenStart start,TokenKind * out)2087 GeneralTokenStreamChars<Unit, AnyCharsAccess>::newTokenInternal(
2088 TokenKind kind, TokenStart start, TokenKind* out) {
2089 MOZ_ASSERT(kind < TokenKind::Limit);
2090 MOZ_ASSERT(kind != TokenKind::Eol,
2091 "TokenKind::Eol should never be used in an actual Token, only "
2092 "returned by peekTokenSameLine()");
2093
2094 TokenStreamAnyChars& anyChars = anyCharsAccess();
2095 anyChars.flags.isDirtyLine = true;
2096
2097 Token* token = anyChars.allocateToken();
2098
2099 *out = token->type = kind;
2100 token->pos = TokenPos(start.offset(), this->sourceUnits.offset());
2101 MOZ_ASSERT(token->pos.begin <= token->pos.end);
2102
2103 // NOTE: |token->modifier| is set in |newToken()| so that optimized,
2104 // non-debug code won't do any work to pass a modifier-argument that will
2105 // never be used.
2106
2107 return token;
2108 }
2109
2110 template <typename Unit, class AnyCharsAccess>
badToken()2111 MOZ_COLD bool GeneralTokenStreamChars<Unit, AnyCharsAccess>::badToken() {
2112 // We didn't get a token, so don't set |flags.isDirtyLine|.
2113 anyCharsAccess().flags.hadError = true;
2114
2115 // Poisoning sourceUnits on error establishes an invariant: once an
2116 // erroneous token has been seen, sourceUnits will not be consulted again.
2117 // This is true because the parser will deal with the illegal token by
2118 // aborting parsing immediately.
2119 this->sourceUnits.poisonInDebug();
2120
2121 return false;
2122 };
2123
AppendCodePointToCharBuffer(CharBuffer & charBuffer,uint32_t codePoint)2124 bool AppendCodePointToCharBuffer(CharBuffer& charBuffer, uint32_t codePoint) {
2125 MOZ_ASSERT(codePoint <= unicode::NonBMPMax,
2126 "should only be processing code points validly decoded from UTF-8 "
2127 "or WTF-16 source text (surrogate code points permitted)");
2128
2129 char16_t units[2];
2130 unsigned numUnits = 0;
2131 unicode::UTF16Encode(codePoint, units, &numUnits);
2132
2133 MOZ_ASSERT(numUnits == 1 || numUnits == 2,
2134 "UTF-16 code points are only encoded in one or two units");
2135
2136 if (!charBuffer.append(units[0])) {
2137 return false;
2138 }
2139
2140 if (numUnits == 1) {
2141 return true;
2142 }
2143
2144 return charBuffer.append(units[1]);
2145 }
2146
2147 template <typename Unit, class AnyCharsAccess>
putIdentInCharBuffer(const Unit * identStart)2148 bool TokenStreamSpecific<Unit, AnyCharsAccess>::putIdentInCharBuffer(
2149 const Unit* identStart) {
2150 const Unit* const originalAddress = this->sourceUnits.addressOfNextCodeUnit();
2151 this->sourceUnits.setAddressOfNextCodeUnit(identStart);
2152
2153 auto restoreNextRawCharAddress = MakeScopeExit([this, originalAddress]() {
2154 this->sourceUnits.setAddressOfNextCodeUnit(originalAddress);
2155 });
2156
2157 this->charBuffer.clear();
2158 do {
2159 int32_t unit = getCodeUnit();
2160 if (unit == EOF) {
2161 break;
2162 }
2163
2164 uint32_t codePoint;
2165 if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2166 if (unicode::IsIdentifierPart(char16_t(unit)) || unit == '#') {
2167 if (!this->charBuffer.append(unit)) {
2168 return false;
2169 }
2170
2171 continue;
2172 }
2173
2174 if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) {
2175 break;
2176 }
2177 } else {
2178 // |restoreNextRawCharAddress| undoes all gets, and this function
2179 // doesn't update line/column info.
2180 char32_t cp;
2181 if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) {
2182 return false;
2183 }
2184
2185 codePoint = cp;
2186 if (!unicode::IsIdentifierPart(codePoint)) {
2187 break;
2188 }
2189 }
2190
2191 if (!AppendCodePointToCharBuffer(this->charBuffer, codePoint)) {
2192 return false;
2193 }
2194 } while (true);
2195
2196 return true;
2197 }
2198
2199 template <typename Unit, class AnyCharsAccess>
identifierName(TokenStart start,const Unit * identStart,IdentifierEscapes escaping,Modifier modifier,NameVisibility visibility,TokenKind * out)2200 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::identifierName(
2201 TokenStart start, const Unit* identStart, IdentifierEscapes escaping,
2202 Modifier modifier, NameVisibility visibility, TokenKind* out) {
2203 // Run the bad-token code for every path out of this function except the
2204 // two success-cases.
2205 auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
2206
2207 // We've already consumed an initial code point in the identifer, to *know*
2208 // that this is an identifier. So no need to worry about not consuming any
2209 // code points in the loop below.
2210 int32_t unit;
2211 while (true) {
2212 unit = peekCodeUnit();
2213 if (unit == EOF) {
2214 break;
2215 }
2216
2217 if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2218 consumeKnownCodeUnit(unit);
2219
2220 if (MOZ_UNLIKELY(
2221 !unicode::IsIdentifierPart(static_cast<char16_t>(unit)))) {
2222 // Handle a Unicode escape -- otherwise it's not part of the
2223 // identifier.
2224 uint32_t codePoint;
2225 if (unit != '\\' || !matchUnicodeEscapeIdent(&codePoint)) {
2226 ungetCodeUnit(unit);
2227 break;
2228 }
2229
2230 escaping = IdentifierEscapes::SawUnicodeEscape;
2231 }
2232 } else {
2233 // This ignores encoding errors: subsequent caller-side code to
2234 // handle source text after the IdentifierName will do so.
2235 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2236 if (peeked.isNone() || !unicode::IsIdentifierPart(peeked.codePoint())) {
2237 break;
2238 }
2239
2240 MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()),
2241 "IdentifierPart must guarantee !IsLineTerminator or "
2242 "else we'll fail to maintain line-info/flags for EOL");
2243
2244 this->sourceUnits.consumeKnownCodePoint(peeked);
2245 }
2246 }
2247
2248 TaggedParserAtomIndex atom;
2249 if (MOZ_UNLIKELY(escaping == IdentifierEscapes::SawUnicodeEscape)) {
2250 // Identifiers containing Unicode escapes have to be converted into
2251 // tokenbuf before atomizing.
2252 if (!putIdentInCharBuffer(identStart)) {
2253 return false;
2254 }
2255
2256 atom = drainCharBufferIntoAtom();
2257 } else {
2258 // Escape-free identifiers can be created directly from sourceUnits.
2259 const Unit* chars = identStart;
2260 size_t length = this->sourceUnits.addressOfNextCodeUnit() - identStart;
2261
2262 // Private identifiers start with a '#', and so cannot be reserved words.
2263 if (visibility == NameVisibility::Public) {
2264 // Represent reserved words lacking escapes as reserved word tokens.
2265 if (const ReservedWordInfo* rw = FindReservedWord(chars, length)) {
2266 noteBadToken.release();
2267 newSimpleToken(rw->tokentype, start, modifier, out);
2268 return true;
2269 }
2270 }
2271
2272 atom = atomizeSourceChars(Span(chars, length));
2273 }
2274 if (!atom) {
2275 return false;
2276 }
2277
2278 noteBadToken.release();
2279 if (visibility == NameVisibility::Private) {
2280 newPrivateNameToken(atom, start, modifier, out);
2281 return true;
2282 }
2283 newNameToken(atom, start, modifier, out);
2284 return true;
2285 }
2286
2287 enum FirstCharKind {
2288 // A char16_t has the 'OneChar' kind if it, by itself, constitutes a valid
2289 // token that cannot also be a prefix of a longer token. E.g. ';' has the
2290 // OneChar kind, but '+' does not, because '++' and '+=' are valid longer
2291 // tokens
2292 // that begin with '+'.
2293 //
2294 // The few token kinds satisfying these properties cover roughly 35--45%
2295 // of the tokens seen in practice.
2296 //
2297 // We represent the 'OneChar' kind with any positive value less than
2298 // TokenKind::Limit. This representation lets us associate
2299 // each one-char token char16_t with a TokenKind and thus avoid
2300 // a subsequent char16_t-to-TokenKind conversion.
2301 OneChar_Min = 0,
2302 OneChar_Max = size_t(TokenKind::Limit) - 1,
2303
2304 Space = size_t(TokenKind::Limit),
2305 Ident,
2306 Dec,
2307 String,
2308 EOL,
2309 ZeroDigit,
2310 Other,
2311
2312 LastCharKind = Other
2313 };
2314
2315 // OneChar: 40, 41, 44, 58, 59, 91, 93, 123, 125, 126:
2316 // '(', ')', ',', ':', ';', '[', ']', '{', '}', '~'
2317 // Ident: 36, 65..90, 95, 97..122: '$', 'A'..'Z', '_', 'a'..'z'
2318 // Dot: 46: '.'
2319 // Equals: 61: '='
2320 // String: 34, 39, 96: '"', '\'', '`'
2321 // Dec: 49..57: '1'..'9'
2322 // Plus: 43: '+'
2323 // ZeroDigit: 48: '0'
2324 // Space: 9, 11, 12, 32: '\t', '\v', '\f', ' '
2325 // EOL: 10, 13: '\n', '\r'
2326 //
2327 #define T_COMMA size_t(TokenKind::Comma)
2328 #define T_COLON size_t(TokenKind::Colon)
2329 #define T_BITNOT size_t(TokenKind::BitNot)
2330 #define T_LP size_t(TokenKind::LeftParen)
2331 #define T_RP size_t(TokenKind::RightParen)
2332 #define T_SEMI size_t(TokenKind::Semi)
2333 #define T_LB size_t(TokenKind::LeftBracket)
2334 #define T_RB size_t(TokenKind::RightBracket)
2335 #define T_LC size_t(TokenKind::LeftCurly)
2336 #define T_RC size_t(TokenKind::RightCurly)
2337 #define _______ Other
2338 static const uint8_t firstCharKinds[] = {
2339 // clang-format off
2340 /* 0 1 2 3 4 5 6 7 8 9 */
2341 /* 0+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, Space,
2342 /* 10+ */ EOL, Space, Space, EOL, _______, _______, _______, _______, _______, _______,
2343 /* 20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______,
2344 /* 30+ */ _______, _______, Space, _______, String, _______, Ident, _______, _______, String,
2345 /* 40+ */ T_LP, T_RP, _______, _______, T_COMMA, _______, _______, _______,ZeroDigit, Dec,
2346 /* 50+ */ Dec, Dec, Dec, Dec, Dec, Dec, Dec, Dec, T_COLON, T_SEMI,
2347 /* 60+ */ _______, _______, _______, _______, _______, Ident, Ident, Ident, Ident, Ident,
2348 /* 70+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident,
2349 /* 80+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident,
2350 /* 90+ */ Ident, T_LB, _______, T_RB, _______, Ident, String, Ident, Ident, Ident,
2351 /* 100+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident,
2352 /* 110+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident,
2353 /* 120+ */ Ident, Ident, Ident, T_LC, _______, T_RC,T_BITNOT, _______
2354 // clang-format on
2355 };
2356 #undef T_COMMA
2357 #undef T_COLON
2358 #undef T_BITNOT
2359 #undef T_LP
2360 #undef T_RP
2361 #undef T_SEMI
2362 #undef T_LB
2363 #undef T_RB
2364 #undef T_LC
2365 #undef T_RC
2366 #undef _______
2367
2368 static_assert(LastCharKind < (1 << (sizeof(firstCharKinds[0]) * 8)),
2369 "Elements of firstCharKinds[] are too small");
2370
2371 template <>
consumeRestOfSingleLineComment()2372 void SourceUnits<char16_t>::consumeRestOfSingleLineComment() {
2373 while (MOZ_LIKELY(!atEnd())) {
2374 char16_t unit = peekCodeUnit();
2375 if (IsLineTerminator(unit)) {
2376 return;
2377 }
2378
2379 consumeKnownCodeUnit(unit);
2380 }
2381 }
2382
2383 template <>
consumeRestOfSingleLineComment()2384 void SourceUnits<Utf8Unit>::consumeRestOfSingleLineComment() {
2385 while (MOZ_LIKELY(!atEnd())) {
2386 const Utf8Unit unit = peekCodeUnit();
2387 if (IsSingleUnitLineTerminator(unit)) {
2388 return;
2389 }
2390
2391 if (MOZ_LIKELY(IsAscii(unit))) {
2392 consumeKnownCodeUnit(unit);
2393 continue;
2394 }
2395
2396 PeekedCodePoint<Utf8Unit> peeked = peekCodePoint();
2397 if (peeked.isNone()) {
2398 return;
2399 }
2400
2401 char32_t c = peeked.codePoint();
2402 if (MOZ_UNLIKELY(c == unicode::LINE_SEPARATOR ||
2403 c == unicode::PARA_SEPARATOR)) {
2404 return;
2405 }
2406
2407 consumeKnownCodePoint(peeked);
2408 }
2409 }
2410
2411 template <typename Unit, class AnyCharsAccess>
2412 [[nodiscard]] MOZ_ALWAYS_INLINE bool
matchInteger(IsIntegerUnit isIntegerUnit,int32_t * nextUnit)2413 TokenStreamSpecific<Unit, AnyCharsAccess>::matchInteger(
2414 IsIntegerUnit isIntegerUnit, int32_t* nextUnit) {
2415 int32_t unit = getCodeUnit();
2416 if (!isIntegerUnit(unit)) {
2417 *nextUnit = unit;
2418 return true;
2419 }
2420 return matchIntegerAfterFirstDigit(isIntegerUnit, nextUnit);
2421 }
2422
2423 template <typename Unit, class AnyCharsAccess>
2424 [[nodiscard]] MOZ_ALWAYS_INLINE bool
matchIntegerAfterFirstDigit(IsIntegerUnit isIntegerUnit,int32_t * nextUnit)2425 TokenStreamSpecific<Unit, AnyCharsAccess>::matchIntegerAfterFirstDigit(
2426 IsIntegerUnit isIntegerUnit, int32_t* nextUnit) {
2427 int32_t unit;
2428 while (true) {
2429 unit = getCodeUnit();
2430 if (isIntegerUnit(unit)) {
2431 continue;
2432 }
2433 if (unit != '_') {
2434 break;
2435 }
2436 unit = getCodeUnit();
2437 if (!isIntegerUnit(unit)) {
2438 if (unit == '_') {
2439 error(JSMSG_NUMBER_MULTIPLE_ADJACENT_UNDERSCORES);
2440 } else {
2441 error(JSMSG_NUMBER_END_WITH_UNDERSCORE);
2442 }
2443 return false;
2444 }
2445 }
2446
2447 *nextUnit = unit;
2448 return true;
2449 }
2450
2451 template <typename Unit, class AnyCharsAccess>
decimalNumber(int32_t unit,TokenStart start,const Unit * numStart,Modifier modifier,TokenKind * out)2452 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::decimalNumber(
2453 int32_t unit, TokenStart start, const Unit* numStart, Modifier modifier,
2454 TokenKind* out) {
2455 // Run the bad-token code for every path out of this function except the
2456 // one success-case.
2457 auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
2458
2459 // Consume integral component digits.
2460 if (IsAsciiDigit(unit)) {
2461 if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) {
2462 return false;
2463 }
2464 }
2465
2466 // Numbers contain no escapes, so we can read directly from |sourceUnits|.
2467 double dval;
2468 bool isBigInt = false;
2469 DecimalPoint decimalPoint = NoDecimal;
2470 if (unit != '.' && unit != 'e' && unit != 'E' && unit != 'n') {
2471 // NOTE: |unit| may be EOF here.
2472 ungetCodeUnit(unit);
2473
2474 // Most numbers are pure decimal integers without fractional component
2475 // or exponential notation. Handle that with optimized code.
2476 if (!GetDecimalInteger(anyCharsAccess().cx, numStart,
2477 this->sourceUnits.addressOfNextCodeUnit(), &dval)) {
2478 return false;
2479 }
2480 } else if (unit == 'n') {
2481 isBigInt = true;
2482 unit = peekCodeUnit();
2483 } else {
2484 // Consume any decimal dot and fractional component.
2485 if (unit == '.') {
2486 decimalPoint = HasDecimal;
2487 if (!matchInteger(IsAsciiDigit, &unit)) {
2488 return false;
2489 }
2490 }
2491
2492 // Consume any exponential notation.
2493 if (unit == 'e' || unit == 'E') {
2494 unit = getCodeUnit();
2495 if (unit == '+' || unit == '-') {
2496 unit = getCodeUnit();
2497 }
2498
2499 // Exponential notation must contain at least one digit.
2500 if (!IsAsciiDigit(unit)) {
2501 ungetCodeUnit(unit);
2502 error(JSMSG_MISSING_EXPONENT);
2503 return false;
2504 }
2505
2506 // Consume exponential digits.
2507 if (!matchIntegerAfterFirstDigit(IsAsciiDigit, &unit)) {
2508 return false;
2509 }
2510 }
2511
2512 ungetCodeUnit(unit);
2513
2514 // "0." and "0e..." numbers parse "." or "e..." here. Neither range
2515 // contains a number, so we can't use |FullStringToDouble|. (Parse
2516 // failures return 0.0, so we'll still get the right result.)
2517 if (!GetDecimalNonInteger(anyCharsAccess().cx, numStart,
2518 this->sourceUnits.addressOfNextCodeUnit(),
2519 &dval)) {
2520 return false;
2521 }
2522 }
2523
2524 // Number followed by IdentifierStart is an error. (This is the only place
2525 // in ECMAScript where token boundary is inadequate to properly separate
2526 // two tokens, necessitating this unaesthetic lookahead.)
2527 if (unit != EOF) {
2528 if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
2529 if (unicode::IsIdentifierStart(char16_t(unit))) {
2530 error(JSMSG_IDSTART_AFTER_NUMBER);
2531 return false;
2532 }
2533 } else {
2534 // This ignores encoding errors: subsequent caller-side code to
2535 // handle source text after the number will do so.
2536 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2537 if (!peeked.isNone() && unicode::IsIdentifierStart(peeked.codePoint())) {
2538 error(JSMSG_IDSTART_AFTER_NUMBER);
2539 return false;
2540 }
2541 }
2542 }
2543
2544 noteBadToken.release();
2545
2546 if (isBigInt) {
2547 return bigIntLiteral(start, modifier, out);
2548 }
2549
2550 newNumberToken(dval, decimalPoint, start, modifier, out);
2551 return true;
2552 }
2553
2554 template <typename Unit, class AnyCharsAccess>
regexpLiteral(TokenStart start,TokenKind * out)2555 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::regexpLiteral(
2556 TokenStart start, TokenKind* out) {
2557 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('/'));
2558 this->charBuffer.clear();
2559
2560 auto ProcessNonAsciiCodePoint = [this](int32_t lead) {
2561 MOZ_ASSERT(lead != EOF);
2562 MOZ_ASSERT(!this->isAsciiCodePoint(lead));
2563
2564 char32_t codePoint;
2565 if (!this->getNonAsciiCodePointDontNormalize(this->toUnit(lead),
2566 &codePoint)) {
2567 return false;
2568 }
2569
2570 if (MOZ_UNLIKELY(codePoint == unicode::LINE_SEPARATOR ||
2571 codePoint == unicode::PARA_SEPARATOR)) {
2572 this->sourceUnits.ungetLineOrParagraphSeparator();
2573 this->error(JSMSG_UNTERMINATED_REGEXP);
2574 return false;
2575 }
2576
2577 return AppendCodePointToCharBuffer(this->charBuffer, codePoint);
2578 };
2579
2580 auto ReportUnterminatedRegExp = [this](int32_t unit) {
2581 this->ungetCodeUnit(unit);
2582 this->error(JSMSG_UNTERMINATED_REGEXP);
2583 };
2584
2585 bool inCharClass = false;
2586 do {
2587 int32_t unit = getCodeUnit();
2588 if (unit == EOF) {
2589 ReportUnterminatedRegExp(unit);
2590 return badToken();
2591 }
2592
2593 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
2594 if (!ProcessNonAsciiCodePoint(unit)) {
2595 return badToken();
2596 }
2597
2598 continue;
2599 }
2600
2601 if (unit == '\\') {
2602 if (!this->charBuffer.append(unit)) {
2603 return badToken();
2604 }
2605
2606 unit = getCodeUnit();
2607 if (unit == EOF) {
2608 ReportUnterminatedRegExp(unit);
2609 return badToken();
2610 }
2611
2612 // Fallthrough only handles ASCII code points, so
2613 // deal with non-ASCII and skip everything else.
2614 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
2615 if (!ProcessNonAsciiCodePoint(unit)) {
2616 return badToken();
2617 }
2618
2619 continue;
2620 }
2621 } else if (unit == '[') {
2622 inCharClass = true;
2623 } else if (unit == ']') {
2624 inCharClass = false;
2625 } else if (unit == '/' && !inCharClass) {
2626 // For IE compat, allow unescaped / in char classes.
2627 break;
2628 }
2629
2630 // NOTE: Non-ASCII LineTerminators were handled by
2631 // ProcessNonAsciiCodePoint calls above.
2632 if (unit == '\r' || unit == '\n') {
2633 ReportUnterminatedRegExp(unit);
2634 return badToken();
2635 }
2636
2637 MOZ_ASSERT(!IsLineTerminator(AssertedCast<char32_t>(unit)));
2638 if (!this->charBuffer.append(unit)) {
2639 return badToken();
2640 }
2641 } while (true);
2642
2643 int32_t unit;
2644 RegExpFlags reflags = RegExpFlag::NoFlags;
2645 while (true) {
2646 uint8_t flag;
2647 unit = getCodeUnit();
2648 if (unit == 'd') {
2649 flag = RegExpFlag::HasIndices;
2650 } else if (unit == 'g') {
2651 flag = RegExpFlag::Global;
2652 } else if (unit == 'i') {
2653 flag = RegExpFlag::IgnoreCase;
2654 } else if (unit == 'm') {
2655 flag = RegExpFlag::Multiline;
2656 } else if (unit == 's') {
2657 flag = RegExpFlag::DotAll;
2658 } else if (unit == 'u') {
2659 flag = RegExpFlag::Unicode;
2660 } else if (unit == 'y') {
2661 flag = RegExpFlag::Sticky;
2662 } else if (IsAsciiAlpha(unit)) {
2663 flag = RegExpFlag::NoFlags;
2664 } else {
2665 break;
2666 }
2667
2668 if ((reflags & flag) || flag == RegExpFlag::NoFlags) {
2669 ungetCodeUnit(unit);
2670 char buf[2] = {char(unit), '\0'};
2671 error(JSMSG_BAD_REGEXP_FLAG, buf);
2672 return badToken();
2673 }
2674
2675 reflags |= flag;
2676 }
2677 ungetCodeUnit(unit);
2678
2679 newRegExpToken(reflags, start, out);
2680 return true;
2681 }
2682
2683 template <typename Unit, class AnyCharsAccess>
bigIntLiteral(TokenStart start,Modifier modifier,TokenKind * out)2684 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::bigIntLiteral(
2685 TokenStart start, Modifier modifier, TokenKind* out) {
2686 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == toUnit('n'));
2687 MOZ_ASSERT(this->sourceUnits.offset() > start.offset());
2688 uint32_t length = this->sourceUnits.offset() - start.offset();
2689 MOZ_ASSERT(length >= 2);
2690 this->charBuffer.clear();
2691 mozilla::Range<const Unit> chars(
2692 this->sourceUnits.codeUnitPtrAt(start.offset()), length);
2693 for (uint32_t idx = 0; idx < length - 1; idx++) {
2694 int32_t unit = CodeUnitValue(chars[idx]);
2695 // Char buffer may start with a 0[bBoOxX] prefix, then follows with
2696 // binary, octal, decimal, or hex digits. Already checked by caller, as
2697 // the "n" indicating bigint comes at the end.
2698 MOZ_ASSERT(isAsciiCodePoint(unit));
2699 // Skip over any separators.
2700 if (unit == '_') {
2701 continue;
2702 }
2703 if (!AppendCodePointToCharBuffer(this->charBuffer, unit)) {
2704 return false;
2705 }
2706 }
2707 newBigIntToken(start, modifier, out);
2708 return true;
2709 }
2710
2711 template <typename Unit, class AnyCharsAccess>
2712 void GeneralTokenStreamChars<Unit,
consumeOptionalHashbangComment()2713 AnyCharsAccess>::consumeOptionalHashbangComment() {
2714 MOZ_ASSERT(this->sourceUnits.atStart(),
2715 "HashBangComment can only appear immediately at the start of a "
2716 "Script or Module");
2717
2718 // HashbangComment ::
2719 // #! SingleLineCommentChars_opt
2720
2721 if (!matchCodeUnit('#')) {
2722 // HashbangComment is optional at start of Script or Module.
2723 return;
2724 }
2725
2726 if (!matchCodeUnit('!')) {
2727 // # not followed by ! at start of Script or Module is an error, but normal
2728 // parsing code will handle that error just fine if we let it.
2729 ungetCodeUnit('#');
2730 return;
2731 }
2732
2733 // This doesn't consume a concluding LineTerminator, and it stops consuming
2734 // just before any encoding error. The subsequent |getToken| call will call
2735 // |getTokenInternal| below which will handle these possibilities.
2736 this->sourceUnits.consumeRestOfSingleLineComment();
2737 }
2738
2739 template <typename Unit, class AnyCharsAccess>
getTokenInternal(TokenKind * const ttp,const Modifier modifier)2740 [[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getTokenInternal(
2741 TokenKind* const ttp, const Modifier modifier) {
2742 // Assume we'll fail: success cases will overwrite this.
2743 #ifdef DEBUG
2744 *ttp = TokenKind::Limit;
2745 #endif
2746 MOZ_MAKE_MEM_UNDEFINED(ttp, sizeof(*ttp));
2747
2748 // This loop runs more than once only when whitespace or comments are
2749 // encountered.
2750 do {
2751 int32_t unit = peekCodeUnit();
2752 if (MOZ_UNLIKELY(unit == EOF)) {
2753 MOZ_ASSERT(this->sourceUnits.atEnd());
2754 anyCharsAccess().flags.isEOF = true;
2755 TokenStart start(this->sourceUnits, 0);
2756 newSimpleToken(TokenKind::Eof, start, modifier, ttp);
2757 return true;
2758 }
2759
2760 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
2761 // Non-ASCII code points can only be identifiers or whitespace. It would
2762 // be nice to compute these *after* discarding whitespace, but IN A WORLD
2763 // where |unicode::IsSpace| requires consuming a variable number of code
2764 // units, it's easier to assume it's an identifier and maybe do a little
2765 // wasted work, than to unget and compute and reget if whitespace.
2766 TokenStart start(this->sourceUnits, 0);
2767 const Unit* identStart = this->sourceUnits.addressOfNextCodeUnit();
2768
2769 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
2770 if (peeked.isNone()) {
2771 int32_t bad;
2772 MOZ_ALWAYS_FALSE(getCodePoint(&bad));
2773 return badToken();
2774 }
2775
2776 char32_t cp = peeked.codePoint();
2777 if (unicode::IsSpace(cp)) {
2778 this->sourceUnits.consumeKnownCodePoint(peeked);
2779 if (IsLineTerminator(cp)) {
2780 if (!updateLineInfoForEOL()) {
2781 return badToken();
2782 }
2783
2784 anyCharsAccess().updateFlagsForEOL();
2785 }
2786
2787 continue;
2788 }
2789
2790 static_assert(isAsciiCodePoint('$'),
2791 "IdentifierStart contains '$', but as "
2792 "!IsUnicodeIDStart('$'), ensure that '$' is never "
2793 "handled here");
2794 static_assert(isAsciiCodePoint('_'),
2795 "IdentifierStart contains '_', but as "
2796 "!IsUnicodeIDStart('_'), ensure that '_' is never "
2797 "handled here");
2798
2799 if (MOZ_LIKELY(unicode::IsUnicodeIDStart(cp))) {
2800 this->sourceUnits.consumeKnownCodePoint(peeked);
2801 MOZ_ASSERT(!IsLineTerminator(cp),
2802 "IdentifierStart must guarantee !IsLineTerminator "
2803 "or else we'll fail to maintain line-info/flags "
2804 "for EOL here");
2805
2806 return identifierName(start, identStart, IdentifierEscapes::None,
2807 modifier, NameVisibility::Public, ttp);
2808 }
2809
2810 reportIllegalCharacter(cp);
2811 return badToken();
2812 } // !isAsciiCodePoint(unit)
2813
2814 consumeKnownCodeUnit(unit);
2815
2816 // Get the token kind, based on the first char. The ordering of c1kind
2817 // comparison is based on the frequency of tokens in real code:
2818 // Parsemark (which represents typical JS code on the web) and the
2819 // Unreal demo (which represents asm.js code).
2820 //
2821 // Parsemark Unreal
2822 // OneChar 32.9% 39.7%
2823 // Space 25.0% 0.6%
2824 // Ident 19.2% 36.4%
2825 // Dec 7.2% 5.1%
2826 // String 7.9% 0.0%
2827 // EOL 1.7% 0.0%
2828 // ZeroDigit 0.4% 4.9%
2829 // Other 5.7% 13.3%
2830 //
2831 // The ordering is based mostly only Parsemark frequencies, with Unreal
2832 // frequencies used to break close categories (e.g. |Dec| and
2833 // |String|). |Other| is biggish, but no other token kind is common
2834 // enough for it to be worth adding extra values to FirstCharKind.
2835 FirstCharKind c1kind = FirstCharKind(firstCharKinds[unit]);
2836
2837 // Look for an unambiguous single-char token.
2838 //
2839 if (c1kind <= OneChar_Max) {
2840 TokenStart start(this->sourceUnits, -1);
2841 newSimpleToken(TokenKind(c1kind), start, modifier, ttp);
2842 return true;
2843 }
2844
2845 // Skip over non-EOL whitespace chars.
2846 //
2847 if (c1kind == Space) {
2848 continue;
2849 }
2850
2851 // Look for an identifier.
2852 //
2853 if (c1kind == Ident) {
2854 TokenStart start(this->sourceUnits, -1);
2855 return identifierName(
2856 start, this->sourceUnits.addressOfNextCodeUnit() - 1,
2857 IdentifierEscapes::None, modifier, NameVisibility::Public, ttp);
2858 }
2859
2860 // Look for a decimal number.
2861 //
2862 if (c1kind == Dec) {
2863 TokenStart start(this->sourceUnits, -1);
2864 const Unit* numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2865 return decimalNumber(unit, start, numStart, modifier, ttp);
2866 }
2867
2868 // Look for a string or a template string.
2869 //
2870 if (c1kind == String) {
2871 return getStringOrTemplateToken(static_cast<char>(unit), modifier, ttp);
2872 }
2873
2874 // Skip over EOL chars, updating line state along the way.
2875 //
2876 if (c1kind == EOL) {
2877 if (unit == '\r') {
2878 matchLineTerminator('\n');
2879 }
2880
2881 if (!updateLineInfoForEOL()) {
2882 return badToken();
2883 }
2884
2885 anyCharsAccess().updateFlagsForEOL();
2886 continue;
2887 }
2888
2889 // From a '0', look for a hexadecimal, binary, octal, or "noctal" (a
2890 // number starting with '0' that contains '8' or '9' and is treated as
2891 // decimal) number.
2892 //
2893 if (c1kind == ZeroDigit) {
2894 TokenStart start(this->sourceUnits, -1);
2895 int radix;
2896 bool isBigInt = false;
2897 const Unit* numStart;
2898 unit = getCodeUnit();
2899 if (unit == 'x' || unit == 'X') {
2900 radix = 16;
2901 unit = getCodeUnit();
2902 if (!IsAsciiHexDigit(unit)) {
2903 // NOTE: |unit| may be EOF here.
2904 ungetCodeUnit(unit);
2905 error(JSMSG_MISSING_HEXDIGITS);
2906 return badToken();
2907 }
2908
2909 // one past the '0x'
2910 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2911
2912 if (!matchIntegerAfterFirstDigit(IsAsciiHexDigit, &unit)) {
2913 return badToken();
2914 }
2915 } else if (unit == 'b' || unit == 'B') {
2916 radix = 2;
2917 unit = getCodeUnit();
2918 if (!IsAsciiBinary(unit)) {
2919 // NOTE: |unit| may be EOF here.
2920 ungetCodeUnit(unit);
2921 error(JSMSG_MISSING_BINARY_DIGITS);
2922 return badToken();
2923 }
2924
2925 // one past the '0b'
2926 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2927
2928 if (!matchIntegerAfterFirstDigit(IsAsciiBinary, &unit)) {
2929 return badToken();
2930 }
2931 } else if (unit == 'o' || unit == 'O') {
2932 radix = 8;
2933 unit = getCodeUnit();
2934 if (!IsAsciiOctal(unit)) {
2935 // NOTE: |unit| may be EOF here.
2936 ungetCodeUnit(unit);
2937 error(JSMSG_MISSING_OCTAL_DIGITS);
2938 return badToken();
2939 }
2940
2941 // one past the '0o'
2942 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2943
2944 if (!matchIntegerAfterFirstDigit(IsAsciiOctal, &unit)) {
2945 return badToken();
2946 }
2947 } else if (IsAsciiDigit(unit)) {
2948 // Reject octal literals that appear in strict mode code.
2949 if (!strictModeError(JSMSG_DEPRECATED_OCTAL_LITERAL)) {
2950 return badToken();
2951 }
2952
2953 // The above test doesn't catch a few edge cases; see
2954 // |GeneralParser::maybeParseDirective|. Record the violation so that
2955 // that function can handle them.
2956 anyCharsAccess().setSawDeprecatedOctalLiteral();
2957
2958 radix = 8;
2959 // one past the '0'
2960 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2961
2962 bool nonOctalDecimalIntegerLiteral = false;
2963 do {
2964 if (unit >= '8') {
2965 nonOctalDecimalIntegerLiteral = true;
2966 }
2967 unit = getCodeUnit();
2968 } while (IsAsciiDigit(unit));
2969
2970 if (unit == '_') {
2971 error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER);
2972 return badToken();
2973 }
2974
2975 if (unit == 'n') {
2976 error(JSMSG_BIGINT_INVALID_SYNTAX);
2977 return badToken();
2978 }
2979
2980 if (nonOctalDecimalIntegerLiteral) {
2981 // Use the decimal scanner for the rest of the number.
2982 return decimalNumber(unit, start, numStart, modifier, ttp);
2983 }
2984 } else if (unit == '_') {
2985 // Give a more explicit error message when '_' is used after '0'.
2986 error(JSMSG_SEPARATOR_IN_ZERO_PREFIXED_NUMBER);
2987 return badToken();
2988 } else {
2989 // '0' not followed by [XxBbOo0-9_]; scan as a decimal number.
2990 numStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
2991
2992 // NOTE: |unit| may be EOF here. (This is permitted by case #3
2993 // in TokenStream.h docs for this function.)
2994 return decimalNumber(unit, start, numStart, modifier, ttp);
2995 }
2996
2997 if (unit == 'n') {
2998 isBigInt = true;
2999 unit = peekCodeUnit();
3000 } else {
3001 ungetCodeUnit(unit);
3002 }
3003
3004 // Error if an identifier-start code point appears immediately
3005 // after the number. Somewhat surprisingly, if we don't check
3006 // here, we'll never check at all.
3007 if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
3008 if (unicode::IsIdentifierStart(char16_t(unit))) {
3009 error(JSMSG_IDSTART_AFTER_NUMBER);
3010 return badToken();
3011 }
3012 } else if (MOZ_LIKELY(unit != EOF)) {
3013 // This ignores encoding errors: subsequent caller-side code to
3014 // handle source text after the number will do so.
3015 PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint();
3016 if (!peeked.isNone() &&
3017 unicode::IsIdentifierStart(peeked.codePoint())) {
3018 error(JSMSG_IDSTART_AFTER_NUMBER);
3019 return badToken();
3020 }
3021 }
3022
3023 if (isBigInt) {
3024 return bigIntLiteral(start, modifier, ttp);
3025 }
3026
3027 double dval;
3028 if (!GetFullInteger(anyCharsAccess().cx, numStart,
3029 this->sourceUnits.addressOfNextCodeUnit(), radix,
3030 IntegerSeparatorHandling::SkipUnderscore, &dval)) {
3031 return badToken();
3032 }
3033 newNumberToken(dval, NoDecimal, start, modifier, ttp);
3034 return true;
3035 }
3036
3037 MOZ_ASSERT(c1kind == Other);
3038
3039 // This handles everything else. Simple tokens distinguished solely by
3040 // TokenKind should set |simpleKind| and break, to share simple-token
3041 // creation code for all such tokens. All other tokens must be handled
3042 // by returning (or by continuing from the loop enclosing this).
3043 //
3044 TokenStart start(this->sourceUnits, -1);
3045 TokenKind simpleKind;
3046 #ifdef DEBUG
3047 simpleKind = TokenKind::Limit; // sentinel value for code after switch
3048 #endif
3049
3050 // The block a ways above eliminated all non-ASCII, so cast to the
3051 // smallest type possible to assist the C++ compiler.
3052 switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) {
3053 case '.':
3054 if (IsAsciiDigit(peekCodeUnit())) {
3055 return decimalNumber('.', start,
3056 this->sourceUnits.addressOfNextCodeUnit() - 1,
3057 modifier, ttp);
3058 }
3059
3060 unit = getCodeUnit();
3061 if (unit == '.') {
3062 if (matchCodeUnit('.')) {
3063 simpleKind = TokenKind::TripleDot;
3064 break;
3065 }
3066 }
3067
3068 // NOTE: |unit| may be EOF here. A stray '.' at EOF would be an
3069 // error, but subsequent code will handle it.
3070 ungetCodeUnit(unit);
3071
3072 simpleKind = TokenKind::Dot;
3073 break;
3074
3075 case '#': {
3076 #ifdef ENABLE_RECORD_TUPLE
3077 if (matchCodeUnit('{')) {
3078 simpleKind = TokenKind::HashCurly;
3079 break;
3080 }
3081 if (matchCodeUnit('[')) {
3082 simpleKind = TokenKind::HashBracket;
3083 break;
3084 }
3085 #endif
3086
3087 TokenStart start(this->sourceUnits, -1);
3088 const Unit* identStart = this->sourceUnits.addressOfNextCodeUnit() - 1;
3089 IdentifierEscapes sawEscape;
3090 if (!matchIdentifierStart(&sawEscape)) {
3091 return badToken();
3092 }
3093 return identifierName(start, identStart, sawEscape, modifier,
3094 NameVisibility::Private, ttp);
3095 }
3096
3097 case '=':
3098 if (matchCodeUnit('=')) {
3099 simpleKind = matchCodeUnit('=') ? TokenKind::StrictEq : TokenKind::Eq;
3100 } else if (matchCodeUnit('>')) {
3101 simpleKind = TokenKind::Arrow;
3102 } else {
3103 simpleKind = TokenKind::Assign;
3104 }
3105 break;
3106
3107 case '+':
3108 if (matchCodeUnit('+')) {
3109 simpleKind = TokenKind::Inc;
3110 } else {
3111 simpleKind =
3112 matchCodeUnit('=') ? TokenKind::AddAssign : TokenKind::Add;
3113 }
3114 break;
3115
3116 case '\\': {
3117 uint32_t codePoint;
3118 if (uint32_t escapeLength = matchUnicodeEscapeIdStart(&codePoint)) {
3119 return identifierName(
3120 start,
3121 this->sourceUnits.addressOfNextCodeUnit() - escapeLength - 1,
3122 IdentifierEscapes::SawUnicodeEscape, modifier,
3123 NameVisibility::Public, ttp);
3124 }
3125
3126 // We could point "into" a mistyped escape, e.g. for "\u{41H}" we
3127 // could point at the 'H'. But we don't do that now, so the code
3128 // unit after the '\' isn't necessarily bad, so just point at the
3129 // start of the actually-invalid escape.
3130 ungetCodeUnit('\\');
3131 error(JSMSG_BAD_ESCAPE);
3132 return badToken();
3133 }
3134
3135 case '|':
3136 if (matchCodeUnit('|')) {
3137 simpleKind = matchCodeUnit('=') ? TokenKind::OrAssign : TokenKind::Or;
3138 } else {
3139 simpleKind =
3140 matchCodeUnit('=') ? TokenKind::BitOrAssign : TokenKind::BitOr;
3141 }
3142 break;
3143
3144 case '^':
3145 simpleKind =
3146 matchCodeUnit('=') ? TokenKind::BitXorAssign : TokenKind::BitXor;
3147 break;
3148
3149 case '&':
3150 if (matchCodeUnit('&')) {
3151 simpleKind =
3152 matchCodeUnit('=') ? TokenKind::AndAssign : TokenKind::And;
3153 } else {
3154 simpleKind =
3155 matchCodeUnit('=') ? TokenKind::BitAndAssign : TokenKind::BitAnd;
3156 }
3157 break;
3158
3159 case '?':
3160 if (matchCodeUnit('.')) {
3161 unit = getCodeUnit();
3162 if (IsAsciiDigit(unit)) {
3163 // if the code unit is followed by a number, for example it has the
3164 // following form `<...> ?.5 <..> then it should be treated as a
3165 // ternary rather than as an optional chain
3166 simpleKind = TokenKind::Hook;
3167 ungetCodeUnit(unit);
3168 ungetCodeUnit('.');
3169 } else {
3170 ungetCodeUnit(unit);
3171 simpleKind = TokenKind::OptionalChain;
3172 }
3173 } else if (matchCodeUnit('?')) {
3174 simpleKind = matchCodeUnit('=') ? TokenKind::CoalesceAssign
3175 : TokenKind::Coalesce;
3176 } else {
3177 simpleKind = TokenKind::Hook;
3178 }
3179 break;
3180
3181 case '!':
3182 if (matchCodeUnit('=')) {
3183 simpleKind = matchCodeUnit('=') ? TokenKind::StrictNe : TokenKind::Ne;
3184 } else {
3185 simpleKind = TokenKind::Not;
3186 }
3187 break;
3188
3189 case '<':
3190 if (anyCharsAccess().options().allowHTMLComments) {
3191 // Treat HTML begin-comment as comment-till-end-of-line.
3192 if (matchCodeUnit('!')) {
3193 if (matchCodeUnit('-')) {
3194 if (matchCodeUnit('-')) {
3195 this->sourceUnits.consumeRestOfSingleLineComment();
3196 continue;
3197 }
3198 ungetCodeUnit('-');
3199 }
3200 ungetCodeUnit('!');
3201 }
3202 }
3203 if (matchCodeUnit('<')) {
3204 simpleKind =
3205 matchCodeUnit('=') ? TokenKind::LshAssign : TokenKind::Lsh;
3206 } else {
3207 simpleKind = matchCodeUnit('=') ? TokenKind::Le : TokenKind::Lt;
3208 }
3209 break;
3210
3211 case '>':
3212 if (matchCodeUnit('>')) {
3213 if (matchCodeUnit('>')) {
3214 simpleKind =
3215 matchCodeUnit('=') ? TokenKind::UrshAssign : TokenKind::Ursh;
3216 } else {
3217 simpleKind =
3218 matchCodeUnit('=') ? TokenKind::RshAssign : TokenKind::Rsh;
3219 }
3220 } else {
3221 simpleKind = matchCodeUnit('=') ? TokenKind::Ge : TokenKind::Gt;
3222 }
3223 break;
3224
3225 case '*':
3226 if (matchCodeUnit('*')) {
3227 simpleKind =
3228 matchCodeUnit('=') ? TokenKind::PowAssign : TokenKind::Pow;
3229 } else {
3230 simpleKind =
3231 matchCodeUnit('=') ? TokenKind::MulAssign : TokenKind::Mul;
3232 }
3233 break;
3234
3235 case '/':
3236 // Look for a single-line comment.
3237 if (matchCodeUnit('/')) {
3238 unit = getCodeUnit();
3239 if (unit == '@' || unit == '#') {
3240 bool shouldWarn = unit == '@';
3241 if (!getDirectives(false, shouldWarn)) {
3242 return false;
3243 }
3244 } else {
3245 // NOTE: |unit| may be EOF here.
3246 ungetCodeUnit(unit);
3247 }
3248
3249 this->sourceUnits.consumeRestOfSingleLineComment();
3250 continue;
3251 }
3252
3253 // Look for a multi-line comment.
3254 if (matchCodeUnit('*')) {
3255 TokenStreamAnyChars& anyChars = anyCharsAccess();
3256 unsigned linenoBefore = anyChars.lineno;
3257
3258 do {
3259 int32_t unit = getCodeUnit();
3260 if (unit == EOF) {
3261 error(JSMSG_UNTERMINATED_COMMENT);
3262 return badToken();
3263 }
3264
3265 if (unit == '*' && matchCodeUnit('/')) {
3266 break;
3267 }
3268
3269 if (unit == '@' || unit == '#') {
3270 bool shouldWarn = unit == '@';
3271 if (!getDirectives(true, shouldWarn)) {
3272 return badToken();
3273 }
3274 } else if (MOZ_LIKELY(isAsciiCodePoint(unit))) {
3275 int32_t codePoint;
3276 if (!getFullAsciiCodePoint(unit, &codePoint)) {
3277 return badToken();
3278 }
3279 } else {
3280 int32_t codePoint;
3281 if (!getNonAsciiCodePoint(unit, &codePoint)) {
3282 return badToken();
3283 }
3284 }
3285 } while (true);
3286
3287 if (linenoBefore != anyChars.lineno) {
3288 anyChars.updateFlagsForEOL();
3289 }
3290
3291 continue;
3292 }
3293
3294 // Look for a regexp.
3295 if (modifier == SlashIsRegExp) {
3296 return regexpLiteral(start, ttp);
3297 }
3298
3299 simpleKind = matchCodeUnit('=') ? TokenKind::DivAssign : TokenKind::Div;
3300 break;
3301
3302 case '%':
3303 simpleKind = matchCodeUnit('=') ? TokenKind::ModAssign : TokenKind::Mod;
3304 break;
3305
3306 case '-':
3307 if (matchCodeUnit('-')) {
3308 if (anyCharsAccess().options().allowHTMLComments &&
3309 !anyCharsAccess().flags.isDirtyLine) {
3310 if (matchCodeUnit('>')) {
3311 this->sourceUnits.consumeRestOfSingleLineComment();
3312 continue;
3313 }
3314 }
3315
3316 simpleKind = TokenKind::Dec;
3317 } else {
3318 simpleKind =
3319 matchCodeUnit('=') ? TokenKind::SubAssign : TokenKind::Sub;
3320 }
3321 break;
3322
3323 default:
3324 // We consumed a bad ASCII code point/unit. Put it back so the
3325 // error location is the bad code point.
3326 ungetCodeUnit(unit);
3327 reportIllegalCharacter(unit);
3328 return badToken();
3329 } // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit))))
3330
3331 MOZ_ASSERT(simpleKind != TokenKind::Limit,
3332 "switch-statement should have set |simpleKind| before "
3333 "breaking");
3334
3335 newSimpleToken(simpleKind, start, modifier, ttp);
3336 return true;
3337 } while (true);
3338 }
3339
3340 template <typename Unit, class AnyCharsAccess>
getStringOrTemplateToken(char untilChar,Modifier modifier,TokenKind * out)3341 bool TokenStreamSpecific<Unit, AnyCharsAccess>::getStringOrTemplateToken(
3342 char untilChar, Modifier modifier, TokenKind* out) {
3343 MOZ_ASSERT(untilChar == '\'' || untilChar == '"' || untilChar == '`',
3344 "unexpected string/template literal delimiter");
3345
3346 bool parsingTemplate = (untilChar == '`');
3347 bool templateHead = false;
3348
3349 TokenStart start(this->sourceUnits, -1);
3350 this->charBuffer.clear();
3351
3352 // Run the bad-token code for every path out of this function except the
3353 // one success-case.
3354 auto noteBadToken = MakeScopeExit([this]() { this->badToken(); });
3355
3356 auto ReportPrematureEndOfLiteral = [this, untilChar](unsigned errnum) {
3357 // Unicode separators aren't end-of-line in template or (as of
3358 // recently) string literals, so this assertion doesn't allow them.
3359 MOZ_ASSERT(this->sourceUnits.atEnd() ||
3360 this->sourceUnits.peekCodeUnit() == Unit('\r') ||
3361 this->sourceUnits.peekCodeUnit() == Unit('\n'),
3362 "must be parked at EOF or EOL to call this function");
3363
3364 // The various errors reported here include language like "in a ''
3365 // literal" or similar, with '' being '', "", or `` as appropriate.
3366 const char delimiters[] = {untilChar, untilChar, '\0'};
3367
3368 this->error(errnum, delimiters);
3369 return;
3370 };
3371
3372 // We need to detect any of these chars: " or ', \n (or its
3373 // equivalents), \\, EOF. Because we detect EOL sequences here and
3374 // put them back immediately, we can use getCodeUnit().
3375 int32_t unit;
3376 while ((unit = getCodeUnit()) != untilChar) {
3377 if (unit == EOF) {
3378 ReportPrematureEndOfLiteral(JSMSG_EOF_BEFORE_END_OF_LITERAL);
3379 return false;
3380 }
3381
3382 // Non-ASCII code points are always directly appended -- even
3383 // U+2028 LINE SEPARATOR and U+2029 PARAGRAPH SEPARATOR that are
3384 // ordinarily LineTerminatorSequences. (They contribute their literal
3385 // values to template and [as of recently] string literals, but they're
3386 // line terminators when computing line/column coordinates.) Handle
3387 // the non-ASCII case early for readability.
3388 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
3389 char32_t cp;
3390 if (!getNonAsciiCodePointDontNormalize(toUnit(unit), &cp)) {
3391 return false;
3392 }
3393
3394 if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
3395 cp == unicode::PARA_SEPARATOR)) {
3396 if (!updateLineInfoForEOL()) {
3397 return false;
3398 }
3399
3400 anyCharsAccess().updateFlagsForEOL();
3401 } else {
3402 MOZ_ASSERT(!IsLineTerminator(cp));
3403 }
3404
3405 if (!AppendCodePointToCharBuffer(this->charBuffer, cp)) {
3406 return false;
3407 }
3408
3409 continue;
3410 }
3411
3412 if (unit == '\\') {
3413 // When parsing templates, we don't immediately report errors for
3414 // invalid escapes; these are handled by the parser. We don't
3415 // append to charBuffer in those cases because it won't be read.
3416 unit = getCodeUnit();
3417 if (unit == EOF) {
3418 ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3419 return false;
3420 }
3421
3422 // Non-ASCII |unit| isn't handled by code after this, so dedicate
3423 // an unlikely special-case to it and then continue.
3424 if (MOZ_UNLIKELY(!isAsciiCodePoint(unit))) {
3425 int32_t codePoint;
3426 if (!getNonAsciiCodePoint(unit, &codePoint)) {
3427 return false;
3428 }
3429
3430 // If we consumed U+2028 LINE SEPARATOR or U+2029 PARAGRAPH
3431 // SEPARATOR, they'll be normalized to '\n'. '\' followed by
3432 // LineContinuation represents no code points, so don't append
3433 // in this case.
3434 if (codePoint != '\n') {
3435 if (!AppendCodePointToCharBuffer(this->charBuffer,
3436 AssertedCast<char32_t>(codePoint))) {
3437 return false;
3438 }
3439 }
3440
3441 continue;
3442 }
3443
3444 // The block above eliminated all non-ASCII, so cast to the
3445 // smallest type possible to assist the C++ compiler.
3446 switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit)))) {
3447 case 'b':
3448 unit = '\b';
3449 break;
3450 case 'f':
3451 unit = '\f';
3452 break;
3453 case 'n':
3454 unit = '\n';
3455 break;
3456 case 'r':
3457 unit = '\r';
3458 break;
3459 case 't':
3460 unit = '\t';
3461 break;
3462 case 'v':
3463 unit = '\v';
3464 break;
3465
3466 case '\r':
3467 matchLineTerminator('\n');
3468 [[fallthrough]];
3469 case '\n': {
3470 // LineContinuation represents no code points. We're manually
3471 // consuming a LineTerminatorSequence, so we must manually
3472 // update line/column info.
3473 if (!updateLineInfoForEOL()) {
3474 return false;
3475 }
3476
3477 continue;
3478 }
3479
3480 // Unicode character specification.
3481 case 'u': {
3482 int32_t c2 = getCodeUnit();
3483 if (c2 == EOF) {
3484 ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3485 return false;
3486 }
3487
3488 // First handle a delimited Unicode escape, e.g. \u{1F4A9}.
3489 if (c2 == '{') {
3490 uint32_t start = this->sourceUnits.offset() - 3;
3491 uint32_t code = 0;
3492 bool first = true;
3493 bool valid = true;
3494 do {
3495 int32_t u3 = getCodeUnit();
3496 if (u3 == EOF) {
3497 if (parsingTemplate) {
3498 TokenStreamAnyChars& anyChars = anyCharsAccess();
3499 anyChars.setInvalidTemplateEscape(start,
3500 InvalidEscapeType::Unicode);
3501 valid = false;
3502 break;
3503 }
3504 reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3505 return false;
3506 }
3507 if (u3 == '}') {
3508 if (first) {
3509 if (parsingTemplate) {
3510 TokenStreamAnyChars& anyChars = anyCharsAccess();
3511 anyChars.setInvalidTemplateEscape(
3512 start, InvalidEscapeType::Unicode);
3513 valid = false;
3514 break;
3515 }
3516 reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3517 return false;
3518 }
3519 break;
3520 }
3521
3522 // Beware: |u3| may be a non-ASCII code point here; if
3523 // so it'll pass into this |if|-block.
3524 if (!IsAsciiHexDigit(u3)) {
3525 if (parsingTemplate) {
3526 // We put the code unit back so that we read it
3527 // on the next pass, which matters if it was
3528 // '`' or '\'.
3529 ungetCodeUnit(u3);
3530
3531 TokenStreamAnyChars& anyChars = anyCharsAccess();
3532 anyChars.setInvalidTemplateEscape(start,
3533 InvalidEscapeType::Unicode);
3534 valid = false;
3535 break;
3536 }
3537 reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3538 return false;
3539 }
3540
3541 code = (code << 4) | AsciiAlphanumericToNumber(u3);
3542 if (code > unicode::NonBMPMax) {
3543 if (parsingTemplate) {
3544 TokenStreamAnyChars& anyChars = anyCharsAccess();
3545 anyChars.setInvalidTemplateEscape(
3546 start + 3, InvalidEscapeType::UnicodeOverflow);
3547 valid = false;
3548 break;
3549 }
3550 reportInvalidEscapeError(start + 3,
3551 InvalidEscapeType::UnicodeOverflow);
3552 return false;
3553 }
3554
3555 first = false;
3556 } while (true);
3557
3558 if (!valid) {
3559 continue;
3560 }
3561
3562 MOZ_ASSERT(code <= unicode::NonBMPMax);
3563 if (!AppendCodePointToCharBuffer(this->charBuffer, code)) {
3564 return false;
3565 }
3566
3567 continue;
3568 } // end of delimited Unicode escape handling
3569
3570 // Otherwise it must be a fixed-length \uXXXX Unicode escape.
3571 // If it isn't, this is usually an error -- but if this is a
3572 // template literal, we must defer error reporting because
3573 // malformed escapes are okay in *tagged* template literals.
3574 char16_t v;
3575 if (IsAsciiHexDigit(c2) && this->sourceUnits.matchHexDigits(3, &v)) {
3576 unit = (AsciiAlphanumericToNumber(c2) << 12) | v;
3577 } else {
3578 // Beware: |c2| may not be an ASCII code point here!
3579 ungetCodeUnit(c2);
3580 uint32_t start = this->sourceUnits.offset() - 2;
3581 if (parsingTemplate) {
3582 TokenStreamAnyChars& anyChars = anyCharsAccess();
3583 anyChars.setInvalidTemplateEscape(start,
3584 InvalidEscapeType::Unicode);
3585 continue;
3586 }
3587 reportInvalidEscapeError(start, InvalidEscapeType::Unicode);
3588 return false;
3589 }
3590 break;
3591 } // case 'u'
3592
3593 // Hexadecimal character specification.
3594 case 'x': {
3595 char16_t v;
3596 if (this->sourceUnits.matchHexDigits(2, &v)) {
3597 unit = v;
3598 } else {
3599 uint32_t start = this->sourceUnits.offset() - 2;
3600 if (parsingTemplate) {
3601 TokenStreamAnyChars& anyChars = anyCharsAccess();
3602 anyChars.setInvalidTemplateEscape(start,
3603 InvalidEscapeType::Hexadecimal);
3604 continue;
3605 }
3606 reportInvalidEscapeError(start, InvalidEscapeType::Hexadecimal);
3607 return false;
3608 }
3609 break;
3610 }
3611
3612 default: {
3613 if (!IsAsciiOctal(unit)) {
3614 // \8 or \9 in an untagged template literal is a syntax error,
3615 // reported in GeneralParser::noSubstitutionUntaggedTemplate.
3616 //
3617 // Tagged template literals, however, may contain \8 and \9. The
3618 // "cooked" representation of such a part will be |undefined|, and
3619 // the "raw" representation will contain the literal characters.
3620 //
3621 // function f(parts) {
3622 // assertEq(parts[0], undefined);
3623 // assertEq(parts.raw[0], "\\8");
3624 // return "composed";
3625 // }
3626 // assertEq(f`\8`, "composed");
3627 if (unit == '8' || unit == '9') {
3628 TokenStreamAnyChars& anyChars = anyCharsAccess();
3629 if (parsingTemplate) {
3630 anyChars.setInvalidTemplateEscape(
3631 this->sourceUnits.offset() - 2,
3632 InvalidEscapeType::EightOrNine);
3633 continue;
3634 }
3635
3636 // \8 and \9 are forbidden in string literals in strict mode code.
3637 if (!strictModeError(JSMSG_DEPRECATED_EIGHT_OR_NINE_ESCAPE)) {
3638 return false;
3639 }
3640
3641 // The above test doesn't catch a few edge cases; see
3642 // |GeneralParser::maybeParseDirective|. Record the violation so
3643 // that that function can handle them.
3644 anyChars.setSawDeprecatedEightOrNineEscape();
3645 }
3646 break;
3647 }
3648
3649 // Octal character specification.
3650 int32_t val = AsciiOctalToNumber(unit);
3651
3652 unit = peekCodeUnit();
3653 if (MOZ_UNLIKELY(unit == EOF)) {
3654 ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3655 return false;
3656 }
3657
3658 // Strict mode code allows only \0 followed by a non-digit.
3659 if (val != 0 || IsAsciiDigit(unit)) {
3660 TokenStreamAnyChars& anyChars = anyCharsAccess();
3661 if (parsingTemplate) {
3662 anyChars.setInvalidTemplateEscape(this->sourceUnits.offset() - 2,
3663 InvalidEscapeType::Octal);
3664 continue;
3665 }
3666
3667 if (!strictModeError(JSMSG_DEPRECATED_OCTAL_ESCAPE)) {
3668 return false;
3669 }
3670
3671 // The above test doesn't catch a few edge cases; see
3672 // |GeneralParser::maybeParseDirective|. Record the violation so
3673 // that that function can handle them.
3674 anyChars.setSawDeprecatedOctalEscape();
3675 }
3676
3677 if (IsAsciiOctal(unit)) {
3678 val = 8 * val + AsciiOctalToNumber(unit);
3679 consumeKnownCodeUnit(unit);
3680
3681 unit = peekCodeUnit();
3682 if (MOZ_UNLIKELY(unit == EOF)) {
3683 ReportPrematureEndOfLiteral(JSMSG_EOF_IN_ESCAPE_IN_LITERAL);
3684 return false;
3685 }
3686
3687 if (IsAsciiOctal(unit)) {
3688 int32_t save = val;
3689 val = 8 * val + AsciiOctalToNumber(unit);
3690 if (val <= 0xFF) {
3691 consumeKnownCodeUnit(unit);
3692 } else {
3693 val = save;
3694 }
3695 }
3696 }
3697
3698 unit = char16_t(val);
3699 break;
3700 } // default
3701 } // switch (AssertedCast<uint8_t>(CodeUnitValue(toUnit(unit))))
3702
3703 if (!this->charBuffer.append(unit)) {
3704 return false;
3705 }
3706
3707 continue;
3708 } // (unit == '\\')
3709
3710 if (unit == '\r' || unit == '\n') {
3711 if (!parsingTemplate) {
3712 // String literals don't allow ASCII line breaks.
3713 ungetCodeUnit(unit);
3714 ReportPrematureEndOfLiteral(JSMSG_EOL_BEFORE_END_OF_STRING);
3715 return false;
3716 }
3717
3718 if (unit == '\r') {
3719 unit = '\n';
3720 matchLineTerminator('\n');
3721 }
3722
3723 if (!updateLineInfoForEOL()) {
3724 return false;
3725 }
3726
3727 anyCharsAccess().updateFlagsForEOL();
3728 } else if (parsingTemplate && unit == '$' && matchCodeUnit('{')) {
3729 templateHead = true;
3730 break;
3731 }
3732
3733 if (!this->charBuffer.append(unit)) {
3734 return false;
3735 }
3736 }
3737
3738 TaggedParserAtomIndex atom = drainCharBufferIntoAtom();
3739 if (!atom) {
3740 return false;
3741 }
3742
3743 noteBadToken.release();
3744
3745 MOZ_ASSERT_IF(!parsingTemplate, !templateHead);
3746
3747 TokenKind kind = !parsingTemplate ? TokenKind::String
3748 : templateHead ? TokenKind::TemplateHead
3749 : TokenKind::NoSubsTemplate;
3750 newAtomToken(kind, atom, start, modifier, out);
3751 return true;
3752 }
3753
TokenKindToDesc(TokenKind tt)3754 const char* TokenKindToDesc(TokenKind tt) {
3755 switch (tt) {
3756 #define EMIT_CASE(name, desc) \
3757 case TokenKind::name: \
3758 return desc;
3759 FOR_EACH_TOKEN_KIND(EMIT_CASE)
3760 #undef EMIT_CASE
3761 case TokenKind::Limit:
3762 MOZ_ASSERT_UNREACHABLE("TokenKind::Limit should not be passed.");
3763 break;
3764 }
3765
3766 return "<bad TokenKind>";
3767 }
3768
3769 #ifdef DEBUG
TokenKindToString(TokenKind tt)3770 const char* TokenKindToString(TokenKind tt) {
3771 switch (tt) {
3772 # define EMIT_CASE(name, desc) \
3773 case TokenKind::name: \
3774 return "TokenKind::" #name;
3775 FOR_EACH_TOKEN_KIND(EMIT_CASE)
3776 # undef EMIT_CASE
3777 case TokenKind::Limit:
3778 break;
3779 }
3780
3781 return "<bad TokenKind>";
3782 }
3783 #endif
3784
3785 template class TokenStreamCharsBase<Utf8Unit>;
3786 template class TokenStreamCharsBase<char16_t>;
3787
3788 template class GeneralTokenStreamChars<char16_t, TokenStreamAnyCharsAccess>;
3789 template class TokenStreamChars<char16_t, TokenStreamAnyCharsAccess>;
3790 template class TokenStreamSpecific<char16_t, TokenStreamAnyCharsAccess>;
3791
3792 template class GeneralTokenStreamChars<
3793 Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
3794 template class GeneralTokenStreamChars<
3795 Utf8Unit,
3796 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
3797 template class GeneralTokenStreamChars<
3798 char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
3799 template class GeneralTokenStreamChars<
3800 char16_t,
3801 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
3802
3803 template class TokenStreamChars<
3804 Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
3805 template class TokenStreamChars<
3806 Utf8Unit,
3807 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
3808 template class TokenStreamChars<
3809 char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
3810 template class TokenStreamChars<
3811 char16_t,
3812 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
3813
3814 template class TokenStreamSpecific<
3815 Utf8Unit, ParserAnyCharsAccess<GeneralParser<FullParseHandler, Utf8Unit>>>;
3816 template class TokenStreamSpecific<
3817 Utf8Unit,
3818 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, Utf8Unit>>>;
3819 template class TokenStreamSpecific<
3820 char16_t, ParserAnyCharsAccess<GeneralParser<FullParseHandler, char16_t>>>;
3821 template class TokenStreamSpecific<
3822 char16_t,
3823 ParserAnyCharsAccess<GeneralParser<SyntaxParseHandler, char16_t>>>;
3824
3825 } // namespace frontend
3826
3827 } // namespace js
3828