1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2  * vim: set ts=8 sts=2 et sw=2 tw=80:
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 /*
8  * Streaming access to the raw tokens of JavaScript source.
9  *
10  * Because JS tokenization is context-sensitive -- a '/' could be either a
11  * regular expression *or* a division operator depending on context -- the
12  * various token stream classes are mostly not useful outside of the Parser
13  * where they reside.  We should probably eventually merge the two concepts.
14  */
15 #ifndef frontend_TokenStream_h
16 #define frontend_TokenStream_h
17 
18 /*
19  * [SMDOC] Parser Token Stream
20  *
21  * A token stream exposes the raw tokens -- operators, names, numbers,
22  * keywords, and so on -- of JavaScript source code.
23  *
24  * These are the components of the overall token stream concept:
25  * TokenStreamShared, TokenStreamAnyChars, TokenStreamCharsBase<Unit>,
26  * TokenStreamChars<Unit>, and TokenStreamSpecific<Unit, AnyCharsAccess>.
27  *
28  * == TokenStreamShared → ∅ ==
29  *
30  * Certain aspects of tokenizing are used everywhere:
31  *
32  *   * modifiers (used to select which context-sensitive interpretation of a
33  *     character should be used to decide what token it is) and modifier
34  *     assertion handling;
35  *   * flags on the overall stream (have we encountered any characters on this
36  *     line?  have we hit a syntax error?  and so on);
37  *   * and certain token-count constants.
38  *
39  * These are all defined in TokenStreamShared.  (They could be namespace-
40  * scoped, but it seems tentatively better not to clutter the namespace.)
41  *
42  * == TokenStreamAnyChars → TokenStreamShared ==
43  *
44  * Certain aspects of tokenizing have meaning independent of the character type
45  * of the source text being tokenized: line/column number information, tokens
46  * in lookahead from determining the meaning of a prior token, compilation
47  * options, the filename, flags, source map URL, access to details of the
48  * current and next tokens (is the token of the given type?  what name or
49  * number is contained in the token?  and other queries), and others.
50  *
51  * All this data/functionality *could* be duplicated for both single-byte and
52  * double-byte tokenizing, but there are two problems.  First, it's potentially
53  * wasteful if the compiler doesnt recognize it can unify the concepts.  (And
54  * if any-character concepts are intermixed with character-specific concepts,
55  * potentially the compiler *can't* unify them because offsets into the
56  * hypothetical TokenStream<Unit>s would differ.)  Second, some of this stuff
57  * needs to be accessible in ParserBase, the aspects of JS language parsing
58  * that have meaning independent of the character type of the source text being
59  * parsed.  So we need a separate data structure that ParserBase can hold on to
60  * for it.  (ParserBase isn't the only instance of this, but it's certainly the
61  * biggest case of it.)  Ergo, TokenStreamAnyChars.
62  *
63  * == TokenStreamCharsShared → ∅ ==
64  *
65  * Some functionality has meaning independent of character type, yet has no use
66  * *unless* you know the character type in actual use.  It *could* live in
67  * TokenStreamAnyChars, but it makes more sense to live in a separate class
68  * that character-aware token information can simply inherit.
69  *
70  * This class currently exists only to contain a char16_t buffer, transiently
71  * used to accumulate strings in tricky cases that can't just be read directly
72  * from source text.  It's not used outside character-aware tokenizing, so it
73  * doesn't make sense in TokenStreamAnyChars.
74  *
75  * == TokenStreamCharsBase<Unit> → TokenStreamCharsShared ==
76  *
77  * Certain data structures in tokenizing are character-type-specific: namely,
78  * the various pointers identifying the source text (including current offset
79  * and end).
80  *
81  * Additionally, some functions operating on this data are defined the same way
82  * no matter what character type you have (e.g. current offset in code units
83  * into the source text) or share a common interface regardless of character
84  * type (e.g. consume the next code unit if it has a given value).
85  *
86  * All such functionality lives in TokenStreamCharsBase<Unit>.
87  *
88  * == SpecializedTokenStreamCharsBase<Unit> → TokenStreamCharsBase<Unit> ==
89  *
90  * Certain tokenizing functionality is specific to a single character type.
91  * For example, JS's UTF-16 encoding recognizes no coding errors, because lone
92  * surrogates are not an error; but a UTF-8 encoding must recognize a variety
93  * of validation errors.  Such functionality is defined only in the appropriate
94  * SpecializedTokenStreamCharsBase specialization.
95  *
96  * == GeneralTokenStreamChars<Unit, AnyCharsAccess> →
97  *    SpecializedTokenStreamCharsBase<Unit> ==
98  *
99  * Some functionality operates differently on different character types, just
100  * as for TokenStreamCharsBase, but additionally requires access to character-
101  * type-agnostic information in TokenStreamAnyChars.  For example, getting the
102  * next character performs different steps for different character types and
103  * must access TokenStreamAnyChars to update line break information.
104  *
105  * Such functionality, if it can be defined using the same algorithm for all
106  * character types, lives in GeneralTokenStreamChars<Unit, AnyCharsAccess>.
107  * The AnyCharsAccess parameter provides a way for a GeneralTokenStreamChars
108  * instance to access its corresponding TokenStreamAnyChars, without inheriting
109  * from it.
110  *
111  * GeneralTokenStreamChars<Unit, AnyCharsAccess> is just functionality, no
112  * actual member data.
113  *
114  * Such functionality all lives in TokenStreamChars<Unit, AnyCharsAccess>, a
115  * declared-but-not-defined template class whose specializations have a common
116  * public interface (plus whatever private helper functions are desirable).
117  *
118  * == TokenStreamChars<Unit, AnyCharsAccess> →
119  *    GeneralTokenStreamChars<Unit, AnyCharsAccess> ==
120  *
121  * Some functionality is like that in GeneralTokenStreamChars, *but* it's
122  * defined entirely differently for different character types.
123  *
124  * For example, consider "match a multi-code unit code point" (hypothetically:
125  * we've only implemented two-byte tokenizing right now):
126  *
127  *   * For two-byte text, there must be two code units to get, the leading code
128  *     unit must be a UTF-16 lead surrogate, and the trailing code unit must be
129  *     a UTF-16 trailing surrogate.  (If any of these fail to hold, a next code
130  *     unit encodes that code point and is not multi-code unit.)
131  *   * For single-byte Latin-1 text, there are no multi-code unit code points.
132  *   * For single-byte UTF-8 text, the first code unit must have N > 1 of its
133  *     highest bits set (and the next unset), and |N - 1| successive code units
134  *     must have their high bit set and next-highest bit unset, *and*
135  *     concatenating all unconstrained bits together must not produce a code
136  *     point value that could have been encoded in fewer code units.
137  *
138  * This functionality can't be implemented as member functions in
139  * GeneralTokenStreamChars because we'd need to *partially specialize* those
140  * functions -- hold Unit constant while letting AnyCharsAccess vary.  But
141  * C++ forbids function template partial specialization like this: either you
142  * fix *all* parameters or you fix none of them.
143  *
144  * Fortunately, C++ *does* allow *class* template partial specialization.  So
145  * TokenStreamChars is a template class with one specialization per Unit.
146  * Functions can be defined differently in the different specializations,
147  * because AnyCharsAccess as the only template parameter on member functions
148  * *can* vary.
149  *
150  * All TokenStreamChars<Unit, AnyCharsAccess> specializations, one per Unit,
151  * are just functionality, no actual member data.
152  *
153  * == TokenStreamSpecific<Unit, AnyCharsAccess> →
154  *    TokenStreamChars<Unit, AnyCharsAccess>, TokenStreamShared,
155  *    ErrorReporter ==
156  *
157  * TokenStreamSpecific is operations that are parametrized on character type
158  * but implement the *general* idea of tokenizing, without being intrinsically
159  * tied to character type.  Notably, this includes all operations that can
160  * report warnings or errors at particular offsets, because we include a line
161  * of context with such errors -- and that necessarily accesses the raw
162  * characters of their specific type.
163  *
164  * Much TokenStreamSpecific operation depends on functionality in
165  * TokenStreamAnyChars.  The obvious solution is to inherit it -- but this
166  * doesn't work in Parser: its ParserBase base class needs some
167  * TokenStreamAnyChars functionality without knowing character type.
168  *
169  * The AnyCharsAccess type parameter is a class that statically converts from a
170  * TokenStreamSpecific* to its corresponding TokenStreamAnyChars.  The
171  * TokenStreamSpecific in Parser<ParseHandler, Unit> can then specify a class
172  * that properly converts from TokenStreamSpecific Parser::tokenStream to
173  * TokenStreamAnyChars ParserBase::anyChars.
174  *
175  * Could we hardcode one set of offset calculations for this and eliminate
176  * AnyCharsAccess?  No.  Offset calculations possibly could be hardcoded if
177  * TokenStreamSpecific were present in Parser before Parser::handler, assuring
178  * the same offsets in all Parser-related cases.  But there's still a separate
179  * TokenStream class, that requires different offset calculations.  So even if
180  * we wanted to hardcode this (it's not clear we would, because forcing the
181  * TokenStreamSpecific declarer to specify this is more explicit), we couldn't.
182  */
183 
184 #include "mozilla/ArrayUtils.h"
185 #include "mozilla/Assertions.h"
186 #include "mozilla/Attributes.h"
187 #include "mozilla/Casting.h"
188 #include "mozilla/DebugOnly.h"
189 #include "mozilla/Maybe.h"
190 #include "mozilla/MemoryChecking.h"
191 #include "mozilla/PodOperations.h"
192 #include "mozilla/Span.h"
193 #include "mozilla/TextUtils.h"
194 #include "mozilla/Utf8.h"
195 
196 #include <algorithm>
197 #include <stdarg.h>
198 #include <stddef.h>
199 #include <stdint.h>
200 #include <stdio.h>
201 #include <type_traits>
202 
203 #include "jspubtd.h"
204 
205 #include "frontend/ErrorReporter.h"
206 #include "frontend/ParserAtom.h"  // ParserAtom, ParserAtomsTable, TaggedParserAtomIndex
207 #include "frontend/Token.h"
208 #include "frontend/TokenKind.h"
209 #include "js/CompileOptions.h"
210 #include "js/friend/ErrorMessages.h"  // JSMSG_*
211 #include "js/HashTable.h"             // js::HashMap
212 #include "js/RegExpFlags.h"           // JS::RegExpFlags
213 #include "js/UniquePtr.h"
214 #include "js/Vector.h"
215 #include "util/Text.h"
216 #include "util/Unicode.h"
217 #include "vm/ErrorReporting.h"
218 #include "vm/JSAtom.h"
219 #include "vm/StringType.h"
220 
221 struct JS_PUBLIC_API JSContext;
222 struct KeywordInfo;
223 
224 namespace js {
225 
226 namespace frontend {
227 
228 // Saturate column number at a limit that can be represented in various parts of
229 // the engine. Source locations beyond this point will report at the limit
230 // column instead.
231 //
232 // See:
233 //  - TokenStreamAnyChars::checkOptions
234 //  - ColSpan::isRepresentable
235 //  - WasmFrameIter::computeLine
236 static constexpr uint32_t ColumnLimit = std::numeric_limits<int32_t>::max() / 2;
237 
238 // If `name` is reserved word, returns the TokenKind of it.
239 // TokenKind::Limit otherwise.
240 extern TokenKind ReservedWordTokenKind(TaggedParserAtomIndex name);
241 
242 // If `name` is reserved word, returns string representation of it.
243 // nullptr otherwise.
244 extern const char* ReservedWordToCharZ(TaggedParserAtomIndex name);
245 
246 // If `tt` is reserved word, returns string representation of it.
247 // nullptr otherwise.
248 extern const char* ReservedWordToCharZ(TokenKind tt);
249 
250 enum class DeprecatedContent : uint8_t {
251   // No deprecated content was present.
252   None = 0,
253   // Octal literal not prefixed by "0o" but rather by just "0", e.g. 0755.
254   OctalLiteral,
255   // Octal character escape, e.g. "hell\157 world".
256   OctalEscape,
257   // NonOctalDecimalEscape, i.e. "\8" or "\9".
258   EightOrNineEscape,
259 };
260 
261 struct TokenStreamFlags {
262   // Hit end of file.
263   bool isEOF : 1;
264   // Non-whitespace since start of line.
265   bool isDirtyLine : 1;
266   // Hit a syntax error, at start or during a token.
267   bool hadError : 1;
268 
269   // The nature of any deprecated content seen since last reset.
270   // We have to uint8_t instead DeprecatedContent to work around a GCC 7 bug.
271   // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414
272   uint8_t sawDeprecatedContent : 2;
273 
TokenStreamFlagsTokenStreamFlags274   TokenStreamFlags()
275       : isEOF(false),
276         isDirtyLine(false),
277         hadError(false),
278         sawDeprecatedContent(uint8_t(DeprecatedContent::None)) {}
279 };
280 
281 template <typename Unit>
282 class TokenStreamPosition;
283 
284 /**
285  * TokenStream types and constants that are used in both TokenStreamAnyChars
286  * and TokenStreamSpecific.  Do not add any non-static data members to this
287  * class!
288  */
289 class TokenStreamShared {
290  protected:
291   static constexpr size_t ntokens = 4;  // 1 current + 2 lookahead, rounded
292                                         // to power of 2 to avoid divmod by 3
293 
294   static constexpr unsigned ntokensMask = ntokens - 1;
295 
296   template <typename Unit>
297   friend class TokenStreamPosition;
298 
299  public:
300   static constexpr unsigned maxLookahead = 2;
301 
302   using Modifier = Token::Modifier;
303   static constexpr Modifier SlashIsDiv = Token::SlashIsDiv;
304   static constexpr Modifier SlashIsRegExp = Token::SlashIsRegExp;
305   static constexpr Modifier SlashIsInvalid = Token::SlashIsInvalid;
306 
verifyConsistentModifier(Modifier modifier,const Token & nextToken)307   static void verifyConsistentModifier(Modifier modifier,
308                                        const Token& nextToken) {
309     MOZ_ASSERT(
310         modifier == nextToken.modifier || modifier == SlashIsInvalid,
311         "This token was scanned with both SlashIsRegExp and SlashIsDiv, "
312         "indicating the parser is confused about how to handle a slash here. "
313         "See comment at Token::Modifier.");
314   }
315 };
316 
317 static_assert(std::is_empty_v<TokenStreamShared>,
318               "TokenStreamShared shouldn't bloat classes that inherit from it");
319 
320 template <typename Unit, class AnyCharsAccess>
321 class TokenStreamSpecific;
322 
323 template <typename Unit>
324 class MOZ_STACK_CLASS TokenStreamPosition final {
325  public:
326   template <class AnyCharsAccess>
327   inline explicit TokenStreamPosition(
328       TokenStreamSpecific<Unit, AnyCharsAccess>& tokenStream);
329 
330  private:
331   TokenStreamPosition(const TokenStreamPosition&) = delete;
332 
333   // Technically only TokenStreamSpecific<Unit, AnyCharsAccess>::seek with
334   // Unit constant and AnyCharsAccess varying must be friended, but 1) it's
335   // hard to friend one function in template classes, and 2) C++ doesn't
336   // allow partial friend specialization to target just that single class.
337   template <typename Char, class AnyCharsAccess>
338   friend class TokenStreamSpecific;
339 
340   const Unit* buf;
341   TokenStreamFlags flags;
342   unsigned lineno;
343   size_t linebase;
344   size_t prevLinebase;
345   Token currentToken;
346   unsigned lookahead;
347   Token lookaheadTokens[TokenStreamShared::maxLookahead];
348 };
349 
350 template <typename Unit>
351 class SourceUnits;
352 
353 /**
354  * This class maps:
355  *
356  *   * a sourceUnits offset (a 0-indexed count of code units)
357  *
358  * to
359  *
360  *   * a (1-indexed) line number and
361  *   * a (0-indexed) offset in code *units* (not code points, not bytes) into
362  *     that line,
363  *
364  * for either |Unit = Utf8Unit| or |Unit = char16_t|.
365  *
366  * Note that the latter quantity is *not* the same as a column number, which is
367  * a count of code *points*.  Computing a column number requires the offset
368  * within the line and the source units of that line (including what type |Unit|
369  * is, to know how to decode them).  If you need a column number, functions in
370  * |GeneralTokenStreamChars<Unit>| will consult this and source units to compute
371  * it.
372  */
373 class SourceCoords {
374   // For a given buffer holding source code, |lineStartOffsets_| has one
375   // element per line of source code, plus one sentinel element.  Each
376   // non-sentinel element holds the buffer offset for the start of the
377   // corresponding line of source code.  For this example script,
378   // assuming an initialLineOffset of 0:
379   //
380   // 1  // xyz            [line starts at offset 0]
381   // 2  var x;            [line starts at offset 7]
382   // 3                    [line starts at offset 14]
383   // 4  var y;            [line starts at offset 15]
384   //
385   // |lineStartOffsets_| is:
386   //
387   //   [0, 7, 14, 15, MAX_PTR]
388   //
389   // To convert a "line number" to an "index" into |lineStartOffsets_|,
390   // subtract |initialLineNum_|.  E.g. line 3's index is
391   // (3 - initialLineNum_), which is 2.  Therefore lineStartOffsets_[2]
392   // holds the buffer offset for the start of line 3, which is 14.  (Note
393   // that |initialLineNum_| is often 1, but not always.
394   //
395   // The first element is always initialLineOffset, passed to the
396   // constructor, and the last element is always the MAX_PTR sentinel.
397   //
398   // Offset-to-{line,offset-into-line} lookups are O(log n) in the worst
399   // case (binary search), but in practice they're heavily clustered and
400   // we do better than that by using the previous lookup's result
401   // (lastIndex_) as a starting point.
402   //
403   // Checking if an offset lies within a particular line number
404   // (isOnThisLine()) is O(1).
405   //
406   Vector<uint32_t, 128> lineStartOffsets_;
407 
408   /** The line number on which the source text begins. */
409   uint32_t initialLineNum_;
410 
411   /**
412    * The index corresponding to the last offset lookup -- used so that if
413    * offset lookups proceed in increasing order, and and the offset appears
414    * in the next couple lines from the last offset, we can avoid a full
415    * binary-search.
416    *
417    * This is mutable because it's modified on every search, but that fact
418    * isn't visible outside this class.
419    */
420   mutable uint32_t lastIndex_;
421 
422   uint32_t indexFromOffset(uint32_t offset) const;
423 
424   static const uint32_t MAX_PTR = UINT32_MAX;
425 
lineNumberFromIndex(uint32_t index)426   uint32_t lineNumberFromIndex(uint32_t index) const {
427     return index + initialLineNum_;
428   }
429 
indexFromLineNumber(uint32_t lineNum)430   uint32_t indexFromLineNumber(uint32_t lineNum) const {
431     return lineNum - initialLineNum_;
432   }
433 
434  public:
435   SourceCoords(JSContext* cx, uint32_t initialLineNumber,
436                uint32_t initialOffset);
437 
438   [[nodiscard]] bool add(uint32_t lineNum, uint32_t lineStartOffset);
439   [[nodiscard]] bool fill(const SourceCoords& other);
440 
isOnThisLine(uint32_t offset,uint32_t lineNum,bool * onThisLine)441   bool isOnThisLine(uint32_t offset, uint32_t lineNum, bool* onThisLine) const {
442     uint32_t index = indexFromLineNumber(lineNum);
443     if (index + 1 >= lineStartOffsets_.length()) {  // +1 due to sentinel
444       return false;
445     }
446     *onThisLine = lineStartOffsets_[index] <= offset &&
447                   offset < lineStartOffsets_[index + 1];
448     return true;
449   }
450 
451   /**
452    * A token, computed for an offset in source text, that can be used to
453    * access line number and line-offset information for that offset.
454    *
455    * LineToken *alone* exposes whether the corresponding offset is in the
456    * the first line of source (which may not be 1, depending on
457    * |initialLineNumber|), and whether it's in the same line as
458    * another LineToken.
459    */
460   class LineToken {
461     uint32_t index;
462 #ifdef DEBUG
463     uint32_t offset_;  // stored for consistency-of-use assertions
464 #endif
465 
466     friend class SourceCoords;
467 
468    public:
LineToken(uint32_t index,uint32_t offset)469     LineToken(uint32_t index, uint32_t offset)
470         : index(index)
471 #ifdef DEBUG
472           ,
473           offset_(offset)
474 #endif
475     {
476     }
477 
isFirstLine()478     bool isFirstLine() const { return index == 0; }
479 
isSameLine(LineToken other)480     bool isSameLine(LineToken other) const { return index == other.index; }
481 
assertConsistentOffset(uint32_t offset)482     void assertConsistentOffset(uint32_t offset) const {
483       MOZ_ASSERT(offset_ == offset);
484     }
485   };
486 
487   /**
488    * Compute a token usable to access information about the line at the
489    * given offset.
490    *
491    * The only information directly accessible in a token is whether it
492    * corresponds to the first line of source text (which may not be line
493    * 1, depending on the |initialLineNumber| value used to construct
494    * this).  Use |lineNumber(LineToken)| to compute the actual line
495    * number (incorporating the contribution of |initialLineNumber|).
496    */
497   LineToken lineToken(uint32_t offset) const;
498 
499   /** Compute the line number for the given token. */
lineNumber(LineToken lineToken)500   uint32_t lineNumber(LineToken lineToken) const {
501     return lineNumberFromIndex(lineToken.index);
502   }
503 
504   /** Return the offset of the start of the line for |lineToken|. */
lineStart(LineToken lineToken)505   uint32_t lineStart(LineToken lineToken) const {
506     MOZ_ASSERT(lineToken.index + 1 < lineStartOffsets_.length(),
507                "recorded line-start information must be available");
508     return lineStartOffsets_[lineToken.index];
509   }
510 };
511 
512 enum class UnitsType : unsigned char {
513   PossiblyMultiUnit = 0,
514   GuaranteedSingleUnit = 1,
515 };
516 
517 class ChunkInfo {
518  private:
519   // Store everything in |unsigned char|s so everything packs.
520   unsigned char column_[sizeof(uint32_t)];
521   unsigned char unitsType_;
522 
523  public:
ChunkInfo(uint32_t col,UnitsType type)524   ChunkInfo(uint32_t col, UnitsType type)
525       : unitsType_(static_cast<unsigned char>(type)) {
526     memcpy(column_, &col, sizeof(col));
527   }
528 
column()529   uint32_t column() const {
530     uint32_t col;
531     memcpy(&col, column_, sizeof(uint32_t));
532     return col;
533   }
534 
unitsType()535   UnitsType unitsType() const {
536     MOZ_ASSERT(unitsType_ <= 1, "unitsType_ must be 0 or 1");
537     return static_cast<UnitsType>(unitsType_);
538   }
539 
guaranteeSingleUnits()540   void guaranteeSingleUnits() {
541     MOZ_ASSERT(unitsType() == UnitsType::PossiblyMultiUnit,
542                "should only be setting to possibly optimize from the "
543                "pessimistic case");
544     unitsType_ = static_cast<unsigned char>(UnitsType::GuaranteedSingleUnit);
545   }
546 };
547 
548 enum class InvalidEscapeType {
549   // No invalid character escapes.
550   None,
551   // A malformed \x escape.
552   Hexadecimal,
553   // A malformed \u escape.
554   Unicode,
555   // An otherwise well-formed \u escape which represents a
556   // codepoint > 10FFFF.
557   UnicodeOverflow,
558   // An octal escape in a template token.
559   Octal,
560   // NonOctalDecimalEscape - \8 or \9.
561   EightOrNine
562 };
563 
564 class TokenStreamAnyChars : public TokenStreamShared {
565  private:
566   // Constant-at-construction fields.
567 
568   JSContext* const cx;
569 
570   /** Options used for parsing/tokenizing. */
571   const JS::ReadOnlyCompileOptions& options_;
572 
573   /**
574    * Pointer used internally to test whether in strict mode.  Use |strictMode()|
575    * instead of this field.
576    */
577   StrictModeGetter* const strictModeGetter_;
578 
579   /** Input filename or null. */
580   const char* const filename_;
581 
582   // Column number computation fields.
583 
584   /**
585    * A map of (line number => sequence of the column numbers at
586    * |ColumnChunkLength|-unit boundaries rewound [if needed] to the nearest code
587    * point boundary).  (|TokenStreamAnyChars::computePartialColumn| is the sole
588    * user of |ColumnChunkLength| and therefore contains its definition.)
589    *
590    * Entries appear in this map only when a column computation of sufficient
591    * distance is performed on a line -- and only when the column is beyond the
592    * first |ColumnChunkLength| units.  Each line's vector is lazily filled as
593    * greater offsets require column computations.
594    */
595   mutable HashMap<uint32_t, Vector<ChunkInfo>> longLineColumnInfo_;
596 
597   // Computing accurate column numbers requires at *some* point linearly
598   // iterating through prior source units in the line, to properly account for
599   // multi-unit code points.  This is quadratic if counting happens repeatedly.
600   //
601   // But usually we need columns for advancing offsets through scripts.  By
602   // caching the last ((line number, offset) => relative column) mapping (in
603   // similar manner to how |SourceCoords::lastIndex_| is used to cache
604   // (offset => line number) mappings) we can usually avoid re-iterating through
605   // the common line prefix.
606   //
607   // Additionally, we avoid hash table lookup costs by caching the
608   // |Vector<ChunkInfo>*| for the line of the last lookup.  (|nullptr| means we
609   // must look it up -- or it hasn't been created yet.)  This pointer is nulled
610   // when a lookup on a new line occurs, but as it's not a pointer at literal,
611   // reallocatable element data, it's *not* invalidated when new entries are
612   // added to such a vector.
613 
614   /**
615    * The line in which the last column computation occurred, or UINT32_MAX if
616    * no prior computation has yet happened.
617    */
618   mutable uint32_t lineOfLastColumnComputation_ = UINT32_MAX;
619 
620   /**
621    * The chunk vector of the line for that last column computation.  This is
622    * null if the chunk vector needs to be recalculated or initially created.
623    */
624   mutable Vector<ChunkInfo>* lastChunkVectorForLine_ = nullptr;
625 
626   /**
627    * The offset (in code units) of the last column computation performed,
628    * relative to source start.
629    */
630   mutable uint32_t lastOffsetOfComputedColumn_ = UINT32_MAX;
631 
632   /**
633    * The column number for the offset (in code units) of the last column
634    * computation performed, relative to source start.
635    */
636   mutable uint32_t lastComputedColumn_ = 0;
637 
638   // Intra-token fields.
639 
640   /**
641    * The offset of the first invalid escape in a template literal.  (If there is
642    * one -- if not, the value of this field is meaningless.)
643    *
644    * See also |invalidTemplateEscapeType|.
645    */
646   uint32_t invalidTemplateEscapeOffset = 0;
647 
648   /**
649    * The type of the first invalid escape in a template literal.  (If there
650    * isn't one, this will be |None|.)
651    *
652    * See also |invalidTemplateEscapeOffset|.
653    */
654   InvalidEscapeType invalidTemplateEscapeType = InvalidEscapeType::None;
655 
656   // Fields with values relevant across tokens (and therefore potentially across
657   // function boundaries, such that lazy function parsing and stream-seeking
658   // must take care in saving and restoring them).
659 
660   /** Line number and offset-to-line mapping information. */
661   SourceCoords srcCoords;
662 
663   /** Circular token buffer of gotten tokens that have been ungotten. */
664   Token tokens[ntokens] = {};
665 
666   /** The index in |tokens| of the last parsed token. */
667   unsigned cursor_ = 0;
668 
669   /** The number of tokens in |tokens| available to be gotten. */
670   unsigned lookahead = 0;
671 
672   /** The current line number. */
673   unsigned lineno;
674 
675   /** Various flag bits (see above). */
676   TokenStreamFlags flags = {};
677 
678   /** The offset of the start of the current line. */
679   size_t linebase = 0;
680 
681   /** The start of the previous line, or |size_t(-1)| on the first line. */
682   size_t prevLinebase = size_t(-1);
683 
684   /** The user's requested source URL.  Null if none has been set. */
685   UniqueTwoByteChars displayURL_ = nullptr;
686 
687   /** The URL of the source map for this script.  Null if none has been set. */
688   UniqueTwoByteChars sourceMapURL_ = nullptr;
689 
690   // Assorted boolean fields, none of which require maintenance across tokens,
691   // stored at class end to minimize padding.
692 
693   /**
694    * Whether syntax errors should or should not contain details about the
695    * precise nature of the error.  (This is intended for use in suppressing
696    * content-revealing details about syntax errors in cross-origin scripts on
697    * the web.)
698    */
699   const bool mutedErrors;
700 
701   /**
702    * An array storing whether a TokenKind observed while attempting to extend
703    * a valid AssignmentExpression into an even longer AssignmentExpression
704    * (e.g., extending '3' to '3 + 5') will terminate it without error.
705    *
706    * For example, ';' always ends an AssignmentExpression because it ends a
707    * Statement or declaration.  '}' always ends an AssignmentExpression
708    * because it terminates BlockStatement, FunctionBody, and embedded
709    * expressions in TemplateLiterals.  Therefore both entries are set to true
710    * in TokenStreamAnyChars construction.
711    *
712    * But e.g. '+' *could* extend an AssignmentExpression, so its entry here
713    * is false.  Meanwhile 'this' can't extend an AssignmentExpression, but
714    * it's only valid after a line break, so its entry here must be false.
715    *
716    * NOTE: This array could be static, but without C99's designated
717    *       initializers it's easier zeroing here and setting the true entries
718    *       in the constructor body.  (Having this per-instance might also aid
719    *       locality.)  Don't worry!  Initialization time for each TokenStream
720    *       is trivial.  See bug 639420.
721    */
722   bool isExprEnding[size_t(TokenKind::Limit)] = {};  // all-false initially
723 
724   // End of fields.
725 
726  public:
727   TokenStreamAnyChars(JSContext* cx, const JS::ReadOnlyCompileOptions& options,
728                       StrictModeGetter* smg);
729 
730   template <typename Unit, class AnyCharsAccess>
731   friend class GeneralTokenStreamChars;
732   template <typename Unit, class AnyCharsAccess>
733   friend class TokenStreamChars;
734   template <typename Unit, class AnyCharsAccess>
735   friend class TokenStreamSpecific;
736 
737   template <typename Unit>
738   friend class TokenStreamPosition;
739 
740   // Accessors.
cursor()741   unsigned cursor() const { return cursor_; }
nextCursor()742   unsigned nextCursor() const { return (cursor_ + 1) & ntokensMask; }
aheadCursor(unsigned steps)743   unsigned aheadCursor(unsigned steps) const {
744     return (cursor_ + steps) & ntokensMask;
745   }
746 
currentToken()747   const Token& currentToken() const { return tokens[cursor()]; }
isCurrentTokenType(TokenKind type)748   bool isCurrentTokenType(TokenKind type) const {
749     return currentToken().type == type;
750   }
751 
752   [[nodiscard]] bool checkOptions();
753 
754  private:
755   TaggedParserAtomIndex reservedWordToPropertyName(TokenKind tt) const;
756 
757  public:
currentName()758   TaggedParserAtomIndex currentName() const {
759     if (isCurrentTokenType(TokenKind::Name) ||
760         isCurrentTokenType(TokenKind::PrivateName)) {
761       return currentToken().name();
762     }
763 
764     MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
765     return reservedWordToPropertyName(currentToken().type);
766   }
767 
currentNameHasEscapes(ParserAtomsTable & parserAtoms)768   bool currentNameHasEscapes(ParserAtomsTable& parserAtoms) const {
769     if (isCurrentTokenType(TokenKind::Name) ||
770         isCurrentTokenType(TokenKind::PrivateName)) {
771       TokenPos pos = currentToken().pos;
772       return (pos.end - pos.begin) != parserAtoms.length(currentToken().name());
773     }
774 
775     MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
776     return false;
777   }
778 
isCurrentTokenAssignment()779   bool isCurrentTokenAssignment() const {
780     return TokenKindIsAssignment(currentToken().type);
781   }
782 
783   // Flag methods.
isEOF()784   bool isEOF() const { return flags.isEOF; }
hadError()785   bool hadError() const { return flags.hadError; }
786 
sawDeprecatedContent()787   DeprecatedContent sawDeprecatedContent() const {
788     return static_cast<DeprecatedContent>(flags.sawDeprecatedContent);
789   }
790 
791  private:
792   // Workaround GCC 7 sadness.
setSawDeprecatedContent(DeprecatedContent content)793   void setSawDeprecatedContent(DeprecatedContent content) {
794     flags.sawDeprecatedContent = static_cast<uint8_t>(content);
795   }
796 
797  public:
clearSawDeprecatedContent()798   void clearSawDeprecatedContent() {
799     setSawDeprecatedContent(DeprecatedContent::None);
800   }
setSawDeprecatedOctalLiteral()801   void setSawDeprecatedOctalLiteral() {
802     setSawDeprecatedContent(DeprecatedContent::OctalLiteral);
803   }
setSawDeprecatedOctalEscape()804   void setSawDeprecatedOctalEscape() {
805     setSawDeprecatedContent(DeprecatedContent::OctalEscape);
806   }
setSawDeprecatedEightOrNineEscape()807   void setSawDeprecatedEightOrNineEscape() {
808     setSawDeprecatedContent(DeprecatedContent::EightOrNineEscape);
809   }
810 
hasInvalidTemplateEscape()811   bool hasInvalidTemplateEscape() const {
812     return invalidTemplateEscapeType != InvalidEscapeType::None;
813   }
clearInvalidTemplateEscape()814   void clearInvalidTemplateEscape() {
815     invalidTemplateEscapeType = InvalidEscapeType::None;
816   }
817 
818  private:
819   // This is private because it should only be called by the tokenizer while
820   // tokenizing not by, for example, BytecodeEmitter.
strictMode()821   bool strictMode() const {
822     return strictModeGetter_ && strictModeGetter_->strictMode();
823   }
824 
setInvalidTemplateEscape(uint32_t offset,InvalidEscapeType type)825   void setInvalidTemplateEscape(uint32_t offset, InvalidEscapeType type) {
826     MOZ_ASSERT(type != InvalidEscapeType::None);
827     if (invalidTemplateEscapeType != InvalidEscapeType::None) {
828       return;
829     }
830     invalidTemplateEscapeOffset = offset;
831     invalidTemplateEscapeType = type;
832   }
833 
834  public:
835   // Call this immediately after parsing an OrExpression to allow scanning the
836   // next token with SlashIsRegExp without asserting (even though we just
837   // peeked at it in SlashIsDiv mode).
838   //
839   // It's OK to disable the assertion because the places where this is called
840   // have peeked at the next token in SlashIsDiv mode, and checked that it is
841   // *not* a Div token.
842   //
843   // To see why it is necessary to disable the assertion, consider these two
844   // programs:
845   //
846   //     x = arg => q       // per spec, this is all one statement, and the
847   //     /a/g;              // slashes are division operators
848   //
849   //     x = arg => {}      // per spec, ASI at the end of this line
850   //     /a/g;              // and that's a regexp literal
851   //
852   // The first program shows why orExpr() has use SlashIsDiv mode when peeking
853   // ahead for the next operator after parsing `q`. The second program shows
854   // why matchOrInsertSemicolon() must use SlashIsRegExp mode when scanning
855   // ahead for a semicolon.
allowGettingNextTokenWithSlashIsRegExp()856   void allowGettingNextTokenWithSlashIsRegExp() {
857 #ifdef DEBUG
858     // Check the precondition: Caller already peeked ahead at the next token,
859     // in SlashIsDiv mode, and it is *not* a Div token.
860     MOZ_ASSERT(hasLookahead());
861     const Token& next = nextToken();
862     MOZ_ASSERT(next.modifier == SlashIsDiv);
863     MOZ_ASSERT(next.type != TokenKind::Div);
864     tokens[nextCursor()].modifier = SlashIsRegExp;
865 #endif
866   }
867 
868 #ifdef DEBUG
debugHasNoLookahead()869   inline bool debugHasNoLookahead() const { return lookahead == 0; }
870 #endif
871 
hasDisplayURL()872   bool hasDisplayURL() const { return displayURL_ != nullptr; }
873 
displayURL()874   char16_t* displayURL() { return displayURL_.get(); }
875 
hasSourceMapURL()876   bool hasSourceMapURL() const { return sourceMapURL_ != nullptr; }
877 
sourceMapURL()878   char16_t* sourceMapURL() { return sourceMapURL_.get(); }
879 
context()880   JSContext* context() const { return cx; }
881 
882   using LineToken = SourceCoords::LineToken;
883 
lineToken(uint32_t offset)884   LineToken lineToken(uint32_t offset) const {
885     return srcCoords.lineToken(offset);
886   }
887 
lineNumber(LineToken lineToken)888   uint32_t lineNumber(LineToken lineToken) const {
889     return srcCoords.lineNumber(lineToken);
890   }
891 
lineStart(LineToken lineToken)892   uint32_t lineStart(LineToken lineToken) const {
893     return srcCoords.lineStart(lineToken);
894   }
895 
896   /**
897    * Fill in |err|.
898    *
899    * If the token stream doesn't have location info for this error, use the
900    * caller's location (including line/column number) and return false.  (No
901    * line of context is set.)
902    *
903    * Otherwise fill in everything in |err| except 1) line/column numbers and
904    * 2) line-of-context-related fields and return true.  The caller *must*
905    * fill in the line/column number; filling the line of context is optional.
906    */
907   bool fillExceptingContext(ErrorMetadata* err, uint32_t offset);
908 
updateFlagsForEOL()909   MOZ_ALWAYS_INLINE void updateFlagsForEOL() { flags.isDirtyLine = false; }
910 
911  private:
912   /**
913    * Compute the "partial" column number in Unicode code points of the absolute
914    * |offset| within source text on the line of |lineToken| (which must have
915    * been computed from |offset|).
916    *
917    * A partial column number on a line that isn't the first line is just the
918    * actual column number.  But a partial column number on the first line is the
919    * column number *ignoring the initial line/column of the script*.  For
920    * example, consider this HTML with line/column number keys:
921    *
922    *                 1         2            3
923    *       0123456789012345678901234   567890
924    *     ------------------------------------
925    *   1 | <html>
926    *   2 | <head>
927    *   3 |   <script>var x = 3;  x &lt; 4;
928    *   4 | const y = 7;</script>
929    *   5 | </head>
930    *   6 | <body></body>
931    *   7 | </html>
932    *
933    * The script would be compiled specifying initial (line, column) of (3, 10)
934    * using |JS::ReadOnlyCompileOptions::{lineno,column}|.  And the column
935    * reported by |computeColumn| for the "v" of |var| would be 10.  But the
936    * partial column number of the "v" in |var|, that this function returns,
937    * would be 0.  On the other hand, the column reported by |computeColumn| and
938    * the partial column number returned by this function for the "c" in |const|
939    * would both be 0, because it's not in the first line of source text.
940    *
941    * The partial column is with respect *only* to the JavaScript source text as
942    * SpiderMonkey sees it.  In the example, the "&lt;" is converted to "<" by
943    * the browser before SpiderMonkey would see it.  So the partial column of the
944    * "4" in the inequality would be 16, not 19.
945    *
946    * Code points are not all equal length, so counting requires *some* kind of
947    * linear-time counting from the start of the line.  This function attempts
948    * various tricks to reduce this cost.  If these optimizations succeed,
949    * repeated calls to this function on a line will pay a one-time cost linear
950    * in the length of the line, then each call pays a separate constant-time
951    * cost.  If the optimizations do not succeed, this function works in time
952    * linear in the length of the line.
953    *
954    * It's unusual for a function in *this* class to be |Unit|-templated, but
955    * while this operation manages |Unit|-agnostic fields in this class and in
956    * |srcCoords|, it must *perform* |Unit|-sensitive computations to fill them.
957    * And this is the best place to do that.
958    */
959   template <typename Unit>
960   uint32_t computePartialColumn(const LineToken lineToken,
961                                 const uint32_t offset,
962                                 const SourceUnits<Unit>& sourceUnits) const;
963 
964   /**
965    * Update line/column information for the start of a new line at
966    * |lineStartOffset|.
967    */
968   [[nodiscard]] MOZ_ALWAYS_INLINE bool internalUpdateLineInfoForEOL(
969       uint32_t lineStartOffset);
970 
971  public:
nextToken()972   const Token& nextToken() const {
973     MOZ_ASSERT(hasLookahead());
974     return tokens[nextCursor()];
975   }
976 
hasLookahead()977   bool hasLookahead() const { return lookahead > 0; }
978 
advanceCursor()979   void advanceCursor() { cursor_ = (cursor_ + 1) & ntokensMask; }
980 
retractCursor()981   void retractCursor() { cursor_ = (cursor_ - 1) & ntokensMask; }
982 
allocateToken()983   Token* allocateToken() {
984     advanceCursor();
985 
986     Token* tp = &tokens[cursor()];
987     MOZ_MAKE_MEM_UNDEFINED(tp, sizeof(*tp));
988 
989     return tp;
990   }
991 
992   // Push the last scanned token back into the stream.
ungetToken()993   void ungetToken() {
994     MOZ_ASSERT(lookahead < maxLookahead);
995     lookahead++;
996     retractCursor();
997   }
998 
999  public:
adoptState(TokenStreamAnyChars & other)1000   void adoptState(TokenStreamAnyChars& other) {
1001     // If |other| has fresh information from directives, overwrite any
1002     // previously recorded directives.  (There is no specification directing
1003     // that last-in-source-order directive controls, sadly.  We behave this way
1004     // in the ordinary case, so we ought do so here too.)
1005     if (auto& url = other.displayURL_) {
1006       displayURL_ = std::move(url);
1007     }
1008     if (auto& url = other.sourceMapURL_) {
1009       sourceMapURL_ = std::move(url);
1010     }
1011   }
1012 
1013   // Compute error metadata for an error at no offset.
1014   void computeErrorMetadataNoOffset(ErrorMetadata* err);
1015 
1016   // ErrorReporter API Helpers
1017 
1018   // Provide minimal set of error reporting API given we cannot use
1019   // ErrorReportMixin here. "report" prefix is added to avoid conflict with
1020   // ErrorReportMixin methods in TokenStream class.
1021   void reportErrorNoOffset(unsigned errorNumber, ...);
1022   void reportErrorNoOffsetVA(unsigned errorNumber, va_list* args);
1023 
options()1024   const JS::ReadOnlyCompileOptions& options() const { return options_; }
1025 
getFilename()1026   const char* getFilename() const { return filename_; }
1027 };
1028 
CodeUnitValue(char16_t unit)1029 constexpr char16_t CodeUnitValue(char16_t unit) { return unit; }
1030 
CodeUnitValue(mozilla::Utf8Unit unit)1031 constexpr uint8_t CodeUnitValue(mozilla::Utf8Unit unit) {
1032   return unit.toUint8();
1033 }
1034 
1035 template <typename Unit>
1036 class TokenStreamCharsBase;
1037 
1038 template <typename T>
1039 inline bool IsLineTerminator(T) = delete;
1040 
IsLineTerminator(char32_t codePoint)1041 inline bool IsLineTerminator(char32_t codePoint) {
1042   return codePoint == '\n' || codePoint == '\r' ||
1043          codePoint == unicode::LINE_SEPARATOR ||
1044          codePoint == unicode::PARA_SEPARATOR;
1045 }
1046 
IsLineTerminator(char16_t unit)1047 inline bool IsLineTerminator(char16_t unit) {
1048   // Every LineTerminator fits in char16_t, so this is exact.
1049   return IsLineTerminator(static_cast<char32_t>(unit));
1050 }
1051 
1052 template <typename Unit>
1053 struct SourceUnitTraits;
1054 
1055 template <>
1056 struct SourceUnitTraits<char16_t> {
1057  public:
1058   static constexpr uint8_t maxUnitsLength = 2;
1059 
1060   static constexpr size_t lengthInUnits(char32_t codePoint) {
1061     return codePoint < unicode::NonBMPMin ? 1 : 2;
1062   }
1063 };
1064 
1065 template <>
1066 struct SourceUnitTraits<mozilla::Utf8Unit> {
1067  public:
1068   static constexpr uint8_t maxUnitsLength = 4;
1069 
1070   static constexpr size_t lengthInUnits(char32_t codePoint) {
1071     return codePoint < 0x80      ? 1
1072            : codePoint < 0x800   ? 2
1073            : codePoint < 0x10000 ? 3
1074                                  : 4;
1075   }
1076 };
1077 
1078 /**
1079  * PeekedCodePoint represents the result of peeking ahead in some source text
1080  * to determine the next validly-encoded code point.
1081  *
1082  * If there isn't a valid code point, then |isNone()|.
1083  *
1084  * But if there *is* a valid code point, then |!isNone()|, the code point has
1085  * value |codePoint()| and its length in code units is |lengthInUnits()|.
1086  *
1087  * Conceptually, this class is |Maybe<struct { char32_t v; uint8_t len; }>|.
1088  */
1089 template <typename Unit>
1090 class PeekedCodePoint final {
1091   char32_t codePoint_ = 0;
1092   uint8_t lengthInUnits_ = 0;
1093 
1094  private:
1095   using SourceUnitTraits = frontend::SourceUnitTraits<Unit>;
1096 
1097   PeekedCodePoint() = default;
1098 
1099  public:
1100   /**
1101    * Create a peeked code point with the given value and length in code
1102    * units.
1103    *
1104    * While the latter value is computable from the former for both UTF-8 and
1105    * JS's version of UTF-16, the caller likely computed a length in units in
1106    * the course of determining the peeked value.  Passing both here avoids
1107    * recomputation and lets us do a consistency-checking assertion.
1108    */
1109   PeekedCodePoint(char32_t codePoint, uint8_t lengthInUnits)
1110       : codePoint_(codePoint), lengthInUnits_(lengthInUnits) {
1111     MOZ_ASSERT(codePoint <= unicode::NonBMPMax);
1112     MOZ_ASSERT(lengthInUnits != 0, "bad code point length");
1113     MOZ_ASSERT(lengthInUnits == SourceUnitTraits::lengthInUnits(codePoint));
1114   }
1115 
1116   /** Create a PeekedCodeUnit that represents no valid code point. */
1117   static PeekedCodePoint none() { return PeekedCodePoint(); }
1118 
1119   /** True if no code point was found, false otherwise. */
1120   bool isNone() const { return lengthInUnits_ == 0; }
1121 
1122   /** If a code point was found, its value. */
1123   char32_t codePoint() const {
1124     MOZ_ASSERT(!isNone());
1125     return codePoint_;
1126   }
1127 
1128   /** If a code point was found, its length in code units. */
1129   uint8_t lengthInUnits() const {
1130     MOZ_ASSERT(!isNone());
1131     return lengthInUnits_;
1132   }
1133 };
1134 
1135 inline PeekedCodePoint<char16_t> PeekCodePoint(const char16_t* const ptr,
1136                                                const char16_t* const end) {
1137   if (MOZ_UNLIKELY(ptr >= end)) {
1138     return PeekedCodePoint<char16_t>::none();
1139   }
1140 
1141   char16_t lead = ptr[0];
1142 
1143   char32_t c;
1144   uint8_t len;
1145   if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead)) ||
1146       MOZ_UNLIKELY(ptr + 1 >= end || !unicode::IsTrailSurrogate(ptr[1]))) {
1147     c = lead;
1148     len = 1;
1149   } else {
1150     c = unicode::UTF16Decode(lead, ptr[1]);
1151     len = 2;
1152   }
1153 
1154   return PeekedCodePoint<char16_t>(c, len);
1155 }
1156 
1157 inline PeekedCodePoint<mozilla::Utf8Unit> PeekCodePoint(
1158     const mozilla::Utf8Unit* const ptr, const mozilla::Utf8Unit* const end) {
1159   if (MOZ_UNLIKELY(ptr >= end)) {
1160     return PeekedCodePoint<mozilla::Utf8Unit>::none();
1161   }
1162 
1163   const mozilla::Utf8Unit lead = ptr[0];
1164   if (mozilla::IsAscii(lead)) {
1165     return PeekedCodePoint<mozilla::Utf8Unit>(lead.toUint8(), 1);
1166   }
1167 
1168   const mozilla::Utf8Unit* afterLead = ptr + 1;
1169   mozilla::Maybe<char32_t> codePoint =
1170       mozilla::DecodeOneUtf8CodePoint(lead, &afterLead, end);
1171   if (codePoint.isNothing()) {
1172     return PeekedCodePoint<mozilla::Utf8Unit>::none();
1173   }
1174 
1175   auto len =
1176       mozilla::AssertedCast<uint8_t>(mozilla::PointerRangeSize(ptr, afterLead));
1177   MOZ_ASSERT(len <= 4);
1178 
1179   return PeekedCodePoint<mozilla::Utf8Unit>(codePoint.value(), len);
1180 }
1181 
1182 inline bool IsSingleUnitLineTerminator(mozilla::Utf8Unit unit) {
1183   // BEWARE: The Unicode line/paragraph separators don't fit in a single
1184   //         UTF-8 code unit, so this test is exact for Utf8Unit but inexact
1185   //         for UTF-8 as a whole.  Users must handle |unit| as start of a
1186   //         Unicode LineTerminator themselves!
1187   return unit == mozilla::Utf8Unit('\n') || unit == mozilla::Utf8Unit('\r');
1188 }
1189 
1190 // This is the low-level interface to the JS source code buffer.  It just gets
1191 // raw Unicode code units -- 16-bit char16_t units of source text that are not
1192 // (always) full code points, and 8-bit units of UTF-8 source text soon.
1193 // TokenStreams functions are layered on top and do some extra stuff like
1194 // converting all EOL sequences to '\n', tracking the line number, and setting
1195 // |flags.isEOF|.  (The "raw" in "raw Unicode code units" refers to the lack of
1196 // EOL sequence normalization.)
1197 //
1198 // buf[0..length-1] often represents a substring of some larger source,
1199 // where we have only the substring in memory. The |startOffset| argument
1200 // indicates the offset within this larger string at which our string
1201 // begins, the offset of |buf[0]|.
1202 template <typename Unit>
1203 class SourceUnits {
1204  private:
1205   /** Base of buffer. */
1206   const Unit* base_;
1207 
1208   /** Offset of base_[0]. */
1209   uint32_t startOffset_;
1210 
1211   /** Limit for quick bounds check. */
1212   const Unit* limit_;
1213 
1214   /** Next char to get. */
1215   const Unit* ptr;
1216 
1217  public:
1218   SourceUnits(const Unit* units, size_t length, size_t startOffset)
1219       : base_(units),
1220         startOffset_(startOffset),
1221         limit_(units + length),
1222         ptr(units) {}
1223 
1224   bool atStart() const {
1225     MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
1226     return ptr == base_;
1227   }
1228 
1229   bool atEnd() const {
1230     MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
1231     MOZ_ASSERT(ptr <= limit_, "shouldn't have overrun");
1232     return ptr >= limit_;
1233   }
1234 
1235   size_t remaining() const {
1236     MOZ_ASSERT(!isPoisoned(),
1237                "can't get a count of remaining code units if poisoned");
1238     return mozilla::PointerRangeSize(ptr, limit_);
1239   }
1240 
1241   size_t startOffset() const { return startOffset_; }
1242 
1243   size_t offset() const {
1244     return startOffset_ + mozilla::PointerRangeSize(base_, ptr);
1245   }
1246 
1247   const Unit* codeUnitPtrAt(size_t offset) const {
1248     MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
1249     MOZ_ASSERT(startOffset_ <= offset);
1250     MOZ_ASSERT(offset - startOffset_ <=
1251                mozilla::PointerRangeSize(base_, limit_));
1252     return base_ + (offset - startOffset_);
1253   }
1254 
1255   const Unit* current() const { return ptr; }
1256 
1257   const Unit* limit() const { return limit_; }
1258 
1259   Unit previousCodeUnit() {
1260     MOZ_ASSERT(!isPoisoned(), "can't get previous code unit if poisoned");
1261     MOZ_ASSERT(!atStart(), "must have a previous code unit to get");
1262     return *(ptr - 1);
1263   }
1264 
1265   Unit getCodeUnit() {
1266     return *ptr++;  // this will nullptr-crash if poisoned
1267   }
1268 
1269   Unit peekCodeUnit() const {
1270     return *ptr;  // this will nullptr-crash if poisoned
1271   }
1272 
1273   /**
1274    * Determine the next code point in source text.  The code point is not
1275    * normalized: '\r', '\n', '\u2028', and '\u2029' are returned literally.
1276    * If there is no next code point because |atEnd()|, or if an encoding
1277    * error is encountered, return a |PeekedCodePoint| that |isNone()|.
1278    *
1279    * This function does not report errors: code that attempts to get the next
1280    * code point must report any error.
1281    *
1282    * If a next code point is found, it may be consumed by passing it to
1283    * |consumeKnownCodePoint|.
1284    */
1285   PeekedCodePoint<Unit> peekCodePoint() const {
1286     return PeekCodePoint(ptr, limit_);
1287   }
1288 
1289  private:
1290 #ifdef DEBUG
1291   void assertNextCodePoint(const PeekedCodePoint<Unit>& peeked);
1292 #endif
1293 
1294  public:
1295   /**
1296    * Consume a peeked code point that |!isNone()|.
1297    *
1298    * This call DOES NOT UPDATE LINE-STATUS.  You may need to call
1299    * |updateLineInfoForEOL()| and |updateFlagsForEOL()| if this consumes a
1300    * LineTerminator.  Note that if this consumes '\r', you also must consume
1301    * an optional '\n' (i.e. a full LineTerminatorSequence) before doing so.
1302    */
1303   void consumeKnownCodePoint(const PeekedCodePoint<Unit>& peeked) {
1304     MOZ_ASSERT(!peeked.isNone());
1305     MOZ_ASSERT(peeked.lengthInUnits() <= remaining());
1306 
1307 #ifdef DEBUG
1308     assertNextCodePoint(peeked);
1309 #endif
1310 
1311     ptr += peeked.lengthInUnits();
1312   }
1313 
1314   /** Match |n| hexadecimal digits and store their value in |*out|. */
1315   bool matchHexDigits(uint8_t n, char16_t* out) {
1316     MOZ_ASSERT(!isPoisoned(), "shouldn't peek into poisoned SourceUnits");
1317     MOZ_ASSERT(n <= 4, "hexdigit value can't overflow char16_t");
1318     if (n > remaining()) {
1319       return false;
1320     }
1321 
1322     char16_t v = 0;
1323     for (uint8_t i = 0; i < n; i++) {
1324       auto unit = CodeUnitValue(ptr[i]);
1325       if (!mozilla::IsAsciiHexDigit(unit)) {
1326         return false;
1327       }
1328 
1329       v = (v << 4) | mozilla::AsciiAlphanumericToNumber(unit);
1330     }
1331 
1332     *out = v;
1333     ptr += n;
1334     return true;
1335   }
1336 
1337   bool matchCodeUnits(const char* chars, uint8_t length) {
1338     MOZ_ASSERT(!isPoisoned(), "shouldn't match into poisoned SourceUnits");
1339     if (length > remaining()) {
1340       return false;
1341     }
1342 
1343     const Unit* start = ptr;
1344     const Unit* end = ptr + length;
1345     while (ptr < end) {
1346       if (*ptr++ != Unit(*chars++)) {
1347         ptr = start;
1348         return false;
1349       }
1350     }
1351 
1352     return true;
1353   }
1354 
1355   void skipCodeUnits(uint32_t n) {
1356     MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1357     MOZ_ASSERT(n <= remaining(), "shouldn't skip beyond end of SourceUnits");
1358     ptr += n;
1359   }
1360 
1361   void unskipCodeUnits(uint32_t n) {
1362     MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1363     MOZ_ASSERT(n <= mozilla::PointerRangeSize(base_, ptr),
1364                "shouldn't unskip beyond start of SourceUnits");
1365     ptr -= n;
1366   }
1367 
1368  private:
1369   friend class TokenStreamCharsBase<Unit>;
1370 
1371   bool internalMatchCodeUnit(Unit c) {
1372     MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1373     if (MOZ_LIKELY(!atEnd()) && *ptr == c) {
1374       ptr++;
1375       return true;
1376     }
1377     return false;
1378   }
1379 
1380  public:
1381   void consumeKnownCodeUnit(Unit c) {
1382     MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1383     MOZ_ASSERT(*ptr == c, "consuming the wrong code unit");
1384     ptr++;
1385   }
1386 
1387   /** Unget U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR. */
1388   inline void ungetLineOrParagraphSeparator();
1389 
1390   void ungetCodeUnit() {
1391     MOZ_ASSERT(!isPoisoned(), "can't unget from poisoned units");
1392     MOZ_ASSERT(!atStart(), "can't unget if currently at start");
1393     ptr--;
1394   }
1395 
1396   const Unit* addressOfNextCodeUnit(bool allowPoisoned = false) const {
1397     MOZ_ASSERT_IF(!allowPoisoned, !isPoisoned());
1398     return ptr;
1399   }
1400 
1401   // Use this with caution!
1402   void setAddressOfNextCodeUnit(const Unit* a, bool allowPoisoned = false) {
1403     MOZ_ASSERT_IF(!allowPoisoned, a);
1404     ptr = a;
1405   }
1406 
1407   // Poison the SourceUnits so they can't be accessed again.
1408   void poisonInDebug() {
1409 #ifdef DEBUG
1410     ptr = nullptr;
1411 #endif
1412   }
1413 
1414  private:
1415   bool isPoisoned() const {
1416 #ifdef DEBUG
1417     // |ptr| can be null for unpoisoned SourceUnits if this was initialized with
1418     // |units == nullptr| and |length == 0|.  In that case, for lack of any
1419     // better options, consider this to not be poisoned.
1420     return ptr == nullptr && ptr != limit_;
1421 #else
1422     return false;
1423 #endif
1424   }
1425 
1426  public:
1427   /**
1428    * Consume the rest of a single-line comment (but not the EOL/EOF that
1429    * terminates it).
1430    *
1431    * If an encoding error is encountered -- possible only for UTF-8 because
1432    * JavaScript's conception of UTF-16 encompasses any sequence of 16-bit
1433    * code units -- valid code points prior to the encoding error are consumed
1434    * and subsequent invalid code units are not consumed.  For example, given
1435    * these UTF-8 code units:
1436    *
1437    *   'B'   'A'  'D'  ':'   <bad code unit sequence>
1438    *   0x42  0x41 0x44 0x3A  0xD0 0x00 ...
1439    *
1440    * the first four code units are consumed, but 0xD0 and 0x00 are not
1441    * consumed because 0xD0 encodes a two-byte lead unit but 0x00 is not a
1442    * valid trailing code unit.
1443    *
1444    * It is expected that the caller will report such an encoding error when
1445    * it attempts to consume the next code point.
1446    */
1447   void consumeRestOfSingleLineComment();
1448 
1449   /**
1450    * The maximum radius of code around the location of an error that should
1451    * be included in a syntax error message -- this many code units to either
1452    * side.  The resulting window of data is then accordinngly trimmed so that
1453    * the window contains only validly-encoded data.
1454    *
1455    * Because this number is the same for both UTF-8 and UTF-16, windows in
1456    * UTF-8 may contain fewer code points than windows in UTF-16.  As we only
1457    * use this for error messages, we don't particularly care.
1458    */
1459   static constexpr size_t WindowRadius = ErrorMetadata::lineOfContextRadius;
1460 
1461   /**
1462    * From absolute offset |offset|, search backward to find an absolute
1463    * offset within source text, no further than |WindowRadius| code units
1464    * away from |offset|, such that all code points from that offset to
1465    * |offset| are valid, non-LineTerminator code points.
1466    */
1467   size_t findWindowStart(size_t offset) const;
1468 
1469   /**
1470    * From absolute offset |offset|, find an absolute offset within source
1471    * text, no further than |WindowRadius| code units away from |offset|, such
1472    * that all code units from |offset| to that offset are valid,
1473    * non-LineTerminator code points.
1474    */
1475   size_t findWindowEnd(size_t offset) const;
1476 
1477   /**
1478    * Given a |window| of |encodingSpecificWindowLength| units encoding valid
1479    * Unicode text, with index |encodingSpecificTokenOffset| indicating a
1480    * particular code point boundary in |window|, compute the corresponding
1481    * token offset and length if |window| were encoded in UTF-16.  For
1482    * example:
1483    *
1484    *   // U+03C0 GREEK SMALL LETTER PI is encoded as 0xCF 0x80.
1485    *   const Utf8Unit* encodedWindow =
1486    *     reinterpret_cast<const Utf8Unit*>(u8"ππππ = @ FAIL");
1487    *   size_t encodedTokenOffset = 11; // 2 * 4 + ' = '.length
1488    *   size_t encodedWindowLength = 17; // 2 * 4 + ' = @ FAIL'.length
1489    *   size_t utf16Offset, utf16Length;
1490    *   computeWindowOffsetAndLength(encodedWindow,
1491    *                                encodedTokenOffset, &utf16Offset,
1492    *                                encodedWindowLength, &utf16Length);
1493    *   MOZ_ASSERT(utf16Offset == 7);
1494    *   MOZ_ASSERT(utf16Length = 13);
1495    *
1496    * This function asserts if called for UTF-16: the sole caller can avoid
1497    * computing UTF-16 offsets when they're definitely the same as the encoded
1498    * offsets.
1499    */
1500   inline void computeWindowOffsetAndLength(const Unit* encodeWindow,
1501                                            size_t encodingSpecificTokenOffset,
1502                                            size_t* utf16TokenOffset,
1503                                            size_t encodingSpecificWindowLength,
1504                                            size_t* utf16WindowLength);
1505 };
1506 
1507 template <>
1508 inline void SourceUnits<char16_t>::ungetLineOrParagraphSeparator() {
1509 #ifdef DEBUG
1510   char16_t prev = previousCodeUnit();
1511 #endif
1512   MOZ_ASSERT(prev == unicode::LINE_SEPARATOR ||
1513              prev == unicode::PARA_SEPARATOR);
1514 
1515   ungetCodeUnit();
1516 }
1517 
1518 template <>
1519 inline void SourceUnits<mozilla::Utf8Unit>::ungetLineOrParagraphSeparator() {
1520   unskipCodeUnits(3);
1521 
1522   MOZ_ASSERT(ptr[0].toUint8() == 0xE2);
1523   MOZ_ASSERT(ptr[1].toUint8() == 0x80);
1524 
1525 #ifdef DEBUG
1526   uint8_t last = ptr[2].toUint8();
1527 #endif
1528   MOZ_ASSERT(last == 0xA8 || last == 0xA9);
1529 }
1530 
1531 /**
1532  * An all-purpose buffer type for accumulating text during tokenizing.
1533  *
1534  * In principle we could make this buffer contain |char16_t|, |Utf8Unit|, or
1535  * |Unit|.  We use |char16_t| because:
1536  *
1537  *   * we don't have a UTF-8 regular expression parser, so in general regular
1538  *     expression text must be copied to a separate UTF-16 buffer to parse it,
1539  *     and
1540  *   * |TokenStreamCharsShared::copyCharBufferTo|, which copies a shared
1541  *     |CharBuffer| to a |char16_t*|, is simpler if it doesn't have to convert.
1542  */
1543 using CharBuffer = Vector<char16_t, 32>;
1544 
1545 /**
1546  * Append the provided code point (in the range [U+0000, U+10FFFF], surrogate
1547  * code points included) to the buffer.
1548  */
1549 [[nodiscard]] extern bool AppendCodePointToCharBuffer(CharBuffer& charBuffer,
1550                                                       uint32_t codePoint);
1551 
1552 /**
1553  * Accumulate the range of UTF-16 text (lone surrogates permitted, because JS
1554  * allows them in source text) into |charBuffer|.  Normalize '\r', '\n', and
1555  * "\r\n" into '\n'.
1556  */
1557 [[nodiscard]] extern bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
1558     CharBuffer& charBuffer, const char16_t* cur, const char16_t* end);
1559 
1560 /**
1561  * Accumulate the range of previously-validated UTF-8 text into |charBuffer|.
1562  * Normalize '\r', '\n', and "\r\n" into '\n'.
1563  */
1564 [[nodiscard]] extern bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
1565     CharBuffer& charBuffer, const mozilla::Utf8Unit* cur,
1566     const mozilla::Utf8Unit* end);
1567 
1568 class TokenStreamCharsShared {
1569  protected:
1570   JSContext* cx;
1571 
1572   /**
1573    * Buffer transiently used to store sequences of identifier or string code
1574    * points when such can't be directly processed from the original source
1575    * text (e.g. because it contains escapes).
1576    */
1577   CharBuffer charBuffer;
1578 
1579   /** Information for parsing with a lifetime longer than the parser itself. */
1580   ParserAtomsTable* parserAtoms;
1581 
1582  protected:
1583   explicit TokenStreamCharsShared(JSContext* cx, ParserAtomsTable* parserAtoms)
1584       : cx(cx), charBuffer(cx), parserAtoms(parserAtoms) {}
1585 
1586   [[nodiscard]] bool copyCharBufferTo(
1587       JSContext* cx, UniquePtr<char16_t[], JS::FreePolicy>* destination);
1588 
1589   /**
1590    * Determine whether a code unit constitutes a complete ASCII code point.
1591    * (The code point's exact value might not be used, however, if subsequent
1592    * code observes that |unit| is part of a LineTerminatorSequence.)
1593    */
1594   [[nodiscard]] static constexpr MOZ_ALWAYS_INLINE bool isAsciiCodePoint(
1595       int32_t unit) {
1596     return mozilla::IsAscii(static_cast<char32_t>(unit));
1597   }
1598 
1599   TaggedParserAtomIndex drainCharBufferIntoAtom() {
1600     // Add to parser atoms table.
1601     auto atom = this->parserAtoms->internChar16(cx, charBuffer.begin(),
1602                                                 charBuffer.length());
1603     charBuffer.clear();
1604     return atom;
1605   }
1606 
1607  protected:
1608   void adoptState(TokenStreamCharsShared& other) {
1609     // The other stream's buffer may contain information for a
1610     // gotten-then-ungotten token, that we must transfer into this stream so
1611     // that token's final get behaves as desired.
1612     charBuffer = std::move(other.charBuffer);
1613   }
1614 
1615  public:
1616   CharBuffer& getCharBuffer() { return charBuffer; }
1617 };
1618 
1619 template <typename Unit>
1620 class TokenStreamCharsBase : public TokenStreamCharsShared {
1621  protected:
1622   using SourceUnits = frontend::SourceUnits<Unit>;
1623 
1624   /** Code units in the source code being tokenized. */
1625   SourceUnits sourceUnits;
1626 
1627   // End of fields.
1628 
1629  protected:
1630   TokenStreamCharsBase(JSContext* cx, ParserAtomsTable* parserAtoms,
1631                        const Unit* units, size_t length, size_t startOffset);
1632 
1633   /**
1634    * Convert a non-EOF code unit returned by |getCodeUnit()| or
1635    * |peekCodeUnit()| to a Unit code unit.
1636    */
1637   inline Unit toUnit(int32_t codeUnitValue);
1638 
1639   void ungetCodeUnit(int32_t c) {
1640     if (c == EOF) {
1641       return;
1642     }
1643 
1644     sourceUnits.ungetCodeUnit();
1645   }
1646 
1647   MOZ_ALWAYS_INLINE TaggedParserAtomIndex
1648   atomizeSourceChars(mozilla::Span<const Unit> units);
1649 
1650   /**
1651    * Try to match a non-LineTerminator ASCII code point.  Return true iff it
1652    * was matched.
1653    */
1654   bool matchCodeUnit(char expect) {
1655     MOZ_ASSERT(mozilla::IsAscii(expect));
1656     MOZ_ASSERT(expect != '\r');
1657     MOZ_ASSERT(expect != '\n');
1658     return this->sourceUnits.internalMatchCodeUnit(Unit(expect));
1659   }
1660 
1661   /**
1662    * Try to match an ASCII LineTerminator code point.  Return true iff it was
1663    * matched.
1664    */
1665   bool matchLineTerminator(char expect) {
1666     MOZ_ASSERT(expect == '\r' || expect == '\n');
1667     return this->sourceUnits.internalMatchCodeUnit(Unit(expect));
1668   }
1669 
1670   template <typename T>
1671   bool matchCodeUnit(T) = delete;
1672   template <typename T>
1673   bool matchLineTerminator(T) = delete;
1674 
1675   int32_t peekCodeUnit() {
1676     return MOZ_LIKELY(!sourceUnits.atEnd())
1677                ? CodeUnitValue(sourceUnits.peekCodeUnit())
1678                : EOF;
1679   }
1680 
1681   /** Consume a known, non-EOF code unit. */
1682   inline void consumeKnownCodeUnit(int32_t unit);
1683 
1684   // Forbid accidental calls to consumeKnownCodeUnit *not* with the single
1685   // unit-or-EOF type.  Unit should use SourceUnits::consumeKnownCodeUnit;
1686   // CodeUnitValue() results should go through toUnit(), or better yet just
1687   // use the original Unit.
1688   template <typename T>
1689   inline void consumeKnownCodeUnit(T) = delete;
1690 
1691   /**
1692    * Add a null-terminated line of context to error information, for the line
1693    * in |sourceUnits| that contains |offset|.  Also record the window's
1694    * length and the offset of the error in the window.  (Don't bother adding
1695    * a line of context if it would be empty.)
1696    *
1697    * The window will contain no LineTerminators of any kind, and it will not
1698    * extend more than |SourceUnits::WindowRadius| to either side of |offset|,
1699    * nor into the previous or next lines.
1700    *
1701    * This function is quite internal, and you probably should be calling one
1702    * of its existing callers instead.
1703    */
1704   [[nodiscard]] bool addLineOfContext(ErrorMetadata* err, uint32_t offset);
1705 };
1706 
1707 template <>
1708 inline char16_t TokenStreamCharsBase<char16_t>::toUnit(int32_t codeUnitValue) {
1709   MOZ_ASSERT(codeUnitValue != EOF, "EOF is not a Unit");
1710   return mozilla::AssertedCast<char16_t>(codeUnitValue);
1711 }
1712 
1713 template <>
1714 inline mozilla::Utf8Unit TokenStreamCharsBase<mozilla::Utf8Unit>::toUnit(
1715     int32_t value) {
1716   MOZ_ASSERT(value != EOF, "EOF is not a Unit");
1717   return mozilla::Utf8Unit(mozilla::AssertedCast<unsigned char>(value));
1718 }
1719 
1720 template <typename Unit>
1721 inline void TokenStreamCharsBase<Unit>::consumeKnownCodeUnit(int32_t unit) {
1722   sourceUnits.consumeKnownCodeUnit(toUnit(unit));
1723 }
1724 
1725 template <>
1726 MOZ_ALWAYS_INLINE TaggedParserAtomIndex
1727 TokenStreamCharsBase<char16_t>::atomizeSourceChars(
1728     mozilla::Span<const char16_t> units) {
1729   return this->parserAtoms->internChar16(cx, units.data(), units.size());
1730 }
1731 
1732 template <>
1733 /* static */ MOZ_ALWAYS_INLINE TaggedParserAtomIndex
1734 TokenStreamCharsBase<mozilla::Utf8Unit>::atomizeSourceChars(
1735     mozilla::Span<const mozilla::Utf8Unit> units) {
1736   return this->parserAtoms->internUtf8(cx, units.data(), units.size());
1737 }
1738 
1739 template <typename Unit>
1740 class SpecializedTokenStreamCharsBase;
1741 
1742 template <>
1743 class SpecializedTokenStreamCharsBase<char16_t>
1744     : public TokenStreamCharsBase<char16_t> {
1745   using CharsBase = TokenStreamCharsBase<char16_t>;
1746 
1747  protected:
1748   using TokenStreamCharsShared::isAsciiCodePoint;
1749   // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
1750 
1751   using typename CharsBase::SourceUnits;
1752 
1753  protected:
1754   // These APIs are only usable by UTF-16-specific code.
1755 
1756   /**
1757    * Given |lead| already consumed, consume and return the code point encoded
1758    * starting from it.  Infallible because lone surrogates in JS encode a
1759    * "code point" of the same value.
1760    */
1761   char32_t infallibleGetNonAsciiCodePointDontNormalize(char16_t lead) {
1762     MOZ_ASSERT(!isAsciiCodePoint(lead));
1763     MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == lead);
1764 
1765     // Handle single-unit code points and lone trailing surrogates.
1766     if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead)) ||
1767         // Or handle lead surrogates not paired with trailing surrogates.
1768         MOZ_UNLIKELY(
1769             this->sourceUnits.atEnd() ||
1770             !unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) {
1771       return lead;
1772     }
1773 
1774     // Otherwise it's a multi-unit code point.
1775     return unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
1776   }
1777 
1778  protected:
1779   // These APIs are in both SpecializedTokenStreamCharsBase specializations
1780   // and so are usable in subclasses no matter what Unit is.
1781 
1782   using CharsBase::CharsBase;
1783 };
1784 
1785 template <>
1786 class SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>
1787     : public TokenStreamCharsBase<mozilla::Utf8Unit> {
1788   using CharsBase = TokenStreamCharsBase<mozilla::Utf8Unit>;
1789 
1790  protected:
1791   // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
1792 
1793  protected:
1794   // These APIs are only usable by UTF-8-specific code.
1795 
1796   using typename CharsBase::SourceUnits;
1797 
1798   /**
1799    * A mutable iterator-wrapper around |SourceUnits| that translates
1800    * operators to calls to |SourceUnits::getCodeUnit()| and similar.
1801    *
1802    * This class is expected to be used in concert with |SourceUnitsEnd|.
1803    */
1804   class SourceUnitsIterator {
1805     SourceUnits& sourceUnits_;
1806 #ifdef DEBUG
1807     // In iterator copies created by the post-increment operator, a pointer
1808     // at the next source text code unit when the post-increment operator
1809     // was called, cleared when the iterator is dereferenced.
1810     mutable mozilla::Maybe<const mozilla::Utf8Unit*>
1811         currentBeforePostIncrement_;
1812 #endif
1813 
1814    public:
1815     explicit SourceUnitsIterator(SourceUnits& sourceUnits)
1816         : sourceUnits_(sourceUnits) {}
1817 
1818     mozilla::Utf8Unit operator*() const {
1819       // operator* is expected to get the *next* value from an iterator
1820       // not pointing at the end of the underlying range.  However, the
1821       // sole use of this is in the context of an expression of the form
1822       // |*iter++|, that performed the |sourceUnits_.getCodeUnit()| in
1823       // the |operator++(int)| below -- so dereferencing acts on a
1824       // |sourceUnits_| already advanced.  Therefore the correct unit to
1825       // return is the previous one.
1826       MOZ_ASSERT(currentBeforePostIncrement_.value() + 1 ==
1827                  sourceUnits_.current());
1828 #ifdef DEBUG
1829       currentBeforePostIncrement_.reset();
1830 #endif
1831       return sourceUnits_.previousCodeUnit();
1832     }
1833 
1834     SourceUnitsIterator operator++(int) {
1835       MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1836                  "the only valid operation on a post-incremented "
1837                  "iterator is dereferencing a single time");
1838 
1839       SourceUnitsIterator copy = *this;
1840 #ifdef DEBUG
1841       copy.currentBeforePostIncrement_.emplace(sourceUnits_.current());
1842 #endif
1843 
1844       sourceUnits_.getCodeUnit();
1845       return copy;
1846     }
1847 
1848     void operator-=(size_t n) {
1849       MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1850                  "the only valid operation on a post-incremented "
1851                  "iterator is dereferencing a single time");
1852       sourceUnits_.unskipCodeUnits(n);
1853     }
1854 
1855     mozilla::Utf8Unit operator[](ptrdiff_t index) {
1856       MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1857                  "the only valid operation on a post-incremented "
1858                  "iterator is dereferencing a single time");
1859       MOZ_ASSERT(index == -1,
1860                  "must only be called to verify the value of the "
1861                  "previous code unit");
1862       return sourceUnits_.previousCodeUnit();
1863     }
1864 
1865     size_t remaining() const {
1866       MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1867                  "the only valid operation on a post-incremented "
1868                  "iterator is dereferencing a single time");
1869       return sourceUnits_.remaining();
1870     }
1871   };
1872 
1873   /** A sentinel representing the end of |SourceUnits| data. */
1874   class SourceUnitsEnd {};
1875 
1876   friend inline size_t operator-(const SourceUnitsEnd& aEnd,
1877                                  const SourceUnitsIterator& aIter);
1878 
1879  protected:
1880   // These APIs are in both SpecializedTokenStreamCharsBase specializations
1881   // and so are usable in subclasses no matter what Unit is.
1882 
1883   using CharsBase::CharsBase;
1884 };
1885 
1886 inline size_t operator-(const SpecializedTokenStreamCharsBase<
1887                             mozilla::Utf8Unit>::SourceUnitsEnd& aEnd,
1888                         const SpecializedTokenStreamCharsBase<
1889                             mozilla::Utf8Unit>::SourceUnitsIterator& aIter) {
1890   return aIter.remaining();
1891 }
1892 
1893 /** A small class encapsulating computation of the start-offset of a Token. */
1894 class TokenStart {
1895   uint32_t startOffset_;
1896 
1897  public:
1898   /**
1899    * Compute a starting offset that is the current offset of |sourceUnits|,
1900    * offset by |adjust|.  (For example, |adjust| of -1 indicates the code
1901    * unit one backwards from |sourceUnits|'s current offset.)
1902    */
1903   template <class SourceUnits>
1904   TokenStart(const SourceUnits& sourceUnits, ptrdiff_t adjust)
1905       : startOffset_(sourceUnits.offset() + adjust) {}
1906 
1907   TokenStart(const TokenStart&) = default;
1908 
1909   uint32_t offset() const { return startOffset_; }
1910 };
1911 
1912 template <typename Unit, class AnyCharsAccess>
1913 class GeneralTokenStreamChars : public SpecializedTokenStreamCharsBase<Unit> {
1914   using CharsBase = TokenStreamCharsBase<Unit>;
1915   using SpecializedCharsBase = SpecializedTokenStreamCharsBase<Unit>;
1916 
1917   using LineToken = TokenStreamAnyChars::LineToken;
1918 
1919  private:
1920   Token* newTokenInternal(TokenKind kind, TokenStart start, TokenKind* out);
1921 
1922   /**
1923    * Allocates a new Token from the given offset to the current offset,
1924    * ascribes it the given kind, and sets |*out| to that kind.
1925    */
1926   Token* newToken(TokenKind kind, TokenStart start,
1927                   TokenStreamShared::Modifier modifier, TokenKind* out) {
1928     Token* token = newTokenInternal(kind, start, out);
1929 
1930 #ifdef DEBUG
1931     // Save the modifier used to get this token, so that if an ungetToken()
1932     // occurs and then the token is re-gotten (or peeked, etc.), we can
1933     // assert both gets used compatible modifiers.
1934     token->modifier = modifier;
1935 #endif
1936 
1937     return token;
1938   }
1939 
1940   uint32_t matchUnicodeEscape(uint32_t* codePoint);
1941   uint32_t matchExtendedUnicodeEscape(uint32_t* codePoint);
1942 
1943  protected:
1944   using CharsBase::addLineOfContext;
1945   using CharsBase::matchCodeUnit;
1946   using CharsBase::matchLineTerminator;
1947   using TokenStreamCharsShared::drainCharBufferIntoAtom;
1948   using TokenStreamCharsShared::isAsciiCodePoint;
1949   // Deliberately don't |using CharsBase::sourceUnits| because of bug 1472569.
1950   // :-(
1951   using CharsBase::toUnit;
1952 
1953   using typename CharsBase::SourceUnits;
1954 
1955  protected:
1956   using SpecializedCharsBase::SpecializedCharsBase;
1957 
1958   TokenStreamAnyChars& anyCharsAccess() {
1959     return AnyCharsAccess::anyChars(this);
1960   }
1961 
1962   const TokenStreamAnyChars& anyCharsAccess() const {
1963     return AnyCharsAccess::anyChars(this);
1964   }
1965 
1966   using TokenStreamSpecific =
1967       frontend::TokenStreamSpecific<Unit, AnyCharsAccess>;
1968 
1969   TokenStreamSpecific* asSpecific() {
1970     static_assert(
1971         std::is_base_of_v<GeneralTokenStreamChars, TokenStreamSpecific>,
1972         "static_cast below presumes an inheritance relationship");
1973 
1974     return static_cast<TokenStreamSpecific*>(this);
1975   }
1976 
1977  protected:
1978   /**
1979    * Compute the column number in Unicode code points of the absolute |offset|
1980    * within source text on the line corresponding to |lineToken|.
1981    *
1982    * |offset| must be a code point boundary, preceded only by validly-encoded
1983    * source units.  (It doesn't have to be *followed* by valid source units.)
1984    */
1985   uint32_t computeColumn(LineToken lineToken, uint32_t offset) const;
1986   void computeLineAndColumn(uint32_t offset, uint32_t* line,
1987                             uint32_t* column) const;
1988 
1989   /**
1990    * Fill in |err| completely, except for line-of-context information.
1991    *
1992    * Return true if the caller can compute a line of context from the token
1993    * stream.  Otherwise return false.
1994    */
1995   [[nodiscard]] bool fillExceptingContext(ErrorMetadata* err, uint32_t offset) {
1996     if (anyCharsAccess().fillExceptingContext(err, offset)) {
1997       computeLineAndColumn(offset, &err->lineNumber, &err->columnNumber);
1998       return true;
1999     }
2000     return false;
2001   }
2002 
2003   void newSimpleToken(TokenKind kind, TokenStart start,
2004                       TokenStreamShared::Modifier modifier, TokenKind* out) {
2005     newToken(kind, start, modifier, out);
2006   }
2007 
2008   void newNumberToken(double dval, DecimalPoint decimalPoint, TokenStart start,
2009                       TokenStreamShared::Modifier modifier, TokenKind* out) {
2010     Token* token = newToken(TokenKind::Number, start, modifier, out);
2011     token->setNumber(dval, decimalPoint);
2012   }
2013 
2014   void newBigIntToken(TokenStart start, TokenStreamShared::Modifier modifier,
2015                       TokenKind* out) {
2016     newToken(TokenKind::BigInt, start, modifier, out);
2017   }
2018 
2019   void newAtomToken(TokenKind kind, TaggedParserAtomIndex atom,
2020                     TokenStart start, TokenStreamShared::Modifier modifier,
2021                     TokenKind* out) {
2022     MOZ_ASSERT(kind == TokenKind::String || kind == TokenKind::TemplateHead ||
2023                kind == TokenKind::NoSubsTemplate);
2024 
2025     Token* token = newToken(kind, start, modifier, out);
2026     token->setAtom(atom);
2027   }
2028 
2029   void newNameToken(TaggedParserAtomIndex name, TokenStart start,
2030                     TokenStreamShared::Modifier modifier, TokenKind* out) {
2031     Token* token = newToken(TokenKind::Name, start, modifier, out);
2032     token->setName(name);
2033   }
2034 
2035   void newPrivateNameToken(TaggedParserAtomIndex name, TokenStart start,
2036                            TokenStreamShared::Modifier modifier,
2037                            TokenKind* out) {
2038     Token* token = newToken(TokenKind::PrivateName, start, modifier, out);
2039     token->setName(name);
2040   }
2041 
2042   void newRegExpToken(JS::RegExpFlags reflags, TokenStart start,
2043                       TokenKind* out) {
2044     Token* token = newToken(TokenKind::RegExp, start,
2045                             TokenStreamShared::SlashIsRegExp, out);
2046     token->setRegExpFlags(reflags);
2047   }
2048 
2049   MOZ_COLD bool badToken();
2050 
2051   /**
2052    * Get the next code unit -- the next numeric sub-unit of source text,
2053    * possibly smaller than a full code point -- without updating line/column
2054    * counters or consuming LineTerminatorSequences.
2055    *
2056    * Because of these limitations, only use this if (a) the resulting code
2057    * unit is guaranteed to be ungotten (by ungetCodeUnit()) if it's an EOL,
2058    * and (b) the line-related state (lineno, linebase) is not used before
2059    * it's ungotten.
2060    */
2061   int32_t getCodeUnit() {
2062     if (MOZ_LIKELY(!this->sourceUnits.atEnd())) {
2063       return CodeUnitValue(this->sourceUnits.getCodeUnit());
2064     }
2065 
2066     anyCharsAccess().flags.isEOF = true;
2067     return EOF;
2068   }
2069 
2070   void ungetCodeUnit(int32_t c) {
2071     MOZ_ASSERT_IF(c == EOF, anyCharsAccess().flags.isEOF);
2072 
2073     CharsBase::ungetCodeUnit(c);
2074   }
2075 
2076   /**
2077    * Given a just-consumed ASCII code unit/point |lead|, consume a full code
2078    * point or LineTerminatorSequence (normalizing it to '\n') and store it in
2079    * |*codePoint|.  Return true on success, otherwise return false and leave
2080    * |*codePoint| undefined on failure.
2081    *
2082    * If a LineTerminatorSequence was consumed, also update line/column info.
2083    *
2084    * This may change the current |sourceUnits| offset.
2085    */
2086   [[nodiscard]] bool getFullAsciiCodePoint(int32_t lead, int32_t* codePoint) {
2087     MOZ_ASSERT(isAsciiCodePoint(lead),
2088                "non-ASCII code units must be handled separately");
2089     MOZ_ASSERT(toUnit(lead) == this->sourceUnits.previousCodeUnit(),
2090                "getFullAsciiCodePoint called incorrectly");
2091 
2092     if (MOZ_UNLIKELY(lead == '\r')) {
2093       matchLineTerminator('\n');
2094     } else if (MOZ_LIKELY(lead != '\n')) {
2095       *codePoint = lead;
2096       return true;
2097     }
2098 
2099     *codePoint = '\n';
2100     bool ok = updateLineInfoForEOL();
2101     if (!ok) {
2102 #ifdef DEBUG
2103       *codePoint = EOF;  // sentinel value to hopefully cause errors
2104 #endif
2105       MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
2106     }
2107     return ok;
2108   }
2109 
2110   [[nodiscard]] MOZ_ALWAYS_INLINE bool updateLineInfoForEOL() {
2111     return anyCharsAccess().internalUpdateLineInfoForEOL(
2112         this->sourceUnits.offset());
2113   }
2114 
2115   uint32_t matchUnicodeEscapeIdStart(uint32_t* codePoint);
2116   bool matchUnicodeEscapeIdent(uint32_t* codePoint);
2117   bool matchIdentifierStart();
2118 
2119   /**
2120    * If possible, compute a line of context for an otherwise-filled-in |err|
2121    * at the given offset in this token stream.
2122    *
2123    * This function is very-internal: almost certainly you should use one of
2124    * its callers instead.  It basically exists only to make those callers
2125    * more readable.
2126    */
2127   [[nodiscard]] bool internalComputeLineOfContext(ErrorMetadata* err,
2128                                                   uint32_t offset) {
2129     // We only have line-start information for the current line.  If the error
2130     // is on a different line, we can't easily provide context.  (This means
2131     // any error in a multi-line token, e.g. an unterminated multiline string
2132     // literal, won't have context.)
2133     if (err->lineNumber != anyCharsAccess().lineno) {
2134       return true;
2135     }
2136 
2137     return addLineOfContext(err, offset);
2138   }
2139 
2140  public:
2141   /**
2142    * Consume any hashbang comment at the start of a Script or Module, if one is
2143    * present.  Stops consuming just before any terminating LineTerminator or
2144    * before an encoding error is encountered.
2145    */
2146   void consumeOptionalHashbangComment();
2147 
2148   TaggedParserAtomIndex getRawTemplateStringAtom() {
2149     TokenStreamAnyChars& anyChars = anyCharsAccess();
2150 
2151     MOZ_ASSERT(anyChars.currentToken().type == TokenKind::TemplateHead ||
2152                anyChars.currentToken().type == TokenKind::NoSubsTemplate);
2153     const Unit* cur =
2154         this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.begin + 1);
2155     const Unit* end;
2156     if (anyChars.currentToken().type == TokenKind::TemplateHead) {
2157       // Of the form    |`...${|   or   |}...${|
2158       end =
2159           this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.end - 2);
2160     } else {
2161       // NO_SUBS_TEMPLATE is of the form   |`...`|   or   |}...`|
2162       end =
2163           this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.end - 1);
2164     }
2165 
2166     // |charBuffer| should be empty here, but we may as well code defensively.
2167     MOZ_ASSERT(this->charBuffer.length() == 0);
2168     this->charBuffer.clear();
2169 
2170     // Template literals normalize only '\r' and "\r\n" to '\n'; Unicode
2171     // separators don't need special handling.
2172     // https://tc39.github.io/ecma262/#sec-static-semantics-tv-and-trv
2173     if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(this->charBuffer,
2174                                                             cur, end)) {
2175       return TaggedParserAtomIndex::null();
2176     }
2177 
2178     return drainCharBufferIntoAtom();
2179   }
2180 };
2181 
2182 template <typename Unit, class AnyCharsAccess>
2183 class TokenStreamChars;
2184 
2185 template <class AnyCharsAccess>
2186 class TokenStreamChars<char16_t, AnyCharsAccess>
2187     : public GeneralTokenStreamChars<char16_t, AnyCharsAccess> {
2188   using CharsBase = TokenStreamCharsBase<char16_t>;
2189   using SpecializedCharsBase = SpecializedTokenStreamCharsBase<char16_t>;
2190   using GeneralCharsBase = GeneralTokenStreamChars<char16_t, AnyCharsAccess>;
2191   using Self = TokenStreamChars<char16_t, AnyCharsAccess>;
2192 
2193   using GeneralCharsBase::asSpecific;
2194 
2195   using typename GeneralCharsBase::TokenStreamSpecific;
2196 
2197  protected:
2198   using CharsBase::matchLineTerminator;
2199   using GeneralCharsBase::anyCharsAccess;
2200   using GeneralCharsBase::getCodeUnit;
2201   using SpecializedCharsBase::infallibleGetNonAsciiCodePointDontNormalize;
2202   using TokenStreamCharsShared::isAsciiCodePoint;
2203   // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
2204   using GeneralCharsBase::ungetCodeUnit;
2205   using GeneralCharsBase::updateLineInfoForEOL;
2206 
2207  protected:
2208   using GeneralCharsBase::GeneralCharsBase;
2209 
2210   /**
2211    * Given the non-ASCII |lead| code unit just consumed, consume and return a
2212    * complete non-ASCII code point.  Line/column updates are not performed,
2213    * and line breaks are returned as-is without normalization.
2214    */
2215   [[nodiscard]] bool getNonAsciiCodePointDontNormalize(char16_t lead,
2216                                                        char32_t* codePoint) {
2217     // There are no encoding errors in 16-bit JS, so implement this so that
2218     // the compiler knows it, too.
2219     *codePoint = infallibleGetNonAsciiCodePointDontNormalize(lead);
2220     return true;
2221   }
2222 
2223   /**
2224    * Given a just-consumed non-ASCII code unit |lead| (which may also be a
2225    * full code point, for UTF-16), consume a full code point or
2226    * LineTerminatorSequence (normalizing it to '\n') and store it in
2227    * |*codePoint|.  Return true on success, otherwise return false and leave
2228    * |*codePoint| undefined on failure.
2229    *
2230    * If a LineTerminatorSequence was consumed, also update line/column info.
2231    *
2232    * This may change the current |sourceUnits| offset.
2233    */
2234   [[nodiscard]] bool getNonAsciiCodePoint(int32_t lead, int32_t* codePoint);
2235 };
2236 
2237 template <class AnyCharsAccess>
2238 class TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>
2239     : public GeneralTokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess> {
2240   using CharsBase = TokenStreamCharsBase<mozilla::Utf8Unit>;
2241   using SpecializedCharsBase =
2242       SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>;
2243   using GeneralCharsBase =
2244       GeneralTokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;
2245   using Self = TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;
2246 
2247   using typename SpecializedCharsBase::SourceUnitsEnd;
2248   using typename SpecializedCharsBase::SourceUnitsIterator;
2249 
2250  protected:
2251   using GeneralCharsBase::anyCharsAccess;
2252   using GeneralCharsBase::computeLineAndColumn;
2253   using GeneralCharsBase::fillExceptingContext;
2254   using GeneralCharsBase::internalComputeLineOfContext;
2255   using TokenStreamCharsShared::isAsciiCodePoint;
2256   // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
2257   using GeneralCharsBase::updateLineInfoForEOL;
2258 
2259  private:
2260   static char toHexChar(uint8_t nibble) {
2261     MOZ_ASSERT(nibble < 16);
2262     return "0123456789ABCDEF"[nibble];
2263   }
2264 
2265   static void byteToString(uint8_t n, char* str) {
2266     str[0] = '0';
2267     str[1] = 'x';
2268     str[2] = toHexChar(n >> 4);
2269     str[3] = toHexChar(n & 0xF);
2270   }
2271 
2272   static void byteToTerminatedString(uint8_t n, char* str) {
2273     byteToString(n, str);
2274     str[4] = '\0';
2275   }
2276 
2277   /**
2278    * Report a UTF-8 encoding-related error for a code point starting AT THE
2279    * CURRENT OFFSET.
2280    *
2281    * |relevantUnits| indicates how many code units from the current offset
2282    * are potentially relevant to the reported error, such that they may be
2283    * included in the error message.  For example, if at the current offset we
2284    * have
2285    *
2286    *   0b1111'1111 ...
2287    *
2288    * a code unit never allowed in UTF-8, then |relevantUnits| might be 1
2289    * because only that unit is relevant.  Or if we have
2290    *
2291    *   0b1111'0111 0b1011'0101 0b0000'0000 ...
2292    *
2293    * where the first two code units are a valid prefix to a four-unit code
2294    * point but the third unit *isn't* a valid trailing code unit, then
2295    * |relevantUnits| might be 3.
2296    */
2297   MOZ_COLD void internalEncodingError(uint8_t relevantUnits,
2298                                       unsigned errorNumber, ...);
2299 
2300   // Don't use |internalEncodingError|!  Use one of the elaborated functions
2301   // that calls it, below -- all of which should be used to indicate an error
2302   // in a code point starting AT THE CURRENT OFFSET as with
2303   // |internalEncodingError|.
2304 
2305   /** Report an error for an invalid lead code unit |lead|. */
2306   MOZ_COLD void badLeadUnit(mozilla::Utf8Unit lead);
2307 
2308   /**
2309    * Report an error when there aren't enough code units remaining to
2310    * constitute a full code point after |lead|: only |remaining| code units
2311    * were available for a code point starting with |lead|, when at least
2312    * |required| code units were required.
2313    */
2314   MOZ_COLD void notEnoughUnits(mozilla::Utf8Unit lead, uint8_t remaining,
2315                                uint8_t required);
2316 
2317   /**
2318    * Report an error for a bad trailing UTF-8 code unit, where the bad
2319    * trailing unit was the last of |unitsObserved| units examined from the
2320    * current offset.
2321    */
2322   MOZ_COLD void badTrailingUnit(uint8_t unitsObserved);
2323 
2324   // Helper used for both |badCodePoint| and |notShortestForm| for code units
2325   // that have all the requisite high bits set/unset in a manner that *could*
2326   // encode a valid code point, but the remaining bits encoding its actual
2327   // value do not define a permitted value.
2328   MOZ_COLD void badStructurallyValidCodePoint(uint32_t codePoint,
2329                                               uint8_t codePointLength,
2330                                               const char* reason);
2331 
2332   /**
2333    * Report an error for UTF-8 that encodes a UTF-16 surrogate or a number
2334    * outside the Unicode range.
2335    */
2336   MOZ_COLD void badCodePoint(uint32_t codePoint, uint8_t codePointLength) {
2337     MOZ_ASSERT(unicode::IsSurrogate(codePoint) ||
2338                codePoint > unicode::NonBMPMax);
2339 
2340     badStructurallyValidCodePoint(codePoint, codePointLength,
2341                                   unicode::IsSurrogate(codePoint)
2342                                       ? "it's a UTF-16 surrogate"
2343                                       : "the maximum code point is U+10FFFF");
2344   }
2345 
2346   /**
2347    * Report an error for UTF-8 that encodes a code point not in its shortest
2348    * form.
2349    */
2350   MOZ_COLD void notShortestForm(uint32_t codePoint, uint8_t codePointLength) {
2351     MOZ_ASSERT(!unicode::IsSurrogate(codePoint));
2352     MOZ_ASSERT(codePoint <= unicode::NonBMPMax);
2353 
2354     badStructurallyValidCodePoint(
2355         codePoint, codePointLength,
2356         "it wasn't encoded in shortest possible form");
2357   }
2358 
2359  protected:
2360   using GeneralCharsBase::GeneralCharsBase;
2361 
2362   /**
2363    * Given the non-ASCII |lead| code unit just consumed, consume the rest of
2364    * a non-ASCII code point.  The code point is not normalized: on success
2365    * |*codePoint| may be U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR.
2366    *
2367    * Report an error if an invalid code point is encountered.
2368    */
2369   [[nodiscard]] bool getNonAsciiCodePointDontNormalize(mozilla::Utf8Unit lead,
2370                                                        char32_t* codePoint);
2371 
2372   /**
2373    * Given a just-consumed non-ASCII code unit |lead|, consume a full code
2374    * point or LineTerminatorSequence (normalizing it to '\n') and store it in
2375    * |*codePoint|.  Return true on success, otherwise return false and leave
2376    * |*codePoint| undefined on failure.
2377    *
2378    * If a LineTerminatorSequence was consumed, also update line/column info.
2379    *
2380    * This function will change the current |sourceUnits| offset.
2381    */
2382   [[nodiscard]] bool getNonAsciiCodePoint(int32_t lead, int32_t* codePoint);
2383 };
2384 
2385 // TokenStream is the lexical scanner for JavaScript source text.
2386 //
2387 // It takes a buffer of Unit code units (currently only char16_t encoding
2388 // UTF-16, but we're adding either UTF-8 or Latin-1 single-byte text soon) and
2389 // linearly scans it into |Token|s.
2390 //
2391 // Internally the class uses a four element circular buffer |tokens| of
2392 // |Token|s. As an index for |tokens|, the member |cursor_| points to the
2393 // current token. Calls to getToken() increase |cursor_| by one and return the
2394 // new current token. If a TokenStream was just created, the current token is
2395 // uninitialized. It's therefore important that one of the first four member
2396 // functions listed below is called first. The circular buffer lets us go back
2397 // up to two tokens from the last scanned token. Internally, the relative
2398 // number of backward steps that were taken (via ungetToken()) after the last
2399 // token was scanned is stored in |lookahead|.
2400 //
2401 // The following table lists in which situations it is safe to call each listed
2402 // function. No checks are made by the functions in non-debug builds.
2403 //
2404 // Function Name     | Precondition; changes to |lookahead|
2405 // ------------------+---------------------------------------------------------
2406 // getToken          | none; if |lookahead > 0| then |lookahead--|
2407 // peekToken         | none; if |lookahead == 0| then |lookahead == 1|
2408 // peekTokenSameLine | none; if |lookahead == 0| then |lookahead == 1|
2409 // matchToken        | none; if |lookahead > 0| and the match succeeds then
2410 //                   |       |lookahead--|
2411 // consumeKnownToken | none; if |lookahead > 0| then |lookahead--|
2412 // ungetToken        | 0 <= |lookahead| <= |maxLookahead - 1|; |lookahead++|
2413 //
2414 // The behavior of the token scanning process (see getTokenInternal()) can be
2415 // modified by calling one of the first four above listed member functions with
2416 // an optional argument of type Modifier.  However, the modifier will be
2417 // ignored unless |lookahead == 0| holds.  Due to constraints of the grammar,
2418 // this turns out not to be a problem in practice. See the
2419 // mozilla.dev.tech.js-engine.internals thread entitled 'Bug in the scanner?'
2420 // for more details:
2421 // https://groups.google.com/forum/?fromgroups=#!topic/mozilla.dev.tech.js-engine.internals/2JLH5jRcr7E).
2422 //
2423 // The method seek() allows rescanning from a previously visited location of
2424 // the buffer, initially computed by constructing a Position local variable.
2425 //
2426 template <typename Unit, class AnyCharsAccess>
2427 class MOZ_STACK_CLASS TokenStreamSpecific
2428     : public TokenStreamChars<Unit, AnyCharsAccess>,
2429       public TokenStreamShared,
2430       public ErrorReporter {
2431  public:
2432   using CharsBase = TokenStreamCharsBase<Unit>;
2433   using SpecializedCharsBase = SpecializedTokenStreamCharsBase<Unit>;
2434   using GeneralCharsBase = GeneralTokenStreamChars<Unit, AnyCharsAccess>;
2435   using SpecializedChars = TokenStreamChars<Unit, AnyCharsAccess>;
2436 
2437   using Position = TokenStreamPosition<Unit>;
2438 
2439   // Anything inherited through a base class whose type depends upon this
2440   // class's template parameters can only be accessed through a dependent
2441   // name: prefixed with |this|, by explicit qualification, and so on.  (This
2442   // is so that references to inherited fields are statically distinguishable
2443   // from references to names outside of the class.)  This is tedious and
2444   // onerous.
2445   //
2446   // As an alternative, we directly add every one of these functions to this
2447   // class, using explicit qualification to address the dependent-name
2448   // problem.  |this| or other qualification is no longer necessary -- at
2449   // cost of this ever-changing laundry list of |using|s.  So it goes.
2450  public:
2451   using GeneralCharsBase::anyCharsAccess;
2452   using GeneralCharsBase::computeLineAndColumn;
2453   using TokenStreamCharsShared::adoptState;
2454 
2455  private:
2456   using typename CharsBase::SourceUnits;
2457 
2458  private:
2459   using CharsBase::atomizeSourceChars;
2460   using GeneralCharsBase::badToken;
2461   // Deliberately don't |using| |charBuffer| because of bug 1472569.  :-(
2462   using CharsBase::consumeKnownCodeUnit;
2463   using CharsBase::matchCodeUnit;
2464   using CharsBase::matchLineTerminator;
2465   using CharsBase::peekCodeUnit;
2466   using GeneralCharsBase::computeColumn;
2467   using GeneralCharsBase::fillExceptingContext;
2468   using GeneralCharsBase::getCodeUnit;
2469   using GeneralCharsBase::getFullAsciiCodePoint;
2470   using GeneralCharsBase::internalComputeLineOfContext;
2471   using GeneralCharsBase::matchUnicodeEscapeIdent;
2472   using GeneralCharsBase::matchUnicodeEscapeIdStart;
2473   using GeneralCharsBase::newAtomToken;
2474   using GeneralCharsBase::newBigIntToken;
2475   using GeneralCharsBase::newNameToken;
2476   using GeneralCharsBase::newNumberToken;
2477   using GeneralCharsBase::newPrivateNameToken;
2478   using GeneralCharsBase::newRegExpToken;
2479   using GeneralCharsBase::newSimpleToken;
2480   using SpecializedChars::getNonAsciiCodePoint;
2481   using SpecializedChars::getNonAsciiCodePointDontNormalize;
2482   using TokenStreamCharsShared::copyCharBufferTo;
2483   using TokenStreamCharsShared::drainCharBufferIntoAtom;
2484   using TokenStreamCharsShared::isAsciiCodePoint;
2485   // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
2486   using CharsBase::toUnit;
2487   using GeneralCharsBase::ungetCodeUnit;
2488   using GeneralCharsBase::updateLineInfoForEOL;
2489 
2490   template <typename CharU>
2491   friend class TokenStreamPosition;
2492 
2493  public:
2494   TokenStreamSpecific(JSContext* cx, ParserAtomsTable* parserAtoms,
2495                       const JS::ReadOnlyCompileOptions& options,
2496                       const Unit* units, size_t length);
2497 
2498   /**
2499    * Get the next code point, converting LineTerminatorSequences to '\n' and
2500    * updating internal line-counter state if needed.  Return true on success
2501    * and store the code point in |*cp|.  Return false and leave |*cp|
2502    * undefined on failure.
2503    */
2504   [[nodiscard]] bool getCodePoint(int32_t* cp);
2505 
2506   // If there is an invalid escape in a template, report it and return false,
2507   // otherwise return true.
2508   bool checkForInvalidTemplateEscapeError() {
2509     if (anyCharsAccess().invalidTemplateEscapeType == InvalidEscapeType::None) {
2510       return true;
2511     }
2512 
2513     reportInvalidEscapeError(anyCharsAccess().invalidTemplateEscapeOffset,
2514                              anyCharsAccess().invalidTemplateEscapeType);
2515     return false;
2516   }
2517 
2518  public:
2519   // Implement ErrorReporter.
2520 
2521   void lineAndColumnAt(size_t offset, uint32_t* line,
2522                        uint32_t* column) const final {
2523     computeLineAndColumn(offset, line, column);
2524   }
2525 
2526   void currentLineAndColumn(uint32_t* line, uint32_t* column) const final {
2527     computeLineAndColumn(anyCharsAccess().currentToken().pos.begin, line,
2528                          column);
2529   }
2530 
2531   bool isOnThisLine(size_t offset, uint32_t lineNum,
2532                     bool* onThisLine) const final {
2533     return anyCharsAccess().srcCoords.isOnThisLine(offset, lineNum, onThisLine);
2534   }
2535 
2536   uint32_t lineAt(size_t offset) const final {
2537     const auto& anyChars = anyCharsAccess();
2538     auto lineToken = anyChars.lineToken(offset);
2539     return anyChars.lineNumber(lineToken);
2540   }
2541 
2542   uint32_t columnAt(size_t offset) const final {
2543     return computeColumn(anyCharsAccess().lineToken(offset), offset);
2544   }
2545 
2546   bool hasTokenizationStarted() const final;
2547 
2548   const char* getFilename() const final {
2549     return anyCharsAccess().getFilename();
2550   }
2551 
2552  private:
2553   // Implement ErrorReportMixin.
2554 
2555   JSContext* getContext() const override { return anyCharsAccess().cx; }
2556 
2557   [[nodiscard]] bool strictMode() const override {
2558     return anyCharsAccess().strictMode();
2559   }
2560 
2561  public:
2562   // Implement ErrorReportMixin.
2563 
2564   const JS::ReadOnlyCompileOptions& options() const final {
2565     return anyCharsAccess().options();
2566   }
2567 
2568   [[nodiscard]] bool computeErrorMetadata(
2569       ErrorMetadata* err, const ErrorOffset& errorOffset) override;
2570 
2571  private:
2572   void reportInvalidEscapeError(uint32_t offset, InvalidEscapeType type) {
2573     switch (type) {
2574       case InvalidEscapeType::None:
2575         MOZ_ASSERT_UNREACHABLE("unexpected InvalidEscapeType");
2576         return;
2577       case InvalidEscapeType::Hexadecimal:
2578         errorAt(offset, JSMSG_MALFORMED_ESCAPE, "hexadecimal");
2579         return;
2580       case InvalidEscapeType::Unicode:
2581         errorAt(offset, JSMSG_MALFORMED_ESCAPE, "Unicode");
2582         return;
2583       case InvalidEscapeType::UnicodeOverflow:
2584         errorAt(offset, JSMSG_UNICODE_OVERFLOW, "escape sequence");
2585         return;
2586       case InvalidEscapeType::Octal:
2587         errorAt(offset, JSMSG_DEPRECATED_OCTAL_ESCAPE);
2588         return;
2589       case InvalidEscapeType::EightOrNine:
2590         errorAt(offset, JSMSG_DEPRECATED_EIGHT_OR_NINE_ESCAPE);
2591         return;
2592     }
2593   }
2594 
2595   void reportIllegalCharacter(int32_t cp);
2596 
2597   [[nodiscard]] bool putIdentInCharBuffer(const Unit* identStart);
2598 
2599   using IsIntegerUnit = bool (*)(int32_t);
2600   [[nodiscard]] MOZ_ALWAYS_INLINE bool matchInteger(IsIntegerUnit isIntegerUnit,
2601                                                     int32_t* nextUnit);
2602   [[nodiscard]] MOZ_ALWAYS_INLINE bool matchIntegerAfterFirstDigit(
2603       IsIntegerUnit isIntegerUnit, int32_t* nextUnit);
2604 
2605   /**
2606    * Tokenize a decimal number that begins at |numStart| into the provided
2607    * token.
2608    *
2609    * |unit| must be one of these values:
2610    *
2611    *   1. The first decimal digit in the integral part of a decimal number
2612    *      not starting with '0' or '.', e.g. '1' for "17", '3' for "3.14", or
2613    *      '8' for "8.675309e6".
2614    *
2615    *   In this case, the next |getCodeUnit()| must return the code unit after
2616    *   |unit| in the overall number.
2617    *
2618    *   2. The '.' in a "."/"0."-prefixed decimal number or the 'e'/'E' in a
2619    *      "0e"/"0E"-prefixed decimal number, e.g. ".17", "0.42", or "0.1e3".
2620    *
2621    *   In this case, the next |getCodeUnit()| must return the code unit
2622    *   *after* the first decimal digit *after* the '.'.  So the next code
2623    *   unit would be '7' in ".17", '2' in "0.42", 'e' in "0.4e+8", or '/' in
2624    *   "0.5/2" (three separate tokens).
2625    *
2626    *   3. The code unit after the '0' where "0" is the entire number token.
2627    *
2628    *   In this case, the next |getCodeUnit()| would return the code unit
2629    *   after |unit|, but this function will never perform such call.
2630    *
2631    *   4. (Non-strict mode code only)  The first '8' or '9' in a "noctal"
2632    *      number that begins with a '0' but contains a non-octal digit in its
2633    *      integer part so is interpreted as decimal, e.g. '9' in "09.28" or
2634    *      '8' in "0386" or '9' in "09+7" (three separate tokens").
2635    *
2636    *   In this case, the next |getCodeUnit()| returns the code unit after
2637    *   |unit|: '.', '6', or '+' in the examples above.
2638    *
2639    * This interface is super-hairy and horribly stateful.  Unfortunately, its
2640    * hair merely reflects the intricacy of ECMAScript numeric literal syntax.
2641    * And incredibly, it *improves* on the goto-based horror that predated it.
2642    */
2643   [[nodiscard]] bool decimalNumber(int32_t unit, TokenStart start,
2644                                    const Unit* numStart, Modifier modifier,
2645                                    TokenKind* out);
2646 
2647   /** Tokenize a regular expression literal beginning at |start|. */
2648   [[nodiscard]] bool regexpLiteral(TokenStart start, TokenKind* out);
2649 
2650   /**
2651    * Slurp characters between |start| and sourceUnits.current() into
2652    * charBuffer, to later parse into a bigint.
2653    */
2654   [[nodiscard]] bool bigIntLiteral(TokenStart start, Modifier modifier,
2655                                    TokenKind* out);
2656 
2657  public:
2658   // Advance to the next token.  If the token stream encountered an error,
2659   // return false.  Otherwise return true and store the token kind in |*ttp|.
2660   [[nodiscard]] bool getToken(TokenKind* ttp, Modifier modifier = SlashIsDiv) {
2661     // Check for a pushed-back token resulting from mismatching lookahead.
2662     TokenStreamAnyChars& anyChars = anyCharsAccess();
2663     if (anyChars.lookahead != 0) {
2664       MOZ_ASSERT(!anyChars.flags.hadError);
2665       anyChars.lookahead--;
2666       anyChars.advanceCursor();
2667       TokenKind tt = anyChars.currentToken().type;
2668       MOZ_ASSERT(tt != TokenKind::Eol);
2669       verifyConsistentModifier(modifier, anyChars.currentToken());
2670       *ttp = tt;
2671       return true;
2672     }
2673 
2674     return getTokenInternal(ttp, modifier);
2675   }
2676 
2677   [[nodiscard]] bool peekToken(TokenKind* ttp, Modifier modifier = SlashIsDiv) {
2678     TokenStreamAnyChars& anyChars = anyCharsAccess();
2679     if (anyChars.lookahead > 0) {
2680       MOZ_ASSERT(!anyChars.flags.hadError);
2681       verifyConsistentModifier(modifier, anyChars.nextToken());
2682       *ttp = anyChars.nextToken().type;
2683       return true;
2684     }
2685     if (!getTokenInternal(ttp, modifier)) {
2686       return false;
2687     }
2688     anyChars.ungetToken();
2689     return true;
2690   }
2691 
2692   [[nodiscard]] bool peekTokenPos(TokenPos* posp,
2693                                   Modifier modifier = SlashIsDiv) {
2694     TokenStreamAnyChars& anyChars = anyCharsAccess();
2695     if (anyChars.lookahead == 0) {
2696       TokenKind tt;
2697       if (!getTokenInternal(&tt, modifier)) {
2698         return false;
2699       }
2700       anyChars.ungetToken();
2701       MOZ_ASSERT(anyChars.hasLookahead());
2702     } else {
2703       MOZ_ASSERT(!anyChars.flags.hadError);
2704       verifyConsistentModifier(modifier, anyChars.nextToken());
2705     }
2706     *posp = anyChars.nextToken().pos;
2707     return true;
2708   }
2709 
2710   [[nodiscard]] bool peekOffset(uint32_t* offset,
2711                                 Modifier modifier = SlashIsDiv) {
2712     TokenPos pos;
2713     if (!peekTokenPos(&pos, modifier)) {
2714       return false;
2715     }
2716     *offset = pos.begin;
2717     return true;
2718   }
2719 
2720   // This is like peekToken(), with one exception:  if there is an EOL
2721   // between the end of the current token and the start of the next token, it
2722   // return true and store Eol in |*ttp|.  In that case, no token with
2723   // Eol is actually created, just a Eol TokenKind is returned, and
2724   // currentToken() shouldn't be consulted.  (This is the only place Eol
2725   // is produced.)
2726   [[nodiscard]] MOZ_ALWAYS_INLINE bool peekTokenSameLine(
2727       TokenKind* ttp, Modifier modifier = SlashIsDiv) {
2728     TokenStreamAnyChars& anyChars = anyCharsAccess();
2729     const Token& curr = anyChars.currentToken();
2730 
2731     // If lookahead != 0, we have scanned ahead at least one token, and
2732     // |lineno| is the line that the furthest-scanned token ends on.  If
2733     // it's the same as the line that the current token ends on, that's a
2734     // stronger condition than what we are looking for, and we don't need
2735     // to return Eol.
2736     if (anyChars.lookahead != 0) {
2737       bool onThisLine;
2738       if (!anyChars.srcCoords.isOnThisLine(curr.pos.end, anyChars.lineno,
2739                                            &onThisLine)) {
2740         error(JSMSG_OUT_OF_MEMORY);
2741         return false;
2742       }
2743 
2744       if (onThisLine) {
2745         MOZ_ASSERT(!anyChars.flags.hadError);
2746         verifyConsistentModifier(modifier, anyChars.nextToken());
2747         *ttp = anyChars.nextToken().type;
2748         return true;
2749       }
2750     }
2751 
2752     // The above check misses two cases where we don't have to return
2753     // Eol.
2754     // - The next token starts on the same line, but is a multi-line token.
2755     // - The next token starts on the same line, but lookahead==2 and there
2756     //   is a newline between the next token and the one after that.
2757     // The following test is somewhat expensive but gets these cases (and
2758     // all others) right.
2759     TokenKind tmp;
2760     if (!getToken(&tmp, modifier)) {
2761       return false;
2762     }
2763 
2764     const Token& next = anyChars.currentToken();
2765     anyChars.ungetToken();
2766 
2767     // Careful, |next| points to an initialized-but-not-allocated Token!
2768     // This is safe because we don't modify token data below.
2769 
2770     auto currentEndToken = anyChars.lineToken(curr.pos.end);
2771     auto nextBeginToken = anyChars.lineToken(next.pos.begin);
2772 
2773     *ttp =
2774         currentEndToken.isSameLine(nextBeginToken) ? next.type : TokenKind::Eol;
2775     return true;
2776   }
2777 
2778   // Get the next token from the stream if its kind is |tt|.
2779   [[nodiscard]] bool matchToken(bool* matchedp, TokenKind tt,
2780                                 Modifier modifier = SlashIsDiv) {
2781     TokenKind token;
2782     if (!getToken(&token, modifier)) {
2783       return false;
2784     }
2785     if (token == tt) {
2786       *matchedp = true;
2787     } else {
2788       anyCharsAccess().ungetToken();
2789       *matchedp = false;
2790     }
2791     return true;
2792   }
2793 
2794   void consumeKnownToken(TokenKind tt, Modifier modifier = SlashIsDiv) {
2795     bool matched;
2796     MOZ_ASSERT(anyCharsAccess().hasLookahead());
2797     MOZ_ALWAYS_TRUE(matchToken(&matched, tt, modifier));
2798     MOZ_ALWAYS_TRUE(matched);
2799   }
2800 
2801   [[nodiscard]] bool nextTokenEndsExpr(bool* endsExpr) {
2802     TokenKind tt;
2803     if (!peekToken(&tt)) {
2804       return false;
2805     }
2806 
2807     *endsExpr = anyCharsAccess().isExprEnding[size_t(tt)];
2808     if (*endsExpr) {
2809       // If the next token ends an overall Expression, we'll parse this
2810       // Expression without ever invoking Parser::orExpr().  But we need that
2811       // function's DEBUG-only side effect of marking this token as safe to get
2812       // with SlashIsRegExp, so we have to do it manually here.
2813       anyCharsAccess().allowGettingNextTokenWithSlashIsRegExp();
2814     }
2815     return true;
2816   }
2817 
2818   [[nodiscard]] bool advance(size_t position);
2819 
2820   void seekTo(const Position& pos);
2821   [[nodiscard]] bool seekTo(const Position& pos,
2822                             const TokenStreamAnyChars& other);
2823 
2824   void rewind(const Position& pos) {
2825     MOZ_ASSERT(pos.buf <= this->sourceUnits.addressOfNextCodeUnit(),
2826                "should be rewinding here");
2827     seekTo(pos);
2828   }
2829 
2830   [[nodiscard]] bool rewind(const Position& pos,
2831                             const TokenStreamAnyChars& other) {
2832     MOZ_ASSERT(pos.buf <= this->sourceUnits.addressOfNextCodeUnit(),
2833                "should be rewinding here");
2834     return seekTo(pos, other);
2835   }
2836 
2837   void fastForward(const Position& pos) {
2838     MOZ_ASSERT(this->sourceUnits.addressOfNextCodeUnit() <= pos.buf,
2839                "should be moving forward here");
2840     seekTo(pos);
2841   }
2842 
2843   [[nodiscard]] bool fastForward(const Position& pos,
2844                                  const TokenStreamAnyChars& other) {
2845     MOZ_ASSERT(this->sourceUnits.addressOfNextCodeUnit() <= pos.buf,
2846                "should be moving forward here");
2847     return seekTo(pos, other);
2848   }
2849 
2850   const Unit* codeUnitPtrAt(size_t offset) const {
2851     return this->sourceUnits.codeUnitPtrAt(offset);
2852   }
2853 
2854   const Unit* rawLimit() const { return this->sourceUnits.limit(); }
2855 
2856   [[nodiscard]] bool identifierName(TokenStart start, const Unit* identStart,
2857                                     IdentifierEscapes escaping,
2858                                     Modifier modifier,
2859                                     NameVisibility visibility, TokenKind* out);
2860 
2861   [[nodiscard]] bool matchIdentifierStart(IdentifierEscapes* sawEscape);
2862 
2863   [[nodiscard]] bool getTokenInternal(TokenKind* const ttp,
2864                                       const Modifier modifier);
2865 
2866   [[nodiscard]] bool getStringOrTemplateToken(char untilChar, Modifier modifier,
2867                                               TokenKind* out);
2868 
2869   // Parse a TemplateMiddle or TemplateTail token (one of the string-like parts
2870   // of a template string) after already consuming the leading `RightCurly`.
2871   // (The spec says the `}` is the first character of the TemplateMiddle/
2872   // TemplateTail, but we treat it as a separate token because that's much
2873   // easier to implement in both TokenStream and the parser.)
2874   //
2875   // This consumes a token and sets the current token, like `getToken()`.  It
2876   // doesn't take a Modifier because there's no risk of encountering a division
2877   // operator or RegExp literal.
2878   //
2879   // On success, `*ttp` is either `TokenKind::TemplateHead` (if we got a
2880   // TemplateMiddle token) or `TokenKind::NoSubsTemplate` (if we got a
2881   // TemplateTail). That may seem strange; there are four different template
2882   // token types in the spec, but we only use two. We use `TemplateHead` for
2883   // TemplateMiddle because both end with `...${`, and `NoSubsTemplate` for
2884   // TemplateTail because both contain the end of the template, including the
2885   // closing quote mark. They're not treated differently, either in the parser
2886   // or in the tokenizer.
2887   [[nodiscard]] bool getTemplateToken(TokenKind* ttp) {
2888     MOZ_ASSERT(anyCharsAccess().currentToken().type == TokenKind::RightCurly);
2889     return getStringOrTemplateToken('`', SlashIsInvalid, ttp);
2890   }
2891 
2892   [[nodiscard]] bool getDirectives(bool isMultiline, bool shouldWarnDeprecated);
2893   [[nodiscard]] bool getDirective(
2894       bool isMultiline, bool shouldWarnDeprecated, const char* directive,
2895       uint8_t directiveLength, const char* errorMsgPragma,
2896       UniquePtr<char16_t[], JS::FreePolicy>* destination);
2897   [[nodiscard]] bool getDisplayURL(bool isMultiline, bool shouldWarnDeprecated);
2898   [[nodiscard]] bool getSourceMappingURL(bool isMultiline,
2899                                          bool shouldWarnDeprecated);
2900 };
2901 
2902 // It's preferable to define this in TokenStream.cpp, but its template-ness
2903 // means we'd then have to *instantiate* this constructor for all possible
2904 // (Unit, AnyCharsAccess) pairs -- and that gets super-messy as AnyCharsAccess
2905 // *itself* is templated.  This symbol really isn't that huge compared to some
2906 // defined inline in TokenStreamSpecific, so just rely on the linker commoning
2907 // stuff up.
2908 template <typename Unit>
2909 template <class AnyCharsAccess>
2910 inline TokenStreamPosition<Unit>::TokenStreamPosition(
2911     TokenStreamSpecific<Unit, AnyCharsAccess>& tokenStream)
2912     : currentToken(tokenStream.anyCharsAccess().currentToken()) {
2913   TokenStreamAnyChars& anyChars = tokenStream.anyCharsAccess();
2914 
2915   buf =
2916       tokenStream.sourceUnits.addressOfNextCodeUnit(/* allowPoisoned = */ true);
2917   flags = anyChars.flags;
2918   lineno = anyChars.lineno;
2919   linebase = anyChars.linebase;
2920   prevLinebase = anyChars.prevLinebase;
2921   lookahead = anyChars.lookahead;
2922   currentToken = anyChars.currentToken();
2923   for (unsigned i = 0; i < anyChars.lookahead; i++) {
2924     lookaheadTokens[i] = anyChars.tokens[anyChars.aheadCursor(1 + i)];
2925   }
2926 }
2927 
2928 class TokenStreamAnyCharsAccess {
2929  public:
2930   template <class TokenStreamSpecific>
2931   static inline TokenStreamAnyChars& anyChars(TokenStreamSpecific* tss);
2932 
2933   template <class TokenStreamSpecific>
2934   static inline const TokenStreamAnyChars& anyChars(
2935       const TokenStreamSpecific* tss);
2936 };
2937 
2938 class MOZ_STACK_CLASS TokenStream
2939     : public TokenStreamAnyChars,
2940       public TokenStreamSpecific<char16_t, TokenStreamAnyCharsAccess> {
2941   using Unit = char16_t;
2942 
2943  public:
2944   TokenStream(JSContext* cx, ParserAtomsTable* parserAtoms,
2945               const JS::ReadOnlyCompileOptions& options, const Unit* units,
2946               size_t length, StrictModeGetter* smg)
2947       : TokenStreamAnyChars(cx, options, smg),
2948         TokenStreamSpecific<Unit, TokenStreamAnyCharsAccess>(
2949             cx, parserAtoms, options, units, length) {}
2950 };
2951 
2952 class MOZ_STACK_CLASS DummyTokenStream final : public TokenStream {
2953  public:
2954   DummyTokenStream(JSContext* cx, const JS::ReadOnlyCompileOptions& options)
2955       : TokenStream(cx, nullptr, options, nullptr, 0, nullptr) {}
2956 };
2957 
2958 template <class TokenStreamSpecific>
2959 /* static */ inline TokenStreamAnyChars& TokenStreamAnyCharsAccess::anyChars(
2960     TokenStreamSpecific* tss) {
2961   auto* ts = static_cast<TokenStream*>(tss);
2962   return *static_cast<TokenStreamAnyChars*>(ts);
2963 }
2964 
2965 template <class TokenStreamSpecific>
2966 /* static */ inline const TokenStreamAnyChars&
2967 TokenStreamAnyCharsAccess::anyChars(const TokenStreamSpecific* tss) {
2968   const auto* ts = static_cast<const TokenStream*>(tss);
2969   return *static_cast<const TokenStreamAnyChars*>(ts);
2970 }
2971 
2972 extern const char* TokenKindToDesc(TokenKind tt);
2973 
2974 }  // namespace frontend
2975 }  // namespace js
2976 
2977 #ifdef DEBUG
2978 extern const char* TokenKindToString(js::frontend::TokenKind tt);
2979 #endif
2980 
2981 #endif /* frontend_TokenStream_h */
2982