1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2  * vim: set ts=8 sts=2 et sw=2 tw=80:
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 /*
8  * Streaming access to the raw tokens of JavaScript source.
9  *
10  * Because JS tokenization is context-sensitive -- a '/' could be either a
11  * regular expression *or* a division operator depending on context -- the
12  * various token stream classes are mostly not useful outside of the Parser
13  * where they reside.  We should probably eventually merge the two concepts.
14  */
15 #ifndef frontend_TokenStream_h
16 #define frontend_TokenStream_h
17 
18 /*
19  * [SMDOC] Parser Token Stream
20  *
21  * A token stream exposes the raw tokens -- operators, names, numbers,
22  * keywords, and so on -- of JavaScript source code.
23  *
24  * These are the components of the overall token stream concept:
25  * TokenStreamShared, TokenStreamAnyChars, TokenStreamCharsBase<Unit>,
26  * TokenStreamChars<Unit>, and TokenStreamSpecific<Unit, AnyCharsAccess>.
27  *
28  * == TokenStreamShared → ∅ ==
29  *
30  * Certain aspects of tokenizing are used everywhere:
31  *
32  *   * modifiers (used to select which context-sensitive interpretation of a
33  *     character should be used to decide what token it is) and modifier
34  *     assertion handling;
35  *   * flags on the overall stream (have we encountered any characters on this
36  *     line?  have we hit a syntax error?  and so on);
37  *   * and certain token-count constants.
38  *
39  * These are all defined in TokenStreamShared.  (They could be namespace-
40  * scoped, but it seems tentatively better not to clutter the namespace.)
41  *
42  * == TokenStreamAnyChars → TokenStreamShared ==
43  *
44  * Certain aspects of tokenizing have meaning independent of the character type
45  * of the source text being tokenized: line/column number information, tokens
46  * in lookahead from determining the meaning of a prior token, compilation
47  * options, the filename, flags, source map URL, access to details of the
48  * current and next tokens (is the token of the given type?  what name or
49  * number is contained in the token?  and other queries), and others.
50  *
51  * All this data/functionality *could* be duplicated for both single-byte and
52  * double-byte tokenizing, but there are two problems.  First, it's potentially
53  * wasteful if the compiler doesnt recognize it can unify the concepts.  (And
54  * if any-character concepts are intermixed with character-specific concepts,
55  * potentially the compiler *can't* unify them because offsets into the
56  * hypothetical TokenStream<Unit>s would differ.)  Second, some of this stuff
57  * needs to be accessible in ParserBase, the aspects of JS language parsing
58  * that have meaning independent of the character type of the source text being
59  * parsed.  So we need a separate data structure that ParserBase can hold on to
60  * for it.  (ParserBase isn't the only instance of this, but it's certainly the
61  * biggest case of it.)  Ergo, TokenStreamAnyChars.
62  *
63  * == TokenStreamCharsShared → ∅ ==
64  *
65  * Some functionality has meaning independent of character type, yet has no use
66  * *unless* you know the character type in actual use.  It *could* live in
67  * TokenStreamAnyChars, but it makes more sense to live in a separate class
68  * that character-aware token information can simply inherit.
69  *
70  * This class currently exists only to contain a char16_t buffer, transiently
71  * used to accumulate strings in tricky cases that can't just be read directly
72  * from source text.  It's not used outside character-aware tokenizing, so it
73  * doesn't make sense in TokenStreamAnyChars.
74  *
75  * == TokenStreamCharsBase<Unit> → TokenStreamCharsShared ==
76  *
77  * Certain data structures in tokenizing are character-type-specific: namely,
78  * the various pointers identifying the source text (including current offset
79  * and end).
80  *
81  * Additionally, some functions operating on this data are defined the same way
82  * no matter what character type you have (e.g. current offset in code units
83  * into the source text) or share a common interface regardless of character
84  * type (e.g. consume the next code unit if it has a given value).
85  *
86  * All such functionality lives in TokenStreamCharsBase<Unit>.
87  *
88  * == SpecializedTokenStreamCharsBase<Unit> → TokenStreamCharsBase<Unit> ==
89  *
90  * Certain tokenizing functionality is specific to a single character type.
91  * For example, JS's UTF-16 encoding recognizes no coding errors, because lone
92  * surrogates are not an error; but a UTF-8 encoding must recognize a variety
93  * of validation errors.  Such functionality is defined only in the appropriate
94  * SpecializedTokenStreamCharsBase specialization.
95  *
96  * == GeneralTokenStreamChars<Unit, AnyCharsAccess> →
97  *    SpecializedTokenStreamCharsBase<Unit> ==
98  *
99  * Some functionality operates differently on different character types, just
100  * as for TokenStreamCharsBase, but additionally requires access to character-
101  * type-agnostic information in TokenStreamAnyChars.  For example, getting the
102  * next character performs different steps for different character types and
103  * must access TokenStreamAnyChars to update line break information.
104  *
105  * Such functionality, if it can be defined using the same algorithm for all
106  * character types, lives in GeneralTokenStreamChars<Unit, AnyCharsAccess>.
107  * The AnyCharsAccess parameter provides a way for a GeneralTokenStreamChars
108  * instance to access its corresponding TokenStreamAnyChars, without inheriting
109  * from it.
110  *
111  * GeneralTokenStreamChars<Unit, AnyCharsAccess> is just functionality, no
112  * actual member data.
113  *
114  * Such functionality all lives in TokenStreamChars<Unit, AnyCharsAccess>, a
115  * declared-but-not-defined template class whose specializations have a common
116  * public interface (plus whatever private helper functions are desirable).
117  *
118  * == TokenStreamChars<Unit, AnyCharsAccess> →
119  *    GeneralTokenStreamChars<Unit, AnyCharsAccess> ==
120  *
121  * Some functionality is like that in GeneralTokenStreamChars, *but* it's
122  * defined entirely differently for different character types.
123  *
124  * For example, consider "match a multi-code unit code point" (hypothetically:
125  * we've only implemented two-byte tokenizing right now):
126  *
127  *   * For two-byte text, there must be two code units to get, the leading code
128  *     unit must be a UTF-16 lead surrogate, and the trailing code unit must be
129  *     a UTF-16 trailing surrogate.  (If any of these fail to hold, a next code
130  *     unit encodes that code point and is not multi-code unit.)
131  *   * For single-byte Latin-1 text, there are no multi-code unit code points.
132  *   * For single-byte UTF-8 text, the first code unit must have N > 1 of its
133  *     highest bits set (and the next unset), and |N - 1| successive code units
134  *     must have their high bit set and next-highest bit unset, *and*
135  *     concatenating all unconstrained bits together must not produce a code
136  *     point value that could have been encoded in fewer code units.
137  *
138  * This functionality can't be implemented as member functions in
139  * GeneralTokenStreamChars because we'd need to *partially specialize* those
140  * functions -- hold Unit constant while letting AnyCharsAccess vary.  But
141  * C++ forbids function template partial specialization like this: either you
142  * fix *all* parameters or you fix none of them.
143  *
144  * Fortunately, C++ *does* allow *class* template partial specialization.  So
145  * TokenStreamChars is a template class with one specialization per Unit.
146  * Functions can be defined differently in the different specializations,
147  * because AnyCharsAccess as the only template parameter on member functions
148  * *can* vary.
149  *
150  * All TokenStreamChars<Unit, AnyCharsAccess> specializations, one per Unit,
151  * are just functionality, no actual member data.
152  *
153  * == TokenStreamSpecific<Unit, AnyCharsAccess> →
154  *    TokenStreamChars<Unit, AnyCharsAccess>, TokenStreamShared,
155  *    ErrorReporter ==
156  *
157  * TokenStreamSpecific is operations that are parametrized on character type
158  * but implement the *general* idea of tokenizing, without being intrinsically
159  * tied to character type.  Notably, this includes all operations that can
160  * report warnings or errors at particular offsets, because we include a line
161  * of context with such errors -- and that necessarily accesses the raw
162  * characters of their specific type.
163  *
164  * Much TokenStreamSpecific operation depends on functionality in
165  * TokenStreamAnyChars.  The obvious solution is to inherit it -- but this
166  * doesn't work in Parser: its ParserBase base class needs some
167  * TokenStreamAnyChars functionality without knowing character type.
168  *
169  * The AnyCharsAccess type parameter is a class that statically converts from a
170  * TokenStreamSpecific* to its corresponding TokenStreamAnyChars.  The
171  * TokenStreamSpecific in Parser<ParseHandler, Unit> can then specify a class
172  * that properly converts from TokenStreamSpecific Parser::tokenStream to
173  * TokenStreamAnyChars ParserBase::anyChars.
174  *
175  * Could we hardcode one set of offset calculations for this and eliminate
176  * AnyCharsAccess?  No.  Offset calculations possibly could be hardcoded if
177  * TokenStreamSpecific were present in Parser before Parser::handler, assuring
178  * the same offsets in all Parser-related cases.  But there's still a separate
179  * TokenStream class, that requires different offset calculations.  So even if
180  * we wanted to hardcode this (it's not clear we would, because forcing the
181  * TokenStreamSpecific declarer to specify this is more explicit), we couldn't.
182  */
183 
184 #include "mozilla/ArrayUtils.h"
185 #include "mozilla/Assertions.h"
186 #include "mozilla/Attributes.h"
187 #include "mozilla/Casting.h"
188 #include "mozilla/Maybe.h"
189 #include "mozilla/MemoryChecking.h"
190 #include "mozilla/Span.h"
191 #include "mozilla/TextUtils.h"
192 #include "mozilla/Utf8.h"
193 
194 #include <algorithm>
195 #include <stdarg.h>
196 #include <stddef.h>
197 #include <stdint.h>
198 #include <stdio.h>
199 #include <type_traits>
200 
201 #include "jspubtd.h"
202 
203 #include "frontend/ErrorReporter.h"
204 #include "frontend/ParserAtom.h"  // ParserAtom, ParserAtomsTable, TaggedParserAtomIndex
205 #include "frontend/Token.h"
206 #include "frontend/TokenKind.h"
207 #include "js/CompileOptions.h"
208 #include "js/friend/ErrorMessages.h"  // JSMSG_*
209 #include "js/HashTable.h"             // js::HashMap
210 #include "js/RegExpFlags.h"           // JS::RegExpFlags
211 #include "js/UniquePtr.h"
212 #include "js/Vector.h"
213 #include "util/Unicode.h"
214 #include "vm/ErrorReporting.h"
215 
216 struct JS_PUBLIC_API JSContext;
217 struct KeywordInfo;
218 
219 namespace js {
220 
221 namespace frontend {
222 
223 // Saturate column number at a limit that can be represented in various parts of
224 // the engine. Source locations beyond this point will report at the limit
225 // column instead.
226 //
227 // See:
228 //  - TokenStreamAnyChars::checkOptions
229 //  - ColSpan::isRepresentable
230 //  - WasmFrameIter::computeLine
231 static constexpr uint32_t ColumnLimit = std::numeric_limits<int32_t>::max() / 2;
232 
233 // If `name` is reserved word, returns the TokenKind of it.
234 // TokenKind::Limit otherwise.
235 extern TokenKind ReservedWordTokenKind(TaggedParserAtomIndex name);
236 
237 // If `name` is reserved word, returns string representation of it.
238 // nullptr otherwise.
239 extern const char* ReservedWordToCharZ(TaggedParserAtomIndex name);
240 
241 // If `tt` is reserved word, returns string representation of it.
242 // nullptr otherwise.
243 extern const char* ReservedWordToCharZ(TokenKind tt);
244 
245 enum class DeprecatedContent : uint8_t {
246   // No deprecated content was present.
247   None = 0,
248   // Octal literal not prefixed by "0o" but rather by just "0", e.g. 0755.
249   OctalLiteral,
250   // Octal character escape, e.g. "hell\157 world".
251   OctalEscape,
252   // NonOctalDecimalEscape, i.e. "\8" or "\9".
253   EightOrNineEscape,
254 };
255 
256 struct TokenStreamFlags {
257   // Hit end of file.
258   bool isEOF : 1;
259   // Non-whitespace since start of line.
260   bool isDirtyLine : 1;
261   // Hit a syntax error, at start or during a token.
262   bool hadError : 1;
263 
264   // The nature of any deprecated content seen since last reset.
265   // We have to uint8_t instead DeprecatedContent to work around a GCC 7 bug.
266   // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414
267   uint8_t sawDeprecatedContent : 2;
268 
TokenStreamFlagsTokenStreamFlags269   TokenStreamFlags()
270       : isEOF(false),
271         isDirtyLine(false),
272         hadError(false),
273         sawDeprecatedContent(uint8_t(DeprecatedContent::None)) {}
274 };
275 
276 template <typename Unit>
277 class TokenStreamPosition;
278 
279 /**
280  * TokenStream types and constants that are used in both TokenStreamAnyChars
281  * and TokenStreamSpecific.  Do not add any non-static data members to this
282  * class!
283  */
284 class TokenStreamShared {
285  protected:
286   static constexpr size_t ntokens = 4;  // 1 current + 2 lookahead, rounded
287                                         // to power of 2 to avoid divmod by 3
288 
289   static constexpr unsigned ntokensMask = ntokens - 1;
290 
291   template <typename Unit>
292   friend class TokenStreamPosition;
293 
294  public:
295   static constexpr unsigned maxLookahead = 2;
296 
297   using Modifier = Token::Modifier;
298   static constexpr Modifier SlashIsDiv = Token::SlashIsDiv;
299   static constexpr Modifier SlashIsRegExp = Token::SlashIsRegExp;
300   static constexpr Modifier SlashIsInvalid = Token::SlashIsInvalid;
301 
verifyConsistentModifier(Modifier modifier,const Token & nextToken)302   static void verifyConsistentModifier(Modifier modifier,
303                                        const Token& nextToken) {
304     MOZ_ASSERT(
305         modifier == nextToken.modifier || modifier == SlashIsInvalid,
306         "This token was scanned with both SlashIsRegExp and SlashIsDiv, "
307         "indicating the parser is confused about how to handle a slash here. "
308         "See comment at Token::Modifier.");
309   }
310 };
311 
312 static_assert(std::is_empty_v<TokenStreamShared>,
313               "TokenStreamShared shouldn't bloat classes that inherit from it");
314 
315 template <typename Unit, class AnyCharsAccess>
316 class TokenStreamSpecific;
317 
318 template <typename Unit>
319 class MOZ_STACK_CLASS TokenStreamPosition final {
320  public:
321   template <class AnyCharsAccess>
322   inline explicit TokenStreamPosition(
323       TokenStreamSpecific<Unit, AnyCharsAccess>& tokenStream);
324 
325  private:
326   TokenStreamPosition(const TokenStreamPosition&) = delete;
327 
328   // Technically only TokenStreamSpecific<Unit, AnyCharsAccess>::seek with
329   // Unit constant and AnyCharsAccess varying must be friended, but 1) it's
330   // hard to friend one function in template classes, and 2) C++ doesn't
331   // allow partial friend specialization to target just that single class.
332   template <typename Char, class AnyCharsAccess>
333   friend class TokenStreamSpecific;
334 
335   const Unit* buf;
336   TokenStreamFlags flags;
337   unsigned lineno;
338   size_t linebase;
339   size_t prevLinebase;
340   Token currentToken;
341   unsigned lookahead;
342   Token lookaheadTokens[TokenStreamShared::maxLookahead];
343 };
344 
345 template <typename Unit>
346 class SourceUnits;
347 
348 /**
349  * This class maps:
350  *
351  *   * a sourceUnits offset (a 0-indexed count of code units)
352  *
353  * to
354  *
355  *   * a (1-indexed) line number and
356  *   * a (0-indexed) offset in code *units* (not code points, not bytes) into
357  *     that line,
358  *
359  * for either |Unit = Utf8Unit| or |Unit = char16_t|.
360  *
361  * Note that the latter quantity is *not* the same as a column number, which is
362  * a count of code *points*.  Computing a column number requires the offset
363  * within the line and the source units of that line (including what type |Unit|
364  * is, to know how to decode them).  If you need a column number, functions in
365  * |GeneralTokenStreamChars<Unit>| will consult this and source units to compute
366  * it.
367  */
368 class SourceCoords {
369   // For a given buffer holding source code, |lineStartOffsets_| has one
370   // element per line of source code, plus one sentinel element.  Each
371   // non-sentinel element holds the buffer offset for the start of the
372   // corresponding line of source code.  For this example script,
373   // assuming an initialLineOffset of 0:
374   //
375   // 1  // xyz            [line starts at offset 0]
376   // 2  var x;            [line starts at offset 7]
377   // 3                    [line starts at offset 14]
378   // 4  var y;            [line starts at offset 15]
379   //
380   // |lineStartOffsets_| is:
381   //
382   //   [0, 7, 14, 15, MAX_PTR]
383   //
384   // To convert a "line number" to an "index" into |lineStartOffsets_|,
385   // subtract |initialLineNum_|.  E.g. line 3's index is
386   // (3 - initialLineNum_), which is 2.  Therefore lineStartOffsets_[2]
387   // holds the buffer offset for the start of line 3, which is 14.  (Note
388   // that |initialLineNum_| is often 1, but not always.
389   //
390   // The first element is always initialLineOffset, passed to the
391   // constructor, and the last element is always the MAX_PTR sentinel.
392   //
393   // Offset-to-{line,offset-into-line} lookups are O(log n) in the worst
394   // case (binary search), but in practice they're heavily clustered and
395   // we do better than that by using the previous lookup's result
396   // (lastIndex_) as a starting point.
397   //
398   // Checking if an offset lies within a particular line number
399   // (isOnThisLine()) is O(1).
400   //
401   Vector<uint32_t, 128> lineStartOffsets_;
402 
403   /** The line number on which the source text begins. */
404   uint32_t initialLineNum_;
405 
406   /**
407    * The index corresponding to the last offset lookup -- used so that if
408    * offset lookups proceed in increasing order, and and the offset appears
409    * in the next couple lines from the last offset, we can avoid a full
410    * binary-search.
411    *
412    * This is mutable because it's modified on every search, but that fact
413    * isn't visible outside this class.
414    */
415   mutable uint32_t lastIndex_;
416 
417   uint32_t indexFromOffset(uint32_t offset) const;
418 
419   static const uint32_t MAX_PTR = UINT32_MAX;
420 
lineNumberFromIndex(uint32_t index)421   uint32_t lineNumberFromIndex(uint32_t index) const {
422     return index + initialLineNum_;
423   }
424 
indexFromLineNumber(uint32_t lineNum)425   uint32_t indexFromLineNumber(uint32_t lineNum) const {
426     return lineNum - initialLineNum_;
427   }
428 
429  public:
430   SourceCoords(JSContext* cx, uint32_t initialLineNumber,
431                uint32_t initialOffset);
432 
433   [[nodiscard]] bool add(uint32_t lineNum, uint32_t lineStartOffset);
434   [[nodiscard]] bool fill(const SourceCoords& other);
435 
isOnThisLine(uint32_t offset,uint32_t lineNum,bool * onThisLine)436   bool isOnThisLine(uint32_t offset, uint32_t lineNum, bool* onThisLine) const {
437     uint32_t index = indexFromLineNumber(lineNum);
438     if (index + 1 >= lineStartOffsets_.length()) {  // +1 due to sentinel
439       return false;
440     }
441     *onThisLine = lineStartOffsets_[index] <= offset &&
442                   offset < lineStartOffsets_[index + 1];
443     return true;
444   }
445 
446   /**
447    * A token, computed for an offset in source text, that can be used to
448    * access line number and line-offset information for that offset.
449    *
450    * LineToken *alone* exposes whether the corresponding offset is in the
451    * the first line of source (which may not be 1, depending on
452    * |initialLineNumber|), and whether it's in the same line as
453    * another LineToken.
454    */
455   class LineToken {
456     uint32_t index;
457 #ifdef DEBUG
458     uint32_t offset_;  // stored for consistency-of-use assertions
459 #endif
460 
461     friend class SourceCoords;
462 
463    public:
LineToken(uint32_t index,uint32_t offset)464     LineToken(uint32_t index, uint32_t offset)
465         : index(index)
466 #ifdef DEBUG
467           ,
468           offset_(offset)
469 #endif
470     {
471     }
472 
isFirstLine()473     bool isFirstLine() const { return index == 0; }
474 
isSameLine(LineToken other)475     bool isSameLine(LineToken other) const { return index == other.index; }
476 
assertConsistentOffset(uint32_t offset)477     void assertConsistentOffset(uint32_t offset) const {
478       MOZ_ASSERT(offset_ == offset);
479     }
480   };
481 
482   /**
483    * Compute a token usable to access information about the line at the
484    * given offset.
485    *
486    * The only information directly accessible in a token is whether it
487    * corresponds to the first line of source text (which may not be line
488    * 1, depending on the |initialLineNumber| value used to construct
489    * this).  Use |lineNumber(LineToken)| to compute the actual line
490    * number (incorporating the contribution of |initialLineNumber|).
491    */
492   LineToken lineToken(uint32_t offset) const;
493 
494   /** Compute the line number for the given token. */
lineNumber(LineToken lineToken)495   uint32_t lineNumber(LineToken lineToken) const {
496     return lineNumberFromIndex(lineToken.index);
497   }
498 
499   /** Return the offset of the start of the line for |lineToken|. */
lineStart(LineToken lineToken)500   uint32_t lineStart(LineToken lineToken) const {
501     MOZ_ASSERT(lineToken.index + 1 < lineStartOffsets_.length(),
502                "recorded line-start information must be available");
503     return lineStartOffsets_[lineToken.index];
504   }
505 };
506 
507 enum class UnitsType : unsigned char {
508   PossiblyMultiUnit = 0,
509   GuaranteedSingleUnit = 1,
510 };
511 
512 class ChunkInfo {
513  private:
514   // Store everything in |unsigned char|s so everything packs.
515   unsigned char column_[sizeof(uint32_t)];
516   unsigned char unitsType_;
517 
518  public:
ChunkInfo(uint32_t col,UnitsType type)519   ChunkInfo(uint32_t col, UnitsType type)
520       : unitsType_(static_cast<unsigned char>(type)) {
521     memcpy(column_, &col, sizeof(col));
522   }
523 
column()524   uint32_t column() const {
525     uint32_t col;
526     memcpy(&col, column_, sizeof(uint32_t));
527     return col;
528   }
529 
unitsType()530   UnitsType unitsType() const {
531     MOZ_ASSERT(unitsType_ <= 1, "unitsType_ must be 0 or 1");
532     return static_cast<UnitsType>(unitsType_);
533   }
534 
guaranteeSingleUnits()535   void guaranteeSingleUnits() {
536     MOZ_ASSERT(unitsType() == UnitsType::PossiblyMultiUnit,
537                "should only be setting to possibly optimize from the "
538                "pessimistic case");
539     unitsType_ = static_cast<unsigned char>(UnitsType::GuaranteedSingleUnit);
540   }
541 };
542 
543 enum class InvalidEscapeType {
544   // No invalid character escapes.
545   None,
546   // A malformed \x escape.
547   Hexadecimal,
548   // A malformed \u escape.
549   Unicode,
550   // An otherwise well-formed \u escape which represents a
551   // codepoint > 10FFFF.
552   UnicodeOverflow,
553   // An octal escape in a template token.
554   Octal,
555   // NonOctalDecimalEscape - \8 or \9.
556   EightOrNine
557 };
558 
559 class TokenStreamAnyChars : public TokenStreamShared {
560  private:
561   // Constant-at-construction fields.
562 
563   JSContext* const cx;
564 
565   /** Options used for parsing/tokenizing. */
566   const JS::ReadOnlyCompileOptions& options_;
567 
568   /**
569    * Pointer used internally to test whether in strict mode.  Use |strictMode()|
570    * instead of this field.
571    */
572   StrictModeGetter* const strictModeGetter_;
573 
574   /** Input filename or null. */
575   const char* const filename_;
576 
577   // Column number computation fields.
578 
579   /**
580    * A map of (line number => sequence of the column numbers at
581    * |ColumnChunkLength|-unit boundaries rewound [if needed] to the nearest code
582    * point boundary).  (|TokenStreamAnyChars::computePartialColumn| is the sole
583    * user of |ColumnChunkLength| and therefore contains its definition.)
584    *
585    * Entries appear in this map only when a column computation of sufficient
586    * distance is performed on a line -- and only when the column is beyond the
587    * first |ColumnChunkLength| units.  Each line's vector is lazily filled as
588    * greater offsets require column computations.
589    */
590   mutable HashMap<uint32_t, Vector<ChunkInfo>> longLineColumnInfo_;
591 
592   // Computing accurate column numbers requires at *some* point linearly
593   // iterating through prior source units in the line, to properly account for
594   // multi-unit code points.  This is quadratic if counting happens repeatedly.
595   //
596   // But usually we need columns for advancing offsets through scripts.  By
597   // caching the last ((line number, offset) => relative column) mapping (in
598   // similar manner to how |SourceCoords::lastIndex_| is used to cache
599   // (offset => line number) mappings) we can usually avoid re-iterating through
600   // the common line prefix.
601   //
602   // Additionally, we avoid hash table lookup costs by caching the
603   // |Vector<ChunkInfo>*| for the line of the last lookup.  (|nullptr| means we
604   // must look it up -- or it hasn't been created yet.)  This pointer is nulled
605   // when a lookup on a new line occurs, but as it's not a pointer at literal,
606   // reallocatable element data, it's *not* invalidated when new entries are
607   // added to such a vector.
608 
609   /**
610    * The line in which the last column computation occurred, or UINT32_MAX if
611    * no prior computation has yet happened.
612    */
613   mutable uint32_t lineOfLastColumnComputation_ = UINT32_MAX;
614 
615   /**
616    * The chunk vector of the line for that last column computation.  This is
617    * null if the chunk vector needs to be recalculated or initially created.
618    */
619   mutable Vector<ChunkInfo>* lastChunkVectorForLine_ = nullptr;
620 
621   /**
622    * The offset (in code units) of the last column computation performed,
623    * relative to source start.
624    */
625   mutable uint32_t lastOffsetOfComputedColumn_ = UINT32_MAX;
626 
627   /**
628    * The column number for the offset (in code units) of the last column
629    * computation performed, relative to source start.
630    */
631   mutable uint32_t lastComputedColumn_ = 0;
632 
633   // Intra-token fields.
634 
635   /**
636    * The offset of the first invalid escape in a template literal.  (If there is
637    * one -- if not, the value of this field is meaningless.)
638    *
639    * See also |invalidTemplateEscapeType|.
640    */
641   uint32_t invalidTemplateEscapeOffset = 0;
642 
643   /**
644    * The type of the first invalid escape in a template literal.  (If there
645    * isn't one, this will be |None|.)
646    *
647    * See also |invalidTemplateEscapeOffset|.
648    */
649   InvalidEscapeType invalidTemplateEscapeType = InvalidEscapeType::None;
650 
651   // Fields with values relevant across tokens (and therefore potentially across
652   // function boundaries, such that lazy function parsing and stream-seeking
653   // must take care in saving and restoring them).
654 
655   /** Line number and offset-to-line mapping information. */
656   SourceCoords srcCoords;
657 
658   /** Circular token buffer of gotten tokens that have been ungotten. */
659   Token tokens[ntokens] = {};
660 
661   /** The index in |tokens| of the last parsed token. */
662   unsigned cursor_ = 0;
663 
664   /** The number of tokens in |tokens| available to be gotten. */
665   unsigned lookahead = 0;
666 
667   /** The current line number. */
668   unsigned lineno;
669 
670   /** Various flag bits (see above). */
671   TokenStreamFlags flags = {};
672 
673   /** The offset of the start of the current line. */
674   size_t linebase = 0;
675 
676   /** The start of the previous line, or |size_t(-1)| on the first line. */
677   size_t prevLinebase = size_t(-1);
678 
679   /** The user's requested source URL.  Null if none has been set. */
680   UniqueTwoByteChars displayURL_ = nullptr;
681 
682   /** The URL of the source map for this script.  Null if none has been set. */
683   UniqueTwoByteChars sourceMapURL_ = nullptr;
684 
685   // Assorted boolean fields, none of which require maintenance across tokens,
686   // stored at class end to minimize padding.
687 
688   /**
689    * Whether syntax errors should or should not contain details about the
690    * precise nature of the error.  (This is intended for use in suppressing
691    * content-revealing details about syntax errors in cross-origin scripts on
692    * the web.)
693    */
694   const bool mutedErrors;
695 
696   /**
697    * An array storing whether a TokenKind observed while attempting to extend
698    * a valid AssignmentExpression into an even longer AssignmentExpression
699    * (e.g., extending '3' to '3 + 5') will terminate it without error.
700    *
701    * For example, ';' always ends an AssignmentExpression because it ends a
702    * Statement or declaration.  '}' always ends an AssignmentExpression
703    * because it terminates BlockStatement, FunctionBody, and embedded
704    * expressions in TemplateLiterals.  Therefore both entries are set to true
705    * in TokenStreamAnyChars construction.
706    *
707    * But e.g. '+' *could* extend an AssignmentExpression, so its entry here
708    * is false.  Meanwhile 'this' can't extend an AssignmentExpression, but
709    * it's only valid after a line break, so its entry here must be false.
710    *
711    * NOTE: This array could be static, but without C99's designated
712    *       initializers it's easier zeroing here and setting the true entries
713    *       in the constructor body.  (Having this per-instance might also aid
714    *       locality.)  Don't worry!  Initialization time for each TokenStream
715    *       is trivial.  See bug 639420.
716    */
717   bool isExprEnding[size_t(TokenKind::Limit)] = {};  // all-false initially
718 
719   // End of fields.
720 
721  public:
722   TokenStreamAnyChars(JSContext* cx, const JS::ReadOnlyCompileOptions& options,
723                       StrictModeGetter* smg);
724 
725   template <typename Unit, class AnyCharsAccess>
726   friend class GeneralTokenStreamChars;
727   template <typename Unit, class AnyCharsAccess>
728   friend class TokenStreamChars;
729   template <typename Unit, class AnyCharsAccess>
730   friend class TokenStreamSpecific;
731 
732   template <typename Unit>
733   friend class TokenStreamPosition;
734 
735   // Accessors.
cursor()736   unsigned cursor() const { return cursor_; }
nextCursor()737   unsigned nextCursor() const { return (cursor_ + 1) & ntokensMask; }
aheadCursor(unsigned steps)738   unsigned aheadCursor(unsigned steps) const {
739     return (cursor_ + steps) & ntokensMask;
740   }
741 
currentToken()742   const Token& currentToken() const { return tokens[cursor()]; }
isCurrentTokenType(TokenKind type)743   bool isCurrentTokenType(TokenKind type) const {
744     return currentToken().type == type;
745   }
746 
747   [[nodiscard]] bool checkOptions();
748 
749  private:
750   TaggedParserAtomIndex reservedWordToPropertyName(TokenKind tt) const;
751 
752  public:
currentName()753   TaggedParserAtomIndex currentName() const {
754     if (isCurrentTokenType(TokenKind::Name) ||
755         isCurrentTokenType(TokenKind::PrivateName)) {
756       return currentToken().name();
757     }
758 
759     MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
760     return reservedWordToPropertyName(currentToken().type);
761   }
762 
currentNameHasEscapes(ParserAtomsTable & parserAtoms)763   bool currentNameHasEscapes(ParserAtomsTable& parserAtoms) const {
764     if (isCurrentTokenType(TokenKind::Name) ||
765         isCurrentTokenType(TokenKind::PrivateName)) {
766       TokenPos pos = currentToken().pos;
767       return (pos.end - pos.begin) != parserAtoms.length(currentToken().name());
768     }
769 
770     MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
771     return false;
772   }
773 
isCurrentTokenAssignment()774   bool isCurrentTokenAssignment() const {
775     return TokenKindIsAssignment(currentToken().type);
776   }
777 
778   // Flag methods.
isEOF()779   bool isEOF() const { return flags.isEOF; }
hadError()780   bool hadError() const { return flags.hadError; }
781 
sawDeprecatedContent()782   DeprecatedContent sawDeprecatedContent() const {
783     return static_cast<DeprecatedContent>(flags.sawDeprecatedContent);
784   }
785 
786  private:
787   // Workaround GCC 7 sadness.
setSawDeprecatedContent(DeprecatedContent content)788   void setSawDeprecatedContent(DeprecatedContent content) {
789     flags.sawDeprecatedContent = static_cast<uint8_t>(content);
790   }
791 
792  public:
clearSawDeprecatedContent()793   void clearSawDeprecatedContent() {
794     setSawDeprecatedContent(DeprecatedContent::None);
795   }
setSawDeprecatedOctalLiteral()796   void setSawDeprecatedOctalLiteral() {
797     setSawDeprecatedContent(DeprecatedContent::OctalLiteral);
798   }
setSawDeprecatedOctalEscape()799   void setSawDeprecatedOctalEscape() {
800     setSawDeprecatedContent(DeprecatedContent::OctalEscape);
801   }
setSawDeprecatedEightOrNineEscape()802   void setSawDeprecatedEightOrNineEscape() {
803     setSawDeprecatedContent(DeprecatedContent::EightOrNineEscape);
804   }
805 
hasInvalidTemplateEscape()806   bool hasInvalidTemplateEscape() const {
807     return invalidTemplateEscapeType != InvalidEscapeType::None;
808   }
clearInvalidTemplateEscape()809   void clearInvalidTemplateEscape() {
810     invalidTemplateEscapeType = InvalidEscapeType::None;
811   }
812 
813  private:
814   // This is private because it should only be called by the tokenizer while
815   // tokenizing not by, for example, BytecodeEmitter.
strictMode()816   bool strictMode() const {
817     return strictModeGetter_ && strictModeGetter_->strictMode();
818   }
819 
setInvalidTemplateEscape(uint32_t offset,InvalidEscapeType type)820   void setInvalidTemplateEscape(uint32_t offset, InvalidEscapeType type) {
821     MOZ_ASSERT(type != InvalidEscapeType::None);
822     if (invalidTemplateEscapeType != InvalidEscapeType::None) {
823       return;
824     }
825     invalidTemplateEscapeOffset = offset;
826     invalidTemplateEscapeType = type;
827   }
828 
829  public:
830   // Call this immediately after parsing an OrExpression to allow scanning the
831   // next token with SlashIsRegExp without asserting (even though we just
832   // peeked at it in SlashIsDiv mode).
833   //
834   // It's OK to disable the assertion because the places where this is called
835   // have peeked at the next token in SlashIsDiv mode, and checked that it is
836   // *not* a Div token.
837   //
838   // To see why it is necessary to disable the assertion, consider these two
839   // programs:
840   //
841   //     x = arg => q       // per spec, this is all one statement, and the
842   //     /a/g;              // slashes are division operators
843   //
844   //     x = arg => {}      // per spec, ASI at the end of this line
845   //     /a/g;              // and that's a regexp literal
846   //
847   // The first program shows why orExpr() has use SlashIsDiv mode when peeking
848   // ahead for the next operator after parsing `q`. The second program shows
849   // why matchOrInsertSemicolon() must use SlashIsRegExp mode when scanning
850   // ahead for a semicolon.
allowGettingNextTokenWithSlashIsRegExp()851   void allowGettingNextTokenWithSlashIsRegExp() {
852 #ifdef DEBUG
853     // Check the precondition: Caller already peeked ahead at the next token,
854     // in SlashIsDiv mode, and it is *not* a Div token.
855     MOZ_ASSERT(hasLookahead());
856     const Token& next = nextToken();
857     MOZ_ASSERT(next.modifier == SlashIsDiv);
858     MOZ_ASSERT(next.type != TokenKind::Div);
859     tokens[nextCursor()].modifier = SlashIsRegExp;
860 #endif
861   }
862 
863 #ifdef DEBUG
debugHasNoLookahead()864   inline bool debugHasNoLookahead() const { return lookahead == 0; }
865 #endif
866 
hasDisplayURL()867   bool hasDisplayURL() const { return displayURL_ != nullptr; }
868 
displayURL()869   char16_t* displayURL() { return displayURL_.get(); }
870 
hasSourceMapURL()871   bool hasSourceMapURL() const { return sourceMapURL_ != nullptr; }
872 
sourceMapURL()873   char16_t* sourceMapURL() { return sourceMapURL_.get(); }
874 
context()875   JSContext* context() const { return cx; }
876 
877   using LineToken = SourceCoords::LineToken;
878 
lineToken(uint32_t offset)879   LineToken lineToken(uint32_t offset) const {
880     return srcCoords.lineToken(offset);
881   }
882 
lineNumber(LineToken lineToken)883   uint32_t lineNumber(LineToken lineToken) const {
884     return srcCoords.lineNumber(lineToken);
885   }
886 
lineStart(LineToken lineToken)887   uint32_t lineStart(LineToken lineToken) const {
888     return srcCoords.lineStart(lineToken);
889   }
890 
891   /**
892    * Fill in |err|.
893    *
894    * If the token stream doesn't have location info for this error, use the
895    * caller's location (including line/column number) and return false.  (No
896    * line of context is set.)
897    *
898    * Otherwise fill in everything in |err| except 1) line/column numbers and
899    * 2) line-of-context-related fields and return true.  The caller *must*
900    * fill in the line/column number; filling the line of context is optional.
901    */
902   bool fillExceptingContext(ErrorMetadata* err, uint32_t offset);
903 
updateFlagsForEOL()904   MOZ_ALWAYS_INLINE void updateFlagsForEOL() { flags.isDirtyLine = false; }
905 
906  private:
907   /**
908    * Compute the "partial" column number in Unicode code points of the absolute
909    * |offset| within source text on the line of |lineToken| (which must have
910    * been computed from |offset|).
911    *
912    * A partial column number on a line that isn't the first line is just the
913    * actual column number.  But a partial column number on the first line is the
914    * column number *ignoring the initial line/column of the script*.  For
915    * example, consider this HTML with line/column number keys:
916    *
917    *                 1         2            3
918    *       0123456789012345678901234   567890
919    *     ------------------------------------
920    *   1 | <html>
921    *   2 | <head>
922    *   3 |   <script>var x = 3;  x &lt; 4;
923    *   4 | const y = 7;</script>
924    *   5 | </head>
925    *   6 | <body></body>
926    *   7 | </html>
927    *
928    * The script would be compiled specifying initial (line, column) of (3, 10)
929    * using |JS::ReadOnlyCompileOptions::{lineno,column}|.  And the column
930    * reported by |computeColumn| for the "v" of |var| would be 10.  But the
931    * partial column number of the "v" in |var|, that this function returns,
932    * would be 0.  On the other hand, the column reported by |computeColumn| and
933    * the partial column number returned by this function for the "c" in |const|
934    * would both be 0, because it's not in the first line of source text.
935    *
936    * The partial column is with respect *only* to the JavaScript source text as
937    * SpiderMonkey sees it.  In the example, the "&lt;" is converted to "<" by
938    * the browser before SpiderMonkey would see it.  So the partial column of the
939    * "4" in the inequality would be 16, not 19.
940    *
941    * Code points are not all equal length, so counting requires *some* kind of
942    * linear-time counting from the start of the line.  This function attempts
943    * various tricks to reduce this cost.  If these optimizations succeed,
944    * repeated calls to this function on a line will pay a one-time cost linear
945    * in the length of the line, then each call pays a separate constant-time
946    * cost.  If the optimizations do not succeed, this function works in time
947    * linear in the length of the line.
948    *
949    * It's unusual for a function in *this* class to be |Unit|-templated, but
950    * while this operation manages |Unit|-agnostic fields in this class and in
951    * |srcCoords|, it must *perform* |Unit|-sensitive computations to fill them.
952    * And this is the best place to do that.
953    */
954   template <typename Unit>
955   uint32_t computePartialColumn(const LineToken lineToken,
956                                 const uint32_t offset,
957                                 const SourceUnits<Unit>& sourceUnits) const;
958 
959   /**
960    * Update line/column information for the start of a new line at
961    * |lineStartOffset|.
962    */
963   [[nodiscard]] MOZ_ALWAYS_INLINE bool internalUpdateLineInfoForEOL(
964       uint32_t lineStartOffset);
965 
966  public:
nextToken()967   const Token& nextToken() const {
968     MOZ_ASSERT(hasLookahead());
969     return tokens[nextCursor()];
970   }
971 
hasLookahead()972   bool hasLookahead() const { return lookahead > 0; }
973 
advanceCursor()974   void advanceCursor() { cursor_ = (cursor_ + 1) & ntokensMask; }
975 
retractCursor()976   void retractCursor() { cursor_ = (cursor_ - 1) & ntokensMask; }
977 
allocateToken()978   Token* allocateToken() {
979     advanceCursor();
980 
981     Token* tp = &tokens[cursor()];
982     MOZ_MAKE_MEM_UNDEFINED(tp, sizeof(*tp));
983 
984     return tp;
985   }
986 
987   // Push the last scanned token back into the stream.
ungetToken()988   void ungetToken() {
989     MOZ_ASSERT(lookahead < maxLookahead);
990     lookahead++;
991     retractCursor();
992   }
993 
994  public:
adoptState(TokenStreamAnyChars & other)995   void adoptState(TokenStreamAnyChars& other) {
996     // If |other| has fresh information from directives, overwrite any
997     // previously recorded directives.  (There is no specification directing
998     // that last-in-source-order directive controls, sadly.  We behave this way
999     // in the ordinary case, so we ought do so here too.)
1000     if (auto& url = other.displayURL_) {
1001       displayURL_ = std::move(url);
1002     }
1003     if (auto& url = other.sourceMapURL_) {
1004       sourceMapURL_ = std::move(url);
1005     }
1006   }
1007 
1008   // Compute error metadata for an error at no offset.
1009   void computeErrorMetadataNoOffset(ErrorMetadata* err);
1010 
1011   // ErrorReporter API Helpers
1012 
1013   // Provide minimal set of error reporting API given we cannot use
1014   // ErrorReportMixin here. "report" prefix is added to avoid conflict with
1015   // ErrorReportMixin methods in TokenStream class.
1016   void reportErrorNoOffset(unsigned errorNumber, ...);
1017   void reportErrorNoOffsetVA(unsigned errorNumber, va_list* args);
1018 
options()1019   const JS::ReadOnlyCompileOptions& options() const { return options_; }
1020 
getFilename()1021   const char* getFilename() const { return filename_; }
1022 };
1023 
CodeUnitValue(char16_t unit)1024 constexpr char16_t CodeUnitValue(char16_t unit) { return unit; }
1025 
CodeUnitValue(mozilla::Utf8Unit unit)1026 constexpr uint8_t CodeUnitValue(mozilla::Utf8Unit unit) {
1027   return unit.toUint8();
1028 }
1029 
1030 template <typename Unit>
1031 class TokenStreamCharsBase;
1032 
1033 template <typename T>
1034 inline bool IsLineTerminator(T) = delete;
1035 
IsLineTerminator(char32_t codePoint)1036 inline bool IsLineTerminator(char32_t codePoint) {
1037   return codePoint == '\n' || codePoint == '\r' ||
1038          codePoint == unicode::LINE_SEPARATOR ||
1039          codePoint == unicode::PARA_SEPARATOR;
1040 }
1041 
IsLineTerminator(char16_t unit)1042 inline bool IsLineTerminator(char16_t unit) {
1043   // Every LineTerminator fits in char16_t, so this is exact.
1044   return IsLineTerminator(static_cast<char32_t>(unit));
1045 }
1046 
1047 template <typename Unit>
1048 struct SourceUnitTraits;
1049 
1050 template <>
1051 struct SourceUnitTraits<char16_t> {
1052  public:
1053   static constexpr uint8_t maxUnitsLength = 2;
1054 
1055   static constexpr size_t lengthInUnits(char32_t codePoint) {
1056     return codePoint < unicode::NonBMPMin ? 1 : 2;
1057   }
1058 };
1059 
1060 template <>
1061 struct SourceUnitTraits<mozilla::Utf8Unit> {
1062  public:
1063   static constexpr uint8_t maxUnitsLength = 4;
1064 
1065   static constexpr size_t lengthInUnits(char32_t codePoint) {
1066     return codePoint < 0x80      ? 1
1067            : codePoint < 0x800   ? 2
1068            : codePoint < 0x10000 ? 3
1069                                  : 4;
1070   }
1071 };
1072 
1073 /**
1074  * PeekedCodePoint represents the result of peeking ahead in some source text
1075  * to determine the next validly-encoded code point.
1076  *
1077  * If there isn't a valid code point, then |isNone()|.
1078  *
1079  * But if there *is* a valid code point, then |!isNone()|, the code point has
1080  * value |codePoint()| and its length in code units is |lengthInUnits()|.
1081  *
1082  * Conceptually, this class is |Maybe<struct { char32_t v; uint8_t len; }>|.
1083  */
1084 template <typename Unit>
1085 class PeekedCodePoint final {
1086   char32_t codePoint_ = 0;
1087   uint8_t lengthInUnits_ = 0;
1088 
1089  private:
1090   using SourceUnitTraits = frontend::SourceUnitTraits<Unit>;
1091 
1092   PeekedCodePoint() = default;
1093 
1094  public:
1095   /**
1096    * Create a peeked code point with the given value and length in code
1097    * units.
1098    *
1099    * While the latter value is computable from the former for both UTF-8 and
1100    * JS's version of UTF-16, the caller likely computed a length in units in
1101    * the course of determining the peeked value.  Passing both here avoids
1102    * recomputation and lets us do a consistency-checking assertion.
1103    */
1104   PeekedCodePoint(char32_t codePoint, uint8_t lengthInUnits)
1105       : codePoint_(codePoint), lengthInUnits_(lengthInUnits) {
1106     MOZ_ASSERT(codePoint <= unicode::NonBMPMax);
1107     MOZ_ASSERT(lengthInUnits != 0, "bad code point length");
1108     MOZ_ASSERT(lengthInUnits == SourceUnitTraits::lengthInUnits(codePoint));
1109   }
1110 
1111   /** Create a PeekedCodeUnit that represents no valid code point. */
1112   static PeekedCodePoint none() { return PeekedCodePoint(); }
1113 
1114   /** True if no code point was found, false otherwise. */
1115   bool isNone() const { return lengthInUnits_ == 0; }
1116 
1117   /** If a code point was found, its value. */
1118   char32_t codePoint() const {
1119     MOZ_ASSERT(!isNone());
1120     return codePoint_;
1121   }
1122 
1123   /** If a code point was found, its length in code units. */
1124   uint8_t lengthInUnits() const {
1125     MOZ_ASSERT(!isNone());
1126     return lengthInUnits_;
1127   }
1128 };
1129 
1130 inline PeekedCodePoint<char16_t> PeekCodePoint(const char16_t* const ptr,
1131                                                const char16_t* const end) {
1132   if (MOZ_UNLIKELY(ptr >= end)) {
1133     return PeekedCodePoint<char16_t>::none();
1134   }
1135 
1136   char16_t lead = ptr[0];
1137 
1138   char32_t c;
1139   uint8_t len;
1140   if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead)) ||
1141       MOZ_UNLIKELY(ptr + 1 >= end || !unicode::IsTrailSurrogate(ptr[1]))) {
1142     c = lead;
1143     len = 1;
1144   } else {
1145     c = unicode::UTF16Decode(lead, ptr[1]);
1146     len = 2;
1147   }
1148 
1149   return PeekedCodePoint<char16_t>(c, len);
1150 }
1151 
1152 inline PeekedCodePoint<mozilla::Utf8Unit> PeekCodePoint(
1153     const mozilla::Utf8Unit* const ptr, const mozilla::Utf8Unit* const end) {
1154   if (MOZ_UNLIKELY(ptr >= end)) {
1155     return PeekedCodePoint<mozilla::Utf8Unit>::none();
1156   }
1157 
1158   const mozilla::Utf8Unit lead = ptr[0];
1159   if (mozilla::IsAscii(lead)) {
1160     return PeekedCodePoint<mozilla::Utf8Unit>(lead.toUint8(), 1);
1161   }
1162 
1163   const mozilla::Utf8Unit* afterLead = ptr + 1;
1164   mozilla::Maybe<char32_t> codePoint =
1165       mozilla::DecodeOneUtf8CodePoint(lead, &afterLead, end);
1166   if (codePoint.isNothing()) {
1167     return PeekedCodePoint<mozilla::Utf8Unit>::none();
1168   }
1169 
1170   auto len =
1171       mozilla::AssertedCast<uint8_t>(mozilla::PointerRangeSize(ptr, afterLead));
1172   MOZ_ASSERT(len <= 4);
1173 
1174   return PeekedCodePoint<mozilla::Utf8Unit>(codePoint.value(), len);
1175 }
1176 
1177 inline bool IsSingleUnitLineTerminator(mozilla::Utf8Unit unit) {
1178   // BEWARE: The Unicode line/paragraph separators don't fit in a single
1179   //         UTF-8 code unit, so this test is exact for Utf8Unit but inexact
1180   //         for UTF-8 as a whole.  Users must handle |unit| as start of a
1181   //         Unicode LineTerminator themselves!
1182   return unit == mozilla::Utf8Unit('\n') || unit == mozilla::Utf8Unit('\r');
1183 }
1184 
1185 // This is the low-level interface to the JS source code buffer.  It just gets
1186 // raw Unicode code units -- 16-bit char16_t units of source text that are not
1187 // (always) full code points, and 8-bit units of UTF-8 source text soon.
1188 // TokenStreams functions are layered on top and do some extra stuff like
1189 // converting all EOL sequences to '\n', tracking the line number, and setting
1190 // |flags.isEOF|.  (The "raw" in "raw Unicode code units" refers to the lack of
1191 // EOL sequence normalization.)
1192 //
1193 // buf[0..length-1] often represents a substring of some larger source,
1194 // where we have only the substring in memory. The |startOffset| argument
1195 // indicates the offset within this larger string at which our string
1196 // begins, the offset of |buf[0]|.
1197 template <typename Unit>
1198 class SourceUnits {
1199  private:
1200   /** Base of buffer. */
1201   const Unit* base_;
1202 
1203   /** Offset of base_[0]. */
1204   uint32_t startOffset_;
1205 
1206   /** Limit for quick bounds check. */
1207   const Unit* limit_;
1208 
1209   /** Next char to get. */
1210   const Unit* ptr;
1211 
1212  public:
1213   SourceUnits(const Unit* units, size_t length, size_t startOffset)
1214       : base_(units),
1215         startOffset_(startOffset),
1216         limit_(units + length),
1217         ptr(units) {}
1218 
1219   bool atStart() const {
1220     MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
1221     return ptr == base_;
1222   }
1223 
1224   bool atEnd() const {
1225     MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
1226     MOZ_ASSERT(ptr <= limit_, "shouldn't have overrun");
1227     return ptr >= limit_;
1228   }
1229 
1230   size_t remaining() const {
1231     MOZ_ASSERT(!isPoisoned(),
1232                "can't get a count of remaining code units if poisoned");
1233     return mozilla::PointerRangeSize(ptr, limit_);
1234   }
1235 
1236   size_t startOffset() const { return startOffset_; }
1237 
1238   size_t offset() const {
1239     return startOffset_ + mozilla::PointerRangeSize(base_, ptr);
1240   }
1241 
1242   const Unit* codeUnitPtrAt(size_t offset) const {
1243     MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
1244     MOZ_ASSERT(startOffset_ <= offset);
1245     MOZ_ASSERT(offset - startOffset_ <=
1246                mozilla::PointerRangeSize(base_, limit_));
1247     return base_ + (offset - startOffset_);
1248   }
1249 
1250   const Unit* current() const { return ptr; }
1251 
1252   const Unit* limit() const { return limit_; }
1253 
1254   Unit previousCodeUnit() {
1255     MOZ_ASSERT(!isPoisoned(), "can't get previous code unit if poisoned");
1256     MOZ_ASSERT(!atStart(), "must have a previous code unit to get");
1257     return *(ptr - 1);
1258   }
1259 
1260   Unit getCodeUnit() {
1261     return *ptr++;  // this will nullptr-crash if poisoned
1262   }
1263 
1264   Unit peekCodeUnit() const {
1265     return *ptr;  // this will nullptr-crash if poisoned
1266   }
1267 
1268   /**
1269    * Determine the next code point in source text.  The code point is not
1270    * normalized: '\r', '\n', '\u2028', and '\u2029' are returned literally.
1271    * If there is no next code point because |atEnd()|, or if an encoding
1272    * error is encountered, return a |PeekedCodePoint| that |isNone()|.
1273    *
1274    * This function does not report errors: code that attempts to get the next
1275    * code point must report any error.
1276    *
1277    * If a next code point is found, it may be consumed by passing it to
1278    * |consumeKnownCodePoint|.
1279    */
1280   PeekedCodePoint<Unit> peekCodePoint() const {
1281     return PeekCodePoint(ptr, limit_);
1282   }
1283 
1284  private:
1285 #ifdef DEBUG
1286   void assertNextCodePoint(const PeekedCodePoint<Unit>& peeked);
1287 #endif
1288 
1289  public:
1290   /**
1291    * Consume a peeked code point that |!isNone()|.
1292    *
1293    * This call DOES NOT UPDATE LINE-STATUS.  You may need to call
1294    * |updateLineInfoForEOL()| and |updateFlagsForEOL()| if this consumes a
1295    * LineTerminator.  Note that if this consumes '\r', you also must consume
1296    * an optional '\n' (i.e. a full LineTerminatorSequence) before doing so.
1297    */
1298   void consumeKnownCodePoint(const PeekedCodePoint<Unit>& peeked) {
1299     MOZ_ASSERT(!peeked.isNone());
1300     MOZ_ASSERT(peeked.lengthInUnits() <= remaining());
1301 
1302 #ifdef DEBUG
1303     assertNextCodePoint(peeked);
1304 #endif
1305 
1306     ptr += peeked.lengthInUnits();
1307   }
1308 
1309   /** Match |n| hexadecimal digits and store their value in |*out|. */
1310   bool matchHexDigits(uint8_t n, char16_t* out) {
1311     MOZ_ASSERT(!isPoisoned(), "shouldn't peek into poisoned SourceUnits");
1312     MOZ_ASSERT(n <= 4, "hexdigit value can't overflow char16_t");
1313     if (n > remaining()) {
1314       return false;
1315     }
1316 
1317     char16_t v = 0;
1318     for (uint8_t i = 0; i < n; i++) {
1319       auto unit = CodeUnitValue(ptr[i]);
1320       if (!mozilla::IsAsciiHexDigit(unit)) {
1321         return false;
1322       }
1323 
1324       v = (v << 4) | mozilla::AsciiAlphanumericToNumber(unit);
1325     }
1326 
1327     *out = v;
1328     ptr += n;
1329     return true;
1330   }
1331 
1332   bool matchCodeUnits(const char* chars, uint8_t length) {
1333     MOZ_ASSERT(!isPoisoned(), "shouldn't match into poisoned SourceUnits");
1334     if (length > remaining()) {
1335       return false;
1336     }
1337 
1338     const Unit* start = ptr;
1339     const Unit* end = ptr + length;
1340     while (ptr < end) {
1341       if (*ptr++ != Unit(*chars++)) {
1342         ptr = start;
1343         return false;
1344       }
1345     }
1346 
1347     return true;
1348   }
1349 
1350   void skipCodeUnits(uint32_t n) {
1351     MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1352     MOZ_ASSERT(n <= remaining(), "shouldn't skip beyond end of SourceUnits");
1353     ptr += n;
1354   }
1355 
1356   void unskipCodeUnits(uint32_t n) {
1357     MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1358     MOZ_ASSERT(n <= mozilla::PointerRangeSize(base_, ptr),
1359                "shouldn't unskip beyond start of SourceUnits");
1360     ptr -= n;
1361   }
1362 
1363  private:
1364   friend class TokenStreamCharsBase<Unit>;
1365 
1366   bool internalMatchCodeUnit(Unit c) {
1367     MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1368     if (MOZ_LIKELY(!atEnd()) && *ptr == c) {
1369       ptr++;
1370       return true;
1371     }
1372     return false;
1373   }
1374 
1375  public:
1376   void consumeKnownCodeUnit(Unit c) {
1377     MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1378     MOZ_ASSERT(*ptr == c, "consuming the wrong code unit");
1379     ptr++;
1380   }
1381 
1382   /** Unget U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR. */
1383   inline void ungetLineOrParagraphSeparator();
1384 
1385   void ungetCodeUnit() {
1386     MOZ_ASSERT(!isPoisoned(), "can't unget from poisoned units");
1387     MOZ_ASSERT(!atStart(), "can't unget if currently at start");
1388     ptr--;
1389   }
1390 
1391   const Unit* addressOfNextCodeUnit(bool allowPoisoned = false) const {
1392     MOZ_ASSERT_IF(!allowPoisoned, !isPoisoned());
1393     return ptr;
1394   }
1395 
1396   // Use this with caution!
1397   void setAddressOfNextCodeUnit(const Unit* a, bool allowPoisoned = false) {
1398     MOZ_ASSERT_IF(!allowPoisoned, a);
1399     ptr = a;
1400   }
1401 
1402   // Poison the SourceUnits so they can't be accessed again.
1403   void poisonInDebug() {
1404 #ifdef DEBUG
1405     ptr = nullptr;
1406 #endif
1407   }
1408 
1409  private:
1410   bool isPoisoned() const {
1411 #ifdef DEBUG
1412     // |ptr| can be null for unpoisoned SourceUnits if this was initialized with
1413     // |units == nullptr| and |length == 0|.  In that case, for lack of any
1414     // better options, consider this to not be poisoned.
1415     return ptr == nullptr && ptr != limit_;
1416 #else
1417     return false;
1418 #endif
1419   }
1420 
1421  public:
1422   /**
1423    * Consume the rest of a single-line comment (but not the EOL/EOF that
1424    * terminates it).
1425    *
1426    * If an encoding error is encountered -- possible only for UTF-8 because
1427    * JavaScript's conception of UTF-16 encompasses any sequence of 16-bit
1428    * code units -- valid code points prior to the encoding error are consumed
1429    * and subsequent invalid code units are not consumed.  For example, given
1430    * these UTF-8 code units:
1431    *
1432    *   'B'   'A'  'D'  ':'   <bad code unit sequence>
1433    *   0x42  0x41 0x44 0x3A  0xD0 0x00 ...
1434    *
1435    * the first four code units are consumed, but 0xD0 and 0x00 are not
1436    * consumed because 0xD0 encodes a two-byte lead unit but 0x00 is not a
1437    * valid trailing code unit.
1438    *
1439    * It is expected that the caller will report such an encoding error when
1440    * it attempts to consume the next code point.
1441    */
1442   void consumeRestOfSingleLineComment();
1443 
1444   /**
1445    * The maximum radius of code around the location of an error that should
1446    * be included in a syntax error message -- this many code units to either
1447    * side.  The resulting window of data is then accordinngly trimmed so that
1448    * the window contains only validly-encoded data.
1449    *
1450    * Because this number is the same for both UTF-8 and UTF-16, windows in
1451    * UTF-8 may contain fewer code points than windows in UTF-16.  As we only
1452    * use this for error messages, we don't particularly care.
1453    */
1454   static constexpr size_t WindowRadius = ErrorMetadata::lineOfContextRadius;
1455 
1456   /**
1457    * From absolute offset |offset|, search backward to find an absolute
1458    * offset within source text, no further than |WindowRadius| code units
1459    * away from |offset|, such that all code points from that offset to
1460    * |offset| are valid, non-LineTerminator code points.
1461    */
1462   size_t findWindowStart(size_t offset) const;
1463 
1464   /**
1465    * From absolute offset |offset|, find an absolute offset within source
1466    * text, no further than |WindowRadius| code units away from |offset|, such
1467    * that all code units from |offset| to that offset are valid,
1468    * non-LineTerminator code points.
1469    */
1470   size_t findWindowEnd(size_t offset) const;
1471 
1472   /**
1473    * Given a |window| of |encodingSpecificWindowLength| units encoding valid
1474    * Unicode text, with index |encodingSpecificTokenOffset| indicating a
1475    * particular code point boundary in |window|, compute the corresponding
1476    * token offset and length if |window| were encoded in UTF-16.  For
1477    * example:
1478    *
1479    *   // U+03C0 GREEK SMALL LETTER PI is encoded as 0xCF 0x80.
1480    *   const Utf8Unit* encodedWindow =
1481    *     reinterpret_cast<const Utf8Unit*>(u8"ππππ = @ FAIL");
1482    *   size_t encodedTokenOffset = 11; // 2 * 4 + ' = '.length
1483    *   size_t encodedWindowLength = 17; // 2 * 4 + ' = @ FAIL'.length
1484    *   size_t utf16Offset, utf16Length;
1485    *   computeWindowOffsetAndLength(encodedWindow,
1486    *                                encodedTokenOffset, &utf16Offset,
1487    *                                encodedWindowLength, &utf16Length);
1488    *   MOZ_ASSERT(utf16Offset == 7);
1489    *   MOZ_ASSERT(utf16Length = 13);
1490    *
1491    * This function asserts if called for UTF-16: the sole caller can avoid
1492    * computing UTF-16 offsets when they're definitely the same as the encoded
1493    * offsets.
1494    */
1495   inline void computeWindowOffsetAndLength(const Unit* encodeWindow,
1496                                            size_t encodingSpecificTokenOffset,
1497                                            size_t* utf16TokenOffset,
1498                                            size_t encodingSpecificWindowLength,
1499                                            size_t* utf16WindowLength);
1500 };
1501 
1502 template <>
1503 inline void SourceUnits<char16_t>::ungetLineOrParagraphSeparator() {
1504 #ifdef DEBUG
1505   char16_t prev = previousCodeUnit();
1506 #endif
1507   MOZ_ASSERT(prev == unicode::LINE_SEPARATOR ||
1508              prev == unicode::PARA_SEPARATOR);
1509 
1510   ungetCodeUnit();
1511 }
1512 
1513 template <>
1514 inline void SourceUnits<mozilla::Utf8Unit>::ungetLineOrParagraphSeparator() {
1515   unskipCodeUnits(3);
1516 
1517   MOZ_ASSERT(ptr[0].toUint8() == 0xE2);
1518   MOZ_ASSERT(ptr[1].toUint8() == 0x80);
1519 
1520 #ifdef DEBUG
1521   uint8_t last = ptr[2].toUint8();
1522 #endif
1523   MOZ_ASSERT(last == 0xA8 || last == 0xA9);
1524 }
1525 
1526 /**
1527  * An all-purpose buffer type for accumulating text during tokenizing.
1528  *
1529  * In principle we could make this buffer contain |char16_t|, |Utf8Unit|, or
1530  * |Unit|.  We use |char16_t| because:
1531  *
1532  *   * we don't have a UTF-8 regular expression parser, so in general regular
1533  *     expression text must be copied to a separate UTF-16 buffer to parse it,
1534  *     and
1535  *   * |TokenStreamCharsShared::copyCharBufferTo|, which copies a shared
1536  *     |CharBuffer| to a |char16_t*|, is simpler if it doesn't have to convert.
1537  */
1538 using CharBuffer = Vector<char16_t, 32>;
1539 
1540 /**
1541  * Append the provided code point (in the range [U+0000, U+10FFFF], surrogate
1542  * code points included) to the buffer.
1543  */
1544 [[nodiscard]] extern bool AppendCodePointToCharBuffer(CharBuffer& charBuffer,
1545                                                       uint32_t codePoint);
1546 
1547 /**
1548  * Accumulate the range of UTF-16 text (lone surrogates permitted, because JS
1549  * allows them in source text) into |charBuffer|.  Normalize '\r', '\n', and
1550  * "\r\n" into '\n'.
1551  */
1552 [[nodiscard]] extern bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
1553     CharBuffer& charBuffer, const char16_t* cur, const char16_t* end);
1554 
1555 /**
1556  * Accumulate the range of previously-validated UTF-8 text into |charBuffer|.
1557  * Normalize '\r', '\n', and "\r\n" into '\n'.
1558  */
1559 [[nodiscard]] extern bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
1560     CharBuffer& charBuffer, const mozilla::Utf8Unit* cur,
1561     const mozilla::Utf8Unit* end);
1562 
1563 class TokenStreamCharsShared {
1564  protected:
1565   JSContext* cx;
1566 
1567   /**
1568    * Buffer transiently used to store sequences of identifier or string code
1569    * points when such can't be directly processed from the original source
1570    * text (e.g. because it contains escapes).
1571    */
1572   CharBuffer charBuffer;
1573 
1574   /** Information for parsing with a lifetime longer than the parser itself. */
1575   ParserAtomsTable* parserAtoms;
1576 
1577  protected:
1578   explicit TokenStreamCharsShared(JSContext* cx, ParserAtomsTable* parserAtoms)
1579       : cx(cx), charBuffer(cx), parserAtoms(parserAtoms) {}
1580 
1581   [[nodiscard]] bool copyCharBufferTo(
1582       JSContext* cx, UniquePtr<char16_t[], JS::FreePolicy>* destination);
1583 
1584   /**
1585    * Determine whether a code unit constitutes a complete ASCII code point.
1586    * (The code point's exact value might not be used, however, if subsequent
1587    * code observes that |unit| is part of a LineTerminatorSequence.)
1588    */
1589   [[nodiscard]] static constexpr MOZ_ALWAYS_INLINE bool isAsciiCodePoint(
1590       int32_t unit) {
1591     return mozilla::IsAscii(static_cast<char32_t>(unit));
1592   }
1593 
1594   TaggedParserAtomIndex drainCharBufferIntoAtom() {
1595     // Add to parser atoms table.
1596     auto atom = this->parserAtoms->internChar16(cx, charBuffer.begin(),
1597                                                 charBuffer.length());
1598     charBuffer.clear();
1599     return atom;
1600   }
1601 
1602  protected:
1603   void adoptState(TokenStreamCharsShared& other) {
1604     // The other stream's buffer may contain information for a
1605     // gotten-then-ungotten token, that we must transfer into this stream so
1606     // that token's final get behaves as desired.
1607     charBuffer = std::move(other.charBuffer);
1608   }
1609 
1610  public:
1611   CharBuffer& getCharBuffer() { return charBuffer; }
1612 };
1613 
1614 template <typename Unit>
1615 class TokenStreamCharsBase : public TokenStreamCharsShared {
1616  protected:
1617   using SourceUnits = frontend::SourceUnits<Unit>;
1618 
1619   /** Code units in the source code being tokenized. */
1620   SourceUnits sourceUnits;
1621 
1622   // End of fields.
1623 
1624  protected:
1625   TokenStreamCharsBase(JSContext* cx, ParserAtomsTable* parserAtoms,
1626                        const Unit* units, size_t length, size_t startOffset);
1627 
1628   /**
1629    * Convert a non-EOF code unit returned by |getCodeUnit()| or
1630    * |peekCodeUnit()| to a Unit code unit.
1631    */
1632   inline Unit toUnit(int32_t codeUnitValue);
1633 
1634   void ungetCodeUnit(int32_t c) {
1635     if (c == EOF) {
1636       return;
1637     }
1638 
1639     sourceUnits.ungetCodeUnit();
1640   }
1641 
1642   MOZ_ALWAYS_INLINE TaggedParserAtomIndex
1643   atomizeSourceChars(mozilla::Span<const Unit> units);
1644 
1645   /**
1646    * Try to match a non-LineTerminator ASCII code point.  Return true iff it
1647    * was matched.
1648    */
1649   bool matchCodeUnit(char expect) {
1650     MOZ_ASSERT(mozilla::IsAscii(expect));
1651     MOZ_ASSERT(expect != '\r');
1652     MOZ_ASSERT(expect != '\n');
1653     return this->sourceUnits.internalMatchCodeUnit(Unit(expect));
1654   }
1655 
1656   /**
1657    * Try to match an ASCII LineTerminator code point.  Return true iff it was
1658    * matched.
1659    */
1660   bool matchLineTerminator(char expect) {
1661     MOZ_ASSERT(expect == '\r' || expect == '\n');
1662     return this->sourceUnits.internalMatchCodeUnit(Unit(expect));
1663   }
1664 
1665   template <typename T>
1666   bool matchCodeUnit(T) = delete;
1667   template <typename T>
1668   bool matchLineTerminator(T) = delete;
1669 
1670   int32_t peekCodeUnit() {
1671     return MOZ_LIKELY(!sourceUnits.atEnd())
1672                ? CodeUnitValue(sourceUnits.peekCodeUnit())
1673                : EOF;
1674   }
1675 
1676   /** Consume a known, non-EOF code unit. */
1677   inline void consumeKnownCodeUnit(int32_t unit);
1678 
1679   // Forbid accidental calls to consumeKnownCodeUnit *not* with the single
1680   // unit-or-EOF type.  Unit should use SourceUnits::consumeKnownCodeUnit;
1681   // CodeUnitValue() results should go through toUnit(), or better yet just
1682   // use the original Unit.
1683   template <typename T>
1684   inline void consumeKnownCodeUnit(T) = delete;
1685 
1686   /**
1687    * Add a null-terminated line of context to error information, for the line
1688    * in |sourceUnits| that contains |offset|.  Also record the window's
1689    * length and the offset of the error in the window.  (Don't bother adding
1690    * a line of context if it would be empty.)
1691    *
1692    * The window will contain no LineTerminators of any kind, and it will not
1693    * extend more than |SourceUnits::WindowRadius| to either side of |offset|,
1694    * nor into the previous or next lines.
1695    *
1696    * This function is quite internal, and you probably should be calling one
1697    * of its existing callers instead.
1698    */
1699   [[nodiscard]] bool addLineOfContext(ErrorMetadata* err, uint32_t offset);
1700 };
1701 
1702 template <>
1703 inline char16_t TokenStreamCharsBase<char16_t>::toUnit(int32_t codeUnitValue) {
1704   MOZ_ASSERT(codeUnitValue != EOF, "EOF is not a Unit");
1705   return mozilla::AssertedCast<char16_t>(codeUnitValue);
1706 }
1707 
1708 template <>
1709 inline mozilla::Utf8Unit TokenStreamCharsBase<mozilla::Utf8Unit>::toUnit(
1710     int32_t value) {
1711   MOZ_ASSERT(value != EOF, "EOF is not a Unit");
1712   return mozilla::Utf8Unit(mozilla::AssertedCast<unsigned char>(value));
1713 }
1714 
1715 template <typename Unit>
1716 inline void TokenStreamCharsBase<Unit>::consumeKnownCodeUnit(int32_t unit) {
1717   sourceUnits.consumeKnownCodeUnit(toUnit(unit));
1718 }
1719 
1720 template <>
1721 MOZ_ALWAYS_INLINE TaggedParserAtomIndex
1722 TokenStreamCharsBase<char16_t>::atomizeSourceChars(
1723     mozilla::Span<const char16_t> units) {
1724   return this->parserAtoms->internChar16(cx, units.data(), units.size());
1725 }
1726 
1727 template <>
1728 /* static */ MOZ_ALWAYS_INLINE TaggedParserAtomIndex
1729 TokenStreamCharsBase<mozilla::Utf8Unit>::atomizeSourceChars(
1730     mozilla::Span<const mozilla::Utf8Unit> units) {
1731   return this->parserAtoms->internUtf8(cx, units.data(), units.size());
1732 }
1733 
1734 template <typename Unit>
1735 class SpecializedTokenStreamCharsBase;
1736 
1737 template <>
1738 class SpecializedTokenStreamCharsBase<char16_t>
1739     : public TokenStreamCharsBase<char16_t> {
1740   using CharsBase = TokenStreamCharsBase<char16_t>;
1741 
1742  protected:
1743   using TokenStreamCharsShared::isAsciiCodePoint;
1744   // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
1745 
1746   using typename CharsBase::SourceUnits;
1747 
1748  protected:
1749   // These APIs are only usable by UTF-16-specific code.
1750 
1751   /**
1752    * Given |lead| already consumed, consume and return the code point encoded
1753    * starting from it.  Infallible because lone surrogates in JS encode a
1754    * "code point" of the same value.
1755    */
1756   char32_t infallibleGetNonAsciiCodePointDontNormalize(char16_t lead) {
1757     MOZ_ASSERT(!isAsciiCodePoint(lead));
1758     MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == lead);
1759 
1760     // Handle single-unit code points and lone trailing surrogates.
1761     if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead)) ||
1762         // Or handle lead surrogates not paired with trailing surrogates.
1763         MOZ_UNLIKELY(
1764             this->sourceUnits.atEnd() ||
1765             !unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) {
1766       return lead;
1767     }
1768 
1769     // Otherwise it's a multi-unit code point.
1770     return unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
1771   }
1772 
1773  protected:
1774   // These APIs are in both SpecializedTokenStreamCharsBase specializations
1775   // and so are usable in subclasses no matter what Unit is.
1776 
1777   using CharsBase::CharsBase;
1778 };
1779 
1780 template <>
1781 class SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>
1782     : public TokenStreamCharsBase<mozilla::Utf8Unit> {
1783   using CharsBase = TokenStreamCharsBase<mozilla::Utf8Unit>;
1784 
1785  protected:
1786   // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
1787 
1788  protected:
1789   // These APIs are only usable by UTF-8-specific code.
1790 
1791   using typename CharsBase::SourceUnits;
1792 
1793   /**
1794    * A mutable iterator-wrapper around |SourceUnits| that translates
1795    * operators to calls to |SourceUnits::getCodeUnit()| and similar.
1796    *
1797    * This class is expected to be used in concert with |SourceUnitsEnd|.
1798    */
1799   class SourceUnitsIterator {
1800     SourceUnits& sourceUnits_;
1801 #ifdef DEBUG
1802     // In iterator copies created by the post-increment operator, a pointer
1803     // at the next source text code unit when the post-increment operator
1804     // was called, cleared when the iterator is dereferenced.
1805     mutable mozilla::Maybe<const mozilla::Utf8Unit*>
1806         currentBeforePostIncrement_;
1807 #endif
1808 
1809    public:
1810     explicit SourceUnitsIterator(SourceUnits& sourceUnits)
1811         : sourceUnits_(sourceUnits) {}
1812 
1813     mozilla::Utf8Unit operator*() const {
1814       // operator* is expected to get the *next* value from an iterator
1815       // not pointing at the end of the underlying range.  However, the
1816       // sole use of this is in the context of an expression of the form
1817       // |*iter++|, that performed the |sourceUnits_.getCodeUnit()| in
1818       // the |operator++(int)| below -- so dereferencing acts on a
1819       // |sourceUnits_| already advanced.  Therefore the correct unit to
1820       // return is the previous one.
1821       MOZ_ASSERT(currentBeforePostIncrement_.value() + 1 ==
1822                  sourceUnits_.current());
1823 #ifdef DEBUG
1824       currentBeforePostIncrement_.reset();
1825 #endif
1826       return sourceUnits_.previousCodeUnit();
1827     }
1828 
1829     SourceUnitsIterator operator++(int) {
1830       MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1831                  "the only valid operation on a post-incremented "
1832                  "iterator is dereferencing a single time");
1833 
1834       SourceUnitsIterator copy = *this;
1835 #ifdef DEBUG
1836       copy.currentBeforePostIncrement_.emplace(sourceUnits_.current());
1837 #endif
1838 
1839       sourceUnits_.getCodeUnit();
1840       return copy;
1841     }
1842 
1843     void operator-=(size_t n) {
1844       MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1845                  "the only valid operation on a post-incremented "
1846                  "iterator is dereferencing a single time");
1847       sourceUnits_.unskipCodeUnits(n);
1848     }
1849 
1850     mozilla::Utf8Unit operator[](ptrdiff_t index) {
1851       MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1852                  "the only valid operation on a post-incremented "
1853                  "iterator is dereferencing a single time");
1854       MOZ_ASSERT(index == -1,
1855                  "must only be called to verify the value of the "
1856                  "previous code unit");
1857       return sourceUnits_.previousCodeUnit();
1858     }
1859 
1860     size_t remaining() const {
1861       MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1862                  "the only valid operation on a post-incremented "
1863                  "iterator is dereferencing a single time");
1864       return sourceUnits_.remaining();
1865     }
1866   };
1867 
1868   /** A sentinel representing the end of |SourceUnits| data. */
1869   class SourceUnitsEnd {};
1870 
1871   friend inline size_t operator-(const SourceUnitsEnd& aEnd,
1872                                  const SourceUnitsIterator& aIter);
1873 
1874  protected:
1875   // These APIs are in both SpecializedTokenStreamCharsBase specializations
1876   // and so are usable in subclasses no matter what Unit is.
1877 
1878   using CharsBase::CharsBase;
1879 };
1880 
1881 inline size_t operator-(const SpecializedTokenStreamCharsBase<
1882                             mozilla::Utf8Unit>::SourceUnitsEnd& aEnd,
1883                         const SpecializedTokenStreamCharsBase<
1884                             mozilla::Utf8Unit>::SourceUnitsIterator& aIter) {
1885   return aIter.remaining();
1886 }
1887 
1888 /** A small class encapsulating computation of the start-offset of a Token. */
1889 class TokenStart {
1890   uint32_t startOffset_;
1891 
1892  public:
1893   /**
1894    * Compute a starting offset that is the current offset of |sourceUnits|,
1895    * offset by |adjust|.  (For example, |adjust| of -1 indicates the code
1896    * unit one backwards from |sourceUnits|'s current offset.)
1897    */
1898   template <class SourceUnits>
1899   TokenStart(const SourceUnits& sourceUnits, ptrdiff_t adjust)
1900       : startOffset_(sourceUnits.offset() + adjust) {}
1901 
1902   TokenStart(const TokenStart&) = default;
1903 
1904   uint32_t offset() const { return startOffset_; }
1905 };
1906 
1907 template <typename Unit, class AnyCharsAccess>
1908 class GeneralTokenStreamChars : public SpecializedTokenStreamCharsBase<Unit> {
1909   using CharsBase = TokenStreamCharsBase<Unit>;
1910   using SpecializedCharsBase = SpecializedTokenStreamCharsBase<Unit>;
1911 
1912   using LineToken = TokenStreamAnyChars::LineToken;
1913 
1914  private:
1915   Token* newTokenInternal(TokenKind kind, TokenStart start, TokenKind* out);
1916 
1917   /**
1918    * Allocates a new Token from the given offset to the current offset,
1919    * ascribes it the given kind, and sets |*out| to that kind.
1920    */
1921   Token* newToken(TokenKind kind, TokenStart start,
1922                   TokenStreamShared::Modifier modifier, TokenKind* out) {
1923     Token* token = newTokenInternal(kind, start, out);
1924 
1925 #ifdef DEBUG
1926     // Save the modifier used to get this token, so that if an ungetToken()
1927     // occurs and then the token is re-gotten (or peeked, etc.), we can
1928     // assert both gets used compatible modifiers.
1929     token->modifier = modifier;
1930 #endif
1931 
1932     return token;
1933   }
1934 
1935   uint32_t matchUnicodeEscape(uint32_t* codePoint);
1936   uint32_t matchExtendedUnicodeEscape(uint32_t* codePoint);
1937 
1938  protected:
1939   using CharsBase::addLineOfContext;
1940   using CharsBase::matchCodeUnit;
1941   using CharsBase::matchLineTerminator;
1942   using TokenStreamCharsShared::drainCharBufferIntoAtom;
1943   using TokenStreamCharsShared::isAsciiCodePoint;
1944   // Deliberately don't |using CharsBase::sourceUnits| because of bug 1472569.
1945   // :-(
1946   using CharsBase::toUnit;
1947 
1948   using typename CharsBase::SourceUnits;
1949 
1950  protected:
1951   using SpecializedCharsBase::SpecializedCharsBase;
1952 
1953   TokenStreamAnyChars& anyCharsAccess() {
1954     return AnyCharsAccess::anyChars(this);
1955   }
1956 
1957   const TokenStreamAnyChars& anyCharsAccess() const {
1958     return AnyCharsAccess::anyChars(this);
1959   }
1960 
1961   using TokenStreamSpecific =
1962       frontend::TokenStreamSpecific<Unit, AnyCharsAccess>;
1963 
1964   TokenStreamSpecific* asSpecific() {
1965     static_assert(
1966         std::is_base_of_v<GeneralTokenStreamChars, TokenStreamSpecific>,
1967         "static_cast below presumes an inheritance relationship");
1968 
1969     return static_cast<TokenStreamSpecific*>(this);
1970   }
1971 
1972  protected:
1973   /**
1974    * Compute the column number in Unicode code points of the absolute |offset|
1975    * within source text on the line corresponding to |lineToken|.
1976    *
1977    * |offset| must be a code point boundary, preceded only by validly-encoded
1978    * source units.  (It doesn't have to be *followed* by valid source units.)
1979    */
1980   uint32_t computeColumn(LineToken lineToken, uint32_t offset) const;
1981   void computeLineAndColumn(uint32_t offset, uint32_t* line,
1982                             uint32_t* column) const;
1983 
1984   /**
1985    * Fill in |err| completely, except for line-of-context information.
1986    *
1987    * Return true if the caller can compute a line of context from the token
1988    * stream.  Otherwise return false.
1989    */
1990   [[nodiscard]] bool fillExceptingContext(ErrorMetadata* err, uint32_t offset) {
1991     if (anyCharsAccess().fillExceptingContext(err, offset)) {
1992       computeLineAndColumn(offset, &err->lineNumber, &err->columnNumber);
1993       return true;
1994     }
1995     return false;
1996   }
1997 
1998   void newSimpleToken(TokenKind kind, TokenStart start,
1999                       TokenStreamShared::Modifier modifier, TokenKind* out) {
2000     newToken(kind, start, modifier, out);
2001   }
2002 
2003   void newNumberToken(double dval, DecimalPoint decimalPoint, TokenStart start,
2004                       TokenStreamShared::Modifier modifier, TokenKind* out) {
2005     Token* token = newToken(TokenKind::Number, start, modifier, out);
2006     token->setNumber(dval, decimalPoint);
2007   }
2008 
2009   void newBigIntToken(TokenStart start, TokenStreamShared::Modifier modifier,
2010                       TokenKind* out) {
2011     newToken(TokenKind::BigInt, start, modifier, out);
2012   }
2013 
2014   void newAtomToken(TokenKind kind, TaggedParserAtomIndex atom,
2015                     TokenStart start, TokenStreamShared::Modifier modifier,
2016                     TokenKind* out) {
2017     MOZ_ASSERT(kind == TokenKind::String || kind == TokenKind::TemplateHead ||
2018                kind == TokenKind::NoSubsTemplate);
2019 
2020     Token* token = newToken(kind, start, modifier, out);
2021     token->setAtom(atom);
2022   }
2023 
2024   void newNameToken(TaggedParserAtomIndex name, TokenStart start,
2025                     TokenStreamShared::Modifier modifier, TokenKind* out) {
2026     Token* token = newToken(TokenKind::Name, start, modifier, out);
2027     token->setName(name);
2028   }
2029 
2030   void newPrivateNameToken(TaggedParserAtomIndex name, TokenStart start,
2031                            TokenStreamShared::Modifier modifier,
2032                            TokenKind* out) {
2033     Token* token = newToken(TokenKind::PrivateName, start, modifier, out);
2034     token->setName(name);
2035   }
2036 
2037   void newRegExpToken(JS::RegExpFlags reflags, TokenStart start,
2038                       TokenKind* out) {
2039     Token* token = newToken(TokenKind::RegExp, start,
2040                             TokenStreamShared::SlashIsRegExp, out);
2041     token->setRegExpFlags(reflags);
2042   }
2043 
2044   MOZ_COLD bool badToken();
2045 
2046   /**
2047    * Get the next code unit -- the next numeric sub-unit of source text,
2048    * possibly smaller than a full code point -- without updating line/column
2049    * counters or consuming LineTerminatorSequences.
2050    *
2051    * Because of these limitations, only use this if (a) the resulting code
2052    * unit is guaranteed to be ungotten (by ungetCodeUnit()) if it's an EOL,
2053    * and (b) the line-related state (lineno, linebase) is not used before
2054    * it's ungotten.
2055    */
2056   int32_t getCodeUnit() {
2057     if (MOZ_LIKELY(!this->sourceUnits.atEnd())) {
2058       return CodeUnitValue(this->sourceUnits.getCodeUnit());
2059     }
2060 
2061     anyCharsAccess().flags.isEOF = true;
2062     return EOF;
2063   }
2064 
2065   void ungetCodeUnit(int32_t c) {
2066     MOZ_ASSERT_IF(c == EOF, anyCharsAccess().flags.isEOF);
2067 
2068     CharsBase::ungetCodeUnit(c);
2069   }
2070 
2071   /**
2072    * Given a just-consumed ASCII code unit/point |lead|, consume a full code
2073    * point or LineTerminatorSequence (normalizing it to '\n') and store it in
2074    * |*codePoint|.  Return true on success, otherwise return false and leave
2075    * |*codePoint| undefined on failure.
2076    *
2077    * If a LineTerminatorSequence was consumed, also update line/column info.
2078    *
2079    * This may change the current |sourceUnits| offset.
2080    */
2081   [[nodiscard]] bool getFullAsciiCodePoint(int32_t lead, int32_t* codePoint) {
2082     MOZ_ASSERT(isAsciiCodePoint(lead),
2083                "non-ASCII code units must be handled separately");
2084     MOZ_ASSERT(toUnit(lead) == this->sourceUnits.previousCodeUnit(),
2085                "getFullAsciiCodePoint called incorrectly");
2086 
2087     if (MOZ_UNLIKELY(lead == '\r')) {
2088       matchLineTerminator('\n');
2089     } else if (MOZ_LIKELY(lead != '\n')) {
2090       *codePoint = lead;
2091       return true;
2092     }
2093 
2094     *codePoint = '\n';
2095     bool ok = updateLineInfoForEOL();
2096     if (!ok) {
2097 #ifdef DEBUG
2098       *codePoint = EOF;  // sentinel value to hopefully cause errors
2099 #endif
2100       MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
2101     }
2102     return ok;
2103   }
2104 
2105   [[nodiscard]] MOZ_ALWAYS_INLINE bool updateLineInfoForEOL() {
2106     return anyCharsAccess().internalUpdateLineInfoForEOL(
2107         this->sourceUnits.offset());
2108   }
2109 
2110   uint32_t matchUnicodeEscapeIdStart(uint32_t* codePoint);
2111   bool matchUnicodeEscapeIdent(uint32_t* codePoint);
2112   bool matchIdentifierStart();
2113 
2114   /**
2115    * If possible, compute a line of context for an otherwise-filled-in |err|
2116    * at the given offset in this token stream.
2117    *
2118    * This function is very-internal: almost certainly you should use one of
2119    * its callers instead.  It basically exists only to make those callers
2120    * more readable.
2121    */
2122   [[nodiscard]] bool internalComputeLineOfContext(ErrorMetadata* err,
2123                                                   uint32_t offset) {
2124     // We only have line-start information for the current line.  If the error
2125     // is on a different line, we can't easily provide context.  (This means
2126     // any error in a multi-line token, e.g. an unterminated multiline string
2127     // literal, won't have context.)
2128     if (err->lineNumber != anyCharsAccess().lineno) {
2129       return true;
2130     }
2131 
2132     return addLineOfContext(err, offset);
2133   }
2134 
2135  public:
2136   /**
2137    * Consume any hashbang comment at the start of a Script or Module, if one is
2138    * present.  Stops consuming just before any terminating LineTerminator or
2139    * before an encoding error is encountered.
2140    */
2141   void consumeOptionalHashbangComment();
2142 
2143   TaggedParserAtomIndex getRawTemplateStringAtom() {
2144     TokenStreamAnyChars& anyChars = anyCharsAccess();
2145 
2146     MOZ_ASSERT(anyChars.currentToken().type == TokenKind::TemplateHead ||
2147                anyChars.currentToken().type == TokenKind::NoSubsTemplate);
2148     const Unit* cur =
2149         this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.begin + 1);
2150     const Unit* end;
2151     if (anyChars.currentToken().type == TokenKind::TemplateHead) {
2152       // Of the form    |`...${|   or   |}...${|
2153       end =
2154           this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.end - 2);
2155     } else {
2156       // NO_SUBS_TEMPLATE is of the form   |`...`|   or   |}...`|
2157       end =
2158           this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.end - 1);
2159     }
2160 
2161     // |charBuffer| should be empty here, but we may as well code defensively.
2162     MOZ_ASSERT(this->charBuffer.length() == 0);
2163     this->charBuffer.clear();
2164 
2165     // Template literals normalize only '\r' and "\r\n" to '\n'; Unicode
2166     // separators don't need special handling.
2167     // https://tc39.github.io/ecma262/#sec-static-semantics-tv-and-trv
2168     if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(this->charBuffer,
2169                                                             cur, end)) {
2170       return TaggedParserAtomIndex::null();
2171     }
2172 
2173     return drainCharBufferIntoAtom();
2174   }
2175 };
2176 
2177 template <typename Unit, class AnyCharsAccess>
2178 class TokenStreamChars;
2179 
2180 template <class AnyCharsAccess>
2181 class TokenStreamChars<char16_t, AnyCharsAccess>
2182     : public GeneralTokenStreamChars<char16_t, AnyCharsAccess> {
2183   using CharsBase = TokenStreamCharsBase<char16_t>;
2184   using SpecializedCharsBase = SpecializedTokenStreamCharsBase<char16_t>;
2185   using GeneralCharsBase = GeneralTokenStreamChars<char16_t, AnyCharsAccess>;
2186   using Self = TokenStreamChars<char16_t, AnyCharsAccess>;
2187 
2188   using GeneralCharsBase::asSpecific;
2189 
2190   using typename GeneralCharsBase::TokenStreamSpecific;
2191 
2192  protected:
2193   using CharsBase::matchLineTerminator;
2194   using GeneralCharsBase::anyCharsAccess;
2195   using GeneralCharsBase::getCodeUnit;
2196   using SpecializedCharsBase::infallibleGetNonAsciiCodePointDontNormalize;
2197   using TokenStreamCharsShared::isAsciiCodePoint;
2198   // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
2199   using GeneralCharsBase::ungetCodeUnit;
2200   using GeneralCharsBase::updateLineInfoForEOL;
2201 
2202  protected:
2203   using GeneralCharsBase::GeneralCharsBase;
2204 
2205   /**
2206    * Given the non-ASCII |lead| code unit just consumed, consume and return a
2207    * complete non-ASCII code point.  Line/column updates are not performed,
2208    * and line breaks are returned as-is without normalization.
2209    */
2210   [[nodiscard]] bool getNonAsciiCodePointDontNormalize(char16_t lead,
2211                                                        char32_t* codePoint) {
2212     // There are no encoding errors in 16-bit JS, so implement this so that
2213     // the compiler knows it, too.
2214     *codePoint = infallibleGetNonAsciiCodePointDontNormalize(lead);
2215     return true;
2216   }
2217 
2218   /**
2219    * Given a just-consumed non-ASCII code unit |lead| (which may also be a
2220    * full code point, for UTF-16), consume a full code point or
2221    * LineTerminatorSequence (normalizing it to '\n') and store it in
2222    * |*codePoint|.  Return true on success, otherwise return false and leave
2223    * |*codePoint| undefined on failure.
2224    *
2225    * If a LineTerminatorSequence was consumed, also update line/column info.
2226    *
2227    * This may change the current |sourceUnits| offset.
2228    */
2229   [[nodiscard]] bool getNonAsciiCodePoint(int32_t lead, int32_t* codePoint);
2230 };
2231 
2232 template <class AnyCharsAccess>
2233 class TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>
2234     : public GeneralTokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess> {
2235   using CharsBase = TokenStreamCharsBase<mozilla::Utf8Unit>;
2236   using SpecializedCharsBase =
2237       SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>;
2238   using GeneralCharsBase =
2239       GeneralTokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;
2240   using Self = TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;
2241 
2242   using typename SpecializedCharsBase::SourceUnitsEnd;
2243   using typename SpecializedCharsBase::SourceUnitsIterator;
2244 
2245  protected:
2246   using GeneralCharsBase::anyCharsAccess;
2247   using GeneralCharsBase::computeLineAndColumn;
2248   using GeneralCharsBase::fillExceptingContext;
2249   using GeneralCharsBase::internalComputeLineOfContext;
2250   using TokenStreamCharsShared::isAsciiCodePoint;
2251   // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
2252   using GeneralCharsBase::updateLineInfoForEOL;
2253 
2254  private:
2255   static char toHexChar(uint8_t nibble) {
2256     MOZ_ASSERT(nibble < 16);
2257     return "0123456789ABCDEF"[nibble];
2258   }
2259 
2260   static void byteToString(uint8_t n, char* str) {
2261     str[0] = '0';
2262     str[1] = 'x';
2263     str[2] = toHexChar(n >> 4);
2264     str[3] = toHexChar(n & 0xF);
2265   }
2266 
2267   static void byteToTerminatedString(uint8_t n, char* str) {
2268     byteToString(n, str);
2269     str[4] = '\0';
2270   }
2271 
2272   /**
2273    * Report a UTF-8 encoding-related error for a code point starting AT THE
2274    * CURRENT OFFSET.
2275    *
2276    * |relevantUnits| indicates how many code units from the current offset
2277    * are potentially relevant to the reported error, such that they may be
2278    * included in the error message.  For example, if at the current offset we
2279    * have
2280    *
2281    *   0b1111'1111 ...
2282    *
2283    * a code unit never allowed in UTF-8, then |relevantUnits| might be 1
2284    * because only that unit is relevant.  Or if we have
2285    *
2286    *   0b1111'0111 0b1011'0101 0b0000'0000 ...
2287    *
2288    * where the first two code units are a valid prefix to a four-unit code
2289    * point but the third unit *isn't* a valid trailing code unit, then
2290    * |relevantUnits| might be 3.
2291    */
2292   MOZ_COLD void internalEncodingError(uint8_t relevantUnits,
2293                                       unsigned errorNumber, ...);
2294 
2295   // Don't use |internalEncodingError|!  Use one of the elaborated functions
2296   // that calls it, below -- all of which should be used to indicate an error
2297   // in a code point starting AT THE CURRENT OFFSET as with
2298   // |internalEncodingError|.
2299 
2300   /** Report an error for an invalid lead code unit |lead|. */
2301   MOZ_COLD void badLeadUnit(mozilla::Utf8Unit lead);
2302 
2303   /**
2304    * Report an error when there aren't enough code units remaining to
2305    * constitute a full code point after |lead|: only |remaining| code units
2306    * were available for a code point starting with |lead|, when at least
2307    * |required| code units were required.
2308    */
2309   MOZ_COLD void notEnoughUnits(mozilla::Utf8Unit lead, uint8_t remaining,
2310                                uint8_t required);
2311 
2312   /**
2313    * Report an error for a bad trailing UTF-8 code unit, where the bad
2314    * trailing unit was the last of |unitsObserved| units examined from the
2315    * current offset.
2316    */
2317   MOZ_COLD void badTrailingUnit(uint8_t unitsObserved);
2318 
2319   // Helper used for both |badCodePoint| and |notShortestForm| for code units
2320   // that have all the requisite high bits set/unset in a manner that *could*
2321   // encode a valid code point, but the remaining bits encoding its actual
2322   // value do not define a permitted value.
2323   MOZ_COLD void badStructurallyValidCodePoint(uint32_t codePoint,
2324                                               uint8_t codePointLength,
2325                                               const char* reason);
2326 
2327   /**
2328    * Report an error for UTF-8 that encodes a UTF-16 surrogate or a number
2329    * outside the Unicode range.
2330    */
2331   MOZ_COLD void badCodePoint(uint32_t codePoint, uint8_t codePointLength) {
2332     MOZ_ASSERT(unicode::IsSurrogate(codePoint) ||
2333                codePoint > unicode::NonBMPMax);
2334 
2335     badStructurallyValidCodePoint(codePoint, codePointLength,
2336                                   unicode::IsSurrogate(codePoint)
2337                                       ? "it's a UTF-16 surrogate"
2338                                       : "the maximum code point is U+10FFFF");
2339   }
2340 
2341   /**
2342    * Report an error for UTF-8 that encodes a code point not in its shortest
2343    * form.
2344    */
2345   MOZ_COLD void notShortestForm(uint32_t codePoint, uint8_t codePointLength) {
2346     MOZ_ASSERT(!unicode::IsSurrogate(codePoint));
2347     MOZ_ASSERT(codePoint <= unicode::NonBMPMax);
2348 
2349     badStructurallyValidCodePoint(
2350         codePoint, codePointLength,
2351         "it wasn't encoded in shortest possible form");
2352   }
2353 
2354  protected:
2355   using GeneralCharsBase::GeneralCharsBase;
2356 
2357   /**
2358    * Given the non-ASCII |lead| code unit just consumed, consume the rest of
2359    * a non-ASCII code point.  The code point is not normalized: on success
2360    * |*codePoint| may be U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR.
2361    *
2362    * Report an error if an invalid code point is encountered.
2363    */
2364   [[nodiscard]] bool getNonAsciiCodePointDontNormalize(mozilla::Utf8Unit lead,
2365                                                        char32_t* codePoint);
2366 
2367   /**
2368    * Given a just-consumed non-ASCII code unit |lead|, consume a full code
2369    * point or LineTerminatorSequence (normalizing it to '\n') and store it in
2370    * |*codePoint|.  Return true on success, otherwise return false and leave
2371    * |*codePoint| undefined on failure.
2372    *
2373    * If a LineTerminatorSequence was consumed, also update line/column info.
2374    *
2375    * This function will change the current |sourceUnits| offset.
2376    */
2377   [[nodiscard]] bool getNonAsciiCodePoint(int32_t lead, int32_t* codePoint);
2378 };
2379 
2380 // TokenStream is the lexical scanner for JavaScript source text.
2381 //
2382 // It takes a buffer of Unit code units (currently only char16_t encoding
2383 // UTF-16, but we're adding either UTF-8 or Latin-1 single-byte text soon) and
2384 // linearly scans it into |Token|s.
2385 //
2386 // Internally the class uses a four element circular buffer |tokens| of
2387 // |Token|s. As an index for |tokens|, the member |cursor_| points to the
2388 // current token. Calls to getToken() increase |cursor_| by one and return the
2389 // new current token. If a TokenStream was just created, the current token is
2390 // uninitialized. It's therefore important that one of the first four member
2391 // functions listed below is called first. The circular buffer lets us go back
2392 // up to two tokens from the last scanned token. Internally, the relative
2393 // number of backward steps that were taken (via ungetToken()) after the last
2394 // token was scanned is stored in |lookahead|.
2395 //
2396 // The following table lists in which situations it is safe to call each listed
2397 // function. No checks are made by the functions in non-debug builds.
2398 //
2399 // Function Name     | Precondition; changes to |lookahead|
2400 // ------------------+---------------------------------------------------------
2401 // getToken          | none; if |lookahead > 0| then |lookahead--|
2402 // peekToken         | none; if |lookahead == 0| then |lookahead == 1|
2403 // peekTokenSameLine | none; if |lookahead == 0| then |lookahead == 1|
2404 // matchToken        | none; if |lookahead > 0| and the match succeeds then
2405 //                   |       |lookahead--|
2406 // consumeKnownToken | none; if |lookahead > 0| then |lookahead--|
2407 // ungetToken        | 0 <= |lookahead| <= |maxLookahead - 1|; |lookahead++|
2408 //
2409 // The behavior of the token scanning process (see getTokenInternal()) can be
2410 // modified by calling one of the first four above listed member functions with
2411 // an optional argument of type Modifier.  However, the modifier will be
2412 // ignored unless |lookahead == 0| holds.  Due to constraints of the grammar,
2413 // this turns out not to be a problem in practice. See the
2414 // mozilla.dev.tech.js-engine.internals thread entitled 'Bug in the scanner?'
2415 // for more details:
2416 // https://groups.google.com/forum/?fromgroups=#!topic/mozilla.dev.tech.js-engine.internals/2JLH5jRcr7E).
2417 //
2418 // The method seek() allows rescanning from a previously visited location of
2419 // the buffer, initially computed by constructing a Position local variable.
2420 //
2421 template <typename Unit, class AnyCharsAccess>
2422 class MOZ_STACK_CLASS TokenStreamSpecific
2423     : public TokenStreamChars<Unit, AnyCharsAccess>,
2424       public TokenStreamShared,
2425       public ErrorReporter {
2426  public:
2427   using CharsBase = TokenStreamCharsBase<Unit>;
2428   using SpecializedCharsBase = SpecializedTokenStreamCharsBase<Unit>;
2429   using GeneralCharsBase = GeneralTokenStreamChars<Unit, AnyCharsAccess>;
2430   using SpecializedChars = TokenStreamChars<Unit, AnyCharsAccess>;
2431 
2432   using Position = TokenStreamPosition<Unit>;
2433 
2434   // Anything inherited through a base class whose type depends upon this
2435   // class's template parameters can only be accessed through a dependent
2436   // name: prefixed with |this|, by explicit qualification, and so on.  (This
2437   // is so that references to inherited fields are statically distinguishable
2438   // from references to names outside of the class.)  This is tedious and
2439   // onerous.
2440   //
2441   // As an alternative, we directly add every one of these functions to this
2442   // class, using explicit qualification to address the dependent-name
2443   // problem.  |this| or other qualification is no longer necessary -- at
2444   // cost of this ever-changing laundry list of |using|s.  So it goes.
2445  public:
2446   using GeneralCharsBase::anyCharsAccess;
2447   using GeneralCharsBase::computeLineAndColumn;
2448   using TokenStreamCharsShared::adoptState;
2449 
2450  private:
2451   using typename CharsBase::SourceUnits;
2452 
2453  private:
2454   using CharsBase::atomizeSourceChars;
2455   using GeneralCharsBase::badToken;
2456   // Deliberately don't |using| |charBuffer| because of bug 1472569.  :-(
2457   using CharsBase::consumeKnownCodeUnit;
2458   using CharsBase::matchCodeUnit;
2459   using CharsBase::matchLineTerminator;
2460   using CharsBase::peekCodeUnit;
2461   using GeneralCharsBase::computeColumn;
2462   using GeneralCharsBase::fillExceptingContext;
2463   using GeneralCharsBase::getCodeUnit;
2464   using GeneralCharsBase::getFullAsciiCodePoint;
2465   using GeneralCharsBase::internalComputeLineOfContext;
2466   using GeneralCharsBase::matchUnicodeEscapeIdent;
2467   using GeneralCharsBase::matchUnicodeEscapeIdStart;
2468   using GeneralCharsBase::newAtomToken;
2469   using GeneralCharsBase::newBigIntToken;
2470   using GeneralCharsBase::newNameToken;
2471   using GeneralCharsBase::newNumberToken;
2472   using GeneralCharsBase::newPrivateNameToken;
2473   using GeneralCharsBase::newRegExpToken;
2474   using GeneralCharsBase::newSimpleToken;
2475   using SpecializedChars::getNonAsciiCodePoint;
2476   using SpecializedChars::getNonAsciiCodePointDontNormalize;
2477   using TokenStreamCharsShared::copyCharBufferTo;
2478   using TokenStreamCharsShared::drainCharBufferIntoAtom;
2479   using TokenStreamCharsShared::isAsciiCodePoint;
2480   // Deliberately don't |using| |sourceUnits| because of bug 1472569.  :-(
2481   using CharsBase::toUnit;
2482   using GeneralCharsBase::ungetCodeUnit;
2483   using GeneralCharsBase::updateLineInfoForEOL;
2484 
2485   template <typename CharU>
2486   friend class TokenStreamPosition;
2487 
2488  public:
2489   TokenStreamSpecific(JSContext* cx, ParserAtomsTable* parserAtoms,
2490                       const JS::ReadOnlyCompileOptions& options,
2491                       const Unit* units, size_t length);
2492 
2493   /**
2494    * Get the next code point, converting LineTerminatorSequences to '\n' and
2495    * updating internal line-counter state if needed.  Return true on success
2496    * and store the code point in |*cp|.  Return false and leave |*cp|
2497    * undefined on failure.
2498    */
2499   [[nodiscard]] bool getCodePoint(int32_t* cp);
2500 
2501   // If there is an invalid escape in a template, report it and return false,
2502   // otherwise return true.
2503   bool checkForInvalidTemplateEscapeError() {
2504     if (anyCharsAccess().invalidTemplateEscapeType == InvalidEscapeType::None) {
2505       return true;
2506     }
2507 
2508     reportInvalidEscapeError(anyCharsAccess().invalidTemplateEscapeOffset,
2509                              anyCharsAccess().invalidTemplateEscapeType);
2510     return false;
2511   }
2512 
2513  public:
2514   // Implement ErrorReporter.
2515 
2516   void lineAndColumnAt(size_t offset, uint32_t* line,
2517                        uint32_t* column) const final {
2518     computeLineAndColumn(offset, line, column);
2519   }
2520 
2521   void currentLineAndColumn(uint32_t* line, uint32_t* column) const final {
2522     computeLineAndColumn(anyCharsAccess().currentToken().pos.begin, line,
2523                          column);
2524   }
2525 
2526   bool isOnThisLine(size_t offset, uint32_t lineNum,
2527                     bool* onThisLine) const final {
2528     return anyCharsAccess().srcCoords.isOnThisLine(offset, lineNum, onThisLine);
2529   }
2530 
2531   uint32_t lineAt(size_t offset) const final {
2532     const auto& anyChars = anyCharsAccess();
2533     auto lineToken = anyChars.lineToken(offset);
2534     return anyChars.lineNumber(lineToken);
2535   }
2536 
2537   uint32_t columnAt(size_t offset) const final {
2538     return computeColumn(anyCharsAccess().lineToken(offset), offset);
2539   }
2540 
2541   bool hasTokenizationStarted() const final;
2542 
2543   const char* getFilename() const final {
2544     return anyCharsAccess().getFilename();
2545   }
2546 
2547  private:
2548   // Implement ErrorReportMixin.
2549 
2550   JSContext* getContext() const override { return anyCharsAccess().cx; }
2551 
2552   [[nodiscard]] bool strictMode() const override {
2553     return anyCharsAccess().strictMode();
2554   }
2555 
2556  public:
2557   // Implement ErrorReportMixin.
2558 
2559   const JS::ReadOnlyCompileOptions& options() const final {
2560     return anyCharsAccess().options();
2561   }
2562 
2563   [[nodiscard]] bool computeErrorMetadata(
2564       ErrorMetadata* err, const ErrorOffset& errorOffset) override;
2565 
2566  private:
2567   void reportInvalidEscapeError(uint32_t offset, InvalidEscapeType type) {
2568     switch (type) {
2569       case InvalidEscapeType::None:
2570         MOZ_ASSERT_UNREACHABLE("unexpected InvalidEscapeType");
2571         return;
2572       case InvalidEscapeType::Hexadecimal:
2573         errorAt(offset, JSMSG_MALFORMED_ESCAPE, "hexadecimal");
2574         return;
2575       case InvalidEscapeType::Unicode:
2576         errorAt(offset, JSMSG_MALFORMED_ESCAPE, "Unicode");
2577         return;
2578       case InvalidEscapeType::UnicodeOverflow:
2579         errorAt(offset, JSMSG_UNICODE_OVERFLOW, "escape sequence");
2580         return;
2581       case InvalidEscapeType::Octal:
2582         errorAt(offset, JSMSG_DEPRECATED_OCTAL_ESCAPE);
2583         return;
2584       case InvalidEscapeType::EightOrNine:
2585         errorAt(offset, JSMSG_DEPRECATED_EIGHT_OR_NINE_ESCAPE);
2586         return;
2587     }
2588   }
2589 
2590   void reportIllegalCharacter(int32_t cp);
2591 
2592   [[nodiscard]] bool putIdentInCharBuffer(const Unit* identStart);
2593 
2594   using IsIntegerUnit = bool (*)(int32_t);
2595   [[nodiscard]] MOZ_ALWAYS_INLINE bool matchInteger(IsIntegerUnit isIntegerUnit,
2596                                                     int32_t* nextUnit);
2597   [[nodiscard]] MOZ_ALWAYS_INLINE bool matchIntegerAfterFirstDigit(
2598       IsIntegerUnit isIntegerUnit, int32_t* nextUnit);
2599 
2600   /**
2601    * Tokenize a decimal number that begins at |numStart| into the provided
2602    * token.
2603    *
2604    * |unit| must be one of these values:
2605    *
2606    *   1. The first decimal digit in the integral part of a decimal number
2607    *      not starting with '0' or '.', e.g. '1' for "17", '3' for "3.14", or
2608    *      '8' for "8.675309e6".
2609    *
2610    *   In this case, the next |getCodeUnit()| must return the code unit after
2611    *   |unit| in the overall number.
2612    *
2613    *   2. The '.' in a "."/"0."-prefixed decimal number or the 'e'/'E' in a
2614    *      "0e"/"0E"-prefixed decimal number, e.g. ".17", "0.42", or "0.1e3".
2615    *
2616    *   In this case, the next |getCodeUnit()| must return the code unit
2617    *   *after* the first decimal digit *after* the '.'.  So the next code
2618    *   unit would be '7' in ".17", '2' in "0.42", 'e' in "0.4e+8", or '/' in
2619    *   "0.5/2" (three separate tokens).
2620    *
2621    *   3. The code unit after the '0' where "0" is the entire number token.
2622    *
2623    *   In this case, the next |getCodeUnit()| would return the code unit
2624    *   after |unit|, but this function will never perform such call.
2625    *
2626    *   4. (Non-strict mode code only)  The first '8' or '9' in a "noctal"
2627    *      number that begins with a '0' but contains a non-octal digit in its
2628    *      integer part so is interpreted as decimal, e.g. '9' in "09.28" or
2629    *      '8' in "0386" or '9' in "09+7" (three separate tokens").
2630    *
2631    *   In this case, the next |getCodeUnit()| returns the code unit after
2632    *   |unit|: '.', '6', or '+' in the examples above.
2633    *
2634    * This interface is super-hairy and horribly stateful.  Unfortunately, its
2635    * hair merely reflects the intricacy of ECMAScript numeric literal syntax.
2636    * And incredibly, it *improves* on the goto-based horror that predated it.
2637    */
2638   [[nodiscard]] bool decimalNumber(int32_t unit, TokenStart start,
2639                                    const Unit* numStart, Modifier modifier,
2640                                    TokenKind* out);
2641 
2642   /** Tokenize a regular expression literal beginning at |start|. */
2643   [[nodiscard]] bool regexpLiteral(TokenStart start, TokenKind* out);
2644 
2645   /**
2646    * Slurp characters between |start| and sourceUnits.current() into
2647    * charBuffer, to later parse into a bigint.
2648    */
2649   [[nodiscard]] bool bigIntLiteral(TokenStart start, Modifier modifier,
2650                                    TokenKind* out);
2651 
2652  public:
2653   // Advance to the next token.  If the token stream encountered an error,
2654   // return false.  Otherwise return true and store the token kind in |*ttp|.
2655   [[nodiscard]] bool getToken(TokenKind* ttp, Modifier modifier = SlashIsDiv) {
2656     // Check for a pushed-back token resulting from mismatching lookahead.
2657     TokenStreamAnyChars& anyChars = anyCharsAccess();
2658     if (anyChars.lookahead != 0) {
2659       MOZ_ASSERT(!anyChars.flags.hadError);
2660       anyChars.lookahead--;
2661       anyChars.advanceCursor();
2662       TokenKind tt = anyChars.currentToken().type;
2663       MOZ_ASSERT(tt != TokenKind::Eol);
2664       verifyConsistentModifier(modifier, anyChars.currentToken());
2665       *ttp = tt;
2666       return true;
2667     }
2668 
2669     return getTokenInternal(ttp, modifier);
2670   }
2671 
2672   [[nodiscard]] bool peekToken(TokenKind* ttp, Modifier modifier = SlashIsDiv) {
2673     TokenStreamAnyChars& anyChars = anyCharsAccess();
2674     if (anyChars.lookahead > 0) {
2675       MOZ_ASSERT(!anyChars.flags.hadError);
2676       verifyConsistentModifier(modifier, anyChars.nextToken());
2677       *ttp = anyChars.nextToken().type;
2678       return true;
2679     }
2680     if (!getTokenInternal(ttp, modifier)) {
2681       return false;
2682     }
2683     anyChars.ungetToken();
2684     return true;
2685   }
2686 
2687   [[nodiscard]] bool peekTokenPos(TokenPos* posp,
2688                                   Modifier modifier = SlashIsDiv) {
2689     TokenStreamAnyChars& anyChars = anyCharsAccess();
2690     if (anyChars.lookahead == 0) {
2691       TokenKind tt;
2692       if (!getTokenInternal(&tt, modifier)) {
2693         return false;
2694       }
2695       anyChars.ungetToken();
2696       MOZ_ASSERT(anyChars.hasLookahead());
2697     } else {
2698       MOZ_ASSERT(!anyChars.flags.hadError);
2699       verifyConsistentModifier(modifier, anyChars.nextToken());
2700     }
2701     *posp = anyChars.nextToken().pos;
2702     return true;
2703   }
2704 
2705   [[nodiscard]] bool peekOffset(uint32_t* offset,
2706                                 Modifier modifier = SlashIsDiv) {
2707     TokenPos pos;
2708     if (!peekTokenPos(&pos, modifier)) {
2709       return false;
2710     }
2711     *offset = pos.begin;
2712     return true;
2713   }
2714 
2715   // This is like peekToken(), with one exception:  if there is an EOL
2716   // between the end of the current token and the start of the next token, it
2717   // return true and store Eol in |*ttp|.  In that case, no token with
2718   // Eol is actually created, just a Eol TokenKind is returned, and
2719   // currentToken() shouldn't be consulted.  (This is the only place Eol
2720   // is produced.)
2721   [[nodiscard]] MOZ_ALWAYS_INLINE bool peekTokenSameLine(
2722       TokenKind* ttp, Modifier modifier = SlashIsDiv) {
2723     TokenStreamAnyChars& anyChars = anyCharsAccess();
2724     const Token& curr = anyChars.currentToken();
2725 
2726     // If lookahead != 0, we have scanned ahead at least one token, and
2727     // |lineno| is the line that the furthest-scanned token ends on.  If
2728     // it's the same as the line that the current token ends on, that's a
2729     // stronger condition than what we are looking for, and we don't need
2730     // to return Eol.
2731     if (anyChars.lookahead != 0) {
2732       bool onThisLine;
2733       if (!anyChars.srcCoords.isOnThisLine(curr.pos.end, anyChars.lineno,
2734                                            &onThisLine)) {
2735         error(JSMSG_OUT_OF_MEMORY);
2736         return false;
2737       }
2738 
2739       if (onThisLine) {
2740         MOZ_ASSERT(!anyChars.flags.hadError);
2741         verifyConsistentModifier(modifier, anyChars.nextToken());
2742         *ttp = anyChars.nextToken().type;
2743         return true;
2744       }
2745     }
2746 
2747     // The above check misses two cases where we don't have to return
2748     // Eol.
2749     // - The next token starts on the same line, but is a multi-line token.
2750     // - The next token starts on the same line, but lookahead==2 and there
2751     //   is a newline between the next token and the one after that.
2752     // The following test is somewhat expensive but gets these cases (and
2753     // all others) right.
2754     TokenKind tmp;
2755     if (!getToken(&tmp, modifier)) {
2756       return false;
2757     }
2758 
2759     const Token& next = anyChars.currentToken();
2760     anyChars.ungetToken();
2761 
2762     // Careful, |next| points to an initialized-but-not-allocated Token!
2763     // This is safe because we don't modify token data below.
2764 
2765     auto currentEndToken = anyChars.lineToken(curr.pos.end);
2766     auto nextBeginToken = anyChars.lineToken(next.pos.begin);
2767 
2768     *ttp =
2769         currentEndToken.isSameLine(nextBeginToken) ? next.type : TokenKind::Eol;
2770     return true;
2771   }
2772 
2773   // Get the next token from the stream if its kind is |tt|.
2774   [[nodiscard]] bool matchToken(bool* matchedp, TokenKind tt,
2775                                 Modifier modifier = SlashIsDiv) {
2776     TokenKind token;
2777     if (!getToken(&token, modifier)) {
2778       return false;
2779     }
2780     if (token == tt) {
2781       *matchedp = true;
2782     } else {
2783       anyCharsAccess().ungetToken();
2784       *matchedp = false;
2785     }
2786     return true;
2787   }
2788 
2789   void consumeKnownToken(TokenKind tt, Modifier modifier = SlashIsDiv) {
2790     bool matched;
2791     MOZ_ASSERT(anyCharsAccess().hasLookahead());
2792     MOZ_ALWAYS_TRUE(matchToken(&matched, tt, modifier));
2793     MOZ_ALWAYS_TRUE(matched);
2794   }
2795 
2796   [[nodiscard]] bool nextTokenEndsExpr(bool* endsExpr) {
2797     TokenKind tt;
2798     if (!peekToken(&tt)) {
2799       return false;
2800     }
2801 
2802     *endsExpr = anyCharsAccess().isExprEnding[size_t(tt)];
2803     if (*endsExpr) {
2804       // If the next token ends an overall Expression, we'll parse this
2805       // Expression without ever invoking Parser::orExpr().  But we need that
2806       // function's DEBUG-only side effect of marking this token as safe to get
2807       // with SlashIsRegExp, so we have to do it manually here.
2808       anyCharsAccess().allowGettingNextTokenWithSlashIsRegExp();
2809     }
2810     return true;
2811   }
2812 
2813   [[nodiscard]] bool advance(size_t position);
2814 
2815   void seekTo(const Position& pos);
2816   [[nodiscard]] bool seekTo(const Position& pos,
2817                             const TokenStreamAnyChars& other);
2818 
2819   void rewind(const Position& pos) {
2820     MOZ_ASSERT(pos.buf <= this->sourceUnits.addressOfNextCodeUnit(),
2821                "should be rewinding here");
2822     seekTo(pos);
2823   }
2824 
2825   [[nodiscard]] bool rewind(const Position& pos,
2826                             const TokenStreamAnyChars& other) {
2827     MOZ_ASSERT(pos.buf <= this->sourceUnits.addressOfNextCodeUnit(),
2828                "should be rewinding here");
2829     return seekTo(pos, other);
2830   }
2831 
2832   void fastForward(const Position& pos) {
2833     MOZ_ASSERT(this->sourceUnits.addressOfNextCodeUnit() <= pos.buf,
2834                "should be moving forward here");
2835     seekTo(pos);
2836   }
2837 
2838   [[nodiscard]] bool fastForward(const Position& pos,
2839                                  const TokenStreamAnyChars& other) {
2840     MOZ_ASSERT(this->sourceUnits.addressOfNextCodeUnit() <= pos.buf,
2841                "should be moving forward here");
2842     return seekTo(pos, other);
2843   }
2844 
2845   const Unit* codeUnitPtrAt(size_t offset) const {
2846     return this->sourceUnits.codeUnitPtrAt(offset);
2847   }
2848 
2849   [[nodiscard]] bool identifierName(TokenStart start, const Unit* identStart,
2850                                     IdentifierEscapes escaping,
2851                                     Modifier modifier,
2852                                     NameVisibility visibility, TokenKind* out);
2853 
2854   [[nodiscard]] bool matchIdentifierStart(IdentifierEscapes* sawEscape);
2855 
2856   [[nodiscard]] bool getTokenInternal(TokenKind* const ttp,
2857                                       const Modifier modifier);
2858 
2859   [[nodiscard]] bool getStringOrTemplateToken(char untilChar, Modifier modifier,
2860                                               TokenKind* out);
2861 
2862   // Parse a TemplateMiddle or TemplateTail token (one of the string-like parts
2863   // of a template string) after already consuming the leading `RightCurly`.
2864   // (The spec says the `}` is the first character of the TemplateMiddle/
2865   // TemplateTail, but we treat it as a separate token because that's much
2866   // easier to implement in both TokenStream and the parser.)
2867   //
2868   // This consumes a token and sets the current token, like `getToken()`.  It
2869   // doesn't take a Modifier because there's no risk of encountering a division
2870   // operator or RegExp literal.
2871   //
2872   // On success, `*ttp` is either `TokenKind::TemplateHead` (if we got a
2873   // TemplateMiddle token) or `TokenKind::NoSubsTemplate` (if we got a
2874   // TemplateTail). That may seem strange; there are four different template
2875   // token types in the spec, but we only use two. We use `TemplateHead` for
2876   // TemplateMiddle because both end with `...${`, and `NoSubsTemplate` for
2877   // TemplateTail because both contain the end of the template, including the
2878   // closing quote mark. They're not treated differently, either in the parser
2879   // or in the tokenizer.
2880   [[nodiscard]] bool getTemplateToken(TokenKind* ttp) {
2881     MOZ_ASSERT(anyCharsAccess().currentToken().type == TokenKind::RightCurly);
2882     return getStringOrTemplateToken('`', SlashIsInvalid, ttp);
2883   }
2884 
2885   [[nodiscard]] bool getDirectives(bool isMultiline, bool shouldWarnDeprecated);
2886   [[nodiscard]] bool getDirective(
2887       bool isMultiline, bool shouldWarnDeprecated, const char* directive,
2888       uint8_t directiveLength, const char* errorMsgPragma,
2889       UniquePtr<char16_t[], JS::FreePolicy>* destination);
2890   [[nodiscard]] bool getDisplayURL(bool isMultiline, bool shouldWarnDeprecated);
2891   [[nodiscard]] bool getSourceMappingURL(bool isMultiline,
2892                                          bool shouldWarnDeprecated);
2893 };
2894 
2895 // It's preferable to define this in TokenStream.cpp, but its template-ness
2896 // means we'd then have to *instantiate* this constructor for all possible
2897 // (Unit, AnyCharsAccess) pairs -- and that gets super-messy as AnyCharsAccess
2898 // *itself* is templated.  This symbol really isn't that huge compared to some
2899 // defined inline in TokenStreamSpecific, so just rely on the linker commoning
2900 // stuff up.
2901 template <typename Unit>
2902 template <class AnyCharsAccess>
2903 inline TokenStreamPosition<Unit>::TokenStreamPosition(
2904     TokenStreamSpecific<Unit, AnyCharsAccess>& tokenStream)
2905     : currentToken(tokenStream.anyCharsAccess().currentToken()) {
2906   TokenStreamAnyChars& anyChars = tokenStream.anyCharsAccess();
2907 
2908   buf =
2909       tokenStream.sourceUnits.addressOfNextCodeUnit(/* allowPoisoned = */ true);
2910   flags = anyChars.flags;
2911   lineno = anyChars.lineno;
2912   linebase = anyChars.linebase;
2913   prevLinebase = anyChars.prevLinebase;
2914   lookahead = anyChars.lookahead;
2915   currentToken = anyChars.currentToken();
2916   for (unsigned i = 0; i < anyChars.lookahead; i++) {
2917     lookaheadTokens[i] = anyChars.tokens[anyChars.aheadCursor(1 + i)];
2918   }
2919 }
2920 
2921 class TokenStreamAnyCharsAccess {
2922  public:
2923   template <class TokenStreamSpecific>
2924   static inline TokenStreamAnyChars& anyChars(TokenStreamSpecific* tss);
2925 
2926   template <class TokenStreamSpecific>
2927   static inline const TokenStreamAnyChars& anyChars(
2928       const TokenStreamSpecific* tss);
2929 };
2930 
2931 class MOZ_STACK_CLASS TokenStream
2932     : public TokenStreamAnyChars,
2933       public TokenStreamSpecific<char16_t, TokenStreamAnyCharsAccess> {
2934   using Unit = char16_t;
2935 
2936  public:
2937   TokenStream(JSContext* cx, ParserAtomsTable* parserAtoms,
2938               const JS::ReadOnlyCompileOptions& options, const Unit* units,
2939               size_t length, StrictModeGetter* smg)
2940       : TokenStreamAnyChars(cx, options, smg),
2941         TokenStreamSpecific<Unit, TokenStreamAnyCharsAccess>(
2942             cx, parserAtoms, options, units, length) {}
2943 };
2944 
2945 class MOZ_STACK_CLASS DummyTokenStream final : public TokenStream {
2946  public:
2947   DummyTokenStream(JSContext* cx, const JS::ReadOnlyCompileOptions& options)
2948       : TokenStream(cx, nullptr, options, nullptr, 0, nullptr) {}
2949 };
2950 
2951 template <class TokenStreamSpecific>
2952 /* static */ inline TokenStreamAnyChars& TokenStreamAnyCharsAccess::anyChars(
2953     TokenStreamSpecific* tss) {
2954   auto* ts = static_cast<TokenStream*>(tss);
2955   return *static_cast<TokenStreamAnyChars*>(ts);
2956 }
2957 
2958 template <class TokenStreamSpecific>
2959 /* static */ inline const TokenStreamAnyChars&
2960 TokenStreamAnyCharsAccess::anyChars(const TokenStreamSpecific* tss) {
2961   const auto* ts = static_cast<const TokenStream*>(tss);
2962   return *static_cast<const TokenStreamAnyChars*>(ts);
2963 }
2964 
2965 extern const char* TokenKindToDesc(TokenKind tt);
2966 
2967 }  // namespace frontend
2968 }  // namespace js
2969 
2970 #ifdef DEBUG
2971 extern const char* TokenKindToString(js::frontend::TokenKind tt);
2972 #endif
2973 
2974 #endif /* frontend_TokenStream_h */
2975