1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * vim: set ts=8 sts=2 et sw=2 tw=80:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 /*
8 * Streaming access to the raw tokens of JavaScript source.
9 *
10 * Because JS tokenization is context-sensitive -- a '/' could be either a
11 * regular expression *or* a division operator depending on context -- the
12 * various token stream classes are mostly not useful outside of the Parser
13 * where they reside. We should probably eventually merge the two concepts.
14 */
15 #ifndef frontend_TokenStream_h
16 #define frontend_TokenStream_h
17
18 /*
19 * [SMDOC] Parser Token Stream
20 *
21 * A token stream exposes the raw tokens -- operators, names, numbers,
22 * keywords, and so on -- of JavaScript source code.
23 *
24 * These are the components of the overall token stream concept:
25 * TokenStreamShared, TokenStreamAnyChars, TokenStreamCharsBase<Unit>,
26 * TokenStreamChars<Unit>, and TokenStreamSpecific<Unit, AnyCharsAccess>.
27 *
28 * == TokenStreamShared → ∅ ==
29 *
30 * Certain aspects of tokenizing are used everywhere:
31 *
32 * * modifiers (used to select which context-sensitive interpretation of a
33 * character should be used to decide what token it is) and modifier
34 * assertion handling;
35 * * flags on the overall stream (have we encountered any characters on this
36 * line? have we hit a syntax error? and so on);
37 * * and certain token-count constants.
38 *
39 * These are all defined in TokenStreamShared. (They could be namespace-
40 * scoped, but it seems tentatively better not to clutter the namespace.)
41 *
42 * == TokenStreamAnyChars → TokenStreamShared ==
43 *
44 * Certain aspects of tokenizing have meaning independent of the character type
45 * of the source text being tokenized: line/column number information, tokens
46 * in lookahead from determining the meaning of a prior token, compilation
47 * options, the filename, flags, source map URL, access to details of the
48 * current and next tokens (is the token of the given type? what name or
49 * number is contained in the token? and other queries), and others.
50 *
51 * All this data/functionality *could* be duplicated for both single-byte and
52 * double-byte tokenizing, but there are two problems. First, it's potentially
53 * wasteful if the compiler doesnt recognize it can unify the concepts. (And
54 * if any-character concepts are intermixed with character-specific concepts,
55 * potentially the compiler *can't* unify them because offsets into the
56 * hypothetical TokenStream<Unit>s would differ.) Second, some of this stuff
57 * needs to be accessible in ParserBase, the aspects of JS language parsing
58 * that have meaning independent of the character type of the source text being
59 * parsed. So we need a separate data structure that ParserBase can hold on to
60 * for it. (ParserBase isn't the only instance of this, but it's certainly the
61 * biggest case of it.) Ergo, TokenStreamAnyChars.
62 *
63 * == TokenStreamCharsShared → ∅ ==
64 *
65 * Some functionality has meaning independent of character type, yet has no use
66 * *unless* you know the character type in actual use. It *could* live in
67 * TokenStreamAnyChars, but it makes more sense to live in a separate class
68 * that character-aware token information can simply inherit.
69 *
70 * This class currently exists only to contain a char16_t buffer, transiently
71 * used to accumulate strings in tricky cases that can't just be read directly
72 * from source text. It's not used outside character-aware tokenizing, so it
73 * doesn't make sense in TokenStreamAnyChars.
74 *
75 * == TokenStreamCharsBase<Unit> → TokenStreamCharsShared ==
76 *
77 * Certain data structures in tokenizing are character-type-specific: namely,
78 * the various pointers identifying the source text (including current offset
79 * and end).
80 *
81 * Additionally, some functions operating on this data are defined the same way
82 * no matter what character type you have (e.g. current offset in code units
83 * into the source text) or share a common interface regardless of character
84 * type (e.g. consume the next code unit if it has a given value).
85 *
86 * All such functionality lives in TokenStreamCharsBase<Unit>.
87 *
88 * == SpecializedTokenStreamCharsBase<Unit> → TokenStreamCharsBase<Unit> ==
89 *
90 * Certain tokenizing functionality is specific to a single character type.
91 * For example, JS's UTF-16 encoding recognizes no coding errors, because lone
92 * surrogates are not an error; but a UTF-8 encoding must recognize a variety
93 * of validation errors. Such functionality is defined only in the appropriate
94 * SpecializedTokenStreamCharsBase specialization.
95 *
96 * == GeneralTokenStreamChars<Unit, AnyCharsAccess> →
97 * SpecializedTokenStreamCharsBase<Unit> ==
98 *
99 * Some functionality operates differently on different character types, just
100 * as for TokenStreamCharsBase, but additionally requires access to character-
101 * type-agnostic information in TokenStreamAnyChars. For example, getting the
102 * next character performs different steps for different character types and
103 * must access TokenStreamAnyChars to update line break information.
104 *
105 * Such functionality, if it can be defined using the same algorithm for all
106 * character types, lives in GeneralTokenStreamChars<Unit, AnyCharsAccess>.
107 * The AnyCharsAccess parameter provides a way for a GeneralTokenStreamChars
108 * instance to access its corresponding TokenStreamAnyChars, without inheriting
109 * from it.
110 *
111 * GeneralTokenStreamChars<Unit, AnyCharsAccess> is just functionality, no
112 * actual member data.
113 *
114 * Such functionality all lives in TokenStreamChars<Unit, AnyCharsAccess>, a
115 * declared-but-not-defined template class whose specializations have a common
116 * public interface (plus whatever private helper functions are desirable).
117 *
118 * == TokenStreamChars<Unit, AnyCharsAccess> →
119 * GeneralTokenStreamChars<Unit, AnyCharsAccess> ==
120 *
121 * Some functionality is like that in GeneralTokenStreamChars, *but* it's
122 * defined entirely differently for different character types.
123 *
124 * For example, consider "match a multi-code unit code point" (hypothetically:
125 * we've only implemented two-byte tokenizing right now):
126 *
127 * * For two-byte text, there must be two code units to get, the leading code
128 * unit must be a UTF-16 lead surrogate, and the trailing code unit must be
129 * a UTF-16 trailing surrogate. (If any of these fail to hold, a next code
130 * unit encodes that code point and is not multi-code unit.)
131 * * For single-byte Latin-1 text, there are no multi-code unit code points.
132 * * For single-byte UTF-8 text, the first code unit must have N > 1 of its
133 * highest bits set (and the next unset), and |N - 1| successive code units
134 * must have their high bit set and next-highest bit unset, *and*
135 * concatenating all unconstrained bits together must not produce a code
136 * point value that could have been encoded in fewer code units.
137 *
138 * This functionality can't be implemented as member functions in
139 * GeneralTokenStreamChars because we'd need to *partially specialize* those
140 * functions -- hold Unit constant while letting AnyCharsAccess vary. But
141 * C++ forbids function template partial specialization like this: either you
142 * fix *all* parameters or you fix none of them.
143 *
144 * Fortunately, C++ *does* allow *class* template partial specialization. So
145 * TokenStreamChars is a template class with one specialization per Unit.
146 * Functions can be defined differently in the different specializations,
147 * because AnyCharsAccess as the only template parameter on member functions
148 * *can* vary.
149 *
150 * All TokenStreamChars<Unit, AnyCharsAccess> specializations, one per Unit,
151 * are just functionality, no actual member data.
152 *
153 * == TokenStreamSpecific<Unit, AnyCharsAccess> →
154 * TokenStreamChars<Unit, AnyCharsAccess>, TokenStreamShared,
155 * ErrorReporter ==
156 *
157 * TokenStreamSpecific is operations that are parametrized on character type
158 * but implement the *general* idea of tokenizing, without being intrinsically
159 * tied to character type. Notably, this includes all operations that can
160 * report warnings or errors at particular offsets, because we include a line
161 * of context with such errors -- and that necessarily accesses the raw
162 * characters of their specific type.
163 *
164 * Much TokenStreamSpecific operation depends on functionality in
165 * TokenStreamAnyChars. The obvious solution is to inherit it -- but this
166 * doesn't work in Parser: its ParserBase base class needs some
167 * TokenStreamAnyChars functionality without knowing character type.
168 *
169 * The AnyCharsAccess type parameter is a class that statically converts from a
170 * TokenStreamSpecific* to its corresponding TokenStreamAnyChars. The
171 * TokenStreamSpecific in Parser<ParseHandler, Unit> can then specify a class
172 * that properly converts from TokenStreamSpecific Parser::tokenStream to
173 * TokenStreamAnyChars ParserBase::anyChars.
174 *
175 * Could we hardcode one set of offset calculations for this and eliminate
176 * AnyCharsAccess? No. Offset calculations possibly could be hardcoded if
177 * TokenStreamSpecific were present in Parser before Parser::handler, assuring
178 * the same offsets in all Parser-related cases. But there's still a separate
179 * TokenStream class, that requires different offset calculations. So even if
180 * we wanted to hardcode this (it's not clear we would, because forcing the
181 * TokenStreamSpecific declarer to specify this is more explicit), we couldn't.
182 */
183
184 #include "mozilla/ArrayUtils.h"
185 #include "mozilla/Assertions.h"
186 #include "mozilla/Attributes.h"
187 #include "mozilla/Casting.h"
188 #include "mozilla/Maybe.h"
189 #include "mozilla/MemoryChecking.h"
190 #include "mozilla/Span.h"
191 #include "mozilla/TextUtils.h"
192 #include "mozilla/Utf8.h"
193
194 #include <algorithm>
195 #include <stdarg.h>
196 #include <stddef.h>
197 #include <stdint.h>
198 #include <stdio.h>
199 #include <type_traits>
200
201 #include "jspubtd.h"
202
203 #include "frontend/ErrorReporter.h"
204 #include "frontend/ParserAtom.h" // ParserAtom, ParserAtomsTable, TaggedParserAtomIndex
205 #include "frontend/Token.h"
206 #include "frontend/TokenKind.h"
207 #include "js/CompileOptions.h"
208 #include "js/friend/ErrorMessages.h" // JSMSG_*
209 #include "js/HashTable.h" // js::HashMap
210 #include "js/RegExpFlags.h" // JS::RegExpFlags
211 #include "js/UniquePtr.h"
212 #include "js/Vector.h"
213 #include "util/Unicode.h"
214 #include "vm/ErrorReporting.h"
215
216 struct JS_PUBLIC_API JSContext;
217 struct KeywordInfo;
218
219 namespace js {
220
221 namespace frontend {
222
223 // Saturate column number at a limit that can be represented in various parts of
224 // the engine. Source locations beyond this point will report at the limit
225 // column instead.
226 //
227 // See:
228 // - TokenStreamAnyChars::checkOptions
229 // - ColSpan::isRepresentable
230 // - WasmFrameIter::computeLine
231 static constexpr uint32_t ColumnLimit = std::numeric_limits<int32_t>::max() / 2;
232
233 // If `name` is reserved word, returns the TokenKind of it.
234 // TokenKind::Limit otherwise.
235 extern TokenKind ReservedWordTokenKind(TaggedParserAtomIndex name);
236
237 // If `name` is reserved word, returns string representation of it.
238 // nullptr otherwise.
239 extern const char* ReservedWordToCharZ(TaggedParserAtomIndex name);
240
241 // If `tt` is reserved word, returns string representation of it.
242 // nullptr otherwise.
243 extern const char* ReservedWordToCharZ(TokenKind tt);
244
245 enum class DeprecatedContent : uint8_t {
246 // No deprecated content was present.
247 None = 0,
248 // Octal literal not prefixed by "0o" but rather by just "0", e.g. 0755.
249 OctalLiteral,
250 // Octal character escape, e.g. "hell\157 world".
251 OctalEscape,
252 // NonOctalDecimalEscape, i.e. "\8" or "\9".
253 EightOrNineEscape,
254 };
255
256 struct TokenStreamFlags {
257 // Hit end of file.
258 bool isEOF : 1;
259 // Non-whitespace since start of line.
260 bool isDirtyLine : 1;
261 // Hit a syntax error, at start or during a token.
262 bool hadError : 1;
263
264 // The nature of any deprecated content seen since last reset.
265 // We have to uint8_t instead DeprecatedContent to work around a GCC 7 bug.
266 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414
267 uint8_t sawDeprecatedContent : 2;
268
TokenStreamFlagsTokenStreamFlags269 TokenStreamFlags()
270 : isEOF(false),
271 isDirtyLine(false),
272 hadError(false),
273 sawDeprecatedContent(uint8_t(DeprecatedContent::None)) {}
274 };
275
276 template <typename Unit>
277 class TokenStreamPosition;
278
279 /**
280 * TokenStream types and constants that are used in both TokenStreamAnyChars
281 * and TokenStreamSpecific. Do not add any non-static data members to this
282 * class!
283 */
284 class TokenStreamShared {
285 protected:
286 static constexpr size_t ntokens = 4; // 1 current + 2 lookahead, rounded
287 // to power of 2 to avoid divmod by 3
288
289 static constexpr unsigned ntokensMask = ntokens - 1;
290
291 template <typename Unit>
292 friend class TokenStreamPosition;
293
294 public:
295 static constexpr unsigned maxLookahead = 2;
296
297 using Modifier = Token::Modifier;
298 static constexpr Modifier SlashIsDiv = Token::SlashIsDiv;
299 static constexpr Modifier SlashIsRegExp = Token::SlashIsRegExp;
300 static constexpr Modifier SlashIsInvalid = Token::SlashIsInvalid;
301
verifyConsistentModifier(Modifier modifier,const Token & nextToken)302 static void verifyConsistentModifier(Modifier modifier,
303 const Token& nextToken) {
304 MOZ_ASSERT(
305 modifier == nextToken.modifier || modifier == SlashIsInvalid,
306 "This token was scanned with both SlashIsRegExp and SlashIsDiv, "
307 "indicating the parser is confused about how to handle a slash here. "
308 "See comment at Token::Modifier.");
309 }
310 };
311
312 static_assert(std::is_empty_v<TokenStreamShared>,
313 "TokenStreamShared shouldn't bloat classes that inherit from it");
314
315 template <typename Unit, class AnyCharsAccess>
316 class TokenStreamSpecific;
317
318 template <typename Unit>
319 class MOZ_STACK_CLASS TokenStreamPosition final {
320 public:
321 template <class AnyCharsAccess>
322 inline explicit TokenStreamPosition(
323 TokenStreamSpecific<Unit, AnyCharsAccess>& tokenStream);
324
325 private:
326 TokenStreamPosition(const TokenStreamPosition&) = delete;
327
328 // Technically only TokenStreamSpecific<Unit, AnyCharsAccess>::seek with
329 // Unit constant and AnyCharsAccess varying must be friended, but 1) it's
330 // hard to friend one function in template classes, and 2) C++ doesn't
331 // allow partial friend specialization to target just that single class.
332 template <typename Char, class AnyCharsAccess>
333 friend class TokenStreamSpecific;
334
335 const Unit* buf;
336 TokenStreamFlags flags;
337 unsigned lineno;
338 size_t linebase;
339 size_t prevLinebase;
340 Token currentToken;
341 unsigned lookahead;
342 Token lookaheadTokens[TokenStreamShared::maxLookahead];
343 };
344
345 template <typename Unit>
346 class SourceUnits;
347
348 /**
349 * This class maps:
350 *
351 * * a sourceUnits offset (a 0-indexed count of code units)
352 *
353 * to
354 *
355 * * a (1-indexed) line number and
356 * * a (0-indexed) offset in code *units* (not code points, not bytes) into
357 * that line,
358 *
359 * for either |Unit = Utf8Unit| or |Unit = char16_t|.
360 *
361 * Note that the latter quantity is *not* the same as a column number, which is
362 * a count of code *points*. Computing a column number requires the offset
363 * within the line and the source units of that line (including what type |Unit|
364 * is, to know how to decode them). If you need a column number, functions in
365 * |GeneralTokenStreamChars<Unit>| will consult this and source units to compute
366 * it.
367 */
368 class SourceCoords {
369 // For a given buffer holding source code, |lineStartOffsets_| has one
370 // element per line of source code, plus one sentinel element. Each
371 // non-sentinel element holds the buffer offset for the start of the
372 // corresponding line of source code. For this example script,
373 // assuming an initialLineOffset of 0:
374 //
375 // 1 // xyz [line starts at offset 0]
376 // 2 var x; [line starts at offset 7]
377 // 3 [line starts at offset 14]
378 // 4 var y; [line starts at offset 15]
379 //
380 // |lineStartOffsets_| is:
381 //
382 // [0, 7, 14, 15, MAX_PTR]
383 //
384 // To convert a "line number" to an "index" into |lineStartOffsets_|,
385 // subtract |initialLineNum_|. E.g. line 3's index is
386 // (3 - initialLineNum_), which is 2. Therefore lineStartOffsets_[2]
387 // holds the buffer offset for the start of line 3, which is 14. (Note
388 // that |initialLineNum_| is often 1, but not always.
389 //
390 // The first element is always initialLineOffset, passed to the
391 // constructor, and the last element is always the MAX_PTR sentinel.
392 //
393 // Offset-to-{line,offset-into-line} lookups are O(log n) in the worst
394 // case (binary search), but in practice they're heavily clustered and
395 // we do better than that by using the previous lookup's result
396 // (lastIndex_) as a starting point.
397 //
398 // Checking if an offset lies within a particular line number
399 // (isOnThisLine()) is O(1).
400 //
401 Vector<uint32_t, 128> lineStartOffsets_;
402
403 /** The line number on which the source text begins. */
404 uint32_t initialLineNum_;
405
406 /**
407 * The index corresponding to the last offset lookup -- used so that if
408 * offset lookups proceed in increasing order, and and the offset appears
409 * in the next couple lines from the last offset, we can avoid a full
410 * binary-search.
411 *
412 * This is mutable because it's modified on every search, but that fact
413 * isn't visible outside this class.
414 */
415 mutable uint32_t lastIndex_;
416
417 uint32_t indexFromOffset(uint32_t offset) const;
418
419 static const uint32_t MAX_PTR = UINT32_MAX;
420
lineNumberFromIndex(uint32_t index)421 uint32_t lineNumberFromIndex(uint32_t index) const {
422 return index + initialLineNum_;
423 }
424
indexFromLineNumber(uint32_t lineNum)425 uint32_t indexFromLineNumber(uint32_t lineNum) const {
426 return lineNum - initialLineNum_;
427 }
428
429 public:
430 SourceCoords(JSContext* cx, uint32_t initialLineNumber,
431 uint32_t initialOffset);
432
433 [[nodiscard]] bool add(uint32_t lineNum, uint32_t lineStartOffset);
434 [[nodiscard]] bool fill(const SourceCoords& other);
435
isOnThisLine(uint32_t offset,uint32_t lineNum,bool * onThisLine)436 bool isOnThisLine(uint32_t offset, uint32_t lineNum, bool* onThisLine) const {
437 uint32_t index = indexFromLineNumber(lineNum);
438 if (index + 1 >= lineStartOffsets_.length()) { // +1 due to sentinel
439 return false;
440 }
441 *onThisLine = lineStartOffsets_[index] <= offset &&
442 offset < lineStartOffsets_[index + 1];
443 return true;
444 }
445
446 /**
447 * A token, computed for an offset in source text, that can be used to
448 * access line number and line-offset information for that offset.
449 *
450 * LineToken *alone* exposes whether the corresponding offset is in the
451 * the first line of source (which may not be 1, depending on
452 * |initialLineNumber|), and whether it's in the same line as
453 * another LineToken.
454 */
455 class LineToken {
456 uint32_t index;
457 #ifdef DEBUG
458 uint32_t offset_; // stored for consistency-of-use assertions
459 #endif
460
461 friend class SourceCoords;
462
463 public:
LineToken(uint32_t index,uint32_t offset)464 LineToken(uint32_t index, uint32_t offset)
465 : index(index)
466 #ifdef DEBUG
467 ,
468 offset_(offset)
469 #endif
470 {
471 }
472
isFirstLine()473 bool isFirstLine() const { return index == 0; }
474
isSameLine(LineToken other)475 bool isSameLine(LineToken other) const { return index == other.index; }
476
assertConsistentOffset(uint32_t offset)477 void assertConsistentOffset(uint32_t offset) const {
478 MOZ_ASSERT(offset_ == offset);
479 }
480 };
481
482 /**
483 * Compute a token usable to access information about the line at the
484 * given offset.
485 *
486 * The only information directly accessible in a token is whether it
487 * corresponds to the first line of source text (which may not be line
488 * 1, depending on the |initialLineNumber| value used to construct
489 * this). Use |lineNumber(LineToken)| to compute the actual line
490 * number (incorporating the contribution of |initialLineNumber|).
491 */
492 LineToken lineToken(uint32_t offset) const;
493
494 /** Compute the line number for the given token. */
lineNumber(LineToken lineToken)495 uint32_t lineNumber(LineToken lineToken) const {
496 return lineNumberFromIndex(lineToken.index);
497 }
498
499 /** Return the offset of the start of the line for |lineToken|. */
lineStart(LineToken lineToken)500 uint32_t lineStart(LineToken lineToken) const {
501 MOZ_ASSERT(lineToken.index + 1 < lineStartOffsets_.length(),
502 "recorded line-start information must be available");
503 return lineStartOffsets_[lineToken.index];
504 }
505 };
506
507 enum class UnitsType : unsigned char {
508 PossiblyMultiUnit = 0,
509 GuaranteedSingleUnit = 1,
510 };
511
512 class ChunkInfo {
513 private:
514 // Store everything in |unsigned char|s so everything packs.
515 unsigned char column_[sizeof(uint32_t)];
516 unsigned char unitsType_;
517
518 public:
ChunkInfo(uint32_t col,UnitsType type)519 ChunkInfo(uint32_t col, UnitsType type)
520 : unitsType_(static_cast<unsigned char>(type)) {
521 memcpy(column_, &col, sizeof(col));
522 }
523
column()524 uint32_t column() const {
525 uint32_t col;
526 memcpy(&col, column_, sizeof(uint32_t));
527 return col;
528 }
529
unitsType()530 UnitsType unitsType() const {
531 MOZ_ASSERT(unitsType_ <= 1, "unitsType_ must be 0 or 1");
532 return static_cast<UnitsType>(unitsType_);
533 }
534
guaranteeSingleUnits()535 void guaranteeSingleUnits() {
536 MOZ_ASSERT(unitsType() == UnitsType::PossiblyMultiUnit,
537 "should only be setting to possibly optimize from the "
538 "pessimistic case");
539 unitsType_ = static_cast<unsigned char>(UnitsType::GuaranteedSingleUnit);
540 }
541 };
542
543 enum class InvalidEscapeType {
544 // No invalid character escapes.
545 None,
546 // A malformed \x escape.
547 Hexadecimal,
548 // A malformed \u escape.
549 Unicode,
550 // An otherwise well-formed \u escape which represents a
551 // codepoint > 10FFFF.
552 UnicodeOverflow,
553 // An octal escape in a template token.
554 Octal,
555 // NonOctalDecimalEscape - \8 or \9.
556 EightOrNine
557 };
558
559 class TokenStreamAnyChars : public TokenStreamShared {
560 private:
561 // Constant-at-construction fields.
562
563 JSContext* const cx;
564
565 /** Options used for parsing/tokenizing. */
566 const JS::ReadOnlyCompileOptions& options_;
567
568 /**
569 * Pointer used internally to test whether in strict mode. Use |strictMode()|
570 * instead of this field.
571 */
572 StrictModeGetter* const strictModeGetter_;
573
574 /** Input filename or null. */
575 const char* const filename_;
576
577 // Column number computation fields.
578
579 /**
580 * A map of (line number => sequence of the column numbers at
581 * |ColumnChunkLength|-unit boundaries rewound [if needed] to the nearest code
582 * point boundary). (|TokenStreamAnyChars::computePartialColumn| is the sole
583 * user of |ColumnChunkLength| and therefore contains its definition.)
584 *
585 * Entries appear in this map only when a column computation of sufficient
586 * distance is performed on a line -- and only when the column is beyond the
587 * first |ColumnChunkLength| units. Each line's vector is lazily filled as
588 * greater offsets require column computations.
589 */
590 mutable HashMap<uint32_t, Vector<ChunkInfo>> longLineColumnInfo_;
591
592 // Computing accurate column numbers requires at *some* point linearly
593 // iterating through prior source units in the line, to properly account for
594 // multi-unit code points. This is quadratic if counting happens repeatedly.
595 //
596 // But usually we need columns for advancing offsets through scripts. By
597 // caching the last ((line number, offset) => relative column) mapping (in
598 // similar manner to how |SourceCoords::lastIndex_| is used to cache
599 // (offset => line number) mappings) we can usually avoid re-iterating through
600 // the common line prefix.
601 //
602 // Additionally, we avoid hash table lookup costs by caching the
603 // |Vector<ChunkInfo>*| for the line of the last lookup. (|nullptr| means we
604 // must look it up -- or it hasn't been created yet.) This pointer is nulled
605 // when a lookup on a new line occurs, but as it's not a pointer at literal,
606 // reallocatable element data, it's *not* invalidated when new entries are
607 // added to such a vector.
608
609 /**
610 * The line in which the last column computation occurred, or UINT32_MAX if
611 * no prior computation has yet happened.
612 */
613 mutable uint32_t lineOfLastColumnComputation_ = UINT32_MAX;
614
615 /**
616 * The chunk vector of the line for that last column computation. This is
617 * null if the chunk vector needs to be recalculated or initially created.
618 */
619 mutable Vector<ChunkInfo>* lastChunkVectorForLine_ = nullptr;
620
621 /**
622 * The offset (in code units) of the last column computation performed,
623 * relative to source start.
624 */
625 mutable uint32_t lastOffsetOfComputedColumn_ = UINT32_MAX;
626
627 /**
628 * The column number for the offset (in code units) of the last column
629 * computation performed, relative to source start.
630 */
631 mutable uint32_t lastComputedColumn_ = 0;
632
633 // Intra-token fields.
634
635 /**
636 * The offset of the first invalid escape in a template literal. (If there is
637 * one -- if not, the value of this field is meaningless.)
638 *
639 * See also |invalidTemplateEscapeType|.
640 */
641 uint32_t invalidTemplateEscapeOffset = 0;
642
643 /**
644 * The type of the first invalid escape in a template literal. (If there
645 * isn't one, this will be |None|.)
646 *
647 * See also |invalidTemplateEscapeOffset|.
648 */
649 InvalidEscapeType invalidTemplateEscapeType = InvalidEscapeType::None;
650
651 // Fields with values relevant across tokens (and therefore potentially across
652 // function boundaries, such that lazy function parsing and stream-seeking
653 // must take care in saving and restoring them).
654
655 /** Line number and offset-to-line mapping information. */
656 SourceCoords srcCoords;
657
658 /** Circular token buffer of gotten tokens that have been ungotten. */
659 Token tokens[ntokens] = {};
660
661 /** The index in |tokens| of the last parsed token. */
662 unsigned cursor_ = 0;
663
664 /** The number of tokens in |tokens| available to be gotten. */
665 unsigned lookahead = 0;
666
667 /** The current line number. */
668 unsigned lineno;
669
670 /** Various flag bits (see above). */
671 TokenStreamFlags flags = {};
672
673 /** The offset of the start of the current line. */
674 size_t linebase = 0;
675
676 /** The start of the previous line, or |size_t(-1)| on the first line. */
677 size_t prevLinebase = size_t(-1);
678
679 /** The user's requested source URL. Null if none has been set. */
680 UniqueTwoByteChars displayURL_ = nullptr;
681
682 /** The URL of the source map for this script. Null if none has been set. */
683 UniqueTwoByteChars sourceMapURL_ = nullptr;
684
685 // Assorted boolean fields, none of which require maintenance across tokens,
686 // stored at class end to minimize padding.
687
688 /**
689 * Whether syntax errors should or should not contain details about the
690 * precise nature of the error. (This is intended for use in suppressing
691 * content-revealing details about syntax errors in cross-origin scripts on
692 * the web.)
693 */
694 const bool mutedErrors;
695
696 /**
697 * An array storing whether a TokenKind observed while attempting to extend
698 * a valid AssignmentExpression into an even longer AssignmentExpression
699 * (e.g., extending '3' to '3 + 5') will terminate it without error.
700 *
701 * For example, ';' always ends an AssignmentExpression because it ends a
702 * Statement or declaration. '}' always ends an AssignmentExpression
703 * because it terminates BlockStatement, FunctionBody, and embedded
704 * expressions in TemplateLiterals. Therefore both entries are set to true
705 * in TokenStreamAnyChars construction.
706 *
707 * But e.g. '+' *could* extend an AssignmentExpression, so its entry here
708 * is false. Meanwhile 'this' can't extend an AssignmentExpression, but
709 * it's only valid after a line break, so its entry here must be false.
710 *
711 * NOTE: This array could be static, but without C99's designated
712 * initializers it's easier zeroing here and setting the true entries
713 * in the constructor body. (Having this per-instance might also aid
714 * locality.) Don't worry! Initialization time for each TokenStream
715 * is trivial. See bug 639420.
716 */
717 bool isExprEnding[size_t(TokenKind::Limit)] = {}; // all-false initially
718
719 // End of fields.
720
721 public:
722 TokenStreamAnyChars(JSContext* cx, const JS::ReadOnlyCompileOptions& options,
723 StrictModeGetter* smg);
724
725 template <typename Unit, class AnyCharsAccess>
726 friend class GeneralTokenStreamChars;
727 template <typename Unit, class AnyCharsAccess>
728 friend class TokenStreamChars;
729 template <typename Unit, class AnyCharsAccess>
730 friend class TokenStreamSpecific;
731
732 template <typename Unit>
733 friend class TokenStreamPosition;
734
735 // Accessors.
cursor()736 unsigned cursor() const { return cursor_; }
nextCursor()737 unsigned nextCursor() const { return (cursor_ + 1) & ntokensMask; }
aheadCursor(unsigned steps)738 unsigned aheadCursor(unsigned steps) const {
739 return (cursor_ + steps) & ntokensMask;
740 }
741
currentToken()742 const Token& currentToken() const { return tokens[cursor()]; }
isCurrentTokenType(TokenKind type)743 bool isCurrentTokenType(TokenKind type) const {
744 return currentToken().type == type;
745 }
746
747 [[nodiscard]] bool checkOptions();
748
749 private:
750 TaggedParserAtomIndex reservedWordToPropertyName(TokenKind tt) const;
751
752 public:
currentName()753 TaggedParserAtomIndex currentName() const {
754 if (isCurrentTokenType(TokenKind::Name) ||
755 isCurrentTokenType(TokenKind::PrivateName)) {
756 return currentToken().name();
757 }
758
759 MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
760 return reservedWordToPropertyName(currentToken().type);
761 }
762
currentNameHasEscapes(ParserAtomsTable & parserAtoms)763 bool currentNameHasEscapes(ParserAtomsTable& parserAtoms) const {
764 if (isCurrentTokenType(TokenKind::Name) ||
765 isCurrentTokenType(TokenKind::PrivateName)) {
766 TokenPos pos = currentToken().pos;
767 return (pos.end - pos.begin) != parserAtoms.length(currentToken().name());
768 }
769
770 MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
771 return false;
772 }
773
isCurrentTokenAssignment()774 bool isCurrentTokenAssignment() const {
775 return TokenKindIsAssignment(currentToken().type);
776 }
777
778 // Flag methods.
isEOF()779 bool isEOF() const { return flags.isEOF; }
hadError()780 bool hadError() const { return flags.hadError; }
781
sawDeprecatedContent()782 DeprecatedContent sawDeprecatedContent() const {
783 return static_cast<DeprecatedContent>(flags.sawDeprecatedContent);
784 }
785
786 private:
787 // Workaround GCC 7 sadness.
setSawDeprecatedContent(DeprecatedContent content)788 void setSawDeprecatedContent(DeprecatedContent content) {
789 flags.sawDeprecatedContent = static_cast<uint8_t>(content);
790 }
791
792 public:
clearSawDeprecatedContent()793 void clearSawDeprecatedContent() {
794 setSawDeprecatedContent(DeprecatedContent::None);
795 }
setSawDeprecatedOctalLiteral()796 void setSawDeprecatedOctalLiteral() {
797 setSawDeprecatedContent(DeprecatedContent::OctalLiteral);
798 }
setSawDeprecatedOctalEscape()799 void setSawDeprecatedOctalEscape() {
800 setSawDeprecatedContent(DeprecatedContent::OctalEscape);
801 }
setSawDeprecatedEightOrNineEscape()802 void setSawDeprecatedEightOrNineEscape() {
803 setSawDeprecatedContent(DeprecatedContent::EightOrNineEscape);
804 }
805
hasInvalidTemplateEscape()806 bool hasInvalidTemplateEscape() const {
807 return invalidTemplateEscapeType != InvalidEscapeType::None;
808 }
clearInvalidTemplateEscape()809 void clearInvalidTemplateEscape() {
810 invalidTemplateEscapeType = InvalidEscapeType::None;
811 }
812
813 private:
814 // This is private because it should only be called by the tokenizer while
815 // tokenizing not by, for example, BytecodeEmitter.
strictMode()816 bool strictMode() const {
817 return strictModeGetter_ && strictModeGetter_->strictMode();
818 }
819
setInvalidTemplateEscape(uint32_t offset,InvalidEscapeType type)820 void setInvalidTemplateEscape(uint32_t offset, InvalidEscapeType type) {
821 MOZ_ASSERT(type != InvalidEscapeType::None);
822 if (invalidTemplateEscapeType != InvalidEscapeType::None) {
823 return;
824 }
825 invalidTemplateEscapeOffset = offset;
826 invalidTemplateEscapeType = type;
827 }
828
829 public:
830 // Call this immediately after parsing an OrExpression to allow scanning the
831 // next token with SlashIsRegExp without asserting (even though we just
832 // peeked at it in SlashIsDiv mode).
833 //
834 // It's OK to disable the assertion because the places where this is called
835 // have peeked at the next token in SlashIsDiv mode, and checked that it is
836 // *not* a Div token.
837 //
838 // To see why it is necessary to disable the assertion, consider these two
839 // programs:
840 //
841 // x = arg => q // per spec, this is all one statement, and the
842 // /a/g; // slashes are division operators
843 //
844 // x = arg => {} // per spec, ASI at the end of this line
845 // /a/g; // and that's a regexp literal
846 //
847 // The first program shows why orExpr() has use SlashIsDiv mode when peeking
848 // ahead for the next operator after parsing `q`. The second program shows
849 // why matchOrInsertSemicolon() must use SlashIsRegExp mode when scanning
850 // ahead for a semicolon.
allowGettingNextTokenWithSlashIsRegExp()851 void allowGettingNextTokenWithSlashIsRegExp() {
852 #ifdef DEBUG
853 // Check the precondition: Caller already peeked ahead at the next token,
854 // in SlashIsDiv mode, and it is *not* a Div token.
855 MOZ_ASSERT(hasLookahead());
856 const Token& next = nextToken();
857 MOZ_ASSERT(next.modifier == SlashIsDiv);
858 MOZ_ASSERT(next.type != TokenKind::Div);
859 tokens[nextCursor()].modifier = SlashIsRegExp;
860 #endif
861 }
862
863 #ifdef DEBUG
debugHasNoLookahead()864 inline bool debugHasNoLookahead() const { return lookahead == 0; }
865 #endif
866
hasDisplayURL()867 bool hasDisplayURL() const { return displayURL_ != nullptr; }
868
displayURL()869 char16_t* displayURL() { return displayURL_.get(); }
870
hasSourceMapURL()871 bool hasSourceMapURL() const { return sourceMapURL_ != nullptr; }
872
sourceMapURL()873 char16_t* sourceMapURL() { return sourceMapURL_.get(); }
874
context()875 JSContext* context() const { return cx; }
876
877 using LineToken = SourceCoords::LineToken;
878
lineToken(uint32_t offset)879 LineToken lineToken(uint32_t offset) const {
880 return srcCoords.lineToken(offset);
881 }
882
lineNumber(LineToken lineToken)883 uint32_t lineNumber(LineToken lineToken) const {
884 return srcCoords.lineNumber(lineToken);
885 }
886
lineStart(LineToken lineToken)887 uint32_t lineStart(LineToken lineToken) const {
888 return srcCoords.lineStart(lineToken);
889 }
890
891 /**
892 * Fill in |err|.
893 *
894 * If the token stream doesn't have location info for this error, use the
895 * caller's location (including line/column number) and return false. (No
896 * line of context is set.)
897 *
898 * Otherwise fill in everything in |err| except 1) line/column numbers and
899 * 2) line-of-context-related fields and return true. The caller *must*
900 * fill in the line/column number; filling the line of context is optional.
901 */
902 bool fillExceptingContext(ErrorMetadata* err, uint32_t offset);
903
updateFlagsForEOL()904 MOZ_ALWAYS_INLINE void updateFlagsForEOL() { flags.isDirtyLine = false; }
905
906 private:
907 /**
908 * Compute the "partial" column number in Unicode code points of the absolute
909 * |offset| within source text on the line of |lineToken| (which must have
910 * been computed from |offset|).
911 *
912 * A partial column number on a line that isn't the first line is just the
913 * actual column number. But a partial column number on the first line is the
914 * column number *ignoring the initial line/column of the script*. For
915 * example, consider this HTML with line/column number keys:
916 *
917 * 1 2 3
918 * 0123456789012345678901234 567890
919 * ------------------------------------
920 * 1 | <html>
921 * 2 | <head>
922 * 3 | <script>var x = 3; x < 4;
923 * 4 | const y = 7;</script>
924 * 5 | </head>
925 * 6 | <body></body>
926 * 7 | </html>
927 *
928 * The script would be compiled specifying initial (line, column) of (3, 10)
929 * using |JS::ReadOnlyCompileOptions::{lineno,column}|. And the column
930 * reported by |computeColumn| for the "v" of |var| would be 10. But the
931 * partial column number of the "v" in |var|, that this function returns,
932 * would be 0. On the other hand, the column reported by |computeColumn| and
933 * the partial column number returned by this function for the "c" in |const|
934 * would both be 0, because it's not in the first line of source text.
935 *
936 * The partial column is with respect *only* to the JavaScript source text as
937 * SpiderMonkey sees it. In the example, the "<" is converted to "<" by
938 * the browser before SpiderMonkey would see it. So the partial column of the
939 * "4" in the inequality would be 16, not 19.
940 *
941 * Code points are not all equal length, so counting requires *some* kind of
942 * linear-time counting from the start of the line. This function attempts
943 * various tricks to reduce this cost. If these optimizations succeed,
944 * repeated calls to this function on a line will pay a one-time cost linear
945 * in the length of the line, then each call pays a separate constant-time
946 * cost. If the optimizations do not succeed, this function works in time
947 * linear in the length of the line.
948 *
949 * It's unusual for a function in *this* class to be |Unit|-templated, but
950 * while this operation manages |Unit|-agnostic fields in this class and in
951 * |srcCoords|, it must *perform* |Unit|-sensitive computations to fill them.
952 * And this is the best place to do that.
953 */
954 template <typename Unit>
955 uint32_t computePartialColumn(const LineToken lineToken,
956 const uint32_t offset,
957 const SourceUnits<Unit>& sourceUnits) const;
958
959 /**
960 * Update line/column information for the start of a new line at
961 * |lineStartOffset|.
962 */
963 [[nodiscard]] MOZ_ALWAYS_INLINE bool internalUpdateLineInfoForEOL(
964 uint32_t lineStartOffset);
965
966 public:
nextToken()967 const Token& nextToken() const {
968 MOZ_ASSERT(hasLookahead());
969 return tokens[nextCursor()];
970 }
971
hasLookahead()972 bool hasLookahead() const { return lookahead > 0; }
973
advanceCursor()974 void advanceCursor() { cursor_ = (cursor_ + 1) & ntokensMask; }
975
retractCursor()976 void retractCursor() { cursor_ = (cursor_ - 1) & ntokensMask; }
977
allocateToken()978 Token* allocateToken() {
979 advanceCursor();
980
981 Token* tp = &tokens[cursor()];
982 MOZ_MAKE_MEM_UNDEFINED(tp, sizeof(*tp));
983
984 return tp;
985 }
986
987 // Push the last scanned token back into the stream.
ungetToken()988 void ungetToken() {
989 MOZ_ASSERT(lookahead < maxLookahead);
990 lookahead++;
991 retractCursor();
992 }
993
994 public:
adoptState(TokenStreamAnyChars & other)995 void adoptState(TokenStreamAnyChars& other) {
996 // If |other| has fresh information from directives, overwrite any
997 // previously recorded directives. (There is no specification directing
998 // that last-in-source-order directive controls, sadly. We behave this way
999 // in the ordinary case, so we ought do so here too.)
1000 if (auto& url = other.displayURL_) {
1001 displayURL_ = std::move(url);
1002 }
1003 if (auto& url = other.sourceMapURL_) {
1004 sourceMapURL_ = std::move(url);
1005 }
1006 }
1007
1008 // Compute error metadata for an error at no offset.
1009 void computeErrorMetadataNoOffset(ErrorMetadata* err);
1010
1011 // ErrorReporter API Helpers
1012
1013 // Provide minimal set of error reporting API given we cannot use
1014 // ErrorReportMixin here. "report" prefix is added to avoid conflict with
1015 // ErrorReportMixin methods in TokenStream class.
1016 void reportErrorNoOffset(unsigned errorNumber, ...);
1017 void reportErrorNoOffsetVA(unsigned errorNumber, va_list* args);
1018
options()1019 const JS::ReadOnlyCompileOptions& options() const { return options_; }
1020
getFilename()1021 const char* getFilename() const { return filename_; }
1022 };
1023
CodeUnitValue(char16_t unit)1024 constexpr char16_t CodeUnitValue(char16_t unit) { return unit; }
1025
CodeUnitValue(mozilla::Utf8Unit unit)1026 constexpr uint8_t CodeUnitValue(mozilla::Utf8Unit unit) {
1027 return unit.toUint8();
1028 }
1029
1030 template <typename Unit>
1031 class TokenStreamCharsBase;
1032
1033 template <typename T>
1034 inline bool IsLineTerminator(T) = delete;
1035
IsLineTerminator(char32_t codePoint)1036 inline bool IsLineTerminator(char32_t codePoint) {
1037 return codePoint == '\n' || codePoint == '\r' ||
1038 codePoint == unicode::LINE_SEPARATOR ||
1039 codePoint == unicode::PARA_SEPARATOR;
1040 }
1041
IsLineTerminator(char16_t unit)1042 inline bool IsLineTerminator(char16_t unit) {
1043 // Every LineTerminator fits in char16_t, so this is exact.
1044 return IsLineTerminator(static_cast<char32_t>(unit));
1045 }
1046
1047 template <typename Unit>
1048 struct SourceUnitTraits;
1049
1050 template <>
1051 struct SourceUnitTraits<char16_t> {
1052 public:
1053 static constexpr uint8_t maxUnitsLength = 2;
1054
1055 static constexpr size_t lengthInUnits(char32_t codePoint) {
1056 return codePoint < unicode::NonBMPMin ? 1 : 2;
1057 }
1058 };
1059
1060 template <>
1061 struct SourceUnitTraits<mozilla::Utf8Unit> {
1062 public:
1063 static constexpr uint8_t maxUnitsLength = 4;
1064
1065 static constexpr size_t lengthInUnits(char32_t codePoint) {
1066 return codePoint < 0x80 ? 1
1067 : codePoint < 0x800 ? 2
1068 : codePoint < 0x10000 ? 3
1069 : 4;
1070 }
1071 };
1072
1073 /**
1074 * PeekedCodePoint represents the result of peeking ahead in some source text
1075 * to determine the next validly-encoded code point.
1076 *
1077 * If there isn't a valid code point, then |isNone()|.
1078 *
1079 * But if there *is* a valid code point, then |!isNone()|, the code point has
1080 * value |codePoint()| and its length in code units is |lengthInUnits()|.
1081 *
1082 * Conceptually, this class is |Maybe<struct { char32_t v; uint8_t len; }>|.
1083 */
1084 template <typename Unit>
1085 class PeekedCodePoint final {
1086 char32_t codePoint_ = 0;
1087 uint8_t lengthInUnits_ = 0;
1088
1089 private:
1090 using SourceUnitTraits = frontend::SourceUnitTraits<Unit>;
1091
1092 PeekedCodePoint() = default;
1093
1094 public:
1095 /**
1096 * Create a peeked code point with the given value and length in code
1097 * units.
1098 *
1099 * While the latter value is computable from the former for both UTF-8 and
1100 * JS's version of UTF-16, the caller likely computed a length in units in
1101 * the course of determining the peeked value. Passing both here avoids
1102 * recomputation and lets us do a consistency-checking assertion.
1103 */
1104 PeekedCodePoint(char32_t codePoint, uint8_t lengthInUnits)
1105 : codePoint_(codePoint), lengthInUnits_(lengthInUnits) {
1106 MOZ_ASSERT(codePoint <= unicode::NonBMPMax);
1107 MOZ_ASSERT(lengthInUnits != 0, "bad code point length");
1108 MOZ_ASSERT(lengthInUnits == SourceUnitTraits::lengthInUnits(codePoint));
1109 }
1110
1111 /** Create a PeekedCodeUnit that represents no valid code point. */
1112 static PeekedCodePoint none() { return PeekedCodePoint(); }
1113
1114 /** True if no code point was found, false otherwise. */
1115 bool isNone() const { return lengthInUnits_ == 0; }
1116
1117 /** If a code point was found, its value. */
1118 char32_t codePoint() const {
1119 MOZ_ASSERT(!isNone());
1120 return codePoint_;
1121 }
1122
1123 /** If a code point was found, its length in code units. */
1124 uint8_t lengthInUnits() const {
1125 MOZ_ASSERT(!isNone());
1126 return lengthInUnits_;
1127 }
1128 };
1129
1130 inline PeekedCodePoint<char16_t> PeekCodePoint(const char16_t* const ptr,
1131 const char16_t* const end) {
1132 if (MOZ_UNLIKELY(ptr >= end)) {
1133 return PeekedCodePoint<char16_t>::none();
1134 }
1135
1136 char16_t lead = ptr[0];
1137
1138 char32_t c;
1139 uint8_t len;
1140 if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead)) ||
1141 MOZ_UNLIKELY(ptr + 1 >= end || !unicode::IsTrailSurrogate(ptr[1]))) {
1142 c = lead;
1143 len = 1;
1144 } else {
1145 c = unicode::UTF16Decode(lead, ptr[1]);
1146 len = 2;
1147 }
1148
1149 return PeekedCodePoint<char16_t>(c, len);
1150 }
1151
1152 inline PeekedCodePoint<mozilla::Utf8Unit> PeekCodePoint(
1153 const mozilla::Utf8Unit* const ptr, const mozilla::Utf8Unit* const end) {
1154 if (MOZ_UNLIKELY(ptr >= end)) {
1155 return PeekedCodePoint<mozilla::Utf8Unit>::none();
1156 }
1157
1158 const mozilla::Utf8Unit lead = ptr[0];
1159 if (mozilla::IsAscii(lead)) {
1160 return PeekedCodePoint<mozilla::Utf8Unit>(lead.toUint8(), 1);
1161 }
1162
1163 const mozilla::Utf8Unit* afterLead = ptr + 1;
1164 mozilla::Maybe<char32_t> codePoint =
1165 mozilla::DecodeOneUtf8CodePoint(lead, &afterLead, end);
1166 if (codePoint.isNothing()) {
1167 return PeekedCodePoint<mozilla::Utf8Unit>::none();
1168 }
1169
1170 auto len =
1171 mozilla::AssertedCast<uint8_t>(mozilla::PointerRangeSize(ptr, afterLead));
1172 MOZ_ASSERT(len <= 4);
1173
1174 return PeekedCodePoint<mozilla::Utf8Unit>(codePoint.value(), len);
1175 }
1176
1177 inline bool IsSingleUnitLineTerminator(mozilla::Utf8Unit unit) {
1178 // BEWARE: The Unicode line/paragraph separators don't fit in a single
1179 // UTF-8 code unit, so this test is exact for Utf8Unit but inexact
1180 // for UTF-8 as a whole. Users must handle |unit| as start of a
1181 // Unicode LineTerminator themselves!
1182 return unit == mozilla::Utf8Unit('\n') || unit == mozilla::Utf8Unit('\r');
1183 }
1184
1185 // This is the low-level interface to the JS source code buffer. It just gets
1186 // raw Unicode code units -- 16-bit char16_t units of source text that are not
1187 // (always) full code points, and 8-bit units of UTF-8 source text soon.
1188 // TokenStreams functions are layered on top and do some extra stuff like
1189 // converting all EOL sequences to '\n', tracking the line number, and setting
1190 // |flags.isEOF|. (The "raw" in "raw Unicode code units" refers to the lack of
1191 // EOL sequence normalization.)
1192 //
1193 // buf[0..length-1] often represents a substring of some larger source,
1194 // where we have only the substring in memory. The |startOffset| argument
1195 // indicates the offset within this larger string at which our string
1196 // begins, the offset of |buf[0]|.
1197 template <typename Unit>
1198 class SourceUnits {
1199 private:
1200 /** Base of buffer. */
1201 const Unit* base_;
1202
1203 /** Offset of base_[0]. */
1204 uint32_t startOffset_;
1205
1206 /** Limit for quick bounds check. */
1207 const Unit* limit_;
1208
1209 /** Next char to get. */
1210 const Unit* ptr;
1211
1212 public:
1213 SourceUnits(const Unit* units, size_t length, size_t startOffset)
1214 : base_(units),
1215 startOffset_(startOffset),
1216 limit_(units + length),
1217 ptr(units) {}
1218
1219 bool atStart() const {
1220 MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
1221 return ptr == base_;
1222 }
1223
1224 bool atEnd() const {
1225 MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
1226 MOZ_ASSERT(ptr <= limit_, "shouldn't have overrun");
1227 return ptr >= limit_;
1228 }
1229
1230 size_t remaining() const {
1231 MOZ_ASSERT(!isPoisoned(),
1232 "can't get a count of remaining code units if poisoned");
1233 return mozilla::PointerRangeSize(ptr, limit_);
1234 }
1235
1236 size_t startOffset() const { return startOffset_; }
1237
1238 size_t offset() const {
1239 return startOffset_ + mozilla::PointerRangeSize(base_, ptr);
1240 }
1241
1242 const Unit* codeUnitPtrAt(size_t offset) const {
1243 MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
1244 MOZ_ASSERT(startOffset_ <= offset);
1245 MOZ_ASSERT(offset - startOffset_ <=
1246 mozilla::PointerRangeSize(base_, limit_));
1247 return base_ + (offset - startOffset_);
1248 }
1249
1250 const Unit* current() const { return ptr; }
1251
1252 const Unit* limit() const { return limit_; }
1253
1254 Unit previousCodeUnit() {
1255 MOZ_ASSERT(!isPoisoned(), "can't get previous code unit if poisoned");
1256 MOZ_ASSERT(!atStart(), "must have a previous code unit to get");
1257 return *(ptr - 1);
1258 }
1259
1260 Unit getCodeUnit() {
1261 return *ptr++; // this will nullptr-crash if poisoned
1262 }
1263
1264 Unit peekCodeUnit() const {
1265 return *ptr; // this will nullptr-crash if poisoned
1266 }
1267
1268 /**
1269 * Determine the next code point in source text. The code point is not
1270 * normalized: '\r', '\n', '\u2028', and '\u2029' are returned literally.
1271 * If there is no next code point because |atEnd()|, or if an encoding
1272 * error is encountered, return a |PeekedCodePoint| that |isNone()|.
1273 *
1274 * This function does not report errors: code that attempts to get the next
1275 * code point must report any error.
1276 *
1277 * If a next code point is found, it may be consumed by passing it to
1278 * |consumeKnownCodePoint|.
1279 */
1280 PeekedCodePoint<Unit> peekCodePoint() const {
1281 return PeekCodePoint(ptr, limit_);
1282 }
1283
1284 private:
1285 #ifdef DEBUG
1286 void assertNextCodePoint(const PeekedCodePoint<Unit>& peeked);
1287 #endif
1288
1289 public:
1290 /**
1291 * Consume a peeked code point that |!isNone()|.
1292 *
1293 * This call DOES NOT UPDATE LINE-STATUS. You may need to call
1294 * |updateLineInfoForEOL()| and |updateFlagsForEOL()| if this consumes a
1295 * LineTerminator. Note that if this consumes '\r', you also must consume
1296 * an optional '\n' (i.e. a full LineTerminatorSequence) before doing so.
1297 */
1298 void consumeKnownCodePoint(const PeekedCodePoint<Unit>& peeked) {
1299 MOZ_ASSERT(!peeked.isNone());
1300 MOZ_ASSERT(peeked.lengthInUnits() <= remaining());
1301
1302 #ifdef DEBUG
1303 assertNextCodePoint(peeked);
1304 #endif
1305
1306 ptr += peeked.lengthInUnits();
1307 }
1308
1309 /** Match |n| hexadecimal digits and store their value in |*out|. */
1310 bool matchHexDigits(uint8_t n, char16_t* out) {
1311 MOZ_ASSERT(!isPoisoned(), "shouldn't peek into poisoned SourceUnits");
1312 MOZ_ASSERT(n <= 4, "hexdigit value can't overflow char16_t");
1313 if (n > remaining()) {
1314 return false;
1315 }
1316
1317 char16_t v = 0;
1318 for (uint8_t i = 0; i < n; i++) {
1319 auto unit = CodeUnitValue(ptr[i]);
1320 if (!mozilla::IsAsciiHexDigit(unit)) {
1321 return false;
1322 }
1323
1324 v = (v << 4) | mozilla::AsciiAlphanumericToNumber(unit);
1325 }
1326
1327 *out = v;
1328 ptr += n;
1329 return true;
1330 }
1331
1332 bool matchCodeUnits(const char* chars, uint8_t length) {
1333 MOZ_ASSERT(!isPoisoned(), "shouldn't match into poisoned SourceUnits");
1334 if (length > remaining()) {
1335 return false;
1336 }
1337
1338 const Unit* start = ptr;
1339 const Unit* end = ptr + length;
1340 while (ptr < end) {
1341 if (*ptr++ != Unit(*chars++)) {
1342 ptr = start;
1343 return false;
1344 }
1345 }
1346
1347 return true;
1348 }
1349
1350 void skipCodeUnits(uint32_t n) {
1351 MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1352 MOZ_ASSERT(n <= remaining(), "shouldn't skip beyond end of SourceUnits");
1353 ptr += n;
1354 }
1355
1356 void unskipCodeUnits(uint32_t n) {
1357 MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1358 MOZ_ASSERT(n <= mozilla::PointerRangeSize(base_, ptr),
1359 "shouldn't unskip beyond start of SourceUnits");
1360 ptr -= n;
1361 }
1362
1363 private:
1364 friend class TokenStreamCharsBase<Unit>;
1365
1366 bool internalMatchCodeUnit(Unit c) {
1367 MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1368 if (MOZ_LIKELY(!atEnd()) && *ptr == c) {
1369 ptr++;
1370 return true;
1371 }
1372 return false;
1373 }
1374
1375 public:
1376 void consumeKnownCodeUnit(Unit c) {
1377 MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1378 MOZ_ASSERT(*ptr == c, "consuming the wrong code unit");
1379 ptr++;
1380 }
1381
1382 /** Unget U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR. */
1383 inline void ungetLineOrParagraphSeparator();
1384
1385 void ungetCodeUnit() {
1386 MOZ_ASSERT(!isPoisoned(), "can't unget from poisoned units");
1387 MOZ_ASSERT(!atStart(), "can't unget if currently at start");
1388 ptr--;
1389 }
1390
1391 const Unit* addressOfNextCodeUnit(bool allowPoisoned = false) const {
1392 MOZ_ASSERT_IF(!allowPoisoned, !isPoisoned());
1393 return ptr;
1394 }
1395
1396 // Use this with caution!
1397 void setAddressOfNextCodeUnit(const Unit* a, bool allowPoisoned = false) {
1398 MOZ_ASSERT_IF(!allowPoisoned, a);
1399 ptr = a;
1400 }
1401
1402 // Poison the SourceUnits so they can't be accessed again.
1403 void poisonInDebug() {
1404 #ifdef DEBUG
1405 ptr = nullptr;
1406 #endif
1407 }
1408
1409 private:
1410 bool isPoisoned() const {
1411 #ifdef DEBUG
1412 // |ptr| can be null for unpoisoned SourceUnits if this was initialized with
1413 // |units == nullptr| and |length == 0|. In that case, for lack of any
1414 // better options, consider this to not be poisoned.
1415 return ptr == nullptr && ptr != limit_;
1416 #else
1417 return false;
1418 #endif
1419 }
1420
1421 public:
1422 /**
1423 * Consume the rest of a single-line comment (but not the EOL/EOF that
1424 * terminates it).
1425 *
1426 * If an encoding error is encountered -- possible only for UTF-8 because
1427 * JavaScript's conception of UTF-16 encompasses any sequence of 16-bit
1428 * code units -- valid code points prior to the encoding error are consumed
1429 * and subsequent invalid code units are not consumed. For example, given
1430 * these UTF-8 code units:
1431 *
1432 * 'B' 'A' 'D' ':' <bad code unit sequence>
1433 * 0x42 0x41 0x44 0x3A 0xD0 0x00 ...
1434 *
1435 * the first four code units are consumed, but 0xD0 and 0x00 are not
1436 * consumed because 0xD0 encodes a two-byte lead unit but 0x00 is not a
1437 * valid trailing code unit.
1438 *
1439 * It is expected that the caller will report such an encoding error when
1440 * it attempts to consume the next code point.
1441 */
1442 void consumeRestOfSingleLineComment();
1443
1444 /**
1445 * The maximum radius of code around the location of an error that should
1446 * be included in a syntax error message -- this many code units to either
1447 * side. The resulting window of data is then accordinngly trimmed so that
1448 * the window contains only validly-encoded data.
1449 *
1450 * Because this number is the same for both UTF-8 and UTF-16, windows in
1451 * UTF-8 may contain fewer code points than windows in UTF-16. As we only
1452 * use this for error messages, we don't particularly care.
1453 */
1454 static constexpr size_t WindowRadius = ErrorMetadata::lineOfContextRadius;
1455
1456 /**
1457 * From absolute offset |offset|, search backward to find an absolute
1458 * offset within source text, no further than |WindowRadius| code units
1459 * away from |offset|, such that all code points from that offset to
1460 * |offset| are valid, non-LineTerminator code points.
1461 */
1462 size_t findWindowStart(size_t offset) const;
1463
1464 /**
1465 * From absolute offset |offset|, find an absolute offset within source
1466 * text, no further than |WindowRadius| code units away from |offset|, such
1467 * that all code units from |offset| to that offset are valid,
1468 * non-LineTerminator code points.
1469 */
1470 size_t findWindowEnd(size_t offset) const;
1471
1472 /**
1473 * Given a |window| of |encodingSpecificWindowLength| units encoding valid
1474 * Unicode text, with index |encodingSpecificTokenOffset| indicating a
1475 * particular code point boundary in |window|, compute the corresponding
1476 * token offset and length if |window| were encoded in UTF-16. For
1477 * example:
1478 *
1479 * // U+03C0 GREEK SMALL LETTER PI is encoded as 0xCF 0x80.
1480 * const Utf8Unit* encodedWindow =
1481 * reinterpret_cast<const Utf8Unit*>(u8"ππππ = @ FAIL");
1482 * size_t encodedTokenOffset = 11; // 2 * 4 + ' = '.length
1483 * size_t encodedWindowLength = 17; // 2 * 4 + ' = @ FAIL'.length
1484 * size_t utf16Offset, utf16Length;
1485 * computeWindowOffsetAndLength(encodedWindow,
1486 * encodedTokenOffset, &utf16Offset,
1487 * encodedWindowLength, &utf16Length);
1488 * MOZ_ASSERT(utf16Offset == 7);
1489 * MOZ_ASSERT(utf16Length = 13);
1490 *
1491 * This function asserts if called for UTF-16: the sole caller can avoid
1492 * computing UTF-16 offsets when they're definitely the same as the encoded
1493 * offsets.
1494 */
1495 inline void computeWindowOffsetAndLength(const Unit* encodeWindow,
1496 size_t encodingSpecificTokenOffset,
1497 size_t* utf16TokenOffset,
1498 size_t encodingSpecificWindowLength,
1499 size_t* utf16WindowLength);
1500 };
1501
1502 template <>
1503 inline void SourceUnits<char16_t>::ungetLineOrParagraphSeparator() {
1504 #ifdef DEBUG
1505 char16_t prev = previousCodeUnit();
1506 #endif
1507 MOZ_ASSERT(prev == unicode::LINE_SEPARATOR ||
1508 prev == unicode::PARA_SEPARATOR);
1509
1510 ungetCodeUnit();
1511 }
1512
1513 template <>
1514 inline void SourceUnits<mozilla::Utf8Unit>::ungetLineOrParagraphSeparator() {
1515 unskipCodeUnits(3);
1516
1517 MOZ_ASSERT(ptr[0].toUint8() == 0xE2);
1518 MOZ_ASSERT(ptr[1].toUint8() == 0x80);
1519
1520 #ifdef DEBUG
1521 uint8_t last = ptr[2].toUint8();
1522 #endif
1523 MOZ_ASSERT(last == 0xA8 || last == 0xA9);
1524 }
1525
1526 /**
1527 * An all-purpose buffer type for accumulating text during tokenizing.
1528 *
1529 * In principle we could make this buffer contain |char16_t|, |Utf8Unit|, or
1530 * |Unit|. We use |char16_t| because:
1531 *
1532 * * we don't have a UTF-8 regular expression parser, so in general regular
1533 * expression text must be copied to a separate UTF-16 buffer to parse it,
1534 * and
1535 * * |TokenStreamCharsShared::copyCharBufferTo|, which copies a shared
1536 * |CharBuffer| to a |char16_t*|, is simpler if it doesn't have to convert.
1537 */
1538 using CharBuffer = Vector<char16_t, 32>;
1539
1540 /**
1541 * Append the provided code point (in the range [U+0000, U+10FFFF], surrogate
1542 * code points included) to the buffer.
1543 */
1544 [[nodiscard]] extern bool AppendCodePointToCharBuffer(CharBuffer& charBuffer,
1545 uint32_t codePoint);
1546
1547 /**
1548 * Accumulate the range of UTF-16 text (lone surrogates permitted, because JS
1549 * allows them in source text) into |charBuffer|. Normalize '\r', '\n', and
1550 * "\r\n" into '\n'.
1551 */
1552 [[nodiscard]] extern bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
1553 CharBuffer& charBuffer, const char16_t* cur, const char16_t* end);
1554
1555 /**
1556 * Accumulate the range of previously-validated UTF-8 text into |charBuffer|.
1557 * Normalize '\r', '\n', and "\r\n" into '\n'.
1558 */
1559 [[nodiscard]] extern bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
1560 CharBuffer& charBuffer, const mozilla::Utf8Unit* cur,
1561 const mozilla::Utf8Unit* end);
1562
1563 class TokenStreamCharsShared {
1564 protected:
1565 JSContext* cx;
1566
1567 /**
1568 * Buffer transiently used to store sequences of identifier or string code
1569 * points when such can't be directly processed from the original source
1570 * text (e.g. because it contains escapes).
1571 */
1572 CharBuffer charBuffer;
1573
1574 /** Information for parsing with a lifetime longer than the parser itself. */
1575 ParserAtomsTable* parserAtoms;
1576
1577 protected:
1578 explicit TokenStreamCharsShared(JSContext* cx, ParserAtomsTable* parserAtoms)
1579 : cx(cx), charBuffer(cx), parserAtoms(parserAtoms) {}
1580
1581 [[nodiscard]] bool copyCharBufferTo(
1582 JSContext* cx, UniquePtr<char16_t[], JS::FreePolicy>* destination);
1583
1584 /**
1585 * Determine whether a code unit constitutes a complete ASCII code point.
1586 * (The code point's exact value might not be used, however, if subsequent
1587 * code observes that |unit| is part of a LineTerminatorSequence.)
1588 */
1589 [[nodiscard]] static constexpr MOZ_ALWAYS_INLINE bool isAsciiCodePoint(
1590 int32_t unit) {
1591 return mozilla::IsAscii(static_cast<char32_t>(unit));
1592 }
1593
1594 TaggedParserAtomIndex drainCharBufferIntoAtom() {
1595 // Add to parser atoms table.
1596 auto atom = this->parserAtoms->internChar16(cx, charBuffer.begin(),
1597 charBuffer.length());
1598 charBuffer.clear();
1599 return atom;
1600 }
1601
1602 protected:
1603 void adoptState(TokenStreamCharsShared& other) {
1604 // The other stream's buffer may contain information for a
1605 // gotten-then-ungotten token, that we must transfer into this stream so
1606 // that token's final get behaves as desired.
1607 charBuffer = std::move(other.charBuffer);
1608 }
1609
1610 public:
1611 CharBuffer& getCharBuffer() { return charBuffer; }
1612 };
1613
1614 template <typename Unit>
1615 class TokenStreamCharsBase : public TokenStreamCharsShared {
1616 protected:
1617 using SourceUnits = frontend::SourceUnits<Unit>;
1618
1619 /** Code units in the source code being tokenized. */
1620 SourceUnits sourceUnits;
1621
1622 // End of fields.
1623
1624 protected:
1625 TokenStreamCharsBase(JSContext* cx, ParserAtomsTable* parserAtoms,
1626 const Unit* units, size_t length, size_t startOffset);
1627
1628 /**
1629 * Convert a non-EOF code unit returned by |getCodeUnit()| or
1630 * |peekCodeUnit()| to a Unit code unit.
1631 */
1632 inline Unit toUnit(int32_t codeUnitValue);
1633
1634 void ungetCodeUnit(int32_t c) {
1635 if (c == EOF) {
1636 return;
1637 }
1638
1639 sourceUnits.ungetCodeUnit();
1640 }
1641
1642 MOZ_ALWAYS_INLINE TaggedParserAtomIndex
1643 atomizeSourceChars(mozilla::Span<const Unit> units);
1644
1645 /**
1646 * Try to match a non-LineTerminator ASCII code point. Return true iff it
1647 * was matched.
1648 */
1649 bool matchCodeUnit(char expect) {
1650 MOZ_ASSERT(mozilla::IsAscii(expect));
1651 MOZ_ASSERT(expect != '\r');
1652 MOZ_ASSERT(expect != '\n');
1653 return this->sourceUnits.internalMatchCodeUnit(Unit(expect));
1654 }
1655
1656 /**
1657 * Try to match an ASCII LineTerminator code point. Return true iff it was
1658 * matched.
1659 */
1660 bool matchLineTerminator(char expect) {
1661 MOZ_ASSERT(expect == '\r' || expect == '\n');
1662 return this->sourceUnits.internalMatchCodeUnit(Unit(expect));
1663 }
1664
1665 template <typename T>
1666 bool matchCodeUnit(T) = delete;
1667 template <typename T>
1668 bool matchLineTerminator(T) = delete;
1669
1670 int32_t peekCodeUnit() {
1671 return MOZ_LIKELY(!sourceUnits.atEnd())
1672 ? CodeUnitValue(sourceUnits.peekCodeUnit())
1673 : EOF;
1674 }
1675
1676 /** Consume a known, non-EOF code unit. */
1677 inline void consumeKnownCodeUnit(int32_t unit);
1678
1679 // Forbid accidental calls to consumeKnownCodeUnit *not* with the single
1680 // unit-or-EOF type. Unit should use SourceUnits::consumeKnownCodeUnit;
1681 // CodeUnitValue() results should go through toUnit(), or better yet just
1682 // use the original Unit.
1683 template <typename T>
1684 inline void consumeKnownCodeUnit(T) = delete;
1685
1686 /**
1687 * Add a null-terminated line of context to error information, for the line
1688 * in |sourceUnits| that contains |offset|. Also record the window's
1689 * length and the offset of the error in the window. (Don't bother adding
1690 * a line of context if it would be empty.)
1691 *
1692 * The window will contain no LineTerminators of any kind, and it will not
1693 * extend more than |SourceUnits::WindowRadius| to either side of |offset|,
1694 * nor into the previous or next lines.
1695 *
1696 * This function is quite internal, and you probably should be calling one
1697 * of its existing callers instead.
1698 */
1699 [[nodiscard]] bool addLineOfContext(ErrorMetadata* err, uint32_t offset);
1700 };
1701
1702 template <>
1703 inline char16_t TokenStreamCharsBase<char16_t>::toUnit(int32_t codeUnitValue) {
1704 MOZ_ASSERT(codeUnitValue != EOF, "EOF is not a Unit");
1705 return mozilla::AssertedCast<char16_t>(codeUnitValue);
1706 }
1707
1708 template <>
1709 inline mozilla::Utf8Unit TokenStreamCharsBase<mozilla::Utf8Unit>::toUnit(
1710 int32_t value) {
1711 MOZ_ASSERT(value != EOF, "EOF is not a Unit");
1712 return mozilla::Utf8Unit(mozilla::AssertedCast<unsigned char>(value));
1713 }
1714
1715 template <typename Unit>
1716 inline void TokenStreamCharsBase<Unit>::consumeKnownCodeUnit(int32_t unit) {
1717 sourceUnits.consumeKnownCodeUnit(toUnit(unit));
1718 }
1719
1720 template <>
1721 MOZ_ALWAYS_INLINE TaggedParserAtomIndex
1722 TokenStreamCharsBase<char16_t>::atomizeSourceChars(
1723 mozilla::Span<const char16_t> units) {
1724 return this->parserAtoms->internChar16(cx, units.data(), units.size());
1725 }
1726
1727 template <>
1728 /* static */ MOZ_ALWAYS_INLINE TaggedParserAtomIndex
1729 TokenStreamCharsBase<mozilla::Utf8Unit>::atomizeSourceChars(
1730 mozilla::Span<const mozilla::Utf8Unit> units) {
1731 return this->parserAtoms->internUtf8(cx, units.data(), units.size());
1732 }
1733
1734 template <typename Unit>
1735 class SpecializedTokenStreamCharsBase;
1736
1737 template <>
1738 class SpecializedTokenStreamCharsBase<char16_t>
1739 : public TokenStreamCharsBase<char16_t> {
1740 using CharsBase = TokenStreamCharsBase<char16_t>;
1741
1742 protected:
1743 using TokenStreamCharsShared::isAsciiCodePoint;
1744 // Deliberately don't |using| |sourceUnits| because of bug 1472569. :-(
1745
1746 using typename CharsBase::SourceUnits;
1747
1748 protected:
1749 // These APIs are only usable by UTF-16-specific code.
1750
1751 /**
1752 * Given |lead| already consumed, consume and return the code point encoded
1753 * starting from it. Infallible because lone surrogates in JS encode a
1754 * "code point" of the same value.
1755 */
1756 char32_t infallibleGetNonAsciiCodePointDontNormalize(char16_t lead) {
1757 MOZ_ASSERT(!isAsciiCodePoint(lead));
1758 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == lead);
1759
1760 // Handle single-unit code points and lone trailing surrogates.
1761 if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead)) ||
1762 // Or handle lead surrogates not paired with trailing surrogates.
1763 MOZ_UNLIKELY(
1764 this->sourceUnits.atEnd() ||
1765 !unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) {
1766 return lead;
1767 }
1768
1769 // Otherwise it's a multi-unit code point.
1770 return unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
1771 }
1772
1773 protected:
1774 // These APIs are in both SpecializedTokenStreamCharsBase specializations
1775 // and so are usable in subclasses no matter what Unit is.
1776
1777 using CharsBase::CharsBase;
1778 };
1779
1780 template <>
1781 class SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>
1782 : public TokenStreamCharsBase<mozilla::Utf8Unit> {
1783 using CharsBase = TokenStreamCharsBase<mozilla::Utf8Unit>;
1784
1785 protected:
1786 // Deliberately don't |using| |sourceUnits| because of bug 1472569. :-(
1787
1788 protected:
1789 // These APIs are only usable by UTF-8-specific code.
1790
1791 using typename CharsBase::SourceUnits;
1792
1793 /**
1794 * A mutable iterator-wrapper around |SourceUnits| that translates
1795 * operators to calls to |SourceUnits::getCodeUnit()| and similar.
1796 *
1797 * This class is expected to be used in concert with |SourceUnitsEnd|.
1798 */
1799 class SourceUnitsIterator {
1800 SourceUnits& sourceUnits_;
1801 #ifdef DEBUG
1802 // In iterator copies created by the post-increment operator, a pointer
1803 // at the next source text code unit when the post-increment operator
1804 // was called, cleared when the iterator is dereferenced.
1805 mutable mozilla::Maybe<const mozilla::Utf8Unit*>
1806 currentBeforePostIncrement_;
1807 #endif
1808
1809 public:
1810 explicit SourceUnitsIterator(SourceUnits& sourceUnits)
1811 : sourceUnits_(sourceUnits) {}
1812
1813 mozilla::Utf8Unit operator*() const {
1814 // operator* is expected to get the *next* value from an iterator
1815 // not pointing at the end of the underlying range. However, the
1816 // sole use of this is in the context of an expression of the form
1817 // |*iter++|, that performed the |sourceUnits_.getCodeUnit()| in
1818 // the |operator++(int)| below -- so dereferencing acts on a
1819 // |sourceUnits_| already advanced. Therefore the correct unit to
1820 // return is the previous one.
1821 MOZ_ASSERT(currentBeforePostIncrement_.value() + 1 ==
1822 sourceUnits_.current());
1823 #ifdef DEBUG
1824 currentBeforePostIncrement_.reset();
1825 #endif
1826 return sourceUnits_.previousCodeUnit();
1827 }
1828
1829 SourceUnitsIterator operator++(int) {
1830 MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1831 "the only valid operation on a post-incremented "
1832 "iterator is dereferencing a single time");
1833
1834 SourceUnitsIterator copy = *this;
1835 #ifdef DEBUG
1836 copy.currentBeforePostIncrement_.emplace(sourceUnits_.current());
1837 #endif
1838
1839 sourceUnits_.getCodeUnit();
1840 return copy;
1841 }
1842
1843 void operator-=(size_t n) {
1844 MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1845 "the only valid operation on a post-incremented "
1846 "iterator is dereferencing a single time");
1847 sourceUnits_.unskipCodeUnits(n);
1848 }
1849
1850 mozilla::Utf8Unit operator[](ptrdiff_t index) {
1851 MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1852 "the only valid operation on a post-incremented "
1853 "iterator is dereferencing a single time");
1854 MOZ_ASSERT(index == -1,
1855 "must only be called to verify the value of the "
1856 "previous code unit");
1857 return sourceUnits_.previousCodeUnit();
1858 }
1859
1860 size_t remaining() const {
1861 MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1862 "the only valid operation on a post-incremented "
1863 "iterator is dereferencing a single time");
1864 return sourceUnits_.remaining();
1865 }
1866 };
1867
1868 /** A sentinel representing the end of |SourceUnits| data. */
1869 class SourceUnitsEnd {};
1870
1871 friend inline size_t operator-(const SourceUnitsEnd& aEnd,
1872 const SourceUnitsIterator& aIter);
1873
1874 protected:
1875 // These APIs are in both SpecializedTokenStreamCharsBase specializations
1876 // and so are usable in subclasses no matter what Unit is.
1877
1878 using CharsBase::CharsBase;
1879 };
1880
1881 inline size_t operator-(const SpecializedTokenStreamCharsBase<
1882 mozilla::Utf8Unit>::SourceUnitsEnd& aEnd,
1883 const SpecializedTokenStreamCharsBase<
1884 mozilla::Utf8Unit>::SourceUnitsIterator& aIter) {
1885 return aIter.remaining();
1886 }
1887
1888 /** A small class encapsulating computation of the start-offset of a Token. */
1889 class TokenStart {
1890 uint32_t startOffset_;
1891
1892 public:
1893 /**
1894 * Compute a starting offset that is the current offset of |sourceUnits|,
1895 * offset by |adjust|. (For example, |adjust| of -1 indicates the code
1896 * unit one backwards from |sourceUnits|'s current offset.)
1897 */
1898 template <class SourceUnits>
1899 TokenStart(const SourceUnits& sourceUnits, ptrdiff_t adjust)
1900 : startOffset_(sourceUnits.offset() + adjust) {}
1901
1902 TokenStart(const TokenStart&) = default;
1903
1904 uint32_t offset() const { return startOffset_; }
1905 };
1906
1907 template <typename Unit, class AnyCharsAccess>
1908 class GeneralTokenStreamChars : public SpecializedTokenStreamCharsBase<Unit> {
1909 using CharsBase = TokenStreamCharsBase<Unit>;
1910 using SpecializedCharsBase = SpecializedTokenStreamCharsBase<Unit>;
1911
1912 using LineToken = TokenStreamAnyChars::LineToken;
1913
1914 private:
1915 Token* newTokenInternal(TokenKind kind, TokenStart start, TokenKind* out);
1916
1917 /**
1918 * Allocates a new Token from the given offset to the current offset,
1919 * ascribes it the given kind, and sets |*out| to that kind.
1920 */
1921 Token* newToken(TokenKind kind, TokenStart start,
1922 TokenStreamShared::Modifier modifier, TokenKind* out) {
1923 Token* token = newTokenInternal(kind, start, out);
1924
1925 #ifdef DEBUG
1926 // Save the modifier used to get this token, so that if an ungetToken()
1927 // occurs and then the token is re-gotten (or peeked, etc.), we can
1928 // assert both gets used compatible modifiers.
1929 token->modifier = modifier;
1930 #endif
1931
1932 return token;
1933 }
1934
1935 uint32_t matchUnicodeEscape(uint32_t* codePoint);
1936 uint32_t matchExtendedUnicodeEscape(uint32_t* codePoint);
1937
1938 protected:
1939 using CharsBase::addLineOfContext;
1940 using CharsBase::matchCodeUnit;
1941 using CharsBase::matchLineTerminator;
1942 using TokenStreamCharsShared::drainCharBufferIntoAtom;
1943 using TokenStreamCharsShared::isAsciiCodePoint;
1944 // Deliberately don't |using CharsBase::sourceUnits| because of bug 1472569.
1945 // :-(
1946 using CharsBase::toUnit;
1947
1948 using typename CharsBase::SourceUnits;
1949
1950 protected:
1951 using SpecializedCharsBase::SpecializedCharsBase;
1952
1953 TokenStreamAnyChars& anyCharsAccess() {
1954 return AnyCharsAccess::anyChars(this);
1955 }
1956
1957 const TokenStreamAnyChars& anyCharsAccess() const {
1958 return AnyCharsAccess::anyChars(this);
1959 }
1960
1961 using TokenStreamSpecific =
1962 frontend::TokenStreamSpecific<Unit, AnyCharsAccess>;
1963
1964 TokenStreamSpecific* asSpecific() {
1965 static_assert(
1966 std::is_base_of_v<GeneralTokenStreamChars, TokenStreamSpecific>,
1967 "static_cast below presumes an inheritance relationship");
1968
1969 return static_cast<TokenStreamSpecific*>(this);
1970 }
1971
1972 protected:
1973 /**
1974 * Compute the column number in Unicode code points of the absolute |offset|
1975 * within source text on the line corresponding to |lineToken|.
1976 *
1977 * |offset| must be a code point boundary, preceded only by validly-encoded
1978 * source units. (It doesn't have to be *followed* by valid source units.)
1979 */
1980 uint32_t computeColumn(LineToken lineToken, uint32_t offset) const;
1981 void computeLineAndColumn(uint32_t offset, uint32_t* line,
1982 uint32_t* column) const;
1983
1984 /**
1985 * Fill in |err| completely, except for line-of-context information.
1986 *
1987 * Return true if the caller can compute a line of context from the token
1988 * stream. Otherwise return false.
1989 */
1990 [[nodiscard]] bool fillExceptingContext(ErrorMetadata* err, uint32_t offset) {
1991 if (anyCharsAccess().fillExceptingContext(err, offset)) {
1992 computeLineAndColumn(offset, &err->lineNumber, &err->columnNumber);
1993 return true;
1994 }
1995 return false;
1996 }
1997
1998 void newSimpleToken(TokenKind kind, TokenStart start,
1999 TokenStreamShared::Modifier modifier, TokenKind* out) {
2000 newToken(kind, start, modifier, out);
2001 }
2002
2003 void newNumberToken(double dval, DecimalPoint decimalPoint, TokenStart start,
2004 TokenStreamShared::Modifier modifier, TokenKind* out) {
2005 Token* token = newToken(TokenKind::Number, start, modifier, out);
2006 token->setNumber(dval, decimalPoint);
2007 }
2008
2009 void newBigIntToken(TokenStart start, TokenStreamShared::Modifier modifier,
2010 TokenKind* out) {
2011 newToken(TokenKind::BigInt, start, modifier, out);
2012 }
2013
2014 void newAtomToken(TokenKind kind, TaggedParserAtomIndex atom,
2015 TokenStart start, TokenStreamShared::Modifier modifier,
2016 TokenKind* out) {
2017 MOZ_ASSERT(kind == TokenKind::String || kind == TokenKind::TemplateHead ||
2018 kind == TokenKind::NoSubsTemplate);
2019
2020 Token* token = newToken(kind, start, modifier, out);
2021 token->setAtom(atom);
2022 }
2023
2024 void newNameToken(TaggedParserAtomIndex name, TokenStart start,
2025 TokenStreamShared::Modifier modifier, TokenKind* out) {
2026 Token* token = newToken(TokenKind::Name, start, modifier, out);
2027 token->setName(name);
2028 }
2029
2030 void newPrivateNameToken(TaggedParserAtomIndex name, TokenStart start,
2031 TokenStreamShared::Modifier modifier,
2032 TokenKind* out) {
2033 Token* token = newToken(TokenKind::PrivateName, start, modifier, out);
2034 token->setName(name);
2035 }
2036
2037 void newRegExpToken(JS::RegExpFlags reflags, TokenStart start,
2038 TokenKind* out) {
2039 Token* token = newToken(TokenKind::RegExp, start,
2040 TokenStreamShared::SlashIsRegExp, out);
2041 token->setRegExpFlags(reflags);
2042 }
2043
2044 MOZ_COLD bool badToken();
2045
2046 /**
2047 * Get the next code unit -- the next numeric sub-unit of source text,
2048 * possibly smaller than a full code point -- without updating line/column
2049 * counters or consuming LineTerminatorSequences.
2050 *
2051 * Because of these limitations, only use this if (a) the resulting code
2052 * unit is guaranteed to be ungotten (by ungetCodeUnit()) if it's an EOL,
2053 * and (b) the line-related state (lineno, linebase) is not used before
2054 * it's ungotten.
2055 */
2056 int32_t getCodeUnit() {
2057 if (MOZ_LIKELY(!this->sourceUnits.atEnd())) {
2058 return CodeUnitValue(this->sourceUnits.getCodeUnit());
2059 }
2060
2061 anyCharsAccess().flags.isEOF = true;
2062 return EOF;
2063 }
2064
2065 void ungetCodeUnit(int32_t c) {
2066 MOZ_ASSERT_IF(c == EOF, anyCharsAccess().flags.isEOF);
2067
2068 CharsBase::ungetCodeUnit(c);
2069 }
2070
2071 /**
2072 * Given a just-consumed ASCII code unit/point |lead|, consume a full code
2073 * point or LineTerminatorSequence (normalizing it to '\n') and store it in
2074 * |*codePoint|. Return true on success, otherwise return false and leave
2075 * |*codePoint| undefined on failure.
2076 *
2077 * If a LineTerminatorSequence was consumed, also update line/column info.
2078 *
2079 * This may change the current |sourceUnits| offset.
2080 */
2081 [[nodiscard]] bool getFullAsciiCodePoint(int32_t lead, int32_t* codePoint) {
2082 MOZ_ASSERT(isAsciiCodePoint(lead),
2083 "non-ASCII code units must be handled separately");
2084 MOZ_ASSERT(toUnit(lead) == this->sourceUnits.previousCodeUnit(),
2085 "getFullAsciiCodePoint called incorrectly");
2086
2087 if (MOZ_UNLIKELY(lead == '\r')) {
2088 matchLineTerminator('\n');
2089 } else if (MOZ_LIKELY(lead != '\n')) {
2090 *codePoint = lead;
2091 return true;
2092 }
2093
2094 *codePoint = '\n';
2095 bool ok = updateLineInfoForEOL();
2096 if (!ok) {
2097 #ifdef DEBUG
2098 *codePoint = EOF; // sentinel value to hopefully cause errors
2099 #endif
2100 MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
2101 }
2102 return ok;
2103 }
2104
2105 [[nodiscard]] MOZ_ALWAYS_INLINE bool updateLineInfoForEOL() {
2106 return anyCharsAccess().internalUpdateLineInfoForEOL(
2107 this->sourceUnits.offset());
2108 }
2109
2110 uint32_t matchUnicodeEscapeIdStart(uint32_t* codePoint);
2111 bool matchUnicodeEscapeIdent(uint32_t* codePoint);
2112 bool matchIdentifierStart();
2113
2114 /**
2115 * If possible, compute a line of context for an otherwise-filled-in |err|
2116 * at the given offset in this token stream.
2117 *
2118 * This function is very-internal: almost certainly you should use one of
2119 * its callers instead. It basically exists only to make those callers
2120 * more readable.
2121 */
2122 [[nodiscard]] bool internalComputeLineOfContext(ErrorMetadata* err,
2123 uint32_t offset) {
2124 // We only have line-start information for the current line. If the error
2125 // is on a different line, we can't easily provide context. (This means
2126 // any error in a multi-line token, e.g. an unterminated multiline string
2127 // literal, won't have context.)
2128 if (err->lineNumber != anyCharsAccess().lineno) {
2129 return true;
2130 }
2131
2132 return addLineOfContext(err, offset);
2133 }
2134
2135 public:
2136 /**
2137 * Consume any hashbang comment at the start of a Script or Module, if one is
2138 * present. Stops consuming just before any terminating LineTerminator or
2139 * before an encoding error is encountered.
2140 */
2141 void consumeOptionalHashbangComment();
2142
2143 TaggedParserAtomIndex getRawTemplateStringAtom() {
2144 TokenStreamAnyChars& anyChars = anyCharsAccess();
2145
2146 MOZ_ASSERT(anyChars.currentToken().type == TokenKind::TemplateHead ||
2147 anyChars.currentToken().type == TokenKind::NoSubsTemplate);
2148 const Unit* cur =
2149 this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.begin + 1);
2150 const Unit* end;
2151 if (anyChars.currentToken().type == TokenKind::TemplateHead) {
2152 // Of the form |`...${| or |}...${|
2153 end =
2154 this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.end - 2);
2155 } else {
2156 // NO_SUBS_TEMPLATE is of the form |`...`| or |}...`|
2157 end =
2158 this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.end - 1);
2159 }
2160
2161 // |charBuffer| should be empty here, but we may as well code defensively.
2162 MOZ_ASSERT(this->charBuffer.length() == 0);
2163 this->charBuffer.clear();
2164
2165 // Template literals normalize only '\r' and "\r\n" to '\n'; Unicode
2166 // separators don't need special handling.
2167 // https://tc39.github.io/ecma262/#sec-static-semantics-tv-and-trv
2168 if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(this->charBuffer,
2169 cur, end)) {
2170 return TaggedParserAtomIndex::null();
2171 }
2172
2173 return drainCharBufferIntoAtom();
2174 }
2175 };
2176
2177 template <typename Unit, class AnyCharsAccess>
2178 class TokenStreamChars;
2179
2180 template <class AnyCharsAccess>
2181 class TokenStreamChars<char16_t, AnyCharsAccess>
2182 : public GeneralTokenStreamChars<char16_t, AnyCharsAccess> {
2183 using CharsBase = TokenStreamCharsBase<char16_t>;
2184 using SpecializedCharsBase = SpecializedTokenStreamCharsBase<char16_t>;
2185 using GeneralCharsBase = GeneralTokenStreamChars<char16_t, AnyCharsAccess>;
2186 using Self = TokenStreamChars<char16_t, AnyCharsAccess>;
2187
2188 using GeneralCharsBase::asSpecific;
2189
2190 using typename GeneralCharsBase::TokenStreamSpecific;
2191
2192 protected:
2193 using CharsBase::matchLineTerminator;
2194 using GeneralCharsBase::anyCharsAccess;
2195 using GeneralCharsBase::getCodeUnit;
2196 using SpecializedCharsBase::infallibleGetNonAsciiCodePointDontNormalize;
2197 using TokenStreamCharsShared::isAsciiCodePoint;
2198 // Deliberately don't |using| |sourceUnits| because of bug 1472569. :-(
2199 using GeneralCharsBase::ungetCodeUnit;
2200 using GeneralCharsBase::updateLineInfoForEOL;
2201
2202 protected:
2203 using GeneralCharsBase::GeneralCharsBase;
2204
2205 /**
2206 * Given the non-ASCII |lead| code unit just consumed, consume and return a
2207 * complete non-ASCII code point. Line/column updates are not performed,
2208 * and line breaks are returned as-is without normalization.
2209 */
2210 [[nodiscard]] bool getNonAsciiCodePointDontNormalize(char16_t lead,
2211 char32_t* codePoint) {
2212 // There are no encoding errors in 16-bit JS, so implement this so that
2213 // the compiler knows it, too.
2214 *codePoint = infallibleGetNonAsciiCodePointDontNormalize(lead);
2215 return true;
2216 }
2217
2218 /**
2219 * Given a just-consumed non-ASCII code unit |lead| (which may also be a
2220 * full code point, for UTF-16), consume a full code point or
2221 * LineTerminatorSequence (normalizing it to '\n') and store it in
2222 * |*codePoint|. Return true on success, otherwise return false and leave
2223 * |*codePoint| undefined on failure.
2224 *
2225 * If a LineTerminatorSequence was consumed, also update line/column info.
2226 *
2227 * This may change the current |sourceUnits| offset.
2228 */
2229 [[nodiscard]] bool getNonAsciiCodePoint(int32_t lead, int32_t* codePoint);
2230 };
2231
2232 template <class AnyCharsAccess>
2233 class TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>
2234 : public GeneralTokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess> {
2235 using CharsBase = TokenStreamCharsBase<mozilla::Utf8Unit>;
2236 using SpecializedCharsBase =
2237 SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>;
2238 using GeneralCharsBase =
2239 GeneralTokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;
2240 using Self = TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;
2241
2242 using typename SpecializedCharsBase::SourceUnitsEnd;
2243 using typename SpecializedCharsBase::SourceUnitsIterator;
2244
2245 protected:
2246 using GeneralCharsBase::anyCharsAccess;
2247 using GeneralCharsBase::computeLineAndColumn;
2248 using GeneralCharsBase::fillExceptingContext;
2249 using GeneralCharsBase::internalComputeLineOfContext;
2250 using TokenStreamCharsShared::isAsciiCodePoint;
2251 // Deliberately don't |using| |sourceUnits| because of bug 1472569. :-(
2252 using GeneralCharsBase::updateLineInfoForEOL;
2253
2254 private:
2255 static char toHexChar(uint8_t nibble) {
2256 MOZ_ASSERT(nibble < 16);
2257 return "0123456789ABCDEF"[nibble];
2258 }
2259
2260 static void byteToString(uint8_t n, char* str) {
2261 str[0] = '0';
2262 str[1] = 'x';
2263 str[2] = toHexChar(n >> 4);
2264 str[3] = toHexChar(n & 0xF);
2265 }
2266
2267 static void byteToTerminatedString(uint8_t n, char* str) {
2268 byteToString(n, str);
2269 str[4] = '\0';
2270 }
2271
2272 /**
2273 * Report a UTF-8 encoding-related error for a code point starting AT THE
2274 * CURRENT OFFSET.
2275 *
2276 * |relevantUnits| indicates how many code units from the current offset
2277 * are potentially relevant to the reported error, such that they may be
2278 * included in the error message. For example, if at the current offset we
2279 * have
2280 *
2281 * 0b1111'1111 ...
2282 *
2283 * a code unit never allowed in UTF-8, then |relevantUnits| might be 1
2284 * because only that unit is relevant. Or if we have
2285 *
2286 * 0b1111'0111 0b1011'0101 0b0000'0000 ...
2287 *
2288 * where the first two code units are a valid prefix to a four-unit code
2289 * point but the third unit *isn't* a valid trailing code unit, then
2290 * |relevantUnits| might be 3.
2291 */
2292 MOZ_COLD void internalEncodingError(uint8_t relevantUnits,
2293 unsigned errorNumber, ...);
2294
2295 // Don't use |internalEncodingError|! Use one of the elaborated functions
2296 // that calls it, below -- all of which should be used to indicate an error
2297 // in a code point starting AT THE CURRENT OFFSET as with
2298 // |internalEncodingError|.
2299
2300 /** Report an error for an invalid lead code unit |lead|. */
2301 MOZ_COLD void badLeadUnit(mozilla::Utf8Unit lead);
2302
2303 /**
2304 * Report an error when there aren't enough code units remaining to
2305 * constitute a full code point after |lead|: only |remaining| code units
2306 * were available for a code point starting with |lead|, when at least
2307 * |required| code units were required.
2308 */
2309 MOZ_COLD void notEnoughUnits(mozilla::Utf8Unit lead, uint8_t remaining,
2310 uint8_t required);
2311
2312 /**
2313 * Report an error for a bad trailing UTF-8 code unit, where the bad
2314 * trailing unit was the last of |unitsObserved| units examined from the
2315 * current offset.
2316 */
2317 MOZ_COLD void badTrailingUnit(uint8_t unitsObserved);
2318
2319 // Helper used for both |badCodePoint| and |notShortestForm| for code units
2320 // that have all the requisite high bits set/unset in a manner that *could*
2321 // encode a valid code point, but the remaining bits encoding its actual
2322 // value do not define a permitted value.
2323 MOZ_COLD void badStructurallyValidCodePoint(uint32_t codePoint,
2324 uint8_t codePointLength,
2325 const char* reason);
2326
2327 /**
2328 * Report an error for UTF-8 that encodes a UTF-16 surrogate or a number
2329 * outside the Unicode range.
2330 */
2331 MOZ_COLD void badCodePoint(uint32_t codePoint, uint8_t codePointLength) {
2332 MOZ_ASSERT(unicode::IsSurrogate(codePoint) ||
2333 codePoint > unicode::NonBMPMax);
2334
2335 badStructurallyValidCodePoint(codePoint, codePointLength,
2336 unicode::IsSurrogate(codePoint)
2337 ? "it's a UTF-16 surrogate"
2338 : "the maximum code point is U+10FFFF");
2339 }
2340
2341 /**
2342 * Report an error for UTF-8 that encodes a code point not in its shortest
2343 * form.
2344 */
2345 MOZ_COLD void notShortestForm(uint32_t codePoint, uint8_t codePointLength) {
2346 MOZ_ASSERT(!unicode::IsSurrogate(codePoint));
2347 MOZ_ASSERT(codePoint <= unicode::NonBMPMax);
2348
2349 badStructurallyValidCodePoint(
2350 codePoint, codePointLength,
2351 "it wasn't encoded in shortest possible form");
2352 }
2353
2354 protected:
2355 using GeneralCharsBase::GeneralCharsBase;
2356
2357 /**
2358 * Given the non-ASCII |lead| code unit just consumed, consume the rest of
2359 * a non-ASCII code point. The code point is not normalized: on success
2360 * |*codePoint| may be U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR.
2361 *
2362 * Report an error if an invalid code point is encountered.
2363 */
2364 [[nodiscard]] bool getNonAsciiCodePointDontNormalize(mozilla::Utf8Unit lead,
2365 char32_t* codePoint);
2366
2367 /**
2368 * Given a just-consumed non-ASCII code unit |lead|, consume a full code
2369 * point or LineTerminatorSequence (normalizing it to '\n') and store it in
2370 * |*codePoint|. Return true on success, otherwise return false and leave
2371 * |*codePoint| undefined on failure.
2372 *
2373 * If a LineTerminatorSequence was consumed, also update line/column info.
2374 *
2375 * This function will change the current |sourceUnits| offset.
2376 */
2377 [[nodiscard]] bool getNonAsciiCodePoint(int32_t lead, int32_t* codePoint);
2378 };
2379
2380 // TokenStream is the lexical scanner for JavaScript source text.
2381 //
2382 // It takes a buffer of Unit code units (currently only char16_t encoding
2383 // UTF-16, but we're adding either UTF-8 or Latin-1 single-byte text soon) and
2384 // linearly scans it into |Token|s.
2385 //
2386 // Internally the class uses a four element circular buffer |tokens| of
2387 // |Token|s. As an index for |tokens|, the member |cursor_| points to the
2388 // current token. Calls to getToken() increase |cursor_| by one and return the
2389 // new current token. If a TokenStream was just created, the current token is
2390 // uninitialized. It's therefore important that one of the first four member
2391 // functions listed below is called first. The circular buffer lets us go back
2392 // up to two tokens from the last scanned token. Internally, the relative
2393 // number of backward steps that were taken (via ungetToken()) after the last
2394 // token was scanned is stored in |lookahead|.
2395 //
2396 // The following table lists in which situations it is safe to call each listed
2397 // function. No checks are made by the functions in non-debug builds.
2398 //
2399 // Function Name | Precondition; changes to |lookahead|
2400 // ------------------+---------------------------------------------------------
2401 // getToken | none; if |lookahead > 0| then |lookahead--|
2402 // peekToken | none; if |lookahead == 0| then |lookahead == 1|
2403 // peekTokenSameLine | none; if |lookahead == 0| then |lookahead == 1|
2404 // matchToken | none; if |lookahead > 0| and the match succeeds then
2405 // | |lookahead--|
2406 // consumeKnownToken | none; if |lookahead > 0| then |lookahead--|
2407 // ungetToken | 0 <= |lookahead| <= |maxLookahead - 1|; |lookahead++|
2408 //
2409 // The behavior of the token scanning process (see getTokenInternal()) can be
2410 // modified by calling one of the first four above listed member functions with
2411 // an optional argument of type Modifier. However, the modifier will be
2412 // ignored unless |lookahead == 0| holds. Due to constraints of the grammar,
2413 // this turns out not to be a problem in practice. See the
2414 // mozilla.dev.tech.js-engine.internals thread entitled 'Bug in the scanner?'
2415 // for more details:
2416 // https://groups.google.com/forum/?fromgroups=#!topic/mozilla.dev.tech.js-engine.internals/2JLH5jRcr7E).
2417 //
2418 // The method seek() allows rescanning from a previously visited location of
2419 // the buffer, initially computed by constructing a Position local variable.
2420 //
2421 template <typename Unit, class AnyCharsAccess>
2422 class MOZ_STACK_CLASS TokenStreamSpecific
2423 : public TokenStreamChars<Unit, AnyCharsAccess>,
2424 public TokenStreamShared,
2425 public ErrorReporter {
2426 public:
2427 using CharsBase = TokenStreamCharsBase<Unit>;
2428 using SpecializedCharsBase = SpecializedTokenStreamCharsBase<Unit>;
2429 using GeneralCharsBase = GeneralTokenStreamChars<Unit, AnyCharsAccess>;
2430 using SpecializedChars = TokenStreamChars<Unit, AnyCharsAccess>;
2431
2432 using Position = TokenStreamPosition<Unit>;
2433
2434 // Anything inherited through a base class whose type depends upon this
2435 // class's template parameters can only be accessed through a dependent
2436 // name: prefixed with |this|, by explicit qualification, and so on. (This
2437 // is so that references to inherited fields are statically distinguishable
2438 // from references to names outside of the class.) This is tedious and
2439 // onerous.
2440 //
2441 // As an alternative, we directly add every one of these functions to this
2442 // class, using explicit qualification to address the dependent-name
2443 // problem. |this| or other qualification is no longer necessary -- at
2444 // cost of this ever-changing laundry list of |using|s. So it goes.
2445 public:
2446 using GeneralCharsBase::anyCharsAccess;
2447 using GeneralCharsBase::computeLineAndColumn;
2448 using TokenStreamCharsShared::adoptState;
2449
2450 private:
2451 using typename CharsBase::SourceUnits;
2452
2453 private:
2454 using CharsBase::atomizeSourceChars;
2455 using GeneralCharsBase::badToken;
2456 // Deliberately don't |using| |charBuffer| because of bug 1472569. :-(
2457 using CharsBase::consumeKnownCodeUnit;
2458 using CharsBase::matchCodeUnit;
2459 using CharsBase::matchLineTerminator;
2460 using CharsBase::peekCodeUnit;
2461 using GeneralCharsBase::computeColumn;
2462 using GeneralCharsBase::fillExceptingContext;
2463 using GeneralCharsBase::getCodeUnit;
2464 using GeneralCharsBase::getFullAsciiCodePoint;
2465 using GeneralCharsBase::internalComputeLineOfContext;
2466 using GeneralCharsBase::matchUnicodeEscapeIdent;
2467 using GeneralCharsBase::matchUnicodeEscapeIdStart;
2468 using GeneralCharsBase::newAtomToken;
2469 using GeneralCharsBase::newBigIntToken;
2470 using GeneralCharsBase::newNameToken;
2471 using GeneralCharsBase::newNumberToken;
2472 using GeneralCharsBase::newPrivateNameToken;
2473 using GeneralCharsBase::newRegExpToken;
2474 using GeneralCharsBase::newSimpleToken;
2475 using SpecializedChars::getNonAsciiCodePoint;
2476 using SpecializedChars::getNonAsciiCodePointDontNormalize;
2477 using TokenStreamCharsShared::copyCharBufferTo;
2478 using TokenStreamCharsShared::drainCharBufferIntoAtom;
2479 using TokenStreamCharsShared::isAsciiCodePoint;
2480 // Deliberately don't |using| |sourceUnits| because of bug 1472569. :-(
2481 using CharsBase::toUnit;
2482 using GeneralCharsBase::ungetCodeUnit;
2483 using GeneralCharsBase::updateLineInfoForEOL;
2484
2485 template <typename CharU>
2486 friend class TokenStreamPosition;
2487
2488 public:
2489 TokenStreamSpecific(JSContext* cx, ParserAtomsTable* parserAtoms,
2490 const JS::ReadOnlyCompileOptions& options,
2491 const Unit* units, size_t length);
2492
2493 /**
2494 * Get the next code point, converting LineTerminatorSequences to '\n' and
2495 * updating internal line-counter state if needed. Return true on success
2496 * and store the code point in |*cp|. Return false and leave |*cp|
2497 * undefined on failure.
2498 */
2499 [[nodiscard]] bool getCodePoint(int32_t* cp);
2500
2501 // If there is an invalid escape in a template, report it and return false,
2502 // otherwise return true.
2503 bool checkForInvalidTemplateEscapeError() {
2504 if (anyCharsAccess().invalidTemplateEscapeType == InvalidEscapeType::None) {
2505 return true;
2506 }
2507
2508 reportInvalidEscapeError(anyCharsAccess().invalidTemplateEscapeOffset,
2509 anyCharsAccess().invalidTemplateEscapeType);
2510 return false;
2511 }
2512
2513 public:
2514 // Implement ErrorReporter.
2515
2516 void lineAndColumnAt(size_t offset, uint32_t* line,
2517 uint32_t* column) const final {
2518 computeLineAndColumn(offset, line, column);
2519 }
2520
2521 void currentLineAndColumn(uint32_t* line, uint32_t* column) const final {
2522 computeLineAndColumn(anyCharsAccess().currentToken().pos.begin, line,
2523 column);
2524 }
2525
2526 bool isOnThisLine(size_t offset, uint32_t lineNum,
2527 bool* onThisLine) const final {
2528 return anyCharsAccess().srcCoords.isOnThisLine(offset, lineNum, onThisLine);
2529 }
2530
2531 uint32_t lineAt(size_t offset) const final {
2532 const auto& anyChars = anyCharsAccess();
2533 auto lineToken = anyChars.lineToken(offset);
2534 return anyChars.lineNumber(lineToken);
2535 }
2536
2537 uint32_t columnAt(size_t offset) const final {
2538 return computeColumn(anyCharsAccess().lineToken(offset), offset);
2539 }
2540
2541 bool hasTokenizationStarted() const final;
2542
2543 const char* getFilename() const final {
2544 return anyCharsAccess().getFilename();
2545 }
2546
2547 private:
2548 // Implement ErrorReportMixin.
2549
2550 JSContext* getContext() const override { return anyCharsAccess().cx; }
2551
2552 [[nodiscard]] bool strictMode() const override {
2553 return anyCharsAccess().strictMode();
2554 }
2555
2556 public:
2557 // Implement ErrorReportMixin.
2558
2559 const JS::ReadOnlyCompileOptions& options() const final {
2560 return anyCharsAccess().options();
2561 }
2562
2563 [[nodiscard]] bool computeErrorMetadata(
2564 ErrorMetadata* err, const ErrorOffset& errorOffset) override;
2565
2566 private:
2567 void reportInvalidEscapeError(uint32_t offset, InvalidEscapeType type) {
2568 switch (type) {
2569 case InvalidEscapeType::None:
2570 MOZ_ASSERT_UNREACHABLE("unexpected InvalidEscapeType");
2571 return;
2572 case InvalidEscapeType::Hexadecimal:
2573 errorAt(offset, JSMSG_MALFORMED_ESCAPE, "hexadecimal");
2574 return;
2575 case InvalidEscapeType::Unicode:
2576 errorAt(offset, JSMSG_MALFORMED_ESCAPE, "Unicode");
2577 return;
2578 case InvalidEscapeType::UnicodeOverflow:
2579 errorAt(offset, JSMSG_UNICODE_OVERFLOW, "escape sequence");
2580 return;
2581 case InvalidEscapeType::Octal:
2582 errorAt(offset, JSMSG_DEPRECATED_OCTAL_ESCAPE);
2583 return;
2584 case InvalidEscapeType::EightOrNine:
2585 errorAt(offset, JSMSG_DEPRECATED_EIGHT_OR_NINE_ESCAPE);
2586 return;
2587 }
2588 }
2589
2590 void reportIllegalCharacter(int32_t cp);
2591
2592 [[nodiscard]] bool putIdentInCharBuffer(const Unit* identStart);
2593
2594 using IsIntegerUnit = bool (*)(int32_t);
2595 [[nodiscard]] MOZ_ALWAYS_INLINE bool matchInteger(IsIntegerUnit isIntegerUnit,
2596 int32_t* nextUnit);
2597 [[nodiscard]] MOZ_ALWAYS_INLINE bool matchIntegerAfterFirstDigit(
2598 IsIntegerUnit isIntegerUnit, int32_t* nextUnit);
2599
2600 /**
2601 * Tokenize a decimal number that begins at |numStart| into the provided
2602 * token.
2603 *
2604 * |unit| must be one of these values:
2605 *
2606 * 1. The first decimal digit in the integral part of a decimal number
2607 * not starting with '0' or '.', e.g. '1' for "17", '3' for "3.14", or
2608 * '8' for "8.675309e6".
2609 *
2610 * In this case, the next |getCodeUnit()| must return the code unit after
2611 * |unit| in the overall number.
2612 *
2613 * 2. The '.' in a "."/"0."-prefixed decimal number or the 'e'/'E' in a
2614 * "0e"/"0E"-prefixed decimal number, e.g. ".17", "0.42", or "0.1e3".
2615 *
2616 * In this case, the next |getCodeUnit()| must return the code unit
2617 * *after* the first decimal digit *after* the '.'. So the next code
2618 * unit would be '7' in ".17", '2' in "0.42", 'e' in "0.4e+8", or '/' in
2619 * "0.5/2" (three separate tokens).
2620 *
2621 * 3. The code unit after the '0' where "0" is the entire number token.
2622 *
2623 * In this case, the next |getCodeUnit()| would return the code unit
2624 * after |unit|, but this function will never perform such call.
2625 *
2626 * 4. (Non-strict mode code only) The first '8' or '9' in a "noctal"
2627 * number that begins with a '0' but contains a non-octal digit in its
2628 * integer part so is interpreted as decimal, e.g. '9' in "09.28" or
2629 * '8' in "0386" or '9' in "09+7" (three separate tokens").
2630 *
2631 * In this case, the next |getCodeUnit()| returns the code unit after
2632 * |unit|: '.', '6', or '+' in the examples above.
2633 *
2634 * This interface is super-hairy and horribly stateful. Unfortunately, its
2635 * hair merely reflects the intricacy of ECMAScript numeric literal syntax.
2636 * And incredibly, it *improves* on the goto-based horror that predated it.
2637 */
2638 [[nodiscard]] bool decimalNumber(int32_t unit, TokenStart start,
2639 const Unit* numStart, Modifier modifier,
2640 TokenKind* out);
2641
2642 /** Tokenize a regular expression literal beginning at |start|. */
2643 [[nodiscard]] bool regexpLiteral(TokenStart start, TokenKind* out);
2644
2645 /**
2646 * Slurp characters between |start| and sourceUnits.current() into
2647 * charBuffer, to later parse into a bigint.
2648 */
2649 [[nodiscard]] bool bigIntLiteral(TokenStart start, Modifier modifier,
2650 TokenKind* out);
2651
2652 public:
2653 // Advance to the next token. If the token stream encountered an error,
2654 // return false. Otherwise return true and store the token kind in |*ttp|.
2655 [[nodiscard]] bool getToken(TokenKind* ttp, Modifier modifier = SlashIsDiv) {
2656 // Check for a pushed-back token resulting from mismatching lookahead.
2657 TokenStreamAnyChars& anyChars = anyCharsAccess();
2658 if (anyChars.lookahead != 0) {
2659 MOZ_ASSERT(!anyChars.flags.hadError);
2660 anyChars.lookahead--;
2661 anyChars.advanceCursor();
2662 TokenKind tt = anyChars.currentToken().type;
2663 MOZ_ASSERT(tt != TokenKind::Eol);
2664 verifyConsistentModifier(modifier, anyChars.currentToken());
2665 *ttp = tt;
2666 return true;
2667 }
2668
2669 return getTokenInternal(ttp, modifier);
2670 }
2671
2672 [[nodiscard]] bool peekToken(TokenKind* ttp, Modifier modifier = SlashIsDiv) {
2673 TokenStreamAnyChars& anyChars = anyCharsAccess();
2674 if (anyChars.lookahead > 0) {
2675 MOZ_ASSERT(!anyChars.flags.hadError);
2676 verifyConsistentModifier(modifier, anyChars.nextToken());
2677 *ttp = anyChars.nextToken().type;
2678 return true;
2679 }
2680 if (!getTokenInternal(ttp, modifier)) {
2681 return false;
2682 }
2683 anyChars.ungetToken();
2684 return true;
2685 }
2686
2687 [[nodiscard]] bool peekTokenPos(TokenPos* posp,
2688 Modifier modifier = SlashIsDiv) {
2689 TokenStreamAnyChars& anyChars = anyCharsAccess();
2690 if (anyChars.lookahead == 0) {
2691 TokenKind tt;
2692 if (!getTokenInternal(&tt, modifier)) {
2693 return false;
2694 }
2695 anyChars.ungetToken();
2696 MOZ_ASSERT(anyChars.hasLookahead());
2697 } else {
2698 MOZ_ASSERT(!anyChars.flags.hadError);
2699 verifyConsistentModifier(modifier, anyChars.nextToken());
2700 }
2701 *posp = anyChars.nextToken().pos;
2702 return true;
2703 }
2704
2705 [[nodiscard]] bool peekOffset(uint32_t* offset,
2706 Modifier modifier = SlashIsDiv) {
2707 TokenPos pos;
2708 if (!peekTokenPos(&pos, modifier)) {
2709 return false;
2710 }
2711 *offset = pos.begin;
2712 return true;
2713 }
2714
2715 // This is like peekToken(), with one exception: if there is an EOL
2716 // between the end of the current token and the start of the next token, it
2717 // return true and store Eol in |*ttp|. In that case, no token with
2718 // Eol is actually created, just a Eol TokenKind is returned, and
2719 // currentToken() shouldn't be consulted. (This is the only place Eol
2720 // is produced.)
2721 [[nodiscard]] MOZ_ALWAYS_INLINE bool peekTokenSameLine(
2722 TokenKind* ttp, Modifier modifier = SlashIsDiv) {
2723 TokenStreamAnyChars& anyChars = anyCharsAccess();
2724 const Token& curr = anyChars.currentToken();
2725
2726 // If lookahead != 0, we have scanned ahead at least one token, and
2727 // |lineno| is the line that the furthest-scanned token ends on. If
2728 // it's the same as the line that the current token ends on, that's a
2729 // stronger condition than what we are looking for, and we don't need
2730 // to return Eol.
2731 if (anyChars.lookahead != 0) {
2732 bool onThisLine;
2733 if (!anyChars.srcCoords.isOnThisLine(curr.pos.end, anyChars.lineno,
2734 &onThisLine)) {
2735 error(JSMSG_OUT_OF_MEMORY);
2736 return false;
2737 }
2738
2739 if (onThisLine) {
2740 MOZ_ASSERT(!anyChars.flags.hadError);
2741 verifyConsistentModifier(modifier, anyChars.nextToken());
2742 *ttp = anyChars.nextToken().type;
2743 return true;
2744 }
2745 }
2746
2747 // The above check misses two cases where we don't have to return
2748 // Eol.
2749 // - The next token starts on the same line, but is a multi-line token.
2750 // - The next token starts on the same line, but lookahead==2 and there
2751 // is a newline between the next token and the one after that.
2752 // The following test is somewhat expensive but gets these cases (and
2753 // all others) right.
2754 TokenKind tmp;
2755 if (!getToken(&tmp, modifier)) {
2756 return false;
2757 }
2758
2759 const Token& next = anyChars.currentToken();
2760 anyChars.ungetToken();
2761
2762 // Careful, |next| points to an initialized-but-not-allocated Token!
2763 // This is safe because we don't modify token data below.
2764
2765 auto currentEndToken = anyChars.lineToken(curr.pos.end);
2766 auto nextBeginToken = anyChars.lineToken(next.pos.begin);
2767
2768 *ttp =
2769 currentEndToken.isSameLine(nextBeginToken) ? next.type : TokenKind::Eol;
2770 return true;
2771 }
2772
2773 // Get the next token from the stream if its kind is |tt|.
2774 [[nodiscard]] bool matchToken(bool* matchedp, TokenKind tt,
2775 Modifier modifier = SlashIsDiv) {
2776 TokenKind token;
2777 if (!getToken(&token, modifier)) {
2778 return false;
2779 }
2780 if (token == tt) {
2781 *matchedp = true;
2782 } else {
2783 anyCharsAccess().ungetToken();
2784 *matchedp = false;
2785 }
2786 return true;
2787 }
2788
2789 void consumeKnownToken(TokenKind tt, Modifier modifier = SlashIsDiv) {
2790 bool matched;
2791 MOZ_ASSERT(anyCharsAccess().hasLookahead());
2792 MOZ_ALWAYS_TRUE(matchToken(&matched, tt, modifier));
2793 MOZ_ALWAYS_TRUE(matched);
2794 }
2795
2796 [[nodiscard]] bool nextTokenEndsExpr(bool* endsExpr) {
2797 TokenKind tt;
2798 if (!peekToken(&tt)) {
2799 return false;
2800 }
2801
2802 *endsExpr = anyCharsAccess().isExprEnding[size_t(tt)];
2803 if (*endsExpr) {
2804 // If the next token ends an overall Expression, we'll parse this
2805 // Expression without ever invoking Parser::orExpr(). But we need that
2806 // function's DEBUG-only side effect of marking this token as safe to get
2807 // with SlashIsRegExp, so we have to do it manually here.
2808 anyCharsAccess().allowGettingNextTokenWithSlashIsRegExp();
2809 }
2810 return true;
2811 }
2812
2813 [[nodiscard]] bool advance(size_t position);
2814
2815 void seekTo(const Position& pos);
2816 [[nodiscard]] bool seekTo(const Position& pos,
2817 const TokenStreamAnyChars& other);
2818
2819 void rewind(const Position& pos) {
2820 MOZ_ASSERT(pos.buf <= this->sourceUnits.addressOfNextCodeUnit(),
2821 "should be rewinding here");
2822 seekTo(pos);
2823 }
2824
2825 [[nodiscard]] bool rewind(const Position& pos,
2826 const TokenStreamAnyChars& other) {
2827 MOZ_ASSERT(pos.buf <= this->sourceUnits.addressOfNextCodeUnit(),
2828 "should be rewinding here");
2829 return seekTo(pos, other);
2830 }
2831
2832 void fastForward(const Position& pos) {
2833 MOZ_ASSERT(this->sourceUnits.addressOfNextCodeUnit() <= pos.buf,
2834 "should be moving forward here");
2835 seekTo(pos);
2836 }
2837
2838 [[nodiscard]] bool fastForward(const Position& pos,
2839 const TokenStreamAnyChars& other) {
2840 MOZ_ASSERT(this->sourceUnits.addressOfNextCodeUnit() <= pos.buf,
2841 "should be moving forward here");
2842 return seekTo(pos, other);
2843 }
2844
2845 const Unit* codeUnitPtrAt(size_t offset) const {
2846 return this->sourceUnits.codeUnitPtrAt(offset);
2847 }
2848
2849 [[nodiscard]] bool identifierName(TokenStart start, const Unit* identStart,
2850 IdentifierEscapes escaping,
2851 Modifier modifier,
2852 NameVisibility visibility, TokenKind* out);
2853
2854 [[nodiscard]] bool matchIdentifierStart(IdentifierEscapes* sawEscape);
2855
2856 [[nodiscard]] bool getTokenInternal(TokenKind* const ttp,
2857 const Modifier modifier);
2858
2859 [[nodiscard]] bool getStringOrTemplateToken(char untilChar, Modifier modifier,
2860 TokenKind* out);
2861
2862 // Parse a TemplateMiddle or TemplateTail token (one of the string-like parts
2863 // of a template string) after already consuming the leading `RightCurly`.
2864 // (The spec says the `}` is the first character of the TemplateMiddle/
2865 // TemplateTail, but we treat it as a separate token because that's much
2866 // easier to implement in both TokenStream and the parser.)
2867 //
2868 // This consumes a token and sets the current token, like `getToken()`. It
2869 // doesn't take a Modifier because there's no risk of encountering a division
2870 // operator or RegExp literal.
2871 //
2872 // On success, `*ttp` is either `TokenKind::TemplateHead` (if we got a
2873 // TemplateMiddle token) or `TokenKind::NoSubsTemplate` (if we got a
2874 // TemplateTail). That may seem strange; there are four different template
2875 // token types in the spec, but we only use two. We use `TemplateHead` for
2876 // TemplateMiddle because both end with `...${`, and `NoSubsTemplate` for
2877 // TemplateTail because both contain the end of the template, including the
2878 // closing quote mark. They're not treated differently, either in the parser
2879 // or in the tokenizer.
2880 [[nodiscard]] bool getTemplateToken(TokenKind* ttp) {
2881 MOZ_ASSERT(anyCharsAccess().currentToken().type == TokenKind::RightCurly);
2882 return getStringOrTemplateToken('`', SlashIsInvalid, ttp);
2883 }
2884
2885 [[nodiscard]] bool getDirectives(bool isMultiline, bool shouldWarnDeprecated);
2886 [[nodiscard]] bool getDirective(
2887 bool isMultiline, bool shouldWarnDeprecated, const char* directive,
2888 uint8_t directiveLength, const char* errorMsgPragma,
2889 UniquePtr<char16_t[], JS::FreePolicy>* destination);
2890 [[nodiscard]] bool getDisplayURL(bool isMultiline, bool shouldWarnDeprecated);
2891 [[nodiscard]] bool getSourceMappingURL(bool isMultiline,
2892 bool shouldWarnDeprecated);
2893 };
2894
2895 // It's preferable to define this in TokenStream.cpp, but its template-ness
2896 // means we'd then have to *instantiate* this constructor for all possible
2897 // (Unit, AnyCharsAccess) pairs -- and that gets super-messy as AnyCharsAccess
2898 // *itself* is templated. This symbol really isn't that huge compared to some
2899 // defined inline in TokenStreamSpecific, so just rely on the linker commoning
2900 // stuff up.
2901 template <typename Unit>
2902 template <class AnyCharsAccess>
2903 inline TokenStreamPosition<Unit>::TokenStreamPosition(
2904 TokenStreamSpecific<Unit, AnyCharsAccess>& tokenStream)
2905 : currentToken(tokenStream.anyCharsAccess().currentToken()) {
2906 TokenStreamAnyChars& anyChars = tokenStream.anyCharsAccess();
2907
2908 buf =
2909 tokenStream.sourceUnits.addressOfNextCodeUnit(/* allowPoisoned = */ true);
2910 flags = anyChars.flags;
2911 lineno = anyChars.lineno;
2912 linebase = anyChars.linebase;
2913 prevLinebase = anyChars.prevLinebase;
2914 lookahead = anyChars.lookahead;
2915 currentToken = anyChars.currentToken();
2916 for (unsigned i = 0; i < anyChars.lookahead; i++) {
2917 lookaheadTokens[i] = anyChars.tokens[anyChars.aheadCursor(1 + i)];
2918 }
2919 }
2920
2921 class TokenStreamAnyCharsAccess {
2922 public:
2923 template <class TokenStreamSpecific>
2924 static inline TokenStreamAnyChars& anyChars(TokenStreamSpecific* tss);
2925
2926 template <class TokenStreamSpecific>
2927 static inline const TokenStreamAnyChars& anyChars(
2928 const TokenStreamSpecific* tss);
2929 };
2930
2931 class MOZ_STACK_CLASS TokenStream
2932 : public TokenStreamAnyChars,
2933 public TokenStreamSpecific<char16_t, TokenStreamAnyCharsAccess> {
2934 using Unit = char16_t;
2935
2936 public:
2937 TokenStream(JSContext* cx, ParserAtomsTable* parserAtoms,
2938 const JS::ReadOnlyCompileOptions& options, const Unit* units,
2939 size_t length, StrictModeGetter* smg)
2940 : TokenStreamAnyChars(cx, options, smg),
2941 TokenStreamSpecific<Unit, TokenStreamAnyCharsAccess>(
2942 cx, parserAtoms, options, units, length) {}
2943 };
2944
2945 class MOZ_STACK_CLASS DummyTokenStream final : public TokenStream {
2946 public:
2947 DummyTokenStream(JSContext* cx, const JS::ReadOnlyCompileOptions& options)
2948 : TokenStream(cx, nullptr, options, nullptr, 0, nullptr) {}
2949 };
2950
2951 template <class TokenStreamSpecific>
2952 /* static */ inline TokenStreamAnyChars& TokenStreamAnyCharsAccess::anyChars(
2953 TokenStreamSpecific* tss) {
2954 auto* ts = static_cast<TokenStream*>(tss);
2955 return *static_cast<TokenStreamAnyChars*>(ts);
2956 }
2957
2958 template <class TokenStreamSpecific>
2959 /* static */ inline const TokenStreamAnyChars&
2960 TokenStreamAnyCharsAccess::anyChars(const TokenStreamSpecific* tss) {
2961 const auto* ts = static_cast<const TokenStream*>(tss);
2962 return *static_cast<const TokenStreamAnyChars*>(ts);
2963 }
2964
2965 extern const char* TokenKindToDesc(TokenKind tt);
2966
2967 } // namespace frontend
2968 } // namespace js
2969
2970 #ifdef DEBUG
2971 extern const char* TokenKindToString(js::frontend::TokenKind tt);
2972 #endif
2973
2974 #endif /* frontend_TokenStream_h */
2975