1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * vim: set ts=8 sts=2 et sw=2 tw=80:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 /*
8 * Streaming access to the raw tokens of JavaScript source.
9 *
10 * Because JS tokenization is context-sensitive -- a '/' could be either a
11 * regular expression *or* a division operator depending on context -- the
12 * various token stream classes are mostly not useful outside of the Parser
13 * where they reside. We should probably eventually merge the two concepts.
14 */
15 #ifndef frontend_TokenStream_h
16 #define frontend_TokenStream_h
17
18 /*
19 * [SMDOC] Parser Token Stream
20 *
21 * A token stream exposes the raw tokens -- operators, names, numbers,
22 * keywords, and so on -- of JavaScript source code.
23 *
24 * These are the components of the overall token stream concept:
25 * TokenStreamShared, TokenStreamAnyChars, TokenStreamCharsBase<Unit>,
26 * TokenStreamChars<Unit>, and TokenStreamSpecific<Unit, AnyCharsAccess>.
27 *
28 * == TokenStreamShared → ∅ ==
29 *
30 * Certain aspects of tokenizing are used everywhere:
31 *
32 * * modifiers (used to select which context-sensitive interpretation of a
33 * character should be used to decide what token it is) and modifier
34 * assertion handling;
35 * * flags on the overall stream (have we encountered any characters on this
36 * line? have we hit a syntax error? and so on);
37 * * and certain token-count constants.
38 *
39 * These are all defined in TokenStreamShared. (They could be namespace-
40 * scoped, but it seems tentatively better not to clutter the namespace.)
41 *
42 * == TokenStreamAnyChars → TokenStreamShared ==
43 *
44 * Certain aspects of tokenizing have meaning independent of the character type
45 * of the source text being tokenized: line/column number information, tokens
46 * in lookahead from determining the meaning of a prior token, compilation
47 * options, the filename, flags, source map URL, access to details of the
48 * current and next tokens (is the token of the given type? what name or
49 * number is contained in the token? and other queries), and others.
50 *
51 * All this data/functionality *could* be duplicated for both single-byte and
52 * double-byte tokenizing, but there are two problems. First, it's potentially
53 * wasteful if the compiler doesnt recognize it can unify the concepts. (And
54 * if any-character concepts are intermixed with character-specific concepts,
55 * potentially the compiler *can't* unify them because offsets into the
56 * hypothetical TokenStream<Unit>s would differ.) Second, some of this stuff
57 * needs to be accessible in ParserBase, the aspects of JS language parsing
58 * that have meaning independent of the character type of the source text being
59 * parsed. So we need a separate data structure that ParserBase can hold on to
60 * for it. (ParserBase isn't the only instance of this, but it's certainly the
61 * biggest case of it.) Ergo, TokenStreamAnyChars.
62 *
63 * == TokenStreamCharsShared → ∅ ==
64 *
65 * Some functionality has meaning independent of character type, yet has no use
66 * *unless* you know the character type in actual use. It *could* live in
67 * TokenStreamAnyChars, but it makes more sense to live in a separate class
68 * that character-aware token information can simply inherit.
69 *
70 * This class currently exists only to contain a char16_t buffer, transiently
71 * used to accumulate strings in tricky cases that can't just be read directly
72 * from source text. It's not used outside character-aware tokenizing, so it
73 * doesn't make sense in TokenStreamAnyChars.
74 *
75 * == TokenStreamCharsBase<Unit> → TokenStreamCharsShared ==
76 *
77 * Certain data structures in tokenizing are character-type-specific: namely,
78 * the various pointers identifying the source text (including current offset
79 * and end).
80 *
81 * Additionally, some functions operating on this data are defined the same way
82 * no matter what character type you have (e.g. current offset in code units
83 * into the source text) or share a common interface regardless of character
84 * type (e.g. consume the next code unit if it has a given value).
85 *
86 * All such functionality lives in TokenStreamCharsBase<Unit>.
87 *
88 * == SpecializedTokenStreamCharsBase<Unit> → TokenStreamCharsBase<Unit> ==
89 *
90 * Certain tokenizing functionality is specific to a single character type.
91 * For example, JS's UTF-16 encoding recognizes no coding errors, because lone
92 * surrogates are not an error; but a UTF-8 encoding must recognize a variety
93 * of validation errors. Such functionality is defined only in the appropriate
94 * SpecializedTokenStreamCharsBase specialization.
95 *
96 * == GeneralTokenStreamChars<Unit, AnyCharsAccess> →
97 * SpecializedTokenStreamCharsBase<Unit> ==
98 *
99 * Some functionality operates differently on different character types, just
100 * as for TokenStreamCharsBase, but additionally requires access to character-
101 * type-agnostic information in TokenStreamAnyChars. For example, getting the
102 * next character performs different steps for different character types and
103 * must access TokenStreamAnyChars to update line break information.
104 *
105 * Such functionality, if it can be defined using the same algorithm for all
106 * character types, lives in GeneralTokenStreamChars<Unit, AnyCharsAccess>.
107 * The AnyCharsAccess parameter provides a way for a GeneralTokenStreamChars
108 * instance to access its corresponding TokenStreamAnyChars, without inheriting
109 * from it.
110 *
111 * GeneralTokenStreamChars<Unit, AnyCharsAccess> is just functionality, no
112 * actual member data.
113 *
114 * Such functionality all lives in TokenStreamChars<Unit, AnyCharsAccess>, a
115 * declared-but-not-defined template class whose specializations have a common
116 * public interface (plus whatever private helper functions are desirable).
117 *
118 * == TokenStreamChars<Unit, AnyCharsAccess> →
119 * GeneralTokenStreamChars<Unit, AnyCharsAccess> ==
120 *
121 * Some functionality is like that in GeneralTokenStreamChars, *but* it's
122 * defined entirely differently for different character types.
123 *
124 * For example, consider "match a multi-code unit code point" (hypothetically:
125 * we've only implemented two-byte tokenizing right now):
126 *
127 * * For two-byte text, there must be two code units to get, the leading code
128 * unit must be a UTF-16 lead surrogate, and the trailing code unit must be
129 * a UTF-16 trailing surrogate. (If any of these fail to hold, a next code
130 * unit encodes that code point and is not multi-code unit.)
131 * * For single-byte Latin-1 text, there are no multi-code unit code points.
132 * * For single-byte UTF-8 text, the first code unit must have N > 1 of its
133 * highest bits set (and the next unset), and |N - 1| successive code units
134 * must have their high bit set and next-highest bit unset, *and*
135 * concatenating all unconstrained bits together must not produce a code
136 * point value that could have been encoded in fewer code units.
137 *
138 * This functionality can't be implemented as member functions in
139 * GeneralTokenStreamChars because we'd need to *partially specialize* those
140 * functions -- hold Unit constant while letting AnyCharsAccess vary. But
141 * C++ forbids function template partial specialization like this: either you
142 * fix *all* parameters or you fix none of them.
143 *
144 * Fortunately, C++ *does* allow *class* template partial specialization. So
145 * TokenStreamChars is a template class with one specialization per Unit.
146 * Functions can be defined differently in the different specializations,
147 * because AnyCharsAccess as the only template parameter on member functions
148 * *can* vary.
149 *
150 * All TokenStreamChars<Unit, AnyCharsAccess> specializations, one per Unit,
151 * are just functionality, no actual member data.
152 *
153 * == TokenStreamSpecific<Unit, AnyCharsAccess> →
154 * TokenStreamChars<Unit, AnyCharsAccess>, TokenStreamShared,
155 * ErrorReporter ==
156 *
157 * TokenStreamSpecific is operations that are parametrized on character type
158 * but implement the *general* idea of tokenizing, without being intrinsically
159 * tied to character type. Notably, this includes all operations that can
160 * report warnings or errors at particular offsets, because we include a line
161 * of context with such errors -- and that necessarily accesses the raw
162 * characters of their specific type.
163 *
164 * Much TokenStreamSpecific operation depends on functionality in
165 * TokenStreamAnyChars. The obvious solution is to inherit it -- but this
166 * doesn't work in Parser: its ParserBase base class needs some
167 * TokenStreamAnyChars functionality without knowing character type.
168 *
169 * The AnyCharsAccess type parameter is a class that statically converts from a
170 * TokenStreamSpecific* to its corresponding TokenStreamAnyChars. The
171 * TokenStreamSpecific in Parser<ParseHandler, Unit> can then specify a class
172 * that properly converts from TokenStreamSpecific Parser::tokenStream to
173 * TokenStreamAnyChars ParserBase::anyChars.
174 *
175 * Could we hardcode one set of offset calculations for this and eliminate
176 * AnyCharsAccess? No. Offset calculations possibly could be hardcoded if
177 * TokenStreamSpecific were present in Parser before Parser::handler, assuring
178 * the same offsets in all Parser-related cases. But there's still a separate
179 * TokenStream class, that requires different offset calculations. So even if
180 * we wanted to hardcode this (it's not clear we would, because forcing the
181 * TokenStreamSpecific declarer to specify this is more explicit), we couldn't.
182 */
183
184 #include "mozilla/ArrayUtils.h"
185 #include "mozilla/Assertions.h"
186 #include "mozilla/Attributes.h"
187 #include "mozilla/Casting.h"
188 #include "mozilla/DebugOnly.h"
189 #include "mozilla/Maybe.h"
190 #include "mozilla/MemoryChecking.h"
191 #include "mozilla/PodOperations.h"
192 #include "mozilla/Span.h"
193 #include "mozilla/TextUtils.h"
194 #include "mozilla/Utf8.h"
195
196 #include <algorithm>
197 #include <stdarg.h>
198 #include <stddef.h>
199 #include <stdint.h>
200 #include <stdio.h>
201 #include <type_traits>
202
203 #include "jspubtd.h"
204
205 #include "frontend/ErrorReporter.h"
206 #include "frontend/ParserAtom.h" // ParserAtom, ParserAtomsTable, TaggedParserAtomIndex
207 #include "frontend/Token.h"
208 #include "frontend/TokenKind.h"
209 #include "js/CompileOptions.h"
210 #include "js/friend/ErrorMessages.h" // JSMSG_*
211 #include "js/HashTable.h" // js::HashMap
212 #include "js/RegExpFlags.h" // JS::RegExpFlags
213 #include "js/UniquePtr.h"
214 #include "js/Vector.h"
215 #include "util/Text.h"
216 #include "util/Unicode.h"
217 #include "vm/ErrorReporting.h"
218 #include "vm/JSAtom.h"
219 #include "vm/StringType.h"
220
221 struct JS_PUBLIC_API JSContext;
222 struct KeywordInfo;
223
224 namespace js {
225
226 namespace frontend {
227
228 // Saturate column number at a limit that can be represented in various parts of
229 // the engine. Source locations beyond this point will report at the limit
230 // column instead.
231 //
232 // See:
233 // - TokenStreamAnyChars::checkOptions
234 // - ColSpan::isRepresentable
235 // - WasmFrameIter::computeLine
236 static constexpr uint32_t ColumnLimit = std::numeric_limits<int32_t>::max() / 2;
237
238 // If `name` is reserved word, returns the TokenKind of it.
239 // TokenKind::Limit otherwise.
240 extern TokenKind ReservedWordTokenKind(TaggedParserAtomIndex name);
241
242 // If `name` is reserved word, returns string representation of it.
243 // nullptr otherwise.
244 extern const char* ReservedWordToCharZ(TaggedParserAtomIndex name);
245
246 // If `tt` is reserved word, returns string representation of it.
247 // nullptr otherwise.
248 extern const char* ReservedWordToCharZ(TokenKind tt);
249
250 enum class DeprecatedContent : uint8_t {
251 // No deprecated content was present.
252 None = 0,
253 // Octal literal not prefixed by "0o" but rather by just "0", e.g. 0755.
254 OctalLiteral,
255 // Octal character escape, e.g. "hell\157 world".
256 OctalEscape,
257 // NonOctalDecimalEscape, i.e. "\8" or "\9".
258 EightOrNineEscape,
259 };
260
261 struct TokenStreamFlags {
262 // Hit end of file.
263 bool isEOF : 1;
264 // Non-whitespace since start of line.
265 bool isDirtyLine : 1;
266 // Hit a syntax error, at start or during a token.
267 bool hadError : 1;
268
269 // The nature of any deprecated content seen since last reset.
270 // We have to uint8_t instead DeprecatedContent to work around a GCC 7 bug.
271 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414
272 uint8_t sawDeprecatedContent : 2;
273
TokenStreamFlagsTokenStreamFlags274 TokenStreamFlags()
275 : isEOF(false),
276 isDirtyLine(false),
277 hadError(false),
278 sawDeprecatedContent(uint8_t(DeprecatedContent::None)) {}
279 };
280
281 template <typename Unit>
282 class TokenStreamPosition;
283
284 /**
285 * TokenStream types and constants that are used in both TokenStreamAnyChars
286 * and TokenStreamSpecific. Do not add any non-static data members to this
287 * class!
288 */
289 class TokenStreamShared {
290 protected:
291 static constexpr size_t ntokens = 4; // 1 current + 2 lookahead, rounded
292 // to power of 2 to avoid divmod by 3
293
294 static constexpr unsigned ntokensMask = ntokens - 1;
295
296 template <typename Unit>
297 friend class TokenStreamPosition;
298
299 public:
300 static constexpr unsigned maxLookahead = 2;
301
302 using Modifier = Token::Modifier;
303 static constexpr Modifier SlashIsDiv = Token::SlashIsDiv;
304 static constexpr Modifier SlashIsRegExp = Token::SlashIsRegExp;
305 static constexpr Modifier SlashIsInvalid = Token::SlashIsInvalid;
306
verifyConsistentModifier(Modifier modifier,const Token & nextToken)307 static void verifyConsistentModifier(Modifier modifier,
308 const Token& nextToken) {
309 MOZ_ASSERT(
310 modifier == nextToken.modifier || modifier == SlashIsInvalid,
311 "This token was scanned with both SlashIsRegExp and SlashIsDiv, "
312 "indicating the parser is confused about how to handle a slash here. "
313 "See comment at Token::Modifier.");
314 }
315 };
316
317 static_assert(std::is_empty_v<TokenStreamShared>,
318 "TokenStreamShared shouldn't bloat classes that inherit from it");
319
320 template <typename Unit, class AnyCharsAccess>
321 class TokenStreamSpecific;
322
323 template <typename Unit>
324 class MOZ_STACK_CLASS TokenStreamPosition final {
325 public:
326 template <class AnyCharsAccess>
327 inline explicit TokenStreamPosition(
328 TokenStreamSpecific<Unit, AnyCharsAccess>& tokenStream);
329
330 private:
331 TokenStreamPosition(const TokenStreamPosition&) = delete;
332
333 // Technically only TokenStreamSpecific<Unit, AnyCharsAccess>::seek with
334 // Unit constant and AnyCharsAccess varying must be friended, but 1) it's
335 // hard to friend one function in template classes, and 2) C++ doesn't
336 // allow partial friend specialization to target just that single class.
337 template <typename Char, class AnyCharsAccess>
338 friend class TokenStreamSpecific;
339
340 const Unit* buf;
341 TokenStreamFlags flags;
342 unsigned lineno;
343 size_t linebase;
344 size_t prevLinebase;
345 Token currentToken;
346 unsigned lookahead;
347 Token lookaheadTokens[TokenStreamShared::maxLookahead];
348 };
349
350 template <typename Unit>
351 class SourceUnits;
352
353 /**
354 * This class maps:
355 *
356 * * a sourceUnits offset (a 0-indexed count of code units)
357 *
358 * to
359 *
360 * * a (1-indexed) line number and
361 * * a (0-indexed) offset in code *units* (not code points, not bytes) into
362 * that line,
363 *
364 * for either |Unit = Utf8Unit| or |Unit = char16_t|.
365 *
366 * Note that the latter quantity is *not* the same as a column number, which is
367 * a count of code *points*. Computing a column number requires the offset
368 * within the line and the source units of that line (including what type |Unit|
369 * is, to know how to decode them). If you need a column number, functions in
370 * |GeneralTokenStreamChars<Unit>| will consult this and source units to compute
371 * it.
372 */
373 class SourceCoords {
374 // For a given buffer holding source code, |lineStartOffsets_| has one
375 // element per line of source code, plus one sentinel element. Each
376 // non-sentinel element holds the buffer offset for the start of the
377 // corresponding line of source code. For this example script,
378 // assuming an initialLineOffset of 0:
379 //
380 // 1 // xyz [line starts at offset 0]
381 // 2 var x; [line starts at offset 7]
382 // 3 [line starts at offset 14]
383 // 4 var y; [line starts at offset 15]
384 //
385 // |lineStartOffsets_| is:
386 //
387 // [0, 7, 14, 15, MAX_PTR]
388 //
389 // To convert a "line number" to an "index" into |lineStartOffsets_|,
390 // subtract |initialLineNum_|. E.g. line 3's index is
391 // (3 - initialLineNum_), which is 2. Therefore lineStartOffsets_[2]
392 // holds the buffer offset for the start of line 3, which is 14. (Note
393 // that |initialLineNum_| is often 1, but not always.
394 //
395 // The first element is always initialLineOffset, passed to the
396 // constructor, and the last element is always the MAX_PTR sentinel.
397 //
398 // Offset-to-{line,offset-into-line} lookups are O(log n) in the worst
399 // case (binary search), but in practice they're heavily clustered and
400 // we do better than that by using the previous lookup's result
401 // (lastIndex_) as a starting point.
402 //
403 // Checking if an offset lies within a particular line number
404 // (isOnThisLine()) is O(1).
405 //
406 Vector<uint32_t, 128> lineStartOffsets_;
407
408 /** The line number on which the source text begins. */
409 uint32_t initialLineNum_;
410
411 /**
412 * The index corresponding to the last offset lookup -- used so that if
413 * offset lookups proceed in increasing order, and and the offset appears
414 * in the next couple lines from the last offset, we can avoid a full
415 * binary-search.
416 *
417 * This is mutable because it's modified on every search, but that fact
418 * isn't visible outside this class.
419 */
420 mutable uint32_t lastIndex_;
421
422 uint32_t indexFromOffset(uint32_t offset) const;
423
424 static const uint32_t MAX_PTR = UINT32_MAX;
425
lineNumberFromIndex(uint32_t index)426 uint32_t lineNumberFromIndex(uint32_t index) const {
427 return index + initialLineNum_;
428 }
429
indexFromLineNumber(uint32_t lineNum)430 uint32_t indexFromLineNumber(uint32_t lineNum) const {
431 return lineNum - initialLineNum_;
432 }
433
434 public:
435 SourceCoords(JSContext* cx, uint32_t initialLineNumber,
436 uint32_t initialOffset);
437
438 [[nodiscard]] bool add(uint32_t lineNum, uint32_t lineStartOffset);
439 [[nodiscard]] bool fill(const SourceCoords& other);
440
isOnThisLine(uint32_t offset,uint32_t lineNum,bool * onThisLine)441 bool isOnThisLine(uint32_t offset, uint32_t lineNum, bool* onThisLine) const {
442 uint32_t index = indexFromLineNumber(lineNum);
443 if (index + 1 >= lineStartOffsets_.length()) { // +1 due to sentinel
444 return false;
445 }
446 *onThisLine = lineStartOffsets_[index] <= offset &&
447 offset < lineStartOffsets_[index + 1];
448 return true;
449 }
450
451 /**
452 * A token, computed for an offset in source text, that can be used to
453 * access line number and line-offset information for that offset.
454 *
455 * LineToken *alone* exposes whether the corresponding offset is in the
456 * the first line of source (which may not be 1, depending on
457 * |initialLineNumber|), and whether it's in the same line as
458 * another LineToken.
459 */
460 class LineToken {
461 uint32_t index;
462 #ifdef DEBUG
463 uint32_t offset_; // stored for consistency-of-use assertions
464 #endif
465
466 friend class SourceCoords;
467
468 public:
LineToken(uint32_t index,uint32_t offset)469 LineToken(uint32_t index, uint32_t offset)
470 : index(index)
471 #ifdef DEBUG
472 ,
473 offset_(offset)
474 #endif
475 {
476 }
477
isFirstLine()478 bool isFirstLine() const { return index == 0; }
479
isSameLine(LineToken other)480 bool isSameLine(LineToken other) const { return index == other.index; }
481
assertConsistentOffset(uint32_t offset)482 void assertConsistentOffset(uint32_t offset) const {
483 MOZ_ASSERT(offset_ == offset);
484 }
485 };
486
487 /**
488 * Compute a token usable to access information about the line at the
489 * given offset.
490 *
491 * The only information directly accessible in a token is whether it
492 * corresponds to the first line of source text (which may not be line
493 * 1, depending on the |initialLineNumber| value used to construct
494 * this). Use |lineNumber(LineToken)| to compute the actual line
495 * number (incorporating the contribution of |initialLineNumber|).
496 */
497 LineToken lineToken(uint32_t offset) const;
498
499 /** Compute the line number for the given token. */
lineNumber(LineToken lineToken)500 uint32_t lineNumber(LineToken lineToken) const {
501 return lineNumberFromIndex(lineToken.index);
502 }
503
504 /** Return the offset of the start of the line for |lineToken|. */
lineStart(LineToken lineToken)505 uint32_t lineStart(LineToken lineToken) const {
506 MOZ_ASSERT(lineToken.index + 1 < lineStartOffsets_.length(),
507 "recorded line-start information must be available");
508 return lineStartOffsets_[lineToken.index];
509 }
510 };
511
512 enum class UnitsType : unsigned char {
513 PossiblyMultiUnit = 0,
514 GuaranteedSingleUnit = 1,
515 };
516
517 class ChunkInfo {
518 private:
519 // Store everything in |unsigned char|s so everything packs.
520 unsigned char column_[sizeof(uint32_t)];
521 unsigned char unitsType_;
522
523 public:
ChunkInfo(uint32_t col,UnitsType type)524 ChunkInfo(uint32_t col, UnitsType type)
525 : unitsType_(static_cast<unsigned char>(type)) {
526 memcpy(column_, &col, sizeof(col));
527 }
528
column()529 uint32_t column() const {
530 uint32_t col;
531 memcpy(&col, column_, sizeof(uint32_t));
532 return col;
533 }
534
unitsType()535 UnitsType unitsType() const {
536 MOZ_ASSERT(unitsType_ <= 1, "unitsType_ must be 0 or 1");
537 return static_cast<UnitsType>(unitsType_);
538 }
539
guaranteeSingleUnits()540 void guaranteeSingleUnits() {
541 MOZ_ASSERT(unitsType() == UnitsType::PossiblyMultiUnit,
542 "should only be setting to possibly optimize from the "
543 "pessimistic case");
544 unitsType_ = static_cast<unsigned char>(UnitsType::GuaranteedSingleUnit);
545 }
546 };
547
548 enum class InvalidEscapeType {
549 // No invalid character escapes.
550 None,
551 // A malformed \x escape.
552 Hexadecimal,
553 // A malformed \u escape.
554 Unicode,
555 // An otherwise well-formed \u escape which represents a
556 // codepoint > 10FFFF.
557 UnicodeOverflow,
558 // An octal escape in a template token.
559 Octal,
560 // NonOctalDecimalEscape - \8 or \9.
561 EightOrNine
562 };
563
564 class TokenStreamAnyChars : public TokenStreamShared {
565 private:
566 // Constant-at-construction fields.
567
568 JSContext* const cx;
569
570 /** Options used for parsing/tokenizing. */
571 const JS::ReadOnlyCompileOptions& options_;
572
573 /**
574 * Pointer used internally to test whether in strict mode. Use |strictMode()|
575 * instead of this field.
576 */
577 StrictModeGetter* const strictModeGetter_;
578
579 /** Input filename or null. */
580 const char* const filename_;
581
582 // Column number computation fields.
583
584 /**
585 * A map of (line number => sequence of the column numbers at
586 * |ColumnChunkLength|-unit boundaries rewound [if needed] to the nearest code
587 * point boundary). (|TokenStreamAnyChars::computePartialColumn| is the sole
588 * user of |ColumnChunkLength| and therefore contains its definition.)
589 *
590 * Entries appear in this map only when a column computation of sufficient
591 * distance is performed on a line -- and only when the column is beyond the
592 * first |ColumnChunkLength| units. Each line's vector is lazily filled as
593 * greater offsets require column computations.
594 */
595 mutable HashMap<uint32_t, Vector<ChunkInfo>> longLineColumnInfo_;
596
597 // Computing accurate column numbers requires at *some* point linearly
598 // iterating through prior source units in the line, to properly account for
599 // multi-unit code points. This is quadratic if counting happens repeatedly.
600 //
601 // But usually we need columns for advancing offsets through scripts. By
602 // caching the last ((line number, offset) => relative column) mapping (in
603 // similar manner to how |SourceCoords::lastIndex_| is used to cache
604 // (offset => line number) mappings) we can usually avoid re-iterating through
605 // the common line prefix.
606 //
607 // Additionally, we avoid hash table lookup costs by caching the
608 // |Vector<ChunkInfo>*| for the line of the last lookup. (|nullptr| means we
609 // must look it up -- or it hasn't been created yet.) This pointer is nulled
610 // when a lookup on a new line occurs, but as it's not a pointer at literal,
611 // reallocatable element data, it's *not* invalidated when new entries are
612 // added to such a vector.
613
614 /**
615 * The line in which the last column computation occurred, or UINT32_MAX if
616 * no prior computation has yet happened.
617 */
618 mutable uint32_t lineOfLastColumnComputation_ = UINT32_MAX;
619
620 /**
621 * The chunk vector of the line for that last column computation. This is
622 * null if the chunk vector needs to be recalculated or initially created.
623 */
624 mutable Vector<ChunkInfo>* lastChunkVectorForLine_ = nullptr;
625
626 /**
627 * The offset (in code units) of the last column computation performed,
628 * relative to source start.
629 */
630 mutable uint32_t lastOffsetOfComputedColumn_ = UINT32_MAX;
631
632 /**
633 * The column number for the offset (in code units) of the last column
634 * computation performed, relative to source start.
635 */
636 mutable uint32_t lastComputedColumn_ = 0;
637
638 // Intra-token fields.
639
640 /**
641 * The offset of the first invalid escape in a template literal. (If there is
642 * one -- if not, the value of this field is meaningless.)
643 *
644 * See also |invalidTemplateEscapeType|.
645 */
646 uint32_t invalidTemplateEscapeOffset = 0;
647
648 /**
649 * The type of the first invalid escape in a template literal. (If there
650 * isn't one, this will be |None|.)
651 *
652 * See also |invalidTemplateEscapeOffset|.
653 */
654 InvalidEscapeType invalidTemplateEscapeType = InvalidEscapeType::None;
655
656 // Fields with values relevant across tokens (and therefore potentially across
657 // function boundaries, such that lazy function parsing and stream-seeking
658 // must take care in saving and restoring them).
659
660 /** Line number and offset-to-line mapping information. */
661 SourceCoords srcCoords;
662
663 /** Circular token buffer of gotten tokens that have been ungotten. */
664 Token tokens[ntokens] = {};
665
666 /** The index in |tokens| of the last parsed token. */
667 unsigned cursor_ = 0;
668
669 /** The number of tokens in |tokens| available to be gotten. */
670 unsigned lookahead = 0;
671
672 /** The current line number. */
673 unsigned lineno;
674
675 /** Various flag bits (see above). */
676 TokenStreamFlags flags = {};
677
678 /** The offset of the start of the current line. */
679 size_t linebase = 0;
680
681 /** The start of the previous line, or |size_t(-1)| on the first line. */
682 size_t prevLinebase = size_t(-1);
683
684 /** The user's requested source URL. Null if none has been set. */
685 UniqueTwoByteChars displayURL_ = nullptr;
686
687 /** The URL of the source map for this script. Null if none has been set. */
688 UniqueTwoByteChars sourceMapURL_ = nullptr;
689
690 // Assorted boolean fields, none of which require maintenance across tokens,
691 // stored at class end to minimize padding.
692
693 /**
694 * Whether syntax errors should or should not contain details about the
695 * precise nature of the error. (This is intended for use in suppressing
696 * content-revealing details about syntax errors in cross-origin scripts on
697 * the web.)
698 */
699 const bool mutedErrors;
700
701 /**
702 * An array storing whether a TokenKind observed while attempting to extend
703 * a valid AssignmentExpression into an even longer AssignmentExpression
704 * (e.g., extending '3' to '3 + 5') will terminate it without error.
705 *
706 * For example, ';' always ends an AssignmentExpression because it ends a
707 * Statement or declaration. '}' always ends an AssignmentExpression
708 * because it terminates BlockStatement, FunctionBody, and embedded
709 * expressions in TemplateLiterals. Therefore both entries are set to true
710 * in TokenStreamAnyChars construction.
711 *
712 * But e.g. '+' *could* extend an AssignmentExpression, so its entry here
713 * is false. Meanwhile 'this' can't extend an AssignmentExpression, but
714 * it's only valid after a line break, so its entry here must be false.
715 *
716 * NOTE: This array could be static, but without C99's designated
717 * initializers it's easier zeroing here and setting the true entries
718 * in the constructor body. (Having this per-instance might also aid
719 * locality.) Don't worry! Initialization time for each TokenStream
720 * is trivial. See bug 639420.
721 */
722 bool isExprEnding[size_t(TokenKind::Limit)] = {}; // all-false initially
723
724 // End of fields.
725
726 public:
727 TokenStreamAnyChars(JSContext* cx, const JS::ReadOnlyCompileOptions& options,
728 StrictModeGetter* smg);
729
730 template <typename Unit, class AnyCharsAccess>
731 friend class GeneralTokenStreamChars;
732 template <typename Unit, class AnyCharsAccess>
733 friend class TokenStreamChars;
734 template <typename Unit, class AnyCharsAccess>
735 friend class TokenStreamSpecific;
736
737 template <typename Unit>
738 friend class TokenStreamPosition;
739
740 // Accessors.
cursor()741 unsigned cursor() const { return cursor_; }
nextCursor()742 unsigned nextCursor() const { return (cursor_ + 1) & ntokensMask; }
aheadCursor(unsigned steps)743 unsigned aheadCursor(unsigned steps) const {
744 return (cursor_ + steps) & ntokensMask;
745 }
746
currentToken()747 const Token& currentToken() const { return tokens[cursor()]; }
isCurrentTokenType(TokenKind type)748 bool isCurrentTokenType(TokenKind type) const {
749 return currentToken().type == type;
750 }
751
752 [[nodiscard]] bool checkOptions();
753
754 private:
755 TaggedParserAtomIndex reservedWordToPropertyName(TokenKind tt) const;
756
757 public:
currentName()758 TaggedParserAtomIndex currentName() const {
759 if (isCurrentTokenType(TokenKind::Name) ||
760 isCurrentTokenType(TokenKind::PrivateName)) {
761 return currentToken().name();
762 }
763
764 MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
765 return reservedWordToPropertyName(currentToken().type);
766 }
767
currentNameHasEscapes(ParserAtomsTable & parserAtoms)768 bool currentNameHasEscapes(ParserAtomsTable& parserAtoms) const {
769 if (isCurrentTokenType(TokenKind::Name) ||
770 isCurrentTokenType(TokenKind::PrivateName)) {
771 TokenPos pos = currentToken().pos;
772 return (pos.end - pos.begin) != parserAtoms.length(currentToken().name());
773 }
774
775 MOZ_ASSERT(TokenKindIsPossibleIdentifierName(currentToken().type));
776 return false;
777 }
778
isCurrentTokenAssignment()779 bool isCurrentTokenAssignment() const {
780 return TokenKindIsAssignment(currentToken().type);
781 }
782
783 // Flag methods.
isEOF()784 bool isEOF() const { return flags.isEOF; }
hadError()785 bool hadError() const { return flags.hadError; }
786
sawDeprecatedContent()787 DeprecatedContent sawDeprecatedContent() const {
788 return static_cast<DeprecatedContent>(flags.sawDeprecatedContent);
789 }
790
791 private:
792 // Workaround GCC 7 sadness.
setSawDeprecatedContent(DeprecatedContent content)793 void setSawDeprecatedContent(DeprecatedContent content) {
794 flags.sawDeprecatedContent = static_cast<uint8_t>(content);
795 }
796
797 public:
clearSawDeprecatedContent()798 void clearSawDeprecatedContent() {
799 setSawDeprecatedContent(DeprecatedContent::None);
800 }
setSawDeprecatedOctalLiteral()801 void setSawDeprecatedOctalLiteral() {
802 setSawDeprecatedContent(DeprecatedContent::OctalLiteral);
803 }
setSawDeprecatedOctalEscape()804 void setSawDeprecatedOctalEscape() {
805 setSawDeprecatedContent(DeprecatedContent::OctalEscape);
806 }
setSawDeprecatedEightOrNineEscape()807 void setSawDeprecatedEightOrNineEscape() {
808 setSawDeprecatedContent(DeprecatedContent::EightOrNineEscape);
809 }
810
hasInvalidTemplateEscape()811 bool hasInvalidTemplateEscape() const {
812 return invalidTemplateEscapeType != InvalidEscapeType::None;
813 }
clearInvalidTemplateEscape()814 void clearInvalidTemplateEscape() {
815 invalidTemplateEscapeType = InvalidEscapeType::None;
816 }
817
818 private:
819 // This is private because it should only be called by the tokenizer while
820 // tokenizing not by, for example, BytecodeEmitter.
strictMode()821 bool strictMode() const {
822 return strictModeGetter_ && strictModeGetter_->strictMode();
823 }
824
setInvalidTemplateEscape(uint32_t offset,InvalidEscapeType type)825 void setInvalidTemplateEscape(uint32_t offset, InvalidEscapeType type) {
826 MOZ_ASSERT(type != InvalidEscapeType::None);
827 if (invalidTemplateEscapeType != InvalidEscapeType::None) {
828 return;
829 }
830 invalidTemplateEscapeOffset = offset;
831 invalidTemplateEscapeType = type;
832 }
833
834 public:
835 // Call this immediately after parsing an OrExpression to allow scanning the
836 // next token with SlashIsRegExp without asserting (even though we just
837 // peeked at it in SlashIsDiv mode).
838 //
839 // It's OK to disable the assertion because the places where this is called
840 // have peeked at the next token in SlashIsDiv mode, and checked that it is
841 // *not* a Div token.
842 //
843 // To see why it is necessary to disable the assertion, consider these two
844 // programs:
845 //
846 // x = arg => q // per spec, this is all one statement, and the
847 // /a/g; // slashes are division operators
848 //
849 // x = arg => {} // per spec, ASI at the end of this line
850 // /a/g; // and that's a regexp literal
851 //
852 // The first program shows why orExpr() has use SlashIsDiv mode when peeking
853 // ahead for the next operator after parsing `q`. The second program shows
854 // why matchOrInsertSemicolon() must use SlashIsRegExp mode when scanning
855 // ahead for a semicolon.
allowGettingNextTokenWithSlashIsRegExp()856 void allowGettingNextTokenWithSlashIsRegExp() {
857 #ifdef DEBUG
858 // Check the precondition: Caller already peeked ahead at the next token,
859 // in SlashIsDiv mode, and it is *not* a Div token.
860 MOZ_ASSERT(hasLookahead());
861 const Token& next = nextToken();
862 MOZ_ASSERT(next.modifier == SlashIsDiv);
863 MOZ_ASSERT(next.type != TokenKind::Div);
864 tokens[nextCursor()].modifier = SlashIsRegExp;
865 #endif
866 }
867
868 #ifdef DEBUG
debugHasNoLookahead()869 inline bool debugHasNoLookahead() const { return lookahead == 0; }
870 #endif
871
hasDisplayURL()872 bool hasDisplayURL() const { return displayURL_ != nullptr; }
873
displayURL()874 char16_t* displayURL() { return displayURL_.get(); }
875
hasSourceMapURL()876 bool hasSourceMapURL() const { return sourceMapURL_ != nullptr; }
877
sourceMapURL()878 char16_t* sourceMapURL() { return sourceMapURL_.get(); }
879
context()880 JSContext* context() const { return cx; }
881
882 using LineToken = SourceCoords::LineToken;
883
lineToken(uint32_t offset)884 LineToken lineToken(uint32_t offset) const {
885 return srcCoords.lineToken(offset);
886 }
887
lineNumber(LineToken lineToken)888 uint32_t lineNumber(LineToken lineToken) const {
889 return srcCoords.lineNumber(lineToken);
890 }
891
lineStart(LineToken lineToken)892 uint32_t lineStart(LineToken lineToken) const {
893 return srcCoords.lineStart(lineToken);
894 }
895
896 /**
897 * Fill in |err|.
898 *
899 * If the token stream doesn't have location info for this error, use the
900 * caller's location (including line/column number) and return false. (No
901 * line of context is set.)
902 *
903 * Otherwise fill in everything in |err| except 1) line/column numbers and
904 * 2) line-of-context-related fields and return true. The caller *must*
905 * fill in the line/column number; filling the line of context is optional.
906 */
907 bool fillExceptingContext(ErrorMetadata* err, uint32_t offset);
908
updateFlagsForEOL()909 MOZ_ALWAYS_INLINE void updateFlagsForEOL() { flags.isDirtyLine = false; }
910
911 private:
912 /**
913 * Compute the "partial" column number in Unicode code points of the absolute
914 * |offset| within source text on the line of |lineToken| (which must have
915 * been computed from |offset|).
916 *
917 * A partial column number on a line that isn't the first line is just the
918 * actual column number. But a partial column number on the first line is the
919 * column number *ignoring the initial line/column of the script*. For
920 * example, consider this HTML with line/column number keys:
921 *
922 * 1 2 3
923 * 0123456789012345678901234 567890
924 * ------------------------------------
925 * 1 | <html>
926 * 2 | <head>
927 * 3 | <script>var x = 3; x < 4;
928 * 4 | const y = 7;</script>
929 * 5 | </head>
930 * 6 | <body></body>
931 * 7 | </html>
932 *
933 * The script would be compiled specifying initial (line, column) of (3, 10)
934 * using |JS::ReadOnlyCompileOptions::{lineno,column}|. And the column
935 * reported by |computeColumn| for the "v" of |var| would be 10. But the
936 * partial column number of the "v" in |var|, that this function returns,
937 * would be 0. On the other hand, the column reported by |computeColumn| and
938 * the partial column number returned by this function for the "c" in |const|
939 * would both be 0, because it's not in the first line of source text.
940 *
941 * The partial column is with respect *only* to the JavaScript source text as
942 * SpiderMonkey sees it. In the example, the "<" is converted to "<" by
943 * the browser before SpiderMonkey would see it. So the partial column of the
944 * "4" in the inequality would be 16, not 19.
945 *
946 * Code points are not all equal length, so counting requires *some* kind of
947 * linear-time counting from the start of the line. This function attempts
948 * various tricks to reduce this cost. If these optimizations succeed,
949 * repeated calls to this function on a line will pay a one-time cost linear
950 * in the length of the line, then each call pays a separate constant-time
951 * cost. If the optimizations do not succeed, this function works in time
952 * linear in the length of the line.
953 *
954 * It's unusual for a function in *this* class to be |Unit|-templated, but
955 * while this operation manages |Unit|-agnostic fields in this class and in
956 * |srcCoords|, it must *perform* |Unit|-sensitive computations to fill them.
957 * And this is the best place to do that.
958 */
959 template <typename Unit>
960 uint32_t computePartialColumn(const LineToken lineToken,
961 const uint32_t offset,
962 const SourceUnits<Unit>& sourceUnits) const;
963
964 /**
965 * Update line/column information for the start of a new line at
966 * |lineStartOffset|.
967 */
968 [[nodiscard]] MOZ_ALWAYS_INLINE bool internalUpdateLineInfoForEOL(
969 uint32_t lineStartOffset);
970
971 public:
nextToken()972 const Token& nextToken() const {
973 MOZ_ASSERT(hasLookahead());
974 return tokens[nextCursor()];
975 }
976
hasLookahead()977 bool hasLookahead() const { return lookahead > 0; }
978
advanceCursor()979 void advanceCursor() { cursor_ = (cursor_ + 1) & ntokensMask; }
980
retractCursor()981 void retractCursor() { cursor_ = (cursor_ - 1) & ntokensMask; }
982
allocateToken()983 Token* allocateToken() {
984 advanceCursor();
985
986 Token* tp = &tokens[cursor()];
987 MOZ_MAKE_MEM_UNDEFINED(tp, sizeof(*tp));
988
989 return tp;
990 }
991
992 // Push the last scanned token back into the stream.
ungetToken()993 void ungetToken() {
994 MOZ_ASSERT(lookahead < maxLookahead);
995 lookahead++;
996 retractCursor();
997 }
998
999 public:
adoptState(TokenStreamAnyChars & other)1000 void adoptState(TokenStreamAnyChars& other) {
1001 // If |other| has fresh information from directives, overwrite any
1002 // previously recorded directives. (There is no specification directing
1003 // that last-in-source-order directive controls, sadly. We behave this way
1004 // in the ordinary case, so we ought do so here too.)
1005 if (auto& url = other.displayURL_) {
1006 displayURL_ = std::move(url);
1007 }
1008 if (auto& url = other.sourceMapURL_) {
1009 sourceMapURL_ = std::move(url);
1010 }
1011 }
1012
1013 // Compute error metadata for an error at no offset.
1014 void computeErrorMetadataNoOffset(ErrorMetadata* err);
1015
1016 // ErrorReporter API Helpers
1017
1018 // Provide minimal set of error reporting API given we cannot use
1019 // ErrorReportMixin here. "report" prefix is added to avoid conflict with
1020 // ErrorReportMixin methods in TokenStream class.
1021 void reportErrorNoOffset(unsigned errorNumber, ...);
1022 void reportErrorNoOffsetVA(unsigned errorNumber, va_list* args);
1023
options()1024 const JS::ReadOnlyCompileOptions& options() const { return options_; }
1025
getFilename()1026 const char* getFilename() const { return filename_; }
1027 };
1028
CodeUnitValue(char16_t unit)1029 constexpr char16_t CodeUnitValue(char16_t unit) { return unit; }
1030
CodeUnitValue(mozilla::Utf8Unit unit)1031 constexpr uint8_t CodeUnitValue(mozilla::Utf8Unit unit) {
1032 return unit.toUint8();
1033 }
1034
1035 template <typename Unit>
1036 class TokenStreamCharsBase;
1037
1038 template <typename T>
1039 inline bool IsLineTerminator(T) = delete;
1040
IsLineTerminator(char32_t codePoint)1041 inline bool IsLineTerminator(char32_t codePoint) {
1042 return codePoint == '\n' || codePoint == '\r' ||
1043 codePoint == unicode::LINE_SEPARATOR ||
1044 codePoint == unicode::PARA_SEPARATOR;
1045 }
1046
IsLineTerminator(char16_t unit)1047 inline bool IsLineTerminator(char16_t unit) {
1048 // Every LineTerminator fits in char16_t, so this is exact.
1049 return IsLineTerminator(static_cast<char32_t>(unit));
1050 }
1051
1052 template <typename Unit>
1053 struct SourceUnitTraits;
1054
1055 template <>
1056 struct SourceUnitTraits<char16_t> {
1057 public:
1058 static constexpr uint8_t maxUnitsLength = 2;
1059
1060 static constexpr size_t lengthInUnits(char32_t codePoint) {
1061 return codePoint < unicode::NonBMPMin ? 1 : 2;
1062 }
1063 };
1064
1065 template <>
1066 struct SourceUnitTraits<mozilla::Utf8Unit> {
1067 public:
1068 static constexpr uint8_t maxUnitsLength = 4;
1069
1070 static constexpr size_t lengthInUnits(char32_t codePoint) {
1071 return codePoint < 0x80 ? 1
1072 : codePoint < 0x800 ? 2
1073 : codePoint < 0x10000 ? 3
1074 : 4;
1075 }
1076 };
1077
1078 /**
1079 * PeekedCodePoint represents the result of peeking ahead in some source text
1080 * to determine the next validly-encoded code point.
1081 *
1082 * If there isn't a valid code point, then |isNone()|.
1083 *
1084 * But if there *is* a valid code point, then |!isNone()|, the code point has
1085 * value |codePoint()| and its length in code units is |lengthInUnits()|.
1086 *
1087 * Conceptually, this class is |Maybe<struct { char32_t v; uint8_t len; }>|.
1088 */
1089 template <typename Unit>
1090 class PeekedCodePoint final {
1091 char32_t codePoint_ = 0;
1092 uint8_t lengthInUnits_ = 0;
1093
1094 private:
1095 using SourceUnitTraits = frontend::SourceUnitTraits<Unit>;
1096
1097 PeekedCodePoint() = default;
1098
1099 public:
1100 /**
1101 * Create a peeked code point with the given value and length in code
1102 * units.
1103 *
1104 * While the latter value is computable from the former for both UTF-8 and
1105 * JS's version of UTF-16, the caller likely computed a length in units in
1106 * the course of determining the peeked value. Passing both here avoids
1107 * recomputation and lets us do a consistency-checking assertion.
1108 */
1109 PeekedCodePoint(char32_t codePoint, uint8_t lengthInUnits)
1110 : codePoint_(codePoint), lengthInUnits_(lengthInUnits) {
1111 MOZ_ASSERT(codePoint <= unicode::NonBMPMax);
1112 MOZ_ASSERT(lengthInUnits != 0, "bad code point length");
1113 MOZ_ASSERT(lengthInUnits == SourceUnitTraits::lengthInUnits(codePoint));
1114 }
1115
1116 /** Create a PeekedCodeUnit that represents no valid code point. */
1117 static PeekedCodePoint none() { return PeekedCodePoint(); }
1118
1119 /** True if no code point was found, false otherwise. */
1120 bool isNone() const { return lengthInUnits_ == 0; }
1121
1122 /** If a code point was found, its value. */
1123 char32_t codePoint() const {
1124 MOZ_ASSERT(!isNone());
1125 return codePoint_;
1126 }
1127
1128 /** If a code point was found, its length in code units. */
1129 uint8_t lengthInUnits() const {
1130 MOZ_ASSERT(!isNone());
1131 return lengthInUnits_;
1132 }
1133 };
1134
1135 inline PeekedCodePoint<char16_t> PeekCodePoint(const char16_t* const ptr,
1136 const char16_t* const end) {
1137 if (MOZ_UNLIKELY(ptr >= end)) {
1138 return PeekedCodePoint<char16_t>::none();
1139 }
1140
1141 char16_t lead = ptr[0];
1142
1143 char32_t c;
1144 uint8_t len;
1145 if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead)) ||
1146 MOZ_UNLIKELY(ptr + 1 >= end || !unicode::IsTrailSurrogate(ptr[1]))) {
1147 c = lead;
1148 len = 1;
1149 } else {
1150 c = unicode::UTF16Decode(lead, ptr[1]);
1151 len = 2;
1152 }
1153
1154 return PeekedCodePoint<char16_t>(c, len);
1155 }
1156
1157 inline PeekedCodePoint<mozilla::Utf8Unit> PeekCodePoint(
1158 const mozilla::Utf8Unit* const ptr, const mozilla::Utf8Unit* const end) {
1159 if (MOZ_UNLIKELY(ptr >= end)) {
1160 return PeekedCodePoint<mozilla::Utf8Unit>::none();
1161 }
1162
1163 const mozilla::Utf8Unit lead = ptr[0];
1164 if (mozilla::IsAscii(lead)) {
1165 return PeekedCodePoint<mozilla::Utf8Unit>(lead.toUint8(), 1);
1166 }
1167
1168 const mozilla::Utf8Unit* afterLead = ptr + 1;
1169 mozilla::Maybe<char32_t> codePoint =
1170 mozilla::DecodeOneUtf8CodePoint(lead, &afterLead, end);
1171 if (codePoint.isNothing()) {
1172 return PeekedCodePoint<mozilla::Utf8Unit>::none();
1173 }
1174
1175 auto len =
1176 mozilla::AssertedCast<uint8_t>(mozilla::PointerRangeSize(ptr, afterLead));
1177 MOZ_ASSERT(len <= 4);
1178
1179 return PeekedCodePoint<mozilla::Utf8Unit>(codePoint.value(), len);
1180 }
1181
1182 inline bool IsSingleUnitLineTerminator(mozilla::Utf8Unit unit) {
1183 // BEWARE: The Unicode line/paragraph separators don't fit in a single
1184 // UTF-8 code unit, so this test is exact for Utf8Unit but inexact
1185 // for UTF-8 as a whole. Users must handle |unit| as start of a
1186 // Unicode LineTerminator themselves!
1187 return unit == mozilla::Utf8Unit('\n') || unit == mozilla::Utf8Unit('\r');
1188 }
1189
1190 // This is the low-level interface to the JS source code buffer. It just gets
1191 // raw Unicode code units -- 16-bit char16_t units of source text that are not
1192 // (always) full code points, and 8-bit units of UTF-8 source text soon.
1193 // TokenStreams functions are layered on top and do some extra stuff like
1194 // converting all EOL sequences to '\n', tracking the line number, and setting
1195 // |flags.isEOF|. (The "raw" in "raw Unicode code units" refers to the lack of
1196 // EOL sequence normalization.)
1197 //
1198 // buf[0..length-1] often represents a substring of some larger source,
1199 // where we have only the substring in memory. The |startOffset| argument
1200 // indicates the offset within this larger string at which our string
1201 // begins, the offset of |buf[0]|.
1202 template <typename Unit>
1203 class SourceUnits {
1204 private:
1205 /** Base of buffer. */
1206 const Unit* base_;
1207
1208 /** Offset of base_[0]. */
1209 uint32_t startOffset_;
1210
1211 /** Limit for quick bounds check. */
1212 const Unit* limit_;
1213
1214 /** Next char to get. */
1215 const Unit* ptr;
1216
1217 public:
1218 SourceUnits(const Unit* units, size_t length, size_t startOffset)
1219 : base_(units),
1220 startOffset_(startOffset),
1221 limit_(units + length),
1222 ptr(units) {}
1223
1224 bool atStart() const {
1225 MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
1226 return ptr == base_;
1227 }
1228
1229 bool atEnd() const {
1230 MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
1231 MOZ_ASSERT(ptr <= limit_, "shouldn't have overrun");
1232 return ptr >= limit_;
1233 }
1234
1235 size_t remaining() const {
1236 MOZ_ASSERT(!isPoisoned(),
1237 "can't get a count of remaining code units if poisoned");
1238 return mozilla::PointerRangeSize(ptr, limit_);
1239 }
1240
1241 size_t startOffset() const { return startOffset_; }
1242
1243 size_t offset() const {
1244 return startOffset_ + mozilla::PointerRangeSize(base_, ptr);
1245 }
1246
1247 const Unit* codeUnitPtrAt(size_t offset) const {
1248 MOZ_ASSERT(!isPoisoned(), "shouldn't be using if poisoned");
1249 MOZ_ASSERT(startOffset_ <= offset);
1250 MOZ_ASSERT(offset - startOffset_ <=
1251 mozilla::PointerRangeSize(base_, limit_));
1252 return base_ + (offset - startOffset_);
1253 }
1254
1255 const Unit* current() const { return ptr; }
1256
1257 const Unit* limit() const { return limit_; }
1258
1259 Unit previousCodeUnit() {
1260 MOZ_ASSERT(!isPoisoned(), "can't get previous code unit if poisoned");
1261 MOZ_ASSERT(!atStart(), "must have a previous code unit to get");
1262 return *(ptr - 1);
1263 }
1264
1265 Unit getCodeUnit() {
1266 return *ptr++; // this will nullptr-crash if poisoned
1267 }
1268
1269 Unit peekCodeUnit() const {
1270 return *ptr; // this will nullptr-crash if poisoned
1271 }
1272
1273 /**
1274 * Determine the next code point in source text. The code point is not
1275 * normalized: '\r', '\n', '\u2028', and '\u2029' are returned literally.
1276 * If there is no next code point because |atEnd()|, or if an encoding
1277 * error is encountered, return a |PeekedCodePoint| that |isNone()|.
1278 *
1279 * This function does not report errors: code that attempts to get the next
1280 * code point must report any error.
1281 *
1282 * If a next code point is found, it may be consumed by passing it to
1283 * |consumeKnownCodePoint|.
1284 */
1285 PeekedCodePoint<Unit> peekCodePoint() const {
1286 return PeekCodePoint(ptr, limit_);
1287 }
1288
1289 private:
1290 #ifdef DEBUG
1291 void assertNextCodePoint(const PeekedCodePoint<Unit>& peeked);
1292 #endif
1293
1294 public:
1295 /**
1296 * Consume a peeked code point that |!isNone()|.
1297 *
1298 * This call DOES NOT UPDATE LINE-STATUS. You may need to call
1299 * |updateLineInfoForEOL()| and |updateFlagsForEOL()| if this consumes a
1300 * LineTerminator. Note that if this consumes '\r', you also must consume
1301 * an optional '\n' (i.e. a full LineTerminatorSequence) before doing so.
1302 */
1303 void consumeKnownCodePoint(const PeekedCodePoint<Unit>& peeked) {
1304 MOZ_ASSERT(!peeked.isNone());
1305 MOZ_ASSERT(peeked.lengthInUnits() <= remaining());
1306
1307 #ifdef DEBUG
1308 assertNextCodePoint(peeked);
1309 #endif
1310
1311 ptr += peeked.lengthInUnits();
1312 }
1313
1314 /** Match |n| hexadecimal digits and store their value in |*out|. */
1315 bool matchHexDigits(uint8_t n, char16_t* out) {
1316 MOZ_ASSERT(!isPoisoned(), "shouldn't peek into poisoned SourceUnits");
1317 MOZ_ASSERT(n <= 4, "hexdigit value can't overflow char16_t");
1318 if (n > remaining()) {
1319 return false;
1320 }
1321
1322 char16_t v = 0;
1323 for (uint8_t i = 0; i < n; i++) {
1324 auto unit = CodeUnitValue(ptr[i]);
1325 if (!mozilla::IsAsciiHexDigit(unit)) {
1326 return false;
1327 }
1328
1329 v = (v << 4) | mozilla::AsciiAlphanumericToNumber(unit);
1330 }
1331
1332 *out = v;
1333 ptr += n;
1334 return true;
1335 }
1336
1337 bool matchCodeUnits(const char* chars, uint8_t length) {
1338 MOZ_ASSERT(!isPoisoned(), "shouldn't match into poisoned SourceUnits");
1339 if (length > remaining()) {
1340 return false;
1341 }
1342
1343 const Unit* start = ptr;
1344 const Unit* end = ptr + length;
1345 while (ptr < end) {
1346 if (*ptr++ != Unit(*chars++)) {
1347 ptr = start;
1348 return false;
1349 }
1350 }
1351
1352 return true;
1353 }
1354
1355 void skipCodeUnits(uint32_t n) {
1356 MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1357 MOZ_ASSERT(n <= remaining(), "shouldn't skip beyond end of SourceUnits");
1358 ptr += n;
1359 }
1360
1361 void unskipCodeUnits(uint32_t n) {
1362 MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1363 MOZ_ASSERT(n <= mozilla::PointerRangeSize(base_, ptr),
1364 "shouldn't unskip beyond start of SourceUnits");
1365 ptr -= n;
1366 }
1367
1368 private:
1369 friend class TokenStreamCharsBase<Unit>;
1370
1371 bool internalMatchCodeUnit(Unit c) {
1372 MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1373 if (MOZ_LIKELY(!atEnd()) && *ptr == c) {
1374 ptr++;
1375 return true;
1376 }
1377 return false;
1378 }
1379
1380 public:
1381 void consumeKnownCodeUnit(Unit c) {
1382 MOZ_ASSERT(!isPoisoned(), "shouldn't use poisoned SourceUnits");
1383 MOZ_ASSERT(*ptr == c, "consuming the wrong code unit");
1384 ptr++;
1385 }
1386
1387 /** Unget U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR. */
1388 inline void ungetLineOrParagraphSeparator();
1389
1390 void ungetCodeUnit() {
1391 MOZ_ASSERT(!isPoisoned(), "can't unget from poisoned units");
1392 MOZ_ASSERT(!atStart(), "can't unget if currently at start");
1393 ptr--;
1394 }
1395
1396 const Unit* addressOfNextCodeUnit(bool allowPoisoned = false) const {
1397 MOZ_ASSERT_IF(!allowPoisoned, !isPoisoned());
1398 return ptr;
1399 }
1400
1401 // Use this with caution!
1402 void setAddressOfNextCodeUnit(const Unit* a, bool allowPoisoned = false) {
1403 MOZ_ASSERT_IF(!allowPoisoned, a);
1404 ptr = a;
1405 }
1406
1407 // Poison the SourceUnits so they can't be accessed again.
1408 void poisonInDebug() {
1409 #ifdef DEBUG
1410 ptr = nullptr;
1411 #endif
1412 }
1413
1414 private:
1415 bool isPoisoned() const {
1416 #ifdef DEBUG
1417 // |ptr| can be null for unpoisoned SourceUnits if this was initialized with
1418 // |units == nullptr| and |length == 0|. In that case, for lack of any
1419 // better options, consider this to not be poisoned.
1420 return ptr == nullptr && ptr != limit_;
1421 #else
1422 return false;
1423 #endif
1424 }
1425
1426 public:
1427 /**
1428 * Consume the rest of a single-line comment (but not the EOL/EOF that
1429 * terminates it).
1430 *
1431 * If an encoding error is encountered -- possible only for UTF-8 because
1432 * JavaScript's conception of UTF-16 encompasses any sequence of 16-bit
1433 * code units -- valid code points prior to the encoding error are consumed
1434 * and subsequent invalid code units are not consumed. For example, given
1435 * these UTF-8 code units:
1436 *
1437 * 'B' 'A' 'D' ':' <bad code unit sequence>
1438 * 0x42 0x41 0x44 0x3A 0xD0 0x00 ...
1439 *
1440 * the first four code units are consumed, but 0xD0 and 0x00 are not
1441 * consumed because 0xD0 encodes a two-byte lead unit but 0x00 is not a
1442 * valid trailing code unit.
1443 *
1444 * It is expected that the caller will report such an encoding error when
1445 * it attempts to consume the next code point.
1446 */
1447 void consumeRestOfSingleLineComment();
1448
1449 /**
1450 * The maximum radius of code around the location of an error that should
1451 * be included in a syntax error message -- this many code units to either
1452 * side. The resulting window of data is then accordinngly trimmed so that
1453 * the window contains only validly-encoded data.
1454 *
1455 * Because this number is the same for both UTF-8 and UTF-16, windows in
1456 * UTF-8 may contain fewer code points than windows in UTF-16. As we only
1457 * use this for error messages, we don't particularly care.
1458 */
1459 static constexpr size_t WindowRadius = ErrorMetadata::lineOfContextRadius;
1460
1461 /**
1462 * From absolute offset |offset|, search backward to find an absolute
1463 * offset within source text, no further than |WindowRadius| code units
1464 * away from |offset|, such that all code points from that offset to
1465 * |offset| are valid, non-LineTerminator code points.
1466 */
1467 size_t findWindowStart(size_t offset) const;
1468
1469 /**
1470 * From absolute offset |offset|, find an absolute offset within source
1471 * text, no further than |WindowRadius| code units away from |offset|, such
1472 * that all code units from |offset| to that offset are valid,
1473 * non-LineTerminator code points.
1474 */
1475 size_t findWindowEnd(size_t offset) const;
1476
1477 /**
1478 * Given a |window| of |encodingSpecificWindowLength| units encoding valid
1479 * Unicode text, with index |encodingSpecificTokenOffset| indicating a
1480 * particular code point boundary in |window|, compute the corresponding
1481 * token offset and length if |window| were encoded in UTF-16. For
1482 * example:
1483 *
1484 * // U+03C0 GREEK SMALL LETTER PI is encoded as 0xCF 0x80.
1485 * const Utf8Unit* encodedWindow =
1486 * reinterpret_cast<const Utf8Unit*>(u8"ππππ = @ FAIL");
1487 * size_t encodedTokenOffset = 11; // 2 * 4 + ' = '.length
1488 * size_t encodedWindowLength = 17; // 2 * 4 + ' = @ FAIL'.length
1489 * size_t utf16Offset, utf16Length;
1490 * computeWindowOffsetAndLength(encodedWindow,
1491 * encodedTokenOffset, &utf16Offset,
1492 * encodedWindowLength, &utf16Length);
1493 * MOZ_ASSERT(utf16Offset == 7);
1494 * MOZ_ASSERT(utf16Length = 13);
1495 *
1496 * This function asserts if called for UTF-16: the sole caller can avoid
1497 * computing UTF-16 offsets when they're definitely the same as the encoded
1498 * offsets.
1499 */
1500 inline void computeWindowOffsetAndLength(const Unit* encodeWindow,
1501 size_t encodingSpecificTokenOffset,
1502 size_t* utf16TokenOffset,
1503 size_t encodingSpecificWindowLength,
1504 size_t* utf16WindowLength);
1505 };
1506
1507 template <>
1508 inline void SourceUnits<char16_t>::ungetLineOrParagraphSeparator() {
1509 #ifdef DEBUG
1510 char16_t prev = previousCodeUnit();
1511 #endif
1512 MOZ_ASSERT(prev == unicode::LINE_SEPARATOR ||
1513 prev == unicode::PARA_SEPARATOR);
1514
1515 ungetCodeUnit();
1516 }
1517
1518 template <>
1519 inline void SourceUnits<mozilla::Utf8Unit>::ungetLineOrParagraphSeparator() {
1520 unskipCodeUnits(3);
1521
1522 MOZ_ASSERT(ptr[0].toUint8() == 0xE2);
1523 MOZ_ASSERT(ptr[1].toUint8() == 0x80);
1524
1525 #ifdef DEBUG
1526 uint8_t last = ptr[2].toUint8();
1527 #endif
1528 MOZ_ASSERT(last == 0xA8 || last == 0xA9);
1529 }
1530
1531 /**
1532 * An all-purpose buffer type for accumulating text during tokenizing.
1533 *
1534 * In principle we could make this buffer contain |char16_t|, |Utf8Unit|, or
1535 * |Unit|. We use |char16_t| because:
1536 *
1537 * * we don't have a UTF-8 regular expression parser, so in general regular
1538 * expression text must be copied to a separate UTF-16 buffer to parse it,
1539 * and
1540 * * |TokenStreamCharsShared::copyCharBufferTo|, which copies a shared
1541 * |CharBuffer| to a |char16_t*|, is simpler if it doesn't have to convert.
1542 */
1543 using CharBuffer = Vector<char16_t, 32>;
1544
1545 /**
1546 * Append the provided code point (in the range [U+0000, U+10FFFF], surrogate
1547 * code points included) to the buffer.
1548 */
1549 [[nodiscard]] extern bool AppendCodePointToCharBuffer(CharBuffer& charBuffer,
1550 uint32_t codePoint);
1551
1552 /**
1553 * Accumulate the range of UTF-16 text (lone surrogates permitted, because JS
1554 * allows them in source text) into |charBuffer|. Normalize '\r', '\n', and
1555 * "\r\n" into '\n'.
1556 */
1557 [[nodiscard]] extern bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
1558 CharBuffer& charBuffer, const char16_t* cur, const char16_t* end);
1559
1560 /**
1561 * Accumulate the range of previously-validated UTF-8 text into |charBuffer|.
1562 * Normalize '\r', '\n', and "\r\n" into '\n'.
1563 */
1564 [[nodiscard]] extern bool FillCharBufferFromSourceNormalizingAsciiLineBreaks(
1565 CharBuffer& charBuffer, const mozilla::Utf8Unit* cur,
1566 const mozilla::Utf8Unit* end);
1567
1568 class TokenStreamCharsShared {
1569 protected:
1570 JSContext* cx;
1571
1572 /**
1573 * Buffer transiently used to store sequences of identifier or string code
1574 * points when such can't be directly processed from the original source
1575 * text (e.g. because it contains escapes).
1576 */
1577 CharBuffer charBuffer;
1578
1579 /** Information for parsing with a lifetime longer than the parser itself. */
1580 ParserAtomsTable* parserAtoms;
1581
1582 protected:
1583 explicit TokenStreamCharsShared(JSContext* cx, ParserAtomsTable* parserAtoms)
1584 : cx(cx), charBuffer(cx), parserAtoms(parserAtoms) {}
1585
1586 [[nodiscard]] bool copyCharBufferTo(
1587 JSContext* cx, UniquePtr<char16_t[], JS::FreePolicy>* destination);
1588
1589 /**
1590 * Determine whether a code unit constitutes a complete ASCII code point.
1591 * (The code point's exact value might not be used, however, if subsequent
1592 * code observes that |unit| is part of a LineTerminatorSequence.)
1593 */
1594 [[nodiscard]] static constexpr MOZ_ALWAYS_INLINE bool isAsciiCodePoint(
1595 int32_t unit) {
1596 return mozilla::IsAscii(static_cast<char32_t>(unit));
1597 }
1598
1599 TaggedParserAtomIndex drainCharBufferIntoAtom() {
1600 // Add to parser atoms table.
1601 auto atom = this->parserAtoms->internChar16(cx, charBuffer.begin(),
1602 charBuffer.length());
1603 charBuffer.clear();
1604 return atom;
1605 }
1606
1607 protected:
1608 void adoptState(TokenStreamCharsShared& other) {
1609 // The other stream's buffer may contain information for a
1610 // gotten-then-ungotten token, that we must transfer into this stream so
1611 // that token's final get behaves as desired.
1612 charBuffer = std::move(other.charBuffer);
1613 }
1614
1615 public:
1616 CharBuffer& getCharBuffer() { return charBuffer; }
1617 };
1618
1619 template <typename Unit>
1620 class TokenStreamCharsBase : public TokenStreamCharsShared {
1621 protected:
1622 using SourceUnits = frontend::SourceUnits<Unit>;
1623
1624 /** Code units in the source code being tokenized. */
1625 SourceUnits sourceUnits;
1626
1627 // End of fields.
1628
1629 protected:
1630 TokenStreamCharsBase(JSContext* cx, ParserAtomsTable* parserAtoms,
1631 const Unit* units, size_t length, size_t startOffset);
1632
1633 /**
1634 * Convert a non-EOF code unit returned by |getCodeUnit()| or
1635 * |peekCodeUnit()| to a Unit code unit.
1636 */
1637 inline Unit toUnit(int32_t codeUnitValue);
1638
1639 void ungetCodeUnit(int32_t c) {
1640 if (c == EOF) {
1641 return;
1642 }
1643
1644 sourceUnits.ungetCodeUnit();
1645 }
1646
1647 MOZ_ALWAYS_INLINE TaggedParserAtomIndex
1648 atomizeSourceChars(mozilla::Span<const Unit> units);
1649
1650 /**
1651 * Try to match a non-LineTerminator ASCII code point. Return true iff it
1652 * was matched.
1653 */
1654 bool matchCodeUnit(char expect) {
1655 MOZ_ASSERT(mozilla::IsAscii(expect));
1656 MOZ_ASSERT(expect != '\r');
1657 MOZ_ASSERT(expect != '\n');
1658 return this->sourceUnits.internalMatchCodeUnit(Unit(expect));
1659 }
1660
1661 /**
1662 * Try to match an ASCII LineTerminator code point. Return true iff it was
1663 * matched.
1664 */
1665 bool matchLineTerminator(char expect) {
1666 MOZ_ASSERT(expect == '\r' || expect == '\n');
1667 return this->sourceUnits.internalMatchCodeUnit(Unit(expect));
1668 }
1669
1670 template <typename T>
1671 bool matchCodeUnit(T) = delete;
1672 template <typename T>
1673 bool matchLineTerminator(T) = delete;
1674
1675 int32_t peekCodeUnit() {
1676 return MOZ_LIKELY(!sourceUnits.atEnd())
1677 ? CodeUnitValue(sourceUnits.peekCodeUnit())
1678 : EOF;
1679 }
1680
1681 /** Consume a known, non-EOF code unit. */
1682 inline void consumeKnownCodeUnit(int32_t unit);
1683
1684 // Forbid accidental calls to consumeKnownCodeUnit *not* with the single
1685 // unit-or-EOF type. Unit should use SourceUnits::consumeKnownCodeUnit;
1686 // CodeUnitValue() results should go through toUnit(), or better yet just
1687 // use the original Unit.
1688 template <typename T>
1689 inline void consumeKnownCodeUnit(T) = delete;
1690
1691 /**
1692 * Add a null-terminated line of context to error information, for the line
1693 * in |sourceUnits| that contains |offset|. Also record the window's
1694 * length and the offset of the error in the window. (Don't bother adding
1695 * a line of context if it would be empty.)
1696 *
1697 * The window will contain no LineTerminators of any kind, and it will not
1698 * extend more than |SourceUnits::WindowRadius| to either side of |offset|,
1699 * nor into the previous or next lines.
1700 *
1701 * This function is quite internal, and you probably should be calling one
1702 * of its existing callers instead.
1703 */
1704 [[nodiscard]] bool addLineOfContext(ErrorMetadata* err, uint32_t offset);
1705 };
1706
1707 template <>
1708 inline char16_t TokenStreamCharsBase<char16_t>::toUnit(int32_t codeUnitValue) {
1709 MOZ_ASSERT(codeUnitValue != EOF, "EOF is not a Unit");
1710 return mozilla::AssertedCast<char16_t>(codeUnitValue);
1711 }
1712
1713 template <>
1714 inline mozilla::Utf8Unit TokenStreamCharsBase<mozilla::Utf8Unit>::toUnit(
1715 int32_t value) {
1716 MOZ_ASSERT(value != EOF, "EOF is not a Unit");
1717 return mozilla::Utf8Unit(mozilla::AssertedCast<unsigned char>(value));
1718 }
1719
1720 template <typename Unit>
1721 inline void TokenStreamCharsBase<Unit>::consumeKnownCodeUnit(int32_t unit) {
1722 sourceUnits.consumeKnownCodeUnit(toUnit(unit));
1723 }
1724
1725 template <>
1726 MOZ_ALWAYS_INLINE TaggedParserAtomIndex
1727 TokenStreamCharsBase<char16_t>::atomizeSourceChars(
1728 mozilla::Span<const char16_t> units) {
1729 return this->parserAtoms->internChar16(cx, units.data(), units.size());
1730 }
1731
1732 template <>
1733 /* static */ MOZ_ALWAYS_INLINE TaggedParserAtomIndex
1734 TokenStreamCharsBase<mozilla::Utf8Unit>::atomizeSourceChars(
1735 mozilla::Span<const mozilla::Utf8Unit> units) {
1736 return this->parserAtoms->internUtf8(cx, units.data(), units.size());
1737 }
1738
1739 template <typename Unit>
1740 class SpecializedTokenStreamCharsBase;
1741
1742 template <>
1743 class SpecializedTokenStreamCharsBase<char16_t>
1744 : public TokenStreamCharsBase<char16_t> {
1745 using CharsBase = TokenStreamCharsBase<char16_t>;
1746
1747 protected:
1748 using TokenStreamCharsShared::isAsciiCodePoint;
1749 // Deliberately don't |using| |sourceUnits| because of bug 1472569. :-(
1750
1751 using typename CharsBase::SourceUnits;
1752
1753 protected:
1754 // These APIs are only usable by UTF-16-specific code.
1755
1756 /**
1757 * Given |lead| already consumed, consume and return the code point encoded
1758 * starting from it. Infallible because lone surrogates in JS encode a
1759 * "code point" of the same value.
1760 */
1761 char32_t infallibleGetNonAsciiCodePointDontNormalize(char16_t lead) {
1762 MOZ_ASSERT(!isAsciiCodePoint(lead));
1763 MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == lead);
1764
1765 // Handle single-unit code points and lone trailing surrogates.
1766 if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead)) ||
1767 // Or handle lead surrogates not paired with trailing surrogates.
1768 MOZ_UNLIKELY(
1769 this->sourceUnits.atEnd() ||
1770 !unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) {
1771 return lead;
1772 }
1773
1774 // Otherwise it's a multi-unit code point.
1775 return unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
1776 }
1777
1778 protected:
1779 // These APIs are in both SpecializedTokenStreamCharsBase specializations
1780 // and so are usable in subclasses no matter what Unit is.
1781
1782 using CharsBase::CharsBase;
1783 };
1784
1785 template <>
1786 class SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>
1787 : public TokenStreamCharsBase<mozilla::Utf8Unit> {
1788 using CharsBase = TokenStreamCharsBase<mozilla::Utf8Unit>;
1789
1790 protected:
1791 // Deliberately don't |using| |sourceUnits| because of bug 1472569. :-(
1792
1793 protected:
1794 // These APIs are only usable by UTF-8-specific code.
1795
1796 using typename CharsBase::SourceUnits;
1797
1798 /**
1799 * A mutable iterator-wrapper around |SourceUnits| that translates
1800 * operators to calls to |SourceUnits::getCodeUnit()| and similar.
1801 *
1802 * This class is expected to be used in concert with |SourceUnitsEnd|.
1803 */
1804 class SourceUnitsIterator {
1805 SourceUnits& sourceUnits_;
1806 #ifdef DEBUG
1807 // In iterator copies created by the post-increment operator, a pointer
1808 // at the next source text code unit when the post-increment operator
1809 // was called, cleared when the iterator is dereferenced.
1810 mutable mozilla::Maybe<const mozilla::Utf8Unit*>
1811 currentBeforePostIncrement_;
1812 #endif
1813
1814 public:
1815 explicit SourceUnitsIterator(SourceUnits& sourceUnits)
1816 : sourceUnits_(sourceUnits) {}
1817
1818 mozilla::Utf8Unit operator*() const {
1819 // operator* is expected to get the *next* value from an iterator
1820 // not pointing at the end of the underlying range. However, the
1821 // sole use of this is in the context of an expression of the form
1822 // |*iter++|, that performed the |sourceUnits_.getCodeUnit()| in
1823 // the |operator++(int)| below -- so dereferencing acts on a
1824 // |sourceUnits_| already advanced. Therefore the correct unit to
1825 // return is the previous one.
1826 MOZ_ASSERT(currentBeforePostIncrement_.value() + 1 ==
1827 sourceUnits_.current());
1828 #ifdef DEBUG
1829 currentBeforePostIncrement_.reset();
1830 #endif
1831 return sourceUnits_.previousCodeUnit();
1832 }
1833
1834 SourceUnitsIterator operator++(int) {
1835 MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1836 "the only valid operation on a post-incremented "
1837 "iterator is dereferencing a single time");
1838
1839 SourceUnitsIterator copy = *this;
1840 #ifdef DEBUG
1841 copy.currentBeforePostIncrement_.emplace(sourceUnits_.current());
1842 #endif
1843
1844 sourceUnits_.getCodeUnit();
1845 return copy;
1846 }
1847
1848 void operator-=(size_t n) {
1849 MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1850 "the only valid operation on a post-incremented "
1851 "iterator is dereferencing a single time");
1852 sourceUnits_.unskipCodeUnits(n);
1853 }
1854
1855 mozilla::Utf8Unit operator[](ptrdiff_t index) {
1856 MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1857 "the only valid operation on a post-incremented "
1858 "iterator is dereferencing a single time");
1859 MOZ_ASSERT(index == -1,
1860 "must only be called to verify the value of the "
1861 "previous code unit");
1862 return sourceUnits_.previousCodeUnit();
1863 }
1864
1865 size_t remaining() const {
1866 MOZ_ASSERT(currentBeforePostIncrement_.isNothing(),
1867 "the only valid operation on a post-incremented "
1868 "iterator is dereferencing a single time");
1869 return sourceUnits_.remaining();
1870 }
1871 };
1872
1873 /** A sentinel representing the end of |SourceUnits| data. */
1874 class SourceUnitsEnd {};
1875
1876 friend inline size_t operator-(const SourceUnitsEnd& aEnd,
1877 const SourceUnitsIterator& aIter);
1878
1879 protected:
1880 // These APIs are in both SpecializedTokenStreamCharsBase specializations
1881 // and so are usable in subclasses no matter what Unit is.
1882
1883 using CharsBase::CharsBase;
1884 };
1885
1886 inline size_t operator-(const SpecializedTokenStreamCharsBase<
1887 mozilla::Utf8Unit>::SourceUnitsEnd& aEnd,
1888 const SpecializedTokenStreamCharsBase<
1889 mozilla::Utf8Unit>::SourceUnitsIterator& aIter) {
1890 return aIter.remaining();
1891 }
1892
1893 /** A small class encapsulating computation of the start-offset of a Token. */
1894 class TokenStart {
1895 uint32_t startOffset_;
1896
1897 public:
1898 /**
1899 * Compute a starting offset that is the current offset of |sourceUnits|,
1900 * offset by |adjust|. (For example, |adjust| of -1 indicates the code
1901 * unit one backwards from |sourceUnits|'s current offset.)
1902 */
1903 template <class SourceUnits>
1904 TokenStart(const SourceUnits& sourceUnits, ptrdiff_t adjust)
1905 : startOffset_(sourceUnits.offset() + adjust) {}
1906
1907 TokenStart(const TokenStart&) = default;
1908
1909 uint32_t offset() const { return startOffset_; }
1910 };
1911
1912 template <typename Unit, class AnyCharsAccess>
1913 class GeneralTokenStreamChars : public SpecializedTokenStreamCharsBase<Unit> {
1914 using CharsBase = TokenStreamCharsBase<Unit>;
1915 using SpecializedCharsBase = SpecializedTokenStreamCharsBase<Unit>;
1916
1917 using LineToken = TokenStreamAnyChars::LineToken;
1918
1919 private:
1920 Token* newTokenInternal(TokenKind kind, TokenStart start, TokenKind* out);
1921
1922 /**
1923 * Allocates a new Token from the given offset to the current offset,
1924 * ascribes it the given kind, and sets |*out| to that kind.
1925 */
1926 Token* newToken(TokenKind kind, TokenStart start,
1927 TokenStreamShared::Modifier modifier, TokenKind* out) {
1928 Token* token = newTokenInternal(kind, start, out);
1929
1930 #ifdef DEBUG
1931 // Save the modifier used to get this token, so that if an ungetToken()
1932 // occurs and then the token is re-gotten (or peeked, etc.), we can
1933 // assert both gets used compatible modifiers.
1934 token->modifier = modifier;
1935 #endif
1936
1937 return token;
1938 }
1939
1940 uint32_t matchUnicodeEscape(uint32_t* codePoint);
1941 uint32_t matchExtendedUnicodeEscape(uint32_t* codePoint);
1942
1943 protected:
1944 using CharsBase::addLineOfContext;
1945 using CharsBase::matchCodeUnit;
1946 using CharsBase::matchLineTerminator;
1947 using TokenStreamCharsShared::drainCharBufferIntoAtom;
1948 using TokenStreamCharsShared::isAsciiCodePoint;
1949 // Deliberately don't |using CharsBase::sourceUnits| because of bug 1472569.
1950 // :-(
1951 using CharsBase::toUnit;
1952
1953 using typename CharsBase::SourceUnits;
1954
1955 protected:
1956 using SpecializedCharsBase::SpecializedCharsBase;
1957
1958 TokenStreamAnyChars& anyCharsAccess() {
1959 return AnyCharsAccess::anyChars(this);
1960 }
1961
1962 const TokenStreamAnyChars& anyCharsAccess() const {
1963 return AnyCharsAccess::anyChars(this);
1964 }
1965
1966 using TokenStreamSpecific =
1967 frontend::TokenStreamSpecific<Unit, AnyCharsAccess>;
1968
1969 TokenStreamSpecific* asSpecific() {
1970 static_assert(
1971 std::is_base_of_v<GeneralTokenStreamChars, TokenStreamSpecific>,
1972 "static_cast below presumes an inheritance relationship");
1973
1974 return static_cast<TokenStreamSpecific*>(this);
1975 }
1976
1977 protected:
1978 /**
1979 * Compute the column number in Unicode code points of the absolute |offset|
1980 * within source text on the line corresponding to |lineToken|.
1981 *
1982 * |offset| must be a code point boundary, preceded only by validly-encoded
1983 * source units. (It doesn't have to be *followed* by valid source units.)
1984 */
1985 uint32_t computeColumn(LineToken lineToken, uint32_t offset) const;
1986 void computeLineAndColumn(uint32_t offset, uint32_t* line,
1987 uint32_t* column) const;
1988
1989 /**
1990 * Fill in |err| completely, except for line-of-context information.
1991 *
1992 * Return true if the caller can compute a line of context from the token
1993 * stream. Otherwise return false.
1994 */
1995 [[nodiscard]] bool fillExceptingContext(ErrorMetadata* err, uint32_t offset) {
1996 if (anyCharsAccess().fillExceptingContext(err, offset)) {
1997 computeLineAndColumn(offset, &err->lineNumber, &err->columnNumber);
1998 return true;
1999 }
2000 return false;
2001 }
2002
2003 void newSimpleToken(TokenKind kind, TokenStart start,
2004 TokenStreamShared::Modifier modifier, TokenKind* out) {
2005 newToken(kind, start, modifier, out);
2006 }
2007
2008 void newNumberToken(double dval, DecimalPoint decimalPoint, TokenStart start,
2009 TokenStreamShared::Modifier modifier, TokenKind* out) {
2010 Token* token = newToken(TokenKind::Number, start, modifier, out);
2011 token->setNumber(dval, decimalPoint);
2012 }
2013
2014 void newBigIntToken(TokenStart start, TokenStreamShared::Modifier modifier,
2015 TokenKind* out) {
2016 newToken(TokenKind::BigInt, start, modifier, out);
2017 }
2018
2019 void newAtomToken(TokenKind kind, TaggedParserAtomIndex atom,
2020 TokenStart start, TokenStreamShared::Modifier modifier,
2021 TokenKind* out) {
2022 MOZ_ASSERT(kind == TokenKind::String || kind == TokenKind::TemplateHead ||
2023 kind == TokenKind::NoSubsTemplate);
2024
2025 Token* token = newToken(kind, start, modifier, out);
2026 token->setAtom(atom);
2027 }
2028
2029 void newNameToken(TaggedParserAtomIndex name, TokenStart start,
2030 TokenStreamShared::Modifier modifier, TokenKind* out) {
2031 Token* token = newToken(TokenKind::Name, start, modifier, out);
2032 token->setName(name);
2033 }
2034
2035 void newPrivateNameToken(TaggedParserAtomIndex name, TokenStart start,
2036 TokenStreamShared::Modifier modifier,
2037 TokenKind* out) {
2038 Token* token = newToken(TokenKind::PrivateName, start, modifier, out);
2039 token->setName(name);
2040 }
2041
2042 void newRegExpToken(JS::RegExpFlags reflags, TokenStart start,
2043 TokenKind* out) {
2044 Token* token = newToken(TokenKind::RegExp, start,
2045 TokenStreamShared::SlashIsRegExp, out);
2046 token->setRegExpFlags(reflags);
2047 }
2048
2049 MOZ_COLD bool badToken();
2050
2051 /**
2052 * Get the next code unit -- the next numeric sub-unit of source text,
2053 * possibly smaller than a full code point -- without updating line/column
2054 * counters or consuming LineTerminatorSequences.
2055 *
2056 * Because of these limitations, only use this if (a) the resulting code
2057 * unit is guaranteed to be ungotten (by ungetCodeUnit()) if it's an EOL,
2058 * and (b) the line-related state (lineno, linebase) is not used before
2059 * it's ungotten.
2060 */
2061 int32_t getCodeUnit() {
2062 if (MOZ_LIKELY(!this->sourceUnits.atEnd())) {
2063 return CodeUnitValue(this->sourceUnits.getCodeUnit());
2064 }
2065
2066 anyCharsAccess().flags.isEOF = true;
2067 return EOF;
2068 }
2069
2070 void ungetCodeUnit(int32_t c) {
2071 MOZ_ASSERT_IF(c == EOF, anyCharsAccess().flags.isEOF);
2072
2073 CharsBase::ungetCodeUnit(c);
2074 }
2075
2076 /**
2077 * Given a just-consumed ASCII code unit/point |lead|, consume a full code
2078 * point or LineTerminatorSequence (normalizing it to '\n') and store it in
2079 * |*codePoint|. Return true on success, otherwise return false and leave
2080 * |*codePoint| undefined on failure.
2081 *
2082 * If a LineTerminatorSequence was consumed, also update line/column info.
2083 *
2084 * This may change the current |sourceUnits| offset.
2085 */
2086 [[nodiscard]] bool getFullAsciiCodePoint(int32_t lead, int32_t* codePoint) {
2087 MOZ_ASSERT(isAsciiCodePoint(lead),
2088 "non-ASCII code units must be handled separately");
2089 MOZ_ASSERT(toUnit(lead) == this->sourceUnits.previousCodeUnit(),
2090 "getFullAsciiCodePoint called incorrectly");
2091
2092 if (MOZ_UNLIKELY(lead == '\r')) {
2093 matchLineTerminator('\n');
2094 } else if (MOZ_LIKELY(lead != '\n')) {
2095 *codePoint = lead;
2096 return true;
2097 }
2098
2099 *codePoint = '\n';
2100 bool ok = updateLineInfoForEOL();
2101 if (!ok) {
2102 #ifdef DEBUG
2103 *codePoint = EOF; // sentinel value to hopefully cause errors
2104 #endif
2105 MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint));
2106 }
2107 return ok;
2108 }
2109
2110 [[nodiscard]] MOZ_ALWAYS_INLINE bool updateLineInfoForEOL() {
2111 return anyCharsAccess().internalUpdateLineInfoForEOL(
2112 this->sourceUnits.offset());
2113 }
2114
2115 uint32_t matchUnicodeEscapeIdStart(uint32_t* codePoint);
2116 bool matchUnicodeEscapeIdent(uint32_t* codePoint);
2117 bool matchIdentifierStart();
2118
2119 /**
2120 * If possible, compute a line of context for an otherwise-filled-in |err|
2121 * at the given offset in this token stream.
2122 *
2123 * This function is very-internal: almost certainly you should use one of
2124 * its callers instead. It basically exists only to make those callers
2125 * more readable.
2126 */
2127 [[nodiscard]] bool internalComputeLineOfContext(ErrorMetadata* err,
2128 uint32_t offset) {
2129 // We only have line-start information for the current line. If the error
2130 // is on a different line, we can't easily provide context. (This means
2131 // any error in a multi-line token, e.g. an unterminated multiline string
2132 // literal, won't have context.)
2133 if (err->lineNumber != anyCharsAccess().lineno) {
2134 return true;
2135 }
2136
2137 return addLineOfContext(err, offset);
2138 }
2139
2140 public:
2141 /**
2142 * Consume any hashbang comment at the start of a Script or Module, if one is
2143 * present. Stops consuming just before any terminating LineTerminator or
2144 * before an encoding error is encountered.
2145 */
2146 void consumeOptionalHashbangComment();
2147
2148 TaggedParserAtomIndex getRawTemplateStringAtom() {
2149 TokenStreamAnyChars& anyChars = anyCharsAccess();
2150
2151 MOZ_ASSERT(anyChars.currentToken().type == TokenKind::TemplateHead ||
2152 anyChars.currentToken().type == TokenKind::NoSubsTemplate);
2153 const Unit* cur =
2154 this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.begin + 1);
2155 const Unit* end;
2156 if (anyChars.currentToken().type == TokenKind::TemplateHead) {
2157 // Of the form |`...${| or |}...${|
2158 end =
2159 this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.end - 2);
2160 } else {
2161 // NO_SUBS_TEMPLATE is of the form |`...`| or |}...`|
2162 end =
2163 this->sourceUnits.codeUnitPtrAt(anyChars.currentToken().pos.end - 1);
2164 }
2165
2166 // |charBuffer| should be empty here, but we may as well code defensively.
2167 MOZ_ASSERT(this->charBuffer.length() == 0);
2168 this->charBuffer.clear();
2169
2170 // Template literals normalize only '\r' and "\r\n" to '\n'; Unicode
2171 // separators don't need special handling.
2172 // https://tc39.github.io/ecma262/#sec-static-semantics-tv-and-trv
2173 if (!FillCharBufferFromSourceNormalizingAsciiLineBreaks(this->charBuffer,
2174 cur, end)) {
2175 return TaggedParserAtomIndex::null();
2176 }
2177
2178 return drainCharBufferIntoAtom();
2179 }
2180 };
2181
2182 template <typename Unit, class AnyCharsAccess>
2183 class TokenStreamChars;
2184
2185 template <class AnyCharsAccess>
2186 class TokenStreamChars<char16_t, AnyCharsAccess>
2187 : public GeneralTokenStreamChars<char16_t, AnyCharsAccess> {
2188 using CharsBase = TokenStreamCharsBase<char16_t>;
2189 using SpecializedCharsBase = SpecializedTokenStreamCharsBase<char16_t>;
2190 using GeneralCharsBase = GeneralTokenStreamChars<char16_t, AnyCharsAccess>;
2191 using Self = TokenStreamChars<char16_t, AnyCharsAccess>;
2192
2193 using GeneralCharsBase::asSpecific;
2194
2195 using typename GeneralCharsBase::TokenStreamSpecific;
2196
2197 protected:
2198 using CharsBase::matchLineTerminator;
2199 using GeneralCharsBase::anyCharsAccess;
2200 using GeneralCharsBase::getCodeUnit;
2201 using SpecializedCharsBase::infallibleGetNonAsciiCodePointDontNormalize;
2202 using TokenStreamCharsShared::isAsciiCodePoint;
2203 // Deliberately don't |using| |sourceUnits| because of bug 1472569. :-(
2204 using GeneralCharsBase::ungetCodeUnit;
2205 using GeneralCharsBase::updateLineInfoForEOL;
2206
2207 protected:
2208 using GeneralCharsBase::GeneralCharsBase;
2209
2210 /**
2211 * Given the non-ASCII |lead| code unit just consumed, consume and return a
2212 * complete non-ASCII code point. Line/column updates are not performed,
2213 * and line breaks are returned as-is without normalization.
2214 */
2215 [[nodiscard]] bool getNonAsciiCodePointDontNormalize(char16_t lead,
2216 char32_t* codePoint) {
2217 // There are no encoding errors in 16-bit JS, so implement this so that
2218 // the compiler knows it, too.
2219 *codePoint = infallibleGetNonAsciiCodePointDontNormalize(lead);
2220 return true;
2221 }
2222
2223 /**
2224 * Given a just-consumed non-ASCII code unit |lead| (which may also be a
2225 * full code point, for UTF-16), consume a full code point or
2226 * LineTerminatorSequence (normalizing it to '\n') and store it in
2227 * |*codePoint|. Return true on success, otherwise return false and leave
2228 * |*codePoint| undefined on failure.
2229 *
2230 * If a LineTerminatorSequence was consumed, also update line/column info.
2231 *
2232 * This may change the current |sourceUnits| offset.
2233 */
2234 [[nodiscard]] bool getNonAsciiCodePoint(int32_t lead, int32_t* codePoint);
2235 };
2236
2237 template <class AnyCharsAccess>
2238 class TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>
2239 : public GeneralTokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess> {
2240 using CharsBase = TokenStreamCharsBase<mozilla::Utf8Unit>;
2241 using SpecializedCharsBase =
2242 SpecializedTokenStreamCharsBase<mozilla::Utf8Unit>;
2243 using GeneralCharsBase =
2244 GeneralTokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;
2245 using Self = TokenStreamChars<mozilla::Utf8Unit, AnyCharsAccess>;
2246
2247 using typename SpecializedCharsBase::SourceUnitsEnd;
2248 using typename SpecializedCharsBase::SourceUnitsIterator;
2249
2250 protected:
2251 using GeneralCharsBase::anyCharsAccess;
2252 using GeneralCharsBase::computeLineAndColumn;
2253 using GeneralCharsBase::fillExceptingContext;
2254 using GeneralCharsBase::internalComputeLineOfContext;
2255 using TokenStreamCharsShared::isAsciiCodePoint;
2256 // Deliberately don't |using| |sourceUnits| because of bug 1472569. :-(
2257 using GeneralCharsBase::updateLineInfoForEOL;
2258
2259 private:
2260 static char toHexChar(uint8_t nibble) {
2261 MOZ_ASSERT(nibble < 16);
2262 return "0123456789ABCDEF"[nibble];
2263 }
2264
2265 static void byteToString(uint8_t n, char* str) {
2266 str[0] = '0';
2267 str[1] = 'x';
2268 str[2] = toHexChar(n >> 4);
2269 str[3] = toHexChar(n & 0xF);
2270 }
2271
2272 static void byteToTerminatedString(uint8_t n, char* str) {
2273 byteToString(n, str);
2274 str[4] = '\0';
2275 }
2276
2277 /**
2278 * Report a UTF-8 encoding-related error for a code point starting AT THE
2279 * CURRENT OFFSET.
2280 *
2281 * |relevantUnits| indicates how many code units from the current offset
2282 * are potentially relevant to the reported error, such that they may be
2283 * included in the error message. For example, if at the current offset we
2284 * have
2285 *
2286 * 0b1111'1111 ...
2287 *
2288 * a code unit never allowed in UTF-8, then |relevantUnits| might be 1
2289 * because only that unit is relevant. Or if we have
2290 *
2291 * 0b1111'0111 0b1011'0101 0b0000'0000 ...
2292 *
2293 * where the first two code units are a valid prefix to a four-unit code
2294 * point but the third unit *isn't* a valid trailing code unit, then
2295 * |relevantUnits| might be 3.
2296 */
2297 MOZ_COLD void internalEncodingError(uint8_t relevantUnits,
2298 unsigned errorNumber, ...);
2299
2300 // Don't use |internalEncodingError|! Use one of the elaborated functions
2301 // that calls it, below -- all of which should be used to indicate an error
2302 // in a code point starting AT THE CURRENT OFFSET as with
2303 // |internalEncodingError|.
2304
2305 /** Report an error for an invalid lead code unit |lead|. */
2306 MOZ_COLD void badLeadUnit(mozilla::Utf8Unit lead);
2307
2308 /**
2309 * Report an error when there aren't enough code units remaining to
2310 * constitute a full code point after |lead|: only |remaining| code units
2311 * were available for a code point starting with |lead|, when at least
2312 * |required| code units were required.
2313 */
2314 MOZ_COLD void notEnoughUnits(mozilla::Utf8Unit lead, uint8_t remaining,
2315 uint8_t required);
2316
2317 /**
2318 * Report an error for a bad trailing UTF-8 code unit, where the bad
2319 * trailing unit was the last of |unitsObserved| units examined from the
2320 * current offset.
2321 */
2322 MOZ_COLD void badTrailingUnit(uint8_t unitsObserved);
2323
2324 // Helper used for both |badCodePoint| and |notShortestForm| for code units
2325 // that have all the requisite high bits set/unset in a manner that *could*
2326 // encode a valid code point, but the remaining bits encoding its actual
2327 // value do not define a permitted value.
2328 MOZ_COLD void badStructurallyValidCodePoint(uint32_t codePoint,
2329 uint8_t codePointLength,
2330 const char* reason);
2331
2332 /**
2333 * Report an error for UTF-8 that encodes a UTF-16 surrogate or a number
2334 * outside the Unicode range.
2335 */
2336 MOZ_COLD void badCodePoint(uint32_t codePoint, uint8_t codePointLength) {
2337 MOZ_ASSERT(unicode::IsSurrogate(codePoint) ||
2338 codePoint > unicode::NonBMPMax);
2339
2340 badStructurallyValidCodePoint(codePoint, codePointLength,
2341 unicode::IsSurrogate(codePoint)
2342 ? "it's a UTF-16 surrogate"
2343 : "the maximum code point is U+10FFFF");
2344 }
2345
2346 /**
2347 * Report an error for UTF-8 that encodes a code point not in its shortest
2348 * form.
2349 */
2350 MOZ_COLD void notShortestForm(uint32_t codePoint, uint8_t codePointLength) {
2351 MOZ_ASSERT(!unicode::IsSurrogate(codePoint));
2352 MOZ_ASSERT(codePoint <= unicode::NonBMPMax);
2353
2354 badStructurallyValidCodePoint(
2355 codePoint, codePointLength,
2356 "it wasn't encoded in shortest possible form");
2357 }
2358
2359 protected:
2360 using GeneralCharsBase::GeneralCharsBase;
2361
2362 /**
2363 * Given the non-ASCII |lead| code unit just consumed, consume the rest of
2364 * a non-ASCII code point. The code point is not normalized: on success
2365 * |*codePoint| may be U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR.
2366 *
2367 * Report an error if an invalid code point is encountered.
2368 */
2369 [[nodiscard]] bool getNonAsciiCodePointDontNormalize(mozilla::Utf8Unit lead,
2370 char32_t* codePoint);
2371
2372 /**
2373 * Given a just-consumed non-ASCII code unit |lead|, consume a full code
2374 * point or LineTerminatorSequence (normalizing it to '\n') and store it in
2375 * |*codePoint|. Return true on success, otherwise return false and leave
2376 * |*codePoint| undefined on failure.
2377 *
2378 * If a LineTerminatorSequence was consumed, also update line/column info.
2379 *
2380 * This function will change the current |sourceUnits| offset.
2381 */
2382 [[nodiscard]] bool getNonAsciiCodePoint(int32_t lead, int32_t* codePoint);
2383 };
2384
2385 // TokenStream is the lexical scanner for JavaScript source text.
2386 //
2387 // It takes a buffer of Unit code units (currently only char16_t encoding
2388 // UTF-16, but we're adding either UTF-8 or Latin-1 single-byte text soon) and
2389 // linearly scans it into |Token|s.
2390 //
2391 // Internally the class uses a four element circular buffer |tokens| of
2392 // |Token|s. As an index for |tokens|, the member |cursor_| points to the
2393 // current token. Calls to getToken() increase |cursor_| by one and return the
2394 // new current token. If a TokenStream was just created, the current token is
2395 // uninitialized. It's therefore important that one of the first four member
2396 // functions listed below is called first. The circular buffer lets us go back
2397 // up to two tokens from the last scanned token. Internally, the relative
2398 // number of backward steps that were taken (via ungetToken()) after the last
2399 // token was scanned is stored in |lookahead|.
2400 //
2401 // The following table lists in which situations it is safe to call each listed
2402 // function. No checks are made by the functions in non-debug builds.
2403 //
2404 // Function Name | Precondition; changes to |lookahead|
2405 // ------------------+---------------------------------------------------------
2406 // getToken | none; if |lookahead > 0| then |lookahead--|
2407 // peekToken | none; if |lookahead == 0| then |lookahead == 1|
2408 // peekTokenSameLine | none; if |lookahead == 0| then |lookahead == 1|
2409 // matchToken | none; if |lookahead > 0| and the match succeeds then
2410 // | |lookahead--|
2411 // consumeKnownToken | none; if |lookahead > 0| then |lookahead--|
2412 // ungetToken | 0 <= |lookahead| <= |maxLookahead - 1|; |lookahead++|
2413 //
2414 // The behavior of the token scanning process (see getTokenInternal()) can be
2415 // modified by calling one of the first four above listed member functions with
2416 // an optional argument of type Modifier. However, the modifier will be
2417 // ignored unless |lookahead == 0| holds. Due to constraints of the grammar,
2418 // this turns out not to be a problem in practice. See the
2419 // mozilla.dev.tech.js-engine.internals thread entitled 'Bug in the scanner?'
2420 // for more details:
2421 // https://groups.google.com/forum/?fromgroups=#!topic/mozilla.dev.tech.js-engine.internals/2JLH5jRcr7E).
2422 //
2423 // The method seek() allows rescanning from a previously visited location of
2424 // the buffer, initially computed by constructing a Position local variable.
2425 //
2426 template <typename Unit, class AnyCharsAccess>
2427 class MOZ_STACK_CLASS TokenStreamSpecific
2428 : public TokenStreamChars<Unit, AnyCharsAccess>,
2429 public TokenStreamShared,
2430 public ErrorReporter {
2431 public:
2432 using CharsBase = TokenStreamCharsBase<Unit>;
2433 using SpecializedCharsBase = SpecializedTokenStreamCharsBase<Unit>;
2434 using GeneralCharsBase = GeneralTokenStreamChars<Unit, AnyCharsAccess>;
2435 using SpecializedChars = TokenStreamChars<Unit, AnyCharsAccess>;
2436
2437 using Position = TokenStreamPosition<Unit>;
2438
2439 // Anything inherited through a base class whose type depends upon this
2440 // class's template parameters can only be accessed through a dependent
2441 // name: prefixed with |this|, by explicit qualification, and so on. (This
2442 // is so that references to inherited fields are statically distinguishable
2443 // from references to names outside of the class.) This is tedious and
2444 // onerous.
2445 //
2446 // As an alternative, we directly add every one of these functions to this
2447 // class, using explicit qualification to address the dependent-name
2448 // problem. |this| or other qualification is no longer necessary -- at
2449 // cost of this ever-changing laundry list of |using|s. So it goes.
2450 public:
2451 using GeneralCharsBase::anyCharsAccess;
2452 using GeneralCharsBase::computeLineAndColumn;
2453 using TokenStreamCharsShared::adoptState;
2454
2455 private:
2456 using typename CharsBase::SourceUnits;
2457
2458 private:
2459 using CharsBase::atomizeSourceChars;
2460 using GeneralCharsBase::badToken;
2461 // Deliberately don't |using| |charBuffer| because of bug 1472569. :-(
2462 using CharsBase::consumeKnownCodeUnit;
2463 using CharsBase::matchCodeUnit;
2464 using CharsBase::matchLineTerminator;
2465 using CharsBase::peekCodeUnit;
2466 using GeneralCharsBase::computeColumn;
2467 using GeneralCharsBase::fillExceptingContext;
2468 using GeneralCharsBase::getCodeUnit;
2469 using GeneralCharsBase::getFullAsciiCodePoint;
2470 using GeneralCharsBase::internalComputeLineOfContext;
2471 using GeneralCharsBase::matchUnicodeEscapeIdent;
2472 using GeneralCharsBase::matchUnicodeEscapeIdStart;
2473 using GeneralCharsBase::newAtomToken;
2474 using GeneralCharsBase::newBigIntToken;
2475 using GeneralCharsBase::newNameToken;
2476 using GeneralCharsBase::newNumberToken;
2477 using GeneralCharsBase::newPrivateNameToken;
2478 using GeneralCharsBase::newRegExpToken;
2479 using GeneralCharsBase::newSimpleToken;
2480 using SpecializedChars::getNonAsciiCodePoint;
2481 using SpecializedChars::getNonAsciiCodePointDontNormalize;
2482 using TokenStreamCharsShared::copyCharBufferTo;
2483 using TokenStreamCharsShared::drainCharBufferIntoAtom;
2484 using TokenStreamCharsShared::isAsciiCodePoint;
2485 // Deliberately don't |using| |sourceUnits| because of bug 1472569. :-(
2486 using CharsBase::toUnit;
2487 using GeneralCharsBase::ungetCodeUnit;
2488 using GeneralCharsBase::updateLineInfoForEOL;
2489
2490 template <typename CharU>
2491 friend class TokenStreamPosition;
2492
2493 public:
2494 TokenStreamSpecific(JSContext* cx, ParserAtomsTable* parserAtoms,
2495 const JS::ReadOnlyCompileOptions& options,
2496 const Unit* units, size_t length);
2497
2498 /**
2499 * Get the next code point, converting LineTerminatorSequences to '\n' and
2500 * updating internal line-counter state if needed. Return true on success
2501 * and store the code point in |*cp|. Return false and leave |*cp|
2502 * undefined on failure.
2503 */
2504 [[nodiscard]] bool getCodePoint(int32_t* cp);
2505
2506 // If there is an invalid escape in a template, report it and return false,
2507 // otherwise return true.
2508 bool checkForInvalidTemplateEscapeError() {
2509 if (anyCharsAccess().invalidTemplateEscapeType == InvalidEscapeType::None) {
2510 return true;
2511 }
2512
2513 reportInvalidEscapeError(anyCharsAccess().invalidTemplateEscapeOffset,
2514 anyCharsAccess().invalidTemplateEscapeType);
2515 return false;
2516 }
2517
2518 public:
2519 // Implement ErrorReporter.
2520
2521 void lineAndColumnAt(size_t offset, uint32_t* line,
2522 uint32_t* column) const final {
2523 computeLineAndColumn(offset, line, column);
2524 }
2525
2526 void currentLineAndColumn(uint32_t* line, uint32_t* column) const final {
2527 computeLineAndColumn(anyCharsAccess().currentToken().pos.begin, line,
2528 column);
2529 }
2530
2531 bool isOnThisLine(size_t offset, uint32_t lineNum,
2532 bool* onThisLine) const final {
2533 return anyCharsAccess().srcCoords.isOnThisLine(offset, lineNum, onThisLine);
2534 }
2535
2536 uint32_t lineAt(size_t offset) const final {
2537 const auto& anyChars = anyCharsAccess();
2538 auto lineToken = anyChars.lineToken(offset);
2539 return anyChars.lineNumber(lineToken);
2540 }
2541
2542 uint32_t columnAt(size_t offset) const final {
2543 return computeColumn(anyCharsAccess().lineToken(offset), offset);
2544 }
2545
2546 bool hasTokenizationStarted() const final;
2547
2548 const char* getFilename() const final {
2549 return anyCharsAccess().getFilename();
2550 }
2551
2552 private:
2553 // Implement ErrorReportMixin.
2554
2555 JSContext* getContext() const override { return anyCharsAccess().cx; }
2556
2557 [[nodiscard]] bool strictMode() const override {
2558 return anyCharsAccess().strictMode();
2559 }
2560
2561 public:
2562 // Implement ErrorReportMixin.
2563
2564 const JS::ReadOnlyCompileOptions& options() const final {
2565 return anyCharsAccess().options();
2566 }
2567
2568 [[nodiscard]] bool computeErrorMetadata(
2569 ErrorMetadata* err, const ErrorOffset& errorOffset) override;
2570
2571 private:
2572 void reportInvalidEscapeError(uint32_t offset, InvalidEscapeType type) {
2573 switch (type) {
2574 case InvalidEscapeType::None:
2575 MOZ_ASSERT_UNREACHABLE("unexpected InvalidEscapeType");
2576 return;
2577 case InvalidEscapeType::Hexadecimal:
2578 errorAt(offset, JSMSG_MALFORMED_ESCAPE, "hexadecimal");
2579 return;
2580 case InvalidEscapeType::Unicode:
2581 errorAt(offset, JSMSG_MALFORMED_ESCAPE, "Unicode");
2582 return;
2583 case InvalidEscapeType::UnicodeOverflow:
2584 errorAt(offset, JSMSG_UNICODE_OVERFLOW, "escape sequence");
2585 return;
2586 case InvalidEscapeType::Octal:
2587 errorAt(offset, JSMSG_DEPRECATED_OCTAL_ESCAPE);
2588 return;
2589 case InvalidEscapeType::EightOrNine:
2590 errorAt(offset, JSMSG_DEPRECATED_EIGHT_OR_NINE_ESCAPE);
2591 return;
2592 }
2593 }
2594
2595 void reportIllegalCharacter(int32_t cp);
2596
2597 [[nodiscard]] bool putIdentInCharBuffer(const Unit* identStart);
2598
2599 using IsIntegerUnit = bool (*)(int32_t);
2600 [[nodiscard]] MOZ_ALWAYS_INLINE bool matchInteger(IsIntegerUnit isIntegerUnit,
2601 int32_t* nextUnit);
2602 [[nodiscard]] MOZ_ALWAYS_INLINE bool matchIntegerAfterFirstDigit(
2603 IsIntegerUnit isIntegerUnit, int32_t* nextUnit);
2604
2605 /**
2606 * Tokenize a decimal number that begins at |numStart| into the provided
2607 * token.
2608 *
2609 * |unit| must be one of these values:
2610 *
2611 * 1. The first decimal digit in the integral part of a decimal number
2612 * not starting with '0' or '.', e.g. '1' for "17", '3' for "3.14", or
2613 * '8' for "8.675309e6".
2614 *
2615 * In this case, the next |getCodeUnit()| must return the code unit after
2616 * |unit| in the overall number.
2617 *
2618 * 2. The '.' in a "."/"0."-prefixed decimal number or the 'e'/'E' in a
2619 * "0e"/"0E"-prefixed decimal number, e.g. ".17", "0.42", or "0.1e3".
2620 *
2621 * In this case, the next |getCodeUnit()| must return the code unit
2622 * *after* the first decimal digit *after* the '.'. So the next code
2623 * unit would be '7' in ".17", '2' in "0.42", 'e' in "0.4e+8", or '/' in
2624 * "0.5/2" (three separate tokens).
2625 *
2626 * 3. The code unit after the '0' where "0" is the entire number token.
2627 *
2628 * In this case, the next |getCodeUnit()| would return the code unit
2629 * after |unit|, but this function will never perform such call.
2630 *
2631 * 4. (Non-strict mode code only) The first '8' or '9' in a "noctal"
2632 * number that begins with a '0' but contains a non-octal digit in its
2633 * integer part so is interpreted as decimal, e.g. '9' in "09.28" or
2634 * '8' in "0386" or '9' in "09+7" (three separate tokens").
2635 *
2636 * In this case, the next |getCodeUnit()| returns the code unit after
2637 * |unit|: '.', '6', or '+' in the examples above.
2638 *
2639 * This interface is super-hairy and horribly stateful. Unfortunately, its
2640 * hair merely reflects the intricacy of ECMAScript numeric literal syntax.
2641 * And incredibly, it *improves* on the goto-based horror that predated it.
2642 */
2643 [[nodiscard]] bool decimalNumber(int32_t unit, TokenStart start,
2644 const Unit* numStart, Modifier modifier,
2645 TokenKind* out);
2646
2647 /** Tokenize a regular expression literal beginning at |start|. */
2648 [[nodiscard]] bool regexpLiteral(TokenStart start, TokenKind* out);
2649
2650 /**
2651 * Slurp characters between |start| and sourceUnits.current() into
2652 * charBuffer, to later parse into a bigint.
2653 */
2654 [[nodiscard]] bool bigIntLiteral(TokenStart start, Modifier modifier,
2655 TokenKind* out);
2656
2657 public:
2658 // Advance to the next token. If the token stream encountered an error,
2659 // return false. Otherwise return true and store the token kind in |*ttp|.
2660 [[nodiscard]] bool getToken(TokenKind* ttp, Modifier modifier = SlashIsDiv) {
2661 // Check for a pushed-back token resulting from mismatching lookahead.
2662 TokenStreamAnyChars& anyChars = anyCharsAccess();
2663 if (anyChars.lookahead != 0) {
2664 MOZ_ASSERT(!anyChars.flags.hadError);
2665 anyChars.lookahead--;
2666 anyChars.advanceCursor();
2667 TokenKind tt = anyChars.currentToken().type;
2668 MOZ_ASSERT(tt != TokenKind::Eol);
2669 verifyConsistentModifier(modifier, anyChars.currentToken());
2670 *ttp = tt;
2671 return true;
2672 }
2673
2674 return getTokenInternal(ttp, modifier);
2675 }
2676
2677 [[nodiscard]] bool peekToken(TokenKind* ttp, Modifier modifier = SlashIsDiv) {
2678 TokenStreamAnyChars& anyChars = anyCharsAccess();
2679 if (anyChars.lookahead > 0) {
2680 MOZ_ASSERT(!anyChars.flags.hadError);
2681 verifyConsistentModifier(modifier, anyChars.nextToken());
2682 *ttp = anyChars.nextToken().type;
2683 return true;
2684 }
2685 if (!getTokenInternal(ttp, modifier)) {
2686 return false;
2687 }
2688 anyChars.ungetToken();
2689 return true;
2690 }
2691
2692 [[nodiscard]] bool peekTokenPos(TokenPos* posp,
2693 Modifier modifier = SlashIsDiv) {
2694 TokenStreamAnyChars& anyChars = anyCharsAccess();
2695 if (anyChars.lookahead == 0) {
2696 TokenKind tt;
2697 if (!getTokenInternal(&tt, modifier)) {
2698 return false;
2699 }
2700 anyChars.ungetToken();
2701 MOZ_ASSERT(anyChars.hasLookahead());
2702 } else {
2703 MOZ_ASSERT(!anyChars.flags.hadError);
2704 verifyConsistentModifier(modifier, anyChars.nextToken());
2705 }
2706 *posp = anyChars.nextToken().pos;
2707 return true;
2708 }
2709
2710 [[nodiscard]] bool peekOffset(uint32_t* offset,
2711 Modifier modifier = SlashIsDiv) {
2712 TokenPos pos;
2713 if (!peekTokenPos(&pos, modifier)) {
2714 return false;
2715 }
2716 *offset = pos.begin;
2717 return true;
2718 }
2719
2720 // This is like peekToken(), with one exception: if there is an EOL
2721 // between the end of the current token and the start of the next token, it
2722 // return true and store Eol in |*ttp|. In that case, no token with
2723 // Eol is actually created, just a Eol TokenKind is returned, and
2724 // currentToken() shouldn't be consulted. (This is the only place Eol
2725 // is produced.)
2726 [[nodiscard]] MOZ_ALWAYS_INLINE bool peekTokenSameLine(
2727 TokenKind* ttp, Modifier modifier = SlashIsDiv) {
2728 TokenStreamAnyChars& anyChars = anyCharsAccess();
2729 const Token& curr = anyChars.currentToken();
2730
2731 // If lookahead != 0, we have scanned ahead at least one token, and
2732 // |lineno| is the line that the furthest-scanned token ends on. If
2733 // it's the same as the line that the current token ends on, that's a
2734 // stronger condition than what we are looking for, and we don't need
2735 // to return Eol.
2736 if (anyChars.lookahead != 0) {
2737 bool onThisLine;
2738 if (!anyChars.srcCoords.isOnThisLine(curr.pos.end, anyChars.lineno,
2739 &onThisLine)) {
2740 error(JSMSG_OUT_OF_MEMORY);
2741 return false;
2742 }
2743
2744 if (onThisLine) {
2745 MOZ_ASSERT(!anyChars.flags.hadError);
2746 verifyConsistentModifier(modifier, anyChars.nextToken());
2747 *ttp = anyChars.nextToken().type;
2748 return true;
2749 }
2750 }
2751
2752 // The above check misses two cases where we don't have to return
2753 // Eol.
2754 // - The next token starts on the same line, but is a multi-line token.
2755 // - The next token starts on the same line, but lookahead==2 and there
2756 // is a newline between the next token and the one after that.
2757 // The following test is somewhat expensive but gets these cases (and
2758 // all others) right.
2759 TokenKind tmp;
2760 if (!getToken(&tmp, modifier)) {
2761 return false;
2762 }
2763
2764 const Token& next = anyChars.currentToken();
2765 anyChars.ungetToken();
2766
2767 // Careful, |next| points to an initialized-but-not-allocated Token!
2768 // This is safe because we don't modify token data below.
2769
2770 auto currentEndToken = anyChars.lineToken(curr.pos.end);
2771 auto nextBeginToken = anyChars.lineToken(next.pos.begin);
2772
2773 *ttp =
2774 currentEndToken.isSameLine(nextBeginToken) ? next.type : TokenKind::Eol;
2775 return true;
2776 }
2777
2778 // Get the next token from the stream if its kind is |tt|.
2779 [[nodiscard]] bool matchToken(bool* matchedp, TokenKind tt,
2780 Modifier modifier = SlashIsDiv) {
2781 TokenKind token;
2782 if (!getToken(&token, modifier)) {
2783 return false;
2784 }
2785 if (token == tt) {
2786 *matchedp = true;
2787 } else {
2788 anyCharsAccess().ungetToken();
2789 *matchedp = false;
2790 }
2791 return true;
2792 }
2793
2794 void consumeKnownToken(TokenKind tt, Modifier modifier = SlashIsDiv) {
2795 bool matched;
2796 MOZ_ASSERT(anyCharsAccess().hasLookahead());
2797 MOZ_ALWAYS_TRUE(matchToken(&matched, tt, modifier));
2798 MOZ_ALWAYS_TRUE(matched);
2799 }
2800
2801 [[nodiscard]] bool nextTokenEndsExpr(bool* endsExpr) {
2802 TokenKind tt;
2803 if (!peekToken(&tt)) {
2804 return false;
2805 }
2806
2807 *endsExpr = anyCharsAccess().isExprEnding[size_t(tt)];
2808 if (*endsExpr) {
2809 // If the next token ends an overall Expression, we'll parse this
2810 // Expression without ever invoking Parser::orExpr(). But we need that
2811 // function's DEBUG-only side effect of marking this token as safe to get
2812 // with SlashIsRegExp, so we have to do it manually here.
2813 anyCharsAccess().allowGettingNextTokenWithSlashIsRegExp();
2814 }
2815 return true;
2816 }
2817
2818 [[nodiscard]] bool advance(size_t position);
2819
2820 void seekTo(const Position& pos);
2821 [[nodiscard]] bool seekTo(const Position& pos,
2822 const TokenStreamAnyChars& other);
2823
2824 void rewind(const Position& pos) {
2825 MOZ_ASSERT(pos.buf <= this->sourceUnits.addressOfNextCodeUnit(),
2826 "should be rewinding here");
2827 seekTo(pos);
2828 }
2829
2830 [[nodiscard]] bool rewind(const Position& pos,
2831 const TokenStreamAnyChars& other) {
2832 MOZ_ASSERT(pos.buf <= this->sourceUnits.addressOfNextCodeUnit(),
2833 "should be rewinding here");
2834 return seekTo(pos, other);
2835 }
2836
2837 void fastForward(const Position& pos) {
2838 MOZ_ASSERT(this->sourceUnits.addressOfNextCodeUnit() <= pos.buf,
2839 "should be moving forward here");
2840 seekTo(pos);
2841 }
2842
2843 [[nodiscard]] bool fastForward(const Position& pos,
2844 const TokenStreamAnyChars& other) {
2845 MOZ_ASSERT(this->sourceUnits.addressOfNextCodeUnit() <= pos.buf,
2846 "should be moving forward here");
2847 return seekTo(pos, other);
2848 }
2849
2850 const Unit* codeUnitPtrAt(size_t offset) const {
2851 return this->sourceUnits.codeUnitPtrAt(offset);
2852 }
2853
2854 const Unit* rawLimit() const { return this->sourceUnits.limit(); }
2855
2856 [[nodiscard]] bool identifierName(TokenStart start, const Unit* identStart,
2857 IdentifierEscapes escaping,
2858 Modifier modifier,
2859 NameVisibility visibility, TokenKind* out);
2860
2861 [[nodiscard]] bool matchIdentifierStart(IdentifierEscapes* sawEscape);
2862
2863 [[nodiscard]] bool getTokenInternal(TokenKind* const ttp,
2864 const Modifier modifier);
2865
2866 [[nodiscard]] bool getStringOrTemplateToken(char untilChar, Modifier modifier,
2867 TokenKind* out);
2868
2869 // Parse a TemplateMiddle or TemplateTail token (one of the string-like parts
2870 // of a template string) after already consuming the leading `RightCurly`.
2871 // (The spec says the `}` is the first character of the TemplateMiddle/
2872 // TemplateTail, but we treat it as a separate token because that's much
2873 // easier to implement in both TokenStream and the parser.)
2874 //
2875 // This consumes a token and sets the current token, like `getToken()`. It
2876 // doesn't take a Modifier because there's no risk of encountering a division
2877 // operator or RegExp literal.
2878 //
2879 // On success, `*ttp` is either `TokenKind::TemplateHead` (if we got a
2880 // TemplateMiddle token) or `TokenKind::NoSubsTemplate` (if we got a
2881 // TemplateTail). That may seem strange; there are four different template
2882 // token types in the spec, but we only use two. We use `TemplateHead` for
2883 // TemplateMiddle because both end with `...${`, and `NoSubsTemplate` for
2884 // TemplateTail because both contain the end of the template, including the
2885 // closing quote mark. They're not treated differently, either in the parser
2886 // or in the tokenizer.
2887 [[nodiscard]] bool getTemplateToken(TokenKind* ttp) {
2888 MOZ_ASSERT(anyCharsAccess().currentToken().type == TokenKind::RightCurly);
2889 return getStringOrTemplateToken('`', SlashIsInvalid, ttp);
2890 }
2891
2892 [[nodiscard]] bool getDirectives(bool isMultiline, bool shouldWarnDeprecated);
2893 [[nodiscard]] bool getDirective(
2894 bool isMultiline, bool shouldWarnDeprecated, const char* directive,
2895 uint8_t directiveLength, const char* errorMsgPragma,
2896 UniquePtr<char16_t[], JS::FreePolicy>* destination);
2897 [[nodiscard]] bool getDisplayURL(bool isMultiline, bool shouldWarnDeprecated);
2898 [[nodiscard]] bool getSourceMappingURL(bool isMultiline,
2899 bool shouldWarnDeprecated);
2900 };
2901
2902 // It's preferable to define this in TokenStream.cpp, but its template-ness
2903 // means we'd then have to *instantiate* this constructor for all possible
2904 // (Unit, AnyCharsAccess) pairs -- and that gets super-messy as AnyCharsAccess
2905 // *itself* is templated. This symbol really isn't that huge compared to some
2906 // defined inline in TokenStreamSpecific, so just rely on the linker commoning
2907 // stuff up.
2908 template <typename Unit>
2909 template <class AnyCharsAccess>
2910 inline TokenStreamPosition<Unit>::TokenStreamPosition(
2911 TokenStreamSpecific<Unit, AnyCharsAccess>& tokenStream)
2912 : currentToken(tokenStream.anyCharsAccess().currentToken()) {
2913 TokenStreamAnyChars& anyChars = tokenStream.anyCharsAccess();
2914
2915 buf =
2916 tokenStream.sourceUnits.addressOfNextCodeUnit(/* allowPoisoned = */ true);
2917 flags = anyChars.flags;
2918 lineno = anyChars.lineno;
2919 linebase = anyChars.linebase;
2920 prevLinebase = anyChars.prevLinebase;
2921 lookahead = anyChars.lookahead;
2922 currentToken = anyChars.currentToken();
2923 for (unsigned i = 0; i < anyChars.lookahead; i++) {
2924 lookaheadTokens[i] = anyChars.tokens[anyChars.aheadCursor(1 + i)];
2925 }
2926 }
2927
2928 class TokenStreamAnyCharsAccess {
2929 public:
2930 template <class TokenStreamSpecific>
2931 static inline TokenStreamAnyChars& anyChars(TokenStreamSpecific* tss);
2932
2933 template <class TokenStreamSpecific>
2934 static inline const TokenStreamAnyChars& anyChars(
2935 const TokenStreamSpecific* tss);
2936 };
2937
2938 class MOZ_STACK_CLASS TokenStream
2939 : public TokenStreamAnyChars,
2940 public TokenStreamSpecific<char16_t, TokenStreamAnyCharsAccess> {
2941 using Unit = char16_t;
2942
2943 public:
2944 TokenStream(JSContext* cx, ParserAtomsTable* parserAtoms,
2945 const JS::ReadOnlyCompileOptions& options, const Unit* units,
2946 size_t length, StrictModeGetter* smg)
2947 : TokenStreamAnyChars(cx, options, smg),
2948 TokenStreamSpecific<Unit, TokenStreamAnyCharsAccess>(
2949 cx, parserAtoms, options, units, length) {}
2950 };
2951
2952 class MOZ_STACK_CLASS DummyTokenStream final : public TokenStream {
2953 public:
2954 DummyTokenStream(JSContext* cx, const JS::ReadOnlyCompileOptions& options)
2955 : TokenStream(cx, nullptr, options, nullptr, 0, nullptr) {}
2956 };
2957
2958 template <class TokenStreamSpecific>
2959 /* static */ inline TokenStreamAnyChars& TokenStreamAnyCharsAccess::anyChars(
2960 TokenStreamSpecific* tss) {
2961 auto* ts = static_cast<TokenStream*>(tss);
2962 return *static_cast<TokenStreamAnyChars*>(ts);
2963 }
2964
2965 template <class TokenStreamSpecific>
2966 /* static */ inline const TokenStreamAnyChars&
2967 TokenStreamAnyCharsAccess::anyChars(const TokenStreamSpecific* tss) {
2968 const auto* ts = static_cast<const TokenStream*>(tss);
2969 return *static_cast<const TokenStreamAnyChars*>(ts);
2970 }
2971
2972 extern const char* TokenKindToDesc(TokenKind tt);
2973
2974 } // namespace frontend
2975 } // namespace js
2976
2977 #ifdef DEBUG
2978 extern const char* TokenKindToString(js::frontend::TokenKind tt);
2979 #endif
2980
2981 #endif /* frontend_TokenStream_h */
2982