1 // Copyright 2018 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_PARSING_SCANNER_INL_H_
6 #define V8_PARSING_SCANNER_INL_H_
7 
8 #include "src/parsing/keywords-gen.h"
9 #include "src/parsing/scanner.h"
10 #include "src/strings/char-predicates-inl.h"
11 #include "src/utils/utils.h"
12 
13 namespace v8 {
14 namespace internal {
15 
16 // ----------------------------------------------------------------------------
17 // Keyword Matcher
18 
19 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                    \
20   KEYWORD_GROUP('a')                                        \
21   KEYWORD("async", Token::ASYNC)                            \
22   KEYWORD("await", Token::AWAIT)                            \
23   KEYWORD_GROUP('b')                                        \
24   KEYWORD("break", Token::BREAK)                            \
25   KEYWORD_GROUP('c')                                        \
26   KEYWORD("case", Token::CASE)                              \
27   KEYWORD("catch", Token::CATCH)                            \
28   KEYWORD("class", Token::CLASS)                            \
29   KEYWORD("const", Token::CONST)                            \
30   KEYWORD("continue", Token::CONTINUE)                      \
31   KEYWORD_GROUP('d')                                        \
32   KEYWORD("debugger", Token::DEBUGGER)                      \
33   KEYWORD("default", Token::DEFAULT)                        \
34   KEYWORD("delete", Token::DELETE)                          \
35   KEYWORD("do", Token::DO)                                  \
36   KEYWORD_GROUP('e')                                        \
37   KEYWORD("else", Token::ELSE)                              \
38   KEYWORD("enum", Token::ENUM)                              \
39   KEYWORD("export", Token::EXPORT)                          \
40   KEYWORD("extends", Token::EXTENDS)                        \
41   KEYWORD_GROUP('f')                                        \
42   KEYWORD("false", Token::FALSE_LITERAL)                    \
43   KEYWORD("finally", Token::FINALLY)                        \
44   KEYWORD("for", Token::FOR)                                \
45   KEYWORD("function", Token::FUNCTION)                      \
46   KEYWORD_GROUP('g')                                        \
47   KEYWORD("get", Token::GET)                                \
48   KEYWORD_GROUP('i')                                        \
49   KEYWORD("if", Token::IF)                                  \
50   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
51   KEYWORD("import", Token::IMPORT)                          \
52   KEYWORD("in", Token::IN)                                  \
53   KEYWORD("instanceof", Token::INSTANCEOF)                  \
54   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)  \
55   KEYWORD_GROUP('l')                                        \
56   KEYWORD("let", Token::LET)                                \
57   KEYWORD_GROUP('n')                                        \
58   KEYWORD("new", Token::NEW)                                \
59   KEYWORD("null", Token::NULL_LITERAL)                      \
60   KEYWORD_GROUP('p')                                        \
61   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)    \
62   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)    \
63   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)  \
64   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)     \
65   KEYWORD_GROUP('r')                                        \
66   KEYWORD("return", Token::RETURN)                          \
67   KEYWORD_GROUP('s')                                        \
68   KEYWORD("set", Token::SET)                                \
69   KEYWORD("static", Token::STATIC)                          \
70   KEYWORD("super", Token::SUPER)                            \
71   KEYWORD("switch", Token::SWITCH)                          \
72   KEYWORD_GROUP('t')                                        \
73   KEYWORD("this", Token::THIS)                              \
74   KEYWORD("throw", Token::THROW)                            \
75   KEYWORD("true", Token::TRUE_LITERAL)                      \
76   KEYWORD("try", Token::TRY)                                \
77   KEYWORD("typeof", Token::TYPEOF)                          \
78   KEYWORD_GROUP('v')                                        \
79   KEYWORD("var", Token::VAR)                                \
80   KEYWORD("void", Token::VOID)                              \
81   KEYWORD_GROUP('w')                                        \
82   KEYWORD("while", Token::WHILE)                            \
83   KEYWORD("with", Token::WITH)                              \
84   KEYWORD_GROUP('y')                                        \
85   KEYWORD("yield", Token::YIELD)
86 
IsKeywordStart(char c)87 constexpr bool IsKeywordStart(char c) {
88 #define KEYWORD_GROUP_CHECK(ch) c == ch ||
89 #define KEYWORD_CHECK(keyword, token)
90   return KEYWORDS(KEYWORD_GROUP_CHECK, KEYWORD_CHECK) /* || */ false;
91 #undef KEYWORD_GROUP_CHECK
92 #undef KEYWORD_CHECK
93 }
94 
KeywordOrIdentifierToken(const uint8_t * input,int input_length)95 V8_INLINE Token::Value KeywordOrIdentifierToken(const uint8_t* input,
96                                                 int input_length) {
97   DCHECK_GE(input_length, 1);
98   return PerfectKeywordHash::GetToken(reinterpret_cast<const char*>(input),
99                                       input_length);
100 }
101 
102 // Recursive constexpr template magic to check if a character is in a given
103 // string.
104 template <int N>
105 constexpr bool IsInString(const char (&s)[N], char c, size_t i = 0) {
106   return i >= N ? false : s[i] == c ? true : IsInString(s, c, i + 1);
107 }
108 
CanBeKeywordCharacter(char c)109 inline constexpr bool CanBeKeywordCharacter(char c) {
110   return IsInString(
111 #define KEYWORD_GROUP_CASE(ch)  // Nothing
112 #define KEYWORD(keyword, token) keyword
113       // Use C string literal concatenation ("a" "b" becomes "ab") to build one
114       // giant string containing all the keywords.
115       KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
116 #undef KEYWORD
117 #undef KEYWORD_GROUP_CASE
118           ,
119       c);
120 }
121 
122 // Make sure tokens are stored as a single byte.
123 STATIC_ASSERT(sizeof(Token::Value) == 1);
124 
125 // Get the shortest token that this character starts, the token may change
126 // depending on subsequent characters.
GetOneCharToken(char c)127 constexpr Token::Value GetOneCharToken(char c) {
128   // clang-format off
129   return
130       c == '(' ? Token::LPAREN :
131       c == ')' ? Token::RPAREN :
132       c == '{' ? Token::LBRACE :
133       c == '}' ? Token::RBRACE :
134       c == '[' ? Token::LBRACK :
135       c == ']' ? Token::RBRACK :
136       c == '?' ? Token::CONDITIONAL :
137       c == ':' ? Token::COLON :
138       c == ';' ? Token::SEMICOLON :
139       c == ',' ? Token::COMMA :
140       c == '.' ? Token::PERIOD :
141       c == '|' ? Token::BIT_OR :
142       c == '&' ? Token::BIT_AND :
143       c == '^' ? Token::BIT_XOR :
144       c == '~' ? Token::BIT_NOT :
145       c == '!' ? Token::NOT :
146       c == '<' ? Token::LT :
147       c == '>' ? Token::GT :
148       c == '%' ? Token::MOD :
149       c == '=' ? Token::ASSIGN :
150       c == '+' ? Token::ADD :
151       c == '-' ? Token::SUB :
152       c == '*' ? Token::MUL :
153       c == '/' ? Token::DIV :
154       c == '#' ? Token::PRIVATE_NAME :
155       c == '"' ? Token::STRING :
156       c == '\'' ? Token::STRING :
157       c == '`' ? Token::TEMPLATE_SPAN :
158       c == '\\' ? Token::IDENTIFIER :
159       // Whitespace or line terminator
160       c == ' ' ? Token::WHITESPACE :
161       c == '\t' ? Token::WHITESPACE :
162       c == '\v' ? Token::WHITESPACE :
163       c == '\f' ? Token::WHITESPACE :
164       c == '\r' ? Token::WHITESPACE :
165       c == '\n' ? Token::WHITESPACE :
166       // IsDecimalDigit must be tested before IsAsciiIdentifier
167       IsDecimalDigit(c) ? Token::NUMBER :
168       IsAsciiIdentifier(c) ? Token::IDENTIFIER :
169       Token::ILLEGAL;
170   // clang-format on
171 }
172 
173 // Table of one-character tokens, by character (0x00..0x7F only).
174 static const constexpr Token::Value one_char_tokens[128] = {
175 #define CALL_GET_SCAN_FLAGS(N) GetOneCharToken(N),
176     INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS)
177 #undef CALL_GET_SCAN_FLAGS
178 };
179 
180 #undef KEYWORDS
181 
ScanIdentifierOrKeyword()182 V8_INLINE Token::Value Scanner::ScanIdentifierOrKeyword() {
183   next().literal_chars.Start();
184   return ScanIdentifierOrKeywordInner();
185 }
186 
187 // Character flags for the fast path of scanning a keyword or identifier token.
188 enum class ScanFlags : uint8_t {
189   kTerminatesLiteral = 1 << 0,
190   // "Cannot" rather than "can" so that this flag can be ORed together across
191   // multiple characters.
192   kCannotBeKeyword = 1 << 1,
193   kCannotBeKeywordStart = 1 << 2,
194   kStringTerminator = 1 << 3,
195   kIdentifierNeedsSlowPath = 1 << 4,
196   kMultilineCommentCharacterNeedsSlowPath = 1 << 5,
197 };
GetScanFlags(char c)198 constexpr uint8_t GetScanFlags(char c) {
199   return
200       // Keywords are all lowercase and only contain letters.
201       // Note that non-identifier characters do not set this flag, so
202       // that it plays well with kTerminatesLiteral.
203       (IsAsciiIdentifier(c) && !CanBeKeywordCharacter(c)
204            ? static_cast<uint8_t>(ScanFlags::kCannotBeKeyword)
205            : 0) |
206       (IsKeywordStart(c)
207            ? 0
208            : static_cast<uint8_t>(ScanFlags::kCannotBeKeywordStart)) |
209       // Anything that isn't an identifier character will terminate the
210       // literal, or at least terminates the literal fast path processing
211       // (like an escape).
212       (!IsAsciiIdentifier(c)
213            ? static_cast<uint8_t>(ScanFlags::kTerminatesLiteral)
214            : 0) |
215       // Possible string termination characters.
216       ((c == '\'' || c == '"' || c == '\n' || c == '\r' || c == '\\')
217            ? static_cast<uint8_t>(ScanFlags::kStringTerminator)
218            : 0) |
219       // Escapes are processed on the slow path.
220       (c == '\\' ? static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath)
221                  : 0) |
222       // Newlines and * are interesting characters for multiline comment
223       // scanning.
224       (c == '\n' || c == '\r' || c == '*'
225            ? static_cast<uint8_t>(
226                  ScanFlags::kMultilineCommentCharacterNeedsSlowPath)
227            : 0);
228 }
TerminatesLiteral(uint8_t scan_flags)229 inline bool TerminatesLiteral(uint8_t scan_flags) {
230   return (scan_flags & static_cast<uint8_t>(ScanFlags::kTerminatesLiteral));
231 }
CanBeKeyword(uint8_t scan_flags)232 inline bool CanBeKeyword(uint8_t scan_flags) {
233   return !(scan_flags & static_cast<uint8_t>(ScanFlags::kCannotBeKeyword));
234 }
IdentifierNeedsSlowPath(uint8_t scan_flags)235 inline bool IdentifierNeedsSlowPath(uint8_t scan_flags) {
236   return (scan_flags &
237           static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath));
238 }
MultilineCommentCharacterNeedsSlowPath(uint8_t scan_flags)239 inline bool MultilineCommentCharacterNeedsSlowPath(uint8_t scan_flags) {
240   return (scan_flags & static_cast<uint8_t>(
241                            ScanFlags::kMultilineCommentCharacterNeedsSlowPath));
242 }
MayTerminateString(uint8_t scan_flags)243 inline bool MayTerminateString(uint8_t scan_flags) {
244   return (scan_flags & static_cast<uint8_t>(ScanFlags::kStringTerminator));
245 }
246 // Table of precomputed scan flags for the 128 ASCII characters, for branchless
247 // flag calculation during the scan.
248 static constexpr const uint8_t character_scan_flags[128] = {
249 #define CALL_GET_SCAN_FLAGS(N) GetScanFlags(N),
250     INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS)
251 #undef CALL_GET_SCAN_FLAGS
252 };
253 
CharCanBeKeyword(base::uc32 c)254 inline bool CharCanBeKeyword(base::uc32 c) {
255   return static_cast<uint32_t>(c) < arraysize(character_scan_flags) &&
256          CanBeKeyword(character_scan_flags[c]);
257 }
258 
ScanIdentifierOrKeywordInner()259 V8_INLINE Token::Value Scanner::ScanIdentifierOrKeywordInner() {
260   DCHECK(IsIdentifierStart(c0_));
261   bool escaped = false;
262   bool can_be_keyword = true;
263 
264   STATIC_ASSERT(arraysize(character_scan_flags) == kMaxAscii + 1);
265   if (V8_LIKELY(static_cast<uint32_t>(c0_) <= kMaxAscii)) {
266     if (V8_LIKELY(c0_ != '\\')) {
267       uint8_t scan_flags = character_scan_flags[c0_];
268       DCHECK(!TerminatesLiteral(scan_flags));
269       STATIC_ASSERT(static_cast<uint8_t>(ScanFlags::kCannotBeKeywordStart) ==
270                     static_cast<uint8_t>(ScanFlags::kCannotBeKeyword) << 1);
271       scan_flags >>= 1;
272       // Make sure the shifting above doesn't set IdentifierNeedsSlowPath.
273       // Otherwise we'll fall into the slow path after scanning the identifier.
274       DCHECK(!IdentifierNeedsSlowPath(scan_flags));
275       AddLiteralChar(static_cast<char>(c0_));
276       AdvanceUntil([this, &scan_flags](base::uc32 c0) {
277         if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
278           // A non-ascii character means we need to drop through to the slow
279           // path.
280           // TODO(leszeks): This would be most efficient as a goto to the slow
281           // path, check codegen and maybe use a bool instead.
282           scan_flags |=
283               static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath);
284           return true;
285         }
286         uint8_t char_flags = character_scan_flags[c0];
287         scan_flags |= char_flags;
288         if (TerminatesLiteral(char_flags)) {
289           return true;
290         } else {
291           AddLiteralChar(static_cast<char>(c0));
292           return false;
293         }
294       });
295 
296       if (V8_LIKELY(!IdentifierNeedsSlowPath(scan_flags))) {
297         if (!CanBeKeyword(scan_flags)) return Token::IDENTIFIER;
298         // Could be a keyword or identifier.
299         base::Vector<const uint8_t> chars =
300             next().literal_chars.one_byte_literal();
301         return KeywordOrIdentifierToken(chars.begin(), chars.length());
302       }
303 
304       can_be_keyword = CanBeKeyword(scan_flags);
305     } else {
306       // Special case for escapes at the start of an identifier.
307       escaped = true;
308       base::uc32 c = ScanIdentifierUnicodeEscape();
309       DCHECK(!IsIdentifierStart(Invalid()));
310       if (c == '\\' || !IsIdentifierStart(c)) {
311         return Token::ILLEGAL;
312       }
313       AddLiteralChar(c);
314       can_be_keyword = CharCanBeKeyword(c);
315     }
316   }
317 
318   return ScanIdentifierOrKeywordInnerSlow(escaped, can_be_keyword);
319 }
320 
SkipWhiteSpace()321 V8_INLINE Token::Value Scanner::SkipWhiteSpace() {
322   int start_position = source_pos();
323 
324   // We won't skip behind the end of input.
325   DCHECK(!IsWhiteSpaceOrLineTerminator(kEndOfInput));
326 
327   // Advance as long as character is a WhiteSpace or LineTerminator.
328   while (IsWhiteSpaceOrLineTerminator(c0_)) {
329     if (!next().after_line_terminator && unibrow::IsLineTerminator(c0_)) {
330       next().after_line_terminator = true;
331     }
332     Advance();
333   }
334 
335   // Return whether or not we skipped any characters.
336   if (source_pos() == start_position) {
337     DCHECK_NE('0', c0_);
338     return Token::ILLEGAL;
339   }
340 
341   return Token::WHITESPACE;
342 }
343 
ScanSingleToken()344 V8_INLINE Token::Value Scanner::ScanSingleToken() {
345   Token::Value token;
346   do {
347     next().location.beg_pos = source_pos();
348 
349     if (V8_LIKELY(static_cast<unsigned>(c0_) <= kMaxAscii)) {
350       token = one_char_tokens[c0_];
351 
352       switch (token) {
353         case Token::LPAREN:
354         case Token::RPAREN:
355         case Token::LBRACE:
356         case Token::RBRACE:
357         case Token::LBRACK:
358         case Token::RBRACK:
359         case Token::COLON:
360         case Token::SEMICOLON:
361         case Token::COMMA:
362         case Token::BIT_NOT:
363         case Token::ILLEGAL:
364           // One character tokens.
365           return Select(token);
366 
367         case Token::CONDITIONAL:
368           // ? ?. ?? ??=
369           Advance();
370           if (c0_ == '.') {
371             Advance();
372             if (!IsDecimalDigit(c0_)) return Token::QUESTION_PERIOD;
373             PushBack('.');
374           } else if (c0_ == '?') {
375             return Select('=', Token::ASSIGN_NULLISH, Token::NULLISH);
376           }
377           return Token::CONDITIONAL;
378 
379         case Token::STRING:
380           return ScanString();
381 
382         case Token::LT:
383           // < <= << <<= <!--
384           Advance();
385           if (c0_ == '=') return Select(Token::LTE);
386           if (c0_ == '<') return Select('=', Token::ASSIGN_SHL, Token::SHL);
387           if (c0_ == '!') {
388             token = ScanHtmlComment();
389             continue;
390           }
391           return Token::LT;
392 
393         case Token::GT:
394           // > >= >> >>= >>> >>>=
395           Advance();
396           if (c0_ == '=') return Select(Token::GTE);
397           if (c0_ == '>') {
398             // >> >>= >>> >>>=
399             Advance();
400             if (c0_ == '=') return Select(Token::ASSIGN_SAR);
401             if (c0_ == '>') return Select('=', Token::ASSIGN_SHR, Token::SHR);
402             return Token::SAR;
403           }
404           return Token::GT;
405 
406         case Token::ASSIGN:
407           // = == === =>
408           Advance();
409           if (c0_ == '=') return Select('=', Token::EQ_STRICT, Token::EQ);
410           if (c0_ == '>') return Select(Token::ARROW);
411           return Token::ASSIGN;
412 
413         case Token::NOT:
414           // ! != !==
415           Advance();
416           if (c0_ == '=') return Select('=', Token::NE_STRICT, Token::NE);
417           return Token::NOT;
418 
419         case Token::ADD:
420           // + ++ +=
421           Advance();
422           if (c0_ == '+') return Select(Token::INC);
423           if (c0_ == '=') return Select(Token::ASSIGN_ADD);
424           return Token::ADD;
425 
426         case Token::SUB:
427           // - -- --> -=
428           Advance();
429           if (c0_ == '-') {
430             Advance();
431             if (c0_ == '>' && next().after_line_terminator) {
432               // For compatibility with SpiderMonkey, we skip lines that
433               // start with an HTML comment end '-->'.
434               token = SkipSingleHTMLComment();
435               continue;
436             }
437             return Token::DEC;
438           }
439           if (c0_ == '=') return Select(Token::ASSIGN_SUB);
440           return Token::SUB;
441 
442         case Token::MUL:
443           // * *=
444           Advance();
445           if (c0_ == '*') return Select('=', Token::ASSIGN_EXP, Token::EXP);
446           if (c0_ == '=') return Select(Token::ASSIGN_MUL);
447           return Token::MUL;
448 
449         case Token::MOD:
450           // % %=
451           return Select('=', Token::ASSIGN_MOD, Token::MOD);
452 
453         case Token::DIV:
454           // /  // /* /=
455           Advance();
456           if (c0_ == '/') {
457             base::uc32 c = Peek();
458             if (c == '#' || c == '@') {
459               Advance();
460               Advance();
461               token = SkipSourceURLComment();
462               continue;
463             }
464             token = SkipSingleLineComment();
465             continue;
466           }
467           if (c0_ == '*') {
468             token = SkipMultiLineComment();
469             continue;
470           }
471           if (c0_ == '=') return Select(Token::ASSIGN_DIV);
472           return Token::DIV;
473 
474         case Token::BIT_AND:
475           // & && &= &&=
476           Advance();
477           if (c0_ == '&') return Select('=', Token::ASSIGN_AND, Token::AND);
478           if (c0_ == '=') return Select(Token::ASSIGN_BIT_AND);
479           return Token::BIT_AND;
480 
481         case Token::BIT_OR:
482           // | || |= ||=
483           Advance();
484           if (c0_ == '|') return Select('=', Token::ASSIGN_OR, Token::OR);
485           if (c0_ == '=') return Select(Token::ASSIGN_BIT_OR);
486           return Token::BIT_OR;
487 
488         case Token::BIT_XOR:
489           // ^ ^=
490           return Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
491 
492         case Token::PERIOD:
493           // . Number
494           Advance();
495           if (IsDecimalDigit(c0_)) return ScanNumber(true);
496           if (c0_ == '.') {
497             if (Peek() == '.') {
498               Advance();
499               Advance();
500               return Token::ELLIPSIS;
501             }
502           }
503           return Token::PERIOD;
504 
505         case Token::TEMPLATE_SPAN:
506           Advance();
507           return ScanTemplateSpan();
508 
509         case Token::PRIVATE_NAME:
510           if (source_pos() == 0 && Peek() == '!') {
511             token = SkipSingleLineComment();
512             continue;
513           }
514           return ScanPrivateName();
515 
516         case Token::WHITESPACE:
517           token = SkipWhiteSpace();
518           continue;
519 
520         case Token::NUMBER:
521           return ScanNumber(false);
522 
523         case Token::IDENTIFIER:
524           return ScanIdentifierOrKeyword();
525 
526         default:
527           UNREACHABLE();
528       }
529     }
530 
531     if (IsIdentifierStart(c0_) ||
532         (CombineSurrogatePair() && IsIdentifierStart(c0_))) {
533       return ScanIdentifierOrKeyword();
534     }
535     if (c0_ == kEndOfInput) {
536       return source_->has_parser_error() ? Token::ILLEGAL : Token::EOS;
537     }
538     token = SkipWhiteSpace();
539 
540     // Continue scanning for tokens as long as we're just skipping whitespace.
541   } while (token == Token::WHITESPACE);
542 
543   return token;
544 }
545 
Scan(TokenDesc * next_desc)546 void Scanner::Scan(TokenDesc* next_desc) {
547   DCHECK_EQ(next_desc, &next());
548 
549   next_desc->token = ScanSingleToken();
550   DCHECK_IMPLIES(has_parser_error(), next_desc->token == Token::ILLEGAL);
551   next_desc->location.end_pos = source_pos();
552 
553 #ifdef DEBUG
554   SanityCheckTokenDesc(current());
555   SanityCheckTokenDesc(next());
556   SanityCheckTokenDesc(next_next());
557 #endif
558 }
559 
Scan()560 void Scanner::Scan() { Scan(next_); }
561 
562 }  // namespace internal
563 }  // namespace v8
564 
565 #endif  // V8_PARSING_SCANNER_INL_H_
566