1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Features shared by parsing and pre-parsing scanners. 6 7 #ifndef V8_PARSING_SCANNER_H_ 8 #define V8_PARSING_SCANNER_H_ 9 10 #include "src/allocation.h" 11 #include "src/base/logging.h" 12 #include "src/char-predicates.h" 13 #include "src/globals.h" 14 #include "src/messages.h" 15 #include "src/parsing/token.h" 16 #include "src/unicode-decoder.h" 17 #include "src/unicode.h" 18 19 namespace v8 { 20 namespace internal { 21 22 23 class AstRawString; 24 class AstValueFactory; 25 class DuplicateFinder; 26 class ExternalOneByteString; 27 class ExternalTwoByteString; 28 class ParserRecorder; 29 class UnicodeCache; 30 31 // --------------------------------------------------------------------- 32 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer. 33 // A code unit is a 16 bit value representing either a 16 bit code point 34 // or one part of a surrogate pair that make a single 21 bit code point. 35 class Utf16CharacterStream { 36 public: 37 static const uc32 kEndOfInput = -1; 38 ~Utf16CharacterStream()39 virtual ~Utf16CharacterStream() { } 40 41 // Returns and advances past the next UTF-16 code unit in the input 42 // stream. If there are no more code units it returns kEndOfInput. Advance()43 inline uc32 Advance() { 44 if (V8_LIKELY(buffer_cursor_ < buffer_end_)) { 45 return static_cast<uc32>(*(buffer_cursor_++)); 46 } else if (ReadBlockChecked()) { 47 return static_cast<uc32>(*(buffer_cursor_++)); 48 } else { 49 // Note: currently the following increment is necessary to avoid a 50 // parser problem! The scanner treats the final kEndOfInput as 51 // a code unit with a position, and does math relative to that 52 // position. 53 buffer_cursor_++; 54 return kEndOfInput; 55 } 56 } 57 58 // Go back one by one character in the input stream. 59 // This undoes the most recent Advance(). Back()60 inline void Back() { 61 // The common case - if the previous character is within 62 // buffer_start_ .. buffer_end_ will be handles locally. 63 // Otherwise, a new block is requested. 64 if (V8_LIKELY(buffer_cursor_ > buffer_start_)) { 65 buffer_cursor_--; 66 } else { 67 ReadBlockAt(pos() - 1); 68 } 69 } 70 71 // Go back one by two characters in the input stream. (This is the same as 72 // calling Back() twice. But Back() may - in some instances - do substantial 73 // work. Back2() guarantees this work will be done only once.) Back2()74 inline void Back2() { 75 if (V8_LIKELY(buffer_cursor_ - 2 >= buffer_start_)) { 76 buffer_cursor_ -= 2; 77 } else { 78 ReadBlockAt(pos() - 2); 79 } 80 } 81 pos()82 inline size_t pos() const { 83 return buffer_pos_ + (buffer_cursor_ - buffer_start_); 84 } 85 Seek(size_t pos)86 inline void Seek(size_t pos) { 87 if (V8_LIKELY(pos >= buffer_pos_ && 88 pos < (buffer_pos_ + (buffer_end_ - buffer_start_)))) { 89 buffer_cursor_ = buffer_start_ + (pos - buffer_pos_); 90 } else { 91 ReadBlockAt(pos); 92 } 93 } 94 95 // Returns true if the stream could access the V8 heap after construction. 96 virtual bool can_access_heap() = 0; 97 98 protected: Utf16CharacterStream(const uint16_t * buffer_start,const uint16_t * buffer_cursor,const uint16_t * buffer_end,size_t buffer_pos)99 Utf16CharacterStream(const uint16_t* buffer_start, 100 const uint16_t* buffer_cursor, 101 const uint16_t* buffer_end, size_t buffer_pos) 102 : buffer_start_(buffer_start), 103 buffer_cursor_(buffer_cursor), 104 buffer_end_(buffer_end), 105 buffer_pos_(buffer_pos) {} Utf16CharacterStream()106 Utf16CharacterStream() : Utf16CharacterStream(nullptr, nullptr, nullptr, 0) {} 107 ReadBlockChecked()108 bool ReadBlockChecked() { 109 size_t position = pos(); 110 USE(position); 111 bool success = ReadBlock(); 112 113 // Post-conditions: 1, We should always be at the right position. 114 // 2, Cursor should be inside the buffer. 115 // 3, We should have more characters available iff success. 116 DCHECK_EQ(pos(), position); 117 DCHECK_LE(buffer_cursor_, buffer_end_); 118 DCHECK_LE(buffer_start_, buffer_cursor_); 119 DCHECK_EQ(success, buffer_cursor_ < buffer_end_); 120 return success; 121 } 122 ReadBlockAt(size_t new_pos)123 void ReadBlockAt(size_t new_pos) { 124 // The callers of this method (Back/Back2/Seek) should handle the easy 125 // case (seeking within the current buffer), and we should only get here 126 // if we actually require new data. 127 // (This is really an efficiency check, not a correctness invariant.) 128 DCHECK(new_pos < buffer_pos_ || 129 new_pos >= buffer_pos_ + (buffer_end_ - buffer_start_)); 130 131 // Change pos() to point to new_pos. 132 buffer_pos_ = new_pos; 133 buffer_cursor_ = buffer_start_; 134 DCHECK_EQ(pos(), new_pos); 135 ReadBlockChecked(); 136 } 137 138 // Read more data, and update buffer_*_ to point to it. 139 // Returns true if more data was available. 140 // 141 // ReadBlock() may modify any of the buffer_*_ members, but must sure that 142 // the result of pos() remains unaffected. 143 // 144 // Examples: 145 // - a stream could either fill a separate buffer. Then buffer_start_ and 146 // buffer_cursor_ would point to the beginning of the buffer, and 147 // buffer_pos would be the old pos(). 148 // - a stream with existing buffer chunks would set buffer_start_ and 149 // buffer_end_ to cover the full chunk, and then buffer_cursor_ would 150 // point into the middle of the buffer, while buffer_pos_ would describe 151 // the start of the buffer. 152 virtual bool ReadBlock() = 0; 153 154 const uint16_t* buffer_start_; 155 const uint16_t* buffer_cursor_; 156 const uint16_t* buffer_end_; 157 size_t buffer_pos_; 158 }; 159 160 161 // ---------------------------------------------------------------------------- 162 // JavaScript Scanner. 163 164 class Scanner { 165 public: 166 // Scoped helper for a re-settable bookmark. 167 class BookmarkScope { 168 public: BookmarkScope(Scanner * scanner)169 explicit BookmarkScope(Scanner* scanner) 170 : scanner_(scanner), bookmark_(kNoBookmark) { 171 DCHECK_NOT_NULL(scanner_); 172 } ~BookmarkScope()173 ~BookmarkScope() {} 174 175 void Set(); 176 void Apply(); 177 bool HasBeenSet(); 178 bool HasBeenApplied(); 179 180 private: 181 static const size_t kNoBookmark; 182 static const size_t kBookmarkWasApplied; 183 static const size_t kBookmarkAtFirstPos; 184 185 Scanner* scanner_; 186 size_t bookmark_; 187 188 DISALLOW_COPY_AND_ASSIGN(BookmarkScope); 189 }; 190 191 // Representation of an interval of source positions. 192 struct Location { LocationLocation193 Location(int b, int e) : beg_pos(b), end_pos(e) { } LocationLocation194 Location() : beg_pos(0), end_pos(0) { } 195 IsValidLocation196 bool IsValid() const { 197 return beg_pos >= 0 && end_pos >= beg_pos; 198 } 199 invalidLocation200 static Location invalid() { return Location(-1, -1); } 201 202 int beg_pos; 203 int end_pos; 204 }; 205 206 // -1 is outside of the range of any real source code. 207 static const int kNoOctalLocation = -1; 208 static const uc32 kEndOfInput = Utf16CharacterStream::kEndOfInput; 209 210 explicit Scanner(UnicodeCache* scanner_contants); 211 212 void Initialize(Utf16CharacterStream* source, bool is_module); 213 214 // Returns the next token and advances input. 215 Token::Value Next(); 216 // Returns the token following peek() 217 Token::Value PeekAhead(); 218 // Returns the current token again. current_token()219 Token::Value current_token() { return current_.token; } 220 current_contextual_token()221 Token::Value current_contextual_token() { return current_.contextual_token; } next_contextual_token()222 Token::Value next_contextual_token() { return next_.contextual_token; } 223 224 // Returns the location information for the current token 225 // (the token last returned by Next()). location()226 Location location() const { return current_.location; } 227 228 // This error is specifically an invalid hex or unicode escape sequence. has_error()229 bool has_error() const { return scanner_error_ != MessageTemplate::kNone; } error()230 MessageTemplate::Template error() const { return scanner_error_; } error_location()231 Location error_location() const { return scanner_error_location_; } 232 has_invalid_template_escape()233 bool has_invalid_template_escape() const { 234 return current_.invalid_template_escape_message != MessageTemplate::kNone; 235 } invalid_template_escape_message()236 MessageTemplate::Template invalid_template_escape_message() const { 237 DCHECK(has_invalid_template_escape()); 238 return current_.invalid_template_escape_message; 239 } invalid_template_escape_location()240 Location invalid_template_escape_location() const { 241 DCHECK(has_invalid_template_escape()); 242 return current_.invalid_template_escape_location; 243 } 244 245 // Similar functions for the upcoming token. 246 247 // One token look-ahead (past the token returned by Next()). peek()248 Token::Value peek() const { return next_.token; } 249 peek_location()250 Location peek_location() const { return next_.location; } 251 literal_contains_escapes()252 bool literal_contains_escapes() const { 253 return LiteralContainsEscapes(current_); 254 } 255 256 const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory) const; 257 const AstRawString* NextSymbol(AstValueFactory* ast_value_factory) const; 258 const AstRawString* CurrentRawSymbol( 259 AstValueFactory* ast_value_factory) const; 260 261 double DoubleValue(); 262 263 const char* CurrentLiteralAsCString(Zone* zone) const; 264 CurrentMatches(Token::Value token)265 inline bool CurrentMatches(Token::Value token) const { 266 DCHECK(Token::IsKeyword(token)); 267 return current_.token == token; 268 } 269 CurrentMatchesContextual(Token::Value token)270 inline bool CurrentMatchesContextual(Token::Value token) const { 271 DCHECK(Token::IsContextualKeyword(token)); 272 return current_.contextual_token == token; 273 } 274 275 // Match the token against the contextual keyword or literal buffer. CurrentMatchesContextualEscaped(Token::Value token)276 inline bool CurrentMatchesContextualEscaped(Token::Value token) const { 277 DCHECK(Token::IsContextualKeyword(token) || token == Token::LET); 278 // Escaped keywords are not matched as tokens. So if we require escape 279 // and/or string processing we need to look at the literal content 280 // (which was escape-processed already). 281 // Conveniently, current_.literal_chars == nullptr for all proper keywords, 282 // so this second condition should exit early in common cases. 283 return (current_.contextual_token == token) || 284 (current_.literal_chars && 285 current_.literal_chars->Equals(Vector<const char>( 286 Token::String(token), Token::StringLength(token)))); 287 } 288 IsUseStrict()289 bool IsUseStrict() const { 290 return current_.token == Token::STRING && 291 current_.literal_chars->Equals( 292 Vector<const char>("use strict", strlen("use strict"))); 293 } IsGetOrSet(bool * is_get,bool * is_set)294 bool IsGetOrSet(bool* is_get, bool* is_set) const { 295 *is_get = CurrentMatchesContextual(Token::GET); 296 *is_set = CurrentMatchesContextual(Token::SET); 297 return *is_get || *is_set; 298 } IsLet()299 bool IsLet() const { 300 return CurrentMatches(Token::LET) || 301 CurrentMatchesContextualEscaped(Token::LET); 302 } 303 304 // Check whether the CurrentSymbol() has already been seen. 305 // The DuplicateFinder holds the data, so different instances can be used 306 // for different sets of duplicates to check for. 307 bool IsDuplicateSymbol(DuplicateFinder* duplicate_finder, 308 AstValueFactory* ast_value_factory) const; 309 unicode_cache()310 UnicodeCache* unicode_cache() { return unicode_cache_; } 311 312 // Returns the location of the last seen octal literal. octal_position()313 Location octal_position() const { return octal_pos_; } clear_octal_position()314 void clear_octal_position() { 315 octal_pos_ = Location::invalid(); 316 octal_message_ = MessageTemplate::kNone; 317 } octal_message()318 MessageTemplate::Template octal_message() const { return octal_message_; } 319 320 // Returns the value of the last smi that was scanned. smi_value()321 uint32_t smi_value() const { return current_.smi_value_; } 322 323 // Seek forward to the given position. This operation does not 324 // work in general, for instance when there are pushed back 325 // characters, but works for seeking forward until simple delimiter 326 // tokens, which is what it is used for. 327 void SeekForward(int pos); 328 329 // Returns true if there was a line terminator before the peek'ed token, 330 // possibly inside a multi-line comment. HasAnyLineTerminatorBeforeNext()331 bool HasAnyLineTerminatorBeforeNext() const { 332 return has_line_terminator_before_next_ || 333 has_multiline_comment_before_next_; 334 } 335 HasAnyLineTerminatorAfterNext()336 bool HasAnyLineTerminatorAfterNext() { 337 Token::Value ensure_next_next = PeekAhead(); 338 USE(ensure_next_next); 339 return has_line_terminator_after_next_; 340 } 341 342 // Scans the input as a regular expression pattern, next token must be /(=). 343 // Returns true if a pattern is scanned. 344 bool ScanRegExpPattern(); 345 // Scans the input as regular expression flags. Returns the flags on success. 346 Maybe<RegExp::Flags> ScanRegExpFlags(); 347 348 // Scans the input as a template literal 349 Token::Value ScanTemplateStart(); ScanTemplateContinuation()350 Token::Value ScanTemplateContinuation() { 351 DCHECK_EQ(next_.token, Token::RBRACE); 352 next_.location.beg_pos = source_pos() - 1; // We already consumed } 353 return ScanTemplateSpan(); 354 } 355 356 Handle<String> SourceUrl(Isolate* isolate) const; 357 Handle<String> SourceMappingUrl(Isolate* isolate) const; 358 FoundHtmlComment()359 bool FoundHtmlComment() const { return found_html_comment_; } 360 allow_harmony_bigint()361 bool allow_harmony_bigint() const { return allow_harmony_bigint_; } set_allow_harmony_bigint(bool allow)362 void set_allow_harmony_bigint(bool allow) { allow_harmony_bigint_ = allow; } allow_harmony_private_fields()363 bool allow_harmony_private_fields() const { 364 return allow_harmony_private_fields_; 365 } set_allow_harmony_private_fields(bool allow)366 void set_allow_harmony_private_fields(bool allow) { 367 allow_harmony_private_fields_ = allow; 368 } allow_harmony_numeric_separator()369 bool allow_harmony_numeric_separator() const { 370 return allow_harmony_numeric_separator_; 371 } set_allow_harmony_numeric_separator(bool allow)372 void set_allow_harmony_numeric_separator(bool allow) { 373 allow_harmony_numeric_separator_ = allow; 374 } 375 376 private: 377 // Scoped helper for saving & restoring scanner error state. 378 // This is used for tagged template literals, in which normally forbidden 379 // escape sequences are allowed. 380 class ErrorState; 381 382 // Scoped helper for literal recording. Automatically drops the literal 383 // if aborting the scanning before it's complete. 384 class LiteralScope { 385 public: LiteralScope(Scanner * self)386 explicit LiteralScope(Scanner* self) : scanner_(self), complete_(false) { 387 scanner_->StartLiteral(); 388 } ~LiteralScope()389 ~LiteralScope() { 390 if (!complete_) scanner_->DropLiteral(); 391 } Complete()392 void Complete() { complete_ = true; } 393 394 private: 395 Scanner* scanner_; 396 bool complete_; 397 }; 398 399 // LiteralBuffer - Collector of chars of literals. 400 class LiteralBuffer { 401 public: LiteralBuffer()402 LiteralBuffer() : is_one_byte_(true), position_(0), backing_store_() {} 403 ~LiteralBuffer()404 ~LiteralBuffer() { backing_store_.Dispose(); } 405 INLINE(void AddChar (char code_unit))406 INLINE(void AddChar(char code_unit)) { 407 DCHECK(IsValidAscii(code_unit)); 408 AddOneByteChar(static_cast<byte>(code_unit)); 409 } 410 INLINE(void AddChar (uc32 code_unit))411 INLINE(void AddChar(uc32 code_unit)) { 412 if (is_one_byte_ && 413 code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) { 414 AddOneByteChar(static_cast<byte>(code_unit)); 415 } else { 416 AddCharSlow(code_unit); 417 } 418 } 419 is_one_byte()420 bool is_one_byte() const { return is_one_byte_; } 421 Equals(Vector<const char> keyword)422 bool Equals(Vector<const char> keyword) const { 423 return is_one_byte() && keyword.length() == position_ && 424 (memcmp(keyword.start(), backing_store_.start(), position_) == 0); 425 } 426 two_byte_literal()427 Vector<const uint16_t> two_byte_literal() const { 428 DCHECK(!is_one_byte_); 429 DCHECK_EQ(position_ & 0x1, 0); 430 return Vector<const uint16_t>( 431 reinterpret_cast<const uint16_t*>(backing_store_.start()), 432 position_ >> 1); 433 } 434 one_byte_literal()435 Vector<const uint8_t> one_byte_literal() const { 436 DCHECK(is_one_byte_); 437 return Vector<const uint8_t>( 438 reinterpret_cast<const uint8_t*>(backing_store_.start()), position_); 439 } 440 length()441 int length() const { return is_one_byte_ ? position_ : (position_ >> 1); } 442 ReduceLength(int delta)443 void ReduceLength(int delta) { 444 position_ -= delta * (is_one_byte_ ? kOneByteSize : kUC16Size); 445 } 446 Reset()447 void Reset() { 448 position_ = 0; 449 is_one_byte_ = true; 450 } 451 452 Handle<String> Internalize(Isolate* isolate) const; 453 454 private: 455 static const int kInitialCapacity = 16; 456 static const int kGrowthFactory = 4; 457 static const int kMinConversionSlack = 256; 458 static const int kMaxGrowth = 1 * MB; 459 IsValidAscii(char code_unit)460 inline bool IsValidAscii(char code_unit) { 461 // Control characters and printable characters span the range of 462 // valid ASCII characters (0-127). Chars are unsigned on some 463 // platforms which causes compiler warnings if the validity check 464 // tests the lower bound >= 0 as it's always true. 465 return iscntrl(code_unit) || isprint(code_unit); 466 } 467 INLINE(void AddOneByteChar (byte one_byte_char))468 INLINE(void AddOneByteChar(byte one_byte_char)) { 469 DCHECK(is_one_byte_); 470 if (position_ >= backing_store_.length()) ExpandBuffer(); 471 backing_store_[position_] = one_byte_char; 472 position_ += kOneByteSize; 473 } 474 475 void AddCharSlow(uc32 code_unit); 476 int NewCapacity(int min_capacity); 477 void ExpandBuffer(); 478 void ConvertToTwoByte(); 479 480 bool is_one_byte_; 481 int position_; 482 Vector<byte> backing_store_; 483 484 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer); 485 }; 486 487 // The current and look-ahead token. 488 struct TokenDesc { 489 Location location; 490 LiteralBuffer* literal_chars; 491 LiteralBuffer* raw_literal_chars; 492 uint32_t smi_value_; 493 Token::Value token; 494 MessageTemplate::Template invalid_template_escape_message; 495 Location invalid_template_escape_location; 496 Token::Value contextual_token; 497 }; 498 499 enum NumberKind { 500 BINARY, 501 OCTAL, 502 IMPLICIT_OCTAL, 503 HEX, 504 DECIMAL, 505 DECIMAL_WITH_LEADING_ZERO 506 }; 507 508 static const int kCharacterLookaheadBufferSize = 1; 509 const int kMaxAscii = 127; 510 511 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence. 512 template <bool capture_raw> 513 uc32 ScanOctalEscape(uc32 c, int length, bool in_template_literal); 514 515 // Call this after setting source_ to the input. Init()516 void Init() { 517 // Set c0_ (one character ahead) 518 STATIC_ASSERT(kCharacterLookaheadBufferSize == 1); 519 Advance(); 520 // Initialize current_ to not refer to a literal. 521 current_.token = Token::UNINITIALIZED; 522 current_.contextual_token = Token::UNINITIALIZED; 523 current_.literal_chars = nullptr; 524 current_.raw_literal_chars = nullptr; 525 current_.invalid_template_escape_message = MessageTemplate::kNone; 526 next_.token = Token::UNINITIALIZED; 527 next_.contextual_token = Token::UNINITIALIZED; 528 next_.literal_chars = nullptr; 529 next_.raw_literal_chars = nullptr; 530 next_.invalid_template_escape_message = MessageTemplate::kNone; 531 next_next_.token = Token::UNINITIALIZED; 532 next_next_.contextual_token = Token::UNINITIALIZED; 533 next_next_.literal_chars = nullptr; 534 next_next_.raw_literal_chars = nullptr; 535 next_next_.invalid_template_escape_message = MessageTemplate::kNone; 536 found_html_comment_ = false; 537 scanner_error_ = MessageTemplate::kNone; 538 } 539 ReportScannerError(const Location & location,MessageTemplate::Template error)540 void ReportScannerError(const Location& location, 541 MessageTemplate::Template error) { 542 if (has_error()) return; 543 scanner_error_ = error; 544 scanner_error_location_ = location; 545 } 546 ReportScannerError(int pos,MessageTemplate::Template error)547 void ReportScannerError(int pos, MessageTemplate::Template error) { 548 if (has_error()) return; 549 scanner_error_ = error; 550 scanner_error_location_ = Location(pos, pos + 1); 551 } 552 553 // Seek to the next_ token at the given position. 554 void SeekNext(size_t position); 555 556 // Literal buffer support StartLiteral()557 inline void StartLiteral() { 558 LiteralBuffer* free_buffer = 559 (current_.literal_chars == &literal_buffer0_) 560 ? &literal_buffer1_ 561 : (current_.literal_chars == &literal_buffer1_) ? &literal_buffer2_ 562 : &literal_buffer0_; 563 free_buffer->Reset(); 564 next_.literal_chars = free_buffer; 565 } 566 StartRawLiteral()567 inline void StartRawLiteral() { 568 LiteralBuffer* free_buffer = 569 (current_.raw_literal_chars == &raw_literal_buffer0_) 570 ? &raw_literal_buffer1_ 571 : (current_.raw_literal_chars == &raw_literal_buffer1_) 572 ? &raw_literal_buffer2_ 573 : &raw_literal_buffer0_; 574 free_buffer->Reset(); 575 next_.raw_literal_chars = free_buffer; 576 } 577 INLINE(void AddLiteralChar (uc32 c))578 INLINE(void AddLiteralChar(uc32 c)) { 579 DCHECK_NOT_NULL(next_.literal_chars); 580 next_.literal_chars->AddChar(c); 581 } 582 INLINE(void AddLiteralChar (char c))583 INLINE(void AddLiteralChar(char c)) { 584 DCHECK_NOT_NULL(next_.literal_chars); 585 next_.literal_chars->AddChar(c); 586 } 587 INLINE(void AddRawLiteralChar (uc32 c))588 INLINE(void AddRawLiteralChar(uc32 c)) { 589 DCHECK_NOT_NULL(next_.raw_literal_chars); 590 next_.raw_literal_chars->AddChar(c); 591 } 592 INLINE(void ReduceRawLiteralLength (int delta))593 INLINE(void ReduceRawLiteralLength(int delta)) { 594 DCHECK_NOT_NULL(next_.raw_literal_chars); 595 next_.raw_literal_chars->ReduceLength(delta); 596 } 597 598 // Stops scanning of a literal and drop the collected characters, 599 // e.g., due to an encountered error. DropLiteral()600 inline void DropLiteral() { 601 next_.literal_chars = nullptr; 602 next_.raw_literal_chars = nullptr; 603 } 604 AddLiteralCharAdvance()605 inline void AddLiteralCharAdvance() { 606 AddLiteralChar(c0_); 607 Advance(); 608 } 609 610 // Low-level scanning support. 611 template <bool capture_raw = false, bool check_surrogate = true> Advance()612 void Advance() { 613 if (capture_raw) { 614 AddRawLiteralChar(c0_); 615 } 616 c0_ = source_->Advance(); 617 if (check_surrogate) HandleLeadSurrogate(); 618 } 619 HandleLeadSurrogate()620 void HandleLeadSurrogate() { 621 if (unibrow::Utf16::IsLeadSurrogate(c0_)) { 622 uc32 c1 = source_->Advance(); 623 if (!unibrow::Utf16::IsTrailSurrogate(c1)) { 624 source_->Back(); 625 } else { 626 c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1); 627 } 628 } 629 } 630 PushBack(uc32 ch)631 void PushBack(uc32 ch) { 632 if (c0_ > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { 633 source_->Back2(); 634 } else { 635 source_->Back(); 636 } 637 c0_ = ch; 638 } 639 640 // Same as PushBack(ch1); PushBack(ch2). 641 // - Potentially more efficient as it uses Back2() on the stream. 642 // - Uses char as parameters, since we're only calling it with ASCII chars in 643 // practice. This way, we can avoid a few edge cases. PushBack2(char ch1,char ch2)644 void PushBack2(char ch1, char ch2) { 645 source_->Back2(); 646 c0_ = ch2; 647 } 648 Select(Token::Value tok)649 inline Token::Value Select(Token::Value tok) { 650 Advance(); 651 return tok; 652 } 653 Select(uc32 next,Token::Value then,Token::Value else_)654 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) { 655 Advance(); 656 if (c0_ == next) { 657 Advance(); 658 return then; 659 } else { 660 return else_; 661 } 662 } 663 // Returns the literal string, if any, for the current token (the 664 // token last returned by Next()). The string is 0-terminated. 665 // Literal strings are collected for identifiers, strings, numbers as well 666 // as for template literals. For template literals we also collect the raw 667 // form. 668 // These functions only give the correct result if the literal was scanned 669 // when a LiteralScope object is alive. 670 // 671 // Current usage of these functions is unfortunately a little undisciplined, 672 // and is_literal_one_byte() + is_literal_one_byte_string() is also 673 // requested for tokens that do not have a literal. Hence, we treat any 674 // token as a one-byte literal. E.g. Token::FUNCTION pretends to have a 675 // literal "function". literal_one_byte_string()676 Vector<const uint8_t> literal_one_byte_string() const { 677 if (current_.literal_chars) 678 return current_.literal_chars->one_byte_literal(); 679 const char* str = Token::String(current_.token); 680 const uint8_t* str_as_uint8 = reinterpret_cast<const uint8_t*>(str); 681 return Vector<const uint8_t>(str_as_uint8, 682 Token::StringLength(current_.token)); 683 } literal_two_byte_string()684 Vector<const uint16_t> literal_two_byte_string() const { 685 DCHECK_NOT_NULL(current_.literal_chars); 686 return current_.literal_chars->two_byte_literal(); 687 } is_literal_one_byte()688 bool is_literal_one_byte() const { 689 return !current_.literal_chars || current_.literal_chars->is_one_byte(); 690 } 691 // Returns the literal string for the next token (the token that 692 // would be returned if Next() were called). next_literal_one_byte_string()693 Vector<const uint8_t> next_literal_one_byte_string() const { 694 DCHECK_NOT_NULL(next_.literal_chars); 695 return next_.literal_chars->one_byte_literal(); 696 } next_literal_two_byte_string()697 Vector<const uint16_t> next_literal_two_byte_string() const { 698 DCHECK_NOT_NULL(next_.literal_chars); 699 return next_.literal_chars->two_byte_literal(); 700 } is_next_literal_one_byte()701 bool is_next_literal_one_byte() const { 702 DCHECK_NOT_NULL(next_.literal_chars); 703 return next_.literal_chars->is_one_byte(); 704 } raw_literal_one_byte_string()705 Vector<const uint8_t> raw_literal_one_byte_string() const { 706 DCHECK_NOT_NULL(current_.raw_literal_chars); 707 return current_.raw_literal_chars->one_byte_literal(); 708 } raw_literal_two_byte_string()709 Vector<const uint16_t> raw_literal_two_byte_string() const { 710 DCHECK_NOT_NULL(current_.raw_literal_chars); 711 return current_.raw_literal_chars->two_byte_literal(); 712 } is_raw_literal_one_byte()713 bool is_raw_literal_one_byte() const { 714 DCHECK_NOT_NULL(current_.raw_literal_chars); 715 return current_.raw_literal_chars->is_one_byte(); 716 } 717 718 template <bool capture_raw, bool unicode = false> 719 uc32 ScanHexNumber(int expected_length); 720 // Scan a number of any length but not bigger than max_value. For example, the 721 // number can be 000000001, so it's very long in characters but its value is 722 // small. 723 template <bool capture_raw> 724 uc32 ScanUnlimitedLengthHexNumber(int max_value, int beg_pos); 725 726 // Scans a single JavaScript token. 727 void Scan(); 728 729 Token::Value SkipWhiteSpace(); 730 Token::Value SkipSingleHTMLComment(); 731 Token::Value SkipSingleLineComment(); 732 Token::Value SkipSourceURLComment(); 733 void TryToParseSourceURLComment(); 734 Token::Value SkipMultiLineComment(); 735 // Scans a possible HTML comment -- begins with '<!'. 736 Token::Value ScanHtmlComment(); 737 738 bool ScanDigitsWithNumericSeparators(bool (*predicate)(uc32 ch), 739 bool is_check_first_digit); 740 bool ScanDecimalDigits(); 741 // Optimized function to scan decimal number as Smi. 742 bool ScanDecimalAsSmi(uint64_t* value); 743 bool ScanDecimalAsSmiWithNumericSeparators(uint64_t* value); 744 bool ScanHexDigits(); 745 bool ScanBinaryDigits(); 746 bool ScanSignedInteger(); 747 bool ScanOctalDigits(); 748 bool ScanImplicitOctalDigits(int start_pos, NumberKind* kind); 749 750 Token::Value ScanNumber(bool seen_period); 751 Token::Value ScanIdentifierOrKeyword(); 752 Token::Value ScanIdentifierOrKeywordInner(LiteralScope* literal); 753 Token::Value ScanIdentifierSuffix(LiteralScope* literal, bool escaped); 754 755 Token::Value ScanString(); 756 Token::Value ScanPrivateName(); 757 758 // Scans an escape-sequence which is part of a string and adds the 759 // decoded character to the current literal. Returns true if a pattern 760 // is scanned. 761 template <bool capture_raw, bool in_template_literal> 762 bool ScanEscape(); 763 764 // Decodes a Unicode escape-sequence which is part of an identifier. 765 // If the escape sequence cannot be decoded the result is kBadChar. 766 uc32 ScanIdentifierUnicodeEscape(); 767 // Helper for the above functions. 768 template <bool capture_raw> 769 uc32 ScanUnicodeEscape(); 770 771 bool is_module_; 772 773 Token::Value ScanTemplateSpan(); 774 775 // Return the current source position. source_pos()776 int source_pos() { 777 return static_cast<int>(source_->pos()) - kCharacterLookaheadBufferSize; 778 } 779 LiteralContainsEscapes(const TokenDesc & token)780 static bool LiteralContainsEscapes(const TokenDesc& token) { 781 Location location = token.location; 782 int source_length = (location.end_pos - location.beg_pos); 783 if (token.token == Token::STRING) { 784 // Subtract delimiters. 785 source_length -= 2; 786 } 787 return token.literal_chars && 788 (token.literal_chars->length() != source_length); 789 } 790 791 #ifdef DEBUG 792 void SanityCheckTokenDesc(const TokenDesc&) const; 793 #endif 794 795 UnicodeCache* unicode_cache_; 796 797 // Buffers collecting literal strings, numbers, etc. 798 LiteralBuffer literal_buffer0_; 799 LiteralBuffer literal_buffer1_; 800 LiteralBuffer literal_buffer2_; 801 802 // Values parsed from magic comments. 803 LiteralBuffer source_url_; 804 LiteralBuffer source_mapping_url_; 805 806 // Buffer to store raw string values 807 LiteralBuffer raw_literal_buffer0_; 808 LiteralBuffer raw_literal_buffer1_; 809 LiteralBuffer raw_literal_buffer2_; 810 811 TokenDesc current_; // desc for current token (as returned by Next()) 812 TokenDesc next_; // desc for next token (one token look-ahead) 813 TokenDesc next_next_; // desc for the token after next (after PeakAhead()) 814 815 // Input stream. Must be initialized to an Utf16CharacterStream. 816 Utf16CharacterStream* source_; 817 818 // Last-seen positions of potentially problematic tokens. 819 Location octal_pos_; 820 MessageTemplate::Template octal_message_; 821 822 // One Unicode character look-ahead; c0_ < 0 at the end of the input. 823 uc32 c0_; 824 825 // Whether there is a line terminator whitespace character after 826 // the current token, and before the next. Does not count newlines 827 // inside multiline comments. 828 bool has_line_terminator_before_next_; 829 // Whether there is a multi-line comment that contains a 830 // line-terminator after the current token, and before the next. 831 bool has_multiline_comment_before_next_; 832 bool has_line_terminator_after_next_; 833 834 // Whether this scanner encountered an HTML comment. 835 bool found_html_comment_; 836 837 // Harmony flags to allow ESNext features. 838 bool allow_harmony_bigint_; 839 bool allow_harmony_private_fields_; 840 bool allow_harmony_numeric_separator_; 841 842 MessageTemplate::Template scanner_error_; 843 Location scanner_error_location_; 844 }; 845 846 } // namespace internal 847 } // namespace v8 848 849 #endif // V8_PARSING_SCANNER_H_ 850