1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- 2 * vim: set ts=8 sts=4 et sw=4 tw=99: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 // JS lexical scanner. 8 9 #include "frontend/TokenStream.h" 10 11 #include "mozilla/IntegerTypeTraits.h" 12 #include "mozilla/PodOperations.h" 13 14 #include <ctype.h> 15 #include <stdarg.h> 16 #include <stdio.h> 17 #include <string.h> 18 19 #include "jsatom.h" 20 #include "jscntxt.h" 21 #include "jscompartment.h" 22 #include "jsexn.h" 23 #include "jsnum.h" 24 25 #include "frontend/BytecodeCompiler.h" 26 #include "js/CharacterEncoding.h" 27 #include "js/UniquePtr.h" 28 #include "vm/HelperThreads.h" 29 #include "vm/Keywords.h" 30 #include "vm/StringBuffer.h" 31 #include "vm/Unicode.h" 32 33 using namespace js; 34 using namespace js::frontend; 35 36 using mozilla::Maybe; 37 using mozilla::PodAssign; 38 using mozilla::PodCopy; 39 using mozilla::PodZero; 40 41 struct KeywordInfo { 42 const char* chars; // C string with keyword text 43 TokenKind tokentype; 44 }; 45 46 static const KeywordInfo keywords[] = { 47 #define KEYWORD_INFO(keyword, name, type) \ 48 {js_##keyword##_str, type}, 49 FOR_EACH_JAVASCRIPT_KEYWORD(KEYWORD_INFO) 50 #undef KEYWORD_INFO 51 }; 52 53 // Returns a KeywordInfo for the specified characters, or nullptr if the string 54 // is not a keyword. 55 template <typename CharT> 56 static const KeywordInfo* 57 FindKeyword(const CharT* s, size_t length) 58 { 59 MOZ_ASSERT(length != 0); 60 61 size_t i; 62 const KeywordInfo* kw; 63 const char* chars; 64 65 #define JSKW_LENGTH() length 66 #define JSKW_AT(column) s[column] 67 #define JSKW_GOT_MATCH(index) i = (index); goto got_match; 68 #define JSKW_TEST_GUESS(index) i = (index); goto test_guess; 69 #define JSKW_NO_MATCH() goto no_match; 70 #include "jsautokw.h" 71 #undef JSKW_NO_MATCH 72 #undef JSKW_TEST_GUESS 73 #undef JSKW_GOT_MATCH 74 #undef JSKW_AT 75 #undef JSKW_LENGTH 76 77 got_match: 78 return &keywords[i]; 79 80 test_guess: 81 kw = &keywords[i]; 82 chars = kw->chars; 83 do { 84 if (*s++ != (unsigned char)(*chars++)) 85 goto no_match; 86 } while (--length != 0); 87 return kw; 88 89 no_match: 90 return nullptr; 91 } 92 93 static const KeywordInfo* 94 FindKeyword(JSLinearString* str) 95 { 96 JS::AutoCheckCannotGC nogc; 97 return str->hasLatin1Chars() 98 ? FindKeyword(str->latin1Chars(nogc), str->length()) 99 : FindKeyword(str->twoByteChars(nogc), str->length()); 100 } 101 102 template <typename CharT> 103 static bool 104 IsIdentifier(const CharT* chars, size_t length) 105 { 106 if (length == 0) 107 return false; 108 109 if (!unicode::IsIdentifierStart(char16_t(*chars))) 110 return false; 111 112 const CharT* end = chars + length; 113 while (++chars != end) { 114 if (!unicode::IsIdentifierPart(char16_t(*chars))) 115 return false; 116 } 117 118 return true; 119 } 120 121 bool 122 frontend::IsIdentifier(JSLinearString* str) 123 { 124 JS::AutoCheckCannotGC nogc; 125 return str->hasLatin1Chars() 126 ? ::IsIdentifier(str->latin1Chars(nogc), str->length()) 127 : ::IsIdentifier(str->twoByteChars(nogc), str->length()); 128 } 129 130 bool 131 frontend::IsIdentifier(const char16_t* chars, size_t length) 132 { 133 return ::IsIdentifier(chars, length); 134 } 135 136 bool 137 frontend::IsKeyword(JSLinearString* str) 138 { 139 return FindKeyword(str) != nullptr; 140 } 141 142 TokenStream::SourceCoords::SourceCoords(ExclusiveContext* cx, uint32_t ln) 143 : lineStartOffsets_(cx), initialLineNum_(ln), lastLineIndex_(0) 144 { 145 // This is actually necessary! Removing it causes compile errors on 146 // GCC and clang. You could try declaring this: 147 // 148 // const uint32_t TokenStream::SourceCoords::MAX_PTR; 149 // 150 // which fixes the GCC/clang error, but causes bustage on Windows. Sigh. 151 // 152 uint32_t maxPtr = MAX_PTR; 153 154 // The first line begins at buffer offset 0. MAX_PTR is the sentinel. The 155 // appends cannot fail because |lineStartOffsets_| has statically-allocated 156 // elements. 157 MOZ_ASSERT(lineStartOffsets_.capacity() >= 2); 158 MOZ_ALWAYS_TRUE(lineStartOffsets_.reserve(2)); 159 lineStartOffsets_.infallibleAppend(0); 160 lineStartOffsets_.infallibleAppend(maxPtr); 161 } 162 163 MOZ_ALWAYS_INLINE bool 164 TokenStream::SourceCoords::add(uint32_t lineNum, uint32_t lineStartOffset) 165 { 166 uint32_t lineIndex = lineNumToIndex(lineNum); 167 uint32_t sentinelIndex = lineStartOffsets_.length() - 1; 168 169 MOZ_ASSERT(lineStartOffsets_[0] == 0 && lineStartOffsets_[sentinelIndex] == MAX_PTR); 170 171 if (lineIndex == sentinelIndex) { 172 // We haven't seen this newline before. Update lineStartOffsets_ 173 // only if lineStartOffsets_.append succeeds, to keep sentinel. 174 // Otherwise return false to tell TokenStream about OOM. 175 uint32_t maxPtr = MAX_PTR; 176 if (!lineStartOffsets_.append(maxPtr)) 177 return false; 178 179 lineStartOffsets_[lineIndex] = lineStartOffset; 180 } else { 181 // We have seen this newline before (and ungot it). Do nothing (other 182 // than checking it hasn't mysteriously changed). 183 // This path can be executed after hitting OOM, so check lineIndex. 184 MOZ_ASSERT_IF(lineIndex < sentinelIndex, lineStartOffsets_[lineIndex] == lineStartOffset); 185 } 186 return true; 187 } 188 189 MOZ_ALWAYS_INLINE bool 190 TokenStream::SourceCoords::fill(const TokenStream::SourceCoords& other) 191 { 192 MOZ_ASSERT(lineStartOffsets_.back() == MAX_PTR); 193 MOZ_ASSERT(other.lineStartOffsets_.back() == MAX_PTR); 194 195 if (lineStartOffsets_.length() >= other.lineStartOffsets_.length()) 196 return true; 197 198 uint32_t sentinelIndex = lineStartOffsets_.length() - 1; 199 lineStartOffsets_[sentinelIndex] = other.lineStartOffsets_[sentinelIndex]; 200 201 for (size_t i = sentinelIndex + 1; i < other.lineStartOffsets_.length(); i++) { 202 if (!lineStartOffsets_.append(other.lineStartOffsets_[i])) 203 return false; 204 } 205 return true; 206 } 207 208 MOZ_ALWAYS_INLINE uint32_t 209 TokenStream::SourceCoords::lineIndexOf(uint32_t offset) const 210 { 211 uint32_t iMin, iMax, iMid; 212 213 if (lineStartOffsets_[lastLineIndex_] <= offset) { 214 // If we reach here, offset is on a line the same as or higher than 215 // last time. Check first for the +0, +1, +2 cases, because they 216 // typically cover 85--98% of cases. 217 if (offset < lineStartOffsets_[lastLineIndex_ + 1]) 218 return lastLineIndex_; // lineIndex is same as last time 219 220 // If we reach here, there must be at least one more entry (plus the 221 // sentinel). Try it. 222 lastLineIndex_++; 223 if (offset < lineStartOffsets_[lastLineIndex_ + 1]) 224 return lastLineIndex_; // lineIndex is one higher than last time 225 226 // The same logic applies here. 227 lastLineIndex_++; 228 if (offset < lineStartOffsets_[lastLineIndex_ + 1]) { 229 return lastLineIndex_; // lineIndex is two higher than last time 230 } 231 232 // No luck. Oh well, we have a better-than-default starting point for 233 // the binary search. 234 iMin = lastLineIndex_ + 1; 235 MOZ_ASSERT(iMin < lineStartOffsets_.length() - 1); // -1 due to the sentinel 236 237 } else { 238 iMin = 0; 239 } 240 241 // This is a binary search with deferred detection of equality, which was 242 // marginally faster in this case than a standard binary search. 243 // The -2 is because |lineStartOffsets_.length() - 1| is the sentinel, and we 244 // want one before that. 245 iMax = lineStartOffsets_.length() - 2; 246 while (iMax > iMin) { 247 iMid = iMin + (iMax - iMin) / 2; 248 if (offset >= lineStartOffsets_[iMid + 1]) 249 iMin = iMid + 1; // offset is above lineStartOffsets_[iMid] 250 else 251 iMax = iMid; // offset is below or within lineStartOffsets_[iMid] 252 } 253 MOZ_ASSERT(iMax == iMin); 254 MOZ_ASSERT(lineStartOffsets_[iMin] <= offset && offset < lineStartOffsets_[iMin + 1]); 255 lastLineIndex_ = iMin; 256 return iMin; 257 } 258 259 uint32_t 260 TokenStream::SourceCoords::lineNum(uint32_t offset) const 261 { 262 uint32_t lineIndex = lineIndexOf(offset); 263 return lineIndexToNum(lineIndex); 264 } 265 266 uint32_t 267 TokenStream::SourceCoords::columnIndex(uint32_t offset) const 268 { 269 uint32_t lineIndex = lineIndexOf(offset); 270 uint32_t lineStartOffset = lineStartOffsets_[lineIndex]; 271 MOZ_ASSERT(offset >= lineStartOffset); 272 return offset - lineStartOffset; 273 } 274 275 void 276 TokenStream::SourceCoords::lineNumAndColumnIndex(uint32_t offset, uint32_t* lineNum, 277 uint32_t* columnIndex) const 278 { 279 uint32_t lineIndex = lineIndexOf(offset); 280 *lineNum = lineIndexToNum(lineIndex); 281 uint32_t lineStartOffset = lineStartOffsets_[lineIndex]; 282 MOZ_ASSERT(offset >= lineStartOffset); 283 *columnIndex = offset - lineStartOffset; 284 } 285 286 #ifdef _MSC_VER 287 #pragma warning(push) 288 #pragma warning(disable:4351) 289 #endif 290 291 TokenStream::TokenStream(ExclusiveContext* cx, const ReadOnlyCompileOptions& options, 292 const char16_t* base, size_t length, StrictModeGetter* smg) 293 : srcCoords(cx, options.lineno), 294 options_(options), 295 tokens(), 296 cursor(), 297 lookahead(), 298 lineno(options.lineno), 299 flags(), 300 linebase(0), 301 prevLinebase(size_t(-1)), 302 userbuf(cx, base, length, options.column), 303 filename(options.filename()), 304 displayURL_(nullptr), 305 sourceMapURL_(nullptr), 306 tokenbuf(cx), 307 cx(cx), 308 mutedErrors(options.mutedErrors()), 309 strictModeGetter(smg) 310 { 311 // Nb: the following tables could be static, but initializing them here is 312 // much easier. Don't worry, the time to initialize them for each 313 // TokenStream is trivial. See bug 639420. 314 315 // See Parser::assignExpr() for an explanation of isExprEnding[]. 316 memset(isExprEnding, 0, sizeof(isExprEnding)); 317 isExprEnding[TOK_COMMA] = 1; 318 isExprEnding[TOK_SEMI] = 1; 319 isExprEnding[TOK_COLON] = 1; 320 isExprEnding[TOK_RP] = 1; 321 isExprEnding[TOK_RB] = 1; 322 isExprEnding[TOK_RC] = 1; 323 } 324 325 #ifdef _MSC_VER 326 #pragma warning(pop) 327 #endif 328 329 bool 330 TokenStream::checkOptions() 331 { 332 // Constrain starting columns to half of the range of a signed 32-bit value, 333 // to avoid overflow. 334 if (options().column >= mozilla::MaxValue<int32_t>::value / 2 + 1) { 335 reportErrorNoOffset(JSMSG_BAD_COLUMN_NUMBER); 336 return false; 337 } 338 339 return true; 340 } 341 342 TokenStream::~TokenStream() 343 { 344 } 345 346 // Use the fastest available getc. 347 #if defined(HAVE_GETC_UNLOCKED) 348 # define fast_getc getc_unlocked 349 #elif defined(HAVE__GETC_NOLOCK) 350 # define fast_getc _getc_nolock 351 #else 352 # define fast_getc getc 353 #endif 354 355 MOZ_ALWAYS_INLINE void 356 TokenStream::updateLineInfoForEOL() 357 { 358 prevLinebase = linebase; 359 linebase = userbuf.offset(); 360 lineno++; 361 if (!srcCoords.add(lineno, linebase)) 362 flags.hitOOM = true; 363 } 364 365 MOZ_ALWAYS_INLINE void 366 TokenStream::updateFlagsForEOL() 367 { 368 flags.isDirtyLine = false; 369 } 370 371 // This gets the next char, normalizing all EOL sequences to '\n' as it goes. 372 int32_t 373 TokenStream::getChar() 374 { 375 int32_t c; 376 if (MOZ_LIKELY(userbuf.hasRawChars())) { 377 c = userbuf.getRawChar(); 378 379 // Normalize the char16_t if it was a newline. 380 if (MOZ_UNLIKELY(c == '\n')) 381 goto eol; 382 if (MOZ_UNLIKELY(c == '\r')) { 383 // If it's a \r\n sequence: treat as a single EOL, skip over the \n. 384 if (MOZ_LIKELY(userbuf.hasRawChars())) 385 userbuf.matchRawChar('\n'); 386 goto eol; 387 } 388 if (MOZ_UNLIKELY(c == LINE_SEPARATOR || c == PARA_SEPARATOR)) 389 goto eol; 390 391 return c; 392 } 393 394 flags.isEOF = true; 395 return EOF; 396 397 eol: 398 updateLineInfoForEOL(); 399 return '\n'; 400 } 401 402 // This gets the next char. It does nothing special with EOL sequences, not 403 // even updating the line counters. It can be used safely if (a) the 404 // resulting char is guaranteed to be ungotten (by ungetCharIgnoreEOL()) if 405 // it's an EOL, and (b) the line-related state (lineno, linebase) is not used 406 // before it's ungotten. 407 int32_t 408 TokenStream::getCharIgnoreEOL() 409 { 410 if (MOZ_LIKELY(userbuf.hasRawChars())) 411 return userbuf.getRawChar(); 412 413 flags.isEOF = true; 414 return EOF; 415 } 416 417 void 418 TokenStream::ungetChar(int32_t c) 419 { 420 if (c == EOF) 421 return; 422 MOZ_ASSERT(!userbuf.atStart()); 423 userbuf.ungetRawChar(); 424 if (c == '\n') { 425 #ifdef DEBUG 426 int32_t c2 = userbuf.peekRawChar(); 427 MOZ_ASSERT(TokenBuf::isRawEOLChar(c2)); 428 #endif 429 430 // If it's a \r\n sequence, also unget the \r. 431 if (!userbuf.atStart()) 432 userbuf.matchRawCharBackwards('\r'); 433 434 MOZ_ASSERT(prevLinebase != size_t(-1)); // we should never get more than one EOL char 435 linebase = prevLinebase; 436 prevLinebase = size_t(-1); 437 lineno--; 438 } else { 439 MOZ_ASSERT(userbuf.peekRawChar() == c); 440 } 441 } 442 443 void 444 TokenStream::ungetCharIgnoreEOL(int32_t c) 445 { 446 if (c == EOF) 447 return; 448 MOZ_ASSERT(!userbuf.atStart()); 449 userbuf.ungetRawChar(); 450 } 451 452 // Return true iff |n| raw characters can be read from this without reading past 453 // EOF or a newline, and copy those characters into |cp| if so. The characters 454 // are not consumed: use skipChars(n) to do so after checking that the consumed 455 // characters had appropriate values. 456 bool 457 TokenStream::peekChars(int n, char16_t* cp) 458 { 459 int i, j; 460 int32_t c; 461 462 for (i = 0; i < n; i++) { 463 c = getCharIgnoreEOL(); 464 if (c == EOF) 465 break; 466 if (c == '\n') { 467 ungetCharIgnoreEOL(c); 468 break; 469 } 470 cp[i] = char16_t(c); 471 } 472 for (j = i - 1; j >= 0; j--) 473 ungetCharIgnoreEOL(cp[j]); 474 return i == n; 475 } 476 477 size_t 478 TokenStream::TokenBuf::findEOLMax(size_t start, size_t max) 479 { 480 const char16_t* p = rawCharPtrAt(start); 481 482 size_t n = 0; 483 while (true) { 484 if (p >= limit_) 485 break; 486 if (n >= max) 487 break; 488 n++; 489 if (TokenBuf::isRawEOLChar(*p++)) 490 break; 491 } 492 return start + n; 493 } 494 495 bool 496 TokenStream::advance(size_t position) 497 { 498 const char16_t* end = userbuf.rawCharPtrAt(position); 499 while (userbuf.addressOfNextRawChar() < end) 500 getChar(); 501 502 Token* cur = &tokens[cursor]; 503 cur->pos.begin = userbuf.offset(); 504 MOZ_MAKE_MEM_UNDEFINED(&cur->type, sizeof(cur->type)); 505 lookahead = 0; 506 507 if (flags.hitOOM) 508 return reportError(JSMSG_OUT_OF_MEMORY); 509 510 return true; 511 } 512 513 void 514 TokenStream::tell(Position* pos) 515 { 516 pos->buf = userbuf.addressOfNextRawChar(/* allowPoisoned = */ true); 517 pos->flags = flags; 518 pos->lineno = lineno; 519 pos->linebase = linebase; 520 pos->prevLinebase = prevLinebase; 521 pos->lookahead = lookahead; 522 pos->currentToken = currentToken(); 523 for (unsigned i = 0; i < lookahead; i++) 524 pos->lookaheadTokens[i] = tokens[(cursor + 1 + i) & ntokensMask]; 525 } 526 527 void 528 TokenStream::seek(const Position& pos) 529 { 530 userbuf.setAddressOfNextRawChar(pos.buf, /* allowPoisoned = */ true); 531 flags = pos.flags; 532 lineno = pos.lineno; 533 linebase = pos.linebase; 534 prevLinebase = pos.prevLinebase; 535 lookahead = pos.lookahead; 536 537 tokens[cursor] = pos.currentToken; 538 for (unsigned i = 0; i < lookahead; i++) 539 tokens[(cursor + 1 + i) & ntokensMask] = pos.lookaheadTokens[i]; 540 } 541 542 bool 543 TokenStream::seek(const Position& pos, const TokenStream& other) 544 { 545 if (!srcCoords.fill(other.srcCoords)) 546 return false; 547 seek(pos); 548 return true; 549 } 550 551 bool 552 TokenStream::reportStrictModeErrorNumberVA(uint32_t offset, bool strictMode, unsigned errorNumber, 553 va_list args) 554 { 555 // In strict mode code, this is an error, not merely a warning. 556 unsigned flags; 557 if (strictMode) 558 flags = JSREPORT_ERROR; 559 else if (options().extraWarningsOption) 560 flags = JSREPORT_WARNING | JSREPORT_STRICT; 561 else 562 return true; 563 564 return reportCompileErrorNumberVA(offset, flags, errorNumber, args); 565 } 566 567 void 568 CompileError::throwError(JSContext* cx) 569 { 570 if (JSREPORT_IS_WARNING(flags)) { 571 CallWarningReporter(cx, this); 572 return; 573 } 574 575 // If there's a runtime exception type associated with this error 576 // number, set that as the pending exception. For errors occuring at 577 // compile time, this is very likely to be a JSEXN_SYNTAXERR. 578 // 579 // If an exception is thrown but not caught, the JSREPORT_EXCEPTION 580 // flag will be set in report.flags. Proper behavior for an error 581 // reporter is to ignore a report with this flag for all but top-level 582 // compilation errors. The exception will remain pending, and so long 583 // as the non-top-level "load", "eval", or "compile" native function 584 // returns false, the top-level reporter will eventually receive the 585 // uncaught exception report. 586 ErrorToException(cx, this, nullptr, nullptr); 587 } 588 589 bool 590 TokenStream::reportCompileErrorNumberVA(uint32_t offset, unsigned flags, unsigned errorNumber, 591 va_list args) 592 { 593 bool warning = JSREPORT_IS_WARNING(flags); 594 595 if (warning && options().werrorOption) { 596 flags &= ~JSREPORT_WARNING; 597 warning = false; 598 } 599 600 // On the main thread, report the error immediately. When compiling off 601 // thread, save the error so that the main thread can report it later. 602 CompileError tempErr; 603 CompileError* tempErrPtr = &tempErr; 604 if (!cx->isJSContext() && !cx->addPendingCompileError(&tempErrPtr)) 605 return false; 606 CompileError& err = *tempErrPtr; 607 608 err.flags = flags; 609 err.errorNumber = errorNumber; 610 err.filename = filename; 611 err.isMuted = mutedErrors; 612 if (offset == NoOffset) { 613 err.lineno = 0; 614 err.column = 0; 615 } else { 616 err.lineno = srcCoords.lineNum(offset); 617 err.column = srcCoords.columnIndex(offset); 618 } 619 620 // If we have no location information, try to get one from the caller. 621 bool callerFilename = false; 622 if (offset != NoOffset && !err.filename && cx->isJSContext()) { 623 NonBuiltinFrameIter iter(cx->asJSContext(), 624 FrameIter::FOLLOW_DEBUGGER_EVAL_PREV_LINK, 625 cx->compartment()->principals()); 626 if (!iter.done() && iter.filename()) { 627 callerFilename = true; 628 err.filename = iter.filename(); 629 err.lineno = iter.computeLine(&err.column); 630 } 631 } 632 633 if (!ExpandErrorArgumentsVA(cx, GetErrorMessage, nullptr, errorNumber, 634 nullptr, ArgumentsAreLatin1, &err, args)) 635 { 636 return false; 637 } 638 639 // Given a token, T, that we want to complain about: if T's (starting) 640 // lineno doesn't match TokenStream's lineno, that means we've scanned past 641 // the line that T starts on, which makes it hard to print some or all of 642 // T's (starting) line for context. 643 // 644 // So we don't even try, leaving report.linebuf and friends zeroed. This 645 // means that any error involving a multi-line token (e.g. an unterminated 646 // multi-line string literal) won't have a context printed. 647 if (offset != NoOffset && err.lineno == lineno && !callerFilename) { 648 // We show only a portion (a "window") of the line around the erroneous 649 // token -- the first char in the token, plus |windowRadius| chars 650 // before it and |windowRadius - 1| chars after it. This is because 651 // lines can be very long and printing the whole line is (a) not that 652 // helpful, and (b) can waste a lot of memory. See bug 634444. 653 static const size_t windowRadius = 60; 654 655 // The window must start within the current line, no earlier than 656 // windowRadius characters before offset. 657 size_t windowStart = (offset - linebase > windowRadius) ? 658 offset - windowRadius : 659 linebase; 660 661 // The window must start within the portion of the current line 662 // that we actually have in our buffer. 663 if (windowStart < userbuf.startOffset()) 664 windowStart = userbuf.startOffset(); 665 666 // The window must end within the current line, no later than 667 // windowRadius after offset. 668 size_t windowEnd = userbuf.findEOLMax(offset, windowRadius); 669 size_t windowLength = windowEnd - windowStart; 670 MOZ_ASSERT(windowLength <= windowRadius * 2); 671 672 // Create the windowed strings. 673 StringBuffer windowBuf(cx); 674 if (!windowBuf.append(userbuf.rawCharPtrAt(windowStart), windowLength) || 675 !windowBuf.append('\0')) 676 { 677 return false; 678 } 679 680 // The window into the offending source line, without final \n. 681 UniqueTwoByteChars linebuf(windowBuf.stealChars()); 682 if (!linebuf) 683 return false; 684 685 err.initOwnedLinebuf(linebuf.release(), windowLength, offset - windowStart); 686 } 687 688 if (cx->isJSContext()) 689 err.throwError(cx->asJSContext()); 690 691 return warning; 692 } 693 694 bool 695 TokenStream::reportStrictModeError(unsigned errorNumber, ...) 696 { 697 va_list args; 698 va_start(args, errorNumber); 699 bool result = reportStrictModeErrorNumberVA(currentToken().pos.begin, strictMode(), 700 errorNumber, args); 701 va_end(args); 702 return result; 703 } 704 705 bool 706 TokenStream::reportError(unsigned errorNumber, ...) 707 { 708 va_list args; 709 va_start(args, errorNumber); 710 bool result = reportCompileErrorNumberVA(currentToken().pos.begin, JSREPORT_ERROR, errorNumber, 711 args); 712 va_end(args); 713 return result; 714 } 715 716 bool 717 TokenStream::reportErrorNoOffset(unsigned errorNumber, ...) 718 { 719 va_list args; 720 va_start(args, errorNumber); 721 bool result = reportCompileErrorNumberVA(NoOffset, JSREPORT_ERROR, errorNumber, 722 args); 723 va_end(args); 724 return result; 725 } 726 727 bool 728 TokenStream::reportWarning(unsigned errorNumber, ...) 729 { 730 va_list args; 731 va_start(args, errorNumber); 732 bool result = reportCompileErrorNumberVA(currentToken().pos.begin, JSREPORT_WARNING, 733 errorNumber, args); 734 va_end(args); 735 return result; 736 } 737 738 bool 739 TokenStream::reportStrictWarningErrorNumberVA(uint32_t offset, unsigned errorNumber, va_list args) 740 { 741 if (!options().extraWarningsOption) 742 return true; 743 744 return reportCompileErrorNumberVA(offset, JSREPORT_STRICT|JSREPORT_WARNING, errorNumber, args); 745 } 746 747 void 748 TokenStream::reportAsmJSError(uint32_t offset, unsigned errorNumber, ...) 749 { 750 va_list args; 751 va_start(args, errorNumber); 752 unsigned flags = options().throwOnAsmJSValidationFailureOption 753 ? JSREPORT_ERROR 754 : JSREPORT_WARNING; 755 reportCompileErrorNumberVA(offset, flags, errorNumber, args); 756 va_end(args); 757 } 758 759 // We have encountered a '\': check for a Unicode escape sequence after it. 760 // Return the length of the escape sequence and the character code point (by 761 // value) if we found a Unicode escape sequence. Otherwise, return 0. In both 762 // cases, do not advance along the buffer. 763 uint32_t 764 TokenStream::peekUnicodeEscape(uint32_t* codePoint) 765 { 766 int32_t c = getCharIgnoreEOL(); 767 if (c != 'u') { 768 ungetCharIgnoreEOL(c); 769 return 0; 770 } 771 772 char16_t cp[3]; 773 uint32_t length; 774 c = getCharIgnoreEOL(); 775 if (JS7_ISHEX(c) && peekChars(3, cp) && 776 JS7_ISHEX(cp[0]) && JS7_ISHEX(cp[1]) && JS7_ISHEX(cp[2])) 777 { 778 *codePoint = (JS7_UNHEX(c) << 12) | 779 (JS7_UNHEX(cp[0]) << 8) | 780 (JS7_UNHEX(cp[1]) << 4) | 781 JS7_UNHEX(cp[2]); 782 length = 5; 783 } else if (c == '{') { 784 length = peekExtendedUnicodeEscape(codePoint); 785 } else { 786 length = 0; 787 } 788 789 ungetCharIgnoreEOL(c); 790 ungetCharIgnoreEOL('u'); 791 return length; 792 } 793 794 uint32_t 795 TokenStream::peekExtendedUnicodeEscape(uint32_t* codePoint) 796 { 797 // The opening brace character was already read. 798 int32_t c = getCharIgnoreEOL(); 799 800 // Skip leading zeros. 801 uint32_t leadingZeros = 0; 802 while (c == '0') { 803 leadingZeros++; 804 c = getCharIgnoreEOL(); 805 } 806 807 char16_t cp[6]; 808 size_t i = 0; 809 uint32_t code = 0; 810 while (JS7_ISHEX(c) && i < 6) { 811 cp[i++] = c; 812 code = code << 4 | JS7_UNHEX(c); 813 c = getCharIgnoreEOL(); 814 } 815 816 uint32_t length; 817 if (c == '}' && (leadingZeros > 0 || i > 0) && code <= unicode::NonBMPMax) { 818 *codePoint = code; 819 length = leadingZeros + i + 3; 820 } else { 821 length = 0; 822 } 823 824 ungetCharIgnoreEOL(c); 825 while (i--) 826 ungetCharIgnoreEOL(cp[i]); 827 while (leadingZeros--) 828 ungetCharIgnoreEOL('0'); 829 830 return length; 831 } 832 833 uint32_t 834 TokenStream::matchUnicodeEscapeIdStart(uint32_t* codePoint) 835 { 836 uint32_t length = peekUnicodeEscape(codePoint); 837 if (length > 0 && unicode::IsIdentifierStart(*codePoint)) { 838 skipChars(length); 839 return length; 840 } 841 return 0; 842 } 843 844 bool 845 TokenStream::matchUnicodeEscapeIdent(uint32_t* codePoint) 846 { 847 uint32_t length = peekUnicodeEscape(codePoint); 848 if (length > 0 && unicode::IsIdentifierPart(*codePoint)) { 849 skipChars(length); 850 return true; 851 } 852 return false; 853 } 854 855 // Helper function which returns true if the first length(q) characters in p are 856 // the same as the characters in q. 857 static bool 858 CharsMatch(const char16_t* p, const char* q) { 859 while (*q) { 860 if (*p++ != *q++) 861 return false; 862 } 863 return true; 864 } 865 866 bool 867 TokenStream::getDirectives(bool isMultiline, bool shouldWarnDeprecated) 868 { 869 // Match directive comments used in debugging, such as "//# sourceURL" and 870 // "//# sourceMappingURL". Use of "//@" instead of "//#" is deprecated. 871 // 872 // To avoid a crashing bug in IE, several JavaScript transpilers wrap single 873 // line comments containing a source mapping URL inside a multiline 874 // comment. To avoid potentially expensive lookahead and backtracking, we 875 // only check for this case if we encounter a '#' character. 876 877 if (!getDisplayURL(isMultiline, shouldWarnDeprecated)) 878 return false; 879 if (!getSourceMappingURL(isMultiline, shouldWarnDeprecated)) 880 return false; 881 882 return true; 883 } 884 885 bool 886 TokenStream::getDirective(bool isMultiline, bool shouldWarnDeprecated, 887 const char* directive, int directiveLength, 888 const char* errorMsgPragma, 889 UniqueTwoByteChars* destination) 890 { 891 MOZ_ASSERT(directiveLength <= 18); 892 char16_t peeked[18]; 893 int32_t c; 894 895 if (peekChars(directiveLength, peeked) && CharsMatch(peeked, directive)) { 896 if (shouldWarnDeprecated && 897 !reportWarning(JSMSG_DEPRECATED_PRAGMA, errorMsgPragma)) 898 return false; 899 900 skipChars(directiveLength); 901 tokenbuf.clear(); 902 903 while ((c = peekChar()) && c != EOF && !unicode::IsSpaceOrBOM2(c)) { 904 getChar(); 905 // Debugging directives can occur in both single- and multi-line 906 // comments. If we're currently inside a multi-line comment, we also 907 // need to recognize multi-line comment terminators. 908 if (isMultiline && c == '*' && peekChar() == '/') { 909 ungetChar('*'); 910 break; 911 } 912 if (!tokenbuf.append(c)) 913 return false; 914 } 915 916 if (tokenbuf.empty()) { 917 // The directive's URL was missing, but this is not quite an 918 // exception that we should stop and drop everything for. 919 return true; 920 } 921 922 size_t length = tokenbuf.length(); 923 924 *destination = cx->make_pod_array<char16_t>(length + 1); 925 if (!*destination) 926 return false; 927 928 PodCopy(destination->get(), tokenbuf.begin(), length); 929 (*destination)[length] = '\0'; 930 } 931 932 return true; 933 } 934 935 bool 936 TokenStream::getDisplayURL(bool isMultiline, bool shouldWarnDeprecated) 937 { 938 // Match comments of the form "//# sourceURL=<url>" or 939 // "/\* //# sourceURL=<url> *\/" 940 // 941 // Note that while these are labeled "sourceURL" in the source text, 942 // internally we refer to it as a "displayURL" to distinguish what the 943 // developer would like to refer to the source as from the source's actual 944 // URL. 945 946 return getDirective(isMultiline, shouldWarnDeprecated, " sourceURL=", 11, 947 "sourceURL", &displayURL_); 948 } 949 950 bool 951 TokenStream::getSourceMappingURL(bool isMultiline, bool shouldWarnDeprecated) 952 { 953 // Match comments of the form "//# sourceMappingURL=<url>" or 954 // "/\* //# sourceMappingURL=<url> *\/" 955 956 return getDirective(isMultiline, shouldWarnDeprecated, " sourceMappingURL=", 18, 957 "sourceMappingURL", &sourceMapURL_); 958 } 959 960 MOZ_ALWAYS_INLINE Token* 961 TokenStream::newToken(ptrdiff_t adjust) 962 { 963 cursor = (cursor + 1) & ntokensMask; 964 Token* tp = &tokens[cursor]; 965 tp->pos.begin = userbuf.offset() + adjust; 966 967 // NOTE: tp->pos.end is not set until the very end of getTokenInternal(). 968 MOZ_MAKE_MEM_UNDEFINED(&tp->pos.end, sizeof(tp->pos.end)); 969 970 return tp; 971 } 972 973 MOZ_ALWAYS_INLINE JSAtom* 974 TokenStream::atomize(ExclusiveContext* cx, CharBuffer& cb) 975 { 976 return AtomizeChars(cx, cb.begin(), cb.length()); 977 } 978 979 #ifdef DEBUG 980 static bool 981 IsTokenSane(Token* tp) 982 { 983 // Nb: TOK_EOL should never be used in an actual Token; it should only be 984 // returned as a TokenKind from peekTokenSameLine(). 985 if (tp->type < 0 || tp->type >= TOK_LIMIT || tp->type == TOK_EOL) 986 return false; 987 988 if (tp->pos.end < tp->pos.begin) 989 return false; 990 991 return true; 992 } 993 #endif 994 995 bool 996 TokenStream::putIdentInTokenbuf(const char16_t* identStart) 997 { 998 int32_t c; 999 uint32_t qc; 1000 const char16_t* tmp = userbuf.addressOfNextRawChar(); 1001 userbuf.setAddressOfNextRawChar(identStart); 1002 1003 tokenbuf.clear(); 1004 for (;;) { 1005 c = getCharIgnoreEOL(); 1006 if (!unicode::IsIdentifierPart(char16_t(c))) { 1007 if (c != '\\' || !matchUnicodeEscapeIdent(&qc)) 1008 break; 1009 c = qc; 1010 } 1011 if (!tokenbuf.append(c)) { 1012 userbuf.setAddressOfNextRawChar(tmp); 1013 return false; 1014 } 1015 } 1016 userbuf.setAddressOfNextRawChar(tmp); 1017 return true; 1018 } 1019 1020 bool 1021 TokenStream::checkForKeyword(const KeywordInfo* kw, TokenKind* ttp) 1022 { 1023 if (!awaitIsKeyword && kw->tokentype == TOK_AWAIT) { 1024 if (ttp) 1025 *ttp = TOK_NAME; 1026 return true; 1027 } 1028 1029 if (kw->tokentype == TOK_RESERVED) 1030 return reportError(JSMSG_RESERVED_ID, kw->chars); 1031 1032 if (kw->tokentype == TOK_STRICT_RESERVED) 1033 return reportStrictModeError(JSMSG_RESERVED_ID, kw->chars); 1034 1035 // Working keyword. 1036 *ttp = kw->tokentype; 1037 return true; 1038 } 1039 1040 bool 1041 TokenStream::checkForKeyword(JSAtom* atom, TokenKind* ttp) 1042 { 1043 const KeywordInfo* kw = FindKeyword(atom); 1044 if (!kw) 1045 return true; 1046 1047 return checkForKeyword(kw, ttp); 1048 } 1049 1050 enum FirstCharKind { 1051 // A char16_t has the 'OneChar' kind if it, by itself, constitutes a valid 1052 // token that cannot also be a prefix of a longer token. E.g. ';' has the 1053 // OneChar kind, but '+' does not, because '++' and '+=' are valid longer tokens 1054 // that begin with '+'. 1055 // 1056 // The few token kinds satisfying these properties cover roughly 35--45% 1057 // of the tokens seen in practice. 1058 // 1059 // We represent the 'OneChar' kind with any positive value less than 1060 // TOK_LIMIT. This representation lets us associate each one-char token 1061 // char16_t with a TokenKind and thus avoid a subsequent char16_t-to-TokenKind 1062 // conversion. 1063 OneChar_Min = 0, 1064 OneChar_Max = TOK_LIMIT - 1, 1065 1066 Space = TOK_LIMIT, 1067 Ident, 1068 Dec, 1069 String, 1070 EOL, 1071 BasePrefix, 1072 Other, 1073 1074 LastCharKind = Other 1075 }; 1076 1077 // OneChar: 40, 41, 44, 58, 59, 63, 91, 93, 123, 125, 126: 1078 // '(', ')', ',', ':', ';', '?', '[', ']', '{', '}', '~' 1079 // Ident: 36, 65..90, 95, 97..122: '$', 'A'..'Z', '_', 'a'..'z' 1080 // Dot: 46: '.' 1081 // Equals: 61: '=' 1082 // String: 34, 39: '"', '\'' 1083 // Dec: 49..57: '1'..'9' 1084 // Plus: 43: '+' 1085 // BasePrefix: 48: '0' 1086 // Space: 9, 11, 12, 32: '\t', '\v', '\f', ' ' 1087 // EOL: 10, 13: '\n', '\r' 1088 // 1089 #define T_COMMA TOK_COMMA 1090 #define T_COLON TOK_COLON 1091 #define T_BITNOT TOK_BITNOT 1092 #define Templat String 1093 #define _______ Other 1094 static const uint8_t firstCharKinds[] = { 1095 /* 0 1 2 3 4 5 6 7 8 9 */ 1096 /* 0+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, Space, 1097 /* 10+ */ EOL, Space, Space, EOL, _______, _______, _______, _______, _______, _______, 1098 /* 20+ */ _______, _______, _______, _______, _______, _______, _______, _______, _______, _______, 1099 /* 30+ */ _______, _______, Space, _______, String, _______, Ident, _______, _______, String, 1100 /* 40+ */ TOK_LP, TOK_RP, _______, _______, T_COMMA,_______, _______, _______,BasePrefix, Dec, 1101 /* 50+ */ Dec, Dec, Dec, Dec, Dec, Dec, Dec, Dec, T_COLON,TOK_SEMI, 1102 /* 60+ */ _______, _______, _______,TOK_HOOK, _______, Ident, Ident, Ident, Ident, Ident, 1103 /* 70+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, 1104 /* 80+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, 1105 /* 90+ */ Ident, TOK_LB, _______, TOK_RB, _______, Ident, Templat, Ident, Ident, Ident, 1106 /* 100+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, 1107 /* 110+ */ Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, Ident, 1108 /* 120+ */ Ident, Ident, Ident, TOK_LC, _______, TOK_RC,T_BITNOT, _______ 1109 }; 1110 #undef T_COMMA 1111 #undef T_COLON 1112 #undef T_BITNOT 1113 #undef Templat 1114 #undef _______ 1115 1116 static_assert(LastCharKind < (1 << (sizeof(firstCharKinds[0]) * 8)), 1117 "Elements of firstCharKinds[] are too small"); 1118 1119 bool 1120 TokenStream::getTokenInternal(TokenKind* ttp, Modifier modifier) 1121 { 1122 int c; 1123 uint32_t qc; 1124 Token* tp; 1125 FirstCharKind c1kind; 1126 const char16_t* numStart; 1127 bool hasExp; 1128 DecimalPoint decimalPoint; 1129 const char16_t* identStart; 1130 bool hadUnicodeEscape; 1131 1132 // Check if in the middle of a template string. Have to get this out of 1133 // the way first. 1134 if (MOZ_UNLIKELY(modifier == TemplateTail)) { 1135 if (!getStringOrTemplateToken('`', &tp)) 1136 goto error; 1137 goto out; 1138 } 1139 1140 retry: 1141 if (MOZ_UNLIKELY(!userbuf.hasRawChars())) { 1142 tp = newToken(0); 1143 tp->type = TOK_EOF; 1144 flags.isEOF = true; 1145 goto out; 1146 } 1147 1148 c = userbuf.getRawChar(); 1149 MOZ_ASSERT(c != EOF); 1150 1151 // Chars not in the range 0..127 are rare. Getting them out of the way 1152 // early allows subsequent checking to be faster. 1153 if (MOZ_UNLIKELY(c >= 128)) { 1154 if (unicode::IsSpaceOrBOM2(c)) { 1155 if (c == LINE_SEPARATOR || c == PARA_SEPARATOR) { 1156 updateLineInfoForEOL(); 1157 updateFlagsForEOL(); 1158 } 1159 1160 goto retry; 1161 } 1162 1163 tp = newToken(-1); 1164 1165 static_assert('$' < 128, 1166 "IdentifierStart contains '$', but as !IsUnicodeIDStart('$'), " 1167 "ensure that '$' is never handled here"); 1168 static_assert('_' < 128, 1169 "IdentifierStart contains '_', but as !IsUnicodeIDStart('_'), " 1170 "ensure that '_' is never handled here"); 1171 if (unicode::IsUnicodeIDStart(c)) { 1172 identStart = userbuf.addressOfNextRawChar() - 1; 1173 hadUnicodeEscape = false; 1174 goto identifier; 1175 } 1176 1177 goto badchar; 1178 } 1179 1180 // Get the token kind, based on the first char. The ordering of c1kind 1181 // comparison is based on the frequency of tokens in real code -- Parsemark 1182 // (which represents typical JS code on the web) and the Unreal demo (which 1183 // represents asm.js code). 1184 // 1185 // Parsemark Unreal 1186 // OneChar 32.9% 39.7% 1187 // Space 25.0% 0.6% 1188 // Ident 19.2% 36.4% 1189 // Dec 7.2% 5.1% 1190 // String 7.9% 0.0% 1191 // EOL 1.7% 0.0% 1192 // BasePrefix 0.4% 4.9% 1193 // Other 5.7% 13.3% 1194 // 1195 // The ordering is based mostly only Parsemark frequencies, with Unreal 1196 // frequencies used to break close categories (e.g. |Dec| and |String|). 1197 // |Other| is biggish, but no other token kind is common enough for it to 1198 // be worth adding extra values to FirstCharKind. 1199 // 1200 c1kind = FirstCharKind(firstCharKinds[c]); 1201 1202 // Look for an unambiguous single-char token. 1203 // 1204 if (c1kind <= OneChar_Max) { 1205 tp = newToken(-1); 1206 tp->type = TokenKind(c1kind); 1207 goto out; 1208 } 1209 1210 // Skip over non-EOL whitespace chars. 1211 // 1212 if (c1kind == Space) 1213 goto retry; 1214 1215 // Look for an identifier. 1216 // 1217 if (c1kind == Ident) { 1218 tp = newToken(-1); 1219 identStart = userbuf.addressOfNextRawChar() - 1; 1220 hadUnicodeEscape = false; 1221 1222 identifier: 1223 for (;;) { 1224 c = getCharIgnoreEOL(); 1225 if (c == EOF) 1226 break; 1227 if (!unicode::IsIdentifierPart(char16_t(c))) { 1228 if (c != '\\' || !matchUnicodeEscapeIdent(&qc)) 1229 break; 1230 hadUnicodeEscape = true; 1231 } 1232 } 1233 ungetCharIgnoreEOL(c); 1234 1235 // Identifiers containing no Unicode escapes can be processed directly 1236 // from userbuf. The rest must use the escapes converted via tokenbuf 1237 // before atomizing. 1238 const char16_t* chars; 1239 size_t length; 1240 if (hadUnicodeEscape) { 1241 if (!putIdentInTokenbuf(identStart)) 1242 goto error; 1243 1244 chars = tokenbuf.begin(); 1245 length = tokenbuf.length(); 1246 } else { 1247 chars = identStart; 1248 length = userbuf.addressOfNextRawChar() - identStart; 1249 } 1250 1251 // Represent keywords as keyword tokens unless told otherwise. 1252 if (modifier != KeywordIsName) { 1253 if (const KeywordInfo* kw = FindKeyword(chars, length)) { 1254 // That said, keywords can't contain escapes. (Contexts where 1255 // keywords are treated as names, that also sometimes treat 1256 // keywords as keywords, must manually check this requirement.) 1257 // There are two exceptions 1258 // 1) StrictReservedWords: These keywords need to be treated as 1259 // names in non-strict mode. 1260 // 2) yield is also treated as a name if it contains an escape 1261 // sequence. The parser must handle this case separately. 1262 if (hadUnicodeEscape && !( 1263 (kw->tokentype == TOK_STRICT_RESERVED && !strictMode()) || 1264 kw->tokentype == TOK_YIELD)) 1265 { 1266 reportError(JSMSG_ESCAPED_KEYWORD); 1267 goto error; 1268 } 1269 1270 tp->type = TOK_NAME; 1271 if (!checkForKeyword(kw, &tp->type)) 1272 goto error; 1273 if (tp->type != TOK_NAME && !hadUnicodeEscape) 1274 goto out; 1275 } 1276 } 1277 1278 JSAtom* atom = AtomizeChars(cx, chars, length); 1279 if (!atom) 1280 goto error; 1281 tp->type = TOK_NAME; 1282 tp->setName(atom->asPropertyName()); 1283 goto out; 1284 } 1285 1286 // Look for a decimal number. 1287 // 1288 if (c1kind == Dec) { 1289 tp = newToken(-1); 1290 numStart = userbuf.addressOfNextRawChar() - 1; 1291 1292 decimal: 1293 decimalPoint = NoDecimal; 1294 hasExp = false; 1295 while (JS7_ISDEC(c)) 1296 c = getCharIgnoreEOL(); 1297 1298 if (c == '.') { 1299 decimalPoint = HasDecimal; 1300 decimal_dot: 1301 do { 1302 c = getCharIgnoreEOL(); 1303 } while (JS7_ISDEC(c)); 1304 } 1305 if (c == 'e' || c == 'E') { 1306 hasExp = true; 1307 c = getCharIgnoreEOL(); 1308 if (c == '+' || c == '-') 1309 c = getCharIgnoreEOL(); 1310 if (!JS7_ISDEC(c)) { 1311 ungetCharIgnoreEOL(c); 1312 reportError(JSMSG_MISSING_EXPONENT); 1313 goto error; 1314 } 1315 do { 1316 c = getCharIgnoreEOL(); 1317 } while (JS7_ISDEC(c)); 1318 } 1319 ungetCharIgnoreEOL(c); 1320 1321 if (c != EOF && unicode::IsIdentifierStart(char16_t(c))) { 1322 reportError(JSMSG_IDSTART_AFTER_NUMBER); 1323 goto error; 1324 } 1325 1326 // Unlike identifiers and strings, numbers cannot contain escaped 1327 // chars, so we don't need to use tokenbuf. Instead we can just 1328 // convert the char16_t characters in userbuf to the numeric value. 1329 double dval; 1330 if (!((decimalPoint == HasDecimal) || hasExp)) { 1331 if (!GetDecimalInteger(cx, numStart, userbuf.addressOfNextRawChar(), &dval)) 1332 goto error; 1333 } else { 1334 const char16_t* dummy; 1335 if (!js_strtod(cx, numStart, userbuf.addressOfNextRawChar(), &dummy, &dval)) 1336 goto error; 1337 } 1338 tp->type = TOK_NUMBER; 1339 tp->setNumber(dval, decimalPoint); 1340 goto out; 1341 } 1342 1343 // Look for a string or a template string. 1344 // 1345 if (c1kind == String) { 1346 if (!getStringOrTemplateToken(c, &tp)) 1347 goto error; 1348 goto out; 1349 } 1350 1351 // Skip over EOL chars, updating line state along the way. 1352 // 1353 if (c1kind == EOL) { 1354 // If it's a \r\n sequence: treat as a single EOL, skip over the \n. 1355 if (c == '\r' && userbuf.hasRawChars()) 1356 userbuf.matchRawChar('\n'); 1357 updateLineInfoForEOL(); 1358 updateFlagsForEOL(); 1359 goto retry; 1360 } 1361 1362 // Look for a hexadecimal, octal, or binary number. 1363 // 1364 if (c1kind == BasePrefix) { 1365 tp = newToken(-1); 1366 int radix; 1367 c = getCharIgnoreEOL(); 1368 if (c == 'x' || c == 'X') { 1369 radix = 16; 1370 c = getCharIgnoreEOL(); 1371 if (!JS7_ISHEX(c)) { 1372 ungetCharIgnoreEOL(c); 1373 reportError(JSMSG_MISSING_HEXDIGITS); 1374 goto error; 1375 } 1376 numStart = userbuf.addressOfNextRawChar() - 1; // one past the '0x' 1377 while (JS7_ISHEX(c)) 1378 c = getCharIgnoreEOL(); 1379 } else if (c == 'b' || c == 'B') { 1380 radix = 2; 1381 c = getCharIgnoreEOL(); 1382 if (c != '0' && c != '1') { 1383 ungetCharIgnoreEOL(c); 1384 reportError(JSMSG_MISSING_BINARY_DIGITS); 1385 goto error; 1386 } 1387 numStart = userbuf.addressOfNextRawChar() - 1; // one past the '0b' 1388 while (c == '0' || c == '1') 1389 c = getCharIgnoreEOL(); 1390 } else if (c == 'o' || c == 'O') { 1391 radix = 8; 1392 c = getCharIgnoreEOL(); 1393 if (c < '0' || c > '7') { 1394 ungetCharIgnoreEOL(c); 1395 reportError(JSMSG_MISSING_OCTAL_DIGITS); 1396 goto error; 1397 } 1398 numStart = userbuf.addressOfNextRawChar() - 1; // one past the '0o' 1399 while ('0' <= c && c <= '7') 1400 c = getCharIgnoreEOL(); 1401 } else if (JS7_ISDEC(c)) { 1402 radix = 8; 1403 numStart = userbuf.addressOfNextRawChar() - 1; // one past the '0' 1404 while (JS7_ISDEC(c)) { 1405 // Octal integer literals are not permitted in strict mode code. 1406 if (!reportStrictModeError(JSMSG_DEPRECATED_OCTAL)) 1407 goto error; 1408 1409 // Outside strict mode, we permit 08 and 09 as decimal numbers, 1410 // which makes our behaviour a superset of the ECMA numeric 1411 // grammar. We might not always be so permissive, so we warn 1412 // about it. 1413 if (c >= '8') { 1414 if (!reportWarning(JSMSG_BAD_OCTAL, c == '8' ? "08" : "09")) { 1415 goto error; 1416 } 1417 goto decimal; // use the decimal scanner for the rest of the number 1418 } 1419 c = getCharIgnoreEOL(); 1420 } 1421 } else { 1422 // '0' not followed by 'x', 'X' or a digit; scan as a decimal number. 1423 numStart = userbuf.addressOfNextRawChar() - 1; 1424 goto decimal; 1425 } 1426 ungetCharIgnoreEOL(c); 1427 1428 if (c != EOF && unicode::IsIdentifierStart(char16_t(c))) { 1429 reportError(JSMSG_IDSTART_AFTER_NUMBER); 1430 goto error; 1431 } 1432 1433 double dval; 1434 const char16_t* dummy; 1435 if (!GetPrefixInteger(cx, numStart, userbuf.addressOfNextRawChar(), radix, &dummy, &dval)) 1436 goto error; 1437 tp->type = TOK_NUMBER; 1438 tp->setNumber(dval, NoDecimal); 1439 goto out; 1440 } 1441 1442 // This handles everything else. 1443 // 1444 MOZ_ASSERT(c1kind == Other); 1445 tp = newToken(-1); 1446 switch (c) { 1447 case '.': 1448 c = getCharIgnoreEOL(); 1449 if (JS7_ISDEC(c)) { 1450 numStart = userbuf.addressOfNextRawChar() - 2; 1451 decimalPoint = HasDecimal; 1452 hasExp = false; 1453 goto decimal_dot; 1454 } 1455 if (c == '.') { 1456 if (matchChar('.')) { 1457 tp->type = TOK_TRIPLEDOT; 1458 goto out; 1459 } 1460 } 1461 ungetCharIgnoreEOL(c); 1462 tp->type = TOK_DOT; 1463 goto out; 1464 1465 case '=': 1466 if (matchChar('=')) 1467 tp->type = matchChar('=') ? TOK_STRICTEQ : TOK_EQ; 1468 else if (matchChar('>')) 1469 tp->type = TOK_ARROW; 1470 else 1471 tp->type = TOK_ASSIGN; 1472 goto out; 1473 1474 case '+': 1475 if (matchChar('+')) 1476 tp->type = TOK_INC; 1477 else 1478 tp->type = matchChar('=') ? TOK_ADDASSIGN : TOK_ADD; 1479 goto out; 1480 1481 case '\\': { 1482 uint32_t escapeLength = matchUnicodeEscapeIdStart(&qc); 1483 if (escapeLength > 0) { 1484 identStart = userbuf.addressOfNextRawChar() - escapeLength - 1; 1485 hadUnicodeEscape = true; 1486 goto identifier; 1487 } 1488 goto badchar; 1489 } 1490 1491 case '|': 1492 if (matchChar('|')) 1493 tp->type = TOK_OR; 1494 else 1495 tp->type = matchChar('=') ? TOK_BITORASSIGN : TOK_BITOR; 1496 goto out; 1497 1498 case '^': 1499 tp->type = matchChar('=') ? TOK_BITXORASSIGN : TOK_BITXOR; 1500 goto out; 1501 1502 case '&': 1503 if (matchChar('&')) 1504 tp->type = TOK_AND; 1505 else 1506 tp->type = matchChar('=') ? TOK_BITANDASSIGN : TOK_BITAND; 1507 goto out; 1508 1509 case '!': 1510 if (matchChar('=')) 1511 tp->type = matchChar('=') ? TOK_STRICTNE : TOK_NE; 1512 else 1513 tp->type = TOK_NOT; 1514 goto out; 1515 1516 case '<': 1517 // NB: treat HTML begin-comment as comment-till-end-of-line. 1518 if (matchChar('!')) { 1519 if (matchChar('-')) { 1520 if (matchChar('-')) 1521 goto skipline; 1522 ungetChar('-'); 1523 } 1524 ungetChar('!'); 1525 } 1526 if (matchChar('<')) { 1527 tp->type = matchChar('=') ? TOK_LSHASSIGN : TOK_LSH; 1528 } else { 1529 tp->type = matchChar('=') ? TOK_LE : TOK_LT; 1530 } 1531 goto out; 1532 1533 case '>': 1534 if (matchChar('>')) { 1535 if (matchChar('>')) 1536 tp->type = matchChar('=') ? TOK_URSHASSIGN : TOK_URSH; 1537 else 1538 tp->type = matchChar('=') ? TOK_RSHASSIGN : TOK_RSH; 1539 } else { 1540 tp->type = matchChar('=') ? TOK_GE : TOK_GT; 1541 } 1542 goto out; 1543 1544 case '*': 1545 if (matchChar('*')) 1546 tp->type = matchChar('=') ? TOK_POWASSIGN : TOK_POW; 1547 else 1548 tp->type = matchChar('=') ? TOK_MULASSIGN : TOK_MUL; 1549 goto out; 1550 1551 case '/': 1552 // Look for a single-line comment. 1553 if (matchChar('/')) { 1554 c = peekChar(); 1555 if (c == '@' || c == '#') { 1556 bool shouldWarn = getChar() == '@'; 1557 if (!getDirectives(false, shouldWarn)) 1558 goto error; 1559 } 1560 1561 skipline: 1562 while ((c = getChar()) != EOF && c != '\n') 1563 continue; 1564 ungetChar(c); 1565 cursor = (cursor - 1) & ntokensMask; 1566 goto retry; 1567 } 1568 1569 // Look for a multi-line comment. 1570 if (matchChar('*')) { 1571 unsigned linenoBefore = lineno; 1572 while ((c = getChar()) != EOF && 1573 !(c == '*' && matchChar('/'))) { 1574 if (c == '@' || c == '#') { 1575 bool shouldWarn = c == '@'; 1576 if (!getDirectives(true, shouldWarn)) 1577 goto error; 1578 } 1579 } 1580 if (c == EOF) { 1581 reportError(JSMSG_UNTERMINATED_COMMENT); 1582 goto error; 1583 } 1584 if (linenoBefore != lineno) 1585 updateFlagsForEOL(); 1586 cursor = (cursor - 1) & ntokensMask; 1587 goto retry; 1588 } 1589 1590 // Look for a regexp. 1591 if (modifier == Operand) { 1592 tokenbuf.clear(); 1593 1594 bool inCharClass = false; 1595 for (;;) { 1596 c = getChar(); 1597 if (c == '\\') { 1598 if (!tokenbuf.append(c)) 1599 goto error; 1600 c = getChar(); 1601 } else if (c == '[') { 1602 inCharClass = true; 1603 } else if (c == ']') { 1604 inCharClass = false; 1605 } else if (c == '/' && !inCharClass) { 1606 // For compat with IE, allow unescaped / in char classes. 1607 break; 1608 } 1609 if (c == '\n' || c == EOF) { 1610 ungetChar(c); 1611 reportError(JSMSG_UNTERMINATED_REGEXP); 1612 goto error; 1613 } 1614 if (!tokenbuf.append(c)) 1615 goto error; 1616 } 1617 1618 RegExpFlag reflags = NoFlags; 1619 unsigned length = tokenbuf.length() + 1; 1620 while (true) { 1621 c = peekChar(); 1622 if (c == 'g' && !(reflags & GlobalFlag)) 1623 reflags = RegExpFlag(reflags | GlobalFlag); 1624 else if (c == 'i' && !(reflags & IgnoreCaseFlag)) 1625 reflags = RegExpFlag(reflags | IgnoreCaseFlag); 1626 else if (c == 'm' && !(reflags & MultilineFlag)) 1627 reflags = RegExpFlag(reflags | MultilineFlag); 1628 else if (c == 'y' && !(reflags & StickyFlag)) 1629 reflags = RegExpFlag(reflags | StickyFlag); 1630 else if (c == 'u' && !(reflags & UnicodeFlag)) 1631 reflags = RegExpFlag(reflags | UnicodeFlag); 1632 else 1633 break; 1634 getChar(); 1635 length++; 1636 } 1637 1638 c = peekChar(); 1639 if (JS7_ISLET(c)) { 1640 char buf[2] = { '\0', '\0' }; 1641 tp->pos.begin += length + 1; 1642 buf[0] = char(c); 1643 reportError(JSMSG_BAD_REGEXP_FLAG, buf); 1644 (void) getChar(); 1645 goto error; 1646 } 1647 tp->type = TOK_REGEXP; 1648 tp->setRegExpFlags(reflags); 1649 goto out; 1650 } 1651 1652 tp->type = matchChar('=') ? TOK_DIVASSIGN : TOK_DIV; 1653 goto out; 1654 1655 case '%': 1656 tp->type = matchChar('=') ? TOK_MODASSIGN : TOK_MOD; 1657 goto out; 1658 1659 case '-': 1660 if (matchChar('-')) { 1661 if (peekChar() == '>' && !flags.isDirtyLine) 1662 goto skipline; 1663 tp->type = TOK_DEC; 1664 } else { 1665 tp->type = matchChar('=') ? TOK_SUBASSIGN : TOK_SUB; 1666 } 1667 goto out; 1668 1669 badchar: 1670 default: 1671 reportError(JSMSG_ILLEGAL_CHARACTER); 1672 goto error; 1673 } 1674 1675 MOZ_CRASH("should have jumped to |out| or |error|"); 1676 1677 out: 1678 if (flags.hitOOM) 1679 return reportError(JSMSG_OUT_OF_MEMORY); 1680 1681 flags.isDirtyLine = true; 1682 tp->pos.end = userbuf.offset(); 1683 #ifdef DEBUG 1684 // Save the modifier used to get this token, so that if an ungetToken() 1685 // occurs and then the token is re-gotten (or peeked, etc.), we can assert 1686 // that both gets have used the same modifiers. 1687 tp->modifier = modifier; 1688 tp->modifierException = NoException; 1689 #endif 1690 MOZ_ASSERT(IsTokenSane(tp)); 1691 *ttp = tp->type; 1692 return true; 1693 1694 error: 1695 if (flags.hitOOM) 1696 return reportError(JSMSG_OUT_OF_MEMORY); 1697 1698 flags.isDirtyLine = true; 1699 tp->pos.end = userbuf.offset(); 1700 MOZ_MAKE_MEM_UNDEFINED(&tp->type, sizeof(tp->type)); 1701 flags.hadError = true; 1702 #ifdef DEBUG 1703 // Poisoning userbuf on error establishes an invariant: once an erroneous 1704 // token has been seen, userbuf will not be consulted again. This is true 1705 // because the parser will deal with the illegal token by aborting parsing 1706 // immediately. 1707 userbuf.poison(); 1708 #endif 1709 MOZ_MAKE_MEM_UNDEFINED(ttp, sizeof(*ttp)); 1710 return false; 1711 } 1712 1713 bool 1714 TokenStream::getBracedUnicode(uint32_t* cp) 1715 { 1716 consumeKnownChar('{'); 1717 1718 bool first = true; 1719 int32_t c; 1720 uint32_t code = 0; 1721 while (true) { 1722 c = getCharIgnoreEOL(); 1723 if (c == EOF) 1724 return false; 1725 if (c == '}') { 1726 if (first) 1727 return false; 1728 break; 1729 } 1730 1731 if (!JS7_ISHEX(c)) 1732 return false; 1733 1734 code = (code << 4) | JS7_UNHEX(c); 1735 if (code > unicode::NonBMPMax) 1736 return false; 1737 first = false; 1738 } 1739 1740 *cp = code; 1741 return true; 1742 } 1743 1744 bool 1745 TokenStream::getStringOrTemplateToken(int untilChar, Token** tp) 1746 { 1747 int c; 1748 int nc = -1; 1749 1750 bool parsingTemplate = (untilChar == '`'); 1751 1752 *tp = newToken(-1); 1753 tokenbuf.clear(); 1754 1755 // We need to detect any of these chars: " or ', \n (or its 1756 // equivalents), \\, EOF. Because we detect EOL sequences here and 1757 // put them back immediately, we can use getCharIgnoreEOL(). 1758 while ((c = getCharIgnoreEOL()) != untilChar) { 1759 if (c == EOF) { 1760 ungetCharIgnoreEOL(c); 1761 reportError(JSMSG_UNTERMINATED_STRING); 1762 return false; 1763 } 1764 1765 if (c == '\\') { 1766 switch (c = getChar()) { 1767 case 'b': c = '\b'; break; 1768 case 'f': c = '\f'; break; 1769 case 'n': c = '\n'; break; 1770 case 'r': c = '\r'; break; 1771 case 't': c = '\t'; break; 1772 case 'v': c = '\v'; break; 1773 1774 case '\n': 1775 // ES5 7.8.4: an escaped line terminator represents 1776 // no character. 1777 continue; 1778 1779 // Unicode character specification. 1780 case 'u': { 1781 if (peekChar() == '{') { 1782 uint32_t code; 1783 if (!getBracedUnicode(&code)) { 1784 reportError(JSMSG_MALFORMED_ESCAPE, "Unicode"); 1785 return false; 1786 } 1787 1788 MOZ_ASSERT(code <= unicode::NonBMPMax); 1789 if (code < unicode::NonBMPMin) { 1790 c = code; 1791 } else { 1792 if (!tokenbuf.append(unicode::LeadSurrogate(code))) 1793 return false; 1794 c = unicode::TrailSurrogate(code); 1795 } 1796 break; 1797 } 1798 1799 char16_t cp[4]; 1800 if (peekChars(4, cp) && 1801 JS7_ISHEX(cp[0]) && JS7_ISHEX(cp[1]) && JS7_ISHEX(cp[2]) && JS7_ISHEX(cp[3])) 1802 { 1803 c = JS7_UNHEX(cp[0]); 1804 c = (c << 4) + JS7_UNHEX(cp[1]); 1805 c = (c << 4) + JS7_UNHEX(cp[2]); 1806 c = (c << 4) + JS7_UNHEX(cp[3]); 1807 skipChars(4); 1808 } else { 1809 reportError(JSMSG_MALFORMED_ESCAPE, "Unicode"); 1810 return false; 1811 } 1812 break; 1813 } 1814 1815 // Hexadecimal character specification. 1816 case 'x': { 1817 char16_t cp[2]; 1818 if (peekChars(2, cp) && JS7_ISHEX(cp[0]) && JS7_ISHEX(cp[1])) { 1819 c = (JS7_UNHEX(cp[0]) << 4) + JS7_UNHEX(cp[1]); 1820 skipChars(2); 1821 } else { 1822 reportError(JSMSG_MALFORMED_ESCAPE, "hexadecimal"); 1823 return false; 1824 } 1825 break; 1826 } 1827 1828 default: 1829 // Octal character specification. 1830 if (JS7_ISOCT(c)) { 1831 int32_t val = JS7_UNOCT(c); 1832 1833 c = peekChar(); 1834 1835 // Strict mode code allows only \0, then a non-digit. 1836 if (val != 0 || JS7_ISDEC(c)) { 1837 if (parsingTemplate) { 1838 reportError(JSMSG_DEPRECATED_OCTAL); 1839 return false; 1840 } 1841 if (!reportStrictModeError(JSMSG_DEPRECATED_OCTAL)) 1842 return false; 1843 flags.sawOctalEscape = true; 1844 } 1845 1846 if (JS7_ISOCT(c)) { 1847 val = 8 * val + JS7_UNOCT(c); 1848 getChar(); 1849 c = peekChar(); 1850 if (JS7_ISOCT(c)) { 1851 int32_t save = val; 1852 val = 8 * val + JS7_UNOCT(c); 1853 if (val <= 0xFF) 1854 getChar(); 1855 else 1856 val = save; 1857 } 1858 } 1859 1860 c = char16_t(val); 1861 } 1862 break; 1863 } 1864 } else if (TokenBuf::isRawEOLChar(c)) { 1865 if (!parsingTemplate) { 1866 ungetCharIgnoreEOL(c); 1867 reportError(JSMSG_UNTERMINATED_STRING); 1868 return false; 1869 } 1870 if (c == '\r') { 1871 c = '\n'; 1872 if (userbuf.peekRawChar() == '\n') 1873 skipCharsIgnoreEOL(1); 1874 } 1875 updateLineInfoForEOL(); 1876 updateFlagsForEOL(); 1877 } else if (parsingTemplate && c == '$') { 1878 if ((nc = getCharIgnoreEOL()) == '{') 1879 break; 1880 ungetCharIgnoreEOL(nc); 1881 } 1882 1883 if (!tokenbuf.append(c)) { 1884 ReportOutOfMemory(cx); 1885 return false; 1886 } 1887 } 1888 1889 JSAtom* atom = atomize(cx, tokenbuf); 1890 if (!atom) 1891 return false; 1892 1893 if (!parsingTemplate) { 1894 (*tp)->type = TOK_STRING; 1895 } else { 1896 if (c == '$' && nc == '{') 1897 (*tp)->type = TOK_TEMPLATE_HEAD; 1898 else 1899 (*tp)->type = TOK_NO_SUBS_TEMPLATE; 1900 } 1901 1902 (*tp)->setAtom(atom); 1903 return true; 1904 } 1905 1906 JS_FRIEND_API(int) 1907 js_fgets(char* buf, int size, FILE* file) 1908 { 1909 int n, i, c; 1910 bool crflag; 1911 1912 n = size - 1; 1913 if (n < 0) 1914 return -1; 1915 1916 crflag = false; 1917 for (i = 0; i < n && (c = fast_getc(file)) != EOF; i++) { 1918 buf[i] = c; 1919 if (c == '\n') { // any \n ends a line 1920 i++; // keep the \n; we know there is room for \0 1921 break; 1922 } 1923 if (crflag) { // \r not followed by \n ends line at the \r 1924 ungetc(c, file); 1925 break; // and overwrite c in buf with \0 1926 } 1927 crflag = (c == '\r'); 1928 } 1929 1930 buf[i] = '\0'; 1931 return i; 1932 } 1933 1934 const char* 1935 frontend::TokenKindToDesc(TokenKind tt) 1936 { 1937 switch (tt) { 1938 #define EMIT_CASE(name, desc) case TOK_##name: return desc; 1939 FOR_EACH_TOKEN_KIND(EMIT_CASE) 1940 #undef EMIT_CASE 1941 case TOK_LIMIT: 1942 MOZ_ASSERT_UNREACHABLE("TOK_LIMIT should not be passed."); 1943 break; 1944 } 1945 1946 return "<bad TokenKind>"; 1947 } 1948 1949 #ifdef DEBUG 1950 const char* 1951 TokenKindToString(TokenKind tt) 1952 { 1953 switch (tt) { 1954 #define EMIT_CASE(name, desc) case TOK_##name: return "TOK_" #name; 1955 FOR_EACH_TOKEN_KIND(EMIT_CASE) 1956 #undef EMIT_CASE 1957 case TOK_LIMIT: break; 1958 } 1959 1960 return "<bad TokenKind>"; 1961 } 1962 #endif 1963