1 /* 2 * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package jdk.nashorn.internal.parser; 27 28 import static jdk.nashorn.internal.parser.TokenType.ADD; 29 import static jdk.nashorn.internal.parser.TokenType.BINARY_NUMBER; 30 import static jdk.nashorn.internal.parser.TokenType.COMMENT; 31 import static jdk.nashorn.internal.parser.TokenType.DECIMAL; 32 import static jdk.nashorn.internal.parser.TokenType.DIRECTIVE_COMMENT; 33 import static jdk.nashorn.internal.parser.TokenType.EOF; 34 import static jdk.nashorn.internal.parser.TokenType.EOL; 35 import static jdk.nashorn.internal.parser.TokenType.ERROR; 36 import static jdk.nashorn.internal.parser.TokenType.ESCSTRING; 37 import static jdk.nashorn.internal.parser.TokenType.EXECSTRING; 38 import static jdk.nashorn.internal.parser.TokenType.FLOATING; 39 import static jdk.nashorn.internal.parser.TokenType.FUNCTION; 40 import static jdk.nashorn.internal.parser.TokenType.HEXADECIMAL; 41 import static jdk.nashorn.internal.parser.TokenType.LBRACE; 42 import static jdk.nashorn.internal.parser.TokenType.LPAREN; 43 import static jdk.nashorn.internal.parser.TokenType.OCTAL; 44 import static jdk.nashorn.internal.parser.TokenType.OCTAL_LEGACY; 45 import static jdk.nashorn.internal.parser.TokenType.RBRACE; 46 import static jdk.nashorn.internal.parser.TokenType.REGEX; 47 import static jdk.nashorn.internal.parser.TokenType.RPAREN; 48 import static jdk.nashorn.internal.parser.TokenType.STRING; 49 import static jdk.nashorn.internal.parser.TokenType.TEMPLATE; 50 import static jdk.nashorn.internal.parser.TokenType.TEMPLATE_HEAD; 51 import static jdk.nashorn.internal.parser.TokenType.TEMPLATE_MIDDLE; 52 import static jdk.nashorn.internal.parser.TokenType.TEMPLATE_TAIL; 53 import static jdk.nashorn.internal.parser.TokenType.XML; 54 55 import java.io.Serializable; 56 57 import jdk.nashorn.internal.runtime.ECMAErrors; 58 import jdk.nashorn.internal.runtime.ErrorManager; 59 import jdk.nashorn.internal.runtime.JSErrorType; 60 import jdk.nashorn.internal.runtime.JSType; 61 import jdk.nashorn.internal.runtime.ParserException; 62 import jdk.nashorn.internal.runtime.Source; 63 import jdk.nashorn.internal.runtime.options.Options; 64 65 /** 66 * Responsible for converting source content into a stream of tokens. 67 * 68 */ 69 @SuppressWarnings("fallthrough") 70 public class Lexer extends Scanner { 71 private static final long MIN_INT_L = Integer.MIN_VALUE; 72 private static final long MAX_INT_L = Integer.MAX_VALUE; 73 74 private static final boolean XML_LITERALS = Options.getBooleanProperty("nashorn.lexer.xmlliterals"); 75 76 /** Content source. */ 77 private final Source source; 78 79 /** Buffered stream for tokens. */ 80 private final TokenStream stream; 81 82 /** True if here and edit strings are supported. */ 83 private final boolean scripting; 84 85 /** True if parsing in ECMAScript 6 mode. */ 86 private final boolean es6; 87 88 /** True if a nested scan. (scan to completion, no EOF.) */ 89 private final boolean nested; 90 91 /** Pending new line number and position. */ 92 int pendingLine; 93 94 /** Position of last EOL + 1. */ 95 private int linePosition; 96 97 /** Type of last token added. */ 98 private TokenType last; 99 100 private final boolean pauseOnFunctionBody; 101 private boolean pauseOnNextLeftBrace; 102 103 private int templateExpressionOpenBraces; 104 105 private static final String JAVASCRIPT_OTHER_WHITESPACE = 106 "\u2028" + // line separator 107 "\u2029" + // paragraph separator 108 "\u00a0" + // Latin-1 space 109 "\u1680" + // Ogham space mark 110 "\u180e" + // separator, Mongolian vowel 111 "\u2000" + // en quad 112 "\u2001" + // em quad 113 "\u2002" + // en space 114 "\u2003" + // em space 115 "\u2004" + // three-per-em space 116 "\u2005" + // four-per-em space 117 "\u2006" + // six-per-em space 118 "\u2007" + // figure space 119 "\u2008" + // punctuation space 120 "\u2009" + // thin space 121 "\u200a" + // hair space 122 "\u202f" + // narrow no-break space 123 "\u205f" + // medium mathematical space 124 "\u3000" + // ideographic space 125 "\ufeff" // byte order mark 126 ; 127 128 private static final String JAVASCRIPT_WHITESPACE_IN_REGEXP = 129 "\\u000a" + // line feed 130 "\\u000d" + // carriage return (ctrl-m) 131 "\\u2028" + // line separator 132 "\\u2029" + // paragraph separator 133 "\\u0009" + // tab 134 "\\u0020" + // ASCII space 135 "\\u000b" + // tabulation line 136 "\\u000c" + // ff (ctrl-l) 137 "\\u00a0" + // Latin-1 space 138 "\\u1680" + // Ogham space mark 139 "\\u180e" + // separator, Mongolian vowel 140 "\\u2000" + // en quad 141 "\\u2001" + // em quad 142 "\\u2002" + // en space 143 "\\u2003" + // em space 144 "\\u2004" + // three-per-em space 145 "\\u2005" + // four-per-em space 146 "\\u2006" + // six-per-em space 147 "\\u2007" + // figure space 148 "\\u2008" + // punctuation space 149 "\\u2009" + // thin space 150 "\\u200a" + // hair space 151 "\\u202f" + // narrow no-break space 152 "\\u205f" + // medium mathematical space 153 "\\u3000" + // ideographic space 154 "\\ufeff" // byte order mark 155 ; 156 unicodeEscape(final char ch)157 static String unicodeEscape(final char ch) { 158 final StringBuilder sb = new StringBuilder(); 159 160 sb.append("\\u"); 161 162 final String hex = Integer.toHexString(ch); 163 for (int i = hex.length(); i < 4; i++) { 164 sb.append('0'); 165 } 166 sb.append(hex); 167 168 return sb.toString(); 169 } 170 171 /** 172 * Constructor 173 * 174 * @param source the source 175 * @param stream the token stream to lex 176 */ Lexer(final Source source, final TokenStream stream)177 public Lexer(final Source source, final TokenStream stream) { 178 this(source, stream, false, false); 179 } 180 181 /** 182 * Constructor 183 * 184 * @param source the source 185 * @param stream the token stream to lex 186 * @param scripting are we in scripting mode 187 * @param es6 are we in ECMAScript 6 mode 188 */ Lexer(final Source source, final TokenStream stream, final boolean scripting, final boolean es6)189 public Lexer(final Source source, final TokenStream stream, final boolean scripting, final boolean es6) { 190 this(source, 0, source.getLength(), stream, scripting, es6, false); 191 } 192 193 /** 194 * Constructor 195 * 196 * @param source the source 197 * @param start start position in source from which to start lexing 198 * @param len length of source segment to lex 199 * @param stream token stream to lex 200 * @param scripting are we in scripting mode 201 * @param es6 are we in ECMAScript 6 mode 202 * @param pauseOnFunctionBody if true, lexer will return from {@link #lexify()} when it encounters a 203 * function body. This is used with the feature where the parser is skipping nested function bodies to 204 * avoid reading ahead unnecessarily when we skip the function bodies. 205 */ Lexer(final Source source, final int start, final int len, final TokenStream stream, final boolean scripting, final boolean es6, final boolean pauseOnFunctionBody)206 public Lexer(final Source source, final int start, final int len, final TokenStream stream, final boolean scripting, final boolean es6, final boolean pauseOnFunctionBody) { 207 super(source.getContent(), 1, start, len); 208 this.source = source; 209 this.stream = stream; 210 this.scripting = scripting; 211 this.es6 = es6; 212 this.nested = false; 213 this.pendingLine = 1; 214 this.last = EOL; 215 216 this.pauseOnFunctionBody = pauseOnFunctionBody; 217 } 218 Lexer(final Lexer lexer, final State state)219 private Lexer(final Lexer lexer, final State state) { 220 super(lexer, state); 221 222 source = lexer.source; 223 stream = lexer.stream; 224 scripting = lexer.scripting; 225 es6 = lexer.es6; 226 nested = true; 227 228 pendingLine = state.pendingLine; 229 linePosition = state.linePosition; 230 last = EOL; 231 pauseOnFunctionBody = false; 232 } 233 234 static class State extends Scanner.State { 235 /** Pending new line number and position. */ 236 public final int pendingLine; 237 238 /** Position of last EOL + 1. */ 239 public final int linePosition; 240 241 /** Type of last token added. */ 242 public final TokenType last; 243 244 /* 245 * Constructor. 246 */ 247 State(final int position, final int limit, final int line, final int pendingLine, final int linePosition, final TokenType last)248 State(final int position, final int limit, final int line, final int pendingLine, final int linePosition, final TokenType last) { 249 super(position, limit, line); 250 251 this.pendingLine = pendingLine; 252 this.linePosition = linePosition; 253 this.last = last; 254 } 255 } 256 257 /** 258 * Save the state of the scan. 259 * 260 * @return Captured state. 261 */ 262 @Override saveState()263 State saveState() { 264 return new State(position, limit, line, pendingLine, linePosition, last); 265 } 266 267 /** 268 * Restore the state of the scan. 269 * 270 * @param state 271 * Captured state. 272 */ restoreState(final State state)273 void restoreState(final State state) { 274 super.restoreState(state); 275 276 pendingLine = state.pendingLine; 277 linePosition = state.linePosition; 278 last = state.last; 279 } 280 281 /** 282 * Add a new token to the stream. 283 * 284 * @param type 285 * Token type. 286 * @param start 287 * Start position. 288 * @param end 289 * End position. 290 */ add(final TokenType type, final int start, final int end)291 protected void add(final TokenType type, final int start, final int end) { 292 // Record last token. 293 last = type; 294 295 // Only emit the last EOL in a cluster. 296 if (type == EOL) { 297 pendingLine = end; 298 linePosition = start; 299 } else { 300 // Write any pending EOL to stream. 301 if (pendingLine != -1) { 302 stream.put(Token.toDesc(EOL, linePosition, pendingLine)); 303 pendingLine = -1; 304 } 305 306 // Write token to stream. 307 stream.put(Token.toDesc(type, start, end - start)); 308 } 309 } 310 311 /** 312 * Add a new token to the stream. 313 * 314 * @param type 315 * Token type. 316 * @param start 317 * Start position. 318 */ add(final TokenType type, final int start)319 protected void add(final TokenType type, final int start) { 320 add(type, start, position); 321 } 322 323 /** 324 * Return the String of valid whitespace characters for regular 325 * expressions in JavaScript 326 * @return regexp whitespace string 327 */ getWhitespaceRegExp()328 public static String getWhitespaceRegExp() { 329 return JAVASCRIPT_WHITESPACE_IN_REGEXP; 330 } 331 332 /** 333 * Skip end of line. 334 * 335 * @param addEOL true if EOL token should be recorded. 336 */ skipEOL(final boolean addEOL)337 private void skipEOL(final boolean addEOL) { 338 339 if (ch0 == '\r') { // detect \r\n pattern 340 skip(1); 341 if (ch0 == '\n') { 342 skip(1); 343 } 344 } else { // all other space, ch0 is guaranteed to be EOL or \0 345 skip(1); 346 } 347 348 // bump up line count 349 line++; 350 351 if (addEOL) { 352 // Add an EOL token. 353 add(EOL, position, line); 354 } 355 } 356 357 /** 358 * Skip over rest of line including end of line. 359 * 360 * @param addEOL true if EOL token should be recorded. 361 */ skipLine(final boolean addEOL)362 private void skipLine(final boolean addEOL) { 363 // Ignore characters. 364 while (!isEOL(ch0) && !atEOF()) { 365 skip(1); 366 } 367 // Skip over end of line. 368 skipEOL(addEOL); 369 } 370 371 /** 372 * Test whether a char is valid JavaScript whitespace 373 * @param ch a char 374 * @return true if valid JavaScript whitespace 375 */ isJSWhitespace(final char ch)376 public static boolean isJSWhitespace(final char ch) { 377 return ch == ' ' // space 378 || ch >= '\t' && ch <= '\r' // 0x09..0x0d: tab, line feed, tabulation line, ff, carriage return 379 || ch >= 160 && isOtherJSWhitespace(ch); 380 } 381 isOtherJSWhitespace(final char ch)382 private static boolean isOtherJSWhitespace(final char ch) { 383 return JAVASCRIPT_OTHER_WHITESPACE.indexOf(ch) != -1; 384 } 385 386 /** 387 * Test whether a char is valid JavaScript end of line 388 * @param ch a char 389 * @return true if valid JavaScript end of line 390 */ isJSEOL(final char ch)391 public static boolean isJSEOL(final char ch) { 392 return ch == '\n' // line feed 393 || ch == '\r' // carriage return (ctrl-m) 394 || ch == '\u2028' // line separator 395 || ch == '\u2029'; // paragraph separator 396 } 397 398 /** 399 * Test if char is a string delimiter, e.g. '\' or '"'. 400 * @param ch a char 401 * @return true if string delimiter 402 */ isStringDelimiter(final char ch)403 protected boolean isStringDelimiter(final char ch) { 404 return ch == '\'' || ch == '"'; 405 } 406 407 /** 408 * Test if char is a template literal delimiter ('`'). 409 */ isTemplateDelimiter(final char ch)410 private static boolean isTemplateDelimiter(final char ch) { 411 return ch == '`'; 412 } 413 414 /** 415 * Test whether a char is valid JavaScript whitespace 416 * @param ch a char 417 * @return true if valid JavaScript whitespace 418 */ isWhitespace(final char ch)419 protected boolean isWhitespace(final char ch) { 420 return Lexer.isJSWhitespace(ch); 421 } 422 423 /** 424 * Test whether a char is valid JavaScript end of line 425 * @param ch a char 426 * @return true if valid JavaScript end of line 427 */ isEOL(final char ch)428 protected boolean isEOL(final char ch) { 429 return Lexer.isJSEOL(ch); 430 } 431 432 /** 433 * Skip over whitespace and detect end of line, adding EOL tokens if 434 * encountered. 435 * 436 * @param addEOL true if EOL tokens should be recorded. 437 */ skipWhitespace(final boolean addEOL)438 private void skipWhitespace(final boolean addEOL) { 439 while (isWhitespace(ch0)) { 440 if (isEOL(ch0)) { 441 skipEOL(addEOL); 442 } else { 443 skip(1); 444 } 445 } 446 } 447 448 /** 449 * Skip over comments. 450 * 451 * @return True if a comment. 452 */ skipComments()453 protected boolean skipComments() { 454 // Save the current position. 455 final int start = position; 456 457 if (ch0 == '/') { 458 // Is it a // comment. 459 if (ch1 == '/') { 460 // Skip over //. 461 skip(2); 462 463 boolean directiveComment = false; 464 if ((ch0 == '#' || ch0 == '@') && (ch1 == ' ')) { 465 directiveComment = true; 466 } 467 468 // Scan for EOL. 469 while (!atEOF() && !isEOL(ch0)) { 470 skip(1); 471 } 472 // Did detect a comment. 473 add(directiveComment? DIRECTIVE_COMMENT : COMMENT, start); 474 return true; 475 } else if (ch1 == '*') { 476 // Skip over /*. 477 skip(2); 478 // Scan for */. 479 while (!atEOF() && !(ch0 == '*' && ch1 == '/')) { 480 // If end of line handle else skip character. 481 if (isEOL(ch0)) { 482 skipEOL(true); 483 } else { 484 skip(1); 485 } 486 } 487 488 if (atEOF()) { 489 // TODO - Report closing */ missing in parser. 490 add(ERROR, start); 491 } else { 492 // Skip */. 493 skip(2); 494 } 495 496 // Did detect a comment. 497 add(COMMENT, start); 498 return true; 499 } 500 } else if (ch0 == '#') { 501 assert scripting; 502 // shell style comment 503 // Skip over #. 504 skip(1); 505 // Scan for EOL. 506 while (!atEOF() && !isEOL(ch0)) { 507 skip(1); 508 } 509 // Did detect a comment. 510 add(COMMENT, start); 511 return true; 512 } 513 514 // Not a comment. 515 return false; 516 } 517 518 /** 519 * Convert a regex token to a token object. 520 * 521 * @param start Position in source content. 522 * @param length Length of regex token. 523 * @return Regex token object. 524 */ valueOfPattern(final int start, final int length)525 public RegexToken valueOfPattern(final int start, final int length) { 526 // Save the current position. 527 final int savePosition = position; 528 // Reset to beginning of content. 529 reset(start); 530 // Buffer for recording characters. 531 final StringBuilder sb = new StringBuilder(length); 532 533 // Skip /. 534 skip(1); 535 boolean inBrackets = false; 536 // Scan for closing /, stopping at end of line. 537 while (!atEOF() && ch0 != '/' && !isEOL(ch0) || inBrackets) { 538 // Skip over escaped character. 539 if (ch0 == '\\') { 540 sb.append(ch0); 541 sb.append(ch1); 542 skip(2); 543 } else { 544 if (ch0 == '[') { 545 inBrackets = true; 546 } else if (ch0 == ']') { 547 inBrackets = false; 548 } 549 550 // Skip literal character. 551 sb.append(ch0); 552 skip(1); 553 } 554 } 555 556 // Get pattern as string. 557 final String regex = sb.toString(); 558 559 // Skip /. 560 skip(1); 561 562 // Options as string. 563 final String options = source.getString(position, scanIdentifier()); 564 565 reset(savePosition); 566 567 // Compile the pattern. 568 return new RegexToken(regex, options); 569 } 570 571 /** 572 * Return true if the given token can be the beginning of a literal. 573 * 574 * @param token a token 575 * @return true if token can start a literal. 576 */ canStartLiteral(final TokenType token)577 public boolean canStartLiteral(final TokenType token) { 578 return token.startsWith('/') || ((scripting || XML_LITERALS) && token.startsWith('<')); 579 } 580 581 /** 582 * interface to receive line information for multi-line literals. 583 */ 584 protected interface LineInfoReceiver { 585 /** 586 * Receives line information 587 * @param line last line number 588 * @param linePosition position of last line 589 */ lineInfo(int line, int linePosition)590 public void lineInfo(int line, int linePosition); 591 } 592 593 /** 594 * Check whether the given token represents the beginning of a literal. If so scan 595 * the literal and return <code>true</code>, otherwise return false. 596 * 597 * @param token the token. 598 * @param startTokenType the token type. 599 * @param lir LineInfoReceiver that receives line info for multi-line string literals. 600 * @return True if a literal beginning with startToken was found and scanned. 601 */ scanLiteral(final long token, final TokenType startTokenType, final LineInfoReceiver lir)602 protected boolean scanLiteral(final long token, final TokenType startTokenType, final LineInfoReceiver lir) { 603 // Check if it can be a literal. 604 if (!canStartLiteral(startTokenType)) { 605 return false; 606 } 607 // We break on ambiguous tokens so if we already moved on it can't be a literal. 608 if (stream.get(stream.last()) != token) { 609 return false; 610 } 611 612 // Record current position in case multiple heredocs start on this line - see JDK-8073653 613 final State state = saveState(); 614 // Rewind to token start position 615 reset(Token.descPosition(token)); 616 617 if (ch0 == '/') { 618 return scanRegEx(); 619 } else if (ch0 == '<') { 620 if (ch1 == '<') { 621 return scanHereString(lir, state); 622 } else if (Character.isJavaIdentifierStart(ch1)) { 623 return scanXMLLiteral(); 624 } 625 } 626 627 return false; 628 } 629 630 /** 631 * Scan over regex literal. 632 * 633 * @return True if a regex literal. 634 */ scanRegEx()635 private boolean scanRegEx() { 636 assert ch0 == '/'; 637 // Make sure it's not a comment. 638 if (ch1 != '/' && ch1 != '*') { 639 // Record beginning of literal. 640 final int start = position; 641 // Skip /. 642 skip(1); 643 boolean inBrackets = false; 644 645 // Scan for closing /, stopping at end of line. 646 while (!atEOF() && (ch0 != '/' || inBrackets) && !isEOL(ch0)) { 647 // Skip over escaped character. 648 if (ch0 == '\\') { 649 skip(1); 650 if (isEOL(ch0)) { 651 reset(start); 652 return false; 653 } 654 skip(1); 655 } else { 656 if (ch0 == '[') { 657 inBrackets = true; 658 } else if (ch0 == ']') { 659 inBrackets = false; 660 } 661 662 // Skip literal character. 663 skip(1); 664 } 665 } 666 667 // If regex literal. 668 if (ch0 == '/') { 669 // Skip /. 670 skip(1); 671 672 // Skip over options. 673 while (!atEOF() && Character.isJavaIdentifierPart(ch0) || ch0 == '\\' && ch1 == 'u') { 674 skip(1); 675 } 676 677 // Add regex token. 678 add(REGEX, start); 679 // Regex literal detected. 680 return true; 681 } 682 683 // False start try again. 684 reset(start); 685 } 686 687 // Regex literal not detected. 688 return false; 689 } 690 691 /** 692 * Convert a digit to a integer. Can't use Character.digit since we are 693 * restricted to ASCII by the spec. 694 * 695 * @param ch Character to convert. 696 * @param base Numeric base. 697 * 698 * @return The converted digit or -1 if invalid. 699 */ convertDigit(final char ch, final int base)700 protected static int convertDigit(final char ch, final int base) { 701 int digit; 702 703 if ('0' <= ch && ch <= '9') { 704 digit = ch - '0'; 705 } else if ('A' <= ch && ch <= 'Z') { 706 digit = ch - 'A' + 10; 707 } else if ('a' <= ch && ch <= 'z') { 708 digit = ch - 'a' + 10; 709 } else { 710 return -1; 711 } 712 713 return digit < base ? digit : -1; 714 } 715 716 717 /** 718 * Get the value of a hexadecimal numeric sequence. 719 * 720 * @param length Number of digits. 721 * @param type Type of token to report against. 722 * @return Value of sequence or < 0 if no digits. 723 */ hexSequence(final int length, final TokenType type)724 private int hexSequence(final int length, final TokenType type) { 725 int value = 0; 726 727 for (int i = 0; i < length; i++) { 728 final int digit = convertDigit(ch0, 16); 729 730 if (digit == -1) { 731 error(Lexer.message("invalid.hex"), type, position, limit); 732 return i == 0 ? -1 : value; 733 } 734 735 value = digit | value << 4; 736 skip(1); 737 } 738 739 return value; 740 } 741 742 /** 743 * Get the value of an octal numeric sequence. This parses up to 3 digits with a maximum value of 255. 744 * 745 * @return Value of sequence. 746 */ octalSequence()747 private int octalSequence() { 748 int value = 0; 749 750 for (int i = 0; i < 3; i++) { 751 final int digit = convertDigit(ch0, 8); 752 753 if (digit == -1) { 754 break; 755 } 756 value = digit | value << 3; 757 skip(1); 758 759 if (i == 1 && value >= 32) { 760 break; 761 } 762 } 763 return value; 764 } 765 766 /** 767 * Convert a string to a JavaScript identifier. 768 * 769 * @param start Position in source content. 770 * @param length Length of token. 771 * @return Ident string or null if an error. 772 */ valueOfIdent(final int start, final int length)773 private String valueOfIdent(final int start, final int length) throws RuntimeException { 774 // Save the current position. 775 final int savePosition = position; 776 // End of scan. 777 final int end = start + length; 778 // Reset to beginning of content. 779 reset(start); 780 // Buffer for recording characters. 781 final StringBuilder sb = new StringBuilder(length); 782 783 // Scan until end of line or end of file. 784 while (!atEOF() && position < end && !isEOL(ch0)) { 785 // If escape character. 786 if (ch0 == '\\' && ch1 == 'u') { 787 skip(2); 788 final int ch = hexSequence(4, TokenType.IDENT); 789 assert ! isWhitespace((char)ch); 790 assert ch >= 0; 791 sb.append((char)ch); 792 } else { 793 // Add regular character. 794 sb.append(ch0); 795 skip(1); 796 } 797 } 798 799 // Restore position. 800 reset(savePosition); 801 802 return sb.toString(); 803 } 804 805 /** 806 * Scan over and identifier or keyword. Handles identifiers containing 807 * encoded Unicode chars. 808 * 809 * Example: 810 * 811 * var \u0042 = 44; 812 */ scanIdentifierOrKeyword()813 private void scanIdentifierOrKeyword() { 814 // Record beginning of identifier. 815 final int start = position; 816 // Scan identifier. 817 final int length = scanIdentifier(); 818 // Check to see if it is a keyword. 819 final TokenType type = TokenLookup.lookupKeyword(content, start, length); 820 if (type == FUNCTION && pauseOnFunctionBody) { 821 pauseOnNextLeftBrace = true; 822 } 823 // Add keyword or identifier token. 824 add(type, start); 825 } 826 827 /** 828 * Convert a string to a JavaScript string object. 829 * 830 * @param start Position in source content. 831 * @param length Length of token. 832 * @return JavaScript string object. 833 */ valueOfString(final int start, final int length, final boolean strict)834 private String valueOfString(final int start, final int length, final boolean strict) throws RuntimeException { 835 // Save the current position. 836 final int savePosition = position; 837 // Calculate the end position. 838 final int end = start + length; 839 // Reset to beginning of string. 840 reset(start); 841 842 // Buffer for recording characters. 843 final StringBuilder sb = new StringBuilder(length); 844 845 // Scan until end of string. 846 while (position < end) { 847 // If escape character. 848 if (ch0 == '\\') { 849 skip(1); 850 851 final char next = ch0; 852 final int afterSlash = position; 853 854 skip(1); 855 856 // Special characters. 857 switch (next) { 858 case '0': 859 case '1': 860 case '2': 861 case '3': 862 case '4': 863 case '5': 864 case '6': 865 case '7': { 866 if (strict) { 867 // "\0" itself is allowed in strict mode. Only other 'real' 868 // octal escape sequences are not allowed (eg. "\02", "\31"). 869 // See section 7.8.4 String literals production EscapeSequence 870 if (next != '0' || (ch0 >= '0' && ch0 <= '9')) { 871 error(Lexer.message("strict.no.octal"), STRING, position, limit); 872 } 873 } 874 reset(afterSlash); 875 // Octal sequence. 876 final int ch = octalSequence(); 877 878 if (ch < 0) { 879 sb.append('\\'); 880 sb.append('x'); 881 } else { 882 sb.append((char)ch); 883 } 884 break; 885 } 886 case 'n': 887 sb.append('\n'); 888 break; 889 case 't': 890 sb.append('\t'); 891 break; 892 case 'b': 893 sb.append('\b'); 894 break; 895 case 'f': 896 sb.append('\f'); 897 break; 898 case 'r': 899 sb.append('\r'); 900 break; 901 case '\'': 902 sb.append('\''); 903 break; 904 case '\"': 905 sb.append('\"'); 906 break; 907 case '\\': 908 sb.append('\\'); 909 break; 910 case '\r': // CR | CRLF 911 if (ch0 == '\n') { 912 skip(1); 913 } 914 // fall through 915 case '\n': // LF 916 case '\u2028': // LS 917 case '\u2029': // PS 918 // continue on the next line, slash-return continues string 919 // literal 920 break; 921 case 'x': { 922 // Hex sequence. 923 final int ch = hexSequence(2, STRING); 924 925 if (ch < 0) { 926 sb.append('\\'); 927 sb.append('x'); 928 } else { 929 sb.append((char)ch); 930 } 931 } 932 break; 933 case 'u': { 934 // Unicode sequence. 935 final int ch = hexSequence(4, STRING); 936 937 if (ch < 0) { 938 sb.append('\\'); 939 sb.append('u'); 940 } else { 941 sb.append((char)ch); 942 } 943 } 944 break; 945 case 'v': 946 sb.append('\u000B'); 947 break; 948 // All other characters. 949 default: 950 sb.append(next); 951 break; 952 } 953 } else if (ch0 == '\r') { 954 // Convert CR-LF or CR to LF line terminator. 955 sb.append('\n'); 956 skip(ch1 == '\n' ? 2 : 1); 957 } else { 958 // Add regular character. 959 sb.append(ch0); 960 skip(1); 961 } 962 } 963 964 // Restore position. 965 reset(savePosition); 966 967 return sb.toString(); 968 } 969 970 /** 971 * Scan over a string literal. 972 * @param add true if we are not just scanning but should actually modify the token stream 973 */ scanString(final boolean add)974 protected void scanString(final boolean add) { 975 // Type of string. 976 TokenType type = STRING; 977 // Record starting quote. 978 final char quote = ch0; 979 // Skip over quote. 980 skip(1); 981 982 // Record beginning of string content. 983 final State stringState = saveState(); 984 985 // Scan until close quote or end of line. 986 while (!atEOF() && ch0 != quote && !isEOL(ch0)) { 987 // Skip over escaped character. 988 if (ch0 == '\\') { 989 type = ESCSTRING; 990 skip(1); 991 if (isEOL(ch0)) { 992 // Multiline string literal 993 skipEOL(false); 994 continue; 995 } 996 } 997 // Skip literal character. 998 skip(1); 999 } 1000 1001 // If close quote. 1002 if (ch0 == quote) { 1003 // Skip close quote. 1004 skip(1); 1005 } else { 1006 error(Lexer.message("missing.close.quote"), STRING, position, limit); 1007 } 1008 1009 // If not just scanning. 1010 if (add) { 1011 // Record end of string. 1012 stringState.setLimit(position - 1); 1013 1014 if (scripting && !stringState.isEmpty()) { 1015 switch (quote) { 1016 case '`': 1017 // Mark the beginning of an exec string. 1018 add(EXECSTRING, stringState.position, stringState.limit); 1019 // Frame edit string with left brace. 1020 add(LBRACE, stringState.position, stringState.position); 1021 // Process edit string. 1022 editString(type, stringState); 1023 // Frame edit string with right brace. 1024 add(RBRACE, stringState.limit, stringState.limit); 1025 break; 1026 case '"': 1027 // Only edit double quoted strings. 1028 editString(type, stringState); 1029 break; 1030 case '\'': 1031 // Add string token without editing. 1032 add(type, stringState.position, stringState.limit); 1033 break; 1034 default: 1035 break; 1036 } 1037 } else { 1038 /// Add string token without editing. 1039 add(type, stringState.position, stringState.limit); 1040 } 1041 } 1042 } 1043 1044 /** 1045 * Scan over a template string literal. 1046 */ scanTemplate()1047 private void scanTemplate() { 1048 assert ch0 == '`'; 1049 TokenType type = TEMPLATE; 1050 1051 // Skip over quote and record beginning of string content. 1052 skip(1); 1053 State stringState = saveState(); 1054 1055 // Scan until close quote 1056 while (!atEOF()) { 1057 // Skip over escaped character. 1058 if (ch0 == '`') { 1059 skip(1); 1060 // Record end of string. 1061 stringState.setLimit(position - 1); 1062 add(type == TEMPLATE ? type : TEMPLATE_TAIL, stringState.position, stringState.limit); 1063 return; 1064 } else if (ch0 == '$' && ch1 == '{') { 1065 skip(2); 1066 stringState.setLimit(position - 2); 1067 add(type == TEMPLATE ? TEMPLATE_HEAD : type, stringState.position, stringState.limit); 1068 1069 // scan to RBRACE 1070 final Lexer expressionLexer = new Lexer(this, saveState()); 1071 expressionLexer.templateExpressionOpenBraces = 1; 1072 expressionLexer.lexify(); 1073 restoreState(expressionLexer.saveState()); 1074 1075 // scan next middle or tail of the template literal 1076 assert ch0 == '}'; 1077 type = TEMPLATE_MIDDLE; 1078 1079 // Skip over rbrace and record beginning of string content. 1080 skip(1); 1081 stringState = saveState(); 1082 1083 continue; 1084 } else if (ch0 == '\\') { 1085 skip(1); 1086 // EscapeSequence 1087 if (isEOL(ch0)) { 1088 // LineContinuation 1089 skipEOL(false); 1090 continue; 1091 } 1092 } else if (isEOL(ch0)) { 1093 // LineTerminatorSequence 1094 skipEOL(false); 1095 continue; 1096 } 1097 1098 // Skip literal character. 1099 skip(1); 1100 } 1101 1102 error(Lexer.message("missing.close.quote"), TEMPLATE, position, limit); 1103 } 1104 1105 /** 1106 * Convert string to number. 1107 * 1108 * @param valueString String to convert. 1109 * @param radix Numeric base. 1110 * @return Converted number. 1111 */ valueOf(final String valueString, final int radix)1112 private static Number valueOf(final String valueString, final int radix) throws NumberFormatException { 1113 try { 1114 return Integer.parseInt(valueString, radix); 1115 } catch (final NumberFormatException e) { 1116 if (radix == 10) { 1117 return Double.valueOf(valueString); 1118 } 1119 1120 double value = 0.0; 1121 1122 for (int i = 0; i < valueString.length(); i++) { 1123 final char ch = valueString.charAt(i); 1124 // Preverified, should always be a valid digit. 1125 final int digit = convertDigit(ch, radix); 1126 value *= radix; 1127 value += digit; 1128 } 1129 1130 return value; 1131 } 1132 } 1133 1134 /** 1135 * Scan a number. 1136 */ scanNumber()1137 protected void scanNumber() { 1138 // Record beginning of number. 1139 final int start = position; 1140 // Assume value is a decimal. 1141 TokenType type = DECIMAL; 1142 1143 // First digit of number. 1144 int digit = convertDigit(ch0, 10); 1145 1146 // If number begins with 0x. 1147 if (digit == 0 && (ch1 == 'x' || ch1 == 'X') && convertDigit(ch2, 16) != -1) { 1148 // Skip over 0xN. 1149 skip(3); 1150 // Skip over remaining digits. 1151 while (convertDigit(ch0, 16) != -1) { 1152 skip(1); 1153 } 1154 1155 type = HEXADECIMAL; 1156 } else if (digit == 0 && es6 && (ch1 == 'o' || ch1 == 'O') && convertDigit(ch2, 8) != -1) { 1157 // Skip over 0oN. 1158 skip(3); 1159 // Skip over remaining digits. 1160 while (convertDigit(ch0, 8) != -1) { 1161 skip(1); 1162 } 1163 1164 type = OCTAL; 1165 } else if (digit == 0 && es6 && (ch1 == 'b' || ch1 == 'B') && convertDigit(ch2, 2) != -1) { 1166 // Skip over 0bN. 1167 skip(3); 1168 // Skip over remaining digits. 1169 while (convertDigit(ch0, 2) != -1) { 1170 skip(1); 1171 } 1172 1173 type = BINARY_NUMBER; 1174 } else { 1175 // Check for possible octal constant. 1176 boolean octal = digit == 0; 1177 // Skip first digit if not leading '.'. 1178 if (digit != -1) { 1179 skip(1); 1180 } 1181 1182 // Skip remaining digits. 1183 while ((digit = convertDigit(ch0, 10)) != -1) { 1184 // Check octal only digits. 1185 octal = octal && digit < 8; 1186 // Skip digit. 1187 skip(1); 1188 } 1189 1190 if (octal && position - start > 1) { 1191 type = OCTAL_LEGACY; 1192 } else if (ch0 == '.' || ch0 == 'E' || ch0 == 'e') { 1193 // Must be a double. 1194 if (ch0 == '.') { 1195 // Skip period. 1196 skip(1); 1197 // Skip mantissa. 1198 while (convertDigit(ch0, 10) != -1) { 1199 skip(1); 1200 } 1201 } 1202 1203 // Detect exponent. 1204 if (ch0 == 'E' || ch0 == 'e') { 1205 // Skip E. 1206 skip(1); 1207 // Detect and skip exponent sign. 1208 if (ch0 == '+' || ch0 == '-') { 1209 skip(1); 1210 } 1211 // Skip exponent. 1212 while (convertDigit(ch0, 10) != -1) { 1213 skip(1); 1214 } 1215 } 1216 1217 type = FLOATING; 1218 } 1219 } 1220 1221 if (Character.isJavaIdentifierStart(ch0)) { 1222 error(Lexer.message("missing.space.after.number"), type, position, 1); 1223 } 1224 1225 // Add number token. 1226 add(type, start); 1227 } 1228 1229 /** 1230 * Convert a regex token to a token object. 1231 * 1232 * @param start Position in source content. 1233 * @param length Length of regex token. 1234 * @return Regex token object. 1235 */ valueOfXML(final int start, final int length)1236 XMLToken valueOfXML(final int start, final int length) { 1237 return new XMLToken(source.getString(start, length)); 1238 } 1239 1240 /** 1241 * Scan over a XML token. 1242 * 1243 * @return TRUE if is an XML literal. 1244 */ scanXMLLiteral()1245 private boolean scanXMLLiteral() { 1246 assert ch0 == '<' && Character.isJavaIdentifierStart(ch1); 1247 if (XML_LITERALS) { 1248 // Record beginning of xml expression. 1249 final int start = position; 1250 1251 int openCount = 0; 1252 1253 do { 1254 if (ch0 == '<') { 1255 if (ch1 == '/' && Character.isJavaIdentifierStart(ch2)) { 1256 skip(3); 1257 openCount--; 1258 } else if (Character.isJavaIdentifierStart(ch1)) { 1259 skip(2); 1260 openCount++; 1261 } else if (ch1 == '?') { 1262 skip(2); 1263 } else if (ch1 == '!' && ch2 == '-' && ch3 == '-') { 1264 skip(4); 1265 } else { 1266 reset(start); 1267 return false; 1268 } 1269 1270 while (!atEOF() && ch0 != '>') { 1271 if (ch0 == '/' && ch1 == '>') { 1272 openCount--; 1273 skip(1); 1274 break; 1275 } else if (ch0 == '\"' || ch0 == '\'') { 1276 scanString(false); 1277 } else { 1278 skip(1); 1279 } 1280 } 1281 1282 if (ch0 != '>') { 1283 reset(start); 1284 return false; 1285 } 1286 1287 skip(1); 1288 } else if (atEOF()) { 1289 reset(start); 1290 return false; 1291 } else { 1292 skip(1); 1293 } 1294 } while (openCount > 0); 1295 1296 add(XML, start); 1297 return true; 1298 } 1299 1300 return false; 1301 } 1302 1303 /** 1304 * Scan over identifier characters. 1305 * 1306 * @return Length of identifier or zero if none found. 1307 */ scanIdentifier()1308 private int scanIdentifier() { 1309 final int start = position; 1310 1311 // Make sure first character is valid start character. 1312 if (ch0 == '\\' && ch1 == 'u') { 1313 skip(2); 1314 final int ch = hexSequence(4, TokenType.IDENT); 1315 1316 if (!Character.isJavaIdentifierStart(ch)) { 1317 error(Lexer.message("illegal.identifier.character"), TokenType.IDENT, start, position); 1318 } 1319 } else if (!Character.isJavaIdentifierStart(ch0)) { 1320 // Not an identifier. 1321 return 0; 1322 } 1323 1324 // Make sure remaining characters are valid part characters. 1325 while (!atEOF()) { 1326 if (ch0 == '\\' && ch1 == 'u') { 1327 skip(2); 1328 final int ch = hexSequence(4, TokenType.IDENT); 1329 1330 if (!Character.isJavaIdentifierPart(ch)) { 1331 error(Lexer.message("illegal.identifier.character"), TokenType.IDENT, start, position); 1332 } 1333 } else if (Character.isJavaIdentifierPart(ch0)) { 1334 skip(1); 1335 } else { 1336 break; 1337 } 1338 } 1339 1340 // Length of identifier sequence. 1341 return position - start; 1342 } 1343 1344 /** 1345 * Compare two identifiers (in content) for equality. 1346 * 1347 * @param aStart Start of first identifier. 1348 * @param aLength Length of first identifier. 1349 * @param bStart Start of second identifier. 1350 * @param bLength Length of second identifier. 1351 * @return True if equal. 1352 */ identifierEqual(final int aStart, final int aLength, final int bStart, final int bLength)1353 private boolean identifierEqual(final int aStart, final int aLength, final int bStart, final int bLength) { 1354 if (aLength == bLength) { 1355 for (int i = 0; i < aLength; i++) { 1356 if (content[aStart + i] != content[bStart + i]) { 1357 return false; 1358 } 1359 } 1360 1361 return true; 1362 } 1363 1364 return false; 1365 } 1366 1367 /** 1368 * Detect if a line starts with a marker identifier. 1369 * 1370 * @param identStart Start of identifier. 1371 * @param identLength Length of identifier. 1372 * @return True if detected. 1373 */ hasHereMarker(final int identStart, final int identLength)1374 private boolean hasHereMarker(final int identStart, final int identLength) { 1375 // Skip any whitespace. 1376 skipWhitespace(false); 1377 1378 return identifierEqual(identStart, identLength, position, scanIdentifier()); 1379 } 1380 1381 /** 1382 * Lexer to service edit strings. 1383 */ 1384 private static class EditStringLexer extends Lexer { 1385 /** Type of string literals to emit. */ 1386 final TokenType stringType; 1387 1388 /* 1389 * Constructor. 1390 */ 1391 EditStringLexer(final Lexer lexer, final TokenType stringType, final State stringState)1392 EditStringLexer(final Lexer lexer, final TokenType stringType, final State stringState) { 1393 super(lexer, stringState); 1394 1395 this.stringType = stringType; 1396 } 1397 1398 /** 1399 * Lexify the contents of the string. 1400 */ 1401 @Override lexify()1402 public void lexify() { 1403 // Record start of string position. 1404 int stringStart = position; 1405 // Indicate that the priming first string has not been emitted. 1406 boolean primed = false; 1407 1408 while (true) { 1409 // Detect end of content. 1410 if (atEOF()) { 1411 break; 1412 } 1413 1414 // Honour escapes (should be well formed.) 1415 if (ch0 == '\\' && stringType == ESCSTRING) { 1416 skip(2); 1417 1418 continue; 1419 } 1420 1421 // If start of expression. 1422 if (ch0 == '$' && ch1 == '{') { 1423 if (!primed || stringStart != position) { 1424 if (primed) { 1425 add(ADD, stringStart, stringStart + 1); 1426 } 1427 1428 add(stringType, stringStart, position); 1429 primed = true; 1430 } 1431 1432 // Skip ${ 1433 skip(2); 1434 1435 // Save expression state. 1436 final State expressionState = saveState(); 1437 1438 // Start with one open brace. 1439 int braceCount = 1; 1440 1441 // Scan for the rest of the string. 1442 while (!atEOF()) { 1443 // If closing brace. 1444 if (ch0 == '}') { 1445 // Break only only if matching brace. 1446 if (--braceCount == 0) { 1447 break; 1448 } 1449 } else if (ch0 == '{') { 1450 // Bump up the brace count. 1451 braceCount++; 1452 } 1453 1454 // Skip to next character. 1455 skip(1); 1456 } 1457 1458 // If braces don't match then report an error. 1459 if (braceCount != 0) { 1460 error(Lexer.message("edit.string.missing.brace"), LBRACE, expressionState.position - 1, 1); 1461 } 1462 1463 // Mark end of expression. 1464 expressionState.setLimit(position); 1465 // Skip closing brace. 1466 skip(1); 1467 1468 // Start next string. 1469 stringStart = position; 1470 1471 // Concatenate expression. 1472 add(ADD, expressionState.position, expressionState.position + 1); 1473 add(LPAREN, expressionState.position, expressionState.position + 1); 1474 1475 // Scan expression. 1476 final Lexer lexer = new Lexer(this, expressionState); 1477 lexer.lexify(); 1478 1479 // Close out expression parenthesis. 1480 add(RPAREN, position - 1, position); 1481 1482 continue; 1483 } 1484 1485 // Next character in string. 1486 skip(1); 1487 } 1488 1489 // If there is any unemitted string portion. 1490 if (stringStart != limit) { 1491 // Concatenate remaining string. 1492 if (primed) { 1493 add(ADD, stringStart, 1); 1494 } 1495 1496 add(stringType, stringStart, limit); 1497 } 1498 } 1499 1500 } 1501 1502 /** 1503 * Edit string for nested expressions. 1504 * 1505 * @param stringType Type of string literals to emit. 1506 * @param stringState State of lexer at start of string. 1507 */ editString(final TokenType stringType, final State stringState)1508 private void editString(final TokenType stringType, final State stringState) { 1509 // Use special lexer to scan string. 1510 final EditStringLexer lexer = new EditStringLexer(this, stringType, stringState); 1511 lexer.lexify(); 1512 1513 // Need to keep lexer informed. 1514 last = stringType; 1515 } 1516 1517 /** 1518 * Scan over a here string. 1519 * 1520 * @return TRUE if is a here string. 1521 */ scanHereString(final LineInfoReceiver lir, final State oldState)1522 private boolean scanHereString(final LineInfoReceiver lir, final State oldState) { 1523 assert ch0 == '<' && ch1 == '<'; 1524 if (scripting) { 1525 // Record beginning of here string. 1526 final State saved = saveState(); 1527 1528 // << or <<< 1529 final boolean excludeLastEOL = ch2 != '<'; 1530 1531 if (excludeLastEOL) { 1532 skip(2); 1533 } else { 1534 skip(3); 1535 } 1536 1537 // Scan identifier. It might be quoted, indicating that no string editing should take place. 1538 final char quoteChar = ch0; 1539 final boolean noStringEditing = quoteChar == '"' || quoteChar == '\''; 1540 if (noStringEditing) { 1541 skip(1); 1542 } 1543 final int identStart = position; 1544 final int identLength = scanIdentifier(); 1545 if (noStringEditing) { 1546 if (ch0 != quoteChar) { 1547 error(Lexer.message("here.non.matching.delimiter"), last, position, position); 1548 restoreState(saved); 1549 return false; 1550 } 1551 skip(1); 1552 } 1553 1554 // Check for identifier. 1555 if (identLength == 0) { 1556 // Treat as shift. 1557 restoreState(saved); 1558 1559 return false; 1560 } 1561 1562 // Record rest of line. 1563 final State restState = saveState(); 1564 // keep line number updated 1565 int lastLine = line; 1566 1567 skipLine(false); 1568 lastLine++; 1569 int lastLinePosition = position; 1570 restState.setLimit(position); 1571 1572 if (oldState.position > position) { 1573 restoreState(oldState); 1574 skipLine(false); 1575 } 1576 1577 // Record beginning of string. 1578 final State stringState = saveState(); 1579 int stringEnd = position; 1580 1581 // Hunt down marker. 1582 while (!atEOF()) { 1583 // Skip any whitespace. 1584 skipWhitespace(false); 1585 1586 //handle trailing blank lines 1587 lastLinePosition = position; 1588 stringEnd = position; 1589 1590 if (hasHereMarker(identStart, identLength)) { 1591 break; 1592 } 1593 1594 skipLine(false); 1595 lastLine++; 1596 lastLinePosition = position; 1597 stringEnd = position; 1598 } 1599 1600 // notify last line information 1601 lir.lineInfo(lastLine, lastLinePosition); 1602 1603 // Record end of string. 1604 stringState.setLimit(stringEnd); 1605 1606 // If marker is missing. 1607 if (stringState.isEmpty() || atEOF()) { 1608 error(Lexer.message("here.missing.end.marker", source.getString(identStart, identLength)), last, position, position); 1609 restoreState(saved); 1610 1611 return false; 1612 } 1613 1614 // Remove last end of line if specified. 1615 if (excludeLastEOL) { 1616 // Handles \n. 1617 if (content[stringEnd - 1] == '\n') { 1618 stringEnd--; 1619 } 1620 1621 // Handles \r and \r\n. 1622 if (content[stringEnd - 1] == '\r') { 1623 stringEnd--; 1624 } 1625 1626 // Update end of string. 1627 stringState.setLimit(stringEnd); 1628 } 1629 1630 // Edit string if appropriate. 1631 if (!noStringEditing && !stringState.isEmpty()) { 1632 editString(STRING, stringState); 1633 } else { 1634 // Add here string. 1635 add(STRING, stringState.position, stringState.limit); 1636 } 1637 1638 // Scan rest of original line. 1639 final Lexer restLexer = new Lexer(this, restState); 1640 1641 restLexer.lexify(); 1642 1643 return true; 1644 } 1645 1646 return false; 1647 } 1648 1649 /** 1650 * Breaks source content down into lex units, adding tokens to the token 1651 * stream. The routine scans until the stream buffer is full. Can be called 1652 * repeatedly until EOF is detected. 1653 */ lexify()1654 public void lexify() { 1655 while (!stream.isFull() || nested) { 1656 // Skip over whitespace. 1657 skipWhitespace(true); 1658 1659 // Detect end of file. 1660 if (atEOF()) { 1661 if (!nested) { 1662 // Add an EOF token at the end. 1663 add(EOF, position); 1664 } 1665 1666 break; 1667 } 1668 1669 // Check for comments. Note that we don't scan for regexp and other literals here as 1670 // we may not have enough context to distinguish them from similar looking operators. 1671 // Instead we break on ambiguous operators below and let the parser decide. 1672 if (ch0 == '/' && skipComments()) { 1673 continue; 1674 } 1675 1676 if (scripting && ch0 == '#' && skipComments()) { 1677 continue; 1678 } 1679 1680 // TokenType for lookup of delimiter or operator. 1681 TokenType type; 1682 1683 if (ch0 == '.' && convertDigit(ch1, 10) != -1) { 1684 // '.' followed by digit. 1685 // Scan and add a number. 1686 scanNumber(); 1687 } else if ((type = TokenLookup.lookupOperator(ch0, ch1, ch2, ch3)) != null) { 1688 if (templateExpressionOpenBraces > 0) { 1689 if (type == LBRACE) { 1690 templateExpressionOpenBraces++; 1691 } else if (type == RBRACE) { 1692 if (--templateExpressionOpenBraces == 0) { 1693 break; 1694 } 1695 } 1696 } 1697 1698 // Get the number of characters in the token. 1699 final int typeLength = type.getLength(); 1700 // Skip that many characters. 1701 skip(typeLength); 1702 // Add operator token. 1703 add(type, position - typeLength); 1704 // Some operator tokens also mark the beginning of regexp, XML, or here string literals. 1705 // We break to let the parser decide what it is. 1706 if (canStartLiteral(type)) { 1707 break; 1708 } else if (type == LBRACE && pauseOnNextLeftBrace) { 1709 pauseOnNextLeftBrace = false; 1710 break; 1711 } 1712 } else if (Character.isJavaIdentifierStart(ch0) || ch0 == '\\' && ch1 == 'u') { 1713 // Scan and add identifier or keyword. 1714 scanIdentifierOrKeyword(); 1715 } else if (isStringDelimiter(ch0)) { 1716 // Scan and add a string. 1717 scanString(true); 1718 } else if (Character.isDigit(ch0)) { 1719 // Scan and add a number. 1720 scanNumber(); 1721 } else if (isTemplateDelimiter(ch0) && es6) { 1722 // Scan and add template in ES6 mode. 1723 scanTemplate(); 1724 } else if (isTemplateDelimiter(ch0) && scripting) { 1725 // Scan and add an exec string ('`') in scripting mode. 1726 scanString(true); 1727 } else { 1728 // Don't recognize this character. 1729 skip(1); 1730 add(ERROR, position - 1); 1731 } 1732 } 1733 } 1734 1735 /** 1736 * Return value of token given its token descriptor. 1737 * 1738 * @param token Token descriptor. 1739 * @return JavaScript value. 1740 */ getValueOf(final long token, final boolean strict)1741 Object getValueOf(final long token, final boolean strict) { 1742 final int start = Token.descPosition(token); 1743 final int len = Token.descLength(token); 1744 1745 switch (Token.descType(token)) { 1746 case DECIMAL: 1747 return Lexer.valueOf(source.getString(start, len), 10); // number 1748 case HEXADECIMAL: 1749 return Lexer.valueOf(source.getString(start + 2, len - 2), 16); // number 1750 case OCTAL_LEGACY: 1751 return Lexer.valueOf(source.getString(start, len), 8); // number 1752 case OCTAL: 1753 return Lexer.valueOf(source.getString(start + 2, len - 2), 8); // number 1754 case BINARY_NUMBER: 1755 return Lexer.valueOf(source.getString(start + 2, len - 2), 2); // number 1756 case FLOATING: 1757 final String str = source.getString(start, len); 1758 final double value = Double.valueOf(str); 1759 if (str.indexOf('.') != -1) { 1760 return value; //number 1761 } 1762 //anything without an explicit decimal point is still subject to a 1763 //"representable as int or long" check. Then the programmer does not 1764 //explicitly code something as a double. For example new Color(int, int, int) 1765 //and new Color(float, float, float) will get ambiguous for cases like 1766 //new Color(1.0, 1.5, 1.5) if we don't respect the decimal point. 1767 //yet we don't want e.g. 1e6 to be a double unnecessarily 1768 if (JSType.isStrictlyRepresentableAsInt(value)) { 1769 return (int)value; 1770 } 1771 return value; 1772 case STRING: 1773 return source.getString(start, len); // String 1774 case ESCSTRING: 1775 return valueOfString(start, len, strict); // String 1776 case IDENT: 1777 return valueOfIdent(start, len); // String 1778 case REGEX: 1779 return valueOfPattern(start, len); // RegexToken::LexerToken 1780 case TEMPLATE: 1781 case TEMPLATE_HEAD: 1782 case TEMPLATE_MIDDLE: 1783 case TEMPLATE_TAIL: 1784 return valueOfString(start, len, true); // String 1785 case XML: 1786 return valueOfXML(start, len); // XMLToken::LexerToken 1787 case DIRECTIVE_COMMENT: 1788 return source.getString(start, len); 1789 default: 1790 break; 1791 } 1792 1793 return null; 1794 } 1795 1796 /** 1797 * Get the raw string value of a template literal string part. 1798 * 1799 * @param token template string token 1800 * @return raw string 1801 */ valueOfRawString(final long token)1802 public String valueOfRawString(final long token) { 1803 final int start = Token.descPosition(token); 1804 final int length = Token.descLength(token); 1805 1806 // Save the current position. 1807 final int savePosition = position; 1808 // Calculate the end position. 1809 final int end = start + length; 1810 // Reset to beginning of string. 1811 reset(start); 1812 1813 // Buffer for recording characters. 1814 final StringBuilder sb = new StringBuilder(length); 1815 1816 // Scan until end of string. 1817 while (position < end) { 1818 if (ch0 == '\r') { 1819 // Convert CR-LF or CR to LF line terminator. 1820 sb.append('\n'); 1821 skip(ch1 == '\n' ? 2 : 1); 1822 } else { 1823 // Add regular character. 1824 sb.append(ch0); 1825 skip(1); 1826 } 1827 } 1828 1829 // Restore position. 1830 reset(savePosition); 1831 1832 return sb.toString(); 1833 } 1834 1835 /** 1836 * Get the correctly localized error message for a given message id format arguments 1837 * @param msgId message id 1838 * @param args format arguments 1839 * @return message 1840 */ message(final String msgId, final String... args)1841 protected static String message(final String msgId, final String... args) { 1842 return ECMAErrors.getMessage("lexer.error." + msgId, args); 1843 } 1844 1845 /** 1846 * Generate a runtime exception 1847 * 1848 * @param message error message 1849 * @param type token type 1850 * @param start start position of lexed error 1851 * @param length length of lexed error 1852 * @throws ParserException unconditionally 1853 */ error(final String message, final TokenType type, final int start, final int length)1854 protected void error(final String message, final TokenType type, final int start, final int length) throws ParserException { 1855 final long token = Token.toDesc(type, start, length); 1856 final int pos = Token.descPosition(token); 1857 final int lineNum = source.getLine(pos); 1858 final int columnNum = source.getColumn(pos); 1859 final String formatted = ErrorManager.format(message, source, lineNum, columnNum, token); 1860 throw new ParserException(JSErrorType.SYNTAX_ERROR, formatted, source, lineNum, columnNum, token); 1861 } 1862 1863 /** 1864 * Helper class for Lexer tokens, e.g XML or RegExp tokens. 1865 * This is the abstract superclass 1866 */ 1867 public static abstract class LexerToken implements Serializable { 1868 private static final long serialVersionUID = 1L; 1869 1870 private final String expression; 1871 1872 /** 1873 * Constructor 1874 * @param expression token expression 1875 */ LexerToken(final String expression)1876 protected LexerToken(final String expression) { 1877 this.expression = expression; 1878 } 1879 1880 /** 1881 * Get the expression 1882 * @return expression 1883 */ getExpression()1884 public String getExpression() { 1885 return expression; 1886 } 1887 } 1888 1889 /** 1890 * Temporary container for regular expressions. 1891 */ 1892 public static class RegexToken extends LexerToken { 1893 private static final long serialVersionUID = 1L; 1894 1895 /** Options. */ 1896 private final String options; 1897 1898 /** 1899 * Constructor. 1900 * 1901 * @param expression regexp expression 1902 * @param options regexp options 1903 */ RegexToken(final String expression, final String options)1904 public RegexToken(final String expression, final String options) { 1905 super(expression); 1906 this.options = options; 1907 } 1908 1909 /** 1910 * Get regexp options 1911 * @return options 1912 */ getOptions()1913 public String getOptions() { 1914 return options; 1915 } 1916 1917 @Override toString()1918 public String toString() { 1919 return '/' + getExpression() + '/' + options; 1920 } 1921 } 1922 1923 /** 1924 * Temporary container for XML expression. 1925 */ 1926 public static class XMLToken extends LexerToken { 1927 private static final long serialVersionUID = 1L; 1928 1929 /** 1930 * Constructor. 1931 * 1932 * @param expression XML expression 1933 */ XMLToken(final String expression)1934 public XMLToken(final String expression) { 1935 super(expression); 1936 } 1937 } 1938 } 1939