1# -*- coding: utf-8 -*- 2# Copyright JS Foundation and other contributors, https://js.foundation/ 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions are met: 6# 7# * Redistributions of source code must retain the above copyright 8# notice, this list of conditions and the following disclaimer. 9# * Redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in the 11# documentation and/or other materials provided with the distribution. 12# 13# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 14# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY 17# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 22# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 24from __future__ import absolute_import, unicode_literals 25 26import re 27 28from .objects import Object 29from .compat import xrange, unicode, uchr, uord 30from .character import Character, HEX_CONV, OCTAL_CONV 31from .messages import Messages 32from .token import Token 33 34 35def hexValue(ch): 36 return HEX_CONV[ch] 37 38 39def octalValue(ch): 40 return OCTAL_CONV[ch] 41 42 43class RegExp(Object): 44 def __init__(self, pattern=None, flags=None): 45 self.pattern = pattern 46 self.flags = flags 47 48 49class Position(Object): 50 def __init__(self, line=None, column=None, offset=None): 51 self.line = line 52 self.column = column 53 self.offset = offset 54 55 56class SourceLocation(Object): 57 def __init__(self, start=None, end=None, source=None): 58 self.start = start 59 self.end = end 60 self.source = source 61 62 63class Comment(Object): 64 def __init__(self, multiLine=None, slice=None, range=None, loc=None): 65 self.multiLine = multiLine 66 self.slice = slice 67 self.range = range 68 self.loc = loc 69 70 71class RawToken(Object): 72 def __init__(self, type=None, value=None, pattern=None, flags=None, regex=None, octal=None, cooked=None, head=None, tail=None, lineNumber=None, lineStart=None, start=None, end=None): 73 self.type = type 74 self.value = value 75 self.pattern = pattern 76 self.flags = flags 77 self.regex = regex 78 self.octal = octal 79 self.cooked = cooked 80 self.head = head 81 self.tail = tail 82 self.lineNumber = lineNumber 83 self.lineStart = lineStart 84 self.start = start 85 self.end = end 86 87 88class ScannerState(Object): 89 def __init__(self, index=None, lineNumber=None, lineStart=None): 90 self.index = index 91 self.lineNumber = lineNumber 92 self.lineStart = lineStart 93 94 95class Octal(object): 96 def __init__(self, octal, code): 97 self.octal = octal 98 self.code = code 99 100 101class Scanner(object): 102 def __init__(self, code, handler): 103 self.source = unicode(code) + '\x00' 104 self.errorHandler = handler 105 self.trackComment = False 106 self.isModule = False 107 108 self.length = len(code) 109 self.index = 0 110 self.lineNumber = 1 if self.length > 0 else 0 111 self.lineStart = 0 112 self.curlyStack = [] 113 114 def saveState(self): 115 return ScannerState( 116 index=self.index, 117 lineNumber=self.lineNumber, 118 lineStart=self.lineStart 119 ) 120 121 def restoreState(self, state): 122 self.index = state.index 123 self.lineNumber = state.lineNumber 124 self.lineStart = state.lineStart 125 126 def eof(self): 127 return self.index >= self.length 128 129 def throwUnexpectedToken(self, message=Messages.UnexpectedTokenIllegal): 130 return self.errorHandler.throwError(self.index, self.lineNumber, 131 self.index - self.lineStart + 1, message) 132 133 def tolerateUnexpectedToken(self, message=Messages.UnexpectedTokenIllegal): 134 self.errorHandler.tolerateError(self.index, self.lineNumber, 135 self.index - self.lineStart + 1, message) 136 137 # https://tc39.github.io/ecma262/#sec-comments 138 139 def skipSingleLineComment(self, offset): 140 comments = [] 141 142 if self.trackComment: 143 start = self.index - offset 144 loc = SourceLocation( 145 start=Position( 146 line=self.lineNumber, 147 column=self.index - self.lineStart - offset 148 ), 149 end=Position() 150 ) 151 152 while not self.eof(): 153 ch = self.source[self.index] 154 self.index += 1 155 if Character.isLineTerminator(ch): 156 if self.trackComment: 157 loc.end = Position( 158 line=self.lineNumber, 159 column=self.index - self.lineStart - 1 160 ) 161 entry = Comment( 162 multiLine=False, 163 slice=[start + offset, self.index - 1], 164 range=[start, self.index - 1], 165 loc=loc 166 ) 167 comments.append(entry) 168 169 if ch == '\r' and self.source[self.index] == '\n': 170 self.index += 1 171 172 self.lineNumber += 1 173 self.lineStart = self.index 174 return comments 175 176 if self.trackComment: 177 loc.end = Position( 178 line=self.lineNumber, 179 column=self.index - self.lineStart 180 ) 181 entry = Comment( 182 multiLine=False, 183 slice=[start + offset, self.index], 184 range=[start, self.index], 185 loc=loc 186 ) 187 comments.append(entry) 188 189 return comments 190 191 def skipMultiLineComment(self): 192 comments = [] 193 194 if self.trackComment: 195 comments = [] 196 start = self.index - 2 197 loc = SourceLocation( 198 start=Position( 199 line=self.lineNumber, 200 column=self.index - self.lineStart - 2 201 ), 202 end=Position() 203 ) 204 205 while not self.eof(): 206 ch = self.source[self.index] 207 if Character.isLineTerminator(ch): 208 if ch == '\r' and self.source[self.index + 1] == '\n': 209 self.index += 1 210 211 self.lineNumber += 1 212 self.index += 1 213 self.lineStart = self.index 214 elif ch == '*': 215 # Block comment ends with '*/'. 216 if self.source[self.index + 1] == '/': 217 self.index += 2 218 if self.trackComment: 219 loc.end = Position( 220 line=self.lineNumber, 221 column=self.index - self.lineStart 222 ) 223 entry = Comment( 224 multiLine=True, 225 slice=[start + 2, self.index - 2], 226 range=[start, self.index], 227 loc=loc 228 ) 229 comments.append(entry) 230 231 return comments 232 233 self.index += 1 234 else: 235 self.index += 1 236 237 # Ran off the end of the file - the whole thing is a comment 238 if self.trackComment: 239 loc.end = Position( 240 line=self.lineNumber, 241 column=self.index - self.lineStart 242 ) 243 entry = Comment( 244 multiLine=True, 245 slice=[start + 2, self.index], 246 range=[start, self.index], 247 loc=loc 248 ) 249 comments.append(entry) 250 251 self.tolerateUnexpectedToken() 252 return comments 253 254 def scanComments(self): 255 comments = [] 256 257 start = self.index == 0 258 while not self.eof(): 259 ch = self.source[self.index] 260 261 if Character.isWhiteSpace(ch): 262 self.index += 1 263 elif Character.isLineTerminator(ch): 264 self.index += 1 265 if ch == '\r' and self.source[self.index] == '\n': 266 self.index += 1 267 268 self.lineNumber += 1 269 self.lineStart = self.index 270 start = True 271 elif ch == '/': # U+002F is '/' 272 ch = self.source[self.index + 1] 273 if ch == '/': 274 self.index += 2 275 comment = self.skipSingleLineComment(2) 276 if self.trackComment: 277 comments.extend(comment) 278 279 start = True 280 elif ch == '*': # U+002A is '*' 281 self.index += 2 282 comment = self.skipMultiLineComment() 283 if self.trackComment: 284 comments.extend(comment) 285 286 else: 287 break 288 289 elif start and ch == '-': # U+002D is '-' 290 # U+003E is '>' 291 if self.source[self.index + 1:self.index + 3] == '->': 292 # '-->' is a single-line comment 293 self.index += 3 294 comment = self.skipSingleLineComment(3) 295 if self.trackComment: 296 comments.extend(comment) 297 298 else: 299 break 300 301 elif ch == '<' and not self.isModule: # U+003C is '<' 302 if self.source[self.index + 1:self.index + 4] == '!--': 303 self.index += 4 # `<!--` 304 comment = self.skipSingleLineComment(4) 305 if self.trackComment: 306 comments.extend(comment) 307 308 else: 309 break 310 311 else: 312 break 313 314 return comments 315 316 # https://tc39.github.io/ecma262/#sec-future-reserved-words 317 318 def isFutureReservedWord(self, id): 319 return id in self.isFutureReservedWord.set 320 isFutureReservedWord.set = set(( 321 'enum', 322 'export', 323 'import', 324 'super', 325 )) 326 327 def isStrictModeReservedWord(self, id): 328 return id in self.isStrictModeReservedWord.set 329 isStrictModeReservedWord.set = set(( 330 'implements', 331 'interface', 332 'package', 333 'private', 334 'protected', 335 'public', 336 'static', 337 'yield', 338 'let', 339 )) 340 341 def isRestrictedWord(self, id): 342 return id in self.isRestrictedWord.set 343 isRestrictedWord.set = set(( 344 'eval', 'arguments', 345 )) 346 347 # https://tc39.github.io/ecma262/#sec-keywords 348 349 def isKeyword(self, id): 350 return id in self.isKeyword.set 351 isKeyword.set = set(( 352 'if', 'in', 'do', 353 354 'var', 'for', 'new', 355 'try', 'let', 356 357 'this', 'else', 'case', 358 'void', 'with', 'enum', 359 360 'while', 'break', 'catch', 361 'throw', 'const', 'yield', 362 'class', 'super', 363 364 'return', 'typeof', 'delete', 365 'switch', 'export', 'import', 366 367 'default', 'finally', 'extends', 368 369 'function', 'continue', 'debugger', 370 371 'instanceof', 372 )) 373 374 def codePointAt(self, i): 375 return uord(self.source[i:i + 2]) 376 377 def scanHexEscape(self, prefix): 378 length = 4 if prefix == 'u' else 2 379 code = 0 380 381 for i in xrange(length): 382 if not self.eof() and Character.isHexDigit(self.source[self.index]): 383 ch = self.source[self.index] 384 self.index += 1 385 code = code * 16 + hexValue(ch) 386 else: 387 return None 388 389 return uchr(code) 390 391 def scanUnicodeCodePointEscape(self): 392 ch = self.source[self.index] 393 code = 0 394 395 # At least, one hex digit is required. 396 if ch == '}': 397 self.throwUnexpectedToken() 398 399 while not self.eof(): 400 ch = self.source[self.index] 401 self.index += 1 402 if not Character.isHexDigit(ch): 403 break 404 405 code = code * 16 + hexValue(ch) 406 407 if code > 0x10FFFF or ch != '}': 408 self.throwUnexpectedToken() 409 410 return Character.fromCodePoint(code) 411 412 def getIdentifier(self): 413 start = self.index 414 self.index += 1 415 while not self.eof(): 416 ch = self.source[self.index] 417 if ch == '\\': 418 # Blackslash (U+005C) marks Unicode escape sequence. 419 self.index = start 420 return self.getComplexIdentifier() 421 else: 422 cp = ord(ch) 423 if cp >= 0xD800 and cp < 0xDFFF: 424 # Need to handle surrogate pairs. 425 self.index = start 426 return self.getComplexIdentifier() 427 428 if Character.isIdentifierPart(ch): 429 self.index += 1 430 else: 431 break 432 433 return self.source[start:self.index] 434 435 def getComplexIdentifier(self): 436 cp = self.codePointAt(self.index) 437 id = Character.fromCodePoint(cp) 438 self.index += len(id) 439 440 # '\u' (U+005C, U+0075) denotes an escaped character. 441 if cp == 0x5C: 442 if self.source[self.index] != 'u': 443 self.throwUnexpectedToken() 444 445 self.index += 1 446 if self.source[self.index] == '{': 447 self.index += 1 448 ch = self.scanUnicodeCodePointEscape() 449 else: 450 ch = self.scanHexEscape('u') 451 if not ch or ch == '\\' or not Character.isIdentifierStart(ch[0]): 452 self.throwUnexpectedToken() 453 454 id = ch 455 456 while not self.eof(): 457 cp = self.codePointAt(self.index) 458 ch = Character.fromCodePoint(cp) 459 if not Character.isIdentifierPart(ch): 460 break 461 462 id += ch 463 self.index += len(ch) 464 465 # '\u' (U+005C, U+0075) denotes an escaped character. 466 if cp == 0x5C: 467 id = id[:-1] 468 if self.source[self.index] != 'u': 469 self.throwUnexpectedToken() 470 471 self.index += 1 472 if self.source[self.index] == '{': 473 self.index += 1 474 ch = self.scanUnicodeCodePointEscape() 475 else: 476 ch = self.scanHexEscape('u') 477 if not ch or ch == '\\' or not Character.isIdentifierPart(ch[0]): 478 self.throwUnexpectedToken() 479 480 id += ch 481 482 return id 483 484 def octalToDecimal(self, ch): 485 # \0 is not octal escape sequence 486 octal = ch != '0' 487 code = octalValue(ch) 488 489 if not self.eof() and Character.isOctalDigit(self.source[self.index]): 490 octal = True 491 code = code * 8 + octalValue(self.source[self.index]) 492 self.index += 1 493 494 # 3 digits are only allowed when string starts 495 # with 0, 1, 2, 3 496 if ch in '0123' and not self.eof() and Character.isOctalDigit(self.source[self.index]): 497 code = code * 8 + octalValue(self.source[self.index]) 498 self.index += 1 499 500 return Octal(octal, code) 501 502 # https://tc39.github.io/ecma262/#sec-names-and-keywords 503 504 def scanIdentifier(self): 505 start = self.index 506 507 # Backslash (U+005C) starts an escaped character. 508 id = self.getComplexIdentifier() if self.source[start] == '\\' else self.getIdentifier() 509 510 # There is no keyword or literal with only one character. 511 # Thus, it must be an identifier. 512 if len(id) == 1: 513 type = Token.Identifier 514 elif self.isKeyword(id): 515 type = Token.Keyword 516 elif id == 'null': 517 type = Token.NullLiteral 518 elif id == 'true' or id == 'false': 519 type = Token.BooleanLiteral 520 else: 521 type = Token.Identifier 522 523 if type is not Token.Identifier and start + len(id) != self.index: 524 restore = self.index 525 self.index = start 526 self.tolerateUnexpectedToken(Messages.InvalidEscapedReservedWord) 527 self.index = restore 528 529 return RawToken( 530 type=type, 531 value=id, 532 lineNumber=self.lineNumber, 533 lineStart=self.lineStart, 534 start=start, 535 end=self.index 536 ) 537 538 # https://tc39.github.io/ecma262/#sec-punctuators 539 540 def scanPunctuator(self): 541 start = self.index 542 543 # Check for most common single-character punctuators. 544 str = self.source[self.index] 545 if str in ( 546 '(', 547 '{', 548 ): 549 if str == '{': 550 self.curlyStack.append('{') 551 552 self.index += 1 553 554 elif str == '.': 555 self.index += 1 556 if self.source[self.index] == '.' and self.source[self.index + 1] == '.': 557 # Spread operator: ... 558 self.index += 2 559 str = '...' 560 561 elif str == '}': 562 self.index += 1 563 if self.curlyStack: 564 self.curlyStack.pop() 565 566 elif str in ( 567 ')', 568 ';', 569 ',', 570 '[', 571 ']', 572 ':', 573 '?', 574 '~', 575 ): 576 self.index += 1 577 578 else: 579 # 4-character punctuator. 580 str = self.source[self.index:self.index + 4] 581 if str == '>>>=': 582 self.index += 4 583 else: 584 585 # 3-character punctuators. 586 str = str[:3] 587 if str in ( 588 '===', '!==', '>>>', 589 '<<=', '>>=', '**=' 590 ): 591 self.index += 3 592 else: 593 594 # 2-character punctuators. 595 str = str[:2] 596 if str in ( 597 '&&', '||', '==', '!=', 598 '+=', '-=', '*=', '/=', 599 '++', '--', '<<', '>>', 600 '&=', '|=', '^=', '%=', 601 '<=', '>=', '=>', '**', 602 ): 603 self.index += 2 604 else: 605 606 # 1-character punctuators. 607 str = self.source[self.index] 608 if str in '<>=!+-*%&|^/': 609 self.index += 1 610 611 if self.index == start: 612 self.throwUnexpectedToken() 613 614 return RawToken( 615 type=Token.Punctuator, 616 value=str, 617 lineNumber=self.lineNumber, 618 lineStart=self.lineStart, 619 start=start, 620 end=self.index 621 ) 622 623 # https://tc39.github.io/ecma262/#sec-literals-numeric-literals 624 625 def scanHexLiteral(self, start): 626 num = '' 627 628 while not self.eof(): 629 if not Character.isHexDigit(self.source[self.index]): 630 break 631 632 num += self.source[self.index] 633 self.index += 1 634 635 if len(num) == 0: 636 self.throwUnexpectedToken() 637 638 if Character.isIdentifierStart(self.source[self.index]): 639 self.throwUnexpectedToken() 640 641 return RawToken( 642 type=Token.NumericLiteral, 643 value=int(num, 16), 644 lineNumber=self.lineNumber, 645 lineStart=self.lineStart, 646 start=start, 647 end=self.index 648 ) 649 650 def scanBinaryLiteral(self, start): 651 num = '' 652 653 while not self.eof(): 654 ch = self.source[self.index] 655 if ch != '0' and ch != '1': 656 break 657 658 num += self.source[self.index] 659 self.index += 1 660 661 if len(num) == 0: 662 # only 0b or 0B 663 self.throwUnexpectedToken() 664 665 if not self.eof(): 666 ch = self.source[self.index] 667 if Character.isIdentifierStart(ch) or Character.isDecimalDigit(ch): 668 self.throwUnexpectedToken() 669 670 return RawToken( 671 type=Token.NumericLiteral, 672 value=int(num, 2), 673 lineNumber=self.lineNumber, 674 lineStart=self.lineStart, 675 start=start, 676 end=self.index 677 ) 678 679 def scanOctalLiteral(self, prefix, start): 680 num = '' 681 octal = False 682 683 if Character.isOctalDigit(prefix[0]): 684 octal = True 685 num = '0' + self.source[self.index] 686 self.index += 1 687 688 while not self.eof(): 689 if not Character.isOctalDigit(self.source[self.index]): 690 break 691 692 num += self.source[self.index] 693 self.index += 1 694 695 if not octal and len(num) == 0: 696 # only 0o or 0O 697 self.throwUnexpectedToken() 698 699 if Character.isIdentifierStart(self.source[self.index]) or Character.isDecimalDigit(self.source[self.index]): 700 self.throwUnexpectedToken() 701 702 return RawToken( 703 type=Token.NumericLiteral, 704 value=int(num, 8), 705 octal=octal, 706 lineNumber=self.lineNumber, 707 lineStart=self.lineStart, 708 start=start, 709 end=self.index 710 ) 711 712 def isImplicitOctalLiteral(self): 713 # Implicit octal, unless there is a non-octal digit. 714 # (Annex B.1.1 on Numeric Literals) 715 for i in xrange(self.index + 1, self.length): 716 ch = self.source[i] 717 if ch in '89': 718 return False 719 if not Character.isOctalDigit(ch): 720 return True 721 return True 722 723 def scanNumericLiteral(self): 724 start = self.index 725 ch = self.source[start] 726 assert Character.isDecimalDigit(ch) or ch == '.', 'Numeric literal must start with a decimal digit or a decimal point' 727 728 num = '' 729 if ch != '.': 730 num = self.source[self.index] 731 self.index += 1 732 ch = self.source[self.index] 733 734 # Hex number starts with '0x'. 735 # Octal number starts with '0'. 736 # Octal number in ES6 starts with '0o'. 737 # Binary number in ES6 starts with '0b'. 738 if num == '0': 739 if ch in ('x', 'X'): 740 self.index += 1 741 return self.scanHexLiteral(start) 742 743 if ch in ('b', 'B'): 744 self.index += 1 745 return self.scanBinaryLiteral(start) 746 747 if ch in ('o', 'O'): 748 return self.scanOctalLiteral(ch, start) 749 750 if ch and Character.isOctalDigit(ch): 751 if self.isImplicitOctalLiteral(): 752 return self.scanOctalLiteral(ch, start) 753 754 while Character.isDecimalDigit(self.source[self.index]): 755 num += self.source[self.index] 756 self.index += 1 757 758 ch = self.source[self.index] 759 760 if ch == '.': 761 num += self.source[self.index] 762 self.index += 1 763 while Character.isDecimalDigit(self.source[self.index]): 764 num += self.source[self.index] 765 self.index += 1 766 767 ch = self.source[self.index] 768 769 if ch in ('e', 'E'): 770 num += self.source[self.index] 771 self.index += 1 772 773 ch = self.source[self.index] 774 if ch in ('+', '-'): 775 num += self.source[self.index] 776 self.index += 1 777 778 if Character.isDecimalDigit(self.source[self.index]): 779 while Character.isDecimalDigit(self.source[self.index]): 780 num += self.source[self.index] 781 self.index += 1 782 783 else: 784 self.throwUnexpectedToken() 785 786 if Character.isIdentifierStart(self.source[self.index]): 787 self.throwUnexpectedToken() 788 789 value = float(num) 790 return RawToken( 791 type=Token.NumericLiteral, 792 value=int(value) if value.is_integer() else value, 793 lineNumber=self.lineNumber, 794 lineStart=self.lineStart, 795 start=start, 796 end=self.index 797 ) 798 799 # https://tc39.github.io/ecma262/#sec-literals-string-literals 800 801 def scanStringLiteral(self): 802 start = self.index 803 quote = self.source[start] 804 assert quote in ('\'', '"'), 'String literal must starts with a quote' 805 806 self.index += 1 807 octal = False 808 str = '' 809 810 while not self.eof(): 811 ch = self.source[self.index] 812 self.index += 1 813 814 if ch == quote: 815 quote = '' 816 break 817 elif ch == '\\': 818 ch = self.source[self.index] 819 self.index += 1 820 if not ch or not Character.isLineTerminator(ch): 821 if ch == 'u': 822 if self.source[self.index] == '{': 823 self.index += 1 824 str += self.scanUnicodeCodePointEscape() 825 else: 826 unescapedChar = self.scanHexEscape(ch) 827 if not unescapedChar: 828 self.throwUnexpectedToken() 829 830 str += unescapedChar 831 832 elif ch == 'x': 833 unescaped = self.scanHexEscape(ch) 834 if not unescaped: 835 self.throwUnexpectedToken(Messages.InvalidHexEscapeSequence) 836 837 str += unescaped 838 elif ch == 'n': 839 str += '\n' 840 elif ch == 'r': 841 str += '\r' 842 elif ch == 't': 843 str += '\t' 844 elif ch == 'b': 845 str += '\b' 846 elif ch == 'f': 847 str += '\f' 848 elif ch == 'v': 849 str += '\x0B' 850 elif ch in ( 851 '8', 852 '9', 853 ): 854 str += ch 855 self.tolerateUnexpectedToken() 856 857 else: 858 if ch and Character.isOctalDigit(ch): 859 octToDec = self.octalToDecimal(ch) 860 861 octal = octToDec.octal or octal 862 str += uchr(octToDec.code) 863 else: 864 str += ch 865 866 else: 867 self.lineNumber += 1 868 if ch == '\r' and self.source[self.index] == '\n': 869 self.index += 1 870 871 self.lineStart = self.index 872 873 elif Character.isLineTerminator(ch): 874 break 875 else: 876 str += ch 877 878 if quote != '': 879 self.index = start 880 self.throwUnexpectedToken() 881 882 return RawToken( 883 type=Token.StringLiteral, 884 value=str, 885 octal=octal, 886 lineNumber=self.lineNumber, 887 lineStart=self.lineStart, 888 start=start, 889 end=self.index 890 ) 891 892 # https://tc39.github.io/ecma262/#sec-template-literal-lexical-components 893 894 def scanTemplate(self): 895 cooked = '' 896 terminated = False 897 start = self.index 898 899 head = self.source[start] == '`' 900 tail = False 901 rawOffset = 2 902 903 self.index += 1 904 905 while not self.eof(): 906 ch = self.source[self.index] 907 self.index += 1 908 if ch == '`': 909 rawOffset = 1 910 tail = True 911 terminated = True 912 break 913 elif ch == '$': 914 if self.source[self.index] == '{': 915 self.curlyStack.append('${') 916 self.index += 1 917 terminated = True 918 break 919 920 cooked += ch 921 elif ch == '\\': 922 ch = self.source[self.index] 923 self.index += 1 924 if not Character.isLineTerminator(ch): 925 if ch == 'n': 926 cooked += '\n' 927 elif ch == 'r': 928 cooked += '\r' 929 elif ch == 't': 930 cooked += '\t' 931 elif ch == 'u': 932 if self.source[self.index] == '{': 933 self.index += 1 934 cooked += self.scanUnicodeCodePointEscape() 935 else: 936 restore = self.index 937 unescapedChar = self.scanHexEscape(ch) 938 if unescapedChar: 939 cooked += unescapedChar 940 else: 941 self.index = restore 942 cooked += ch 943 944 elif ch == 'x': 945 unescaped = self.scanHexEscape(ch) 946 if not unescaped: 947 self.throwUnexpectedToken(Messages.InvalidHexEscapeSequence) 948 949 cooked += unescaped 950 elif ch == 'b': 951 cooked += '\b' 952 elif ch == 'f': 953 cooked += '\f' 954 elif ch == 'v': 955 cooked += '\v' 956 957 else: 958 if ch == '0': 959 if Character.isDecimalDigit(self.source[self.index]): 960 # Illegal: \01 \02 and so on 961 self.throwUnexpectedToken(Messages.TemplateOctalLiteral) 962 963 cooked += '\0' 964 elif Character.isOctalDigit(ch): 965 # Illegal: \1 \2 966 self.throwUnexpectedToken(Messages.TemplateOctalLiteral) 967 else: 968 cooked += ch 969 970 else: 971 self.lineNumber += 1 972 if ch == '\r' and self.source[self.index] == '\n': 973 self.index += 1 974 975 self.lineStart = self.index 976 977 elif Character.isLineTerminator(ch): 978 self.lineNumber += 1 979 if ch == '\r' and self.source[self.index] == '\n': 980 self.index += 1 981 982 self.lineStart = self.index 983 cooked += '\n' 984 else: 985 cooked += ch 986 987 if not terminated: 988 self.throwUnexpectedToken() 989 990 if not head: 991 if self.curlyStack: 992 self.curlyStack.pop() 993 994 return RawToken( 995 type=Token.Template, 996 value=self.source[start + 1:self.index - rawOffset], 997 cooked=cooked, 998 head=head, 999 tail=tail, 1000 lineNumber=self.lineNumber, 1001 lineStart=self.lineStart, 1002 start=start, 1003 end=self.index 1004 ) 1005 1006 # https://tc39.github.io/ecma262/#sec-literals-regular-expression-literals 1007 1008 def testRegExp(self, pattern, flags): 1009 # The BMP character to use as a replacement for astral symbols when 1010 # translating an ES6 "u"-flagged pattern to an ES5-compatible 1011 # approximation. 1012 # Note: replacing with '\uFFFF' enables false positives in unlikely 1013 # scenarios. For example, `[\u{1044f}-\u{10440}]` is an invalid 1014 # pattern that would not be detected by this substitution. 1015 astralSubstitute = '\uFFFF' 1016 1017 # Replace every Unicode escape sequence with the equivalent 1018 # BMP character or a constant ASCII code point in the case of 1019 # astral symbols. (See the above note on `astralSubstitute` 1020 # for more information.) 1021 def astralSub(m): 1022 codePoint = int(m.group(1) or m.group(2), 16) 1023 if codePoint > 0x10FFFF: 1024 self.tolerateUnexpectedToken(Messages.InvalidRegExp) 1025 elif codePoint <= 0xFFFF: 1026 return uchr(codePoint) 1027 return astralSubstitute 1028 pattern = re.sub(r'\\u\{([0-9a-fA-F]+)\}|\\u([a-fA-F0-9]{4})', astralSub, pattern) 1029 1030 # Replace each paired surrogate with a single ASCII symbol to 1031 # avoid throwing on regular expressions that are only valid in 1032 # combination with the "u" flag. 1033 pattern = re.sub(r'[\uD800-\uDBFF][\uDC00-\uDFFF]', astralSubstitute, pattern) 1034 1035 # Return a regular expression object for this pattern-flag pair, or 1036 # `null` in case the current environment doesn't support the flags it 1037 # uses. 1038 pyflags = 0 | re.M if 'm' in flags else 0 | re.I if 'i' in flags else 0 1039 try: 1040 return re.compile(pattern, pyflags) 1041 except Exception: 1042 self.tolerateUnexpectedToken(Messages.InvalidRegExp) 1043 1044 def scanRegExpBody(self): 1045 ch = self.source[self.index] 1046 assert ch == '/', 'Regular expression literal must start with a slash' 1047 1048 str = self.source[self.index] 1049 self.index += 1 1050 classMarker = False 1051 terminated = False 1052 1053 while not self.eof(): 1054 ch = self.source[self.index] 1055 self.index += 1 1056 str += ch 1057 if ch == '\\': 1058 ch = self.source[self.index] 1059 self.index += 1 1060 # https://tc39.github.io/ecma262/#sec-literals-regular-expression-literals 1061 if Character.isLineTerminator(ch): 1062 self.throwUnexpectedToken(Messages.UnterminatedRegExp) 1063 1064 str += ch 1065 elif Character.isLineTerminator(ch): 1066 self.throwUnexpectedToken(Messages.UnterminatedRegExp) 1067 elif classMarker: 1068 if ch == ']': 1069 classMarker = False 1070 1071 else: 1072 if ch == '/': 1073 terminated = True 1074 break 1075 elif ch == '[': 1076 classMarker = True 1077 1078 if not terminated: 1079 self.throwUnexpectedToken(Messages.UnterminatedRegExp) 1080 1081 # Exclude leading and trailing slash. 1082 return str[1:-1] 1083 1084 def scanRegExpFlags(self): 1085 str = '' 1086 flags = '' 1087 while not self.eof(): 1088 ch = self.source[self.index] 1089 if not Character.isIdentifierPart(ch): 1090 break 1091 1092 self.index += 1 1093 if ch == '\\' and not self.eof(): 1094 ch = self.source[self.index] 1095 if ch == 'u': 1096 self.index += 1 1097 restore = self.index 1098 char = self.scanHexEscape('u') 1099 if char: 1100 flags += char 1101 str += '\\u' 1102 while restore < self.index: 1103 str += self.source[restore] 1104 restore += 1 1105 1106 else: 1107 self.index = restore 1108 flags += 'u' 1109 str += '\\u' 1110 1111 self.tolerateUnexpectedToken() 1112 else: 1113 str += '\\' 1114 self.tolerateUnexpectedToken() 1115 1116 else: 1117 flags += ch 1118 str += ch 1119 1120 return flags 1121 1122 def scanRegExp(self): 1123 start = self.index 1124 1125 pattern = self.scanRegExpBody() 1126 flags = self.scanRegExpFlags() 1127 value = self.testRegExp(pattern, flags) 1128 1129 return RawToken( 1130 type=Token.RegularExpression, 1131 value='', 1132 pattern=pattern, 1133 flags=flags, 1134 regex=value, 1135 lineNumber=self.lineNumber, 1136 lineStart=self.lineStart, 1137 start=start, 1138 end=self.index 1139 ) 1140 1141 def lex(self): 1142 if self.eof(): 1143 return RawToken( 1144 type=Token.EOF, 1145 value='', 1146 lineNumber=self.lineNumber, 1147 lineStart=self.lineStart, 1148 start=self.index, 1149 end=self.index 1150 ) 1151 1152 ch = self.source[self.index] 1153 1154 if Character.isIdentifierStart(ch): 1155 return self.scanIdentifier() 1156 1157 # Very common: ( and ) and ; 1158 if ch in ('(', ')', ';'): 1159 return self.scanPunctuator() 1160 1161 # String literal starts with single quote (U+0027) or double quote (U+0022). 1162 if ch in ('\'', '"'): 1163 return self.scanStringLiteral() 1164 1165 # Dot (.) U+002E can also start a floating-point number, hence the need 1166 # to check the next character. 1167 if ch == '.': 1168 if Character.isDecimalDigit(self.source[self.index + 1]): 1169 return self.scanNumericLiteral() 1170 1171 return self.scanPunctuator() 1172 1173 if Character.isDecimalDigit(ch): 1174 return self.scanNumericLiteral() 1175 1176 # Template literals start with ` (U+0060) for template head 1177 # or } (U+007D) for template middle or template tail. 1178 if ch == '`' or (ch == '}' and self.curlyStack and self.curlyStack[-1] == '${'): 1179 return self.scanTemplate() 1180 1181 # Possible identifier start in a surrogate pair. 1182 cp = ord(ch) 1183 if cp >= 0xD800 and cp < 0xDFFF: 1184 cp = self.codePointAt(self.index) 1185 ch = Character.fromCodePoint(cp) 1186 if Character.isIdentifierStart(ch): 1187 return self.scanIdentifier() 1188 1189 return self.scanPunctuator() 1190