1# -*- coding: utf-8 -*-
2# Copyright JS Foundation and other contributors, https://js.foundation/
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are met:
6#
7#   * Redistributions of source code must retain the above copyright
8#     notice, this list of conditions and the following disclaimer.
9#   * Redistributions in binary form must reproduce the above copyright
10#     notice, this list of conditions and the following disclaimer in the
11#     documentation and/or other materials provided with the distribution.
12#
13# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
17# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
22# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23
24from __future__ import absolute_import, unicode_literals
25
26import re
27
28from .objects import Object
29from .compat import xrange, unicode, uchr, uord
30from .character import Character, HEX_CONV, OCTAL_CONV
31from .messages import Messages
32from .token import Token
33
34
35def hexValue(ch):
36    return HEX_CONV[ch]
37
38
39def octalValue(ch):
40    return OCTAL_CONV[ch]
41
42
43class RegExp(Object):
44    def __init__(self, pattern=None, flags=None):
45        self.pattern = pattern
46        self.flags = flags
47
48
49class Position(Object):
50    def __init__(self, line=None, column=None, offset=None):
51        self.line = line
52        self.column = column
53        self.offset = offset
54
55
56class SourceLocation(Object):
57    def __init__(self, start=None, end=None, source=None):
58        self.start = start
59        self.end = end
60        self.source = source
61
62
63class Comment(Object):
64    def __init__(self, multiLine=None, slice=None, range=None, loc=None):
65        self.multiLine = multiLine
66        self.slice = slice
67        self.range = range
68        self.loc = loc
69
70
71class RawToken(Object):
72    def __init__(self, type=None, value=None, pattern=None, flags=None, regex=None, octal=None, cooked=None, head=None, tail=None, lineNumber=None, lineStart=None, start=None, end=None):
73        self.type = type
74        self.value = value
75        self.pattern = pattern
76        self.flags = flags
77        self.regex = regex
78        self.octal = octal
79        self.cooked = cooked
80        self.head = head
81        self.tail = tail
82        self.lineNumber = lineNumber
83        self.lineStart = lineStart
84        self.start = start
85        self.end = end
86
87
88class ScannerState(Object):
89    def __init__(self, index=None, lineNumber=None, lineStart=None):
90        self.index = index
91        self.lineNumber = lineNumber
92        self.lineStart = lineStart
93
94
95class Octal(object):
96    def __init__(self, octal, code):
97        self.octal = octal
98        self.code = code
99
100
101class Scanner(object):
102    def __init__(self, code, handler):
103        self.source = unicode(code) + '\x00'
104        self.errorHandler = handler
105        self.trackComment = False
106        self.isModule = False
107
108        self.length = len(code)
109        self.index = 0
110        self.lineNumber = 1 if self.length > 0 else 0
111        self.lineStart = 0
112        self.curlyStack = []
113
114    def saveState(self):
115        return ScannerState(
116            index=self.index,
117            lineNumber=self.lineNumber,
118            lineStart=self.lineStart
119        )
120
121    def restoreState(self, state):
122        self.index = state.index
123        self.lineNumber = state.lineNumber
124        self.lineStart = state.lineStart
125
126    def eof(self):
127        return self.index >= self.length
128
129    def throwUnexpectedToken(self, message=Messages.UnexpectedTokenIllegal):
130        return self.errorHandler.throwError(self.index, self.lineNumber,
131            self.index - self.lineStart + 1, message)
132
133    def tolerateUnexpectedToken(self, message=Messages.UnexpectedTokenIllegal):
134        self.errorHandler.tolerateError(self.index, self.lineNumber,
135            self.index - self.lineStart + 1, message)
136
137    # https://tc39.github.io/ecma262/#sec-comments
138
139    def skipSingleLineComment(self, offset):
140        comments = []
141
142        if self.trackComment:
143            start = self.index - offset
144            loc = SourceLocation(
145                start=Position(
146                    line=self.lineNumber,
147                    column=self.index - self.lineStart - offset
148                ),
149                end=Position()
150            )
151
152        while not self.eof():
153            ch = self.source[self.index]
154            self.index += 1
155            if Character.isLineTerminator(ch):
156                if self.trackComment:
157                    loc.end = Position(
158                        line=self.lineNumber,
159                        column=self.index - self.lineStart - 1
160                    )
161                    entry = Comment(
162                        multiLine=False,
163                        slice=[start + offset, self.index - 1],
164                        range=[start, self.index - 1],
165                        loc=loc
166                    )
167                    comments.append(entry)
168
169                if ch == '\r' and self.source[self.index] == '\n':
170                    self.index += 1
171
172                self.lineNumber += 1
173                self.lineStart = self.index
174                return comments
175
176        if self.trackComment:
177            loc.end = Position(
178                line=self.lineNumber,
179                column=self.index - self.lineStart
180            )
181            entry = Comment(
182                multiLine=False,
183                slice=[start + offset, self.index],
184                range=[start, self.index],
185                loc=loc
186            )
187            comments.append(entry)
188
189        return comments
190
191    def skipMultiLineComment(self):
192        comments = []
193
194        if self.trackComment:
195            comments = []
196            start = self.index - 2
197            loc = SourceLocation(
198                start=Position(
199                    line=self.lineNumber,
200                    column=self.index - self.lineStart - 2
201                ),
202                end=Position()
203            )
204
205        while not self.eof():
206            ch = self.source[self.index]
207            if Character.isLineTerminator(ch):
208                if ch == '\r' and self.source[self.index + 1] == '\n':
209                    self.index += 1
210
211                self.lineNumber += 1
212                self.index += 1
213                self.lineStart = self.index
214            elif ch == '*':
215                # Block comment ends with '*/'.
216                if self.source[self.index + 1] == '/':
217                    self.index += 2
218                    if self.trackComment:
219                        loc.end = Position(
220                            line=self.lineNumber,
221                            column=self.index - self.lineStart
222                        )
223                        entry = Comment(
224                            multiLine=True,
225                            slice=[start + 2, self.index - 2],
226                            range=[start, self.index],
227                            loc=loc
228                        )
229                        comments.append(entry)
230
231                    return comments
232
233                self.index += 1
234            else:
235                self.index += 1
236
237        # Ran off the end of the file - the whole thing is a comment
238        if self.trackComment:
239            loc.end = Position(
240                line=self.lineNumber,
241                column=self.index - self.lineStart
242            )
243            entry = Comment(
244                multiLine=True,
245                slice=[start + 2, self.index],
246                range=[start, self.index],
247                loc=loc
248            )
249            comments.append(entry)
250
251        self.tolerateUnexpectedToken()
252        return comments
253
254    def scanComments(self):
255        comments = []
256
257        start = self.index == 0
258        while not self.eof():
259            ch = self.source[self.index]
260
261            if Character.isWhiteSpace(ch):
262                self.index += 1
263            elif Character.isLineTerminator(ch):
264                self.index += 1
265                if ch == '\r' and self.source[self.index] == '\n':
266                    self.index += 1
267
268                self.lineNumber += 1
269                self.lineStart = self.index
270                start = True
271            elif ch == '/':  # U+002F is '/'
272                ch = self.source[self.index + 1]
273                if ch == '/':
274                    self.index += 2
275                    comment = self.skipSingleLineComment(2)
276                    if self.trackComment:
277                        comments.extend(comment)
278
279                    start = True
280                elif ch == '*':  # U+002A is '*'
281                    self.index += 2
282                    comment = self.skipMultiLineComment()
283                    if self.trackComment:
284                        comments.extend(comment)
285
286                else:
287                    break
288
289            elif start and ch == '-':  # U+002D is '-'
290                # U+003E is '>'
291                if self.source[self.index + 1:self.index + 3] == '->':
292                    # '-->' is a single-line comment
293                    self.index += 3
294                    comment = self.skipSingleLineComment(3)
295                    if self.trackComment:
296                        comments.extend(comment)
297
298                else:
299                    break
300
301            elif ch == '<' and not self.isModule:  # U+003C is '<'
302                if self.source[self.index + 1:self.index + 4] == '!--':
303                    self.index += 4  # `<!--`
304                    comment = self.skipSingleLineComment(4)
305                    if self.trackComment:
306                        comments.extend(comment)
307
308                else:
309                    break
310
311            else:
312                break
313
314        return comments
315
316    # https://tc39.github.io/ecma262/#sec-future-reserved-words
317
318    def isFutureReservedWord(self, id):
319        return id in self.isFutureReservedWord.set
320    isFutureReservedWord.set = set((
321        'enum',
322        'export',
323        'import',
324        'super',
325    ))
326
327    def isStrictModeReservedWord(self, id):
328        return id in self.isStrictModeReservedWord.set
329    isStrictModeReservedWord.set = set((
330        'implements',
331        'interface',
332        'package',
333        'private',
334        'protected',
335        'public',
336        'static',
337        'yield',
338        'let',
339    ))
340
341    def isRestrictedWord(self, id):
342        return id in self.isRestrictedWord.set
343    isRestrictedWord.set = set((
344        'eval', 'arguments',
345    ))
346
347    # https://tc39.github.io/ecma262/#sec-keywords
348
349    def isKeyword(self, id):
350        return id in self.isKeyword.set
351    isKeyword.set = set((
352        'if', 'in', 'do',
353
354        'var', 'for', 'new',
355        'try', 'let',
356
357        'this', 'else', 'case',
358        'void', 'with', 'enum',
359
360        'while', 'break', 'catch',
361        'throw', 'const', 'yield',
362        'class', 'super',
363
364        'return', 'typeof', 'delete',
365        'switch', 'export', 'import',
366
367        'default', 'finally', 'extends',
368
369        'function', 'continue', 'debugger',
370
371        'instanceof',
372    ))
373
374    def codePointAt(self, i):
375        return uord(self.source[i:i + 2])
376
377    def scanHexEscape(self, prefix):
378        length = 4 if prefix == 'u' else 2
379        code = 0
380
381        for i in xrange(length):
382            if not self.eof() and Character.isHexDigit(self.source[self.index]):
383                ch = self.source[self.index]
384                self.index += 1
385                code = code * 16 + hexValue(ch)
386            else:
387                return None
388
389        return uchr(code)
390
391    def scanUnicodeCodePointEscape(self):
392        ch = self.source[self.index]
393        code = 0
394
395        # At least, one hex digit is required.
396        if ch == '}':
397            self.throwUnexpectedToken()
398
399        while not self.eof():
400            ch = self.source[self.index]
401            self.index += 1
402            if not Character.isHexDigit(ch):
403                break
404
405            code = code * 16 + hexValue(ch)
406
407        if code > 0x10FFFF or ch != '}':
408            self.throwUnexpectedToken()
409
410        return Character.fromCodePoint(code)
411
412    def getIdentifier(self):
413        start = self.index
414        self.index += 1
415        while not self.eof():
416            ch = self.source[self.index]
417            if ch == '\\':
418                # Blackslash (U+005C) marks Unicode escape sequence.
419                self.index = start
420                return self.getComplexIdentifier()
421            else:
422                cp = ord(ch)
423                if cp >= 0xD800 and cp < 0xDFFF:
424                    # Need to handle surrogate pairs.
425                    self.index = start
426                    return self.getComplexIdentifier()
427
428            if Character.isIdentifierPart(ch):
429                self.index += 1
430            else:
431                break
432
433        return self.source[start:self.index]
434
435    def getComplexIdentifier(self):
436        cp = self.codePointAt(self.index)
437        id = Character.fromCodePoint(cp)
438        self.index += len(id)
439
440        # '\u' (U+005C, U+0075) denotes an escaped character.
441        if cp == 0x5C:
442            if self.source[self.index] != 'u':
443                self.throwUnexpectedToken()
444
445            self.index += 1
446            if self.source[self.index] == '{':
447                self.index += 1
448                ch = self.scanUnicodeCodePointEscape()
449            else:
450                ch = self.scanHexEscape('u')
451                if not ch or ch == '\\' or not Character.isIdentifierStart(ch[0]):
452                    self.throwUnexpectedToken()
453
454            id = ch
455
456        while not self.eof():
457            cp = self.codePointAt(self.index)
458            ch = Character.fromCodePoint(cp)
459            if not Character.isIdentifierPart(ch):
460                break
461
462            id += ch
463            self.index += len(ch)
464
465            # '\u' (U+005C, U+0075) denotes an escaped character.
466            if cp == 0x5C:
467                id = id[:-1]
468                if self.source[self.index] != 'u':
469                    self.throwUnexpectedToken()
470
471                self.index += 1
472                if self.source[self.index] == '{':
473                    self.index += 1
474                    ch = self.scanUnicodeCodePointEscape()
475                else:
476                    ch = self.scanHexEscape('u')
477                    if not ch or ch == '\\' or not Character.isIdentifierPart(ch[0]):
478                        self.throwUnexpectedToken()
479
480                id += ch
481
482        return id
483
484    def octalToDecimal(self, ch):
485        # \0 is not octal escape sequence
486        octal = ch != '0'
487        code = octalValue(ch)
488
489        if not self.eof() and Character.isOctalDigit(self.source[self.index]):
490            octal = True
491            code = code * 8 + octalValue(self.source[self.index])
492            self.index += 1
493
494            # 3 digits are only allowed when string starts
495            # with 0, 1, 2, 3
496            if ch in '0123' and not self.eof() and Character.isOctalDigit(self.source[self.index]):
497                code = code * 8 + octalValue(self.source[self.index])
498                self.index += 1
499
500        return Octal(octal, code)
501
502    # https://tc39.github.io/ecma262/#sec-names-and-keywords
503
504    def scanIdentifier(self):
505        start = self.index
506
507        # Backslash (U+005C) starts an escaped character.
508        id = self.getComplexIdentifier() if self.source[start] == '\\' else self.getIdentifier()
509
510        # There is no keyword or literal with only one character.
511        # Thus, it must be an identifier.
512        if len(id) == 1:
513            type = Token.Identifier
514        elif self.isKeyword(id):
515            type = Token.Keyword
516        elif id == 'null':
517            type = Token.NullLiteral
518        elif id == 'true' or id == 'false':
519            type = Token.BooleanLiteral
520        else:
521            type = Token.Identifier
522
523        if type is not Token.Identifier and start + len(id) != self.index:
524            restore = self.index
525            self.index = start
526            self.tolerateUnexpectedToken(Messages.InvalidEscapedReservedWord)
527            self.index = restore
528
529        return RawToken(
530            type=type,
531            value=id,
532            lineNumber=self.lineNumber,
533            lineStart=self.lineStart,
534            start=start,
535            end=self.index
536        )
537
538    # https://tc39.github.io/ecma262/#sec-punctuators
539
540    def scanPunctuator(self):
541        start = self.index
542
543        # Check for most common single-character punctuators.
544        str = self.source[self.index]
545        if str in (
546            '(',
547            '{',
548        ):
549            if str == '{':
550                self.curlyStack.append('{')
551
552            self.index += 1
553
554        elif str == '.':
555            self.index += 1
556            if self.source[self.index] == '.' and self.source[self.index + 1] == '.':
557                # Spread operator: ...
558                self.index += 2
559                str = '...'
560
561        elif str == '}':
562            self.index += 1
563            if self.curlyStack:
564                self.curlyStack.pop()
565
566        elif str in (
567            ')',
568            ';',
569            ',',
570            '[',
571            ']',
572            ':',
573            '?',
574            '~',
575        ):
576            self.index += 1
577
578        else:
579            # 4-character punctuator.
580            str = self.source[self.index:self.index + 4]
581            if str == '>>>=':
582                self.index += 4
583            else:
584
585                # 3-character punctuators.
586                str = str[:3]
587                if str in (
588                    '===', '!==', '>>>',
589                    '<<=', '>>=', '**='
590                ):
591                    self.index += 3
592                else:
593
594                    # 2-character punctuators.
595                    str = str[:2]
596                    if str in (
597                        '&&', '||', '==', '!=',
598                        '+=', '-=', '*=', '/=',
599                        '++', '--', '<<', '>>',
600                        '&=', '|=', '^=', '%=',
601                        '<=', '>=', '=>', '**',
602                    ):
603                        self.index += 2
604                    else:
605
606                        # 1-character punctuators.
607                        str = self.source[self.index]
608                        if str in '<>=!+-*%&|^/':
609                            self.index += 1
610
611        if self.index == start:
612            self.throwUnexpectedToken()
613
614        return RawToken(
615            type=Token.Punctuator,
616            value=str,
617            lineNumber=self.lineNumber,
618            lineStart=self.lineStart,
619            start=start,
620            end=self.index
621        )
622
623    # https://tc39.github.io/ecma262/#sec-literals-numeric-literals
624
625    def scanHexLiteral(self, start):
626        num = ''
627
628        while not self.eof():
629            if not Character.isHexDigit(self.source[self.index]):
630                break
631
632            num += self.source[self.index]
633            self.index += 1
634
635        if len(num) == 0:
636            self.throwUnexpectedToken()
637
638        if Character.isIdentifierStart(self.source[self.index]):
639            self.throwUnexpectedToken()
640
641        return RawToken(
642            type=Token.NumericLiteral,
643            value=int(num, 16),
644            lineNumber=self.lineNumber,
645            lineStart=self.lineStart,
646            start=start,
647            end=self.index
648        )
649
650    def scanBinaryLiteral(self, start):
651        num = ''
652
653        while not self.eof():
654            ch = self.source[self.index]
655            if ch != '0' and ch != '1':
656                break
657
658            num += self.source[self.index]
659            self.index += 1
660
661        if len(num) == 0:
662            # only 0b or 0B
663            self.throwUnexpectedToken()
664
665        if not self.eof():
666            ch = self.source[self.index]
667            if Character.isIdentifierStart(ch) or Character.isDecimalDigit(ch):
668                self.throwUnexpectedToken()
669
670        return RawToken(
671            type=Token.NumericLiteral,
672            value=int(num, 2),
673            lineNumber=self.lineNumber,
674            lineStart=self.lineStart,
675            start=start,
676            end=self.index
677        )
678
679    def scanOctalLiteral(self, prefix, start):
680        num = ''
681        octal = False
682
683        if Character.isOctalDigit(prefix[0]):
684            octal = True
685            num = '0' + self.source[self.index]
686        self.index += 1
687
688        while not self.eof():
689            if not Character.isOctalDigit(self.source[self.index]):
690                break
691
692            num += self.source[self.index]
693            self.index += 1
694
695        if not octal and len(num) == 0:
696            # only 0o or 0O
697            self.throwUnexpectedToken()
698
699        if Character.isIdentifierStart(self.source[self.index]) or Character.isDecimalDigit(self.source[self.index]):
700            self.throwUnexpectedToken()
701
702        return RawToken(
703            type=Token.NumericLiteral,
704            value=int(num, 8),
705            octal=octal,
706            lineNumber=self.lineNumber,
707            lineStart=self.lineStart,
708            start=start,
709            end=self.index
710        )
711
712    def isImplicitOctalLiteral(self):
713        # Implicit octal, unless there is a non-octal digit.
714        # (Annex B.1.1 on Numeric Literals)
715        for i in xrange(self.index + 1, self.length):
716            ch = self.source[i]
717            if ch in '89':
718                return False
719            if not Character.isOctalDigit(ch):
720                return True
721        return True
722
723    def scanNumericLiteral(self):
724        start = self.index
725        ch = self.source[start]
726        assert Character.isDecimalDigit(ch) or ch == '.', 'Numeric literal must start with a decimal digit or a decimal point'
727
728        num = ''
729        if ch != '.':
730            num = self.source[self.index]
731            self.index += 1
732            ch = self.source[self.index]
733
734            # Hex number starts with '0x'.
735            # Octal number starts with '0'.
736            # Octal number in ES6 starts with '0o'.
737            # Binary number in ES6 starts with '0b'.
738            if num == '0':
739                if ch in ('x', 'X'):
740                    self.index += 1
741                    return self.scanHexLiteral(start)
742
743                if ch in ('b', 'B'):
744                    self.index += 1
745                    return self.scanBinaryLiteral(start)
746
747                if ch in ('o', 'O'):
748                    return self.scanOctalLiteral(ch, start)
749
750                if ch and Character.isOctalDigit(ch):
751                    if self.isImplicitOctalLiteral():
752                        return self.scanOctalLiteral(ch, start)
753
754            while Character.isDecimalDigit(self.source[self.index]):
755                num += self.source[self.index]
756                self.index += 1
757
758            ch = self.source[self.index]
759
760        if ch == '.':
761            num += self.source[self.index]
762            self.index += 1
763            while Character.isDecimalDigit(self.source[self.index]):
764                num += self.source[self.index]
765                self.index += 1
766
767            ch = self.source[self.index]
768
769        if ch in ('e', 'E'):
770            num += self.source[self.index]
771            self.index += 1
772
773            ch = self.source[self.index]
774            if ch in ('+', '-'):
775                num += self.source[self.index]
776                self.index += 1
777
778            if Character.isDecimalDigit(self.source[self.index]):
779                while Character.isDecimalDigit(self.source[self.index]):
780                    num += self.source[self.index]
781                    self.index += 1
782
783            else:
784                self.throwUnexpectedToken()
785
786        if Character.isIdentifierStart(self.source[self.index]):
787            self.throwUnexpectedToken()
788
789        value = float(num)
790        return RawToken(
791            type=Token.NumericLiteral,
792            value=int(value) if value.is_integer() else value,
793            lineNumber=self.lineNumber,
794            lineStart=self.lineStart,
795            start=start,
796            end=self.index
797        )
798
799    # https://tc39.github.io/ecma262/#sec-literals-string-literals
800
801    def scanStringLiteral(self):
802        start = self.index
803        quote = self.source[start]
804        assert quote in ('\'', '"'), 'String literal must starts with a quote'
805
806        self.index += 1
807        octal = False
808        str = ''
809
810        while not self.eof():
811            ch = self.source[self.index]
812            self.index += 1
813
814            if ch == quote:
815                quote = ''
816                break
817            elif ch == '\\':
818                ch = self.source[self.index]
819                self.index += 1
820                if not ch or not Character.isLineTerminator(ch):
821                    if ch == 'u':
822                        if self.source[self.index] == '{':
823                            self.index += 1
824                            str += self.scanUnicodeCodePointEscape()
825                        else:
826                            unescapedChar = self.scanHexEscape(ch)
827                            if not unescapedChar:
828                                self.throwUnexpectedToken()
829
830                            str += unescapedChar
831
832                    elif ch == 'x':
833                        unescaped = self.scanHexEscape(ch)
834                        if not unescaped:
835                            self.throwUnexpectedToken(Messages.InvalidHexEscapeSequence)
836
837                        str += unescaped
838                    elif ch == 'n':
839                        str += '\n'
840                    elif ch == 'r':
841                        str += '\r'
842                    elif ch == 't':
843                        str += '\t'
844                    elif ch == 'b':
845                        str += '\b'
846                    elif ch == 'f':
847                        str += '\f'
848                    elif ch == 'v':
849                        str += '\x0B'
850                    elif ch in (
851                        '8',
852                        '9',
853                    ):
854                        str += ch
855                        self.tolerateUnexpectedToken()
856
857                    else:
858                        if ch and Character.isOctalDigit(ch):
859                            octToDec = self.octalToDecimal(ch)
860
861                            octal = octToDec.octal or octal
862                            str += uchr(octToDec.code)
863                        else:
864                            str += ch
865
866                else:
867                    self.lineNumber += 1
868                    if ch == '\r' and self.source[self.index] == '\n':
869                        self.index += 1
870
871                    self.lineStart = self.index
872
873            elif Character.isLineTerminator(ch):
874                break
875            else:
876                str += ch
877
878        if quote != '':
879            self.index = start
880            self.throwUnexpectedToken()
881
882        return RawToken(
883            type=Token.StringLiteral,
884            value=str,
885            octal=octal,
886            lineNumber=self.lineNumber,
887            lineStart=self.lineStart,
888            start=start,
889            end=self.index
890        )
891
892    # https://tc39.github.io/ecma262/#sec-template-literal-lexical-components
893
894    def scanTemplate(self):
895        cooked = ''
896        terminated = False
897        start = self.index
898
899        head = self.source[start] == '`'
900        tail = False
901        rawOffset = 2
902
903        self.index += 1
904
905        while not self.eof():
906            ch = self.source[self.index]
907            self.index += 1
908            if ch == '`':
909                rawOffset = 1
910                tail = True
911                terminated = True
912                break
913            elif ch == '$':
914                if self.source[self.index] == '{':
915                    self.curlyStack.append('${')
916                    self.index += 1
917                    terminated = True
918                    break
919
920                cooked += ch
921            elif ch == '\\':
922                ch = self.source[self.index]
923                self.index += 1
924                if not Character.isLineTerminator(ch):
925                    if ch == 'n':
926                        cooked += '\n'
927                    elif ch == 'r':
928                        cooked += '\r'
929                    elif ch == 't':
930                        cooked += '\t'
931                    elif ch == 'u':
932                        if self.source[self.index] == '{':
933                            self.index += 1
934                            cooked += self.scanUnicodeCodePointEscape()
935                        else:
936                            restore = self.index
937                            unescapedChar = self.scanHexEscape(ch)
938                            if unescapedChar:
939                                cooked += unescapedChar
940                            else:
941                                self.index = restore
942                                cooked += ch
943
944                    elif ch == 'x':
945                        unescaped = self.scanHexEscape(ch)
946                        if not unescaped:
947                            self.throwUnexpectedToken(Messages.InvalidHexEscapeSequence)
948
949                        cooked += unescaped
950                    elif ch == 'b':
951                        cooked += '\b'
952                    elif ch == 'f':
953                        cooked += '\f'
954                    elif ch == 'v':
955                        cooked += '\v'
956
957                    else:
958                        if ch == '0':
959                            if Character.isDecimalDigit(self.source[self.index]):
960                                # Illegal: \01 \02 and so on
961                                self.throwUnexpectedToken(Messages.TemplateOctalLiteral)
962
963                            cooked += '\0'
964                        elif Character.isOctalDigit(ch):
965                            # Illegal: \1 \2
966                            self.throwUnexpectedToken(Messages.TemplateOctalLiteral)
967                        else:
968                            cooked += ch
969
970                else:
971                    self.lineNumber += 1
972                    if ch == '\r' and self.source[self.index] == '\n':
973                        self.index += 1
974
975                    self.lineStart = self.index
976
977            elif Character.isLineTerminator(ch):
978                self.lineNumber += 1
979                if ch == '\r' and self.source[self.index] == '\n':
980                    self.index += 1
981
982                self.lineStart = self.index
983                cooked += '\n'
984            else:
985                cooked += ch
986
987        if not terminated:
988            self.throwUnexpectedToken()
989
990        if not head:
991            if self.curlyStack:
992                self.curlyStack.pop()
993
994        return RawToken(
995            type=Token.Template,
996            value=self.source[start + 1:self.index - rawOffset],
997            cooked=cooked,
998            head=head,
999            tail=tail,
1000            lineNumber=self.lineNumber,
1001            lineStart=self.lineStart,
1002            start=start,
1003            end=self.index
1004        )
1005
1006    # https://tc39.github.io/ecma262/#sec-literals-regular-expression-literals
1007
1008    def testRegExp(self, pattern, flags):
1009        # The BMP character to use as a replacement for astral symbols when
1010        # translating an ES6 "u"-flagged pattern to an ES5-compatible
1011        # approximation.
1012        # Note: replacing with '\uFFFF' enables false positives in unlikely
1013        # scenarios. For example, `[\u{1044f}-\u{10440}]` is an invalid
1014        # pattern that would not be detected by this substitution.
1015        astralSubstitute = '\uFFFF'
1016
1017        # Replace every Unicode escape sequence with the equivalent
1018        # BMP character or a constant ASCII code point in the case of
1019        # astral symbols. (See the above note on `astralSubstitute`
1020        # for more information.)
1021        def astralSub(m):
1022            codePoint = int(m.group(1) or m.group(2), 16)
1023            if codePoint > 0x10FFFF:
1024                self.tolerateUnexpectedToken(Messages.InvalidRegExp)
1025            elif codePoint <= 0xFFFF:
1026                return uchr(codePoint)
1027            return astralSubstitute
1028        pattern = re.sub(r'\\u\{([0-9a-fA-F]+)\}|\\u([a-fA-F0-9]{4})', astralSub, pattern)
1029
1030        # Replace each paired surrogate with a single ASCII symbol to
1031        # avoid throwing on regular expressions that are only valid in
1032        # combination with the "u" flag.
1033        pattern = re.sub(r'[\uD800-\uDBFF][\uDC00-\uDFFF]', astralSubstitute, pattern)
1034
1035        # Return a regular expression object for this pattern-flag pair, or
1036        # `null` in case the current environment doesn't support the flags it
1037        # uses.
1038        pyflags = 0 | re.M if 'm' in flags else 0 | re.I if 'i' in flags else 0
1039        try:
1040            return re.compile(pattern, pyflags)
1041        except Exception:
1042            self.tolerateUnexpectedToken(Messages.InvalidRegExp)
1043
1044    def scanRegExpBody(self):
1045        ch = self.source[self.index]
1046        assert ch == '/', 'Regular expression literal must start with a slash'
1047
1048        str = self.source[self.index]
1049        self.index += 1
1050        classMarker = False
1051        terminated = False
1052
1053        while not self.eof():
1054            ch = self.source[self.index]
1055            self.index += 1
1056            str += ch
1057            if ch == '\\':
1058                ch = self.source[self.index]
1059                self.index += 1
1060                # https://tc39.github.io/ecma262/#sec-literals-regular-expression-literals
1061                if Character.isLineTerminator(ch):
1062                    self.throwUnexpectedToken(Messages.UnterminatedRegExp)
1063
1064                str += ch
1065            elif Character.isLineTerminator(ch):
1066                self.throwUnexpectedToken(Messages.UnterminatedRegExp)
1067            elif classMarker:
1068                if ch == ']':
1069                    classMarker = False
1070
1071            else:
1072                if ch == '/':
1073                    terminated = True
1074                    break
1075                elif ch == '[':
1076                    classMarker = True
1077
1078        if not terminated:
1079            self.throwUnexpectedToken(Messages.UnterminatedRegExp)
1080
1081        # Exclude leading and trailing slash.
1082        return str[1:-1]
1083
1084    def scanRegExpFlags(self):
1085        str = ''
1086        flags = ''
1087        while not self.eof():
1088            ch = self.source[self.index]
1089            if not Character.isIdentifierPart(ch):
1090                break
1091
1092            self.index += 1
1093            if ch == '\\' and not self.eof():
1094                ch = self.source[self.index]
1095                if ch == 'u':
1096                    self.index += 1
1097                    restore = self.index
1098                    char = self.scanHexEscape('u')
1099                    if char:
1100                        flags += char
1101                        str += '\\u'
1102                        while restore < self.index:
1103                            str += self.source[restore]
1104                            restore += 1
1105
1106                    else:
1107                        self.index = restore
1108                        flags += 'u'
1109                        str += '\\u'
1110
1111                    self.tolerateUnexpectedToken()
1112                else:
1113                    str += '\\'
1114                    self.tolerateUnexpectedToken()
1115
1116            else:
1117                flags += ch
1118                str += ch
1119
1120        return flags
1121
1122    def scanRegExp(self):
1123        start = self.index
1124
1125        pattern = self.scanRegExpBody()
1126        flags = self.scanRegExpFlags()
1127        value = self.testRegExp(pattern, flags)
1128
1129        return RawToken(
1130            type=Token.RegularExpression,
1131            value='',
1132            pattern=pattern,
1133            flags=flags,
1134            regex=value,
1135            lineNumber=self.lineNumber,
1136            lineStart=self.lineStart,
1137            start=start,
1138            end=self.index
1139        )
1140
1141    def lex(self):
1142        if self.eof():
1143            return RawToken(
1144                type=Token.EOF,
1145                value='',
1146                lineNumber=self.lineNumber,
1147                lineStart=self.lineStart,
1148                start=self.index,
1149                end=self.index
1150            )
1151
1152        ch = self.source[self.index]
1153
1154        if Character.isIdentifierStart(ch):
1155            return self.scanIdentifier()
1156
1157        # Very common: ( and ) and ;
1158        if ch in ('(', ')', ';'):
1159            return self.scanPunctuator()
1160
1161        # String literal starts with single quote (U+0027) or double quote (U+0022).
1162        if ch in ('\'', '"'):
1163            return self.scanStringLiteral()
1164
1165        # Dot (.) U+002E can also start a floating-point number, hence the need
1166        # to check the next character.
1167        if ch == '.':
1168            if Character.isDecimalDigit(self.source[self.index + 1]):
1169                return self.scanNumericLiteral()
1170
1171            return self.scanPunctuator()
1172
1173        if Character.isDecimalDigit(ch):
1174            return self.scanNumericLiteral()
1175
1176        # Template literals start with ` (U+0060) for template head
1177        # or } (U+007D) for template middle or template tail.
1178        if ch == '`' or (ch == '}' and self.curlyStack and self.curlyStack[-1] == '${'):
1179            return self.scanTemplate()
1180
1181        # Possible identifier start in a surrogate pair.
1182        cp = ord(ch)
1183        if cp >= 0xD800 and cp < 0xDFFF:
1184            cp = self.codePointAt(self.index)
1185            ch = Character.fromCodePoint(cp)
1186            if Character.isIdentifierStart(ch):
1187                return self.scanIdentifier()
1188
1189        return self.scanPunctuator()
1190