esprima/esprima/tokenizer.py

# -*- coding: utf-8 -*-
# Copyright JS Foundation and other contributors, https://js.foundation/
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#   * Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#   * Redistributions in binary form must reproduce the above copyright
#     notice, this list of conditions and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES
# LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from __future__ import absolute_import, unicode_literals

from collections import deque

from .objects import Object
from .error_handler import ErrorHandler
from .scanner import Scanner, SourceLocation, Position, RegExp
from .token import Token, TokenName


class BufferEntry(Object):
    def __init__(self, type, value, regex=None, range=None, loc=None):
        self.type = type
        self.value = value
        self.regex = regex
        self.range = range
        self.loc = loc


class Reader(object):
    def __init__(self):
        self.values = []
        self.curly = self.paren = -1

    # A function following one of those tokens is an expression.
    def beforeFunctionExpression(self, t):
        return t in (
            '(', '{', '[', 'in', 'typeof', 'instanceof', 'new',
            'return', 'case', 'delete', 'throw', 'void',
            # assignment operators
            '=', '+=', '-=', '*=', '**=', '/=', '%=', '<<=', '>>=', '>>>=',
            '&=', '|=', '^=', ',',
            # binary/unary operators
            '+', '-', '*', '**', '/', '%', '++', '--', '<<', '>>', '>>>', '&',
            '|', '^', '!', '~', '&&', '||', '?', ':', '===', '==', '>=',
            '<=', '<', '>', '!=', '!=='
        )

    # Determine if forward slash (/) is an operator or part of a regular expression
    # https://github.com/mozilla/sweet.js/wiki/design
    def isRegexStart(self):
        if not self.values:
            return True

        previous = self.values[-1]
        regex = previous is not None

        if previous in (
            'this',
            ']',
        ):
            regex = False
        elif previous == ')':
            keyword = self.values[self.paren - 1]
            regex = keyword in ('if', 'while', 'for', 'with')

        elif previous == '}':
            # Dividing a function by anything makes little sense,
            # but we have to check for that.
            regex = True
            if len(self.values) >= 3 and self.values[self.curly - 3] == 'function':
                # Anonymous function, e.g. function(){} /42
                check = self.values[self.curly - 4]
                regex = not self.beforeFunctionExpression(check) if check else False
            elif len(self.values) >= 4 and self.values[self.curly - 4] == 'function':
                # Named function, e.g. function f(){} /42/
                check = self.values[self.curly - 5]
                regex = not self.beforeFunctionExpression(check) if check else True

        return regex

    def append(self, token):
        if token.type in (Token.Punctuator, Token.Keyword):
            if token.value == '{':
                self.curly = len(self.values)
            elif token.value == '(':
                self.paren = len(self.values)
            self.values.append(token.value)
        else:
            self.values.append(None)


class Config(Object):
    def __init__(self, tolerant=None, comment=None, range=None, loc=None, **options):
        self.tolerant = tolerant
        self.comment = comment
        self.range = range
        self.loc = loc
        for k, v in options.items():
            setattr(self, k, v)


class Tokenizer(object):
    def __init__(self, code, options):
        self.config = Config(**options)

        self.errorHandler = ErrorHandler()
        self.errorHandler.tolerant = self.config.tolerant
        self.scanner = Scanner(code, self.errorHandler)
        self.scanner.trackComment = self.config.comment

        self.trackRange = self.config.range
        self.trackLoc = self.config.loc
        self.buffer = deque()
        self.reader = Reader()

    def errors(self):
        return self.errorHandler.errors

    def getNextToken(self):
        if not self.buffer:

            comments = self.scanner.scanComments()
            if self.scanner.trackComment:
                for e in comments:
                    value = self.scanner.source[e.slice[0]:e.slice[1]]
                    comment = BufferEntry(
                        type='BlockComment' if e.multiLine else 'LineComment',
                        value=value
                    )
                    if self.trackRange:
                        comment.range = e.range
                    if self.trackLoc:
                        comment.loc = e.loc
                    self.buffer.append(comment)

            if not self.scanner.eof():
                if self.trackLoc:
                    loc = SourceLocation(
                        start=Position(
                            line=self.scanner.lineNumber,
                            column=self.scanner.index - self.scanner.lineStart
                        ),
                        end=Position(),
                    )

                maybeRegex = self.scanner.source[self.scanner.index] == '/' and self.reader.isRegexStart()
                if maybeRegex:
                    state = self.scanner.saveState()
                    try:
                        token = self.scanner.scanRegExp()
                    except Exception:
                        self.scanner.restoreState(state)
                        token = self.scanner.lex()
                else:
                    token = self.scanner.lex()

                self.reader.append(token)

                entry = BufferEntry(
                    type=TokenName[token.type],
                    value=self.scanner.source[token.start:token.end]
                )
                if self.trackRange:
                    entry.range = [token.start, token.end]
                if self.trackLoc:
                    loc.end = Position(
                        line=self.scanner.lineNumber,
                        column=self.scanner.index - self.scanner.lineStart
                    )
                    entry.loc = loc
                if token.type is Token.RegularExpression:
                    entry.regex = RegExp(
                        pattern=token.pattern,
                        flags=token.flags,
                    )

                self.buffer.append(entry)

        return self.buffer.popleft() if self.buffer else None