# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import itertools import re from dataclasses import dataclass from io import BytesIO from tokenize import detect_encoding as py_tokenize_detect_encoding from typing import FrozenSet, Iterable, Iterator, Pattern, Set, Union from libcst._nodes.whitespace import NEWLINE_RE from libcst._parser.parso.python.token import PythonTokenTypes, TokenType from libcst._parser.parso.utils import split_lines from libcst._parser.types.config import AutoConfig, ParserConfig, PartialParserConfig from libcst._parser.types.token import Token from libcst._parser.wrapped_tokenize import tokenize_lines _INDENT: TokenType = PythonTokenTypes.INDENT _NAME: TokenType = PythonTokenTypes.NAME _NEWLINE: TokenType = PythonTokenTypes.NEWLINE _STRING: TokenType = PythonTokenTypes.STRING _FALLBACK_DEFAULT_NEWLINE = "\n" _FALLBACK_DEFAULT_INDENT = " " _CONTINUATION_RE: Pattern[str] = re.compile(r"\\(\r\n?|\n)", re.UNICODE) @dataclass(frozen=True) class ConfigDetectionResult: # The config is a set of constant values used by the parser. config: ParserConfig # The tokens iterator is mutated by the parser. tokens: Iterator[Token] def _detect_encoding(source: Union[str, bytes]) -> str: """ Detects the encoding from the presence of a UTF-8 BOM or an encoding cookie as specified in PEP 263. If given a string (instead of bytes) the encoding is assumed to be utf-8. """ if isinstance(source, str): return "utf-8" return py_tokenize_detect_encoding(BytesIO(source).readline)[0] def _detect_default_newline(source_str: str) -> str: """ Finds the first newline, and uses that value as the default newline. """ # Don't use `NEWLINE_RE` for this, because it might match multiple newlines as a # single newline. match = NEWLINE_RE.search(source_str) return match.group(0) if match is not None else _FALLBACK_DEFAULT_NEWLINE def _detect_indent(tokens: Iterable[Token]) -> str: """ Finds the first INDENT token, and uses that as the value of the default indent. """ try: first_indent = next(t for t in tokens if t.type is _INDENT) except StopIteration: return _FALLBACK_DEFAULT_INDENT first_indent_str = first_indent.relative_indent assert first_indent_str is not None, "INDENT tokens must contain a relative_indent" return first_indent_str def _detect_trailing_newline(source_str: str) -> bool: if len(source_str) == 0 or not NEWLINE_RE.fullmatch(source_str[-1]): return False # Make sure that the last newline wasn't following a continuation return not ( _CONTINUATION_RE.fullmatch(source_str[-2:]) or _CONTINUATION_RE.fullmatch(source_str[-3:]) ) def _detect_future_imports(tokens: Iterable[Token]) -> FrozenSet[str]: """ Finds __future__ imports in their proper locations. See `https://www.python.org/dev/peps/pep-0236/`_ """ future_imports: Set[str] = set() state = 0 for tok in tokens: if state == 0 and tok.type in (_STRING, _NEWLINE): continue elif state == 0 and tok.string == "from": state = 1 elif state == 1 and tok.string == "__future__": state = 2 elif state == 2 and tok.string == "import": state = 3 elif state == 3 and tok.string == "as": state = 4 elif state == 3 and tok.type == _NAME: future_imports.add(tok.string) elif state == 4 and tok.type == _NAME: state = 3 elif state == 3 and tok.string in "(),": continue elif state == 3 and tok.type == _NEWLINE: state = 0 else: break return frozenset(future_imports) def detect_config( source: Union[str, bytes], *, partial: PartialParserConfig, detect_trailing_newline: bool, detect_default_newline: bool, ) -> ConfigDetectionResult: """ Computes a ParserConfig given the current source code to be parsed and a partial config. """ python_version = partial.parsed_python_version partial_encoding = partial.encoding encoding = ( _detect_encoding(source) if isinstance(partial_encoding, AutoConfig) else partial_encoding ) source_str = source if isinstance(source, str) else source.decode(encoding) partial_default_newline = partial.default_newline default_newline = ( ( _detect_default_newline(source_str) if detect_default_newline else _FALLBACK_DEFAULT_NEWLINE ) if isinstance(partial_default_newline, AutoConfig) else partial_default_newline ) # HACK: The grammar requires a trailing newline, but python doesn't actually require # a trailing newline. Add one onto the end to make the parser happy. We'll strip it # out again during cst.Module's codegen. # # I think parso relies on error recovery support to handle this, which we don't # have. lib2to3 doesn't handle this case at all AFAICT. has_trailing_newline = detect_trailing_newline and _detect_trailing_newline( source_str ) if detect_trailing_newline and not has_trailing_newline: source_str += default_newline lines = split_lines(source_str, keepends=True) tokens = tokenize_lines(lines, python_version) partial_default_indent = partial.default_indent if isinstance(partial_default_indent, AutoConfig): # We need to clone `tokens` before passing it to `_detect_indent`, because # `_detect_indent` consumes some tokens, mutating `tokens`. # # Implementation detail: CPython's `itertools.tee` uses weakrefs to reduce the # size of its FIFO, so this doesn't retain items (leak memory) for `tokens_dup` # once `token_dup` is freed at the end of this method (subject to # GC/refcounting). tokens, tokens_dup = itertools.tee(tokens) default_indent = _detect_indent(tokens_dup) else: default_indent = partial_default_indent partial_future_imports = partial.future_imports if isinstance(partial_future_imports, AutoConfig): # Same note as above re itertools.tee, we will consume tokens. tokens, tokens_dup = itertools.tee(tokens) future_imports = _detect_future_imports(tokens_dup) else: future_imports = partial_future_imports return ConfigDetectionResult( config=ParserConfig( lines=lines, encoding=encoding, default_indent=default_indent, default_newline=default_newline, has_trailing_newline=has_trailing_newline, version=python_version, future_imports=future_imports, ), tokens=tokens, )