1# Copyright (c) Facebook, Inc. and its affiliates. 2# 3# This source code is licensed under the MIT license found in the 4# LICENSE file in the root directory of this source tree. 5 6 7import itertools 8import re 9from dataclasses import dataclass 10from io import BytesIO 11from tokenize import detect_encoding as py_tokenize_detect_encoding 12from typing import FrozenSet, Iterable, Iterator, Pattern, Set, Union 13 14from libcst._nodes.whitespace import NEWLINE_RE 15from libcst._parser.parso.python.token import PythonTokenTypes, TokenType 16from libcst._parser.parso.utils import split_lines 17from libcst._parser.types.config import AutoConfig, ParserConfig, PartialParserConfig 18from libcst._parser.types.token import Token 19from libcst._parser.wrapped_tokenize import tokenize_lines 20 21_INDENT: TokenType = PythonTokenTypes.INDENT 22_NAME: TokenType = PythonTokenTypes.NAME 23_NEWLINE: TokenType = PythonTokenTypes.NEWLINE 24_STRING: TokenType = PythonTokenTypes.STRING 25 26_FALLBACK_DEFAULT_NEWLINE = "\n" 27_FALLBACK_DEFAULT_INDENT = " " 28_CONTINUATION_RE: Pattern[str] = re.compile(r"\\(\r\n?|\n)", re.UNICODE) 29 30 31@dataclass(frozen=True) 32class ConfigDetectionResult: 33 # The config is a set of constant values used by the parser. 34 config: ParserConfig 35 # The tokens iterator is mutated by the parser. 36 tokens: Iterator[Token] 37 38 39def _detect_encoding(source: Union[str, bytes]) -> str: 40 """ 41 Detects the encoding from the presence of a UTF-8 BOM or an encoding cookie as 42 specified in PEP 263. 43 44 If given a string (instead of bytes) the encoding is assumed to be utf-8. 45 """ 46 47 if isinstance(source, str): 48 return "utf-8" 49 return py_tokenize_detect_encoding(BytesIO(source).readline)[0] 50 51 52def _detect_default_newline(source_str: str) -> str: 53 """ 54 Finds the first newline, and uses that value as the default newline. 55 """ 56 # Don't use `NEWLINE_RE` for this, because it might match multiple newlines as a 57 # single newline. 58 match = NEWLINE_RE.search(source_str) 59 return match.group(0) if match is not None else _FALLBACK_DEFAULT_NEWLINE 60 61 62def _detect_indent(tokens: Iterable[Token]) -> str: 63 """ 64 Finds the first INDENT token, and uses that as the value of the default indent. 65 """ 66 try: 67 first_indent = next(t for t in tokens if t.type is _INDENT) 68 except StopIteration: 69 return _FALLBACK_DEFAULT_INDENT 70 first_indent_str = first_indent.relative_indent 71 assert first_indent_str is not None, "INDENT tokens must contain a relative_indent" 72 return first_indent_str 73 74 75def _detect_trailing_newline(source_str: str) -> bool: 76 if len(source_str) == 0 or not NEWLINE_RE.fullmatch(source_str[-1]): 77 return False 78 # Make sure that the last newline wasn't following a continuation 79 return not ( 80 _CONTINUATION_RE.fullmatch(source_str[-2:]) 81 or _CONTINUATION_RE.fullmatch(source_str[-3:]) 82 ) 83 84 85def _detect_future_imports(tokens: Iterable[Token]) -> FrozenSet[str]: 86 """ 87 Finds __future__ imports in their proper locations. 88 89 See `https://www.python.org/dev/peps/pep-0236/`_ 90 """ 91 future_imports: Set[str] = set() 92 state = 0 93 for tok in tokens: 94 if state == 0 and tok.type in (_STRING, _NEWLINE): 95 continue 96 elif state == 0 and tok.string == "from": 97 state = 1 98 elif state == 1 and tok.string == "__future__": 99 state = 2 100 elif state == 2 and tok.string == "import": 101 state = 3 102 elif state == 3 and tok.string == "as": 103 state = 4 104 elif state == 3 and tok.type == _NAME: 105 future_imports.add(tok.string) 106 elif state == 4 and tok.type == _NAME: 107 state = 3 108 elif state == 3 and tok.string in "(),": 109 continue 110 elif state == 3 and tok.type == _NEWLINE: 111 state = 0 112 else: 113 break 114 return frozenset(future_imports) 115 116 117def detect_config( 118 source: Union[str, bytes], 119 *, 120 partial: PartialParserConfig, 121 detect_trailing_newline: bool, 122 detect_default_newline: bool, 123) -> ConfigDetectionResult: 124 """ 125 Computes a ParserConfig given the current source code to be parsed and a partial 126 config. 127 """ 128 129 python_version = partial.parsed_python_version 130 131 partial_encoding = partial.encoding 132 encoding = ( 133 _detect_encoding(source) 134 if isinstance(partial_encoding, AutoConfig) 135 else partial_encoding 136 ) 137 138 source_str = source if isinstance(source, str) else source.decode(encoding) 139 140 partial_default_newline = partial.default_newline 141 default_newline = ( 142 ( 143 _detect_default_newline(source_str) 144 if detect_default_newline 145 else _FALLBACK_DEFAULT_NEWLINE 146 ) 147 if isinstance(partial_default_newline, AutoConfig) 148 else partial_default_newline 149 ) 150 151 # HACK: The grammar requires a trailing newline, but python doesn't actually require 152 # a trailing newline. Add one onto the end to make the parser happy. We'll strip it 153 # out again during cst.Module's codegen. 154 # 155 # I think parso relies on error recovery support to handle this, which we don't 156 # have. lib2to3 doesn't handle this case at all AFAICT. 157 has_trailing_newline = detect_trailing_newline and _detect_trailing_newline( 158 source_str 159 ) 160 if detect_trailing_newline and not has_trailing_newline: 161 source_str += default_newline 162 163 lines = split_lines(source_str, keepends=True) 164 165 tokens = tokenize_lines(lines, python_version) 166 167 partial_default_indent = partial.default_indent 168 if isinstance(partial_default_indent, AutoConfig): 169 # We need to clone `tokens` before passing it to `_detect_indent`, because 170 # `_detect_indent` consumes some tokens, mutating `tokens`. 171 # 172 # Implementation detail: CPython's `itertools.tee` uses weakrefs to reduce the 173 # size of its FIFO, so this doesn't retain items (leak memory) for `tokens_dup` 174 # once `token_dup` is freed at the end of this method (subject to 175 # GC/refcounting). 176 tokens, tokens_dup = itertools.tee(tokens) 177 default_indent = _detect_indent(tokens_dup) 178 else: 179 default_indent = partial_default_indent 180 181 partial_future_imports = partial.future_imports 182 if isinstance(partial_future_imports, AutoConfig): 183 # Same note as above re itertools.tee, we will consume tokens. 184 tokens, tokens_dup = itertools.tee(tokens) 185 future_imports = _detect_future_imports(tokens_dup) 186 else: 187 future_imports = partial_future_imports 188 189 return ConfigDetectionResult( 190 config=ParserConfig( 191 lines=lines, 192 encoding=encoding, 193 default_indent=default_indent, 194 default_newline=default_newline, 195 has_trailing_newline=has_trailing_newline, 196 version=python_version, 197 future_imports=future_imports, 198 ), 199 tokens=tokens, 200 ) 201