1# Copyright (c) Facebook, Inc. and its affiliates.
2#
3# This source code is licensed under the MIT license found in the
4# LICENSE file in the root directory of this source tree.
5
6
7import itertools
8import re
9from dataclasses import dataclass
10from io import BytesIO
11from tokenize import detect_encoding as py_tokenize_detect_encoding
12from typing import FrozenSet, Iterable, Iterator, Pattern, Set, Union
13
14from libcst._nodes.whitespace import NEWLINE_RE
15from libcst._parser.parso.python.token import PythonTokenTypes, TokenType
16from libcst._parser.parso.utils import split_lines
17from libcst._parser.types.config import AutoConfig, ParserConfig, PartialParserConfig
18from libcst._parser.types.token import Token
19from libcst._parser.wrapped_tokenize import tokenize_lines
20
21_INDENT: TokenType = PythonTokenTypes.INDENT
22_NAME: TokenType = PythonTokenTypes.NAME
23_NEWLINE: TokenType = PythonTokenTypes.NEWLINE
24_STRING: TokenType = PythonTokenTypes.STRING
25
26_FALLBACK_DEFAULT_NEWLINE = "\n"
27_FALLBACK_DEFAULT_INDENT = "    "
28_CONTINUATION_RE: Pattern[str] = re.compile(r"\\(\r\n?|\n)", re.UNICODE)
29
30
31@dataclass(frozen=True)
32class ConfigDetectionResult:
33    # The config is a set of constant values used by the parser.
34    config: ParserConfig
35    # The tokens iterator is mutated by the parser.
36    tokens: Iterator[Token]
37
38
39def _detect_encoding(source: Union[str, bytes]) -> str:
40    """
41    Detects the encoding from the presence of a UTF-8 BOM or an encoding cookie as
42    specified in PEP 263.
43
44    If given a string (instead of bytes) the encoding is assumed to be utf-8.
45    """
46
47    if isinstance(source, str):
48        return "utf-8"
49    return py_tokenize_detect_encoding(BytesIO(source).readline)[0]
50
51
52def _detect_default_newline(source_str: str) -> str:
53    """
54    Finds the first newline, and uses that value as the default newline.
55    """
56    # Don't use `NEWLINE_RE` for this, because it might match multiple newlines as a
57    # single newline.
58    match = NEWLINE_RE.search(source_str)
59    return match.group(0) if match is not None else _FALLBACK_DEFAULT_NEWLINE
60
61
62def _detect_indent(tokens: Iterable[Token]) -> str:
63    """
64    Finds the first INDENT token, and uses that as the value of the default indent.
65    """
66    try:
67        first_indent = next(t for t in tokens if t.type is _INDENT)
68    except StopIteration:
69        return _FALLBACK_DEFAULT_INDENT
70    first_indent_str = first_indent.relative_indent
71    assert first_indent_str is not None, "INDENT tokens must contain a relative_indent"
72    return first_indent_str
73
74
75def _detect_trailing_newline(source_str: str) -> bool:
76    if len(source_str) == 0 or not NEWLINE_RE.fullmatch(source_str[-1]):
77        return False
78    # Make sure that the last newline wasn't following a continuation
79    return not (
80        _CONTINUATION_RE.fullmatch(source_str[-2:])
81        or _CONTINUATION_RE.fullmatch(source_str[-3:])
82    )
83
84
85def _detect_future_imports(tokens: Iterable[Token]) -> FrozenSet[str]:
86    """
87    Finds __future__ imports in their proper locations.
88
89    See `https://www.python.org/dev/peps/pep-0236/`_
90    """
91    future_imports: Set[str] = set()
92    state = 0
93    for tok in tokens:
94        if state == 0 and tok.type in (_STRING, _NEWLINE):
95            continue
96        elif state == 0 and tok.string == "from":
97            state = 1
98        elif state == 1 and tok.string == "__future__":
99            state = 2
100        elif state == 2 and tok.string == "import":
101            state = 3
102        elif state == 3 and tok.string == "as":
103            state = 4
104        elif state == 3 and tok.type == _NAME:
105            future_imports.add(tok.string)
106        elif state == 4 and tok.type == _NAME:
107            state = 3
108        elif state == 3 and tok.string in "(),":
109            continue
110        elif state == 3 and tok.type == _NEWLINE:
111            state = 0
112        else:
113            break
114    return frozenset(future_imports)
115
116
117def detect_config(
118    source: Union[str, bytes],
119    *,
120    partial: PartialParserConfig,
121    detect_trailing_newline: bool,
122    detect_default_newline: bool,
123) -> ConfigDetectionResult:
124    """
125    Computes a ParserConfig given the current source code to be parsed and a partial
126    config.
127    """
128
129    python_version = partial.parsed_python_version
130
131    partial_encoding = partial.encoding
132    encoding = (
133        _detect_encoding(source)
134        if isinstance(partial_encoding, AutoConfig)
135        else partial_encoding
136    )
137
138    source_str = source if isinstance(source, str) else source.decode(encoding)
139
140    partial_default_newline = partial.default_newline
141    default_newline = (
142        (
143            _detect_default_newline(source_str)
144            if detect_default_newline
145            else _FALLBACK_DEFAULT_NEWLINE
146        )
147        if isinstance(partial_default_newline, AutoConfig)
148        else partial_default_newline
149    )
150
151    # HACK: The grammar requires a trailing newline, but python doesn't actually require
152    # a trailing newline. Add one onto the end to make the parser happy. We'll strip it
153    # out again during cst.Module's codegen.
154    #
155    # I think parso relies on error recovery support to handle this, which we don't
156    # have. lib2to3 doesn't handle this case at all AFAICT.
157    has_trailing_newline = detect_trailing_newline and _detect_trailing_newline(
158        source_str
159    )
160    if detect_trailing_newline and not has_trailing_newline:
161        source_str += default_newline
162
163    lines = split_lines(source_str, keepends=True)
164
165    tokens = tokenize_lines(lines, python_version)
166
167    partial_default_indent = partial.default_indent
168    if isinstance(partial_default_indent, AutoConfig):
169        # We need to clone `tokens` before passing it to `_detect_indent`, because
170        # `_detect_indent` consumes some tokens, mutating `tokens`.
171        #
172        # Implementation detail: CPython's `itertools.tee` uses weakrefs to reduce the
173        # size of its FIFO, so this doesn't retain items (leak memory) for `tokens_dup`
174        # once `token_dup` is freed at the end of this method (subject to
175        # GC/refcounting).
176        tokens, tokens_dup = itertools.tee(tokens)
177        default_indent = _detect_indent(tokens_dup)
178    else:
179        default_indent = partial_default_indent
180
181    partial_future_imports = partial.future_imports
182    if isinstance(partial_future_imports, AutoConfig):
183        # Same note as above re itertools.tee, we will consume tokens.
184        tokens, tokens_dup = itertools.tee(tokens)
185        future_imports = _detect_future_imports(tokens_dup)
186    else:
187        future_imports = partial_future_imports
188
189    return ConfigDetectionResult(
190        config=ParserConfig(
191            lines=lines,
192            encoding=encoding,
193            default_indent=default_indent,
194            default_newline=default_newline,
195            has_trailing_newline=has_trailing_newline,
196            version=python_version,
197            future_imports=future_imports,
198        ),
199        tokens=tokens,
200    )
201