1# Copyright 2020 the authors.
2# This file is part of Hy, which is free software licensed under the Expat
3# license. See the LICENSE.
4
5from __future__ import unicode_literals
6
7import keyword
8import re
9import sys
10import unicodedata
11
12from hy.lex.exceptions import PrematureEndOfInput, LexException  # NOQA
13from hy.models import HyExpression, HySymbol
14
15try:
16    from io import StringIO
17except ImportError:
18    from StringIO import StringIO
19
20
21def hy_parse(source, filename='<string>'):
22    """Parse a Hy source string.
23
24    Parameters
25    ----------
26    source: string
27        Source code to parse.
28
29    filename: string, optional
30        File name corresponding to source.  Defaults to "<string>".
31
32    Returns
33    -------
34    out : HyExpression
35    """
36    _source = re.sub(r'\A#!.*', '', source)
37    res = HyExpression([HySymbol("do")] +
38                       tokenize(_source + "\n",
39                                filename=filename))
40    res.source = source
41    res.filename = filename
42    return res
43
44
45class ParserState(object):
46    def __init__(self, source, filename):
47        self.source = source
48        self.filename = filename
49
50
51def tokenize(source, filename=None):
52    """ Tokenize a Lisp file or string buffer into internal Hy objects.
53
54    Parameters
55    ----------
56    source: str
57        The source to tokenize.
58    filename: str, optional
59        The filename corresponding to `source`.
60    """
61    from hy.lex.lexer import lexer
62    from hy.lex.parser import parser
63    from rply.errors import LexingError
64    try:
65        return parser.parse(lexer.lex(source),
66                            state=ParserState(source, filename))
67    except LexingError as e:
68        pos = e.getsourcepos()
69        raise LexException("Could not identify the next token.",
70                           None, filename, source,
71                           max(pos.lineno, 1),
72                           max(pos.colno, 1))
73    except LexException as e:
74        raise e
75
76
77def parse_one_thing(src_string):
78    """Parse the first form from the string. Return it and the
79    remainder of the string."""
80    import re
81    from hy.lex.lexer import lexer
82    from hy.lex.parser import parser
83    from rply.errors import LexingError
84    tokens = []
85    err = None
86    for token in lexer.lex(src_string):
87        tokens.append(token)
88        try:
89            model, = parser.parse(
90                iter(tokens),
91                state=ParserState(src_string, filename=None))
92        except (LexingError, LexException) as e:
93            err = e
94        else:
95            return model, src_string[re.match(
96                r'.+\n' * (model.end_line - 1)
97                    + '.' * model.end_column,
98                src_string).end():]
99    if err:
100        raise err
101    raise ValueError("No form found")
102
103
104mangle_delim = 'X'
105
106
107def mangle(s):
108    """Stringify the argument and convert it to a valid Python identifier
109    according to Hy's mangling rules."""
110    def unicode_char_to_hex(uchr):
111        # Covert a unicode char to hex string, without prefix
112        if len(uchr) == 1 and ord(uchr) < 128:
113            return format(ord(uchr), 'x')
114        return (uchr.encode('unicode-escape').decode('utf-8')
115            .lstrip('\\U').lstrip('\\u').lstrip('\\x').lstrip('0'))
116
117    assert s
118
119    s = str(s)
120    s = s.replace("-", "_")
121    s2 = s.lstrip('_')
122    leading_underscores = '_' * (len(s) - len(s2))
123    s = s2
124
125    if s.endswith("?"):
126        s = 'is_' + s[:-1]
127    if not isidentifier(leading_underscores + s):
128        # Replace illegal characters with their Unicode character
129        # names, or hexadecimal if they don't have one.
130        s = 'hyx_' + ''.join(
131            c
132               if c != mangle_delim and isidentifier('S' + c)
133                 # We prepend the "S" because some characters aren't
134                 # allowed at the start of an identifier.
135               else '{0}{1}{0}'.format(mangle_delim,
136                   unicodedata.name(c, '').lower().replace('-', 'H').replace(' ', '_')
137                   or 'U{}'.format(unicode_char_to_hex(c)))
138            for c in s)
139
140    s = leading_underscores + s
141    assert isidentifier(s)
142    return s
143
144
145def unmangle(s):
146    """Stringify the argument and try to convert it to a pretty unmangled
147    form. This may not round-trip, because different Hy symbol names can
148    mangle to the same Python identifier."""
149
150    s = str(s)
151
152    s2 = s.lstrip('_')
153    leading_underscores = len(s) - len(s2)
154    s = s2
155
156    if s.startswith('hyx_'):
157        s = re.sub('{0}(U)?([_a-z0-9H]+?){0}'.format(mangle_delim),
158            lambda mo:
159               chr(int(mo.group(2), base=16))
160               if mo.group(1)
161               else unicodedata.lookup(
162                   mo.group(2).replace('_', ' ').replace('H', '-').upper()),
163            s[len('hyx_'):])
164    if s.startswith('is_'):
165        s = s[len("is_"):] + "?"
166    s = s.replace('_', '-')
167
168    return '-' * leading_underscores + s
169
170
171def read(from_file=sys.stdin, eof=""):
172    """Read from input and returns a tokenized string.
173
174    Can take a given input buffer to read from, and a single byte as EOF
175    (defaults to an empty string).
176    """
177    buff = ""
178    while True:
179        inn = str(from_file.readline())
180        if inn == eof:
181            raise EOFError("Reached end of file")
182        buff += inn
183        try:
184            parsed = next(iter(tokenize(buff)), None)
185        except (PrematureEndOfInput, IndexError):
186            pass
187        else:
188            break
189    return parsed
190
191
192def read_str(input):
193    return read(StringIO(str(input)))
194
195
196def isidentifier(x):
197    if x in ('True', 'False', 'None'):
198        return True
199    if keyword.iskeyword(x):
200        return False
201    return x.isidentifier()
202