1# Copyright 2020 the authors. 2# This file is part of Hy, which is free software licensed under the Expat 3# license. See the LICENSE. 4 5from __future__ import unicode_literals 6 7import keyword 8import re 9import sys 10import unicodedata 11 12from hy.lex.exceptions import PrematureEndOfInput, LexException # NOQA 13from hy.models import HyExpression, HySymbol 14 15try: 16 from io import StringIO 17except ImportError: 18 from StringIO import StringIO 19 20 21def hy_parse(source, filename='<string>'): 22 """Parse a Hy source string. 23 24 Parameters 25 ---------- 26 source: string 27 Source code to parse. 28 29 filename: string, optional 30 File name corresponding to source. Defaults to "<string>". 31 32 Returns 33 ------- 34 out : HyExpression 35 """ 36 _source = re.sub(r'\A#!.*', '', source) 37 res = HyExpression([HySymbol("do")] + 38 tokenize(_source + "\n", 39 filename=filename)) 40 res.source = source 41 res.filename = filename 42 return res 43 44 45class ParserState(object): 46 def __init__(self, source, filename): 47 self.source = source 48 self.filename = filename 49 50 51def tokenize(source, filename=None): 52 """ Tokenize a Lisp file or string buffer into internal Hy objects. 53 54 Parameters 55 ---------- 56 source: str 57 The source to tokenize. 58 filename: str, optional 59 The filename corresponding to `source`. 60 """ 61 from hy.lex.lexer import lexer 62 from hy.lex.parser import parser 63 from rply.errors import LexingError 64 try: 65 return parser.parse(lexer.lex(source), 66 state=ParserState(source, filename)) 67 except LexingError as e: 68 pos = e.getsourcepos() 69 raise LexException("Could not identify the next token.", 70 None, filename, source, 71 max(pos.lineno, 1), 72 max(pos.colno, 1)) 73 except LexException as e: 74 raise e 75 76 77def parse_one_thing(src_string): 78 """Parse the first form from the string. Return it and the 79 remainder of the string.""" 80 import re 81 from hy.lex.lexer import lexer 82 from hy.lex.parser import parser 83 from rply.errors import LexingError 84 tokens = [] 85 err = None 86 for token in lexer.lex(src_string): 87 tokens.append(token) 88 try: 89 model, = parser.parse( 90 iter(tokens), 91 state=ParserState(src_string, filename=None)) 92 except (LexingError, LexException) as e: 93 err = e 94 else: 95 return model, src_string[re.match( 96 r'.+\n' * (model.end_line - 1) 97 + '.' * model.end_column, 98 src_string).end():] 99 if err: 100 raise err 101 raise ValueError("No form found") 102 103 104mangle_delim = 'X' 105 106 107def mangle(s): 108 """Stringify the argument and convert it to a valid Python identifier 109 according to Hy's mangling rules.""" 110 def unicode_char_to_hex(uchr): 111 # Covert a unicode char to hex string, without prefix 112 if len(uchr) == 1 and ord(uchr) < 128: 113 return format(ord(uchr), 'x') 114 return (uchr.encode('unicode-escape').decode('utf-8') 115 .lstrip('\\U').lstrip('\\u').lstrip('\\x').lstrip('0')) 116 117 assert s 118 119 s = str(s) 120 s = s.replace("-", "_") 121 s2 = s.lstrip('_') 122 leading_underscores = '_' * (len(s) - len(s2)) 123 s = s2 124 125 if s.endswith("?"): 126 s = 'is_' + s[:-1] 127 if not isidentifier(leading_underscores + s): 128 # Replace illegal characters with their Unicode character 129 # names, or hexadecimal if they don't have one. 130 s = 'hyx_' + ''.join( 131 c 132 if c != mangle_delim and isidentifier('S' + c) 133 # We prepend the "S" because some characters aren't 134 # allowed at the start of an identifier. 135 else '{0}{1}{0}'.format(mangle_delim, 136 unicodedata.name(c, '').lower().replace('-', 'H').replace(' ', '_') 137 or 'U{}'.format(unicode_char_to_hex(c))) 138 for c in s) 139 140 s = leading_underscores + s 141 assert isidentifier(s) 142 return s 143 144 145def unmangle(s): 146 """Stringify the argument and try to convert it to a pretty unmangled 147 form. This may not round-trip, because different Hy symbol names can 148 mangle to the same Python identifier.""" 149 150 s = str(s) 151 152 s2 = s.lstrip('_') 153 leading_underscores = len(s) - len(s2) 154 s = s2 155 156 if s.startswith('hyx_'): 157 s = re.sub('{0}(U)?([_a-z0-9H]+?){0}'.format(mangle_delim), 158 lambda mo: 159 chr(int(mo.group(2), base=16)) 160 if mo.group(1) 161 else unicodedata.lookup( 162 mo.group(2).replace('_', ' ').replace('H', '-').upper()), 163 s[len('hyx_'):]) 164 if s.startswith('is_'): 165 s = s[len("is_"):] + "?" 166 s = s.replace('_', '-') 167 168 return '-' * leading_underscores + s 169 170 171def read(from_file=sys.stdin, eof=""): 172 """Read from input and returns a tokenized string. 173 174 Can take a given input buffer to read from, and a single byte as EOF 175 (defaults to an empty string). 176 """ 177 buff = "" 178 while True: 179 inn = str(from_file.readline()) 180 if inn == eof: 181 raise EOFError("Reached end of file") 182 buff += inn 183 try: 184 parsed = next(iter(tokenize(buff)), None) 185 except (PrematureEndOfInput, IndexError): 186 pass 187 else: 188 break 189 return parsed 190 191 192def read_str(input): 193 return read(StringIO(str(input))) 194 195 196def isidentifier(x): 197 if x in ('True', 'False', 'None'): 198 return True 199 if keyword.iskeyword(x): 200 return False 201 return x.isidentifier() 202