1# -*- coding: utf-8 -*- 2"""Lexer for xonsh code. 3 4Written using a hybrid of ``tokenize`` and PLY. 5""" 6import io 7 8# 'keyword' interferes with ast.keyword 9import keyword as kwmod 10 11try: 12 from ply.lex import LexToken 13except ImportError: 14 from xonsh.ply.ply.lex import LexToken 15 16from xonsh.lazyasd import lazyobject 17from xonsh.platform import PYTHON_VERSION_INFO 18from xonsh.tokenize import ( 19 OP, 20 IOREDIRECT, 21 STRING, 22 DOLLARNAME, 23 NUMBER, 24 SEARCHPATH, 25 NEWLINE, 26 INDENT, 27 DEDENT, 28 NL, 29 COMMENT, 30 ENCODING, 31 ENDMARKER, 32 NAME, 33 ERRORTOKEN, 34 GREATER, 35 LESS, 36 RIGHTSHIFT, 37 tokenize, 38 TokenError, 39) 40 41 42@lazyobject 43def token_map(): 44 """Mapping from ``tokenize`` tokens (or token types) to PLY token types. If 45 a simple one-to-one mapping from ``tokenize`` to PLY exists, the lexer will 46 look it up here and generate a single PLY token of the given type. 47 Otherwise, it will fall back to handling that token using one of the 48 handlers in``special_handlers``. 49 """ 50 tm = {} 51 # operators 52 _op_map = { 53 # punctuation 54 ",": "COMMA", 55 ".": "PERIOD", 56 ";": "SEMI", 57 ":": "COLON", 58 "...": "ELLIPSIS", 59 # basic operators 60 "+": "PLUS", 61 "-": "MINUS", 62 "*": "TIMES", 63 "@": "AT", 64 "/": "DIVIDE", 65 "//": "DOUBLEDIV", 66 "%": "MOD", 67 "**": "POW", 68 "|": "PIPE", 69 "~": "TILDE", 70 "^": "XOR", 71 "<<": "LSHIFT", 72 ">>": "RSHIFT", 73 "<": "LT", 74 "<=": "LE", 75 ">": "GT", 76 ">=": "GE", 77 "==": "EQ", 78 "!=": "NE", 79 "->": "RARROW", 80 # assignment operators 81 "=": "EQUALS", 82 "+=": "PLUSEQUAL", 83 "-=": "MINUSEQUAL", 84 "*=": "TIMESEQUAL", 85 "@=": "ATEQUAL", 86 "/=": "DIVEQUAL", 87 "%=": "MODEQUAL", 88 "**=": "POWEQUAL", 89 "<<=": "LSHIFTEQUAL", 90 ">>=": "RSHIFTEQUAL", 91 "&=": "AMPERSANDEQUAL", 92 "^=": "XOREQUAL", 93 "|=": "PIPEEQUAL", 94 "//=": "DOUBLEDIVEQUAL", 95 # extra xonsh operators 96 "?": "QUESTION", 97 "??": "DOUBLE_QUESTION", 98 "@$": "ATDOLLAR", 99 "&": "AMPERSAND", 100 } 101 for (op, typ) in _op_map.items(): 102 tm[(OP, op)] = typ 103 tm[IOREDIRECT] = "IOREDIRECT" 104 tm[STRING] = "STRING" 105 tm[DOLLARNAME] = "DOLLAR_NAME" 106 tm[NUMBER] = "NUMBER" 107 tm[SEARCHPATH] = "SEARCHPATH" 108 tm[NEWLINE] = "NEWLINE" 109 tm[INDENT] = "INDENT" 110 tm[DEDENT] = "DEDENT" 111 if (3, 5, 0) <= PYTHON_VERSION_INFO < (3, 7, 0): 112 from xonsh.tokenize import ASYNC, AWAIT 113 114 tm[ASYNC] = "ASYNC" 115 tm[AWAIT] = "AWAIT" 116 return tm 117 118 119def handle_name(state, token): 120 """Function for handling name tokens""" 121 typ = "NAME" 122 if state["pymode"][-1][0]: 123 if token.string in kwmod.kwlist: 124 typ = token.string.upper() 125 state["last"] = token 126 yield _new_token(typ, token.string, token.start) 127 else: 128 prev = state["last"] 129 state["last"] = token 130 has_whitespace = prev.end != token.start 131 if token.string == "and" and has_whitespace: 132 yield _new_token("AND", token.string, token.start) 133 elif token.string == "or" and has_whitespace: 134 yield _new_token("OR", token.string, token.start) 135 else: 136 yield _new_token("NAME", token.string, token.start) 137 138 139def _end_delimiter(state, token): 140 py = state["pymode"] 141 s = token.string 142 l, c = token.start 143 if len(py) > 1: 144 mode, orig, match, pos = py.pop() 145 if s != match: 146 e = '"{}" at {} ends "{}" at {} (expected "{}")' 147 return e.format(s, (l, c), orig, pos, match) 148 else: 149 return 'Unmatched "{}" at line {}, column {}'.format(s, l, c) 150 151 152def handle_rparen(state, token): 153 """ 154 Function for handling ``)`` 155 """ 156 e = _end_delimiter(state, token) 157 if e is None: 158 state["last"] = token 159 yield _new_token("RPAREN", ")", token.start) 160 else: 161 yield _new_token("ERRORTOKEN", e, token.start) 162 163 164def handle_rbrace(state, token): 165 """Function for handling ``}``""" 166 e = _end_delimiter(state, token) 167 if e is None: 168 state["last"] = token 169 yield _new_token("RBRACE", "}", token.start) 170 else: 171 yield _new_token("ERRORTOKEN", e, token.start) 172 173 174def handle_rbracket(state, token): 175 """ 176 Function for handling ``]`` 177 """ 178 e = _end_delimiter(state, token) 179 if e is None: 180 state["last"] = token 181 yield _new_token("RBRACKET", "]", token.start) 182 else: 183 yield _new_token("ERRORTOKEN", e, token.start) 184 185 186def handle_error_space(state, token): 187 """ 188 Function for handling special whitespace characters in subprocess mode 189 """ 190 if not state["pymode"][-1][0]: 191 state["last"] = token 192 yield _new_token("WS", token.string, token.start) 193 else: 194 yield from [] 195 196 197def handle_error_linecont(state, token): 198 """Function for handling special line continuations as whitespace 199 characters in subprocess mode. 200 """ 201 if state["pymode"][-1][0]: 202 return 203 prev = state["last"] 204 if prev.end != token.start: 205 return # previous token is separated by whitespace 206 state["last"] = token 207 yield _new_token("WS", "\\", token.start) 208 209 210def handle_error_token(state, token): 211 """ 212 Function for handling error tokens 213 """ 214 state["last"] = token 215 if token.string == "!": 216 typ = "BANG" 217 elif not state["pymode"][-1][0]: 218 typ = "NAME" 219 else: 220 typ = "ERRORTOKEN" 221 yield _new_token(typ, token.string, token.start) 222 223 224def handle_ignore(state, token): 225 """Function for handling tokens that should be ignored""" 226 yield from [] 227 228 229def handle_double_amps(state, token): 230 yield _new_token("AND", "and", token.start) 231 232 233def handle_double_pipe(state, token): 234 yield _new_token("OR", "or", token.start) 235 236 237def handle_redirect(state, token): 238 # The parser expects whitespace after a redirection in subproc mode. 239 # If whitespace does not exist, we'll issue an empty whitespace 240 # token before proceeding. 241 state["last"] = token 242 typ = token.type 243 st = token.string 244 key = (typ, st) if (typ, st) in token_map else typ 245 yield _new_token(token_map[key], st, token.start) 246 if state["pymode"][-1][0]: 247 return 248 # add a whitespace token after a redirection, if we need to 249 next_tok = next(state["stream"]) 250 if next_tok.start == token.end: 251 yield _new_token("WS", "", token.end) 252 yield from handle_token(state, next_tok) 253 254 255def _make_matcher_handler(tok, typ, pymode, ender, handlers): 256 matcher = ( 257 ")" 258 if tok.endswith("(") 259 else "}" 260 if tok.endswith("{") 261 else "]" 262 if tok.endswith("[") 263 else None 264 ) 265 266 def _inner_handler(state, token): 267 state["pymode"].append((pymode, tok, matcher, token.start)) 268 state["last"] = token 269 yield _new_token(typ, tok, token.start) 270 271 handlers[(OP, tok)] = _inner_handler 272 273 274@lazyobject 275def special_handlers(): 276 """Mapping from ``tokenize`` tokens (or token types) to the proper 277 function for generating PLY tokens from them. In addition to 278 yielding PLY tokens, these functions may manipulate the Lexer's state. 279 """ 280 sh = { 281 NL: handle_ignore, 282 COMMENT: handle_ignore, 283 ENCODING: handle_ignore, 284 ENDMARKER: handle_ignore, 285 NAME: handle_name, 286 ERRORTOKEN: handle_error_token, 287 LESS: handle_redirect, 288 GREATER: handle_redirect, 289 RIGHTSHIFT: handle_redirect, 290 IOREDIRECT: handle_redirect, 291 (OP, "<"): handle_redirect, 292 (OP, ">"): handle_redirect, 293 (OP, ">>"): handle_redirect, 294 (OP, ")"): handle_rparen, 295 (OP, "}"): handle_rbrace, 296 (OP, "]"): handle_rbracket, 297 (OP, "&&"): handle_double_amps, 298 (OP, "||"): handle_double_pipe, 299 (ERRORTOKEN, " "): handle_error_space, 300 (ERRORTOKEN, "\\\n"): handle_error_linecont, 301 (ERRORTOKEN, "\\\r\n"): handle_error_linecont, 302 } 303 _make_matcher_handler("(", "LPAREN", True, ")", sh) 304 _make_matcher_handler("[", "LBRACKET", True, "]", sh) 305 _make_matcher_handler("{", "LBRACE", True, "}", sh) 306 _make_matcher_handler("$(", "DOLLAR_LPAREN", False, ")", sh) 307 _make_matcher_handler("$[", "DOLLAR_LBRACKET", False, "]", sh) 308 _make_matcher_handler("${", "DOLLAR_LBRACE", True, "}", sh) 309 _make_matcher_handler("!(", "BANG_LPAREN", False, ")", sh) 310 _make_matcher_handler("![", "BANG_LBRACKET", False, "]", sh) 311 _make_matcher_handler("@(", "AT_LPAREN", True, ")", sh) 312 _make_matcher_handler("@$(", "ATDOLLAR_LPAREN", False, ")", sh) 313 return sh 314 315 316def handle_token(state, token): 317 """ 318 General-purpose token handler. Makes use of ``token_map`` or 319 ``special_map`` to yield one or more PLY tokens from the given input. 320 321 Parameters 322 ---------- 323 324 state : 325 The current state of the lexer, including information about whether 326 we are in Python mode or subprocess mode, which changes the lexer's 327 behavior. Also includes the stream of tokens yet to be considered. 328 token : 329 The token (from ``tokenize``) currently under consideration 330 """ 331 typ = token.type 332 st = token.string 333 pymode = state["pymode"][-1][0] 334 if not pymode: 335 if state["last"] is not None and state["last"].end != token.start: 336 cur = token.start 337 old = state["last"].end 338 if cur[0] == old[0] and cur[1] > old[1]: 339 yield _new_token("WS", token.line[old[1] : cur[1]], old) 340 if (typ, st) in special_handlers: 341 yield from special_handlers[(typ, st)](state, token) 342 elif (typ, st) in token_map: 343 state["last"] = token 344 yield _new_token(token_map[(typ, st)], st, token.start) 345 elif typ in special_handlers: 346 yield from special_handlers[typ](state, token) 347 elif typ in token_map: 348 state["last"] = token 349 yield _new_token(token_map[typ], st, token.start) 350 else: 351 m = "Unexpected token: {0}".format(token) 352 yield _new_token("ERRORTOKEN", m, token.start) 353 354 355def get_tokens(s): 356 """ 357 Given a string containing xonsh code, generates a stream of relevant PLY 358 tokens using ``handle_token``. 359 """ 360 state = { 361 "indents": [0], 362 "last": None, 363 "pymode": [(True, "", "", (0, 0))], 364 "stream": tokenize(io.BytesIO(s.encode("utf-8")).readline), 365 } 366 while True: 367 try: 368 token = next(state["stream"]) 369 yield from handle_token(state, token) 370 except StopIteration: 371 if len(state["pymode"]) > 1: 372 pm, o, m, p = state["pymode"][-1] 373 l, c = p 374 e = 'Unmatched "{}" at line {}, column {}' 375 yield _new_token("ERRORTOKEN", e.format(o, l, c), (0, 0)) 376 break 377 except TokenError as e: 378 # this is recoverable in single-line mode (from the shell) 379 # (e.g., EOF while scanning string literal) 380 yield _new_token("ERRORTOKEN", e.args[0], (0, 0)) 381 break 382 except IndentationError as e: 383 # this is never recoverable 384 yield _new_token("ERRORTOKEN", e, (0, 0)) 385 break 386 387 388# synthesize a new PLY token 389def _new_token(type, value, pos): 390 o = LexToken() 391 o.type = type 392 o.value = value 393 o.lineno, o.lexpos = pos 394 return o 395 396 397class Lexer(object): 398 """Implements a lexer for the xonsh language.""" 399 400 _tokens = None 401 402 def __init__(self): 403 """ 404 Attributes 405 ---------- 406 fname : str 407 Filename 408 last : token 409 The last token seen. 410 lineno : int 411 The last line number seen. 412 413 """ 414 self.fname = "" 415 self.last = None 416 self.beforelast = None 417 418 def build(self, **kwargs): 419 """Part of the PLY lexer API.""" 420 pass 421 422 def reset(self): 423 pass 424 425 def input(self, s): 426 """Calls the lexer on the string s.""" 427 self.token_stream = get_tokens(s) 428 429 def token(self): 430 """Retrieves the next token.""" 431 self.beforelast = self.last 432 self.last = next(self.token_stream, None) 433 return self.last 434 435 def __iter__(self): 436 t = self.token() 437 while t is not None: 438 yield t 439 t = self.token() 440 441 def split(self, s): 442 """Splits a string into a list of strings which are whitespace-separated 443 tokens. 444 """ 445 vals = [] 446 self.input(s) 447 l = c = -1 448 ws = "WS" 449 nl = "\n" 450 for t in self: 451 if t.type == ws: 452 continue 453 elif l < t.lineno: 454 vals.append(t.value) 455 elif len(vals) > 0 and c == t.lexpos: 456 vals[-1] = vals[-1] + t.value 457 else: 458 vals.append(t.value) 459 nnl = t.value.count(nl) 460 if nnl == 0: 461 l = t.lineno 462 c = t.lexpos + len(t.value) 463 else: 464 l = t.lineno + nnl 465 c = len(t.value.rpartition(nl)[-1]) 466 return vals 467 468 # 469 # All the tokens recognized by the lexer 470 # 471 @property 472 def tokens(self): 473 if self._tokens is None: 474 t = ( 475 tuple(token_map.values()) 476 + ( 477 "NAME", # name tokens 478 "BANG", # ! tokens 479 "WS", # whitespace in subprocess mode 480 "LPAREN", 481 "RPAREN", # ( ) 482 "LBRACKET", 483 "RBRACKET", # [ ] 484 "LBRACE", 485 "RBRACE", # { } 486 "AT_LPAREN", # @( 487 "BANG_LPAREN", # !( 488 "BANG_LBRACKET", # ![ 489 "DOLLAR_LPAREN", # $( 490 "DOLLAR_LBRACE", # ${ 491 "DOLLAR_LBRACKET", # $[ 492 "ATDOLLAR_LPAREN", # @$( 493 "ERRORTOKEN", # whoops! 494 ) 495 + tuple(i.upper() for i in kwmod.kwlist) 496 ) 497 self._tokens = t 498 return self._tokens 499