1import json 2 3from six import unichr 4 5from ..error import GraphQLSyntaxError 6 7# Necessary for static type checking 8if False: # flake8: noqa 9 from typing import Optional, Any, List 10 from .source import Source 11 12__all__ = ["Token", "Lexer", "TokenKind", "get_token_desc", "get_token_kind_desc"] 13 14 15class Token(object): 16 __slots__ = "kind", "start", "end", "value" 17 18 def __init__(self, kind, start, end, value=None): 19 # type: (int, int, int, Optional[str]) -> None 20 self.kind = kind 21 self.start = start 22 self.end = end 23 self.value = value 24 25 def __repr__(self): 26 # type: () -> str 27 return u"<Token kind={} at {}..{} value={}>".format( 28 get_token_kind_desc(self.kind), self.start, self.end, repr(self.value) 29 ) 30 31 def __eq__(self, other): 32 # type: (Any) -> bool 33 return ( 34 isinstance(other, Token) 35 and self.kind == other.kind 36 and self.start == other.start 37 and self.end == other.end 38 and self.value == other.value 39 ) 40 41 42class Lexer(object): 43 __slots__ = "source", "prev_position" 44 45 def __init__(self, source): 46 # type: (Source) -> None 47 self.source = source 48 self.prev_position = 0 49 50 def next_token(self, reset_position=None): 51 # type: (Optional[int]) -> Token 52 if reset_position is None: 53 reset_position = self.prev_position 54 token = read_token(self.source, reset_position) 55 self.prev_position = token.end 56 return token 57 58 59class TokenKind(object): 60 EOF = 1 61 BANG = 2 62 DOLLAR = 3 63 PAREN_L = 4 64 PAREN_R = 5 65 SPREAD = 6 66 COLON = 7 67 EQUALS = 8 68 AT = 9 69 BRACKET_L = 10 70 BRACKET_R = 11 71 BRACE_L = 12 72 PIPE = 13 73 BRACE_R = 14 74 NAME = 15 75 VARIABLE = 16 76 INT = 17 77 FLOAT = 18 78 STRING = 19 79 80 81def get_token_desc(token): 82 # type: (Token) -> str 83 if token.value: 84 return u'{} "{}"'.format(get_token_kind_desc(token.kind), token.value) 85 else: 86 return get_token_kind_desc(token.kind) 87 88 89def get_token_kind_desc(kind): 90 # type: (int) -> str 91 return TOKEN_DESCRIPTION[kind] 92 93 94TOKEN_DESCRIPTION = { 95 TokenKind.EOF: "EOF", 96 TokenKind.BANG: "!", 97 TokenKind.DOLLAR: "$", 98 TokenKind.PAREN_L: "(", 99 TokenKind.PAREN_R: ")", 100 TokenKind.SPREAD: "...", 101 TokenKind.COLON: ":", 102 TokenKind.EQUALS: "=", 103 TokenKind.AT: "@", 104 TokenKind.BRACKET_L: "[", 105 TokenKind.BRACKET_R: "]", 106 TokenKind.BRACE_L: "{", 107 TokenKind.PIPE: "|", 108 TokenKind.BRACE_R: "}", 109 TokenKind.NAME: "Name", 110 TokenKind.VARIABLE: "Variable", 111 TokenKind.INT: "Int", 112 TokenKind.FLOAT: "Float", 113 TokenKind.STRING: "String", 114} 115 116 117def char_code_at(s, pos): 118 # type: (str, int) -> Optional[int] 119 if 0 <= pos < len(s): 120 return ord(s[pos]) 121 122 return None 123 124 125PUNCT_CODE_TO_KIND = { 126 ord("!"): TokenKind.BANG, 127 ord("$"): TokenKind.DOLLAR, 128 ord("("): TokenKind.PAREN_L, 129 ord(")"): TokenKind.PAREN_R, 130 ord(":"): TokenKind.COLON, 131 ord("="): TokenKind.EQUALS, 132 ord("@"): TokenKind.AT, 133 ord("["): TokenKind.BRACKET_L, 134 ord("]"): TokenKind.BRACKET_R, 135 ord("{"): TokenKind.BRACE_L, 136 ord("|"): TokenKind.PIPE, 137 ord("}"): TokenKind.BRACE_R, 138} 139 140 141def print_char_code(code): 142 # type: (Optional[int]) -> str 143 if code is None: 144 return "<EOF>" 145 146 if code < 0x007F: 147 return json.dumps(unichr(code)) 148 149 return '"\\u%04X"' % code 150 151 152def read_token(source, from_position): 153 # type: (Source, int) -> Token 154 """Gets the next token from the source starting at the given position. 155 156 This skips over whitespace and comments until it finds the next lexable 157 token, then lexes punctuators immediately or calls the appropriate 158 helper fucntion for more complicated tokens.""" 159 body = source.body 160 body_length = len(body) 161 162 position = position_after_whitespace(body, from_position) 163 164 if position >= body_length: 165 return Token(TokenKind.EOF, position, position) 166 167 code = char_code_at(body, position) 168 if code: 169 if code < 0x0020 and code not in (0x0009, 0x000A, 0x000D): 170 raise GraphQLSyntaxError( 171 source, position, u"Invalid character {}.".format(print_char_code(code)) 172 ) 173 174 kind = PUNCT_CODE_TO_KIND.get(code) 175 if kind is not None: 176 return Token(kind, position, position + 1) 177 178 if code == 46: # . 179 if ( 180 char_code_at(body, position + 1) 181 == char_code_at(body, position + 2) 182 == 46 183 ): 184 return Token(TokenKind.SPREAD, position, position + 3) 185 186 elif 65 <= code <= 90 or code == 95 or 97 <= code <= 122: 187 # A-Z, _, a-z 188 return read_name(source, position) 189 190 elif code == 45 or 48 <= code <= 57: # -, 0-9 191 return read_number(source, position, code) 192 193 elif code == 34: # " 194 return read_string(source, position) 195 196 raise GraphQLSyntaxError( 197 source, position, u"Unexpected character {}.".format(print_char_code(code)) 198 ) 199 200 201ignored_whitespace_characters = frozenset( 202 [ 203 # BOM 204 0xFEFF, 205 # White Space 206 0x0009, # tab 207 0x0020, # space 208 # Line Terminator 209 0x000A, # new line 210 0x000D, # carriage return 211 # Comma 212 0x002C, 213 ] 214) 215 216 217def position_after_whitespace(body, start_position): 218 # type: (str, int) -> int 219 """Reads from body starting at start_position until it finds a 220 non-whitespace or commented character, then returns the position of 221 that character for lexing.""" 222 body_length = len(body) 223 position = start_position 224 while position < body_length: 225 code = char_code_at(body, position) 226 if code in ignored_whitespace_characters: 227 position += 1 228 229 elif code == 35: # #, skip comments 230 position += 1 231 while position < body_length: 232 code = char_code_at(body, position) 233 if not ( 234 code is not None 235 and (code > 0x001F or code == 0x0009) 236 and code not in (0x000A, 0x000D) 237 ): 238 break 239 240 position += 1 241 else: 242 break 243 return position 244 245 246def read_number(source, start, first_code): 247 # type: (Source, int, Optional[int]) -> Token 248 r"""Reads a number token from the source file, either a float 249 or an int depending on whether a decimal point appears. 250 251 Int: -?(0|[1-9][0-9]*) 252 Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?""" 253 code = first_code 254 body = source.body 255 position = start 256 is_float = False 257 258 if code == 45: # - 259 position += 1 260 code = char_code_at(body, position) 261 262 if code == 48: # 0 263 position += 1 264 code = char_code_at(body, position) 265 266 if code is not None and 48 <= code <= 57: 267 raise GraphQLSyntaxError( 268 source, 269 position, 270 u"Invalid number, unexpected digit after 0: {}.".format( 271 print_char_code(code) 272 ), 273 ) 274 else: 275 position = read_digits(source, position, code) 276 code = char_code_at(body, position) 277 278 if code == 46: # . 279 is_float = True 280 281 position += 1 282 code = char_code_at(body, position) 283 position = read_digits(source, position, code) 284 code = char_code_at(body, position) 285 286 if code in (69, 101): # E e 287 is_float = True 288 position += 1 289 code = char_code_at(body, position) 290 if code in (43, 45): # + - 291 position += 1 292 code = char_code_at(body, position) 293 294 position = read_digits(source, position, code) 295 296 return Token( 297 TokenKind.FLOAT if is_float else TokenKind.INT, 298 start, 299 position, 300 body[start:position], 301 ) 302 303 304def read_digits(source, start, first_code): 305 # type: (Source, int, Optional[int]) -> int 306 body = source.body 307 position = start 308 code = first_code 309 310 if code is not None and 48 <= code <= 57: # 0 - 9 311 while True: 312 position += 1 313 code = char_code_at(body, position) 314 315 if not (code is not None and 48 <= code <= 57): 316 break 317 318 return position 319 320 raise GraphQLSyntaxError( 321 source, 322 position, 323 u"Invalid number, expected digit but got: {}.".format(print_char_code(code)), 324 ) 325 326 327ESCAPED_CHAR_CODES = { 328 34: '"', 329 47: "/", 330 92: "\\", 331 98: "\b", 332 102: "\f", 333 110: "\n", 334 114: "\r", 335 116: "\t", 336} 337 338 339def read_string(source, start): 340 # type: (Source, int) -> Token 341 """Reads a string token from the source file. 342 343 "([^"\\\u000A\u000D\u2028\u2029]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*" 344 """ 345 body = source.body 346 body_length = len(body) 347 348 position = start + 1 349 chunk_start = position 350 code = 0 # type: Optional[int] 351 value = [] # type: List[str] 352 append = value.append 353 354 while position < body_length: 355 code = char_code_at(body, position) 356 if code in ( 357 None, 358 # LineTerminator 359 0x000A, 360 0x000D, 361 # Quote 362 34, 363 ): 364 break 365 366 if code < 0x0020 and code != 0x0009: # type: ignore 367 raise GraphQLSyntaxError( 368 source, 369 position, 370 u"Invalid character within String: {}.".format(print_char_code(code)), 371 ) 372 373 position += 1 374 if code == 92: # \ 375 append(body[chunk_start : position - 1]) 376 377 code = char_code_at(body, position) 378 escaped = ESCAPED_CHAR_CODES.get(code) # type: ignore 379 if escaped is not None: 380 append(escaped) 381 382 elif code == 117: # u 383 char_code = uni_char_code( 384 char_code_at(body, position + 1) or 0, 385 char_code_at(body, position + 2) or 0, 386 char_code_at(body, position + 3) or 0, 387 char_code_at(body, position + 4) or 0, 388 ) 389 390 if char_code < 0: 391 raise GraphQLSyntaxError( 392 source, 393 position, 394 u"Invalid character escape sequence: \\u{}.".format( 395 body[position + 1 : position + 5] 396 ), 397 ) 398 399 append(unichr(char_code)) 400 position += 4 401 else: 402 raise GraphQLSyntaxError( 403 source, 404 position, 405 u"Invalid character escape sequence: \\{}.".format( 406 unichr(code) # type: ignore 407 ), 408 ) 409 410 position += 1 411 chunk_start = position 412 413 if code != 34: # Quote (") 414 raise GraphQLSyntaxError(source, position, "Unterminated string") 415 416 append(body[chunk_start:position]) 417 return Token(TokenKind.STRING, start, position + 1, u"".join(value)) 418 419 420def uni_char_code(a, b, c, d): 421 # type: (int, int, int, int) -> int 422 """Converts four hexidecimal chars to the integer that the 423 string represents. For example, uniCharCode('0','0','0','f') 424 will return 15, and uniCharCode('0','0','f','f') returns 255. 425 426 Returns a negative number on error, if a char was invalid. 427 428 This is implemented by noting that char2hex() returns -1 on error, 429 which means the result of ORing the char2hex() will also be negative. 430 """ 431 return char2hex(a) << 12 | char2hex(b) << 8 | char2hex(c) << 4 | char2hex(d) 432 433 434def char2hex(a): 435 # type: (int) -> int 436 """Converts a hex character to its integer value. 437 '0' becomes 0, '9' becomes 9 438 'A' becomes 10, 'F' becomes 15 439 'a' becomes 10, 'f' becomes 15 440 441 Returns -1 on error.""" 442 if 48 <= a <= 57: # 0-9 443 return a - 48 444 elif 65 <= a <= 70: # A-F 445 return a - 55 446 elif 97 <= a <= 102: # a-f 447 return a - 87 448 return -1 449 450 451def read_name(source, position): 452 # type: (Source, int) -> Token 453 """Reads an alphanumeric + underscore name from the source. 454 455 [_A-Za-z][_0-9A-Za-z]*""" 456 body = source.body 457 body_length = len(body) 458 end = position + 1 459 460 while end != body_length: 461 code = char_code_at(body, end) 462 if not ( 463 code is not None 464 and ( 465 code == 95 466 or 48 <= code <= 57 # _ 467 or 65 <= code <= 90 # 0-9 468 or 97 <= code <= 122 # A-Z # a-z 469 ) 470 ): 471 break 472 473 end += 1 474 475 return Token(TokenKind.NAME, position, end, body[position:end]) 476