1import json
2
3from six import unichr
4
5from ..error import GraphQLSyntaxError
6
7# Necessary for static type checking
8if False:  # flake8: noqa
9    from typing import Optional, Any, List
10    from .source import Source
11
12__all__ = ["Token", "Lexer", "TokenKind", "get_token_desc", "get_token_kind_desc"]
13
14
15class Token(object):
16    __slots__ = "kind", "start", "end", "value"
17
18    def __init__(self, kind, start, end, value=None):
19        # type: (int, int, int, Optional[str]) -> None
20        self.kind = kind
21        self.start = start
22        self.end = end
23        self.value = value
24
25    def __repr__(self):
26        # type: () -> str
27        return u"<Token kind={} at {}..{} value={}>".format(
28            get_token_kind_desc(self.kind), self.start, self.end, repr(self.value)
29        )
30
31    def __eq__(self, other):
32        # type: (Any) -> bool
33        return (
34            isinstance(other, Token)
35            and self.kind == other.kind
36            and self.start == other.start
37            and self.end == other.end
38            and self.value == other.value
39        )
40
41
42class Lexer(object):
43    __slots__ = "source", "prev_position"
44
45    def __init__(self, source):
46        # type: (Source) -> None
47        self.source = source
48        self.prev_position = 0
49
50    def next_token(self, reset_position=None):
51        # type: (Optional[int]) -> Token
52        if reset_position is None:
53            reset_position = self.prev_position
54        token = read_token(self.source, reset_position)
55        self.prev_position = token.end
56        return token
57
58
59class TokenKind(object):
60    EOF = 1
61    BANG = 2
62    DOLLAR = 3
63    PAREN_L = 4
64    PAREN_R = 5
65    SPREAD = 6
66    COLON = 7
67    EQUALS = 8
68    AT = 9
69    BRACKET_L = 10
70    BRACKET_R = 11
71    BRACE_L = 12
72    PIPE = 13
73    BRACE_R = 14
74    NAME = 15
75    VARIABLE = 16
76    INT = 17
77    FLOAT = 18
78    STRING = 19
79
80
81def get_token_desc(token):
82    # type: (Token) -> str
83    if token.value:
84        return u'{} "{}"'.format(get_token_kind_desc(token.kind), token.value)
85    else:
86        return get_token_kind_desc(token.kind)
87
88
89def get_token_kind_desc(kind):
90    # type: (int) -> str
91    return TOKEN_DESCRIPTION[kind]
92
93
94TOKEN_DESCRIPTION = {
95    TokenKind.EOF: "EOF",
96    TokenKind.BANG: "!",
97    TokenKind.DOLLAR: "$",
98    TokenKind.PAREN_L: "(",
99    TokenKind.PAREN_R: ")",
100    TokenKind.SPREAD: "...",
101    TokenKind.COLON: ":",
102    TokenKind.EQUALS: "=",
103    TokenKind.AT: "@",
104    TokenKind.BRACKET_L: "[",
105    TokenKind.BRACKET_R: "]",
106    TokenKind.BRACE_L: "{",
107    TokenKind.PIPE: "|",
108    TokenKind.BRACE_R: "}",
109    TokenKind.NAME: "Name",
110    TokenKind.VARIABLE: "Variable",
111    TokenKind.INT: "Int",
112    TokenKind.FLOAT: "Float",
113    TokenKind.STRING: "String",
114}
115
116
117def char_code_at(s, pos):
118    # type: (str, int) -> Optional[int]
119    if 0 <= pos < len(s):
120        return ord(s[pos])
121
122    return None
123
124
125PUNCT_CODE_TO_KIND = {
126    ord("!"): TokenKind.BANG,
127    ord("$"): TokenKind.DOLLAR,
128    ord("("): TokenKind.PAREN_L,
129    ord(")"): TokenKind.PAREN_R,
130    ord(":"): TokenKind.COLON,
131    ord("="): TokenKind.EQUALS,
132    ord("@"): TokenKind.AT,
133    ord("["): TokenKind.BRACKET_L,
134    ord("]"): TokenKind.BRACKET_R,
135    ord("{"): TokenKind.BRACE_L,
136    ord("|"): TokenKind.PIPE,
137    ord("}"): TokenKind.BRACE_R,
138}
139
140
141def print_char_code(code):
142    # type: (Optional[int]) -> str
143    if code is None:
144        return "<EOF>"
145
146    if code < 0x007F:
147        return json.dumps(unichr(code))
148
149    return '"\\u%04X"' % code
150
151
152def read_token(source, from_position):
153    # type: (Source, int) -> Token
154    """Gets the next token from the source starting at the given position.
155
156    This skips over whitespace and comments until it finds the next lexable
157    token, then lexes punctuators immediately or calls the appropriate
158    helper fucntion for more complicated tokens."""
159    body = source.body
160    body_length = len(body)
161
162    position = position_after_whitespace(body, from_position)
163
164    if position >= body_length:
165        return Token(TokenKind.EOF, position, position)
166
167    code = char_code_at(body, position)
168    if code:
169        if code < 0x0020 and code not in (0x0009, 0x000A, 0x000D):
170            raise GraphQLSyntaxError(
171                source, position, u"Invalid character {}.".format(print_char_code(code))
172            )
173
174        kind = PUNCT_CODE_TO_KIND.get(code)
175        if kind is not None:
176            return Token(kind, position, position + 1)
177
178        if code == 46:  # .
179            if (
180                char_code_at(body, position + 1)
181                == char_code_at(body, position + 2)
182                == 46
183            ):
184                return Token(TokenKind.SPREAD, position, position + 3)
185
186        elif 65 <= code <= 90 or code == 95 or 97 <= code <= 122:
187            # A-Z, _, a-z
188            return read_name(source, position)
189
190        elif code == 45 or 48 <= code <= 57:  # -, 0-9
191            return read_number(source, position, code)
192
193        elif code == 34:  # "
194            return read_string(source, position)
195
196    raise GraphQLSyntaxError(
197        source, position, u"Unexpected character {}.".format(print_char_code(code))
198    )
199
200
201ignored_whitespace_characters = frozenset(
202    [
203        # BOM
204        0xFEFF,
205        # White Space
206        0x0009,  # tab
207        0x0020,  # space
208        # Line Terminator
209        0x000A,  # new line
210        0x000D,  # carriage return
211        # Comma
212        0x002C,
213    ]
214)
215
216
217def position_after_whitespace(body, start_position):
218    # type: (str, int) -> int
219    """Reads from body starting at start_position until it finds a
220    non-whitespace or commented character, then returns the position of
221    that character for lexing."""
222    body_length = len(body)
223    position = start_position
224    while position < body_length:
225        code = char_code_at(body, position)
226        if code in ignored_whitespace_characters:
227            position += 1
228
229        elif code == 35:  # #, skip comments
230            position += 1
231            while position < body_length:
232                code = char_code_at(body, position)
233                if not (
234                    code is not None
235                    and (code > 0x001F or code == 0x0009)
236                    and code not in (0x000A, 0x000D)
237                ):
238                    break
239
240                position += 1
241        else:
242            break
243    return position
244
245
246def read_number(source, start, first_code):
247    # type: (Source, int, Optional[int]) -> Token
248    r"""Reads a number token from the source file, either a float
249    or an int depending on whether a decimal point appears.
250
251    Int:   -?(0|[1-9][0-9]*)
252    Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?"""
253    code = first_code
254    body = source.body
255    position = start
256    is_float = False
257
258    if code == 45:  # -
259        position += 1
260        code = char_code_at(body, position)
261
262    if code == 48:  # 0
263        position += 1
264        code = char_code_at(body, position)
265
266        if code is not None and 48 <= code <= 57:
267            raise GraphQLSyntaxError(
268                source,
269                position,
270                u"Invalid number, unexpected digit after 0: {}.".format(
271                    print_char_code(code)
272                ),
273            )
274    else:
275        position = read_digits(source, position, code)
276        code = char_code_at(body, position)
277
278    if code == 46:  # .
279        is_float = True
280
281        position += 1
282        code = char_code_at(body, position)
283        position = read_digits(source, position, code)
284        code = char_code_at(body, position)
285
286    if code in (69, 101):  # E e
287        is_float = True
288        position += 1
289        code = char_code_at(body, position)
290        if code in (43, 45):  # + -
291            position += 1
292            code = char_code_at(body, position)
293
294        position = read_digits(source, position, code)
295
296    return Token(
297        TokenKind.FLOAT if is_float else TokenKind.INT,
298        start,
299        position,
300        body[start:position],
301    )
302
303
304def read_digits(source, start, first_code):
305    # type: (Source, int, Optional[int]) -> int
306    body = source.body
307    position = start
308    code = first_code
309
310    if code is not None and 48 <= code <= 57:  # 0 - 9
311        while True:
312            position += 1
313            code = char_code_at(body, position)
314
315            if not (code is not None and 48 <= code <= 57):
316                break
317
318        return position
319
320    raise GraphQLSyntaxError(
321        source,
322        position,
323        u"Invalid number, expected digit but got: {}.".format(print_char_code(code)),
324    )
325
326
327ESCAPED_CHAR_CODES = {
328    34: '"',
329    47: "/",
330    92: "\\",
331    98: "\b",
332    102: "\f",
333    110: "\n",
334    114: "\r",
335    116: "\t",
336}
337
338
339def read_string(source, start):
340    # type: (Source, int) -> Token
341    """Reads a string token from the source file.
342
343    "([^"\\\u000A\u000D\u2028\u2029]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
344    """
345    body = source.body
346    body_length = len(body)
347
348    position = start + 1
349    chunk_start = position
350    code = 0  # type: Optional[int]
351    value = []  # type: List[str]
352    append = value.append
353
354    while position < body_length:
355        code = char_code_at(body, position)
356        if code in (
357            None,
358            # LineTerminator
359            0x000A,
360            0x000D,
361            # Quote
362            34,
363        ):
364            break
365
366        if code < 0x0020 and code != 0x0009:  # type: ignore
367            raise GraphQLSyntaxError(
368                source,
369                position,
370                u"Invalid character within String: {}.".format(print_char_code(code)),
371            )
372
373        position += 1
374        if code == 92:  # \
375            append(body[chunk_start : position - 1])
376
377            code = char_code_at(body, position)
378            escaped = ESCAPED_CHAR_CODES.get(code)  # type: ignore
379            if escaped is not None:
380                append(escaped)
381
382            elif code == 117:  # u
383                char_code = uni_char_code(
384                    char_code_at(body, position + 1) or 0,
385                    char_code_at(body, position + 2) or 0,
386                    char_code_at(body, position + 3) or 0,
387                    char_code_at(body, position + 4) or 0,
388                )
389
390                if char_code < 0:
391                    raise GraphQLSyntaxError(
392                        source,
393                        position,
394                        u"Invalid character escape sequence: \\u{}.".format(
395                            body[position + 1 : position + 5]
396                        ),
397                    )
398
399                append(unichr(char_code))
400                position += 4
401            else:
402                raise GraphQLSyntaxError(
403                    source,
404                    position,
405                    u"Invalid character escape sequence: \\{}.".format(
406                        unichr(code)  # type: ignore
407                    ),
408                )
409
410            position += 1
411            chunk_start = position
412
413    if code != 34:  # Quote (")
414        raise GraphQLSyntaxError(source, position, "Unterminated string")
415
416    append(body[chunk_start:position])
417    return Token(TokenKind.STRING, start, position + 1, u"".join(value))
418
419
420def uni_char_code(a, b, c, d):
421    # type: (int, int, int, int) -> int
422    """Converts four hexidecimal chars to the integer that the
423    string represents. For example, uniCharCode('0','0','0','f')
424    will return 15, and uniCharCode('0','0','f','f') returns 255.
425
426    Returns a negative number on error, if a char was invalid.
427
428    This is implemented by noting that char2hex() returns -1 on error,
429    which means the result of ORing the char2hex() will also be negative.
430    """
431    return char2hex(a) << 12 | char2hex(b) << 8 | char2hex(c) << 4 | char2hex(d)
432
433
434def char2hex(a):
435    # type: (int) -> int
436    """Converts a hex character to its integer value.
437    '0' becomes 0, '9' becomes 9
438    'A' becomes 10, 'F' becomes 15
439    'a' becomes 10, 'f' becomes 15
440
441    Returns -1 on error."""
442    if 48 <= a <= 57:  # 0-9
443        return a - 48
444    elif 65 <= a <= 70:  # A-F
445        return a - 55
446    elif 97 <= a <= 102:  # a-f
447        return a - 87
448    return -1
449
450
451def read_name(source, position):
452    # type: (Source, int) -> Token
453    """Reads an alphanumeric + underscore name from the source.
454
455    [_A-Za-z][_0-9A-Za-z]*"""
456    body = source.body
457    body_length = len(body)
458    end = position + 1
459
460    while end != body_length:
461        code = char_code_at(body, end)
462        if not (
463            code is not None
464            and (
465                code == 95
466                or 48 <= code <= 57  # _
467                or 65 <= code <= 90  # 0-9
468                or 97 <= code <= 122  # A-Z  # a-z
469            )
470        ):
471            break
472
473        end += 1
474
475    return Token(TokenKind.NAME, position, end, body[position:end])
476