1<?php
2namespace JmesPath;
3
4/**
5 * Tokenizes JMESPath expressions
6 */
7class Lexer
8{
9    const T_DOT = 'dot';
10    const T_STAR = 'star';
11    const T_COMMA = 'comma';
12    const T_COLON = 'colon';
13    const T_CURRENT = 'current';
14    const T_EXPREF = 'expref';
15    const T_LPAREN = 'lparen';
16    const T_RPAREN = 'rparen';
17    const T_LBRACE = 'lbrace';
18    const T_RBRACE = 'rbrace';
19    const T_LBRACKET = 'lbracket';
20    const T_RBRACKET = 'rbracket';
21    const T_FLATTEN = 'flatten';
22    const T_IDENTIFIER = 'identifier';
23    const T_NUMBER = 'number';
24    const T_QUOTED_IDENTIFIER = 'quoted_identifier';
25    const T_UNKNOWN = 'unknown';
26    const T_PIPE = 'pipe';
27    const T_OR = 'or';
28    const T_AND = 'and';
29    const T_NOT = 'not';
30    const T_FILTER = 'filter';
31    const T_LITERAL = 'literal';
32    const T_EOF = 'eof';
33    const T_COMPARATOR = 'comparator';
34
35    const STATE_IDENTIFIER = 0;
36    const STATE_NUMBER = 1;
37    const STATE_SINGLE_CHAR = 2;
38    const STATE_WHITESPACE = 3;
39    const STATE_STRING_LITERAL = 4;
40    const STATE_QUOTED_STRING = 5;
41    const STATE_JSON_LITERAL = 6;
42    const STATE_LBRACKET = 7;
43    const STATE_PIPE = 8;
44    const STATE_LT = 9;
45    const STATE_GT = 10;
46    const STATE_EQ = 11;
47    const STATE_NOT = 12;
48    const STATE_AND = 13;
49
50    /** @var array We know what token we are consuming based on each char */
51    private static $transitionTable = [
52        '<'  => self::STATE_LT,
53        '>'  => self::STATE_GT,
54        '='  => self::STATE_EQ,
55        '!'  => self::STATE_NOT,
56        '['  => self::STATE_LBRACKET,
57        '|'  => self::STATE_PIPE,
58        '&'  => self::STATE_AND,
59        '`'  => self::STATE_JSON_LITERAL,
60        '"'  => self::STATE_QUOTED_STRING,
61        "'"  => self::STATE_STRING_LITERAL,
62        '-'  => self::STATE_NUMBER,
63        '0'  => self::STATE_NUMBER,
64        '1'  => self::STATE_NUMBER,
65        '2'  => self::STATE_NUMBER,
66        '3'  => self::STATE_NUMBER,
67        '4'  => self::STATE_NUMBER,
68        '5'  => self::STATE_NUMBER,
69        '6'  => self::STATE_NUMBER,
70        '7'  => self::STATE_NUMBER,
71        '8'  => self::STATE_NUMBER,
72        '9'  => self::STATE_NUMBER,
73        ' '  => self::STATE_WHITESPACE,
74        "\t" => self::STATE_WHITESPACE,
75        "\n" => self::STATE_WHITESPACE,
76        "\r" => self::STATE_WHITESPACE,
77        '.'  => self::STATE_SINGLE_CHAR,
78        '*'  => self::STATE_SINGLE_CHAR,
79        ']'  => self::STATE_SINGLE_CHAR,
80        ','  => self::STATE_SINGLE_CHAR,
81        ':'  => self::STATE_SINGLE_CHAR,
82        '@'  => self::STATE_SINGLE_CHAR,
83        '('  => self::STATE_SINGLE_CHAR,
84        ')'  => self::STATE_SINGLE_CHAR,
85        '{'  => self::STATE_SINGLE_CHAR,
86        '}'  => self::STATE_SINGLE_CHAR,
87        '_'  => self::STATE_IDENTIFIER,
88        'A'  => self::STATE_IDENTIFIER,
89        'B'  => self::STATE_IDENTIFIER,
90        'C'  => self::STATE_IDENTIFIER,
91        'D'  => self::STATE_IDENTIFIER,
92        'E'  => self::STATE_IDENTIFIER,
93        'F'  => self::STATE_IDENTIFIER,
94        'G'  => self::STATE_IDENTIFIER,
95        'H'  => self::STATE_IDENTIFIER,
96        'I'  => self::STATE_IDENTIFIER,
97        'J'  => self::STATE_IDENTIFIER,
98        'K'  => self::STATE_IDENTIFIER,
99        'L'  => self::STATE_IDENTIFIER,
100        'M'  => self::STATE_IDENTIFIER,
101        'N'  => self::STATE_IDENTIFIER,
102        'O'  => self::STATE_IDENTIFIER,
103        'P'  => self::STATE_IDENTIFIER,
104        'Q'  => self::STATE_IDENTIFIER,
105        'R'  => self::STATE_IDENTIFIER,
106        'S'  => self::STATE_IDENTIFIER,
107        'T'  => self::STATE_IDENTIFIER,
108        'U'  => self::STATE_IDENTIFIER,
109        'V'  => self::STATE_IDENTIFIER,
110        'W'  => self::STATE_IDENTIFIER,
111        'X'  => self::STATE_IDENTIFIER,
112        'Y'  => self::STATE_IDENTIFIER,
113        'Z'  => self::STATE_IDENTIFIER,
114        'a'  => self::STATE_IDENTIFIER,
115        'b'  => self::STATE_IDENTIFIER,
116        'c'  => self::STATE_IDENTIFIER,
117        'd'  => self::STATE_IDENTIFIER,
118        'e'  => self::STATE_IDENTIFIER,
119        'f'  => self::STATE_IDENTIFIER,
120        'g'  => self::STATE_IDENTIFIER,
121        'h'  => self::STATE_IDENTIFIER,
122        'i'  => self::STATE_IDENTIFIER,
123        'j'  => self::STATE_IDENTIFIER,
124        'k'  => self::STATE_IDENTIFIER,
125        'l'  => self::STATE_IDENTIFIER,
126        'm'  => self::STATE_IDENTIFIER,
127        'n'  => self::STATE_IDENTIFIER,
128        'o'  => self::STATE_IDENTIFIER,
129        'p'  => self::STATE_IDENTIFIER,
130        'q'  => self::STATE_IDENTIFIER,
131        'r'  => self::STATE_IDENTIFIER,
132        's'  => self::STATE_IDENTIFIER,
133        't'  => self::STATE_IDENTIFIER,
134        'u'  => self::STATE_IDENTIFIER,
135        'v'  => self::STATE_IDENTIFIER,
136        'w'  => self::STATE_IDENTIFIER,
137        'x'  => self::STATE_IDENTIFIER,
138        'y'  => self::STATE_IDENTIFIER,
139        'z'  => self::STATE_IDENTIFIER,
140    ];
141
142    /** @var array Valid identifier characters after first character */
143    private $validIdentifier = [
144        'A' => true, 'B' => true, 'C' => true, 'D' => true, 'E' => true,
145        'F' => true, 'G' => true, 'H' => true, 'I' => true, 'J' => true,
146        'K' => true, 'L' => true, 'M' => true, 'N' => true, 'O' => true,
147        'P' => true, 'Q' => true, 'R' => true, 'S' => true, 'T' => true,
148        'U' => true, 'V' => true, 'W' => true, 'X' => true, 'Y' => true,
149        'Z' => true, 'a' => true, 'b' => true, 'c' => true, 'd' => true,
150        'e' => true, 'f' => true, 'g' => true, 'h' => true, 'i' => true,
151        'j' => true, 'k' => true, 'l' => true, 'm' => true, 'n' => true,
152        'o' => true, 'p' => true, 'q' => true, 'r' => true, 's' => true,
153        't' => true, 'u' => true, 'v' => true, 'w' => true, 'x' => true,
154        'y' => true, 'z' => true, '_' => true, '0' => true, '1' => true,
155        '2' => true, '3' => true, '4' => true, '5' => true, '6' => true,
156        '7' => true, '8' => true, '9' => true,
157    ];
158
159    /** @var array Valid number characters after the first character */
160    private $numbers = [
161        '0' => true, '1' => true, '2' => true, '3' => true, '4' => true,
162        '5' => true, '6' => true, '7' => true, '8' => true, '9' => true
163    ];
164
165    /** @var array Map of simple single character tokens */
166    private $simpleTokens = [
167        '.' => self::T_DOT,
168        '*' => self::T_STAR,
169        ']' => self::T_RBRACKET,
170        ',' => self::T_COMMA,
171        ':' => self::T_COLON,
172        '@' => self::T_CURRENT,
173        '(' => self::T_LPAREN,
174        ')' => self::T_RPAREN,
175        '{' => self::T_LBRACE,
176        '}' => self::T_RBRACE,
177    ];
178
179    /**
180     * Tokenize the JMESPath expression into an array of tokens hashes that
181     * contain a 'type', 'value', and 'key'.
182     *
183     * @param string $input JMESPath input
184     *
185     * @return array
186     * @throws SyntaxErrorException
187     */
188    public function tokenize($input)
189    {
190        $tokens = [];
191
192        if ($input === '') {
193            goto eof;
194        }
195
196        $chars = str_split($input);
197
198        while (false !== ($current = current($chars))) {
199
200            // Every character must be in the transition character table.
201            if (!isset(self::$transitionTable[$current])) {
202                $tokens[] = [
203                    'type'  => self::T_UNKNOWN,
204                    'pos'   => key($chars),
205                    'value' => $current
206                ];
207                next($chars);
208                continue;
209            }
210
211            $state = self::$transitionTable[$current];
212
213            if ($state === self::STATE_SINGLE_CHAR) {
214
215                // Consume simple tokens like ".", ",", "@", etc.
216                $tokens[] = [
217                    'type'  => $this->simpleTokens[$current],
218                    'pos'   => key($chars),
219                    'value' => $current
220                ];
221                next($chars);
222
223            } elseif ($state === self::STATE_IDENTIFIER) {
224
225                // Consume identifiers
226                $start = key($chars);
227                $buffer = '';
228                do {
229                    $buffer .= $current;
230                    $current = next($chars);
231                } while ($current !== false && isset($this->validIdentifier[$current]));
232                $tokens[] = [
233                    'type'  => self::T_IDENTIFIER,
234                    'value' => $buffer,
235                    'pos'   => $start
236                ];
237
238            } elseif ($state === self::STATE_WHITESPACE) {
239
240                // Skip whitespace
241                next($chars);
242
243            } elseif ($state === self::STATE_LBRACKET) {
244
245                // Consume "[", "[?", and "[]"
246                $position = key($chars);
247                $actual = next($chars);
248                if ($actual === ']') {
249                    next($chars);
250                    $tokens[] = [
251                        'type'  => self::T_FLATTEN,
252                        'pos'   => $position,
253                        'value' => '[]'
254                    ];
255                } elseif ($actual === '?') {
256                    next($chars);
257                    $tokens[] = [
258                        'type'  => self::T_FILTER,
259                        'pos'   => $position,
260                        'value' => '[?'
261                    ];
262                } else {
263                    $tokens[] = [
264                        'type'  => self::T_LBRACKET,
265                        'pos'   => $position,
266                        'value' => '['
267                    ];
268                }
269
270            } elseif ($state === self::STATE_STRING_LITERAL) {
271
272                // Consume raw string literals
273                $t = $this->inside($chars, "'", self::T_LITERAL);
274                $t['value'] = str_replace("\\'", "'", $t['value']);
275                $tokens[] = $t;
276
277            } elseif ($state === self::STATE_PIPE) {
278
279                // Consume pipe and OR
280                $tokens[] = $this->matchOr($chars, '|', '|', self::T_OR, self::T_PIPE);
281
282            } elseif ($state == self::STATE_JSON_LITERAL) {
283
284                // Consume JSON literals
285                $token = $this->inside($chars, '`', self::T_LITERAL);
286                if ($token['type'] === self::T_LITERAL) {
287                    $token['value'] = str_replace('\\`', '`', $token['value']);
288                    $token = $this->parseJson($token);
289                }
290                $tokens[] = $token;
291
292            } elseif ($state == self::STATE_NUMBER) {
293
294                // Consume numbers
295                $start = key($chars);
296                $buffer = '';
297                do {
298                    $buffer .= $current;
299                    $current = next($chars);
300                } while ($current !== false && isset($this->numbers[$current]));
301                $tokens[] = [
302                    'type'  => self::T_NUMBER,
303                    'value' => (int)$buffer,
304                    'pos'   => $start
305                ];
306
307            } elseif ($state === self::STATE_QUOTED_STRING) {
308
309                // Consume quoted identifiers
310                $token = $this->inside($chars, '"', self::T_QUOTED_IDENTIFIER);
311                if ($token['type'] === self::T_QUOTED_IDENTIFIER) {
312                    $token['value'] = '"' . $token['value'] . '"';
313                    $token = $this->parseJson($token);
314                }
315                $tokens[] = $token;
316
317            } elseif ($state === self::STATE_EQ) {
318
319                // Consume equals
320                $tokens[] = $this->matchOr($chars, '=', '=', self::T_COMPARATOR, self::T_UNKNOWN);
321
322            } elseif ($state == self::STATE_AND) {
323
324                $tokens[] = $this->matchOr($chars, '&', '&', self::T_AND, self::T_EXPREF);
325
326            } elseif ($state === self::STATE_NOT) {
327
328                // Consume not equal
329                $tokens[] = $this->matchOr($chars, '!', '=', self::T_COMPARATOR, self::T_NOT);
330
331            } else {
332
333                // either '<' or '>'
334                // Consume less than and greater than
335                $tokens[] = $this->matchOr($chars, $current, '=', self::T_COMPARATOR, self::T_COMPARATOR);
336
337            }
338        }
339
340        eof:
341        $tokens[] = [
342            'type'  => self::T_EOF,
343            'pos'   => mb_strlen($input, 'UTF-8'),
344            'value' => null
345        ];
346
347        return $tokens;
348    }
349
350    /**
351     * Returns a token based on whether or not the next token matches the
352     * expected value. If it does, a token of "$type" is returned. Otherwise,
353     * a token of "$orElse" type is returned.
354     *
355     * @param array  $chars    Array of characters by reference.
356     * @param string $current  The current character.
357     * @param string $expected Expected character.
358     * @param string $type     Expected result type.
359     * @param string $orElse   Otherwise return a token of this type.
360     *
361     * @return array Returns a conditional token.
362     */
363    private function matchOr(array &$chars, $current, $expected, $type, $orElse)
364    {
365        if (next($chars) === $expected) {
366            next($chars);
367            return [
368                'type'  => $type,
369                'pos'   => key($chars) - 1,
370                'value' => $current . $expected
371            ];
372        }
373
374        return [
375            'type'  => $orElse,
376            'pos'   => key($chars) - 1,
377            'value' => $current
378        ];
379    }
380
381    /**
382     * Returns a token the is the result of consuming inside of delimiter
383     * characters. Escaped delimiters will be adjusted before returning a
384     * value. If the token is not closed, "unknown" is returned.
385     *
386     * @param array  $chars Array of characters by reference.
387     * @param string $delim The delimiter character.
388     * @param string $type  Token type.
389     *
390     * @return array Returns the consumed token.
391     */
392    private function inside(array &$chars, $delim, $type)
393    {
394        $position = key($chars);
395        $current = next($chars);
396        $buffer = '';
397
398        while ($current !== $delim) {
399            if ($current === '\\') {
400                $buffer .= '\\';
401                $current = next($chars);
402            }
403            if ($current === false) {
404                // Unclosed delimiter
405                return [
406                    'type'  => self::T_UNKNOWN,
407                    'value' => $buffer,
408                    'pos'   => $position
409                ];
410            }
411            $buffer .= $current;
412            $current = next($chars);
413        }
414
415        next($chars);
416
417        return ['type' => $type, 'value' => $buffer, 'pos' => $position];
418    }
419
420    /**
421     * Parses a JSON token or sets the token type to "unknown" on error.
422     *
423     * @param array $token Token that needs parsing.
424     *
425     * @return array Returns a token with a parsed value.
426     */
427    private function parseJson(array $token)
428    {
429        $value = json_decode($token['value'], true);
430
431        if ($error = json_last_error()) {
432            // Legacy support for elided quotes. Try to parse again by adding
433            // quotes around the bad input value.
434            $value = json_decode('"' . $token['value'] . '"', true);
435            if ($error = json_last_error()) {
436                $token['type'] = self::T_UNKNOWN;
437                return $token;
438            }
439        }
440
441        $token['value'] = $value;
442        return $token;
443    }
444}
445