1<?php 2namespace JmesPath; 3 4/** 5 * Tokenizes JMESPath expressions 6 */ 7class Lexer 8{ 9 const T_DOT = 'dot'; 10 const T_STAR = 'star'; 11 const T_COMMA = 'comma'; 12 const T_COLON = 'colon'; 13 const T_CURRENT = 'current'; 14 const T_EXPREF = 'expref'; 15 const T_LPAREN = 'lparen'; 16 const T_RPAREN = 'rparen'; 17 const T_LBRACE = 'lbrace'; 18 const T_RBRACE = 'rbrace'; 19 const T_LBRACKET = 'lbracket'; 20 const T_RBRACKET = 'rbracket'; 21 const T_FLATTEN = 'flatten'; 22 const T_IDENTIFIER = 'identifier'; 23 const T_NUMBER = 'number'; 24 const T_QUOTED_IDENTIFIER = 'quoted_identifier'; 25 const T_UNKNOWN = 'unknown'; 26 const T_PIPE = 'pipe'; 27 const T_OR = 'or'; 28 const T_AND = 'and'; 29 const T_NOT = 'not'; 30 const T_FILTER = 'filter'; 31 const T_LITERAL = 'literal'; 32 const T_EOF = 'eof'; 33 const T_COMPARATOR = 'comparator'; 34 35 const STATE_IDENTIFIER = 0; 36 const STATE_NUMBER = 1; 37 const STATE_SINGLE_CHAR = 2; 38 const STATE_WHITESPACE = 3; 39 const STATE_STRING_LITERAL = 4; 40 const STATE_QUOTED_STRING = 5; 41 const STATE_JSON_LITERAL = 6; 42 const STATE_LBRACKET = 7; 43 const STATE_PIPE = 8; 44 const STATE_LT = 9; 45 const STATE_GT = 10; 46 const STATE_EQ = 11; 47 const STATE_NOT = 12; 48 const STATE_AND = 13; 49 50 /** @var array We know what token we are consuming based on each char */ 51 private static $transitionTable = [ 52 '<' => self::STATE_LT, 53 '>' => self::STATE_GT, 54 '=' => self::STATE_EQ, 55 '!' => self::STATE_NOT, 56 '[' => self::STATE_LBRACKET, 57 '|' => self::STATE_PIPE, 58 '&' => self::STATE_AND, 59 '`' => self::STATE_JSON_LITERAL, 60 '"' => self::STATE_QUOTED_STRING, 61 "'" => self::STATE_STRING_LITERAL, 62 '-' => self::STATE_NUMBER, 63 '0' => self::STATE_NUMBER, 64 '1' => self::STATE_NUMBER, 65 '2' => self::STATE_NUMBER, 66 '3' => self::STATE_NUMBER, 67 '4' => self::STATE_NUMBER, 68 '5' => self::STATE_NUMBER, 69 '6' => self::STATE_NUMBER, 70 '7' => self::STATE_NUMBER, 71 '8' => self::STATE_NUMBER, 72 '9' => self::STATE_NUMBER, 73 ' ' => self::STATE_WHITESPACE, 74 "\t" => self::STATE_WHITESPACE, 75 "\n" => self::STATE_WHITESPACE, 76 "\r" => self::STATE_WHITESPACE, 77 '.' => self::STATE_SINGLE_CHAR, 78 '*' => self::STATE_SINGLE_CHAR, 79 ']' => self::STATE_SINGLE_CHAR, 80 ',' => self::STATE_SINGLE_CHAR, 81 ':' => self::STATE_SINGLE_CHAR, 82 '@' => self::STATE_SINGLE_CHAR, 83 '(' => self::STATE_SINGLE_CHAR, 84 ')' => self::STATE_SINGLE_CHAR, 85 '{' => self::STATE_SINGLE_CHAR, 86 '}' => self::STATE_SINGLE_CHAR, 87 '_' => self::STATE_IDENTIFIER, 88 'A' => self::STATE_IDENTIFIER, 89 'B' => self::STATE_IDENTIFIER, 90 'C' => self::STATE_IDENTIFIER, 91 'D' => self::STATE_IDENTIFIER, 92 'E' => self::STATE_IDENTIFIER, 93 'F' => self::STATE_IDENTIFIER, 94 'G' => self::STATE_IDENTIFIER, 95 'H' => self::STATE_IDENTIFIER, 96 'I' => self::STATE_IDENTIFIER, 97 'J' => self::STATE_IDENTIFIER, 98 'K' => self::STATE_IDENTIFIER, 99 'L' => self::STATE_IDENTIFIER, 100 'M' => self::STATE_IDENTIFIER, 101 'N' => self::STATE_IDENTIFIER, 102 'O' => self::STATE_IDENTIFIER, 103 'P' => self::STATE_IDENTIFIER, 104 'Q' => self::STATE_IDENTIFIER, 105 'R' => self::STATE_IDENTIFIER, 106 'S' => self::STATE_IDENTIFIER, 107 'T' => self::STATE_IDENTIFIER, 108 'U' => self::STATE_IDENTIFIER, 109 'V' => self::STATE_IDENTIFIER, 110 'W' => self::STATE_IDENTIFIER, 111 'X' => self::STATE_IDENTIFIER, 112 'Y' => self::STATE_IDENTIFIER, 113 'Z' => self::STATE_IDENTIFIER, 114 'a' => self::STATE_IDENTIFIER, 115 'b' => self::STATE_IDENTIFIER, 116 'c' => self::STATE_IDENTIFIER, 117 'd' => self::STATE_IDENTIFIER, 118 'e' => self::STATE_IDENTIFIER, 119 'f' => self::STATE_IDENTIFIER, 120 'g' => self::STATE_IDENTIFIER, 121 'h' => self::STATE_IDENTIFIER, 122 'i' => self::STATE_IDENTIFIER, 123 'j' => self::STATE_IDENTIFIER, 124 'k' => self::STATE_IDENTIFIER, 125 'l' => self::STATE_IDENTIFIER, 126 'm' => self::STATE_IDENTIFIER, 127 'n' => self::STATE_IDENTIFIER, 128 'o' => self::STATE_IDENTIFIER, 129 'p' => self::STATE_IDENTIFIER, 130 'q' => self::STATE_IDENTIFIER, 131 'r' => self::STATE_IDENTIFIER, 132 's' => self::STATE_IDENTIFIER, 133 't' => self::STATE_IDENTIFIER, 134 'u' => self::STATE_IDENTIFIER, 135 'v' => self::STATE_IDENTIFIER, 136 'w' => self::STATE_IDENTIFIER, 137 'x' => self::STATE_IDENTIFIER, 138 'y' => self::STATE_IDENTIFIER, 139 'z' => self::STATE_IDENTIFIER, 140 ]; 141 142 /** @var array Valid identifier characters after first character */ 143 private $validIdentifier = [ 144 'A' => true, 'B' => true, 'C' => true, 'D' => true, 'E' => true, 145 'F' => true, 'G' => true, 'H' => true, 'I' => true, 'J' => true, 146 'K' => true, 'L' => true, 'M' => true, 'N' => true, 'O' => true, 147 'P' => true, 'Q' => true, 'R' => true, 'S' => true, 'T' => true, 148 'U' => true, 'V' => true, 'W' => true, 'X' => true, 'Y' => true, 149 'Z' => true, 'a' => true, 'b' => true, 'c' => true, 'd' => true, 150 'e' => true, 'f' => true, 'g' => true, 'h' => true, 'i' => true, 151 'j' => true, 'k' => true, 'l' => true, 'm' => true, 'n' => true, 152 'o' => true, 'p' => true, 'q' => true, 'r' => true, 's' => true, 153 't' => true, 'u' => true, 'v' => true, 'w' => true, 'x' => true, 154 'y' => true, 'z' => true, '_' => true, '0' => true, '1' => true, 155 '2' => true, '3' => true, '4' => true, '5' => true, '6' => true, 156 '7' => true, '8' => true, '9' => true, 157 ]; 158 159 /** @var array Valid number characters after the first character */ 160 private $numbers = [ 161 '0' => true, '1' => true, '2' => true, '3' => true, '4' => true, 162 '5' => true, '6' => true, '7' => true, '8' => true, '9' => true 163 ]; 164 165 /** @var array Map of simple single character tokens */ 166 private $simpleTokens = [ 167 '.' => self::T_DOT, 168 '*' => self::T_STAR, 169 ']' => self::T_RBRACKET, 170 ',' => self::T_COMMA, 171 ':' => self::T_COLON, 172 '@' => self::T_CURRENT, 173 '(' => self::T_LPAREN, 174 ')' => self::T_RPAREN, 175 '{' => self::T_LBRACE, 176 '}' => self::T_RBRACE, 177 ]; 178 179 /** 180 * Tokenize the JMESPath expression into an array of tokens hashes that 181 * contain a 'type', 'value', and 'key'. 182 * 183 * @param string $input JMESPath input 184 * 185 * @return array 186 * @throws SyntaxErrorException 187 */ 188 public function tokenize($input) 189 { 190 $tokens = []; 191 192 if ($input === '') { 193 goto eof; 194 } 195 196 $chars = str_split($input); 197 198 while (false !== ($current = current($chars))) { 199 200 // Every character must be in the transition character table. 201 if (!isset(self::$transitionTable[$current])) { 202 $tokens[] = [ 203 'type' => self::T_UNKNOWN, 204 'pos' => key($chars), 205 'value' => $current 206 ]; 207 next($chars); 208 continue; 209 } 210 211 $state = self::$transitionTable[$current]; 212 213 if ($state === self::STATE_SINGLE_CHAR) { 214 215 // Consume simple tokens like ".", ",", "@", etc. 216 $tokens[] = [ 217 'type' => $this->simpleTokens[$current], 218 'pos' => key($chars), 219 'value' => $current 220 ]; 221 next($chars); 222 223 } elseif ($state === self::STATE_IDENTIFIER) { 224 225 // Consume identifiers 226 $start = key($chars); 227 $buffer = ''; 228 do { 229 $buffer .= $current; 230 $current = next($chars); 231 } while ($current !== false && isset($this->validIdentifier[$current])); 232 $tokens[] = [ 233 'type' => self::T_IDENTIFIER, 234 'value' => $buffer, 235 'pos' => $start 236 ]; 237 238 } elseif ($state === self::STATE_WHITESPACE) { 239 240 // Skip whitespace 241 next($chars); 242 243 } elseif ($state === self::STATE_LBRACKET) { 244 245 // Consume "[", "[?", and "[]" 246 $position = key($chars); 247 $actual = next($chars); 248 if ($actual === ']') { 249 next($chars); 250 $tokens[] = [ 251 'type' => self::T_FLATTEN, 252 'pos' => $position, 253 'value' => '[]' 254 ]; 255 } elseif ($actual === '?') { 256 next($chars); 257 $tokens[] = [ 258 'type' => self::T_FILTER, 259 'pos' => $position, 260 'value' => '[?' 261 ]; 262 } else { 263 $tokens[] = [ 264 'type' => self::T_LBRACKET, 265 'pos' => $position, 266 'value' => '[' 267 ]; 268 } 269 270 } elseif ($state === self::STATE_STRING_LITERAL) { 271 272 // Consume raw string literals 273 $t = $this->inside($chars, "'", self::T_LITERAL); 274 $t['value'] = str_replace("\\'", "'", $t['value']); 275 $tokens[] = $t; 276 277 } elseif ($state === self::STATE_PIPE) { 278 279 // Consume pipe and OR 280 $tokens[] = $this->matchOr($chars, '|', '|', self::T_OR, self::T_PIPE); 281 282 } elseif ($state == self::STATE_JSON_LITERAL) { 283 284 // Consume JSON literals 285 $token = $this->inside($chars, '`', self::T_LITERAL); 286 if ($token['type'] === self::T_LITERAL) { 287 $token['value'] = str_replace('\\`', '`', $token['value']); 288 $token = $this->parseJson($token); 289 } 290 $tokens[] = $token; 291 292 } elseif ($state == self::STATE_NUMBER) { 293 294 // Consume numbers 295 $start = key($chars); 296 $buffer = ''; 297 do { 298 $buffer .= $current; 299 $current = next($chars); 300 } while ($current !== false && isset($this->numbers[$current])); 301 $tokens[] = [ 302 'type' => self::T_NUMBER, 303 'value' => (int)$buffer, 304 'pos' => $start 305 ]; 306 307 } elseif ($state === self::STATE_QUOTED_STRING) { 308 309 // Consume quoted identifiers 310 $token = $this->inside($chars, '"', self::T_QUOTED_IDENTIFIER); 311 if ($token['type'] === self::T_QUOTED_IDENTIFIER) { 312 $token['value'] = '"' . $token['value'] . '"'; 313 $token = $this->parseJson($token); 314 } 315 $tokens[] = $token; 316 317 } elseif ($state === self::STATE_EQ) { 318 319 // Consume equals 320 $tokens[] = $this->matchOr($chars, '=', '=', self::T_COMPARATOR, self::T_UNKNOWN); 321 322 } elseif ($state == self::STATE_AND) { 323 324 $tokens[] = $this->matchOr($chars, '&', '&', self::T_AND, self::T_EXPREF); 325 326 } elseif ($state === self::STATE_NOT) { 327 328 // Consume not equal 329 $tokens[] = $this->matchOr($chars, '!', '=', self::T_COMPARATOR, self::T_NOT); 330 331 } else { 332 333 // either '<' or '>' 334 // Consume less than and greater than 335 $tokens[] = $this->matchOr($chars, $current, '=', self::T_COMPARATOR, self::T_COMPARATOR); 336 337 } 338 } 339 340 eof: 341 $tokens[] = [ 342 'type' => self::T_EOF, 343 'pos' => mb_strlen($input, 'UTF-8'), 344 'value' => null 345 ]; 346 347 return $tokens; 348 } 349 350 /** 351 * Returns a token based on whether or not the next token matches the 352 * expected value. If it does, a token of "$type" is returned. Otherwise, 353 * a token of "$orElse" type is returned. 354 * 355 * @param array $chars Array of characters by reference. 356 * @param string $current The current character. 357 * @param string $expected Expected character. 358 * @param string $type Expected result type. 359 * @param string $orElse Otherwise return a token of this type. 360 * 361 * @return array Returns a conditional token. 362 */ 363 private function matchOr(array &$chars, $current, $expected, $type, $orElse) 364 { 365 if (next($chars) === $expected) { 366 next($chars); 367 return [ 368 'type' => $type, 369 'pos' => key($chars) - 1, 370 'value' => $current . $expected 371 ]; 372 } 373 374 return [ 375 'type' => $orElse, 376 'pos' => key($chars) - 1, 377 'value' => $current 378 ]; 379 } 380 381 /** 382 * Returns a token the is the result of consuming inside of delimiter 383 * characters. Escaped delimiters will be adjusted before returning a 384 * value. If the token is not closed, "unknown" is returned. 385 * 386 * @param array $chars Array of characters by reference. 387 * @param string $delim The delimiter character. 388 * @param string $type Token type. 389 * 390 * @return array Returns the consumed token. 391 */ 392 private function inside(array &$chars, $delim, $type) 393 { 394 $position = key($chars); 395 $current = next($chars); 396 $buffer = ''; 397 398 while ($current !== $delim) { 399 if ($current === '\\') { 400 $buffer .= '\\'; 401 $current = next($chars); 402 } 403 if ($current === false) { 404 // Unclosed delimiter 405 return [ 406 'type' => self::T_UNKNOWN, 407 'value' => $buffer, 408 'pos' => $position 409 ]; 410 } 411 $buffer .= $current; 412 $current = next($chars); 413 } 414 415 next($chars); 416 417 return ['type' => $type, 'value' => $buffer, 'pos' => $position]; 418 } 419 420 /** 421 * Parses a JSON token or sets the token type to "unknown" on error. 422 * 423 * @param array $token Token that needs parsing. 424 * 425 * @return array Returns a token with a parsed value. 426 */ 427 private function parseJson(array $token) 428 { 429 $value = json_decode($token['value'], true); 430 431 if ($error = json_last_error()) { 432 // Legacy support for elided quotes. Try to parse again by adding 433 // quotes around the bad input value. 434 $value = json_decode('"' . $token['value'] . '"', true); 435 if ($error = json_last_error()) { 436 $token['type'] = self::T_UNKNOWN; 437 return $token; 438 } 439 } 440 441 $token['value'] = $value; 442 return $token; 443 } 444} 445