1 %{ 2 /*------------------------------------------------------------------------- 3 * 4 * jsonpath_scan.l 5 * Lexical parser for jsonpath datatype 6 * 7 * Splits jsonpath string into tokens represented as JsonPathString structs. 8 * Decodes unicode and hex escaped strings. 9 * 10 * Copyright (c) 2019-2020, PostgreSQL Global Development Group 11 * 12 * IDENTIFICATION 13 * src/backend/utils/adt/jsonpath_scan.l 14 * 15 *------------------------------------------------------------------------- 16 */ 17 18 #include "postgres.h" 19 20 #include "mb/pg_wchar.h" 21 #include "nodes/pg_list.h" 22 23 static JsonPathString scanstring; 24 25 /* Handles to the buffer that the lexer uses internally */ 26 static YY_BUFFER_STATE scanbufhandle; 27 static char *scanbuf; 28 static int scanbuflen; 29 30 static void addstring(bool init, char *s, int l); 31 static void addchar(bool init, char s); 32 static enum yytokentype checkKeyword(void); 33 static void parseUnicode(char *s, int l); 34 static void parseHexChar(char *s); 35 36 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */ 37 #undef fprintf 38 #define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg) 39 40 static void 41 fprintf_to_ereport(const char *fmt, const char *msg) 42 { 43 ereport(ERROR, (errmsg_internal("%s", msg))); 44 } 45 46 /* LCOV_EXCL_START */ 47 48 %} 49 50 %option 8bit 51 %option never-interactive 52 %option nodefault 53 %option noinput 54 %option nounput 55 %option noyywrap 56 %option warn 57 %option prefix="jsonpath_yy" 58 %option bison-bridge 59 %option noyyalloc 60 %option noyyrealloc 61 %option noyyfree 62 63 /* 64 * We use exclusive states for quoted and non-quoted strings, 65 * quoted variable names and C-style comments. 66 * Exclusive states: 67 * <xq> - quoted strings 68 * <xnq> - non-quoted strings 69 * <xvq> - quoted variable names 70 * <xc> - C-style comment 71 */ 72 73 %x xq 74 %x xnq 75 %x xvq 76 %x xc 77 78 special [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/] 79 blank [ \t\n\r\f] 80 /* "other" means anything that's not special, blank, or '\' or '"' */ 81 other [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\" \t\n\r\f] 82 83 digit [0-9] 84 integer (0|[1-9]{digit}*) 85 decimal {integer}\.{digit}+ 86 decimalfail {integer}\. 87 real ({integer}|{decimal})[Ee][-+]?{digit}+ 88 realfail1 ({integer}|{decimal})[Ee] 89 realfail2 ({integer}|{decimal})[Ee][-+] 90 91 hex_dig [0-9A-Fa-f] 92 unicode \\u({hex_dig}{4}|\{{hex_dig}{1,6}\}) 93 unicodefail \\u({hex_dig}{0,3}|\{{hex_dig}{0,6}) 94 hex_char \\x{hex_dig}{2} 95 hex_fail \\x{hex_dig}{0,1} 96 97 %% 98 99 <xnq>{other}+ { 100 addstring(false, yytext, yyleng); 101 } 102 103 <xnq>{blank}+ { 104 yylval->str = scanstring; 105 BEGIN INITIAL; 106 return checkKeyword(); 107 } 108 109 <xnq>\/\* { 110 yylval->str = scanstring; 111 BEGIN xc; 112 } 113 114 <xnq>({special}|\") { 115 yylval->str = scanstring; 116 yyless(0); 117 BEGIN INITIAL; 118 return checkKeyword(); 119 } 120 121 <xnq><<EOF>> { 122 yylval->str = scanstring; 123 BEGIN INITIAL; 124 return checkKeyword(); 125 } 126 127 <xnq,xq,xvq>\\b { addchar(false, '\b'); } 128 129 <xnq,xq,xvq>\\f { addchar(false, '\f'); } 130 131 <xnq,xq,xvq>\\n { addchar(false, '\n'); } 132 133 <xnq,xq,xvq>\\r { addchar(false, '\r'); } 134 135 <xnq,xq,xvq>\\t { addchar(false, '\t'); } 136 137 <xnq,xq,xvq>\\v { addchar(false, '\v'); } 138 139 <xnq,xq,xvq>{unicode}+ { parseUnicode(yytext, yyleng); } 140 141 <xnq,xq,xvq>{hex_char} { parseHexChar(yytext); } 142 143 <xnq,xq,xvq>{unicode}*{unicodefail} { yyerror(NULL, "invalid unicode sequence"); } 144 145 <xnq,xq,xvq>{hex_fail} { yyerror(NULL, "invalid hex character sequence"); } 146 147 <xnq,xq,xvq>{unicode}+\\ { 148 /* throw back the \\, and treat as unicode */ 149 yyless(yyleng - 1); 150 parseUnicode(yytext, yyleng); 151 } 152 153 <xnq,xq,xvq>\\. { addchar(false, yytext[1]); } 154 155 <xnq,xq,xvq>\\ { yyerror(NULL, "unexpected end after backslash"); } 156 157 <xq,xvq><<EOF>> { yyerror(NULL, "unexpected end of quoted string"); } 158 159 <xq>\" { 160 yylval->str = scanstring; 161 BEGIN INITIAL; 162 return STRING_P; 163 } 164 165 <xvq>\" { 166 yylval->str = scanstring; 167 BEGIN INITIAL; 168 return VARIABLE_P; 169 } 170 171 <xq,xvq>[^\\\"]+ { addstring(false, yytext, yyleng); } 172 173 <xc>\*\/ { BEGIN INITIAL; } 174 175 <xc>[^\*]+ { } 176 177 <xc>\* { } 178 179 <xc><<EOF>> { yyerror(NULL, "unexpected end of comment"); } 180 181 \&\& { return AND_P; } 182 183 \|\| { return OR_P; } 184 185 \! { return NOT_P; } 186 187 \*\* { return ANY_P; } 188 189 \< { return LESS_P; } 190 191 \<\= { return LESSEQUAL_P; } 192 193 \=\= { return EQUAL_P; } 194 195 \<\> { return NOTEQUAL_P; } 196 197 \!\= { return NOTEQUAL_P; } 198 199 \>\= { return GREATEREQUAL_P; } 200 201 \> { return GREATER_P; } 202 203 \${other}+ { 204 addstring(true, yytext + 1, yyleng - 1); 205 addchar(false, '\0'); 206 yylval->str = scanstring; 207 return VARIABLE_P; 208 } 209 210 \$\" { 211 addchar(true, '\0'); 212 BEGIN xvq; 213 } 214 215 {special} { return *yytext; } 216 217 {blank}+ { /* ignore */ } 218 219 \/\* { 220 addchar(true, '\0'); 221 BEGIN xc; 222 } 223 224 {real} { 225 addstring(true, yytext, yyleng); 226 addchar(false, '\0'); 227 yylval->str = scanstring; 228 return NUMERIC_P; 229 } 230 231 {decimal} { 232 addstring(true, yytext, yyleng); 233 addchar(false, '\0'); 234 yylval->str = scanstring; 235 return NUMERIC_P; 236 } 237 238 {integer} { 239 addstring(true, yytext, yyleng); 240 addchar(false, '\0'); 241 yylval->str = scanstring; 242 return INT_P; 243 } 244 245 {decimalfail} { 246 /* throw back the ., and treat as integer */ 247 yyless(yyleng - 1); 248 addstring(true, yytext, yyleng); 249 addchar(false, '\0'); 250 yylval->str = scanstring; 251 return INT_P; 252 } 253 254 ({realfail1}|{realfail2}) { yyerror(NULL, "invalid floating point number"); } 255 256 \" { 257 addchar(true, '\0'); 258 BEGIN xq; 259 } 260 261 \\ { 262 yyless(0); 263 addchar(true, '\0'); 264 BEGIN xnq; 265 } 266 267 {other}+ { 268 addstring(true, yytext, yyleng); 269 BEGIN xnq; 270 } 271 272 <<EOF>> { yyterminate(); } 273 274 %% 275 276 /* LCOV_EXCL_STOP */ 277 278 void 279 jsonpath_yyerror(JsonPathParseResult **result, const char *message) 280 { 281 if (*yytext == YY_END_OF_BUFFER_CHAR) 282 { 283 ereport(ERROR, 284 (errcode(ERRCODE_SYNTAX_ERROR), 285 /* translator: %s is typically "syntax error" */ 286 errmsg("%s at end of jsonpath input", _(message)))); 287 } 288 else 289 { 290 ereport(ERROR, 291 (errcode(ERRCODE_SYNTAX_ERROR), 292 /* translator: first %s is typically "syntax error" */ 293 errmsg("%s at or near \"%s\" of jsonpath input", 294 _(message), yytext))); 295 } 296 } 297 298 typedef struct JsonPathKeyword 299 { 300 int16 len; 301 bool lowercase; 302 int val; 303 const char *keyword; 304 } JsonPathKeyword; 305 306 /* 307 * Array of key words should be sorted by length and then 308 * alphabetical order 309 */ 310 static const JsonPathKeyword keywords[] = { 311 { 2, false, IS_P, "is"}, 312 { 2, false, TO_P, "to"}, 313 { 3, false, ABS_P, "abs"}, 314 { 3, false, LAX_P, "lax"}, 315 { 4, false, FLAG_P, "flag"}, 316 { 4, false, LAST_P, "last"}, 317 { 4, true, NULL_P, "null"}, 318 { 4, false, SIZE_P, "size"}, 319 { 4, true, TRUE_P, "true"}, 320 { 4, false, TYPE_P, "type"}, 321 { 4, false, WITH_P, "with"}, 322 { 5, true, FALSE_P, "false"}, 323 { 5, false, FLOOR_P, "floor"}, 324 { 6, false, DOUBLE_P, "double"}, 325 { 6, false, EXISTS_P, "exists"}, 326 { 6, false, STARTS_P, "starts"}, 327 { 6, false, STRICT_P, "strict"}, 328 { 7, false, CEILING_P, "ceiling"}, 329 { 7, false, UNKNOWN_P, "unknown"}, 330 { 8, false, DATETIME_P, "datetime"}, 331 { 8, false, KEYVALUE_P, "keyvalue"}, 332 { 10,false, LIKE_REGEX_P, "like_regex"}, 333 }; 334 335 /* Check if current scanstring value is a keyword */ 336 static enum yytokentype 337 checkKeyword() 338 { 339 int res = IDENT_P; 340 int diff; 341 const JsonPathKeyword *StopLow = keywords, 342 *StopHigh = keywords + lengthof(keywords), 343 *StopMiddle; 344 345 if (scanstring.len > keywords[lengthof(keywords) - 1].len) 346 return res; 347 348 while (StopLow < StopHigh) 349 { 350 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); 351 352 if (StopMiddle->len == scanstring.len) 353 diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val, 354 scanstring.len); 355 else 356 diff = StopMiddle->len - scanstring.len; 357 358 if (diff < 0) 359 StopLow = StopMiddle + 1; 360 else if (diff > 0) 361 StopHigh = StopMiddle; 362 else 363 { 364 if (StopMiddle->lowercase) 365 diff = strncmp(StopMiddle->keyword, scanstring.val, 366 scanstring.len); 367 368 if (diff == 0) 369 res = StopMiddle->val; 370 371 break; 372 } 373 } 374 375 return res; 376 } 377 378 /* 379 * Called before any actual parsing is done 380 */ 381 static void 382 jsonpath_scanner_init(const char *str, int slen) 383 { 384 if (slen <= 0) 385 slen = strlen(str); 386 387 /* 388 * Might be left over after ereport() 389 */ 390 yy_init_globals(); 391 392 /* 393 * Make a scan buffer with special termination needed by flex. 394 */ 395 396 scanbuflen = slen; 397 scanbuf = palloc(slen + 2); 398 memcpy(scanbuf, str, slen); 399 scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR; 400 scanbufhandle = yy_scan_buffer(scanbuf, slen + 2); 401 402 BEGIN(INITIAL); 403 } 404 405 406 /* 407 * Called after parsing is done to clean up after jsonpath_scanner_init() 408 */ 409 static void 410 jsonpath_scanner_finish(void) 411 { 412 yy_delete_buffer(scanbufhandle); 413 pfree(scanbuf); 414 } 415 416 /* 417 * Resize scanstring so that it can append string of given length. 418 * Reinitialize if required. 419 */ 420 static void 421 resizeString(bool init, int appendLen) 422 { 423 if (init) 424 { 425 scanstring.total = Max(32, appendLen); 426 scanstring.val = (char *) palloc(scanstring.total); 427 scanstring.len = 0; 428 } 429 else 430 { 431 if (scanstring.len + appendLen >= scanstring.total) 432 { 433 while (scanstring.len + appendLen >= scanstring.total) 434 scanstring.total *= 2; 435 scanstring.val = repalloc(scanstring.val, scanstring.total); 436 } 437 } 438 } 439 440 /* Add set of bytes at "s" of length "l" to scanstring */ 441 static void 442 addstring(bool init, char *s, int l) 443 { 444 resizeString(init, l + 1); 445 memcpy(scanstring.val + scanstring.len, s, l); 446 scanstring.len += l; 447 } 448 449 /* Add single byte "c" to scanstring */ 450 static void 451 addchar(bool init, char c) 452 { 453 resizeString(init, 1); 454 scanstring.val[scanstring.len] = c; 455 if (c != '\0') 456 scanstring.len++; 457 } 458 459 /* Interface to jsonpath parser */ 460 JsonPathParseResult * 461 parsejsonpath(const char *str, int len) 462 { 463 JsonPathParseResult *parseresult; 464 465 jsonpath_scanner_init(str, len); 466 467 if (jsonpath_yyparse((void *) &parseresult) != 0) 468 jsonpath_yyerror(NULL, "bogus input"); /* shouldn't happen */ 469 470 jsonpath_scanner_finish(); 471 472 return parseresult; 473 } 474 475 /* Turn hex character into integer */ 476 static int 477 hexval(char c) 478 { 479 if (c >= '0' && c <= '9') 480 return c - '0'; 481 if (c >= 'a' && c <= 'f') 482 return c - 'a' + 0xA; 483 if (c >= 'A' && c <= 'F') 484 return c - 'A' + 0xA; 485 jsonpath_yyerror(NULL, "invalid hexadecimal digit"); 486 return 0; /* not reached */ 487 } 488 489 /* Add given unicode character to scanstring */ 490 static void 491 addUnicodeChar(int ch) 492 { 493 if (ch == 0) 494 { 495 /* We can't allow this, since our TEXT type doesn't */ 496 ereport(ERROR, 497 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), 498 errmsg("unsupported Unicode escape sequence"), 499 errdetail("\\u0000 cannot be converted to text."))); 500 } 501 else 502 { 503 char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1]; 504 505 pg_unicode_to_server(ch, (unsigned char *) cbuf); 506 addstring(false, cbuf, strlen(cbuf)); 507 } 508 } 509 510 /* Add unicode character, processing any surrogate pairs */ 511 static void 512 addUnicode(int ch, int *hi_surrogate) 513 { 514 if (is_utf16_surrogate_first(ch)) 515 { 516 if (*hi_surrogate != -1) 517 ereport(ERROR, 518 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), 519 errmsg("invalid input syntax for type %s", "jsonpath"), 520 errdetail("Unicode high surrogate must not follow " 521 "a high surrogate."))); 522 *hi_surrogate = ch; 523 return; 524 } 525 else if (is_utf16_surrogate_second(ch)) 526 { 527 if (*hi_surrogate == -1) 528 ereport(ERROR, 529 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), 530 errmsg("invalid input syntax for type %s", "jsonpath"), 531 errdetail("Unicode low surrogate must follow a high " 532 "surrogate."))); 533 ch = surrogate_pair_to_codepoint(*hi_surrogate, ch); 534 *hi_surrogate = -1; 535 } 536 else if (*hi_surrogate != -1) 537 { 538 ereport(ERROR, 539 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), 540 errmsg("invalid input syntax for type %s", "jsonpath"), 541 errdetail("Unicode low surrogate must follow a high " 542 "surrogate."))); 543 } 544 545 addUnicodeChar(ch); 546 } 547 548 /* 549 * parseUnicode was adopted from json_lex_string() in 550 * src/backend/utils/adt/json.c 551 */ 552 static void 553 parseUnicode(char *s, int l) 554 { 555 int i = 2; 556 int hi_surrogate = -1; 557 558 for (i = 2; i < l; i += 2) /* skip '\u' */ 559 { 560 int ch = 0; 561 int j; 562 563 if (s[i] == '{') /* parse '\u{XX...}' */ 564 { 565 while (s[++i] != '}' && i < l) 566 ch = (ch << 4) | hexval(s[i]); 567 i++; /* skip '}' */ 568 } 569 else /* parse '\uXXXX' */ 570 { 571 for (j = 0; j < 4 && i < l; j++) 572 ch = (ch << 4) | hexval(s[i++]); 573 } 574 575 addUnicode(ch, &hi_surrogate); 576 } 577 578 if (hi_surrogate != -1) 579 { 580 ereport(ERROR, 581 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), 582 errmsg("invalid input syntax for type %s", "jsonpath"), 583 errdetail("Unicode low surrogate must follow a high " 584 "surrogate."))); 585 } 586 } 587 588 /* Parse sequence of hex-encoded characters */ 589 static void 590 parseHexChar(char *s) 591 { 592 int ch = (hexval(s[2]) << 4) | 593 hexval(s[3]); 594 595 addUnicodeChar(ch); 596 } 597 598 /* 599 * Interface functions to make flex use palloc() instead of malloc(). 600 * It'd be better to make these static, but flex insists otherwise. 601 */ 602 603 void * 604 jsonpath_yyalloc(yy_size_t bytes) 605 { 606 return palloc(bytes); 607 } 608 609 void * 610 jsonpath_yyrealloc(void *ptr, yy_size_t bytes) 611 { 612 if (ptr) 613 return repalloc(ptr, bytes); 614 else 615 return palloc(bytes); 616 } 617 618 void 619 jsonpath_yyfree(void *ptr) 620 { 621 if (ptr) 622 pfree(ptr); 623 } 624