1 %{
2 /*-------------------------------------------------------------------------
3 *
4 * jsonpath_scan.l
5 * Lexical parser for jsonpath datatype
6 *
7 * Splits jsonpath string into tokens represented as JsonPathString structs.
8 * Decodes unicode and hex escaped strings.
9 *
10 * Copyright (c) 2019, PostgreSQL Global Development Group
11 *
12 * IDENTIFICATION
13 * src/backend/utils/adt/jsonpath_scan.l
14 *
15 *-------------------------------------------------------------------------
16 */
17
18 #include "postgres.h"
19
20 #include "mb/pg_wchar.h"
21 #include "nodes/pg_list.h"
22
23 static JsonPathString scanstring;
24
25 /* Handles to the buffer that the lexer uses internally */
26 static YY_BUFFER_STATE scanbufhandle;
27 static char *scanbuf;
28 static int scanbuflen;
29
30 static void addstring(bool init, char *s, int l);
31 static void addchar(bool init, char s);
32 static enum yytokentype checkKeyword(void);
33 static void parseUnicode(char *s, int l);
34 static void parseHexChar(char *s);
35
36 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
37 #undef fprintf
38 #define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg)
39
40 static void
fprintf_to_ereport(const char * fmt,const char * msg)41 fprintf_to_ereport(const char *fmt, const char *msg)
42 {
43 ereport(ERROR, (errmsg_internal("%s", msg)));
44 }
45
46 /* LCOV_EXCL_START */
47
48 %}
49
50 %option 8bit
51 %option never-interactive
52 %option nodefault
53 %option noinput
54 %option nounput
55 %option noyywrap
56 %option warn
57 %option prefix="jsonpath_yy"
58 %option bison-bridge
59 %option noyyalloc
60 %option noyyrealloc
61 %option noyyfree
62
63 /*
64 * We use exclusive states for quoted and non-quoted strings,
65 * quoted variable names and C-style comments.
66 * Exclusive states:
67 * <xq> - quoted strings
68 * <xnq> - non-quoted strings
69 * <xvq> - quoted variable names
70 * <xc> - C-style comment
71 */
72
73 %x xq
74 %x xnq
75 %x xvq
76 %x xc
77
78 special [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
79 blank [ \t\n\r\f]
80 /* "other" means anything that's not special, blank, or '\' or '"' */
81 other [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\" \t\n\r\f]
82
83 digit [0-9]
84 integer (0|[1-9]{digit}*)
85 decimal {integer}\.{digit}+
86 decimalfail {integer}\.
87 real ({integer}|{decimal})[Ee][-+]?{digit}+
88 realfail1 ({integer}|{decimal})[Ee]
89 realfail2 ({integer}|{decimal})[Ee][-+]
90
91 hex_dig [0-9A-Fa-f]
92 unicode \\u({hex_dig}{4}|\{{hex_dig}{1,6}\})
93 unicodefail \\u({hex_dig}{0,3}|\{{hex_dig}{0,6})
94 hex_char \\x{hex_dig}{2}
95 hex_fail \\x{hex_dig}{0,1}
96
97 %%
98
99 <xnq>{other}+ {
100 addstring(false, yytext, yyleng);
101 }
102
103 <xnq>{blank}+ {
104 yylval->str = scanstring;
105 BEGIN INITIAL;
106 return checkKeyword();
107 }
108
109 <xnq>\/\* {
110 yylval->str = scanstring;
111 BEGIN xc;
112 }
113
114 <xnq>({special}|\") {
115 yylval->str = scanstring;
116 yyless(0);
117 BEGIN INITIAL;
118 return checkKeyword();
119 }
120
121 <xnq><<EOF>> {
122 yylval->str = scanstring;
123 BEGIN INITIAL;
124 return checkKeyword();
125 }
126
127 <xnq,xq,xvq>\\b { addchar(false, '\b'); }
128
129 <xnq,xq,xvq>\\f { addchar(false, '\f'); }
130
131 <xnq,xq,xvq>\\n { addchar(false, '\n'); }
132
133 <xnq,xq,xvq>\\r { addchar(false, '\r'); }
134
135 <xnq,xq,xvq>\\t { addchar(false, '\t'); }
136
137 <xnq,xq,xvq>\\v { addchar(false, '\v'); }
138
139 <xnq,xq,xvq>{unicode}+ { parseUnicode(yytext, yyleng); }
140
141 <xnq,xq,xvq>{hex_char} { parseHexChar(yytext); }
142
143 <xnq,xq,xvq>{unicode}*{unicodefail} { yyerror(NULL, "invalid unicode sequence"); }
144
145 <xnq,xq,xvq>{hex_fail} { yyerror(NULL, "invalid hex character sequence"); }
146
147 <xnq,xq,xvq>{unicode}+\\ {
148 /* throw back the \\, and treat as unicode */
149 yyless(yyleng - 1);
150 parseUnicode(yytext, yyleng);
151 }
152
153 <xnq,xq,xvq>\\. { addchar(false, yytext[1]); }
154
155 <xnq,xq,xvq>\\ { yyerror(NULL, "unexpected end after backslash"); }
156
157 <xq,xvq><<EOF>> { yyerror(NULL, "unexpected end of quoted string"); }
158
159 <xq>\" {
160 yylval->str = scanstring;
161 BEGIN INITIAL;
162 return STRING_P;
163 }
164
165 <xvq>\" {
166 yylval->str = scanstring;
167 BEGIN INITIAL;
168 return VARIABLE_P;
169 }
170
171 <xq,xvq>[^\\\"]+ { addstring(false, yytext, yyleng); }
172
173 <xc>\*\/ { BEGIN INITIAL; }
174
175 <xc>[^\*]+ { }
176
177 <xc>\* { }
178
179 <xc><<EOF>> { yyerror(NULL, "unexpected end of comment"); }
180
181 \&\& { return AND_P; }
182
183 \|\| { return OR_P; }
184
185 \! { return NOT_P; }
186
187 \*\* { return ANY_P; }
188
189 \< { return LESS_P; }
190
191 \<\= { return LESSEQUAL_P; }
192
193 \=\= { return EQUAL_P; }
194
195 \<\> { return NOTEQUAL_P; }
196
197 \!\= { return NOTEQUAL_P; }
198
199 \>\= { return GREATEREQUAL_P; }
200
201 \> { return GREATER_P; }
202
203 \${other}+ {
204 addstring(true, yytext + 1, yyleng - 1);
205 addchar(false, '\0');
206 yylval->str = scanstring;
207 return VARIABLE_P;
208 }
209
210 \$\" {
211 addchar(true, '\0');
212 BEGIN xvq;
213 }
214
215 {special} { return *yytext; }
216
217 {blank}+ { /* ignore */ }
218
219 \/\* {
220 addchar(true, '\0');
221 BEGIN xc;
222 }
223
224 {real} {
225 addstring(true, yytext, yyleng);
226 addchar(false, '\0');
227 yylval->str = scanstring;
228 return NUMERIC_P;
229 }
230
231 {decimal} {
232 addstring(true, yytext, yyleng);
233 addchar(false, '\0');
234 yylval->str = scanstring;
235 return NUMERIC_P;
236 }
237
238 {integer} {
239 addstring(true, yytext, yyleng);
240 addchar(false, '\0');
241 yylval->str = scanstring;
242 return INT_P;
243 }
244
245 {decimalfail} {
246 /* throw back the ., and treat as integer */
247 yyless(yyleng - 1);
248 addstring(true, yytext, yyleng);
249 addchar(false, '\0');
250 yylval->str = scanstring;
251 return INT_P;
252 }
253
254 ({realfail1}|{realfail2}) { yyerror(NULL, "invalid floating point number"); }
255
256 \" {
257 addchar(true, '\0');
258 BEGIN xq;
259 }
260
261 \\ {
262 yyless(0);
263 addchar(true, '\0');
264 BEGIN xnq;
265 }
266
267 {other}+ {
268 addstring(true, yytext, yyleng);
269 BEGIN xnq;
270 }
271
272 <<EOF>> { yyterminate(); }
273
274 %%
275
276 /* LCOV_EXCL_STOP */
277
278 void
279 jsonpath_yyerror(JsonPathParseResult **result, const char *message)
280 {
281 if (*yytext == YY_END_OF_BUFFER_CHAR)
282 {
283 ereport(ERROR,
284 (errcode(ERRCODE_SYNTAX_ERROR),
285 /* translator: %s is typically "syntax error" */
286 errmsg("%s at end of jsonpath input", _(message))));
287 }
288 else
289 {
290 ereport(ERROR,
291 (errcode(ERRCODE_SYNTAX_ERROR),
292 /* translator: first %s is typically "syntax error" */
293 errmsg("%s at or near \"%s\" of jsonpath input",
294 _(message), yytext)));
295 }
296 }
297
298 typedef struct JsonPathKeyword
299 {
300 int16 len;
301 bool lowercase;
302 int val;
303 const char *keyword;
304 } JsonPathKeyword;
305
306 /*
307 * Array of key words should be sorted by length and then
308 * alphabetical order
309 */
310 static const JsonPathKeyword keywords[] = {
311 { 2, false, IS_P, "is"},
312 { 2, false, TO_P, "to"},
313 { 3, false, ABS_P, "abs"},
314 { 3, false, LAX_P, "lax"},
315 { 4, false, FLAG_P, "flag"},
316 { 4, false, LAST_P, "last"},
317 { 4, true, NULL_P, "null"},
318 { 4, false, SIZE_P, "size"},
319 { 4, true, TRUE_P, "true"},
320 { 4, false, TYPE_P, "type"},
321 { 4, false, WITH_P, "with"},
322 { 5, true, FALSE_P, "false"},
323 { 5, false, FLOOR_P, "floor"},
324 { 6, false, DOUBLE_P, "double"},
325 { 6, false, EXISTS_P, "exists"},
326 { 6, false, STARTS_P, "starts"},
327 { 6, false, STRICT_P, "strict"},
328 { 7, false, CEILING_P, "ceiling"},
329 { 7, false, UNKNOWN_P, "unknown"},
330 { 8, false, KEYVALUE_P, "keyvalue"},
331 { 10,false, LIKE_REGEX_P, "like_regex"},
332 };
333
334 /* Check if current scanstring value is a keyword */
335 static enum yytokentype
336 checkKeyword()
337 {
338 int res = IDENT_P;
339 int diff;
340 const JsonPathKeyword *StopLow = keywords,
341 *StopHigh = keywords + lengthof(keywords),
342 *StopMiddle;
343
344 if (scanstring.len > keywords[lengthof(keywords) - 1].len)
345 return res;
346
347 while (StopLow < StopHigh)
348 {
349 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
350
351 if (StopMiddle->len == scanstring.len)
352 diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val,
353 scanstring.len);
354 else
355 diff = StopMiddle->len - scanstring.len;
356
357 if (diff < 0)
358 StopLow = StopMiddle + 1;
359 else if (diff > 0)
360 StopHigh = StopMiddle;
361 else
362 {
363 if (StopMiddle->lowercase)
364 diff = strncmp(StopMiddle->keyword, scanstring.val,
365 scanstring.len);
366
367 if (diff == 0)
368 res = StopMiddle->val;
369
370 break;
371 }
372 }
373
374 return res;
375 }
376
377 /*
378 * Called before any actual parsing is done
379 */
380 static void
381 jsonpath_scanner_init(const char *str, int slen)
382 {
383 if (slen <= 0)
384 slen = strlen(str);
385
386 /*
387 * Might be left over after ereport()
388 */
389 yy_init_globals();
390
391 /*
392 * Make a scan buffer with special termination needed by flex.
393 */
394
395 scanbuflen = slen;
396 scanbuf = palloc(slen + 2);
397 memcpy(scanbuf, str, slen);
398 scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
399 scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
400
401 BEGIN(INITIAL);
402 }
403
404
405 /*
406 * Called after parsing is done to clean up after jsonpath_scanner_init()
407 */
408 static void
409 jsonpath_scanner_finish(void)
410 {
411 yy_delete_buffer(scanbufhandle);
412 pfree(scanbuf);
413 }
414
415 /*
416 * Resize scanstring so that it can append string of given length.
417 * Reinitialize if required.
418 */
419 static void
420 resizeString(bool init, int appendLen)
421 {
422 if (init)
423 {
424 scanstring.total = Max(32, appendLen);
425 scanstring.val = (char *) palloc(scanstring.total);
426 scanstring.len = 0;
427 }
428 else
429 {
430 if (scanstring.len + appendLen >= scanstring.total)
431 {
432 while (scanstring.len + appendLen >= scanstring.total)
433 scanstring.total *= 2;
434 scanstring.val = repalloc(scanstring.val, scanstring.total);
435 }
436 }
437 }
438
439 /* Add set of bytes at "s" of length "l" to scanstring */
440 static void
441 addstring(bool init, char *s, int l)
442 {
443 resizeString(init, l + 1);
444 memcpy(scanstring.val + scanstring.len, s, l);
445 scanstring.len += l;
446 }
447
448 /* Add single byte "c" to scanstring */
449 static void
450 addchar(bool init, char c)
451 {
452 resizeString(init, 1);
453 scanstring.val[scanstring.len] = c;
454 if (c != '\0')
455 scanstring.len++;
456 }
457
458 /* Interface to jsonpath parser */
459 JsonPathParseResult *
460 parsejsonpath(const char *str, int len)
461 {
462 JsonPathParseResult *parseresult;
463
464 jsonpath_scanner_init(str, len);
465
466 if (jsonpath_yyparse((void *) &parseresult) != 0)
467 jsonpath_yyerror(NULL, "bogus input"); /* shouldn't happen */
468
469 jsonpath_scanner_finish();
470
471 return parseresult;
472 }
473
474 /* Turn hex character into integer */
475 static int
476 hexval(char c)
477 {
478 if (c >= '0' && c <= '9')
479 return c - '0';
480 if (c >= 'a' && c <= 'f')
481 return c - 'a' + 0xA;
482 if (c >= 'A' && c <= 'F')
483 return c - 'A' + 0xA;
484 jsonpath_yyerror(NULL, "invalid hexadecimal digit");
485 return 0; /* not reached */
486 }
487
488 /* Add given unicode character to scanstring */
489 static void
490 addUnicodeChar(int ch)
491 {
492 /*
493 * For UTF8, replace the escape sequence by the actual
494 * utf8 character in lex->strval. Do this also for other
495 * encodings if the escape designates an ASCII character,
496 * otherwise raise an error.
497 */
498
499 if (ch == 0)
500 {
501 /* We can't allow this, since our TEXT type doesn't */
502 ereport(ERROR,
503 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
504 errmsg("unsupported Unicode escape sequence"),
505 errdetail("\\u0000 cannot be converted to text.")));
506 }
507 else if (GetDatabaseEncoding() == PG_UTF8)
508 {
509 char utf8str[5];
510 int utf8len;
511
512 unicode_to_utf8(ch, (unsigned char *) utf8str);
513 utf8len = pg_utf_mblen((unsigned char *) utf8str);
514 addstring(false, utf8str, utf8len);
515 }
516 else if (ch <= 0x007f)
517 {
518 /*
519 * This is the only way to designate things like a
520 * form feed character in JSON, so it's useful in all
521 * encodings.
522 */
523 addchar(false, (char) ch);
524 }
525 else
526 {
527 ereport(ERROR,
528 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
529 errmsg("invalid input syntax for type %s", "jsonpath"),
530 errdetail("Unicode escape values cannot be used for code "
531 "point values above 007F when the server encoding "
532 "is not UTF8.")));
533 }
534 }
535
536 /* Add unicode character and process its hi surrogate */
537 static void
538 addUnicode(int ch, int *hi_surrogate)
539 {
540 if (ch >= 0xd800 && ch <= 0xdbff)
541 {
542 if (*hi_surrogate != -1)
543 ereport(ERROR,
544 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
545 errmsg("invalid input syntax for type %s", "jsonpath"),
546 errdetail("Unicode high surrogate must not follow "
547 "a high surrogate.")));
548 *hi_surrogate = (ch & 0x3ff) << 10;
549 return;
550 }
551 else if (ch >= 0xdc00 && ch <= 0xdfff)
552 {
553 if (*hi_surrogate == -1)
554 ereport(ERROR,
555 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
556 errmsg("invalid input syntax for type %s", "jsonpath"),
557 errdetail("Unicode low surrogate must follow a high "
558 "surrogate.")));
559 ch = 0x10000 + *hi_surrogate + (ch & 0x3ff);
560 *hi_surrogate = -1;
561 }
562 else if (*hi_surrogate != -1)
563 {
564 ereport(ERROR,
565 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
566 errmsg("invalid input syntax for type %s", "jsonpath"),
567 errdetail("Unicode low surrogate must follow a high "
568 "surrogate.")));
569 }
570
571 addUnicodeChar(ch);
572 }
573
574 /*
575 * parseUnicode was adopted from json_lex_string() in
576 * src/backend/utils/adt/json.c
577 */
578 static void
579 parseUnicode(char *s, int l)
580 {
581 int i = 2;
582 int hi_surrogate = -1;
583
584 for (i = 2; i < l; i += 2) /* skip '\u' */
585 {
586 int ch = 0;
587 int j;
588
589 if (s[i] == '{') /* parse '\u{XX...}' */
590 {
591 while (s[++i] != '}' && i < l)
592 ch = (ch << 4) | hexval(s[i]);
593 i++; /* skip '}' */
594 }
595 else /* parse '\uXXXX' */
596 {
597 for (j = 0; j < 4 && i < l; j++)
598 ch = (ch << 4) | hexval(s[i++]);
599 }
600
601 addUnicode(ch, &hi_surrogate);
602 }
603
604 if (hi_surrogate != -1)
605 {
606 ereport(ERROR,
607 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
608 errmsg("invalid input syntax for type %s", "jsonpath"),
609 errdetail("Unicode low surrogate must follow a high "
610 "surrogate.")));
611 }
612 }
613
614 /* Parse sequence of hex-encoded characters */
615 static void
616 parseHexChar(char *s)
617 {
618 int ch = (hexval(s[2]) << 4) |
619 hexval(s[3]);
620
621 addUnicodeChar(ch);
622 }
623
624 /*
625 * Interface functions to make flex use palloc() instead of malloc().
626 * It'd be better to make these static, but flex insists otherwise.
627 */
628
629 void *
630 jsonpath_yyalloc(yy_size_t bytes)
631 {
632 return palloc(bytes);
633 }
634
635 void *
636 jsonpath_yyrealloc(void *ptr, yy_size_t bytes)
637 {
638 if (ptr)
639 return repalloc(ptr, bytes);
640 else
641 return palloc(bytes);
642 }
643
644 void
645 jsonpath_yyfree(void *ptr)
646 {
647 if (ptr)
648 pfree(ptr);
649 }
650