1 %{
2 /*-------------------------------------------------------------------------
3  *
4  * jsonpath_scan.l
5  *	Lexical parser for jsonpath datatype
6  *
7  * Splits jsonpath string into tokens represented as JsonPathString structs.
8  * Decodes unicode and hex escaped strings.
9  *
10  * Copyright (c) 2019, PostgreSQL Global Development Group
11  *
12  * IDENTIFICATION
13  *	src/backend/utils/adt/jsonpath_scan.l
14  *
15  *-------------------------------------------------------------------------
16  */
17 
18 #include "postgres.h"
19 
20 #include "mb/pg_wchar.h"
21 #include "nodes/pg_list.h"
22 
23 static JsonPathString scanstring;
24 
25 /* Handles to the buffer that the lexer uses internally */
26 static YY_BUFFER_STATE scanbufhandle;
27 static char *scanbuf;
28 static int	scanbuflen;
29 
30 static void addstring(bool init, char *s, int l);
31 static void addchar(bool init, char s);
32 static enum yytokentype checkKeyword(void);
33 static void parseUnicode(char *s, int l);
34 static void parseHexChar(char *s);
35 
36 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
37 #undef fprintf
38 #define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
39 
40 static void
fprintf_to_ereport(const char * fmt,const char * msg)41 fprintf_to_ereport(const char *fmt, const char *msg)
42 {
43 	ereport(ERROR, (errmsg_internal("%s", msg)));
44 }
45 
46 /* LCOV_EXCL_START */
47 
48 %}
49 
50 %option 8bit
51 %option never-interactive
52 %option nodefault
53 %option noinput
54 %option nounput
55 %option noyywrap
56 %option warn
57 %option prefix="jsonpath_yy"
58 %option bison-bridge
59 %option noyyalloc
60 %option noyyrealloc
61 %option noyyfree
62 
63 /*
64  * We use exclusive states for quoted and non-quoted strings,
65  * quoted variable names and C-style comments.
66  * Exclusive states:
67  *  <xq> - quoted strings
68  *  <xnq> - non-quoted strings
69  *  <xvq> - quoted variable names
70  *  <xc> - C-style comment
71  */
72 
73 %x xq
74 %x xnq
75 %x xvq
76 %x xc
77 
78 special		[\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
79 blank		[ \t\n\r\f]
80 /* "other" means anything that's not special, blank, or '\' or '"' */
81 other		[^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\" \t\n\r\f]
82 
83 digit		[0-9]
84 integer		(0|[1-9]{digit}*)
85 decimal		{integer}\.{digit}+
86 decimalfail	{integer}\.
87 real		({integer}|{decimal})[Ee][-+]?{digit}+
88 realfail1	({integer}|{decimal})[Ee]
89 realfail2	({integer}|{decimal})[Ee][-+]
90 
91 hex_dig		[0-9A-Fa-f]
92 unicode		\\u({hex_dig}{4}|\{{hex_dig}{1,6}\})
93 unicodefail	\\u({hex_dig}{0,3}|\{{hex_dig}{0,6})
94 hex_char	\\x{hex_dig}{2}
95 hex_fail	\\x{hex_dig}{0,1}
96 
97 %%
98 
99 <xnq>{other}+					{
100 									addstring(false, yytext, yyleng);
101 								}
102 
103 <xnq>{blank}+					{
104 									yylval->str = scanstring;
105 									BEGIN INITIAL;
106 									return checkKeyword();
107 								}
108 
109 <xnq>\/\*						{
110 									yylval->str = scanstring;
111 									BEGIN xc;
112 								}
113 
114 <xnq>({special}|\")				{
115 									yylval->str = scanstring;
116 									yyless(0);
117 									BEGIN INITIAL;
118 									return checkKeyword();
119 								}
120 
121 <xnq><<EOF>>					{
122 									yylval->str = scanstring;
123 									BEGIN INITIAL;
124 									return checkKeyword();
125 								}
126 
127 <xnq,xq,xvq>\\b				{ addchar(false, '\b'); }
128 
129 <xnq,xq,xvq>\\f				{ addchar(false, '\f'); }
130 
131 <xnq,xq,xvq>\\n				{ addchar(false, '\n'); }
132 
133 <xnq,xq,xvq>\\r				{ addchar(false, '\r'); }
134 
135 <xnq,xq,xvq>\\t				{ addchar(false, '\t'); }
136 
137 <xnq,xq,xvq>\\v				{ addchar(false, '\v'); }
138 
139 <xnq,xq,xvq>{unicode}+		{ parseUnicode(yytext, yyleng); }
140 
141 <xnq,xq,xvq>{hex_char}		{ parseHexChar(yytext); }
142 
143 <xnq,xq,xvq>{unicode}*{unicodefail}	{ yyerror(NULL, "invalid unicode sequence"); }
144 
145 <xnq,xq,xvq>{hex_fail}		{ yyerror(NULL, "invalid hex character sequence"); }
146 
147 <xnq,xq,xvq>{unicode}+\\	{
148 								/* throw back the \\, and treat as unicode */
149 								yyless(yyleng - 1);
150 								parseUnicode(yytext, yyleng);
151 							}
152 
153 <xnq,xq,xvq>\\.				{ addchar(false, yytext[1]); }
154 
155 <xnq,xq,xvq>\\				{ yyerror(NULL, "unexpected end after backslash"); }
156 
157 <xq,xvq><<EOF>>				{ yyerror(NULL, "unexpected end of quoted string"); }
158 
159 <xq>\"							{
160 									yylval->str = scanstring;
161 									BEGIN INITIAL;
162 									return STRING_P;
163 								}
164 
165 <xvq>\"							{
166 									yylval->str = scanstring;
167 									BEGIN INITIAL;
168 									return VARIABLE_P;
169 								}
170 
171 <xq,xvq>[^\\\"]+				{ addstring(false, yytext, yyleng); }
172 
173 <xc>\*\/						{ BEGIN INITIAL; }
174 
175 <xc>[^\*]+						{ }
176 
177 <xc>\*							{ }
178 
179 <xc><<EOF>>						{ yyerror(NULL, "unexpected end of comment"); }
180 
181 \&\&							{ return AND_P; }
182 
183 \|\|							{ return OR_P; }
184 
185 \!								{ return NOT_P; }
186 
187 \*\*							{ return ANY_P; }
188 
189 \<								{ return LESS_P; }
190 
191 \<\=							{ return LESSEQUAL_P; }
192 
193 \=\=							{ return EQUAL_P; }
194 
195 \<\>							{ return NOTEQUAL_P; }
196 
197 \!\=							{ return NOTEQUAL_P; }
198 
199 \>\=							{ return GREATEREQUAL_P; }
200 
201 \>								{ return GREATER_P; }
202 
203 \${other}+						{
204 									addstring(true, yytext + 1, yyleng - 1);
205 									addchar(false, '\0');
206 									yylval->str = scanstring;
207 									return VARIABLE_P;
208 								}
209 
210 \$\"							{
211 									addchar(true, '\0');
212 									BEGIN xvq;
213 								}
214 
215 {special}						{ return *yytext; }
216 
217 {blank}+						{ /* ignore */ }
218 
219 \/\*							{
220 									addchar(true, '\0');
221 									BEGIN xc;
222 								}
223 
224 {real}							{
225 									addstring(true, yytext, yyleng);
226 									addchar(false, '\0');
227 									yylval->str = scanstring;
228 									return NUMERIC_P;
229 								}
230 
231 {decimal}						{
232 									addstring(true, yytext, yyleng);
233 									addchar(false, '\0');
234 									yylval->str = scanstring;
235 									return NUMERIC_P;
236 								}
237 
238 {integer}						{
239 									addstring(true, yytext, yyleng);
240 									addchar(false, '\0');
241 									yylval->str = scanstring;
242 									return INT_P;
243 								}
244 
245 {decimalfail}					{
246 									/* throw back the ., and treat as integer */
247 									yyless(yyleng - 1);
248 									addstring(true, yytext, yyleng);
249 									addchar(false, '\0');
250 									yylval->str = scanstring;
251 									return INT_P;
252 								}
253 
254 ({realfail1}|{realfail2})		{ yyerror(NULL, "invalid floating point number"); }
255 
256 \"								{
257 									addchar(true, '\0');
258 									BEGIN xq;
259 								}
260 
261 \\								{
262 									yyless(0);
263 									addchar(true, '\0');
264 									BEGIN xnq;
265 								}
266 
267 {other}+						{
268 									addstring(true, yytext, yyleng);
269 									BEGIN xnq;
270 								}
271 
272 <<EOF>>							{ yyterminate(); }
273 
274 %%
275 
276 /* LCOV_EXCL_STOP */
277 
278 void
279 jsonpath_yyerror(JsonPathParseResult **result, const char *message)
280 {
281 	if (*yytext == YY_END_OF_BUFFER_CHAR)
282 	{
283 		ereport(ERROR,
284 				(errcode(ERRCODE_SYNTAX_ERROR),
285 				 /* translator: %s is typically "syntax error" */
286 				 errmsg("%s at end of jsonpath input", _(message))));
287 	}
288 	else
289 	{
290 		ereport(ERROR,
291 				(errcode(ERRCODE_SYNTAX_ERROR),
292 				 /* translator: first %s is typically "syntax error" */
293 				 errmsg("%s at or near \"%s\" of jsonpath input",
294 						_(message), yytext)));
295 	}
296 }
297 
298 typedef struct JsonPathKeyword
299 {
300 	int16		len;
301 	bool		lowercase;
302 	int			val;
303 	const char *keyword;
304 } JsonPathKeyword;
305 
306 /*
307  * Array of key words should be sorted by length and then
308  * alphabetical order
309  */
310 static const JsonPathKeyword keywords[] = {
311 	{ 2, false,	IS_P,		"is"},
312 	{ 2, false,	TO_P,		"to"},
313 	{ 3, false,	ABS_P,		"abs"},
314 	{ 3, false,	LAX_P,		"lax"},
315 	{ 4, false,	FLAG_P,		"flag"},
316 	{ 4, false,	LAST_P,		"last"},
317 	{ 4, true,	NULL_P,		"null"},
318 	{ 4, false,	SIZE_P,		"size"},
319 	{ 4, true,	TRUE_P,		"true"},
320 	{ 4, false,	TYPE_P,		"type"},
321 	{ 4, false,	WITH_P,		"with"},
322 	{ 5, true,	FALSE_P,	"false"},
323 	{ 5, false,	FLOOR_P,	"floor"},
324 	{ 6, false,	DOUBLE_P,	"double"},
325 	{ 6, false,	EXISTS_P,	"exists"},
326 	{ 6, false,	STARTS_P,	"starts"},
327 	{ 6, false,	STRICT_P,	"strict"},
328 	{ 7, false,	CEILING_P,	"ceiling"},
329 	{ 7, false,	UNKNOWN_P,	"unknown"},
330 	{ 8, false,	KEYVALUE_P,	"keyvalue"},
331 	{ 10,false, LIKE_REGEX_P, "like_regex"},
332 };
333 
334 /* Check if current scanstring value is a keyword */
335 static enum yytokentype
336 checkKeyword()
337 {
338 	int						res = IDENT_P;
339 	int						diff;
340 	const JsonPathKeyword  *StopLow = keywords,
341 						   *StopHigh = keywords + lengthof(keywords),
342 						   *StopMiddle;
343 
344 	if (scanstring.len > keywords[lengthof(keywords) - 1].len)
345 		return res;
346 
347 	while (StopLow < StopHigh)
348 	{
349 		StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
350 
351 		if (StopMiddle->len == scanstring.len)
352 			diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val,
353 								  scanstring.len);
354 		else
355 			diff = StopMiddle->len - scanstring.len;
356 
357 		if (diff < 0)
358 			StopLow = StopMiddle + 1;
359 		else if (diff > 0)
360 			StopHigh = StopMiddle;
361 		else
362 		{
363 			if (StopMiddle->lowercase)
364 				diff = strncmp(StopMiddle->keyword, scanstring.val,
365 							   scanstring.len);
366 
367 			if (diff == 0)
368 				res = StopMiddle->val;
369 
370 			break;
371 		}
372 	}
373 
374 	return res;
375 }
376 
377 /*
378  * Called before any actual parsing is done
379  */
380 static void
381 jsonpath_scanner_init(const char *str, int slen)
382 {
383 	if (slen <= 0)
384 		slen = strlen(str);
385 
386 	/*
387 	 * Might be left over after ereport()
388 	 */
389 	yy_init_globals();
390 
391 	/*
392 	 * Make a scan buffer with special termination needed by flex.
393 	 */
394 
395 	scanbuflen = slen;
396 	scanbuf = palloc(slen + 2);
397 	memcpy(scanbuf, str, slen);
398 	scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
399 	scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
400 
401 	BEGIN(INITIAL);
402 }
403 
404 
405 /*
406  * Called after parsing is done to clean up after jsonpath_scanner_init()
407  */
408 static void
409 jsonpath_scanner_finish(void)
410 {
411 	yy_delete_buffer(scanbufhandle);
412 	pfree(scanbuf);
413 }
414 
415 /*
416  * Resize scanstring so that it can append string of given length.
417  * Reinitialize if required.
418  */
419 static void
420 resizeString(bool init, int appendLen)
421 {
422 	if (init)
423 	{
424 		scanstring.total = Max(32, appendLen);
425 		scanstring.val = (char *) palloc(scanstring.total);
426 		scanstring.len = 0;
427 	}
428 	else
429 	{
430 		if (scanstring.len + appendLen >= scanstring.total)
431 		{
432 			while (scanstring.len + appendLen >= scanstring.total)
433 				scanstring.total *= 2;
434 			scanstring.val = repalloc(scanstring.val, scanstring.total);
435 		}
436 	}
437 }
438 
439 /* Add set of bytes at "s" of length "l" to scanstring */
440 static void
441 addstring(bool init, char *s, int l)
442 {
443 	resizeString(init, l + 1);
444 	memcpy(scanstring.val + scanstring.len, s, l);
445 	scanstring.len += l;
446 }
447 
448 /* Add single byte "c" to scanstring */
449 static void
450 addchar(bool init, char c)
451 {
452 	resizeString(init, 1);
453 	scanstring.val[scanstring.len] = c;
454 	if (c != '\0')
455 		scanstring.len++;
456 }
457 
458 /* Interface to jsonpath parser */
459 JsonPathParseResult *
460 parsejsonpath(const char *str, int len)
461 {
462 	JsonPathParseResult	*parseresult;
463 
464 	jsonpath_scanner_init(str, len);
465 
466 	if (jsonpath_yyparse((void *) &parseresult) != 0)
467 		jsonpath_yyerror(NULL, "bogus input"); /* shouldn't happen */
468 
469 	jsonpath_scanner_finish();
470 
471 	return parseresult;
472 }
473 
474 /* Turn hex character into integer */
475 static int
476 hexval(char c)
477 {
478 	if (c >= '0' && c <= '9')
479 		return c - '0';
480 	if (c >= 'a' && c <= 'f')
481 		return c - 'a' + 0xA;
482 	if (c >= 'A' && c <= 'F')
483 		return c - 'A' + 0xA;
484 	jsonpath_yyerror(NULL, "invalid hexadecimal digit");
485 	return 0; /* not reached */
486 }
487 
488 /* Add given unicode character to scanstring */
489 static void
490 addUnicodeChar(int ch)
491 {
492 	/*
493 	 * For UTF8, replace the escape sequence by the actual
494 	 * utf8 character in lex->strval. Do this also for other
495 	 * encodings if the escape designates an ASCII character,
496 	 * otherwise raise an error.
497 	 */
498 
499 	if (ch == 0)
500 	{
501 		/* We can't allow this, since our TEXT type doesn't */
502 		ereport(ERROR,
503 				(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
504 				 errmsg("unsupported Unicode escape sequence"),
505 				  errdetail("\\u0000 cannot be converted to text.")));
506 	}
507 	else if (GetDatabaseEncoding() == PG_UTF8)
508 	{
509 		char utf8str[5];
510 		int utf8len;
511 
512 		unicode_to_utf8(ch, (unsigned char *) utf8str);
513 		utf8len = pg_utf_mblen((unsigned char *) utf8str);
514 		addstring(false, utf8str, utf8len);
515 	}
516 	else if (ch <= 0x007f)
517 	{
518 		/*
519 		 * This is the only way to designate things like a
520 		 * form feed character in JSON, so it's useful in all
521 		 * encodings.
522 		 */
523 		addchar(false, (char) ch);
524 	}
525 	else
526 	{
527 		ereport(ERROR,
528 				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
529 				 errmsg("invalid input syntax for type %s", "jsonpath"),
530 				 errdetail("Unicode escape values cannot be used for code "
531 						   "point values above 007F when the server encoding "
532 						   "is not UTF8.")));
533 	}
534 }
535 
536 /* Add unicode character and process its hi surrogate */
537 static void
538 addUnicode(int ch, int *hi_surrogate)
539 {
540 	if (ch >= 0xd800 && ch <= 0xdbff)
541 	{
542 		if (*hi_surrogate != -1)
543 			ereport(ERROR,
544 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
545 					 errmsg("invalid input syntax for type %s", "jsonpath"),
546 					 errdetail("Unicode high surrogate must not follow "
547 							   "a high surrogate.")));
548 		*hi_surrogate = (ch & 0x3ff) << 10;
549 		return;
550 	}
551 	else if (ch >= 0xdc00 && ch <= 0xdfff)
552 	{
553 		if (*hi_surrogate == -1)
554 			ereport(ERROR,
555 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
556 					 errmsg("invalid input syntax for type %s", "jsonpath"),
557 					 errdetail("Unicode low surrogate must follow a high "
558 							   "surrogate.")));
559 		ch = 0x10000 + *hi_surrogate + (ch & 0x3ff);
560 		*hi_surrogate = -1;
561 	}
562 	else if (*hi_surrogate != -1)
563 	{
564 		ereport(ERROR,
565 				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
566 				 errmsg("invalid input syntax for type %s", "jsonpath"),
567 				 errdetail("Unicode low surrogate must follow a high "
568 						   "surrogate.")));
569 	}
570 
571 	addUnicodeChar(ch);
572 }
573 
574 /*
575  * parseUnicode was adopted from json_lex_string() in
576  * src/backend/utils/adt/json.c
577  */
578 static void
579 parseUnicode(char *s, int l)
580 {
581 	int			i = 2;
582 	int			hi_surrogate = -1;
583 
584 	for (i = 2; i < l; i += 2)	/* skip '\u' */
585 	{
586 		int			ch = 0;
587 		int			j;
588 
589 		if (s[i] == '{')	/* parse '\u{XX...}' */
590 		{
591 			while (s[++i] != '}' && i < l)
592 				ch = (ch << 4) | hexval(s[i]);
593 			i++;	/* skip '}' */
594 		}
595 		else		/* parse '\uXXXX' */
596 		{
597 			for (j = 0; j < 4 && i < l; j++)
598 				ch = (ch << 4) | hexval(s[i++]);
599 		}
600 
601 		addUnicode(ch, &hi_surrogate);
602 	}
603 
604 	if (hi_surrogate != -1)
605 	{
606 		ereport(ERROR,
607 				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
608 				 errmsg("invalid input syntax for type %s", "jsonpath"),
609 				 errdetail("Unicode low surrogate must follow a high "
610 						   "surrogate.")));
611 	}
612 }
613 
614 /* Parse sequence of hex-encoded characters */
615 static void
616 parseHexChar(char *s)
617 {
618 	int			ch = (hexval(s[2]) << 4) |
619 					  hexval(s[3]);
620 
621 	addUnicodeChar(ch);
622 }
623 
624 /*
625  * Interface functions to make flex use palloc() instead of malloc().
626  * It'd be better to make these static, but flex insists otherwise.
627  */
628 
629 void *
630 jsonpath_yyalloc(yy_size_t bytes)
631 {
632 	return palloc(bytes);
633 }
634 
635 void *
636 jsonpath_yyrealloc(void *ptr, yy_size_t bytes)
637 {
638 	if (ptr)
639 		return repalloc(ptr, bytes);
640 	else
641 		return palloc(bytes);
642 }
643 
644 void
645 jsonpath_yyfree(void *ptr)
646 {
647 	if (ptr)
648 		pfree(ptr);
649 }
650