1 %{
2 /*-------------------------------------------------------------------------
3  *
4  * jsonpath_scan.l
5  *	Lexical parser for jsonpath datatype
6  *
7  * Splits jsonpath string into tokens represented as JsonPathString structs.
8  * Decodes unicode and hex escaped strings.
9  *
10  * Copyright (c) 2019-2020, PostgreSQL Global Development Group
11  *
12  * IDENTIFICATION
13  *	src/backend/utils/adt/jsonpath_scan.l
14  *
15  *-------------------------------------------------------------------------
16  */
17 
18 #include "postgres.h"
19 
20 #include "mb/pg_wchar.h"
21 #include "nodes/pg_list.h"
22 
23 static JsonPathString scanstring;
24 
25 /* Handles to the buffer that the lexer uses internally */
26 static YY_BUFFER_STATE scanbufhandle;
27 static char *scanbuf;
28 static int	scanbuflen;
29 
30 static void addstring(bool init, char *s, int l);
31 static void addchar(bool init, char s);
32 static enum yytokentype checkKeyword(void);
33 static void parseUnicode(char *s, int l);
34 static void parseHexChar(char *s);
35 
36 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
37 #undef fprintf
38 #define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
39 
40 static void
41 fprintf_to_ereport(const char *fmt, const char *msg)
42 {
43 	ereport(ERROR, (errmsg_internal("%s", msg)));
44 }
45 
46 /* LCOV_EXCL_START */
47 
48 %}
49 
50 %option 8bit
51 %option never-interactive
52 %option nodefault
53 %option noinput
54 %option nounput
55 %option noyywrap
56 %option warn
57 %option prefix="jsonpath_yy"
58 %option bison-bridge
59 %option noyyalloc
60 %option noyyrealloc
61 %option noyyfree
62 
63 /*
64  * We use exclusive states for quoted and non-quoted strings,
65  * quoted variable names and C-style comments.
66  * Exclusive states:
67  *  <xq> - quoted strings
68  *  <xnq> - non-quoted strings
69  *  <xvq> - quoted variable names
70  *  <xc> - C-style comment
71  */
72 
73 %x xq
74 %x xnq
75 %x xvq
76 %x xc
77 
78 special		[\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
79 blank		[ \t\n\r\f]
80 /* "other" means anything that's not special, blank, or '\' or '"' */
81 other		[^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\" \t\n\r\f]
82 
83 digit		[0-9]
84 integer		(0|[1-9]{digit}*)
85 decimal		{integer}\.{digit}+
86 decimalfail	{integer}\.
87 real		({integer}|{decimal})[Ee][-+]?{digit}+
88 realfail1	({integer}|{decimal})[Ee]
89 realfail2	({integer}|{decimal})[Ee][-+]
90 
91 hex_dig		[0-9A-Fa-f]
92 unicode		\\u({hex_dig}{4}|\{{hex_dig}{1,6}\})
93 unicodefail	\\u({hex_dig}{0,3}|\{{hex_dig}{0,6})
94 hex_char	\\x{hex_dig}{2}
95 hex_fail	\\x{hex_dig}{0,1}
96 
97 %%
98 
99 <xnq>{other}+					{
100 									addstring(false, yytext, yyleng);
101 								}
102 
103 <xnq>{blank}+					{
104 									yylval->str = scanstring;
105 									BEGIN INITIAL;
106 									return checkKeyword();
107 								}
108 
109 <xnq>\/\*						{
110 									yylval->str = scanstring;
111 									BEGIN xc;
112 								}
113 
114 <xnq>({special}|\")				{
115 									yylval->str = scanstring;
116 									yyless(0);
117 									BEGIN INITIAL;
118 									return checkKeyword();
119 								}
120 
121 <xnq><<EOF>>					{
122 									yylval->str = scanstring;
123 									BEGIN INITIAL;
124 									return checkKeyword();
125 								}
126 
127 <xnq,xq,xvq>\\b				{ addchar(false, '\b'); }
128 
129 <xnq,xq,xvq>\\f				{ addchar(false, '\f'); }
130 
131 <xnq,xq,xvq>\\n				{ addchar(false, '\n'); }
132 
133 <xnq,xq,xvq>\\r				{ addchar(false, '\r'); }
134 
135 <xnq,xq,xvq>\\t				{ addchar(false, '\t'); }
136 
137 <xnq,xq,xvq>\\v				{ addchar(false, '\v'); }
138 
139 <xnq,xq,xvq>{unicode}+		{ parseUnicode(yytext, yyleng); }
140 
141 <xnq,xq,xvq>{hex_char}		{ parseHexChar(yytext); }
142 
143 <xnq,xq,xvq>{unicode}*{unicodefail}	{ yyerror(NULL, "invalid unicode sequence"); }
144 
145 <xnq,xq,xvq>{hex_fail}		{ yyerror(NULL, "invalid hex character sequence"); }
146 
147 <xnq,xq,xvq>{unicode}+\\	{
148 								/* throw back the \\, and treat as unicode */
149 								yyless(yyleng - 1);
150 								parseUnicode(yytext, yyleng);
151 							}
152 
153 <xnq,xq,xvq>\\.				{ addchar(false, yytext[1]); }
154 
155 <xnq,xq,xvq>\\				{ yyerror(NULL, "unexpected end after backslash"); }
156 
157 <xq,xvq><<EOF>>				{ yyerror(NULL, "unexpected end of quoted string"); }
158 
159 <xq>\"							{
160 									yylval->str = scanstring;
161 									BEGIN INITIAL;
162 									return STRING_P;
163 								}
164 
165 <xvq>\"							{
166 									yylval->str = scanstring;
167 									BEGIN INITIAL;
168 									return VARIABLE_P;
169 								}
170 
171 <xq,xvq>[^\\\"]+				{ addstring(false, yytext, yyleng); }
172 
173 <xc>\*\/						{ BEGIN INITIAL; }
174 
175 <xc>[^\*]+						{ }
176 
177 <xc>\*							{ }
178 
179 <xc><<EOF>>						{ yyerror(NULL, "unexpected end of comment"); }
180 
181 \&\&							{ return AND_P; }
182 
183 \|\|							{ return OR_P; }
184 
185 \!								{ return NOT_P; }
186 
187 \*\*							{ return ANY_P; }
188 
189 \<								{ return LESS_P; }
190 
191 \<\=							{ return LESSEQUAL_P; }
192 
193 \=\=							{ return EQUAL_P; }
194 
195 \<\>							{ return NOTEQUAL_P; }
196 
197 \!\=							{ return NOTEQUAL_P; }
198 
199 \>\=							{ return GREATEREQUAL_P; }
200 
201 \>								{ return GREATER_P; }
202 
203 \${other}+						{
204 									addstring(true, yytext + 1, yyleng - 1);
205 									addchar(false, '\0');
206 									yylval->str = scanstring;
207 									return VARIABLE_P;
208 								}
209 
210 \$\"							{
211 									addchar(true, '\0');
212 									BEGIN xvq;
213 								}
214 
215 {special}						{ return *yytext; }
216 
217 {blank}+						{ /* ignore */ }
218 
219 \/\*							{
220 									addchar(true, '\0');
221 									BEGIN xc;
222 								}
223 
224 {real}							{
225 									addstring(true, yytext, yyleng);
226 									addchar(false, '\0');
227 									yylval->str = scanstring;
228 									return NUMERIC_P;
229 								}
230 
231 {decimal}						{
232 									addstring(true, yytext, yyleng);
233 									addchar(false, '\0');
234 									yylval->str = scanstring;
235 									return NUMERIC_P;
236 								}
237 
238 {integer}						{
239 									addstring(true, yytext, yyleng);
240 									addchar(false, '\0');
241 									yylval->str = scanstring;
242 									return INT_P;
243 								}
244 
245 {decimalfail}					{
246 									/* throw back the ., and treat as integer */
247 									yyless(yyleng - 1);
248 									addstring(true, yytext, yyleng);
249 									addchar(false, '\0');
250 									yylval->str = scanstring;
251 									return INT_P;
252 								}
253 
254 ({realfail1}|{realfail2})		{ yyerror(NULL, "invalid floating point number"); }
255 
256 \"								{
257 									addchar(true, '\0');
258 									BEGIN xq;
259 								}
260 
261 \\								{
262 									yyless(0);
263 									addchar(true, '\0');
264 									BEGIN xnq;
265 								}
266 
267 {other}+						{
268 									addstring(true, yytext, yyleng);
269 									BEGIN xnq;
270 								}
271 
272 <<EOF>>							{ yyterminate(); }
273 
274 %%
275 
276 /* LCOV_EXCL_STOP */
277 
278 void
279 jsonpath_yyerror(JsonPathParseResult **result, const char *message)
280 {
281 	if (*yytext == YY_END_OF_BUFFER_CHAR)
282 	{
283 		ereport(ERROR,
284 				(errcode(ERRCODE_SYNTAX_ERROR),
285 				 /* translator: %s is typically "syntax error" */
286 				 errmsg("%s at end of jsonpath input", _(message))));
287 	}
288 	else
289 	{
290 		ereport(ERROR,
291 				(errcode(ERRCODE_SYNTAX_ERROR),
292 				 /* translator: first %s is typically "syntax error" */
293 				 errmsg("%s at or near \"%s\" of jsonpath input",
294 						_(message), yytext)));
295 	}
296 }
297 
298 typedef struct JsonPathKeyword
299 {
300 	int16		len;
301 	bool		lowercase;
302 	int			val;
303 	const char *keyword;
304 } JsonPathKeyword;
305 
306 /*
307  * Array of key words should be sorted by length and then
308  * alphabetical order
309  */
310 static const JsonPathKeyword keywords[] = {
311 	{ 2, false,	IS_P,		"is"},
312 	{ 2, false,	TO_P,		"to"},
313 	{ 3, false,	ABS_P,		"abs"},
314 	{ 3, false,	LAX_P,		"lax"},
315 	{ 4, false,	FLAG_P,		"flag"},
316 	{ 4, false,	LAST_P,		"last"},
317 	{ 4, true,	NULL_P,		"null"},
318 	{ 4, false,	SIZE_P,		"size"},
319 	{ 4, true,	TRUE_P,		"true"},
320 	{ 4, false,	TYPE_P,		"type"},
321 	{ 4, false,	WITH_P,		"with"},
322 	{ 5, true,	FALSE_P,	"false"},
323 	{ 5, false,	FLOOR_P,	"floor"},
324 	{ 6, false,	DOUBLE_P,	"double"},
325 	{ 6, false,	EXISTS_P,	"exists"},
326 	{ 6, false,	STARTS_P,	"starts"},
327 	{ 6, false,	STRICT_P,	"strict"},
328 	{ 7, false,	CEILING_P,	"ceiling"},
329 	{ 7, false,	UNKNOWN_P,	"unknown"},
330 	{ 8, false,	DATETIME_P,	"datetime"},
331 	{ 8, false,	KEYVALUE_P,	"keyvalue"},
332 	{ 10,false, LIKE_REGEX_P, "like_regex"},
333 };
334 
335 /* Check if current scanstring value is a keyword */
336 static enum yytokentype
337 checkKeyword()
338 {
339 	int						res = IDENT_P;
340 	int						diff;
341 	const JsonPathKeyword  *StopLow = keywords,
342 						   *StopHigh = keywords + lengthof(keywords),
343 						   *StopMiddle;
344 
345 	if (scanstring.len > keywords[lengthof(keywords) - 1].len)
346 		return res;
347 
348 	while (StopLow < StopHigh)
349 	{
350 		StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
351 
352 		if (StopMiddle->len == scanstring.len)
353 			diff = pg_strncasecmp(StopMiddle->keyword, scanstring.val,
354 								  scanstring.len);
355 		else
356 			diff = StopMiddle->len - scanstring.len;
357 
358 		if (diff < 0)
359 			StopLow = StopMiddle + 1;
360 		else if (diff > 0)
361 			StopHigh = StopMiddle;
362 		else
363 		{
364 			if (StopMiddle->lowercase)
365 				diff = strncmp(StopMiddle->keyword, scanstring.val,
366 							   scanstring.len);
367 
368 			if (diff == 0)
369 				res = StopMiddle->val;
370 
371 			break;
372 		}
373 	}
374 
375 	return res;
376 }
377 
378 /*
379  * Called before any actual parsing is done
380  */
381 static void
382 jsonpath_scanner_init(const char *str, int slen)
383 {
384 	if (slen <= 0)
385 		slen = strlen(str);
386 
387 	/*
388 	 * Might be left over after ereport()
389 	 */
390 	yy_init_globals();
391 
392 	/*
393 	 * Make a scan buffer with special termination needed by flex.
394 	 */
395 
396 	scanbuflen = slen;
397 	scanbuf = palloc(slen + 2);
398 	memcpy(scanbuf, str, slen);
399 	scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
400 	scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
401 
402 	BEGIN(INITIAL);
403 }
404 
405 
406 /*
407  * Called after parsing is done to clean up after jsonpath_scanner_init()
408  */
409 static void
410 jsonpath_scanner_finish(void)
411 {
412 	yy_delete_buffer(scanbufhandle);
413 	pfree(scanbuf);
414 }
415 
416 /*
417  * Resize scanstring so that it can append string of given length.
418  * Reinitialize if required.
419  */
420 static void
421 resizeString(bool init, int appendLen)
422 {
423 	if (init)
424 	{
425 		scanstring.total = Max(32, appendLen);
426 		scanstring.val = (char *) palloc(scanstring.total);
427 		scanstring.len = 0;
428 	}
429 	else
430 	{
431 		if (scanstring.len + appendLen >= scanstring.total)
432 		{
433 			while (scanstring.len + appendLen >= scanstring.total)
434 				scanstring.total *= 2;
435 			scanstring.val = repalloc(scanstring.val, scanstring.total);
436 		}
437 	}
438 }
439 
440 /* Add set of bytes at "s" of length "l" to scanstring */
441 static void
442 addstring(bool init, char *s, int l)
443 {
444 	resizeString(init, l + 1);
445 	memcpy(scanstring.val + scanstring.len, s, l);
446 	scanstring.len += l;
447 }
448 
449 /* Add single byte "c" to scanstring */
450 static void
451 addchar(bool init, char c)
452 {
453 	resizeString(init, 1);
454 	scanstring.val[scanstring.len] = c;
455 	if (c != '\0')
456 		scanstring.len++;
457 }
458 
459 /* Interface to jsonpath parser */
460 JsonPathParseResult *
461 parsejsonpath(const char *str, int len)
462 {
463 	JsonPathParseResult	*parseresult;
464 
465 	jsonpath_scanner_init(str, len);
466 
467 	if (jsonpath_yyparse((void *) &parseresult) != 0)
468 		jsonpath_yyerror(NULL, "bogus input"); /* shouldn't happen */
469 
470 	jsonpath_scanner_finish();
471 
472 	return parseresult;
473 }
474 
475 /* Turn hex character into integer */
476 static int
477 hexval(char c)
478 {
479 	if (c >= '0' && c <= '9')
480 		return c - '0';
481 	if (c >= 'a' && c <= 'f')
482 		return c - 'a' + 0xA;
483 	if (c >= 'A' && c <= 'F')
484 		return c - 'A' + 0xA;
485 	jsonpath_yyerror(NULL, "invalid hexadecimal digit");
486 	return 0; /* not reached */
487 }
488 
489 /* Add given unicode character to scanstring */
490 static void
491 addUnicodeChar(int ch)
492 {
493 	if (ch == 0)
494 	{
495 		/* We can't allow this, since our TEXT type doesn't */
496 		ereport(ERROR,
497 				(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
498 				 errmsg("unsupported Unicode escape sequence"),
499 				  errdetail("\\u0000 cannot be converted to text.")));
500 	}
501 	else
502 	{
503 		char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
504 
505 		pg_unicode_to_server(ch, (unsigned char *) cbuf);
506 		addstring(false, cbuf, strlen(cbuf));
507 	}
508 }
509 
510 /* Add unicode character, processing any surrogate pairs */
511 static void
512 addUnicode(int ch, int *hi_surrogate)
513 {
514 	if (is_utf16_surrogate_first(ch))
515 	{
516 		if (*hi_surrogate != -1)
517 			ereport(ERROR,
518 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
519 					 errmsg("invalid input syntax for type %s", "jsonpath"),
520 					 errdetail("Unicode high surrogate must not follow "
521 							   "a high surrogate.")));
522 		*hi_surrogate = ch;
523 		return;
524 	}
525 	else if (is_utf16_surrogate_second(ch))
526 	{
527 		if (*hi_surrogate == -1)
528 			ereport(ERROR,
529 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
530 					 errmsg("invalid input syntax for type %s", "jsonpath"),
531 					 errdetail("Unicode low surrogate must follow a high "
532 							   "surrogate.")));
533 		ch = surrogate_pair_to_codepoint(*hi_surrogate, ch);
534 		*hi_surrogate = -1;
535 	}
536 	else if (*hi_surrogate != -1)
537 	{
538 		ereport(ERROR,
539 				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
540 				 errmsg("invalid input syntax for type %s", "jsonpath"),
541 				 errdetail("Unicode low surrogate must follow a high "
542 						   "surrogate.")));
543 	}
544 
545 	addUnicodeChar(ch);
546 }
547 
548 /*
549  * parseUnicode was adopted from json_lex_string() in
550  * src/backend/utils/adt/json.c
551  */
552 static void
553 parseUnicode(char *s, int l)
554 {
555 	int			i = 2;
556 	int			hi_surrogate = -1;
557 
558 	for (i = 2; i < l; i += 2)	/* skip '\u' */
559 	{
560 		int			ch = 0;
561 		int			j;
562 
563 		if (s[i] == '{')	/* parse '\u{XX...}' */
564 		{
565 			while (s[++i] != '}' && i < l)
566 				ch = (ch << 4) | hexval(s[i]);
567 			i++;	/* skip '}' */
568 		}
569 		else		/* parse '\uXXXX' */
570 		{
571 			for (j = 0; j < 4 && i < l; j++)
572 				ch = (ch << 4) | hexval(s[i++]);
573 		}
574 
575 		addUnicode(ch, &hi_surrogate);
576 	}
577 
578 	if (hi_surrogate != -1)
579 	{
580 		ereport(ERROR,
581 				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
582 				 errmsg("invalid input syntax for type %s", "jsonpath"),
583 				 errdetail("Unicode low surrogate must follow a high "
584 						   "surrogate.")));
585 	}
586 }
587 
588 /* Parse sequence of hex-encoded characters */
589 static void
590 parseHexChar(char *s)
591 {
592 	int			ch = (hexval(s[2]) << 4) |
593 					  hexval(s[3]);
594 
595 	addUnicodeChar(ch);
596 }
597 
598 /*
599  * Interface functions to make flex use palloc() instead of malloc().
600  * It'd be better to make these static, but flex insists otherwise.
601  */
602 
603 void *
604 jsonpath_yyalloc(yy_size_t bytes)
605 {
606 	return palloc(bytes);
607 }
608 
609 void *
610 jsonpath_yyrealloc(void *ptr, yy_size_t bytes)
611 {
612 	if (ptr)
613 		return repalloc(ptr, bytes);
614 	else
615 		return palloc(bytes);
616 }
617 
618 void
619 jsonpath_yyfree(void *ptr)
620 {
621 	if (ptr)
622 		pfree(ptr);
623 }
624