1 /*-------------------------------------------------------------------------
2  *
3  * parser.c
4  *		Main entry point/driver for PostgreSQL grammar
5  *
6  * Note that the grammar is not allowed to perform any table access
7  * (since we need to be able to do basic parsing even while inside an
8  * aborted transaction).  Therefore, the data structures returned by
9  * the grammar are "raw" parsetrees that still need to be analyzed by
10  * analyze.c and related files.
11  *
12  *
13  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
14  * Portions Copyright (c) 1994, Regents of the University of California
15  *
16  * IDENTIFICATION
17  *	  src/backend/parser/parser.c
18  *
19  *-------------------------------------------------------------------------
20  */
21 
22 #include "postgres.h"
23 
24 #include "mb/pg_wchar.h"
25 #include "parser/gramparse.h"
26 #include "parser/parser.h"
27 #include "parser/scansup.h"
28 
29 static bool check_uescapechar(unsigned char escape);
30 static char *str_udeescape(const char *str, char escape,
31 						   int position, core_yyscan_t yyscanner);
32 
33 
34 /*
35  * raw_parser
36  *		Given a query in string form, do lexical and grammatical analysis.
37  *
38  * Returns a list of raw (un-analyzed) parse trees.  The contents of the
39  * list have the form required by the specified RawParseMode.
40  */
41 List *
raw_parser(const char * str,RawParseMode mode)42 raw_parser(const char *str, RawParseMode mode)
43 {
44 	core_yyscan_t yyscanner;
45 	base_yy_extra_type yyextra;
46 	int			yyresult;
47 
48 	/* initialize the flex scanner */
49 	yyscanner = scanner_init(str, &yyextra.core_yy_extra,
50 							 &ScanKeywords, ScanKeywordTokens);
51 
52 	/* base_yylex() only needs us to initialize the lookahead token, if any */
53 	if (mode == RAW_PARSE_DEFAULT)
54 		yyextra.have_lookahead = false;
55 	else
56 	{
57 		/* this array is indexed by RawParseMode enum */
58 		static const int mode_token[] = {
59 			0,					/* RAW_PARSE_DEFAULT */
60 			MODE_TYPE_NAME,		/* RAW_PARSE_TYPE_NAME */
61 			MODE_PLPGSQL_EXPR,	/* RAW_PARSE_PLPGSQL_EXPR */
62 			MODE_PLPGSQL_ASSIGN1,	/* RAW_PARSE_PLPGSQL_ASSIGN1 */
63 			MODE_PLPGSQL_ASSIGN2,	/* RAW_PARSE_PLPGSQL_ASSIGN2 */
64 			MODE_PLPGSQL_ASSIGN3	/* RAW_PARSE_PLPGSQL_ASSIGN3 */
65 		};
66 
67 		yyextra.have_lookahead = true;
68 		yyextra.lookahead_token = mode_token[mode];
69 		yyextra.lookahead_yylloc = 0;
70 		yyextra.lookahead_end = NULL;
71 	}
72 
73 	/* initialize the bison parser */
74 	parser_init(&yyextra);
75 
76 	/* Parse! */
77 	yyresult = base_yyparse(yyscanner);
78 
79 	/* Clean up (release memory) */
80 	scanner_finish(yyscanner);
81 
82 	if (yyresult)				/* error */
83 		return NIL;
84 
85 	return yyextra.parsetree;
86 }
87 
88 
89 /*
90  * Intermediate filter between parser and core lexer (core_yylex in scan.l).
91  *
92  * This filter is needed because in some cases the standard SQL grammar
93  * requires more than one token lookahead.  We reduce these cases to one-token
94  * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
95  *
96  * Using a filter is simpler than trying to recognize multiword tokens
97  * directly in scan.l, because we'd have to allow for comments between the
98  * words.  Furthermore it's not clear how to do that without re-introducing
99  * scanner backtrack, which would cost more performance than this filter
100  * layer does.
101  *
102  * We also use this filter to convert UIDENT and USCONST sequences into
103  * plain IDENT and SCONST tokens.  While that could be handled by additional
104  * productions in the main grammar, it's more efficient to do it like this.
105  *
106  * The filter also provides a convenient place to translate between
107  * the core_YYSTYPE and YYSTYPE representations (which are really the
108  * same thing anyway, but notationally they're different).
109  */
110 int
base_yylex(YYSTYPE * lvalp,YYLTYPE * llocp,core_yyscan_t yyscanner)111 base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
112 {
113 	base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
114 	int			cur_token;
115 	int			next_token;
116 	int			cur_token_length;
117 	YYLTYPE		cur_yylloc;
118 
119 	/* Get next token --- we might already have it */
120 	if (yyextra->have_lookahead)
121 	{
122 		cur_token = yyextra->lookahead_token;
123 		lvalp->core_yystype = yyextra->lookahead_yylval;
124 		*llocp = yyextra->lookahead_yylloc;
125 		if (yyextra->lookahead_end)
126 			*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
127 		yyextra->have_lookahead = false;
128 	}
129 	else
130 		cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
131 
132 	/*
133 	 * If this token isn't one that requires lookahead, just return it.  If it
134 	 * does, determine the token length.  (We could get that via strlen(), but
135 	 * since we have such a small set of possibilities, hardwiring seems
136 	 * feasible and more efficient --- at least for the fixed-length cases.)
137 	 */
138 	switch (cur_token)
139 	{
140 		case NOT:
141 			cur_token_length = 3;
142 			break;
143 		case NULLS_P:
144 			cur_token_length = 5;
145 			break;
146 		case WITH:
147 			cur_token_length = 4;
148 			break;
149 		case UIDENT:
150 		case USCONST:
151 			cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
152 			break;
153 		default:
154 			return cur_token;
155 	}
156 
157 	/*
158 	 * Identify end+1 of current token.  core_yylex() has temporarily stored a
159 	 * '\0' here, and will undo that when we call it again.  We need to redo
160 	 * it to fully revert the lookahead call for error reporting purposes.
161 	 */
162 	yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
163 		*llocp + cur_token_length;
164 	Assert(*(yyextra->lookahead_end) == '\0');
165 
166 	/*
167 	 * Save and restore *llocp around the call.  It might look like we could
168 	 * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
169 	 * does not work because flex actually holds onto the last-passed pointer
170 	 * internally, and will use that for error reporting.  We need any error
171 	 * reports to point to the current token, not the next one.
172 	 */
173 	cur_yylloc = *llocp;
174 
175 	/* Get next token, saving outputs into lookahead variables */
176 	next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
177 	yyextra->lookahead_token = next_token;
178 	yyextra->lookahead_yylloc = *llocp;
179 
180 	*llocp = cur_yylloc;
181 
182 	/* Now revert the un-truncation of the current token */
183 	yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
184 	*(yyextra->lookahead_end) = '\0';
185 
186 	yyextra->have_lookahead = true;
187 
188 	/* Replace cur_token if needed, based on lookahead */
189 	switch (cur_token)
190 	{
191 		case NOT:
192 			/* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
193 			switch (next_token)
194 			{
195 				case BETWEEN:
196 				case IN_P:
197 				case LIKE:
198 				case ILIKE:
199 				case SIMILAR:
200 					cur_token = NOT_LA;
201 					break;
202 			}
203 			break;
204 
205 		case NULLS_P:
206 			/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
207 			switch (next_token)
208 			{
209 				case FIRST_P:
210 				case LAST_P:
211 					cur_token = NULLS_LA;
212 					break;
213 			}
214 			break;
215 
216 		case WITH:
217 			/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
218 			switch (next_token)
219 			{
220 				case TIME:
221 				case ORDINALITY:
222 					cur_token = WITH_LA;
223 					break;
224 			}
225 			break;
226 
227 		case UIDENT:
228 		case USCONST:
229 			/* Look ahead for UESCAPE */
230 			if (next_token == UESCAPE)
231 			{
232 				/* Yup, so get third token, which had better be SCONST */
233 				const char *escstr;
234 
235 				/* Again save and restore *llocp */
236 				cur_yylloc = *llocp;
237 
238 				/* Un-truncate current token so errors point to third token */
239 				*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
240 
241 				/* Get third token */
242 				next_token = core_yylex(&(yyextra->lookahead_yylval),
243 										llocp, yyscanner);
244 
245 				/* If we throw error here, it will point to third token */
246 				if (next_token != SCONST)
247 					scanner_yyerror("UESCAPE must be followed by a simple string literal",
248 									yyscanner);
249 
250 				escstr = yyextra->lookahead_yylval.str;
251 				if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
252 					scanner_yyerror("invalid Unicode escape character",
253 									yyscanner);
254 
255 				/* Now restore *llocp; errors will point to first token */
256 				*llocp = cur_yylloc;
257 
258 				/* Apply Unicode conversion */
259 				lvalp->core_yystype.str =
260 					str_udeescape(lvalp->core_yystype.str,
261 								  escstr[0],
262 								  *llocp,
263 								  yyscanner);
264 
265 				/*
266 				 * We don't need to revert the un-truncation of UESCAPE.  What
267 				 * we do want to do is clear have_lookahead, thereby consuming
268 				 * all three tokens.
269 				 */
270 				yyextra->have_lookahead = false;
271 			}
272 			else
273 			{
274 				/* No UESCAPE, so convert using default escape character */
275 				lvalp->core_yystype.str =
276 					str_udeescape(lvalp->core_yystype.str,
277 								  '\\',
278 								  *llocp,
279 								  yyscanner);
280 			}
281 
282 			if (cur_token == UIDENT)
283 			{
284 				/* It's an identifier, so truncate as appropriate */
285 				truncate_identifier(lvalp->core_yystype.str,
286 									strlen(lvalp->core_yystype.str),
287 									true);
288 				cur_token = IDENT;
289 			}
290 			else if (cur_token == USCONST)
291 			{
292 				cur_token = SCONST;
293 			}
294 			break;
295 	}
296 
297 	return cur_token;
298 }
299 
300 /* convert hex digit (caller should have verified that) to value */
301 static unsigned int
hexval(unsigned char c)302 hexval(unsigned char c)
303 {
304 	if (c >= '0' && c <= '9')
305 		return c - '0';
306 	if (c >= 'a' && c <= 'f')
307 		return c - 'a' + 0xA;
308 	if (c >= 'A' && c <= 'F')
309 		return c - 'A' + 0xA;
310 	elog(ERROR, "invalid hexadecimal digit");
311 	return 0;					/* not reached */
312 }
313 
314 /* is Unicode code point acceptable? */
315 static void
check_unicode_value(pg_wchar c)316 check_unicode_value(pg_wchar c)
317 {
318 	if (!is_valid_unicode_codepoint(c))
319 		ereport(ERROR,
320 				(errcode(ERRCODE_SYNTAX_ERROR),
321 				 errmsg("invalid Unicode escape value")));
322 }
323 
324 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
325 static bool
check_uescapechar(unsigned char escape)326 check_uescapechar(unsigned char escape)
327 {
328 	if (isxdigit(escape)
329 		|| escape == '+'
330 		|| escape == '\''
331 		|| escape == '"'
332 		|| scanner_isspace(escape))
333 		return false;
334 	else
335 		return true;
336 }
337 
338 /*
339  * Process Unicode escapes in "str", producing a palloc'd plain string
340  *
341  * escape: the escape character to use
342  * position: start position of U&'' or U&"" string token
343  * yyscanner: context information needed for error reports
344  */
345 static char *
str_udeescape(const char * str,char escape,int position,core_yyscan_t yyscanner)346 str_udeescape(const char *str, char escape,
347 			  int position, core_yyscan_t yyscanner)
348 {
349 	const char *in;
350 	char	   *new,
351 			   *out;
352 	size_t		new_len;
353 	pg_wchar	pair_first = 0;
354 	ScannerCallbackState scbstate;
355 
356 	/*
357 	 * Guesstimate that result will be no longer than input, but allow enough
358 	 * padding for Unicode conversion.
359 	 */
360 	new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
361 	new = palloc(new_len);
362 
363 	in = str;
364 	out = new;
365 	while (*in)
366 	{
367 		/* Enlarge string if needed */
368 		size_t		out_dist = out - new;
369 
370 		if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
371 		{
372 			new_len *= 2;
373 			new = repalloc(new, new_len);
374 			out = new + out_dist;
375 		}
376 
377 		if (in[0] == escape)
378 		{
379 			/*
380 			 * Any errors reported while processing this escape sequence will
381 			 * have an error cursor pointing at the escape.
382 			 */
383 			setup_scanner_errposition_callback(&scbstate, yyscanner,
384 											   in - str + position + 3);	/* 3 for U&" */
385 			if (in[1] == escape)
386 			{
387 				if (pair_first)
388 					goto invalid_pair;
389 				*out++ = escape;
390 				in += 2;
391 			}
392 			else if (isxdigit((unsigned char) in[1]) &&
393 					 isxdigit((unsigned char) in[2]) &&
394 					 isxdigit((unsigned char) in[3]) &&
395 					 isxdigit((unsigned char) in[4]))
396 			{
397 				pg_wchar	unicode;
398 
399 				unicode = (hexval(in[1]) << 12) +
400 					(hexval(in[2]) << 8) +
401 					(hexval(in[3]) << 4) +
402 					hexval(in[4]);
403 				check_unicode_value(unicode);
404 				if (pair_first)
405 				{
406 					if (is_utf16_surrogate_second(unicode))
407 					{
408 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
409 						pair_first = 0;
410 					}
411 					else
412 						goto invalid_pair;
413 				}
414 				else if (is_utf16_surrogate_second(unicode))
415 					goto invalid_pair;
416 
417 				if (is_utf16_surrogate_first(unicode))
418 					pair_first = unicode;
419 				else
420 				{
421 					pg_unicode_to_server(unicode, (unsigned char *) out);
422 					out += strlen(out);
423 				}
424 				in += 5;
425 			}
426 			else if (in[1] == '+' &&
427 					 isxdigit((unsigned char) in[2]) &&
428 					 isxdigit((unsigned char) in[3]) &&
429 					 isxdigit((unsigned char) in[4]) &&
430 					 isxdigit((unsigned char) in[5]) &&
431 					 isxdigit((unsigned char) in[6]) &&
432 					 isxdigit((unsigned char) in[7]))
433 			{
434 				pg_wchar	unicode;
435 
436 				unicode = (hexval(in[2]) << 20) +
437 					(hexval(in[3]) << 16) +
438 					(hexval(in[4]) << 12) +
439 					(hexval(in[5]) << 8) +
440 					(hexval(in[6]) << 4) +
441 					hexval(in[7]);
442 				check_unicode_value(unicode);
443 				if (pair_first)
444 				{
445 					if (is_utf16_surrogate_second(unicode))
446 					{
447 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
448 						pair_first = 0;
449 					}
450 					else
451 						goto invalid_pair;
452 				}
453 				else if (is_utf16_surrogate_second(unicode))
454 					goto invalid_pair;
455 
456 				if (is_utf16_surrogate_first(unicode))
457 					pair_first = unicode;
458 				else
459 				{
460 					pg_unicode_to_server(unicode, (unsigned char *) out);
461 					out += strlen(out);
462 				}
463 				in += 8;
464 			}
465 			else
466 				ereport(ERROR,
467 						(errcode(ERRCODE_SYNTAX_ERROR),
468 						 errmsg("invalid Unicode escape"),
469 						 errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
470 
471 			cancel_scanner_errposition_callback(&scbstate);
472 		}
473 		else
474 		{
475 			if (pair_first)
476 				goto invalid_pair;
477 
478 			*out++ = *in++;
479 		}
480 	}
481 
482 	/* unfinished surrogate pair? */
483 	if (pair_first)
484 		goto invalid_pair;
485 
486 	*out = '\0';
487 	return new;
488 
489 	/*
490 	 * We might get here with the error callback active, or not.  Call
491 	 * scanner_errposition to make sure an error cursor appears; if the
492 	 * callback is active, this is duplicative but harmless.
493 	 */
494 invalid_pair:
495 	ereport(ERROR,
496 			(errcode(ERRCODE_SYNTAX_ERROR),
497 			 errmsg("invalid Unicode surrogate pair"),
498 			 scanner_errposition(in - str + position + 3,	/* 3 for U&" */
499 								 yyscanner)));
500 	return NULL;				/* keep compiler quiet */
501 }
502