1 /*-------------------------------------------------------------------------
2  *
3  * parser.c
4  *		Main entry point/driver for PostgreSQL grammar
5  *
6  * Note that the grammar is not allowed to perform any table access
7  * (since we need to be able to do basic parsing even while inside an
8  * aborted transaction).  Therefore, the data structures returned by
9  * the grammar are "raw" parsetrees that still need to be analyzed by
10  * analyze.c and related files.
11  *
12  *
13  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
14  * Portions Copyright (c) 1994, Regents of the University of California
15  *
16  * IDENTIFICATION
17  *	  src/backend/parser/parser.c
18  *
19  *-------------------------------------------------------------------------
20  */
21 
22 #include "postgres.h"
23 
24 #include "mb/pg_wchar.h"
25 #include "parser/gramparse.h"
26 #include "parser/parser.h"
27 #include "parser/scansup.h"
28 
29 static bool check_uescapechar(unsigned char escape);
30 static char *str_udeescape(const char *str, char escape,
31 						   int position, core_yyscan_t yyscanner);
32 
33 
34 /*
35  * raw_parser
36  *		Given a query in string form, do lexical and grammatical analysis.
37  *
38  * Returns a list of raw (un-analyzed) parse trees.  The immediate elements
39  * of the list are always RawStmt nodes.
40  */
41 List *
raw_parser(const char * str)42 raw_parser(const char *str)
43 {
44 	core_yyscan_t yyscanner;
45 	base_yy_extra_type yyextra;
46 	int			yyresult;
47 
48 	/* initialize the flex scanner */
49 	yyscanner = scanner_init(str, &yyextra.core_yy_extra,
50 							 &ScanKeywords, ScanKeywordTokens);
51 
52 	/* base_yylex() only needs this much initialization */
53 	yyextra.have_lookahead = false;
54 
55 	/* initialize the bison parser */
56 	parser_init(&yyextra);
57 
58 	/* Parse! */
59 	yyresult = base_yyparse(yyscanner);
60 
61 	/* Clean up (release memory) */
62 	scanner_finish(yyscanner);
63 
64 	if (yyresult)				/* error */
65 		return NIL;
66 
67 	return yyextra.parsetree;
68 }
69 
70 
71 /*
72  * Intermediate filter between parser and core lexer (core_yylex in scan.l).
73  *
74  * This filter is needed because in some cases the standard SQL grammar
75  * requires more than one token lookahead.  We reduce these cases to one-token
76  * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
77  *
78  * Using a filter is simpler than trying to recognize multiword tokens
79  * directly in scan.l, because we'd have to allow for comments between the
80  * words.  Furthermore it's not clear how to do that without re-introducing
81  * scanner backtrack, which would cost more performance than this filter
82  * layer does.
83  *
84  * We also use this filter to convert UIDENT and USCONST sequences into
85  * plain IDENT and SCONST tokens.  While that could be handled by additional
86  * productions in the main grammar, it's more efficient to do it like this.
87  *
88  * The filter also provides a convenient place to translate between
89  * the core_YYSTYPE and YYSTYPE representations (which are really the
90  * same thing anyway, but notationally they're different).
91  */
92 int
base_yylex(YYSTYPE * lvalp,YYLTYPE * llocp,core_yyscan_t yyscanner)93 base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
94 {
95 	base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
96 	int			cur_token;
97 	int			next_token;
98 	int			cur_token_length;
99 	YYLTYPE		cur_yylloc;
100 
101 	/* Get next token --- we might already have it */
102 	if (yyextra->have_lookahead)
103 	{
104 		cur_token = yyextra->lookahead_token;
105 		lvalp->core_yystype = yyextra->lookahead_yylval;
106 		*llocp = yyextra->lookahead_yylloc;
107 		*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
108 		yyextra->have_lookahead = false;
109 	}
110 	else
111 		cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
112 
113 	/*
114 	 * If this token isn't one that requires lookahead, just return it.  If it
115 	 * does, determine the token length.  (We could get that via strlen(), but
116 	 * since we have such a small set of possibilities, hardwiring seems
117 	 * feasible and more efficient --- at least for the fixed-length cases.)
118 	 */
119 	switch (cur_token)
120 	{
121 		case NOT:
122 			cur_token_length = 3;
123 			break;
124 		case NULLS_P:
125 			cur_token_length = 5;
126 			break;
127 		case WITH:
128 			cur_token_length = 4;
129 			break;
130 		case UIDENT:
131 		case USCONST:
132 			cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
133 			break;
134 		default:
135 			return cur_token;
136 	}
137 
138 	/*
139 	 * Identify end+1 of current token.  core_yylex() has temporarily stored a
140 	 * '\0' here, and will undo that when we call it again.  We need to redo
141 	 * it to fully revert the lookahead call for error reporting purposes.
142 	 */
143 	yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
144 		*llocp + cur_token_length;
145 	Assert(*(yyextra->lookahead_end) == '\0');
146 
147 	/*
148 	 * Save and restore *llocp around the call.  It might look like we could
149 	 * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
150 	 * does not work because flex actually holds onto the last-passed pointer
151 	 * internally, and will use that for error reporting.  We need any error
152 	 * reports to point to the current token, not the next one.
153 	 */
154 	cur_yylloc = *llocp;
155 
156 	/* Get next token, saving outputs into lookahead variables */
157 	next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
158 	yyextra->lookahead_token = next_token;
159 	yyextra->lookahead_yylloc = *llocp;
160 
161 	*llocp = cur_yylloc;
162 
163 	/* Now revert the un-truncation of the current token */
164 	yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
165 	*(yyextra->lookahead_end) = '\0';
166 
167 	yyextra->have_lookahead = true;
168 
169 	/* Replace cur_token if needed, based on lookahead */
170 	switch (cur_token)
171 	{
172 		case NOT:
173 			/* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
174 			switch (next_token)
175 			{
176 				case BETWEEN:
177 				case IN_P:
178 				case LIKE:
179 				case ILIKE:
180 				case SIMILAR:
181 					cur_token = NOT_LA;
182 					break;
183 			}
184 			break;
185 
186 		case NULLS_P:
187 			/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
188 			switch (next_token)
189 			{
190 				case FIRST_P:
191 				case LAST_P:
192 					cur_token = NULLS_LA;
193 					break;
194 			}
195 			break;
196 
197 		case WITH:
198 			/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
199 			switch (next_token)
200 			{
201 				case TIME:
202 				case ORDINALITY:
203 					cur_token = WITH_LA;
204 					break;
205 			}
206 			break;
207 
208 		case UIDENT:
209 		case USCONST:
210 			/* Look ahead for UESCAPE */
211 			if (next_token == UESCAPE)
212 			{
213 				/* Yup, so get third token, which had better be SCONST */
214 				const char *escstr;
215 
216 				/* Again save and restore *llocp */
217 				cur_yylloc = *llocp;
218 
219 				/* Un-truncate current token so errors point to third token */
220 				*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
221 
222 				/* Get third token */
223 				next_token = core_yylex(&(yyextra->lookahead_yylval),
224 										llocp, yyscanner);
225 
226 				/* If we throw error here, it will point to third token */
227 				if (next_token != SCONST)
228 					scanner_yyerror("UESCAPE must be followed by a simple string literal",
229 									yyscanner);
230 
231 				escstr = yyextra->lookahead_yylval.str;
232 				if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
233 					scanner_yyerror("invalid Unicode escape character",
234 									yyscanner);
235 
236 				/* Now restore *llocp; errors will point to first token */
237 				*llocp = cur_yylloc;
238 
239 				/* Apply Unicode conversion */
240 				lvalp->core_yystype.str =
241 					str_udeescape(lvalp->core_yystype.str,
242 								  escstr[0],
243 								  *llocp,
244 								  yyscanner);
245 
246 				/*
247 				 * We don't need to revert the un-truncation of UESCAPE.  What
248 				 * we do want to do is clear have_lookahead, thereby consuming
249 				 * all three tokens.
250 				 */
251 				yyextra->have_lookahead = false;
252 			}
253 			else
254 			{
255 				/* No UESCAPE, so convert using default escape character */
256 				lvalp->core_yystype.str =
257 					str_udeescape(lvalp->core_yystype.str,
258 								  '\\',
259 								  *llocp,
260 								  yyscanner);
261 			}
262 
263 			if (cur_token == UIDENT)
264 			{
265 				/* It's an identifier, so truncate as appropriate */
266 				truncate_identifier(lvalp->core_yystype.str,
267 									strlen(lvalp->core_yystype.str),
268 									true);
269 				cur_token = IDENT;
270 			}
271 			else if (cur_token == USCONST)
272 			{
273 				cur_token = SCONST;
274 			}
275 			break;
276 	}
277 
278 	return cur_token;
279 }
280 
281 /* convert hex digit (caller should have verified that) to value */
282 static unsigned int
hexval(unsigned char c)283 hexval(unsigned char c)
284 {
285 	if (c >= '0' && c <= '9')
286 		return c - '0';
287 	if (c >= 'a' && c <= 'f')
288 		return c - 'a' + 0xA;
289 	if (c >= 'A' && c <= 'F')
290 		return c - 'A' + 0xA;
291 	elog(ERROR, "invalid hexadecimal digit");
292 	return 0;					/* not reached */
293 }
294 
295 /* is Unicode code point acceptable? */
296 static void
check_unicode_value(pg_wchar c)297 check_unicode_value(pg_wchar c)
298 {
299 	if (!is_valid_unicode_codepoint(c))
300 		ereport(ERROR,
301 				(errcode(ERRCODE_SYNTAX_ERROR),
302 				 errmsg("invalid Unicode escape value")));
303 }
304 
305 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
306 static bool
check_uescapechar(unsigned char escape)307 check_uescapechar(unsigned char escape)
308 {
309 	if (isxdigit(escape)
310 		|| escape == '+'
311 		|| escape == '\''
312 		|| escape == '"'
313 		|| scanner_isspace(escape))
314 		return false;
315 	else
316 		return true;
317 }
318 
319 /*
320  * Process Unicode escapes in "str", producing a palloc'd plain string
321  *
322  * escape: the escape character to use
323  * position: start position of U&'' or U&"" string token
324  * yyscanner: context information needed for error reports
325  */
326 static char *
str_udeescape(const char * str,char escape,int position,core_yyscan_t yyscanner)327 str_udeescape(const char *str, char escape,
328 			  int position, core_yyscan_t yyscanner)
329 {
330 	const char *in;
331 	char	   *new,
332 			   *out;
333 	size_t		new_len;
334 	pg_wchar	pair_first = 0;
335 	ScannerCallbackState scbstate;
336 
337 	/*
338 	 * Guesstimate that result will be no longer than input, but allow enough
339 	 * padding for Unicode conversion.
340 	 */
341 	new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
342 	new = palloc(new_len);
343 
344 	in = str;
345 	out = new;
346 	while (*in)
347 	{
348 		/* Enlarge string if needed */
349 		size_t		out_dist = out - new;
350 
351 		if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
352 		{
353 			new_len *= 2;
354 			new = repalloc(new, new_len);
355 			out = new + out_dist;
356 		}
357 
358 		if (in[0] == escape)
359 		{
360 			/*
361 			 * Any errors reported while processing this escape sequence will
362 			 * have an error cursor pointing at the escape.
363 			 */
364 			setup_scanner_errposition_callback(&scbstate, yyscanner,
365 											   in - str + position + 3);	/* 3 for U&" */
366 			if (in[1] == escape)
367 			{
368 				if (pair_first)
369 					goto invalid_pair;
370 				*out++ = escape;
371 				in += 2;
372 			}
373 			else if (isxdigit((unsigned char) in[1]) &&
374 					 isxdigit((unsigned char) in[2]) &&
375 					 isxdigit((unsigned char) in[3]) &&
376 					 isxdigit((unsigned char) in[4]))
377 			{
378 				pg_wchar	unicode;
379 
380 				unicode = (hexval(in[1]) << 12) +
381 					(hexval(in[2]) << 8) +
382 					(hexval(in[3]) << 4) +
383 					hexval(in[4]);
384 				check_unicode_value(unicode);
385 				if (pair_first)
386 				{
387 					if (is_utf16_surrogate_second(unicode))
388 					{
389 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
390 						pair_first = 0;
391 					}
392 					else
393 						goto invalid_pair;
394 				}
395 				else if (is_utf16_surrogate_second(unicode))
396 					goto invalid_pair;
397 
398 				if (is_utf16_surrogate_first(unicode))
399 					pair_first = unicode;
400 				else
401 				{
402 					pg_unicode_to_server(unicode, (unsigned char *) out);
403 					out += strlen(out);
404 				}
405 				in += 5;
406 			}
407 			else if (in[1] == '+' &&
408 					 isxdigit((unsigned char) in[2]) &&
409 					 isxdigit((unsigned char) in[3]) &&
410 					 isxdigit((unsigned char) in[4]) &&
411 					 isxdigit((unsigned char) in[5]) &&
412 					 isxdigit((unsigned char) in[6]) &&
413 					 isxdigit((unsigned char) in[7]))
414 			{
415 				pg_wchar	unicode;
416 
417 				unicode = (hexval(in[2]) << 20) +
418 					(hexval(in[3]) << 16) +
419 					(hexval(in[4]) << 12) +
420 					(hexval(in[5]) << 8) +
421 					(hexval(in[6]) << 4) +
422 					hexval(in[7]);
423 				check_unicode_value(unicode);
424 				if (pair_first)
425 				{
426 					if (is_utf16_surrogate_second(unicode))
427 					{
428 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
429 						pair_first = 0;
430 					}
431 					else
432 						goto invalid_pair;
433 				}
434 				else if (is_utf16_surrogate_second(unicode))
435 					goto invalid_pair;
436 
437 				if (is_utf16_surrogate_first(unicode))
438 					pair_first = unicode;
439 				else
440 				{
441 					pg_unicode_to_server(unicode, (unsigned char *) out);
442 					out += strlen(out);
443 				}
444 				in += 8;
445 			}
446 			else
447 				ereport(ERROR,
448 						(errcode(ERRCODE_SYNTAX_ERROR),
449 						 errmsg("invalid Unicode escape"),
450 						 errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
451 
452 			cancel_scanner_errposition_callback(&scbstate);
453 		}
454 		else
455 		{
456 			if (pair_first)
457 				goto invalid_pair;
458 
459 			*out++ = *in++;
460 		}
461 	}
462 
463 	/* unfinished surrogate pair? */
464 	if (pair_first)
465 		goto invalid_pair;
466 
467 	*out = '\0';
468 	return new;
469 
470 	/*
471 	 * We might get here with the error callback active, or not.  Call
472 	 * scanner_errposition to make sure an error cursor appears; if the
473 	 * callback is active, this is duplicative but harmless.
474 	 */
475 invalid_pair:
476 	ereport(ERROR,
477 			(errcode(ERRCODE_SYNTAX_ERROR),
478 			 errmsg("invalid Unicode surrogate pair"),
479 			 scanner_errposition(in - str + position + 3,	/* 3 for U&" */
480 								 yyscanner)));
481 	return NULL;				/* keep compiler quiet */
482 }
483