1 /*-------------------------------------------------------------------------
2 *
3 * parser.c
4 * Main entry point/driver for PostgreSQL grammar
5 *
6 * Note that the grammar is not allowed to perform any table access
7 * (since we need to be able to do basic parsing even while inside an
8 * aborted transaction). Therefore, the data structures returned by
9 * the grammar are "raw" parsetrees that still need to be analyzed by
10 * analyze.c and related files.
11 *
12 *
13 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
14 * Portions Copyright (c) 1994, Regents of the University of California
15 *
16 * IDENTIFICATION
17 * src/backend/parser/parser.c
18 *
19 *-------------------------------------------------------------------------
20 */
21
22 #include "postgres.h"
23
24 #include "mb/pg_wchar.h"
25 #include "parser/gramparse.h"
26 #include "parser/parser.h"
27 #include "parser/scansup.h"
28
29 static bool check_uescapechar(unsigned char escape);
30 static char *str_udeescape(const char *str, char escape,
31 int position, core_yyscan_t yyscanner);
32
33
34 /*
35 * raw_parser
36 * Given a query in string form, do lexical and grammatical analysis.
37 *
38 * Returns a list of raw (un-analyzed) parse trees. The contents of the
39 * list have the form required by the specified RawParseMode.
40 */
41 List *
raw_parser(const char * str,RawParseMode mode)42 raw_parser(const char *str, RawParseMode mode)
43 {
44 core_yyscan_t yyscanner;
45 base_yy_extra_type yyextra;
46 int yyresult;
47
48 /* initialize the flex scanner */
49 yyscanner = scanner_init(str, &yyextra.core_yy_extra,
50 &ScanKeywords, ScanKeywordTokens);
51
52 /* base_yylex() only needs us to initialize the lookahead token, if any */
53 if (mode == RAW_PARSE_DEFAULT)
54 yyextra.have_lookahead = false;
55 else
56 {
57 /* this array is indexed by RawParseMode enum */
58 static const int mode_token[] = {
59 0, /* RAW_PARSE_DEFAULT */
60 MODE_TYPE_NAME, /* RAW_PARSE_TYPE_NAME */
61 MODE_PLPGSQL_EXPR, /* RAW_PARSE_PLPGSQL_EXPR */
62 MODE_PLPGSQL_ASSIGN1, /* RAW_PARSE_PLPGSQL_ASSIGN1 */
63 MODE_PLPGSQL_ASSIGN2, /* RAW_PARSE_PLPGSQL_ASSIGN2 */
64 MODE_PLPGSQL_ASSIGN3 /* RAW_PARSE_PLPGSQL_ASSIGN3 */
65 };
66
67 yyextra.have_lookahead = true;
68 yyextra.lookahead_token = mode_token[mode];
69 yyextra.lookahead_yylloc = 0;
70 yyextra.lookahead_end = NULL;
71 }
72
73 /* initialize the bison parser */
74 parser_init(&yyextra);
75
76 /* Parse! */
77 yyresult = base_yyparse(yyscanner);
78
79 /* Clean up (release memory) */
80 scanner_finish(yyscanner);
81
82 if (yyresult) /* error */
83 return NIL;
84
85 return yyextra.parsetree;
86 }
87
88
89 /*
90 * Intermediate filter between parser and core lexer (core_yylex in scan.l).
91 *
92 * This filter is needed because in some cases the standard SQL grammar
93 * requires more than one token lookahead. We reduce these cases to one-token
94 * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
95 *
96 * Using a filter is simpler than trying to recognize multiword tokens
97 * directly in scan.l, because we'd have to allow for comments between the
98 * words. Furthermore it's not clear how to do that without re-introducing
99 * scanner backtrack, which would cost more performance than this filter
100 * layer does.
101 *
102 * We also use this filter to convert UIDENT and USCONST sequences into
103 * plain IDENT and SCONST tokens. While that could be handled by additional
104 * productions in the main grammar, it's more efficient to do it like this.
105 *
106 * The filter also provides a convenient place to translate between
107 * the core_YYSTYPE and YYSTYPE representations (which are really the
108 * same thing anyway, but notationally they're different).
109 */
110 int
base_yylex(YYSTYPE * lvalp,YYLTYPE * llocp,core_yyscan_t yyscanner)111 base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
112 {
113 base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
114 int cur_token;
115 int next_token;
116 int cur_token_length;
117 YYLTYPE cur_yylloc;
118
119 /* Get next token --- we might already have it */
120 if (yyextra->have_lookahead)
121 {
122 cur_token = yyextra->lookahead_token;
123 lvalp->core_yystype = yyextra->lookahead_yylval;
124 *llocp = yyextra->lookahead_yylloc;
125 if (yyextra->lookahead_end)
126 *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
127 yyextra->have_lookahead = false;
128 }
129 else
130 cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
131
132 /*
133 * If this token isn't one that requires lookahead, just return it. If it
134 * does, determine the token length. (We could get that via strlen(), but
135 * since we have such a small set of possibilities, hardwiring seems
136 * feasible and more efficient --- at least for the fixed-length cases.)
137 */
138 switch (cur_token)
139 {
140 case NOT:
141 cur_token_length = 3;
142 break;
143 case NULLS_P:
144 cur_token_length = 5;
145 break;
146 case WITH:
147 cur_token_length = 4;
148 break;
149 case UIDENT:
150 case USCONST:
151 cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
152 break;
153 default:
154 return cur_token;
155 }
156
157 /*
158 * Identify end+1 of current token. core_yylex() has temporarily stored a
159 * '\0' here, and will undo that when we call it again. We need to redo
160 * it to fully revert the lookahead call for error reporting purposes.
161 */
162 yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
163 *llocp + cur_token_length;
164 Assert(*(yyextra->lookahead_end) == '\0');
165
166 /*
167 * Save and restore *llocp around the call. It might look like we could
168 * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
169 * does not work because flex actually holds onto the last-passed pointer
170 * internally, and will use that for error reporting. We need any error
171 * reports to point to the current token, not the next one.
172 */
173 cur_yylloc = *llocp;
174
175 /* Get next token, saving outputs into lookahead variables */
176 next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
177 yyextra->lookahead_token = next_token;
178 yyextra->lookahead_yylloc = *llocp;
179
180 *llocp = cur_yylloc;
181
182 /* Now revert the un-truncation of the current token */
183 yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
184 *(yyextra->lookahead_end) = '\0';
185
186 yyextra->have_lookahead = true;
187
188 /* Replace cur_token if needed, based on lookahead */
189 switch (cur_token)
190 {
191 case NOT:
192 /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
193 switch (next_token)
194 {
195 case BETWEEN:
196 case IN_P:
197 case LIKE:
198 case ILIKE:
199 case SIMILAR:
200 cur_token = NOT_LA;
201 break;
202 }
203 break;
204
205 case NULLS_P:
206 /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
207 switch (next_token)
208 {
209 case FIRST_P:
210 case LAST_P:
211 cur_token = NULLS_LA;
212 break;
213 }
214 break;
215
216 case WITH:
217 /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
218 switch (next_token)
219 {
220 case TIME:
221 case ORDINALITY:
222 cur_token = WITH_LA;
223 break;
224 }
225 break;
226
227 case UIDENT:
228 case USCONST:
229 /* Look ahead for UESCAPE */
230 if (next_token == UESCAPE)
231 {
232 /* Yup, so get third token, which had better be SCONST */
233 const char *escstr;
234
235 /* Again save and restore *llocp */
236 cur_yylloc = *llocp;
237
238 /* Un-truncate current token so errors point to third token */
239 *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
240
241 /* Get third token */
242 next_token = core_yylex(&(yyextra->lookahead_yylval),
243 llocp, yyscanner);
244
245 /* If we throw error here, it will point to third token */
246 if (next_token != SCONST)
247 scanner_yyerror("UESCAPE must be followed by a simple string literal",
248 yyscanner);
249
250 escstr = yyextra->lookahead_yylval.str;
251 if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
252 scanner_yyerror("invalid Unicode escape character",
253 yyscanner);
254
255 /* Now restore *llocp; errors will point to first token */
256 *llocp = cur_yylloc;
257
258 /* Apply Unicode conversion */
259 lvalp->core_yystype.str =
260 str_udeescape(lvalp->core_yystype.str,
261 escstr[0],
262 *llocp,
263 yyscanner);
264
265 /*
266 * We don't need to revert the un-truncation of UESCAPE. What
267 * we do want to do is clear have_lookahead, thereby consuming
268 * all three tokens.
269 */
270 yyextra->have_lookahead = false;
271 }
272 else
273 {
274 /* No UESCAPE, so convert using default escape character */
275 lvalp->core_yystype.str =
276 str_udeescape(lvalp->core_yystype.str,
277 '\\',
278 *llocp,
279 yyscanner);
280 }
281
282 if (cur_token == UIDENT)
283 {
284 /* It's an identifier, so truncate as appropriate */
285 truncate_identifier(lvalp->core_yystype.str,
286 strlen(lvalp->core_yystype.str),
287 true);
288 cur_token = IDENT;
289 }
290 else if (cur_token == USCONST)
291 {
292 cur_token = SCONST;
293 }
294 break;
295 }
296
297 return cur_token;
298 }
299
300 /* convert hex digit (caller should have verified that) to value */
301 static unsigned int
hexval(unsigned char c)302 hexval(unsigned char c)
303 {
304 if (c >= '0' && c <= '9')
305 return c - '0';
306 if (c >= 'a' && c <= 'f')
307 return c - 'a' + 0xA;
308 if (c >= 'A' && c <= 'F')
309 return c - 'A' + 0xA;
310 elog(ERROR, "invalid hexadecimal digit");
311 return 0; /* not reached */
312 }
313
314 /* is Unicode code point acceptable? */
315 static void
check_unicode_value(pg_wchar c)316 check_unicode_value(pg_wchar c)
317 {
318 if (!is_valid_unicode_codepoint(c))
319 ereport(ERROR,
320 (errcode(ERRCODE_SYNTAX_ERROR),
321 errmsg("invalid Unicode escape value")));
322 }
323
324 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
325 static bool
check_uescapechar(unsigned char escape)326 check_uescapechar(unsigned char escape)
327 {
328 if (isxdigit(escape)
329 || escape == '+'
330 || escape == '\''
331 || escape == '"'
332 || scanner_isspace(escape))
333 return false;
334 else
335 return true;
336 }
337
338 /*
339 * Process Unicode escapes in "str", producing a palloc'd plain string
340 *
341 * escape: the escape character to use
342 * position: start position of U&'' or U&"" string token
343 * yyscanner: context information needed for error reports
344 */
345 static char *
str_udeescape(const char * str,char escape,int position,core_yyscan_t yyscanner)346 str_udeescape(const char *str, char escape,
347 int position, core_yyscan_t yyscanner)
348 {
349 const char *in;
350 char *new,
351 *out;
352 size_t new_len;
353 pg_wchar pair_first = 0;
354 ScannerCallbackState scbstate;
355
356 /*
357 * Guesstimate that result will be no longer than input, but allow enough
358 * padding for Unicode conversion.
359 */
360 new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
361 new = palloc(new_len);
362
363 in = str;
364 out = new;
365 while (*in)
366 {
367 /* Enlarge string if needed */
368 size_t out_dist = out - new;
369
370 if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
371 {
372 new_len *= 2;
373 new = repalloc(new, new_len);
374 out = new + out_dist;
375 }
376
377 if (in[0] == escape)
378 {
379 /*
380 * Any errors reported while processing this escape sequence will
381 * have an error cursor pointing at the escape.
382 */
383 setup_scanner_errposition_callback(&scbstate, yyscanner,
384 in - str + position + 3); /* 3 for U&" */
385 if (in[1] == escape)
386 {
387 if (pair_first)
388 goto invalid_pair;
389 *out++ = escape;
390 in += 2;
391 }
392 else if (isxdigit((unsigned char) in[1]) &&
393 isxdigit((unsigned char) in[2]) &&
394 isxdigit((unsigned char) in[3]) &&
395 isxdigit((unsigned char) in[4]))
396 {
397 pg_wchar unicode;
398
399 unicode = (hexval(in[1]) << 12) +
400 (hexval(in[2]) << 8) +
401 (hexval(in[3]) << 4) +
402 hexval(in[4]);
403 check_unicode_value(unicode);
404 if (pair_first)
405 {
406 if (is_utf16_surrogate_second(unicode))
407 {
408 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
409 pair_first = 0;
410 }
411 else
412 goto invalid_pair;
413 }
414 else if (is_utf16_surrogate_second(unicode))
415 goto invalid_pair;
416
417 if (is_utf16_surrogate_first(unicode))
418 pair_first = unicode;
419 else
420 {
421 pg_unicode_to_server(unicode, (unsigned char *) out);
422 out += strlen(out);
423 }
424 in += 5;
425 }
426 else if (in[1] == '+' &&
427 isxdigit((unsigned char) in[2]) &&
428 isxdigit((unsigned char) in[3]) &&
429 isxdigit((unsigned char) in[4]) &&
430 isxdigit((unsigned char) in[5]) &&
431 isxdigit((unsigned char) in[6]) &&
432 isxdigit((unsigned char) in[7]))
433 {
434 pg_wchar unicode;
435
436 unicode = (hexval(in[2]) << 20) +
437 (hexval(in[3]) << 16) +
438 (hexval(in[4]) << 12) +
439 (hexval(in[5]) << 8) +
440 (hexval(in[6]) << 4) +
441 hexval(in[7]);
442 check_unicode_value(unicode);
443 if (pair_first)
444 {
445 if (is_utf16_surrogate_second(unicode))
446 {
447 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
448 pair_first = 0;
449 }
450 else
451 goto invalid_pair;
452 }
453 else if (is_utf16_surrogate_second(unicode))
454 goto invalid_pair;
455
456 if (is_utf16_surrogate_first(unicode))
457 pair_first = unicode;
458 else
459 {
460 pg_unicode_to_server(unicode, (unsigned char *) out);
461 out += strlen(out);
462 }
463 in += 8;
464 }
465 else
466 ereport(ERROR,
467 (errcode(ERRCODE_SYNTAX_ERROR),
468 errmsg("invalid Unicode escape"),
469 errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
470
471 cancel_scanner_errposition_callback(&scbstate);
472 }
473 else
474 {
475 if (pair_first)
476 goto invalid_pair;
477
478 *out++ = *in++;
479 }
480 }
481
482 /* unfinished surrogate pair? */
483 if (pair_first)
484 goto invalid_pair;
485
486 *out = '\0';
487 return new;
488
489 /*
490 * We might get here with the error callback active, or not. Call
491 * scanner_errposition to make sure an error cursor appears; if the
492 * callback is active, this is duplicative but harmless.
493 */
494 invalid_pair:
495 ereport(ERROR,
496 (errcode(ERRCODE_SYNTAX_ERROR),
497 errmsg("invalid Unicode surrogate pair"),
498 scanner_errposition(in - str + position + 3, /* 3 for U&" */
499 yyscanner)));
500 return NULL; /* keep compiler quiet */
501 }
502