1 /*-------------------------------------------------------------------------
2 *
3 * parser.c
4 * Main entry point/driver for PostgreSQL grammar
5 *
6 * Note that the grammar is not allowed to perform any table access
7 * (since we need to be able to do basic parsing even while inside an
8 * aborted transaction). Therefore, the data structures returned by
9 * the grammar are "raw" parsetrees that still need to be analyzed by
10 * analyze.c and related files.
11 *
12 *
13 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
14 * Portions Copyright (c) 1994, Regents of the University of California
15 *
16 * IDENTIFICATION
17 * src/backend/parser/parser.c
18 *
19 *-------------------------------------------------------------------------
20 */
21
22 #include "postgres.h"
23
24 #include "mb/pg_wchar.h"
25 #include "parser/gramparse.h"
26 #include "parser/parser.h"
27 #include "parser/scansup.h"
28
29 static bool check_uescapechar(unsigned char escape);
30 static char *str_udeescape(const char *str, char escape,
31 int position, core_yyscan_t yyscanner);
32
33
34 /*
35 * raw_parser
36 * Given a query in string form, do lexical and grammatical analysis.
37 *
38 * Returns a list of raw (un-analyzed) parse trees. The immediate elements
39 * of the list are always RawStmt nodes.
40 */
41 List *
raw_parser(const char * str)42 raw_parser(const char *str)
43 {
44 core_yyscan_t yyscanner;
45 base_yy_extra_type yyextra;
46 int yyresult;
47
48 /* initialize the flex scanner */
49 yyscanner = scanner_init(str, &yyextra.core_yy_extra,
50 &ScanKeywords, ScanKeywordTokens);
51
52 /* base_yylex() only needs this much initialization */
53 yyextra.have_lookahead = false;
54
55 /* initialize the bison parser */
56 parser_init(&yyextra);
57
58 /* Parse! */
59 yyresult = base_yyparse(yyscanner);
60
61 /* Clean up (release memory) */
62 scanner_finish(yyscanner);
63
64 if (yyresult) /* error */
65 return NIL;
66
67 return yyextra.parsetree;
68 }
69
70
71 /*
72 * Intermediate filter between parser and core lexer (core_yylex in scan.l).
73 *
74 * This filter is needed because in some cases the standard SQL grammar
75 * requires more than one token lookahead. We reduce these cases to one-token
76 * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
77 *
78 * Using a filter is simpler than trying to recognize multiword tokens
79 * directly in scan.l, because we'd have to allow for comments between the
80 * words. Furthermore it's not clear how to do that without re-introducing
81 * scanner backtrack, which would cost more performance than this filter
82 * layer does.
83 *
84 * We also use this filter to convert UIDENT and USCONST sequences into
85 * plain IDENT and SCONST tokens. While that could be handled by additional
86 * productions in the main grammar, it's more efficient to do it like this.
87 *
88 * The filter also provides a convenient place to translate between
89 * the core_YYSTYPE and YYSTYPE representations (which are really the
90 * same thing anyway, but notationally they're different).
91 */
92 int
base_yylex(YYSTYPE * lvalp,YYLTYPE * llocp,core_yyscan_t yyscanner)93 base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
94 {
95 base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
96 int cur_token;
97 int next_token;
98 int cur_token_length;
99 YYLTYPE cur_yylloc;
100
101 /* Get next token --- we might already have it */
102 if (yyextra->have_lookahead)
103 {
104 cur_token = yyextra->lookahead_token;
105 lvalp->core_yystype = yyextra->lookahead_yylval;
106 *llocp = yyextra->lookahead_yylloc;
107 *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
108 yyextra->have_lookahead = false;
109 }
110 else
111 cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
112
113 /*
114 * If this token isn't one that requires lookahead, just return it. If it
115 * does, determine the token length. (We could get that via strlen(), but
116 * since we have such a small set of possibilities, hardwiring seems
117 * feasible and more efficient --- at least for the fixed-length cases.)
118 */
119 switch (cur_token)
120 {
121 case NOT:
122 cur_token_length = 3;
123 break;
124 case NULLS_P:
125 cur_token_length = 5;
126 break;
127 case WITH:
128 cur_token_length = 4;
129 break;
130 case UIDENT:
131 case USCONST:
132 cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
133 break;
134 default:
135 return cur_token;
136 }
137
138 /*
139 * Identify end+1 of current token. core_yylex() has temporarily stored a
140 * '\0' here, and will undo that when we call it again. We need to redo
141 * it to fully revert the lookahead call for error reporting purposes.
142 */
143 yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
144 *llocp + cur_token_length;
145 Assert(*(yyextra->lookahead_end) == '\0');
146
147 /*
148 * Save and restore *llocp around the call. It might look like we could
149 * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
150 * does not work because flex actually holds onto the last-passed pointer
151 * internally, and will use that for error reporting. We need any error
152 * reports to point to the current token, not the next one.
153 */
154 cur_yylloc = *llocp;
155
156 /* Get next token, saving outputs into lookahead variables */
157 next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
158 yyextra->lookahead_token = next_token;
159 yyextra->lookahead_yylloc = *llocp;
160
161 *llocp = cur_yylloc;
162
163 /* Now revert the un-truncation of the current token */
164 yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
165 *(yyextra->lookahead_end) = '\0';
166
167 yyextra->have_lookahead = true;
168
169 /* Replace cur_token if needed, based on lookahead */
170 switch (cur_token)
171 {
172 case NOT:
173 /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
174 switch (next_token)
175 {
176 case BETWEEN:
177 case IN_P:
178 case LIKE:
179 case ILIKE:
180 case SIMILAR:
181 cur_token = NOT_LA;
182 break;
183 }
184 break;
185
186 case NULLS_P:
187 /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
188 switch (next_token)
189 {
190 case FIRST_P:
191 case LAST_P:
192 cur_token = NULLS_LA;
193 break;
194 }
195 break;
196
197 case WITH:
198 /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
199 switch (next_token)
200 {
201 case TIME:
202 case ORDINALITY:
203 cur_token = WITH_LA;
204 break;
205 }
206 break;
207
208 case UIDENT:
209 case USCONST:
210 /* Look ahead for UESCAPE */
211 if (next_token == UESCAPE)
212 {
213 /* Yup, so get third token, which had better be SCONST */
214 const char *escstr;
215
216 /* Again save and restore *llocp */
217 cur_yylloc = *llocp;
218
219 /* Un-truncate current token so errors point to third token */
220 *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
221
222 /* Get third token */
223 next_token = core_yylex(&(yyextra->lookahead_yylval),
224 llocp, yyscanner);
225
226 /* If we throw error here, it will point to third token */
227 if (next_token != SCONST)
228 scanner_yyerror("UESCAPE must be followed by a simple string literal",
229 yyscanner);
230
231 escstr = yyextra->lookahead_yylval.str;
232 if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
233 scanner_yyerror("invalid Unicode escape character",
234 yyscanner);
235
236 /* Now restore *llocp; errors will point to first token */
237 *llocp = cur_yylloc;
238
239 /* Apply Unicode conversion */
240 lvalp->core_yystype.str =
241 str_udeescape(lvalp->core_yystype.str,
242 escstr[0],
243 *llocp,
244 yyscanner);
245
246 /*
247 * We don't need to revert the un-truncation of UESCAPE. What
248 * we do want to do is clear have_lookahead, thereby consuming
249 * all three tokens.
250 */
251 yyextra->have_lookahead = false;
252 }
253 else
254 {
255 /* No UESCAPE, so convert using default escape character */
256 lvalp->core_yystype.str =
257 str_udeescape(lvalp->core_yystype.str,
258 '\\',
259 *llocp,
260 yyscanner);
261 }
262
263 if (cur_token == UIDENT)
264 {
265 /* It's an identifier, so truncate as appropriate */
266 truncate_identifier(lvalp->core_yystype.str,
267 strlen(lvalp->core_yystype.str),
268 true);
269 cur_token = IDENT;
270 }
271 else if (cur_token == USCONST)
272 {
273 cur_token = SCONST;
274 }
275 break;
276 }
277
278 return cur_token;
279 }
280
281 /* convert hex digit (caller should have verified that) to value */
282 static unsigned int
hexval(unsigned char c)283 hexval(unsigned char c)
284 {
285 if (c >= '0' && c <= '9')
286 return c - '0';
287 if (c >= 'a' && c <= 'f')
288 return c - 'a' + 0xA;
289 if (c >= 'A' && c <= 'F')
290 return c - 'A' + 0xA;
291 elog(ERROR, "invalid hexadecimal digit");
292 return 0; /* not reached */
293 }
294
295 /* is Unicode code point acceptable? */
296 static void
check_unicode_value(pg_wchar c)297 check_unicode_value(pg_wchar c)
298 {
299 if (!is_valid_unicode_codepoint(c))
300 ereport(ERROR,
301 (errcode(ERRCODE_SYNTAX_ERROR),
302 errmsg("invalid Unicode escape value")));
303 }
304
305 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
306 static bool
check_uescapechar(unsigned char escape)307 check_uescapechar(unsigned char escape)
308 {
309 if (isxdigit(escape)
310 || escape == '+'
311 || escape == '\''
312 || escape == '"'
313 || scanner_isspace(escape))
314 return false;
315 else
316 return true;
317 }
318
319 /*
320 * Process Unicode escapes in "str", producing a palloc'd plain string
321 *
322 * escape: the escape character to use
323 * position: start position of U&'' or U&"" string token
324 * yyscanner: context information needed for error reports
325 */
326 static char *
str_udeescape(const char * str,char escape,int position,core_yyscan_t yyscanner)327 str_udeescape(const char *str, char escape,
328 int position, core_yyscan_t yyscanner)
329 {
330 const char *in;
331 char *new,
332 *out;
333 size_t new_len;
334 pg_wchar pair_first = 0;
335 ScannerCallbackState scbstate;
336
337 /*
338 * Guesstimate that result will be no longer than input, but allow enough
339 * padding for Unicode conversion.
340 */
341 new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
342 new = palloc(new_len);
343
344 in = str;
345 out = new;
346 while (*in)
347 {
348 /* Enlarge string if needed */
349 size_t out_dist = out - new;
350
351 if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
352 {
353 new_len *= 2;
354 new = repalloc(new, new_len);
355 out = new + out_dist;
356 }
357
358 if (in[0] == escape)
359 {
360 /*
361 * Any errors reported while processing this escape sequence will
362 * have an error cursor pointing at the escape.
363 */
364 setup_scanner_errposition_callback(&scbstate, yyscanner,
365 in - str + position + 3); /* 3 for U&" */
366 if (in[1] == escape)
367 {
368 if (pair_first)
369 goto invalid_pair;
370 *out++ = escape;
371 in += 2;
372 }
373 else if (isxdigit((unsigned char) in[1]) &&
374 isxdigit((unsigned char) in[2]) &&
375 isxdigit((unsigned char) in[3]) &&
376 isxdigit((unsigned char) in[4]))
377 {
378 pg_wchar unicode;
379
380 unicode = (hexval(in[1]) << 12) +
381 (hexval(in[2]) << 8) +
382 (hexval(in[3]) << 4) +
383 hexval(in[4]);
384 check_unicode_value(unicode);
385 if (pair_first)
386 {
387 if (is_utf16_surrogate_second(unicode))
388 {
389 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
390 pair_first = 0;
391 }
392 else
393 goto invalid_pair;
394 }
395 else if (is_utf16_surrogate_second(unicode))
396 goto invalid_pair;
397
398 if (is_utf16_surrogate_first(unicode))
399 pair_first = unicode;
400 else
401 {
402 pg_unicode_to_server(unicode, (unsigned char *) out);
403 out += strlen(out);
404 }
405 in += 5;
406 }
407 else if (in[1] == '+' &&
408 isxdigit((unsigned char) in[2]) &&
409 isxdigit((unsigned char) in[3]) &&
410 isxdigit((unsigned char) in[4]) &&
411 isxdigit((unsigned char) in[5]) &&
412 isxdigit((unsigned char) in[6]) &&
413 isxdigit((unsigned char) in[7]))
414 {
415 pg_wchar unicode;
416
417 unicode = (hexval(in[2]) << 20) +
418 (hexval(in[3]) << 16) +
419 (hexval(in[4]) << 12) +
420 (hexval(in[5]) << 8) +
421 (hexval(in[6]) << 4) +
422 hexval(in[7]);
423 check_unicode_value(unicode);
424 if (pair_first)
425 {
426 if (is_utf16_surrogate_second(unicode))
427 {
428 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
429 pair_first = 0;
430 }
431 else
432 goto invalid_pair;
433 }
434 else if (is_utf16_surrogate_second(unicode))
435 goto invalid_pair;
436
437 if (is_utf16_surrogate_first(unicode))
438 pair_first = unicode;
439 else
440 {
441 pg_unicode_to_server(unicode, (unsigned char *) out);
442 out += strlen(out);
443 }
444 in += 8;
445 }
446 else
447 ereport(ERROR,
448 (errcode(ERRCODE_SYNTAX_ERROR),
449 errmsg("invalid Unicode escape"),
450 errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
451
452 cancel_scanner_errposition_callback(&scbstate);
453 }
454 else
455 {
456 if (pair_first)
457 goto invalid_pair;
458
459 *out++ = *in++;
460 }
461 }
462
463 /* unfinished surrogate pair? */
464 if (pair_first)
465 goto invalid_pair;
466
467 *out = '\0';
468 return new;
469
470 /*
471 * We might get here with the error callback active, or not. Call
472 * scanner_errposition to make sure an error cursor appears; if the
473 * callback is active, this is duplicative but harmless.
474 */
475 invalid_pair:
476 ereport(ERROR,
477 (errcode(ERRCODE_SYNTAX_ERROR),
478 errmsg("invalid Unicode surrogate pair"),
479 scanner_errposition(in - str + position + 3, /* 3 for U&" */
480 yyscanner)));
481 return NULL; /* keep compiler quiet */
482 }
483