1 /*-------------------------------------------------------------------------
2 *
3 * pl_scanner.c
4 * lexical scanning for PL/pgSQL
5 *
6 *
7 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 *
11 * IDENTIFICATION
12 * src/pl/plpgsql/src/pl_scanner.c
13 *
14 *-------------------------------------------------------------------------
15 */
16 #include "postgres.h"
17
18 #include "mb/pg_wchar.h"
19 #include "parser/scanner.h"
20
21 #include "plpgsql.h"
22 #include "pl_gram.h" /* must be after parser/scanner.h */
23
24
25 /* Klugy flag to tell scanner how to look up identifiers */
26 IdentifierLookup plpgsql_IdentifierLookup = IDENTIFIER_LOOKUP_NORMAL;
27
28 /*
29 * A word about keywords:
30 *
31 * We keep reserved and unreserved keywords in separate headers. Be careful
32 * not to put the same word in both headers. Also be sure that pl_gram.y's
33 * unreserved_keyword production agrees with the unreserved header. The
34 * reserved keywords are passed to the core scanner, so they will be
35 * recognized before (and instead of) any variable name. Unreserved words
36 * are checked for separately, usually after determining that the identifier
37 * isn't a known variable name. If plpgsql_IdentifierLookup is DECLARE then
38 * no variable names will be recognized, so the unreserved words always work.
39 * (Note in particular that this helps us avoid reserving keywords that are
40 * only needed in DECLARE sections.)
41 *
42 * In certain contexts it is desirable to prefer recognizing an unreserved
43 * keyword over recognizing a variable name. In particular, at the start
44 * of a statement we should prefer unreserved keywords unless the statement
45 * looks like an assignment (i.e., first token is followed by ':=' or '[').
46 * This rule allows most statement-introducing keywords to be kept unreserved.
47 * (We still have to reserve initial keywords that might follow a block
48 * label, unfortunately, since the method used to determine if we are at
49 * start of statement doesn't recognize such cases. We'd also have to
50 * reserve any keyword that could legitimately be followed by ':=' or '['.)
51 * Some additional cases are handled in pl_gram.y using tok_is_keyword().
52 *
53 * We try to avoid reserving more keywords than we have to; but there's
54 * little point in not reserving a word if it's reserved in the core grammar.
55 * Currently, the following words are reserved here but not in the core:
56 * BEGIN BY DECLARE EXECUTE FOREACH IF LOOP STRICT WHILE
57 */
58
59 /* ScanKeywordList lookup data for PL/pgSQL keywords */
60 #include "pl_reserved_kwlist_d.h"
61 #include "pl_unreserved_kwlist_d.h"
62
63 /* Token codes for PL/pgSQL keywords */
64 #define PG_KEYWORD(kwname, value) value,
65
66 static const uint16 ReservedPLKeywordTokens[] = {
67 #include "pl_reserved_kwlist.h"
68 };
69
70 static const uint16 UnreservedPLKeywordTokens[] = {
71 #include "pl_unreserved_kwlist.h"
72 };
73
74 #undef PG_KEYWORD
75
76 /*
77 * This macro must recognize all tokens that can immediately precede a
78 * PL/pgSQL executable statement (that is, proc_sect or proc_stmt in the
79 * grammar). Fortunately, there are not very many, so hard-coding in this
80 * fashion seems sufficient.
81 */
82 #define AT_STMT_START(prev_token) \
83 ((prev_token) == ';' || \
84 (prev_token) == K_BEGIN || \
85 (prev_token) == K_THEN || \
86 (prev_token) == K_ELSE || \
87 (prev_token) == K_LOOP)
88
89
90 /* Auxiliary data about a token (other than the token type) */
91 typedef struct
92 {
93 YYSTYPE lval; /* semantic information */
94 YYLTYPE lloc; /* offset in scanbuf */
95 int leng; /* length in bytes */
96 } TokenAuxData;
97
98 /*
99 * Scanner working state. At some point we might wish to fold all this
100 * into a YY_EXTRA struct. For the moment, there is no need for plpgsql's
101 * lexer to be re-entrant, and the notational burden of passing a yyscanner
102 * pointer around is great enough to not want to do it without need.
103 */
104
105 /* The stuff the core lexer needs */
106 static core_yyscan_t yyscanner = NULL;
107 static core_yy_extra_type core_yy;
108
109 /* The original input string */
110 static const char *scanorig;
111
112 /* Current token's length (corresponds to plpgsql_yylval and plpgsql_yylloc) */
113 static int plpgsql_yyleng;
114
115 /* Current token's code (corresponds to plpgsql_yylval and plpgsql_yylloc) */
116 static int plpgsql_yytoken;
117
118 /* Token pushback stack */
119 #define MAX_PUSHBACKS 4
120
121 static int num_pushbacks;
122 static int pushback_token[MAX_PUSHBACKS];
123 static TokenAuxData pushback_auxdata[MAX_PUSHBACKS];
124
125 /* State for plpgsql_location_to_lineno() */
126 static const char *cur_line_start;
127 static const char *cur_line_end;
128 static int cur_line_num;
129
130 /* Internal functions */
131 static int internal_yylex(TokenAuxData *auxdata);
132 static void push_back_token(int token, TokenAuxData *auxdata);
133 static void location_lineno_init(void);
134
135
136 /*
137 * This is the yylex routine called from the PL/pgSQL grammar.
138 * It is a wrapper around the core lexer, with the ability to recognize
139 * PL/pgSQL variables and return them as special T_DATUM tokens. If a
140 * word or compound word does not match any variable name, or if matching
141 * is turned off by plpgsql_IdentifierLookup, it is returned as
142 * T_WORD or T_CWORD respectively, or as an unreserved keyword if it
143 * matches one of those.
144 */
145 int
plpgsql_yylex(void)146 plpgsql_yylex(void)
147 {
148 int tok1;
149 TokenAuxData aux1;
150 int kwnum;
151
152 tok1 = internal_yylex(&aux1);
153 if (tok1 == IDENT || tok1 == PARAM)
154 {
155 int tok2;
156 TokenAuxData aux2;
157
158 tok2 = internal_yylex(&aux2);
159 if (tok2 == '.')
160 {
161 int tok3;
162 TokenAuxData aux3;
163
164 tok3 = internal_yylex(&aux3);
165 if (tok3 == IDENT)
166 {
167 int tok4;
168 TokenAuxData aux4;
169
170 tok4 = internal_yylex(&aux4);
171 if (tok4 == '.')
172 {
173 int tok5;
174 TokenAuxData aux5;
175
176 tok5 = internal_yylex(&aux5);
177 if (tok5 == IDENT)
178 {
179 if (plpgsql_parse_tripword(aux1.lval.str,
180 aux3.lval.str,
181 aux5.lval.str,
182 &aux1.lval.wdatum,
183 &aux1.lval.cword))
184 tok1 = T_DATUM;
185 else
186 tok1 = T_CWORD;
187 }
188 else
189 {
190 /* not A.B.C, so just process A.B */
191 push_back_token(tok5, &aux5);
192 push_back_token(tok4, &aux4);
193 if (plpgsql_parse_dblword(aux1.lval.str,
194 aux3.lval.str,
195 &aux1.lval.wdatum,
196 &aux1.lval.cword))
197 tok1 = T_DATUM;
198 else
199 tok1 = T_CWORD;
200 }
201 }
202 else
203 {
204 /* not A.B.C, so just process A.B */
205 push_back_token(tok4, &aux4);
206 if (plpgsql_parse_dblword(aux1.lval.str,
207 aux3.lval.str,
208 &aux1.lval.wdatum,
209 &aux1.lval.cword))
210 tok1 = T_DATUM;
211 else
212 tok1 = T_CWORD;
213 }
214 }
215 else
216 {
217 /* not A.B, so just process A */
218 push_back_token(tok3, &aux3);
219 push_back_token(tok2, &aux2);
220 if (plpgsql_parse_word(aux1.lval.str,
221 core_yy.scanbuf + aux1.lloc,
222 true,
223 &aux1.lval.wdatum,
224 &aux1.lval.word))
225 tok1 = T_DATUM;
226 else if (!aux1.lval.word.quoted &&
227 (kwnum = ScanKeywordLookup(aux1.lval.word.ident,
228 &UnreservedPLKeywords)) >= 0)
229 {
230 aux1.lval.keyword = GetScanKeyword(kwnum,
231 &UnreservedPLKeywords);
232 tok1 = UnreservedPLKeywordTokens[kwnum];
233 }
234 else
235 tok1 = T_WORD;
236 }
237 }
238 else
239 {
240 /* not A.B, so just process A */
241 push_back_token(tok2, &aux2);
242
243 /*
244 * See if it matches a variable name, except in the context where
245 * we are at start of statement and the next token isn't
246 * assignment or '['. In that case, it couldn't validly be a
247 * variable name, and skipping the lookup allows variable names to
248 * be used that would conflict with plpgsql or core keywords that
249 * introduce statements (e.g., "comment"). Without this special
250 * logic, every statement-introducing keyword would effectively be
251 * reserved in PL/pgSQL, which would be unpleasant.
252 *
253 * If it isn't a variable name, try to match against unreserved
254 * plpgsql keywords. If not one of those either, it's T_WORD.
255 *
256 * Note: we must call plpgsql_parse_word even if we don't want to
257 * do variable lookup, because it sets up aux1.lval.word for the
258 * non-variable cases.
259 */
260 if (plpgsql_parse_word(aux1.lval.str,
261 core_yy.scanbuf + aux1.lloc,
262 (!AT_STMT_START(plpgsql_yytoken) ||
263 (tok2 == '=' || tok2 == COLON_EQUALS ||
264 tok2 == '[')),
265 &aux1.lval.wdatum,
266 &aux1.lval.word))
267 tok1 = T_DATUM;
268 else if (!aux1.lval.word.quoted &&
269 (kwnum = ScanKeywordLookup(aux1.lval.word.ident,
270 &UnreservedPLKeywords)) >= 0)
271 {
272 aux1.lval.keyword = GetScanKeyword(kwnum,
273 &UnreservedPLKeywords);
274 tok1 = UnreservedPLKeywordTokens[kwnum];
275 }
276 else
277 tok1 = T_WORD;
278 }
279 }
280 else
281 {
282 /*
283 * Not a potential plpgsql variable name, just return the data.
284 *
285 * Note that we also come through here if the grammar pushed back a
286 * T_DATUM, T_CWORD, T_WORD, or unreserved-keyword token returned by a
287 * previous lookup cycle; thus, pushbacks do not incur extra lookup
288 * work, since we'll never do the above code twice for the same token.
289 * This property also makes it safe to rely on the old value of
290 * plpgsql_yytoken in the is-this-start-of-statement test above.
291 */
292 }
293
294 plpgsql_yylval = aux1.lval;
295 plpgsql_yylloc = aux1.lloc;
296 plpgsql_yyleng = aux1.leng;
297 plpgsql_yytoken = tok1;
298 return tok1;
299 }
300
301 /*
302 * Internal yylex function. This wraps the core lexer and adds one feature:
303 * a token pushback stack. We also make a couple of trivial single-token
304 * translations from what the core lexer does to what we want, in particular
305 * interfacing from the core_YYSTYPE to YYSTYPE union.
306 */
307 static int
internal_yylex(TokenAuxData * auxdata)308 internal_yylex(TokenAuxData *auxdata)
309 {
310 int token;
311 const char *yytext;
312
313 if (num_pushbacks > 0)
314 {
315 num_pushbacks--;
316 token = pushback_token[num_pushbacks];
317 *auxdata = pushback_auxdata[num_pushbacks];
318 }
319 else
320 {
321 token = core_yylex(&auxdata->lval.core_yystype,
322 &auxdata->lloc,
323 yyscanner);
324
325 /* remember the length of yytext before it gets changed */
326 yytext = core_yy.scanbuf + auxdata->lloc;
327 auxdata->leng = strlen(yytext);
328
329 /* Check for << >> and #, which the core considers operators */
330 if (token == Op)
331 {
332 if (strcmp(auxdata->lval.str, "<<") == 0)
333 token = LESS_LESS;
334 else if (strcmp(auxdata->lval.str, ">>") == 0)
335 token = GREATER_GREATER;
336 else if (strcmp(auxdata->lval.str, "#") == 0)
337 token = '#';
338 }
339
340 /* The core returns PARAM as ival, but we treat it like IDENT */
341 else if (token == PARAM)
342 {
343 auxdata->lval.str = pstrdup(yytext);
344 }
345 }
346
347 return token;
348 }
349
350 /*
351 * Push back a token to be re-read by next internal_yylex() call.
352 */
353 static void
push_back_token(int token,TokenAuxData * auxdata)354 push_back_token(int token, TokenAuxData *auxdata)
355 {
356 if (num_pushbacks >= MAX_PUSHBACKS)
357 elog(ERROR, "too many tokens pushed back");
358 pushback_token[num_pushbacks] = token;
359 pushback_auxdata[num_pushbacks] = *auxdata;
360 num_pushbacks++;
361 }
362
363 /*
364 * Push back a single token to be re-read by next plpgsql_yylex() call.
365 *
366 * NOTE: this does not cause yylval or yylloc to "back up". Also, it
367 * is not a good idea to push back a token code other than what you read.
368 */
369 void
plpgsql_push_back_token(int token)370 plpgsql_push_back_token(int token)
371 {
372 TokenAuxData auxdata;
373
374 auxdata.lval = plpgsql_yylval;
375 auxdata.lloc = plpgsql_yylloc;
376 auxdata.leng = plpgsql_yyleng;
377 push_back_token(token, &auxdata);
378 }
379
380 /*
381 * Tell whether a token is an unreserved keyword.
382 *
383 * (If it is, its lowercased form was returned as the token value, so we
384 * do not need to offer that data here.)
385 */
386 bool
plpgsql_token_is_unreserved_keyword(int token)387 plpgsql_token_is_unreserved_keyword(int token)
388 {
389 int i;
390
391 for (i = 0; i < lengthof(UnreservedPLKeywordTokens); i++)
392 {
393 if (UnreservedPLKeywordTokens[i] == token)
394 return true;
395 }
396 return false;
397 }
398
399 /*
400 * Append the function text starting at startlocation and extending to
401 * (not including) endlocation onto the existing contents of "buf".
402 */
403 void
plpgsql_append_source_text(StringInfo buf,int startlocation,int endlocation)404 plpgsql_append_source_text(StringInfo buf,
405 int startlocation, int endlocation)
406 {
407 Assert(startlocation <= endlocation);
408 appendBinaryStringInfo(buf, scanorig + startlocation,
409 endlocation - startlocation);
410 }
411
412 /*
413 * Peek one token ahead in the input stream. Only the token code is
414 * made available, not any of the auxiliary info such as location.
415 *
416 * NB: no variable or unreserved keyword lookup is performed here, they will
417 * be returned as IDENT. Reserved keywords are resolved as usual.
418 */
419 int
plpgsql_peek(void)420 plpgsql_peek(void)
421 {
422 int tok1;
423 TokenAuxData aux1;
424
425 tok1 = internal_yylex(&aux1);
426 push_back_token(tok1, &aux1);
427 return tok1;
428 }
429
430 /*
431 * Peek two tokens ahead in the input stream. The first token and its
432 * location in the query are returned in *tok1_p and *tok1_loc, second token
433 * and its location in *tok2_p and *tok2_loc.
434 *
435 * NB: no variable or unreserved keyword lookup is performed here, they will
436 * be returned as IDENT. Reserved keywords are resolved as usual.
437 */
438 void
plpgsql_peek2(int * tok1_p,int * tok2_p,int * tok1_loc,int * tok2_loc)439 plpgsql_peek2(int *tok1_p, int *tok2_p, int *tok1_loc, int *tok2_loc)
440 {
441 int tok1,
442 tok2;
443 TokenAuxData aux1,
444 aux2;
445
446 tok1 = internal_yylex(&aux1);
447 tok2 = internal_yylex(&aux2);
448
449 *tok1_p = tok1;
450 if (tok1_loc)
451 *tok1_loc = aux1.lloc;
452 *tok2_p = tok2;
453 if (tok2_loc)
454 *tok2_loc = aux2.lloc;
455
456 push_back_token(tok2, &aux2);
457 push_back_token(tok1, &aux1);
458 }
459
460 /*
461 * plpgsql_scanner_errposition
462 * Report an error cursor position, if possible.
463 *
464 * This is expected to be used within an ereport() call. The return value
465 * is a dummy (always 0, in fact).
466 *
467 * Note that this can only be used for messages emitted during initial
468 * parsing of a plpgsql function, since it requires the scanorig string
469 * to still be available.
470 */
471 int
plpgsql_scanner_errposition(int location)472 plpgsql_scanner_errposition(int location)
473 {
474 int pos;
475
476 if (location < 0 || scanorig == NULL)
477 return 0; /* no-op if location is unknown */
478
479 /* Convert byte offset to character number */
480 pos = pg_mbstrlen_with_len(scanorig, location) + 1;
481 /* And pass it to the ereport mechanism */
482 (void) internalerrposition(pos);
483 /* Also pass the function body string */
484 return internalerrquery(scanorig);
485 }
486
487 /*
488 * plpgsql_yyerror
489 * Report a lexer or grammar error.
490 *
491 * The message's cursor position refers to the current token (the one
492 * last returned by plpgsql_yylex()).
493 * This is OK for syntax error messages from the Bison parser, because Bison
494 * parsers report error as soon as the first unparsable token is reached.
495 * Beware of using yyerror for other purposes, as the cursor position might
496 * be misleading!
497 */
498 void
plpgsql_yyerror(const char * message)499 plpgsql_yyerror(const char *message)
500 {
501 char *yytext = core_yy.scanbuf + plpgsql_yylloc;
502
503 if (*yytext == '\0')
504 {
505 ereport(ERROR,
506 (errcode(ERRCODE_SYNTAX_ERROR),
507 /* translator: %s is typically the translation of "syntax error" */
508 errmsg("%s at end of input", _(message)),
509 plpgsql_scanner_errposition(plpgsql_yylloc)));
510 }
511 else
512 {
513 /*
514 * If we have done any lookahead then flex will have restored the
515 * character after the end-of-token. Zap it again so that we report
516 * only the single token here. This modifies scanbuf but we no longer
517 * care about that.
518 */
519 yytext[plpgsql_yyleng] = '\0';
520
521 ereport(ERROR,
522 (errcode(ERRCODE_SYNTAX_ERROR),
523 /* translator: first %s is typically the translation of "syntax error" */
524 errmsg("%s at or near \"%s\"", _(message), yytext),
525 plpgsql_scanner_errposition(plpgsql_yylloc)));
526 }
527 }
528
529 /*
530 * Given a location (a byte offset in the function source text),
531 * return a line number.
532 *
533 * We expect that this is typically called for a sequence of increasing
534 * location values, so optimize accordingly by tracking the endpoints
535 * of the "current" line.
536 */
537 int
plpgsql_location_to_lineno(int location)538 plpgsql_location_to_lineno(int location)
539 {
540 const char *loc;
541
542 if (location < 0 || scanorig == NULL)
543 return 0; /* garbage in, garbage out */
544 loc = scanorig + location;
545
546 /* be correct, but not fast, if input location goes backwards */
547 if (loc < cur_line_start)
548 location_lineno_init();
549
550 while (cur_line_end != NULL && loc > cur_line_end)
551 {
552 cur_line_start = cur_line_end + 1;
553 cur_line_num++;
554 cur_line_end = strchr(cur_line_start, '\n');
555 }
556
557 return cur_line_num;
558 }
559
560 /* initialize or reset the state for plpgsql_location_to_lineno */
561 static void
location_lineno_init(void)562 location_lineno_init(void)
563 {
564 cur_line_start = scanorig;
565 cur_line_num = 1;
566
567 cur_line_end = strchr(cur_line_start, '\n');
568 }
569
570 /* return the most recently computed lineno */
571 int
plpgsql_latest_lineno(void)572 plpgsql_latest_lineno(void)
573 {
574 return cur_line_num;
575 }
576
577
578 /*
579 * Called before any actual parsing is done
580 *
581 * Note: the passed "str" must remain valid until plpgsql_scanner_finish().
582 * Although it is not fed directly to flex, we need the original string
583 * to cite in error messages.
584 */
585 void
plpgsql_scanner_init(const char * str)586 plpgsql_scanner_init(const char *str)
587 {
588 /* Start up the core scanner */
589 yyscanner = scanner_init(str, &core_yy,
590 &ReservedPLKeywords, ReservedPLKeywordTokens);
591
592 /*
593 * scanorig points to the original string, which unlike the scanner's
594 * scanbuf won't be modified on-the-fly by flex. Notice that although
595 * yytext points into scanbuf, we rely on being able to apply locations
596 * (offsets from string start) to scanorig as well.
597 */
598 scanorig = str;
599
600 /* Other setup */
601 plpgsql_IdentifierLookup = IDENTIFIER_LOOKUP_NORMAL;
602 plpgsql_yytoken = 0;
603
604 num_pushbacks = 0;
605
606 location_lineno_init();
607 }
608
609 /*
610 * Called after parsing is done to clean up after plpgsql_scanner_init()
611 */
612 void
plpgsql_scanner_finish(void)613 plpgsql_scanner_finish(void)
614 {
615 /* release storage */
616 scanner_finish(yyscanner);
617 /* avoid leaving any dangling pointers */
618 yyscanner = NULL;
619 scanorig = NULL;
620 }
621