1 /*-------------------------------------------------------------------------
2  *
3  * pl_scanner.c
4  *	  lexical scanning for PL/pgSQL
5  *
6  *
7  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  *
11  * IDENTIFICATION
12  *	  src/pl/plpgsql/src/pl_scanner.c
13  *
14  *-------------------------------------------------------------------------
15  */
16 #include "postgres.h"
17 
18 #include "mb/pg_wchar.h"
19 #include "parser/scanner.h"
20 
21 #include "plpgsql.h"
22 #include "pl_gram.h"			/* must be after parser/scanner.h */
23 
24 
25 /* Klugy flag to tell scanner how to look up identifiers */
26 IdentifierLookup plpgsql_IdentifierLookup = IDENTIFIER_LOOKUP_NORMAL;
27 
28 /*
29  * A word about keywords:
30  *
31  * We keep reserved and unreserved keywords in separate headers.  Be careful
32  * not to put the same word in both headers.  Also be sure that pl_gram.y's
33  * unreserved_keyword production agrees with the unreserved header.  The
34  * reserved keywords are passed to the core scanner, so they will be
35  * recognized before (and instead of) any variable name.  Unreserved words
36  * are checked for separately, usually after determining that the identifier
37  * isn't a known variable name.  If plpgsql_IdentifierLookup is DECLARE then
38  * no variable names will be recognized, so the unreserved words always work.
39  * (Note in particular that this helps us avoid reserving keywords that are
40  * only needed in DECLARE sections.)
41  *
42  * In certain contexts it is desirable to prefer recognizing an unreserved
43  * keyword over recognizing a variable name.  In particular, at the start
44  * of a statement we should prefer unreserved keywords unless the statement
45  * looks like an assignment (i.e., first token is followed by ':=' or '[').
46  * This rule allows most statement-introducing keywords to be kept unreserved.
47  * (We still have to reserve initial keywords that might follow a block
48  * label, unfortunately, since the method used to determine if we are at
49  * start of statement doesn't recognize such cases.  We'd also have to
50  * reserve any keyword that could legitimately be followed by ':=' or '['.)
51  * Some additional cases are handled in pl_gram.y using tok_is_keyword().
52  *
53  * We try to avoid reserving more keywords than we have to; but there's
54  * little point in not reserving a word if it's reserved in the core grammar.
55  * Currently, the following words are reserved here but not in the core:
56  * BEGIN BY DECLARE EXECUTE FOREACH IF LOOP STRICT WHILE
57  */
58 
59 /* ScanKeywordList lookup data for PL/pgSQL keywords */
60 #include "pl_reserved_kwlist_d.h"
61 #include "pl_unreserved_kwlist_d.h"
62 
63 /* Token codes for PL/pgSQL keywords */
64 #define PG_KEYWORD(kwname, value) value,
65 
66 static const uint16 ReservedPLKeywordTokens[] = {
67 #include "pl_reserved_kwlist.h"
68 };
69 
70 static const uint16 UnreservedPLKeywordTokens[] = {
71 #include "pl_unreserved_kwlist.h"
72 };
73 
74 #undef PG_KEYWORD
75 
76 /*
77  * This macro must recognize all tokens that can immediately precede a
78  * PL/pgSQL executable statement (that is, proc_sect or proc_stmt in the
79  * grammar).  Fortunately, there are not very many, so hard-coding in this
80  * fashion seems sufficient.
81  */
82 #define AT_STMT_START(prev_token) \
83 	((prev_token) == ';' || \
84 	 (prev_token) == K_BEGIN || \
85 	 (prev_token) == K_THEN || \
86 	 (prev_token) == K_ELSE || \
87 	 (prev_token) == K_LOOP)
88 
89 
90 /* Auxiliary data about a token (other than the token type) */
91 typedef struct
92 {
93 	YYSTYPE		lval;			/* semantic information */
94 	YYLTYPE		lloc;			/* offset in scanbuf */
95 	int			leng;			/* length in bytes */
96 } TokenAuxData;
97 
98 /*
99  * Scanner working state.  At some point we might wish to fold all this
100  * into a YY_EXTRA struct.  For the moment, there is no need for plpgsql's
101  * lexer to be re-entrant, and the notational burden of passing a yyscanner
102  * pointer around is great enough to not want to do it without need.
103  */
104 
105 /* The stuff the core lexer needs */
106 static core_yyscan_t yyscanner = NULL;
107 static core_yy_extra_type core_yy;
108 
109 /* The original input string */
110 static const char *scanorig;
111 
112 /* Current token's length (corresponds to plpgsql_yylval and plpgsql_yylloc) */
113 static int	plpgsql_yyleng;
114 
115 /* Current token's code (corresponds to plpgsql_yylval and plpgsql_yylloc) */
116 static int	plpgsql_yytoken;
117 
118 /* Token pushback stack */
119 #define MAX_PUSHBACKS 4
120 
121 static int	num_pushbacks;
122 static int	pushback_token[MAX_PUSHBACKS];
123 static TokenAuxData pushback_auxdata[MAX_PUSHBACKS];
124 
125 /* State for plpgsql_location_to_lineno() */
126 static const char *cur_line_start;
127 static const char *cur_line_end;
128 static int	cur_line_num;
129 
130 /* Internal functions */
131 static int	internal_yylex(TokenAuxData *auxdata);
132 static void push_back_token(int token, TokenAuxData *auxdata);
133 static void location_lineno_init(void);
134 
135 
136 /*
137  * This is the yylex routine called from the PL/pgSQL grammar.
138  * It is a wrapper around the core lexer, with the ability to recognize
139  * PL/pgSQL variables and return them as special T_DATUM tokens.  If a
140  * word or compound word does not match any variable name, or if matching
141  * is turned off by plpgsql_IdentifierLookup, it is returned as
142  * T_WORD or T_CWORD respectively, or as an unreserved keyword if it
143  * matches one of those.
144  */
145 int
plpgsql_yylex(void)146 plpgsql_yylex(void)
147 {
148 	int			tok1;
149 	TokenAuxData aux1;
150 	int			kwnum;
151 
152 	tok1 = internal_yylex(&aux1);
153 	if (tok1 == IDENT || tok1 == PARAM)
154 	{
155 		int			tok2;
156 		TokenAuxData aux2;
157 
158 		tok2 = internal_yylex(&aux2);
159 		if (tok2 == '.')
160 		{
161 			int			tok3;
162 			TokenAuxData aux3;
163 
164 			tok3 = internal_yylex(&aux3);
165 			if (tok3 == IDENT)
166 			{
167 				int			tok4;
168 				TokenAuxData aux4;
169 
170 				tok4 = internal_yylex(&aux4);
171 				if (tok4 == '.')
172 				{
173 					int			tok5;
174 					TokenAuxData aux5;
175 
176 					tok5 = internal_yylex(&aux5);
177 					if (tok5 == IDENT)
178 					{
179 						if (plpgsql_parse_tripword(aux1.lval.str,
180 												   aux3.lval.str,
181 												   aux5.lval.str,
182 												   &aux1.lval.wdatum,
183 												   &aux1.lval.cword))
184 							tok1 = T_DATUM;
185 						else
186 							tok1 = T_CWORD;
187 					}
188 					else
189 					{
190 						/* not A.B.C, so just process A.B */
191 						push_back_token(tok5, &aux5);
192 						push_back_token(tok4, &aux4);
193 						if (plpgsql_parse_dblword(aux1.lval.str,
194 												  aux3.lval.str,
195 												  &aux1.lval.wdatum,
196 												  &aux1.lval.cword))
197 							tok1 = T_DATUM;
198 						else
199 							tok1 = T_CWORD;
200 					}
201 				}
202 				else
203 				{
204 					/* not A.B.C, so just process A.B */
205 					push_back_token(tok4, &aux4);
206 					if (plpgsql_parse_dblword(aux1.lval.str,
207 											  aux3.lval.str,
208 											  &aux1.lval.wdatum,
209 											  &aux1.lval.cword))
210 						tok1 = T_DATUM;
211 					else
212 						tok1 = T_CWORD;
213 				}
214 			}
215 			else
216 			{
217 				/* not A.B, so just process A */
218 				push_back_token(tok3, &aux3);
219 				push_back_token(tok2, &aux2);
220 				if (plpgsql_parse_word(aux1.lval.str,
221 									   core_yy.scanbuf + aux1.lloc,
222 									   true,
223 									   &aux1.lval.wdatum,
224 									   &aux1.lval.word))
225 					tok1 = T_DATUM;
226 				else if (!aux1.lval.word.quoted &&
227 						 (kwnum = ScanKeywordLookup(aux1.lval.word.ident,
228 													&UnreservedPLKeywords)) >= 0)
229 				{
230 					aux1.lval.keyword = GetScanKeyword(kwnum,
231 													   &UnreservedPLKeywords);
232 					tok1 = UnreservedPLKeywordTokens[kwnum];
233 				}
234 				else
235 					tok1 = T_WORD;
236 			}
237 		}
238 		else
239 		{
240 			/* not A.B, so just process A */
241 			push_back_token(tok2, &aux2);
242 
243 			/*
244 			 * See if it matches a variable name, except in the context where
245 			 * we are at start of statement and the next token isn't
246 			 * assignment or '['.  In that case, it couldn't validly be a
247 			 * variable name, and skipping the lookup allows variable names to
248 			 * be used that would conflict with plpgsql or core keywords that
249 			 * introduce statements (e.g., "comment").  Without this special
250 			 * logic, every statement-introducing keyword would effectively be
251 			 * reserved in PL/pgSQL, which would be unpleasant.
252 			 *
253 			 * If it isn't a variable name, try to match against unreserved
254 			 * plpgsql keywords.  If not one of those either, it's T_WORD.
255 			 *
256 			 * Note: we must call plpgsql_parse_word even if we don't want to
257 			 * do variable lookup, because it sets up aux1.lval.word for the
258 			 * non-variable cases.
259 			 */
260 			if (plpgsql_parse_word(aux1.lval.str,
261 								   core_yy.scanbuf + aux1.lloc,
262 								   (!AT_STMT_START(plpgsql_yytoken) ||
263 									(tok2 == '=' || tok2 == COLON_EQUALS ||
264 									 tok2 == '[')),
265 								   &aux1.lval.wdatum,
266 								   &aux1.lval.word))
267 				tok1 = T_DATUM;
268 			else if (!aux1.lval.word.quoted &&
269 					 (kwnum = ScanKeywordLookup(aux1.lval.word.ident,
270 												&UnreservedPLKeywords)) >= 0)
271 			{
272 				aux1.lval.keyword = GetScanKeyword(kwnum,
273 												   &UnreservedPLKeywords);
274 				tok1 = UnreservedPLKeywordTokens[kwnum];
275 			}
276 			else
277 				tok1 = T_WORD;
278 		}
279 	}
280 	else
281 	{
282 		/*
283 		 * Not a potential plpgsql variable name, just return the data.
284 		 *
285 		 * Note that we also come through here if the grammar pushed back a
286 		 * T_DATUM, T_CWORD, T_WORD, or unreserved-keyword token returned by a
287 		 * previous lookup cycle; thus, pushbacks do not incur extra lookup
288 		 * work, since we'll never do the above code twice for the same token.
289 		 * This property also makes it safe to rely on the old value of
290 		 * plpgsql_yytoken in the is-this-start-of-statement test above.
291 		 */
292 	}
293 
294 	plpgsql_yylval = aux1.lval;
295 	plpgsql_yylloc = aux1.lloc;
296 	plpgsql_yyleng = aux1.leng;
297 	plpgsql_yytoken = tok1;
298 	return tok1;
299 }
300 
301 /*
302  * Internal yylex function.  This wraps the core lexer and adds one feature:
303  * a token pushback stack.  We also make a couple of trivial single-token
304  * translations from what the core lexer does to what we want, in particular
305  * interfacing from the core_YYSTYPE to YYSTYPE union.
306  */
307 static int
internal_yylex(TokenAuxData * auxdata)308 internal_yylex(TokenAuxData *auxdata)
309 {
310 	int			token;
311 	const char *yytext;
312 
313 	if (num_pushbacks > 0)
314 	{
315 		num_pushbacks--;
316 		token = pushback_token[num_pushbacks];
317 		*auxdata = pushback_auxdata[num_pushbacks];
318 	}
319 	else
320 	{
321 		token = core_yylex(&auxdata->lval.core_yystype,
322 						   &auxdata->lloc,
323 						   yyscanner);
324 
325 		/* remember the length of yytext before it gets changed */
326 		yytext = core_yy.scanbuf + auxdata->lloc;
327 		auxdata->leng = strlen(yytext);
328 
329 		/* Check for << >> and #, which the core considers operators */
330 		if (token == Op)
331 		{
332 			if (strcmp(auxdata->lval.str, "<<") == 0)
333 				token = LESS_LESS;
334 			else if (strcmp(auxdata->lval.str, ">>") == 0)
335 				token = GREATER_GREATER;
336 			else if (strcmp(auxdata->lval.str, "#") == 0)
337 				token = '#';
338 		}
339 
340 		/* The core returns PARAM as ival, but we treat it like IDENT */
341 		else if (token == PARAM)
342 		{
343 			auxdata->lval.str = pstrdup(yytext);
344 		}
345 	}
346 
347 	return token;
348 }
349 
350 /*
351  * Push back a token to be re-read by next internal_yylex() call.
352  */
353 static void
push_back_token(int token,TokenAuxData * auxdata)354 push_back_token(int token, TokenAuxData *auxdata)
355 {
356 	if (num_pushbacks >= MAX_PUSHBACKS)
357 		elog(ERROR, "too many tokens pushed back");
358 	pushback_token[num_pushbacks] = token;
359 	pushback_auxdata[num_pushbacks] = *auxdata;
360 	num_pushbacks++;
361 }
362 
363 /*
364  * Push back a single token to be re-read by next plpgsql_yylex() call.
365  *
366  * NOTE: this does not cause yylval or yylloc to "back up".  Also, it
367  * is not a good idea to push back a token code other than what you read.
368  */
369 void
plpgsql_push_back_token(int token)370 plpgsql_push_back_token(int token)
371 {
372 	TokenAuxData auxdata;
373 
374 	auxdata.lval = plpgsql_yylval;
375 	auxdata.lloc = plpgsql_yylloc;
376 	auxdata.leng = plpgsql_yyleng;
377 	push_back_token(token, &auxdata);
378 }
379 
380 /*
381  * Tell whether a token is an unreserved keyword.
382  *
383  * (If it is, its lowercased form was returned as the token value, so we
384  * do not need to offer that data here.)
385  */
386 bool
plpgsql_token_is_unreserved_keyword(int token)387 plpgsql_token_is_unreserved_keyword(int token)
388 {
389 	int			i;
390 
391 	for (i = 0; i < lengthof(UnreservedPLKeywordTokens); i++)
392 	{
393 		if (UnreservedPLKeywordTokens[i] == token)
394 			return true;
395 	}
396 	return false;
397 }
398 
399 /*
400  * Append the function text starting at startlocation and extending to
401  * (not including) endlocation onto the existing contents of "buf".
402  */
403 void
plpgsql_append_source_text(StringInfo buf,int startlocation,int endlocation)404 plpgsql_append_source_text(StringInfo buf,
405 						   int startlocation, int endlocation)
406 {
407 	Assert(startlocation <= endlocation);
408 	appendBinaryStringInfo(buf, scanorig + startlocation,
409 						   endlocation - startlocation);
410 }
411 
412 /*
413  * Peek one token ahead in the input stream.  Only the token code is
414  * made available, not any of the auxiliary info such as location.
415  *
416  * NB: no variable or unreserved keyword lookup is performed here, they will
417  * be returned as IDENT. Reserved keywords are resolved as usual.
418  */
419 int
plpgsql_peek(void)420 plpgsql_peek(void)
421 {
422 	int			tok1;
423 	TokenAuxData aux1;
424 
425 	tok1 = internal_yylex(&aux1);
426 	push_back_token(tok1, &aux1);
427 	return tok1;
428 }
429 
430 /*
431  * Peek two tokens ahead in the input stream. The first token and its
432  * location in the query are returned in *tok1_p and *tok1_loc, second token
433  * and its location in *tok2_p and *tok2_loc.
434  *
435  * NB: no variable or unreserved keyword lookup is performed here, they will
436  * be returned as IDENT. Reserved keywords are resolved as usual.
437  */
438 void
plpgsql_peek2(int * tok1_p,int * tok2_p,int * tok1_loc,int * tok2_loc)439 plpgsql_peek2(int *tok1_p, int *tok2_p, int *tok1_loc, int *tok2_loc)
440 {
441 	int			tok1,
442 				tok2;
443 	TokenAuxData aux1,
444 				aux2;
445 
446 	tok1 = internal_yylex(&aux1);
447 	tok2 = internal_yylex(&aux2);
448 
449 	*tok1_p = tok1;
450 	if (tok1_loc)
451 		*tok1_loc = aux1.lloc;
452 	*tok2_p = tok2;
453 	if (tok2_loc)
454 		*tok2_loc = aux2.lloc;
455 
456 	push_back_token(tok2, &aux2);
457 	push_back_token(tok1, &aux1);
458 }
459 
460 /*
461  * plpgsql_scanner_errposition
462  *		Report an error cursor position, if possible.
463  *
464  * This is expected to be used within an ereport() call.  The return value
465  * is a dummy (always 0, in fact).
466  *
467  * Note that this can only be used for messages emitted during initial
468  * parsing of a plpgsql function, since it requires the scanorig string
469  * to still be available.
470  */
471 int
plpgsql_scanner_errposition(int location)472 plpgsql_scanner_errposition(int location)
473 {
474 	int			pos;
475 
476 	if (location < 0 || scanorig == NULL)
477 		return 0;				/* no-op if location is unknown */
478 
479 	/* Convert byte offset to character number */
480 	pos = pg_mbstrlen_with_len(scanorig, location) + 1;
481 	/* And pass it to the ereport mechanism */
482 	(void) internalerrposition(pos);
483 	/* Also pass the function body string */
484 	return internalerrquery(scanorig);
485 }
486 
487 /*
488  * plpgsql_yyerror
489  *		Report a lexer or grammar error.
490  *
491  * The message's cursor position refers to the current token (the one
492  * last returned by plpgsql_yylex()).
493  * This is OK for syntax error messages from the Bison parser, because Bison
494  * parsers report error as soon as the first unparsable token is reached.
495  * Beware of using yyerror for other purposes, as the cursor position might
496  * be misleading!
497  */
498 void
plpgsql_yyerror(const char * message)499 plpgsql_yyerror(const char *message)
500 {
501 	char	   *yytext = core_yy.scanbuf + plpgsql_yylloc;
502 
503 	if (*yytext == '\0')
504 	{
505 		ereport(ERROR,
506 				(errcode(ERRCODE_SYNTAX_ERROR),
507 		/* translator: %s is typically the translation of "syntax error" */
508 				 errmsg("%s at end of input", _(message)),
509 				 plpgsql_scanner_errposition(plpgsql_yylloc)));
510 	}
511 	else
512 	{
513 		/*
514 		 * If we have done any lookahead then flex will have restored the
515 		 * character after the end-of-token.  Zap it again so that we report
516 		 * only the single token here.  This modifies scanbuf but we no longer
517 		 * care about that.
518 		 */
519 		yytext[plpgsql_yyleng] = '\0';
520 
521 		ereport(ERROR,
522 				(errcode(ERRCODE_SYNTAX_ERROR),
523 		/* translator: first %s is typically the translation of "syntax error" */
524 				 errmsg("%s at or near \"%s\"", _(message), yytext),
525 				 plpgsql_scanner_errposition(plpgsql_yylloc)));
526 	}
527 }
528 
529 /*
530  * Given a location (a byte offset in the function source text),
531  * return a line number.
532  *
533  * We expect that this is typically called for a sequence of increasing
534  * location values, so optimize accordingly by tracking the endpoints
535  * of the "current" line.
536  */
537 int
plpgsql_location_to_lineno(int location)538 plpgsql_location_to_lineno(int location)
539 {
540 	const char *loc;
541 
542 	if (location < 0 || scanorig == NULL)
543 		return 0;				/* garbage in, garbage out */
544 	loc = scanorig + location;
545 
546 	/* be correct, but not fast, if input location goes backwards */
547 	if (loc < cur_line_start)
548 		location_lineno_init();
549 
550 	while (cur_line_end != NULL && loc > cur_line_end)
551 	{
552 		cur_line_start = cur_line_end + 1;
553 		cur_line_num++;
554 		cur_line_end = strchr(cur_line_start, '\n');
555 	}
556 
557 	return cur_line_num;
558 }
559 
560 /* initialize or reset the state for plpgsql_location_to_lineno */
561 static void
location_lineno_init(void)562 location_lineno_init(void)
563 {
564 	cur_line_start = scanorig;
565 	cur_line_num = 1;
566 
567 	cur_line_end = strchr(cur_line_start, '\n');
568 }
569 
570 /* return the most recently computed lineno */
571 int
plpgsql_latest_lineno(void)572 plpgsql_latest_lineno(void)
573 {
574 	return cur_line_num;
575 }
576 
577 
578 /*
579  * Called before any actual parsing is done
580  *
581  * Note: the passed "str" must remain valid until plpgsql_scanner_finish().
582  * Although it is not fed directly to flex, we need the original string
583  * to cite in error messages.
584  */
585 void
plpgsql_scanner_init(const char * str)586 plpgsql_scanner_init(const char *str)
587 {
588 	/* Start up the core scanner */
589 	yyscanner = scanner_init(str, &core_yy,
590 							 &ReservedPLKeywords, ReservedPLKeywordTokens);
591 
592 	/*
593 	 * scanorig points to the original string, which unlike the scanner's
594 	 * scanbuf won't be modified on-the-fly by flex.  Notice that although
595 	 * yytext points into scanbuf, we rely on being able to apply locations
596 	 * (offsets from string start) to scanorig as well.
597 	 */
598 	scanorig = str;
599 
600 	/* Other setup */
601 	plpgsql_IdentifierLookup = IDENTIFIER_LOOKUP_NORMAL;
602 	plpgsql_yytoken = 0;
603 
604 	num_pushbacks = 0;
605 
606 	location_lineno_init();
607 }
608 
609 /*
610  * Called after parsing is done to clean up after plpgsql_scanner_init()
611  */
612 void
plpgsql_scanner_finish(void)613 plpgsql_scanner_finish(void)
614 {
615 	/* release storage */
616 	scanner_finish(yyscanner);
617 	/* avoid leaving any dangling pointers */
618 	yyscanner = NULL;
619 	scanorig = NULL;
620 }
621