1 /*------------------------------------------------------------------------- 2 * 3 * pl_scanner.c 4 * lexical scanning for PL/pgSQL 5 * 6 * 7 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group 8 * Portions Copyright (c) 1994, Regents of the University of California 9 * 10 * 11 * IDENTIFICATION 12 * src/pl/plpgsql/src/pl_scanner.c 13 * 14 *------------------------------------------------------------------------- 15 */ 16 #include "postgres.h" 17 18 #include "mb/pg_wchar.h" 19 #include "parser/scanner.h" 20 21 #include "plpgsql.h" 22 #include "pl_gram.h" /* must be after parser/scanner.h */ 23 24 25 /* Klugy flag to tell scanner how to look up identifiers */ 26 IdentifierLookup plpgsql_IdentifierLookup = IDENTIFIER_LOOKUP_NORMAL; 27 28 /* 29 * A word about keywords: 30 * 31 * We keep reserved and unreserved keywords in separate headers. Be careful 32 * not to put the same word in both headers. Also be sure that pl_gram.y's 33 * unreserved_keyword production agrees with the unreserved header. The 34 * reserved keywords are passed to the core scanner, so they will be 35 * recognized before (and instead of) any variable name. Unreserved words 36 * are checked for separately, usually after determining that the identifier 37 * isn't a known variable name. If plpgsql_IdentifierLookup is DECLARE then 38 * no variable names will be recognized, so the unreserved words always work. 39 * (Note in particular that this helps us avoid reserving keywords that are 40 * only needed in DECLARE sections.) 41 * 42 * In certain contexts it is desirable to prefer recognizing an unreserved 43 * keyword over recognizing a variable name. In particular, at the start 44 * of a statement we should prefer unreserved keywords unless the statement 45 * looks like an assignment (i.e., first token is followed by ':=' or '['). 46 * This rule allows most statement-introducing keywords to be kept unreserved. 47 * (We still have to reserve initial keywords that might follow a block 48 * label, unfortunately, since the method used to determine if we are at 49 * start of statement doesn't recognize such cases. We'd also have to 50 * reserve any keyword that could legitimately be followed by ':=' or '['.) 51 * Some additional cases are handled in pl_gram.y using tok_is_keyword(). 52 * 53 * We try to avoid reserving more keywords than we have to; but there's 54 * little point in not reserving a word if it's reserved in the core grammar. 55 * Currently, the following words are reserved here but not in the core: 56 * BEGIN BY DECLARE EXECUTE FOREACH IF LOOP STRICT WHILE 57 */ 58 59 /* ScanKeywordList lookup data for PL/pgSQL keywords */ 60 #include "pl_reserved_kwlist_d.h" 61 #include "pl_unreserved_kwlist_d.h" 62 63 /* Token codes for PL/pgSQL keywords */ 64 #define PG_KEYWORD(kwname, value) value, 65 66 static const uint16 ReservedPLKeywordTokens[] = { 67 #include "pl_reserved_kwlist.h" 68 }; 69 70 static const uint16 UnreservedPLKeywordTokens[] = { 71 #include "pl_unreserved_kwlist.h" 72 }; 73 74 #undef PG_KEYWORD 75 76 /* 77 * This macro must recognize all tokens that can immediately precede a 78 * PL/pgSQL executable statement (that is, proc_sect or proc_stmt in the 79 * grammar). Fortunately, there are not very many, so hard-coding in this 80 * fashion seems sufficient. 81 */ 82 #define AT_STMT_START(prev_token) \ 83 ((prev_token) == ';' || \ 84 (prev_token) == K_BEGIN || \ 85 (prev_token) == K_THEN || \ 86 (prev_token) == K_ELSE || \ 87 (prev_token) == K_LOOP) 88 89 90 /* Auxiliary data about a token (other than the token type) */ 91 typedef struct 92 { 93 YYSTYPE lval; /* semantic information */ 94 YYLTYPE lloc; /* offset in scanbuf */ 95 int leng; /* length in bytes */ 96 } TokenAuxData; 97 98 /* 99 * Scanner working state. At some point we might wish to fold all this 100 * into a YY_EXTRA struct. For the moment, there is no need for plpgsql's 101 * lexer to be re-entrant, and the notational burden of passing a yyscanner 102 * pointer around is great enough to not want to do it without need. 103 */ 104 105 /* The stuff the core lexer needs */ 106 static core_yyscan_t yyscanner = NULL; 107 static core_yy_extra_type core_yy; 108 109 /* The original input string */ 110 static const char *scanorig; 111 112 /* Current token's length (corresponds to plpgsql_yylval and plpgsql_yylloc) */ 113 static int plpgsql_yyleng; 114 115 /* Current token's code (corresponds to plpgsql_yylval and plpgsql_yylloc) */ 116 static int plpgsql_yytoken; 117 118 /* Token pushback stack */ 119 #define MAX_PUSHBACKS 4 120 121 static int num_pushbacks; 122 static int pushback_token[MAX_PUSHBACKS]; 123 static TokenAuxData pushback_auxdata[MAX_PUSHBACKS]; 124 125 /* State for plpgsql_location_to_lineno() */ 126 static const char *cur_line_start; 127 static const char *cur_line_end; 128 static int cur_line_num; 129 130 /* Internal functions */ 131 static int internal_yylex(TokenAuxData *auxdata); 132 static void push_back_token(int token, TokenAuxData *auxdata); 133 static void location_lineno_init(void); 134 135 136 /* 137 * This is the yylex routine called from the PL/pgSQL grammar. 138 * It is a wrapper around the core lexer, with the ability to recognize 139 * PL/pgSQL variables and return them as special T_DATUM tokens. If a 140 * word or compound word does not match any variable name, or if matching 141 * is turned off by plpgsql_IdentifierLookup, it is returned as 142 * T_WORD or T_CWORD respectively, or as an unreserved keyword if it 143 * matches one of those. 144 */ 145 int 146 plpgsql_yylex(void) 147 { 148 int tok1; 149 TokenAuxData aux1; 150 int kwnum; 151 152 tok1 = internal_yylex(&aux1); 153 if (tok1 == IDENT || tok1 == PARAM) 154 { 155 int tok2; 156 TokenAuxData aux2; 157 158 tok2 = internal_yylex(&aux2); 159 if (tok2 == '.') 160 { 161 int tok3; 162 TokenAuxData aux3; 163 164 tok3 = internal_yylex(&aux3); 165 if (tok3 == IDENT) 166 { 167 int tok4; 168 TokenAuxData aux4; 169 170 tok4 = internal_yylex(&aux4); 171 if (tok4 == '.') 172 { 173 int tok5; 174 TokenAuxData aux5; 175 176 tok5 = internal_yylex(&aux5); 177 if (tok5 == IDENT) 178 { 179 if (plpgsql_parse_tripword(aux1.lval.str, 180 aux3.lval.str, 181 aux5.lval.str, 182 &aux1.lval.wdatum, 183 &aux1.lval.cword)) 184 tok1 = T_DATUM; 185 else 186 tok1 = T_CWORD; 187 } 188 else 189 { 190 /* not A.B.C, so just process A.B */ 191 push_back_token(tok5, &aux5); 192 push_back_token(tok4, &aux4); 193 if (plpgsql_parse_dblword(aux1.lval.str, 194 aux3.lval.str, 195 &aux1.lval.wdatum, 196 &aux1.lval.cword)) 197 tok1 = T_DATUM; 198 else 199 tok1 = T_CWORD; 200 } 201 } 202 else 203 { 204 /* not A.B.C, so just process A.B */ 205 push_back_token(tok4, &aux4); 206 if (plpgsql_parse_dblword(aux1.lval.str, 207 aux3.lval.str, 208 &aux1.lval.wdatum, 209 &aux1.lval.cword)) 210 tok1 = T_DATUM; 211 else 212 tok1 = T_CWORD; 213 } 214 } 215 else 216 { 217 /* not A.B, so just process A */ 218 push_back_token(tok3, &aux3); 219 push_back_token(tok2, &aux2); 220 if (plpgsql_parse_word(aux1.lval.str, 221 core_yy.scanbuf + aux1.lloc, 222 true, 223 &aux1.lval.wdatum, 224 &aux1.lval.word)) 225 tok1 = T_DATUM; 226 else if (!aux1.lval.word.quoted && 227 (kwnum = ScanKeywordLookup(aux1.lval.word.ident, 228 &UnreservedPLKeywords)) >= 0) 229 { 230 aux1.lval.keyword = GetScanKeyword(kwnum, 231 &UnreservedPLKeywords); 232 tok1 = UnreservedPLKeywordTokens[kwnum]; 233 } 234 else 235 tok1 = T_WORD; 236 } 237 } 238 else 239 { 240 /* not A.B, so just process A */ 241 push_back_token(tok2, &aux2); 242 243 /* 244 * See if it matches a variable name, except in the context where 245 * we are at start of statement and the next token isn't 246 * assignment or '['. In that case, it couldn't validly be a 247 * variable name, and skipping the lookup allows variable names to 248 * be used that would conflict with plpgsql or core keywords that 249 * introduce statements (e.g., "comment"). Without this special 250 * logic, every statement-introducing keyword would effectively be 251 * reserved in PL/pgSQL, which would be unpleasant. 252 * 253 * If it isn't a variable name, try to match against unreserved 254 * plpgsql keywords. If not one of those either, it's T_WORD. 255 * 256 * Note: we must call plpgsql_parse_word even if we don't want to 257 * do variable lookup, because it sets up aux1.lval.word for the 258 * non-variable cases. 259 */ 260 if (plpgsql_parse_word(aux1.lval.str, 261 core_yy.scanbuf + aux1.lloc, 262 (!AT_STMT_START(plpgsql_yytoken) || 263 (tok2 == '=' || tok2 == COLON_EQUALS || 264 tok2 == '[')), 265 &aux1.lval.wdatum, 266 &aux1.lval.word)) 267 tok1 = T_DATUM; 268 else if (!aux1.lval.word.quoted && 269 (kwnum = ScanKeywordLookup(aux1.lval.word.ident, 270 &UnreservedPLKeywords)) >= 0) 271 { 272 aux1.lval.keyword = GetScanKeyword(kwnum, 273 &UnreservedPLKeywords); 274 tok1 = UnreservedPLKeywordTokens[kwnum]; 275 } 276 else 277 tok1 = T_WORD; 278 } 279 } 280 else 281 { 282 /* 283 * Not a potential plpgsql variable name, just return the data. 284 * 285 * Note that we also come through here if the grammar pushed back a 286 * T_DATUM, T_CWORD, T_WORD, or unreserved-keyword token returned by a 287 * previous lookup cycle; thus, pushbacks do not incur extra lookup 288 * work, since we'll never do the above code twice for the same token. 289 * This property also makes it safe to rely on the old value of 290 * plpgsql_yytoken in the is-this-start-of-statement test above. 291 */ 292 } 293 294 plpgsql_yylval = aux1.lval; 295 plpgsql_yylloc = aux1.lloc; 296 plpgsql_yyleng = aux1.leng; 297 plpgsql_yytoken = tok1; 298 return tok1; 299 } 300 301 /* 302 * Internal yylex function. This wraps the core lexer and adds one feature: 303 * a token pushback stack. We also make a couple of trivial single-token 304 * translations from what the core lexer does to what we want, in particular 305 * interfacing from the core_YYSTYPE to YYSTYPE union. 306 */ 307 static int 308 internal_yylex(TokenAuxData *auxdata) 309 { 310 int token; 311 const char *yytext; 312 313 if (num_pushbacks > 0) 314 { 315 num_pushbacks--; 316 token = pushback_token[num_pushbacks]; 317 *auxdata = pushback_auxdata[num_pushbacks]; 318 } 319 else 320 { 321 token = core_yylex(&auxdata->lval.core_yystype, 322 &auxdata->lloc, 323 yyscanner); 324 325 /* remember the length of yytext before it gets changed */ 326 yytext = core_yy.scanbuf + auxdata->lloc; 327 auxdata->leng = strlen(yytext); 328 329 /* Check for << >> and #, which the core considers operators */ 330 if (token == Op) 331 { 332 if (strcmp(auxdata->lval.str, "<<") == 0) 333 token = LESS_LESS; 334 else if (strcmp(auxdata->lval.str, ">>") == 0) 335 token = GREATER_GREATER; 336 else if (strcmp(auxdata->lval.str, "#") == 0) 337 token = '#'; 338 } 339 340 /* The core returns PARAM as ival, but we treat it like IDENT */ 341 else if (token == PARAM) 342 { 343 auxdata->lval.str = pstrdup(yytext); 344 } 345 } 346 347 return token; 348 } 349 350 /* 351 * Push back a token to be re-read by next internal_yylex() call. 352 */ 353 static void 354 push_back_token(int token, TokenAuxData *auxdata) 355 { 356 if (num_pushbacks >= MAX_PUSHBACKS) 357 elog(ERROR, "too many tokens pushed back"); 358 pushback_token[num_pushbacks] = token; 359 pushback_auxdata[num_pushbacks] = *auxdata; 360 num_pushbacks++; 361 } 362 363 /* 364 * Push back a single token to be re-read by next plpgsql_yylex() call. 365 * 366 * NOTE: this does not cause yylval or yylloc to "back up". Also, it 367 * is not a good idea to push back a token code other than what you read. 368 */ 369 void 370 plpgsql_push_back_token(int token) 371 { 372 TokenAuxData auxdata; 373 374 auxdata.lval = plpgsql_yylval; 375 auxdata.lloc = plpgsql_yylloc; 376 auxdata.leng = plpgsql_yyleng; 377 push_back_token(token, &auxdata); 378 } 379 380 /* 381 * Tell whether a token is an unreserved keyword. 382 * 383 * (If it is, its lowercased form was returned as the token value, so we 384 * do not need to offer that data here.) 385 */ 386 bool 387 plpgsql_token_is_unreserved_keyword(int token) 388 { 389 int i; 390 391 for (i = 0; i < lengthof(UnreservedPLKeywordTokens); i++) 392 { 393 if (UnreservedPLKeywordTokens[i] == token) 394 return true; 395 } 396 return false; 397 } 398 399 /* 400 * Append the function text starting at startlocation and extending to 401 * (not including) endlocation onto the existing contents of "buf". 402 */ 403 void 404 plpgsql_append_source_text(StringInfo buf, 405 int startlocation, int endlocation) 406 { 407 Assert(startlocation <= endlocation); 408 appendBinaryStringInfo(buf, scanorig + startlocation, 409 endlocation - startlocation); 410 } 411 412 /* 413 * Peek one token ahead in the input stream. Only the token code is 414 * made available, not any of the auxiliary info such as location. 415 * 416 * NB: no variable or unreserved keyword lookup is performed here, they will 417 * be returned as IDENT. Reserved keywords are resolved as usual. 418 */ 419 int 420 plpgsql_peek(void) 421 { 422 int tok1; 423 TokenAuxData aux1; 424 425 tok1 = internal_yylex(&aux1); 426 push_back_token(tok1, &aux1); 427 return tok1; 428 } 429 430 /* 431 * Peek two tokens ahead in the input stream. The first token and its 432 * location in the query are returned in *tok1_p and *tok1_loc, second token 433 * and its location in *tok2_p and *tok2_loc. 434 * 435 * NB: no variable or unreserved keyword lookup is performed here, they will 436 * be returned as IDENT. Reserved keywords are resolved as usual. 437 */ 438 void 439 plpgsql_peek2(int *tok1_p, int *tok2_p, int *tok1_loc, int *tok2_loc) 440 { 441 int tok1, 442 tok2; 443 TokenAuxData aux1, 444 aux2; 445 446 tok1 = internal_yylex(&aux1); 447 tok2 = internal_yylex(&aux2); 448 449 *tok1_p = tok1; 450 if (tok1_loc) 451 *tok1_loc = aux1.lloc; 452 *tok2_p = tok2; 453 if (tok2_loc) 454 *tok2_loc = aux2.lloc; 455 456 push_back_token(tok2, &aux2); 457 push_back_token(tok1, &aux1); 458 } 459 460 /* 461 * plpgsql_scanner_errposition 462 * Report an error cursor position, if possible. 463 * 464 * This is expected to be used within an ereport() call. The return value 465 * is a dummy (always 0, in fact). 466 * 467 * Note that this can only be used for messages emitted during initial 468 * parsing of a plpgsql function, since it requires the scanorig string 469 * to still be available. 470 */ 471 int 472 plpgsql_scanner_errposition(int location) 473 { 474 int pos; 475 476 if (location < 0 || scanorig == NULL) 477 return 0; /* no-op if location is unknown */ 478 479 /* Convert byte offset to character number */ 480 pos = pg_mbstrlen_with_len(scanorig, location) + 1; 481 /* And pass it to the ereport mechanism */ 482 (void) internalerrposition(pos); 483 /* Also pass the function body string */ 484 return internalerrquery(scanorig); 485 } 486 487 /* 488 * plpgsql_yyerror 489 * Report a lexer or grammar error. 490 * 491 * The message's cursor position refers to the current token (the one 492 * last returned by plpgsql_yylex()). 493 * This is OK for syntax error messages from the Bison parser, because Bison 494 * parsers report error as soon as the first unparsable token is reached. 495 * Beware of using yyerror for other purposes, as the cursor position might 496 * be misleading! 497 */ 498 void 499 plpgsql_yyerror(const char *message) 500 { 501 char *yytext = core_yy.scanbuf + plpgsql_yylloc; 502 503 if (*yytext == '\0') 504 { 505 ereport(ERROR, 506 (errcode(ERRCODE_SYNTAX_ERROR), 507 /* translator: %s is typically the translation of "syntax error" */ 508 errmsg("%s at end of input", _(message)), 509 plpgsql_scanner_errposition(plpgsql_yylloc))); 510 } 511 else 512 { 513 /* 514 * If we have done any lookahead then flex will have restored the 515 * character after the end-of-token. Zap it again so that we report 516 * only the single token here. This modifies scanbuf but we no longer 517 * care about that. 518 */ 519 yytext[plpgsql_yyleng] = '\0'; 520 521 ereport(ERROR, 522 (errcode(ERRCODE_SYNTAX_ERROR), 523 /* translator: first %s is typically the translation of "syntax error" */ 524 errmsg("%s at or near \"%s\"", _(message), yytext), 525 plpgsql_scanner_errposition(plpgsql_yylloc))); 526 } 527 } 528 529 /* 530 * Given a location (a byte offset in the function source text), 531 * return a line number. 532 * 533 * We expect that this is typically called for a sequence of increasing 534 * location values, so optimize accordingly by tracking the endpoints 535 * of the "current" line. 536 */ 537 int 538 plpgsql_location_to_lineno(int location) 539 { 540 const char *loc; 541 542 if (location < 0 || scanorig == NULL) 543 return 0; /* garbage in, garbage out */ 544 loc = scanorig + location; 545 546 /* be correct, but not fast, if input location goes backwards */ 547 if (loc < cur_line_start) 548 location_lineno_init(); 549 550 while (cur_line_end != NULL && loc > cur_line_end) 551 { 552 cur_line_start = cur_line_end + 1; 553 cur_line_num++; 554 cur_line_end = strchr(cur_line_start, '\n'); 555 } 556 557 return cur_line_num; 558 } 559 560 /* initialize or reset the state for plpgsql_location_to_lineno */ 561 static void 562 location_lineno_init(void) 563 { 564 cur_line_start = scanorig; 565 cur_line_num = 1; 566 567 cur_line_end = strchr(cur_line_start, '\n'); 568 } 569 570 /* return the most recently computed lineno */ 571 int 572 plpgsql_latest_lineno(void) 573 { 574 return cur_line_num; 575 } 576 577 578 /* 579 * Called before any actual parsing is done 580 * 581 * Note: the passed "str" must remain valid until plpgsql_scanner_finish(). 582 * Although it is not fed directly to flex, we need the original string 583 * to cite in error messages. 584 */ 585 void 586 plpgsql_scanner_init(const char *str) 587 { 588 /* Start up the core scanner */ 589 yyscanner = scanner_init(str, &core_yy, 590 &ReservedPLKeywords, ReservedPLKeywordTokens); 591 592 /* 593 * scanorig points to the original string, which unlike the scanner's 594 * scanbuf won't be modified on-the-fly by flex. Notice that although 595 * yytext points into scanbuf, we rely on being able to apply locations 596 * (offsets from string start) to scanorig as well. 597 */ 598 scanorig = str; 599 600 /* Other setup */ 601 plpgsql_IdentifierLookup = IDENTIFIER_LOOKUP_NORMAL; 602 plpgsql_yytoken = 0; 603 604 num_pushbacks = 0; 605 606 location_lineno_init(); 607 } 608 609 /* 610 * Called after parsing is done to clean up after plpgsql_scanner_init() 611 */ 612 void 613 plpgsql_scanner_finish(void) 614 { 615 /* release storage */ 616 scanner_finish(yyscanner); 617 /* avoid leaving any dangling pointers */ 618 yyscanner = NULL; 619 scanorig = NULL; 620 } 621