1 %top{ 2 /*------------------------------------------------------------------------- 3 * 4 * psqlscan.l 5 * lexical scanner for SQL commands 6 * 7 * This lexer used to be part of psql, and that heritage is reflected in 8 * the file name as well as function and typedef names, though it can now 9 * be used by other frontend programs as well. It's also possible to extend 10 * this lexer with a compatible add-on lexer to handle program-specific 11 * backslash commands. 12 * 13 * This code is mainly concerned with determining where the end of a SQL 14 * statement is: we are looking for semicolons that are not within quotes, 15 * comments, or parentheses. The most reliable way to handle this is to 16 * borrow the backend's flex lexer rules, lock, stock, and barrel. The rules 17 * below are (except for a few) the same as the backend's, but their actions 18 * are just ECHO whereas the backend's actions generally do other things. 19 * 20 * XXX The rules in this file must be kept in sync with the backend lexer!!! 21 * 22 * XXX Avoid creating backtracking cases --- see the backend lexer for info. 23 * 24 * See psqlscan_int.h for additional commentary. 25 * 26 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group 27 * Portions Copyright (c) 1994, Regents of the University of California 28 * 29 * IDENTIFICATION 30 * src/fe_utils/psqlscan.l 31 * 32 *------------------------------------------------------------------------- 33 */ 34 #include "postgres_fe.h" 35 36 #include "fe_utils/psqlscan.h" 37 38 #include "libpq-fe.h" 39 } 40 41 %{ 42 #include "fe_utils/psqlscan_int.h" 43 44 /* 45 * We must have a typedef YYSTYPE for yylex's first argument, but this lexer 46 * doesn't presently make use of that argument, so just declare it as int. 47 */ 48 typedef int YYSTYPE; 49 50 /* 51 * Set the type of yyextra; we use it as a pointer back to the containing 52 * PsqlScanState. 53 */ 54 #define YY_EXTRA_TYPE PsqlScanState 55 56 57 /* Return values from yylex() */ 58 #define LEXRES_EOL 0 /* end of input */ 59 #define LEXRES_SEMI 1 /* command-terminating semicolon found */ 60 #define LEXRES_BACKSLASH 2 /* backslash command start */ 61 62 63 #define ECHO psqlscan_emit(cur_state, yytext, yyleng) 64 65 /* 66 * Work around a bug in flex 2.5.35: it emits a couple of functions that 67 * it forgets to emit declarations for. Since we use -Wmissing-prototypes, 68 * this would cause warnings. Providing our own declarations should be 69 * harmless even when the bug gets fixed. 70 */ 71 extern int psql_yyget_column(yyscan_t yyscanner); 72 extern void psql_yyset_column(int column_no, yyscan_t yyscanner); 73 74 %} 75 76 %option reentrant 77 %option bison-bridge 78 %option 8bit 79 %option never-interactive 80 %option nodefault 81 %option noinput 82 %option nounput 83 %option noyywrap 84 %option warn 85 %option prefix="psql_yy" 86 87 /* 88 * All of the following definitions and rules should exactly match 89 * src/backend/parser/scan.l so far as the flex patterns are concerned. 90 * The rule bodies are just ECHO as opposed to what the backend does, 91 * however. (But be sure to duplicate code that affects the lexing process, 92 * such as BEGIN() and yyless().) Also, psqlscan uses a single <<EOF>> rule 93 * whereas scan.l has a separate one for each exclusive state. 94 */ 95 96 /* 97 * OK, here is a short description of lex/flex rules behavior. 98 * The longest pattern which matches an input string is always chosen. 99 * For equal-length patterns, the first occurring in the rules list is chosen. 100 * INITIAL is the starting state, to which all non-conditional rules apply. 101 * Exclusive states change parsing rules while the state is active. When in 102 * an exclusive state, only those rules defined for that state apply. 103 * 104 * We use exclusive states for quoted strings, extended comments, 105 * and to eliminate parsing troubles for numeric strings. 106 * Exclusive states: 107 * <xb> bit string literal 108 * <xc> extended C-style comments 109 * <xd> delimited identifiers (double-quoted identifiers) 110 * <xh> hexadecimal numeric string 111 * <xq> standard quoted strings 112 * <xe> extended quoted strings (support backslash escape sequences) 113 * <xdolq> $foo$ quoted strings 114 * <xui> quoted identifier with Unicode escapes 115 * <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow 116 * <xus> quoted string with Unicode escapes 117 * <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow 118 * 119 * Note: we intentionally don't mimic the backend's <xeu> state; we have 120 * no need to distinguish it from <xe> state, and no good way to get out 121 * of it in error cases. The backend just throws yyerror() in those 122 * cases, but that's not an option here. 123 */ 124 125 %x xb 126 %x xc 127 %x xd 128 %x xh 129 %x xe 130 %x xq 131 %x xdolq 132 %x xui 133 %x xuiend 134 %x xus 135 %x xusend 136 137 /* 138 * In order to make the world safe for Windows and Mac clients as well as 139 * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n 140 * sequence will be seen as two successive newlines, but that doesn't cause 141 * any problems. Comments that start with -- and extend to the next 142 * newline are treated as equivalent to a single whitespace character. 143 * 144 * NOTE a fine point: if there is no newline following --, we will absorb 145 * everything to the end of the input as a comment. This is correct. Older 146 * versions of Postgres failed to recognize -- as a comment if the input 147 * did not end with a newline. 148 * 149 * XXX perhaps \f (formfeed) should be treated as a newline as well? 150 * 151 * XXX if you change the set of whitespace characters, fix scanner_isspace() 152 * to agree, and see also the plpgsql lexer. 153 */ 154 155 space [ \t\n\r\f] 156 horiz_space [ \t\f] 157 newline [\n\r] 158 non_newline [^\n\r] 159 160 comment ("--"{non_newline}*) 161 162 whitespace ({space}+|{comment}) 163 164 /* 165 * SQL requires at least one newline in the whitespace separating 166 * string literals that are to be concatenated. Silly, but who are we 167 * to argue? Note that {whitespace_with_newline} should not have * after 168 * it, whereas {whitespace} should generally have a * after it... 169 */ 170 171 special_whitespace ({space}+|{comment}{newline}) 172 horiz_whitespace ({horiz_space}|{comment}) 173 whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) 174 175 /* 176 * To ensure that {quotecontinue} can be scanned without having to back up 177 * if the full pattern isn't matched, we include trailing whitespace in 178 * {quotestop}. This matches all cases where {quotecontinue} fails to match, 179 * except for {quote} followed by whitespace and just one "-" (not two, 180 * which would start a {comment}). To cover that we have {quotefail}. 181 * The actions for {quotestop} and {quotefail} must throw back characters 182 * beyond the quote proper. 183 */ 184 quote ' 185 quotestop {quote}{whitespace}* 186 quotecontinue {quote}{whitespace_with_newline}{quote} 187 quotefail {quote}{whitespace}*"-" 188 189 /* Bit string 190 * It is tempting to scan the string for only those characters 191 * which are allowed. However, this leads to silently swallowed 192 * characters if illegal characters are included in the string. 193 * For example, if xbinside is [01] then B'ABCD' is interpreted 194 * as a zero-length string, and the ABCD' is lost! 195 * Better to pass the string forward and let the input routines 196 * validate the contents. 197 */ 198 xbstart [bB]{quote} 199 xbinside [^']* 200 201 /* Hexadecimal number */ 202 xhstart [xX]{quote} 203 xhinside [^']* 204 205 /* National character */ 206 xnstart [nN]{quote} 207 208 /* Quoted string that allows backslash escapes */ 209 xestart [eE]{quote} 210 xeinside [^\\']+ 211 xeescape [\\][^0-7] 212 xeoctesc [\\][0-7]{1,3} 213 xehexesc [\\]x[0-9A-Fa-f]{1,2} 214 xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}) 215 xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7}) 216 217 /* Extended quote 218 * xqdouble implements embedded quote, '''' 219 */ 220 xqstart {quote} 221 xqdouble {quote}{quote} 222 xqinside [^']+ 223 224 /* $foo$ style quotes ("dollar quoting") 225 * The quoted string starts with $foo$ where "foo" is an optional string 226 * in the form of an identifier, except that it may not contain "$", 227 * and extends to the first occurrence of an identical string. 228 * There is *no* processing of the quoted text. 229 * 230 * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim} 231 * fails to match its trailing "$". 232 */ 233 dolq_start [A-Za-z\200-\377_] 234 dolq_cont [A-Za-z\200-\377_0-9] 235 dolqdelim \$({dolq_start}{dolq_cont}*)?\$ 236 dolqfailed \${dolq_start}{dolq_cont}* 237 dolqinside [^$]+ 238 239 /* Double quote 240 * Allows embedded spaces and other special characters into identifiers. 241 */ 242 dquote \" 243 xdstart {dquote} 244 xdstop {dquote} 245 xddouble {dquote}{dquote} 246 xdinside [^"]+ 247 248 /* Unicode escapes */ 249 uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} 250 /* error rule to avoid backup */ 251 uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] 252 253 /* Quoted identifier with Unicode escapes */ 254 xuistart [uU]&{dquote} 255 256 /* Quoted string with Unicode escapes */ 257 xusstart [uU]&{quote} 258 259 /* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */ 260 xustop1 {uescapefail}? 261 xustop2 {uescape} 262 263 /* error rule to avoid backup */ 264 xufailed [uU]& 265 266 267 /* C-style comments 268 * 269 * The "extended comment" syntax closely resembles allowable operator syntax. 270 * The tricky part here is to get lex to recognize a string starting with 271 * slash-star as a comment, when interpreting it as an operator would produce 272 * a longer match --- remember lex will prefer a longer match! Also, if we 273 * have something like plus-slash-star, lex will think this is a 3-character 274 * operator whereas we want to see it as a + operator and a comment start. 275 * The solution is two-fold: 276 * 1. append {op_chars}* to xcstart so that it matches as much text as 277 * {operator} would. Then the tie-breaker (first matching rule of same 278 * length) ensures xcstart wins. We put back the extra stuff with yyless() 279 * in case it contains a star-slash that should terminate the comment. 280 * 2. In the operator rule, check for slash-star within the operator, and 281 * if found throw it back with yyless(). This handles the plus-slash-star 282 * problem. 283 * Dash-dash comments have similar interactions with the operator rule. 284 */ 285 xcstart \/\*{op_chars}* 286 xcstop \*+\/ 287 xcinside [^*/]+ 288 289 digit [0-9] 290 ident_start [A-Za-z\200-\377_] 291 ident_cont [A-Za-z\200-\377_0-9\$] 292 293 identifier {ident_start}{ident_cont}* 294 295 /* Assorted special-case operators and operator-like tokens */ 296 typecast "::" 297 dot_dot \.\. 298 colon_equals ":=" 299 300 /* 301 * These operator-like tokens (unlike the above ones) also match the {operator} 302 * rule, which means that they might be overridden by a longer match if they 303 * are followed by a comment start or a + or - character. Accordingly, if you 304 * add to this list, you must also add corresponding code to the {operator} 305 * block to return the correct token in such cases. (This is not needed in 306 * psqlscan.l since the token value is ignored there.) 307 */ 308 equals_greater "=>" 309 less_equals "<=" 310 greater_equals ">=" 311 less_greater "<>" 312 not_equals "!=" 313 314 /* 315 * "self" is the set of chars that should be returned as single-character 316 * tokens. "op_chars" is the set of chars that can make up "Op" tokens, 317 * which can be one or more characters long (but if a single-char token 318 * appears in the "self" set, it is not to be returned as an Op). Note 319 * that the sets overlap, but each has some chars that are not in the other. 320 * 321 * If you change either set, adjust the character lists appearing in the 322 * rule for "operator"! 323 */ 324 self [,()\[\].;\:\+\-\*\/\%\^\<\>\=] 325 op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=] 326 operator {op_chars}+ 327 328 /* we no longer allow unary minus in numbers. 329 * instead we pass it separately to parser. there it gets 330 * coerced via doNegate() -- Leon aug 20 1999 331 * 332 * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10. 333 * 334 * {realfail1} and {realfail2} are added to prevent the need for scanner 335 * backup when the {real} rule fails to match completely. 336 */ 337 338 integer {digit}+ 339 decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*)) 340 decimalfail {digit}+\.\. 341 real ({integer}|{decimal})[Ee][-+]?{digit}+ 342 realfail1 ({integer}|{decimal})[Ee] 343 realfail2 ({integer}|{decimal})[Ee][-+] 344 345 param \${integer} 346 347 /* psql-specific: characters allowed in variable names */ 348 variable_char [A-Za-z\200-\377_0-9] 349 350 other . 351 352 /* 353 * Dollar quoted strings are totally opaque, and no escaping is done on them. 354 * Other quoted strings must allow some special characters such as single-quote 355 * and newline. 356 * Embedded single-quotes are implemented both in the SQL standard 357 * style of two adjacent single quotes "''" and in the Postgres/Java style 358 * of escaped-quote "\'". 359 * Other embedded escaped characters are matched explicitly and the leading 360 * backslash is dropped from the string. 361 * Note that xcstart must appear before operator, as explained above! 362 * Also whitespace (comment) must appear before operator. 363 */ 364 365 %% 366 367 %{ 368 /* Declare some local variables inside yylex(), for convenience */ 369 PsqlScanState cur_state = yyextra; 370 PQExpBuffer output_buf = cur_state->output_buf; 371 372 /* 373 * Force flex into the state indicated by start_state. This has a 374 * couple of purposes: it lets some of the functions below set a new 375 * starting state without ugly direct access to flex variables, and it 376 * allows us to transition from one flex lexer to another so that we 377 * can lex different parts of the source string using separate lexers. 378 */ 379 BEGIN(cur_state->start_state); 380 %} 381 382 {whitespace} { 383 /* 384 * Note that the whitespace rule includes both true 385 * whitespace and single-line ("--" style) comments. 386 * We suppress whitespace at the start of the query 387 * buffer. We also suppress all single-line comments, 388 * which is pretty dubious but is the historical 389 * behavior. 390 */ 391 if (!(output_buf->len == 0 || yytext[0] == '-')) 392 ECHO; 393 } 394 395 {xcstart} { 396 cur_state->xcdepth = 0; 397 BEGIN(xc); 398 /* Put back any characters past slash-star; see above */ 399 yyless(2); 400 ECHO; 401 } 402 403 <xc>{xcstart} { 404 cur_state->xcdepth++; 405 /* Put back any characters past slash-star; see above */ 406 yyless(2); 407 ECHO; 408 } 409 410 <xc>{xcstop} { 411 if (cur_state->xcdepth <= 0) 412 BEGIN(INITIAL); 413 else 414 cur_state->xcdepth--; 415 ECHO; 416 } 417 418 <xc>{xcinside} { 419 ECHO; 420 } 421 422 <xc>{op_chars} { 423 ECHO; 424 } 425 426 <xc>\*+ { 427 ECHO; 428 } 429 430 {xbstart} { 431 BEGIN(xb); 432 ECHO; 433 } 434 <xb>{quotestop} | 435 <xb>{quotefail} { 436 yyless(1); 437 BEGIN(INITIAL); 438 ECHO; 439 } 440 <xh>{xhinside} | 441 <xb>{xbinside} { 442 ECHO; 443 } 444 <xh>{quotecontinue} | 445 <xb>{quotecontinue} { 446 ECHO; 447 } 448 449 {xhstart} { 450 /* Hexadecimal bit type. 451 * At some point we should simply pass the string 452 * forward to the parser and label it there. 453 * In the meantime, place a leading "x" on the string 454 * to mark it for the input routine as a hex string. 455 */ 456 BEGIN(xh); 457 ECHO; 458 } 459 <xh>{quotestop} | 460 <xh>{quotefail} { 461 yyless(1); 462 BEGIN(INITIAL); 463 ECHO; 464 } 465 466 {xnstart} { 467 yyless(1); /* eat only 'n' this time */ 468 ECHO; 469 } 470 471 {xqstart} { 472 if (cur_state->std_strings) 473 BEGIN(xq); 474 else 475 BEGIN(xe); 476 ECHO; 477 } 478 {xestart} { 479 BEGIN(xe); 480 ECHO; 481 } 482 {xusstart} { 483 BEGIN(xus); 484 ECHO; 485 } 486 <xq,xe>{quotestop} | 487 <xq,xe>{quotefail} { 488 yyless(1); 489 BEGIN(INITIAL); 490 ECHO; 491 } 492 <xus>{quotestop} | 493 <xus>{quotefail} { 494 /* throw back all but the quote */ 495 yyless(1); 496 BEGIN(xusend); 497 ECHO; 498 } 499 <xusend>{whitespace} { 500 ECHO; 501 } 502 <xusend>{other} | 503 <xusend>{xustop1} { 504 yyless(0); 505 BEGIN(INITIAL); 506 ECHO; 507 } 508 <xusend>{xustop2} { 509 BEGIN(INITIAL); 510 ECHO; 511 } 512 <xq,xe,xus>{xqdouble} { 513 ECHO; 514 } 515 <xq,xus>{xqinside} { 516 ECHO; 517 } 518 <xe>{xeinside} { 519 ECHO; 520 } 521 <xe>{xeunicode} { 522 ECHO; 523 } 524 <xe>{xeunicodefail} { 525 ECHO; 526 } 527 <xe>{xeescape} { 528 ECHO; 529 } 530 <xe>{xeoctesc} { 531 ECHO; 532 } 533 <xe>{xehexesc} { 534 ECHO; 535 } 536 <xq,xe,xus>{quotecontinue} { 537 ECHO; 538 } 539 <xe>. { 540 /* This is only needed for \ just before EOF */ 541 ECHO; 542 } 543 544 {dolqdelim} { 545 cur_state->dolqstart = pg_strdup(yytext); 546 BEGIN(xdolq); 547 ECHO; 548 } 549 {dolqfailed} { 550 /* throw back all but the initial "$" */ 551 yyless(1); 552 ECHO; 553 } 554 <xdolq>{dolqdelim} { 555 if (strcmp(yytext, cur_state->dolqstart) == 0) 556 { 557 free(cur_state->dolqstart); 558 cur_state->dolqstart = NULL; 559 BEGIN(INITIAL); 560 } 561 else 562 { 563 /* 564 * When we fail to match $...$ to dolqstart, transfer 565 * the $... part to the output, but put back the final 566 * $ for rescanning. Consider $delim$...$junk$delim$ 567 */ 568 yyless(yyleng - 1); 569 } 570 ECHO; 571 } 572 <xdolq>{dolqinside} { 573 ECHO; 574 } 575 <xdolq>{dolqfailed} { 576 ECHO; 577 } 578 <xdolq>. { 579 /* This is only needed for $ inside the quoted text */ 580 ECHO; 581 } 582 583 {xdstart} { 584 BEGIN(xd); 585 ECHO; 586 } 587 {xuistart} { 588 BEGIN(xui); 589 ECHO; 590 } 591 <xd>{xdstop} { 592 BEGIN(INITIAL); 593 ECHO; 594 } 595 <xui>{dquote} { 596 yyless(1); 597 BEGIN(xuiend); 598 ECHO; 599 } 600 <xuiend>{whitespace} { 601 ECHO; 602 } 603 <xuiend>{other} | 604 <xuiend>{xustop1} { 605 yyless(0); 606 BEGIN(INITIAL); 607 ECHO; 608 } 609 <xuiend>{xustop2} { 610 BEGIN(INITIAL); 611 ECHO; 612 } 613 <xd,xui>{xddouble} { 614 ECHO; 615 } 616 <xd,xui>{xdinside} { 617 ECHO; 618 } 619 620 {xufailed} { 621 /* throw back all but the initial u/U */ 622 yyless(1); 623 ECHO; 624 } 625 626 {typecast} { 627 ECHO; 628 } 629 630 {dot_dot} { 631 ECHO; 632 } 633 634 {colon_equals} { 635 ECHO; 636 } 637 638 {equals_greater} { 639 ECHO; 640 } 641 642 {less_equals} { 643 ECHO; 644 } 645 646 {greater_equals} { 647 ECHO; 648 } 649 650 {less_greater} { 651 ECHO; 652 } 653 654 {not_equals} { 655 ECHO; 656 } 657 658 /* 659 * These rules are specific to psql --- they implement parenthesis 660 * counting and detection of command-ending semicolon. These must 661 * appear before the {self} rule so that they take precedence over it. 662 */ 663 664 "(" { 665 cur_state->paren_depth++; 666 ECHO; 667 } 668 669 ")" { 670 if (cur_state->paren_depth > 0) 671 cur_state->paren_depth--; 672 ECHO; 673 } 674 675 ";" { 676 ECHO; 677 if (cur_state->paren_depth == 0) 678 { 679 /* Terminate lexing temporarily */ 680 cur_state->start_state = YY_START; 681 return LEXRES_SEMI; 682 } 683 } 684 685 /* 686 * psql-specific rules to handle backslash commands and variable 687 * substitution. We want these before {self}, also. 688 */ 689 690 "\\"[;:] { 691 /* Force a semicolon or colon into the query buffer */ 692 psqlscan_emit(cur_state, yytext + 1, 1); 693 } 694 695 "\\" { 696 /* Terminate lexing temporarily */ 697 cur_state->start_state = YY_START; 698 return LEXRES_BACKSLASH; 699 } 700 701 :{variable_char}+ { 702 /* Possible psql variable substitution */ 703 char *varname; 704 char *value; 705 706 varname = psqlscan_extract_substring(cur_state, 707 yytext + 1, 708 yyleng - 1); 709 if (cur_state->callbacks->get_variable) 710 value = cur_state->callbacks->get_variable(varname, 711 false, 712 false); 713 else 714 value = NULL; 715 716 if (value) 717 { 718 /* It is a variable, check for recursion */ 719 if (psqlscan_var_is_current_source(cur_state, varname)) 720 { 721 /* Recursive expansion --- don't go there */ 722 cur_state->callbacks->write_error("skipping recursive expansion of variable \"%s\"\n", 723 varname); 724 /* Instead copy the string as is */ 725 ECHO; 726 } 727 else 728 { 729 /* OK, perform substitution */ 730 psqlscan_push_new_buffer(cur_state, value, varname); 731 /* yy_scan_string already made buffer active */ 732 } 733 free(value); 734 } 735 else 736 { 737 /* 738 * if the variable doesn't exist we'll copy the string 739 * as is 740 */ 741 ECHO; 742 } 743 744 free(varname); 745 } 746 747 :'{variable_char}+' { 748 psqlscan_escape_variable(cur_state, yytext, yyleng, false); 749 } 750 751 :\"{variable_char}+\" { 752 psqlscan_escape_variable(cur_state, yytext, yyleng, true); 753 } 754 755 /* 756 * These rules just avoid the need for scanner backup if one of the 757 * two rules above fails to match completely. 758 */ 759 760 :'{variable_char}* { 761 /* Throw back everything but the colon */ 762 yyless(1); 763 ECHO; 764 } 765 766 :\"{variable_char}* { 767 /* Throw back everything but the colon */ 768 yyless(1); 769 ECHO; 770 } 771 772 /* 773 * Back to backend-compatible rules. 774 */ 775 776 {self} { 777 ECHO; 778 } 779 780 {operator} { 781 /* 782 * Check for embedded slash-star or dash-dash; those 783 * are comment starts, so operator must stop there. 784 * Note that slash-star or dash-dash at the first 785 * character will match a prior rule, not this one. 786 */ 787 int nchars = yyleng; 788 char *slashstar = strstr(yytext, "/*"); 789 char *dashdash = strstr(yytext, "--"); 790 791 if (slashstar && dashdash) 792 { 793 /* if both appear, take the first one */ 794 if (slashstar > dashdash) 795 slashstar = dashdash; 796 } 797 else if (!slashstar) 798 slashstar = dashdash; 799 if (slashstar) 800 nchars = slashstar - yytext; 801 802 /* 803 * For SQL compatibility, '+' and '-' cannot be the 804 * last char of a multi-char operator unless the operator 805 * contains chars that are not in SQL operators. 806 * The idea is to lex '=-' as two operators, but not 807 * to forbid operator names like '?-' that could not be 808 * sequences of SQL operators. 809 */ 810 if (nchars > 1 && 811 (yytext[nchars - 1] == '+' || 812 yytext[nchars - 1] == '-')) 813 { 814 int ic; 815 816 for (ic = nchars - 2; ic >= 0; ic--) 817 { 818 char c = yytext[ic]; 819 if (c == '~' || c == '!' || c == '@' || 820 c == '#' || c == '^' || c == '&' || 821 c == '|' || c == '`' || c == '?' || 822 c == '%') 823 break; 824 } 825 if (ic < 0) 826 { 827 /* 828 * didn't find a qualifying character, so remove 829 * all trailing [+-] 830 */ 831 do { 832 nchars--; 833 } while (nchars > 1 && 834 (yytext[nchars - 1] == '+' || 835 yytext[nchars - 1] == '-')); 836 } 837 } 838 839 if (nchars < yyleng) 840 { 841 /* Strip the unwanted chars from the token */ 842 yyless(nchars); 843 } 844 ECHO; 845 } 846 847 {param} { 848 ECHO; 849 } 850 851 {integer} { 852 ECHO; 853 } 854 {decimal} { 855 ECHO; 856 } 857 {decimalfail} { 858 /* throw back the .., and treat as integer */ 859 yyless(yyleng - 2); 860 ECHO; 861 } 862 {real} { 863 ECHO; 864 } 865 {realfail1} { 866 /* 867 * throw back the [Ee], and treat as {decimal}. Note 868 * that it is possible the input is actually {integer}, 869 * but since this case will almost certainly lead to a 870 * syntax error anyway, we don't bother to distinguish. 871 */ 872 yyless(yyleng - 1); 873 ECHO; 874 } 875 {realfail2} { 876 /* throw back the [Ee][+-], and proceed as above */ 877 yyless(yyleng - 2); 878 ECHO; 879 } 880 881 882 {identifier} { 883 ECHO; 884 } 885 886 {other} { 887 ECHO; 888 } 889 890 <<EOF>> { 891 if (cur_state->buffer_stack == NULL) 892 { 893 cur_state->start_state = YY_START; 894 return LEXRES_EOL; /* end of input reached */ 895 } 896 897 /* 898 * We were expanding a variable, so pop the inclusion 899 * stack and keep lexing 900 */ 901 psqlscan_pop_buffer_stack(cur_state); 902 psqlscan_select_top_buffer(cur_state); 903 } 904 905 %% 906 907 /* 908 * Create a lexer working state struct. 909 * 910 * callbacks is a struct of function pointers that encapsulate some 911 * behavior we need from the surrounding program. This struct must 912 * remain valid for the lifespan of the PsqlScanState. 913 */ 914 PsqlScanState 915 psql_scan_create(const PsqlScanCallbacks *callbacks) 916 { 917 PsqlScanState state; 918 919 state = (PsqlScanStateData *) pg_malloc0(sizeof(PsqlScanStateData)); 920 921 state->callbacks = callbacks; 922 923 yylex_init(&state->scanner); 924 925 yyset_extra(state, state->scanner); 926 927 psql_scan_reset(state); 928 929 return state; 930 } 931 932 /* 933 * Destroy a lexer working state struct, releasing all resources. 934 */ 935 void 936 psql_scan_destroy(PsqlScanState state) 937 { 938 psql_scan_finish(state); 939 940 psql_scan_reset(state); 941 942 yylex_destroy(state->scanner); 943 944 free(state); 945 } 946 947 /* 948 * Set up to perform lexing of the given input line. 949 * 950 * The text at *line, extending for line_len bytes, will be scanned by 951 * subsequent calls to the psql_scan routines. psql_scan_finish should 952 * be called when scanning is complete. Note that the lexer retains 953 * a pointer to the storage at *line --- this string must not be altered 954 * or freed until after psql_scan_finish is called. 955 * 956 * encoding is the libpq identifier for the character encoding in use, 957 * and std_strings says whether standard_conforming_strings is on. 958 */ 959 void 960 psql_scan_setup(PsqlScanState state, 961 const char *line, int line_len, 962 int encoding, bool std_strings) 963 { 964 /* Mustn't be scanning already */ 965 Assert(state->scanbufhandle == NULL); 966 Assert(state->buffer_stack == NULL); 967 968 /* Do we need to hack the character set encoding? */ 969 state->encoding = encoding; 970 state->safe_encoding = pg_valid_server_encoding_id(encoding); 971 972 /* Save standard-strings flag as well */ 973 state->std_strings = std_strings; 974 975 /* Set up flex input buffer with appropriate translation and padding */ 976 state->scanbufhandle = psqlscan_prepare_buffer(state, line, line_len, 977 &state->scanbuf); 978 state->scanline = line; 979 980 /* Set lookaside data in case we have to map unsafe encoding */ 981 state->curline = state->scanbuf; 982 state->refline = state->scanline; 983 } 984 985 /* 986 * Do lexical analysis of SQL command text. 987 * 988 * The text previously passed to psql_scan_setup is scanned, and appended 989 * (possibly with transformation) to query_buf. 990 * 991 * The return value indicates the condition that stopped scanning: 992 * 993 * PSCAN_SEMICOLON: found a command-ending semicolon. (The semicolon is 994 * transferred to query_buf.) The command accumulated in query_buf should 995 * be executed, then clear query_buf and call again to scan the remainder 996 * of the line. 997 * 998 * PSCAN_BACKSLASH: found a backslash that starts a special command. 999 * Any previous data on the line has been transferred to query_buf. 1000 * The caller will typically next apply a separate flex lexer to scan 1001 * the special command. 1002 * 1003 * PSCAN_INCOMPLETE: the end of the line was reached, but we have an 1004 * incomplete SQL command. *prompt is set to the appropriate prompt type. 1005 * 1006 * PSCAN_EOL: the end of the line was reached, and there is no lexical 1007 * reason to consider the command incomplete. The caller may or may not 1008 * choose to send it. *prompt is set to the appropriate prompt type if 1009 * the caller chooses to collect more input. 1010 * 1011 * In the PSCAN_INCOMPLETE and PSCAN_EOL cases, psql_scan_finish() should 1012 * be called next, then the cycle may be repeated with a fresh input line. 1013 * 1014 * In all cases, *prompt is set to an appropriate prompt type code for the 1015 * next line-input operation. 1016 */ 1017 PsqlScanResult 1018 psql_scan(PsqlScanState state, 1019 PQExpBuffer query_buf, 1020 promptStatus_t *prompt) 1021 { 1022 PsqlScanResult result; 1023 int lexresult; 1024 1025 /* Must be scanning already */ 1026 Assert(state->scanbufhandle != NULL); 1027 1028 /* Set current output target */ 1029 state->output_buf = query_buf; 1030 1031 /* Set input source */ 1032 if (state->buffer_stack != NULL) 1033 yy_switch_to_buffer(state->buffer_stack->buf, state->scanner); 1034 else 1035 yy_switch_to_buffer(state->scanbufhandle, state->scanner); 1036 1037 /* And lex. */ 1038 lexresult = yylex(NULL, state->scanner); 1039 1040 /* 1041 * Check termination state and return appropriate result info. 1042 */ 1043 switch (lexresult) 1044 { 1045 case LEXRES_EOL: /* end of input */ 1046 switch (state->start_state) 1047 { 1048 case INITIAL: 1049 case xuiend: /* we treat these like INITIAL */ 1050 case xusend: 1051 if (state->paren_depth > 0) 1052 { 1053 result = PSCAN_INCOMPLETE; 1054 *prompt = PROMPT_PAREN; 1055 } 1056 else if (query_buf->len > 0) 1057 { 1058 result = PSCAN_EOL; 1059 *prompt = PROMPT_CONTINUE; 1060 } 1061 else 1062 { 1063 /* never bother to send an empty buffer */ 1064 result = PSCAN_INCOMPLETE; 1065 *prompt = PROMPT_READY; 1066 } 1067 break; 1068 case xb: 1069 result = PSCAN_INCOMPLETE; 1070 *prompt = PROMPT_SINGLEQUOTE; 1071 break; 1072 case xc: 1073 result = PSCAN_INCOMPLETE; 1074 *prompt = PROMPT_COMMENT; 1075 break; 1076 case xd: 1077 result = PSCAN_INCOMPLETE; 1078 *prompt = PROMPT_DOUBLEQUOTE; 1079 break; 1080 case xh: 1081 result = PSCAN_INCOMPLETE; 1082 *prompt = PROMPT_SINGLEQUOTE; 1083 break; 1084 case xe: 1085 result = PSCAN_INCOMPLETE; 1086 *prompt = PROMPT_SINGLEQUOTE; 1087 break; 1088 case xq: 1089 result = PSCAN_INCOMPLETE; 1090 *prompt = PROMPT_SINGLEQUOTE; 1091 break; 1092 case xdolq: 1093 result = PSCAN_INCOMPLETE; 1094 *prompt = PROMPT_DOLLARQUOTE; 1095 break; 1096 case xui: 1097 result = PSCAN_INCOMPLETE; 1098 *prompt = PROMPT_DOUBLEQUOTE; 1099 break; 1100 case xus: 1101 result = PSCAN_INCOMPLETE; 1102 *prompt = PROMPT_SINGLEQUOTE; 1103 break; 1104 default: 1105 /* can't get here */ 1106 fprintf(stderr, "invalid YY_START\n"); 1107 exit(1); 1108 } 1109 break; 1110 case LEXRES_SEMI: /* semicolon */ 1111 result = PSCAN_SEMICOLON; 1112 *prompt = PROMPT_READY; 1113 break; 1114 case LEXRES_BACKSLASH: /* backslash */ 1115 result = PSCAN_BACKSLASH; 1116 *prompt = PROMPT_READY; 1117 break; 1118 default: 1119 /* can't get here */ 1120 fprintf(stderr, "invalid yylex result\n"); 1121 exit(1); 1122 } 1123 1124 return result; 1125 } 1126 1127 /* 1128 * Clean up after scanning a string. This flushes any unread input and 1129 * releases resources (but not the PsqlScanState itself). Note however 1130 * that this does not reset the lexer scan state; that can be done by 1131 * psql_scan_reset(), which is an orthogonal operation. 1132 * 1133 * It is legal to call this when not scanning anything (makes it easier 1134 * to deal with error recovery). 1135 */ 1136 void 1137 psql_scan_finish(PsqlScanState state) 1138 { 1139 /* Drop any incomplete variable expansions. */ 1140 while (state->buffer_stack != NULL) 1141 psqlscan_pop_buffer_stack(state); 1142 1143 /* Done with the outer scan buffer, too */ 1144 if (state->scanbufhandle) 1145 yy_delete_buffer(state->scanbufhandle, state->scanner); 1146 state->scanbufhandle = NULL; 1147 if (state->scanbuf) 1148 free(state->scanbuf); 1149 state->scanbuf = NULL; 1150 } 1151 1152 /* 1153 * Reset lexer scanning state to start conditions. This is appropriate 1154 * for executing \r psql commands (or any other time that we discard the 1155 * prior contents of query_buf). It is not, however, necessary to do this 1156 * when we execute and clear the buffer after getting a PSCAN_SEMICOLON or 1157 * PSCAN_EOL scan result, because the scan state must be INITIAL when those 1158 * conditions are returned. 1159 * 1160 * Note that this is unrelated to flushing unread input; that task is 1161 * done by psql_scan_finish(). 1162 */ 1163 void 1164 psql_scan_reset(PsqlScanState state) 1165 { 1166 state->start_state = INITIAL; 1167 state->paren_depth = 0; 1168 state->xcdepth = 0; /* not really necessary */ 1169 if (state->dolqstart) 1170 free(state->dolqstart); 1171 state->dolqstart = NULL; 1172 } 1173 1174 /* 1175 * Reselect this lexer (psqlscan.l) after using another one. 1176 * 1177 * Currently and for foreseeable uses, it's sufficient to reset to INITIAL 1178 * state, because we'd never switch to another lexer in a different state. 1179 * However, we don't want to reset e.g. paren_depth, so this can't be 1180 * the same as psql_scan_reset(). 1181 * 1182 * Note: psql setjmp error recovery just calls psql_scan_reset(), so that 1183 * must be a superset of this. 1184 * 1185 * Note: it seems likely that other lexers could just assign INITIAL for 1186 * themselves, since that probably has the value zero in every flex-generated 1187 * lexer. But let's not assume that. 1188 */ 1189 void 1190 psql_scan_reselect_sql_lexer(PsqlScanState state) 1191 { 1192 state->start_state = INITIAL; 1193 } 1194 1195 /* 1196 * Return true if lexer is currently in an "inside quotes" state. 1197 * 1198 * This is pretty grotty but is needed to preserve the old behavior 1199 * that mainloop.c drops blank lines not inside quotes without even 1200 * echoing them. 1201 */ 1202 bool 1203 psql_scan_in_quote(PsqlScanState state) 1204 { 1205 return state->start_state != INITIAL; 1206 } 1207 1208 /* 1209 * Push the given string onto the stack of stuff to scan. 1210 * 1211 * NOTE SIDE EFFECT: the new buffer is made the active flex input buffer. 1212 */ 1213 void 1214 psqlscan_push_new_buffer(PsqlScanState state, const char *newstr, 1215 const char *varname) 1216 { 1217 StackElem *stackelem; 1218 1219 stackelem = (StackElem *) pg_malloc(sizeof(StackElem)); 1220 1221 /* 1222 * In current usage, the passed varname points at the current flex input 1223 * buffer; we must copy it before calling psqlscan_prepare_buffer() 1224 * because that will change the buffer state. 1225 */ 1226 stackelem->varname = varname ? pg_strdup(varname) : NULL; 1227 1228 stackelem->buf = psqlscan_prepare_buffer(state, newstr, strlen(newstr), 1229 &stackelem->bufstring); 1230 state->curline = stackelem->bufstring; 1231 if (state->safe_encoding) 1232 { 1233 stackelem->origstring = NULL; 1234 state->refline = stackelem->bufstring; 1235 } 1236 else 1237 { 1238 stackelem->origstring = pg_strdup(newstr); 1239 state->refline = stackelem->origstring; 1240 } 1241 stackelem->next = state->buffer_stack; 1242 state->buffer_stack = stackelem; 1243 } 1244 1245 /* 1246 * Pop the topmost buffer stack item (there must be one!) 1247 * 1248 * NB: after this, the flex input state is unspecified; caller must 1249 * switch to an appropriate buffer to continue lexing. 1250 * See psqlscan_select_top_buffer(). 1251 */ 1252 void 1253 psqlscan_pop_buffer_stack(PsqlScanState state) 1254 { 1255 StackElem *stackelem = state->buffer_stack; 1256 1257 state->buffer_stack = stackelem->next; 1258 yy_delete_buffer(stackelem->buf, state->scanner); 1259 free(stackelem->bufstring); 1260 if (stackelem->origstring) 1261 free(stackelem->origstring); 1262 if (stackelem->varname) 1263 free(stackelem->varname); 1264 free(stackelem); 1265 } 1266 1267 /* 1268 * Select the topmost surviving buffer as the active input. 1269 */ 1270 void 1271 psqlscan_select_top_buffer(PsqlScanState state) 1272 { 1273 StackElem *stackelem = state->buffer_stack; 1274 1275 if (stackelem != NULL) 1276 { 1277 yy_switch_to_buffer(stackelem->buf, state->scanner); 1278 state->curline = stackelem->bufstring; 1279 state->refline = stackelem->origstring ? stackelem->origstring : stackelem->bufstring; 1280 } 1281 else 1282 { 1283 yy_switch_to_buffer(state->scanbufhandle, state->scanner); 1284 state->curline = state->scanbuf; 1285 state->refline = state->scanline; 1286 } 1287 } 1288 1289 /* 1290 * Check if specified variable name is the source for any string 1291 * currently being scanned 1292 */ 1293 bool 1294 psqlscan_var_is_current_source(PsqlScanState state, const char *varname) 1295 { 1296 StackElem *stackelem; 1297 1298 for (stackelem = state->buffer_stack; 1299 stackelem != NULL; 1300 stackelem = stackelem->next) 1301 { 1302 if (stackelem->varname && strcmp(stackelem->varname, varname) == 0) 1303 return true; 1304 } 1305 return false; 1306 } 1307 1308 /* 1309 * Set up a flex input buffer to scan the given data. We always make a 1310 * copy of the data. If working in an unsafe encoding, the copy has 1311 * multibyte sequences replaced by FFs to avoid fooling the lexer rules. 1312 * 1313 * NOTE SIDE EFFECT: the new buffer is made the active flex input buffer. 1314 */ 1315 YY_BUFFER_STATE 1316 psqlscan_prepare_buffer(PsqlScanState state, const char *txt, int len, 1317 char **txtcopy) 1318 { 1319 char *newtxt; 1320 1321 /* Flex wants two \0 characters after the actual data */ 1322 newtxt = pg_malloc(len + 2); 1323 *txtcopy = newtxt; 1324 newtxt[len] = newtxt[len + 1] = YY_END_OF_BUFFER_CHAR; 1325 1326 if (state->safe_encoding) 1327 memcpy(newtxt, txt, len); 1328 else 1329 { 1330 /* Gotta do it the hard way */ 1331 int i = 0; 1332 1333 while (i < len) 1334 { 1335 int thislen = PQmblen(txt + i, state->encoding); 1336 1337 /* first byte should always be okay... */ 1338 newtxt[i] = txt[i]; 1339 i++; 1340 while (--thislen > 0 && i < len) 1341 newtxt[i++] = (char) 0xFF; 1342 } 1343 } 1344 1345 return yy_scan_buffer(newtxt, len + 2, state->scanner); 1346 } 1347 1348 /* 1349 * psqlscan_emit() --- body for ECHO macro 1350 * 1351 * NB: this must be used for ALL and ONLY the text copied from the flex 1352 * input data. If you pass it something that is not part of the yytext 1353 * string, you are making a mistake. Internally generated text can be 1354 * appended directly to state->output_buf. 1355 */ 1356 void 1357 psqlscan_emit(PsqlScanState state, const char *txt, int len) 1358 { 1359 PQExpBuffer output_buf = state->output_buf; 1360 1361 if (state->safe_encoding) 1362 appendBinaryPQExpBuffer(output_buf, txt, len); 1363 else 1364 { 1365 /* Gotta do it the hard way */ 1366 const char *reference = state->refline; 1367 int i; 1368 1369 reference += (txt - state->curline); 1370 1371 for (i = 0; i < len; i++) 1372 { 1373 char ch = txt[i]; 1374 1375 if (ch == (char) 0xFF) 1376 ch = reference[i]; 1377 appendPQExpBufferChar(output_buf, ch); 1378 } 1379 } 1380 } 1381 1382 /* 1383 * psqlscan_extract_substring --- fetch value of (part of) the current token 1384 * 1385 * This is like psqlscan_emit(), except that the data is returned as a 1386 * malloc'd string rather than being pushed directly to state->output_buf. 1387 */ 1388 char * 1389 psqlscan_extract_substring(PsqlScanState state, const char *txt, int len) 1390 { 1391 char *result = (char *) pg_malloc(len + 1); 1392 1393 if (state->safe_encoding) 1394 memcpy(result, txt, len); 1395 else 1396 { 1397 /* Gotta do it the hard way */ 1398 const char *reference = state->refline; 1399 int i; 1400 1401 reference += (txt - state->curline); 1402 1403 for (i = 0; i < len; i++) 1404 { 1405 char ch = txt[i]; 1406 1407 if (ch == (char) 0xFF) 1408 ch = reference[i]; 1409 result[i] = ch; 1410 } 1411 } 1412 result[len] = '\0'; 1413 return result; 1414 } 1415 1416 /* 1417 * psqlscan_escape_variable --- process :'VARIABLE' or :"VARIABLE" 1418 * 1419 * If the variable name is found, escape its value using the appropriate 1420 * quoting method and emit the value to output_buf. (Since the result is 1421 * surely quoted, there is never any reason to rescan it.) If we don't 1422 * find the variable or escaping fails, emit the token as-is. 1423 */ 1424 void 1425 psqlscan_escape_variable(PsqlScanState state, const char *txt, int len, 1426 bool as_ident) 1427 { 1428 char *varname; 1429 char *value; 1430 1431 /* Variable lookup. */ 1432 varname = psqlscan_extract_substring(state, txt + 2, len - 3); 1433 if (state->callbacks->get_variable) 1434 value = state->callbacks->get_variable(varname, true, as_ident); 1435 else 1436 value = NULL; 1437 free(varname); 1438 1439 if (value) 1440 { 1441 /* Emit the suitably-escaped value */ 1442 appendPQExpBufferStr(state->output_buf, value); 1443 free(value); 1444 } 1445 else 1446 { 1447 /* Emit original token as-is */ 1448 psqlscan_emit(state, txt, len); 1449 } 1450 } 1451