1 %top{ 2 /*------------------------------------------------------------------------- 3 * 4 * scan.l 5 * lexical scanner for PostgreSQL 6 * 7 * NOTE NOTE NOTE: 8 * 9 * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l 10 * and src/interfaces/ecpg/preproc/pgc.l! 11 * 12 * The rules are designed so that the scanner never has to backtrack, 13 * in the sense that there is always a rule that can match the input 14 * consumed so far (the rule action may internally throw back some input 15 * with yyless(), however). As explained in the flex manual, this makes 16 * for a useful speed increase --- about a third faster than a plain -CF 17 * lexer, in simple testing. The extra complexity is mostly in the rules 18 * for handling float numbers and continued string literals. If you change 19 * the lexical rules, verify that you haven't broken the no-backtrack 20 * property by running flex with the "-b" option and checking that the 21 * resulting "lex.backup" file says that no backing up is needed. (As of 22 * Postgres 9.2, this check is made automatically by the Makefile.) 23 * 24 * 25 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group 26 * Portions Copyright (c) 1994, Regents of the University of California 27 * 28 * IDENTIFICATION 29 * src/backend/parser/scan.l 30 * 31 *------------------------------------------------------------------------- 32 */ 33 #include "postgres.h" 34 35 #include <ctype.h> 36 #include <unistd.h> 37 38 #include "common/string.h" 39 #include "parser/gramparse.h" 40 #include "parser/parser.h" /* only needed for GUC variables */ 41 #include "parser/scansup.h" 42 #include "mb/pg_wchar.h" 43 } 44 45 %{ 46 47 /* LCOV_EXCL_START */ 48 49 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */ 50 #undef fprintf 51 #define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg) 52 53 static void 54 fprintf_to_ereport(const char *fmt, const char *msg) 55 { 56 ereport(ERROR, (errmsg_internal("%s", msg))); 57 } 58 59 /* 60 * GUC variables. This is a DIRECT violation of the warning given at the 61 * head of gram.y, ie flex/bison code must not depend on any GUC variables; 62 * as such, changing their values can induce very unintuitive behavior. 63 * But we shall have to live with it until we can remove these variables. 64 */ 65 int backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING; 66 bool escape_string_warning = true; 67 bool standard_conforming_strings = true; 68 69 /* 70 * Constant data exported from this file. This array maps from the 71 * zero-based keyword numbers returned by ScanKeywordLookup to the 72 * Bison token numbers needed by gram.y. This is exported because 73 * callers need to pass it to scanner_init, if they are using the 74 * standard keyword list ScanKeywords. 75 */ 76 #define PG_KEYWORD(kwname, value, category) value, 77 78 const uint16 ScanKeywordTokens[] = { 79 #include "parser/kwlist.h" 80 }; 81 82 #undef PG_KEYWORD 83 84 /* 85 * Set the type of YYSTYPE. 86 */ 87 #define YYSTYPE core_YYSTYPE 88 89 /* 90 * Set the type of yyextra. All state variables used by the scanner should 91 * be in yyextra, *not* statically allocated. 92 */ 93 #define YY_EXTRA_TYPE core_yy_extra_type * 94 95 /* 96 * Each call to yylex must set yylloc to the location of the found token 97 * (expressed as a byte offset from the start of the input text). 98 * When we parse a token that requires multiple lexer rules to process, 99 * this should be done in the first such rule, else yylloc will point 100 * into the middle of the token. 101 */ 102 #define SET_YYLLOC() (*(yylloc) = yytext - yyextra->scanbuf) 103 104 /* 105 * Advance yylloc by the given number of bytes. 106 */ 107 #define ADVANCE_YYLLOC(delta) ( *(yylloc) += (delta) ) 108 109 #define startlit() ( yyextra->literallen = 0 ) 110 static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner); 111 static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner); 112 static char *litbufdup(core_yyscan_t yyscanner); 113 static char *litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner); 114 static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner); 115 static int process_integer_literal(const char *token, YYSTYPE *lval); 116 static bool is_utf16_surrogate_first(pg_wchar c); 117 static bool is_utf16_surrogate_second(pg_wchar c); 118 static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second); 119 static void addunicode(pg_wchar c, yyscan_t yyscanner); 120 static bool check_uescapechar(unsigned char escape); 121 122 #define yyerror(msg) scanner_yyerror(msg, yyscanner) 123 124 #define lexer_errposition() scanner_errposition(*(yylloc), yyscanner) 125 126 static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner); 127 static void check_escape_warning(core_yyscan_t yyscanner); 128 129 /* 130 * Work around a bug in flex 2.5.35: it emits a couple of functions that 131 * it forgets to emit declarations for. Since we use -Wmissing-prototypes, 132 * this would cause warnings. Providing our own declarations should be 133 * harmless even when the bug gets fixed. 134 */ 135 extern int core_yyget_column(yyscan_t yyscanner); 136 extern void core_yyset_column(int column_no, yyscan_t yyscanner); 137 138 %} 139 140 %option reentrant 141 %option bison-bridge 142 %option bison-locations 143 %option 8bit 144 %option never-interactive 145 %option nodefault 146 %option noinput 147 %option nounput 148 %option noyywrap 149 %option noyyalloc 150 %option noyyrealloc 151 %option noyyfree 152 %option warn 153 %option prefix="core_yy" 154 155 /* 156 * OK, here is a short description of lex/flex rules behavior. 157 * The longest pattern which matches an input string is always chosen. 158 * For equal-length patterns, the first occurring in the rules list is chosen. 159 * INITIAL is the starting state, to which all non-conditional rules apply. 160 * Exclusive states change parsing rules while the state is active. When in 161 * an exclusive state, only those rules defined for that state apply. 162 * 163 * We use exclusive states for quoted strings, extended comments, 164 * and to eliminate parsing troubles for numeric strings. 165 * Exclusive states: 166 * <xb> bit string literal 167 * <xc> extended C-style comments 168 * <xd> delimited identifiers (double-quoted identifiers) 169 * <xh> hexadecimal numeric string 170 * <xq> standard quoted strings 171 * <xe> extended quoted strings (support backslash escape sequences) 172 * <xdolq> $foo$ quoted strings 173 * <xui> quoted identifier with Unicode escapes 174 * <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow 175 * <xus> quoted string with Unicode escapes 176 * <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow 177 * <xeu> Unicode surrogate pair in extended quoted string 178 * 179 * Remember to add an <<EOF>> case whenever you add a new exclusive state! 180 * The default one is probably not the right thing. 181 */ 182 183 %x xb 184 %x xc 185 %x xd 186 %x xh 187 %x xq 188 %x xe 189 %x xdolq 190 %x xui 191 %x xuiend 192 %x xus 193 %x xusend 194 %x xeu 195 196 /* 197 * In order to make the world safe for Windows and Mac clients as well as 198 * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n 199 * sequence will be seen as two successive newlines, but that doesn't cause 200 * any problems. Comments that start with -- and extend to the next 201 * newline are treated as equivalent to a single whitespace character. 202 * 203 * NOTE a fine point: if there is no newline following --, we will absorb 204 * everything to the end of the input as a comment. This is correct. Older 205 * versions of Postgres failed to recognize -- as a comment if the input 206 * did not end with a newline. 207 * 208 * XXX perhaps \f (formfeed) should be treated as a newline as well? 209 * 210 * XXX if you change the set of whitespace characters, fix scanner_isspace() 211 * to agree. 212 */ 213 214 space [ \t\n\r\f] 215 horiz_space [ \t\f] 216 newline [\n\r] 217 non_newline [^\n\r] 218 219 comment ("--"{non_newline}*) 220 221 whitespace ({space}+|{comment}) 222 223 /* 224 * SQL requires at least one newline in the whitespace separating 225 * string literals that are to be concatenated. Silly, but who are we 226 * to argue? Note that {whitespace_with_newline} should not have * after 227 * it, whereas {whitespace} should generally have a * after it... 228 */ 229 230 special_whitespace ({space}+|{comment}{newline}) 231 horiz_whitespace ({horiz_space}|{comment}) 232 whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) 233 234 /* 235 * To ensure that {quotecontinue} can be scanned without having to back up 236 * if the full pattern isn't matched, we include trailing whitespace in 237 * {quotestop}. This matches all cases where {quotecontinue} fails to match, 238 * except for {quote} followed by whitespace and just one "-" (not two, 239 * which would start a {comment}). To cover that we have {quotefail}. 240 * The actions for {quotestop} and {quotefail} must throw back characters 241 * beyond the quote proper. 242 */ 243 quote ' 244 quotestop {quote}{whitespace}* 245 quotecontinue {quote}{whitespace_with_newline}{quote} 246 quotefail {quote}{whitespace}*"-" 247 248 /* Bit string 249 * It is tempting to scan the string for only those characters 250 * which are allowed. However, this leads to silently swallowed 251 * characters if illegal characters are included in the string. 252 * For example, if xbinside is [01] then B'ABCD' is interpreted 253 * as a zero-length string, and the ABCD' is lost! 254 * Better to pass the string forward and let the input routines 255 * validate the contents. 256 */ 257 xbstart [bB]{quote} 258 xbinside [^']* 259 260 /* Hexadecimal number */ 261 xhstart [xX]{quote} 262 xhinside [^']* 263 264 /* National character */ 265 xnstart [nN]{quote} 266 267 /* Quoted string that allows backslash escapes */ 268 xestart [eE]{quote} 269 xeinside [^\\']+ 270 xeescape [\\][^0-7] 271 xeoctesc [\\][0-7]{1,3} 272 xehexesc [\\]x[0-9A-Fa-f]{1,2} 273 xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}) 274 xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7}) 275 276 /* Extended quote 277 * xqdouble implements embedded quote, '''' 278 */ 279 xqstart {quote} 280 xqdouble {quote}{quote} 281 xqinside [^']+ 282 283 /* $foo$ style quotes ("dollar quoting") 284 * The quoted string starts with $foo$ where "foo" is an optional string 285 * in the form of an identifier, except that it may not contain "$", 286 * and extends to the first occurrence of an identical string. 287 * There is *no* processing of the quoted text. 288 * 289 * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim} 290 * fails to match its trailing "$". 291 */ 292 dolq_start [A-Za-z\200-\377_] 293 dolq_cont [A-Za-z\200-\377_0-9] 294 dolqdelim \$({dolq_start}{dolq_cont}*)?\$ 295 dolqfailed \${dolq_start}{dolq_cont}* 296 dolqinside [^$]+ 297 298 /* Double quote 299 * Allows embedded spaces and other special characters into identifiers. 300 */ 301 dquote \" 302 xdstart {dquote} 303 xdstop {dquote} 304 xddouble {dquote}{dquote} 305 xdinside [^"]+ 306 307 /* Unicode escapes */ 308 uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} 309 /* error rule to avoid backup */ 310 uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] 311 312 /* Quoted identifier with Unicode escapes */ 313 xuistart [uU]&{dquote} 314 315 /* Quoted string with Unicode escapes */ 316 xusstart [uU]&{quote} 317 318 /* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */ 319 xustop1 {uescapefail}? 320 xustop2 {uescape} 321 322 /* error rule to avoid backup */ 323 xufailed [uU]& 324 325 326 /* C-style comments 327 * 328 * The "extended comment" syntax closely resembles allowable operator syntax. 329 * The tricky part here is to get lex to recognize a string starting with 330 * slash-star as a comment, when interpreting it as an operator would produce 331 * a longer match --- remember lex will prefer a longer match! Also, if we 332 * have something like plus-slash-star, lex will think this is a 3-character 333 * operator whereas we want to see it as a + operator and a comment start. 334 * The solution is two-fold: 335 * 1. append {op_chars}* to xcstart so that it matches as much text as 336 * {operator} would. Then the tie-breaker (first matching rule of same 337 * length) ensures xcstart wins. We put back the extra stuff with yyless() 338 * in case it contains a star-slash that should terminate the comment. 339 * 2. In the operator rule, check for slash-star within the operator, and 340 * if found throw it back with yyless(). This handles the plus-slash-star 341 * problem. 342 * Dash-dash comments have similar interactions with the operator rule. 343 */ 344 xcstart \/\*{op_chars}* 345 xcstop \*+\/ 346 xcinside [^*/]+ 347 348 digit [0-9] 349 ident_start [A-Za-z\200-\377_] 350 ident_cont [A-Za-z\200-\377_0-9\$] 351 352 identifier {ident_start}{ident_cont}* 353 354 /* Assorted special-case operators and operator-like tokens */ 355 typecast "::" 356 dot_dot \.\. 357 colon_equals ":=" 358 359 /* 360 * These operator-like tokens (unlike the above ones) also match the {operator} 361 * rule, which means that they might be overridden by a longer match if they 362 * are followed by a comment start or a + or - character. Accordingly, if you 363 * add to this list, you must also add corresponding code to the {operator} 364 * block to return the correct token in such cases. (This is not needed in 365 * psqlscan.l since the token value is ignored there.) 366 */ 367 equals_greater "=>" 368 less_equals "<=" 369 greater_equals ">=" 370 less_greater "<>" 371 not_equals "!=" 372 373 /* 374 * "self" is the set of chars that should be returned as single-character 375 * tokens. "op_chars" is the set of chars that can make up "Op" tokens, 376 * which can be one or more characters long (but if a single-char token 377 * appears in the "self" set, it is not to be returned as an Op). Note 378 * that the sets overlap, but each has some chars that are not in the other. 379 * 380 * If you change either set, adjust the character lists appearing in the 381 * rule for "operator"! 382 */ 383 self [,()\[\].;\:\+\-\*\/\%\^\<\>\=] 384 op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=] 385 operator {op_chars}+ 386 387 /* we no longer allow unary minus in numbers. 388 * instead we pass it separately to parser. there it gets 389 * coerced via doNegate() -- Leon aug 20 1999 390 * 391 * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10. 392 * 393 * {realfail1} and {realfail2} are added to prevent the need for scanner 394 * backup when the {real} rule fails to match completely. 395 */ 396 397 integer {digit}+ 398 decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*)) 399 decimalfail {digit}+\.\. 400 real ({integer}|{decimal})[Ee][-+]?{digit}+ 401 realfail1 ({integer}|{decimal})[Ee] 402 realfail2 ({integer}|{decimal})[Ee][-+] 403 404 param \${integer} 405 406 other . 407 408 /* 409 * Dollar quoted strings are totally opaque, and no escaping is done on them. 410 * Other quoted strings must allow some special characters such as single-quote 411 * and newline. 412 * Embedded single-quotes are implemented both in the SQL standard 413 * style of two adjacent single quotes "''" and in the Postgres/Java style 414 * of escaped-quote "\'". 415 * Other embedded escaped characters are matched explicitly and the leading 416 * backslash is dropped from the string. 417 * Note that xcstart must appear before operator, as explained above! 418 * Also whitespace (comment) must appear before operator. 419 */ 420 421 %% 422 423 {whitespace} { 424 /* ignore */ 425 } 426 427 {xcstart} { 428 /* Set location in case of syntax error in comment */ 429 SET_YYLLOC(); 430 yyextra->xcdepth = 0; 431 BEGIN(xc); 432 /* Put back any characters past slash-star; see above */ 433 yyless(2); 434 } 435 436 <xc>{ 437 {xcstart} { 438 (yyextra->xcdepth)++; 439 /* Put back any characters past slash-star; see above */ 440 yyless(2); 441 } 442 443 {xcstop} { 444 if (yyextra->xcdepth <= 0) 445 BEGIN(INITIAL); 446 else 447 (yyextra->xcdepth)--; 448 } 449 450 {xcinside} { 451 /* ignore */ 452 } 453 454 {op_chars} { 455 /* ignore */ 456 } 457 458 \*+ { 459 /* ignore */ 460 } 461 462 <<EOF>> { 463 yyerror("unterminated /* comment"); 464 } 465 } /* <xc> */ 466 467 {xbstart} { 468 /* Binary bit type. 469 * At some point we should simply pass the string 470 * forward to the parser and label it there. 471 * In the meantime, place a leading "b" on the string 472 * to mark it for the input routine as a binary string. 473 */ 474 SET_YYLLOC(); 475 BEGIN(xb); 476 startlit(); 477 addlitchar('b', yyscanner); 478 } 479 <xb>{quotestop} | 480 <xb>{quotefail} { 481 yyless(1); 482 BEGIN(INITIAL); 483 yylval->str = litbufdup(yyscanner); 484 return BCONST; 485 } 486 <xh>{xhinside} | 487 <xb>{xbinside} { 488 addlit(yytext, yyleng, yyscanner); 489 } 490 <xh>{quotecontinue} | 491 <xb>{quotecontinue} { 492 /* ignore */ 493 } 494 <xb><<EOF>> { yyerror("unterminated bit string literal"); } 495 496 {xhstart} { 497 /* Hexadecimal bit type. 498 * At some point we should simply pass the string 499 * forward to the parser and label it there. 500 * In the meantime, place a leading "x" on the string 501 * to mark it for the input routine as a hex string. 502 */ 503 SET_YYLLOC(); 504 BEGIN(xh); 505 startlit(); 506 addlitchar('x', yyscanner); 507 } 508 <xh>{quotestop} | 509 <xh>{quotefail} { 510 yyless(1); 511 BEGIN(INITIAL); 512 yylval->str = litbufdup(yyscanner); 513 return XCONST; 514 } 515 <xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); } 516 517 {xnstart} { 518 /* National character. 519 * We will pass this along as a normal character string, 520 * but preceded with an internally-generated "NCHAR". 521 */ 522 int kwnum; 523 524 SET_YYLLOC(); 525 yyless(1); /* eat only 'n' this time */ 526 527 kwnum = ScanKeywordLookup("nchar", 528 yyextra->keywordlist); 529 if (kwnum >= 0) 530 { 531 yylval->keyword = GetScanKeyword(kwnum, 532 yyextra->keywordlist); 533 return yyextra->keyword_tokens[kwnum]; 534 } 535 else 536 { 537 /* If NCHAR isn't a keyword, just return "n" */ 538 yylval->str = pstrdup("n"); 539 return IDENT; 540 } 541 } 542 543 {xqstart} { 544 yyextra->warn_on_first_escape = true; 545 yyextra->saw_non_ascii = false; 546 SET_YYLLOC(); 547 if (yyextra->standard_conforming_strings) 548 BEGIN(xq); 549 else 550 BEGIN(xe); 551 startlit(); 552 } 553 {xestart} { 554 yyextra->warn_on_first_escape = false; 555 yyextra->saw_non_ascii = false; 556 SET_YYLLOC(); 557 BEGIN(xe); 558 startlit(); 559 } 560 {xusstart} { 561 SET_YYLLOC(); 562 if (!yyextra->standard_conforming_strings) 563 ereport(ERROR, 564 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), 565 errmsg("unsafe use of string constant with Unicode escapes"), 566 errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."), 567 lexer_errposition())); 568 BEGIN(xus); 569 startlit(); 570 } 571 <xq,xe>{quotestop} | 572 <xq,xe>{quotefail} { 573 yyless(1); 574 BEGIN(INITIAL); 575 /* 576 * check that the data remains valid if it might have been 577 * made invalid by unescaping any chars. 578 */ 579 if (yyextra->saw_non_ascii) 580 pg_verifymbstr(yyextra->literalbuf, 581 yyextra->literallen, 582 false); 583 yylval->str = litbufdup(yyscanner); 584 return SCONST; 585 } 586 <xus>{quotestop} | 587 <xus>{quotefail} { 588 /* throw back all but the quote */ 589 yyless(1); 590 /* xusend state looks for possible UESCAPE */ 591 BEGIN(xusend); 592 } 593 <xusend>{whitespace} { 594 /* stay in xusend state over whitespace */ 595 } 596 <xusend><<EOF>> | 597 <xusend>{other} | 598 <xusend>{xustop1} { 599 /* no UESCAPE after the quote, throw back everything */ 600 yyless(0); 601 BEGIN(INITIAL); 602 yylval->str = litbuf_udeescape('\\', yyscanner); 603 return SCONST; 604 } 605 <xusend>{xustop2} { 606 /* found UESCAPE after the end quote */ 607 BEGIN(INITIAL); 608 if (!check_uescapechar(yytext[yyleng - 2])) 609 { 610 SET_YYLLOC(); 611 ADVANCE_YYLLOC(yyleng - 2); 612 yyerror("invalid Unicode escape character"); 613 } 614 yylval->str = litbuf_udeescape(yytext[yyleng - 2], 615 yyscanner); 616 return SCONST; 617 } 618 <xq,xe,xus>{xqdouble} { 619 addlitchar('\'', yyscanner); 620 } 621 <xq,xus>{xqinside} { 622 addlit(yytext, yyleng, yyscanner); 623 } 624 <xe>{xeinside} { 625 addlit(yytext, yyleng, yyscanner); 626 } 627 <xe>{xeunicode} { 628 pg_wchar c = strtoul(yytext + 2, NULL, 16); 629 630 check_escape_warning(yyscanner); 631 632 if (is_utf16_surrogate_first(c)) 633 { 634 yyextra->utf16_first_part = c; 635 BEGIN(xeu); 636 } 637 else if (is_utf16_surrogate_second(c)) 638 yyerror("invalid Unicode surrogate pair"); 639 else 640 addunicode(c, yyscanner); 641 } 642 <xeu>{xeunicode} { 643 pg_wchar c = strtoul(yytext + 2, NULL, 16); 644 645 if (!is_utf16_surrogate_second(c)) 646 yyerror("invalid Unicode surrogate pair"); 647 648 c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c); 649 650 addunicode(c, yyscanner); 651 652 BEGIN(xe); 653 } 654 <xeu>. { yyerror("invalid Unicode surrogate pair"); } 655 <xeu>\n { yyerror("invalid Unicode surrogate pair"); } 656 <xeu><<EOF>> { yyerror("invalid Unicode surrogate pair"); } 657 <xe,xeu>{xeunicodefail} { 658 ereport(ERROR, 659 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), 660 errmsg("invalid Unicode escape"), 661 errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."), 662 lexer_errposition())); 663 } 664 <xe>{xeescape} { 665 if (yytext[1] == '\'') 666 { 667 if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF || 668 (yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING && 669 PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding()))) 670 ereport(ERROR, 671 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER), 672 errmsg("unsafe use of \\' in a string literal"), 673 errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."), 674 lexer_errposition())); 675 } 676 check_string_escape_warning(yytext[1], yyscanner); 677 addlitchar(unescape_single_char(yytext[1], yyscanner), 678 yyscanner); 679 } 680 <xe>{xeoctesc} { 681 unsigned char c = strtoul(yytext + 1, NULL, 8); 682 683 check_escape_warning(yyscanner); 684 addlitchar(c, yyscanner); 685 if (c == '\0' || IS_HIGHBIT_SET(c)) 686 yyextra->saw_non_ascii = true; 687 } 688 <xe>{xehexesc} { 689 unsigned char c = strtoul(yytext + 2, NULL, 16); 690 691 check_escape_warning(yyscanner); 692 addlitchar(c, yyscanner); 693 if (c == '\0' || IS_HIGHBIT_SET(c)) 694 yyextra->saw_non_ascii = true; 695 } 696 <xq,xe,xus>{quotecontinue} { 697 /* ignore */ 698 } 699 <xe>. { 700 /* This is only needed for \ just before EOF */ 701 addlitchar(yytext[0], yyscanner); 702 } 703 <xq,xe,xus><<EOF>> { yyerror("unterminated quoted string"); } 704 705 {dolqdelim} { 706 SET_YYLLOC(); 707 yyextra->dolqstart = pstrdup(yytext); 708 BEGIN(xdolq); 709 startlit(); 710 } 711 {dolqfailed} { 712 SET_YYLLOC(); 713 /* throw back all but the initial "$" */ 714 yyless(1); 715 /* and treat it as {other} */ 716 return yytext[0]; 717 } 718 <xdolq>{dolqdelim} { 719 if (strcmp(yytext, yyextra->dolqstart) == 0) 720 { 721 pfree(yyextra->dolqstart); 722 yyextra->dolqstart = NULL; 723 BEGIN(INITIAL); 724 yylval->str = litbufdup(yyscanner); 725 return SCONST; 726 } 727 else 728 { 729 /* 730 * When we fail to match $...$ to dolqstart, transfer 731 * the $... part to the output, but put back the final 732 * $ for rescanning. Consider $delim$...$junk$delim$ 733 */ 734 addlit(yytext, yyleng - 1, yyscanner); 735 yyless(yyleng - 1); 736 } 737 } 738 <xdolq>{dolqinside} { 739 addlit(yytext, yyleng, yyscanner); 740 } 741 <xdolq>{dolqfailed} { 742 addlit(yytext, yyleng, yyscanner); 743 } 744 <xdolq>. { 745 /* This is only needed for $ inside the quoted text */ 746 addlitchar(yytext[0], yyscanner); 747 } 748 <xdolq><<EOF>> { yyerror("unterminated dollar-quoted string"); } 749 750 {xdstart} { 751 SET_YYLLOC(); 752 BEGIN(xd); 753 startlit(); 754 } 755 {xuistart} { 756 SET_YYLLOC(); 757 BEGIN(xui); 758 startlit(); 759 } 760 <xd>{xdstop} { 761 char *ident; 762 763 BEGIN(INITIAL); 764 if (yyextra->literallen == 0) 765 yyerror("zero-length delimited identifier"); 766 ident = litbufdup(yyscanner); 767 if (yyextra->literallen >= NAMEDATALEN) 768 truncate_identifier(ident, yyextra->literallen, true); 769 yylval->str = ident; 770 return IDENT; 771 } 772 <xui>{dquote} { 773 yyless(1); 774 /* xuiend state looks for possible UESCAPE */ 775 BEGIN(xuiend); 776 } 777 <xuiend>{whitespace} { 778 /* stay in xuiend state over whitespace */ 779 } 780 <xuiend><<EOF>> | 781 <xuiend>{other} | 782 <xuiend>{xustop1} { 783 /* no UESCAPE after the quote, throw back everything */ 784 char *ident; 785 int identlen; 786 787 yyless(0); 788 789 BEGIN(INITIAL); 790 if (yyextra->literallen == 0) 791 yyerror("zero-length delimited identifier"); 792 ident = litbuf_udeescape('\\', yyscanner); 793 identlen = strlen(ident); 794 if (identlen >= NAMEDATALEN) 795 truncate_identifier(ident, identlen, true); 796 yylval->str = ident; 797 return IDENT; 798 } 799 <xuiend>{xustop2} { 800 /* found UESCAPE after the end quote */ 801 char *ident; 802 int identlen; 803 804 BEGIN(INITIAL); 805 if (yyextra->literallen == 0) 806 yyerror("zero-length delimited identifier"); 807 if (!check_uescapechar(yytext[yyleng - 2])) 808 { 809 SET_YYLLOC(); 810 ADVANCE_YYLLOC(yyleng - 2); 811 yyerror("invalid Unicode escape character"); 812 } 813 ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner); 814 identlen = strlen(ident); 815 if (identlen >= NAMEDATALEN) 816 truncate_identifier(ident, identlen, true); 817 yylval->str = ident; 818 return IDENT; 819 } 820 <xd,xui>{xddouble} { 821 addlitchar('"', yyscanner); 822 } 823 <xd,xui>{xdinside} { 824 addlit(yytext, yyleng, yyscanner); 825 } 826 <xd,xui><<EOF>> { yyerror("unterminated quoted identifier"); } 827 828 {xufailed} { 829 char *ident; 830 831 SET_YYLLOC(); 832 /* throw back all but the initial u/U */ 833 yyless(1); 834 /* and treat it as {identifier} */ 835 ident = downcase_truncate_identifier(yytext, yyleng, true); 836 yylval->str = ident; 837 return IDENT; 838 } 839 840 {typecast} { 841 SET_YYLLOC(); 842 return TYPECAST; 843 } 844 845 {dot_dot} { 846 SET_YYLLOC(); 847 return DOT_DOT; 848 } 849 850 {colon_equals} { 851 SET_YYLLOC(); 852 return COLON_EQUALS; 853 } 854 855 {equals_greater} { 856 SET_YYLLOC(); 857 return EQUALS_GREATER; 858 } 859 860 {less_equals} { 861 SET_YYLLOC(); 862 return LESS_EQUALS; 863 } 864 865 {greater_equals} { 866 SET_YYLLOC(); 867 return GREATER_EQUALS; 868 } 869 870 {less_greater} { 871 /* We accept both "<>" and "!=" as meaning NOT_EQUALS */ 872 SET_YYLLOC(); 873 return NOT_EQUALS; 874 } 875 876 {not_equals} { 877 /* We accept both "<>" and "!=" as meaning NOT_EQUALS */ 878 SET_YYLLOC(); 879 return NOT_EQUALS; 880 } 881 882 {self} { 883 SET_YYLLOC(); 884 return yytext[0]; 885 } 886 887 {operator} { 888 /* 889 * Check for embedded slash-star or dash-dash; those 890 * are comment starts, so operator must stop there. 891 * Note that slash-star or dash-dash at the first 892 * character will match a prior rule, not this one. 893 */ 894 int nchars = yyleng; 895 char *slashstar = strstr(yytext, "/*"); 896 char *dashdash = strstr(yytext, "--"); 897 898 if (slashstar && dashdash) 899 { 900 /* if both appear, take the first one */ 901 if (slashstar > dashdash) 902 slashstar = dashdash; 903 } 904 else if (!slashstar) 905 slashstar = dashdash; 906 if (slashstar) 907 nchars = slashstar - yytext; 908 909 /* 910 * For SQL compatibility, '+' and '-' cannot be the 911 * last char of a multi-char operator unless the operator 912 * contains chars that are not in SQL operators. 913 * The idea is to lex '=-' as two operators, but not 914 * to forbid operator names like '?-' that could not be 915 * sequences of SQL operators. 916 */ 917 if (nchars > 1 && 918 (yytext[nchars - 1] == '+' || 919 yytext[nchars - 1] == '-')) 920 { 921 int ic; 922 923 for (ic = nchars - 2; ic >= 0; ic--) 924 { 925 char c = yytext[ic]; 926 if (c == '~' || c == '!' || c == '@' || 927 c == '#' || c == '^' || c == '&' || 928 c == '|' || c == '`' || c == '?' || 929 c == '%') 930 break; 931 } 932 if (ic < 0) 933 { 934 /* 935 * didn't find a qualifying character, so remove 936 * all trailing [+-] 937 */ 938 do { 939 nchars--; 940 } while (nchars > 1 && 941 (yytext[nchars - 1] == '+' || 942 yytext[nchars - 1] == '-')); 943 } 944 } 945 946 SET_YYLLOC(); 947 948 if (nchars < yyleng) 949 { 950 /* Strip the unwanted chars from the token */ 951 yyless(nchars); 952 /* 953 * If what we have left is only one char, and it's 954 * one of the characters matching "self", then 955 * return it as a character token the same way 956 * that the "self" rule would have. 957 */ 958 if (nchars == 1 && 959 strchr(",()[].;:+-*/%^<>=", yytext[0])) 960 return yytext[0]; 961 /* 962 * Likewise, if what we have left is two chars, and 963 * those match the tokens ">=", "<=", "=>", "<>" or 964 * "!=", then we must return the appropriate token 965 * rather than the generic Op. 966 */ 967 if (nchars == 2) 968 { 969 if (yytext[0] == '=' && yytext[1] == '>') 970 return EQUALS_GREATER; 971 if (yytext[0] == '>' && yytext[1] == '=') 972 return GREATER_EQUALS; 973 if (yytext[0] == '<' && yytext[1] == '=') 974 return LESS_EQUALS; 975 if (yytext[0] == '<' && yytext[1] == '>') 976 return NOT_EQUALS; 977 if (yytext[0] == '!' && yytext[1] == '=') 978 return NOT_EQUALS; 979 } 980 } 981 982 /* 983 * Complain if operator is too long. Unlike the case 984 * for identifiers, we make this an error not a notice- 985 * and-truncate, because the odds are we are looking at 986 * a syntactic mistake anyway. 987 */ 988 if (nchars >= NAMEDATALEN) 989 yyerror("operator too long"); 990 991 yylval->str = pstrdup(yytext); 992 return Op; 993 } 994 995 {param} { 996 SET_YYLLOC(); 997 yylval->ival = atol(yytext + 1); 998 return PARAM; 999 } 1000 1001 {integer} { 1002 SET_YYLLOC(); 1003 return process_integer_literal(yytext, yylval); 1004 } 1005 {decimal} { 1006 SET_YYLLOC(); 1007 yylval->str = pstrdup(yytext); 1008 return FCONST; 1009 } 1010 {decimalfail} { 1011 /* throw back the .., and treat as integer */ 1012 yyless(yyleng - 2); 1013 SET_YYLLOC(); 1014 return process_integer_literal(yytext, yylval); 1015 } 1016 {real} { 1017 SET_YYLLOC(); 1018 yylval->str = pstrdup(yytext); 1019 return FCONST; 1020 } 1021 {realfail1} { 1022 /* 1023 * throw back the [Ee], and figure out whether what 1024 * remains is an {integer} or {decimal}. 1025 */ 1026 yyless(yyleng - 1); 1027 SET_YYLLOC(); 1028 return process_integer_literal(yytext, yylval); 1029 } 1030 {realfail2} { 1031 /* throw back the [Ee][+-], and proceed as above */ 1032 yyless(yyleng - 2); 1033 SET_YYLLOC(); 1034 return process_integer_literal(yytext, yylval); 1035 } 1036 1037 1038 {identifier} { 1039 int kwnum; 1040 char *ident; 1041 1042 SET_YYLLOC(); 1043 1044 /* Is it a keyword? */ 1045 kwnum = ScanKeywordLookup(yytext, 1046 yyextra->keywordlist); 1047 if (kwnum >= 0) 1048 { 1049 yylval->keyword = GetScanKeyword(kwnum, 1050 yyextra->keywordlist); 1051 return yyextra->keyword_tokens[kwnum]; 1052 } 1053 1054 /* 1055 * No. Convert the identifier to lower case, and truncate 1056 * if necessary. 1057 */ 1058 ident = downcase_truncate_identifier(yytext, yyleng, true); 1059 yylval->str = ident; 1060 return IDENT; 1061 } 1062 1063 {other} { 1064 SET_YYLLOC(); 1065 return yytext[0]; 1066 } 1067 1068 <<EOF>> { 1069 SET_YYLLOC(); 1070 yyterminate(); 1071 } 1072 1073 %% 1074 1075 /* LCOV_EXCL_STOP */ 1076 1077 /* 1078 * Arrange access to yyextra for subroutines of the main yylex() function. 1079 * We expect each subroutine to have a yyscanner parameter. Rather than 1080 * use the yyget_xxx functions, which might or might not get inlined by the 1081 * compiler, we cheat just a bit and cast yyscanner to the right type. 1082 */ 1083 #undef yyextra 1084 #define yyextra (((struct yyguts_t *) yyscanner)->yyextra_r) 1085 1086 /* Likewise for a couple of other things we need. */ 1087 #undef yylloc 1088 #define yylloc (((struct yyguts_t *) yyscanner)->yylloc_r) 1089 #undef yyleng 1090 #define yyleng (((struct yyguts_t *) yyscanner)->yyleng_r) 1091 1092 1093 /* 1094 * scanner_errposition 1095 * Report a lexer or grammar error cursor position, if possible. 1096 * 1097 * This is expected to be used within an ereport() call. The return value 1098 * is a dummy (always 0, in fact). 1099 * 1100 * Note that this can only be used for messages emitted during raw parsing 1101 * (essentially, scan.l and gram.y), since it requires the yyscanner struct 1102 * to still be available. 1103 */ 1104 int 1105 scanner_errposition(int location, core_yyscan_t yyscanner) 1106 { 1107 int pos; 1108 1109 if (location < 0) 1110 return 0; /* no-op if location is unknown */ 1111 1112 /* Convert byte offset to character number */ 1113 pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1; 1114 /* And pass it to the ereport mechanism */ 1115 return errposition(pos); 1116 } 1117 1118 /* 1119 * scanner_yyerror 1120 * Report a lexer or grammar error. 1121 * 1122 * The message's cursor position is whatever YYLLOC was last set to, 1123 * ie, the start of the current token if called within yylex(), or the 1124 * most recently lexed token if called from the grammar. 1125 * This is OK for syntax error messages from the Bison parser, because Bison 1126 * parsers report error as soon as the first unparsable token is reached. 1127 * Beware of using yyerror for other purposes, as the cursor position might 1128 * be misleading! 1129 */ 1130 void 1131 scanner_yyerror(const char *message, core_yyscan_t yyscanner) 1132 { 1133 const char *loc = yyextra->scanbuf + *yylloc; 1134 1135 if (*loc == YY_END_OF_BUFFER_CHAR) 1136 { 1137 ereport(ERROR, 1138 (errcode(ERRCODE_SYNTAX_ERROR), 1139 /* translator: %s is typically the translation of "syntax error" */ 1140 errmsg("%s at end of input", _(message)), 1141 lexer_errposition())); 1142 } 1143 else 1144 { 1145 ereport(ERROR, 1146 (errcode(ERRCODE_SYNTAX_ERROR), 1147 /* translator: first %s is typically the translation of "syntax error" */ 1148 errmsg("%s at or near \"%s\"", _(message), loc), 1149 lexer_errposition())); 1150 } 1151 } 1152 1153 1154 /* 1155 * Called before any actual parsing is done 1156 */ 1157 core_yyscan_t 1158 scanner_init(const char *str, 1159 core_yy_extra_type *yyext, 1160 const ScanKeywordList *keywordlist, 1161 const uint16 *keyword_tokens) 1162 { 1163 Size slen = strlen(str); 1164 yyscan_t scanner; 1165 1166 if (yylex_init(&scanner) != 0) 1167 elog(ERROR, "yylex_init() failed: %m"); 1168 1169 core_yyset_extra(yyext, scanner); 1170 1171 yyext->keywordlist = keywordlist; 1172 yyext->keyword_tokens = keyword_tokens; 1173 1174 yyext->backslash_quote = backslash_quote; 1175 yyext->escape_string_warning = escape_string_warning; 1176 yyext->standard_conforming_strings = standard_conforming_strings; 1177 1178 /* 1179 * Make a scan buffer with special termination needed by flex. 1180 */ 1181 yyext->scanbuf = (char *) palloc(slen + 2); 1182 yyext->scanbuflen = slen; 1183 memcpy(yyext->scanbuf, str, slen); 1184 yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR; 1185 yy_scan_buffer(yyext->scanbuf, slen + 2, scanner); 1186 1187 /* initialize literal buffer to a reasonable but expansible size */ 1188 yyext->literalalloc = 1024; 1189 yyext->literalbuf = (char *) palloc(yyext->literalalloc); 1190 yyext->literallen = 0; 1191 1192 return scanner; 1193 } 1194 1195 1196 /* 1197 * Called after parsing is done to clean up after scanner_init() 1198 */ 1199 void 1200 scanner_finish(core_yyscan_t yyscanner) 1201 { 1202 /* 1203 * We don't bother to call yylex_destroy(), because all it would do is 1204 * pfree a small amount of control storage. It's cheaper to leak the 1205 * storage until the parsing context is destroyed. The amount of space 1206 * involved is usually negligible compared to the output parse tree 1207 * anyway. 1208 * 1209 * We do bother to pfree the scanbuf and literal buffer, but only if they 1210 * represent a nontrivial amount of space. The 8K cutoff is arbitrary. 1211 */ 1212 if (yyextra->scanbuflen >= 8192) 1213 pfree(yyextra->scanbuf); 1214 if (yyextra->literalalloc >= 8192) 1215 pfree(yyextra->literalbuf); 1216 } 1217 1218 1219 static void 1220 addlit(char *ytext, int yleng, core_yyscan_t yyscanner) 1221 { 1222 /* enlarge buffer if needed */ 1223 if ((yyextra->literallen + yleng) >= yyextra->literalalloc) 1224 { 1225 do 1226 { 1227 yyextra->literalalloc *= 2; 1228 } while ((yyextra->literallen + yleng) >= yyextra->literalalloc); 1229 yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf, 1230 yyextra->literalalloc); 1231 } 1232 /* append new data */ 1233 memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng); 1234 yyextra->literallen += yleng; 1235 } 1236 1237 1238 static void 1239 addlitchar(unsigned char ychar, core_yyscan_t yyscanner) 1240 { 1241 /* enlarge buffer if needed */ 1242 if ((yyextra->literallen + 1) >= yyextra->literalalloc) 1243 { 1244 yyextra->literalalloc *= 2; 1245 yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf, 1246 yyextra->literalalloc); 1247 } 1248 /* append new data */ 1249 yyextra->literalbuf[yyextra->literallen] = ychar; 1250 yyextra->literallen += 1; 1251 } 1252 1253 1254 /* 1255 * Create a palloc'd copy of literalbuf, adding a trailing null. 1256 */ 1257 static char * 1258 litbufdup(core_yyscan_t yyscanner) 1259 { 1260 int llen = yyextra->literallen; 1261 char *new; 1262 1263 new = palloc(llen + 1); 1264 memcpy(new, yyextra->literalbuf, llen); 1265 new[llen] = '\0'; 1266 return new; 1267 } 1268 1269 /* 1270 * Process {integer}. Note this will also do the right thing with {decimal}, 1271 * ie digits and a decimal point. 1272 */ 1273 static int 1274 process_integer_literal(const char *token, YYSTYPE *lval) 1275 { 1276 int val; 1277 char *endptr; 1278 1279 errno = 0; 1280 val = strtoint(token, &endptr, 10); 1281 if (*endptr != '\0' || errno == ERANGE) 1282 { 1283 /* integer too large (or contains decimal pt), treat it as a float */ 1284 lval->str = pstrdup(token); 1285 return FCONST; 1286 } 1287 lval->ival = val; 1288 return ICONST; 1289 } 1290 1291 static unsigned int 1292 hexval(unsigned char c) 1293 { 1294 if (c >= '0' && c <= '9') 1295 return c - '0'; 1296 if (c >= 'a' && c <= 'f') 1297 return c - 'a' + 0xA; 1298 if (c >= 'A' && c <= 'F') 1299 return c - 'A' + 0xA; 1300 elog(ERROR, "invalid hexadecimal digit"); 1301 return 0; /* not reached */ 1302 } 1303 1304 static void 1305 check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner) 1306 { 1307 if (GetDatabaseEncoding() == PG_UTF8) 1308 return; 1309 1310 if (c > 0x7F) 1311 { 1312 ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3); /* 3 for U&" */ 1313 yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"); 1314 } 1315 } 1316 1317 static bool 1318 is_utf16_surrogate_first(pg_wchar c) 1319 { 1320 return (c >= 0xD800 && c <= 0xDBFF); 1321 } 1322 1323 static bool 1324 is_utf16_surrogate_second(pg_wchar c) 1325 { 1326 return (c >= 0xDC00 && c <= 0xDFFF); 1327 } 1328 1329 static pg_wchar 1330 surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) 1331 { 1332 return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF); 1333 } 1334 1335 static void 1336 addunicode(pg_wchar c, core_yyscan_t yyscanner) 1337 { 1338 char buf[8]; 1339 1340 if (c == 0 || c > 0x10FFFF) 1341 yyerror("invalid Unicode escape value"); 1342 if (c > 0x7F) 1343 { 1344 if (GetDatabaseEncoding() != PG_UTF8) 1345 yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"); 1346 yyextra->saw_non_ascii = true; 1347 } 1348 unicode_to_utf8(c, (unsigned char *) buf); 1349 addlit(buf, pg_mblen(buf), yyscanner); 1350 } 1351 1352 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ 1353 static bool 1354 check_uescapechar(unsigned char escape) 1355 { 1356 if (isxdigit(escape) 1357 || escape == '+' 1358 || escape == '\'' 1359 || escape == '"' 1360 || scanner_isspace(escape)) 1361 { 1362 return false; 1363 } 1364 else 1365 return true; 1366 } 1367 1368 /* like litbufdup, but handle unicode escapes */ 1369 static char * 1370 litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner) 1371 { 1372 char *new; 1373 char *litbuf, 1374 *in, 1375 *out; 1376 pg_wchar pair_first = 0; 1377 1378 /* Make literalbuf null-terminated to simplify the scanning loop */ 1379 litbuf = yyextra->literalbuf; 1380 litbuf[yyextra->literallen] = '\0'; 1381 1382 /* 1383 * This relies on the subtle assumption that a UTF-8 expansion cannot be 1384 * longer than its escaped representation. 1385 */ 1386 new = palloc(yyextra->literallen + 1); 1387 1388 in = litbuf; 1389 out = new; 1390 while (*in) 1391 { 1392 if (in[0] == escape) 1393 { 1394 if (in[1] == escape) 1395 { 1396 if (pair_first) 1397 { 1398 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ 1399 yyerror("invalid Unicode surrogate pair"); 1400 } 1401 *out++ = escape; 1402 in += 2; 1403 } 1404 else if (isxdigit((unsigned char) in[1]) && 1405 isxdigit((unsigned char) in[2]) && 1406 isxdigit((unsigned char) in[3]) && 1407 isxdigit((unsigned char) in[4])) 1408 { 1409 pg_wchar unicode; 1410 1411 unicode = (hexval(in[1]) << 12) + 1412 (hexval(in[2]) << 8) + 1413 (hexval(in[3]) << 4) + 1414 hexval(in[4]); 1415 check_unicode_value(unicode, in, yyscanner); 1416 if (pair_first) 1417 { 1418 if (is_utf16_surrogate_second(unicode)) 1419 { 1420 unicode = surrogate_pair_to_codepoint(pair_first, unicode); 1421 pair_first = 0; 1422 } 1423 else 1424 { 1425 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ 1426 yyerror("invalid Unicode surrogate pair"); 1427 } 1428 } 1429 else if (is_utf16_surrogate_second(unicode)) 1430 yyerror("invalid Unicode surrogate pair"); 1431 1432 if (is_utf16_surrogate_first(unicode)) 1433 pair_first = unicode; 1434 else 1435 { 1436 unicode_to_utf8(unicode, (unsigned char *) out); 1437 out += pg_mblen(out); 1438 } 1439 in += 5; 1440 } 1441 else if (in[1] == '+' && 1442 isxdigit((unsigned char) in[2]) && 1443 isxdigit((unsigned char) in[3]) && 1444 isxdigit((unsigned char) in[4]) && 1445 isxdigit((unsigned char) in[5]) && 1446 isxdigit((unsigned char) in[6]) && 1447 isxdigit((unsigned char) in[7])) 1448 { 1449 pg_wchar unicode; 1450 1451 unicode = (hexval(in[2]) << 20) + 1452 (hexval(in[3]) << 16) + 1453 (hexval(in[4]) << 12) + 1454 (hexval(in[5]) << 8) + 1455 (hexval(in[6]) << 4) + 1456 hexval(in[7]); 1457 check_unicode_value(unicode, in, yyscanner); 1458 if (pair_first) 1459 { 1460 if (is_utf16_surrogate_second(unicode)) 1461 { 1462 unicode = surrogate_pair_to_codepoint(pair_first, unicode); 1463 pair_first = 0; 1464 } 1465 else 1466 { 1467 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ 1468 yyerror("invalid Unicode surrogate pair"); 1469 } 1470 } 1471 else if (is_utf16_surrogate_second(unicode)) 1472 yyerror("invalid Unicode surrogate pair"); 1473 1474 if (is_utf16_surrogate_first(unicode)) 1475 pair_first = unicode; 1476 else 1477 { 1478 unicode_to_utf8(unicode, (unsigned char *) out); 1479 out += pg_mblen(out); 1480 } 1481 in += 8; 1482 } 1483 else 1484 { 1485 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ 1486 yyerror("invalid Unicode escape value"); 1487 } 1488 } 1489 else 1490 { 1491 if (pair_first) 1492 { 1493 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ 1494 yyerror("invalid Unicode surrogate pair"); 1495 } 1496 *out++ = *in++; 1497 } 1498 } 1499 1500 /* unfinished surrogate pair? */ 1501 if (pair_first) 1502 { 1503 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ 1504 yyerror("invalid Unicode surrogate pair"); 1505 } 1506 1507 *out = '\0'; 1508 1509 /* 1510 * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII 1511 * codes; but it's probably not worth the trouble, since this isn't likely 1512 * to be a performance-critical path. 1513 */ 1514 pg_verifymbstr(new, out - new, false); 1515 return new; 1516 } 1517 1518 static unsigned char 1519 unescape_single_char(unsigned char c, core_yyscan_t yyscanner) 1520 { 1521 switch (c) 1522 { 1523 case 'b': 1524 return '\b'; 1525 case 'f': 1526 return '\f'; 1527 case 'n': 1528 return '\n'; 1529 case 'r': 1530 return '\r'; 1531 case 't': 1532 return '\t'; 1533 default: 1534 /* check for backslash followed by non-7-bit-ASCII */ 1535 if (c == '\0' || IS_HIGHBIT_SET(c)) 1536 yyextra->saw_non_ascii = true; 1537 1538 return c; 1539 } 1540 } 1541 1542 static void 1543 check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner) 1544 { 1545 if (ychar == '\'') 1546 { 1547 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning) 1548 ereport(WARNING, 1549 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER), 1550 errmsg("nonstandard use of \\' in a string literal"), 1551 errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."), 1552 lexer_errposition())); 1553 yyextra->warn_on_first_escape = false; /* warn only once per string */ 1554 } 1555 else if (ychar == '\\') 1556 { 1557 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning) 1558 ereport(WARNING, 1559 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER), 1560 errmsg("nonstandard use of \\\\ in a string literal"), 1561 errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."), 1562 lexer_errposition())); 1563 yyextra->warn_on_first_escape = false; /* warn only once per string */ 1564 } 1565 else 1566 check_escape_warning(yyscanner); 1567 } 1568 1569 static void 1570 check_escape_warning(core_yyscan_t yyscanner) 1571 { 1572 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning) 1573 ereport(WARNING, 1574 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER), 1575 errmsg("nonstandard use of escape in a string literal"), 1576 errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."), 1577 lexer_errposition())); 1578 yyextra->warn_on_first_escape = false; /* warn only once per string */ 1579 } 1580 1581 /* 1582 * Interface functions to make flex use palloc() instead of malloc(). 1583 * It'd be better to make these static, but flex insists otherwise. 1584 */ 1585 1586 void * 1587 core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner) 1588 { 1589 return palloc(bytes); 1590 } 1591 1592 void * 1593 core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner) 1594 { 1595 if (ptr) 1596 return repalloc(ptr, bytes); 1597 else 1598 return palloc(bytes); 1599 } 1600 1601 void 1602 core_yyfree(void *ptr, core_yyscan_t yyscanner) 1603 { 1604 if (ptr) 1605 pfree(ptr); 1606 } 1607