1 %top{
2 /*-------------------------------------------------------------------------
3  *
4  * scan.l
5  *	  lexical scanner for PostgreSQL
6  *
7  * NOTE NOTE NOTE:
8  *
9  * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l!
10  *
11  * The rules are designed so that the scanner never has to backtrack,
12  * in the sense that there is always a rule that can match the input
13  * consumed so far (the rule action may internally throw back some input
14  * with yyless(), however).  As explained in the flex manual, this makes
15  * for a useful speed increase --- about a third faster than a plain -CF
16  * lexer, in simple testing.  The extra complexity is mostly in the rules
17  * for handling float numbers and continued string literals.  If you change
18  * the lexical rules, verify that you haven't broken the no-backtrack
19  * property by running flex with the "-b" option and checking that the
20  * resulting "lex.backup" file says that no backing up is needed.  (As of
21  * Postgres 9.2, this check is made automatically by the Makefile.)
22  *
23  *
24  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
25  * Portions Copyright (c) 1994, Regents of the University of California
26  *
27  * IDENTIFICATION
28  *	  src/backend/parser/scan.l
29  *
30  *-------------------------------------------------------------------------
31  */
32 #include "postgres.h"
33 
34 #include <ctype.h>
35 #include <unistd.h>
36 
37 #include "common/string.h"
38 #include "parser/gramparse.h"
39 #include "parser/parser.h"		/* only needed for GUC variables */
40 #include "parser/scansup.h"
41 #include "mb/pg_wchar.h"
42 }
43 
44 %{
45 
46 /* LCOV_EXCL_START */
47 
48 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
49 #undef fprintf
50 #define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
51 
52 static void
fprintf_to_ereport(const char * fmt,const char * msg)53 fprintf_to_ereport(const char *fmt, const char *msg)
54 {
55 	ereport(ERROR, (errmsg_internal("%s", msg)));
56 }
57 
58 /*
59  * GUC variables.  This is a DIRECT violation of the warning given at the
60  * head of gram.y, ie flex/bison code must not depend on any GUC variables;
61  * as such, changing their values can induce very unintuitive behavior.
62  * But we shall have to live with it until we can remove these variables.
63  */
64 int			backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
65 bool		escape_string_warning = true;
66 bool		standard_conforming_strings = true;
67 
68 /*
69  * Set the type of YYSTYPE.
70  */
71 #define YYSTYPE core_YYSTYPE
72 
73 /*
74  * Set the type of yyextra.  All state variables used by the scanner should
75  * be in yyextra, *not* statically allocated.
76  */
77 #define YY_EXTRA_TYPE core_yy_extra_type *
78 
79 /*
80  * Each call to yylex must set yylloc to the location of the found token
81  * (expressed as a byte offset from the start of the input text).
82  * When we parse a token that requires multiple lexer rules to process,
83  * this should be done in the first such rule, else yylloc will point
84  * into the middle of the token.
85  */
86 #define SET_YYLLOC()  (*(yylloc) = yytext - yyextra->scanbuf)
87 
88 /*
89  * Advance yylloc by the given number of bytes.
90  */
91 #define ADVANCE_YYLLOC(delta)  ( *(yylloc) += (delta) )
92 
93 #define startlit()	( yyextra->literallen = 0 )
94 static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
95 static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
96 static char *litbufdup(core_yyscan_t yyscanner);
97 static char *litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner);
98 static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
99 static int	process_integer_literal(const char *token, YYSTYPE *lval);
100 static bool is_utf16_surrogate_first(pg_wchar c);
101 static bool is_utf16_surrogate_second(pg_wchar c);
102 static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
103 static void addunicode(pg_wchar c, yyscan_t yyscanner);
104 static bool check_uescapechar(unsigned char escape);
105 
106 #define yyerror(msg)  scanner_yyerror(msg, yyscanner)
107 
108 #define lexer_errposition()  scanner_errposition(*(yylloc), yyscanner)
109 
110 static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner);
111 static void check_escape_warning(core_yyscan_t yyscanner);
112 
113 /*
114  * Work around a bug in flex 2.5.35: it emits a couple of functions that
115  * it forgets to emit declarations for.  Since we use -Wmissing-prototypes,
116  * this would cause warnings.  Providing our own declarations should be
117  * harmless even when the bug gets fixed.
118  */
119 extern int	core_yyget_column(yyscan_t yyscanner);
120 extern void core_yyset_column(int column_no, yyscan_t yyscanner);
121 
122 %}
123 
124 %option reentrant
125 %option bison-bridge
126 %option bison-locations
127 %option 8bit
128 %option never-interactive
129 %option nodefault
130 %option noinput
131 %option nounput
132 %option noyywrap
133 %option noyyalloc
134 %option noyyrealloc
135 %option noyyfree
136 %option warn
137 %option prefix="core_yy"
138 
139 /*
140  * OK, here is a short description of lex/flex rules behavior.
141  * The longest pattern which matches an input string is always chosen.
142  * For equal-length patterns, the first occurring in the rules list is chosen.
143  * INITIAL is the starting state, to which all non-conditional rules apply.
144  * Exclusive states change parsing rules while the state is active.  When in
145  * an exclusive state, only those rules defined for that state apply.
146  *
147  * We use exclusive states for quoted strings, extended comments,
148  * and to eliminate parsing troubles for numeric strings.
149  * Exclusive states:
150  *  <xb> bit string literal
151  *  <xc> extended C-style comments
152  *  <xd> delimited identifiers (double-quoted identifiers)
153  *  <xh> hexadecimal numeric string
154  *  <xq> standard quoted strings
155  *  <xe> extended quoted strings (support backslash escape sequences)
156  *  <xdolq> $foo$ quoted strings
157  *  <xui> quoted identifier with Unicode escapes
158  *  <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
159  *  <xus> quoted string with Unicode escapes
160  *  <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
161  *  <xeu> Unicode surrogate pair in extended quoted string
162  *
163  * Remember to add an <<EOF>> case whenever you add a new exclusive state!
164  * The default one is probably not the right thing.
165  */
166 
167 %x xb
168 %x xc
169 %x xd
170 %x xh
171 %x xe
172 %x xq
173 %x xdolq
174 %x xui
175 %x xuiend
176 %x xus
177 %x xusend
178 %x xeu
179 
180 /*
181  * In order to make the world safe for Windows and Mac clients as well as
182  * Unix ones, we accept either \n or \r as a newline.  A DOS-style \r\n
183  * sequence will be seen as two successive newlines, but that doesn't cause
184  * any problems.  Comments that start with -- and extend to the next
185  * newline are treated as equivalent to a single whitespace character.
186  *
187  * NOTE a fine point: if there is no newline following --, we will absorb
188  * everything to the end of the input as a comment.  This is correct.  Older
189  * versions of Postgres failed to recognize -- as a comment if the input
190  * did not end with a newline.
191  *
192  * XXX perhaps \f (formfeed) should be treated as a newline as well?
193  *
194  * XXX if you change the set of whitespace characters, fix scanner_isspace()
195  * to agree, and see also the plpgsql lexer.
196  */
197 
198 space			[ \t\n\r\f]
199 horiz_space		[ \t\f]
200 newline			[\n\r]
201 non_newline		[^\n\r]
202 
203 comment			("--"{non_newline}*)
204 
205 whitespace		({space}+|{comment})
206 
207 /*
208  * SQL requires at least one newline in the whitespace separating
209  * string literals that are to be concatenated.  Silly, but who are we
210  * to argue?  Note that {whitespace_with_newline} should not have * after
211  * it, whereas {whitespace} should generally have a * after it...
212  */
213 
214 special_whitespace		({space}+|{comment}{newline})
215 horiz_whitespace		({horiz_space}|{comment})
216 whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
217 
218 /*
219  * To ensure that {quotecontinue} can be scanned without having to back up
220  * if the full pattern isn't matched, we include trailing whitespace in
221  * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
222  * except for {quote} followed by whitespace and just one "-" (not two,
223  * which would start a {comment}).  To cover that we have {quotefail}.
224  * The actions for {quotestop} and {quotefail} must throw back characters
225  * beyond the quote proper.
226  */
227 quote			'
228 quotestop		{quote}{whitespace}*
229 quotecontinue	{quote}{whitespace_with_newline}{quote}
230 quotefail		{quote}{whitespace}*"-"
231 
232 /* Bit string
233  * It is tempting to scan the string for only those characters
234  * which are allowed. However, this leads to silently swallowed
235  * characters if illegal characters are included in the string.
236  * For example, if xbinside is [01] then B'ABCD' is interpreted
237  * as a zero-length string, and the ABCD' is lost!
238  * Better to pass the string forward and let the input routines
239  * validate the contents.
240  */
241 xbstart			[bB]{quote}
242 xbinside		[^']*
243 
244 /* Hexadecimal number */
245 xhstart			[xX]{quote}
246 xhinside		[^']*
247 
248 /* National character */
249 xnstart			[nN]{quote}
250 
251 /* Quoted string that allows backslash escapes */
252 xestart			[eE]{quote}
253 xeinside		[^\\']+
254 xeescape		[\\][^0-7]
255 xeoctesc		[\\][0-7]{1,3}
256 xehexesc		[\\]x[0-9A-Fa-f]{1,2}
257 xeunicode		[\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
258 xeunicodefail	[\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
259 
260 /* Extended quote
261  * xqdouble implements embedded quote, ''''
262  */
263 xqstart			{quote}
264 xqdouble		{quote}{quote}
265 xqinside		[^']+
266 
267 /* $foo$ style quotes ("dollar quoting")
268  * The quoted string starts with $foo$ where "foo" is an optional string
269  * in the form of an identifier, except that it may not contain "$",
270  * and extends to the first occurrence of an identical string.
271  * There is *no* processing of the quoted text.
272  *
273  * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
274  * fails to match its trailing "$".
275  */
276 dolq_start		[A-Za-z\200-\377_]
277 dolq_cont		[A-Za-z\200-\377_0-9]
278 dolqdelim		\$({dolq_start}{dolq_cont}*)?\$
279 dolqfailed		\${dolq_start}{dolq_cont}*
280 dolqinside		[^$]+
281 
282 /* Double quote
283  * Allows embedded spaces and other special characters into identifiers.
284  */
285 dquote			\"
286 xdstart			{dquote}
287 xdstop			{dquote}
288 xddouble		{dquote}{dquote}
289 xdinside		[^"]+
290 
291 /* Unicode escapes */
292 uescape			[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
293 /* error rule to avoid backup */
294 uescapefail		[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
295 
296 /* Quoted identifier with Unicode escapes */
297 xuistart		[uU]&{dquote}
298 
299 /* Quoted string with Unicode escapes */
300 xusstart		[uU]&{quote}
301 
302 /* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
303 xustop1		{uescapefail}?
304 xustop2		{uescape}
305 
306 /* error rule to avoid backup */
307 xufailed		[uU]&
308 
309 
310 /* C-style comments
311  *
312  * The "extended comment" syntax closely resembles allowable operator syntax.
313  * The tricky part here is to get lex to recognize a string starting with
314  * slash-star as a comment, when interpreting it as an operator would produce
315  * a longer match --- remember lex will prefer a longer match!  Also, if we
316  * have something like plus-slash-star, lex will think this is a 3-character
317  * operator whereas we want to see it as a + operator and a comment start.
318  * The solution is two-fold:
319  * 1. append {op_chars}* to xcstart so that it matches as much text as
320  *    {operator} would. Then the tie-breaker (first matching rule of same
321  *    length) ensures xcstart wins.  We put back the extra stuff with yyless()
322  *    in case it contains a star-slash that should terminate the comment.
323  * 2. In the operator rule, check for slash-star within the operator, and
324  *    if found throw it back with yyless().  This handles the plus-slash-star
325  *    problem.
326  * Dash-dash comments have similar interactions with the operator rule.
327  */
328 xcstart			\/\*{op_chars}*
329 xcstop			\*+\/
330 xcinside		[^*/]+
331 
332 digit			[0-9]
333 ident_start		[A-Za-z\200-\377_]
334 ident_cont		[A-Za-z\200-\377_0-9\$]
335 
336 identifier		{ident_start}{ident_cont}*
337 
338 /* Assorted special-case operators and operator-like tokens */
339 typecast		"::"
340 dot_dot			\.\.
341 colon_equals	":="
342 
343 /*
344  * These operator-like tokens (unlike the above ones) also match the {operator}
345  * rule, which means that they might be overridden by a longer match if they
346  * are followed by a comment start or a + or - character. Accordingly, if you
347  * add to this list, you must also add corresponding code to the {operator}
348  * block to return the correct token in such cases. (This is not needed in
349  * psqlscan.l since the token value is ignored there.)
350  */
351 equals_greater	"=>"
352 less_equals		"<="
353 greater_equals	">="
354 less_greater	"<>"
355 not_equals		"!="
356 
357 /*
358  * "self" is the set of chars that should be returned as single-character
359  * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
360  * which can be one or more characters long (but if a single-char token
361  * appears in the "self" set, it is not to be returned as an Op).  Note
362  * that the sets overlap, but each has some chars that are not in the other.
363  *
364  * If you change either set, adjust the character lists appearing in the
365  * rule for "operator"!
366  */
367 self			[,()\[\].;\:\+\-\*\/\%\^\<\>\=]
368 op_chars		[\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
369 operator		{op_chars}+
370 
371 /* we no longer allow unary minus in numbers.
372  * instead we pass it separately to parser. there it gets
373  * coerced via doNegate() -- Leon aug 20 1999
374  *
375  * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
376  *
377  * {realfail1} and {realfail2} are added to prevent the need for scanner
378  * backup when the {real} rule fails to match completely.
379  */
380 
381 integer			{digit}+
382 decimal			(({digit}*\.{digit}+)|({digit}+\.{digit}*))
383 decimalfail		{digit}+\.\.
384 real			({integer}|{decimal})[Ee][-+]?{digit}+
385 realfail1		({integer}|{decimal})[Ee]
386 realfail2		({integer}|{decimal})[Ee][-+]
387 
388 param			\${integer}
389 
390 other			.
391 
392 /*
393  * Dollar quoted strings are totally opaque, and no escaping is done on them.
394  * Other quoted strings must allow some special characters such as single-quote
395  *  and newline.
396  * Embedded single-quotes are implemented both in the SQL standard
397  *  style of two adjacent single quotes "''" and in the Postgres/Java style
398  *  of escaped-quote "\'".
399  * Other embedded escaped characters are matched explicitly and the leading
400  *  backslash is dropped from the string.
401  * Note that xcstart must appear before operator, as explained above!
402  *  Also whitespace (comment) must appear before operator.
403  */
404 
405 %%
406 
407 {whitespace}	{
408 					/* ignore */
409 				}
410 
411 {xcstart}		{
412 					/* Set location in case of syntax error in comment */
413 					SET_YYLLOC();
414 					yyextra->xcdepth = 0;
415 					BEGIN(xc);
416 					/* Put back any characters past slash-star; see above */
417 					yyless(2);
418 				}
419 
420 <xc>{xcstart}	{
421 					(yyextra->xcdepth)++;
422 					/* Put back any characters past slash-star; see above */
423 					yyless(2);
424 				}
425 
426 <xc>{xcstop}	{
427 					if (yyextra->xcdepth <= 0)
428 						BEGIN(INITIAL);
429 					else
430 						(yyextra->xcdepth)--;
431 				}
432 
433 <xc>{xcinside}	{
434 					/* ignore */
435 				}
436 
437 <xc>{op_chars}	{
438 					/* ignore */
439 				}
440 
441 <xc>\*+			{
442 					/* ignore */
443 				}
444 
445 <xc><<EOF>>		{ yyerror("unterminated /* comment"); }
446 
447 {xbstart}		{
448 					/* Binary bit type.
449 					 * At some point we should simply pass the string
450 					 * forward to the parser and label it there.
451 					 * In the meantime, place a leading "b" on the string
452 					 * to mark it for the input routine as a binary string.
453 					 */
454 					SET_YYLLOC();
455 					BEGIN(xb);
456 					startlit();
457 					addlitchar('b', yyscanner);
458 				}
459 <xb>{quotestop}	|
460 <xb>{quotefail} {
461 					yyless(1);
462 					BEGIN(INITIAL);
463 					yylval->str = litbufdup(yyscanner);
464 					return BCONST;
465 				}
466 <xh>{xhinside}	|
467 <xb>{xbinside}	{
468 					addlit(yytext, yyleng, yyscanner);
469 				}
470 <xh>{quotecontinue}	|
471 <xb>{quotecontinue}	{
472 					/* ignore */
473 				}
474 <xb><<EOF>>		{ yyerror("unterminated bit string literal"); }
475 
476 {xhstart}		{
477 					/* Hexadecimal bit type.
478 					 * At some point we should simply pass the string
479 					 * forward to the parser and label it there.
480 					 * In the meantime, place a leading "x" on the string
481 					 * to mark it for the input routine as a hex string.
482 					 */
483 					SET_YYLLOC();
484 					BEGIN(xh);
485 					startlit();
486 					addlitchar('x', yyscanner);
487 				}
488 <xh>{quotestop}	|
489 <xh>{quotefail} {
490 					yyless(1);
491 					BEGIN(INITIAL);
492 					yylval->str = litbufdup(yyscanner);
493 					return XCONST;
494 				}
495 <xh><<EOF>>		{ yyerror("unterminated hexadecimal string literal"); }
496 
497 {xnstart}		{
498 					/* National character.
499 					 * We will pass this along as a normal character string,
500 					 * but preceded with an internally-generated "NCHAR".
501 					 */
502 					const ScanKeyword *keyword;
503 
504 					SET_YYLLOC();
505 					yyless(1);	/* eat only 'n' this time */
506 
507 					keyword = ScanKeywordLookup("nchar",
508 												yyextra->keywords,
509 												yyextra->num_keywords);
510 					if (keyword != NULL)
511 					{
512 						yylval->keyword = keyword->name;
513 						return keyword->value;
514 					}
515 					else
516 					{
517 						/* If NCHAR isn't a keyword, just return "n" */
518 						yylval->str = pstrdup("n");
519 						return IDENT;
520 					}
521 				}
522 
523 {xqstart}		{
524 					yyextra->warn_on_first_escape = true;
525 					yyextra->saw_non_ascii = false;
526 					SET_YYLLOC();
527 					if (yyextra->standard_conforming_strings)
528 						BEGIN(xq);
529 					else
530 						BEGIN(xe);
531 					startlit();
532 				}
533 {xestart}		{
534 					yyextra->warn_on_first_escape = false;
535 					yyextra->saw_non_ascii = false;
536 					SET_YYLLOC();
537 					BEGIN(xe);
538 					startlit();
539 				}
540 {xusstart}		{
541 					SET_YYLLOC();
542 					if (!yyextra->standard_conforming_strings)
543 						ereport(ERROR,
544 								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
545 								 errmsg("unsafe use of string constant with Unicode escapes"),
546 								 errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
547 								 lexer_errposition()));
548 					BEGIN(xus);
549 					startlit();
550 				}
551 <xq,xe>{quotestop}	|
552 <xq,xe>{quotefail} {
553 					yyless(1);
554 					BEGIN(INITIAL);
555 					/*
556 					 * check that the data remains valid if it might have been
557 					 * made invalid by unescaping any chars.
558 					 */
559 					if (yyextra->saw_non_ascii)
560 						pg_verifymbstr(yyextra->literalbuf,
561 									   yyextra->literallen,
562 									   false);
563 					yylval->str = litbufdup(yyscanner);
564 					return SCONST;
565 				}
566 <xus>{quotestop} |
567 <xus>{quotefail} {
568 					/* throw back all but the quote */
569 					yyless(1);
570 					/* xusend state looks for possible UESCAPE */
571 					BEGIN(xusend);
572 				}
573 <xusend>{whitespace} {
574 					/* stay in xusend state over whitespace */
575 				}
576 <xusend><<EOF>> |
577 <xusend>{other} |
578 <xusend>{xustop1} {
579 					/* no UESCAPE after the quote, throw back everything */
580 					yyless(0);
581 					BEGIN(INITIAL);
582 					yylval->str = litbuf_udeescape('\\', yyscanner);
583 					return SCONST;
584 				}
585 <xusend>{xustop2} {
586 					/* found UESCAPE after the end quote */
587 					BEGIN(INITIAL);
588 					if (!check_uescapechar(yytext[yyleng - 2]))
589 					{
590 						SET_YYLLOC();
591 						ADVANCE_YYLLOC(yyleng - 2);
592 						yyerror("invalid Unicode escape character");
593 					}
594 					yylval->str = litbuf_udeescape(yytext[yyleng - 2],
595 												   yyscanner);
596 					return SCONST;
597 				}
598 <xq,xe,xus>{xqdouble} {
599 					addlitchar('\'', yyscanner);
600 				}
601 <xq,xus>{xqinside}  {
602 					addlit(yytext, yyleng, yyscanner);
603 				}
604 <xe>{xeinside}  {
605 					addlit(yytext, yyleng, yyscanner);
606 				}
607 <xe>{xeunicode} {
608 					pg_wchar	c = strtoul(yytext + 2, NULL, 16);
609 
610 					check_escape_warning(yyscanner);
611 
612 					if (is_utf16_surrogate_first(c))
613 					{
614 						yyextra->utf16_first_part = c;
615 						BEGIN(xeu);
616 					}
617 					else if (is_utf16_surrogate_second(c))
618 						yyerror("invalid Unicode surrogate pair");
619 					else
620 						addunicode(c, yyscanner);
621 				}
622 <xeu>{xeunicode} {
623 					pg_wchar	c = strtoul(yytext + 2, NULL, 16);
624 
625 					if (!is_utf16_surrogate_second(c))
626 						yyerror("invalid Unicode surrogate pair");
627 
628 					c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
629 
630 					addunicode(c, yyscanner);
631 
632 					BEGIN(xe);
633 				}
634 <xeu>.			{ yyerror("invalid Unicode surrogate pair"); }
635 <xeu>\n			{ yyerror("invalid Unicode surrogate pair"); }
636 <xeu><<EOF>>	{ yyerror("invalid Unicode surrogate pair"); }
637 <xe,xeu>{xeunicodefail}	{
638 					ereport(ERROR,
639 							(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
640 							 errmsg("invalid Unicode escape"),
641 							 errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
642 							 lexer_errposition()));
643 				}
644 <xe>{xeescape}  {
645 					if (yytext[1] == '\'')
646 					{
647 						if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
648 							(yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
649 							 PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
650 							ereport(ERROR,
651 									(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
652 									 errmsg("unsafe use of \\' in a string literal"),
653 									 errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
654 									 lexer_errposition()));
655 					}
656 					check_string_escape_warning(yytext[1], yyscanner);
657 					addlitchar(unescape_single_char(yytext[1], yyscanner),
658 							   yyscanner);
659 				}
660 <xe>{xeoctesc}  {
661 					unsigned char c = strtoul(yytext + 1, NULL, 8);
662 
663 					check_escape_warning(yyscanner);
664 					addlitchar(c, yyscanner);
665 					if (c == '\0' || IS_HIGHBIT_SET(c))
666 						yyextra->saw_non_ascii = true;
667 				}
668 <xe>{xehexesc}  {
669 					unsigned char c = strtoul(yytext + 2, NULL, 16);
670 
671 					check_escape_warning(yyscanner);
672 					addlitchar(c, yyscanner);
673 					if (c == '\0' || IS_HIGHBIT_SET(c))
674 						yyextra->saw_non_ascii = true;
675 				}
676 <xq,xe,xus>{quotecontinue} {
677 					/* ignore */
678 				}
679 <xe>.			{
680 					/* This is only needed for \ just before EOF */
681 					addlitchar(yytext[0], yyscanner);
682 				}
683 <xq,xe,xus><<EOF>>		{ yyerror("unterminated quoted string"); }
684 
685 {dolqdelim}		{
686 					SET_YYLLOC();
687 					yyextra->dolqstart = pstrdup(yytext);
688 					BEGIN(xdolq);
689 					startlit();
690 				}
691 {dolqfailed}	{
692 					SET_YYLLOC();
693 					/* throw back all but the initial "$" */
694 					yyless(1);
695 					/* and treat it as {other} */
696 					return yytext[0];
697 				}
698 <xdolq>{dolqdelim} {
699 					if (strcmp(yytext, yyextra->dolqstart) == 0)
700 					{
701 						pfree(yyextra->dolqstart);
702 						yyextra->dolqstart = NULL;
703 						BEGIN(INITIAL);
704 						yylval->str = litbufdup(yyscanner);
705 						return SCONST;
706 					}
707 					else
708 					{
709 						/*
710 						 * When we fail to match $...$ to dolqstart, transfer
711 						 * the $... part to the output, but put back the final
712 						 * $ for rescanning.  Consider $delim$...$junk$delim$
713 						 */
714 						addlit(yytext, yyleng - 1, yyscanner);
715 						yyless(yyleng - 1);
716 					}
717 				}
718 <xdolq>{dolqinside} {
719 					addlit(yytext, yyleng, yyscanner);
720 				}
721 <xdolq>{dolqfailed} {
722 					addlit(yytext, yyleng, yyscanner);
723 				}
724 <xdolq>.		{
725 					/* This is only needed for $ inside the quoted text */
726 					addlitchar(yytext[0], yyscanner);
727 				}
728 <xdolq><<EOF>>	{ yyerror("unterminated dollar-quoted string"); }
729 
730 {xdstart}		{
731 					SET_YYLLOC();
732 					BEGIN(xd);
733 					startlit();
734 				}
735 {xuistart}		{
736 					SET_YYLLOC();
737 					BEGIN(xui);
738 					startlit();
739 				}
740 <xd>{xdstop}	{
741 					char	   *ident;
742 
743 					BEGIN(INITIAL);
744 					if (yyextra->literallen == 0)
745 						yyerror("zero-length delimited identifier");
746 					ident = litbufdup(yyscanner);
747 					if (yyextra->literallen >= NAMEDATALEN)
748 						truncate_identifier(ident, yyextra->literallen, true);
749 					yylval->str = ident;
750 					return IDENT;
751 				}
752 <xui>{dquote} {
753 					yyless(1);
754 					/* xuiend state looks for possible UESCAPE */
755 					BEGIN(xuiend);
756 				}
757 <xuiend>{whitespace} {
758 					/* stay in xuiend state over whitespace */
759 				}
760 <xuiend><<EOF>> |
761 <xuiend>{other} |
762 <xuiend>{xustop1} {
763 					/* no UESCAPE after the quote, throw back everything */
764 					char	   *ident;
765 					int			identlen;
766 
767 					yyless(0);
768 
769 					BEGIN(INITIAL);
770 					if (yyextra->literallen == 0)
771 						yyerror("zero-length delimited identifier");
772 					ident = litbuf_udeescape('\\', yyscanner);
773 					identlen = strlen(ident);
774 					if (identlen >= NAMEDATALEN)
775 						truncate_identifier(ident, identlen, true);
776 					yylval->str = ident;
777 					return IDENT;
778 				}
779 <xuiend>{xustop2}	{
780 					/* found UESCAPE after the end quote */
781 					char	   *ident;
782 					int			identlen;
783 
784 					BEGIN(INITIAL);
785 					if (yyextra->literallen == 0)
786 						yyerror("zero-length delimited identifier");
787 					if (!check_uescapechar(yytext[yyleng - 2]))
788 					{
789 						SET_YYLLOC();
790 						ADVANCE_YYLLOC(yyleng - 2);
791 						yyerror("invalid Unicode escape character");
792 					}
793 					ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
794 					identlen = strlen(ident);
795 					if (identlen >= NAMEDATALEN)
796 						truncate_identifier(ident, identlen, true);
797 					yylval->str = ident;
798 					return IDENT;
799 				}
800 <xd,xui>{xddouble}	{
801 					addlitchar('"', yyscanner);
802 				}
803 <xd,xui>{xdinside}	{
804 					addlit(yytext, yyleng, yyscanner);
805 				}
806 <xd,xui><<EOF>>		{ yyerror("unterminated quoted identifier"); }
807 
808 {xufailed}	{
809 					char	   *ident;
810 
811 					SET_YYLLOC();
812 					/* throw back all but the initial u/U */
813 					yyless(1);
814 					/* and treat it as {identifier} */
815 					ident = downcase_truncate_identifier(yytext, yyleng, true);
816 					yylval->str = ident;
817 					return IDENT;
818 				}
819 
820 {typecast}		{
821 					SET_YYLLOC();
822 					return TYPECAST;
823 				}
824 
825 {dot_dot}		{
826 					SET_YYLLOC();
827 					return DOT_DOT;
828 				}
829 
830 {colon_equals}	{
831 					SET_YYLLOC();
832 					return COLON_EQUALS;
833 				}
834 
835 {equals_greater} {
836 					SET_YYLLOC();
837 					return EQUALS_GREATER;
838 				}
839 
840 {less_equals}	{
841 					SET_YYLLOC();
842 					return LESS_EQUALS;
843 				}
844 
845 {greater_equals} {
846 					SET_YYLLOC();
847 					return GREATER_EQUALS;
848 				}
849 
850 {less_greater}	{
851 					/* We accept both "<>" and "!=" as meaning NOT_EQUALS */
852 					SET_YYLLOC();
853 					return NOT_EQUALS;
854 				}
855 
856 {not_equals}	{
857 					/* We accept both "<>" and "!=" as meaning NOT_EQUALS */
858 					SET_YYLLOC();
859 					return NOT_EQUALS;
860 				}
861 
862 {self}			{
863 					SET_YYLLOC();
864 					return yytext[0];
865 				}
866 
867 {operator}		{
868 					/*
869 					 * Check for embedded slash-star or dash-dash; those
870 					 * are comment starts, so operator must stop there.
871 					 * Note that slash-star or dash-dash at the first
872 					 * character will match a prior rule, not this one.
873 					 */
874 					int			nchars = yyleng;
875 					char	   *slashstar = strstr(yytext, "/*");
876 					char	   *dashdash = strstr(yytext, "--");
877 
878 					if (slashstar && dashdash)
879 					{
880 						/* if both appear, take the first one */
881 						if (slashstar > dashdash)
882 							slashstar = dashdash;
883 					}
884 					else if (!slashstar)
885 						slashstar = dashdash;
886 					if (slashstar)
887 						nchars = slashstar - yytext;
888 
889 					/*
890 					 * For SQL compatibility, '+' and '-' cannot be the
891 					 * last char of a multi-char operator unless the operator
892 					 * contains chars that are not in SQL operators.
893 					 * The idea is to lex '=-' as two operators, but not
894 					 * to forbid operator names like '?-' that could not be
895 					 * sequences of SQL operators.
896 					 */
897 					if (nchars > 1 &&
898 						(yytext[nchars - 1] == '+' ||
899 						 yytext[nchars - 1] == '-'))
900 					{
901 						int			ic;
902 
903 						for (ic = nchars - 2; ic >= 0; ic--)
904 						{
905 							char c = yytext[ic];
906 							if (c == '~' || c == '!' || c == '@' ||
907 								c == '#' || c == '^' || c == '&' ||
908 								c == '|' || c == '`' || c == '?' ||
909 								c == '%')
910 								break;
911 						}
912 						if (ic < 0)
913 						{
914 							/*
915 							 * didn't find a qualifying character, so remove
916 							 * all trailing [+-]
917 							 */
918 							do {
919 								nchars--;
920 							} while (nchars > 1 &&
921 								 (yytext[nchars - 1] == '+' ||
922 								  yytext[nchars - 1] == '-'));
923 						}
924 					}
925 
926 					SET_YYLLOC();
927 
928 					if (nchars < yyleng)
929 					{
930 						/* Strip the unwanted chars from the token */
931 						yyless(nchars);
932 						/*
933 						 * If what we have left is only one char, and it's
934 						 * one of the characters matching "self", then
935 						 * return it as a character token the same way
936 						 * that the "self" rule would have.
937 						 */
938 						if (nchars == 1 &&
939 							strchr(",()[].;:+-*/%^<>=", yytext[0]))
940 							return yytext[0];
941 						/*
942 						 * Likewise, if what we have left is two chars, and
943 						 * those match the tokens ">=", "<=", "=>", "<>" or
944 						 * "!=", then we must return the appropriate token
945 						 * rather than the generic Op.
946 						 */
947 						if (nchars == 2)
948 						{
949 							if (yytext[0] == '=' && yytext[1] == '>')
950 								return EQUALS_GREATER;
951 							if (yytext[0] == '>' && yytext[1] == '=')
952 								return GREATER_EQUALS;
953 							if (yytext[0] == '<' && yytext[1] == '=')
954 								return LESS_EQUALS;
955 							if (yytext[0] == '<' && yytext[1] == '>')
956 								return NOT_EQUALS;
957 							if (yytext[0] == '!' && yytext[1] == '=')
958 								return NOT_EQUALS;
959 						}
960 					}
961 
962 					/*
963 					 * Complain if operator is too long.  Unlike the case
964 					 * for identifiers, we make this an error not a notice-
965 					 * and-truncate, because the odds are we are looking at
966 					 * a syntactic mistake anyway.
967 					 */
968 					if (nchars >= NAMEDATALEN)
969 						yyerror("operator too long");
970 
971 					yylval->str = pstrdup(yytext);
972 					return Op;
973 				}
974 
975 {param}			{
976 					SET_YYLLOC();
977 					yylval->ival = atol(yytext + 1);
978 					return PARAM;
979 				}
980 
981 {integer}		{
982 					SET_YYLLOC();
983 					return process_integer_literal(yytext, yylval);
984 				}
985 {decimal}		{
986 					SET_YYLLOC();
987 					yylval->str = pstrdup(yytext);
988 					return FCONST;
989 				}
990 {decimalfail}	{
991 					/* throw back the .., and treat as integer */
992 					yyless(yyleng - 2);
993 					SET_YYLLOC();
994 					return process_integer_literal(yytext, yylval);
995 				}
996 {real}			{
997 					SET_YYLLOC();
998 					yylval->str = pstrdup(yytext);
999 					return FCONST;
1000 				}
1001 {realfail1}		{
1002 					/*
1003 					 * throw back the [Ee], and treat as {decimal}.  Note
1004 					 * that it is possible the input is actually {integer},
1005 					 * but since this case will almost certainly lead to a
1006 					 * syntax error anyway, we don't bother to distinguish.
1007 					 */
1008 					yyless(yyleng - 1);
1009 					SET_YYLLOC();
1010 					yylval->str = pstrdup(yytext);
1011 					return FCONST;
1012 				}
1013 {realfail2}		{
1014 					/* throw back the [Ee][+-], and proceed as above */
1015 					yyless(yyleng - 2);
1016 					SET_YYLLOC();
1017 					yylval->str = pstrdup(yytext);
1018 					return FCONST;
1019 				}
1020 
1021 
1022 {identifier}	{
1023 					const ScanKeyword *keyword;
1024 					char	   *ident;
1025 
1026 					SET_YYLLOC();
1027 
1028 					/* Is it a keyword? */
1029 					keyword = ScanKeywordLookup(yytext,
1030 												yyextra->keywords,
1031 												yyextra->num_keywords);
1032 					if (keyword != NULL)
1033 					{
1034 						yylval->keyword = keyword->name;
1035 						return keyword->value;
1036 					}
1037 
1038 					/*
1039 					 * No.  Convert the identifier to lower case, and truncate
1040 					 * if necessary.
1041 					 */
1042 					ident = downcase_truncate_identifier(yytext, yyleng, true);
1043 					yylval->str = ident;
1044 					return IDENT;
1045 				}
1046 
1047 {other}			{
1048 					SET_YYLLOC();
1049 					return yytext[0];
1050 				}
1051 
1052 <<EOF>>			{
1053 					SET_YYLLOC();
1054 					yyterminate();
1055 				}
1056 
1057 %%
1058 
1059 /* LCOV_EXCL_STOP */
1060 
1061 /*
1062  * Arrange access to yyextra for subroutines of the main yylex() function.
1063  * We expect each subroutine to have a yyscanner parameter.  Rather than
1064  * use the yyget_xxx functions, which might or might not get inlined by the
1065  * compiler, we cheat just a bit and cast yyscanner to the right type.
1066  */
1067 #undef yyextra
1068 #define yyextra  (((struct yyguts_t *) yyscanner)->yyextra_r)
1069 
1070 /* Likewise for a couple of other things we need. */
1071 #undef yylloc
1072 #define yylloc	(((struct yyguts_t *) yyscanner)->yylloc_r)
1073 #undef yyleng
1074 #define yyleng	(((struct yyguts_t *) yyscanner)->yyleng_r)
1075 
1076 
1077 /*
1078  * scanner_errposition
1079  *		Report a lexer or grammar error cursor position, if possible.
1080  *
1081  * This is expected to be used within an ereport() call.  The return value
1082  * is a dummy (always 0, in fact).
1083  *
1084  * Note that this can only be used for messages emitted during raw parsing
1085  * (essentially, scan.l and gram.y), since it requires the yyscanner struct
1086  * to still be available.
1087  */
1088 int
1089 scanner_errposition(int location, core_yyscan_t yyscanner)
1090 {
1091 	int			pos;
1092 
1093 	if (location < 0)
1094 		return 0;				/* no-op if location is unknown */
1095 
1096 	/* Convert byte offset to character number */
1097 	pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
1098 	/* And pass it to the ereport mechanism */
1099 	return errposition(pos);
1100 }
1101 
1102 /*
1103  * scanner_yyerror
1104  *		Report a lexer or grammar error.
1105  *
1106  * The message's cursor position is whatever YYLLOC was last set to,
1107  * ie, the start of the current token if called within yylex(), or the
1108  * most recently lexed token if called from the grammar.
1109  * This is OK for syntax error messages from the Bison parser, because Bison
1110  * parsers report error as soon as the first unparsable token is reached.
1111  * Beware of using yyerror for other purposes, as the cursor position might
1112  * be misleading!
1113  */
1114 void
1115 scanner_yyerror(const char *message, core_yyscan_t yyscanner)
1116 {
1117 	const char *loc = yyextra->scanbuf + *yylloc;
1118 
1119 	if (*loc == YY_END_OF_BUFFER_CHAR)
1120 	{
1121 		ereport(ERROR,
1122 				(errcode(ERRCODE_SYNTAX_ERROR),
1123 		/* translator: %s is typically the translation of "syntax error" */
1124 				 errmsg("%s at end of input", _(message)),
1125 				 lexer_errposition()));
1126 	}
1127 	else
1128 	{
1129 		ereport(ERROR,
1130 				(errcode(ERRCODE_SYNTAX_ERROR),
1131 		/* translator: first %s is typically the translation of "syntax error" */
1132 				 errmsg("%s at or near \"%s\"", _(message), loc),
1133 				 lexer_errposition()));
1134 	}
1135 }
1136 
1137 
1138 /*
1139  * Called before any actual parsing is done
1140  */
1141 core_yyscan_t
1142 scanner_init(const char *str,
1143 			 core_yy_extra_type *yyext,
1144 			 const ScanKeyword *keywords,
1145 			 int num_keywords)
1146 {
1147 	Size		slen = strlen(str);
1148 	yyscan_t	scanner;
1149 
1150 	if (yylex_init(&scanner) != 0)
1151 		elog(ERROR, "yylex_init() failed: %m");
1152 
1153 	core_yyset_extra(yyext, scanner);
1154 
1155 	yyext->keywords = keywords;
1156 	yyext->num_keywords = num_keywords;
1157 
1158 	yyext->backslash_quote = backslash_quote;
1159 	yyext->escape_string_warning = escape_string_warning;
1160 	yyext->standard_conforming_strings = standard_conforming_strings;
1161 
1162 	/*
1163 	 * Make a scan buffer with special termination needed by flex.
1164 	 */
1165 	yyext->scanbuf = (char *) palloc(slen + 2);
1166 	yyext->scanbuflen = slen;
1167 	memcpy(yyext->scanbuf, str, slen);
1168 	yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
1169 	yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
1170 
1171 	/* initialize literal buffer to a reasonable but expansible size */
1172 	yyext->literalalloc = 1024;
1173 	yyext->literalbuf = (char *) palloc(yyext->literalalloc);
1174 	yyext->literallen = 0;
1175 
1176 	return scanner;
1177 }
1178 
1179 
1180 /*
1181  * Called after parsing is done to clean up after scanner_init()
1182  */
1183 void
1184 scanner_finish(core_yyscan_t yyscanner)
1185 {
1186 	/*
1187 	 * We don't bother to call yylex_destroy(), because all it would do is
1188 	 * pfree a small amount of control storage.  It's cheaper to leak the
1189 	 * storage until the parsing context is destroyed.  The amount of space
1190 	 * involved is usually negligible compared to the output parse tree
1191 	 * anyway.
1192 	 *
1193 	 * We do bother to pfree the scanbuf and literal buffer, but only if they
1194 	 * represent a nontrivial amount of space.  The 8K cutoff is arbitrary.
1195 	 */
1196 	if (yyextra->scanbuflen >= 8192)
1197 		pfree(yyextra->scanbuf);
1198 	if (yyextra->literalalloc >= 8192)
1199 		pfree(yyextra->literalbuf);
1200 }
1201 
1202 
1203 static void
1204 addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
1205 {
1206 	/* enlarge buffer if needed */
1207 	if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
1208 	{
1209 		do
1210 		{
1211 			yyextra->literalalloc *= 2;
1212 		} while ((yyextra->literallen + yleng) >= yyextra->literalalloc);
1213 		yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1214 												yyextra->literalalloc);
1215 	}
1216 	/* append new data */
1217 	memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
1218 	yyextra->literallen += yleng;
1219 }
1220 
1221 
1222 static void
1223 addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
1224 {
1225 	/* enlarge buffer if needed */
1226 	if ((yyextra->literallen + 1) >= yyextra->literalalloc)
1227 	{
1228 		yyextra->literalalloc *= 2;
1229 		yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1230 												yyextra->literalalloc);
1231 	}
1232 	/* append new data */
1233 	yyextra->literalbuf[yyextra->literallen] = ychar;
1234 	yyextra->literallen += 1;
1235 }
1236 
1237 
1238 /*
1239  * Create a palloc'd copy of literalbuf, adding a trailing null.
1240  */
1241 static char *
1242 litbufdup(core_yyscan_t yyscanner)
1243 {
1244 	int			llen = yyextra->literallen;
1245 	char	   *new;
1246 
1247 	new = palloc(llen + 1);
1248 	memcpy(new, yyextra->literalbuf, llen);
1249 	new[llen] = '\0';
1250 	return new;
1251 }
1252 
1253 static int
1254 process_integer_literal(const char *token, YYSTYPE *lval)
1255 {
1256 	int			val;
1257 	char	   *endptr;
1258 
1259 	errno = 0;
1260 	val = strtoint(token, &endptr, 10);
1261 	if (*endptr != '\0' || errno == ERANGE)
1262 	{
1263 		/* integer too large, treat it as a float */
1264 		lval->str = pstrdup(token);
1265 		return FCONST;
1266 	}
1267 	lval->ival = val;
1268 	return ICONST;
1269 }
1270 
1271 static unsigned int
1272 hexval(unsigned char c)
1273 {
1274 	if (c >= '0' && c <= '9')
1275 		return c - '0';
1276 	if (c >= 'a' && c <= 'f')
1277 		return c - 'a' + 0xA;
1278 	if (c >= 'A' && c <= 'F')
1279 		return c - 'A' + 0xA;
1280 	elog(ERROR, "invalid hexadecimal digit");
1281 	return 0;					/* not reached */
1282 }
1283 
1284 static void
1285 check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner)
1286 {
1287 	if (GetDatabaseEncoding() == PG_UTF8)
1288 		return;
1289 
1290 	if (c > 0x7F)
1291 	{
1292 		ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3);	/* 3 for U&" */
1293 		yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1294 	}
1295 }
1296 
1297 static bool
1298 is_utf16_surrogate_first(pg_wchar c)
1299 {
1300 	return (c >= 0xD800 && c <= 0xDBFF);
1301 }
1302 
1303 static bool
1304 is_utf16_surrogate_second(pg_wchar c)
1305 {
1306 	return (c >= 0xDC00 && c <= 0xDFFF);
1307 }
1308 
1309 static pg_wchar
1310 surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
1311 {
1312 	return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
1313 }
1314 
1315 static void
1316 addunicode(pg_wchar c, core_yyscan_t yyscanner)
1317 {
1318 	char		buf[8];
1319 
1320 	if (c == 0 || c > 0x10FFFF)
1321 		yyerror("invalid Unicode escape value");
1322 	if (c > 0x7F)
1323 	{
1324 		if (GetDatabaseEncoding() != PG_UTF8)
1325 			yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1326 		yyextra->saw_non_ascii = true;
1327 	}
1328 	unicode_to_utf8(c, (unsigned char *) buf);
1329 	addlit(buf, pg_mblen(buf), yyscanner);
1330 }
1331 
1332 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
1333 static bool
1334 check_uescapechar(unsigned char escape)
1335 {
1336 	if (isxdigit(escape)
1337 		|| escape == '+'
1338 		|| escape == '\''
1339 		|| escape == '"'
1340 		|| scanner_isspace(escape))
1341 	{
1342 		return false;
1343 	}
1344 	else
1345 		return true;
1346 }
1347 
1348 /* like litbufdup, but handle unicode escapes */
1349 static char *
1350 litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
1351 {
1352 	char	   *new;
1353 	char	   *litbuf,
1354 			   *in,
1355 			   *out;
1356 	pg_wchar	pair_first = 0;
1357 
1358 	/* Make literalbuf null-terminated to simplify the scanning loop */
1359 	litbuf = yyextra->literalbuf;
1360 	litbuf[yyextra->literallen] = '\0';
1361 
1362 	/*
1363 	 * This relies on the subtle assumption that a UTF-8 expansion cannot be
1364 	 * longer than its escaped representation.
1365 	 */
1366 	new = palloc(yyextra->literallen + 1);
1367 
1368 	in = litbuf;
1369 	out = new;
1370 	while (*in)
1371 	{
1372 		if (in[0] == escape)
1373 		{
1374 			if (in[1] == escape)
1375 			{
1376 				if (pair_first)
1377 				{
1378 					ADVANCE_YYLLOC(in - litbuf + 3);	/* 3 for U&" */
1379 					yyerror("invalid Unicode surrogate pair");
1380 				}
1381 				*out++ = escape;
1382 				in += 2;
1383 			}
1384 			else if (isxdigit((unsigned char) in[1]) &&
1385 					 isxdigit((unsigned char) in[2]) &&
1386 					 isxdigit((unsigned char) in[3]) &&
1387 					 isxdigit((unsigned char) in[4]))
1388 			{
1389 				pg_wchar	unicode;
1390 
1391 				unicode = (hexval(in[1]) << 12) +
1392 					(hexval(in[2]) << 8) +
1393 					(hexval(in[3]) << 4) +
1394 					hexval(in[4]);
1395 				check_unicode_value(unicode, in, yyscanner);
1396 				if (pair_first)
1397 				{
1398 					if (is_utf16_surrogate_second(unicode))
1399 					{
1400 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1401 						pair_first = 0;
1402 					}
1403 					else
1404 					{
1405 						ADVANCE_YYLLOC(in - litbuf + 3);		/* 3 for U&" */
1406 						yyerror("invalid Unicode surrogate pair");
1407 					}
1408 				}
1409 				else if (is_utf16_surrogate_second(unicode))
1410 					yyerror("invalid Unicode surrogate pair");
1411 
1412 				if (is_utf16_surrogate_first(unicode))
1413 					pair_first = unicode;
1414 				else
1415 				{
1416 					unicode_to_utf8(unicode, (unsigned char *) out);
1417 					out += pg_mblen(out);
1418 				}
1419 				in += 5;
1420 			}
1421 			else if (in[1] == '+' &&
1422 					 isxdigit((unsigned char) in[2]) &&
1423 					 isxdigit((unsigned char) in[3]) &&
1424 					 isxdigit((unsigned char) in[4]) &&
1425 					 isxdigit((unsigned char) in[5]) &&
1426 					 isxdigit((unsigned char) in[6]) &&
1427 					 isxdigit((unsigned char) in[7]))
1428 			{
1429 				pg_wchar	unicode;
1430 
1431 				unicode = (hexval(in[2]) << 20) +
1432 					(hexval(in[3]) << 16) +
1433 					(hexval(in[4]) << 12) +
1434 					(hexval(in[5]) << 8) +
1435 					(hexval(in[6]) << 4) +
1436 					hexval(in[7]);
1437 				check_unicode_value(unicode, in, yyscanner);
1438 				if (pair_first)
1439 				{
1440 					if (is_utf16_surrogate_second(unicode))
1441 					{
1442 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1443 						pair_first = 0;
1444 					}
1445 					else
1446 					{
1447 						ADVANCE_YYLLOC(in - litbuf + 3);		/* 3 for U&" */
1448 						yyerror("invalid Unicode surrogate pair");
1449 					}
1450 				}
1451 				else if (is_utf16_surrogate_second(unicode))
1452 					yyerror("invalid Unicode surrogate pair");
1453 
1454 				if (is_utf16_surrogate_first(unicode))
1455 					pair_first = unicode;
1456 				else
1457 				{
1458 					unicode_to_utf8(unicode, (unsigned char *) out);
1459 					out += pg_mblen(out);
1460 				}
1461 				in += 8;
1462 			}
1463 			else
1464 			{
1465 				ADVANCE_YYLLOC(in - litbuf + 3);		/* 3 for U&" */
1466 				yyerror("invalid Unicode escape value");
1467 			}
1468 		}
1469 		else
1470 		{
1471 			if (pair_first)
1472 			{
1473 				ADVANCE_YYLLOC(in - litbuf + 3);		/* 3 for U&" */
1474 				yyerror("invalid Unicode surrogate pair");
1475 			}
1476 			*out++ = *in++;
1477 		}
1478 	}
1479 
1480 	/* unfinished surrogate pair? */
1481 	if (pair_first)
1482 	{
1483 		ADVANCE_YYLLOC(in - litbuf + 3);				/* 3 for U&" */
1484 		yyerror("invalid Unicode surrogate pair");
1485 	}
1486 
1487 	*out = '\0';
1488 
1489 	/*
1490 	 * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
1491 	 * codes; but it's probably not worth the trouble, since this isn't likely
1492 	 * to be a performance-critical path.
1493 	 */
1494 	pg_verifymbstr(new, out - new, false);
1495 	return new;
1496 }
1497 
1498 static unsigned char
1499 unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
1500 {
1501 	switch (c)
1502 	{
1503 		case 'b':
1504 			return '\b';
1505 		case 'f':
1506 			return '\f';
1507 		case 'n':
1508 			return '\n';
1509 		case 'r':
1510 			return '\r';
1511 		case 't':
1512 			return '\t';
1513 		default:
1514 			/* check for backslash followed by non-7-bit-ASCII */
1515 			if (c == '\0' || IS_HIGHBIT_SET(c))
1516 				yyextra->saw_non_ascii = true;
1517 
1518 			return c;
1519 	}
1520 }
1521 
1522 static void
1523 check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
1524 {
1525 	if (ychar == '\'')
1526 	{
1527 		if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1528 			ereport(WARNING,
1529 					(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1530 					 errmsg("nonstandard use of \\' in a string literal"),
1531 					 errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
1532 					 lexer_errposition()));
1533 		yyextra->warn_on_first_escape = false;	/* warn only once per string */
1534 	}
1535 	else if (ychar == '\\')
1536 	{
1537 		if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1538 			ereport(WARNING,
1539 					(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1540 					 errmsg("nonstandard use of \\\\ in a string literal"),
1541 					 errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1542 					 lexer_errposition()));
1543 		yyextra->warn_on_first_escape = false;	/* warn only once per string */
1544 	}
1545 	else
1546 		check_escape_warning(yyscanner);
1547 }
1548 
1549 static void
1550 check_escape_warning(core_yyscan_t yyscanner)
1551 {
1552 	if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1553 		ereport(WARNING,
1554 				(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1555 				 errmsg("nonstandard use of escape in a string literal"),
1556 		errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
1557 				 lexer_errposition()));
1558 	yyextra->warn_on_first_escape = false;		/* warn only once per string */
1559 }
1560 
1561 /*
1562  * Interface functions to make flex use palloc() instead of malloc().
1563  * It'd be better to make these static, but flex insists otherwise.
1564  */
1565 
1566 void *
1567 core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
1568 {
1569 	return palloc(bytes);
1570 }
1571 
1572 void *
1573 core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
1574 {
1575 	if (ptr)
1576 		return repalloc(ptr, bytes);
1577 	else
1578 		return palloc(bytes);
1579 }
1580 
1581 void
1582 core_yyfree(void *ptr, core_yyscan_t yyscanner)
1583 {
1584 	if (ptr)
1585 		pfree(ptr);
1586 }
1587