1 %top{
2 /*-------------------------------------------------------------------------
3  *
4  * scan.l
5  *	  lexical scanner for PostgreSQL
6  *
7  * NOTE NOTE NOTE:
8  *
9  * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l!
10  *
11  * The rules are designed so that the scanner never has to backtrack,
12  * in the sense that there is always a rule that can match the input
13  * consumed so far (the rule action may internally throw back some input
14  * with yyless(), however).  As explained in the flex manual, this makes
15  * for a useful speed increase --- about a third faster than a plain -CF
16  * lexer, in simple testing.  The extra complexity is mostly in the rules
17  * for handling float numbers and continued string literals.  If you change
18  * the lexical rules, verify that you haven't broken the no-backtrack
19  * property by running flex with the "-b" option and checking that the
20  * resulting "lex.backup" file says that no backing up is needed.  (As of
21  * Postgres 9.2, this check is made automatically by the Makefile.)
22  *
23  *
24  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
25  * Portions Copyright (c) 1994, Regents of the University of California
26  *
27  * IDENTIFICATION
28  *	  src/backend/parser/scan.l
29  *
30  *-------------------------------------------------------------------------
31  */
32 #include "postgres.h"
33 
34 #include <ctype.h>
35 #include <unistd.h>
36 
37 #include "parser/gramparse.h"
38 #include "parser/parser.h"		/* only needed for GUC variables */
39 #include "parser/scansup.h"
40 #include "mb/pg_wchar.h"
41 }
42 
43 %{
44 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
45 #undef fprintf
46 #define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
47 
48 static void
fprintf_to_ereport(const char * fmt,const char * msg)49 fprintf_to_ereport(const char *fmt, const char *msg)
50 {
51 	ereport(ERROR, (errmsg_internal("%s", msg)));
52 }
53 
54 /*
55  * GUC variables.  This is a DIRECT violation of the warning given at the
56  * head of gram.y, ie flex/bison code must not depend on any GUC variables;
57  * as such, changing their values can induce very unintuitive behavior.
58  * But we shall have to live with it until we can remove these variables.
59  */
60 int			backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
61 bool		escape_string_warning = true;
62 bool		standard_conforming_strings = true;
63 
64 /*
65  * Set the type of YYSTYPE.
66  */
67 #define YYSTYPE core_YYSTYPE
68 
69 /*
70  * Set the type of yyextra.  All state variables used by the scanner should
71  * be in yyextra, *not* statically allocated.
72  */
73 #define YY_EXTRA_TYPE core_yy_extra_type *
74 
75 /*
76  * Each call to yylex must set yylloc to the location of the found token
77  * (expressed as a byte offset from the start of the input text).
78  * When we parse a token that requires multiple lexer rules to process,
79  * this should be done in the first such rule, else yylloc will point
80  * into the middle of the token.
81  */
82 #define SET_YYLLOC()  (*(yylloc) = yytext - yyextra->scanbuf)
83 
84 /*
85  * Advance yylloc by the given number of bytes.
86  */
87 #define ADVANCE_YYLLOC(delta)  ( *(yylloc) += (delta) )
88 
89 #define startlit()	( yyextra->literallen = 0 )
90 static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
91 static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
92 static char *litbufdup(core_yyscan_t yyscanner);
93 static char *litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner);
94 static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
95 static int	process_integer_literal(const char *token, YYSTYPE *lval);
96 static bool is_utf16_surrogate_first(pg_wchar c);
97 static bool is_utf16_surrogate_second(pg_wchar c);
98 static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
99 static void addunicode(pg_wchar c, yyscan_t yyscanner);
100 static bool check_uescapechar(unsigned char escape);
101 
102 #define yyerror(msg)  scanner_yyerror(msg, yyscanner)
103 
104 #define lexer_errposition()  scanner_errposition(*(yylloc), yyscanner)
105 
106 static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner);
107 static void check_escape_warning(core_yyscan_t yyscanner);
108 
109 /*
110  * Work around a bug in flex 2.5.35: it emits a couple of functions that
111  * it forgets to emit declarations for.  Since we use -Wmissing-prototypes,
112  * this would cause warnings.  Providing our own declarations should be
113  * harmless even when the bug gets fixed.
114  */
115 extern int	core_yyget_column(yyscan_t yyscanner);
116 extern void core_yyset_column(int column_no, yyscan_t yyscanner);
117 
118 %}
119 
120 %option reentrant
121 %option bison-bridge
122 %option bison-locations
123 %option 8bit
124 %option never-interactive
125 %option nodefault
126 %option noinput
127 %option nounput
128 %option noyywrap
129 %option noyyalloc
130 %option noyyrealloc
131 %option noyyfree
132 %option warn
133 %option prefix="core_yy"
134 
135 /*
136  * OK, here is a short description of lex/flex rules behavior.
137  * The longest pattern which matches an input string is always chosen.
138  * For equal-length patterns, the first occurring in the rules list is chosen.
139  * INITIAL is the starting state, to which all non-conditional rules apply.
140  * Exclusive states change parsing rules while the state is active.  When in
141  * an exclusive state, only those rules defined for that state apply.
142  *
143  * We use exclusive states for quoted strings, extended comments,
144  * and to eliminate parsing troubles for numeric strings.
145  * Exclusive states:
146  *  <xb> bit string literal
147  *  <xc> extended C-style comments
148  *  <xd> delimited identifiers (double-quoted identifiers)
149  *  <xh> hexadecimal numeric string
150  *  <xq> standard quoted strings
151  *  <xe> extended quoted strings (support backslash escape sequences)
152  *  <xdolq> $foo$ quoted strings
153  *  <xui> quoted identifier with Unicode escapes
154  *  <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
155  *  <xus> quoted string with Unicode escapes
156  *  <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
157  *  <xeu> Unicode surrogate pair in extended quoted string
158  *
159  * Remember to add an <<EOF>> case whenever you add a new exclusive state!
160  * The default one is probably not the right thing.
161  */
162 
163 %x xb
164 %x xc
165 %x xd
166 %x xh
167 %x xe
168 %x xq
169 %x xdolq
170 %x xui
171 %x xuiend
172 %x xus
173 %x xusend
174 %x xeu
175 
176 /*
177  * In order to make the world safe for Windows and Mac clients as well as
178  * Unix ones, we accept either \n or \r as a newline.  A DOS-style \r\n
179  * sequence will be seen as two successive newlines, but that doesn't cause
180  * any problems.  Comments that start with -- and extend to the next
181  * newline are treated as equivalent to a single whitespace character.
182  *
183  * NOTE a fine point: if there is no newline following --, we will absorb
184  * everything to the end of the input as a comment.  This is correct.  Older
185  * versions of Postgres failed to recognize -- as a comment if the input
186  * did not end with a newline.
187  *
188  * XXX perhaps \f (formfeed) should be treated as a newline as well?
189  *
190  * XXX if you change the set of whitespace characters, fix scanner_isspace()
191  * to agree, and see also the plpgsql lexer.
192  */
193 
194 space			[ \t\n\r\f]
195 horiz_space		[ \t\f]
196 newline			[\n\r]
197 non_newline		[^\n\r]
198 
199 comment			("--"{non_newline}*)
200 
201 whitespace		({space}+|{comment})
202 
203 /*
204  * SQL requires at least one newline in the whitespace separating
205  * string literals that are to be concatenated.  Silly, but who are we
206  * to argue?  Note that {whitespace_with_newline} should not have * after
207  * it, whereas {whitespace} should generally have a * after it...
208  */
209 
210 special_whitespace		({space}+|{comment}{newline})
211 horiz_whitespace		({horiz_space}|{comment})
212 whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
213 
214 /*
215  * To ensure that {quotecontinue} can be scanned without having to back up
216  * if the full pattern isn't matched, we include trailing whitespace in
217  * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
218  * except for {quote} followed by whitespace and just one "-" (not two,
219  * which would start a {comment}).  To cover that we have {quotefail}.
220  * The actions for {quotestop} and {quotefail} must throw back characters
221  * beyond the quote proper.
222  */
223 quote			'
224 quotestop		{quote}{whitespace}*
225 quotecontinue	{quote}{whitespace_with_newline}{quote}
226 quotefail		{quote}{whitespace}*"-"
227 
228 /* Bit string
229  * It is tempting to scan the string for only those characters
230  * which are allowed. However, this leads to silently swallowed
231  * characters if illegal characters are included in the string.
232  * For example, if xbinside is [01] then B'ABCD' is interpreted
233  * as a zero-length string, and the ABCD' is lost!
234  * Better to pass the string forward and let the input routines
235  * validate the contents.
236  */
237 xbstart			[bB]{quote}
238 xbinside		[^']*
239 
240 /* Hexadecimal number */
241 xhstart			[xX]{quote}
242 xhinside		[^']*
243 
244 /* National character */
245 xnstart			[nN]{quote}
246 
247 /* Quoted string that allows backslash escapes */
248 xestart			[eE]{quote}
249 xeinside		[^\\']+
250 xeescape		[\\][^0-7]
251 xeoctesc		[\\][0-7]{1,3}
252 xehexesc		[\\]x[0-9A-Fa-f]{1,2}
253 xeunicode		[\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
254 xeunicodefail	[\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
255 
256 /* Extended quote
257  * xqdouble implements embedded quote, ''''
258  */
259 xqstart			{quote}
260 xqdouble		{quote}{quote}
261 xqinside		[^']+
262 
263 /* $foo$ style quotes ("dollar quoting")
264  * The quoted string starts with $foo$ where "foo" is an optional string
265  * in the form of an identifier, except that it may not contain "$",
266  * and extends to the first occurrence of an identical string.
267  * There is *no* processing of the quoted text.
268  *
269  * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
270  * fails to match its trailing "$".
271  */
272 dolq_start		[A-Za-z\200-\377_]
273 dolq_cont		[A-Za-z\200-\377_0-9]
274 dolqdelim		\$({dolq_start}{dolq_cont}*)?\$
275 dolqfailed		\${dolq_start}{dolq_cont}*
276 dolqinside		[^$]+
277 
278 /* Double quote
279  * Allows embedded spaces and other special characters into identifiers.
280  */
281 dquote			\"
282 xdstart			{dquote}
283 xdstop			{dquote}
284 xddouble		{dquote}{dquote}
285 xdinside		[^"]+
286 
287 /* Unicode escapes */
288 uescape			[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
289 /* error rule to avoid backup */
290 uescapefail		[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
291 
292 /* Quoted identifier with Unicode escapes */
293 xuistart		[uU]&{dquote}
294 
295 /* Quoted string with Unicode escapes */
296 xusstart		[uU]&{quote}
297 
298 /* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
299 xustop1		{uescapefail}?
300 xustop2		{uescape}
301 
302 /* error rule to avoid backup */
303 xufailed		[uU]&
304 
305 
306 /* C-style comments
307  *
308  * The "extended comment" syntax closely resembles allowable operator syntax.
309  * The tricky part here is to get lex to recognize a string starting with
310  * slash-star as a comment, when interpreting it as an operator would produce
311  * a longer match --- remember lex will prefer a longer match!  Also, if we
312  * have something like plus-slash-star, lex will think this is a 3-character
313  * operator whereas we want to see it as a + operator and a comment start.
314  * The solution is two-fold:
315  * 1. append {op_chars}* to xcstart so that it matches as much text as
316  *    {operator} would. Then the tie-breaker (first matching rule of same
317  *    length) ensures xcstart wins.  We put back the extra stuff with yyless()
318  *    in case it contains a star-slash that should terminate the comment.
319  * 2. In the operator rule, check for slash-star within the operator, and
320  *    if found throw it back with yyless().  This handles the plus-slash-star
321  *    problem.
322  * Dash-dash comments have similar interactions with the operator rule.
323  */
324 xcstart			\/\*{op_chars}*
325 xcstop			\*+\/
326 xcinside		[^*/]+
327 
328 digit			[0-9]
329 ident_start		[A-Za-z\200-\377_]
330 ident_cont		[A-Za-z\200-\377_0-9\$]
331 
332 identifier		{ident_start}{ident_cont}*
333 
334 /* Assorted special-case operators and operator-like tokens */
335 typecast		"::"
336 dot_dot			\.\.
337 colon_equals	":="
338 
339 /*
340  * These operator-like tokens (unlike the above ones) also match the {operator}
341  * rule, which means that they might be overridden by a longer match if they
342  * are followed by a comment start or a + or - character. Accordingly, if you
343  * add to this list, you must also add corresponding code to the {operator}
344  * block to return the correct token in such cases. (This is not needed in
345  * psqlscan.l since the token value is ignored there.)
346  */
347 equals_greater	"=>"
348 less_equals		"<="
349 greater_equals	">="
350 less_greater	"<>"
351 not_equals		"!="
352 
353 /*
354  * "self" is the set of chars that should be returned as single-character
355  * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
356  * which can be one or more characters long (but if a single-char token
357  * appears in the "self" set, it is not to be returned as an Op).  Note
358  * that the sets overlap, but each has some chars that are not in the other.
359  *
360  * If you change either set, adjust the character lists appearing in the
361  * rule for "operator"!
362  */
363 self			[,()\[\].;\:\+\-\*\/\%\^\<\>\=]
364 op_chars		[\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
365 operator		{op_chars}+
366 
367 /* we no longer allow unary minus in numbers.
368  * instead we pass it separately to parser. there it gets
369  * coerced via doNegate() -- Leon aug 20 1999
370  *
371  * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
372  *
373  * {realfail1} and {realfail2} are added to prevent the need for scanner
374  * backup when the {real} rule fails to match completely.
375  */
376 
377 integer			{digit}+
378 decimal			(({digit}*\.{digit}+)|({digit}+\.{digit}*))
379 decimalfail		{digit}+\.\.
380 real			({integer}|{decimal})[Ee][-+]?{digit}+
381 realfail1		({integer}|{decimal})[Ee]
382 realfail2		({integer}|{decimal})[Ee][-+]
383 
384 param			\${integer}
385 
386 other			.
387 
388 /*
389  * Dollar quoted strings are totally opaque, and no escaping is done on them.
390  * Other quoted strings must allow some special characters such as single-quote
391  *  and newline.
392  * Embedded single-quotes are implemented both in the SQL standard
393  *  style of two adjacent single quotes "''" and in the Postgres/Java style
394  *  of escaped-quote "\'".
395  * Other embedded escaped characters are matched explicitly and the leading
396  *  backslash is dropped from the string.
397  * Note that xcstart must appear before operator, as explained above!
398  *  Also whitespace (comment) must appear before operator.
399  */
400 
401 %%
402 
403 {whitespace}	{
404 					/* ignore */
405 				}
406 
407 {xcstart}		{
408 					/* Set location in case of syntax error in comment */
409 					SET_YYLLOC();
410 					yyextra->xcdepth = 0;
411 					BEGIN(xc);
412 					/* Put back any characters past slash-star; see above */
413 					yyless(2);
414 				}
415 
416 <xc>{xcstart}	{
417 					(yyextra->xcdepth)++;
418 					/* Put back any characters past slash-star; see above */
419 					yyless(2);
420 				}
421 
422 <xc>{xcstop}	{
423 					if (yyextra->xcdepth <= 0)
424 						BEGIN(INITIAL);
425 					else
426 						(yyextra->xcdepth)--;
427 				}
428 
429 <xc>{xcinside}	{
430 					/* ignore */
431 				}
432 
433 <xc>{op_chars}	{
434 					/* ignore */
435 				}
436 
437 <xc>\*+			{
438 					/* ignore */
439 				}
440 
441 <xc><<EOF>>		{ yyerror("unterminated /* comment"); }
442 
443 {xbstart}		{
444 					/* Binary bit type.
445 					 * At some point we should simply pass the string
446 					 * forward to the parser and label it there.
447 					 * In the meantime, place a leading "b" on the string
448 					 * to mark it for the input routine as a binary string.
449 					 */
450 					SET_YYLLOC();
451 					BEGIN(xb);
452 					startlit();
453 					addlitchar('b', yyscanner);
454 				}
455 <xb>{quotestop}	|
456 <xb>{quotefail} {
457 					yyless(1);
458 					BEGIN(INITIAL);
459 					yylval->str = litbufdup(yyscanner);
460 					return BCONST;
461 				}
462 <xh>{xhinside}	|
463 <xb>{xbinside}	{
464 					addlit(yytext, yyleng, yyscanner);
465 				}
466 <xh>{quotecontinue}	|
467 <xb>{quotecontinue}	{
468 					/* ignore */
469 				}
470 <xb><<EOF>>		{ yyerror("unterminated bit string literal"); }
471 
472 {xhstart}		{
473 					/* Hexadecimal bit type.
474 					 * At some point we should simply pass the string
475 					 * forward to the parser and label it there.
476 					 * In the meantime, place a leading "x" on the string
477 					 * to mark it for the input routine as a hex string.
478 					 */
479 					SET_YYLLOC();
480 					BEGIN(xh);
481 					startlit();
482 					addlitchar('x', yyscanner);
483 				}
484 <xh>{quotestop}	|
485 <xh>{quotefail} {
486 					yyless(1);
487 					BEGIN(INITIAL);
488 					yylval->str = litbufdup(yyscanner);
489 					return XCONST;
490 				}
491 <xh><<EOF>>		{ yyerror("unterminated hexadecimal string literal"); }
492 
493 {xnstart}		{
494 					/* National character.
495 					 * We will pass this along as a normal character string,
496 					 * but preceded with an internally-generated "NCHAR".
497 					 */
498 					const ScanKeyword *keyword;
499 
500 					SET_YYLLOC();
501 					yyless(1);	/* eat only 'n' this time */
502 
503 					keyword = ScanKeywordLookup("nchar",
504 												yyextra->keywords,
505 												yyextra->num_keywords);
506 					if (keyword != NULL)
507 					{
508 						yylval->keyword = keyword->name;
509 						return keyword->value;
510 					}
511 					else
512 					{
513 						/* If NCHAR isn't a keyword, just return "n" */
514 						yylval->str = pstrdup("n");
515 						return IDENT;
516 					}
517 				}
518 
519 {xqstart}		{
520 					yyextra->warn_on_first_escape = true;
521 					yyextra->saw_non_ascii = false;
522 					SET_YYLLOC();
523 					if (yyextra->standard_conforming_strings)
524 						BEGIN(xq);
525 					else
526 						BEGIN(xe);
527 					startlit();
528 				}
529 {xestart}		{
530 					yyextra->warn_on_first_escape = false;
531 					yyextra->saw_non_ascii = false;
532 					SET_YYLLOC();
533 					BEGIN(xe);
534 					startlit();
535 				}
536 {xusstart}		{
537 					SET_YYLLOC();
538 					if (!yyextra->standard_conforming_strings)
539 						ereport(ERROR,
540 								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
541 								 errmsg("unsafe use of string constant with Unicode escapes"),
542 								 errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
543 								 lexer_errposition()));
544 					BEGIN(xus);
545 					startlit();
546 				}
547 <xq,xe>{quotestop}	|
548 <xq,xe>{quotefail} {
549 					yyless(1);
550 					BEGIN(INITIAL);
551 					/*
552 					 * check that the data remains valid if it might have been
553 					 * made invalid by unescaping any chars.
554 					 */
555 					if (yyextra->saw_non_ascii)
556 						pg_verifymbstr(yyextra->literalbuf,
557 									   yyextra->literallen,
558 									   false);
559 					yylval->str = litbufdup(yyscanner);
560 					return SCONST;
561 				}
562 <xus>{quotestop} |
563 <xus>{quotefail} {
564 					/* throw back all but the quote */
565 					yyless(1);
566 					/* xusend state looks for possible UESCAPE */
567 					BEGIN(xusend);
568 				}
569 <xusend>{whitespace} {
570 					/* stay in xusend state over whitespace */
571 				}
572 <xusend><<EOF>> |
573 <xusend>{other} |
574 <xusend>{xustop1} {
575 					/* no UESCAPE after the quote, throw back everything */
576 					yyless(0);
577 					BEGIN(INITIAL);
578 					yylval->str = litbuf_udeescape('\\', yyscanner);
579 					return SCONST;
580 				}
581 <xusend>{xustop2} {
582 					/* found UESCAPE after the end quote */
583 					BEGIN(INITIAL);
584 					if (!check_uescapechar(yytext[yyleng - 2]))
585 					{
586 						SET_YYLLOC();
587 						ADVANCE_YYLLOC(yyleng - 2);
588 						yyerror("invalid Unicode escape character");
589 					}
590 					yylval->str = litbuf_udeescape(yytext[yyleng - 2],
591 												   yyscanner);
592 					return SCONST;
593 				}
594 <xq,xe,xus>{xqdouble} {
595 					addlitchar('\'', yyscanner);
596 				}
597 <xq,xus>{xqinside}  {
598 					addlit(yytext, yyleng, yyscanner);
599 				}
600 <xe>{xeinside}  {
601 					addlit(yytext, yyleng, yyscanner);
602 				}
603 <xe>{xeunicode} {
604 					pg_wchar	c = strtoul(yytext + 2, NULL, 16);
605 
606 					check_escape_warning(yyscanner);
607 
608 					if (is_utf16_surrogate_first(c))
609 					{
610 						yyextra->utf16_first_part = c;
611 						BEGIN(xeu);
612 					}
613 					else if (is_utf16_surrogate_second(c))
614 						yyerror("invalid Unicode surrogate pair");
615 					else
616 						addunicode(c, yyscanner);
617 				}
618 <xeu>{xeunicode} {
619 					pg_wchar	c = strtoul(yytext + 2, NULL, 16);
620 
621 					if (!is_utf16_surrogate_second(c))
622 						yyerror("invalid Unicode surrogate pair");
623 
624 					c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
625 
626 					addunicode(c, yyscanner);
627 
628 					BEGIN(xe);
629 				}
630 <xeu>.			{ yyerror("invalid Unicode surrogate pair"); }
631 <xeu>\n			{ yyerror("invalid Unicode surrogate pair"); }
632 <xeu><<EOF>>	{ yyerror("invalid Unicode surrogate pair"); }
633 <xe,xeu>{xeunicodefail}	{
634 					ereport(ERROR,
635 							(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
636 							 errmsg("invalid Unicode escape"),
637 							 errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
638 							 lexer_errposition()));
639 				}
640 <xe>{xeescape}  {
641 					if (yytext[1] == '\'')
642 					{
643 						if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
644 							(yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
645 							 PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
646 							ereport(ERROR,
647 									(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
648 									 errmsg("unsafe use of \\' in a string literal"),
649 									 errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
650 									 lexer_errposition()));
651 					}
652 					check_string_escape_warning(yytext[1], yyscanner);
653 					addlitchar(unescape_single_char(yytext[1], yyscanner),
654 							   yyscanner);
655 				}
656 <xe>{xeoctesc}  {
657 					unsigned char c = strtoul(yytext + 1, NULL, 8);
658 
659 					check_escape_warning(yyscanner);
660 					addlitchar(c, yyscanner);
661 					if (c == '\0' || IS_HIGHBIT_SET(c))
662 						yyextra->saw_non_ascii = true;
663 				}
664 <xe>{xehexesc}  {
665 					unsigned char c = strtoul(yytext + 2, NULL, 16);
666 
667 					check_escape_warning(yyscanner);
668 					addlitchar(c, yyscanner);
669 					if (c == '\0' || IS_HIGHBIT_SET(c))
670 						yyextra->saw_non_ascii = true;
671 				}
672 <xq,xe,xus>{quotecontinue} {
673 					/* ignore */
674 				}
675 <xe>.			{
676 					/* This is only needed for \ just before EOF */
677 					addlitchar(yytext[0], yyscanner);
678 				}
679 <xq,xe,xus><<EOF>>		{ yyerror("unterminated quoted string"); }
680 
681 {dolqdelim}		{
682 					SET_YYLLOC();
683 					yyextra->dolqstart = pstrdup(yytext);
684 					BEGIN(xdolq);
685 					startlit();
686 				}
687 {dolqfailed}	{
688 					SET_YYLLOC();
689 					/* throw back all but the initial "$" */
690 					yyless(1);
691 					/* and treat it as {other} */
692 					return yytext[0];
693 				}
694 <xdolq>{dolqdelim} {
695 					if (strcmp(yytext, yyextra->dolqstart) == 0)
696 					{
697 						pfree(yyextra->dolqstart);
698 						yyextra->dolqstart = NULL;
699 						BEGIN(INITIAL);
700 						yylval->str = litbufdup(yyscanner);
701 						return SCONST;
702 					}
703 					else
704 					{
705 						/*
706 						 * When we fail to match $...$ to dolqstart, transfer
707 						 * the $... part to the output, but put back the final
708 						 * $ for rescanning.  Consider $delim$...$junk$delim$
709 						 */
710 						addlit(yytext, yyleng - 1, yyscanner);
711 						yyless(yyleng - 1);
712 					}
713 				}
714 <xdolq>{dolqinside} {
715 					addlit(yytext, yyleng, yyscanner);
716 				}
717 <xdolq>{dolqfailed} {
718 					addlit(yytext, yyleng, yyscanner);
719 				}
720 <xdolq>.		{
721 					/* This is only needed for $ inside the quoted text */
722 					addlitchar(yytext[0], yyscanner);
723 				}
724 <xdolq><<EOF>>	{ yyerror("unterminated dollar-quoted string"); }
725 
726 {xdstart}		{
727 					SET_YYLLOC();
728 					BEGIN(xd);
729 					startlit();
730 				}
731 {xuistart}		{
732 					SET_YYLLOC();
733 					BEGIN(xui);
734 					startlit();
735 				}
736 <xd>{xdstop}	{
737 					char	   *ident;
738 
739 					BEGIN(INITIAL);
740 					if (yyextra->literallen == 0)
741 						yyerror("zero-length delimited identifier");
742 					ident = litbufdup(yyscanner);
743 					if (yyextra->literallen >= NAMEDATALEN)
744 						truncate_identifier(ident, yyextra->literallen, true);
745 					yylval->str = ident;
746 					return IDENT;
747 				}
748 <xui>{dquote} {
749 					yyless(1);
750 					/* xuiend state looks for possible UESCAPE */
751 					BEGIN(xuiend);
752 				}
753 <xuiend>{whitespace} {
754 					/* stay in xuiend state over whitespace */
755 				}
756 <xuiend><<EOF>> |
757 <xuiend>{other} |
758 <xuiend>{xustop1} {
759 					/* no UESCAPE after the quote, throw back everything */
760 					char	   *ident;
761 					int			identlen;
762 
763 					yyless(0);
764 
765 					BEGIN(INITIAL);
766 					if (yyextra->literallen == 0)
767 						yyerror("zero-length delimited identifier");
768 					ident = litbuf_udeescape('\\', yyscanner);
769 					identlen = strlen(ident);
770 					if (identlen >= NAMEDATALEN)
771 						truncate_identifier(ident, identlen, true);
772 					yylval->str = ident;
773 					return IDENT;
774 				}
775 <xuiend>{xustop2}	{
776 					/* found UESCAPE after the end quote */
777 					char	   *ident;
778 					int			identlen;
779 
780 					BEGIN(INITIAL);
781 					if (yyextra->literallen == 0)
782 						yyerror("zero-length delimited identifier");
783 					if (!check_uescapechar(yytext[yyleng - 2]))
784 					{
785 						SET_YYLLOC();
786 						ADVANCE_YYLLOC(yyleng - 2);
787 						yyerror("invalid Unicode escape character");
788 					}
789 					ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
790 					identlen = strlen(ident);
791 					if (identlen >= NAMEDATALEN)
792 						truncate_identifier(ident, identlen, true);
793 					yylval->str = ident;
794 					return IDENT;
795 				}
796 <xd,xui>{xddouble}	{
797 					addlitchar('"', yyscanner);
798 				}
799 <xd,xui>{xdinside}	{
800 					addlit(yytext, yyleng, yyscanner);
801 				}
802 <xd,xui><<EOF>>		{ yyerror("unterminated quoted identifier"); }
803 
804 {xufailed}	{
805 					char	   *ident;
806 
807 					SET_YYLLOC();
808 					/* throw back all but the initial u/U */
809 					yyless(1);
810 					/* and treat it as {identifier} */
811 					ident = downcase_truncate_identifier(yytext, yyleng, true);
812 					yylval->str = ident;
813 					return IDENT;
814 				}
815 
816 {typecast}		{
817 					SET_YYLLOC();
818 					return TYPECAST;
819 				}
820 
821 {dot_dot}		{
822 					SET_YYLLOC();
823 					return DOT_DOT;
824 				}
825 
826 {colon_equals}	{
827 					SET_YYLLOC();
828 					return COLON_EQUALS;
829 				}
830 
831 {equals_greater} {
832 					SET_YYLLOC();
833 					return EQUALS_GREATER;
834 				}
835 
836 {less_equals}	{
837 					SET_YYLLOC();
838 					return LESS_EQUALS;
839 				}
840 
841 {greater_equals} {
842 					SET_YYLLOC();
843 					return GREATER_EQUALS;
844 				}
845 
846 {less_greater}	{
847 					/* We accept both "<>" and "!=" as meaning NOT_EQUALS */
848 					SET_YYLLOC();
849 					return NOT_EQUALS;
850 				}
851 
852 {not_equals}	{
853 					/* We accept both "<>" and "!=" as meaning NOT_EQUALS */
854 					SET_YYLLOC();
855 					return NOT_EQUALS;
856 				}
857 
858 {self}			{
859 					SET_YYLLOC();
860 					return yytext[0];
861 				}
862 
863 {operator}		{
864 					/*
865 					 * Check for embedded slash-star or dash-dash; those
866 					 * are comment starts, so operator must stop there.
867 					 * Note that slash-star or dash-dash at the first
868 					 * character will match a prior rule, not this one.
869 					 */
870 					int			nchars = yyleng;
871 					char	   *slashstar = strstr(yytext, "/*");
872 					char	   *dashdash = strstr(yytext, "--");
873 
874 					if (slashstar && dashdash)
875 					{
876 						/* if both appear, take the first one */
877 						if (slashstar > dashdash)
878 							slashstar = dashdash;
879 					}
880 					else if (!slashstar)
881 						slashstar = dashdash;
882 					if (slashstar)
883 						nchars = slashstar - yytext;
884 
885 					/*
886 					 * For SQL compatibility, '+' and '-' cannot be the
887 					 * last char of a multi-char operator unless the operator
888 					 * contains chars that are not in SQL operators.
889 					 * The idea is to lex '=-' as two operators, but not
890 					 * to forbid operator names like '?-' that could not be
891 					 * sequences of SQL operators.
892 					 */
893 					if (nchars > 1 &&
894 						(yytext[nchars - 1] == '+' ||
895 						 yytext[nchars - 1] == '-'))
896 					{
897 						int			ic;
898 
899 						for (ic = nchars - 2; ic >= 0; ic--)
900 						{
901 							char c = yytext[ic];
902 							if (c == '~' || c == '!' || c == '@' ||
903 								c == '#' || c == '^' || c == '&' ||
904 								c == '|' || c == '`' || c == '?' ||
905 								c == '%')
906 								break;
907 						}
908 						if (ic < 0)
909 						{
910 							/*
911 							 * didn't find a qualifying character, so remove
912 							 * all trailing [+-]
913 							 */
914 							do {
915 								nchars--;
916 							} while (nchars > 1 &&
917 								 (yytext[nchars - 1] == '+' ||
918 								  yytext[nchars - 1] == '-'));
919 						}
920 					}
921 
922 					SET_YYLLOC();
923 
924 					if (nchars < yyleng)
925 					{
926 						/* Strip the unwanted chars from the token */
927 						yyless(nchars);
928 						/*
929 						 * If what we have left is only one char, and it's
930 						 * one of the characters matching "self", then
931 						 * return it as a character token the same way
932 						 * that the "self" rule would have.
933 						 */
934 						if (nchars == 1 &&
935 							strchr(",()[].;:+-*/%^<>=", yytext[0]))
936 							return yytext[0];
937 						/*
938 						 * Likewise, if what we have left is two chars, and
939 						 * those match the tokens ">=", "<=", "=>", "<>" or
940 						 * "!=", then we must return the appropriate token
941 						 * rather than the generic Op.
942 						 */
943 						if (nchars == 2)
944 						{
945 							if (yytext[0] == '=' && yytext[1] == '>')
946 								return EQUALS_GREATER;
947 							if (yytext[0] == '>' && yytext[1] == '=')
948 								return GREATER_EQUALS;
949 							if (yytext[0] == '<' && yytext[1] == '=')
950 								return LESS_EQUALS;
951 							if (yytext[0] == '<' && yytext[1] == '>')
952 								return NOT_EQUALS;
953 							if (yytext[0] == '!' && yytext[1] == '=')
954 								return NOT_EQUALS;
955 						}
956 					}
957 
958 					/*
959 					 * Complain if operator is too long.  Unlike the case
960 					 * for identifiers, we make this an error not a notice-
961 					 * and-truncate, because the odds are we are looking at
962 					 * a syntactic mistake anyway.
963 					 */
964 					if (nchars >= NAMEDATALEN)
965 						yyerror("operator too long");
966 
967 					yylval->str = pstrdup(yytext);
968 					return Op;
969 				}
970 
971 {param}			{
972 					SET_YYLLOC();
973 					yylval->ival = atol(yytext + 1);
974 					return PARAM;
975 				}
976 
977 {integer}		{
978 					SET_YYLLOC();
979 					return process_integer_literal(yytext, yylval);
980 				}
981 {decimal}		{
982 					SET_YYLLOC();
983 					yylval->str = pstrdup(yytext);
984 					return FCONST;
985 				}
986 {decimalfail}	{
987 					/* throw back the .., and treat as integer */
988 					yyless(yyleng - 2);
989 					SET_YYLLOC();
990 					return process_integer_literal(yytext, yylval);
991 				}
992 {real}			{
993 					SET_YYLLOC();
994 					yylval->str = pstrdup(yytext);
995 					return FCONST;
996 				}
997 {realfail1}		{
998 					/*
999 					 * throw back the [Ee], and treat as {decimal}.  Note
1000 					 * that it is possible the input is actually {integer},
1001 					 * but since this case will almost certainly lead to a
1002 					 * syntax error anyway, we don't bother to distinguish.
1003 					 */
1004 					yyless(yyleng - 1);
1005 					SET_YYLLOC();
1006 					yylval->str = pstrdup(yytext);
1007 					return FCONST;
1008 				}
1009 {realfail2}		{
1010 					/* throw back the [Ee][+-], and proceed as above */
1011 					yyless(yyleng - 2);
1012 					SET_YYLLOC();
1013 					yylval->str = pstrdup(yytext);
1014 					return FCONST;
1015 				}
1016 
1017 
1018 {identifier}	{
1019 					const ScanKeyword *keyword;
1020 					char	   *ident;
1021 
1022 					SET_YYLLOC();
1023 
1024 					/* Is it a keyword? */
1025 					keyword = ScanKeywordLookup(yytext,
1026 												yyextra->keywords,
1027 												yyextra->num_keywords);
1028 					if (keyword != NULL)
1029 					{
1030 						yylval->keyword = keyword->name;
1031 						return keyword->value;
1032 					}
1033 
1034 					/*
1035 					 * No.  Convert the identifier to lower case, and truncate
1036 					 * if necessary.
1037 					 */
1038 					ident = downcase_truncate_identifier(yytext, yyleng, true);
1039 					yylval->str = ident;
1040 					return IDENT;
1041 				}
1042 
1043 {other}			{
1044 					SET_YYLLOC();
1045 					return yytext[0];
1046 				}
1047 
1048 <<EOF>>			{
1049 					SET_YYLLOC();
1050 					yyterminate();
1051 				}
1052 
1053 %%
1054 
1055 /*
1056  * Arrange access to yyextra for subroutines of the main yylex() function.
1057  * We expect each subroutine to have a yyscanner parameter.  Rather than
1058  * use the yyget_xxx functions, which might or might not get inlined by the
1059  * compiler, we cheat just a bit and cast yyscanner to the right type.
1060  */
1061 #undef yyextra
1062 #define yyextra  (((struct yyguts_t *) yyscanner)->yyextra_r)
1063 
1064 /* Likewise for a couple of other things we need. */
1065 #undef yylloc
1066 #define yylloc	(((struct yyguts_t *) yyscanner)->yylloc_r)
1067 #undef yyleng
1068 #define yyleng	(((struct yyguts_t *) yyscanner)->yyleng_r)
1069 
1070 
1071 /*
1072  * scanner_errposition
1073  *		Report a lexer or grammar error cursor position, if possible.
1074  *
1075  * This is expected to be used within an ereport() call.  The return value
1076  * is a dummy (always 0, in fact).
1077  *
1078  * Note that this can only be used for messages emitted during raw parsing
1079  * (essentially, scan.l and gram.y), since it requires the yyscanner struct
1080  * to still be available.
1081  */
1082 int
1083 scanner_errposition(int location, core_yyscan_t yyscanner)
1084 {
1085 	int			pos;
1086 
1087 	if (location < 0)
1088 		return 0;				/* no-op if location is unknown */
1089 
1090 	/* Convert byte offset to character number */
1091 	pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
1092 	/* And pass it to the ereport mechanism */
1093 	return errposition(pos);
1094 }
1095 
1096 /*
1097  * scanner_yyerror
1098  *		Report a lexer or grammar error.
1099  *
1100  * The message's cursor position is whatever YYLLOC was last set to,
1101  * ie, the start of the current token if called within yylex(), or the
1102  * most recently lexed token if called from the grammar.
1103  * This is OK for syntax error messages from the Bison parser, because Bison
1104  * parsers report error as soon as the first unparsable token is reached.
1105  * Beware of using yyerror for other purposes, as the cursor position might
1106  * be misleading!
1107  */
1108 void
1109 scanner_yyerror(const char *message, core_yyscan_t yyscanner)
1110 {
1111 	const char *loc = yyextra->scanbuf + *yylloc;
1112 
1113 	if (*loc == YY_END_OF_BUFFER_CHAR)
1114 	{
1115 		ereport(ERROR,
1116 				(errcode(ERRCODE_SYNTAX_ERROR),
1117 		/* translator: %s is typically the translation of "syntax error" */
1118 				 errmsg("%s at end of input", _(message)),
1119 				 lexer_errposition()));
1120 	}
1121 	else
1122 	{
1123 		ereport(ERROR,
1124 				(errcode(ERRCODE_SYNTAX_ERROR),
1125 		/* translator: first %s is typically the translation of "syntax error" */
1126 				 errmsg("%s at or near \"%s\"", _(message), loc),
1127 				 lexer_errposition()));
1128 	}
1129 }
1130 
1131 
1132 /*
1133  * Called before any actual parsing is done
1134  */
1135 core_yyscan_t
1136 scanner_init(const char *str,
1137 			 core_yy_extra_type *yyext,
1138 			 const ScanKeyword *keywords,
1139 			 int num_keywords)
1140 {
1141 	Size		slen = strlen(str);
1142 	yyscan_t	scanner;
1143 
1144 	if (yylex_init(&scanner) != 0)
1145 		elog(ERROR, "yylex_init() failed: %m");
1146 
1147 	core_yyset_extra(yyext, scanner);
1148 
1149 	yyext->keywords = keywords;
1150 	yyext->num_keywords = num_keywords;
1151 
1152 	yyext->backslash_quote = backslash_quote;
1153 	yyext->escape_string_warning = escape_string_warning;
1154 	yyext->standard_conforming_strings = standard_conforming_strings;
1155 
1156 	/*
1157 	 * Make a scan buffer with special termination needed by flex.
1158 	 */
1159 	yyext->scanbuf = (char *) palloc(slen + 2);
1160 	yyext->scanbuflen = slen;
1161 	memcpy(yyext->scanbuf, str, slen);
1162 	yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
1163 	yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
1164 
1165 	/* initialize literal buffer to a reasonable but expansible size */
1166 	yyext->literalalloc = 1024;
1167 	yyext->literalbuf = (char *) palloc(yyext->literalalloc);
1168 	yyext->literallen = 0;
1169 
1170 	return scanner;
1171 }
1172 
1173 
1174 /*
1175  * Called after parsing is done to clean up after scanner_init()
1176  */
1177 void
1178 scanner_finish(core_yyscan_t yyscanner)
1179 {
1180 	/*
1181 	 * We don't bother to call yylex_destroy(), because all it would do is
1182 	 * pfree a small amount of control storage.  It's cheaper to leak the
1183 	 * storage until the parsing context is destroyed.  The amount of space
1184 	 * involved is usually negligible compared to the output parse tree
1185 	 * anyway.
1186 	 *
1187 	 * We do bother to pfree the scanbuf and literal buffer, but only if they
1188 	 * represent a nontrivial amount of space.  The 8K cutoff is arbitrary.
1189 	 */
1190 	if (yyextra->scanbuflen >= 8192)
1191 		pfree(yyextra->scanbuf);
1192 	if (yyextra->literalalloc >= 8192)
1193 		pfree(yyextra->literalbuf);
1194 }
1195 
1196 
1197 static void
1198 addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
1199 {
1200 	/* enlarge buffer if needed */
1201 	if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
1202 	{
1203 		do
1204 		{
1205 			yyextra->literalalloc *= 2;
1206 		} while ((yyextra->literallen + yleng) >= yyextra->literalalloc);
1207 		yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1208 												yyextra->literalalloc);
1209 	}
1210 	/* append new data */
1211 	memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
1212 	yyextra->literallen += yleng;
1213 }
1214 
1215 
1216 static void
1217 addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
1218 {
1219 	/* enlarge buffer if needed */
1220 	if ((yyextra->literallen + 1) >= yyextra->literalalloc)
1221 	{
1222 		yyextra->literalalloc *= 2;
1223 		yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1224 												yyextra->literalalloc);
1225 	}
1226 	/* append new data */
1227 	yyextra->literalbuf[yyextra->literallen] = ychar;
1228 	yyextra->literallen += 1;
1229 }
1230 
1231 
1232 /*
1233  * Create a palloc'd copy of literalbuf, adding a trailing null.
1234  */
1235 static char *
1236 litbufdup(core_yyscan_t yyscanner)
1237 {
1238 	int			llen = yyextra->literallen;
1239 	char	   *new;
1240 
1241 	new = palloc(llen + 1);
1242 	memcpy(new, yyextra->literalbuf, llen);
1243 	new[llen] = '\0';
1244 	return new;
1245 }
1246 
1247 static int
1248 process_integer_literal(const char *token, YYSTYPE *lval)
1249 {
1250 	long		val;
1251 	char	   *endptr;
1252 
1253 	errno = 0;
1254 	val = strtol(token, &endptr, 10);
1255 	if (*endptr != '\0' || errno == ERANGE
1256 #ifdef HAVE_LONG_INT_64
1257 	/* if long > 32 bits, check for overflow of int4 */
1258 		|| val != (long) ((int32) val)
1259 #endif
1260 		)
1261 	{
1262 		/* integer too large, treat it as a float */
1263 		lval->str = pstrdup(token);
1264 		return FCONST;
1265 	}
1266 	lval->ival = val;
1267 	return ICONST;
1268 }
1269 
1270 static unsigned int
1271 hexval(unsigned char c)
1272 {
1273 	if (c >= '0' && c <= '9')
1274 		return c - '0';
1275 	if (c >= 'a' && c <= 'f')
1276 		return c - 'a' + 0xA;
1277 	if (c >= 'A' && c <= 'F')
1278 		return c - 'A' + 0xA;
1279 	elog(ERROR, "invalid hexadecimal digit");
1280 	return 0;					/* not reached */
1281 }
1282 
1283 static void
1284 check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner)
1285 {
1286 	if (GetDatabaseEncoding() == PG_UTF8)
1287 		return;
1288 
1289 	if (c > 0x7F)
1290 	{
1291 		ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3);	/* 3 for U&" */
1292 		yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1293 	}
1294 }
1295 
1296 static bool
1297 is_utf16_surrogate_first(pg_wchar c)
1298 {
1299 	return (c >= 0xD800 && c <= 0xDBFF);
1300 }
1301 
1302 static bool
1303 is_utf16_surrogate_second(pg_wchar c)
1304 {
1305 	return (c >= 0xDC00 && c <= 0xDFFF);
1306 }
1307 
1308 static pg_wchar
1309 surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
1310 {
1311 	return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
1312 }
1313 
1314 static void
1315 addunicode(pg_wchar c, core_yyscan_t yyscanner)
1316 {
1317 	char		buf[8];
1318 
1319 	if (c == 0 || c > 0x10FFFF)
1320 		yyerror("invalid Unicode escape value");
1321 	if (c > 0x7F)
1322 	{
1323 		if (GetDatabaseEncoding() != PG_UTF8)
1324 			yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1325 		yyextra->saw_non_ascii = true;
1326 	}
1327 	unicode_to_utf8(c, (unsigned char *) buf);
1328 	addlit(buf, pg_mblen(buf), yyscanner);
1329 }
1330 
1331 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
1332 static bool
1333 check_uescapechar(unsigned char escape)
1334 {
1335 	if (isxdigit(escape)
1336 		|| escape == '+'
1337 		|| escape == '\''
1338 		|| escape == '"'
1339 		|| scanner_isspace(escape))
1340 	{
1341 		return false;
1342 	}
1343 	else
1344 		return true;
1345 }
1346 
1347 /* like litbufdup, but handle unicode escapes */
1348 static char *
1349 litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
1350 {
1351 	char	   *new;
1352 	char	   *litbuf,
1353 			   *in,
1354 			   *out;
1355 	pg_wchar	pair_first = 0;
1356 
1357 	/* Make literalbuf null-terminated to simplify the scanning loop */
1358 	litbuf = yyextra->literalbuf;
1359 	litbuf[yyextra->literallen] = '\0';
1360 
1361 	/*
1362 	 * This relies on the subtle assumption that a UTF-8 expansion cannot be
1363 	 * longer than its escaped representation.
1364 	 */
1365 	new = palloc(yyextra->literallen + 1);
1366 
1367 	in = litbuf;
1368 	out = new;
1369 	while (*in)
1370 	{
1371 		if (in[0] == escape)
1372 		{
1373 			if (in[1] == escape)
1374 			{
1375 				if (pair_first)
1376 				{
1377 					ADVANCE_YYLLOC(in - litbuf + 3);	/* 3 for U&" */
1378 					yyerror("invalid Unicode surrogate pair");
1379 				}
1380 				*out++ = escape;
1381 				in += 2;
1382 			}
1383 			else if (isxdigit((unsigned char) in[1]) &&
1384 					 isxdigit((unsigned char) in[2]) &&
1385 					 isxdigit((unsigned char) in[3]) &&
1386 					 isxdigit((unsigned char) in[4]))
1387 			{
1388 				pg_wchar	unicode;
1389 
1390 				unicode = (hexval(in[1]) << 12) +
1391 					(hexval(in[2]) << 8) +
1392 					(hexval(in[3]) << 4) +
1393 					hexval(in[4]);
1394 				check_unicode_value(unicode, in, yyscanner);
1395 				if (pair_first)
1396 				{
1397 					if (is_utf16_surrogate_second(unicode))
1398 					{
1399 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1400 						pair_first = 0;
1401 					}
1402 					else
1403 					{
1404 						ADVANCE_YYLLOC(in - litbuf + 3);		/* 3 for U&" */
1405 						yyerror("invalid Unicode surrogate pair");
1406 					}
1407 				}
1408 				else if (is_utf16_surrogate_second(unicode))
1409 					yyerror("invalid Unicode surrogate pair");
1410 
1411 				if (is_utf16_surrogate_first(unicode))
1412 					pair_first = unicode;
1413 				else
1414 				{
1415 					unicode_to_utf8(unicode, (unsigned char *) out);
1416 					out += pg_mblen(out);
1417 				}
1418 				in += 5;
1419 			}
1420 			else if (in[1] == '+' &&
1421 					 isxdigit((unsigned char) in[2]) &&
1422 					 isxdigit((unsigned char) in[3]) &&
1423 					 isxdigit((unsigned char) in[4]) &&
1424 					 isxdigit((unsigned char) in[5]) &&
1425 					 isxdigit((unsigned char) in[6]) &&
1426 					 isxdigit((unsigned char) in[7]))
1427 			{
1428 				pg_wchar	unicode;
1429 
1430 				unicode = (hexval(in[2]) << 20) +
1431 					(hexval(in[3]) << 16) +
1432 					(hexval(in[4]) << 12) +
1433 					(hexval(in[5]) << 8) +
1434 					(hexval(in[6]) << 4) +
1435 					hexval(in[7]);
1436 				check_unicode_value(unicode, in, yyscanner);
1437 				if (pair_first)
1438 				{
1439 					if (is_utf16_surrogate_second(unicode))
1440 					{
1441 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1442 						pair_first = 0;
1443 					}
1444 					else
1445 					{
1446 						ADVANCE_YYLLOC(in - litbuf + 3);		/* 3 for U&" */
1447 						yyerror("invalid Unicode surrogate pair");
1448 					}
1449 				}
1450 				else if (is_utf16_surrogate_second(unicode))
1451 					yyerror("invalid Unicode surrogate pair");
1452 
1453 				if (is_utf16_surrogate_first(unicode))
1454 					pair_first = unicode;
1455 				else
1456 				{
1457 					unicode_to_utf8(unicode, (unsigned char *) out);
1458 					out += pg_mblen(out);
1459 				}
1460 				in += 8;
1461 			}
1462 			else
1463 			{
1464 				ADVANCE_YYLLOC(in - litbuf + 3);		/* 3 for U&" */
1465 				yyerror("invalid Unicode escape value");
1466 			}
1467 		}
1468 		else
1469 		{
1470 			if (pair_first)
1471 			{
1472 				ADVANCE_YYLLOC(in - litbuf + 3);		/* 3 for U&" */
1473 				yyerror("invalid Unicode surrogate pair");
1474 			}
1475 			*out++ = *in++;
1476 		}
1477 	}
1478 
1479 	/* unfinished surrogate pair? */
1480 	if (pair_first)
1481 	{
1482 		ADVANCE_YYLLOC(in - litbuf + 3);				/* 3 for U&" */
1483 		yyerror("invalid Unicode surrogate pair");
1484 	}
1485 
1486 	*out = '\0';
1487 
1488 	/*
1489 	 * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
1490 	 * codes; but it's probably not worth the trouble, since this isn't likely
1491 	 * to be a performance-critical path.
1492 	 */
1493 	pg_verifymbstr(new, out - new, false);
1494 	return new;
1495 }
1496 
1497 static unsigned char
1498 unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
1499 {
1500 	switch (c)
1501 	{
1502 		case 'b':
1503 			return '\b';
1504 		case 'f':
1505 			return '\f';
1506 		case 'n':
1507 			return '\n';
1508 		case 'r':
1509 			return '\r';
1510 		case 't':
1511 			return '\t';
1512 		default:
1513 			/* check for backslash followed by non-7-bit-ASCII */
1514 			if (c == '\0' || IS_HIGHBIT_SET(c))
1515 				yyextra->saw_non_ascii = true;
1516 
1517 			return c;
1518 	}
1519 }
1520 
1521 static void
1522 check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
1523 {
1524 	if (ychar == '\'')
1525 	{
1526 		if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1527 			ereport(WARNING,
1528 					(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1529 					 errmsg("nonstandard use of \\' in a string literal"),
1530 					 errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
1531 					 lexer_errposition()));
1532 		yyextra->warn_on_first_escape = false;	/* warn only once per string */
1533 	}
1534 	else if (ychar == '\\')
1535 	{
1536 		if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1537 			ereport(WARNING,
1538 					(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1539 					 errmsg("nonstandard use of \\\\ in a string literal"),
1540 					 errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1541 					 lexer_errposition()));
1542 		yyextra->warn_on_first_escape = false;	/* warn only once per string */
1543 	}
1544 	else
1545 		check_escape_warning(yyscanner);
1546 }
1547 
1548 static void
1549 check_escape_warning(core_yyscan_t yyscanner)
1550 {
1551 	if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1552 		ereport(WARNING,
1553 				(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1554 				 errmsg("nonstandard use of escape in a string literal"),
1555 		errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
1556 				 lexer_errposition()));
1557 	yyextra->warn_on_first_escape = false;		/* warn only once per string */
1558 }
1559 
1560 /*
1561  * Interface functions to make flex use palloc() instead of malloc().
1562  * It'd be better to make these static, but flex insists otherwise.
1563  */
1564 
1565 void *
1566 core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
1567 {
1568 	return palloc(bytes);
1569 }
1570 
1571 void *
1572 core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
1573 {
1574 	if (ptr)
1575 		return repalloc(ptr, bytes);
1576 	else
1577 		return palloc(bytes);
1578 }
1579 
1580 void
1581 core_yyfree(void *ptr, core_yyscan_t yyscanner)
1582 {
1583 	if (ptr)
1584 		pfree(ptr);
1585 }
1586