1 %top{
2 /*-------------------------------------------------------------------------
3  *
4  * scan.l
5  *	  lexical scanner for PostgreSQL
6  *
7  * NOTE NOTE NOTE:
8  *
9  * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l
10  * and src/interfaces/ecpg/preproc/pgc.l!
11  *
12  * The rules are designed so that the scanner never has to backtrack,
13  * in the sense that there is always a rule that can match the input
14  * consumed so far (the rule action may internally throw back some input
15  * with yyless(), however).  As explained in the flex manual, this makes
16  * for a useful speed increase --- about a third faster than a plain -CF
17  * lexer, in simple testing.  The extra complexity is mostly in the rules
18  * for handling float numbers and continued string literals.  If you change
19  * the lexical rules, verify that you haven't broken the no-backtrack
20  * property by running flex with the "-b" option and checking that the
21  * resulting "lex.backup" file says that no backing up is needed.  (As of
22  * Postgres 9.2, this check is made automatically by the Makefile.)
23  *
24  *
25  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
26  * Portions Copyright (c) 1994, Regents of the University of California
27  *
28  * IDENTIFICATION
29  *	  src/backend/parser/scan.l
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34 
35 #include <ctype.h>
36 #include <unistd.h>
37 
38 #include "common/string.h"
39 #include "parser/gramparse.h"
40 #include "parser/parser.h"		/* only needed for GUC variables */
41 #include "parser/scansup.h"
42 #include "mb/pg_wchar.h"
43 }
44 
45 %{
46 
47 /* LCOV_EXCL_START */
48 
49 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
50 #undef fprintf
51 #define fprintf(file, fmt, msg)  fprintf_to_ereport(fmt, msg)
52 
53 static void
54 fprintf_to_ereport(const char *fmt, const char *msg)
55 {
56 	ereport(ERROR, (errmsg_internal("%s", msg)));
57 }
58 
59 /*
60  * GUC variables.  This is a DIRECT violation of the warning given at the
61  * head of gram.y, ie flex/bison code must not depend on any GUC variables;
62  * as such, changing their values can induce very unintuitive behavior.
63  * But we shall have to live with it until we can remove these variables.
64  */
65 int			backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
66 bool		escape_string_warning = true;
67 bool		standard_conforming_strings = true;
68 
69 /*
70  * Constant data exported from this file.  This array maps from the
71  * zero-based keyword numbers returned by ScanKeywordLookup to the
72  * Bison token numbers needed by gram.y.  This is exported because
73  * callers need to pass it to scanner_init, if they are using the
74  * standard keyword list ScanKeywords.
75  */
76 #define PG_KEYWORD(kwname, value, category) value,
77 
78 const uint16 ScanKeywordTokens[] = {
79 #include "parser/kwlist.h"
80 };
81 
82 #undef PG_KEYWORD
83 
84 /*
85  * Set the type of YYSTYPE.
86  */
87 #define YYSTYPE core_YYSTYPE
88 
89 /*
90  * Set the type of yyextra.  All state variables used by the scanner should
91  * be in yyextra, *not* statically allocated.
92  */
93 #define YY_EXTRA_TYPE core_yy_extra_type *
94 
95 /*
96  * Each call to yylex must set yylloc to the location of the found token
97  * (expressed as a byte offset from the start of the input text).
98  * When we parse a token that requires multiple lexer rules to process,
99  * this should be done in the first such rule, else yylloc will point
100  * into the middle of the token.
101  */
102 #define SET_YYLLOC()  (*(yylloc) = yytext - yyextra->scanbuf)
103 
104 /*
105  * Advance yylloc by the given number of bytes.
106  */
107 #define ADVANCE_YYLLOC(delta)  ( *(yylloc) += (delta) )
108 
109 #define startlit()	( yyextra->literallen = 0 )
110 static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
111 static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
112 static char *litbufdup(core_yyscan_t yyscanner);
113 static char *litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner);
114 static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
115 static int	process_integer_literal(const char *token, YYSTYPE *lval);
116 static bool is_utf16_surrogate_first(pg_wchar c);
117 static bool is_utf16_surrogate_second(pg_wchar c);
118 static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
119 static void addunicode(pg_wchar c, yyscan_t yyscanner);
120 static bool check_uescapechar(unsigned char escape);
121 
122 #define yyerror(msg)  scanner_yyerror(msg, yyscanner)
123 
124 #define lexer_errposition()  scanner_errposition(*(yylloc), yyscanner)
125 
126 static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner);
127 static void check_escape_warning(core_yyscan_t yyscanner);
128 
129 /*
130  * Work around a bug in flex 2.5.35: it emits a couple of functions that
131  * it forgets to emit declarations for.  Since we use -Wmissing-prototypes,
132  * this would cause warnings.  Providing our own declarations should be
133  * harmless even when the bug gets fixed.
134  */
135 extern int	core_yyget_column(yyscan_t yyscanner);
136 extern void core_yyset_column(int column_no, yyscan_t yyscanner);
137 
138 %}
139 
140 %option reentrant
141 %option bison-bridge
142 %option bison-locations
143 %option 8bit
144 %option never-interactive
145 %option nodefault
146 %option noinput
147 %option nounput
148 %option noyywrap
149 %option noyyalloc
150 %option noyyrealloc
151 %option noyyfree
152 %option warn
153 %option prefix="core_yy"
154 
155 /*
156  * OK, here is a short description of lex/flex rules behavior.
157  * The longest pattern which matches an input string is always chosen.
158  * For equal-length patterns, the first occurring in the rules list is chosen.
159  * INITIAL is the starting state, to which all non-conditional rules apply.
160  * Exclusive states change parsing rules while the state is active.  When in
161  * an exclusive state, only those rules defined for that state apply.
162  *
163  * We use exclusive states for quoted strings, extended comments,
164  * and to eliminate parsing troubles for numeric strings.
165  * Exclusive states:
166  *  <xb> bit string literal
167  *  <xc> extended C-style comments
168  *  <xd> delimited identifiers (double-quoted identifiers)
169  *  <xh> hexadecimal numeric string
170  *  <xq> standard quoted strings
171  *  <xe> extended quoted strings (support backslash escape sequences)
172  *  <xdolq> $foo$ quoted strings
173  *  <xui> quoted identifier with Unicode escapes
174  *  <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
175  *  <xus> quoted string with Unicode escapes
176  *  <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
177  *  <xeu> Unicode surrogate pair in extended quoted string
178  *
179  * Remember to add an <<EOF>> case whenever you add a new exclusive state!
180  * The default one is probably not the right thing.
181  */
182 
183 %x xb
184 %x xc
185 %x xd
186 %x xh
187 %x xq
188 %x xe
189 %x xdolq
190 %x xui
191 %x xuiend
192 %x xus
193 %x xusend
194 %x xeu
195 
196 /*
197  * In order to make the world safe for Windows and Mac clients as well as
198  * Unix ones, we accept either \n or \r as a newline.  A DOS-style \r\n
199  * sequence will be seen as two successive newlines, but that doesn't cause
200  * any problems.  Comments that start with -- and extend to the next
201  * newline are treated as equivalent to a single whitespace character.
202  *
203  * NOTE a fine point: if there is no newline following --, we will absorb
204  * everything to the end of the input as a comment.  This is correct.  Older
205  * versions of Postgres failed to recognize -- as a comment if the input
206  * did not end with a newline.
207  *
208  * XXX perhaps \f (formfeed) should be treated as a newline as well?
209  *
210  * XXX if you change the set of whitespace characters, fix scanner_isspace()
211  * to agree.
212  */
213 
214 space			[ \t\n\r\f]
215 horiz_space		[ \t\f]
216 newline			[\n\r]
217 non_newline		[^\n\r]
218 
219 comment			("--"{non_newline}*)
220 
221 whitespace		({space}+|{comment})
222 
223 /*
224  * SQL requires at least one newline in the whitespace separating
225  * string literals that are to be concatenated.  Silly, but who are we
226  * to argue?  Note that {whitespace_with_newline} should not have * after
227  * it, whereas {whitespace} should generally have a * after it...
228  */
229 
230 special_whitespace		({space}+|{comment}{newline})
231 horiz_whitespace		({horiz_space}|{comment})
232 whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
233 
234 /*
235  * To ensure that {quotecontinue} can be scanned without having to back up
236  * if the full pattern isn't matched, we include trailing whitespace in
237  * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
238  * except for {quote} followed by whitespace and just one "-" (not two,
239  * which would start a {comment}).  To cover that we have {quotefail}.
240  * The actions for {quotestop} and {quotefail} must throw back characters
241  * beyond the quote proper.
242  */
243 quote			'
244 quotestop		{quote}{whitespace}*
245 quotecontinue	{quote}{whitespace_with_newline}{quote}
246 quotefail		{quote}{whitespace}*"-"
247 
248 /* Bit string
249  * It is tempting to scan the string for only those characters
250  * which are allowed. However, this leads to silently swallowed
251  * characters if illegal characters are included in the string.
252  * For example, if xbinside is [01] then B'ABCD' is interpreted
253  * as a zero-length string, and the ABCD' is lost!
254  * Better to pass the string forward and let the input routines
255  * validate the contents.
256  */
257 xbstart			[bB]{quote}
258 xbinside		[^']*
259 
260 /* Hexadecimal number */
261 xhstart			[xX]{quote}
262 xhinside		[^']*
263 
264 /* National character */
265 xnstart			[nN]{quote}
266 
267 /* Quoted string that allows backslash escapes */
268 xestart			[eE]{quote}
269 xeinside		[^\\']+
270 xeescape		[\\][^0-7]
271 xeoctesc		[\\][0-7]{1,3}
272 xehexesc		[\\]x[0-9A-Fa-f]{1,2}
273 xeunicode		[\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
274 xeunicodefail	[\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
275 
276 /* Extended quote
277  * xqdouble implements embedded quote, ''''
278  */
279 xqstart			{quote}
280 xqdouble		{quote}{quote}
281 xqinside		[^']+
282 
283 /* $foo$ style quotes ("dollar quoting")
284  * The quoted string starts with $foo$ where "foo" is an optional string
285  * in the form of an identifier, except that it may not contain "$",
286  * and extends to the first occurrence of an identical string.
287  * There is *no* processing of the quoted text.
288  *
289  * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
290  * fails to match its trailing "$".
291  */
292 dolq_start		[A-Za-z\200-\377_]
293 dolq_cont		[A-Za-z\200-\377_0-9]
294 dolqdelim		\$({dolq_start}{dolq_cont}*)?\$
295 dolqfailed		\${dolq_start}{dolq_cont}*
296 dolqinside		[^$]+
297 
298 /* Double quote
299  * Allows embedded spaces and other special characters into identifiers.
300  */
301 dquote			\"
302 xdstart			{dquote}
303 xdstop			{dquote}
304 xddouble		{dquote}{dquote}
305 xdinside		[^"]+
306 
307 /* Unicode escapes */
308 uescape			[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
309 /* error rule to avoid backup */
310 uescapefail		[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
311 
312 /* Quoted identifier with Unicode escapes */
313 xuistart		[uU]&{dquote}
314 
315 /* Quoted string with Unicode escapes */
316 xusstart		[uU]&{quote}
317 
318 /* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
319 xustop1		{uescapefail}?
320 xustop2		{uescape}
321 
322 /* error rule to avoid backup */
323 xufailed		[uU]&
324 
325 
326 /* C-style comments
327  *
328  * The "extended comment" syntax closely resembles allowable operator syntax.
329  * The tricky part here is to get lex to recognize a string starting with
330  * slash-star as a comment, when interpreting it as an operator would produce
331  * a longer match --- remember lex will prefer a longer match!  Also, if we
332  * have something like plus-slash-star, lex will think this is a 3-character
333  * operator whereas we want to see it as a + operator and a comment start.
334  * The solution is two-fold:
335  * 1. append {op_chars}* to xcstart so that it matches as much text as
336  *    {operator} would. Then the tie-breaker (first matching rule of same
337  *    length) ensures xcstart wins.  We put back the extra stuff with yyless()
338  *    in case it contains a star-slash that should terminate the comment.
339  * 2. In the operator rule, check for slash-star within the operator, and
340  *    if found throw it back with yyless().  This handles the plus-slash-star
341  *    problem.
342  * Dash-dash comments have similar interactions with the operator rule.
343  */
344 xcstart			\/\*{op_chars}*
345 xcstop			\*+\/
346 xcinside		[^*/]+
347 
348 digit			[0-9]
349 ident_start		[A-Za-z\200-\377_]
350 ident_cont		[A-Za-z\200-\377_0-9\$]
351 
352 identifier		{ident_start}{ident_cont}*
353 
354 /* Assorted special-case operators and operator-like tokens */
355 typecast		"::"
356 dot_dot			\.\.
357 colon_equals	":="
358 
359 /*
360  * These operator-like tokens (unlike the above ones) also match the {operator}
361  * rule, which means that they might be overridden by a longer match if they
362  * are followed by a comment start or a + or - character. Accordingly, if you
363  * add to this list, you must also add corresponding code to the {operator}
364  * block to return the correct token in such cases. (This is not needed in
365  * psqlscan.l since the token value is ignored there.)
366  */
367 equals_greater	"=>"
368 less_equals		"<="
369 greater_equals	">="
370 less_greater	"<>"
371 not_equals		"!="
372 
373 /*
374  * "self" is the set of chars that should be returned as single-character
375  * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
376  * which can be one or more characters long (but if a single-char token
377  * appears in the "self" set, it is not to be returned as an Op).  Note
378  * that the sets overlap, but each has some chars that are not in the other.
379  *
380  * If you change either set, adjust the character lists appearing in the
381  * rule for "operator"!
382  */
383 self			[,()\[\].;\:\+\-\*\/\%\^\<\>\=]
384 op_chars		[\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
385 operator		{op_chars}+
386 
387 /* we no longer allow unary minus in numbers.
388  * instead we pass it separately to parser. there it gets
389  * coerced via doNegate() -- Leon aug 20 1999
390  *
391  * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
392  *
393  * {realfail1} and {realfail2} are added to prevent the need for scanner
394  * backup when the {real} rule fails to match completely.
395  */
396 
397 integer			{digit}+
398 decimal			(({digit}*\.{digit}+)|({digit}+\.{digit}*))
399 decimalfail		{digit}+\.\.
400 real			({integer}|{decimal})[Ee][-+]?{digit}+
401 realfail1		({integer}|{decimal})[Ee]
402 realfail2		({integer}|{decimal})[Ee][-+]
403 
404 param			\${integer}
405 
406 other			.
407 
408 /*
409  * Dollar quoted strings are totally opaque, and no escaping is done on them.
410  * Other quoted strings must allow some special characters such as single-quote
411  *  and newline.
412  * Embedded single-quotes are implemented both in the SQL standard
413  *  style of two adjacent single quotes "''" and in the Postgres/Java style
414  *  of escaped-quote "\'".
415  * Other embedded escaped characters are matched explicitly and the leading
416  *  backslash is dropped from the string.
417  * Note that xcstart must appear before operator, as explained above!
418  *  Also whitespace (comment) must appear before operator.
419  */
420 
421 %%
422 
423 {whitespace}	{
424 					/* ignore */
425 				}
426 
427 {xcstart}		{
428 					/* Set location in case of syntax error in comment */
429 					SET_YYLLOC();
430 					yyextra->xcdepth = 0;
431 					BEGIN(xc);
432 					/* Put back any characters past slash-star; see above */
433 					yyless(2);
434 				}
435 
436 <xc>{
437 {xcstart}		{
438 					(yyextra->xcdepth)++;
439 					/* Put back any characters past slash-star; see above */
440 					yyless(2);
441 				}
442 
443 {xcstop}		{
444 					if (yyextra->xcdepth <= 0)
445 						BEGIN(INITIAL);
446 					else
447 						(yyextra->xcdepth)--;
448 				}
449 
450 {xcinside}		{
451 					/* ignore */
452 				}
453 
454 {op_chars}		{
455 					/* ignore */
456 				}
457 
458 \*+				{
459 					/* ignore */
460 				}
461 
462 <<EOF>>			{
463 					yyerror("unterminated /* comment");
464 				}
465 } /* <xc> */
466 
467 {xbstart}		{
468 					/* Binary bit type.
469 					 * At some point we should simply pass the string
470 					 * forward to the parser and label it there.
471 					 * In the meantime, place a leading "b" on the string
472 					 * to mark it for the input routine as a binary string.
473 					 */
474 					SET_YYLLOC();
475 					BEGIN(xb);
476 					startlit();
477 					addlitchar('b', yyscanner);
478 				}
479 <xb>{quotestop}	|
480 <xb>{quotefail} {
481 					yyless(1);
482 					BEGIN(INITIAL);
483 					yylval->str = litbufdup(yyscanner);
484 					return BCONST;
485 				}
486 <xh>{xhinside}	|
487 <xb>{xbinside}	{
488 					addlit(yytext, yyleng, yyscanner);
489 				}
490 <xh>{quotecontinue}	|
491 <xb>{quotecontinue}	{
492 					/* ignore */
493 				}
494 <xb><<EOF>>		{ yyerror("unterminated bit string literal"); }
495 
496 {xhstart}		{
497 					/* Hexadecimal bit type.
498 					 * At some point we should simply pass the string
499 					 * forward to the parser and label it there.
500 					 * In the meantime, place a leading "x" on the string
501 					 * to mark it for the input routine as a hex string.
502 					 */
503 					SET_YYLLOC();
504 					BEGIN(xh);
505 					startlit();
506 					addlitchar('x', yyscanner);
507 				}
508 <xh>{quotestop}	|
509 <xh>{quotefail} {
510 					yyless(1);
511 					BEGIN(INITIAL);
512 					yylval->str = litbufdup(yyscanner);
513 					return XCONST;
514 				}
515 <xh><<EOF>>		{ yyerror("unterminated hexadecimal string literal"); }
516 
517 {xnstart}		{
518 					/* National character.
519 					 * We will pass this along as a normal character string,
520 					 * but preceded with an internally-generated "NCHAR".
521 					 */
522 					int		kwnum;
523 
524 					SET_YYLLOC();
525 					yyless(1);	/* eat only 'n' this time */
526 
527 					kwnum = ScanKeywordLookup("nchar",
528 											  yyextra->keywordlist);
529 					if (kwnum >= 0)
530 					{
531 						yylval->keyword = GetScanKeyword(kwnum,
532 														 yyextra->keywordlist);
533 						return yyextra->keyword_tokens[kwnum];
534 					}
535 					else
536 					{
537 						/* If NCHAR isn't a keyword, just return "n" */
538 						yylval->str = pstrdup("n");
539 						return IDENT;
540 					}
541 				}
542 
543 {xqstart}		{
544 					yyextra->warn_on_first_escape = true;
545 					yyextra->saw_non_ascii = false;
546 					SET_YYLLOC();
547 					if (yyextra->standard_conforming_strings)
548 						BEGIN(xq);
549 					else
550 						BEGIN(xe);
551 					startlit();
552 				}
553 {xestart}		{
554 					yyextra->warn_on_first_escape = false;
555 					yyextra->saw_non_ascii = false;
556 					SET_YYLLOC();
557 					BEGIN(xe);
558 					startlit();
559 				}
560 {xusstart}		{
561 					SET_YYLLOC();
562 					if (!yyextra->standard_conforming_strings)
563 						ereport(ERROR,
564 								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
565 								 errmsg("unsafe use of string constant with Unicode escapes"),
566 								 errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
567 								 lexer_errposition()));
568 					BEGIN(xus);
569 					startlit();
570 				}
571 <xq,xe>{quotestop}	|
572 <xq,xe>{quotefail} {
573 					yyless(1);
574 					BEGIN(INITIAL);
575 					/*
576 					 * check that the data remains valid if it might have been
577 					 * made invalid by unescaping any chars.
578 					 */
579 					if (yyextra->saw_non_ascii)
580 						pg_verifymbstr(yyextra->literalbuf,
581 									   yyextra->literallen,
582 									   false);
583 					yylval->str = litbufdup(yyscanner);
584 					return SCONST;
585 				}
586 <xus>{quotestop} |
587 <xus>{quotefail} {
588 					/* throw back all but the quote */
589 					yyless(1);
590 					/* xusend state looks for possible UESCAPE */
591 					BEGIN(xusend);
592 				}
593 <xusend>{whitespace} {
594 					/* stay in xusend state over whitespace */
595 				}
596 <xusend><<EOF>> |
597 <xusend>{other} |
598 <xusend>{xustop1} {
599 					/* no UESCAPE after the quote, throw back everything */
600 					yyless(0);
601 					BEGIN(INITIAL);
602 					yylval->str = litbuf_udeescape('\\', yyscanner);
603 					return SCONST;
604 				}
605 <xusend>{xustop2} {
606 					/* found UESCAPE after the end quote */
607 					BEGIN(INITIAL);
608 					if (!check_uescapechar(yytext[yyleng - 2]))
609 					{
610 						SET_YYLLOC();
611 						ADVANCE_YYLLOC(yyleng - 2);
612 						yyerror("invalid Unicode escape character");
613 					}
614 					yylval->str = litbuf_udeescape(yytext[yyleng - 2],
615 												   yyscanner);
616 					return SCONST;
617 				}
618 <xq,xe,xus>{xqdouble} {
619 					addlitchar('\'', yyscanner);
620 				}
621 <xq,xus>{xqinside}  {
622 					addlit(yytext, yyleng, yyscanner);
623 				}
624 <xe>{xeinside}  {
625 					addlit(yytext, yyleng, yyscanner);
626 				}
627 <xe>{xeunicode} {
628 					pg_wchar	c = strtoul(yytext + 2, NULL, 16);
629 
630 					check_escape_warning(yyscanner);
631 
632 					if (is_utf16_surrogate_first(c))
633 					{
634 						yyextra->utf16_first_part = c;
635 						BEGIN(xeu);
636 					}
637 					else if (is_utf16_surrogate_second(c))
638 						yyerror("invalid Unicode surrogate pair");
639 					else
640 						addunicode(c, yyscanner);
641 				}
642 <xeu>{xeunicode} {
643 					pg_wchar	c = strtoul(yytext + 2, NULL, 16);
644 
645 					if (!is_utf16_surrogate_second(c))
646 						yyerror("invalid Unicode surrogate pair");
647 
648 					c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
649 
650 					addunicode(c, yyscanner);
651 
652 					BEGIN(xe);
653 				}
654 <xeu>.			{ yyerror("invalid Unicode surrogate pair"); }
655 <xeu>\n			{ yyerror("invalid Unicode surrogate pair"); }
656 <xeu><<EOF>>	{ yyerror("invalid Unicode surrogate pair"); }
657 <xe,xeu>{xeunicodefail}	{
658 					ereport(ERROR,
659 							(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
660 							 errmsg("invalid Unicode escape"),
661 							 errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
662 							 lexer_errposition()));
663 				}
664 <xe>{xeescape}  {
665 					if (yytext[1] == '\'')
666 					{
667 						if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
668 							(yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
669 							 PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
670 							ereport(ERROR,
671 									(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
672 									 errmsg("unsafe use of \\' in a string literal"),
673 									 errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
674 									 lexer_errposition()));
675 					}
676 					check_string_escape_warning(yytext[1], yyscanner);
677 					addlitchar(unescape_single_char(yytext[1], yyscanner),
678 							   yyscanner);
679 				}
680 <xe>{xeoctesc}  {
681 					unsigned char c = strtoul(yytext + 1, NULL, 8);
682 
683 					check_escape_warning(yyscanner);
684 					addlitchar(c, yyscanner);
685 					if (c == '\0' || IS_HIGHBIT_SET(c))
686 						yyextra->saw_non_ascii = true;
687 				}
688 <xe>{xehexesc}  {
689 					unsigned char c = strtoul(yytext + 2, NULL, 16);
690 
691 					check_escape_warning(yyscanner);
692 					addlitchar(c, yyscanner);
693 					if (c == '\0' || IS_HIGHBIT_SET(c))
694 						yyextra->saw_non_ascii = true;
695 				}
696 <xq,xe,xus>{quotecontinue} {
697 					/* ignore */
698 				}
699 <xe>.			{
700 					/* This is only needed for \ just before EOF */
701 					addlitchar(yytext[0], yyscanner);
702 				}
703 <xq,xe,xus><<EOF>>		{ yyerror("unterminated quoted string"); }
704 
705 {dolqdelim}		{
706 					SET_YYLLOC();
707 					yyextra->dolqstart = pstrdup(yytext);
708 					BEGIN(xdolq);
709 					startlit();
710 				}
711 {dolqfailed}	{
712 					SET_YYLLOC();
713 					/* throw back all but the initial "$" */
714 					yyless(1);
715 					/* and treat it as {other} */
716 					return yytext[0];
717 				}
718 <xdolq>{dolqdelim} {
719 					if (strcmp(yytext, yyextra->dolqstart) == 0)
720 					{
721 						pfree(yyextra->dolqstart);
722 						yyextra->dolqstart = NULL;
723 						BEGIN(INITIAL);
724 						yylval->str = litbufdup(yyscanner);
725 						return SCONST;
726 					}
727 					else
728 					{
729 						/*
730 						 * When we fail to match $...$ to dolqstart, transfer
731 						 * the $... part to the output, but put back the final
732 						 * $ for rescanning.  Consider $delim$...$junk$delim$
733 						 */
734 						addlit(yytext, yyleng - 1, yyscanner);
735 						yyless(yyleng - 1);
736 					}
737 				}
738 <xdolq>{dolqinside} {
739 					addlit(yytext, yyleng, yyscanner);
740 				}
741 <xdolq>{dolqfailed} {
742 					addlit(yytext, yyleng, yyscanner);
743 				}
744 <xdolq>.		{
745 					/* This is only needed for $ inside the quoted text */
746 					addlitchar(yytext[0], yyscanner);
747 				}
748 <xdolq><<EOF>>	{ yyerror("unterminated dollar-quoted string"); }
749 
750 {xdstart}		{
751 					SET_YYLLOC();
752 					BEGIN(xd);
753 					startlit();
754 				}
755 {xuistart}		{
756 					SET_YYLLOC();
757 					BEGIN(xui);
758 					startlit();
759 				}
760 <xd>{xdstop}	{
761 					char	   *ident;
762 
763 					BEGIN(INITIAL);
764 					if (yyextra->literallen == 0)
765 						yyerror("zero-length delimited identifier");
766 					ident = litbufdup(yyscanner);
767 					if (yyextra->literallen >= NAMEDATALEN)
768 						truncate_identifier(ident, yyextra->literallen, true);
769 					yylval->str = ident;
770 					return IDENT;
771 				}
772 <xui>{dquote} {
773 					yyless(1);
774 					/* xuiend state looks for possible UESCAPE */
775 					BEGIN(xuiend);
776 				}
777 <xuiend>{whitespace} {
778 					/* stay in xuiend state over whitespace */
779 				}
780 <xuiend><<EOF>> |
781 <xuiend>{other} |
782 <xuiend>{xustop1} {
783 					/* no UESCAPE after the quote, throw back everything */
784 					char	   *ident;
785 					int			identlen;
786 
787 					yyless(0);
788 
789 					BEGIN(INITIAL);
790 					if (yyextra->literallen == 0)
791 						yyerror("zero-length delimited identifier");
792 					ident = litbuf_udeescape('\\', yyscanner);
793 					identlen = strlen(ident);
794 					if (identlen >= NAMEDATALEN)
795 						truncate_identifier(ident, identlen, true);
796 					yylval->str = ident;
797 					return IDENT;
798 				}
799 <xuiend>{xustop2}	{
800 					/* found UESCAPE after the end quote */
801 					char	   *ident;
802 					int			identlen;
803 
804 					BEGIN(INITIAL);
805 					if (yyextra->literallen == 0)
806 						yyerror("zero-length delimited identifier");
807 					if (!check_uescapechar(yytext[yyleng - 2]))
808 					{
809 						SET_YYLLOC();
810 						ADVANCE_YYLLOC(yyleng - 2);
811 						yyerror("invalid Unicode escape character");
812 					}
813 					ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
814 					identlen = strlen(ident);
815 					if (identlen >= NAMEDATALEN)
816 						truncate_identifier(ident, identlen, true);
817 					yylval->str = ident;
818 					return IDENT;
819 				}
820 <xd,xui>{xddouble}	{
821 					addlitchar('"', yyscanner);
822 				}
823 <xd,xui>{xdinside}	{
824 					addlit(yytext, yyleng, yyscanner);
825 				}
826 <xd,xui><<EOF>>		{ yyerror("unterminated quoted identifier"); }
827 
828 {xufailed}	{
829 					char	   *ident;
830 
831 					SET_YYLLOC();
832 					/* throw back all but the initial u/U */
833 					yyless(1);
834 					/* and treat it as {identifier} */
835 					ident = downcase_truncate_identifier(yytext, yyleng, true);
836 					yylval->str = ident;
837 					return IDENT;
838 				}
839 
840 {typecast}		{
841 					SET_YYLLOC();
842 					return TYPECAST;
843 				}
844 
845 {dot_dot}		{
846 					SET_YYLLOC();
847 					return DOT_DOT;
848 				}
849 
850 {colon_equals}	{
851 					SET_YYLLOC();
852 					return COLON_EQUALS;
853 				}
854 
855 {equals_greater} {
856 					SET_YYLLOC();
857 					return EQUALS_GREATER;
858 				}
859 
860 {less_equals}	{
861 					SET_YYLLOC();
862 					return LESS_EQUALS;
863 				}
864 
865 {greater_equals} {
866 					SET_YYLLOC();
867 					return GREATER_EQUALS;
868 				}
869 
870 {less_greater}	{
871 					/* We accept both "<>" and "!=" as meaning NOT_EQUALS */
872 					SET_YYLLOC();
873 					return NOT_EQUALS;
874 				}
875 
876 {not_equals}	{
877 					/* We accept both "<>" and "!=" as meaning NOT_EQUALS */
878 					SET_YYLLOC();
879 					return NOT_EQUALS;
880 				}
881 
882 {self}			{
883 					SET_YYLLOC();
884 					return yytext[0];
885 				}
886 
887 {operator}		{
888 					/*
889 					 * Check for embedded slash-star or dash-dash; those
890 					 * are comment starts, so operator must stop there.
891 					 * Note that slash-star or dash-dash at the first
892 					 * character will match a prior rule, not this one.
893 					 */
894 					int			nchars = yyleng;
895 					char	   *slashstar = strstr(yytext, "/*");
896 					char	   *dashdash = strstr(yytext, "--");
897 
898 					if (slashstar && dashdash)
899 					{
900 						/* if both appear, take the first one */
901 						if (slashstar > dashdash)
902 							slashstar = dashdash;
903 					}
904 					else if (!slashstar)
905 						slashstar = dashdash;
906 					if (slashstar)
907 						nchars = slashstar - yytext;
908 
909 					/*
910 					 * For SQL compatibility, '+' and '-' cannot be the
911 					 * last char of a multi-char operator unless the operator
912 					 * contains chars that are not in SQL operators.
913 					 * The idea is to lex '=-' as two operators, but not
914 					 * to forbid operator names like '?-' that could not be
915 					 * sequences of SQL operators.
916 					 */
917 					if (nchars > 1 &&
918 						(yytext[nchars - 1] == '+' ||
919 						 yytext[nchars - 1] == '-'))
920 					{
921 						int			ic;
922 
923 						for (ic = nchars - 2; ic >= 0; ic--)
924 						{
925 							char c = yytext[ic];
926 							if (c == '~' || c == '!' || c == '@' ||
927 								c == '#' || c == '^' || c == '&' ||
928 								c == '|' || c == '`' || c == '?' ||
929 								c == '%')
930 								break;
931 						}
932 						if (ic < 0)
933 						{
934 							/*
935 							 * didn't find a qualifying character, so remove
936 							 * all trailing [+-]
937 							 */
938 							do {
939 								nchars--;
940 							} while (nchars > 1 &&
941 								 (yytext[nchars - 1] == '+' ||
942 								  yytext[nchars - 1] == '-'));
943 						}
944 					}
945 
946 					SET_YYLLOC();
947 
948 					if (nchars < yyleng)
949 					{
950 						/* Strip the unwanted chars from the token */
951 						yyless(nchars);
952 						/*
953 						 * If what we have left is only one char, and it's
954 						 * one of the characters matching "self", then
955 						 * return it as a character token the same way
956 						 * that the "self" rule would have.
957 						 */
958 						if (nchars == 1 &&
959 							strchr(",()[].;:+-*/%^<>=", yytext[0]))
960 							return yytext[0];
961 						/*
962 						 * Likewise, if what we have left is two chars, and
963 						 * those match the tokens ">=", "<=", "=>", "<>" or
964 						 * "!=", then we must return the appropriate token
965 						 * rather than the generic Op.
966 						 */
967 						if (nchars == 2)
968 						{
969 							if (yytext[0] == '=' && yytext[1] == '>')
970 								return EQUALS_GREATER;
971 							if (yytext[0] == '>' && yytext[1] == '=')
972 								return GREATER_EQUALS;
973 							if (yytext[0] == '<' && yytext[1] == '=')
974 								return LESS_EQUALS;
975 							if (yytext[0] == '<' && yytext[1] == '>')
976 								return NOT_EQUALS;
977 							if (yytext[0] == '!' && yytext[1] == '=')
978 								return NOT_EQUALS;
979 						}
980 					}
981 
982 					/*
983 					 * Complain if operator is too long.  Unlike the case
984 					 * for identifiers, we make this an error not a notice-
985 					 * and-truncate, because the odds are we are looking at
986 					 * a syntactic mistake anyway.
987 					 */
988 					if (nchars >= NAMEDATALEN)
989 						yyerror("operator too long");
990 
991 					yylval->str = pstrdup(yytext);
992 					return Op;
993 				}
994 
995 {param}			{
996 					SET_YYLLOC();
997 					yylval->ival = atol(yytext + 1);
998 					return PARAM;
999 				}
1000 
1001 {integer}		{
1002 					SET_YYLLOC();
1003 					return process_integer_literal(yytext, yylval);
1004 				}
1005 {decimal}		{
1006 					SET_YYLLOC();
1007 					yylval->str = pstrdup(yytext);
1008 					return FCONST;
1009 				}
1010 {decimalfail}	{
1011 					/* throw back the .., and treat as integer */
1012 					yyless(yyleng - 2);
1013 					SET_YYLLOC();
1014 					return process_integer_literal(yytext, yylval);
1015 				}
1016 {real}			{
1017 					SET_YYLLOC();
1018 					yylval->str = pstrdup(yytext);
1019 					return FCONST;
1020 				}
1021 {realfail1}		{
1022 					/*
1023 					 * throw back the [Ee], and figure out whether what
1024 					 * remains is an {integer} or {decimal}.
1025 					 */
1026 					yyless(yyleng - 1);
1027 					SET_YYLLOC();
1028 					return process_integer_literal(yytext, yylval);
1029 				}
1030 {realfail2}		{
1031 					/* throw back the [Ee][+-], and proceed as above */
1032 					yyless(yyleng - 2);
1033 					SET_YYLLOC();
1034 					return process_integer_literal(yytext, yylval);
1035 				}
1036 
1037 
1038 {identifier}	{
1039 					int			kwnum;
1040 					char	   *ident;
1041 
1042 					SET_YYLLOC();
1043 
1044 					/* Is it a keyword? */
1045 					kwnum = ScanKeywordLookup(yytext,
1046 											  yyextra->keywordlist);
1047 					if (kwnum >= 0)
1048 					{
1049 						yylval->keyword = GetScanKeyword(kwnum,
1050 														 yyextra->keywordlist);
1051 						return yyextra->keyword_tokens[kwnum];
1052 					}
1053 
1054 					/*
1055 					 * No.  Convert the identifier to lower case, and truncate
1056 					 * if necessary.
1057 					 */
1058 					ident = downcase_truncate_identifier(yytext, yyleng, true);
1059 					yylval->str = ident;
1060 					return IDENT;
1061 				}
1062 
1063 {other}			{
1064 					SET_YYLLOC();
1065 					return yytext[0];
1066 				}
1067 
1068 <<EOF>>			{
1069 					SET_YYLLOC();
1070 					yyterminate();
1071 				}
1072 
1073 %%
1074 
1075 /* LCOV_EXCL_STOP */
1076 
1077 /*
1078  * Arrange access to yyextra for subroutines of the main yylex() function.
1079  * We expect each subroutine to have a yyscanner parameter.  Rather than
1080  * use the yyget_xxx functions, which might or might not get inlined by the
1081  * compiler, we cheat just a bit and cast yyscanner to the right type.
1082  */
1083 #undef yyextra
1084 #define yyextra  (((struct yyguts_t *) yyscanner)->yyextra_r)
1085 
1086 /* Likewise for a couple of other things we need. */
1087 #undef yylloc
1088 #define yylloc	(((struct yyguts_t *) yyscanner)->yylloc_r)
1089 #undef yyleng
1090 #define yyleng	(((struct yyguts_t *) yyscanner)->yyleng_r)
1091 
1092 
1093 /*
1094  * scanner_errposition
1095  *		Report a lexer or grammar error cursor position, if possible.
1096  *
1097  * This is expected to be used within an ereport() call.  The return value
1098  * is a dummy (always 0, in fact).
1099  *
1100  * Note that this can only be used for messages emitted during raw parsing
1101  * (essentially, scan.l and gram.y), since it requires the yyscanner struct
1102  * to still be available.
1103  */
1104 int
1105 scanner_errposition(int location, core_yyscan_t yyscanner)
1106 {
1107 	int			pos;
1108 
1109 	if (location < 0)
1110 		return 0;				/* no-op if location is unknown */
1111 
1112 	/* Convert byte offset to character number */
1113 	pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
1114 	/* And pass it to the ereport mechanism */
1115 	return errposition(pos);
1116 }
1117 
1118 /*
1119  * scanner_yyerror
1120  *		Report a lexer or grammar error.
1121  *
1122  * The message's cursor position is whatever YYLLOC was last set to,
1123  * ie, the start of the current token if called within yylex(), or the
1124  * most recently lexed token if called from the grammar.
1125  * This is OK for syntax error messages from the Bison parser, because Bison
1126  * parsers report error as soon as the first unparsable token is reached.
1127  * Beware of using yyerror for other purposes, as the cursor position might
1128  * be misleading!
1129  */
1130 void
1131 scanner_yyerror(const char *message, core_yyscan_t yyscanner)
1132 {
1133 	const char *loc = yyextra->scanbuf + *yylloc;
1134 
1135 	if (*loc == YY_END_OF_BUFFER_CHAR)
1136 	{
1137 		ereport(ERROR,
1138 				(errcode(ERRCODE_SYNTAX_ERROR),
1139 		/* translator: %s is typically the translation of "syntax error" */
1140 				 errmsg("%s at end of input", _(message)),
1141 				 lexer_errposition()));
1142 	}
1143 	else
1144 	{
1145 		ereport(ERROR,
1146 				(errcode(ERRCODE_SYNTAX_ERROR),
1147 		/* translator: first %s is typically the translation of "syntax error" */
1148 				 errmsg("%s at or near \"%s\"", _(message), loc),
1149 				 lexer_errposition()));
1150 	}
1151 }
1152 
1153 
1154 /*
1155  * Called before any actual parsing is done
1156  */
1157 core_yyscan_t
1158 scanner_init(const char *str,
1159 			 core_yy_extra_type *yyext,
1160 			 const ScanKeywordList *keywordlist,
1161 			 const uint16 *keyword_tokens)
1162 {
1163 	Size		slen = strlen(str);
1164 	yyscan_t	scanner;
1165 
1166 	if (yylex_init(&scanner) != 0)
1167 		elog(ERROR, "yylex_init() failed: %m");
1168 
1169 	core_yyset_extra(yyext, scanner);
1170 
1171 	yyext->keywordlist = keywordlist;
1172 	yyext->keyword_tokens = keyword_tokens;
1173 
1174 	yyext->backslash_quote = backslash_quote;
1175 	yyext->escape_string_warning = escape_string_warning;
1176 	yyext->standard_conforming_strings = standard_conforming_strings;
1177 
1178 	/*
1179 	 * Make a scan buffer with special termination needed by flex.
1180 	 */
1181 	yyext->scanbuf = (char *) palloc(slen + 2);
1182 	yyext->scanbuflen = slen;
1183 	memcpy(yyext->scanbuf, str, slen);
1184 	yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
1185 	yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
1186 
1187 	/* initialize literal buffer to a reasonable but expansible size */
1188 	yyext->literalalloc = 1024;
1189 	yyext->literalbuf = (char *) palloc(yyext->literalalloc);
1190 	yyext->literallen = 0;
1191 
1192 	return scanner;
1193 }
1194 
1195 
1196 /*
1197  * Called after parsing is done to clean up after scanner_init()
1198  */
1199 void
1200 scanner_finish(core_yyscan_t yyscanner)
1201 {
1202 	/*
1203 	 * We don't bother to call yylex_destroy(), because all it would do is
1204 	 * pfree a small amount of control storage.  It's cheaper to leak the
1205 	 * storage until the parsing context is destroyed.  The amount of space
1206 	 * involved is usually negligible compared to the output parse tree
1207 	 * anyway.
1208 	 *
1209 	 * We do bother to pfree the scanbuf and literal buffer, but only if they
1210 	 * represent a nontrivial amount of space.  The 8K cutoff is arbitrary.
1211 	 */
1212 	if (yyextra->scanbuflen >= 8192)
1213 		pfree(yyextra->scanbuf);
1214 	if (yyextra->literalalloc >= 8192)
1215 		pfree(yyextra->literalbuf);
1216 }
1217 
1218 
1219 static void
1220 addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
1221 {
1222 	/* enlarge buffer if needed */
1223 	if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
1224 	{
1225 		do
1226 		{
1227 			yyextra->literalalloc *= 2;
1228 		} while ((yyextra->literallen + yleng) >= yyextra->literalalloc);
1229 		yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1230 												yyextra->literalalloc);
1231 	}
1232 	/* append new data */
1233 	memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
1234 	yyextra->literallen += yleng;
1235 }
1236 
1237 
1238 static void
1239 addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
1240 {
1241 	/* enlarge buffer if needed */
1242 	if ((yyextra->literallen + 1) >= yyextra->literalalloc)
1243 	{
1244 		yyextra->literalalloc *= 2;
1245 		yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1246 												yyextra->literalalloc);
1247 	}
1248 	/* append new data */
1249 	yyextra->literalbuf[yyextra->literallen] = ychar;
1250 	yyextra->literallen += 1;
1251 }
1252 
1253 
1254 /*
1255  * Create a palloc'd copy of literalbuf, adding a trailing null.
1256  */
1257 static char *
1258 litbufdup(core_yyscan_t yyscanner)
1259 {
1260 	int			llen = yyextra->literallen;
1261 	char	   *new;
1262 
1263 	new = palloc(llen + 1);
1264 	memcpy(new, yyextra->literalbuf, llen);
1265 	new[llen] = '\0';
1266 	return new;
1267 }
1268 
1269 /*
1270  * Process {integer}.  Note this will also do the right thing with {decimal},
1271  * ie digits and a decimal point.
1272  */
1273 static int
1274 process_integer_literal(const char *token, YYSTYPE *lval)
1275 {
1276 	int			val;
1277 	char	   *endptr;
1278 
1279 	errno = 0;
1280 	val = strtoint(token, &endptr, 10);
1281 	if (*endptr != '\0' || errno == ERANGE)
1282 	{
1283 		/* integer too large (or contains decimal pt), treat it as a float */
1284 		lval->str = pstrdup(token);
1285 		return FCONST;
1286 	}
1287 	lval->ival = val;
1288 	return ICONST;
1289 }
1290 
1291 static unsigned int
1292 hexval(unsigned char c)
1293 {
1294 	if (c >= '0' && c <= '9')
1295 		return c - '0';
1296 	if (c >= 'a' && c <= 'f')
1297 		return c - 'a' + 0xA;
1298 	if (c >= 'A' && c <= 'F')
1299 		return c - 'A' + 0xA;
1300 	elog(ERROR, "invalid hexadecimal digit");
1301 	return 0;					/* not reached */
1302 }
1303 
1304 static void
1305 check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner)
1306 {
1307 	if (GetDatabaseEncoding() == PG_UTF8)
1308 		return;
1309 
1310 	if (c > 0x7F)
1311 	{
1312 		ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3);	/* 3 for U&" */
1313 		yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1314 	}
1315 }
1316 
1317 static bool
1318 is_utf16_surrogate_first(pg_wchar c)
1319 {
1320 	return (c >= 0xD800 && c <= 0xDBFF);
1321 }
1322 
1323 static bool
1324 is_utf16_surrogate_second(pg_wchar c)
1325 {
1326 	return (c >= 0xDC00 && c <= 0xDFFF);
1327 }
1328 
1329 static pg_wchar
1330 surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
1331 {
1332 	return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
1333 }
1334 
1335 static void
1336 addunicode(pg_wchar c, core_yyscan_t yyscanner)
1337 {
1338 	char		buf[8];
1339 
1340 	if (c == 0 || c > 0x10FFFF)
1341 		yyerror("invalid Unicode escape value");
1342 	if (c > 0x7F)
1343 	{
1344 		if (GetDatabaseEncoding() != PG_UTF8)
1345 			yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1346 		yyextra->saw_non_ascii = true;
1347 	}
1348 	unicode_to_utf8(c, (unsigned char *) buf);
1349 	addlit(buf, pg_mblen(buf), yyscanner);
1350 }
1351 
1352 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
1353 static bool
1354 check_uescapechar(unsigned char escape)
1355 {
1356 	if (isxdigit(escape)
1357 		|| escape == '+'
1358 		|| escape == '\''
1359 		|| escape == '"'
1360 		|| scanner_isspace(escape))
1361 	{
1362 		return false;
1363 	}
1364 	else
1365 		return true;
1366 }
1367 
1368 /* like litbufdup, but handle unicode escapes */
1369 static char *
1370 litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
1371 {
1372 	char	   *new;
1373 	char	   *litbuf,
1374 			   *in,
1375 			   *out;
1376 	pg_wchar	pair_first = 0;
1377 
1378 	/* Make literalbuf null-terminated to simplify the scanning loop */
1379 	litbuf = yyextra->literalbuf;
1380 	litbuf[yyextra->literallen] = '\0';
1381 
1382 	/*
1383 	 * This relies on the subtle assumption that a UTF-8 expansion cannot be
1384 	 * longer than its escaped representation.
1385 	 */
1386 	new = palloc(yyextra->literallen + 1);
1387 
1388 	in = litbuf;
1389 	out = new;
1390 	while (*in)
1391 	{
1392 		if (in[0] == escape)
1393 		{
1394 			if (in[1] == escape)
1395 			{
1396 				if (pair_first)
1397 				{
1398 					ADVANCE_YYLLOC(in - litbuf + 3);	/* 3 for U&" */
1399 					yyerror("invalid Unicode surrogate pair");
1400 				}
1401 				*out++ = escape;
1402 				in += 2;
1403 			}
1404 			else if (isxdigit((unsigned char) in[1]) &&
1405 					 isxdigit((unsigned char) in[2]) &&
1406 					 isxdigit((unsigned char) in[3]) &&
1407 					 isxdigit((unsigned char) in[4]))
1408 			{
1409 				pg_wchar	unicode;
1410 
1411 				unicode = (hexval(in[1]) << 12) +
1412 					(hexval(in[2]) << 8) +
1413 					(hexval(in[3]) << 4) +
1414 					hexval(in[4]);
1415 				check_unicode_value(unicode, in, yyscanner);
1416 				if (pair_first)
1417 				{
1418 					if (is_utf16_surrogate_second(unicode))
1419 					{
1420 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1421 						pair_first = 0;
1422 					}
1423 					else
1424 					{
1425 						ADVANCE_YYLLOC(in - litbuf + 3);		/* 3 for U&" */
1426 						yyerror("invalid Unicode surrogate pair");
1427 					}
1428 				}
1429 				else if (is_utf16_surrogate_second(unicode))
1430 					yyerror("invalid Unicode surrogate pair");
1431 
1432 				if (is_utf16_surrogate_first(unicode))
1433 					pair_first = unicode;
1434 				else
1435 				{
1436 					unicode_to_utf8(unicode, (unsigned char *) out);
1437 					out += pg_mblen(out);
1438 				}
1439 				in += 5;
1440 			}
1441 			else if (in[1] == '+' &&
1442 					 isxdigit((unsigned char) in[2]) &&
1443 					 isxdigit((unsigned char) in[3]) &&
1444 					 isxdigit((unsigned char) in[4]) &&
1445 					 isxdigit((unsigned char) in[5]) &&
1446 					 isxdigit((unsigned char) in[6]) &&
1447 					 isxdigit((unsigned char) in[7]))
1448 			{
1449 				pg_wchar	unicode;
1450 
1451 				unicode = (hexval(in[2]) << 20) +
1452 					(hexval(in[3]) << 16) +
1453 					(hexval(in[4]) << 12) +
1454 					(hexval(in[5]) << 8) +
1455 					(hexval(in[6]) << 4) +
1456 					hexval(in[7]);
1457 				check_unicode_value(unicode, in, yyscanner);
1458 				if (pair_first)
1459 				{
1460 					if (is_utf16_surrogate_second(unicode))
1461 					{
1462 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1463 						pair_first = 0;
1464 					}
1465 					else
1466 					{
1467 						ADVANCE_YYLLOC(in - litbuf + 3);		/* 3 for U&" */
1468 						yyerror("invalid Unicode surrogate pair");
1469 					}
1470 				}
1471 				else if (is_utf16_surrogate_second(unicode))
1472 					yyerror("invalid Unicode surrogate pair");
1473 
1474 				if (is_utf16_surrogate_first(unicode))
1475 					pair_first = unicode;
1476 				else
1477 				{
1478 					unicode_to_utf8(unicode, (unsigned char *) out);
1479 					out += pg_mblen(out);
1480 				}
1481 				in += 8;
1482 			}
1483 			else
1484 			{
1485 				ADVANCE_YYLLOC(in - litbuf + 3);		/* 3 for U&" */
1486 				yyerror("invalid Unicode escape value");
1487 			}
1488 		}
1489 		else
1490 		{
1491 			if (pair_first)
1492 			{
1493 				ADVANCE_YYLLOC(in - litbuf + 3);		/* 3 for U&" */
1494 				yyerror("invalid Unicode surrogate pair");
1495 			}
1496 			*out++ = *in++;
1497 		}
1498 	}
1499 
1500 	/* unfinished surrogate pair? */
1501 	if (pair_first)
1502 	{
1503 		ADVANCE_YYLLOC(in - litbuf + 3);				/* 3 for U&" */
1504 		yyerror("invalid Unicode surrogate pair");
1505 	}
1506 
1507 	*out = '\0';
1508 
1509 	/*
1510 	 * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
1511 	 * codes; but it's probably not worth the trouble, since this isn't likely
1512 	 * to be a performance-critical path.
1513 	 */
1514 	pg_verifymbstr(new, out - new, false);
1515 	return new;
1516 }
1517 
1518 static unsigned char
1519 unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
1520 {
1521 	switch (c)
1522 	{
1523 		case 'b':
1524 			return '\b';
1525 		case 'f':
1526 			return '\f';
1527 		case 'n':
1528 			return '\n';
1529 		case 'r':
1530 			return '\r';
1531 		case 't':
1532 			return '\t';
1533 		default:
1534 			/* check for backslash followed by non-7-bit-ASCII */
1535 			if (c == '\0' || IS_HIGHBIT_SET(c))
1536 				yyextra->saw_non_ascii = true;
1537 
1538 			return c;
1539 	}
1540 }
1541 
1542 static void
1543 check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
1544 {
1545 	if (ychar == '\'')
1546 	{
1547 		if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1548 			ereport(WARNING,
1549 					(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1550 					 errmsg("nonstandard use of \\' in a string literal"),
1551 					 errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
1552 					 lexer_errposition()));
1553 		yyextra->warn_on_first_escape = false;	/* warn only once per string */
1554 	}
1555 	else if (ychar == '\\')
1556 	{
1557 		if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1558 			ereport(WARNING,
1559 					(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1560 					 errmsg("nonstandard use of \\\\ in a string literal"),
1561 					 errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1562 					 lexer_errposition()));
1563 		yyextra->warn_on_first_escape = false;	/* warn only once per string */
1564 	}
1565 	else
1566 		check_escape_warning(yyscanner);
1567 }
1568 
1569 static void
1570 check_escape_warning(core_yyscan_t yyscanner)
1571 {
1572 	if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1573 		ereport(WARNING,
1574 				(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1575 				 errmsg("nonstandard use of escape in a string literal"),
1576 		errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
1577 				 lexer_errposition()));
1578 	yyextra->warn_on_first_escape = false;		/* warn only once per string */
1579 }
1580 
1581 /*
1582  * Interface functions to make flex use palloc() instead of malloc().
1583  * It'd be better to make these static, but flex insists otherwise.
1584  */
1585 
1586 void *
1587 core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
1588 {
1589 	return palloc(bytes);
1590 }
1591 
1592 void *
1593 core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
1594 {
1595 	if (ptr)
1596 		return repalloc(ptr, bytes);
1597 	else
1598 		return palloc(bytes);
1599 }
1600 
1601 void
1602 core_yyfree(void *ptr, core_yyscan_t yyscanner)
1603 {
1604 	if (ptr)
1605 		pfree(ptr);
1606 }
1607