1 %top{
2 /*-------------------------------------------------------------------------
3 *
4 * scan.l
5 * lexical scanner for PostgreSQL
6 *
7 * NOTE NOTE NOTE:
8 *
9 * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l!
10 *
11 * The rules are designed so that the scanner never has to backtrack,
12 * in the sense that there is always a rule that can match the input
13 * consumed so far (the rule action may internally throw back some input
14 * with yyless(), however). As explained in the flex manual, this makes
15 * for a useful speed increase --- about a third faster than a plain -CF
16 * lexer, in simple testing. The extra complexity is mostly in the rules
17 * for handling float numbers and continued string literals. If you change
18 * the lexical rules, verify that you haven't broken the no-backtrack
19 * property by running flex with the "-b" option and checking that the
20 * resulting "lex.backup" file says that no backing up is needed. (As of
21 * Postgres 9.2, this check is made automatically by the Makefile.)
22 *
23 *
24 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
25 * Portions Copyright (c) 1994, Regents of the University of California
26 *
27 * IDENTIFICATION
28 * src/backend/parser/scan.l
29 *
30 *-------------------------------------------------------------------------
31 */
32 #include "postgres.h"
33
34 #include <ctype.h>
35 #include <unistd.h>
36
37 #include "common/string.h"
38 #include "parser/gramparse.h"
39 #include "parser/parser.h" /* only needed for GUC variables */
40 #include "parser/scansup.h"
41 #include "mb/pg_wchar.h"
42 }
43
44 %{
45
46 /* LCOV_EXCL_START */
47
48 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
49 #undef fprintf
50 #define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg)
51
52 static void
fprintf_to_ereport(const char * fmt,const char * msg)53 fprintf_to_ereport(const char *fmt, const char *msg)
54 {
55 ereport(ERROR, (errmsg_internal("%s", msg)));
56 }
57
58 /*
59 * GUC variables. This is a DIRECT violation of the warning given at the
60 * head of gram.y, ie flex/bison code must not depend on any GUC variables;
61 * as such, changing their values can induce very unintuitive behavior.
62 * But we shall have to live with it until we can remove these variables.
63 */
64 int backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
65 bool escape_string_warning = true;
66 bool standard_conforming_strings = true;
67
68 /*
69 * Set the type of YYSTYPE.
70 */
71 #define YYSTYPE core_YYSTYPE
72
73 /*
74 * Set the type of yyextra. All state variables used by the scanner should
75 * be in yyextra, *not* statically allocated.
76 */
77 #define YY_EXTRA_TYPE core_yy_extra_type *
78
79 /*
80 * Each call to yylex must set yylloc to the location of the found token
81 * (expressed as a byte offset from the start of the input text).
82 * When we parse a token that requires multiple lexer rules to process,
83 * this should be done in the first such rule, else yylloc will point
84 * into the middle of the token.
85 */
86 #define SET_YYLLOC() (*(yylloc) = yytext - yyextra->scanbuf)
87
88 /*
89 * Advance yylloc by the given number of bytes.
90 */
91 #define ADVANCE_YYLLOC(delta) ( *(yylloc) += (delta) )
92
93 #define startlit() ( yyextra->literallen = 0 )
94 static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
95 static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
96 static char *litbufdup(core_yyscan_t yyscanner);
97 static char *litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner);
98 static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
99 static int process_integer_literal(const char *token, YYSTYPE *lval);
100 static bool is_utf16_surrogate_first(pg_wchar c);
101 static bool is_utf16_surrogate_second(pg_wchar c);
102 static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
103 static void addunicode(pg_wchar c, yyscan_t yyscanner);
104 static bool check_uescapechar(unsigned char escape);
105
106 #define yyerror(msg) scanner_yyerror(msg, yyscanner)
107
108 #define lexer_errposition() scanner_errposition(*(yylloc), yyscanner)
109
110 static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner);
111 static void check_escape_warning(core_yyscan_t yyscanner);
112
113 /*
114 * Work around a bug in flex 2.5.35: it emits a couple of functions that
115 * it forgets to emit declarations for. Since we use -Wmissing-prototypes,
116 * this would cause warnings. Providing our own declarations should be
117 * harmless even when the bug gets fixed.
118 */
119 extern int core_yyget_column(yyscan_t yyscanner);
120 extern void core_yyset_column(int column_no, yyscan_t yyscanner);
121
122 %}
123
124 %option reentrant
125 %option bison-bridge
126 %option bison-locations
127 %option 8bit
128 %option never-interactive
129 %option nodefault
130 %option noinput
131 %option nounput
132 %option noyywrap
133 %option noyyalloc
134 %option noyyrealloc
135 %option noyyfree
136 %option warn
137 %option prefix="core_yy"
138
139 /*
140 * OK, here is a short description of lex/flex rules behavior.
141 * The longest pattern which matches an input string is always chosen.
142 * For equal-length patterns, the first occurring in the rules list is chosen.
143 * INITIAL is the starting state, to which all non-conditional rules apply.
144 * Exclusive states change parsing rules while the state is active. When in
145 * an exclusive state, only those rules defined for that state apply.
146 *
147 * We use exclusive states for quoted strings, extended comments,
148 * and to eliminate parsing troubles for numeric strings.
149 * Exclusive states:
150 * <xb> bit string literal
151 * <xc> extended C-style comments
152 * <xd> delimited identifiers (double-quoted identifiers)
153 * <xh> hexadecimal numeric string
154 * <xq> standard quoted strings
155 * <xe> extended quoted strings (support backslash escape sequences)
156 * <xdolq> $foo$ quoted strings
157 * <xui> quoted identifier with Unicode escapes
158 * <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
159 * <xus> quoted string with Unicode escapes
160 * <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
161 * <xeu> Unicode surrogate pair in extended quoted string
162 *
163 * Remember to add an <<EOF>> case whenever you add a new exclusive state!
164 * The default one is probably not the right thing.
165 */
166
167 %x xb
168 %x xc
169 %x xd
170 %x xh
171 %x xe
172 %x xq
173 %x xdolq
174 %x xui
175 %x xuiend
176 %x xus
177 %x xusend
178 %x xeu
179
180 /*
181 * In order to make the world safe for Windows and Mac clients as well as
182 * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n
183 * sequence will be seen as two successive newlines, but that doesn't cause
184 * any problems. Comments that start with -- and extend to the next
185 * newline are treated as equivalent to a single whitespace character.
186 *
187 * NOTE a fine point: if there is no newline following --, we will absorb
188 * everything to the end of the input as a comment. This is correct. Older
189 * versions of Postgres failed to recognize -- as a comment if the input
190 * did not end with a newline.
191 *
192 * XXX perhaps \f (formfeed) should be treated as a newline as well?
193 *
194 * XXX if you change the set of whitespace characters, fix scanner_isspace()
195 * to agree, and see also the plpgsql lexer.
196 */
197
198 space [ \t\n\r\f]
199 horiz_space [ \t\f]
200 newline [\n\r]
201 non_newline [^\n\r]
202
203 comment ("--"{non_newline}*)
204
205 whitespace ({space}+|{comment})
206
207 /*
208 * SQL requires at least one newline in the whitespace separating
209 * string literals that are to be concatenated. Silly, but who are we
210 * to argue? Note that {whitespace_with_newline} should not have * after
211 * it, whereas {whitespace} should generally have a * after it...
212 */
213
214 special_whitespace ({space}+|{comment}{newline})
215 horiz_whitespace ({horiz_space}|{comment})
216 whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
217
218 /*
219 * To ensure that {quotecontinue} can be scanned without having to back up
220 * if the full pattern isn't matched, we include trailing whitespace in
221 * {quotestop}. This matches all cases where {quotecontinue} fails to match,
222 * except for {quote} followed by whitespace and just one "-" (not two,
223 * which would start a {comment}). To cover that we have {quotefail}.
224 * The actions for {quotestop} and {quotefail} must throw back characters
225 * beyond the quote proper.
226 */
227 quote '
228 quotestop {quote}{whitespace}*
229 quotecontinue {quote}{whitespace_with_newline}{quote}
230 quotefail {quote}{whitespace}*"-"
231
232 /* Bit string
233 * It is tempting to scan the string for only those characters
234 * which are allowed. However, this leads to silently swallowed
235 * characters if illegal characters are included in the string.
236 * For example, if xbinside is [01] then B'ABCD' is interpreted
237 * as a zero-length string, and the ABCD' is lost!
238 * Better to pass the string forward and let the input routines
239 * validate the contents.
240 */
241 xbstart [bB]{quote}
242 xbinside [^']*
243
244 /* Hexadecimal number */
245 xhstart [xX]{quote}
246 xhinside [^']*
247
248 /* National character */
249 xnstart [nN]{quote}
250
251 /* Quoted string that allows backslash escapes */
252 xestart [eE]{quote}
253 xeinside [^\\']+
254 xeescape [\\][^0-7]
255 xeoctesc [\\][0-7]{1,3}
256 xehexesc [\\]x[0-9A-Fa-f]{1,2}
257 xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
258 xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
259
260 /* Extended quote
261 * xqdouble implements embedded quote, ''''
262 */
263 xqstart {quote}
264 xqdouble {quote}{quote}
265 xqinside [^']+
266
267 /* $foo$ style quotes ("dollar quoting")
268 * The quoted string starts with $foo$ where "foo" is an optional string
269 * in the form of an identifier, except that it may not contain "$",
270 * and extends to the first occurrence of an identical string.
271 * There is *no* processing of the quoted text.
272 *
273 * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
274 * fails to match its trailing "$".
275 */
276 dolq_start [A-Za-z\200-\377_]
277 dolq_cont [A-Za-z\200-\377_0-9]
278 dolqdelim \$({dolq_start}{dolq_cont}*)?\$
279 dolqfailed \${dolq_start}{dolq_cont}*
280 dolqinside [^$]+
281
282 /* Double quote
283 * Allows embedded spaces and other special characters into identifiers.
284 */
285 dquote \"
286 xdstart {dquote}
287 xdstop {dquote}
288 xddouble {dquote}{dquote}
289 xdinside [^"]+
290
291 /* Unicode escapes */
292 uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
293 /* error rule to avoid backup */
294 uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
295
296 /* Quoted identifier with Unicode escapes */
297 xuistart [uU]&{dquote}
298
299 /* Quoted string with Unicode escapes */
300 xusstart [uU]&{quote}
301
302 /* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
303 xustop1 {uescapefail}?
304 xustop2 {uescape}
305
306 /* error rule to avoid backup */
307 xufailed [uU]&
308
309
310 /* C-style comments
311 *
312 * The "extended comment" syntax closely resembles allowable operator syntax.
313 * The tricky part here is to get lex to recognize a string starting with
314 * slash-star as a comment, when interpreting it as an operator would produce
315 * a longer match --- remember lex will prefer a longer match! Also, if we
316 * have something like plus-slash-star, lex will think this is a 3-character
317 * operator whereas we want to see it as a + operator and a comment start.
318 * The solution is two-fold:
319 * 1. append {op_chars}* to xcstart so that it matches as much text as
320 * {operator} would. Then the tie-breaker (first matching rule of same
321 * length) ensures xcstart wins. We put back the extra stuff with yyless()
322 * in case it contains a star-slash that should terminate the comment.
323 * 2. In the operator rule, check for slash-star within the operator, and
324 * if found throw it back with yyless(). This handles the plus-slash-star
325 * problem.
326 * Dash-dash comments have similar interactions with the operator rule.
327 */
328 xcstart \/\*{op_chars}*
329 xcstop \*+\/
330 xcinside [^*/]+
331
332 digit [0-9]
333 ident_start [A-Za-z\200-\377_]
334 ident_cont [A-Za-z\200-\377_0-9\$]
335
336 identifier {ident_start}{ident_cont}*
337
338 /* Assorted special-case operators and operator-like tokens */
339 typecast "::"
340 dot_dot \.\.
341 colon_equals ":="
342
343 /*
344 * These operator-like tokens (unlike the above ones) also match the {operator}
345 * rule, which means that they might be overridden by a longer match if they
346 * are followed by a comment start or a + or - character. Accordingly, if you
347 * add to this list, you must also add corresponding code to the {operator}
348 * block to return the correct token in such cases. (This is not needed in
349 * psqlscan.l since the token value is ignored there.)
350 */
351 equals_greater "=>"
352 less_equals "<="
353 greater_equals ">="
354 less_greater "<>"
355 not_equals "!="
356
357 /*
358 * "self" is the set of chars that should be returned as single-character
359 * tokens. "op_chars" is the set of chars that can make up "Op" tokens,
360 * which can be one or more characters long (but if a single-char token
361 * appears in the "self" set, it is not to be returned as an Op). Note
362 * that the sets overlap, but each has some chars that are not in the other.
363 *
364 * If you change either set, adjust the character lists appearing in the
365 * rule for "operator"!
366 */
367 self [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
368 op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
369 operator {op_chars}+
370
371 /* we no longer allow unary minus in numbers.
372 * instead we pass it separately to parser. there it gets
373 * coerced via doNegate() -- Leon aug 20 1999
374 *
375 * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
376 *
377 * {realfail1} and {realfail2} are added to prevent the need for scanner
378 * backup when the {real} rule fails to match completely.
379 */
380
381 integer {digit}+
382 decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*))
383 decimalfail {digit}+\.\.
384 real ({integer}|{decimal})[Ee][-+]?{digit}+
385 realfail1 ({integer}|{decimal})[Ee]
386 realfail2 ({integer}|{decimal})[Ee][-+]
387
388 param \${integer}
389
390 other .
391
392 /*
393 * Dollar quoted strings are totally opaque, and no escaping is done on them.
394 * Other quoted strings must allow some special characters such as single-quote
395 * and newline.
396 * Embedded single-quotes are implemented both in the SQL standard
397 * style of two adjacent single quotes "''" and in the Postgres/Java style
398 * of escaped-quote "\'".
399 * Other embedded escaped characters are matched explicitly and the leading
400 * backslash is dropped from the string.
401 * Note that xcstart must appear before operator, as explained above!
402 * Also whitespace (comment) must appear before operator.
403 */
404
405 %%
406
407 {whitespace} {
408 /* ignore */
409 }
410
411 {xcstart} {
412 /* Set location in case of syntax error in comment */
413 SET_YYLLOC();
414 yyextra->xcdepth = 0;
415 BEGIN(xc);
416 /* Put back any characters past slash-star; see above */
417 yyless(2);
418 }
419
420 <xc>{xcstart} {
421 (yyextra->xcdepth)++;
422 /* Put back any characters past slash-star; see above */
423 yyless(2);
424 }
425
426 <xc>{xcstop} {
427 if (yyextra->xcdepth <= 0)
428 BEGIN(INITIAL);
429 else
430 (yyextra->xcdepth)--;
431 }
432
433 <xc>{xcinside} {
434 /* ignore */
435 }
436
437 <xc>{op_chars} {
438 /* ignore */
439 }
440
441 <xc>\*+ {
442 /* ignore */
443 }
444
445 <xc><<EOF>> { yyerror("unterminated /* comment"); }
446
447 {xbstart} {
448 /* Binary bit type.
449 * At some point we should simply pass the string
450 * forward to the parser and label it there.
451 * In the meantime, place a leading "b" on the string
452 * to mark it for the input routine as a binary string.
453 */
454 SET_YYLLOC();
455 BEGIN(xb);
456 startlit();
457 addlitchar('b', yyscanner);
458 }
459 <xb>{quotestop} |
460 <xb>{quotefail} {
461 yyless(1);
462 BEGIN(INITIAL);
463 yylval->str = litbufdup(yyscanner);
464 return BCONST;
465 }
466 <xh>{xhinside} |
467 <xb>{xbinside} {
468 addlit(yytext, yyleng, yyscanner);
469 }
470 <xh>{quotecontinue} |
471 <xb>{quotecontinue} {
472 /* ignore */
473 }
474 <xb><<EOF>> { yyerror("unterminated bit string literal"); }
475
476 {xhstart} {
477 /* Hexadecimal bit type.
478 * At some point we should simply pass the string
479 * forward to the parser and label it there.
480 * In the meantime, place a leading "x" on the string
481 * to mark it for the input routine as a hex string.
482 */
483 SET_YYLLOC();
484 BEGIN(xh);
485 startlit();
486 addlitchar('x', yyscanner);
487 }
488 <xh>{quotestop} |
489 <xh>{quotefail} {
490 yyless(1);
491 BEGIN(INITIAL);
492 yylval->str = litbufdup(yyscanner);
493 return XCONST;
494 }
495 <xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); }
496
497 {xnstart} {
498 /* National character.
499 * We will pass this along as a normal character string,
500 * but preceded with an internally-generated "NCHAR".
501 */
502 const ScanKeyword *keyword;
503
504 SET_YYLLOC();
505 yyless(1); /* eat only 'n' this time */
506
507 keyword = ScanKeywordLookup("nchar",
508 yyextra->keywords,
509 yyextra->num_keywords);
510 if (keyword != NULL)
511 {
512 yylval->keyword = keyword->name;
513 return keyword->value;
514 }
515 else
516 {
517 /* If NCHAR isn't a keyword, just return "n" */
518 yylval->str = pstrdup("n");
519 return IDENT;
520 }
521 }
522
523 {xqstart} {
524 yyextra->warn_on_first_escape = true;
525 yyextra->saw_non_ascii = false;
526 SET_YYLLOC();
527 if (yyextra->standard_conforming_strings)
528 BEGIN(xq);
529 else
530 BEGIN(xe);
531 startlit();
532 }
533 {xestart} {
534 yyextra->warn_on_first_escape = false;
535 yyextra->saw_non_ascii = false;
536 SET_YYLLOC();
537 BEGIN(xe);
538 startlit();
539 }
540 {xusstart} {
541 SET_YYLLOC();
542 if (!yyextra->standard_conforming_strings)
543 ereport(ERROR,
544 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
545 errmsg("unsafe use of string constant with Unicode escapes"),
546 errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
547 lexer_errposition()));
548 BEGIN(xus);
549 startlit();
550 }
551 <xq,xe>{quotestop} |
552 <xq,xe>{quotefail} {
553 yyless(1);
554 BEGIN(INITIAL);
555 /*
556 * check that the data remains valid if it might have been
557 * made invalid by unescaping any chars.
558 */
559 if (yyextra->saw_non_ascii)
560 pg_verifymbstr(yyextra->literalbuf,
561 yyextra->literallen,
562 false);
563 yylval->str = litbufdup(yyscanner);
564 return SCONST;
565 }
566 <xus>{quotestop} |
567 <xus>{quotefail} {
568 /* throw back all but the quote */
569 yyless(1);
570 /* xusend state looks for possible UESCAPE */
571 BEGIN(xusend);
572 }
573 <xusend>{whitespace} {
574 /* stay in xusend state over whitespace */
575 }
576 <xusend><<EOF>> |
577 <xusend>{other} |
578 <xusend>{xustop1} {
579 /* no UESCAPE after the quote, throw back everything */
580 yyless(0);
581 BEGIN(INITIAL);
582 yylval->str = litbuf_udeescape('\\', yyscanner);
583 return SCONST;
584 }
585 <xusend>{xustop2} {
586 /* found UESCAPE after the end quote */
587 BEGIN(INITIAL);
588 if (!check_uescapechar(yytext[yyleng - 2]))
589 {
590 SET_YYLLOC();
591 ADVANCE_YYLLOC(yyleng - 2);
592 yyerror("invalid Unicode escape character");
593 }
594 yylval->str = litbuf_udeescape(yytext[yyleng - 2],
595 yyscanner);
596 return SCONST;
597 }
598 <xq,xe,xus>{xqdouble} {
599 addlitchar('\'', yyscanner);
600 }
601 <xq,xus>{xqinside} {
602 addlit(yytext, yyleng, yyscanner);
603 }
604 <xe>{xeinside} {
605 addlit(yytext, yyleng, yyscanner);
606 }
607 <xe>{xeunicode} {
608 pg_wchar c = strtoul(yytext + 2, NULL, 16);
609
610 check_escape_warning(yyscanner);
611
612 if (is_utf16_surrogate_first(c))
613 {
614 yyextra->utf16_first_part = c;
615 BEGIN(xeu);
616 }
617 else if (is_utf16_surrogate_second(c))
618 yyerror("invalid Unicode surrogate pair");
619 else
620 addunicode(c, yyscanner);
621 }
622 <xeu>{xeunicode} {
623 pg_wchar c = strtoul(yytext + 2, NULL, 16);
624
625 if (!is_utf16_surrogate_second(c))
626 yyerror("invalid Unicode surrogate pair");
627
628 c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
629
630 addunicode(c, yyscanner);
631
632 BEGIN(xe);
633 }
634 <xeu>. { yyerror("invalid Unicode surrogate pair"); }
635 <xeu>\n { yyerror("invalid Unicode surrogate pair"); }
636 <xeu><<EOF>> { yyerror("invalid Unicode surrogate pair"); }
637 <xe,xeu>{xeunicodefail} {
638 ereport(ERROR,
639 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
640 errmsg("invalid Unicode escape"),
641 errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
642 lexer_errposition()));
643 }
644 <xe>{xeescape} {
645 if (yytext[1] == '\'')
646 {
647 if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
648 (yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
649 PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
650 ereport(ERROR,
651 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
652 errmsg("unsafe use of \\' in a string literal"),
653 errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
654 lexer_errposition()));
655 }
656 check_string_escape_warning(yytext[1], yyscanner);
657 addlitchar(unescape_single_char(yytext[1], yyscanner),
658 yyscanner);
659 }
660 <xe>{xeoctesc} {
661 unsigned char c = strtoul(yytext + 1, NULL, 8);
662
663 check_escape_warning(yyscanner);
664 addlitchar(c, yyscanner);
665 if (c == '\0' || IS_HIGHBIT_SET(c))
666 yyextra->saw_non_ascii = true;
667 }
668 <xe>{xehexesc} {
669 unsigned char c = strtoul(yytext + 2, NULL, 16);
670
671 check_escape_warning(yyscanner);
672 addlitchar(c, yyscanner);
673 if (c == '\0' || IS_HIGHBIT_SET(c))
674 yyextra->saw_non_ascii = true;
675 }
676 <xq,xe,xus>{quotecontinue} {
677 /* ignore */
678 }
679 <xe>. {
680 /* This is only needed for \ just before EOF */
681 addlitchar(yytext[0], yyscanner);
682 }
683 <xq,xe,xus><<EOF>> { yyerror("unterminated quoted string"); }
684
685 {dolqdelim} {
686 SET_YYLLOC();
687 yyextra->dolqstart = pstrdup(yytext);
688 BEGIN(xdolq);
689 startlit();
690 }
691 {dolqfailed} {
692 SET_YYLLOC();
693 /* throw back all but the initial "$" */
694 yyless(1);
695 /* and treat it as {other} */
696 return yytext[0];
697 }
698 <xdolq>{dolqdelim} {
699 if (strcmp(yytext, yyextra->dolqstart) == 0)
700 {
701 pfree(yyextra->dolqstart);
702 yyextra->dolqstart = NULL;
703 BEGIN(INITIAL);
704 yylval->str = litbufdup(yyscanner);
705 return SCONST;
706 }
707 else
708 {
709 /*
710 * When we fail to match $...$ to dolqstart, transfer
711 * the $... part to the output, but put back the final
712 * $ for rescanning. Consider $delim$...$junk$delim$
713 */
714 addlit(yytext, yyleng - 1, yyscanner);
715 yyless(yyleng - 1);
716 }
717 }
718 <xdolq>{dolqinside} {
719 addlit(yytext, yyleng, yyscanner);
720 }
721 <xdolq>{dolqfailed} {
722 addlit(yytext, yyleng, yyscanner);
723 }
724 <xdolq>. {
725 /* This is only needed for $ inside the quoted text */
726 addlitchar(yytext[0], yyscanner);
727 }
728 <xdolq><<EOF>> { yyerror("unterminated dollar-quoted string"); }
729
730 {xdstart} {
731 SET_YYLLOC();
732 BEGIN(xd);
733 startlit();
734 }
735 {xuistart} {
736 SET_YYLLOC();
737 BEGIN(xui);
738 startlit();
739 }
740 <xd>{xdstop} {
741 char *ident;
742
743 BEGIN(INITIAL);
744 if (yyextra->literallen == 0)
745 yyerror("zero-length delimited identifier");
746 ident = litbufdup(yyscanner);
747 if (yyextra->literallen >= NAMEDATALEN)
748 truncate_identifier(ident, yyextra->literallen, true);
749 yylval->str = ident;
750 return IDENT;
751 }
752 <xui>{dquote} {
753 yyless(1);
754 /* xuiend state looks for possible UESCAPE */
755 BEGIN(xuiend);
756 }
757 <xuiend>{whitespace} {
758 /* stay in xuiend state over whitespace */
759 }
760 <xuiend><<EOF>> |
761 <xuiend>{other} |
762 <xuiend>{xustop1} {
763 /* no UESCAPE after the quote, throw back everything */
764 char *ident;
765 int identlen;
766
767 yyless(0);
768
769 BEGIN(INITIAL);
770 if (yyextra->literallen == 0)
771 yyerror("zero-length delimited identifier");
772 ident = litbuf_udeescape('\\', yyscanner);
773 identlen = strlen(ident);
774 if (identlen >= NAMEDATALEN)
775 truncate_identifier(ident, identlen, true);
776 yylval->str = ident;
777 return IDENT;
778 }
779 <xuiend>{xustop2} {
780 /* found UESCAPE after the end quote */
781 char *ident;
782 int identlen;
783
784 BEGIN(INITIAL);
785 if (yyextra->literallen == 0)
786 yyerror("zero-length delimited identifier");
787 if (!check_uescapechar(yytext[yyleng - 2]))
788 {
789 SET_YYLLOC();
790 ADVANCE_YYLLOC(yyleng - 2);
791 yyerror("invalid Unicode escape character");
792 }
793 ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
794 identlen = strlen(ident);
795 if (identlen >= NAMEDATALEN)
796 truncate_identifier(ident, identlen, true);
797 yylval->str = ident;
798 return IDENT;
799 }
800 <xd,xui>{xddouble} {
801 addlitchar('"', yyscanner);
802 }
803 <xd,xui>{xdinside} {
804 addlit(yytext, yyleng, yyscanner);
805 }
806 <xd,xui><<EOF>> { yyerror("unterminated quoted identifier"); }
807
808 {xufailed} {
809 char *ident;
810
811 SET_YYLLOC();
812 /* throw back all but the initial u/U */
813 yyless(1);
814 /* and treat it as {identifier} */
815 ident = downcase_truncate_identifier(yytext, yyleng, true);
816 yylval->str = ident;
817 return IDENT;
818 }
819
820 {typecast} {
821 SET_YYLLOC();
822 return TYPECAST;
823 }
824
825 {dot_dot} {
826 SET_YYLLOC();
827 return DOT_DOT;
828 }
829
830 {colon_equals} {
831 SET_YYLLOC();
832 return COLON_EQUALS;
833 }
834
835 {equals_greater} {
836 SET_YYLLOC();
837 return EQUALS_GREATER;
838 }
839
840 {less_equals} {
841 SET_YYLLOC();
842 return LESS_EQUALS;
843 }
844
845 {greater_equals} {
846 SET_YYLLOC();
847 return GREATER_EQUALS;
848 }
849
850 {less_greater} {
851 /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
852 SET_YYLLOC();
853 return NOT_EQUALS;
854 }
855
856 {not_equals} {
857 /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
858 SET_YYLLOC();
859 return NOT_EQUALS;
860 }
861
862 {self} {
863 SET_YYLLOC();
864 return yytext[0];
865 }
866
867 {operator} {
868 /*
869 * Check for embedded slash-star or dash-dash; those
870 * are comment starts, so operator must stop there.
871 * Note that slash-star or dash-dash at the first
872 * character will match a prior rule, not this one.
873 */
874 int nchars = yyleng;
875 char *slashstar = strstr(yytext, "/*");
876 char *dashdash = strstr(yytext, "--");
877
878 if (slashstar && dashdash)
879 {
880 /* if both appear, take the first one */
881 if (slashstar > dashdash)
882 slashstar = dashdash;
883 }
884 else if (!slashstar)
885 slashstar = dashdash;
886 if (slashstar)
887 nchars = slashstar - yytext;
888
889 /*
890 * For SQL compatibility, '+' and '-' cannot be the
891 * last char of a multi-char operator unless the operator
892 * contains chars that are not in SQL operators.
893 * The idea is to lex '=-' as two operators, but not
894 * to forbid operator names like '?-' that could not be
895 * sequences of SQL operators.
896 */
897 if (nchars > 1 &&
898 (yytext[nchars - 1] == '+' ||
899 yytext[nchars - 1] == '-'))
900 {
901 int ic;
902
903 for (ic = nchars - 2; ic >= 0; ic--)
904 {
905 char c = yytext[ic];
906 if (c == '~' || c == '!' || c == '@' ||
907 c == '#' || c == '^' || c == '&' ||
908 c == '|' || c == '`' || c == '?' ||
909 c == '%')
910 break;
911 }
912 if (ic < 0)
913 {
914 /*
915 * didn't find a qualifying character, so remove
916 * all trailing [+-]
917 */
918 do {
919 nchars--;
920 } while (nchars > 1 &&
921 (yytext[nchars - 1] == '+' ||
922 yytext[nchars - 1] == '-'));
923 }
924 }
925
926 SET_YYLLOC();
927
928 if (nchars < yyleng)
929 {
930 /* Strip the unwanted chars from the token */
931 yyless(nchars);
932 /*
933 * If what we have left is only one char, and it's
934 * one of the characters matching "self", then
935 * return it as a character token the same way
936 * that the "self" rule would have.
937 */
938 if (nchars == 1 &&
939 strchr(",()[].;:+-*/%^<>=", yytext[0]))
940 return yytext[0];
941 /*
942 * Likewise, if what we have left is two chars, and
943 * those match the tokens ">=", "<=", "=>", "<>" or
944 * "!=", then we must return the appropriate token
945 * rather than the generic Op.
946 */
947 if (nchars == 2)
948 {
949 if (yytext[0] == '=' && yytext[1] == '>')
950 return EQUALS_GREATER;
951 if (yytext[0] == '>' && yytext[1] == '=')
952 return GREATER_EQUALS;
953 if (yytext[0] == '<' && yytext[1] == '=')
954 return LESS_EQUALS;
955 if (yytext[0] == '<' && yytext[1] == '>')
956 return NOT_EQUALS;
957 if (yytext[0] == '!' && yytext[1] == '=')
958 return NOT_EQUALS;
959 }
960 }
961
962 /*
963 * Complain if operator is too long. Unlike the case
964 * for identifiers, we make this an error not a notice-
965 * and-truncate, because the odds are we are looking at
966 * a syntactic mistake anyway.
967 */
968 if (nchars >= NAMEDATALEN)
969 yyerror("operator too long");
970
971 yylval->str = pstrdup(yytext);
972 return Op;
973 }
974
975 {param} {
976 SET_YYLLOC();
977 yylval->ival = atol(yytext + 1);
978 return PARAM;
979 }
980
981 {integer} {
982 SET_YYLLOC();
983 return process_integer_literal(yytext, yylval);
984 }
985 {decimal} {
986 SET_YYLLOC();
987 yylval->str = pstrdup(yytext);
988 return FCONST;
989 }
990 {decimalfail} {
991 /* throw back the .., and treat as integer */
992 yyless(yyleng - 2);
993 SET_YYLLOC();
994 return process_integer_literal(yytext, yylval);
995 }
996 {real} {
997 SET_YYLLOC();
998 yylval->str = pstrdup(yytext);
999 return FCONST;
1000 }
1001 {realfail1} {
1002 /*
1003 * throw back the [Ee], and treat as {decimal}. Note
1004 * that it is possible the input is actually {integer},
1005 * but since this case will almost certainly lead to a
1006 * syntax error anyway, we don't bother to distinguish.
1007 */
1008 yyless(yyleng - 1);
1009 SET_YYLLOC();
1010 yylval->str = pstrdup(yytext);
1011 return FCONST;
1012 }
1013 {realfail2} {
1014 /* throw back the [Ee][+-], and proceed as above */
1015 yyless(yyleng - 2);
1016 SET_YYLLOC();
1017 yylval->str = pstrdup(yytext);
1018 return FCONST;
1019 }
1020
1021
1022 {identifier} {
1023 const ScanKeyword *keyword;
1024 char *ident;
1025
1026 SET_YYLLOC();
1027
1028 /* Is it a keyword? */
1029 keyword = ScanKeywordLookup(yytext,
1030 yyextra->keywords,
1031 yyextra->num_keywords);
1032 if (keyword != NULL)
1033 {
1034 yylval->keyword = keyword->name;
1035 return keyword->value;
1036 }
1037
1038 /*
1039 * No. Convert the identifier to lower case, and truncate
1040 * if necessary.
1041 */
1042 ident = downcase_truncate_identifier(yytext, yyleng, true);
1043 yylval->str = ident;
1044 return IDENT;
1045 }
1046
1047 {other} {
1048 SET_YYLLOC();
1049 return yytext[0];
1050 }
1051
1052 <<EOF>> {
1053 SET_YYLLOC();
1054 yyterminate();
1055 }
1056
1057 %%
1058
1059 /* LCOV_EXCL_STOP */
1060
1061 /*
1062 * Arrange access to yyextra for subroutines of the main yylex() function.
1063 * We expect each subroutine to have a yyscanner parameter. Rather than
1064 * use the yyget_xxx functions, which might or might not get inlined by the
1065 * compiler, we cheat just a bit and cast yyscanner to the right type.
1066 */
1067 #undef yyextra
1068 #define yyextra (((struct yyguts_t *) yyscanner)->yyextra_r)
1069
1070 /* Likewise for a couple of other things we need. */
1071 #undef yylloc
1072 #define yylloc (((struct yyguts_t *) yyscanner)->yylloc_r)
1073 #undef yyleng
1074 #define yyleng (((struct yyguts_t *) yyscanner)->yyleng_r)
1075
1076
1077 /*
1078 * scanner_errposition
1079 * Report a lexer or grammar error cursor position, if possible.
1080 *
1081 * This is expected to be used within an ereport() call. The return value
1082 * is a dummy (always 0, in fact).
1083 *
1084 * Note that this can only be used for messages emitted during raw parsing
1085 * (essentially, scan.l and gram.y), since it requires the yyscanner struct
1086 * to still be available.
1087 */
1088 int
1089 scanner_errposition(int location, core_yyscan_t yyscanner)
1090 {
1091 int pos;
1092
1093 if (location < 0)
1094 return 0; /* no-op if location is unknown */
1095
1096 /* Convert byte offset to character number */
1097 pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
1098 /* And pass it to the ereport mechanism */
1099 return errposition(pos);
1100 }
1101
1102 /*
1103 * scanner_yyerror
1104 * Report a lexer or grammar error.
1105 *
1106 * The message's cursor position is whatever YYLLOC was last set to,
1107 * ie, the start of the current token if called within yylex(), or the
1108 * most recently lexed token if called from the grammar.
1109 * This is OK for syntax error messages from the Bison parser, because Bison
1110 * parsers report error as soon as the first unparsable token is reached.
1111 * Beware of using yyerror for other purposes, as the cursor position might
1112 * be misleading!
1113 */
1114 void
1115 scanner_yyerror(const char *message, core_yyscan_t yyscanner)
1116 {
1117 const char *loc = yyextra->scanbuf + *yylloc;
1118
1119 if (*loc == YY_END_OF_BUFFER_CHAR)
1120 {
1121 ereport(ERROR,
1122 (errcode(ERRCODE_SYNTAX_ERROR),
1123 /* translator: %s is typically the translation of "syntax error" */
1124 errmsg("%s at end of input", _(message)),
1125 lexer_errposition()));
1126 }
1127 else
1128 {
1129 ereport(ERROR,
1130 (errcode(ERRCODE_SYNTAX_ERROR),
1131 /* translator: first %s is typically the translation of "syntax error" */
1132 errmsg("%s at or near \"%s\"", _(message), loc),
1133 lexer_errposition()));
1134 }
1135 }
1136
1137
1138 /*
1139 * Called before any actual parsing is done
1140 */
1141 core_yyscan_t
1142 scanner_init(const char *str,
1143 core_yy_extra_type *yyext,
1144 const ScanKeyword *keywords,
1145 int num_keywords)
1146 {
1147 Size slen = strlen(str);
1148 yyscan_t scanner;
1149
1150 if (yylex_init(&scanner) != 0)
1151 elog(ERROR, "yylex_init() failed: %m");
1152
1153 core_yyset_extra(yyext, scanner);
1154
1155 yyext->keywords = keywords;
1156 yyext->num_keywords = num_keywords;
1157
1158 yyext->backslash_quote = backslash_quote;
1159 yyext->escape_string_warning = escape_string_warning;
1160 yyext->standard_conforming_strings = standard_conforming_strings;
1161
1162 /*
1163 * Make a scan buffer with special termination needed by flex.
1164 */
1165 yyext->scanbuf = (char *) palloc(slen + 2);
1166 yyext->scanbuflen = slen;
1167 memcpy(yyext->scanbuf, str, slen);
1168 yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
1169 yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
1170
1171 /* initialize literal buffer to a reasonable but expansible size */
1172 yyext->literalalloc = 1024;
1173 yyext->literalbuf = (char *) palloc(yyext->literalalloc);
1174 yyext->literallen = 0;
1175
1176 return scanner;
1177 }
1178
1179
1180 /*
1181 * Called after parsing is done to clean up after scanner_init()
1182 */
1183 void
1184 scanner_finish(core_yyscan_t yyscanner)
1185 {
1186 /*
1187 * We don't bother to call yylex_destroy(), because all it would do is
1188 * pfree a small amount of control storage. It's cheaper to leak the
1189 * storage until the parsing context is destroyed. The amount of space
1190 * involved is usually negligible compared to the output parse tree
1191 * anyway.
1192 *
1193 * We do bother to pfree the scanbuf and literal buffer, but only if they
1194 * represent a nontrivial amount of space. The 8K cutoff is arbitrary.
1195 */
1196 if (yyextra->scanbuflen >= 8192)
1197 pfree(yyextra->scanbuf);
1198 if (yyextra->literalalloc >= 8192)
1199 pfree(yyextra->literalbuf);
1200 }
1201
1202
1203 static void
1204 addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
1205 {
1206 /* enlarge buffer if needed */
1207 if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
1208 {
1209 do
1210 {
1211 yyextra->literalalloc *= 2;
1212 } while ((yyextra->literallen + yleng) >= yyextra->literalalloc);
1213 yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1214 yyextra->literalalloc);
1215 }
1216 /* append new data */
1217 memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
1218 yyextra->literallen += yleng;
1219 }
1220
1221
1222 static void
1223 addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
1224 {
1225 /* enlarge buffer if needed */
1226 if ((yyextra->literallen + 1) >= yyextra->literalalloc)
1227 {
1228 yyextra->literalalloc *= 2;
1229 yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1230 yyextra->literalalloc);
1231 }
1232 /* append new data */
1233 yyextra->literalbuf[yyextra->literallen] = ychar;
1234 yyextra->literallen += 1;
1235 }
1236
1237
1238 /*
1239 * Create a palloc'd copy of literalbuf, adding a trailing null.
1240 */
1241 static char *
1242 litbufdup(core_yyscan_t yyscanner)
1243 {
1244 int llen = yyextra->literallen;
1245 char *new;
1246
1247 new = palloc(llen + 1);
1248 memcpy(new, yyextra->literalbuf, llen);
1249 new[llen] = '\0';
1250 return new;
1251 }
1252
1253 static int
1254 process_integer_literal(const char *token, YYSTYPE *lval)
1255 {
1256 int val;
1257 char *endptr;
1258
1259 errno = 0;
1260 val = strtoint(token, &endptr, 10);
1261 if (*endptr != '\0' || errno == ERANGE)
1262 {
1263 /* integer too large, treat it as a float */
1264 lval->str = pstrdup(token);
1265 return FCONST;
1266 }
1267 lval->ival = val;
1268 return ICONST;
1269 }
1270
1271 static unsigned int
1272 hexval(unsigned char c)
1273 {
1274 if (c >= '0' && c <= '9')
1275 return c - '0';
1276 if (c >= 'a' && c <= 'f')
1277 return c - 'a' + 0xA;
1278 if (c >= 'A' && c <= 'F')
1279 return c - 'A' + 0xA;
1280 elog(ERROR, "invalid hexadecimal digit");
1281 return 0; /* not reached */
1282 }
1283
1284 static void
1285 check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner)
1286 {
1287 if (GetDatabaseEncoding() == PG_UTF8)
1288 return;
1289
1290 if (c > 0x7F)
1291 {
1292 ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3); /* 3 for U&" */
1293 yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1294 }
1295 }
1296
1297 static bool
1298 is_utf16_surrogate_first(pg_wchar c)
1299 {
1300 return (c >= 0xD800 && c <= 0xDBFF);
1301 }
1302
1303 static bool
1304 is_utf16_surrogate_second(pg_wchar c)
1305 {
1306 return (c >= 0xDC00 && c <= 0xDFFF);
1307 }
1308
1309 static pg_wchar
1310 surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
1311 {
1312 return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
1313 }
1314
1315 static void
1316 addunicode(pg_wchar c, core_yyscan_t yyscanner)
1317 {
1318 char buf[8];
1319
1320 if (c == 0 || c > 0x10FFFF)
1321 yyerror("invalid Unicode escape value");
1322 if (c > 0x7F)
1323 {
1324 if (GetDatabaseEncoding() != PG_UTF8)
1325 yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1326 yyextra->saw_non_ascii = true;
1327 }
1328 unicode_to_utf8(c, (unsigned char *) buf);
1329 addlit(buf, pg_mblen(buf), yyscanner);
1330 }
1331
1332 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
1333 static bool
1334 check_uescapechar(unsigned char escape)
1335 {
1336 if (isxdigit(escape)
1337 || escape == '+'
1338 || escape == '\''
1339 || escape == '"'
1340 || scanner_isspace(escape))
1341 {
1342 return false;
1343 }
1344 else
1345 return true;
1346 }
1347
1348 /* like litbufdup, but handle unicode escapes */
1349 static char *
1350 litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
1351 {
1352 char *new;
1353 char *litbuf,
1354 *in,
1355 *out;
1356 pg_wchar pair_first = 0;
1357
1358 /* Make literalbuf null-terminated to simplify the scanning loop */
1359 litbuf = yyextra->literalbuf;
1360 litbuf[yyextra->literallen] = '\0';
1361
1362 /*
1363 * This relies on the subtle assumption that a UTF-8 expansion cannot be
1364 * longer than its escaped representation.
1365 */
1366 new = palloc(yyextra->literallen + 1);
1367
1368 in = litbuf;
1369 out = new;
1370 while (*in)
1371 {
1372 if (in[0] == escape)
1373 {
1374 if (in[1] == escape)
1375 {
1376 if (pair_first)
1377 {
1378 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1379 yyerror("invalid Unicode surrogate pair");
1380 }
1381 *out++ = escape;
1382 in += 2;
1383 }
1384 else if (isxdigit((unsigned char) in[1]) &&
1385 isxdigit((unsigned char) in[2]) &&
1386 isxdigit((unsigned char) in[3]) &&
1387 isxdigit((unsigned char) in[4]))
1388 {
1389 pg_wchar unicode;
1390
1391 unicode = (hexval(in[1]) << 12) +
1392 (hexval(in[2]) << 8) +
1393 (hexval(in[3]) << 4) +
1394 hexval(in[4]);
1395 check_unicode_value(unicode, in, yyscanner);
1396 if (pair_first)
1397 {
1398 if (is_utf16_surrogate_second(unicode))
1399 {
1400 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1401 pair_first = 0;
1402 }
1403 else
1404 {
1405 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1406 yyerror("invalid Unicode surrogate pair");
1407 }
1408 }
1409 else if (is_utf16_surrogate_second(unicode))
1410 yyerror("invalid Unicode surrogate pair");
1411
1412 if (is_utf16_surrogate_first(unicode))
1413 pair_first = unicode;
1414 else
1415 {
1416 unicode_to_utf8(unicode, (unsigned char *) out);
1417 out += pg_mblen(out);
1418 }
1419 in += 5;
1420 }
1421 else if (in[1] == '+' &&
1422 isxdigit((unsigned char) in[2]) &&
1423 isxdigit((unsigned char) in[3]) &&
1424 isxdigit((unsigned char) in[4]) &&
1425 isxdigit((unsigned char) in[5]) &&
1426 isxdigit((unsigned char) in[6]) &&
1427 isxdigit((unsigned char) in[7]))
1428 {
1429 pg_wchar unicode;
1430
1431 unicode = (hexval(in[2]) << 20) +
1432 (hexval(in[3]) << 16) +
1433 (hexval(in[4]) << 12) +
1434 (hexval(in[5]) << 8) +
1435 (hexval(in[6]) << 4) +
1436 hexval(in[7]);
1437 check_unicode_value(unicode, in, yyscanner);
1438 if (pair_first)
1439 {
1440 if (is_utf16_surrogate_second(unicode))
1441 {
1442 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1443 pair_first = 0;
1444 }
1445 else
1446 {
1447 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1448 yyerror("invalid Unicode surrogate pair");
1449 }
1450 }
1451 else if (is_utf16_surrogate_second(unicode))
1452 yyerror("invalid Unicode surrogate pair");
1453
1454 if (is_utf16_surrogate_first(unicode))
1455 pair_first = unicode;
1456 else
1457 {
1458 unicode_to_utf8(unicode, (unsigned char *) out);
1459 out += pg_mblen(out);
1460 }
1461 in += 8;
1462 }
1463 else
1464 {
1465 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1466 yyerror("invalid Unicode escape value");
1467 }
1468 }
1469 else
1470 {
1471 if (pair_first)
1472 {
1473 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1474 yyerror("invalid Unicode surrogate pair");
1475 }
1476 *out++ = *in++;
1477 }
1478 }
1479
1480 /* unfinished surrogate pair? */
1481 if (pair_first)
1482 {
1483 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1484 yyerror("invalid Unicode surrogate pair");
1485 }
1486
1487 *out = '\0';
1488
1489 /*
1490 * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
1491 * codes; but it's probably not worth the trouble, since this isn't likely
1492 * to be a performance-critical path.
1493 */
1494 pg_verifymbstr(new, out - new, false);
1495 return new;
1496 }
1497
1498 static unsigned char
1499 unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
1500 {
1501 switch (c)
1502 {
1503 case 'b':
1504 return '\b';
1505 case 'f':
1506 return '\f';
1507 case 'n':
1508 return '\n';
1509 case 'r':
1510 return '\r';
1511 case 't':
1512 return '\t';
1513 default:
1514 /* check for backslash followed by non-7-bit-ASCII */
1515 if (c == '\0' || IS_HIGHBIT_SET(c))
1516 yyextra->saw_non_ascii = true;
1517
1518 return c;
1519 }
1520 }
1521
1522 static void
1523 check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
1524 {
1525 if (ychar == '\'')
1526 {
1527 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1528 ereport(WARNING,
1529 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1530 errmsg("nonstandard use of \\' in a string literal"),
1531 errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
1532 lexer_errposition()));
1533 yyextra->warn_on_first_escape = false; /* warn only once per string */
1534 }
1535 else if (ychar == '\\')
1536 {
1537 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1538 ereport(WARNING,
1539 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1540 errmsg("nonstandard use of \\\\ in a string literal"),
1541 errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1542 lexer_errposition()));
1543 yyextra->warn_on_first_escape = false; /* warn only once per string */
1544 }
1545 else
1546 check_escape_warning(yyscanner);
1547 }
1548
1549 static void
1550 check_escape_warning(core_yyscan_t yyscanner)
1551 {
1552 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1553 ereport(WARNING,
1554 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1555 errmsg("nonstandard use of escape in a string literal"),
1556 errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
1557 lexer_errposition()));
1558 yyextra->warn_on_first_escape = false; /* warn only once per string */
1559 }
1560
1561 /*
1562 * Interface functions to make flex use palloc() instead of malloc().
1563 * It'd be better to make these static, but flex insists otherwise.
1564 */
1565
1566 void *
1567 core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
1568 {
1569 return palloc(bytes);
1570 }
1571
1572 void *
1573 core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
1574 {
1575 if (ptr)
1576 return repalloc(ptr, bytes);
1577 else
1578 return palloc(bytes);
1579 }
1580
1581 void
1582 core_yyfree(void *ptr, core_yyscan_t yyscanner)
1583 {
1584 if (ptr)
1585 pfree(ptr);
1586 }
1587