1 %top{
2 /*-------------------------------------------------------------------------
3 *
4 * scan.l
5 * lexical scanner for PostgreSQL
6 *
7 * NOTE NOTE NOTE:
8 *
9 * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l!
10 *
11 * The rules are designed so that the scanner never has to backtrack,
12 * in the sense that there is always a rule that can match the input
13 * consumed so far (the rule action may internally throw back some input
14 * with yyless(), however). As explained in the flex manual, this makes
15 * for a useful speed increase --- about a third faster than a plain -CF
16 * lexer, in simple testing. The extra complexity is mostly in the rules
17 * for handling float numbers and continued string literals. If you change
18 * the lexical rules, verify that you haven't broken the no-backtrack
19 * property by running flex with the "-b" option and checking that the
20 * resulting "lex.backup" file says that no backing up is needed. (As of
21 * Postgres 9.2, this check is made automatically by the Makefile.)
22 *
23 *
24 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
25 * Portions Copyright (c) 1994, Regents of the University of California
26 *
27 * IDENTIFICATION
28 * src/backend/parser/scan.l
29 *
30 *-------------------------------------------------------------------------
31 */
32 #include "postgres.h"
33
34 #include <ctype.h>
35 #include <unistd.h>
36
37 #include "parser/gramparse.h"
38 #include "parser/parser.h" /* only needed for GUC variables */
39 #include "parser/scansup.h"
40 #include "mb/pg_wchar.h"
41 }
42
43 %{
44 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
45 #undef fprintf
46 #define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg)
47
48 static void
fprintf_to_ereport(const char * fmt,const char * msg)49 fprintf_to_ereport(const char *fmt, const char *msg)
50 {
51 ereport(ERROR, (errmsg_internal("%s", msg)));
52 }
53
54 /*
55 * GUC variables. This is a DIRECT violation of the warning given at the
56 * head of gram.y, ie flex/bison code must not depend on any GUC variables;
57 * as such, changing their values can induce very unintuitive behavior.
58 * But we shall have to live with it until we can remove these variables.
59 */
60 int backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
61 bool escape_string_warning = true;
62 bool standard_conforming_strings = true;
63
64 /*
65 * Set the type of YYSTYPE.
66 */
67 #define YYSTYPE core_YYSTYPE
68
69 /*
70 * Set the type of yyextra. All state variables used by the scanner should
71 * be in yyextra, *not* statically allocated.
72 */
73 #define YY_EXTRA_TYPE core_yy_extra_type *
74
75 /*
76 * Each call to yylex must set yylloc to the location of the found token
77 * (expressed as a byte offset from the start of the input text).
78 * When we parse a token that requires multiple lexer rules to process,
79 * this should be done in the first such rule, else yylloc will point
80 * into the middle of the token.
81 */
82 #define SET_YYLLOC() (*(yylloc) = yytext - yyextra->scanbuf)
83
84 /*
85 * Advance yylloc by the given number of bytes.
86 */
87 #define ADVANCE_YYLLOC(delta) ( *(yylloc) += (delta) )
88
89 #define startlit() ( yyextra->literallen = 0 )
90 static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
91 static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
92 static char *litbufdup(core_yyscan_t yyscanner);
93 static char *litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner);
94 static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
95 static int process_integer_literal(const char *token, YYSTYPE *lval);
96 static bool is_utf16_surrogate_first(pg_wchar c);
97 static bool is_utf16_surrogate_second(pg_wchar c);
98 static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
99 static void addunicode(pg_wchar c, yyscan_t yyscanner);
100 static bool check_uescapechar(unsigned char escape);
101
102 #define yyerror(msg) scanner_yyerror(msg, yyscanner)
103
104 #define lexer_errposition() scanner_errposition(*(yylloc), yyscanner)
105
106 static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner);
107 static void check_escape_warning(core_yyscan_t yyscanner);
108
109 /*
110 * Work around a bug in flex 2.5.35: it emits a couple of functions that
111 * it forgets to emit declarations for. Since we use -Wmissing-prototypes,
112 * this would cause warnings. Providing our own declarations should be
113 * harmless even when the bug gets fixed.
114 */
115 extern int core_yyget_column(yyscan_t yyscanner);
116 extern void core_yyset_column(int column_no, yyscan_t yyscanner);
117
118 %}
119
120 %option reentrant
121 %option bison-bridge
122 %option bison-locations
123 %option 8bit
124 %option never-interactive
125 %option nodefault
126 %option noinput
127 %option nounput
128 %option noyywrap
129 %option noyyalloc
130 %option noyyrealloc
131 %option noyyfree
132 %option warn
133 %option prefix="core_yy"
134
135 /*
136 * OK, here is a short description of lex/flex rules behavior.
137 * The longest pattern which matches an input string is always chosen.
138 * For equal-length patterns, the first occurring in the rules list is chosen.
139 * INITIAL is the starting state, to which all non-conditional rules apply.
140 * Exclusive states change parsing rules while the state is active. When in
141 * an exclusive state, only those rules defined for that state apply.
142 *
143 * We use exclusive states for quoted strings, extended comments,
144 * and to eliminate parsing troubles for numeric strings.
145 * Exclusive states:
146 * <xb> bit string literal
147 * <xc> extended C-style comments
148 * <xd> delimited identifiers (double-quoted identifiers)
149 * <xh> hexadecimal numeric string
150 * <xq> standard quoted strings
151 * <xe> extended quoted strings (support backslash escape sequences)
152 * <xdolq> $foo$ quoted strings
153 * <xui> quoted identifier with Unicode escapes
154 * <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
155 * <xus> quoted string with Unicode escapes
156 * <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
157 * <xeu> Unicode surrogate pair in extended quoted string
158 *
159 * Remember to add an <<EOF>> case whenever you add a new exclusive state!
160 * The default one is probably not the right thing.
161 */
162
163 %x xb
164 %x xc
165 %x xd
166 %x xh
167 %x xe
168 %x xq
169 %x xdolq
170 %x xui
171 %x xuiend
172 %x xus
173 %x xusend
174 %x xeu
175
176 /*
177 * In order to make the world safe for Windows and Mac clients as well as
178 * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n
179 * sequence will be seen as two successive newlines, but that doesn't cause
180 * any problems. Comments that start with -- and extend to the next
181 * newline are treated as equivalent to a single whitespace character.
182 *
183 * NOTE a fine point: if there is no newline following --, we will absorb
184 * everything to the end of the input as a comment. This is correct. Older
185 * versions of Postgres failed to recognize -- as a comment if the input
186 * did not end with a newline.
187 *
188 * XXX perhaps \f (formfeed) should be treated as a newline as well?
189 *
190 * XXX if you change the set of whitespace characters, fix scanner_isspace()
191 * to agree, and see also the plpgsql lexer.
192 */
193
194 space [ \t\n\r\f]
195 horiz_space [ \t\f]
196 newline [\n\r]
197 non_newline [^\n\r]
198
199 comment ("--"{non_newline}*)
200
201 whitespace ({space}+|{comment})
202
203 /*
204 * SQL requires at least one newline in the whitespace separating
205 * string literals that are to be concatenated. Silly, but who are we
206 * to argue? Note that {whitespace_with_newline} should not have * after
207 * it, whereas {whitespace} should generally have a * after it...
208 */
209
210 special_whitespace ({space}+|{comment}{newline})
211 horiz_whitespace ({horiz_space}|{comment})
212 whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
213
214 /*
215 * To ensure that {quotecontinue} can be scanned without having to back up
216 * if the full pattern isn't matched, we include trailing whitespace in
217 * {quotestop}. This matches all cases where {quotecontinue} fails to match,
218 * except for {quote} followed by whitespace and just one "-" (not two,
219 * which would start a {comment}). To cover that we have {quotefail}.
220 * The actions for {quotestop} and {quotefail} must throw back characters
221 * beyond the quote proper.
222 */
223 quote '
224 quotestop {quote}{whitespace}*
225 quotecontinue {quote}{whitespace_with_newline}{quote}
226 quotefail {quote}{whitespace}*"-"
227
228 /* Bit string
229 * It is tempting to scan the string for only those characters
230 * which are allowed. However, this leads to silently swallowed
231 * characters if illegal characters are included in the string.
232 * For example, if xbinside is [01] then B'ABCD' is interpreted
233 * as a zero-length string, and the ABCD' is lost!
234 * Better to pass the string forward and let the input routines
235 * validate the contents.
236 */
237 xbstart [bB]{quote}
238 xbinside [^']*
239
240 /* Hexadecimal number */
241 xhstart [xX]{quote}
242 xhinside [^']*
243
244 /* National character */
245 xnstart [nN]{quote}
246
247 /* Quoted string that allows backslash escapes */
248 xestart [eE]{quote}
249 xeinside [^\\']+
250 xeescape [\\][^0-7]
251 xeoctesc [\\][0-7]{1,3}
252 xehexesc [\\]x[0-9A-Fa-f]{1,2}
253 xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
254 xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
255
256 /* Extended quote
257 * xqdouble implements embedded quote, ''''
258 */
259 xqstart {quote}
260 xqdouble {quote}{quote}
261 xqinside [^']+
262
263 /* $foo$ style quotes ("dollar quoting")
264 * The quoted string starts with $foo$ where "foo" is an optional string
265 * in the form of an identifier, except that it may not contain "$",
266 * and extends to the first occurrence of an identical string.
267 * There is *no* processing of the quoted text.
268 *
269 * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
270 * fails to match its trailing "$".
271 */
272 dolq_start [A-Za-z\200-\377_]
273 dolq_cont [A-Za-z\200-\377_0-9]
274 dolqdelim \$({dolq_start}{dolq_cont}*)?\$
275 dolqfailed \${dolq_start}{dolq_cont}*
276 dolqinside [^$]+
277
278 /* Double quote
279 * Allows embedded spaces and other special characters into identifiers.
280 */
281 dquote \"
282 xdstart {dquote}
283 xdstop {dquote}
284 xddouble {dquote}{dquote}
285 xdinside [^"]+
286
287 /* Unicode escapes */
288 uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
289 /* error rule to avoid backup */
290 uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
291
292 /* Quoted identifier with Unicode escapes */
293 xuistart [uU]&{dquote}
294
295 /* Quoted string with Unicode escapes */
296 xusstart [uU]&{quote}
297
298 /* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
299 xustop1 {uescapefail}?
300 xustop2 {uescape}
301
302 /* error rule to avoid backup */
303 xufailed [uU]&
304
305
306 /* C-style comments
307 *
308 * The "extended comment" syntax closely resembles allowable operator syntax.
309 * The tricky part here is to get lex to recognize a string starting with
310 * slash-star as a comment, when interpreting it as an operator would produce
311 * a longer match --- remember lex will prefer a longer match! Also, if we
312 * have something like plus-slash-star, lex will think this is a 3-character
313 * operator whereas we want to see it as a + operator and a comment start.
314 * The solution is two-fold:
315 * 1. append {op_chars}* to xcstart so that it matches as much text as
316 * {operator} would. Then the tie-breaker (first matching rule of same
317 * length) ensures xcstart wins. We put back the extra stuff with yyless()
318 * in case it contains a star-slash that should terminate the comment.
319 * 2. In the operator rule, check for slash-star within the operator, and
320 * if found throw it back with yyless(). This handles the plus-slash-star
321 * problem.
322 * Dash-dash comments have similar interactions with the operator rule.
323 */
324 xcstart \/\*{op_chars}*
325 xcstop \*+\/
326 xcinside [^*/]+
327
328 digit [0-9]
329 ident_start [A-Za-z\200-\377_]
330 ident_cont [A-Za-z\200-\377_0-9\$]
331
332 identifier {ident_start}{ident_cont}*
333
334 /* Assorted special-case operators and operator-like tokens */
335 typecast "::"
336 dot_dot \.\.
337 colon_equals ":="
338
339 /*
340 * These operator-like tokens (unlike the above ones) also match the {operator}
341 * rule, which means that they might be overridden by a longer match if they
342 * are followed by a comment start or a + or - character. Accordingly, if you
343 * add to this list, you must also add corresponding code to the {operator}
344 * block to return the correct token in such cases. (This is not needed in
345 * psqlscan.l since the token value is ignored there.)
346 */
347 equals_greater "=>"
348 less_equals "<="
349 greater_equals ">="
350 less_greater "<>"
351 not_equals "!="
352
353 /*
354 * "self" is the set of chars that should be returned as single-character
355 * tokens. "op_chars" is the set of chars that can make up "Op" tokens,
356 * which can be one or more characters long (but if a single-char token
357 * appears in the "self" set, it is not to be returned as an Op). Note
358 * that the sets overlap, but each has some chars that are not in the other.
359 *
360 * If you change either set, adjust the character lists appearing in the
361 * rule for "operator"!
362 */
363 self [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
364 op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
365 operator {op_chars}+
366
367 /* we no longer allow unary minus in numbers.
368 * instead we pass it separately to parser. there it gets
369 * coerced via doNegate() -- Leon aug 20 1999
370 *
371 * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
372 *
373 * {realfail1} and {realfail2} are added to prevent the need for scanner
374 * backup when the {real} rule fails to match completely.
375 */
376
377 integer {digit}+
378 decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*))
379 decimalfail {digit}+\.\.
380 real ({integer}|{decimal})[Ee][-+]?{digit}+
381 realfail1 ({integer}|{decimal})[Ee]
382 realfail2 ({integer}|{decimal})[Ee][-+]
383
384 param \${integer}
385
386 other .
387
388 /*
389 * Dollar quoted strings are totally opaque, and no escaping is done on them.
390 * Other quoted strings must allow some special characters such as single-quote
391 * and newline.
392 * Embedded single-quotes are implemented both in the SQL standard
393 * style of two adjacent single quotes "''" and in the Postgres/Java style
394 * of escaped-quote "\'".
395 * Other embedded escaped characters are matched explicitly and the leading
396 * backslash is dropped from the string.
397 * Note that xcstart must appear before operator, as explained above!
398 * Also whitespace (comment) must appear before operator.
399 */
400
401 %%
402
403 {whitespace} {
404 /* ignore */
405 }
406
407 {xcstart} {
408 /* Set location in case of syntax error in comment */
409 SET_YYLLOC();
410 yyextra->xcdepth = 0;
411 BEGIN(xc);
412 /* Put back any characters past slash-star; see above */
413 yyless(2);
414 }
415
416 <xc>{xcstart} {
417 (yyextra->xcdepth)++;
418 /* Put back any characters past slash-star; see above */
419 yyless(2);
420 }
421
422 <xc>{xcstop} {
423 if (yyextra->xcdepth <= 0)
424 BEGIN(INITIAL);
425 else
426 (yyextra->xcdepth)--;
427 }
428
429 <xc>{xcinside} {
430 /* ignore */
431 }
432
433 <xc>{op_chars} {
434 /* ignore */
435 }
436
437 <xc>\*+ {
438 /* ignore */
439 }
440
441 <xc><<EOF>> { yyerror("unterminated /* comment"); }
442
443 {xbstart} {
444 /* Binary bit type.
445 * At some point we should simply pass the string
446 * forward to the parser and label it there.
447 * In the meantime, place a leading "b" on the string
448 * to mark it for the input routine as a binary string.
449 */
450 SET_YYLLOC();
451 BEGIN(xb);
452 startlit();
453 addlitchar('b', yyscanner);
454 }
455 <xb>{quotestop} |
456 <xb>{quotefail} {
457 yyless(1);
458 BEGIN(INITIAL);
459 yylval->str = litbufdup(yyscanner);
460 return BCONST;
461 }
462 <xh>{xhinside} |
463 <xb>{xbinside} {
464 addlit(yytext, yyleng, yyscanner);
465 }
466 <xh>{quotecontinue} |
467 <xb>{quotecontinue} {
468 /* ignore */
469 }
470 <xb><<EOF>> { yyerror("unterminated bit string literal"); }
471
472 {xhstart} {
473 /* Hexadecimal bit type.
474 * At some point we should simply pass the string
475 * forward to the parser and label it there.
476 * In the meantime, place a leading "x" on the string
477 * to mark it for the input routine as a hex string.
478 */
479 SET_YYLLOC();
480 BEGIN(xh);
481 startlit();
482 addlitchar('x', yyscanner);
483 }
484 <xh>{quotestop} |
485 <xh>{quotefail} {
486 yyless(1);
487 BEGIN(INITIAL);
488 yylval->str = litbufdup(yyscanner);
489 return XCONST;
490 }
491 <xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); }
492
493 {xnstart} {
494 /* National character.
495 * We will pass this along as a normal character string,
496 * but preceded with an internally-generated "NCHAR".
497 */
498 const ScanKeyword *keyword;
499
500 SET_YYLLOC();
501 yyless(1); /* eat only 'n' this time */
502
503 keyword = ScanKeywordLookup("nchar",
504 yyextra->keywords,
505 yyextra->num_keywords);
506 if (keyword != NULL)
507 {
508 yylval->keyword = keyword->name;
509 return keyword->value;
510 }
511 else
512 {
513 /* If NCHAR isn't a keyword, just return "n" */
514 yylval->str = pstrdup("n");
515 return IDENT;
516 }
517 }
518
519 {xqstart} {
520 yyextra->warn_on_first_escape = true;
521 yyextra->saw_non_ascii = false;
522 SET_YYLLOC();
523 if (yyextra->standard_conforming_strings)
524 BEGIN(xq);
525 else
526 BEGIN(xe);
527 startlit();
528 }
529 {xestart} {
530 yyextra->warn_on_first_escape = false;
531 yyextra->saw_non_ascii = false;
532 SET_YYLLOC();
533 BEGIN(xe);
534 startlit();
535 }
536 {xusstart} {
537 SET_YYLLOC();
538 if (!yyextra->standard_conforming_strings)
539 ereport(ERROR,
540 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
541 errmsg("unsafe use of string constant with Unicode escapes"),
542 errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
543 lexer_errposition()));
544 BEGIN(xus);
545 startlit();
546 }
547 <xq,xe>{quotestop} |
548 <xq,xe>{quotefail} {
549 yyless(1);
550 BEGIN(INITIAL);
551 /*
552 * check that the data remains valid if it might have been
553 * made invalid by unescaping any chars.
554 */
555 if (yyextra->saw_non_ascii)
556 pg_verifymbstr(yyextra->literalbuf,
557 yyextra->literallen,
558 false);
559 yylval->str = litbufdup(yyscanner);
560 return SCONST;
561 }
562 <xus>{quotestop} |
563 <xus>{quotefail} {
564 /* throw back all but the quote */
565 yyless(1);
566 /* xusend state looks for possible UESCAPE */
567 BEGIN(xusend);
568 }
569 <xusend>{whitespace} {
570 /* stay in xusend state over whitespace */
571 }
572 <xusend><<EOF>> |
573 <xusend>{other} |
574 <xusend>{xustop1} {
575 /* no UESCAPE after the quote, throw back everything */
576 yyless(0);
577 BEGIN(INITIAL);
578 yylval->str = litbuf_udeescape('\\', yyscanner);
579 return SCONST;
580 }
581 <xusend>{xustop2} {
582 /* found UESCAPE after the end quote */
583 BEGIN(INITIAL);
584 if (!check_uescapechar(yytext[yyleng - 2]))
585 {
586 SET_YYLLOC();
587 ADVANCE_YYLLOC(yyleng - 2);
588 yyerror("invalid Unicode escape character");
589 }
590 yylval->str = litbuf_udeescape(yytext[yyleng - 2],
591 yyscanner);
592 return SCONST;
593 }
594 <xq,xe,xus>{xqdouble} {
595 addlitchar('\'', yyscanner);
596 }
597 <xq,xus>{xqinside} {
598 addlit(yytext, yyleng, yyscanner);
599 }
600 <xe>{xeinside} {
601 addlit(yytext, yyleng, yyscanner);
602 }
603 <xe>{xeunicode} {
604 pg_wchar c = strtoul(yytext + 2, NULL, 16);
605
606 check_escape_warning(yyscanner);
607
608 if (is_utf16_surrogate_first(c))
609 {
610 yyextra->utf16_first_part = c;
611 BEGIN(xeu);
612 }
613 else if (is_utf16_surrogate_second(c))
614 yyerror("invalid Unicode surrogate pair");
615 else
616 addunicode(c, yyscanner);
617 }
618 <xeu>{xeunicode} {
619 pg_wchar c = strtoul(yytext + 2, NULL, 16);
620
621 if (!is_utf16_surrogate_second(c))
622 yyerror("invalid Unicode surrogate pair");
623
624 c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
625
626 addunicode(c, yyscanner);
627
628 BEGIN(xe);
629 }
630 <xeu>. { yyerror("invalid Unicode surrogate pair"); }
631 <xeu>\n { yyerror("invalid Unicode surrogate pair"); }
632 <xeu><<EOF>> { yyerror("invalid Unicode surrogate pair"); }
633 <xe,xeu>{xeunicodefail} {
634 ereport(ERROR,
635 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
636 errmsg("invalid Unicode escape"),
637 errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
638 lexer_errposition()));
639 }
640 <xe>{xeescape} {
641 if (yytext[1] == '\'')
642 {
643 if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
644 (yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
645 PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
646 ereport(ERROR,
647 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
648 errmsg("unsafe use of \\' in a string literal"),
649 errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
650 lexer_errposition()));
651 }
652 check_string_escape_warning(yytext[1], yyscanner);
653 addlitchar(unescape_single_char(yytext[1], yyscanner),
654 yyscanner);
655 }
656 <xe>{xeoctesc} {
657 unsigned char c = strtoul(yytext + 1, NULL, 8);
658
659 check_escape_warning(yyscanner);
660 addlitchar(c, yyscanner);
661 if (c == '\0' || IS_HIGHBIT_SET(c))
662 yyextra->saw_non_ascii = true;
663 }
664 <xe>{xehexesc} {
665 unsigned char c = strtoul(yytext + 2, NULL, 16);
666
667 check_escape_warning(yyscanner);
668 addlitchar(c, yyscanner);
669 if (c == '\0' || IS_HIGHBIT_SET(c))
670 yyextra->saw_non_ascii = true;
671 }
672 <xq,xe,xus>{quotecontinue} {
673 /* ignore */
674 }
675 <xe>. {
676 /* This is only needed for \ just before EOF */
677 addlitchar(yytext[0], yyscanner);
678 }
679 <xq,xe,xus><<EOF>> { yyerror("unterminated quoted string"); }
680
681 {dolqdelim} {
682 SET_YYLLOC();
683 yyextra->dolqstart = pstrdup(yytext);
684 BEGIN(xdolq);
685 startlit();
686 }
687 {dolqfailed} {
688 SET_YYLLOC();
689 /* throw back all but the initial "$" */
690 yyless(1);
691 /* and treat it as {other} */
692 return yytext[0];
693 }
694 <xdolq>{dolqdelim} {
695 if (strcmp(yytext, yyextra->dolqstart) == 0)
696 {
697 pfree(yyextra->dolqstart);
698 yyextra->dolqstart = NULL;
699 BEGIN(INITIAL);
700 yylval->str = litbufdup(yyscanner);
701 return SCONST;
702 }
703 else
704 {
705 /*
706 * When we fail to match $...$ to dolqstart, transfer
707 * the $... part to the output, but put back the final
708 * $ for rescanning. Consider $delim$...$junk$delim$
709 */
710 addlit(yytext, yyleng - 1, yyscanner);
711 yyless(yyleng - 1);
712 }
713 }
714 <xdolq>{dolqinside} {
715 addlit(yytext, yyleng, yyscanner);
716 }
717 <xdolq>{dolqfailed} {
718 addlit(yytext, yyleng, yyscanner);
719 }
720 <xdolq>. {
721 /* This is only needed for $ inside the quoted text */
722 addlitchar(yytext[0], yyscanner);
723 }
724 <xdolq><<EOF>> { yyerror("unterminated dollar-quoted string"); }
725
726 {xdstart} {
727 SET_YYLLOC();
728 BEGIN(xd);
729 startlit();
730 }
731 {xuistart} {
732 SET_YYLLOC();
733 BEGIN(xui);
734 startlit();
735 }
736 <xd>{xdstop} {
737 char *ident;
738
739 BEGIN(INITIAL);
740 if (yyextra->literallen == 0)
741 yyerror("zero-length delimited identifier");
742 ident = litbufdup(yyscanner);
743 if (yyextra->literallen >= NAMEDATALEN)
744 truncate_identifier(ident, yyextra->literallen, true);
745 yylval->str = ident;
746 return IDENT;
747 }
748 <xui>{dquote} {
749 yyless(1);
750 /* xuiend state looks for possible UESCAPE */
751 BEGIN(xuiend);
752 }
753 <xuiend>{whitespace} {
754 /* stay in xuiend state over whitespace */
755 }
756 <xuiend><<EOF>> |
757 <xuiend>{other} |
758 <xuiend>{xustop1} {
759 /* no UESCAPE after the quote, throw back everything */
760 char *ident;
761 int identlen;
762
763 yyless(0);
764
765 BEGIN(INITIAL);
766 if (yyextra->literallen == 0)
767 yyerror("zero-length delimited identifier");
768 ident = litbuf_udeescape('\\', yyscanner);
769 identlen = strlen(ident);
770 if (identlen >= NAMEDATALEN)
771 truncate_identifier(ident, identlen, true);
772 yylval->str = ident;
773 return IDENT;
774 }
775 <xuiend>{xustop2} {
776 /* found UESCAPE after the end quote */
777 char *ident;
778 int identlen;
779
780 BEGIN(INITIAL);
781 if (yyextra->literallen == 0)
782 yyerror("zero-length delimited identifier");
783 if (!check_uescapechar(yytext[yyleng - 2]))
784 {
785 SET_YYLLOC();
786 ADVANCE_YYLLOC(yyleng - 2);
787 yyerror("invalid Unicode escape character");
788 }
789 ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
790 identlen = strlen(ident);
791 if (identlen >= NAMEDATALEN)
792 truncate_identifier(ident, identlen, true);
793 yylval->str = ident;
794 return IDENT;
795 }
796 <xd,xui>{xddouble} {
797 addlitchar('"', yyscanner);
798 }
799 <xd,xui>{xdinside} {
800 addlit(yytext, yyleng, yyscanner);
801 }
802 <xd,xui><<EOF>> { yyerror("unterminated quoted identifier"); }
803
804 {xufailed} {
805 char *ident;
806
807 SET_YYLLOC();
808 /* throw back all but the initial u/U */
809 yyless(1);
810 /* and treat it as {identifier} */
811 ident = downcase_truncate_identifier(yytext, yyleng, true);
812 yylval->str = ident;
813 return IDENT;
814 }
815
816 {typecast} {
817 SET_YYLLOC();
818 return TYPECAST;
819 }
820
821 {dot_dot} {
822 SET_YYLLOC();
823 return DOT_DOT;
824 }
825
826 {colon_equals} {
827 SET_YYLLOC();
828 return COLON_EQUALS;
829 }
830
831 {equals_greater} {
832 SET_YYLLOC();
833 return EQUALS_GREATER;
834 }
835
836 {less_equals} {
837 SET_YYLLOC();
838 return LESS_EQUALS;
839 }
840
841 {greater_equals} {
842 SET_YYLLOC();
843 return GREATER_EQUALS;
844 }
845
846 {less_greater} {
847 /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
848 SET_YYLLOC();
849 return NOT_EQUALS;
850 }
851
852 {not_equals} {
853 /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
854 SET_YYLLOC();
855 return NOT_EQUALS;
856 }
857
858 {self} {
859 SET_YYLLOC();
860 return yytext[0];
861 }
862
863 {operator} {
864 /*
865 * Check for embedded slash-star or dash-dash; those
866 * are comment starts, so operator must stop there.
867 * Note that slash-star or dash-dash at the first
868 * character will match a prior rule, not this one.
869 */
870 int nchars = yyleng;
871 char *slashstar = strstr(yytext, "/*");
872 char *dashdash = strstr(yytext, "--");
873
874 if (slashstar && dashdash)
875 {
876 /* if both appear, take the first one */
877 if (slashstar > dashdash)
878 slashstar = dashdash;
879 }
880 else if (!slashstar)
881 slashstar = dashdash;
882 if (slashstar)
883 nchars = slashstar - yytext;
884
885 /*
886 * For SQL compatibility, '+' and '-' cannot be the
887 * last char of a multi-char operator unless the operator
888 * contains chars that are not in SQL operators.
889 * The idea is to lex '=-' as two operators, but not
890 * to forbid operator names like '?-' that could not be
891 * sequences of SQL operators.
892 */
893 if (nchars > 1 &&
894 (yytext[nchars - 1] == '+' ||
895 yytext[nchars - 1] == '-'))
896 {
897 int ic;
898
899 for (ic = nchars - 2; ic >= 0; ic--)
900 {
901 char c = yytext[ic];
902 if (c == '~' || c == '!' || c == '@' ||
903 c == '#' || c == '^' || c == '&' ||
904 c == '|' || c == '`' || c == '?' ||
905 c == '%')
906 break;
907 }
908 if (ic < 0)
909 {
910 /*
911 * didn't find a qualifying character, so remove
912 * all trailing [+-]
913 */
914 do {
915 nchars--;
916 } while (nchars > 1 &&
917 (yytext[nchars - 1] == '+' ||
918 yytext[nchars - 1] == '-'));
919 }
920 }
921
922 SET_YYLLOC();
923
924 if (nchars < yyleng)
925 {
926 /* Strip the unwanted chars from the token */
927 yyless(nchars);
928 /*
929 * If what we have left is only one char, and it's
930 * one of the characters matching "self", then
931 * return it as a character token the same way
932 * that the "self" rule would have.
933 */
934 if (nchars == 1 &&
935 strchr(",()[].;:+-*/%^<>=", yytext[0]))
936 return yytext[0];
937 /*
938 * Likewise, if what we have left is two chars, and
939 * those match the tokens ">=", "<=", "=>", "<>" or
940 * "!=", then we must return the appropriate token
941 * rather than the generic Op.
942 */
943 if (nchars == 2)
944 {
945 if (yytext[0] == '=' && yytext[1] == '>')
946 return EQUALS_GREATER;
947 if (yytext[0] == '>' && yytext[1] == '=')
948 return GREATER_EQUALS;
949 if (yytext[0] == '<' && yytext[1] == '=')
950 return LESS_EQUALS;
951 if (yytext[0] == '<' && yytext[1] == '>')
952 return NOT_EQUALS;
953 if (yytext[0] == '!' && yytext[1] == '=')
954 return NOT_EQUALS;
955 }
956 }
957
958 /*
959 * Complain if operator is too long. Unlike the case
960 * for identifiers, we make this an error not a notice-
961 * and-truncate, because the odds are we are looking at
962 * a syntactic mistake anyway.
963 */
964 if (nchars >= NAMEDATALEN)
965 yyerror("operator too long");
966
967 yylval->str = pstrdup(yytext);
968 return Op;
969 }
970
971 {param} {
972 SET_YYLLOC();
973 yylval->ival = atol(yytext + 1);
974 return PARAM;
975 }
976
977 {integer} {
978 SET_YYLLOC();
979 return process_integer_literal(yytext, yylval);
980 }
981 {decimal} {
982 SET_YYLLOC();
983 yylval->str = pstrdup(yytext);
984 return FCONST;
985 }
986 {decimalfail} {
987 /* throw back the .., and treat as integer */
988 yyless(yyleng - 2);
989 SET_YYLLOC();
990 return process_integer_literal(yytext, yylval);
991 }
992 {real} {
993 SET_YYLLOC();
994 yylval->str = pstrdup(yytext);
995 return FCONST;
996 }
997 {realfail1} {
998 /*
999 * throw back the [Ee], and treat as {decimal}. Note
1000 * that it is possible the input is actually {integer},
1001 * but since this case will almost certainly lead to a
1002 * syntax error anyway, we don't bother to distinguish.
1003 */
1004 yyless(yyleng - 1);
1005 SET_YYLLOC();
1006 yylval->str = pstrdup(yytext);
1007 return FCONST;
1008 }
1009 {realfail2} {
1010 /* throw back the [Ee][+-], and proceed as above */
1011 yyless(yyleng - 2);
1012 SET_YYLLOC();
1013 yylval->str = pstrdup(yytext);
1014 return FCONST;
1015 }
1016
1017
1018 {identifier} {
1019 const ScanKeyword *keyword;
1020 char *ident;
1021
1022 SET_YYLLOC();
1023
1024 /* Is it a keyword? */
1025 keyword = ScanKeywordLookup(yytext,
1026 yyextra->keywords,
1027 yyextra->num_keywords);
1028 if (keyword != NULL)
1029 {
1030 yylval->keyword = keyword->name;
1031 return keyword->value;
1032 }
1033
1034 /*
1035 * No. Convert the identifier to lower case, and truncate
1036 * if necessary.
1037 */
1038 ident = downcase_truncate_identifier(yytext, yyleng, true);
1039 yylval->str = ident;
1040 return IDENT;
1041 }
1042
1043 {other} {
1044 SET_YYLLOC();
1045 return yytext[0];
1046 }
1047
1048 <<EOF>> {
1049 SET_YYLLOC();
1050 yyterminate();
1051 }
1052
1053 %%
1054
1055 /*
1056 * Arrange access to yyextra for subroutines of the main yylex() function.
1057 * We expect each subroutine to have a yyscanner parameter. Rather than
1058 * use the yyget_xxx functions, which might or might not get inlined by the
1059 * compiler, we cheat just a bit and cast yyscanner to the right type.
1060 */
1061 #undef yyextra
1062 #define yyextra (((struct yyguts_t *) yyscanner)->yyextra_r)
1063
1064 /* Likewise for a couple of other things we need. */
1065 #undef yylloc
1066 #define yylloc (((struct yyguts_t *) yyscanner)->yylloc_r)
1067 #undef yyleng
1068 #define yyleng (((struct yyguts_t *) yyscanner)->yyleng_r)
1069
1070
1071 /*
1072 * scanner_errposition
1073 * Report a lexer or grammar error cursor position, if possible.
1074 *
1075 * This is expected to be used within an ereport() call. The return value
1076 * is a dummy (always 0, in fact).
1077 *
1078 * Note that this can only be used for messages emitted during raw parsing
1079 * (essentially, scan.l and gram.y), since it requires the yyscanner struct
1080 * to still be available.
1081 */
1082 int
1083 scanner_errposition(int location, core_yyscan_t yyscanner)
1084 {
1085 int pos;
1086
1087 if (location < 0)
1088 return 0; /* no-op if location is unknown */
1089
1090 /* Convert byte offset to character number */
1091 pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
1092 /* And pass it to the ereport mechanism */
1093 return errposition(pos);
1094 }
1095
1096 /*
1097 * scanner_yyerror
1098 * Report a lexer or grammar error.
1099 *
1100 * The message's cursor position is whatever YYLLOC was last set to,
1101 * ie, the start of the current token if called within yylex(), or the
1102 * most recently lexed token if called from the grammar.
1103 * This is OK for syntax error messages from the Bison parser, because Bison
1104 * parsers report error as soon as the first unparsable token is reached.
1105 * Beware of using yyerror for other purposes, as the cursor position might
1106 * be misleading!
1107 */
1108 void
1109 scanner_yyerror(const char *message, core_yyscan_t yyscanner)
1110 {
1111 const char *loc = yyextra->scanbuf + *yylloc;
1112
1113 if (*loc == YY_END_OF_BUFFER_CHAR)
1114 {
1115 ereport(ERROR,
1116 (errcode(ERRCODE_SYNTAX_ERROR),
1117 /* translator: %s is typically the translation of "syntax error" */
1118 errmsg("%s at end of input", _(message)),
1119 lexer_errposition()));
1120 }
1121 else
1122 {
1123 ereport(ERROR,
1124 (errcode(ERRCODE_SYNTAX_ERROR),
1125 /* translator: first %s is typically the translation of "syntax error" */
1126 errmsg("%s at or near \"%s\"", _(message), loc),
1127 lexer_errposition()));
1128 }
1129 }
1130
1131
1132 /*
1133 * Called before any actual parsing is done
1134 */
1135 core_yyscan_t
1136 scanner_init(const char *str,
1137 core_yy_extra_type *yyext,
1138 const ScanKeyword *keywords,
1139 int num_keywords)
1140 {
1141 Size slen = strlen(str);
1142 yyscan_t scanner;
1143
1144 if (yylex_init(&scanner) != 0)
1145 elog(ERROR, "yylex_init() failed: %m");
1146
1147 core_yyset_extra(yyext, scanner);
1148
1149 yyext->keywords = keywords;
1150 yyext->num_keywords = num_keywords;
1151
1152 yyext->backslash_quote = backslash_quote;
1153 yyext->escape_string_warning = escape_string_warning;
1154 yyext->standard_conforming_strings = standard_conforming_strings;
1155
1156 /*
1157 * Make a scan buffer with special termination needed by flex.
1158 */
1159 yyext->scanbuf = (char *) palloc(slen + 2);
1160 yyext->scanbuflen = slen;
1161 memcpy(yyext->scanbuf, str, slen);
1162 yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
1163 yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
1164
1165 /* initialize literal buffer to a reasonable but expansible size */
1166 yyext->literalalloc = 1024;
1167 yyext->literalbuf = (char *) palloc(yyext->literalalloc);
1168 yyext->literallen = 0;
1169
1170 return scanner;
1171 }
1172
1173
1174 /*
1175 * Called after parsing is done to clean up after scanner_init()
1176 */
1177 void
1178 scanner_finish(core_yyscan_t yyscanner)
1179 {
1180 /*
1181 * We don't bother to call yylex_destroy(), because all it would do is
1182 * pfree a small amount of control storage. It's cheaper to leak the
1183 * storage until the parsing context is destroyed. The amount of space
1184 * involved is usually negligible compared to the output parse tree
1185 * anyway.
1186 *
1187 * We do bother to pfree the scanbuf and literal buffer, but only if they
1188 * represent a nontrivial amount of space. The 8K cutoff is arbitrary.
1189 */
1190 if (yyextra->scanbuflen >= 8192)
1191 pfree(yyextra->scanbuf);
1192 if (yyextra->literalalloc >= 8192)
1193 pfree(yyextra->literalbuf);
1194 }
1195
1196
1197 static void
1198 addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
1199 {
1200 /* enlarge buffer if needed */
1201 if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
1202 {
1203 do
1204 {
1205 yyextra->literalalloc *= 2;
1206 } while ((yyextra->literallen + yleng) >= yyextra->literalalloc);
1207 yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1208 yyextra->literalalloc);
1209 }
1210 /* append new data */
1211 memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
1212 yyextra->literallen += yleng;
1213 }
1214
1215
1216 static void
1217 addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
1218 {
1219 /* enlarge buffer if needed */
1220 if ((yyextra->literallen + 1) >= yyextra->literalalloc)
1221 {
1222 yyextra->literalalloc *= 2;
1223 yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1224 yyextra->literalalloc);
1225 }
1226 /* append new data */
1227 yyextra->literalbuf[yyextra->literallen] = ychar;
1228 yyextra->literallen += 1;
1229 }
1230
1231
1232 /*
1233 * Create a palloc'd copy of literalbuf, adding a trailing null.
1234 */
1235 static char *
1236 litbufdup(core_yyscan_t yyscanner)
1237 {
1238 int llen = yyextra->literallen;
1239 char *new;
1240
1241 new = palloc(llen + 1);
1242 memcpy(new, yyextra->literalbuf, llen);
1243 new[llen] = '\0';
1244 return new;
1245 }
1246
1247 static int
1248 process_integer_literal(const char *token, YYSTYPE *lval)
1249 {
1250 long val;
1251 char *endptr;
1252
1253 errno = 0;
1254 val = strtol(token, &endptr, 10);
1255 if (*endptr != '\0' || errno == ERANGE
1256 #ifdef HAVE_LONG_INT_64
1257 /* if long > 32 bits, check for overflow of int4 */
1258 || val != (long) ((int32) val)
1259 #endif
1260 )
1261 {
1262 /* integer too large, treat it as a float */
1263 lval->str = pstrdup(token);
1264 return FCONST;
1265 }
1266 lval->ival = val;
1267 return ICONST;
1268 }
1269
1270 static unsigned int
1271 hexval(unsigned char c)
1272 {
1273 if (c >= '0' && c <= '9')
1274 return c - '0';
1275 if (c >= 'a' && c <= 'f')
1276 return c - 'a' + 0xA;
1277 if (c >= 'A' && c <= 'F')
1278 return c - 'A' + 0xA;
1279 elog(ERROR, "invalid hexadecimal digit");
1280 return 0; /* not reached */
1281 }
1282
1283 static void
1284 check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner)
1285 {
1286 if (GetDatabaseEncoding() == PG_UTF8)
1287 return;
1288
1289 if (c > 0x7F)
1290 {
1291 ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3); /* 3 for U&" */
1292 yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1293 }
1294 }
1295
1296 static bool
1297 is_utf16_surrogate_first(pg_wchar c)
1298 {
1299 return (c >= 0xD800 && c <= 0xDBFF);
1300 }
1301
1302 static bool
1303 is_utf16_surrogate_second(pg_wchar c)
1304 {
1305 return (c >= 0xDC00 && c <= 0xDFFF);
1306 }
1307
1308 static pg_wchar
1309 surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
1310 {
1311 return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
1312 }
1313
1314 static void
1315 addunicode(pg_wchar c, core_yyscan_t yyscanner)
1316 {
1317 char buf[8];
1318
1319 if (c == 0 || c > 0x10FFFF)
1320 yyerror("invalid Unicode escape value");
1321 if (c > 0x7F)
1322 {
1323 if (GetDatabaseEncoding() != PG_UTF8)
1324 yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1325 yyextra->saw_non_ascii = true;
1326 }
1327 unicode_to_utf8(c, (unsigned char *) buf);
1328 addlit(buf, pg_mblen(buf), yyscanner);
1329 }
1330
1331 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
1332 static bool
1333 check_uescapechar(unsigned char escape)
1334 {
1335 if (isxdigit(escape)
1336 || escape == '+'
1337 || escape == '\''
1338 || escape == '"'
1339 || scanner_isspace(escape))
1340 {
1341 return false;
1342 }
1343 else
1344 return true;
1345 }
1346
1347 /* like litbufdup, but handle unicode escapes */
1348 static char *
1349 litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
1350 {
1351 char *new;
1352 char *litbuf,
1353 *in,
1354 *out;
1355 pg_wchar pair_first = 0;
1356
1357 /* Make literalbuf null-terminated to simplify the scanning loop */
1358 litbuf = yyextra->literalbuf;
1359 litbuf[yyextra->literallen] = '\0';
1360
1361 /*
1362 * This relies on the subtle assumption that a UTF-8 expansion cannot be
1363 * longer than its escaped representation.
1364 */
1365 new = palloc(yyextra->literallen + 1);
1366
1367 in = litbuf;
1368 out = new;
1369 while (*in)
1370 {
1371 if (in[0] == escape)
1372 {
1373 if (in[1] == escape)
1374 {
1375 if (pair_first)
1376 {
1377 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1378 yyerror("invalid Unicode surrogate pair");
1379 }
1380 *out++ = escape;
1381 in += 2;
1382 }
1383 else if (isxdigit((unsigned char) in[1]) &&
1384 isxdigit((unsigned char) in[2]) &&
1385 isxdigit((unsigned char) in[3]) &&
1386 isxdigit((unsigned char) in[4]))
1387 {
1388 pg_wchar unicode;
1389
1390 unicode = (hexval(in[1]) << 12) +
1391 (hexval(in[2]) << 8) +
1392 (hexval(in[3]) << 4) +
1393 hexval(in[4]);
1394 check_unicode_value(unicode, in, yyscanner);
1395 if (pair_first)
1396 {
1397 if (is_utf16_surrogate_second(unicode))
1398 {
1399 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1400 pair_first = 0;
1401 }
1402 else
1403 {
1404 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1405 yyerror("invalid Unicode surrogate pair");
1406 }
1407 }
1408 else if (is_utf16_surrogate_second(unicode))
1409 yyerror("invalid Unicode surrogate pair");
1410
1411 if (is_utf16_surrogate_first(unicode))
1412 pair_first = unicode;
1413 else
1414 {
1415 unicode_to_utf8(unicode, (unsigned char *) out);
1416 out += pg_mblen(out);
1417 }
1418 in += 5;
1419 }
1420 else if (in[1] == '+' &&
1421 isxdigit((unsigned char) in[2]) &&
1422 isxdigit((unsigned char) in[3]) &&
1423 isxdigit((unsigned char) in[4]) &&
1424 isxdigit((unsigned char) in[5]) &&
1425 isxdigit((unsigned char) in[6]) &&
1426 isxdigit((unsigned char) in[7]))
1427 {
1428 pg_wchar unicode;
1429
1430 unicode = (hexval(in[2]) << 20) +
1431 (hexval(in[3]) << 16) +
1432 (hexval(in[4]) << 12) +
1433 (hexval(in[5]) << 8) +
1434 (hexval(in[6]) << 4) +
1435 hexval(in[7]);
1436 check_unicode_value(unicode, in, yyscanner);
1437 if (pair_first)
1438 {
1439 if (is_utf16_surrogate_second(unicode))
1440 {
1441 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1442 pair_first = 0;
1443 }
1444 else
1445 {
1446 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1447 yyerror("invalid Unicode surrogate pair");
1448 }
1449 }
1450 else if (is_utf16_surrogate_second(unicode))
1451 yyerror("invalid Unicode surrogate pair");
1452
1453 if (is_utf16_surrogate_first(unicode))
1454 pair_first = unicode;
1455 else
1456 {
1457 unicode_to_utf8(unicode, (unsigned char *) out);
1458 out += pg_mblen(out);
1459 }
1460 in += 8;
1461 }
1462 else
1463 {
1464 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1465 yyerror("invalid Unicode escape value");
1466 }
1467 }
1468 else
1469 {
1470 if (pair_first)
1471 {
1472 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1473 yyerror("invalid Unicode surrogate pair");
1474 }
1475 *out++ = *in++;
1476 }
1477 }
1478
1479 /* unfinished surrogate pair? */
1480 if (pair_first)
1481 {
1482 ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1483 yyerror("invalid Unicode surrogate pair");
1484 }
1485
1486 *out = '\0';
1487
1488 /*
1489 * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
1490 * codes; but it's probably not worth the trouble, since this isn't likely
1491 * to be a performance-critical path.
1492 */
1493 pg_verifymbstr(new, out - new, false);
1494 return new;
1495 }
1496
1497 static unsigned char
1498 unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
1499 {
1500 switch (c)
1501 {
1502 case 'b':
1503 return '\b';
1504 case 'f':
1505 return '\f';
1506 case 'n':
1507 return '\n';
1508 case 'r':
1509 return '\r';
1510 case 't':
1511 return '\t';
1512 default:
1513 /* check for backslash followed by non-7-bit-ASCII */
1514 if (c == '\0' || IS_HIGHBIT_SET(c))
1515 yyextra->saw_non_ascii = true;
1516
1517 return c;
1518 }
1519 }
1520
1521 static void
1522 check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
1523 {
1524 if (ychar == '\'')
1525 {
1526 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1527 ereport(WARNING,
1528 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1529 errmsg("nonstandard use of \\' in a string literal"),
1530 errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
1531 lexer_errposition()));
1532 yyextra->warn_on_first_escape = false; /* warn only once per string */
1533 }
1534 else if (ychar == '\\')
1535 {
1536 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1537 ereport(WARNING,
1538 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1539 errmsg("nonstandard use of \\\\ in a string literal"),
1540 errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1541 lexer_errposition()));
1542 yyextra->warn_on_first_escape = false; /* warn only once per string */
1543 }
1544 else
1545 check_escape_warning(yyscanner);
1546 }
1547
1548 static void
1549 check_escape_warning(core_yyscan_t yyscanner)
1550 {
1551 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1552 ereport(WARNING,
1553 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1554 errmsg("nonstandard use of escape in a string literal"),
1555 errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
1556 lexer_errposition()));
1557 yyextra->warn_on_first_escape = false; /* warn only once per string */
1558 }
1559
1560 /*
1561 * Interface functions to make flex use palloc() instead of malloc().
1562 * It'd be better to make these static, but flex insists otherwise.
1563 */
1564
1565 void *
1566 core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
1567 {
1568 return palloc(bytes);
1569 }
1570
1571 void *
1572 core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
1573 {
1574 if (ptr)
1575 return repalloc(ptr, bytes);
1576 else
1577 return palloc(bytes);
1578 }
1579
1580 void
1581 core_yyfree(void *ptr, core_yyscan_t yyscanner)
1582 {
1583 if (ptr)
1584 pfree(ptr);
1585 }
1586