1 %top{
2 /*-------------------------------------------------------------------------
3 *
4 * scan.l
5 * lexical scanner for PostgreSQL
6 *
7 * NOTE NOTE NOTE:
8 *
9 * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l
10 * and src/interfaces/ecpg/preproc/pgc.l!
11 *
12 * The rules are designed so that the scanner never has to backtrack,
13 * in the sense that there is always a rule that can match the input
14 * consumed so far (the rule action may internally throw back some input
15 * with yyless(), however). As explained in the flex manual, this makes
16 * for a useful speed increase --- several percent faster when measuring
17 * raw parsing (Flex + Bison). The extra complexity is mostly in the rules
18 * for handling float numbers and continued string literals. If you change
19 * the lexical rules, verify that you haven't broken the no-backtrack
20 * property by running flex with the "-b" option and checking that the
21 * resulting "lex.backup" file says that no backing up is needed. (As of
22 * Postgres 9.2, this check is made automatically by the Makefile.)
23 *
24 *
25 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
26 * Portions Copyright (c) 1994, Regents of the University of California
27 *
28 * IDENTIFICATION
29 * src/backend/parser/scan.l
30 *
31 *-------------------------------------------------------------------------
32 */
33 #include "postgres.h"
34
35 #include <ctype.h>
36 #include <unistd.h>
37
38 #include "common/string.h"
39 #include "parser/gramparse.h"
40 #include "parser/parser.h" /* only needed for GUC variables */
41 #include "parser/scansup.h"
42 #include "mb/pg_wchar.h"
43 }
44
45 %{
46
47 /* LCOV_EXCL_START */
48
49 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
50 #undef fprintf
51 #define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg)
52
53 static void
fprintf_to_ereport(const char * fmt,const char * msg)54 fprintf_to_ereport(const char *fmt, const char *msg)
55 {
56 ereport(ERROR, (errmsg_internal("%s", msg)));
57 }
58
59 /*
60 * GUC variables. This is a DIRECT violation of the warning given at the
61 * head of gram.y, ie flex/bison code must not depend on any GUC variables;
62 * as such, changing their values can induce very unintuitive behavior.
63 * But we shall have to live with it until we can remove these variables.
64 */
65 int backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
66 bool escape_string_warning = true;
67 bool standard_conforming_strings = true;
68
69 /*
70 * Constant data exported from this file. This array maps from the
71 * zero-based keyword numbers returned by ScanKeywordLookup to the
72 * Bison token numbers needed by gram.y. This is exported because
73 * callers need to pass it to scanner_init, if they are using the
74 * standard keyword list ScanKeywords.
75 */
76 #define PG_KEYWORD(kwname, value, category, collabel) value,
77
78 const uint16 ScanKeywordTokens[] = {
79 #include "parser/kwlist.h"
80 };
81
82 #undef PG_KEYWORD
83
84 /*
85 * Set the type of YYSTYPE.
86 */
87 #define YYSTYPE core_YYSTYPE
88
89 /*
90 * Set the type of yyextra. All state variables used by the scanner should
91 * be in yyextra, *not* statically allocated.
92 */
93 #define YY_EXTRA_TYPE core_yy_extra_type *
94
95 /*
96 * Each call to yylex must set yylloc to the location of the found token
97 * (expressed as a byte offset from the start of the input text).
98 * When we parse a token that requires multiple lexer rules to process,
99 * this should be done in the first such rule, else yylloc will point
100 * into the middle of the token.
101 */
102 #define SET_YYLLOC() (*(yylloc) = yytext - yyextra->scanbuf)
103
104 /*
105 * Advance yylloc by the given number of bytes.
106 */
107 #define ADVANCE_YYLLOC(delta) ( *(yylloc) += (delta) )
108
109 /*
110 * Sometimes, we do want yylloc to point into the middle of a token; this is
111 * useful for instance to throw an error about an escape sequence within a
112 * string literal. But if we find no error there, we want to revert yylloc
113 * to the token start, so that that's the location reported to the parser.
114 * Use PUSH_YYLLOC/POP_YYLLOC to save/restore yylloc around such code.
115 * (Currently the implied "stack" is just one location, but someday we might
116 * need to nest these.)
117 */
118 #define PUSH_YYLLOC() (yyextra->save_yylloc = *(yylloc))
119 #define POP_YYLLOC() (*(yylloc) = yyextra->save_yylloc)
120
121 #define startlit() ( yyextra->literallen = 0 )
122 static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
123 static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
124 static char *litbufdup(core_yyscan_t yyscanner);
125 static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
126 static int process_integer_literal(const char *token, YYSTYPE *lval);
127 static void addunicode(pg_wchar c, yyscan_t yyscanner);
128
129 #define yyerror(msg) scanner_yyerror(msg, yyscanner)
130
131 #define lexer_errposition() scanner_errposition(*(yylloc), yyscanner)
132
133 static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner);
134 static void check_escape_warning(core_yyscan_t yyscanner);
135
136 /*
137 * Work around a bug in flex 2.5.35: it emits a couple of functions that
138 * it forgets to emit declarations for. Since we use -Wmissing-prototypes,
139 * this would cause warnings. Providing our own declarations should be
140 * harmless even when the bug gets fixed.
141 */
142 extern int core_yyget_column(yyscan_t yyscanner);
143 extern void core_yyset_column(int column_no, yyscan_t yyscanner);
144
145 %}
146
147 %option reentrant
148 %option bison-bridge
149 %option bison-locations
150 %option 8bit
151 %option never-interactive
152 %option nodefault
153 %option noinput
154 %option nounput
155 %option noyywrap
156 %option noyyalloc
157 %option noyyrealloc
158 %option noyyfree
159 %option warn
160 %option prefix="core_yy"
161
162 /*
163 * OK, here is a short description of lex/flex rules behavior.
164 * The longest pattern which matches an input string is always chosen.
165 * For equal-length patterns, the first occurring in the rules list is chosen.
166 * INITIAL is the starting state, to which all non-conditional rules apply.
167 * Exclusive states change parsing rules while the state is active. When in
168 * an exclusive state, only those rules defined for that state apply.
169 *
170 * We use exclusive states for quoted strings, extended comments,
171 * and to eliminate parsing troubles for numeric strings.
172 * Exclusive states:
173 * <xb> bit string literal
174 * <xc> extended C-style comments
175 * <xd> delimited identifiers (double-quoted identifiers)
176 * <xh> hexadecimal numeric string
177 * <xq> standard quoted strings
178 * <xqs> quote stop (detect continued strings)
179 * <xe> extended quoted strings (support backslash escape sequences)
180 * <xdolq> $foo$ quoted strings
181 * <xui> quoted identifier with Unicode escapes
182 * <xus> quoted string with Unicode escapes
183 * <xeu> Unicode surrogate pair in extended quoted string
184 *
185 * Remember to add an <<EOF>> case whenever you add a new exclusive state!
186 * The default one is probably not the right thing.
187 */
188
189 %x xb
190 %x xc
191 %x xd
192 %x xh
193 %x xq
194 %x xqs
195 %x xe
196 %x xdolq
197 %x xui
198 %x xus
199 %x xeu
200
201 /*
202 * In order to make the world safe for Windows and Mac clients as well as
203 * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n
204 * sequence will be seen as two successive newlines, but that doesn't cause
205 * any problems. Comments that start with -- and extend to the next
206 * newline are treated as equivalent to a single whitespace character.
207 *
208 * NOTE a fine point: if there is no newline following --, we will absorb
209 * everything to the end of the input as a comment. This is correct. Older
210 * versions of Postgres failed to recognize -- as a comment if the input
211 * did not end with a newline.
212 *
213 * XXX perhaps \f (formfeed) should be treated as a newline as well?
214 *
215 * XXX if you change the set of whitespace characters, fix scanner_isspace()
216 * to agree.
217 */
218
219 space [ \t\n\r\f]
220 horiz_space [ \t\f]
221 newline [\n\r]
222 non_newline [^\n\r]
223
224 comment ("--"{non_newline}*)
225
226 whitespace ({space}+|{comment})
227
228 /*
229 * SQL requires at least one newline in the whitespace separating
230 * string literals that are to be concatenated. Silly, but who are we
231 * to argue? Note that {whitespace_with_newline} should not have * after
232 * it, whereas {whitespace} should generally have a * after it...
233 */
234
235 special_whitespace ({space}+|{comment}{newline})
236 horiz_whitespace ({horiz_space}|{comment})
237 whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
238
239 quote '
240 /* If we see {quote} then {quotecontinue}, the quoted string continues */
241 quotecontinue {whitespace_with_newline}{quote}
242
243 /*
244 * {quotecontinuefail} is needed to avoid lexer backup when we fail to match
245 * {quotecontinue}. It might seem that this could just be {whitespace}*,
246 * but if there's a dash after {whitespace_with_newline}, it must be consumed
247 * to see if there's another dash --- which would start a {comment} and thus
248 * allow continuation of the {quotecontinue} token.
249 */
250 quotecontinuefail {whitespace}*"-"?
251
252 /* Bit string
253 * It is tempting to scan the string for only those characters
254 * which are allowed. However, this leads to silently swallowed
255 * characters if illegal characters are included in the string.
256 * For example, if xbinside is [01] then B'ABCD' is interpreted
257 * as a zero-length string, and the ABCD' is lost!
258 * Better to pass the string forward and let the input routines
259 * validate the contents.
260 */
261 xbstart [bB]{quote}
262 xbinside [^']*
263
264 /* Hexadecimal number */
265 xhstart [xX]{quote}
266 xhinside [^']*
267
268 /* National character */
269 xnstart [nN]{quote}
270
271 /* Quoted string that allows backslash escapes */
272 xestart [eE]{quote}
273 xeinside [^\\']+
274 xeescape [\\][^0-7]
275 xeoctesc [\\][0-7]{1,3}
276 xehexesc [\\]x[0-9A-Fa-f]{1,2}
277 xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
278 xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
279
280 /* Extended quote
281 * xqdouble implements embedded quote, ''''
282 */
283 xqstart {quote}
284 xqdouble {quote}{quote}
285 xqinside [^']+
286
287 /* $foo$ style quotes ("dollar quoting")
288 * The quoted string starts with $foo$ where "foo" is an optional string
289 * in the form of an identifier, except that it may not contain "$",
290 * and extends to the first occurrence of an identical string.
291 * There is *no* processing of the quoted text.
292 *
293 * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
294 * fails to match its trailing "$".
295 */
296 dolq_start [A-Za-z\200-\377_]
297 dolq_cont [A-Za-z\200-\377_0-9]
298 dolqdelim \$({dolq_start}{dolq_cont}*)?\$
299 dolqfailed \${dolq_start}{dolq_cont}*
300 dolqinside [^$]+
301
302 /* Double quote
303 * Allows embedded spaces and other special characters into identifiers.
304 */
305 dquote \"
306 xdstart {dquote}
307 xdstop {dquote}
308 xddouble {dquote}{dquote}
309 xdinside [^"]+
310
311 /* Quoted identifier with Unicode escapes */
312 xuistart [uU]&{dquote}
313
314 /* Quoted string with Unicode escapes */
315 xusstart [uU]&{quote}
316
317 /* error rule to avoid backup */
318 xufailed [uU]&
319
320
321 /* C-style comments
322 *
323 * The "extended comment" syntax closely resembles allowable operator syntax.
324 * The tricky part here is to get lex to recognize a string starting with
325 * slash-star as a comment, when interpreting it as an operator would produce
326 * a longer match --- remember lex will prefer a longer match! Also, if we
327 * have something like plus-slash-star, lex will think this is a 3-character
328 * operator whereas we want to see it as a + operator and a comment start.
329 * The solution is two-fold:
330 * 1. append {op_chars}* to xcstart so that it matches as much text as
331 * {operator} would. Then the tie-breaker (first matching rule of same
332 * length) ensures xcstart wins. We put back the extra stuff with yyless()
333 * in case it contains a star-slash that should terminate the comment.
334 * 2. In the operator rule, check for slash-star within the operator, and
335 * if found throw it back with yyless(). This handles the plus-slash-star
336 * problem.
337 * Dash-dash comments have similar interactions with the operator rule.
338 */
339 xcstart \/\*{op_chars}*
340 xcstop \*+\/
341 xcinside [^*/]+
342
343 digit [0-9]
344 ident_start [A-Za-z\200-\377_]
345 ident_cont [A-Za-z\200-\377_0-9\$]
346
347 identifier {ident_start}{ident_cont}*
348
349 /* Assorted special-case operators and operator-like tokens */
350 typecast "::"
351 dot_dot \.\.
352 colon_equals ":="
353
354 /*
355 * These operator-like tokens (unlike the above ones) also match the {operator}
356 * rule, which means that they might be overridden by a longer match if they
357 * are followed by a comment start or a + or - character. Accordingly, if you
358 * add to this list, you must also add corresponding code to the {operator}
359 * block to return the correct token in such cases. (This is not needed in
360 * psqlscan.l since the token value is ignored there.)
361 */
362 equals_greater "=>"
363 less_equals "<="
364 greater_equals ">="
365 less_greater "<>"
366 not_equals "!="
367
368 /*
369 * "self" is the set of chars that should be returned as single-character
370 * tokens. "op_chars" is the set of chars that can make up "Op" tokens,
371 * which can be one or more characters long (but if a single-char token
372 * appears in the "self" set, it is not to be returned as an Op). Note
373 * that the sets overlap, but each has some chars that are not in the other.
374 *
375 * If you change either set, adjust the character lists appearing in the
376 * rule for "operator"!
377 */
378 self [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
379 op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
380 operator {op_chars}+
381
382 /* we no longer allow unary minus in numbers.
383 * instead we pass it separately to parser. there it gets
384 * coerced via doNegate() -- Leon aug 20 1999
385 *
386 * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
387 *
388 * {realfail1} and {realfail2} are added to prevent the need for scanner
389 * backup when the {real} rule fails to match completely.
390 */
391
392 integer {digit}+
393 decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*))
394 decimalfail {digit}+\.\.
395 real ({integer}|{decimal})[Ee][-+]?{digit}+
396 realfail1 ({integer}|{decimal})[Ee]
397 realfail2 ({integer}|{decimal})[Ee][-+]
398
399 param \${integer}
400
401 other .
402
403 /*
404 * Dollar quoted strings are totally opaque, and no escaping is done on them.
405 * Other quoted strings must allow some special characters such as single-quote
406 * and newline.
407 * Embedded single-quotes are implemented both in the SQL standard
408 * style of two adjacent single quotes "''" and in the Postgres/Java style
409 * of escaped-quote "\'".
410 * Other embedded escaped characters are matched explicitly and the leading
411 * backslash is dropped from the string.
412 * Note that xcstart must appear before operator, as explained above!
413 * Also whitespace (comment) must appear before operator.
414 */
415
416 %%
417
418 {whitespace} {
419 /* ignore */
420 }
421
422 {xcstart} {
423 /* Set location in case of syntax error in comment */
424 SET_YYLLOC();
425 yyextra->xcdepth = 0;
426 BEGIN(xc);
427 /* Put back any characters past slash-star; see above */
428 yyless(2);
429 }
430
431 <xc>{
432 {xcstart} {
433 (yyextra->xcdepth)++;
434 /* Put back any characters past slash-star; see above */
435 yyless(2);
436 }
437
438 {xcstop} {
439 if (yyextra->xcdepth <= 0)
440 BEGIN(INITIAL);
441 else
442 (yyextra->xcdepth)--;
443 }
444
445 {xcinside} {
446 /* ignore */
447 }
448
449 {op_chars} {
450 /* ignore */
451 }
452
453 \*+ {
454 /* ignore */
455 }
456
457 <<EOF>> {
458 yyerror("unterminated /* comment");
459 }
460 } /* <xc> */
461
462 {xbstart} {
463 /* Binary bit type.
464 * At some point we should simply pass the string
465 * forward to the parser and label it there.
466 * In the meantime, place a leading "b" on the string
467 * to mark it for the input routine as a binary string.
468 */
469 SET_YYLLOC();
470 BEGIN(xb);
471 startlit();
472 addlitchar('b', yyscanner);
473 }
474 <xh>{xhinside} |
475 <xb>{xbinside} {
476 addlit(yytext, yyleng, yyscanner);
477 }
478 <xb><<EOF>> { yyerror("unterminated bit string literal"); }
479
480 {xhstart} {
481 /* Hexadecimal bit type.
482 * At some point we should simply pass the string
483 * forward to the parser and label it there.
484 * In the meantime, place a leading "x" on the string
485 * to mark it for the input routine as a hex string.
486 */
487 SET_YYLLOC();
488 BEGIN(xh);
489 startlit();
490 addlitchar('x', yyscanner);
491 }
492 <xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); }
493
494 {xnstart} {
495 /* National character.
496 * We will pass this along as a normal character string,
497 * but preceded with an internally-generated "NCHAR".
498 */
499 int kwnum;
500
501 SET_YYLLOC();
502 yyless(1); /* eat only 'n' this time */
503
504 kwnum = ScanKeywordLookup("nchar",
505 yyextra->keywordlist);
506 if (kwnum >= 0)
507 {
508 yylval->keyword = GetScanKeyword(kwnum,
509 yyextra->keywordlist);
510 return yyextra->keyword_tokens[kwnum];
511 }
512 else
513 {
514 /* If NCHAR isn't a keyword, just return "n" */
515 yylval->str = pstrdup("n");
516 return IDENT;
517 }
518 }
519
520 {xqstart} {
521 yyextra->warn_on_first_escape = true;
522 yyextra->saw_non_ascii = false;
523 SET_YYLLOC();
524 if (yyextra->standard_conforming_strings)
525 BEGIN(xq);
526 else
527 BEGIN(xe);
528 startlit();
529 }
530 {xestart} {
531 yyextra->warn_on_first_escape = false;
532 yyextra->saw_non_ascii = false;
533 SET_YYLLOC();
534 BEGIN(xe);
535 startlit();
536 }
537 {xusstart} {
538 SET_YYLLOC();
539 if (!yyextra->standard_conforming_strings)
540 ereport(ERROR,
541 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
542 errmsg("unsafe use of string constant with Unicode escapes"),
543 errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
544 lexer_errposition()));
545 BEGIN(xus);
546 startlit();
547 }
548
549 <xb,xh,xq,xe,xus>{quote} {
550 /*
551 * When we are scanning a quoted string and see an end
552 * quote, we must look ahead for a possible continuation.
553 * If we don't see one, we know the end quote was in fact
554 * the end of the string. To reduce the lexer table size,
555 * we use a single "xqs" state to do the lookahead for all
556 * types of strings.
557 */
558 yyextra->state_before_str_stop = YYSTATE;
559 BEGIN(xqs);
560 }
561 <xqs>{quotecontinue} {
562 /*
563 * Found a quote continuation, so return to the in-quote
564 * state and continue scanning the literal. Nothing is
565 * added to the literal's contents.
566 */
567 BEGIN(yyextra->state_before_str_stop);
568 }
569 <xqs>{quotecontinuefail} |
570 <xqs>{other} |
571 <xqs><<EOF>> {
572 /*
573 * Failed to see a quote continuation. Throw back
574 * everything after the end quote, and handle the string
575 * according to the state we were in previously.
576 */
577 yyless(0);
578 BEGIN(INITIAL);
579
580 switch (yyextra->state_before_str_stop)
581 {
582 case xb:
583 yylval->str = litbufdup(yyscanner);
584 return BCONST;
585 case xh:
586 yylval->str = litbufdup(yyscanner);
587 return XCONST;
588 case xq:
589 case xe:
590 /*
591 * Check that the data remains valid, if it might
592 * have been made invalid by unescaping any chars.
593 */
594 if (yyextra->saw_non_ascii)
595 pg_verifymbstr(yyextra->literalbuf,
596 yyextra->literallen,
597 false);
598 yylval->str = litbufdup(yyscanner);
599 return SCONST;
600 case xus:
601 yylval->str = litbufdup(yyscanner);
602 return USCONST;
603 default:
604 yyerror("unhandled previous state in xqs");
605 }
606 }
607
608 <xq,xe,xus>{xqdouble} {
609 addlitchar('\'', yyscanner);
610 }
611 <xq,xus>{xqinside} {
612 addlit(yytext, yyleng, yyscanner);
613 }
614 <xe>{xeinside} {
615 addlit(yytext, yyleng, yyscanner);
616 }
617 <xe>{xeunicode} {
618 pg_wchar c = strtoul(yytext + 2, NULL, 16);
619
620 /*
621 * For consistency with other productions, issue any
622 * escape warning with cursor pointing to start of string.
623 * We might want to change that, someday.
624 */
625 check_escape_warning(yyscanner);
626
627 /* Remember start of overall string token ... */
628 PUSH_YYLLOC();
629 /* ... and set the error cursor to point at this esc seq */
630 SET_YYLLOC();
631
632 if (is_utf16_surrogate_first(c))
633 {
634 yyextra->utf16_first_part = c;
635 BEGIN(xeu);
636 }
637 else if (is_utf16_surrogate_second(c))
638 yyerror("invalid Unicode surrogate pair");
639 else
640 addunicode(c, yyscanner);
641
642 /* Restore yylloc to be start of string token */
643 POP_YYLLOC();
644 }
645 <xeu>{xeunicode} {
646 pg_wchar c = strtoul(yytext + 2, NULL, 16);
647
648 /* Remember start of overall string token ... */
649 PUSH_YYLLOC();
650 /* ... and set the error cursor to point at this esc seq */
651 SET_YYLLOC();
652
653 if (!is_utf16_surrogate_second(c))
654 yyerror("invalid Unicode surrogate pair");
655
656 c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
657
658 addunicode(c, yyscanner);
659
660 /* Restore yylloc to be start of string token */
661 POP_YYLLOC();
662
663 BEGIN(xe);
664 }
665 <xeu>. |
666 <xeu>\n |
667 <xeu><<EOF>> {
668 /* Set the error cursor to point at missing esc seq */
669 SET_YYLLOC();
670 yyerror("invalid Unicode surrogate pair");
671 }
672 <xe,xeu>{xeunicodefail} {
673 /* Set the error cursor to point at malformed esc seq */
674 SET_YYLLOC();
675 ereport(ERROR,
676 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
677 errmsg("invalid Unicode escape"),
678 errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
679 lexer_errposition()));
680 }
681 <xe>{xeescape} {
682 if (yytext[1] == '\'')
683 {
684 if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
685 (yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
686 PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
687 ereport(ERROR,
688 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
689 errmsg("unsafe use of \\' in a string literal"),
690 errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
691 lexer_errposition()));
692 }
693 check_string_escape_warning(yytext[1], yyscanner);
694 addlitchar(unescape_single_char(yytext[1], yyscanner),
695 yyscanner);
696 }
697 <xe>{xeoctesc} {
698 unsigned char c = strtoul(yytext + 1, NULL, 8);
699
700 check_escape_warning(yyscanner);
701 addlitchar(c, yyscanner);
702 if (c == '\0' || IS_HIGHBIT_SET(c))
703 yyextra->saw_non_ascii = true;
704 }
705 <xe>{xehexesc} {
706 unsigned char c = strtoul(yytext + 2, NULL, 16);
707
708 check_escape_warning(yyscanner);
709 addlitchar(c, yyscanner);
710 if (c == '\0' || IS_HIGHBIT_SET(c))
711 yyextra->saw_non_ascii = true;
712 }
713 <xe>. {
714 /* This is only needed for \ just before EOF */
715 addlitchar(yytext[0], yyscanner);
716 }
717 <xq,xe,xus><<EOF>> { yyerror("unterminated quoted string"); }
718
719 {dolqdelim} {
720 SET_YYLLOC();
721 yyextra->dolqstart = pstrdup(yytext);
722 BEGIN(xdolq);
723 startlit();
724 }
725 {dolqfailed} {
726 SET_YYLLOC();
727 /* throw back all but the initial "$" */
728 yyless(1);
729 /* and treat it as {other} */
730 return yytext[0];
731 }
732 <xdolq>{dolqdelim} {
733 if (strcmp(yytext, yyextra->dolqstart) == 0)
734 {
735 pfree(yyextra->dolqstart);
736 yyextra->dolqstart = NULL;
737 BEGIN(INITIAL);
738 yylval->str = litbufdup(yyscanner);
739 return SCONST;
740 }
741 else
742 {
743 /*
744 * When we fail to match $...$ to dolqstart, transfer
745 * the $... part to the output, but put back the final
746 * $ for rescanning. Consider $delim$...$junk$delim$
747 */
748 addlit(yytext, yyleng - 1, yyscanner);
749 yyless(yyleng - 1);
750 }
751 }
752 <xdolq>{dolqinside} {
753 addlit(yytext, yyleng, yyscanner);
754 }
755 <xdolq>{dolqfailed} {
756 addlit(yytext, yyleng, yyscanner);
757 }
758 <xdolq>. {
759 /* This is only needed for $ inside the quoted text */
760 addlitchar(yytext[0], yyscanner);
761 }
762 <xdolq><<EOF>> { yyerror("unterminated dollar-quoted string"); }
763
764 {xdstart} {
765 SET_YYLLOC();
766 BEGIN(xd);
767 startlit();
768 }
769 {xuistart} {
770 SET_YYLLOC();
771 BEGIN(xui);
772 startlit();
773 }
774 <xd>{xdstop} {
775 char *ident;
776
777 BEGIN(INITIAL);
778 if (yyextra->literallen == 0)
779 yyerror("zero-length delimited identifier");
780 ident = litbufdup(yyscanner);
781 if (yyextra->literallen >= NAMEDATALEN)
782 truncate_identifier(ident, yyextra->literallen, true);
783 yylval->str = ident;
784 return IDENT;
785 }
786 <xui>{dquote} {
787 BEGIN(INITIAL);
788 if (yyextra->literallen == 0)
789 yyerror("zero-length delimited identifier");
790 /* can't truncate till after we de-escape the ident */
791 yylval->str = litbufdup(yyscanner);
792 return UIDENT;
793 }
794 <xd,xui>{xddouble} {
795 addlitchar('"', yyscanner);
796 }
797 <xd,xui>{xdinside} {
798 addlit(yytext, yyleng, yyscanner);
799 }
800 <xd,xui><<EOF>> { yyerror("unterminated quoted identifier"); }
801
802 {xufailed} {
803 char *ident;
804
805 SET_YYLLOC();
806 /* throw back all but the initial u/U */
807 yyless(1);
808 /* and treat it as {identifier} */
809 ident = downcase_truncate_identifier(yytext, yyleng, true);
810 yylval->str = ident;
811 return IDENT;
812 }
813
814 {typecast} {
815 SET_YYLLOC();
816 return TYPECAST;
817 }
818
819 {dot_dot} {
820 SET_YYLLOC();
821 return DOT_DOT;
822 }
823
824 {colon_equals} {
825 SET_YYLLOC();
826 return COLON_EQUALS;
827 }
828
829 {equals_greater} {
830 SET_YYLLOC();
831 return EQUALS_GREATER;
832 }
833
834 {less_equals} {
835 SET_YYLLOC();
836 return LESS_EQUALS;
837 }
838
839 {greater_equals} {
840 SET_YYLLOC();
841 return GREATER_EQUALS;
842 }
843
844 {less_greater} {
845 /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
846 SET_YYLLOC();
847 return NOT_EQUALS;
848 }
849
850 {not_equals} {
851 /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
852 SET_YYLLOC();
853 return NOT_EQUALS;
854 }
855
856 {self} {
857 SET_YYLLOC();
858 return yytext[0];
859 }
860
861 {operator} {
862 /*
863 * Check for embedded slash-star or dash-dash; those
864 * are comment starts, so operator must stop there.
865 * Note that slash-star or dash-dash at the first
866 * character will match a prior rule, not this one.
867 */
868 int nchars = yyleng;
869 char *slashstar = strstr(yytext, "/*");
870 char *dashdash = strstr(yytext, "--");
871
872 if (slashstar && dashdash)
873 {
874 /* if both appear, take the first one */
875 if (slashstar > dashdash)
876 slashstar = dashdash;
877 }
878 else if (!slashstar)
879 slashstar = dashdash;
880 if (slashstar)
881 nchars = slashstar - yytext;
882
883 /*
884 * For SQL compatibility, '+' and '-' cannot be the
885 * last char of a multi-char operator unless the operator
886 * contains chars that are not in SQL operators.
887 * The idea is to lex '=-' as two operators, but not
888 * to forbid operator names like '?-' that could not be
889 * sequences of SQL operators.
890 */
891 if (nchars > 1 &&
892 (yytext[nchars - 1] == '+' ||
893 yytext[nchars - 1] == '-'))
894 {
895 int ic;
896
897 for (ic = nchars - 2; ic >= 0; ic--)
898 {
899 char c = yytext[ic];
900 if (c == '~' || c == '!' || c == '@' ||
901 c == '#' || c == '^' || c == '&' ||
902 c == '|' || c == '`' || c == '?' ||
903 c == '%')
904 break;
905 }
906 if (ic < 0)
907 {
908 /*
909 * didn't find a qualifying character, so remove
910 * all trailing [+-]
911 */
912 do {
913 nchars--;
914 } while (nchars > 1 &&
915 (yytext[nchars - 1] == '+' ||
916 yytext[nchars - 1] == '-'));
917 }
918 }
919
920 SET_YYLLOC();
921
922 if (nchars < yyleng)
923 {
924 /* Strip the unwanted chars from the token */
925 yyless(nchars);
926 /*
927 * If what we have left is only one char, and it's
928 * one of the characters matching "self", then
929 * return it as a character token the same way
930 * that the "self" rule would have.
931 */
932 if (nchars == 1 &&
933 strchr(",()[].;:+-*/%^<>=", yytext[0]))
934 return yytext[0];
935 /*
936 * Likewise, if what we have left is two chars, and
937 * those match the tokens ">=", "<=", "=>", "<>" or
938 * "!=", then we must return the appropriate token
939 * rather than the generic Op.
940 */
941 if (nchars == 2)
942 {
943 if (yytext[0] == '=' && yytext[1] == '>')
944 return EQUALS_GREATER;
945 if (yytext[0] == '>' && yytext[1] == '=')
946 return GREATER_EQUALS;
947 if (yytext[0] == '<' && yytext[1] == '=')
948 return LESS_EQUALS;
949 if (yytext[0] == '<' && yytext[1] == '>')
950 return NOT_EQUALS;
951 if (yytext[0] == '!' && yytext[1] == '=')
952 return NOT_EQUALS;
953 }
954 }
955
956 /*
957 * Complain if operator is too long. Unlike the case
958 * for identifiers, we make this an error not a notice-
959 * and-truncate, because the odds are we are looking at
960 * a syntactic mistake anyway.
961 */
962 if (nchars >= NAMEDATALEN)
963 yyerror("operator too long");
964
965 yylval->str = pstrdup(yytext);
966 return Op;
967 }
968
969 {param} {
970 SET_YYLLOC();
971 yylval->ival = atol(yytext + 1);
972 return PARAM;
973 }
974
975 {integer} {
976 SET_YYLLOC();
977 return process_integer_literal(yytext, yylval);
978 }
979 {decimal} {
980 SET_YYLLOC();
981 yylval->str = pstrdup(yytext);
982 return FCONST;
983 }
984 {decimalfail} {
985 /* throw back the .., and treat as integer */
986 yyless(yyleng - 2);
987 SET_YYLLOC();
988 return process_integer_literal(yytext, yylval);
989 }
990 {real} {
991 SET_YYLLOC();
992 yylval->str = pstrdup(yytext);
993 return FCONST;
994 }
995 {realfail1} {
996 /*
997 * throw back the [Ee], and figure out whether what
998 * remains is an {integer} or {decimal}.
999 */
1000 yyless(yyleng - 1);
1001 SET_YYLLOC();
1002 return process_integer_literal(yytext, yylval);
1003 }
1004 {realfail2} {
1005 /* throw back the [Ee][+-], and proceed as above */
1006 yyless(yyleng - 2);
1007 SET_YYLLOC();
1008 return process_integer_literal(yytext, yylval);
1009 }
1010
1011
1012 {identifier} {
1013 int kwnum;
1014 char *ident;
1015
1016 SET_YYLLOC();
1017
1018 /* Is it a keyword? */
1019 kwnum = ScanKeywordLookup(yytext,
1020 yyextra->keywordlist);
1021 if (kwnum >= 0)
1022 {
1023 yylval->keyword = GetScanKeyword(kwnum,
1024 yyextra->keywordlist);
1025 return yyextra->keyword_tokens[kwnum];
1026 }
1027
1028 /*
1029 * No. Convert the identifier to lower case, and truncate
1030 * if necessary.
1031 */
1032 ident = downcase_truncate_identifier(yytext, yyleng, true);
1033 yylval->str = ident;
1034 return IDENT;
1035 }
1036
1037 {other} {
1038 SET_YYLLOC();
1039 return yytext[0];
1040 }
1041
1042 <<EOF>> {
1043 SET_YYLLOC();
1044 yyterminate();
1045 }
1046
1047 %%
1048
1049 /* LCOV_EXCL_STOP */
1050
1051 /*
1052 * Arrange access to yyextra for subroutines of the main yylex() function.
1053 * We expect each subroutine to have a yyscanner parameter. Rather than
1054 * use the yyget_xxx functions, which might or might not get inlined by the
1055 * compiler, we cheat just a bit and cast yyscanner to the right type.
1056 */
1057 #undef yyextra
1058 #define yyextra (((struct yyguts_t *) yyscanner)->yyextra_r)
1059
1060 /* Likewise for a couple of other things we need. */
1061 #undef yylloc
1062 #define yylloc (((struct yyguts_t *) yyscanner)->yylloc_r)
1063 #undef yyleng
1064 #define yyleng (((struct yyguts_t *) yyscanner)->yyleng_r)
1065
1066
1067 /*
1068 * scanner_errposition
1069 * Report a lexer or grammar error cursor position, if possible.
1070 *
1071 * This is expected to be used within an ereport() call, or via an error
1072 * callback such as setup_scanner_errposition_callback(). The return value
1073 * is a dummy (always 0, in fact).
1074 *
1075 * Note that this can only be used for messages emitted during raw parsing
1076 * (essentially, scan.l, parser.c, and gram.y), since it requires the
1077 * yyscanner struct to still be available.
1078 */
1079 int
1080 scanner_errposition(int location, core_yyscan_t yyscanner)
1081 {
1082 int pos;
1083
1084 if (location < 0)
1085 return 0; /* no-op if location is unknown */
1086
1087 /* Convert byte offset to character number */
1088 pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
1089 /* And pass it to the ereport mechanism */
1090 return errposition(pos);
1091 }
1092
1093 /*
1094 * Error context callback for inserting scanner error location.
1095 *
1096 * Note that this will be called for *any* error occurring while the
1097 * callback is installed. We avoid inserting an irrelevant error location
1098 * if the error is a query cancel --- are there any other important cases?
1099 */
1100 static void
1101 scb_error_callback(void *arg)
1102 {
1103 ScannerCallbackState *scbstate = (ScannerCallbackState *) arg;
1104
1105 if (geterrcode() != ERRCODE_QUERY_CANCELED)
1106 (void) scanner_errposition(scbstate->location, scbstate->yyscanner);
1107 }
1108
1109 /*
1110 * setup_scanner_errposition_callback
1111 * Arrange for non-scanner errors to report an error position
1112 *
1113 * Sometimes the scanner calls functions that aren't part of the scanner
1114 * subsystem and can't reasonably be passed the yyscanner pointer; yet
1115 * we would like any errors thrown in those functions to be tagged with an
1116 * error location. Use this function to set up an error context stack
1117 * entry that will accomplish that. Usage pattern:
1118 *
1119 * declare a local variable "ScannerCallbackState scbstate"
1120 * ...
1121 * setup_scanner_errposition_callback(&scbstate, yyscanner, location);
1122 * call function that might throw error;
1123 * cancel_scanner_errposition_callback(&scbstate);
1124 */
1125 void
1126 setup_scanner_errposition_callback(ScannerCallbackState *scbstate,
1127 core_yyscan_t yyscanner,
1128 int location)
1129 {
1130 /* Setup error traceback support for ereport() */
1131 scbstate->yyscanner = yyscanner;
1132 scbstate->location = location;
1133 scbstate->errcallback.callback = scb_error_callback;
1134 scbstate->errcallback.arg = (void *) scbstate;
1135 scbstate->errcallback.previous = error_context_stack;
1136 error_context_stack = &scbstate->errcallback;
1137 }
1138
1139 /*
1140 * Cancel a previously-set-up errposition callback.
1141 */
1142 void
1143 cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)
1144 {
1145 /* Pop the error context stack */
1146 error_context_stack = scbstate->errcallback.previous;
1147 }
1148
1149 /*
1150 * scanner_yyerror
1151 * Report a lexer or grammar error.
1152 *
1153 * The message's cursor position is whatever YYLLOC was last set to,
1154 * ie, the start of the current token if called within yylex(), or the
1155 * most recently lexed token if called from the grammar.
1156 * This is OK for syntax error messages from the Bison parser, because Bison
1157 * parsers report error as soon as the first unparsable token is reached.
1158 * Beware of using yyerror for other purposes, as the cursor position might
1159 * be misleading!
1160 */
1161 void
1162 scanner_yyerror(const char *message, core_yyscan_t yyscanner)
1163 {
1164 const char *loc = yyextra->scanbuf + *yylloc;
1165
1166 if (*loc == YY_END_OF_BUFFER_CHAR)
1167 {
1168 ereport(ERROR,
1169 (errcode(ERRCODE_SYNTAX_ERROR),
1170 /* translator: %s is typically the translation of "syntax error" */
1171 errmsg("%s at end of input", _(message)),
1172 lexer_errposition()));
1173 }
1174 else
1175 {
1176 ereport(ERROR,
1177 (errcode(ERRCODE_SYNTAX_ERROR),
1178 /* translator: first %s is typically the translation of "syntax error" */
1179 errmsg("%s at or near \"%s\"", _(message), loc),
1180 lexer_errposition()));
1181 }
1182 }
1183
1184
1185 /*
1186 * Called before any actual parsing is done
1187 */
1188 core_yyscan_t
1189 scanner_init(const char *str,
1190 core_yy_extra_type *yyext,
1191 const ScanKeywordList *keywordlist,
1192 const uint16 *keyword_tokens)
1193 {
1194 Size slen = strlen(str);
1195 yyscan_t scanner;
1196
1197 if (yylex_init(&scanner) != 0)
1198 elog(ERROR, "yylex_init() failed: %m");
1199
1200 core_yyset_extra(yyext, scanner);
1201
1202 yyext->keywordlist = keywordlist;
1203 yyext->keyword_tokens = keyword_tokens;
1204
1205 yyext->backslash_quote = backslash_quote;
1206 yyext->escape_string_warning = escape_string_warning;
1207 yyext->standard_conforming_strings = standard_conforming_strings;
1208
1209 /*
1210 * Make a scan buffer with special termination needed by flex.
1211 */
1212 yyext->scanbuf = (char *) palloc(slen + 2);
1213 yyext->scanbuflen = slen;
1214 memcpy(yyext->scanbuf, str, slen);
1215 yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
1216 yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
1217
1218 /* initialize literal buffer to a reasonable but expansible size */
1219 yyext->literalalloc = 1024;
1220 yyext->literalbuf = (char *) palloc(yyext->literalalloc);
1221 yyext->literallen = 0;
1222
1223 return scanner;
1224 }
1225
1226
1227 /*
1228 * Called after parsing is done to clean up after scanner_init()
1229 */
1230 void
1231 scanner_finish(core_yyscan_t yyscanner)
1232 {
1233 /*
1234 * We don't bother to call yylex_destroy(), because all it would do is
1235 * pfree a small amount of control storage. It's cheaper to leak the
1236 * storage until the parsing context is destroyed. The amount of space
1237 * involved is usually negligible compared to the output parse tree
1238 * anyway.
1239 *
1240 * We do bother to pfree the scanbuf and literal buffer, but only if they
1241 * represent a nontrivial amount of space. The 8K cutoff is arbitrary.
1242 */
1243 if (yyextra->scanbuflen >= 8192)
1244 pfree(yyextra->scanbuf);
1245 if (yyextra->literalalloc >= 8192)
1246 pfree(yyextra->literalbuf);
1247 }
1248
1249
1250 static void
1251 addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
1252 {
1253 /* enlarge buffer if needed */
1254 if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
1255 {
1256 do
1257 {
1258 yyextra->literalalloc *= 2;
1259 } while ((yyextra->literallen + yleng) >= yyextra->literalalloc);
1260 yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1261 yyextra->literalalloc);
1262 }
1263 /* append new data */
1264 memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
1265 yyextra->literallen += yleng;
1266 }
1267
1268
1269 static void
1270 addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
1271 {
1272 /* enlarge buffer if needed */
1273 if ((yyextra->literallen + 1) >= yyextra->literalalloc)
1274 {
1275 yyextra->literalalloc *= 2;
1276 yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1277 yyextra->literalalloc);
1278 }
1279 /* append new data */
1280 yyextra->literalbuf[yyextra->literallen] = ychar;
1281 yyextra->literallen += 1;
1282 }
1283
1284
1285 /*
1286 * Create a palloc'd copy of literalbuf, adding a trailing null.
1287 */
1288 static char *
1289 litbufdup(core_yyscan_t yyscanner)
1290 {
1291 int llen = yyextra->literallen;
1292 char *new;
1293
1294 new = palloc(llen + 1);
1295 memcpy(new, yyextra->literalbuf, llen);
1296 new[llen] = '\0';
1297 return new;
1298 }
1299
1300 /*
1301 * Process {integer}. Note this will also do the right thing with {decimal},
1302 * ie digits and a decimal point.
1303 */
1304 static int
1305 process_integer_literal(const char *token, YYSTYPE *lval)
1306 {
1307 int val;
1308 char *endptr;
1309
1310 errno = 0;
1311 val = strtoint(token, &endptr, 10);
1312 if (*endptr != '\0' || errno == ERANGE)
1313 {
1314 /* integer too large (or contains decimal pt), treat it as a float */
1315 lval->str = pstrdup(token);
1316 return FCONST;
1317 }
1318 lval->ival = val;
1319 return ICONST;
1320 }
1321
1322 static void
1323 addunicode(pg_wchar c, core_yyscan_t yyscanner)
1324 {
1325 ScannerCallbackState scbstate;
1326 char buf[MAX_UNICODE_EQUIVALENT_STRING + 1];
1327
1328 if (!is_valid_unicode_codepoint(c))
1329 yyerror("invalid Unicode escape value");
1330
1331 /*
1332 * We expect that pg_unicode_to_server() will complain about any
1333 * unconvertible code point, so we don't have to set saw_non_ascii.
1334 */
1335 setup_scanner_errposition_callback(&scbstate, yyscanner, *(yylloc));
1336 pg_unicode_to_server(c, (unsigned char *) buf);
1337 cancel_scanner_errposition_callback(&scbstate);
1338 addlit(buf, strlen(buf), yyscanner);
1339 }
1340
1341 static unsigned char
1342 unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
1343 {
1344 switch (c)
1345 {
1346 case 'b':
1347 return '\b';
1348 case 'f':
1349 return '\f';
1350 case 'n':
1351 return '\n';
1352 case 'r':
1353 return '\r';
1354 case 't':
1355 return '\t';
1356 default:
1357 /* check for backslash followed by non-7-bit-ASCII */
1358 if (c == '\0' || IS_HIGHBIT_SET(c))
1359 yyextra->saw_non_ascii = true;
1360
1361 return c;
1362 }
1363 }
1364
1365 static void
1366 check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
1367 {
1368 if (ychar == '\'')
1369 {
1370 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1371 ereport(WARNING,
1372 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1373 errmsg("nonstandard use of \\' in a string literal"),
1374 errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
1375 lexer_errposition()));
1376 yyextra->warn_on_first_escape = false; /* warn only once per string */
1377 }
1378 else if (ychar == '\\')
1379 {
1380 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1381 ereport(WARNING,
1382 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1383 errmsg("nonstandard use of \\\\ in a string literal"),
1384 errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1385 lexer_errposition()));
1386 yyextra->warn_on_first_escape = false; /* warn only once per string */
1387 }
1388 else
1389 check_escape_warning(yyscanner);
1390 }
1391
1392 static void
1393 check_escape_warning(core_yyscan_t yyscanner)
1394 {
1395 if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1396 ereport(WARNING,
1397 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1398 errmsg("nonstandard use of escape in a string literal"),
1399 errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
1400 lexer_errposition()));
1401 yyextra->warn_on_first_escape = false; /* warn only once per string */
1402 }
1403
1404 /*
1405 * Interface functions to make flex use palloc() instead of malloc().
1406 * It'd be better to make these static, but flex insists otherwise.
1407 */
1408
1409 void *
1410 core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
1411 {
1412 return palloc(bytes);
1413 }
1414
1415 void *
1416 core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
1417 {
1418 if (ptr)
1419 return repalloc(ptr, bytes);
1420 else
1421 return palloc(bytes);
1422 }
1423
1424 void
1425 core_yyfree(void *ptr, core_yyscan_t yyscanner)
1426 {
1427 if (ptr)
1428 pfree(ptr);
1429 }
1430