1 %top{
2 /*-------------------------------------------------------------------------
3  *
4  * psqlscan.l
5  *	  lexical scanner for SQL commands
6  *
7  * This lexer used to be part of psql, and that heritage is reflected in
8  * the file name as well as function and typedef names, though it can now
9  * be used by other frontend programs as well.  It's also possible to extend
10  * this lexer with a compatible add-on lexer to handle program-specific
11  * backslash commands.
12  *
13  * This code is mainly concerned with determining where the end of a SQL
14  * statement is: we are looking for semicolons that are not within quotes,
15  * comments, or parentheses.  The most reliable way to handle this is to
16  * borrow the backend's flex lexer rules, lock, stock, and barrel.  The rules
17  * below are (except for a few) the same as the backend's, but their actions
18  * are just ECHO whereas the backend's actions generally do other things.
19  *
20  * XXX The rules in this file must be kept in sync with the backend lexer!!!
21  *
22  * XXX Avoid creating backtracking cases --- see the backend lexer for info.
23  *
24  * See psqlscan_int.h for additional commentary.
25  *
26  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
27  * Portions Copyright (c) 1994, Regents of the University of California
28  *
29  * IDENTIFICATION
30  *	  src/fe_utils/psqlscan.l
31  *
32  *-------------------------------------------------------------------------
33  */
34 #include "postgres_fe.h"
35 
36 #include "fe_utils/psqlscan.h"
37 
38 #include "libpq-fe.h"
39 }
40 
41 %{
42 #include "fe_utils/psqlscan_int.h"
43 
44 /*
45  * We must have a typedef YYSTYPE for yylex's first argument, but this lexer
46  * doesn't presently make use of that argument, so just declare it as int.
47  */
48 typedef int YYSTYPE;
49 
50 /*
51  * Set the type of yyextra; we use it as a pointer back to the containing
52  * PsqlScanState.
53  */
54 #define YY_EXTRA_TYPE PsqlScanState
55 
56 
57 /* Return values from yylex() */
58 #define LEXRES_EOL			0	/* end of input */
59 #define LEXRES_SEMI			1	/* command-terminating semicolon found */
60 #define LEXRES_BACKSLASH	2	/* backslash command start */
61 
62 
63 #define ECHO psqlscan_emit(cur_state, yytext, yyleng)
64 
65 /*
66  * Work around a bug in flex 2.5.35: it emits a couple of functions that
67  * it forgets to emit declarations for.  Since we use -Wmissing-prototypes,
68  * this would cause warnings.  Providing our own declarations should be
69  * harmless even when the bug gets fixed.
70  */
71 extern int	psql_yyget_column(yyscan_t yyscanner);
72 extern void psql_yyset_column(int column_no, yyscan_t yyscanner);
73 
74 %}
75 
76 %option reentrant
77 %option bison-bridge
78 %option 8bit
79 %option never-interactive
80 %option nodefault
81 %option noinput
82 %option nounput
83 %option noyywrap
84 %option warn
85 %option prefix="psql_yy"
86 
87 /*
88  * All of the following definitions and rules should exactly match
89  * src/backend/parser/scan.l so far as the flex patterns are concerned.
90  * The rule bodies are just ECHO as opposed to what the backend does,
91  * however.  (But be sure to duplicate code that affects the lexing process,
92  * such as BEGIN() and yyless().)  Also, psqlscan uses a single <<EOF>> rule
93  * whereas scan.l has a separate one for each exclusive state.
94  */
95 
96 /*
97  * OK, here is a short description of lex/flex rules behavior.
98  * The longest pattern which matches an input string is always chosen.
99  * For equal-length patterns, the first occurring in the rules list is chosen.
100  * INITIAL is the starting state, to which all non-conditional rules apply.
101  * Exclusive states change parsing rules while the state is active.  When in
102  * an exclusive state, only those rules defined for that state apply.
103  *
104  * We use exclusive states for quoted strings, extended comments,
105  * and to eliminate parsing troubles for numeric strings.
106  * Exclusive states:
107  *  <xb> bit string literal
108  *  <xc> extended C-style comments
109  *  <xd> delimited identifiers (double-quoted identifiers)
110  *  <xh> hexadecimal numeric string
111  *  <xq> standard quoted strings
112  *  <xe> extended quoted strings (support backslash escape sequences)
113  *  <xdolq> $foo$ quoted strings
114  *  <xui> quoted identifier with Unicode escapes
115  *  <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
116  *  <xus> quoted string with Unicode escapes
117  *  <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
118  *
119  * Note: we intentionally don't mimic the backend's <xeu> state; we have
120  * no need to distinguish it from <xe> state, and no good way to get out
121  * of it in error cases.  The backend just throws yyerror() in those
122  * cases, but that's not an option here.
123  */
124 
125 %x xb
126 %x xc
127 %x xd
128 %x xh
129 %x xe
130 %x xq
131 %x xdolq
132 %x xui
133 %x xuiend
134 %x xus
135 %x xusend
136 
137 /*
138  * In order to make the world safe for Windows and Mac clients as well as
139  * Unix ones, we accept either \n or \r as a newline.  A DOS-style \r\n
140  * sequence will be seen as two successive newlines, but that doesn't cause
141  * any problems.  Comments that start with -- and extend to the next
142  * newline are treated as equivalent to a single whitespace character.
143  *
144  * NOTE a fine point: if there is no newline following --, we will absorb
145  * everything to the end of the input as a comment.  This is correct.  Older
146  * versions of Postgres failed to recognize -- as a comment if the input
147  * did not end with a newline.
148  *
149  * XXX perhaps \f (formfeed) should be treated as a newline as well?
150  *
151  * XXX if you change the set of whitespace characters, fix scanner_isspace()
152  * to agree, and see also the plpgsql lexer.
153  */
154 
155 space			[ \t\n\r\f]
156 horiz_space		[ \t\f]
157 newline			[\n\r]
158 non_newline		[^\n\r]
159 
160 comment			("--"{non_newline}*)
161 
162 whitespace		({space}+|{comment})
163 
164 /*
165  * SQL requires at least one newline in the whitespace separating
166  * string literals that are to be concatenated.  Silly, but who are we
167  * to argue?  Note that {whitespace_with_newline} should not have * after
168  * it, whereas {whitespace} should generally have a * after it...
169  */
170 
171 special_whitespace		({space}+|{comment}{newline})
172 horiz_whitespace		({horiz_space}|{comment})
173 whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
174 
175 /*
176  * To ensure that {quotecontinue} can be scanned without having to back up
177  * if the full pattern isn't matched, we include trailing whitespace in
178  * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
179  * except for {quote} followed by whitespace and just one "-" (not two,
180  * which would start a {comment}).  To cover that we have {quotefail}.
181  * The actions for {quotestop} and {quotefail} must throw back characters
182  * beyond the quote proper.
183  */
184 quote			'
185 quotestop		{quote}{whitespace}*
186 quotecontinue	{quote}{whitespace_with_newline}{quote}
187 quotefail		{quote}{whitespace}*"-"
188 
189 /* Bit string
190  * It is tempting to scan the string for only those characters
191  * which are allowed. However, this leads to silently swallowed
192  * characters if illegal characters are included in the string.
193  * For example, if xbinside is [01] then B'ABCD' is interpreted
194  * as a zero-length string, and the ABCD' is lost!
195  * Better to pass the string forward and let the input routines
196  * validate the contents.
197  */
198 xbstart			[bB]{quote}
199 xbinside		[^']*
200 
201 /* Hexadecimal number */
202 xhstart			[xX]{quote}
203 xhinside		[^']*
204 
205 /* National character */
206 xnstart			[nN]{quote}
207 
208 /* Quoted string that allows backslash escapes */
209 xestart			[eE]{quote}
210 xeinside		[^\\']+
211 xeescape		[\\][^0-7]
212 xeoctesc		[\\][0-7]{1,3}
213 xehexesc		[\\]x[0-9A-Fa-f]{1,2}
214 xeunicode		[\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
215 xeunicodefail	[\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
216 
217 /* Extended quote
218  * xqdouble implements embedded quote, ''''
219  */
220 xqstart			{quote}
221 xqdouble		{quote}{quote}
222 xqinside		[^']+
223 
224 /* $foo$ style quotes ("dollar quoting")
225  * The quoted string starts with $foo$ where "foo" is an optional string
226  * in the form of an identifier, except that it may not contain "$",
227  * and extends to the first occurrence of an identical string.
228  * There is *no* processing of the quoted text.
229  *
230  * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
231  * fails to match its trailing "$".
232  */
233 dolq_start		[A-Za-z\200-\377_]
234 dolq_cont		[A-Za-z\200-\377_0-9]
235 dolqdelim		\$({dolq_start}{dolq_cont}*)?\$
236 dolqfailed		\${dolq_start}{dolq_cont}*
237 dolqinside		[^$]+
238 
239 /* Double quote
240  * Allows embedded spaces and other special characters into identifiers.
241  */
242 dquote			\"
243 xdstart			{dquote}
244 xdstop			{dquote}
245 xddouble		{dquote}{dquote}
246 xdinside		[^"]+
247 
248 /* Unicode escapes */
249 uescape			[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
250 /* error rule to avoid backup */
251 uescapefail		[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
252 
253 /* Quoted identifier with Unicode escapes */
254 xuistart		[uU]&{dquote}
255 
256 /* Quoted string with Unicode escapes */
257 xusstart		[uU]&{quote}
258 
259 /* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
260 xustop1		{uescapefail}?
261 xustop2		{uescape}
262 
263 /* error rule to avoid backup */
264 xufailed		[uU]&
265 
266 
267 /* C-style comments
268  *
269  * The "extended comment" syntax closely resembles allowable operator syntax.
270  * The tricky part here is to get lex to recognize a string starting with
271  * slash-star as a comment, when interpreting it as an operator would produce
272  * a longer match --- remember lex will prefer a longer match!  Also, if we
273  * have something like plus-slash-star, lex will think this is a 3-character
274  * operator whereas we want to see it as a + operator and a comment start.
275  * The solution is two-fold:
276  * 1. append {op_chars}* to xcstart so that it matches as much text as
277  *    {operator} would. Then the tie-breaker (first matching rule of same
278  *    length) ensures xcstart wins.  We put back the extra stuff with yyless()
279  *    in case it contains a star-slash that should terminate the comment.
280  * 2. In the operator rule, check for slash-star within the operator, and
281  *    if found throw it back with yyless().  This handles the plus-slash-star
282  *    problem.
283  * Dash-dash comments have similar interactions with the operator rule.
284  */
285 xcstart			\/\*{op_chars}*
286 xcstop			\*+\/
287 xcinside		[^*/]+
288 
289 digit			[0-9]
290 ident_start		[A-Za-z\200-\377_]
291 ident_cont		[A-Za-z\200-\377_0-9\$]
292 
293 identifier		{ident_start}{ident_cont}*
294 
295 /* Assorted special-case operators and operator-like tokens */
296 typecast		"::"
297 dot_dot			\.\.
298 colon_equals	":="
299 
300 /*
301  * These operator-like tokens (unlike the above ones) also match the {operator}
302  * rule, which means that they might be overridden by a longer match if they
303  * are followed by a comment start or a + or - character. Accordingly, if you
304  * add to this list, you must also add corresponding code to the {operator}
305  * block to return the correct token in such cases. (This is not needed in
306  * psqlscan.l since the token value is ignored there.)
307  */
308 equals_greater	"=>"
309 less_equals		"<="
310 greater_equals	">="
311 less_greater	"<>"
312 not_equals		"!="
313 
314 /*
315  * "self" is the set of chars that should be returned as single-character
316  * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
317  * which can be one or more characters long (but if a single-char token
318  * appears in the "self" set, it is not to be returned as an Op).  Note
319  * that the sets overlap, but each has some chars that are not in the other.
320  *
321  * If you change either set, adjust the character lists appearing in the
322  * rule for "operator"!
323  */
324 self			[,()\[\].;\:\+\-\*\/\%\^\<\>\=]
325 op_chars		[\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
326 operator		{op_chars}+
327 
328 /* we no longer allow unary minus in numbers.
329  * instead we pass it separately to parser. there it gets
330  * coerced via doNegate() -- Leon aug 20 1999
331  *
332  * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
333  *
334  * {realfail1} and {realfail2} are added to prevent the need for scanner
335  * backup when the {real} rule fails to match completely.
336  */
337 
338 integer			{digit}+
339 decimal			(({digit}*\.{digit}+)|({digit}+\.{digit}*))
340 decimalfail		{digit}+\.\.
341 real			({integer}|{decimal})[Ee][-+]?{digit}+
342 realfail1		({integer}|{decimal})[Ee]
343 realfail2		({integer}|{decimal})[Ee][-+]
344 
345 param			\${integer}
346 
347 /* psql-specific: characters allowed in variable names */
348 variable_char	[A-Za-z\200-\377_0-9]
349 
350 other			.
351 
352 /*
353  * Dollar quoted strings are totally opaque, and no escaping is done on them.
354  * Other quoted strings must allow some special characters such as single-quote
355  *  and newline.
356  * Embedded single-quotes are implemented both in the SQL standard
357  *  style of two adjacent single quotes "''" and in the Postgres/Java style
358  *  of escaped-quote "\'".
359  * Other embedded escaped characters are matched explicitly and the leading
360  *  backslash is dropped from the string.
361  * Note that xcstart must appear before operator, as explained above!
362  *  Also whitespace (comment) must appear before operator.
363  */
364 
365 %%
366 
367 %{
368 		/* Declare some local variables inside yylex(), for convenience */
369 		PsqlScanState cur_state = yyextra;
370 		PQExpBuffer output_buf = cur_state->output_buf;
371 
372 		/*
373 		 * Force flex into the state indicated by start_state.  This has a
374 		 * couple of purposes: it lets some of the functions below set a new
375 		 * starting state without ugly direct access to flex variables, and it
376 		 * allows us to transition from one flex lexer to another so that we
377 		 * can lex different parts of the source string using separate lexers.
378 		 */
379 		BEGIN(cur_state->start_state);
380 %}
381 
382 {whitespace}	{
383 					/*
384 					 * Note that the whitespace rule includes both true
385 					 * whitespace and single-line ("--" style) comments.
386 					 * We suppress whitespace at the start of the query
387 					 * buffer.  We also suppress all single-line comments,
388 					 * which is pretty dubious but is the historical
389 					 * behavior.
390 					 */
391 					if (!(output_buf->len == 0 || yytext[0] == '-'))
392 						ECHO;
393 				}
394 
395 {xcstart}		{
396 					cur_state->xcdepth = 0;
397 					BEGIN(xc);
398 					/* Put back any characters past slash-star; see above */
399 					yyless(2);
400 					ECHO;
401 				}
402 
403 <xc>{xcstart}	{
404 					cur_state->xcdepth++;
405 					/* Put back any characters past slash-star; see above */
406 					yyless(2);
407 					ECHO;
408 				}
409 
410 <xc>{xcstop}	{
411 					if (cur_state->xcdepth <= 0)
412 						BEGIN(INITIAL);
413 					else
414 						cur_state->xcdepth--;
415 					ECHO;
416 				}
417 
418 <xc>{xcinside}	{
419 					ECHO;
420 				}
421 
422 <xc>{op_chars}	{
423 					ECHO;
424 				}
425 
426 <xc>\*+			{
427 					ECHO;
428 				}
429 
430 {xbstart}		{
431 					BEGIN(xb);
432 					ECHO;
433 				}
434 <xb>{quotestop}	|
435 <xb>{quotefail} {
436 					yyless(1);
437 					BEGIN(INITIAL);
438 					ECHO;
439 				}
440 <xh>{xhinside}	|
441 <xb>{xbinside}	{
442 					ECHO;
443 				}
444 <xh>{quotecontinue}	|
445 <xb>{quotecontinue}	{
446 					ECHO;
447 				}
448 
449 {xhstart}		{
450 					/* Hexadecimal bit type.
451 					 * At some point we should simply pass the string
452 					 * forward to the parser and label it there.
453 					 * In the meantime, place a leading "x" on the string
454 					 * to mark it for the input routine as a hex string.
455 					 */
456 					BEGIN(xh);
457 					ECHO;
458 				}
459 <xh>{quotestop}	|
460 <xh>{quotefail} {
461 					yyless(1);
462 					BEGIN(INITIAL);
463 					ECHO;
464 				}
465 
466 {xnstart}		{
467 					yyless(1);	/* eat only 'n' this time */
468 					ECHO;
469 				}
470 
471 {xqstart}		{
472 					if (cur_state->std_strings)
473 						BEGIN(xq);
474 					else
475 						BEGIN(xe);
476 					ECHO;
477 				}
478 {xestart}		{
479 					BEGIN(xe);
480 					ECHO;
481 				}
482 {xusstart}		{
483 					BEGIN(xus);
484 					ECHO;
485 				}
486 <xq,xe>{quotestop}	|
487 <xq,xe>{quotefail} {
488 					yyless(1);
489 					BEGIN(INITIAL);
490 					ECHO;
491 				}
492 <xus>{quotestop} |
493 <xus>{quotefail} {
494 					/* throw back all but the quote */
495 					yyless(1);
496 					BEGIN(xusend);
497 					ECHO;
498 				}
499 <xusend>{whitespace} {
500 					ECHO;
501 				}
502 <xusend>{other} |
503 <xusend>{xustop1} {
504 					yyless(0);
505 					BEGIN(INITIAL);
506 					ECHO;
507 				}
508 <xusend>{xustop2} {
509 					BEGIN(INITIAL);
510 					ECHO;
511 				}
512 <xq,xe,xus>{xqdouble} {
513 					ECHO;
514 				}
515 <xq,xus>{xqinside}  {
516 					ECHO;
517 				}
518 <xe>{xeinside}  {
519 					ECHO;
520 				}
521 <xe>{xeunicode} {
522 					ECHO;
523 				}
524 <xe>{xeunicodefail}	{
525 					ECHO;
526 				}
527 <xe>{xeescape}  {
528 					ECHO;
529 				}
530 <xe>{xeoctesc}  {
531 					ECHO;
532 				}
533 <xe>{xehexesc}  {
534 					ECHO;
535 				}
536 <xq,xe,xus>{quotecontinue} {
537 					ECHO;
538 				}
539 <xe>.			{
540 					/* This is only needed for \ just before EOF */
541 					ECHO;
542 				}
543 
544 {dolqdelim}		{
545 					cur_state->dolqstart = pg_strdup(yytext);
546 					BEGIN(xdolq);
547 					ECHO;
548 				}
549 {dolqfailed}	{
550 					/* throw back all but the initial "$" */
551 					yyless(1);
552 					ECHO;
553 				}
554 <xdolq>{dolqdelim} {
555 					if (strcmp(yytext, cur_state->dolqstart) == 0)
556 					{
557 						free(cur_state->dolqstart);
558 						cur_state->dolqstart = NULL;
559 						BEGIN(INITIAL);
560 					}
561 					else
562 					{
563 						/*
564 						 * When we fail to match $...$ to dolqstart, transfer
565 						 * the $... part to the output, but put back the final
566 						 * $ for rescanning.  Consider $delim$...$junk$delim$
567 						 */
568 						yyless(yyleng - 1);
569 					}
570 					ECHO;
571 				}
572 <xdolq>{dolqinside} {
573 					ECHO;
574 				}
575 <xdolq>{dolqfailed} {
576 					ECHO;
577 				}
578 <xdolq>.		{
579 					/* This is only needed for $ inside the quoted text */
580 					ECHO;
581 				}
582 
583 {xdstart}		{
584 					BEGIN(xd);
585 					ECHO;
586 				}
587 {xuistart}		{
588 					BEGIN(xui);
589 					ECHO;
590 				}
591 <xd>{xdstop}	{
592 					BEGIN(INITIAL);
593 					ECHO;
594 				}
595 <xui>{dquote} {
596 					yyless(1);
597 					BEGIN(xuiend);
598 					ECHO;
599 				}
600 <xuiend>{whitespace} {
601 					ECHO;
602 				}
603 <xuiend>{other} |
604 <xuiend>{xustop1} {
605 					yyless(0);
606 					BEGIN(INITIAL);
607 					ECHO;
608 				}
609 <xuiend>{xustop2}	{
610 					BEGIN(INITIAL);
611 					ECHO;
612 				}
613 <xd,xui>{xddouble}	{
614 					ECHO;
615 				}
616 <xd,xui>{xdinside}	{
617 					ECHO;
618 				}
619 
620 {xufailed}	{
621 					/* throw back all but the initial u/U */
622 					yyless(1);
623 					ECHO;
624 				}
625 
626 {typecast}		{
627 					ECHO;
628 				}
629 
630 {dot_dot}		{
631 					ECHO;
632 				}
633 
634 {colon_equals}	{
635 					ECHO;
636 				}
637 
638 {equals_greater} {
639 					ECHO;
640 				}
641 
642 {less_equals}	{
643 					ECHO;
644 				}
645 
646 {greater_equals} {
647 					ECHO;
648 				}
649 
650 {less_greater}	{
651 					ECHO;
652 				}
653 
654 {not_equals}	{
655 					ECHO;
656 				}
657 
658 	/*
659 	 * These rules are specific to psql --- they implement parenthesis
660 	 * counting and detection of command-ending semicolon.  These must
661 	 * appear before the {self} rule so that they take precedence over it.
662 	 */
663 
664 "("				{
665 					cur_state->paren_depth++;
666 					ECHO;
667 				}
668 
669 ")"				{
670 					if (cur_state->paren_depth > 0)
671 						cur_state->paren_depth--;
672 					ECHO;
673 				}
674 
675 ";"				{
676 					ECHO;
677 					if (cur_state->paren_depth == 0)
678 					{
679 						/* Terminate lexing temporarily */
680 						cur_state->start_state = YY_START;
681 						return LEXRES_SEMI;
682 					}
683 				}
684 
685 	/*
686 	 * psql-specific rules to handle backslash commands and variable
687 	 * substitution.  We want these before {self}, also.
688 	 */
689 
690 "\\"[;:]		{
691 					/* Force a semicolon or colon into the query buffer */
692 					psqlscan_emit(cur_state, yytext + 1, 1);
693 				}
694 
695 "\\"			{
696 					/* Terminate lexing temporarily */
697 					cur_state->start_state = YY_START;
698 					return LEXRES_BACKSLASH;
699 				}
700 
701 :{variable_char}+	{
702 					/* Possible psql variable substitution */
703 					char	   *varname;
704 					char	   *value;
705 
706 					varname = psqlscan_extract_substring(cur_state,
707 														 yytext + 1,
708 														 yyleng - 1);
709 					if (cur_state->callbacks->get_variable)
710 						value = cur_state->callbacks->get_variable(varname,
711 																   false,
712 																   false);
713 					else
714 						value = NULL;
715 
716 					if (value)
717 					{
718 						/* It is a variable, check for recursion */
719 						if (psqlscan_var_is_current_source(cur_state, varname))
720 						{
721 							/* Recursive expansion --- don't go there */
722 							cur_state->callbacks->write_error("skipping recursive expansion of variable \"%s\"\n",
723 															  varname);
724 							/* Instead copy the string as is */
725 							ECHO;
726 						}
727 						else
728 						{
729 							/* OK, perform substitution */
730 							psqlscan_push_new_buffer(cur_state, value, varname);
731 							/* yy_scan_string already made buffer active */
732 						}
733 						free(value);
734 					}
735 					else
736 					{
737 						/*
738 						 * if the variable doesn't exist we'll copy the string
739 						 * as is
740 						 */
741 						ECHO;
742 					}
743 
744 					free(varname);
745 				}
746 
747 :'{variable_char}+'	{
748 					psqlscan_escape_variable(cur_state, yytext, yyleng, false);
749 				}
750 
751 :\"{variable_char}+\"	{
752 					psqlscan_escape_variable(cur_state, yytext, yyleng, true);
753 				}
754 
755 	/*
756 	 * These rules just avoid the need for scanner backup if one of the
757 	 * two rules above fails to match completely.
758 	 */
759 
760 :'{variable_char}*	{
761 					/* Throw back everything but the colon */
762 					yyless(1);
763 					ECHO;
764 				}
765 
766 :\"{variable_char}*	{
767 					/* Throw back everything but the colon */
768 					yyless(1);
769 					ECHO;
770 				}
771 
772 	/*
773 	 * Back to backend-compatible rules.
774 	 */
775 
776 {self}			{
777 					ECHO;
778 				}
779 
780 {operator}		{
781 					/*
782 					 * Check for embedded slash-star or dash-dash; those
783 					 * are comment starts, so operator must stop there.
784 					 * Note that slash-star or dash-dash at the first
785 					 * character will match a prior rule, not this one.
786 					 */
787 					int			nchars = yyleng;
788 					char	   *slashstar = strstr(yytext, "/*");
789 					char	   *dashdash = strstr(yytext, "--");
790 
791 					if (slashstar && dashdash)
792 					{
793 						/* if both appear, take the first one */
794 						if (slashstar > dashdash)
795 							slashstar = dashdash;
796 					}
797 					else if (!slashstar)
798 						slashstar = dashdash;
799 					if (slashstar)
800 						nchars = slashstar - yytext;
801 
802 					/*
803 					 * For SQL compatibility, '+' and '-' cannot be the
804 					 * last char of a multi-char operator unless the operator
805 					 * contains chars that are not in SQL operators.
806 					 * The idea is to lex '=-' as two operators, but not
807 					 * to forbid operator names like '?-' that could not be
808 					 * sequences of SQL operators.
809 					 */
810 					if (nchars > 1 &&
811 						(yytext[nchars - 1] == '+' ||
812 						 yytext[nchars - 1] == '-'))
813 					{
814 						int			ic;
815 
816 						for (ic = nchars - 2; ic >= 0; ic--)
817 						{
818 							char c = yytext[ic];
819 							if (c == '~' || c == '!' || c == '@' ||
820 								c == '#' || c == '^' || c == '&' ||
821 								c == '|' || c == '`' || c == '?' ||
822 								c == '%')
823 								break;
824 						}
825 						if (ic < 0)
826 						{
827 							/*
828 							 * didn't find a qualifying character, so remove
829 							 * all trailing [+-]
830 							 */
831 							do {
832 								nchars--;
833 							} while (nchars > 1 &&
834 								 (yytext[nchars - 1] == '+' ||
835 								  yytext[nchars - 1] == '-'));
836 						}
837 					}
838 
839 					if (nchars < yyleng)
840 					{
841 						/* Strip the unwanted chars from the token */
842 						yyless(nchars);
843 					}
844 					ECHO;
845 				}
846 
847 {param}			{
848 					ECHO;
849 				}
850 
851 {integer}		{
852 					ECHO;
853 				}
854 {decimal}		{
855 					ECHO;
856 				}
857 {decimalfail}	{
858 					/* throw back the .., and treat as integer */
859 					yyless(yyleng - 2);
860 					ECHO;
861 				}
862 {real}			{
863 					ECHO;
864 				}
865 {realfail1}		{
866 					/*
867 					 * throw back the [Ee], and treat as {decimal}.  Note
868 					 * that it is possible the input is actually {integer},
869 					 * but since this case will almost certainly lead to a
870 					 * syntax error anyway, we don't bother to distinguish.
871 					 */
872 					yyless(yyleng - 1);
873 					ECHO;
874 				}
875 {realfail2}		{
876 					/* throw back the [Ee][+-], and proceed as above */
877 					yyless(yyleng - 2);
878 					ECHO;
879 				}
880 
881 
882 {identifier}	{
883 					ECHO;
884 				}
885 
886 {other}			{
887 					ECHO;
888 				}
889 
890 <<EOF>>			{
891 					if (cur_state->buffer_stack == NULL)
892 					{
893 						cur_state->start_state = YY_START;
894 						return LEXRES_EOL;		/* end of input reached */
895 					}
896 
897 					/*
898 					 * We were expanding a variable, so pop the inclusion
899 					 * stack and keep lexing
900 					 */
901 					psqlscan_pop_buffer_stack(cur_state);
902 					psqlscan_select_top_buffer(cur_state);
903 				}
904 
905 %%
906 
907 /*
908  * Create a lexer working state struct.
909  *
910  * callbacks is a struct of function pointers that encapsulate some
911  * behavior we need from the surrounding program.  This struct must
912  * remain valid for the lifespan of the PsqlScanState.
913  */
914 PsqlScanState
915 psql_scan_create(const PsqlScanCallbacks *callbacks)
916 {
917 	PsqlScanState state;
918 
919 	state = (PsqlScanStateData *) pg_malloc0(sizeof(PsqlScanStateData));
920 
921 	state->callbacks = callbacks;
922 
923 	yylex_init(&state->scanner);
924 
925 	yyset_extra(state, state->scanner);
926 
927 	psql_scan_reset(state);
928 
929 	return state;
930 }
931 
932 /*
933  * Destroy a lexer working state struct, releasing all resources.
934  */
935 void
936 psql_scan_destroy(PsqlScanState state)
937 {
938 	psql_scan_finish(state);
939 
940 	psql_scan_reset(state);
941 
942 	yylex_destroy(state->scanner);
943 
944 	free(state);
945 }
946 
947 /*
948  * Set up to perform lexing of the given input line.
949  *
950  * The text at *line, extending for line_len bytes, will be scanned by
951  * subsequent calls to the psql_scan routines.  psql_scan_finish should
952  * be called when scanning is complete.  Note that the lexer retains
953  * a pointer to the storage at *line --- this string must not be altered
954  * or freed until after psql_scan_finish is called.
955  *
956  * encoding is the libpq identifier for the character encoding in use,
957  * and std_strings says whether standard_conforming_strings is on.
958  */
959 void
960 psql_scan_setup(PsqlScanState state,
961 				const char *line, int line_len,
962 				int encoding, bool std_strings)
963 {
964 	/* Mustn't be scanning already */
965 	Assert(state->scanbufhandle == NULL);
966 	Assert(state->buffer_stack == NULL);
967 
968 	/* Do we need to hack the character set encoding? */
969 	state->encoding = encoding;
970 	state->safe_encoding = pg_valid_server_encoding_id(encoding);
971 
972 	/* Save standard-strings flag as well */
973 	state->std_strings = std_strings;
974 
975 	/* Set up flex input buffer with appropriate translation and padding */
976 	state->scanbufhandle = psqlscan_prepare_buffer(state, line, line_len,
977 												   &state->scanbuf);
978 	state->scanline = line;
979 
980 	/* Set lookaside data in case we have to map unsafe encoding */
981 	state->curline = state->scanbuf;
982 	state->refline = state->scanline;
983 }
984 
985 /*
986  * Do lexical analysis of SQL command text.
987  *
988  * The text previously passed to psql_scan_setup is scanned, and appended
989  * (possibly with transformation) to query_buf.
990  *
991  * The return value indicates the condition that stopped scanning:
992  *
993  * PSCAN_SEMICOLON: found a command-ending semicolon.  (The semicolon is
994  * transferred to query_buf.)  The command accumulated in query_buf should
995  * be executed, then clear query_buf and call again to scan the remainder
996  * of the line.
997  *
998  * PSCAN_BACKSLASH: found a backslash that starts a special command.
999  * Any previous data on the line has been transferred to query_buf.
1000  * The caller will typically next apply a separate flex lexer to scan
1001  * the special command.
1002  *
1003  * PSCAN_INCOMPLETE: the end of the line was reached, but we have an
1004  * incomplete SQL command.  *prompt is set to the appropriate prompt type.
1005  *
1006  * PSCAN_EOL: the end of the line was reached, and there is no lexical
1007  * reason to consider the command incomplete.  The caller may or may not
1008  * choose to send it.  *prompt is set to the appropriate prompt type if
1009  * the caller chooses to collect more input.
1010  *
1011  * In the PSCAN_INCOMPLETE and PSCAN_EOL cases, psql_scan_finish() should
1012  * be called next, then the cycle may be repeated with a fresh input line.
1013  *
1014  * In all cases, *prompt is set to an appropriate prompt type code for the
1015  * next line-input operation.
1016  */
1017 PsqlScanResult
1018 psql_scan(PsqlScanState state,
1019 		  PQExpBuffer query_buf,
1020 		  promptStatus_t *prompt)
1021 {
1022 	PsqlScanResult result;
1023 	int			lexresult;
1024 
1025 	/* Must be scanning already */
1026 	Assert(state->scanbufhandle != NULL);
1027 
1028 	/* Set current output target */
1029 	state->output_buf = query_buf;
1030 
1031 	/* Set input source */
1032 	if (state->buffer_stack != NULL)
1033 		yy_switch_to_buffer(state->buffer_stack->buf, state->scanner);
1034 	else
1035 		yy_switch_to_buffer(state->scanbufhandle, state->scanner);
1036 
1037 	/* And lex. */
1038 	lexresult = yylex(NULL, state->scanner);
1039 
1040 	/*
1041 	 * Check termination state and return appropriate result info.
1042 	 */
1043 	switch (lexresult)
1044 	{
1045 		case LEXRES_EOL:		/* end of input */
1046 			switch (state->start_state)
1047 			{
1048 				case INITIAL:
1049 				case xuiend:	/* we treat these like INITIAL */
1050 				case xusend:
1051 					if (state->paren_depth > 0)
1052 					{
1053 						result = PSCAN_INCOMPLETE;
1054 						*prompt = PROMPT_PAREN;
1055 					}
1056 					else if (query_buf->len > 0)
1057 					{
1058 						result = PSCAN_EOL;
1059 						*prompt = PROMPT_CONTINUE;
1060 					}
1061 					else
1062 					{
1063 						/* never bother to send an empty buffer */
1064 						result = PSCAN_INCOMPLETE;
1065 						*prompt = PROMPT_READY;
1066 					}
1067 					break;
1068 				case xb:
1069 					result = PSCAN_INCOMPLETE;
1070 					*prompt = PROMPT_SINGLEQUOTE;
1071 					break;
1072 				case xc:
1073 					result = PSCAN_INCOMPLETE;
1074 					*prompt = PROMPT_COMMENT;
1075 					break;
1076 				case xd:
1077 					result = PSCAN_INCOMPLETE;
1078 					*prompt = PROMPT_DOUBLEQUOTE;
1079 					break;
1080 				case xh:
1081 					result = PSCAN_INCOMPLETE;
1082 					*prompt = PROMPT_SINGLEQUOTE;
1083 					break;
1084 				case xe:
1085 					result = PSCAN_INCOMPLETE;
1086 					*prompt = PROMPT_SINGLEQUOTE;
1087 					break;
1088 				case xq:
1089 					result = PSCAN_INCOMPLETE;
1090 					*prompt = PROMPT_SINGLEQUOTE;
1091 					break;
1092 				case xdolq:
1093 					result = PSCAN_INCOMPLETE;
1094 					*prompt = PROMPT_DOLLARQUOTE;
1095 					break;
1096 				case xui:
1097 					result = PSCAN_INCOMPLETE;
1098 					*prompt = PROMPT_DOUBLEQUOTE;
1099 					break;
1100 				case xus:
1101 					result = PSCAN_INCOMPLETE;
1102 					*prompt = PROMPT_SINGLEQUOTE;
1103 					break;
1104 				default:
1105 					/* can't get here */
1106 					fprintf(stderr, "invalid YY_START\n");
1107 					exit(1);
1108 			}
1109 			break;
1110 		case LEXRES_SEMI:		/* semicolon */
1111 			result = PSCAN_SEMICOLON;
1112 			*prompt = PROMPT_READY;
1113 			break;
1114 		case LEXRES_BACKSLASH:	/* backslash */
1115 			result = PSCAN_BACKSLASH;
1116 			*prompt = PROMPT_READY;
1117 			break;
1118 		default:
1119 			/* can't get here */
1120 			fprintf(stderr, "invalid yylex result\n");
1121 			exit(1);
1122 	}
1123 
1124 	return result;
1125 }
1126 
1127 /*
1128  * Clean up after scanning a string.  This flushes any unread input and
1129  * releases resources (but not the PsqlScanState itself).  Note however
1130  * that this does not reset the lexer scan state; that can be done by
1131  * psql_scan_reset(), which is an orthogonal operation.
1132  *
1133  * It is legal to call this when not scanning anything (makes it easier
1134  * to deal with error recovery).
1135  */
1136 void
1137 psql_scan_finish(PsqlScanState state)
1138 {
1139 	/* Drop any incomplete variable expansions. */
1140 	while (state->buffer_stack != NULL)
1141 		psqlscan_pop_buffer_stack(state);
1142 
1143 	/* Done with the outer scan buffer, too */
1144 	if (state->scanbufhandle)
1145 		yy_delete_buffer(state->scanbufhandle, state->scanner);
1146 	state->scanbufhandle = NULL;
1147 	if (state->scanbuf)
1148 		free(state->scanbuf);
1149 	state->scanbuf = NULL;
1150 }
1151 
1152 /*
1153  * Reset lexer scanning state to start conditions.  This is appropriate
1154  * for executing \r psql commands (or any other time that we discard the
1155  * prior contents of query_buf).  It is not, however, necessary to do this
1156  * when we execute and clear the buffer after getting a PSCAN_SEMICOLON or
1157  * PSCAN_EOL scan result, because the scan state must be INITIAL when those
1158  * conditions are returned.
1159  *
1160  * Note that this is unrelated to flushing unread input; that task is
1161  * done by psql_scan_finish().
1162  */
1163 void
1164 psql_scan_reset(PsqlScanState state)
1165 {
1166 	state->start_state = INITIAL;
1167 	state->paren_depth = 0;
1168 	state->xcdepth = 0;			/* not really necessary */
1169 	if (state->dolqstart)
1170 		free(state->dolqstart);
1171 	state->dolqstart = NULL;
1172 }
1173 
1174 /*
1175  * Reselect this lexer (psqlscan.l) after using another one.
1176  *
1177  * Currently and for foreseeable uses, it's sufficient to reset to INITIAL
1178  * state, because we'd never switch to another lexer in a different state.
1179  * However, we don't want to reset e.g. paren_depth, so this can't be
1180  * the same as psql_scan_reset().
1181  *
1182  * Note: psql setjmp error recovery just calls psql_scan_reset(), so that
1183  * must be a superset of this.
1184  *
1185  * Note: it seems likely that other lexers could just assign INITIAL for
1186  * themselves, since that probably has the value zero in every flex-generated
1187  * lexer.  But let's not assume that.
1188  */
1189 void
1190 psql_scan_reselect_sql_lexer(PsqlScanState state)
1191 {
1192 	state->start_state = INITIAL;
1193 }
1194 
1195 /*
1196  * Return true if lexer is currently in an "inside quotes" state.
1197  *
1198  * This is pretty grotty but is needed to preserve the old behavior
1199  * that mainloop.c drops blank lines not inside quotes without even
1200  * echoing them.
1201  */
1202 bool
1203 psql_scan_in_quote(PsqlScanState state)
1204 {
1205 	return state->start_state != INITIAL;
1206 }
1207 
1208 /*
1209  * Push the given string onto the stack of stuff to scan.
1210  *
1211  * NOTE SIDE EFFECT: the new buffer is made the active flex input buffer.
1212  */
1213 void
1214 psqlscan_push_new_buffer(PsqlScanState state, const char *newstr,
1215 						 const char *varname)
1216 {
1217 	StackElem  *stackelem;
1218 
1219 	stackelem = (StackElem *) pg_malloc(sizeof(StackElem));
1220 
1221 	/*
1222 	 * In current usage, the passed varname points at the current flex input
1223 	 * buffer; we must copy it before calling psqlscan_prepare_buffer()
1224 	 * because that will change the buffer state.
1225 	 */
1226 	stackelem->varname = varname ? pg_strdup(varname) : NULL;
1227 
1228 	stackelem->buf = psqlscan_prepare_buffer(state, newstr, strlen(newstr),
1229 											 &stackelem->bufstring);
1230 	state->curline = stackelem->bufstring;
1231 	if (state->safe_encoding)
1232 	{
1233 		stackelem->origstring = NULL;
1234 		state->refline = stackelem->bufstring;
1235 	}
1236 	else
1237 	{
1238 		stackelem->origstring = pg_strdup(newstr);
1239 		state->refline = stackelem->origstring;
1240 	}
1241 	stackelem->next = state->buffer_stack;
1242 	state->buffer_stack = stackelem;
1243 }
1244 
1245 /*
1246  * Pop the topmost buffer stack item (there must be one!)
1247  *
1248  * NB: after this, the flex input state is unspecified; caller must
1249  * switch to an appropriate buffer to continue lexing.
1250  * See psqlscan_select_top_buffer().
1251  */
1252 void
1253 psqlscan_pop_buffer_stack(PsqlScanState state)
1254 {
1255 	StackElem  *stackelem = state->buffer_stack;
1256 
1257 	state->buffer_stack = stackelem->next;
1258 	yy_delete_buffer(stackelem->buf, state->scanner);
1259 	free(stackelem->bufstring);
1260 	if (stackelem->origstring)
1261 		free(stackelem->origstring);
1262 	if (stackelem->varname)
1263 		free(stackelem->varname);
1264 	free(stackelem);
1265 }
1266 
1267 /*
1268  * Select the topmost surviving buffer as the active input.
1269  */
1270 void
1271 psqlscan_select_top_buffer(PsqlScanState state)
1272 {
1273 	StackElem  *stackelem = state->buffer_stack;
1274 
1275 	if (stackelem != NULL)
1276 	{
1277 		yy_switch_to_buffer(stackelem->buf, state->scanner);
1278 		state->curline = stackelem->bufstring;
1279 		state->refline = stackelem->origstring ? stackelem->origstring : stackelem->bufstring;
1280 	}
1281 	else
1282 	{
1283 		yy_switch_to_buffer(state->scanbufhandle, state->scanner);
1284 		state->curline = state->scanbuf;
1285 		state->refline = state->scanline;
1286 	}
1287 }
1288 
1289 /*
1290  * Check if specified variable name is the source for any string
1291  * currently being scanned
1292  */
1293 bool
1294 psqlscan_var_is_current_source(PsqlScanState state, const char *varname)
1295 {
1296 	StackElem  *stackelem;
1297 
1298 	for (stackelem = state->buffer_stack;
1299 		 stackelem != NULL;
1300 		 stackelem = stackelem->next)
1301 	{
1302 		if (stackelem->varname && strcmp(stackelem->varname, varname) == 0)
1303 			return true;
1304 	}
1305 	return false;
1306 }
1307 
1308 /*
1309  * Set up a flex input buffer to scan the given data.  We always make a
1310  * copy of the data.  If working in an unsafe encoding, the copy has
1311  * multibyte sequences replaced by FFs to avoid fooling the lexer rules.
1312  *
1313  * NOTE SIDE EFFECT: the new buffer is made the active flex input buffer.
1314  */
1315 YY_BUFFER_STATE
1316 psqlscan_prepare_buffer(PsqlScanState state, const char *txt, int len,
1317 						char **txtcopy)
1318 {
1319 	char	   *newtxt;
1320 
1321 	/* Flex wants two \0 characters after the actual data */
1322 	newtxt = pg_malloc(len + 2);
1323 	*txtcopy = newtxt;
1324 	newtxt[len] = newtxt[len + 1] = YY_END_OF_BUFFER_CHAR;
1325 
1326 	if (state->safe_encoding)
1327 		memcpy(newtxt, txt, len);
1328 	else
1329 	{
1330 		/* Gotta do it the hard way */
1331 		int			i = 0;
1332 
1333 		while (i < len)
1334 		{
1335 			int			thislen = PQmblen(txt + i, state->encoding);
1336 
1337 			/* first byte should always be okay... */
1338 			newtxt[i] = txt[i];
1339 			i++;
1340 			while (--thislen > 0 && i < len)
1341 				newtxt[i++] = (char) 0xFF;
1342 		}
1343 	}
1344 
1345 	return yy_scan_buffer(newtxt, len + 2, state->scanner);
1346 }
1347 
1348 /*
1349  * psqlscan_emit() --- body for ECHO macro
1350  *
1351  * NB: this must be used for ALL and ONLY the text copied from the flex
1352  * input data.  If you pass it something that is not part of the yytext
1353  * string, you are making a mistake.  Internally generated text can be
1354  * appended directly to state->output_buf.
1355  */
1356 void
1357 psqlscan_emit(PsqlScanState state, const char *txt, int len)
1358 {
1359 	PQExpBuffer output_buf = state->output_buf;
1360 
1361 	if (state->safe_encoding)
1362 		appendBinaryPQExpBuffer(output_buf, txt, len);
1363 	else
1364 	{
1365 		/* Gotta do it the hard way */
1366 		const char *reference = state->refline;
1367 		int			i;
1368 
1369 		reference += (txt - state->curline);
1370 
1371 		for (i = 0; i < len; i++)
1372 		{
1373 			char		ch = txt[i];
1374 
1375 			if (ch == (char) 0xFF)
1376 				ch = reference[i];
1377 			appendPQExpBufferChar(output_buf, ch);
1378 		}
1379 	}
1380 }
1381 
1382 /*
1383  * psqlscan_extract_substring --- fetch value of (part of) the current token
1384  *
1385  * This is like psqlscan_emit(), except that the data is returned as a
1386  * malloc'd string rather than being pushed directly to state->output_buf.
1387  */
1388 char *
1389 psqlscan_extract_substring(PsqlScanState state, const char *txt, int len)
1390 {
1391 	char	   *result = (char *) pg_malloc(len + 1);
1392 
1393 	if (state->safe_encoding)
1394 		memcpy(result, txt, len);
1395 	else
1396 	{
1397 		/* Gotta do it the hard way */
1398 		const char *reference = state->refline;
1399 		int			i;
1400 
1401 		reference += (txt - state->curline);
1402 
1403 		for (i = 0; i < len; i++)
1404 		{
1405 			char		ch = txt[i];
1406 
1407 			if (ch == (char) 0xFF)
1408 				ch = reference[i];
1409 			result[i] = ch;
1410 		}
1411 	}
1412 	result[len] = '\0';
1413 	return result;
1414 }
1415 
1416 /*
1417  * psqlscan_escape_variable --- process :'VARIABLE' or :"VARIABLE"
1418  *
1419  * If the variable name is found, escape its value using the appropriate
1420  * quoting method and emit the value to output_buf.  (Since the result is
1421  * surely quoted, there is never any reason to rescan it.)	If we don't
1422  * find the variable or escaping fails, emit the token as-is.
1423  */
1424 void
1425 psqlscan_escape_variable(PsqlScanState state, const char *txt, int len,
1426 						 bool as_ident)
1427 {
1428 	char	   *varname;
1429 	char	   *value;
1430 
1431 	/* Variable lookup. */
1432 	varname = psqlscan_extract_substring(state, txt + 2, len - 3);
1433 	if (state->callbacks->get_variable)
1434 		value = state->callbacks->get_variable(varname, true, as_ident);
1435 	else
1436 		value = NULL;
1437 	free(varname);
1438 
1439 	if (value)
1440 	{
1441 		/* Emit the suitably-escaped value */
1442 		appendPQExpBufferStr(state->output_buf, value);
1443 		free(value);
1444 	}
1445 	else
1446 	{
1447 		/* Emit original token as-is */
1448 		psqlscan_emit(state, txt, len);
1449 	}
1450 }
1451