1 /*-------------------------------------------------------------------------
2  *
3  * parser.c
4  *		Main entry point/driver for PostgreSQL grammar
5  *
6  * This should match src/backend/parser/parser.c, except that we do not
7  * need to bother with re-entrant interfaces.
8  *
9  * Note: ECPG doesn't report error location like the backend does.
10  * This file will need work if we ever want it to.
11  *
12  *
13  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
14  * Portions Copyright (c) 1994, Regents of the University of California
15  *
16  * IDENTIFICATION
17  *	  src/interfaces/ecpg/preproc/parser.c
18  *
19  *-------------------------------------------------------------------------
20  */
21 
22 #include "postgres_fe.h"
23 
24 #include "preproc_extern.h"
25 #include "preproc.h"
26 
27 
28 static bool have_lookahead;		/* is lookahead info valid? */
29 static int	lookahead_token;	/* one-token lookahead */
30 static YYSTYPE lookahead_yylval;	/* yylval for lookahead token */
31 static YYLTYPE lookahead_yylloc;	/* yylloc for lookahead token */
32 static char *lookahead_yytext;	/* start current token */
33 
34 static bool check_uescapechar(unsigned char escape);
35 static bool ecpg_isspace(char ch);
36 
37 
38 /*
39  * Intermediate filter between parser and base lexer (base_yylex in scan.l).
40  *
41  * This filter is needed because in some cases the standard SQL grammar
42  * requires more than one token lookahead.  We reduce these cases to one-token
43  * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
44  *
45  * Using a filter is simpler than trying to recognize multiword tokens
46  * directly in scan.l, because we'd have to allow for comments between the
47  * words.  Furthermore it's not clear how to do that without re-introducing
48  * scanner backtrack, which would cost more performance than this filter
49  * layer does.
50  *
51  * We also use this filter to convert UIDENT and USCONST sequences into
52  * plain IDENT and SCONST tokens.  While that could be handled by additional
53  * productions in the main grammar, it's more efficient to do it like this.
54  */
55 int
filtered_base_yylex(void)56 filtered_base_yylex(void)
57 {
58 	int			cur_token;
59 	int			next_token;
60 	YYSTYPE		cur_yylval;
61 	YYLTYPE		cur_yylloc;
62 	char	   *cur_yytext;
63 
64 	/* Get next token --- we might already have it */
65 	if (have_lookahead)
66 	{
67 		cur_token = lookahead_token;
68 		base_yylval = lookahead_yylval;
69 		base_yylloc = lookahead_yylloc;
70 		base_yytext = lookahead_yytext;
71 		have_lookahead = false;
72 	}
73 	else
74 		cur_token = base_yylex();
75 
76 	/*
77 	 * If this token isn't one that requires lookahead, just return it.
78 	 */
79 	switch (cur_token)
80 	{
81 		case NOT:
82 		case NULLS_P:
83 		case WITH:
84 		case UIDENT:
85 		case USCONST:
86 			break;
87 		default:
88 			return cur_token;
89 	}
90 
91 	/* Save and restore lexer output variables around the call */
92 	cur_yylval = base_yylval;
93 	cur_yylloc = base_yylloc;
94 	cur_yytext = base_yytext;
95 
96 	/* Get next token, saving outputs into lookahead variables */
97 	next_token = base_yylex();
98 
99 	lookahead_token = next_token;
100 	lookahead_yylval = base_yylval;
101 	lookahead_yylloc = base_yylloc;
102 	lookahead_yytext = base_yytext;
103 
104 	base_yylval = cur_yylval;
105 	base_yylloc = cur_yylloc;
106 	base_yytext = cur_yytext;
107 
108 	have_lookahead = true;
109 
110 	/* Replace cur_token if needed, based on lookahead */
111 	switch (cur_token)
112 	{
113 		case NOT:
114 			/* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
115 			switch (next_token)
116 			{
117 				case BETWEEN:
118 				case IN_P:
119 				case LIKE:
120 				case ILIKE:
121 				case SIMILAR:
122 					cur_token = NOT_LA;
123 					break;
124 			}
125 			break;
126 
127 		case NULLS_P:
128 			/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
129 			switch (next_token)
130 			{
131 				case FIRST_P:
132 				case LAST_P:
133 					cur_token = NULLS_LA;
134 					break;
135 			}
136 			break;
137 
138 		case WITH:
139 			/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
140 			switch (next_token)
141 			{
142 				case TIME:
143 				case ORDINALITY:
144 					cur_token = WITH_LA;
145 					break;
146 			}
147 			break;
148 		case UIDENT:
149 		case USCONST:
150 			/* Look ahead for UESCAPE */
151 			if (next_token == UESCAPE)
152 			{
153 				/* Yup, so get third token, which had better be SCONST */
154 				const char *escstr;
155 
156 				/*
157 				 * Again save and restore lexer output variables around the
158 				 * call
159 				 */
160 				cur_yylval = base_yylval;
161 				cur_yylloc = base_yylloc;
162 				cur_yytext = base_yytext;
163 
164 				/* Get third token */
165 				next_token = base_yylex();
166 
167 				if (next_token != SCONST)
168 					mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal");
169 
170 				/*
171 				 * Save and check escape string, which the scanner returns
172 				 * with quotes
173 				 */
174 				escstr = base_yylval.str;
175 				if (strlen(escstr) != 3 || !check_uescapechar(escstr[1]))
176 					mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character");
177 
178 				base_yylval = cur_yylval;
179 				base_yylloc = cur_yylloc;
180 				base_yytext = cur_yytext;
181 
182 				/* Combine 3 tokens into 1 */
183 				base_yylval.str = psprintf("%s UESCAPE %s", base_yylval.str, escstr);
184 
185 				/* Clear have_lookahead, thereby consuming all three tokens */
186 				have_lookahead = false;
187 			}
188 
189 			if (cur_token == UIDENT)
190 				cur_token = IDENT;
191 			else if (cur_token == USCONST)
192 				cur_token = SCONST;
193 			break;
194 	}
195 
196 	return cur_token;
197 }
198 
199 /*
200  * check_uescapechar() and ecpg_isspace() should match their equivalents
201  * in pgc.l.
202  */
203 
204 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
205 static bool
check_uescapechar(unsigned char escape)206 check_uescapechar(unsigned char escape)
207 {
208 	if (isxdigit(escape)
209 		|| escape == '+'
210 		|| escape == '\''
211 		|| escape == '"'
212 		|| ecpg_isspace(escape))
213 		return false;
214 	else
215 		return true;
216 }
217 
218 /*
219  * ecpg_isspace() --- return true if flex scanner considers char whitespace
220  */
221 static bool
ecpg_isspace(char ch)222 ecpg_isspace(char ch)
223 {
224 	if (ch == ' ' ||
225 		ch == '\t' ||
226 		ch == '\n' ||
227 		ch == '\r' ||
228 		ch == '\f')
229 		return true;
230 	return false;
231 }
232