1 /*-------------------------------------------------------------------------
2 *
3 * parser.c
4 * Main entry point/driver for PostgreSQL grammar
5 *
6 * This should match src/backend/parser/parser.c, except that we do not
7 * need to bother with re-entrant interfaces.
8 *
9 * Note: ECPG doesn't report error location like the backend does.
10 * This file will need work if we ever want it to.
11 *
12 *
13 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
14 * Portions Copyright (c) 1994, Regents of the University of California
15 *
16 * IDENTIFICATION
17 * src/interfaces/ecpg/preproc/parser.c
18 *
19 *-------------------------------------------------------------------------
20 */
21
22 #include "postgres_fe.h"
23
24 #include "preproc_extern.h"
25 #include "preproc.h"
26
27
28 static bool have_lookahead; /* is lookahead info valid? */
29 static int lookahead_token; /* one-token lookahead */
30 static YYSTYPE lookahead_yylval; /* yylval for lookahead token */
31 static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */
32 static char *lookahead_yytext; /* start current token */
33
34 static bool check_uescapechar(unsigned char escape);
35 static bool ecpg_isspace(char ch);
36
37
38 /*
39 * Intermediate filter between parser and base lexer (base_yylex in scan.l).
40 *
41 * This filter is needed because in some cases the standard SQL grammar
42 * requires more than one token lookahead. We reduce these cases to one-token
43 * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
44 *
45 * Using a filter is simpler than trying to recognize multiword tokens
46 * directly in scan.l, because we'd have to allow for comments between the
47 * words. Furthermore it's not clear how to do that without re-introducing
48 * scanner backtrack, which would cost more performance than this filter
49 * layer does.
50 *
51 * We also use this filter to convert UIDENT and USCONST sequences into
52 * plain IDENT and SCONST tokens. While that could be handled by additional
53 * productions in the main grammar, it's more efficient to do it like this.
54 */
55 int
filtered_base_yylex(void)56 filtered_base_yylex(void)
57 {
58 int cur_token;
59 int next_token;
60 YYSTYPE cur_yylval;
61 YYLTYPE cur_yylloc;
62 char *cur_yytext;
63
64 /* Get next token --- we might already have it */
65 if (have_lookahead)
66 {
67 cur_token = lookahead_token;
68 base_yylval = lookahead_yylval;
69 base_yylloc = lookahead_yylloc;
70 base_yytext = lookahead_yytext;
71 have_lookahead = false;
72 }
73 else
74 cur_token = base_yylex();
75
76 /*
77 * If this token isn't one that requires lookahead, just return it.
78 */
79 switch (cur_token)
80 {
81 case NOT:
82 case NULLS_P:
83 case WITH:
84 case UIDENT:
85 case USCONST:
86 break;
87 default:
88 return cur_token;
89 }
90
91 /* Save and restore lexer output variables around the call */
92 cur_yylval = base_yylval;
93 cur_yylloc = base_yylloc;
94 cur_yytext = base_yytext;
95
96 /* Get next token, saving outputs into lookahead variables */
97 next_token = base_yylex();
98
99 lookahead_token = next_token;
100 lookahead_yylval = base_yylval;
101 lookahead_yylloc = base_yylloc;
102 lookahead_yytext = base_yytext;
103
104 base_yylval = cur_yylval;
105 base_yylloc = cur_yylloc;
106 base_yytext = cur_yytext;
107
108 have_lookahead = true;
109
110 /* Replace cur_token if needed, based on lookahead */
111 switch (cur_token)
112 {
113 case NOT:
114 /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
115 switch (next_token)
116 {
117 case BETWEEN:
118 case IN_P:
119 case LIKE:
120 case ILIKE:
121 case SIMILAR:
122 cur_token = NOT_LA;
123 break;
124 }
125 break;
126
127 case NULLS_P:
128 /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
129 switch (next_token)
130 {
131 case FIRST_P:
132 case LAST_P:
133 cur_token = NULLS_LA;
134 break;
135 }
136 break;
137
138 case WITH:
139 /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
140 switch (next_token)
141 {
142 case TIME:
143 case ORDINALITY:
144 cur_token = WITH_LA;
145 break;
146 }
147 break;
148 case UIDENT:
149 case USCONST:
150 /* Look ahead for UESCAPE */
151 if (next_token == UESCAPE)
152 {
153 /* Yup, so get third token, which had better be SCONST */
154 const char *escstr;
155
156 /*
157 * Again save and restore lexer output variables around the
158 * call
159 */
160 cur_yylval = base_yylval;
161 cur_yylloc = base_yylloc;
162 cur_yytext = base_yytext;
163
164 /* Get third token */
165 next_token = base_yylex();
166
167 if (next_token != SCONST)
168 mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal");
169
170 /*
171 * Save and check escape string, which the scanner returns
172 * with quotes
173 */
174 escstr = base_yylval.str;
175 if (strlen(escstr) != 3 || !check_uescapechar(escstr[1]))
176 mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character");
177
178 base_yylval = cur_yylval;
179 base_yylloc = cur_yylloc;
180 base_yytext = cur_yytext;
181
182 /* Combine 3 tokens into 1 */
183 base_yylval.str = psprintf("%s UESCAPE %s", base_yylval.str, escstr);
184
185 /* Clear have_lookahead, thereby consuming all three tokens */
186 have_lookahead = false;
187 }
188
189 if (cur_token == UIDENT)
190 cur_token = IDENT;
191 else if (cur_token == USCONST)
192 cur_token = SCONST;
193 break;
194 }
195
196 return cur_token;
197 }
198
199 /*
200 * check_uescapechar() and ecpg_isspace() should match their equivalents
201 * in pgc.l.
202 */
203
204 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
205 static bool
check_uescapechar(unsigned char escape)206 check_uescapechar(unsigned char escape)
207 {
208 if (isxdigit(escape)
209 || escape == '+'
210 || escape == '\''
211 || escape == '"'
212 || ecpg_isspace(escape))
213 return false;
214 else
215 return true;
216 }
217
218 /*
219 * ecpg_isspace() --- return true if flex scanner considers char whitespace
220 */
221 static bool
ecpg_isspace(char ch)222 ecpg_isspace(char ch)
223 {
224 if (ch == ' ' ||
225 ch == '\t' ||
226 ch == '\n' ||
227 ch == '\r' ||
228 ch == '\f')
229 return true;
230 return false;
231 }
232