1 /*-------------------------------------------------------------------------
2 *
3 * tsvector_parser.c
4 * Parser for tsvector
5 *
6 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsvector_parser.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15 #include "postgres.h"
16
17 #include "tsearch/ts_locale.h"
18 #include "tsearch/ts_utils.h"
19
20
21 /*
22 * Private state of tsvector parser. Note that tsquery also uses this code to
23 * parse its input, hence the boolean flags. The two flags are both true or
24 * both false in current usage, but we keep them separate for clarity.
25 * is_tsquery affects *only* the content of error messages.
26 */
27 struct TSVectorParseStateData
28 {
29 char *prsbuf; /* next input character */
30 char *bufstart; /* whole string (used only for errors) */
31 char *word; /* buffer to hold the current word */
32 int len; /* size in bytes allocated for 'word' */
33 int eml; /* max bytes per character */
34 bool oprisdelim; /* treat ! | * ( ) as delimiters? */
35 bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
36 };
37
38
39 /*
40 * Initializes parser for the input string. If oprisdelim is set, the
41 * following characters are treated as delimiters in addition to whitespace:
42 * ! | & ( )
43 */
44 TSVectorParseState
init_tsvector_parser(char * input,bool oprisdelim,bool is_tsquery)45 init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
46 {
47 TSVectorParseState state;
48
49 state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
50 state->prsbuf = input;
51 state->bufstart = input;
52 state->len = 32;
53 state->word = (char *) palloc(state->len);
54 state->eml = pg_database_encoding_max_length();
55 state->oprisdelim = oprisdelim;
56 state->is_tsquery = is_tsquery;
57
58 return state;
59 }
60
61 /*
62 * Reinitializes parser to parse 'input', instead of previous input.
63 */
64 void
reset_tsvector_parser(TSVectorParseState state,char * input)65 reset_tsvector_parser(TSVectorParseState state, char *input)
66 {
67 state->prsbuf = input;
68 }
69
70 /*
71 * Shuts down a tsvector parser.
72 */
73 void
close_tsvector_parser(TSVectorParseState state)74 close_tsvector_parser(TSVectorParseState state)
75 {
76 pfree(state->word);
77 pfree(state);
78 }
79
80 /* increase the size of 'word' if needed to hold one more character */
81 #define RESIZEPRSBUF \
82 do { \
83 int clen = curpos - state->word; \
84 if ( clen + state->eml >= state->len ) \
85 { \
86 state->len *= 2; \
87 state->word = (char *) repalloc(state->word, state->len); \
88 curpos = state->word + clen; \
89 } \
90 } while (0)
91
92 /* phrase operator begins with '<' */
93 #define ISOPERATOR(x) \
94 ( pg_mblen(x) == 1 && ( *(x) == '!' || \
95 *(x) == '&' || \
96 *(x) == '|' || \
97 *(x) == '(' || \
98 *(x) == ')' || \
99 *(x) == '<' \
100 ) )
101
102 /* Fills gettoken_tsvector's output parameters, and returns true */
103 #define RETURN_TOKEN \
104 do { \
105 if (pos_ptr != NULL) \
106 { \
107 *pos_ptr = pos; \
108 *poslen = npos; \
109 } \
110 else if (pos != NULL) \
111 pfree(pos); \
112 \
113 if (strval != NULL) \
114 *strval = state->word; \
115 if (lenval != NULL) \
116 *lenval = curpos - state->word; \
117 if (endptr != NULL) \
118 *endptr = state->prsbuf; \
119 return true; \
120 } while(0)
121
122
123 /* State codes used in gettoken_tsvector */
124 #define WAITWORD 1
125 #define WAITENDWORD 2
126 #define WAITNEXTCHAR 3
127 #define WAITENDCMPLX 4
128 #define WAITPOSINFO 5
129 #define INPOSINFO 6
130 #define WAITPOSDELIM 7
131 #define WAITCHARCMPLX 8
132
133 #define PRSSYNTAXERROR prssyntaxerror(state)
134
135 static void
prssyntaxerror(TSVectorParseState state)136 prssyntaxerror(TSVectorParseState state)
137 {
138 ereport(ERROR,
139 (errcode(ERRCODE_SYNTAX_ERROR),
140 state->is_tsquery ?
141 errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
142 errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
143 }
144
145
146 /*
147 * Get next token from string being parsed. Returns true if successful,
148 * false if end of input string is reached. On success, these output
149 * parameters are filled in:
150 *
151 * *strval pointer to token
152 * *lenval length of *strval
153 * *pos_ptr pointer to a palloc'd array of positions and weights
154 * associated with the token. If the caller is not interested
155 * in the information, NULL can be supplied. Otherwise
156 * the caller is responsible for pfreeing the array.
157 * *poslen number of elements in *pos_ptr
158 * *endptr scan resumption point
159 *
160 * Pass NULL for unwanted output parameters.
161 */
162 bool
gettoken_tsvector(TSVectorParseState state,char ** strval,int * lenval,WordEntryPos ** pos_ptr,int * poslen,char ** endptr)163 gettoken_tsvector(TSVectorParseState state,
164 char **strval, int *lenval,
165 WordEntryPos **pos_ptr, int *poslen,
166 char **endptr)
167 {
168 int oldstate = 0;
169 char *curpos = state->word;
170 int statecode = WAITWORD;
171
172 /*
173 * pos is for collecting the comma delimited list of positions followed by
174 * the actual token.
175 */
176 WordEntryPos *pos = NULL;
177 int npos = 0; /* elements of pos used */
178 int posalen = 0; /* allocated size of pos */
179
180 while (1)
181 {
182 if (statecode == WAITWORD)
183 {
184 if (*(state->prsbuf) == '\0')
185 return false;
186 else if (t_iseq(state->prsbuf, '\''))
187 statecode = WAITENDCMPLX;
188 else if (t_iseq(state->prsbuf, '\\'))
189 {
190 statecode = WAITNEXTCHAR;
191 oldstate = WAITENDWORD;
192 }
193 else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
194 PRSSYNTAXERROR;
195 else if (!t_isspace(state->prsbuf))
196 {
197 COPYCHAR(curpos, state->prsbuf);
198 curpos += pg_mblen(state->prsbuf);
199 statecode = WAITENDWORD;
200 }
201 }
202 else if (statecode == WAITNEXTCHAR)
203 {
204 if (*(state->prsbuf) == '\0')
205 ereport(ERROR,
206 (errcode(ERRCODE_SYNTAX_ERROR),
207 errmsg("there is no escaped character: \"%s\"",
208 state->bufstart)));
209 else
210 {
211 RESIZEPRSBUF;
212 COPYCHAR(curpos, state->prsbuf);
213 curpos += pg_mblen(state->prsbuf);
214 Assert(oldstate != 0);
215 statecode = oldstate;
216 }
217 }
218 else if (statecode == WAITENDWORD)
219 {
220 if (t_iseq(state->prsbuf, '\\'))
221 {
222 statecode = WAITNEXTCHAR;
223 oldstate = WAITENDWORD;
224 }
225 else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
226 (state->oprisdelim && ISOPERATOR(state->prsbuf)))
227 {
228 RESIZEPRSBUF;
229 if (curpos == state->word)
230 PRSSYNTAXERROR;
231 *(curpos) = '\0';
232 RETURN_TOKEN;
233 }
234 else if (t_iseq(state->prsbuf, ':'))
235 {
236 if (curpos == state->word)
237 PRSSYNTAXERROR;
238 *(curpos) = '\0';
239 if (state->oprisdelim)
240 RETURN_TOKEN;
241 else
242 statecode = INPOSINFO;
243 }
244 else
245 {
246 RESIZEPRSBUF;
247 COPYCHAR(curpos, state->prsbuf);
248 curpos += pg_mblen(state->prsbuf);
249 }
250 }
251 else if (statecode == WAITENDCMPLX)
252 {
253 if (t_iseq(state->prsbuf, '\''))
254 {
255 statecode = WAITCHARCMPLX;
256 }
257 else if (t_iseq(state->prsbuf, '\\'))
258 {
259 statecode = WAITNEXTCHAR;
260 oldstate = WAITENDCMPLX;
261 }
262 else if (*(state->prsbuf) == '\0')
263 PRSSYNTAXERROR;
264 else
265 {
266 RESIZEPRSBUF;
267 COPYCHAR(curpos, state->prsbuf);
268 curpos += pg_mblen(state->prsbuf);
269 }
270 }
271 else if (statecode == WAITCHARCMPLX)
272 {
273 if (t_iseq(state->prsbuf, '\''))
274 {
275 RESIZEPRSBUF;
276 COPYCHAR(curpos, state->prsbuf);
277 curpos += pg_mblen(state->prsbuf);
278 statecode = WAITENDCMPLX;
279 }
280 else
281 {
282 RESIZEPRSBUF;
283 *(curpos) = '\0';
284 if (curpos == state->word)
285 PRSSYNTAXERROR;
286 if (state->oprisdelim)
287 {
288 /* state->prsbuf+=pg_mblen(state->prsbuf); */
289 RETURN_TOKEN;
290 }
291 else
292 statecode = WAITPOSINFO;
293 continue; /* recheck current character */
294 }
295 }
296 else if (statecode == WAITPOSINFO)
297 {
298 if (t_iseq(state->prsbuf, ':'))
299 statecode = INPOSINFO;
300 else
301 RETURN_TOKEN;
302 }
303 else if (statecode == INPOSINFO)
304 {
305 if (t_isdigit(state->prsbuf))
306 {
307 if (posalen == 0)
308 {
309 posalen = 4;
310 pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
311 npos = 0;
312 }
313 else if (npos + 1 >= posalen)
314 {
315 posalen *= 2;
316 pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
317 }
318 npos++;
319 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
320 /* we cannot get here in tsquery, so no need for 2 errmsgs */
321 if (WEP_GETPOS(pos[npos - 1]) == 0)
322 ereport(ERROR,
323 (errcode(ERRCODE_SYNTAX_ERROR),
324 errmsg("wrong position info in tsvector: \"%s\"",
325 state->bufstart)));
326 WEP_SETWEIGHT(pos[npos - 1], 0);
327 statecode = WAITPOSDELIM;
328 }
329 else
330 PRSSYNTAXERROR;
331 }
332 else if (statecode == WAITPOSDELIM)
333 {
334 if (t_iseq(state->prsbuf, ','))
335 statecode = INPOSINFO;
336 else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
337 {
338 if (WEP_GETWEIGHT(pos[npos - 1]))
339 PRSSYNTAXERROR;
340 WEP_SETWEIGHT(pos[npos - 1], 3);
341 }
342 else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
343 {
344 if (WEP_GETWEIGHT(pos[npos - 1]))
345 PRSSYNTAXERROR;
346 WEP_SETWEIGHT(pos[npos - 1], 2);
347 }
348 else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
349 {
350 if (WEP_GETWEIGHT(pos[npos - 1]))
351 PRSSYNTAXERROR;
352 WEP_SETWEIGHT(pos[npos - 1], 1);
353 }
354 else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
355 {
356 if (WEP_GETWEIGHT(pos[npos - 1]))
357 PRSSYNTAXERROR;
358 WEP_SETWEIGHT(pos[npos - 1], 0);
359 }
360 else if (t_isspace(state->prsbuf) ||
361 *(state->prsbuf) == '\0')
362 RETURN_TOKEN;
363 else if (!t_isdigit(state->prsbuf))
364 PRSSYNTAXERROR;
365 }
366 else /* internal error */
367 elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
368 statecode);
369
370 /* get next char */
371 state->prsbuf += pg_mblen(state->prsbuf);
372 }
373 }
374