1 /*-------------------------------------------------------------------------
2  *
3  * tsvector_parser.c
4  *	  Parser for tsvector
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/utils/adt/tsvector_parser.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "tsearch/ts_locale.h"
18 #include "tsearch/ts_utils.h"
19 
20 
21 /*
22  * Private state of tsvector parser.  Note that tsquery also uses this code to
23  * parse its input, hence the boolean flags.  The two flags are both true or
24  * both false in current usage, but we keep them separate for clarity.
25  * is_tsquery affects *only* the content of error messages.
26  */
27 struct TSVectorParseStateData
28 {
29 	char	   *prsbuf;			/* next input character */
30 	char	   *bufstart;		/* whole string (used only for errors) */
31 	char	   *word;			/* buffer to hold the current word */
32 	int			len;			/* size in bytes allocated for 'word' */
33 	int			eml;			/* max bytes per character */
34 	bool		oprisdelim;		/* treat ! | * ( ) as delimiters? */
35 	bool		is_tsquery;		/* say "tsquery" not "tsvector" in errors? */
36 	bool		is_web;			/* we're in websearch_to_tsquery() */
37 };
38 
39 
40 /*
41  * Initializes parser for the input string. If oprisdelim is set, the
42  * following characters are treated as delimiters in addition to whitespace:
43  * ! | & ( )
44  */
45 TSVectorParseState
init_tsvector_parser(char * input,int flags)46 init_tsvector_parser(char *input, int flags)
47 {
48 	TSVectorParseState state;
49 
50 	state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
51 	state->prsbuf = input;
52 	state->bufstart = input;
53 	state->len = 32;
54 	state->word = (char *) palloc(state->len);
55 	state->eml = pg_database_encoding_max_length();
56 	state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
57 	state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
58 	state->is_web = (flags & P_TSV_IS_WEB) != 0;
59 
60 	return state;
61 }
62 
63 /*
64  * Reinitializes parser to parse 'input', instead of previous input.
65  */
66 void
reset_tsvector_parser(TSVectorParseState state,char * input)67 reset_tsvector_parser(TSVectorParseState state, char *input)
68 {
69 	state->prsbuf = input;
70 }
71 
72 /*
73  * Shuts down a tsvector parser.
74  */
75 void
close_tsvector_parser(TSVectorParseState state)76 close_tsvector_parser(TSVectorParseState state)
77 {
78 	pfree(state->word);
79 	pfree(state);
80 }
81 
82 /* increase the size of 'word' if needed to hold one more character */
83 #define RESIZEPRSBUF \
84 do { \
85 	int clen = curpos - state->word; \
86 	if ( clen + state->eml >= state->len ) \
87 	{ \
88 		state->len *= 2; \
89 		state->word = (char *) repalloc(state->word, state->len); \
90 		curpos = state->word + clen; \
91 	} \
92 } while (0)
93 
94 /* Fills gettoken_tsvector's output parameters, and returns true */
95 #define RETURN_TOKEN \
96 do { \
97 	if (pos_ptr != NULL) \
98 	{ \
99 		*pos_ptr = pos; \
100 		*poslen = npos; \
101 	} \
102 	else if (pos != NULL) \
103 		pfree(pos); \
104 	\
105 	if (strval != NULL) \
106 		*strval = state->word; \
107 	if (lenval != NULL) \
108 		*lenval = curpos - state->word; \
109 	if (endptr != NULL) \
110 		*endptr = state->prsbuf; \
111 	return true; \
112 } while(0)
113 
114 
115 /* State codes used in gettoken_tsvector */
116 #define WAITWORD		1
117 #define WAITENDWORD		2
118 #define WAITNEXTCHAR	3
119 #define WAITENDCMPLX	4
120 #define WAITPOSINFO		5
121 #define INPOSINFO		6
122 #define WAITPOSDELIM	7
123 #define WAITCHARCMPLX	8
124 
125 #define PRSSYNTAXERROR prssyntaxerror(state)
126 
127 static void
prssyntaxerror(TSVectorParseState state)128 prssyntaxerror(TSVectorParseState state)
129 {
130 	ereport(ERROR,
131 			(errcode(ERRCODE_SYNTAX_ERROR),
132 			 state->is_tsquery ?
133 			 errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
134 			 errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
135 }
136 
137 
138 /*
139  * Get next token from string being parsed. Returns true if successful,
140  * false if end of input string is reached.  On success, these output
141  * parameters are filled in:
142  *
143  * *strval		pointer to token
144  * *lenval		length of *strval
145  * *pos_ptr		pointer to a palloc'd array of positions and weights
146  *				associated with the token. If the caller is not interested
147  *				in the information, NULL can be supplied. Otherwise
148  *				the caller is responsible for pfreeing the array.
149  * *poslen		number of elements in *pos_ptr
150  * *endptr		scan resumption point
151  *
152  * Pass NULL for unwanted output parameters.
153  */
154 bool
gettoken_tsvector(TSVectorParseState state,char ** strval,int * lenval,WordEntryPos ** pos_ptr,int * poslen,char ** endptr)155 gettoken_tsvector(TSVectorParseState state,
156 				  char **strval, int *lenval,
157 				  WordEntryPos **pos_ptr, int *poslen,
158 				  char **endptr)
159 {
160 	int			oldstate = 0;
161 	char	   *curpos = state->word;
162 	int			statecode = WAITWORD;
163 
164 	/*
165 	 * pos is for collecting the comma delimited list of positions followed by
166 	 * the actual token.
167 	 */
168 	WordEntryPos *pos = NULL;
169 	int			npos = 0;		/* elements of pos used */
170 	int			posalen = 0;	/* allocated size of pos */
171 
172 	while (1)
173 	{
174 		if (statecode == WAITWORD)
175 		{
176 			if (*(state->prsbuf) == '\0')
177 				return false;
178 			else if (!state->is_web && t_iseq(state->prsbuf, '\''))
179 				statecode = WAITENDCMPLX;
180 			else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
181 			{
182 				statecode = WAITNEXTCHAR;
183 				oldstate = WAITENDWORD;
184 			}
185 			else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
186 					 (state->is_web && t_iseq(state->prsbuf, '"')))
187 				PRSSYNTAXERROR;
188 			else if (!t_isspace(state->prsbuf))
189 			{
190 				COPYCHAR(curpos, state->prsbuf);
191 				curpos += pg_mblen(state->prsbuf);
192 				statecode = WAITENDWORD;
193 			}
194 		}
195 		else if (statecode == WAITNEXTCHAR)
196 		{
197 			if (*(state->prsbuf) == '\0')
198 				ereport(ERROR,
199 						(errcode(ERRCODE_SYNTAX_ERROR),
200 						 errmsg("there is no escaped character: \"%s\"",
201 								state->bufstart)));
202 			else
203 			{
204 				RESIZEPRSBUF;
205 				COPYCHAR(curpos, state->prsbuf);
206 				curpos += pg_mblen(state->prsbuf);
207 				Assert(oldstate != 0);
208 				statecode = oldstate;
209 			}
210 		}
211 		else if (statecode == WAITENDWORD)
212 		{
213 			if (!state->is_web && t_iseq(state->prsbuf, '\\'))
214 			{
215 				statecode = WAITNEXTCHAR;
216 				oldstate = WAITENDWORD;
217 			}
218 			else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
219 					 (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
220 					 (state->is_web && t_iseq(state->prsbuf, '"')))
221 			{
222 				RESIZEPRSBUF;
223 				if (curpos == state->word)
224 					PRSSYNTAXERROR;
225 				*(curpos) = '\0';
226 				RETURN_TOKEN;
227 			}
228 			else if (t_iseq(state->prsbuf, ':'))
229 			{
230 				if (curpos == state->word)
231 					PRSSYNTAXERROR;
232 				*(curpos) = '\0';
233 				if (state->oprisdelim)
234 					RETURN_TOKEN;
235 				else
236 					statecode = INPOSINFO;
237 			}
238 			else
239 			{
240 				RESIZEPRSBUF;
241 				COPYCHAR(curpos, state->prsbuf);
242 				curpos += pg_mblen(state->prsbuf);
243 			}
244 		}
245 		else if (statecode == WAITENDCMPLX)
246 		{
247 			if (!state->is_web && t_iseq(state->prsbuf, '\''))
248 			{
249 				statecode = WAITCHARCMPLX;
250 			}
251 			else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
252 			{
253 				statecode = WAITNEXTCHAR;
254 				oldstate = WAITENDCMPLX;
255 			}
256 			else if (*(state->prsbuf) == '\0')
257 				PRSSYNTAXERROR;
258 			else
259 			{
260 				RESIZEPRSBUF;
261 				COPYCHAR(curpos, state->prsbuf);
262 				curpos += pg_mblen(state->prsbuf);
263 			}
264 		}
265 		else if (statecode == WAITCHARCMPLX)
266 		{
267 			if (!state->is_web && t_iseq(state->prsbuf, '\''))
268 			{
269 				RESIZEPRSBUF;
270 				COPYCHAR(curpos, state->prsbuf);
271 				curpos += pg_mblen(state->prsbuf);
272 				statecode = WAITENDCMPLX;
273 			}
274 			else
275 			{
276 				RESIZEPRSBUF;
277 				*(curpos) = '\0';
278 				if (curpos == state->word)
279 					PRSSYNTAXERROR;
280 				if (state->oprisdelim)
281 				{
282 					/* state->prsbuf+=pg_mblen(state->prsbuf); */
283 					RETURN_TOKEN;
284 				}
285 				else
286 					statecode = WAITPOSINFO;
287 				continue;		/* recheck current character */
288 			}
289 		}
290 		else if (statecode == WAITPOSINFO)
291 		{
292 			if (t_iseq(state->prsbuf, ':'))
293 				statecode = INPOSINFO;
294 			else
295 				RETURN_TOKEN;
296 		}
297 		else if (statecode == INPOSINFO)
298 		{
299 			if (t_isdigit(state->prsbuf))
300 			{
301 				if (posalen == 0)
302 				{
303 					posalen = 4;
304 					pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
305 					npos = 0;
306 				}
307 				else if (npos + 1 >= posalen)
308 				{
309 					posalen *= 2;
310 					pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
311 				}
312 				npos++;
313 				WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
314 				/* we cannot get here in tsquery, so no need for 2 errmsgs */
315 				if (WEP_GETPOS(pos[npos - 1]) == 0)
316 					ereport(ERROR,
317 							(errcode(ERRCODE_SYNTAX_ERROR),
318 							 errmsg("wrong position info in tsvector: \"%s\"",
319 									state->bufstart)));
320 				WEP_SETWEIGHT(pos[npos - 1], 0);
321 				statecode = WAITPOSDELIM;
322 			}
323 			else
324 				PRSSYNTAXERROR;
325 		}
326 		else if (statecode == WAITPOSDELIM)
327 		{
328 			if (t_iseq(state->prsbuf, ','))
329 				statecode = INPOSINFO;
330 			else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
331 			{
332 				if (WEP_GETWEIGHT(pos[npos - 1]))
333 					PRSSYNTAXERROR;
334 				WEP_SETWEIGHT(pos[npos - 1], 3);
335 			}
336 			else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
337 			{
338 				if (WEP_GETWEIGHT(pos[npos - 1]))
339 					PRSSYNTAXERROR;
340 				WEP_SETWEIGHT(pos[npos - 1], 2);
341 			}
342 			else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
343 			{
344 				if (WEP_GETWEIGHT(pos[npos - 1]))
345 					PRSSYNTAXERROR;
346 				WEP_SETWEIGHT(pos[npos - 1], 1);
347 			}
348 			else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
349 			{
350 				if (WEP_GETWEIGHT(pos[npos - 1]))
351 					PRSSYNTAXERROR;
352 				WEP_SETWEIGHT(pos[npos - 1], 0);
353 			}
354 			else if (t_isspace(state->prsbuf) ||
355 					 *(state->prsbuf) == '\0')
356 				RETURN_TOKEN;
357 			else if (!t_isdigit(state->prsbuf))
358 				PRSSYNTAXERROR;
359 		}
360 		else					/* internal error */
361 			elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
362 				 statecode);
363 
364 		/* get next char */
365 		state->prsbuf += pg_mblen(state->prsbuf);
366 	}
367 }
368