1 /*-------------------------------------------------------------------------
2  *
3  * tsvector_parser.c
4  *	  Parser for tsvector
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/utils/adt/tsvector_parser.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "tsearch/ts_locale.h"
18 #include "tsearch/ts_utils.h"
19 
20 
21 /*
22  * Private state of tsvector parser.  Note that tsquery also uses this code to
23  * parse its input, hence the boolean flags.  The two flags are both true or
24  * both false in current usage, but we keep them separate for clarity.
25  * is_tsquery affects *only* the content of error messages.
26  */
27 struct TSVectorParseStateData
28 {
29 	char	   *prsbuf;			/* next input character */
30 	char	   *bufstart;		/* whole string (used only for errors) */
31 	char	   *word;			/* buffer to hold the current word */
32 	int			len;			/* size in bytes allocated for 'word' */
33 	int			eml;			/* max bytes per character */
34 	bool		oprisdelim;		/* treat ! | * ( ) as delimiters? */
35 	bool		is_tsquery;		/* say "tsquery" not "tsvector" in errors? */
36 };
37 
38 
39 /*
40  * Initializes parser for the input string. If oprisdelim is set, the
41  * following characters are treated as delimiters in addition to whitespace:
42  * ! | & ( )
43  */
44 TSVectorParseState
init_tsvector_parser(char * input,bool oprisdelim,bool is_tsquery)45 init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
46 {
47 	TSVectorParseState state;
48 
49 	state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
50 	state->prsbuf = input;
51 	state->bufstart = input;
52 	state->len = 32;
53 	state->word = (char *) palloc(state->len);
54 	state->eml = pg_database_encoding_max_length();
55 	state->oprisdelim = oprisdelim;
56 	state->is_tsquery = is_tsquery;
57 
58 	return state;
59 }
60 
61 /*
62  * Reinitializes parser to parse 'input', instead of previous input.
63  */
64 void
reset_tsvector_parser(TSVectorParseState state,char * input)65 reset_tsvector_parser(TSVectorParseState state, char *input)
66 {
67 	state->prsbuf = input;
68 }
69 
70 /*
71  * Shuts down a tsvector parser.
72  */
73 void
close_tsvector_parser(TSVectorParseState state)74 close_tsvector_parser(TSVectorParseState state)
75 {
76 	pfree(state->word);
77 	pfree(state);
78 }
79 
80 /* increase the size of 'word' if needed to hold one more character */
81 #define RESIZEPRSBUF \
82 do { \
83 	int clen = curpos - state->word; \
84 	if ( clen + state->eml >= state->len ) \
85 	{ \
86 		state->len *= 2; \
87 		state->word = (char *) repalloc(state->word, state->len); \
88 		curpos = state->word + clen; \
89 	} \
90 } while (0)
91 
92 /* phrase operator begins with '<' */
93 #define ISOPERATOR(x) \
94 	( pg_mblen(x) == 1 && ( *(x) == '!' ||	\
95 							*(x) == '&' ||	\
96 							*(x) == '|' ||	\
97 							*(x) == '(' ||	\
98 							*(x) == ')' ||	\
99 							*(x) == '<'		\
100 						  ) )
101 
102 /* Fills gettoken_tsvector's output parameters, and returns true */
103 #define RETURN_TOKEN \
104 do { \
105 	if (pos_ptr != NULL) \
106 	{ \
107 		*pos_ptr = pos; \
108 		*poslen = npos; \
109 	} \
110 	else if (pos != NULL) \
111 		pfree(pos); \
112 	\
113 	if (strval != NULL) \
114 		*strval = state->word; \
115 	if (lenval != NULL) \
116 		*lenval = curpos - state->word; \
117 	if (endptr != NULL) \
118 		*endptr = state->prsbuf; \
119 	return true; \
120 } while(0)
121 
122 
123 /* State codes used in gettoken_tsvector */
124 #define WAITWORD		1
125 #define WAITENDWORD		2
126 #define WAITNEXTCHAR	3
127 #define WAITENDCMPLX	4
128 #define WAITPOSINFO		5
129 #define INPOSINFO		6
130 #define WAITPOSDELIM	7
131 #define WAITCHARCMPLX	8
132 
133 #define PRSSYNTAXERROR prssyntaxerror(state)
134 
135 static void
prssyntaxerror(TSVectorParseState state)136 prssyntaxerror(TSVectorParseState state)
137 {
138 	ereport(ERROR,
139 			(errcode(ERRCODE_SYNTAX_ERROR),
140 			 state->is_tsquery ?
141 			 errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
142 			 errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
143 }
144 
145 
146 /*
147  * Get next token from string being parsed. Returns true if successful,
148  * false if end of input string is reached.  On success, these output
149  * parameters are filled in:
150  *
151  * *strval		pointer to token
152  * *lenval		length of *strval
153  * *pos_ptr		pointer to a palloc'd array of positions and weights
154  *				associated with the token. If the caller is not interested
155  *				in the information, NULL can be supplied. Otherwise
156  *				the caller is responsible for pfreeing the array.
157  * *poslen		number of elements in *pos_ptr
158  * *endptr		scan resumption point
159  *
160  * Pass NULL for unwanted output parameters.
161  */
162 bool
gettoken_tsvector(TSVectorParseState state,char ** strval,int * lenval,WordEntryPos ** pos_ptr,int * poslen,char ** endptr)163 gettoken_tsvector(TSVectorParseState state,
164 				  char **strval, int *lenval,
165 				  WordEntryPos **pos_ptr, int *poslen,
166 				  char **endptr)
167 {
168 	int			oldstate = 0;
169 	char	   *curpos = state->word;
170 	int			statecode = WAITWORD;
171 
172 	/*
173 	 * pos is for collecting the comma delimited list of positions followed by
174 	 * the actual token.
175 	 */
176 	WordEntryPos *pos = NULL;
177 	int			npos = 0;		/* elements of pos used */
178 	int			posalen = 0;	/* allocated size of pos */
179 
180 	while (1)
181 	{
182 		if (statecode == WAITWORD)
183 		{
184 			if (*(state->prsbuf) == '\0')
185 				return false;
186 			else if (t_iseq(state->prsbuf, '\''))
187 				statecode = WAITENDCMPLX;
188 			else if (t_iseq(state->prsbuf, '\\'))
189 			{
190 				statecode = WAITNEXTCHAR;
191 				oldstate = WAITENDWORD;
192 			}
193 			else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
194 				PRSSYNTAXERROR;
195 			else if (!t_isspace(state->prsbuf))
196 			{
197 				COPYCHAR(curpos, state->prsbuf);
198 				curpos += pg_mblen(state->prsbuf);
199 				statecode = WAITENDWORD;
200 			}
201 		}
202 		else if (statecode == WAITNEXTCHAR)
203 		{
204 			if (*(state->prsbuf) == '\0')
205 				ereport(ERROR,
206 						(errcode(ERRCODE_SYNTAX_ERROR),
207 						 errmsg("there is no escaped character: \"%s\"",
208 								state->bufstart)));
209 			else
210 			{
211 				RESIZEPRSBUF;
212 				COPYCHAR(curpos, state->prsbuf);
213 				curpos += pg_mblen(state->prsbuf);
214 				Assert(oldstate != 0);
215 				statecode = oldstate;
216 			}
217 		}
218 		else if (statecode == WAITENDWORD)
219 		{
220 			if (t_iseq(state->prsbuf, '\\'))
221 			{
222 				statecode = WAITNEXTCHAR;
223 				oldstate = WAITENDWORD;
224 			}
225 			else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
226 					 (state->oprisdelim && ISOPERATOR(state->prsbuf)))
227 			{
228 				RESIZEPRSBUF;
229 				if (curpos == state->word)
230 					PRSSYNTAXERROR;
231 				*(curpos) = '\0';
232 				RETURN_TOKEN;
233 			}
234 			else if (t_iseq(state->prsbuf, ':'))
235 			{
236 				if (curpos == state->word)
237 					PRSSYNTAXERROR;
238 				*(curpos) = '\0';
239 				if (state->oprisdelim)
240 					RETURN_TOKEN;
241 				else
242 					statecode = INPOSINFO;
243 			}
244 			else
245 			{
246 				RESIZEPRSBUF;
247 				COPYCHAR(curpos, state->prsbuf);
248 				curpos += pg_mblen(state->prsbuf);
249 			}
250 		}
251 		else if (statecode == WAITENDCMPLX)
252 		{
253 			if (t_iseq(state->prsbuf, '\''))
254 			{
255 				statecode = WAITCHARCMPLX;
256 			}
257 			else if (t_iseq(state->prsbuf, '\\'))
258 			{
259 				statecode = WAITNEXTCHAR;
260 				oldstate = WAITENDCMPLX;
261 			}
262 			else if (*(state->prsbuf) == '\0')
263 				PRSSYNTAXERROR;
264 			else
265 			{
266 				RESIZEPRSBUF;
267 				COPYCHAR(curpos, state->prsbuf);
268 				curpos += pg_mblen(state->prsbuf);
269 			}
270 		}
271 		else if (statecode == WAITCHARCMPLX)
272 		{
273 			if (t_iseq(state->prsbuf, '\''))
274 			{
275 				RESIZEPRSBUF;
276 				COPYCHAR(curpos, state->prsbuf);
277 				curpos += pg_mblen(state->prsbuf);
278 				statecode = WAITENDCMPLX;
279 			}
280 			else
281 			{
282 				RESIZEPRSBUF;
283 				*(curpos) = '\0';
284 				if (curpos == state->word)
285 					PRSSYNTAXERROR;
286 				if (state->oprisdelim)
287 				{
288 					/* state->prsbuf+=pg_mblen(state->prsbuf); */
289 					RETURN_TOKEN;
290 				}
291 				else
292 					statecode = WAITPOSINFO;
293 				continue;		/* recheck current character */
294 			}
295 		}
296 		else if (statecode == WAITPOSINFO)
297 		{
298 			if (t_iseq(state->prsbuf, ':'))
299 				statecode = INPOSINFO;
300 			else
301 				RETURN_TOKEN;
302 		}
303 		else if (statecode == INPOSINFO)
304 		{
305 			if (t_isdigit(state->prsbuf))
306 			{
307 				if (posalen == 0)
308 				{
309 					posalen = 4;
310 					pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
311 					npos = 0;
312 				}
313 				else if (npos + 1 >= posalen)
314 				{
315 					posalen *= 2;
316 					pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
317 				}
318 				npos++;
319 				WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
320 				/* we cannot get here in tsquery, so no need for 2 errmsgs */
321 				if (WEP_GETPOS(pos[npos - 1]) == 0)
322 					ereport(ERROR,
323 							(errcode(ERRCODE_SYNTAX_ERROR),
324 							 errmsg("wrong position info in tsvector: \"%s\"",
325 									state->bufstart)));
326 				WEP_SETWEIGHT(pos[npos - 1], 0);
327 				statecode = WAITPOSDELIM;
328 			}
329 			else
330 				PRSSYNTAXERROR;
331 		}
332 		else if (statecode == WAITPOSDELIM)
333 		{
334 			if (t_iseq(state->prsbuf, ','))
335 				statecode = INPOSINFO;
336 			else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
337 			{
338 				if (WEP_GETWEIGHT(pos[npos - 1]))
339 					PRSSYNTAXERROR;
340 				WEP_SETWEIGHT(pos[npos - 1], 3);
341 			}
342 			else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
343 			{
344 				if (WEP_GETWEIGHT(pos[npos - 1]))
345 					PRSSYNTAXERROR;
346 				WEP_SETWEIGHT(pos[npos - 1], 2);
347 			}
348 			else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
349 			{
350 				if (WEP_GETWEIGHT(pos[npos - 1]))
351 					PRSSYNTAXERROR;
352 				WEP_SETWEIGHT(pos[npos - 1], 1);
353 			}
354 			else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
355 			{
356 				if (WEP_GETWEIGHT(pos[npos - 1]))
357 					PRSSYNTAXERROR;
358 				WEP_SETWEIGHT(pos[npos - 1], 0);
359 			}
360 			else if (t_isspace(state->prsbuf) ||
361 					 *(state->prsbuf) == '\0')
362 				RETURN_TOKEN;
363 			else if (!t_isdigit(state->prsbuf))
364 				PRSSYNTAXERROR;
365 		}
366 		else	/* internal error */
367 			elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
368 				 statecode);
369 
370 		/* get next char */
371 		state->prsbuf += pg_mblen(state->prsbuf);
372 	}
373 }
374