1 /*-------------------------------------------------------------------------
2  *
3  * tsquery.c
4  *	  I/O functions for tsquery
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/utils/adt/tsquery.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "libpq/pqformat.h"
18 #include "miscadmin.h"
19 #include "tsearch/ts_type.h"
20 #include "tsearch/ts_locale.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
23 #include "utils/memutils.h"
24 #include "utils/pg_crc.h"
25 
26 /* FTS operator priorities, see ts_type.h */
27 const int	tsearch_op_priority[OP_COUNT] =
28 {
29 	4,							/* OP_NOT */
30 	2,							/* OP_AND */
31 	1,							/* OP_OR */
32 	3							/* OP_PHRASE */
33 };
34 
35 /*
36  * parser's states
37  */
38 typedef enum
39 {
40 	WAITOPERAND = 1,
41 	WAITOPERATOR = 2,
42 	WAITFIRSTOPERAND = 3
43 } ts_parserstate;
44 
45 /*
46  * token types for parsing
47  */
48 typedef enum
49 {
50 	PT_END = 0,
51 	PT_ERR = 1,
52 	PT_VAL = 2,
53 	PT_OPR = 3,
54 	PT_OPEN = 4,
55 	PT_CLOSE = 5
56 } ts_tokentype;
57 
58 /*
59  * get token from query string
60  *
61  * *operator is filled in with OP_* when return values is PT_OPR,
62  * but *weight could contain a distance value in case of phrase operator.
63  * *strval, *lenval and *weight are filled in when return value is PT_VAL
64  *
65  */
66 typedef ts_tokentype (*ts_tokenizer) (TSQueryParserState state, int8 *operator,
67 									  int *lenval, char **strval,
68 									  int16 *weight, bool *prefix);
69 
70 struct TSQueryParserStateData
71 {
72 	/* Tokenizer used for parsing tsquery */
73 	ts_tokenizer gettoken;
74 
75 	/* State of tokenizer function */
76 	char	   *buffer;			/* entire string we are scanning */
77 	char	   *buf;			/* current scan point */
78 	int			count;			/* nesting count, incremented by (,
79 								 * decremented by ) */
80 	bool		in_quotes;		/* phrase in quotes "" */
81 	ts_parserstate state;
82 
83 	/* polish (prefix) notation in list, filled in by push* functions */
84 	List	   *polstr;
85 
86 	/*
87 	 * Strings from operands are collected in op. curop is a pointer to the
88 	 * end of used space of op.
89 	 */
90 	char	   *op;
91 	char	   *curop;
92 	int			lenop;			/* allocated size of op */
93 	int			sumlen;			/* used size of op */
94 
95 	/* state for value's parser */
96 	TSVectorParseState valstate;
97 };
98 
99 /*
100  * subroutine to parse the modifiers (weight and prefix flag currently)
101  * part, like ':AB*' of a query.
102  */
103 static char *
get_modifiers(char * buf,int16 * weight,bool * prefix)104 get_modifiers(char *buf, int16 *weight, bool *prefix)
105 {
106 	*weight = 0;
107 	*prefix = false;
108 
109 	if (!t_iseq(buf, ':'))
110 		return buf;
111 
112 	buf++;
113 	while (*buf && pg_mblen(buf) == 1)
114 	{
115 		switch (*buf)
116 		{
117 			case 'a':
118 			case 'A':
119 				*weight |= 1 << 3;
120 				break;
121 			case 'b':
122 			case 'B':
123 				*weight |= 1 << 2;
124 				break;
125 			case 'c':
126 			case 'C':
127 				*weight |= 1 << 1;
128 				break;
129 			case 'd':
130 			case 'D':
131 				*weight |= 1;
132 				break;
133 			case '*':
134 				*prefix = true;
135 				break;
136 			default:
137 				return buf;
138 		}
139 		buf++;
140 	}
141 
142 	return buf;
143 }
144 
145 /*
146  * Parse phrase operator. The operator
147  * may take the following forms:
148  *
149  *		a <N> b (distance is exactly N lexemes)
150  *		a <-> b (default distance = 1)
151  *
152  * The buffer should begin with '<' char
153  */
154 static bool
parse_phrase_operator(TSQueryParserState pstate,int16 * distance)155 parse_phrase_operator(TSQueryParserState pstate, int16 *distance)
156 {
157 	enum
158 	{
159 		PHRASE_OPEN = 0,
160 		PHRASE_DIST,
161 		PHRASE_CLOSE,
162 		PHRASE_FINISH
163 	}			state = PHRASE_OPEN;
164 	char	   *ptr = pstate->buf;
165 	char	   *endptr;
166 	long		l = 1;			/* default distance */
167 
168 	while (*ptr)
169 	{
170 		switch (state)
171 		{
172 			case PHRASE_OPEN:
173 				if (t_iseq(ptr, '<'))
174 				{
175 					state = PHRASE_DIST;
176 					ptr++;
177 				}
178 				else
179 					return false;
180 				break;
181 
182 			case PHRASE_DIST:
183 				if (t_iseq(ptr, '-'))
184 				{
185 					state = PHRASE_CLOSE;
186 					ptr++;
187 					continue;
188 				}
189 
190 				if (!t_isdigit(ptr))
191 					return false;
192 
193 				errno = 0;
194 				l = strtol(ptr, &endptr, 10);
195 				if (ptr == endptr)
196 					return false;
197 				else if (errno == ERANGE || l < 0 || l > MAXENTRYPOS)
198 					ereport(ERROR,
199 							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
200 							 errmsg("distance in phrase operator must be an integer value between zero and %d inclusive",
201 									MAXENTRYPOS)));
202 				else
203 				{
204 					state = PHRASE_CLOSE;
205 					ptr = endptr;
206 				}
207 				break;
208 
209 			case PHRASE_CLOSE:
210 				if (t_iseq(ptr, '>'))
211 				{
212 					state = PHRASE_FINISH;
213 					ptr++;
214 				}
215 				else
216 					return false;
217 				break;
218 
219 			case PHRASE_FINISH:
220 				*distance = (int16) l;
221 				pstate->buf = ptr;
222 				return true;
223 		}
224 	}
225 
226 	return false;
227 }
228 
229 /*
230  * Parse OR operator used in websearch_to_tsquery(), returns true if we
231  * believe that "OR" literal could be an operator OR
232  */
233 static bool
parse_or_operator(TSQueryParserState pstate)234 parse_or_operator(TSQueryParserState pstate)
235 {
236 	char	   *ptr = pstate->buf;
237 
238 	if (pstate->in_quotes)
239 		return false;
240 
241 	/* it should begin with "OR" literal */
242 	if (pg_strncasecmp(ptr, "or", 2) != 0)
243 		return false;
244 
245 	ptr += 2;
246 
247 	/*
248 	 * it shouldn't be a part of any word but somewhere later it should be
249 	 * some operand
250 	 */
251 	if (*ptr == '\0')			/* no operand */
252 		return false;
253 
254 	/* it shouldn't be a part of any word */
255 	if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha(ptr) || t_isdigit(ptr))
256 		return false;
257 
258 	for (;;)
259 	{
260 		ptr += pg_mblen(ptr);
261 
262 		if (*ptr == '\0')		/* got end of string without operand */
263 			return false;
264 
265 		/*
266 		 * Suppose, we found an operand, but could be a not correct operand.
267 		 * So we still treat OR literal as operation with possibly incorrect
268 		 * operand and  will not search it as lexeme
269 		 */
270 		if (!t_isspace(ptr))
271 			break;
272 	}
273 
274 	pstate->buf += 2;
275 	return true;
276 }
277 
278 static ts_tokentype
gettoken_query_standard(TSQueryParserState state,int8 * operator,int * lenval,char ** strval,int16 * weight,bool * prefix)279 gettoken_query_standard(TSQueryParserState state, int8 *operator,
280 						int *lenval, char **strval,
281 						int16 *weight, bool *prefix)
282 {
283 	*weight = 0;
284 	*prefix = false;
285 
286 	while (true)
287 	{
288 		switch (state->state)
289 		{
290 			case WAITFIRSTOPERAND:
291 			case WAITOPERAND:
292 				if (t_iseq(state->buf, '!'))
293 				{
294 					state->buf++;
295 					state->state = WAITOPERAND;
296 					*operator = OP_NOT;
297 					return PT_OPR;
298 				}
299 				else if (t_iseq(state->buf, '('))
300 				{
301 					state->buf++;
302 					state->state = WAITOPERAND;
303 					state->count++;
304 					return PT_OPEN;
305 				}
306 				else if (t_iseq(state->buf, ':'))
307 				{
308 					ereport(ERROR,
309 							(errcode(ERRCODE_SYNTAX_ERROR),
310 							 errmsg("syntax error in tsquery: \"%s\"",
311 									state->buffer)));
312 				}
313 				else if (!t_isspace(state->buf))
314 				{
315 					/*
316 					 * We rely on the tsvector parser to parse the value for
317 					 * us
318 					 */
319 					reset_tsvector_parser(state->valstate, state->buf);
320 					if (gettoken_tsvector(state->valstate, strval, lenval,
321 										  NULL, NULL, &state->buf))
322 					{
323 						state->buf = get_modifiers(state->buf, weight, prefix);
324 						state->state = WAITOPERATOR;
325 						return PT_VAL;
326 					}
327 					else if (state->state == WAITFIRSTOPERAND)
328 					{
329 						return PT_END;
330 					}
331 					else
332 						ereport(ERROR,
333 								(errcode(ERRCODE_SYNTAX_ERROR),
334 								 errmsg("no operand in tsquery: \"%s\"",
335 										state->buffer)));
336 				}
337 				break;
338 
339 			case WAITOPERATOR:
340 				if (t_iseq(state->buf, '&'))
341 				{
342 					state->buf++;
343 					state->state = WAITOPERAND;
344 					*operator = OP_AND;
345 					return PT_OPR;
346 				}
347 				else if (t_iseq(state->buf, '|'))
348 				{
349 					state->buf++;
350 					state->state = WAITOPERAND;
351 					*operator = OP_OR;
352 					return PT_OPR;
353 				}
354 				else if (parse_phrase_operator(state, weight))
355 				{
356 					/* weight var is used as storage for distance */
357 					state->state = WAITOPERAND;
358 					*operator = OP_PHRASE;
359 					return PT_OPR;
360 				}
361 				else if (t_iseq(state->buf, ')'))
362 				{
363 					state->buf++;
364 					state->count--;
365 					return (state->count < 0) ? PT_ERR : PT_CLOSE;
366 				}
367 				else if (*state->buf == '\0')
368 				{
369 					return (state->count) ? PT_ERR : PT_END;
370 				}
371 				else if (!t_isspace(state->buf))
372 				{
373 					return PT_ERR;
374 				}
375 				break;
376 		}
377 
378 		state->buf += pg_mblen(state->buf);
379 	}
380 }
381 
382 static ts_tokentype
gettoken_query_websearch(TSQueryParserState state,int8 * operator,int * lenval,char ** strval,int16 * weight,bool * prefix)383 gettoken_query_websearch(TSQueryParserState state, int8 *operator,
384 						 int *lenval, char **strval,
385 						 int16 *weight, bool *prefix)
386 {
387 	*weight = 0;
388 	*prefix = false;
389 
390 	while (true)
391 	{
392 		switch (state->state)
393 		{
394 			case WAITFIRSTOPERAND:
395 			case WAITOPERAND:
396 				if (t_iseq(state->buf, '-'))
397 				{
398 					state->buf++;
399 					state->state = WAITOPERAND;
400 
401 					if (state->in_quotes)
402 						continue;
403 
404 					*operator = OP_NOT;
405 					return PT_OPR;
406 				}
407 				else if (t_iseq(state->buf, '"'))
408 				{
409 					state->buf++;
410 
411 					if (!state->in_quotes)
412 					{
413 						state->state = WAITOPERAND;
414 
415 						if (strchr(state->buf, '"'))
416 						{
417 							/* quoted text should be ordered <-> */
418 							state->in_quotes = true;
419 							return PT_OPEN;
420 						}
421 
422 						/* web search tolerates missing quotes */
423 						continue;
424 					}
425 					else
426 					{
427 						/* we have to provide an operand */
428 						state->in_quotes = false;
429 						state->state = WAITOPERATOR;
430 						pushStop(state);
431 						return PT_CLOSE;
432 					}
433 				}
434 				else if (ISOPERATOR(state->buf))
435 				{
436 					/* or else gettoken_tsvector() will raise an error */
437 					state->buf++;
438 					state->state = WAITOPERAND;
439 					continue;
440 				}
441 				else if (!t_isspace(state->buf))
442 				{
443 					/*
444 					 * We rely on the tsvector parser to parse the value for
445 					 * us
446 					 */
447 					reset_tsvector_parser(state->valstate, state->buf);
448 					if (gettoken_tsvector(state->valstate, strval, lenval,
449 										  NULL, NULL, &state->buf))
450 					{
451 						state->state = WAITOPERATOR;
452 						return PT_VAL;
453 					}
454 					else if (state->state == WAITFIRSTOPERAND)
455 					{
456 						return PT_END;
457 					}
458 					else
459 					{
460 						/* finally, we have to provide an operand */
461 						pushStop(state);
462 						return PT_END;
463 					}
464 				}
465 				break;
466 
467 			case WAITOPERATOR:
468 				if (t_iseq(state->buf, '"'))
469 				{
470 					if (!state->in_quotes)
471 					{
472 						/*
473 						 * put implicit AND after an operand and handle this
474 						 * quote in WAITOPERAND
475 						 */
476 						state->state = WAITOPERAND;
477 						*operator = OP_AND;
478 						return PT_OPR;
479 					}
480 					else
481 					{
482 						state->buf++;
483 
484 						/* just close quotes */
485 						state->in_quotes = false;
486 						return PT_CLOSE;
487 					}
488 				}
489 				else if (parse_or_operator(state))
490 				{
491 					state->state = WAITOPERAND;
492 					*operator = OP_OR;
493 					return PT_OPR;
494 				}
495 				else if (*state->buf == '\0')
496 				{
497 					return PT_END;
498 				}
499 				else if (!t_isspace(state->buf))
500 				{
501 					if (state->in_quotes)
502 					{
503 						/* put implicit <-> after an operand */
504 						*operator = OP_PHRASE;
505 						*weight = 1;
506 					}
507 					else
508 					{
509 						/* put implicit AND after an operand */
510 						*operator = OP_AND;
511 					}
512 
513 					state->state = WAITOPERAND;
514 					return PT_OPR;
515 				}
516 				break;
517 		}
518 
519 		state->buf += pg_mblen(state->buf);
520 	}
521 }
522 
523 static ts_tokentype
gettoken_query_plain(TSQueryParserState state,int8 * operator,int * lenval,char ** strval,int16 * weight,bool * prefix)524 gettoken_query_plain(TSQueryParserState state, int8 *operator,
525 					 int *lenval, char **strval,
526 					 int16 *weight, bool *prefix)
527 {
528 	*weight = 0;
529 	*prefix = false;
530 
531 	if (*state->buf == '\0')
532 		return PT_END;
533 
534 	*strval = state->buf;
535 	*lenval = strlen(state->buf);
536 	state->buf += *lenval;
537 	state->count++;
538 	return PT_VAL;
539 }
540 
541 /*
542  * Push an operator to state->polstr
543  */
544 void
pushOperator(TSQueryParserState state,int8 oper,int16 distance)545 pushOperator(TSQueryParserState state, int8 oper, int16 distance)
546 {
547 	QueryOperator *tmp;
548 
549 	Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR || oper == OP_PHRASE);
550 
551 	tmp = (QueryOperator *) palloc0(sizeof(QueryOperator));
552 	tmp->type = QI_OPR;
553 	tmp->oper = oper;
554 	tmp->distance = (oper == OP_PHRASE) ? distance : 0;
555 	/* left is filled in later with findoprnd */
556 
557 	state->polstr = lcons(tmp, state->polstr);
558 }
559 
560 static void
pushValue_internal(TSQueryParserState state,pg_crc32 valcrc,int distance,int lenval,int weight,bool prefix)561 pushValue_internal(TSQueryParserState state, pg_crc32 valcrc, int distance, int lenval, int weight, bool prefix)
562 {
563 	QueryOperand *tmp;
564 
565 	if (distance >= MAXSTRPOS)
566 		ereport(ERROR,
567 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
568 				 errmsg("value is too big in tsquery: \"%s\"",
569 						state->buffer)));
570 	if (lenval >= MAXSTRLEN)
571 		ereport(ERROR,
572 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
573 				 errmsg("operand is too long in tsquery: \"%s\"",
574 						state->buffer)));
575 
576 	tmp = (QueryOperand *) palloc0(sizeof(QueryOperand));
577 	tmp->type = QI_VAL;
578 	tmp->weight = weight;
579 	tmp->prefix = prefix;
580 	tmp->valcrc = (int32) valcrc;
581 	tmp->length = lenval;
582 	tmp->distance = distance;
583 
584 	state->polstr = lcons(tmp, state->polstr);
585 }
586 
587 /*
588  * Push an operand to state->polstr.
589  *
590  * strval must point to a string equal to state->curop. lenval is the length
591  * of the string.
592  */
593 void
pushValue(TSQueryParserState state,char * strval,int lenval,int16 weight,bool prefix)594 pushValue(TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
595 {
596 	pg_crc32	valcrc;
597 
598 	if (lenval >= MAXSTRLEN)
599 		ereport(ERROR,
600 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
601 				 errmsg("word is too long in tsquery: \"%s\"",
602 						state->buffer)));
603 
604 	INIT_LEGACY_CRC32(valcrc);
605 	COMP_LEGACY_CRC32(valcrc, strval, lenval);
606 	FIN_LEGACY_CRC32(valcrc);
607 	pushValue_internal(state, valcrc, state->curop - state->op, lenval, weight, prefix);
608 
609 	/* append the value string to state.op, enlarging buffer if needed first */
610 	while (state->curop - state->op + lenval + 1 >= state->lenop)
611 	{
612 		int			used = state->curop - state->op;
613 
614 		state->lenop *= 2;
615 		state->op = (char *) repalloc((void *) state->op, state->lenop);
616 		state->curop = state->op + used;
617 	}
618 	memcpy((void *) state->curop, (void *) strval, lenval);
619 	state->curop += lenval;
620 	*(state->curop) = '\0';
621 	state->curop++;
622 	state->sumlen += lenval + 1 /* \0 */ ;
623 }
624 
625 
626 /*
627  * Push a stopword placeholder to state->polstr
628  */
629 void
pushStop(TSQueryParserState state)630 pushStop(TSQueryParserState state)
631 {
632 	QueryOperand *tmp;
633 
634 	tmp = (QueryOperand *) palloc0(sizeof(QueryOperand));
635 	tmp->type = QI_VALSTOP;
636 
637 	state->polstr = lcons(tmp, state->polstr);
638 }
639 
640 
641 #define STACKDEPTH	32
642 
643 typedef struct OperatorElement
644 {
645 	int8		op;
646 	int16		distance;
647 } OperatorElement;
648 
649 static void
pushOpStack(OperatorElement * stack,int * lenstack,int8 op,int16 distance)650 pushOpStack(OperatorElement *stack, int *lenstack, int8 op, int16 distance)
651 {
652 	if (*lenstack == STACKDEPTH)	/* internal error */
653 		elog(ERROR, "tsquery stack too small");
654 
655 	stack[*lenstack].op = op;
656 	stack[*lenstack].distance = distance;
657 
658 	(*lenstack)++;
659 }
660 
661 static void
cleanOpStack(TSQueryParserState state,OperatorElement * stack,int * lenstack,int8 op)662 cleanOpStack(TSQueryParserState state,
663 			 OperatorElement *stack, int *lenstack, int8 op)
664 {
665 	int			opPriority = OP_PRIORITY(op);
666 
667 	while (*lenstack)
668 	{
669 		/* NOT is right associative unlike to others */
670 		if ((op != OP_NOT && opPriority > OP_PRIORITY(stack[*lenstack - 1].op)) ||
671 			(op == OP_NOT && opPriority >= OP_PRIORITY(stack[*lenstack - 1].op)))
672 			break;
673 
674 		(*lenstack)--;
675 		pushOperator(state, stack[*lenstack].op,
676 					 stack[*lenstack].distance);
677 	}
678 }
679 
680 /*
681  * Make polish (prefix) notation of query.
682  *
683  * See parse_tsquery for explanation of pushval.
684  */
685 static void
makepol(TSQueryParserState state,PushFunction pushval,Datum opaque)686 makepol(TSQueryParserState state,
687 		PushFunction pushval,
688 		Datum opaque)
689 {
690 	int8		operator = 0;
691 	ts_tokentype type;
692 	int			lenval = 0;
693 	char	   *strval = NULL;
694 	OperatorElement opstack[STACKDEPTH];
695 	int			lenstack = 0;
696 	int16		weight = 0;
697 	bool		prefix;
698 
699 	/* since this function recurses, it could be driven to stack overflow */
700 	check_stack_depth();
701 
702 	while ((type = state->gettoken(state, &operator,
703 								   &lenval, &strval,
704 								   &weight, &prefix)) != PT_END)
705 	{
706 		switch (type)
707 		{
708 			case PT_VAL:
709 				pushval(opaque, state, strval, lenval, weight, prefix);
710 				break;
711 			case PT_OPR:
712 				cleanOpStack(state, opstack, &lenstack, operator);
713 				pushOpStack(opstack, &lenstack, operator, weight);
714 				break;
715 			case PT_OPEN:
716 				makepol(state, pushval, opaque);
717 				break;
718 			case PT_CLOSE:
719 				cleanOpStack(state, opstack, &lenstack, OP_OR /* lowest */ );
720 				return;
721 			case PT_ERR:
722 			default:
723 				ereport(ERROR,
724 						(errcode(ERRCODE_SYNTAX_ERROR),
725 						 errmsg("syntax error in tsquery: \"%s\"",
726 								state->buffer)));
727 		}
728 	}
729 
730 	cleanOpStack(state, opstack, &lenstack, OP_OR /* lowest */ );
731 }
732 
733 static void
findoprnd_recurse(QueryItem * ptr,uint32 * pos,int nnodes,bool * needcleanup)734 findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes, bool *needcleanup)
735 {
736 	/* since this function recurses, it could be driven to stack overflow. */
737 	check_stack_depth();
738 
739 	if (*pos >= nnodes)
740 		elog(ERROR, "malformed tsquery: operand not found");
741 
742 	if (ptr[*pos].type == QI_VAL)
743 	{
744 		(*pos)++;
745 	}
746 	else if (ptr[*pos].type == QI_VALSTOP)
747 	{
748 		*needcleanup = true;	/* we'll have to remove stop words */
749 		(*pos)++;
750 	}
751 	else
752 	{
753 		Assert(ptr[*pos].type == QI_OPR);
754 
755 		if (ptr[*pos].qoperator.oper == OP_NOT)
756 		{
757 			ptr[*pos].qoperator.left = 1;	/* fixed offset */
758 			(*pos)++;
759 
760 			/* process the only argument */
761 			findoprnd_recurse(ptr, pos, nnodes, needcleanup);
762 		}
763 		else
764 		{
765 			QueryOperator *curitem = &ptr[*pos].qoperator;
766 			int			tmp = *pos; /* save current position */
767 
768 			Assert(curitem->oper == OP_AND ||
769 				   curitem->oper == OP_OR ||
770 				   curitem->oper == OP_PHRASE);
771 
772 			(*pos)++;
773 
774 			/* process RIGHT argument */
775 			findoprnd_recurse(ptr, pos, nnodes, needcleanup);
776 
777 			curitem->left = *pos - tmp; /* set LEFT arg's offset */
778 
779 			/* process LEFT argument */
780 			findoprnd_recurse(ptr, pos, nnodes, needcleanup);
781 		}
782 	}
783 }
784 
785 
786 /*
787  * Fill in the left-fields previously left unfilled.
788  * The input QueryItems must be in polish (prefix) notation.
789  * Also, set *needcleanup to true if there are any QI_VALSTOP nodes.
790  */
791 static void
findoprnd(QueryItem * ptr,int size,bool * needcleanup)792 findoprnd(QueryItem *ptr, int size, bool *needcleanup)
793 {
794 	uint32		pos;
795 
796 	*needcleanup = false;
797 	pos = 0;
798 	findoprnd_recurse(ptr, &pos, size, needcleanup);
799 
800 	if (pos != size)
801 		elog(ERROR, "malformed tsquery: extra nodes");
802 }
803 
804 
805 /*
806  * Each value (operand) in the query is passed to pushval. pushval can
807  * transform the simple value to an arbitrarily complex expression using
808  * pushValue and pushOperator. It must push a single value with pushValue,
809  * a complete expression with all operands, or a stopword placeholder
810  * with pushStop, otherwise the prefix notation representation will be broken,
811  * having an operator with no operand.
812  *
813  * opaque is passed on to pushval as is, pushval can use it to store its
814  * private state.
815  */
816 TSQuery
parse_tsquery(char * buf,PushFunction pushval,Datum opaque,int flags)817 parse_tsquery(char *buf,
818 			  PushFunction pushval,
819 			  Datum opaque,
820 			  int flags)
821 {
822 	struct TSQueryParserStateData state;
823 	int			i;
824 	TSQuery		query;
825 	int			commonlen;
826 	QueryItem  *ptr;
827 	ListCell   *cell;
828 	bool		needcleanup;
829 	int			tsv_flags = P_TSV_OPR_IS_DELIM | P_TSV_IS_TSQUERY;
830 
831 	/* plain should not be used with web */
832 	Assert((flags & (P_TSQ_PLAIN | P_TSQ_WEB)) != (P_TSQ_PLAIN | P_TSQ_WEB));
833 
834 	/* select suitable tokenizer */
835 	if (flags & P_TSQ_PLAIN)
836 		state.gettoken = gettoken_query_plain;
837 	else if (flags & P_TSQ_WEB)
838 	{
839 		state.gettoken = gettoken_query_websearch;
840 		tsv_flags |= P_TSV_IS_WEB;
841 	}
842 	else
843 		state.gettoken = gettoken_query_standard;
844 
845 	/* init state */
846 	state.buffer = buf;
847 	state.buf = buf;
848 	state.count = 0;
849 	state.in_quotes = false;
850 	state.state = WAITFIRSTOPERAND;
851 	state.polstr = NIL;
852 
853 	/* init value parser's state */
854 	state.valstate = init_tsvector_parser(state.buffer, tsv_flags);
855 
856 	/* init list of operand */
857 	state.sumlen = 0;
858 	state.lenop = 64;
859 	state.curop = state.op = (char *) palloc(state.lenop);
860 	*(state.curop) = '\0';
861 
862 	/* parse query & make polish notation (postfix, but in reverse order) */
863 	makepol(&state, pushval, opaque);
864 
865 	close_tsvector_parser(state.valstate);
866 
867 	if (list_length(state.polstr) == 0)
868 	{
869 		ereport(NOTICE,
870 				(errmsg("text-search query doesn't contain lexemes: \"%s\"",
871 						state.buffer)));
872 		query = (TSQuery) palloc(HDRSIZETQ);
873 		SET_VARSIZE(query, HDRSIZETQ);
874 		query->size = 0;
875 		return query;
876 	}
877 
878 	if (TSQUERY_TOO_BIG(list_length(state.polstr), state.sumlen))
879 		ereport(ERROR,
880 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
881 				 errmsg("tsquery is too large")));
882 	commonlen = COMPUTESIZE(list_length(state.polstr), state.sumlen);
883 
884 	/* Pack the QueryItems in the final TSQuery struct to return to caller */
885 	query = (TSQuery) palloc0(commonlen);
886 	SET_VARSIZE(query, commonlen);
887 	query->size = list_length(state.polstr);
888 	ptr = GETQUERY(query);
889 
890 	/* Copy QueryItems to TSQuery */
891 	i = 0;
892 	foreach(cell, state.polstr)
893 	{
894 		QueryItem  *item = (QueryItem *) lfirst(cell);
895 
896 		switch (item->type)
897 		{
898 			case QI_VAL:
899 				memcpy(&ptr[i], item, sizeof(QueryOperand));
900 				break;
901 			case QI_VALSTOP:
902 				ptr[i].type = QI_VALSTOP;
903 				break;
904 			case QI_OPR:
905 				memcpy(&ptr[i], item, sizeof(QueryOperator));
906 				break;
907 			default:
908 				elog(ERROR, "unrecognized QueryItem type: %d", item->type);
909 		}
910 		i++;
911 	}
912 
913 	/* Copy all the operand strings to TSQuery */
914 	memcpy((void *) GETOPERAND(query), (void *) state.op, state.sumlen);
915 	pfree(state.op);
916 
917 	/*
918 	 * Set left operand pointers for every operator.  While we're at it,
919 	 * detect whether there are any QI_VALSTOP nodes.
920 	 */
921 	findoprnd(ptr, query->size, &needcleanup);
922 
923 	/*
924 	 * If there are QI_VALSTOP nodes, delete them and simplify the tree.
925 	 */
926 	if (needcleanup)
927 		query = cleanup_tsquery_stopwords(query);
928 
929 	return query;
930 }
931 
932 static void
pushval_asis(Datum opaque,TSQueryParserState state,char * strval,int lenval,int16 weight,bool prefix)933 pushval_asis(Datum opaque, TSQueryParserState state, char *strval, int lenval,
934 			 int16 weight, bool prefix)
935 {
936 	pushValue(state, strval, lenval, weight, prefix);
937 }
938 
939 /*
940  * in without morphology
941  */
942 Datum
tsqueryin(PG_FUNCTION_ARGS)943 tsqueryin(PG_FUNCTION_ARGS)
944 {
945 	char	   *in = PG_GETARG_CSTRING(0);
946 
947 	PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, PointerGetDatum(NULL), 0));
948 }
949 
950 /*
951  * out function
952  */
953 typedef struct
954 {
955 	QueryItem  *curpol;
956 	char	   *buf;
957 	char	   *cur;
958 	char	   *op;
959 	int			buflen;
960 } INFIX;
961 
962 /* Makes sure inf->buf is large enough for adding 'addsize' bytes */
963 #define RESIZEBUF(inf, addsize) \
964 while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \
965 { \
966 	int len = (inf)->cur - (inf)->buf; \
967 	(inf)->buflen *= 2; \
968 	(inf)->buf = (char*) repalloc( (void*)(inf)->buf, (inf)->buflen ); \
969 	(inf)->cur = (inf)->buf + len; \
970 }
971 
972 /*
973  * recursively traverse the tree and
974  * print it in infix (human-readable) form
975  */
976 static void
infix(INFIX * in,int parentPriority,bool rightPhraseOp)977 infix(INFIX *in, int parentPriority, bool rightPhraseOp)
978 {
979 	/* since this function recurses, it could be driven to stack overflow. */
980 	check_stack_depth();
981 
982 	if (in->curpol->type == QI_VAL)
983 	{
984 		QueryOperand *curpol = &in->curpol->qoperand;
985 		char	   *op = in->op + curpol->distance;
986 		int			clen;
987 
988 		RESIZEBUF(in, curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 6);
989 		*(in->cur) = '\'';
990 		in->cur++;
991 		while (*op)
992 		{
993 			if (t_iseq(op, '\''))
994 			{
995 				*(in->cur) = '\'';
996 				in->cur++;
997 			}
998 			else if (t_iseq(op, '\\'))
999 			{
1000 				*(in->cur) = '\\';
1001 				in->cur++;
1002 			}
1003 			COPYCHAR(in->cur, op);
1004 
1005 			clen = pg_mblen(op);
1006 			op += clen;
1007 			in->cur += clen;
1008 		}
1009 		*(in->cur) = '\'';
1010 		in->cur++;
1011 		if (curpol->weight || curpol->prefix)
1012 		{
1013 			*(in->cur) = ':';
1014 			in->cur++;
1015 			if (curpol->prefix)
1016 			{
1017 				*(in->cur) = '*';
1018 				in->cur++;
1019 			}
1020 			if (curpol->weight & (1 << 3))
1021 			{
1022 				*(in->cur) = 'A';
1023 				in->cur++;
1024 			}
1025 			if (curpol->weight & (1 << 2))
1026 			{
1027 				*(in->cur) = 'B';
1028 				in->cur++;
1029 			}
1030 			if (curpol->weight & (1 << 1))
1031 			{
1032 				*(in->cur) = 'C';
1033 				in->cur++;
1034 			}
1035 			if (curpol->weight & 1)
1036 			{
1037 				*(in->cur) = 'D';
1038 				in->cur++;
1039 			}
1040 		}
1041 		*(in->cur) = '\0';
1042 		in->curpol++;
1043 	}
1044 	else if (in->curpol->qoperator.oper == OP_NOT)
1045 	{
1046 		int			priority = QO_PRIORITY(in->curpol);
1047 
1048 		if (priority < parentPriority)
1049 		{
1050 			RESIZEBUF(in, 2);
1051 			sprintf(in->cur, "( ");
1052 			in->cur = strchr(in->cur, '\0');
1053 		}
1054 		RESIZEBUF(in, 1);
1055 		*(in->cur) = '!';
1056 		in->cur++;
1057 		*(in->cur) = '\0';
1058 		in->curpol++;
1059 
1060 		infix(in, priority, false);
1061 		if (priority < parentPriority)
1062 		{
1063 			RESIZEBUF(in, 2);
1064 			sprintf(in->cur, " )");
1065 			in->cur = strchr(in->cur, '\0');
1066 		}
1067 	}
1068 	else
1069 	{
1070 		int8		op = in->curpol->qoperator.oper;
1071 		int			priority = QO_PRIORITY(in->curpol);
1072 		int16		distance = in->curpol->qoperator.distance;
1073 		INFIX		nrm;
1074 		bool		needParenthesis = false;
1075 
1076 		in->curpol++;
1077 		if (priority < parentPriority ||
1078 		/* phrase operator depends on order */
1079 			(op == OP_PHRASE && rightPhraseOp))
1080 		{
1081 			needParenthesis = true;
1082 			RESIZEBUF(in, 2);
1083 			sprintf(in->cur, "( ");
1084 			in->cur = strchr(in->cur, '\0');
1085 		}
1086 
1087 		nrm.curpol = in->curpol;
1088 		nrm.op = in->op;
1089 		nrm.buflen = 16;
1090 		nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
1091 
1092 		/* get right operand */
1093 		infix(&nrm, priority, (op == OP_PHRASE));
1094 
1095 		/* get & print left operand */
1096 		in->curpol = nrm.curpol;
1097 		infix(in, priority, false);
1098 
1099 		/* print operator & right operand */
1100 		RESIZEBUF(in, 3 + (2 + 10 /* distance */ ) + (nrm.cur - nrm.buf));
1101 		switch (op)
1102 		{
1103 			case OP_OR:
1104 				sprintf(in->cur, " | %s", nrm.buf);
1105 				break;
1106 			case OP_AND:
1107 				sprintf(in->cur, " & %s", nrm.buf);
1108 				break;
1109 			case OP_PHRASE:
1110 				if (distance != 1)
1111 					sprintf(in->cur, " <%d> %s", distance, nrm.buf);
1112 				else
1113 					sprintf(in->cur, " <-> %s", nrm.buf);
1114 				break;
1115 			default:
1116 				/* OP_NOT is handled in above if-branch */
1117 				elog(ERROR, "unrecognized operator type: %d", op);
1118 		}
1119 		in->cur = strchr(in->cur, '\0');
1120 		pfree(nrm.buf);
1121 
1122 		if (needParenthesis)
1123 		{
1124 			RESIZEBUF(in, 2);
1125 			sprintf(in->cur, " )");
1126 			in->cur = strchr(in->cur, '\0');
1127 		}
1128 	}
1129 }
1130 
1131 Datum
tsqueryout(PG_FUNCTION_ARGS)1132 tsqueryout(PG_FUNCTION_ARGS)
1133 {
1134 	TSQuery		query = PG_GETARG_TSQUERY(0);
1135 	INFIX		nrm;
1136 
1137 	if (query->size == 0)
1138 	{
1139 		char	   *b = palloc(1);
1140 
1141 		*b = '\0';
1142 		PG_RETURN_POINTER(b);
1143 	}
1144 	nrm.curpol = GETQUERY(query);
1145 	nrm.buflen = 32;
1146 	nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
1147 	*(nrm.cur) = '\0';
1148 	nrm.op = GETOPERAND(query);
1149 	infix(&nrm, -1 /* lowest priority */ , false);
1150 
1151 	PG_FREE_IF_COPY(query, 0);
1152 	PG_RETURN_CSTRING(nrm.buf);
1153 }
1154 
1155 /*
1156  * Binary Input / Output functions. The binary format is as follows:
1157  *
1158  * uint32	 number of operators/operands in the query
1159  *
1160  * Followed by the operators and operands, in prefix notation. For each
1161  * operand:
1162  *
1163  * uint8	type, QI_VAL
1164  * uint8	weight
1165  *			operand text in client encoding, null-terminated
1166  * uint8	prefix
1167  *
1168  * For each operator:
1169  * uint8	type, QI_OPR
1170  * uint8	operator, one of OP_AND, OP_PHRASE OP_OR, OP_NOT.
1171  * uint16	distance (only for OP_PHRASE)
1172  */
1173 Datum
tsquerysend(PG_FUNCTION_ARGS)1174 tsquerysend(PG_FUNCTION_ARGS)
1175 {
1176 	TSQuery		query = PG_GETARG_TSQUERY(0);
1177 	StringInfoData buf;
1178 	int			i;
1179 	QueryItem  *item = GETQUERY(query);
1180 
1181 	pq_begintypsend(&buf);
1182 
1183 	pq_sendint32(&buf, query->size);
1184 	for (i = 0; i < query->size; i++)
1185 	{
1186 		pq_sendint8(&buf, item->type);
1187 
1188 		switch (item->type)
1189 		{
1190 			case QI_VAL:
1191 				pq_sendint8(&buf, item->qoperand.weight);
1192 				pq_sendint8(&buf, item->qoperand.prefix);
1193 				pq_sendstring(&buf, GETOPERAND(query) + item->qoperand.distance);
1194 				break;
1195 			case QI_OPR:
1196 				pq_sendint8(&buf, item->qoperator.oper);
1197 				if (item->qoperator.oper == OP_PHRASE)
1198 					pq_sendint16(&buf, item->qoperator.distance);
1199 				break;
1200 			default:
1201 				elog(ERROR, "unrecognized tsquery node type: %d", item->type);
1202 		}
1203 		item++;
1204 	}
1205 
1206 	PG_FREE_IF_COPY(query, 0);
1207 
1208 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
1209 }
1210 
1211 Datum
tsqueryrecv(PG_FUNCTION_ARGS)1212 tsqueryrecv(PG_FUNCTION_ARGS)
1213 {
1214 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
1215 	TSQuery		query;
1216 	int			i,
1217 				len;
1218 	QueryItem  *item;
1219 	int			datalen;
1220 	char	   *ptr;
1221 	uint32		size;
1222 	const char **operands;
1223 	bool		needcleanup;
1224 
1225 	size = pq_getmsgint(buf, sizeof(uint32));
1226 	if (size > (MaxAllocSize / sizeof(QueryItem)))
1227 		elog(ERROR, "invalid size of tsquery");
1228 
1229 	/* Allocate space to temporarily hold operand strings */
1230 	operands = palloc(size * sizeof(char *));
1231 
1232 	/* Allocate space for all the QueryItems. */
1233 	len = HDRSIZETQ + sizeof(QueryItem) * size;
1234 	query = (TSQuery) palloc0(len);
1235 	query->size = size;
1236 	item = GETQUERY(query);
1237 
1238 	datalen = 0;
1239 	for (i = 0; i < size; i++)
1240 	{
1241 		item->type = (int8) pq_getmsgint(buf, sizeof(int8));
1242 
1243 		if (item->type == QI_VAL)
1244 		{
1245 			size_t		val_len;	/* length after recoding to server
1246 									 * encoding */
1247 			uint8		weight;
1248 			uint8		prefix;
1249 			const char *val;
1250 			pg_crc32	valcrc;
1251 
1252 			weight = (uint8) pq_getmsgint(buf, sizeof(uint8));
1253 			prefix = (uint8) pq_getmsgint(buf, sizeof(uint8));
1254 			val = pq_getmsgstring(buf);
1255 			val_len = strlen(val);
1256 
1257 			/* Sanity checks */
1258 
1259 			if (weight > 0xF)
1260 				elog(ERROR, "invalid tsquery: invalid weight bitmap");
1261 
1262 			if (val_len > MAXSTRLEN)
1263 				elog(ERROR, "invalid tsquery: operand too long");
1264 
1265 			if (datalen > MAXSTRPOS)
1266 				elog(ERROR, "invalid tsquery: total operand length exceeded");
1267 
1268 			/* Looks valid. */
1269 
1270 			INIT_LEGACY_CRC32(valcrc);
1271 			COMP_LEGACY_CRC32(valcrc, val, val_len);
1272 			FIN_LEGACY_CRC32(valcrc);
1273 
1274 			item->qoperand.weight = weight;
1275 			item->qoperand.prefix = (prefix) ? true : false;
1276 			item->qoperand.valcrc = (int32) valcrc;
1277 			item->qoperand.length = val_len;
1278 			item->qoperand.distance = datalen;
1279 
1280 			/*
1281 			 * Operand strings are copied to the final struct after this loop;
1282 			 * here we just collect them to an array
1283 			 */
1284 			operands[i] = val;
1285 
1286 			datalen += val_len + 1; /* + 1 for the '\0' terminator */
1287 		}
1288 		else if (item->type == QI_OPR)
1289 		{
1290 			int8		oper;
1291 
1292 			oper = (int8) pq_getmsgint(buf, sizeof(int8));
1293 			if (oper != OP_NOT && oper != OP_OR && oper != OP_AND && oper != OP_PHRASE)
1294 				elog(ERROR, "invalid tsquery: unrecognized operator type %d",
1295 					 (int) oper);
1296 			if (i == size - 1)
1297 				elog(ERROR, "invalid pointer to right operand");
1298 
1299 			item->qoperator.oper = oper;
1300 			if (oper == OP_PHRASE)
1301 				item->qoperator.distance = (int16) pq_getmsgint(buf, sizeof(int16));
1302 		}
1303 		else
1304 			elog(ERROR, "unrecognized tsquery node type: %d", item->type);
1305 
1306 		item++;
1307 	}
1308 
1309 	/* Enlarge buffer to make room for the operand values. */
1310 	query = (TSQuery) repalloc(query, len + datalen);
1311 	item = GETQUERY(query);
1312 	ptr = GETOPERAND(query);
1313 
1314 	/*
1315 	 * Fill in the left-pointers. Checks that the tree is well-formed as a
1316 	 * side-effect.
1317 	 */
1318 	findoprnd(item, size, &needcleanup);
1319 
1320 	/* Can't have found any QI_VALSTOP nodes */
1321 	Assert(!needcleanup);
1322 
1323 	/* Copy operands to output struct */
1324 	for (i = 0; i < size; i++)
1325 	{
1326 		if (item->type == QI_VAL)
1327 		{
1328 			memcpy(ptr, operands[i], item->qoperand.length + 1);
1329 			ptr += item->qoperand.length + 1;
1330 		}
1331 		item++;
1332 	}
1333 
1334 	pfree(operands);
1335 
1336 	Assert(ptr - GETOPERAND(query) == datalen);
1337 
1338 	SET_VARSIZE(query, len + datalen);
1339 
1340 	PG_RETURN_TSQUERY(query);
1341 }
1342 
1343 /*
1344  * debug function, used only for view query
1345  * which will be executed in non-leaf pages in index
1346  */
1347 Datum
tsquerytree(PG_FUNCTION_ARGS)1348 tsquerytree(PG_FUNCTION_ARGS)
1349 {
1350 	TSQuery		query = PG_GETARG_TSQUERY(0);
1351 	INFIX		nrm;
1352 	text	   *res;
1353 	QueryItem  *q;
1354 	int			len;
1355 
1356 	if (query->size == 0)
1357 	{
1358 		res = (text *) palloc(VARHDRSZ);
1359 		SET_VARSIZE(res, VARHDRSZ);
1360 		PG_RETURN_POINTER(res);
1361 	}
1362 
1363 	q = clean_NOT(GETQUERY(query), &len);
1364 
1365 	if (!q)
1366 	{
1367 		res = cstring_to_text("T");
1368 	}
1369 	else
1370 	{
1371 		nrm.curpol = q;
1372 		nrm.buflen = 32;
1373 		nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
1374 		*(nrm.cur) = '\0';
1375 		nrm.op = GETOPERAND(query);
1376 		infix(&nrm, -1, false);
1377 		res = cstring_to_text_with_len(nrm.buf, nrm.cur - nrm.buf);
1378 		pfree(q);
1379 	}
1380 
1381 	PG_FREE_IF_COPY(query, 0);
1382 
1383 	PG_RETURN_TEXT_P(res);
1384 }
1385