1 /*-------------------------------------------------------------------------
2  *
3  * tsquery.c
4  *	  I/O functions for tsquery
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/utils/adt/tsquery.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "libpq/pqformat.h"
18 #include "miscadmin.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_type.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
23 #include "utils/memutils.h"
24 #include "utils/pg_crc.h"
25 
26 /* FTS operator priorities, see ts_type.h */
27 const int	tsearch_op_priority[OP_COUNT] =
28 {
29 	4,							/* OP_NOT */
30 	2,							/* OP_AND */
31 	1,							/* OP_OR */
32 	3							/* OP_PHRASE */
33 };
34 
35 /*
36  * parser's states
37  */
38 typedef enum
39 {
40 	WAITOPERAND = 1,
41 	WAITOPERATOR = 2,
42 	WAITFIRSTOPERAND = 3
43 } ts_parserstate;
44 
45 /*
46  * token types for parsing
47  */
48 typedef enum
49 {
50 	PT_END = 0,
51 	PT_ERR = 1,
52 	PT_VAL = 2,
53 	PT_OPR = 3,
54 	PT_OPEN = 4,
55 	PT_CLOSE = 5
56 } ts_tokentype;
57 
58 /*
59  * get token from query string
60  *
61  * *operator is filled in with OP_* when return values is PT_OPR,
62  * but *weight could contain a distance value in case of phrase operator.
63  * *strval, *lenval and *weight are filled in when return value is PT_VAL
64  *
65  */
66 typedef ts_tokentype (*ts_tokenizer) (TSQueryParserState state, int8 *operator,
67 									  int *lenval, char **strval,
68 									  int16 *weight, bool *prefix);
69 
70 struct TSQueryParserStateData
71 {
72 	/* Tokenizer used for parsing tsquery */
73 	ts_tokenizer gettoken;
74 
75 	/* State of tokenizer function */
76 	char	   *buffer;			/* entire string we are scanning */
77 	char	   *buf;			/* current scan point */
78 	int			count;			/* nesting count, incremented by (,
79 								 * decremented by ) */
80 	ts_parserstate state;
81 
82 	/* polish (prefix) notation in list, filled in by push* functions */
83 	List	   *polstr;
84 
85 	/*
86 	 * Strings from operands are collected in op. curop is a pointer to the
87 	 * end of used space of op.
88 	 */
89 	char	   *op;
90 	char	   *curop;
91 	int			lenop;			/* allocated size of op */
92 	int			sumlen;			/* used size of op */
93 
94 	/* state for value's parser */
95 	TSVectorParseState valstate;
96 };
97 
98 /*
99  * subroutine to parse the modifiers (weight and prefix flag currently)
100  * part, like ':AB*' of a query.
101  */
102 static char *
get_modifiers(char * buf,int16 * weight,bool * prefix)103 get_modifiers(char *buf, int16 *weight, bool *prefix)
104 {
105 	*weight = 0;
106 	*prefix = false;
107 
108 	if (!t_iseq(buf, ':'))
109 		return buf;
110 
111 	buf++;
112 	while (*buf && pg_mblen(buf) == 1)
113 	{
114 		switch (*buf)
115 		{
116 			case 'a':
117 			case 'A':
118 				*weight |= 1 << 3;
119 				break;
120 			case 'b':
121 			case 'B':
122 				*weight |= 1 << 2;
123 				break;
124 			case 'c':
125 			case 'C':
126 				*weight |= 1 << 1;
127 				break;
128 			case 'd':
129 			case 'D':
130 				*weight |= 1;
131 				break;
132 			case '*':
133 				*prefix = true;
134 				break;
135 			default:
136 				return buf;
137 		}
138 		buf++;
139 	}
140 
141 	return buf;
142 }
143 
144 /*
145  * Parse phrase operator. The operator
146  * may take the following forms:
147  *
148  *		a <N> b (distance is exactly N lexemes)
149  *		a <-> b (default distance = 1)
150  *
151  * The buffer should begin with '<' char
152  */
153 static bool
parse_phrase_operator(TSQueryParserState pstate,int16 * distance)154 parse_phrase_operator(TSQueryParserState pstate, int16 *distance)
155 {
156 	enum
157 	{
158 		PHRASE_OPEN = 0,
159 		PHRASE_DIST,
160 		PHRASE_CLOSE,
161 		PHRASE_FINISH
162 	}			state = PHRASE_OPEN;
163 	char	   *ptr = pstate->buf;
164 	char	   *endptr;
165 	long		l = 1;			/* default distance */
166 
167 	while (*ptr)
168 	{
169 		switch (state)
170 		{
171 			case PHRASE_OPEN:
172 				if (t_iseq(ptr, '<'))
173 				{
174 					state = PHRASE_DIST;
175 					ptr++;
176 				}
177 				else
178 					return false;
179 				break;
180 
181 			case PHRASE_DIST:
182 				if (t_iseq(ptr, '-'))
183 				{
184 					state = PHRASE_CLOSE;
185 					ptr++;
186 					continue;
187 				}
188 
189 				if (!t_isdigit(ptr))
190 					return false;
191 
192 				errno = 0;
193 				l = strtol(ptr, &endptr, 10);
194 				if (ptr == endptr)
195 					return false;
196 				else if (errno == ERANGE || l < 0 || l > MAXENTRYPOS)
197 					ereport(ERROR,
198 							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
199 							 errmsg("distance in phrase operator must be an integer value between zero and %d inclusive",
200 									MAXENTRYPOS)));
201 				else
202 				{
203 					state = PHRASE_CLOSE;
204 					ptr = endptr;
205 				}
206 				break;
207 
208 			case PHRASE_CLOSE:
209 				if (t_iseq(ptr, '>'))
210 				{
211 					state = PHRASE_FINISH;
212 					ptr++;
213 				}
214 				else
215 					return false;
216 				break;
217 
218 			case PHRASE_FINISH:
219 				*distance = (int16) l;
220 				pstate->buf = ptr;
221 				return true;
222 		}
223 	}
224 
225 	return false;
226 }
227 
228 /*
229  * Parse OR operator used in websearch_to_tsquery(), returns true if we
230  * believe that "OR" literal could be an operator OR
231  */
232 static bool
parse_or_operator(TSQueryParserState pstate)233 parse_or_operator(TSQueryParserState pstate)
234 {
235 	char	   *ptr = pstate->buf;
236 
237 	/* it should begin with "OR" literal */
238 	if (pg_strncasecmp(ptr, "or", 2) != 0)
239 		return false;
240 
241 	ptr += 2;
242 
243 	/*
244 	 * it shouldn't be a part of any word but somewhere later it should be
245 	 * some operand
246 	 */
247 	if (*ptr == '\0')			/* no operand */
248 		return false;
249 
250 	/* it shouldn't be a part of any word */
251 	if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha(ptr) || t_isdigit(ptr))
252 		return false;
253 
254 	for (;;)
255 	{
256 		ptr += pg_mblen(ptr);
257 
258 		if (*ptr == '\0')		/* got end of string without operand */
259 			return false;
260 
261 		/*
262 		 * Suppose, we found an operand, but could be a not correct operand.
263 		 * So we still treat OR literal as operation with possibly incorrect
264 		 * operand and  will not search it as lexeme
265 		 */
266 		if (!t_isspace(ptr))
267 			break;
268 	}
269 
270 	pstate->buf += 2;
271 	return true;
272 }
273 
274 static ts_tokentype
gettoken_query_standard(TSQueryParserState state,int8 * operator,int * lenval,char ** strval,int16 * weight,bool * prefix)275 gettoken_query_standard(TSQueryParserState state, int8 *operator,
276 						int *lenval, char **strval,
277 						int16 *weight, bool *prefix)
278 {
279 	*weight = 0;
280 	*prefix = false;
281 
282 	while (true)
283 	{
284 		switch (state->state)
285 		{
286 			case WAITFIRSTOPERAND:
287 			case WAITOPERAND:
288 				if (t_iseq(state->buf, '!'))
289 				{
290 					state->buf++;
291 					state->state = WAITOPERAND;
292 					*operator = OP_NOT;
293 					return PT_OPR;
294 				}
295 				else if (t_iseq(state->buf, '('))
296 				{
297 					state->buf++;
298 					state->state = WAITOPERAND;
299 					state->count++;
300 					return PT_OPEN;
301 				}
302 				else if (t_iseq(state->buf, ':'))
303 				{
304 					ereport(ERROR,
305 							(errcode(ERRCODE_SYNTAX_ERROR),
306 							 errmsg("syntax error in tsquery: \"%s\"",
307 									state->buffer)));
308 				}
309 				else if (!t_isspace(state->buf))
310 				{
311 					/*
312 					 * We rely on the tsvector parser to parse the value for
313 					 * us
314 					 */
315 					reset_tsvector_parser(state->valstate, state->buf);
316 					if (gettoken_tsvector(state->valstate, strval, lenval,
317 										  NULL, NULL, &state->buf))
318 					{
319 						state->buf = get_modifiers(state->buf, weight, prefix);
320 						state->state = WAITOPERATOR;
321 						return PT_VAL;
322 					}
323 					else if (state->state == WAITFIRSTOPERAND)
324 					{
325 						return PT_END;
326 					}
327 					else
328 						ereport(ERROR,
329 								(errcode(ERRCODE_SYNTAX_ERROR),
330 								 errmsg("no operand in tsquery: \"%s\"",
331 										state->buffer)));
332 				}
333 				break;
334 
335 			case WAITOPERATOR:
336 				if (t_iseq(state->buf, '&'))
337 				{
338 					state->buf++;
339 					state->state = WAITOPERAND;
340 					*operator = OP_AND;
341 					return PT_OPR;
342 				}
343 				else if (t_iseq(state->buf, '|'))
344 				{
345 					state->buf++;
346 					state->state = WAITOPERAND;
347 					*operator = OP_OR;
348 					return PT_OPR;
349 				}
350 				else if (parse_phrase_operator(state, weight))
351 				{
352 					/* weight var is used as storage for distance */
353 					state->state = WAITOPERAND;
354 					*operator = OP_PHRASE;
355 					return PT_OPR;
356 				}
357 				else if (t_iseq(state->buf, ')'))
358 				{
359 					state->buf++;
360 					state->count--;
361 					return (state->count < 0) ? PT_ERR : PT_CLOSE;
362 				}
363 				else if (*state->buf == '\0')
364 				{
365 					return (state->count) ? PT_ERR : PT_END;
366 				}
367 				else if (!t_isspace(state->buf))
368 				{
369 					return PT_ERR;
370 				}
371 				break;
372 		}
373 
374 		state->buf += pg_mblen(state->buf);
375 	}
376 }
377 
378 static ts_tokentype
gettoken_query_websearch(TSQueryParserState state,int8 * operator,int * lenval,char ** strval,int16 * weight,bool * prefix)379 gettoken_query_websearch(TSQueryParserState state, int8 *operator,
380 						 int *lenval, char **strval,
381 						 int16 *weight, bool *prefix)
382 {
383 	*weight = 0;
384 	*prefix = false;
385 
386 	while (true)
387 	{
388 		switch (state->state)
389 		{
390 			case WAITFIRSTOPERAND:
391 			case WAITOPERAND:
392 				if (t_iseq(state->buf, '-'))
393 				{
394 					state->buf++;
395 					state->state = WAITOPERAND;
396 
397 					*operator = OP_NOT;
398 					return PT_OPR;
399 				}
400 				else if (t_iseq(state->buf, '"'))
401 				{
402 					/* Everything in quotes is processed as a single token */
403 
404 					/* skip opening quote */
405 					state->buf++;
406 					*strval = state->buf;
407 
408 					/* iterate to the closing quote or end of the string */
409 					while (*state->buf != '\0' && !t_iseq(state->buf, '"'))
410 						state->buf++;
411 					*lenval = state->buf - *strval;
412 
413 					/* skip closing quote if not end of the string */
414 					if (*state->buf != '\0')
415 						state->buf++;
416 
417 					state->state = WAITOPERATOR;
418 					state->count++;
419 					return PT_VAL;
420 				}
421 				else if (ISOPERATOR(state->buf))
422 				{
423 					/* or else gettoken_tsvector() will raise an error */
424 					state->buf++;
425 					state->state = WAITOPERAND;
426 					continue;
427 				}
428 				else if (!t_isspace(state->buf))
429 				{
430 					/*
431 					 * We rely on the tsvector parser to parse the value for
432 					 * us
433 					 */
434 					reset_tsvector_parser(state->valstate, state->buf);
435 					if (gettoken_tsvector(state->valstate, strval, lenval,
436 										  NULL, NULL, &state->buf))
437 					{
438 						state->state = WAITOPERATOR;
439 						return PT_VAL;
440 					}
441 					else if (state->state == WAITFIRSTOPERAND)
442 					{
443 						return PT_END;
444 					}
445 					else
446 					{
447 						/* finally, we have to provide an operand */
448 						pushStop(state);
449 						return PT_END;
450 					}
451 				}
452 				break;
453 
454 			case WAITOPERATOR:
455 				if (t_iseq(state->buf, '"'))
456 				{
457 					/*
458 					 * put implicit AND after an operand and handle this quote
459 					 * in WAITOPERAND
460 					 */
461 					state->state = WAITOPERAND;
462 					*operator = OP_AND;
463 					return PT_OPR;
464 				}
465 				else if (parse_or_operator(state))
466 				{
467 					state->state = WAITOPERAND;
468 					*operator = OP_OR;
469 					return PT_OPR;
470 				}
471 				else if (*state->buf == '\0')
472 				{
473 					return PT_END;
474 				}
475 				else if (!t_isspace(state->buf))
476 				{
477 					/* put implicit AND after an operand */
478 					*operator = OP_AND;
479 					state->state = WAITOPERAND;
480 					return PT_OPR;
481 				}
482 				break;
483 		}
484 
485 		state->buf += pg_mblen(state->buf);
486 	}
487 }
488 
489 static ts_tokentype
gettoken_query_plain(TSQueryParserState state,int8 * operator,int * lenval,char ** strval,int16 * weight,bool * prefix)490 gettoken_query_plain(TSQueryParserState state, int8 *operator,
491 					 int *lenval, char **strval,
492 					 int16 *weight, bool *prefix)
493 {
494 	*weight = 0;
495 	*prefix = false;
496 
497 	if (*state->buf == '\0')
498 		return PT_END;
499 
500 	*strval = state->buf;
501 	*lenval = strlen(state->buf);
502 	state->buf += *lenval;
503 	state->count++;
504 	return PT_VAL;
505 }
506 
507 /*
508  * Push an operator to state->polstr
509  */
510 void
pushOperator(TSQueryParserState state,int8 oper,int16 distance)511 pushOperator(TSQueryParserState state, int8 oper, int16 distance)
512 {
513 	QueryOperator *tmp;
514 
515 	Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR || oper == OP_PHRASE);
516 
517 	tmp = (QueryOperator *) palloc0(sizeof(QueryOperator));
518 	tmp->type = QI_OPR;
519 	tmp->oper = oper;
520 	tmp->distance = (oper == OP_PHRASE) ? distance : 0;
521 	/* left is filled in later with findoprnd */
522 
523 	state->polstr = lcons(tmp, state->polstr);
524 }
525 
526 static void
pushValue_internal(TSQueryParserState state,pg_crc32 valcrc,int distance,int lenval,int weight,bool prefix)527 pushValue_internal(TSQueryParserState state, pg_crc32 valcrc, int distance, int lenval, int weight, bool prefix)
528 {
529 	QueryOperand *tmp;
530 
531 	if (distance >= MAXSTRPOS)
532 		ereport(ERROR,
533 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
534 				 errmsg("value is too big in tsquery: \"%s\"",
535 						state->buffer)));
536 	if (lenval >= MAXSTRLEN)
537 		ereport(ERROR,
538 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
539 				 errmsg("operand is too long in tsquery: \"%s\"",
540 						state->buffer)));
541 
542 	tmp = (QueryOperand *) palloc0(sizeof(QueryOperand));
543 	tmp->type = QI_VAL;
544 	tmp->weight = weight;
545 	tmp->prefix = prefix;
546 	tmp->valcrc = (int32) valcrc;
547 	tmp->length = lenval;
548 	tmp->distance = distance;
549 
550 	state->polstr = lcons(tmp, state->polstr);
551 }
552 
553 /*
554  * Push an operand to state->polstr.
555  *
556  * strval must point to a string equal to state->curop. lenval is the length
557  * of the string.
558  */
559 void
pushValue(TSQueryParserState state,char * strval,int lenval,int16 weight,bool prefix)560 pushValue(TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
561 {
562 	pg_crc32	valcrc;
563 
564 	if (lenval >= MAXSTRLEN)
565 		ereport(ERROR,
566 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
567 				 errmsg("word is too long in tsquery: \"%s\"",
568 						state->buffer)));
569 
570 	INIT_LEGACY_CRC32(valcrc);
571 	COMP_LEGACY_CRC32(valcrc, strval, lenval);
572 	FIN_LEGACY_CRC32(valcrc);
573 	pushValue_internal(state, valcrc, state->curop - state->op, lenval, weight, prefix);
574 
575 	/* append the value string to state.op, enlarging buffer if needed first */
576 	while (state->curop - state->op + lenval + 1 >= state->lenop)
577 	{
578 		int			used = state->curop - state->op;
579 
580 		state->lenop *= 2;
581 		state->op = (char *) repalloc((void *) state->op, state->lenop);
582 		state->curop = state->op + used;
583 	}
584 	memcpy((void *) state->curop, (void *) strval, lenval);
585 	state->curop += lenval;
586 	*(state->curop) = '\0';
587 	state->curop++;
588 	state->sumlen += lenval + 1 /* \0 */ ;
589 }
590 
591 
592 /*
593  * Push a stopword placeholder to state->polstr
594  */
595 void
pushStop(TSQueryParserState state)596 pushStop(TSQueryParserState state)
597 {
598 	QueryOperand *tmp;
599 
600 	tmp = (QueryOperand *) palloc0(sizeof(QueryOperand));
601 	tmp->type = QI_VALSTOP;
602 
603 	state->polstr = lcons(tmp, state->polstr);
604 }
605 
606 
607 #define STACKDEPTH	32
608 
609 typedef struct OperatorElement
610 {
611 	int8		op;
612 	int16		distance;
613 } OperatorElement;
614 
615 static void
pushOpStack(OperatorElement * stack,int * lenstack,int8 op,int16 distance)616 pushOpStack(OperatorElement *stack, int *lenstack, int8 op, int16 distance)
617 {
618 	if (*lenstack == STACKDEPTH)	/* internal error */
619 		elog(ERROR, "tsquery stack too small");
620 
621 	stack[*lenstack].op = op;
622 	stack[*lenstack].distance = distance;
623 
624 	(*lenstack)++;
625 }
626 
627 static void
cleanOpStack(TSQueryParserState state,OperatorElement * stack,int * lenstack,int8 op)628 cleanOpStack(TSQueryParserState state,
629 			 OperatorElement *stack, int *lenstack, int8 op)
630 {
631 	int			opPriority = OP_PRIORITY(op);
632 
633 	while (*lenstack)
634 	{
635 		/* NOT is right associative unlike to others */
636 		if ((op != OP_NOT && opPriority > OP_PRIORITY(stack[*lenstack - 1].op)) ||
637 			(op == OP_NOT && opPriority >= OP_PRIORITY(stack[*lenstack - 1].op)))
638 			break;
639 
640 		(*lenstack)--;
641 		pushOperator(state, stack[*lenstack].op,
642 					 stack[*lenstack].distance);
643 	}
644 }
645 
646 /*
647  * Make polish (prefix) notation of query.
648  *
649  * See parse_tsquery for explanation of pushval.
650  */
651 static void
makepol(TSQueryParserState state,PushFunction pushval,Datum opaque)652 makepol(TSQueryParserState state,
653 		PushFunction pushval,
654 		Datum opaque)
655 {
656 	int8		operator = 0;
657 	ts_tokentype type;
658 	int			lenval = 0;
659 	char	   *strval = NULL;
660 	OperatorElement opstack[STACKDEPTH];
661 	int			lenstack = 0;
662 	int16		weight = 0;
663 	bool		prefix;
664 
665 	/* since this function recurses, it could be driven to stack overflow */
666 	check_stack_depth();
667 
668 	while ((type = state->gettoken(state, &operator,
669 								   &lenval, &strval,
670 								   &weight, &prefix)) != PT_END)
671 	{
672 		switch (type)
673 		{
674 			case PT_VAL:
675 				pushval(opaque, state, strval, lenval, weight, prefix);
676 				break;
677 			case PT_OPR:
678 				cleanOpStack(state, opstack, &lenstack, operator);
679 				pushOpStack(opstack, &lenstack, operator, weight);
680 				break;
681 			case PT_OPEN:
682 				makepol(state, pushval, opaque);
683 				break;
684 			case PT_CLOSE:
685 				cleanOpStack(state, opstack, &lenstack, OP_OR /* lowest */ );
686 				return;
687 			case PT_ERR:
688 			default:
689 				ereport(ERROR,
690 						(errcode(ERRCODE_SYNTAX_ERROR),
691 						 errmsg("syntax error in tsquery: \"%s\"",
692 								state->buffer)));
693 		}
694 	}
695 
696 	cleanOpStack(state, opstack, &lenstack, OP_OR /* lowest */ );
697 }
698 
699 static void
findoprnd_recurse(QueryItem * ptr,uint32 * pos,int nnodes,bool * needcleanup)700 findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes, bool *needcleanup)
701 {
702 	/* since this function recurses, it could be driven to stack overflow. */
703 	check_stack_depth();
704 
705 	if (*pos >= nnodes)
706 		elog(ERROR, "malformed tsquery: operand not found");
707 
708 	if (ptr[*pos].type == QI_VAL)
709 	{
710 		(*pos)++;
711 	}
712 	else if (ptr[*pos].type == QI_VALSTOP)
713 	{
714 		*needcleanup = true;	/* we'll have to remove stop words */
715 		(*pos)++;
716 	}
717 	else
718 	{
719 		Assert(ptr[*pos].type == QI_OPR);
720 
721 		if (ptr[*pos].qoperator.oper == OP_NOT)
722 		{
723 			ptr[*pos].qoperator.left = 1;	/* fixed offset */
724 			(*pos)++;
725 
726 			/* process the only argument */
727 			findoprnd_recurse(ptr, pos, nnodes, needcleanup);
728 		}
729 		else
730 		{
731 			QueryOperator *curitem = &ptr[*pos].qoperator;
732 			int			tmp = *pos; /* save current position */
733 
734 			Assert(curitem->oper == OP_AND ||
735 				   curitem->oper == OP_OR ||
736 				   curitem->oper == OP_PHRASE);
737 
738 			(*pos)++;
739 
740 			/* process RIGHT argument */
741 			findoprnd_recurse(ptr, pos, nnodes, needcleanup);
742 
743 			curitem->left = *pos - tmp; /* set LEFT arg's offset */
744 
745 			/* process LEFT argument */
746 			findoprnd_recurse(ptr, pos, nnodes, needcleanup);
747 		}
748 	}
749 }
750 
751 
752 /*
753  * Fill in the left-fields previously left unfilled.
754  * The input QueryItems must be in polish (prefix) notation.
755  * Also, set *needcleanup to true if there are any QI_VALSTOP nodes.
756  */
757 static void
findoprnd(QueryItem * ptr,int size,bool * needcleanup)758 findoprnd(QueryItem *ptr, int size, bool *needcleanup)
759 {
760 	uint32		pos;
761 
762 	*needcleanup = false;
763 	pos = 0;
764 	findoprnd_recurse(ptr, &pos, size, needcleanup);
765 
766 	if (pos != size)
767 		elog(ERROR, "malformed tsquery: extra nodes");
768 }
769 
770 
771 /*
772  * Each value (operand) in the query is passed to pushval. pushval can
773  * transform the simple value to an arbitrarily complex expression using
774  * pushValue and pushOperator. It must push a single value with pushValue,
775  * a complete expression with all operands, or a stopword placeholder
776  * with pushStop, otherwise the prefix notation representation will be broken,
777  * having an operator with no operand.
778  *
779  * opaque is passed on to pushval as is, pushval can use it to store its
780  * private state.
781  */
782 TSQuery
parse_tsquery(char * buf,PushFunction pushval,Datum opaque,int flags)783 parse_tsquery(char *buf,
784 			  PushFunction pushval,
785 			  Datum opaque,
786 			  int flags)
787 {
788 	struct TSQueryParserStateData state;
789 	int			i;
790 	TSQuery		query;
791 	int			commonlen;
792 	QueryItem  *ptr;
793 	ListCell   *cell;
794 	bool		needcleanup;
795 	int			tsv_flags = P_TSV_OPR_IS_DELIM | P_TSV_IS_TSQUERY;
796 
797 	/* plain should not be used with web */
798 	Assert((flags & (P_TSQ_PLAIN | P_TSQ_WEB)) != (P_TSQ_PLAIN | P_TSQ_WEB));
799 
800 	/* select suitable tokenizer */
801 	if (flags & P_TSQ_PLAIN)
802 		state.gettoken = gettoken_query_plain;
803 	else if (flags & P_TSQ_WEB)
804 	{
805 		state.gettoken = gettoken_query_websearch;
806 		tsv_flags |= P_TSV_IS_WEB;
807 	}
808 	else
809 		state.gettoken = gettoken_query_standard;
810 
811 	/* init state */
812 	state.buffer = buf;
813 	state.buf = buf;
814 	state.count = 0;
815 	state.state = WAITFIRSTOPERAND;
816 	state.polstr = NIL;
817 
818 	/* init value parser's state */
819 	state.valstate = init_tsvector_parser(state.buffer, tsv_flags);
820 
821 	/* init list of operand */
822 	state.sumlen = 0;
823 	state.lenop = 64;
824 	state.curop = state.op = (char *) palloc(state.lenop);
825 	*(state.curop) = '\0';
826 
827 	/* parse query & make polish notation (postfix, but in reverse order) */
828 	makepol(&state, pushval, opaque);
829 
830 	close_tsvector_parser(state.valstate);
831 
832 	if (list_length(state.polstr) == 0)
833 	{
834 		ereport(NOTICE,
835 				(errmsg("text-search query doesn't contain lexemes: \"%s\"",
836 						state.buffer)));
837 		query = (TSQuery) palloc(HDRSIZETQ);
838 		SET_VARSIZE(query, HDRSIZETQ);
839 		query->size = 0;
840 		return query;
841 	}
842 
843 	if (TSQUERY_TOO_BIG(list_length(state.polstr), state.sumlen))
844 		ereport(ERROR,
845 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
846 				 errmsg("tsquery is too large")));
847 	commonlen = COMPUTESIZE(list_length(state.polstr), state.sumlen);
848 
849 	/* Pack the QueryItems in the final TSQuery struct to return to caller */
850 	query = (TSQuery) palloc0(commonlen);
851 	SET_VARSIZE(query, commonlen);
852 	query->size = list_length(state.polstr);
853 	ptr = GETQUERY(query);
854 
855 	/* Copy QueryItems to TSQuery */
856 	i = 0;
857 	foreach(cell, state.polstr)
858 	{
859 		QueryItem  *item = (QueryItem *) lfirst(cell);
860 
861 		switch (item->type)
862 		{
863 			case QI_VAL:
864 				memcpy(&ptr[i], item, sizeof(QueryOperand));
865 				break;
866 			case QI_VALSTOP:
867 				ptr[i].type = QI_VALSTOP;
868 				break;
869 			case QI_OPR:
870 				memcpy(&ptr[i], item, sizeof(QueryOperator));
871 				break;
872 			default:
873 				elog(ERROR, "unrecognized QueryItem type: %d", item->type);
874 		}
875 		i++;
876 	}
877 
878 	/* Copy all the operand strings to TSQuery */
879 	memcpy((void *) GETOPERAND(query), (void *) state.op, state.sumlen);
880 	pfree(state.op);
881 
882 	/*
883 	 * Set left operand pointers for every operator.  While we're at it,
884 	 * detect whether there are any QI_VALSTOP nodes.
885 	 */
886 	findoprnd(ptr, query->size, &needcleanup);
887 
888 	/*
889 	 * If there are QI_VALSTOP nodes, delete them and simplify the tree.
890 	 */
891 	if (needcleanup)
892 		query = cleanup_tsquery_stopwords(query);
893 
894 	return query;
895 }
896 
897 static void
pushval_asis(Datum opaque,TSQueryParserState state,char * strval,int lenval,int16 weight,bool prefix)898 pushval_asis(Datum opaque, TSQueryParserState state, char *strval, int lenval,
899 			 int16 weight, bool prefix)
900 {
901 	pushValue(state, strval, lenval, weight, prefix);
902 }
903 
904 /*
905  * in without morphology
906  */
907 Datum
tsqueryin(PG_FUNCTION_ARGS)908 tsqueryin(PG_FUNCTION_ARGS)
909 {
910 	char	   *in = PG_GETARG_CSTRING(0);
911 
912 	PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, PointerGetDatum(NULL), 0));
913 }
914 
915 /*
916  * out function
917  */
918 typedef struct
919 {
920 	QueryItem  *curpol;
921 	char	   *buf;
922 	char	   *cur;
923 	char	   *op;
924 	int			buflen;
925 } INFIX;
926 
927 /* Makes sure inf->buf is large enough for adding 'addsize' bytes */
928 #define RESIZEBUF(inf, addsize) \
929 while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \
930 { \
931 	int len = (inf)->cur - (inf)->buf; \
932 	(inf)->buflen *= 2; \
933 	(inf)->buf = (char*) repalloc( (void*)(inf)->buf, (inf)->buflen ); \
934 	(inf)->cur = (inf)->buf + len; \
935 }
936 
937 /*
938  * recursively traverse the tree and
939  * print it in infix (human-readable) form
940  */
941 static void
infix(INFIX * in,int parentPriority,bool rightPhraseOp)942 infix(INFIX *in, int parentPriority, bool rightPhraseOp)
943 {
944 	/* since this function recurses, it could be driven to stack overflow. */
945 	check_stack_depth();
946 
947 	if (in->curpol->type == QI_VAL)
948 	{
949 		QueryOperand *curpol = &in->curpol->qoperand;
950 		char	   *op = in->op + curpol->distance;
951 		int			clen;
952 
953 		RESIZEBUF(in, curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 6);
954 		*(in->cur) = '\'';
955 		in->cur++;
956 		while (*op)
957 		{
958 			if (t_iseq(op, '\''))
959 			{
960 				*(in->cur) = '\'';
961 				in->cur++;
962 			}
963 			else if (t_iseq(op, '\\'))
964 			{
965 				*(in->cur) = '\\';
966 				in->cur++;
967 			}
968 			COPYCHAR(in->cur, op);
969 
970 			clen = pg_mblen(op);
971 			op += clen;
972 			in->cur += clen;
973 		}
974 		*(in->cur) = '\'';
975 		in->cur++;
976 		if (curpol->weight || curpol->prefix)
977 		{
978 			*(in->cur) = ':';
979 			in->cur++;
980 			if (curpol->prefix)
981 			{
982 				*(in->cur) = '*';
983 				in->cur++;
984 			}
985 			if (curpol->weight & (1 << 3))
986 			{
987 				*(in->cur) = 'A';
988 				in->cur++;
989 			}
990 			if (curpol->weight & (1 << 2))
991 			{
992 				*(in->cur) = 'B';
993 				in->cur++;
994 			}
995 			if (curpol->weight & (1 << 1))
996 			{
997 				*(in->cur) = 'C';
998 				in->cur++;
999 			}
1000 			if (curpol->weight & 1)
1001 			{
1002 				*(in->cur) = 'D';
1003 				in->cur++;
1004 			}
1005 		}
1006 		*(in->cur) = '\0';
1007 		in->curpol++;
1008 	}
1009 	else if (in->curpol->qoperator.oper == OP_NOT)
1010 	{
1011 		int			priority = QO_PRIORITY(in->curpol);
1012 
1013 		if (priority < parentPriority)
1014 		{
1015 			RESIZEBUF(in, 2);
1016 			sprintf(in->cur, "( ");
1017 			in->cur = strchr(in->cur, '\0');
1018 		}
1019 		RESIZEBUF(in, 1);
1020 		*(in->cur) = '!';
1021 		in->cur++;
1022 		*(in->cur) = '\0';
1023 		in->curpol++;
1024 
1025 		infix(in, priority, false);
1026 		if (priority < parentPriority)
1027 		{
1028 			RESIZEBUF(in, 2);
1029 			sprintf(in->cur, " )");
1030 			in->cur = strchr(in->cur, '\0');
1031 		}
1032 	}
1033 	else
1034 	{
1035 		int8		op = in->curpol->qoperator.oper;
1036 		int			priority = QO_PRIORITY(in->curpol);
1037 		int16		distance = in->curpol->qoperator.distance;
1038 		INFIX		nrm;
1039 		bool		needParenthesis = false;
1040 
1041 		in->curpol++;
1042 		if (priority < parentPriority ||
1043 		/* phrase operator depends on order */
1044 			(op == OP_PHRASE && rightPhraseOp))
1045 		{
1046 			needParenthesis = true;
1047 			RESIZEBUF(in, 2);
1048 			sprintf(in->cur, "( ");
1049 			in->cur = strchr(in->cur, '\0');
1050 		}
1051 
1052 		nrm.curpol = in->curpol;
1053 		nrm.op = in->op;
1054 		nrm.buflen = 16;
1055 		nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
1056 
1057 		/* get right operand */
1058 		infix(&nrm, priority, (op == OP_PHRASE));
1059 
1060 		/* get & print left operand */
1061 		in->curpol = nrm.curpol;
1062 		infix(in, priority, false);
1063 
1064 		/* print operator & right operand */
1065 		RESIZEBUF(in, 3 + (2 + 10 /* distance */ ) + (nrm.cur - nrm.buf));
1066 		switch (op)
1067 		{
1068 			case OP_OR:
1069 				sprintf(in->cur, " | %s", nrm.buf);
1070 				break;
1071 			case OP_AND:
1072 				sprintf(in->cur, " & %s", nrm.buf);
1073 				break;
1074 			case OP_PHRASE:
1075 				if (distance != 1)
1076 					sprintf(in->cur, " <%d> %s", distance, nrm.buf);
1077 				else
1078 					sprintf(in->cur, " <-> %s", nrm.buf);
1079 				break;
1080 			default:
1081 				/* OP_NOT is handled in above if-branch */
1082 				elog(ERROR, "unrecognized operator type: %d", op);
1083 		}
1084 		in->cur = strchr(in->cur, '\0');
1085 		pfree(nrm.buf);
1086 
1087 		if (needParenthesis)
1088 		{
1089 			RESIZEBUF(in, 2);
1090 			sprintf(in->cur, " )");
1091 			in->cur = strchr(in->cur, '\0');
1092 		}
1093 	}
1094 }
1095 
1096 Datum
tsqueryout(PG_FUNCTION_ARGS)1097 tsqueryout(PG_FUNCTION_ARGS)
1098 {
1099 	TSQuery		query = PG_GETARG_TSQUERY(0);
1100 	INFIX		nrm;
1101 
1102 	if (query->size == 0)
1103 	{
1104 		char	   *b = palloc(1);
1105 
1106 		*b = '\0';
1107 		PG_RETURN_POINTER(b);
1108 	}
1109 	nrm.curpol = GETQUERY(query);
1110 	nrm.buflen = 32;
1111 	nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
1112 	*(nrm.cur) = '\0';
1113 	nrm.op = GETOPERAND(query);
1114 	infix(&nrm, -1 /* lowest priority */ , false);
1115 
1116 	PG_FREE_IF_COPY(query, 0);
1117 	PG_RETURN_CSTRING(nrm.buf);
1118 }
1119 
1120 /*
1121  * Binary Input / Output functions. The binary format is as follows:
1122  *
1123  * uint32	 number of operators/operands in the query
1124  *
1125  * Followed by the operators and operands, in prefix notation. For each
1126  * operand:
1127  *
1128  * uint8	type, QI_VAL
1129  * uint8	weight
1130  *			operand text in client encoding, null-terminated
1131  * uint8	prefix
1132  *
1133  * For each operator:
1134  * uint8	type, QI_OPR
1135  * uint8	operator, one of OP_AND, OP_PHRASE OP_OR, OP_NOT.
1136  * uint16	distance (only for OP_PHRASE)
1137  */
1138 Datum
tsquerysend(PG_FUNCTION_ARGS)1139 tsquerysend(PG_FUNCTION_ARGS)
1140 {
1141 	TSQuery		query = PG_GETARG_TSQUERY(0);
1142 	StringInfoData buf;
1143 	int			i;
1144 	QueryItem  *item = GETQUERY(query);
1145 
1146 	pq_begintypsend(&buf);
1147 
1148 	pq_sendint32(&buf, query->size);
1149 	for (i = 0; i < query->size; i++)
1150 	{
1151 		pq_sendint8(&buf, item->type);
1152 
1153 		switch (item->type)
1154 		{
1155 			case QI_VAL:
1156 				pq_sendint8(&buf, item->qoperand.weight);
1157 				pq_sendint8(&buf, item->qoperand.prefix);
1158 				pq_sendstring(&buf, GETOPERAND(query) + item->qoperand.distance);
1159 				break;
1160 			case QI_OPR:
1161 				pq_sendint8(&buf, item->qoperator.oper);
1162 				if (item->qoperator.oper == OP_PHRASE)
1163 					pq_sendint16(&buf, item->qoperator.distance);
1164 				break;
1165 			default:
1166 				elog(ERROR, "unrecognized tsquery node type: %d", item->type);
1167 		}
1168 		item++;
1169 	}
1170 
1171 	PG_FREE_IF_COPY(query, 0);
1172 
1173 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
1174 }
1175 
1176 Datum
tsqueryrecv(PG_FUNCTION_ARGS)1177 tsqueryrecv(PG_FUNCTION_ARGS)
1178 {
1179 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
1180 	TSQuery		query;
1181 	int			i,
1182 				len;
1183 	QueryItem  *item;
1184 	int			datalen;
1185 	char	   *ptr;
1186 	uint32		size;
1187 	const char **operands;
1188 	bool		needcleanup;
1189 
1190 	size = pq_getmsgint(buf, sizeof(uint32));
1191 	if (size > (MaxAllocSize / sizeof(QueryItem)))
1192 		elog(ERROR, "invalid size of tsquery");
1193 
1194 	/* Allocate space to temporarily hold operand strings */
1195 	operands = palloc(size * sizeof(char *));
1196 
1197 	/* Allocate space for all the QueryItems. */
1198 	len = HDRSIZETQ + sizeof(QueryItem) * size;
1199 	query = (TSQuery) palloc0(len);
1200 	query->size = size;
1201 	item = GETQUERY(query);
1202 
1203 	datalen = 0;
1204 	for (i = 0; i < size; i++)
1205 	{
1206 		item->type = (int8) pq_getmsgint(buf, sizeof(int8));
1207 
1208 		if (item->type == QI_VAL)
1209 		{
1210 			size_t		val_len;	/* length after recoding to server
1211 									 * encoding */
1212 			uint8		weight;
1213 			uint8		prefix;
1214 			const char *val;
1215 			pg_crc32	valcrc;
1216 
1217 			weight = (uint8) pq_getmsgint(buf, sizeof(uint8));
1218 			prefix = (uint8) pq_getmsgint(buf, sizeof(uint8));
1219 			val = pq_getmsgstring(buf);
1220 			val_len = strlen(val);
1221 
1222 			/* Sanity checks */
1223 
1224 			if (weight > 0xF)
1225 				elog(ERROR, "invalid tsquery: invalid weight bitmap");
1226 
1227 			if (val_len > MAXSTRLEN)
1228 				elog(ERROR, "invalid tsquery: operand too long");
1229 
1230 			if (datalen > MAXSTRPOS)
1231 				elog(ERROR, "invalid tsquery: total operand length exceeded");
1232 
1233 			/* Looks valid. */
1234 
1235 			INIT_LEGACY_CRC32(valcrc);
1236 			COMP_LEGACY_CRC32(valcrc, val, val_len);
1237 			FIN_LEGACY_CRC32(valcrc);
1238 
1239 			item->qoperand.weight = weight;
1240 			item->qoperand.prefix = (prefix) ? true : false;
1241 			item->qoperand.valcrc = (int32) valcrc;
1242 			item->qoperand.length = val_len;
1243 			item->qoperand.distance = datalen;
1244 
1245 			/*
1246 			 * Operand strings are copied to the final struct after this loop;
1247 			 * here we just collect them to an array
1248 			 */
1249 			operands[i] = val;
1250 
1251 			datalen += val_len + 1; /* + 1 for the '\0' terminator */
1252 		}
1253 		else if (item->type == QI_OPR)
1254 		{
1255 			int8		oper;
1256 
1257 			oper = (int8) pq_getmsgint(buf, sizeof(int8));
1258 			if (oper != OP_NOT && oper != OP_OR && oper != OP_AND && oper != OP_PHRASE)
1259 				elog(ERROR, "invalid tsquery: unrecognized operator type %d",
1260 					 (int) oper);
1261 			if (i == size - 1)
1262 				elog(ERROR, "invalid pointer to right operand");
1263 
1264 			item->qoperator.oper = oper;
1265 			if (oper == OP_PHRASE)
1266 				item->qoperator.distance = (int16) pq_getmsgint(buf, sizeof(int16));
1267 		}
1268 		else
1269 			elog(ERROR, "unrecognized tsquery node type: %d", item->type);
1270 
1271 		item++;
1272 	}
1273 
1274 	/* Enlarge buffer to make room for the operand values. */
1275 	query = (TSQuery) repalloc(query, len + datalen);
1276 	item = GETQUERY(query);
1277 	ptr = GETOPERAND(query);
1278 
1279 	/*
1280 	 * Fill in the left-pointers. Checks that the tree is well-formed as a
1281 	 * side-effect.
1282 	 */
1283 	findoprnd(item, size, &needcleanup);
1284 
1285 	/* Can't have found any QI_VALSTOP nodes */
1286 	Assert(!needcleanup);
1287 
1288 	/* Copy operands to output struct */
1289 	for (i = 0; i < size; i++)
1290 	{
1291 		if (item->type == QI_VAL)
1292 		{
1293 			memcpy(ptr, operands[i], item->qoperand.length + 1);
1294 			ptr += item->qoperand.length + 1;
1295 		}
1296 		item++;
1297 	}
1298 
1299 	pfree(operands);
1300 
1301 	Assert(ptr - GETOPERAND(query) == datalen);
1302 
1303 	SET_VARSIZE(query, len + datalen);
1304 
1305 	PG_RETURN_TSQUERY(query);
1306 }
1307 
1308 /*
1309  * debug function, used only for view query
1310  * which will be executed in non-leaf pages in index
1311  */
1312 Datum
tsquerytree(PG_FUNCTION_ARGS)1313 tsquerytree(PG_FUNCTION_ARGS)
1314 {
1315 	TSQuery		query = PG_GETARG_TSQUERY(0);
1316 	INFIX		nrm;
1317 	text	   *res;
1318 	QueryItem  *q;
1319 	int			len;
1320 
1321 	if (query->size == 0)
1322 	{
1323 		res = (text *) palloc(VARHDRSZ);
1324 		SET_VARSIZE(res, VARHDRSZ);
1325 		PG_RETURN_POINTER(res);
1326 	}
1327 
1328 	q = clean_NOT(GETQUERY(query), &len);
1329 
1330 	if (!q)
1331 	{
1332 		res = cstring_to_text("T");
1333 	}
1334 	else
1335 	{
1336 		nrm.curpol = q;
1337 		nrm.buflen = 32;
1338 		nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
1339 		*(nrm.cur) = '\0';
1340 		nrm.op = GETOPERAND(query);
1341 		infix(&nrm, -1, false);
1342 		res = cstring_to_text_with_len(nrm.buf, nrm.cur - nrm.buf);
1343 		pfree(q);
1344 	}
1345 
1346 	PG_FREE_IF_COPY(query, 0);
1347 
1348 	PG_RETURN_TEXT_P(res);
1349 }
1350