1 /*-------------------------------------------------------------------------
2 *
3 * tsquery.c
4 * I/O functions for tsquery
5 *
6 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsquery.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15 #include "postgres.h"
16
17 #include "libpq/pqformat.h"
18 #include "miscadmin.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_type.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
23 #include "utils/memutils.h"
24 #include "utils/pg_crc.h"
25
26 /* FTS operator priorities, see ts_type.h */
27 const int tsearch_op_priority[OP_COUNT] =
28 {
29 4, /* OP_NOT */
30 2, /* OP_AND */
31 1, /* OP_OR */
32 3 /* OP_PHRASE */
33 };
34
35 /*
36 * parser's states
37 */
38 typedef enum
39 {
40 WAITOPERAND = 1,
41 WAITOPERATOR = 2,
42 WAITFIRSTOPERAND = 3
43 } ts_parserstate;
44
45 /*
46 * token types for parsing
47 */
48 typedef enum
49 {
50 PT_END = 0,
51 PT_ERR = 1,
52 PT_VAL = 2,
53 PT_OPR = 3,
54 PT_OPEN = 4,
55 PT_CLOSE = 5
56 } ts_tokentype;
57
58 /*
59 * get token from query string
60 *
61 * *operator is filled in with OP_* when return values is PT_OPR,
62 * but *weight could contain a distance value in case of phrase operator.
63 * *strval, *lenval and *weight are filled in when return value is PT_VAL
64 *
65 */
66 typedef ts_tokentype (*ts_tokenizer) (TSQueryParserState state, int8 *operator,
67 int *lenval, char **strval,
68 int16 *weight, bool *prefix);
69
70 struct TSQueryParserStateData
71 {
72 /* Tokenizer used for parsing tsquery */
73 ts_tokenizer gettoken;
74
75 /* State of tokenizer function */
76 char *buffer; /* entire string we are scanning */
77 char *buf; /* current scan point */
78 int count; /* nesting count, incremented by (,
79 * decremented by ) */
80 ts_parserstate state;
81
82 /* polish (prefix) notation in list, filled in by push* functions */
83 List *polstr;
84
85 /*
86 * Strings from operands are collected in op. curop is a pointer to the
87 * end of used space of op.
88 */
89 char *op;
90 char *curop;
91 int lenop; /* allocated size of op */
92 int sumlen; /* used size of op */
93
94 /* state for value's parser */
95 TSVectorParseState valstate;
96 };
97
98 /*
99 * subroutine to parse the modifiers (weight and prefix flag currently)
100 * part, like ':AB*' of a query.
101 */
102 static char *
get_modifiers(char * buf,int16 * weight,bool * prefix)103 get_modifiers(char *buf, int16 *weight, bool *prefix)
104 {
105 *weight = 0;
106 *prefix = false;
107
108 if (!t_iseq(buf, ':'))
109 return buf;
110
111 buf++;
112 while (*buf && pg_mblen(buf) == 1)
113 {
114 switch (*buf)
115 {
116 case 'a':
117 case 'A':
118 *weight |= 1 << 3;
119 break;
120 case 'b':
121 case 'B':
122 *weight |= 1 << 2;
123 break;
124 case 'c':
125 case 'C':
126 *weight |= 1 << 1;
127 break;
128 case 'd':
129 case 'D':
130 *weight |= 1;
131 break;
132 case '*':
133 *prefix = true;
134 break;
135 default:
136 return buf;
137 }
138 buf++;
139 }
140
141 return buf;
142 }
143
144 /*
145 * Parse phrase operator. The operator
146 * may take the following forms:
147 *
148 * a <N> b (distance is exactly N lexemes)
149 * a <-> b (default distance = 1)
150 *
151 * The buffer should begin with '<' char
152 */
153 static bool
parse_phrase_operator(TSQueryParserState pstate,int16 * distance)154 parse_phrase_operator(TSQueryParserState pstate, int16 *distance)
155 {
156 enum
157 {
158 PHRASE_OPEN = 0,
159 PHRASE_DIST,
160 PHRASE_CLOSE,
161 PHRASE_FINISH
162 } state = PHRASE_OPEN;
163 char *ptr = pstate->buf;
164 char *endptr;
165 long l = 1; /* default distance */
166
167 while (*ptr)
168 {
169 switch (state)
170 {
171 case PHRASE_OPEN:
172 if (t_iseq(ptr, '<'))
173 {
174 state = PHRASE_DIST;
175 ptr++;
176 }
177 else
178 return false;
179 break;
180
181 case PHRASE_DIST:
182 if (t_iseq(ptr, '-'))
183 {
184 state = PHRASE_CLOSE;
185 ptr++;
186 continue;
187 }
188
189 if (!t_isdigit(ptr))
190 return false;
191
192 errno = 0;
193 l = strtol(ptr, &endptr, 10);
194 if (ptr == endptr)
195 return false;
196 else if (errno == ERANGE || l < 0 || l > MAXENTRYPOS)
197 ereport(ERROR,
198 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
199 errmsg("distance in phrase operator must be an integer value between zero and %d inclusive",
200 MAXENTRYPOS)));
201 else
202 {
203 state = PHRASE_CLOSE;
204 ptr = endptr;
205 }
206 break;
207
208 case PHRASE_CLOSE:
209 if (t_iseq(ptr, '>'))
210 {
211 state = PHRASE_FINISH;
212 ptr++;
213 }
214 else
215 return false;
216 break;
217
218 case PHRASE_FINISH:
219 *distance = (int16) l;
220 pstate->buf = ptr;
221 return true;
222 }
223 }
224
225 return false;
226 }
227
228 /*
229 * Parse OR operator used in websearch_to_tsquery(), returns true if we
230 * believe that "OR" literal could be an operator OR
231 */
232 static bool
parse_or_operator(TSQueryParserState pstate)233 parse_or_operator(TSQueryParserState pstate)
234 {
235 char *ptr = pstate->buf;
236
237 /* it should begin with "OR" literal */
238 if (pg_strncasecmp(ptr, "or", 2) != 0)
239 return false;
240
241 ptr += 2;
242
243 /*
244 * it shouldn't be a part of any word but somewhere later it should be
245 * some operand
246 */
247 if (*ptr == '\0') /* no operand */
248 return false;
249
250 /* it shouldn't be a part of any word */
251 if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha(ptr) || t_isdigit(ptr))
252 return false;
253
254 for (;;)
255 {
256 ptr += pg_mblen(ptr);
257
258 if (*ptr == '\0') /* got end of string without operand */
259 return false;
260
261 /*
262 * Suppose, we found an operand, but could be a not correct operand.
263 * So we still treat OR literal as operation with possibly incorrect
264 * operand and will not search it as lexeme
265 */
266 if (!t_isspace(ptr))
267 break;
268 }
269
270 pstate->buf += 2;
271 return true;
272 }
273
274 static ts_tokentype
gettoken_query_standard(TSQueryParserState state,int8 * operator,int * lenval,char ** strval,int16 * weight,bool * prefix)275 gettoken_query_standard(TSQueryParserState state, int8 *operator,
276 int *lenval, char **strval,
277 int16 *weight, bool *prefix)
278 {
279 *weight = 0;
280 *prefix = false;
281
282 while (true)
283 {
284 switch (state->state)
285 {
286 case WAITFIRSTOPERAND:
287 case WAITOPERAND:
288 if (t_iseq(state->buf, '!'))
289 {
290 state->buf++;
291 state->state = WAITOPERAND;
292 *operator = OP_NOT;
293 return PT_OPR;
294 }
295 else if (t_iseq(state->buf, '('))
296 {
297 state->buf++;
298 state->state = WAITOPERAND;
299 state->count++;
300 return PT_OPEN;
301 }
302 else if (t_iseq(state->buf, ':'))
303 {
304 ereport(ERROR,
305 (errcode(ERRCODE_SYNTAX_ERROR),
306 errmsg("syntax error in tsquery: \"%s\"",
307 state->buffer)));
308 }
309 else if (!t_isspace(state->buf))
310 {
311 /*
312 * We rely on the tsvector parser to parse the value for
313 * us
314 */
315 reset_tsvector_parser(state->valstate, state->buf);
316 if (gettoken_tsvector(state->valstate, strval, lenval,
317 NULL, NULL, &state->buf))
318 {
319 state->buf = get_modifiers(state->buf, weight, prefix);
320 state->state = WAITOPERATOR;
321 return PT_VAL;
322 }
323 else if (state->state == WAITFIRSTOPERAND)
324 {
325 return PT_END;
326 }
327 else
328 ereport(ERROR,
329 (errcode(ERRCODE_SYNTAX_ERROR),
330 errmsg("no operand in tsquery: \"%s\"",
331 state->buffer)));
332 }
333 break;
334
335 case WAITOPERATOR:
336 if (t_iseq(state->buf, '&'))
337 {
338 state->buf++;
339 state->state = WAITOPERAND;
340 *operator = OP_AND;
341 return PT_OPR;
342 }
343 else if (t_iseq(state->buf, '|'))
344 {
345 state->buf++;
346 state->state = WAITOPERAND;
347 *operator = OP_OR;
348 return PT_OPR;
349 }
350 else if (parse_phrase_operator(state, weight))
351 {
352 /* weight var is used as storage for distance */
353 state->state = WAITOPERAND;
354 *operator = OP_PHRASE;
355 return PT_OPR;
356 }
357 else if (t_iseq(state->buf, ')'))
358 {
359 state->buf++;
360 state->count--;
361 return (state->count < 0) ? PT_ERR : PT_CLOSE;
362 }
363 else if (*state->buf == '\0')
364 {
365 return (state->count) ? PT_ERR : PT_END;
366 }
367 else if (!t_isspace(state->buf))
368 {
369 return PT_ERR;
370 }
371 break;
372 }
373
374 state->buf += pg_mblen(state->buf);
375 }
376 }
377
378 static ts_tokentype
gettoken_query_websearch(TSQueryParserState state,int8 * operator,int * lenval,char ** strval,int16 * weight,bool * prefix)379 gettoken_query_websearch(TSQueryParserState state, int8 *operator,
380 int *lenval, char **strval,
381 int16 *weight, bool *prefix)
382 {
383 *weight = 0;
384 *prefix = false;
385
386 while (true)
387 {
388 switch (state->state)
389 {
390 case WAITFIRSTOPERAND:
391 case WAITOPERAND:
392 if (t_iseq(state->buf, '-'))
393 {
394 state->buf++;
395 state->state = WAITOPERAND;
396
397 *operator = OP_NOT;
398 return PT_OPR;
399 }
400 else if (t_iseq(state->buf, '"'))
401 {
402 /* Everything in quotes is processed as a single token */
403
404 /* skip opening quote */
405 state->buf++;
406 *strval = state->buf;
407
408 /* iterate to the closing quote or end of the string */
409 while (*state->buf != '\0' && !t_iseq(state->buf, '"'))
410 state->buf++;
411 *lenval = state->buf - *strval;
412
413 /* skip closing quote if not end of the string */
414 if (*state->buf != '\0')
415 state->buf++;
416
417 state->state = WAITOPERATOR;
418 state->count++;
419 return PT_VAL;
420 }
421 else if (ISOPERATOR(state->buf))
422 {
423 /* or else gettoken_tsvector() will raise an error */
424 state->buf++;
425 state->state = WAITOPERAND;
426 continue;
427 }
428 else if (!t_isspace(state->buf))
429 {
430 /*
431 * We rely on the tsvector parser to parse the value for
432 * us
433 */
434 reset_tsvector_parser(state->valstate, state->buf);
435 if (gettoken_tsvector(state->valstate, strval, lenval,
436 NULL, NULL, &state->buf))
437 {
438 state->state = WAITOPERATOR;
439 return PT_VAL;
440 }
441 else if (state->state == WAITFIRSTOPERAND)
442 {
443 return PT_END;
444 }
445 else
446 {
447 /* finally, we have to provide an operand */
448 pushStop(state);
449 return PT_END;
450 }
451 }
452 break;
453
454 case WAITOPERATOR:
455 if (t_iseq(state->buf, '"'))
456 {
457 /*
458 * put implicit AND after an operand and handle this quote
459 * in WAITOPERAND
460 */
461 state->state = WAITOPERAND;
462 *operator = OP_AND;
463 return PT_OPR;
464 }
465 else if (parse_or_operator(state))
466 {
467 state->state = WAITOPERAND;
468 *operator = OP_OR;
469 return PT_OPR;
470 }
471 else if (*state->buf == '\0')
472 {
473 return PT_END;
474 }
475 else if (!t_isspace(state->buf))
476 {
477 /* put implicit AND after an operand */
478 *operator = OP_AND;
479 state->state = WAITOPERAND;
480 return PT_OPR;
481 }
482 break;
483 }
484
485 state->buf += pg_mblen(state->buf);
486 }
487 }
488
489 static ts_tokentype
gettoken_query_plain(TSQueryParserState state,int8 * operator,int * lenval,char ** strval,int16 * weight,bool * prefix)490 gettoken_query_plain(TSQueryParserState state, int8 *operator,
491 int *lenval, char **strval,
492 int16 *weight, bool *prefix)
493 {
494 *weight = 0;
495 *prefix = false;
496
497 if (*state->buf == '\0')
498 return PT_END;
499
500 *strval = state->buf;
501 *lenval = strlen(state->buf);
502 state->buf += *lenval;
503 state->count++;
504 return PT_VAL;
505 }
506
507 /*
508 * Push an operator to state->polstr
509 */
510 void
pushOperator(TSQueryParserState state,int8 oper,int16 distance)511 pushOperator(TSQueryParserState state, int8 oper, int16 distance)
512 {
513 QueryOperator *tmp;
514
515 Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR || oper == OP_PHRASE);
516
517 tmp = (QueryOperator *) palloc0(sizeof(QueryOperator));
518 tmp->type = QI_OPR;
519 tmp->oper = oper;
520 tmp->distance = (oper == OP_PHRASE) ? distance : 0;
521 /* left is filled in later with findoprnd */
522
523 state->polstr = lcons(tmp, state->polstr);
524 }
525
526 static void
pushValue_internal(TSQueryParserState state,pg_crc32 valcrc,int distance,int lenval,int weight,bool prefix)527 pushValue_internal(TSQueryParserState state, pg_crc32 valcrc, int distance, int lenval, int weight, bool prefix)
528 {
529 QueryOperand *tmp;
530
531 if (distance >= MAXSTRPOS)
532 ereport(ERROR,
533 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
534 errmsg("value is too big in tsquery: \"%s\"",
535 state->buffer)));
536 if (lenval >= MAXSTRLEN)
537 ereport(ERROR,
538 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
539 errmsg("operand is too long in tsquery: \"%s\"",
540 state->buffer)));
541
542 tmp = (QueryOperand *) palloc0(sizeof(QueryOperand));
543 tmp->type = QI_VAL;
544 tmp->weight = weight;
545 tmp->prefix = prefix;
546 tmp->valcrc = (int32) valcrc;
547 tmp->length = lenval;
548 tmp->distance = distance;
549
550 state->polstr = lcons(tmp, state->polstr);
551 }
552
553 /*
554 * Push an operand to state->polstr.
555 *
556 * strval must point to a string equal to state->curop. lenval is the length
557 * of the string.
558 */
559 void
pushValue(TSQueryParserState state,char * strval,int lenval,int16 weight,bool prefix)560 pushValue(TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
561 {
562 pg_crc32 valcrc;
563
564 if (lenval >= MAXSTRLEN)
565 ereport(ERROR,
566 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
567 errmsg("word is too long in tsquery: \"%s\"",
568 state->buffer)));
569
570 INIT_LEGACY_CRC32(valcrc);
571 COMP_LEGACY_CRC32(valcrc, strval, lenval);
572 FIN_LEGACY_CRC32(valcrc);
573 pushValue_internal(state, valcrc, state->curop - state->op, lenval, weight, prefix);
574
575 /* append the value string to state.op, enlarging buffer if needed first */
576 while (state->curop - state->op + lenval + 1 >= state->lenop)
577 {
578 int used = state->curop - state->op;
579
580 state->lenop *= 2;
581 state->op = (char *) repalloc((void *) state->op, state->lenop);
582 state->curop = state->op + used;
583 }
584 memcpy((void *) state->curop, (void *) strval, lenval);
585 state->curop += lenval;
586 *(state->curop) = '\0';
587 state->curop++;
588 state->sumlen += lenval + 1 /* \0 */ ;
589 }
590
591
592 /*
593 * Push a stopword placeholder to state->polstr
594 */
595 void
pushStop(TSQueryParserState state)596 pushStop(TSQueryParserState state)
597 {
598 QueryOperand *tmp;
599
600 tmp = (QueryOperand *) palloc0(sizeof(QueryOperand));
601 tmp->type = QI_VALSTOP;
602
603 state->polstr = lcons(tmp, state->polstr);
604 }
605
606
607 #define STACKDEPTH 32
608
609 typedef struct OperatorElement
610 {
611 int8 op;
612 int16 distance;
613 } OperatorElement;
614
615 static void
pushOpStack(OperatorElement * stack,int * lenstack,int8 op,int16 distance)616 pushOpStack(OperatorElement *stack, int *lenstack, int8 op, int16 distance)
617 {
618 if (*lenstack == STACKDEPTH) /* internal error */
619 elog(ERROR, "tsquery stack too small");
620
621 stack[*lenstack].op = op;
622 stack[*lenstack].distance = distance;
623
624 (*lenstack)++;
625 }
626
627 static void
cleanOpStack(TSQueryParserState state,OperatorElement * stack,int * lenstack,int8 op)628 cleanOpStack(TSQueryParserState state,
629 OperatorElement *stack, int *lenstack, int8 op)
630 {
631 int opPriority = OP_PRIORITY(op);
632
633 while (*lenstack)
634 {
635 /* NOT is right associative unlike to others */
636 if ((op != OP_NOT && opPriority > OP_PRIORITY(stack[*lenstack - 1].op)) ||
637 (op == OP_NOT && opPriority >= OP_PRIORITY(stack[*lenstack - 1].op)))
638 break;
639
640 (*lenstack)--;
641 pushOperator(state, stack[*lenstack].op,
642 stack[*lenstack].distance);
643 }
644 }
645
646 /*
647 * Make polish (prefix) notation of query.
648 *
649 * See parse_tsquery for explanation of pushval.
650 */
651 static void
makepol(TSQueryParserState state,PushFunction pushval,Datum opaque)652 makepol(TSQueryParserState state,
653 PushFunction pushval,
654 Datum opaque)
655 {
656 int8 operator = 0;
657 ts_tokentype type;
658 int lenval = 0;
659 char *strval = NULL;
660 OperatorElement opstack[STACKDEPTH];
661 int lenstack = 0;
662 int16 weight = 0;
663 bool prefix;
664
665 /* since this function recurses, it could be driven to stack overflow */
666 check_stack_depth();
667
668 while ((type = state->gettoken(state, &operator,
669 &lenval, &strval,
670 &weight, &prefix)) != PT_END)
671 {
672 switch (type)
673 {
674 case PT_VAL:
675 pushval(opaque, state, strval, lenval, weight, prefix);
676 break;
677 case PT_OPR:
678 cleanOpStack(state, opstack, &lenstack, operator);
679 pushOpStack(opstack, &lenstack, operator, weight);
680 break;
681 case PT_OPEN:
682 makepol(state, pushval, opaque);
683 break;
684 case PT_CLOSE:
685 cleanOpStack(state, opstack, &lenstack, OP_OR /* lowest */ );
686 return;
687 case PT_ERR:
688 default:
689 ereport(ERROR,
690 (errcode(ERRCODE_SYNTAX_ERROR),
691 errmsg("syntax error in tsquery: \"%s\"",
692 state->buffer)));
693 }
694 }
695
696 cleanOpStack(state, opstack, &lenstack, OP_OR /* lowest */ );
697 }
698
699 static void
findoprnd_recurse(QueryItem * ptr,uint32 * pos,int nnodes,bool * needcleanup)700 findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes, bool *needcleanup)
701 {
702 /* since this function recurses, it could be driven to stack overflow. */
703 check_stack_depth();
704
705 if (*pos >= nnodes)
706 elog(ERROR, "malformed tsquery: operand not found");
707
708 if (ptr[*pos].type == QI_VAL)
709 {
710 (*pos)++;
711 }
712 else if (ptr[*pos].type == QI_VALSTOP)
713 {
714 *needcleanup = true; /* we'll have to remove stop words */
715 (*pos)++;
716 }
717 else
718 {
719 Assert(ptr[*pos].type == QI_OPR);
720
721 if (ptr[*pos].qoperator.oper == OP_NOT)
722 {
723 ptr[*pos].qoperator.left = 1; /* fixed offset */
724 (*pos)++;
725
726 /* process the only argument */
727 findoprnd_recurse(ptr, pos, nnodes, needcleanup);
728 }
729 else
730 {
731 QueryOperator *curitem = &ptr[*pos].qoperator;
732 int tmp = *pos; /* save current position */
733
734 Assert(curitem->oper == OP_AND ||
735 curitem->oper == OP_OR ||
736 curitem->oper == OP_PHRASE);
737
738 (*pos)++;
739
740 /* process RIGHT argument */
741 findoprnd_recurse(ptr, pos, nnodes, needcleanup);
742
743 curitem->left = *pos - tmp; /* set LEFT arg's offset */
744
745 /* process LEFT argument */
746 findoprnd_recurse(ptr, pos, nnodes, needcleanup);
747 }
748 }
749 }
750
751
752 /*
753 * Fill in the left-fields previously left unfilled.
754 * The input QueryItems must be in polish (prefix) notation.
755 * Also, set *needcleanup to true if there are any QI_VALSTOP nodes.
756 */
757 static void
findoprnd(QueryItem * ptr,int size,bool * needcleanup)758 findoprnd(QueryItem *ptr, int size, bool *needcleanup)
759 {
760 uint32 pos;
761
762 *needcleanup = false;
763 pos = 0;
764 findoprnd_recurse(ptr, &pos, size, needcleanup);
765
766 if (pos != size)
767 elog(ERROR, "malformed tsquery: extra nodes");
768 }
769
770
771 /*
772 * Each value (operand) in the query is passed to pushval. pushval can
773 * transform the simple value to an arbitrarily complex expression using
774 * pushValue and pushOperator. It must push a single value with pushValue,
775 * a complete expression with all operands, or a stopword placeholder
776 * with pushStop, otherwise the prefix notation representation will be broken,
777 * having an operator with no operand.
778 *
779 * opaque is passed on to pushval as is, pushval can use it to store its
780 * private state.
781 */
782 TSQuery
parse_tsquery(char * buf,PushFunction pushval,Datum opaque,int flags)783 parse_tsquery(char *buf,
784 PushFunction pushval,
785 Datum opaque,
786 int flags)
787 {
788 struct TSQueryParserStateData state;
789 int i;
790 TSQuery query;
791 int commonlen;
792 QueryItem *ptr;
793 ListCell *cell;
794 bool needcleanup;
795 int tsv_flags = P_TSV_OPR_IS_DELIM | P_TSV_IS_TSQUERY;
796
797 /* plain should not be used with web */
798 Assert((flags & (P_TSQ_PLAIN | P_TSQ_WEB)) != (P_TSQ_PLAIN | P_TSQ_WEB));
799
800 /* select suitable tokenizer */
801 if (flags & P_TSQ_PLAIN)
802 state.gettoken = gettoken_query_plain;
803 else if (flags & P_TSQ_WEB)
804 {
805 state.gettoken = gettoken_query_websearch;
806 tsv_flags |= P_TSV_IS_WEB;
807 }
808 else
809 state.gettoken = gettoken_query_standard;
810
811 /* init state */
812 state.buffer = buf;
813 state.buf = buf;
814 state.count = 0;
815 state.state = WAITFIRSTOPERAND;
816 state.polstr = NIL;
817
818 /* init value parser's state */
819 state.valstate = init_tsvector_parser(state.buffer, tsv_flags);
820
821 /* init list of operand */
822 state.sumlen = 0;
823 state.lenop = 64;
824 state.curop = state.op = (char *) palloc(state.lenop);
825 *(state.curop) = '\0';
826
827 /* parse query & make polish notation (postfix, but in reverse order) */
828 makepol(&state, pushval, opaque);
829
830 close_tsvector_parser(state.valstate);
831
832 if (list_length(state.polstr) == 0)
833 {
834 ereport(NOTICE,
835 (errmsg("text-search query doesn't contain lexemes: \"%s\"",
836 state.buffer)));
837 query = (TSQuery) palloc(HDRSIZETQ);
838 SET_VARSIZE(query, HDRSIZETQ);
839 query->size = 0;
840 return query;
841 }
842
843 if (TSQUERY_TOO_BIG(list_length(state.polstr), state.sumlen))
844 ereport(ERROR,
845 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
846 errmsg("tsquery is too large")));
847 commonlen = COMPUTESIZE(list_length(state.polstr), state.sumlen);
848
849 /* Pack the QueryItems in the final TSQuery struct to return to caller */
850 query = (TSQuery) palloc0(commonlen);
851 SET_VARSIZE(query, commonlen);
852 query->size = list_length(state.polstr);
853 ptr = GETQUERY(query);
854
855 /* Copy QueryItems to TSQuery */
856 i = 0;
857 foreach(cell, state.polstr)
858 {
859 QueryItem *item = (QueryItem *) lfirst(cell);
860
861 switch (item->type)
862 {
863 case QI_VAL:
864 memcpy(&ptr[i], item, sizeof(QueryOperand));
865 break;
866 case QI_VALSTOP:
867 ptr[i].type = QI_VALSTOP;
868 break;
869 case QI_OPR:
870 memcpy(&ptr[i], item, sizeof(QueryOperator));
871 break;
872 default:
873 elog(ERROR, "unrecognized QueryItem type: %d", item->type);
874 }
875 i++;
876 }
877
878 /* Copy all the operand strings to TSQuery */
879 memcpy((void *) GETOPERAND(query), (void *) state.op, state.sumlen);
880 pfree(state.op);
881
882 /*
883 * Set left operand pointers for every operator. While we're at it,
884 * detect whether there are any QI_VALSTOP nodes.
885 */
886 findoprnd(ptr, query->size, &needcleanup);
887
888 /*
889 * If there are QI_VALSTOP nodes, delete them and simplify the tree.
890 */
891 if (needcleanup)
892 query = cleanup_tsquery_stopwords(query);
893
894 return query;
895 }
896
897 static void
pushval_asis(Datum opaque,TSQueryParserState state,char * strval,int lenval,int16 weight,bool prefix)898 pushval_asis(Datum opaque, TSQueryParserState state, char *strval, int lenval,
899 int16 weight, bool prefix)
900 {
901 pushValue(state, strval, lenval, weight, prefix);
902 }
903
904 /*
905 * in without morphology
906 */
907 Datum
tsqueryin(PG_FUNCTION_ARGS)908 tsqueryin(PG_FUNCTION_ARGS)
909 {
910 char *in = PG_GETARG_CSTRING(0);
911
912 PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, PointerGetDatum(NULL), 0));
913 }
914
915 /*
916 * out function
917 */
918 typedef struct
919 {
920 QueryItem *curpol;
921 char *buf;
922 char *cur;
923 char *op;
924 int buflen;
925 } INFIX;
926
927 /* Makes sure inf->buf is large enough for adding 'addsize' bytes */
928 #define RESIZEBUF(inf, addsize) \
929 while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \
930 { \
931 int len = (inf)->cur - (inf)->buf; \
932 (inf)->buflen *= 2; \
933 (inf)->buf = (char*) repalloc( (void*)(inf)->buf, (inf)->buflen ); \
934 (inf)->cur = (inf)->buf + len; \
935 }
936
937 /*
938 * recursively traverse the tree and
939 * print it in infix (human-readable) form
940 */
941 static void
infix(INFIX * in,int parentPriority,bool rightPhraseOp)942 infix(INFIX *in, int parentPriority, bool rightPhraseOp)
943 {
944 /* since this function recurses, it could be driven to stack overflow. */
945 check_stack_depth();
946
947 if (in->curpol->type == QI_VAL)
948 {
949 QueryOperand *curpol = &in->curpol->qoperand;
950 char *op = in->op + curpol->distance;
951 int clen;
952
953 RESIZEBUF(in, curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 6);
954 *(in->cur) = '\'';
955 in->cur++;
956 while (*op)
957 {
958 if (t_iseq(op, '\''))
959 {
960 *(in->cur) = '\'';
961 in->cur++;
962 }
963 else if (t_iseq(op, '\\'))
964 {
965 *(in->cur) = '\\';
966 in->cur++;
967 }
968 COPYCHAR(in->cur, op);
969
970 clen = pg_mblen(op);
971 op += clen;
972 in->cur += clen;
973 }
974 *(in->cur) = '\'';
975 in->cur++;
976 if (curpol->weight || curpol->prefix)
977 {
978 *(in->cur) = ':';
979 in->cur++;
980 if (curpol->prefix)
981 {
982 *(in->cur) = '*';
983 in->cur++;
984 }
985 if (curpol->weight & (1 << 3))
986 {
987 *(in->cur) = 'A';
988 in->cur++;
989 }
990 if (curpol->weight & (1 << 2))
991 {
992 *(in->cur) = 'B';
993 in->cur++;
994 }
995 if (curpol->weight & (1 << 1))
996 {
997 *(in->cur) = 'C';
998 in->cur++;
999 }
1000 if (curpol->weight & 1)
1001 {
1002 *(in->cur) = 'D';
1003 in->cur++;
1004 }
1005 }
1006 *(in->cur) = '\0';
1007 in->curpol++;
1008 }
1009 else if (in->curpol->qoperator.oper == OP_NOT)
1010 {
1011 int priority = QO_PRIORITY(in->curpol);
1012
1013 if (priority < parentPriority)
1014 {
1015 RESIZEBUF(in, 2);
1016 sprintf(in->cur, "( ");
1017 in->cur = strchr(in->cur, '\0');
1018 }
1019 RESIZEBUF(in, 1);
1020 *(in->cur) = '!';
1021 in->cur++;
1022 *(in->cur) = '\0';
1023 in->curpol++;
1024
1025 infix(in, priority, false);
1026 if (priority < parentPriority)
1027 {
1028 RESIZEBUF(in, 2);
1029 sprintf(in->cur, " )");
1030 in->cur = strchr(in->cur, '\0');
1031 }
1032 }
1033 else
1034 {
1035 int8 op = in->curpol->qoperator.oper;
1036 int priority = QO_PRIORITY(in->curpol);
1037 int16 distance = in->curpol->qoperator.distance;
1038 INFIX nrm;
1039 bool needParenthesis = false;
1040
1041 in->curpol++;
1042 if (priority < parentPriority ||
1043 /* phrase operator depends on order */
1044 (op == OP_PHRASE && rightPhraseOp))
1045 {
1046 needParenthesis = true;
1047 RESIZEBUF(in, 2);
1048 sprintf(in->cur, "( ");
1049 in->cur = strchr(in->cur, '\0');
1050 }
1051
1052 nrm.curpol = in->curpol;
1053 nrm.op = in->op;
1054 nrm.buflen = 16;
1055 nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
1056
1057 /* get right operand */
1058 infix(&nrm, priority, (op == OP_PHRASE));
1059
1060 /* get & print left operand */
1061 in->curpol = nrm.curpol;
1062 infix(in, priority, false);
1063
1064 /* print operator & right operand */
1065 RESIZEBUF(in, 3 + (2 + 10 /* distance */ ) + (nrm.cur - nrm.buf));
1066 switch (op)
1067 {
1068 case OP_OR:
1069 sprintf(in->cur, " | %s", nrm.buf);
1070 break;
1071 case OP_AND:
1072 sprintf(in->cur, " & %s", nrm.buf);
1073 break;
1074 case OP_PHRASE:
1075 if (distance != 1)
1076 sprintf(in->cur, " <%d> %s", distance, nrm.buf);
1077 else
1078 sprintf(in->cur, " <-> %s", nrm.buf);
1079 break;
1080 default:
1081 /* OP_NOT is handled in above if-branch */
1082 elog(ERROR, "unrecognized operator type: %d", op);
1083 }
1084 in->cur = strchr(in->cur, '\0');
1085 pfree(nrm.buf);
1086
1087 if (needParenthesis)
1088 {
1089 RESIZEBUF(in, 2);
1090 sprintf(in->cur, " )");
1091 in->cur = strchr(in->cur, '\0');
1092 }
1093 }
1094 }
1095
1096 Datum
tsqueryout(PG_FUNCTION_ARGS)1097 tsqueryout(PG_FUNCTION_ARGS)
1098 {
1099 TSQuery query = PG_GETARG_TSQUERY(0);
1100 INFIX nrm;
1101
1102 if (query->size == 0)
1103 {
1104 char *b = palloc(1);
1105
1106 *b = '\0';
1107 PG_RETURN_POINTER(b);
1108 }
1109 nrm.curpol = GETQUERY(query);
1110 nrm.buflen = 32;
1111 nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
1112 *(nrm.cur) = '\0';
1113 nrm.op = GETOPERAND(query);
1114 infix(&nrm, -1 /* lowest priority */ , false);
1115
1116 PG_FREE_IF_COPY(query, 0);
1117 PG_RETURN_CSTRING(nrm.buf);
1118 }
1119
1120 /*
1121 * Binary Input / Output functions. The binary format is as follows:
1122 *
1123 * uint32 number of operators/operands in the query
1124 *
1125 * Followed by the operators and operands, in prefix notation. For each
1126 * operand:
1127 *
1128 * uint8 type, QI_VAL
1129 * uint8 weight
1130 * operand text in client encoding, null-terminated
1131 * uint8 prefix
1132 *
1133 * For each operator:
1134 * uint8 type, QI_OPR
1135 * uint8 operator, one of OP_AND, OP_PHRASE OP_OR, OP_NOT.
1136 * uint16 distance (only for OP_PHRASE)
1137 */
1138 Datum
tsquerysend(PG_FUNCTION_ARGS)1139 tsquerysend(PG_FUNCTION_ARGS)
1140 {
1141 TSQuery query = PG_GETARG_TSQUERY(0);
1142 StringInfoData buf;
1143 int i;
1144 QueryItem *item = GETQUERY(query);
1145
1146 pq_begintypsend(&buf);
1147
1148 pq_sendint32(&buf, query->size);
1149 for (i = 0; i < query->size; i++)
1150 {
1151 pq_sendint8(&buf, item->type);
1152
1153 switch (item->type)
1154 {
1155 case QI_VAL:
1156 pq_sendint8(&buf, item->qoperand.weight);
1157 pq_sendint8(&buf, item->qoperand.prefix);
1158 pq_sendstring(&buf, GETOPERAND(query) + item->qoperand.distance);
1159 break;
1160 case QI_OPR:
1161 pq_sendint8(&buf, item->qoperator.oper);
1162 if (item->qoperator.oper == OP_PHRASE)
1163 pq_sendint16(&buf, item->qoperator.distance);
1164 break;
1165 default:
1166 elog(ERROR, "unrecognized tsquery node type: %d", item->type);
1167 }
1168 item++;
1169 }
1170
1171 PG_FREE_IF_COPY(query, 0);
1172
1173 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
1174 }
1175
1176 Datum
tsqueryrecv(PG_FUNCTION_ARGS)1177 tsqueryrecv(PG_FUNCTION_ARGS)
1178 {
1179 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
1180 TSQuery query;
1181 int i,
1182 len;
1183 QueryItem *item;
1184 int datalen;
1185 char *ptr;
1186 uint32 size;
1187 const char **operands;
1188 bool needcleanup;
1189
1190 size = pq_getmsgint(buf, sizeof(uint32));
1191 if (size > (MaxAllocSize / sizeof(QueryItem)))
1192 elog(ERROR, "invalid size of tsquery");
1193
1194 /* Allocate space to temporarily hold operand strings */
1195 operands = palloc(size * sizeof(char *));
1196
1197 /* Allocate space for all the QueryItems. */
1198 len = HDRSIZETQ + sizeof(QueryItem) * size;
1199 query = (TSQuery) palloc0(len);
1200 query->size = size;
1201 item = GETQUERY(query);
1202
1203 datalen = 0;
1204 for (i = 0; i < size; i++)
1205 {
1206 item->type = (int8) pq_getmsgint(buf, sizeof(int8));
1207
1208 if (item->type == QI_VAL)
1209 {
1210 size_t val_len; /* length after recoding to server
1211 * encoding */
1212 uint8 weight;
1213 uint8 prefix;
1214 const char *val;
1215 pg_crc32 valcrc;
1216
1217 weight = (uint8) pq_getmsgint(buf, sizeof(uint8));
1218 prefix = (uint8) pq_getmsgint(buf, sizeof(uint8));
1219 val = pq_getmsgstring(buf);
1220 val_len = strlen(val);
1221
1222 /* Sanity checks */
1223
1224 if (weight > 0xF)
1225 elog(ERROR, "invalid tsquery: invalid weight bitmap");
1226
1227 if (val_len > MAXSTRLEN)
1228 elog(ERROR, "invalid tsquery: operand too long");
1229
1230 if (datalen > MAXSTRPOS)
1231 elog(ERROR, "invalid tsquery: total operand length exceeded");
1232
1233 /* Looks valid. */
1234
1235 INIT_LEGACY_CRC32(valcrc);
1236 COMP_LEGACY_CRC32(valcrc, val, val_len);
1237 FIN_LEGACY_CRC32(valcrc);
1238
1239 item->qoperand.weight = weight;
1240 item->qoperand.prefix = (prefix) ? true : false;
1241 item->qoperand.valcrc = (int32) valcrc;
1242 item->qoperand.length = val_len;
1243 item->qoperand.distance = datalen;
1244
1245 /*
1246 * Operand strings are copied to the final struct after this loop;
1247 * here we just collect them to an array
1248 */
1249 operands[i] = val;
1250
1251 datalen += val_len + 1; /* + 1 for the '\0' terminator */
1252 }
1253 else if (item->type == QI_OPR)
1254 {
1255 int8 oper;
1256
1257 oper = (int8) pq_getmsgint(buf, sizeof(int8));
1258 if (oper != OP_NOT && oper != OP_OR && oper != OP_AND && oper != OP_PHRASE)
1259 elog(ERROR, "invalid tsquery: unrecognized operator type %d",
1260 (int) oper);
1261 if (i == size - 1)
1262 elog(ERROR, "invalid pointer to right operand");
1263
1264 item->qoperator.oper = oper;
1265 if (oper == OP_PHRASE)
1266 item->qoperator.distance = (int16) pq_getmsgint(buf, sizeof(int16));
1267 }
1268 else
1269 elog(ERROR, "unrecognized tsquery node type: %d", item->type);
1270
1271 item++;
1272 }
1273
1274 /* Enlarge buffer to make room for the operand values. */
1275 query = (TSQuery) repalloc(query, len + datalen);
1276 item = GETQUERY(query);
1277 ptr = GETOPERAND(query);
1278
1279 /*
1280 * Fill in the left-pointers. Checks that the tree is well-formed as a
1281 * side-effect.
1282 */
1283 findoprnd(item, size, &needcleanup);
1284
1285 /* Can't have found any QI_VALSTOP nodes */
1286 Assert(!needcleanup);
1287
1288 /* Copy operands to output struct */
1289 for (i = 0; i < size; i++)
1290 {
1291 if (item->type == QI_VAL)
1292 {
1293 memcpy(ptr, operands[i], item->qoperand.length + 1);
1294 ptr += item->qoperand.length + 1;
1295 }
1296 item++;
1297 }
1298
1299 pfree(operands);
1300
1301 Assert(ptr - GETOPERAND(query) == datalen);
1302
1303 SET_VARSIZE(query, len + datalen);
1304
1305 PG_RETURN_TSQUERY(query);
1306 }
1307
1308 /*
1309 * debug function, used only for view query
1310 * which will be executed in non-leaf pages in index
1311 */
1312 Datum
tsquerytree(PG_FUNCTION_ARGS)1313 tsquerytree(PG_FUNCTION_ARGS)
1314 {
1315 TSQuery query = PG_GETARG_TSQUERY(0);
1316 INFIX nrm;
1317 text *res;
1318 QueryItem *q;
1319 int len;
1320
1321 if (query->size == 0)
1322 {
1323 res = (text *) palloc(VARHDRSZ);
1324 SET_VARSIZE(res, VARHDRSZ);
1325 PG_RETURN_POINTER(res);
1326 }
1327
1328 q = clean_NOT(GETQUERY(query), &len);
1329
1330 if (!q)
1331 {
1332 res = cstring_to_text("T");
1333 }
1334 else
1335 {
1336 nrm.curpol = q;
1337 nrm.buflen = 32;
1338 nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
1339 *(nrm.cur) = '\0';
1340 nrm.op = GETOPERAND(query);
1341 infix(&nrm, -1, false);
1342 res = cstring_to_text_with_len(nrm.buf, nrm.cur - nrm.buf);
1343 pfree(q);
1344 }
1345
1346 PG_FREE_IF_COPY(query, 0);
1347
1348 PG_RETURN_TEXT_P(res);
1349 }
1350