1 /*-------------------------------------------------------------------------
2 *
3 * tsvector_op.c
4 * operations over tsvector
5 *
6 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsvector_op.c
11 *
12 *-------------------------------------------------------------------------
13 */
14 #include "postgres.h"
15
16 #include <limits.h>
17
18 #include "access/htup_details.h"
19 #include "catalog/namespace.h"
20 #include "catalog/pg_type.h"
21 #include "commands/trigger.h"
22 #include "executor/spi.h"
23 #include "funcapi.h"
24 #include "lib/qunique.h"
25 #include "mb/pg_wchar.h"
26 #include "miscadmin.h"
27 #include "parser/parse_coerce.h"
28 #include "tsearch/ts_utils.h"
29 #include "utils/array.h"
30 #include "utils/builtins.h"
31 #include "utils/lsyscache.h"
32 #include "utils/regproc.h"
33 #include "utils/rel.h"
34
35
36 typedef struct
37 {
38 WordEntry *arrb;
39 WordEntry *arre;
40 char *values;
41 char *operand;
42 } CHKVAL;
43
44
45 typedef struct StatEntry
46 {
47 uint32 ndoc; /* zero indicates that we were already here
48 * while walking through the tree */
49 uint32 nentry;
50 struct StatEntry *left;
51 struct StatEntry *right;
52 uint32 lenlexeme;
53 char lexeme[FLEXIBLE_ARRAY_MEMBER];
54 } StatEntry;
55
56 #define STATENTRYHDRSZ (offsetof(StatEntry, lexeme))
57
58 typedef struct
59 {
60 int32 weight;
61
62 uint32 maxdepth;
63
64 StatEntry **stack;
65 uint32 stackpos;
66
67 StatEntry *root;
68 } TSVectorStat;
69
70
71 static TSTernaryValue TS_execute_recurse(QueryItem *curitem, void *arg,
72 uint32 flags,
73 TSExecuteCallback chkcond);
74 static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len);
75 static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
76
77
78 /*
79 * Order: haspos, len, word, for all positions (pos, weight)
80 */
81 static int
silly_cmp_tsvector(const TSVector a,const TSVector b)82 silly_cmp_tsvector(const TSVector a, const TSVector b)
83 {
84 if (VARSIZE(a) < VARSIZE(b))
85 return -1;
86 else if (VARSIZE(a) > VARSIZE(b))
87 return 1;
88 else if (a->size < b->size)
89 return -1;
90 else if (a->size > b->size)
91 return 1;
92 else
93 {
94 WordEntry *aptr = ARRPTR(a);
95 WordEntry *bptr = ARRPTR(b);
96 int i = 0;
97 int res;
98
99
100 for (i = 0; i < a->size; i++)
101 {
102 if (aptr->haspos != bptr->haspos)
103 {
104 return (aptr->haspos > bptr->haspos) ? -1 : 1;
105 }
106 else if ((res = tsCompareString(STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) != 0)
107 {
108 return res;
109 }
110 else if (aptr->haspos)
111 {
112 WordEntryPos *ap = POSDATAPTR(a, aptr);
113 WordEntryPos *bp = POSDATAPTR(b, bptr);
114 int j;
115
116 if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr))
117 return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1;
118
119 for (j = 0; j < POSDATALEN(a, aptr); j++)
120 {
121 if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp))
122 {
123 return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1;
124 }
125 else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp))
126 {
127 return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1;
128 }
129 ap++, bp++;
130 }
131 }
132
133 aptr++;
134 bptr++;
135 }
136 }
137
138 return 0;
139 }
140
141 #define TSVECTORCMPFUNC( type, action, ret ) \
142 Datum \
143 tsvector_##type(PG_FUNCTION_ARGS) \
144 { \
145 TSVector a = PG_GETARG_TSVECTOR(0); \
146 TSVector b = PG_GETARG_TSVECTOR(1); \
147 int res = silly_cmp_tsvector(a, b); \
148 PG_FREE_IF_COPY(a,0); \
149 PG_FREE_IF_COPY(b,1); \
150 PG_RETURN_##ret( res action 0 ); \
151 } \
152 /* keep compiler quiet - no extra ; */ \
153 extern int no_such_variable
154
155 TSVECTORCMPFUNC(lt, <, BOOL);
156 TSVECTORCMPFUNC(le, <=, BOOL);
157 TSVECTORCMPFUNC(eq, ==, BOOL);
158 TSVECTORCMPFUNC(ge, >=, BOOL);
159 TSVECTORCMPFUNC(gt, >, BOOL);
160 TSVECTORCMPFUNC(ne, !=, BOOL);
161 TSVECTORCMPFUNC(cmp, +, INT32);
162
163 Datum
tsvector_strip(PG_FUNCTION_ARGS)164 tsvector_strip(PG_FUNCTION_ARGS)
165 {
166 TSVector in = PG_GETARG_TSVECTOR(0);
167 TSVector out;
168 int i,
169 len = 0;
170 WordEntry *arrin = ARRPTR(in),
171 *arrout;
172 char *cur;
173
174 for (i = 0; i < in->size; i++)
175 len += arrin[i].len;
176
177 len = CALCDATASIZE(in->size, len);
178 out = (TSVector) palloc0(len);
179 SET_VARSIZE(out, len);
180 out->size = in->size;
181 arrout = ARRPTR(out);
182 cur = STRPTR(out);
183 for (i = 0; i < in->size; i++)
184 {
185 memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len);
186 arrout[i].haspos = 0;
187 arrout[i].len = arrin[i].len;
188 arrout[i].pos = cur - STRPTR(out);
189 cur += arrout[i].len;
190 }
191
192 PG_FREE_IF_COPY(in, 0);
193 PG_RETURN_POINTER(out);
194 }
195
196 Datum
tsvector_length(PG_FUNCTION_ARGS)197 tsvector_length(PG_FUNCTION_ARGS)
198 {
199 TSVector in = PG_GETARG_TSVECTOR(0);
200 int32 ret = in->size;
201
202 PG_FREE_IF_COPY(in, 0);
203 PG_RETURN_INT32(ret);
204 }
205
206 Datum
tsvector_setweight(PG_FUNCTION_ARGS)207 tsvector_setweight(PG_FUNCTION_ARGS)
208 {
209 TSVector in = PG_GETARG_TSVECTOR(0);
210 char cw = PG_GETARG_CHAR(1);
211 TSVector out;
212 int i,
213 j;
214 WordEntry *entry;
215 WordEntryPos *p;
216 int w = 0;
217
218 switch (cw)
219 {
220 case 'A':
221 case 'a':
222 w = 3;
223 break;
224 case 'B':
225 case 'b':
226 w = 2;
227 break;
228 case 'C':
229 case 'c':
230 w = 1;
231 break;
232 case 'D':
233 case 'd':
234 w = 0;
235 break;
236 default:
237 /* internal error */
238 elog(ERROR, "unrecognized weight: %d", cw);
239 }
240
241 out = (TSVector) palloc(VARSIZE(in));
242 memcpy(out, in, VARSIZE(in));
243 entry = ARRPTR(out);
244 i = out->size;
245 while (i--)
246 {
247 if ((j = POSDATALEN(out, entry)) != 0)
248 {
249 p = POSDATAPTR(out, entry);
250 while (j--)
251 {
252 WEP_SETWEIGHT(*p, w);
253 p++;
254 }
255 }
256 entry++;
257 }
258
259 PG_FREE_IF_COPY(in, 0);
260 PG_RETURN_POINTER(out);
261 }
262
263 /*
264 * setweight(tsin tsvector, char_weight "char", lexemes "text"[])
265 *
266 * Assign weight w to elements of tsin that are listed in lexemes.
267 */
268 Datum
tsvector_setweight_by_filter(PG_FUNCTION_ARGS)269 tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
270 {
271 TSVector tsin = PG_GETARG_TSVECTOR(0);
272 char char_weight = PG_GETARG_CHAR(1);
273 ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2);
274
275 TSVector tsout;
276 int i,
277 j,
278 nlexemes,
279 weight;
280 WordEntry *entry;
281 Datum *dlexemes;
282 bool *nulls;
283
284 switch (char_weight)
285 {
286 case 'A':
287 case 'a':
288 weight = 3;
289 break;
290 case 'B':
291 case 'b':
292 weight = 2;
293 break;
294 case 'C':
295 case 'c':
296 weight = 1;
297 break;
298 case 'D':
299 case 'd':
300 weight = 0;
301 break;
302 default:
303 /* internal error */
304 elog(ERROR, "unrecognized weight: %c", char_weight);
305 }
306
307 tsout = (TSVector) palloc(VARSIZE(tsin));
308 memcpy(tsout, tsin, VARSIZE(tsin));
309 entry = ARRPTR(tsout);
310
311 deconstruct_array(lexemes, TEXTOID, -1, false, TYPALIGN_INT,
312 &dlexemes, &nulls, &nlexemes);
313
314 /*
315 * Assuming that lexemes array is significantly shorter than tsvector we
316 * can iterate through lexemes performing binary search of each lexeme
317 * from lexemes in tsvector.
318 */
319 for (i = 0; i < nlexemes; i++)
320 {
321 char *lex;
322 int lex_len,
323 lex_pos;
324
325 if (nulls[i])
326 ereport(ERROR,
327 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
328 errmsg("lexeme array may not contain nulls")));
329
330 lex = VARDATA(dlexemes[i]);
331 lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
332 lex_pos = tsvector_bsearch(tsout, lex, lex_len);
333
334 if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
335 {
336 WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
337
338 while (j--)
339 {
340 WEP_SETWEIGHT(*p, weight);
341 p++;
342 }
343 }
344 }
345
346 PG_FREE_IF_COPY(tsin, 0);
347 PG_FREE_IF_COPY(lexemes, 2);
348
349 PG_RETURN_POINTER(tsout);
350 }
351
352 #define compareEntry(pa, a, pb, b) \
353 tsCompareString((pa) + (a)->pos, (a)->len, \
354 (pb) + (b)->pos, (b)->len, \
355 false)
356
357 /*
358 * Add positions from src to dest after offsetting them by maxpos.
359 * Return the number added (might be less than expected due to overflow)
360 */
361 static int32
add_pos(TSVector src,WordEntry * srcptr,TSVector dest,WordEntry * destptr,int32 maxpos)362 add_pos(TSVector src, WordEntry *srcptr,
363 TSVector dest, WordEntry *destptr,
364 int32 maxpos)
365 {
366 uint16 *clen = &_POSVECPTR(dest, destptr)->npos;
367 int i;
368 uint16 slen = POSDATALEN(src, srcptr),
369 startlen;
370 WordEntryPos *spos = POSDATAPTR(src, srcptr),
371 *dpos = POSDATAPTR(dest, destptr);
372
373 if (!destptr->haspos)
374 *clen = 0;
375
376 startlen = *clen;
377 for (i = 0;
378 i < slen && *clen < MAXNUMPOS &&
379 (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1);
380 i++)
381 {
382 WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i]));
383 WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos));
384 (*clen)++;
385 }
386
387 if (*clen != startlen)
388 destptr->haspos = 1;
389 return *clen - startlen;
390 }
391
392 /*
393 * Perform binary search of given lexeme in TSVector.
394 * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
395 * found.
396 */
397 static int
tsvector_bsearch(const TSVector tsv,char * lexeme,int lexeme_len)398 tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
399 {
400 WordEntry *arrin = ARRPTR(tsv);
401 int StopLow = 0,
402 StopHigh = tsv->size,
403 StopMiddle,
404 cmp;
405
406 while (StopLow < StopHigh)
407 {
408 StopMiddle = (StopLow + StopHigh) / 2;
409
410 cmp = tsCompareString(lexeme, lexeme_len,
411 STRPTR(tsv) + arrin[StopMiddle].pos,
412 arrin[StopMiddle].len,
413 false);
414
415 if (cmp < 0)
416 StopHigh = StopMiddle;
417 else if (cmp > 0)
418 StopLow = StopMiddle + 1;
419 else /* found it */
420 return StopMiddle;
421 }
422
423 return -1;
424 }
425
426 /*
427 * qsort comparator functions
428 */
429
430 static int
compare_int(const void * va,const void * vb)431 compare_int(const void *va, const void *vb)
432 {
433 int a = *((const int *) va);
434 int b = *((const int *) vb);
435
436 if (a == b)
437 return 0;
438 return (a > b) ? 1 : -1;
439 }
440
441 static int
compare_text_lexemes(const void * va,const void * vb)442 compare_text_lexemes(const void *va, const void *vb)
443 {
444 Datum a = *((const Datum *) va);
445 Datum b = *((const Datum *) vb);
446 char *alex = VARDATA_ANY(a);
447 int alex_len = VARSIZE_ANY_EXHDR(a);
448 char *blex = VARDATA_ANY(b);
449 int blex_len = VARSIZE_ANY_EXHDR(b);
450
451 return tsCompareString(alex, alex_len, blex, blex_len, false);
452 }
453
454 /*
455 * Internal routine to delete lexemes from TSVector by array of offsets.
456 *
457 * int *indices_to_delete -- array of lexeme offsets to delete (modified here!)
458 * int indices_count -- size of that array
459 *
460 * Returns new TSVector without given lexemes along with their positions
461 * and weights.
462 */
463 static TSVector
tsvector_delete_by_indices(TSVector tsv,int * indices_to_delete,int indices_count)464 tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
465 int indices_count)
466 {
467 TSVector tsout;
468 WordEntry *arrin = ARRPTR(tsv),
469 *arrout;
470 char *data = STRPTR(tsv),
471 *dataout;
472 int i, /* index in arrin */
473 j, /* index in arrout */
474 k, /* index in indices_to_delete */
475 curoff; /* index in dataout area */
476
477 /*
478 * Sort the filter array to simplify membership checks below. Also, get
479 * rid of any duplicate entries, so that we can assume that indices_count
480 * is exactly equal to the number of lexemes that will be removed.
481 */
482 if (indices_count > 1)
483 {
484 qsort(indices_to_delete, indices_count, sizeof(int), compare_int);
485 indices_count = qunique(indices_to_delete, indices_count, sizeof(int),
486 compare_int);
487 }
488
489 /*
490 * Here we overestimate tsout size, since we don't know how much space is
491 * used by the deleted lexeme(s). We will set exact size below.
492 */
493 tsout = (TSVector) palloc0(VARSIZE(tsv));
494
495 /* This count must be correct because STRPTR(tsout) relies on it. */
496 tsout->size = tsv->size - indices_count;
497
498 /*
499 * Copy tsv to tsout, skipping lexemes listed in indices_to_delete.
500 */
501 arrout = ARRPTR(tsout);
502 dataout = STRPTR(tsout);
503 curoff = 0;
504 for (i = j = k = 0; i < tsv->size; i++)
505 {
506 /*
507 * If current i is present in indices_to_delete, skip this lexeme.
508 * Since indices_to_delete is already sorted, we only need to check
509 * the current (k'th) entry.
510 */
511 if (k < indices_count && i == indices_to_delete[k])
512 {
513 k++;
514 continue;
515 }
516
517 /* Copy lexeme and its positions and weights */
518 memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
519 arrout[j].haspos = arrin[i].haspos;
520 arrout[j].len = arrin[i].len;
521 arrout[j].pos = curoff;
522 curoff += arrin[i].len;
523 if (arrin[i].haspos)
524 {
525 int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos)
526 + sizeof(uint16);
527
528 curoff = SHORTALIGN(curoff);
529 memcpy(dataout + curoff,
530 STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len),
531 len);
532 curoff += len;
533 }
534
535 j++;
536 }
537
538 /*
539 * k should now be exactly equal to indices_count. If it isn't then the
540 * caller provided us with indices outside of [0, tsv->size) range and
541 * estimation of tsout's size is wrong.
542 */
543 Assert(k == indices_count);
544
545 SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
546 return tsout;
547 }
548
549 /*
550 * Delete given lexeme from tsvector.
551 * Implementation of user-level ts_delete(tsvector, text).
552 */
553 Datum
tsvector_delete_str(PG_FUNCTION_ARGS)554 tsvector_delete_str(PG_FUNCTION_ARGS)
555 {
556 TSVector tsin = PG_GETARG_TSVECTOR(0),
557 tsout;
558 text *tlexeme = PG_GETARG_TEXT_PP(1);
559 char *lexeme = VARDATA_ANY(tlexeme);
560 int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme),
561 skip_index;
562
563 if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1)
564 PG_RETURN_POINTER(tsin);
565
566 tsout = tsvector_delete_by_indices(tsin, &skip_index, 1);
567
568 PG_FREE_IF_COPY(tsin, 0);
569 PG_FREE_IF_COPY(tlexeme, 1);
570 PG_RETURN_POINTER(tsout);
571 }
572
573 /*
574 * Delete given array of lexemes from tsvector.
575 * Implementation of user-level ts_delete(tsvector, text[]).
576 */
577 Datum
tsvector_delete_arr(PG_FUNCTION_ARGS)578 tsvector_delete_arr(PG_FUNCTION_ARGS)
579 {
580 TSVector tsin = PG_GETARG_TSVECTOR(0),
581 tsout;
582 ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1);
583 int i,
584 nlex,
585 skip_count,
586 *skip_indices;
587 Datum *dlexemes;
588 bool *nulls;
589
590 deconstruct_array(lexemes, TEXTOID, -1, false, TYPALIGN_INT,
591 &dlexemes, &nulls, &nlex);
592
593 /*
594 * In typical use case array of lexemes to delete is relatively small. So
595 * here we optimize things for that scenario: iterate through lexarr
596 * performing binary search of each lexeme from lexarr in tsvector.
597 */
598 skip_indices = palloc0(nlex * sizeof(int));
599 for (i = skip_count = 0; i < nlex; i++)
600 {
601 char *lex;
602 int lex_len,
603 lex_pos;
604
605 if (nulls[i])
606 ereport(ERROR,
607 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
608 errmsg("lexeme array may not contain nulls")));
609
610 lex = VARDATA(dlexemes[i]);
611 lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
612 lex_pos = tsvector_bsearch(tsin, lex, lex_len);
613
614 if (lex_pos >= 0)
615 skip_indices[skip_count++] = lex_pos;
616 }
617
618 tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count);
619
620 pfree(skip_indices);
621 PG_FREE_IF_COPY(tsin, 0);
622 PG_FREE_IF_COPY(lexemes, 1);
623
624 PG_RETURN_POINTER(tsout);
625 }
626
627 /*
628 * Expand tsvector as table with following columns:
629 * lexeme: lexeme text
630 * positions: integer array of lexeme positions
631 * weights: char array of weights corresponding to positions
632 */
633 Datum
tsvector_unnest(PG_FUNCTION_ARGS)634 tsvector_unnest(PG_FUNCTION_ARGS)
635 {
636 FuncCallContext *funcctx;
637 TSVector tsin;
638
639 if (SRF_IS_FIRSTCALL())
640 {
641 MemoryContext oldcontext;
642 TupleDesc tupdesc;
643
644 funcctx = SRF_FIRSTCALL_INIT();
645 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
646
647 tupdesc = CreateTemplateTupleDesc(3);
648 TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
649 TEXTOID, -1, 0);
650 TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
651 INT2ARRAYOID, -1, 0);
652 TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
653 TEXTARRAYOID, -1, 0);
654 funcctx->tuple_desc = BlessTupleDesc(tupdesc);
655
656 funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0);
657
658 MemoryContextSwitchTo(oldcontext);
659 }
660
661 funcctx = SRF_PERCALL_SETUP();
662 tsin = (TSVector) funcctx->user_fctx;
663
664 if (funcctx->call_cntr < tsin->size)
665 {
666 WordEntry *arrin = ARRPTR(tsin);
667 char *data = STRPTR(tsin);
668 HeapTuple tuple;
669 int j,
670 i = funcctx->call_cntr;
671 bool nulls[] = {false, false, false};
672 Datum values[3];
673
674 values[0] = PointerGetDatum(cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len));
675
676 if (arrin[i].haspos)
677 {
678 WordEntryPosVector *posv;
679 Datum *positions;
680 Datum *weights;
681 char weight;
682
683 /*
684 * Internally tsvector stores position and weight in the same
685 * uint16 (2 bits for weight, 14 for position). Here we extract
686 * that in two separate arrays.
687 */
688 posv = _POSVECPTR(tsin, arrin + i);
689 positions = palloc(posv->npos * sizeof(Datum));
690 weights = palloc(posv->npos * sizeof(Datum));
691 for (j = 0; j < posv->npos; j++)
692 {
693 positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
694 weight = 'D' - WEP_GETWEIGHT(posv->pos[j]);
695 weights[j] = PointerGetDatum(cstring_to_text_with_len(&weight,
696 1));
697 }
698
699 values[1] = PointerGetDatum(construct_array(positions, posv->npos,
700 INT2OID, 2, true, TYPALIGN_SHORT));
701 values[2] = PointerGetDatum(construct_array(weights, posv->npos,
702 TEXTOID, -1, false, TYPALIGN_INT));
703 }
704 else
705 {
706 nulls[1] = nulls[2] = true;
707 }
708
709 tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
710 SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
711 }
712 else
713 {
714 SRF_RETURN_DONE(funcctx);
715 }
716 }
717
718 /*
719 * Convert tsvector to array of lexemes.
720 */
721 Datum
tsvector_to_array(PG_FUNCTION_ARGS)722 tsvector_to_array(PG_FUNCTION_ARGS)
723 {
724 TSVector tsin = PG_GETARG_TSVECTOR(0);
725 WordEntry *arrin = ARRPTR(tsin);
726 Datum *elements;
727 int i;
728 ArrayType *array;
729
730 elements = palloc(tsin->size * sizeof(Datum));
731
732 for (i = 0; i < tsin->size; i++)
733 {
734 elements[i] = PointerGetDatum(cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos,
735 arrin[i].len));
736 }
737
738 array = construct_array(elements, tsin->size, TEXTOID, -1, false, TYPALIGN_INT);
739
740 pfree(elements);
741 PG_FREE_IF_COPY(tsin, 0);
742 PG_RETURN_POINTER(array);
743 }
744
745 /*
746 * Build tsvector from array of lexemes.
747 */
748 Datum
array_to_tsvector(PG_FUNCTION_ARGS)749 array_to_tsvector(PG_FUNCTION_ARGS)
750 {
751 ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
752 TSVector tsout;
753 Datum *dlexemes;
754 WordEntry *arrout;
755 bool *nulls;
756 int nitems,
757 i,
758 tslen,
759 datalen = 0;
760 char *cur;
761
762 deconstruct_array(v, TEXTOID, -1, false, TYPALIGN_INT, &dlexemes, &nulls, &nitems);
763
764 /* Reject nulls (maybe we should just ignore them, instead?) */
765 for (i = 0; i < nitems; i++)
766 {
767 if (nulls[i])
768 ereport(ERROR,
769 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
770 errmsg("lexeme array may not contain nulls")));
771 }
772
773 /* Sort and de-dup, because this is required for a valid tsvector. */
774 if (nitems > 1)
775 {
776 qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes);
777 nitems = qunique(dlexemes, nitems, sizeof(Datum),
778 compare_text_lexemes);
779 }
780
781 /* Calculate space needed for surviving lexemes. */
782 for (i = 0; i < nitems; i++)
783 datalen += VARSIZE(dlexemes[i]) - VARHDRSZ;
784 tslen = CALCDATASIZE(nitems, datalen);
785
786 /* Allocate and fill tsvector. */
787 tsout = (TSVector) palloc0(tslen);
788 SET_VARSIZE(tsout, tslen);
789 tsout->size = nitems;
790
791 arrout = ARRPTR(tsout);
792 cur = STRPTR(tsout);
793 for (i = 0; i < nitems; i++)
794 {
795 char *lex = VARDATA(dlexemes[i]);
796 int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
797
798 memcpy(cur, lex, lex_len);
799 arrout[i].haspos = 0;
800 arrout[i].len = lex_len;
801 arrout[i].pos = cur - STRPTR(tsout);
802 cur += lex_len;
803 }
804
805 PG_FREE_IF_COPY(v, 0);
806 PG_RETURN_POINTER(tsout);
807 }
808
809 /*
810 * ts_filter(): keep only lexemes with given weights in tsvector.
811 */
812 Datum
tsvector_filter(PG_FUNCTION_ARGS)813 tsvector_filter(PG_FUNCTION_ARGS)
814 {
815 TSVector tsin = PG_GETARG_TSVECTOR(0),
816 tsout;
817 ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
818 WordEntry *arrin = ARRPTR(tsin),
819 *arrout;
820 char *datain = STRPTR(tsin),
821 *dataout;
822 Datum *dweights;
823 bool *nulls;
824 int nweights;
825 int i,
826 j;
827 int cur_pos = 0;
828 char mask = 0;
829
830 deconstruct_array(weights, CHAROID, 1, true, TYPALIGN_CHAR,
831 &dweights, &nulls, &nweights);
832
833 for (i = 0; i < nweights; i++)
834 {
835 char char_weight;
836
837 if (nulls[i])
838 ereport(ERROR,
839 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
840 errmsg("weight array may not contain nulls")));
841
842 char_weight = DatumGetChar(dweights[i]);
843 switch (char_weight)
844 {
845 case 'A':
846 case 'a':
847 mask = mask | 8;
848 break;
849 case 'B':
850 case 'b':
851 mask = mask | 4;
852 break;
853 case 'C':
854 case 'c':
855 mask = mask | 2;
856 break;
857 case 'D':
858 case 'd':
859 mask = mask | 1;
860 break;
861 default:
862 ereport(ERROR,
863 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
864 errmsg("unrecognized weight: \"%c\"", char_weight)));
865 }
866 }
867
868 tsout = (TSVector) palloc0(VARSIZE(tsin));
869 tsout->size = tsin->size;
870 arrout = ARRPTR(tsout);
871 dataout = STRPTR(tsout);
872
873 for (i = j = 0; i < tsin->size; i++)
874 {
875 WordEntryPosVector *posvin,
876 *posvout;
877 int npos = 0;
878 int k;
879
880 if (!arrin[i].haspos)
881 continue;
882
883 posvin = _POSVECPTR(tsin, arrin + i);
884 posvout = (WordEntryPosVector *)
885 (dataout + SHORTALIGN(cur_pos + arrin[i].len));
886
887 for (k = 0; k < posvin->npos; k++)
888 {
889 if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
890 posvout->pos[npos++] = posvin->pos[k];
891 }
892
893 /* if no satisfactory positions found, skip lexeme */
894 if (!npos)
895 continue;
896
897 arrout[j].haspos = true;
898 arrout[j].len = arrin[i].len;
899 arrout[j].pos = cur_pos;
900
901 memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
902 posvout->npos = npos;
903 cur_pos += SHORTALIGN(arrin[i].len);
904 cur_pos += POSDATALEN(tsout, arrout + j) * sizeof(WordEntryPos) +
905 sizeof(uint16);
906 j++;
907 }
908
909 tsout->size = j;
910 if (dataout != STRPTR(tsout))
911 memmove(STRPTR(tsout), dataout, cur_pos);
912
913 SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
914
915 PG_FREE_IF_COPY(tsin, 0);
916 PG_RETURN_POINTER(tsout);
917 }
918
919 Datum
tsvector_concat(PG_FUNCTION_ARGS)920 tsvector_concat(PG_FUNCTION_ARGS)
921 {
922 TSVector in1 = PG_GETARG_TSVECTOR(0);
923 TSVector in2 = PG_GETARG_TSVECTOR(1);
924 TSVector out;
925 WordEntry *ptr;
926 WordEntry *ptr1,
927 *ptr2;
928 WordEntryPos *p;
929 int maxpos = 0,
930 i,
931 j,
932 i1,
933 i2,
934 dataoff,
935 output_bytes,
936 output_size;
937 char *data,
938 *data1,
939 *data2;
940
941 /* Get max position in in1; we'll need this to offset in2's positions */
942 ptr = ARRPTR(in1);
943 i = in1->size;
944 while (i--)
945 {
946 if ((j = POSDATALEN(in1, ptr)) != 0)
947 {
948 p = POSDATAPTR(in1, ptr);
949 while (j--)
950 {
951 if (WEP_GETPOS(*p) > maxpos)
952 maxpos = WEP_GETPOS(*p);
953 p++;
954 }
955 }
956 ptr++;
957 }
958
959 ptr1 = ARRPTR(in1);
960 ptr2 = ARRPTR(in2);
961 data1 = STRPTR(in1);
962 data2 = STRPTR(in2);
963 i1 = in1->size;
964 i2 = in2->size;
965
966 /*
967 * Conservative estimate of space needed. We might need all the data in
968 * both inputs, and conceivably add a pad byte before position data for
969 * each item where there was none before.
970 */
971 output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 + i2;
972
973 out = (TSVector) palloc0(output_bytes);
974 SET_VARSIZE(out, output_bytes);
975
976 /*
977 * We must make out->size valid so that STRPTR(out) is sensible. We'll
978 * collapse out any unused space at the end.
979 */
980 out->size = in1->size + in2->size;
981
982 ptr = ARRPTR(out);
983 data = STRPTR(out);
984 dataoff = 0;
985 while (i1 && i2)
986 {
987 int cmp = compareEntry(data1, ptr1, data2, ptr2);
988
989 if (cmp < 0)
990 { /* in1 first */
991 ptr->haspos = ptr1->haspos;
992 ptr->len = ptr1->len;
993 memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
994 ptr->pos = dataoff;
995 dataoff += ptr1->len;
996 if (ptr->haspos)
997 {
998 dataoff = SHORTALIGN(dataoff);
999 memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
1000 dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
1001 }
1002
1003 ptr++;
1004 ptr1++;
1005 i1--;
1006 }
1007 else if (cmp > 0)
1008 { /* in2 first */
1009 ptr->haspos = ptr2->haspos;
1010 ptr->len = ptr2->len;
1011 memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
1012 ptr->pos = dataoff;
1013 dataoff += ptr2->len;
1014 if (ptr->haspos)
1015 {
1016 int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
1017
1018 if (addlen == 0)
1019 ptr->haspos = 0;
1020 else
1021 {
1022 dataoff = SHORTALIGN(dataoff);
1023 dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
1024 }
1025 }
1026
1027 ptr++;
1028 ptr2++;
1029 i2--;
1030 }
1031 else
1032 {
1033 ptr->haspos = ptr1->haspos | ptr2->haspos;
1034 ptr->len = ptr1->len;
1035 memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
1036 ptr->pos = dataoff;
1037 dataoff += ptr1->len;
1038 if (ptr->haspos)
1039 {
1040 if (ptr1->haspos)
1041 {
1042 dataoff = SHORTALIGN(dataoff);
1043 memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
1044 dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
1045 if (ptr2->haspos)
1046 dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos);
1047 }
1048 else /* must have ptr2->haspos */
1049 {
1050 int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
1051
1052 if (addlen == 0)
1053 ptr->haspos = 0;
1054 else
1055 {
1056 dataoff = SHORTALIGN(dataoff);
1057 dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
1058 }
1059 }
1060 }
1061
1062 ptr++;
1063 ptr1++;
1064 ptr2++;
1065 i1--;
1066 i2--;
1067 }
1068 }
1069
1070 while (i1)
1071 {
1072 ptr->haspos = ptr1->haspos;
1073 ptr->len = ptr1->len;
1074 memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
1075 ptr->pos = dataoff;
1076 dataoff += ptr1->len;
1077 if (ptr->haspos)
1078 {
1079 dataoff = SHORTALIGN(dataoff);
1080 memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
1081 dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
1082 }
1083
1084 ptr++;
1085 ptr1++;
1086 i1--;
1087 }
1088
1089 while (i2)
1090 {
1091 ptr->haspos = ptr2->haspos;
1092 ptr->len = ptr2->len;
1093 memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
1094 ptr->pos = dataoff;
1095 dataoff += ptr2->len;
1096 if (ptr->haspos)
1097 {
1098 int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
1099
1100 if (addlen == 0)
1101 ptr->haspos = 0;
1102 else
1103 {
1104 dataoff = SHORTALIGN(dataoff);
1105 dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
1106 }
1107 }
1108
1109 ptr++;
1110 ptr2++;
1111 i2--;
1112 }
1113
1114 /*
1115 * Instead of checking each offset individually, we check for overflow of
1116 * pos fields once at the end.
1117 */
1118 if (dataoff > MAXSTRPOS)
1119 ereport(ERROR,
1120 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1121 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", dataoff, MAXSTRPOS)));
1122
1123 /*
1124 * Adjust sizes (asserting that we didn't overrun the original estimates)
1125 * and collapse out any unused array entries.
1126 */
1127 output_size = ptr - ARRPTR(out);
1128 Assert(output_size <= out->size);
1129 out->size = output_size;
1130 if (data != STRPTR(out))
1131 memmove(STRPTR(out), data, dataoff);
1132 output_bytes = CALCDATASIZE(out->size, dataoff);
1133 Assert(output_bytes <= VARSIZE(out));
1134 SET_VARSIZE(out, output_bytes);
1135
1136 PG_FREE_IF_COPY(in1, 0);
1137 PG_FREE_IF_COPY(in2, 1);
1138 PG_RETURN_POINTER(out);
1139 }
1140
1141 /*
1142 * Compare two strings by tsvector rules.
1143 *
1144 * if prefix = true then it returns zero value iff b has prefix a
1145 */
1146 int32
tsCompareString(char * a,int lena,char * b,int lenb,bool prefix)1147 tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
1148 {
1149 int cmp;
1150
1151 if (lena == 0)
1152 {
1153 if (prefix)
1154 cmp = 0; /* empty string is prefix of anything */
1155 else
1156 cmp = (lenb > 0) ? -1 : 0;
1157 }
1158 else if (lenb == 0)
1159 {
1160 cmp = (lena > 0) ? 1 : 0;
1161 }
1162 else
1163 {
1164 cmp = memcmp(a, b, Min(lena, lenb));
1165
1166 if (prefix)
1167 {
1168 if (cmp == 0 && lena > lenb)
1169 cmp = 1; /* a is longer, so not a prefix of b */
1170 }
1171 else if (cmp == 0 && lena != lenb)
1172 {
1173 cmp = (lena < lenb) ? -1 : 1;
1174 }
1175 }
1176
1177 return cmp;
1178 }
1179
1180 /*
1181 * Check weight info or/and fill 'data' with the required positions
1182 */
1183 static TSTernaryValue
checkclass_str(CHKVAL * chkval,WordEntry * entry,QueryOperand * val,ExecPhraseData * data)1184 checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
1185 ExecPhraseData *data)
1186 {
1187 TSTernaryValue result = TS_NO;
1188
1189 Assert(data == NULL || data->npos == 0);
1190
1191 if (entry->haspos)
1192 {
1193 WordEntryPosVector *posvec;
1194
1195 /*
1196 * We can't use the _POSVECPTR macro here because the pointer to the
1197 * tsvector's lexeme storage is already contained in chkval->values.
1198 */
1199 posvec = (WordEntryPosVector *)
1200 (chkval->values + SHORTALIGN(entry->pos + entry->len));
1201
1202 if (val->weight && data)
1203 {
1204 WordEntryPos *posvec_iter = posvec->pos;
1205 WordEntryPos *dptr;
1206
1207 /*
1208 * Filter position information by weights
1209 */
1210 dptr = data->pos = palloc(sizeof(WordEntryPos) * posvec->npos);
1211 data->allocated = true;
1212
1213 /* Is there a position with a matching weight? */
1214 while (posvec_iter < posvec->pos + posvec->npos)
1215 {
1216 /* If true, append this position to the data->pos */
1217 if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
1218 {
1219 *dptr = WEP_GETPOS(*posvec_iter);
1220 dptr++;
1221 }
1222
1223 posvec_iter++;
1224 }
1225
1226 data->npos = dptr - data->pos;
1227
1228 if (data->npos > 0)
1229 result = TS_YES;
1230 else
1231 {
1232 pfree(data->pos);
1233 data->pos = NULL;
1234 data->allocated = false;
1235 }
1236 }
1237 else if (val->weight)
1238 {
1239 WordEntryPos *posvec_iter = posvec->pos;
1240
1241 /* Is there a position with a matching weight? */
1242 while (posvec_iter < posvec->pos + posvec->npos)
1243 {
1244 if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
1245 {
1246 result = TS_YES;
1247 break; /* no need to go further */
1248 }
1249
1250 posvec_iter++;
1251 }
1252 }
1253 else if (data)
1254 {
1255 data->npos = posvec->npos;
1256 data->pos = posvec->pos;
1257 data->allocated = false;
1258 result = TS_YES;
1259 }
1260 else
1261 {
1262 /* simplest case: no weight check, positions not needed */
1263 result = TS_YES;
1264 }
1265 }
1266 else
1267 {
1268 /*
1269 * Position info is lacking, so if the caller requires it, we can only
1270 * say that maybe there is a match.
1271 *
1272 * Notice, however, that we *don't* check val->weight here.
1273 * Historically, stripped tsvectors are considered to match queries
1274 * whether or not the query has a weight restriction; that's a little
1275 * dubious but we'll preserve the behavior.
1276 */
1277 if (data)
1278 result = TS_MAYBE;
1279 else
1280 result = TS_YES;
1281 }
1282
1283 return result;
1284 }
1285
1286 /*
1287 * TS_execute callback for matching a tsquery operand to plain tsvector data
1288 */
1289 static TSTernaryValue
checkcondition_str(void * checkval,QueryOperand * val,ExecPhraseData * data)1290 checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
1291 {
1292 CHKVAL *chkval = (CHKVAL *) checkval;
1293 WordEntry *StopLow = chkval->arrb;
1294 WordEntry *StopHigh = chkval->arre;
1295 WordEntry *StopMiddle = StopHigh;
1296 TSTernaryValue res = TS_NO;
1297
1298 /* Loop invariant: StopLow <= val < StopHigh */
1299 while (StopLow < StopHigh)
1300 {
1301 int difference;
1302
1303 StopMiddle = StopLow + (StopHigh - StopLow) / 2;
1304 difference = tsCompareString(chkval->operand + val->distance,
1305 val->length,
1306 chkval->values + StopMiddle->pos,
1307 StopMiddle->len,
1308 false);
1309
1310 if (difference == 0)
1311 {
1312 /* Check weight info & fill 'data' with positions */
1313 res = checkclass_str(chkval, StopMiddle, val, data);
1314 break;
1315 }
1316 else if (difference > 0)
1317 StopLow = StopMiddle + 1;
1318 else
1319 StopHigh = StopMiddle;
1320 }
1321
1322 /*
1323 * If it's a prefix search, we should also consider lexemes that the
1324 * search term is a prefix of (which will necessarily immediately follow
1325 * the place we found in the above loop). But we can skip them if there
1326 * was a definite match on the exact term AND the caller doesn't need
1327 * position info.
1328 */
1329 if (val->prefix && (res != TS_YES || data))
1330 {
1331 WordEntryPos *allpos = NULL;
1332 int npos = 0,
1333 totalpos = 0;
1334
1335 /* adjust start position for corner case */
1336 if (StopLow >= StopHigh)
1337 StopMiddle = StopHigh;
1338
1339 /* we don't try to re-use any data from the initial match */
1340 if (data)
1341 {
1342 if (data->allocated)
1343 pfree(data->pos);
1344 data->pos = NULL;
1345 data->allocated = false;
1346 data->npos = 0;
1347 }
1348 res = TS_NO;
1349
1350 while ((res != TS_YES || data) &&
1351 StopMiddle < chkval->arre &&
1352 tsCompareString(chkval->operand + val->distance,
1353 val->length,
1354 chkval->values + StopMiddle->pos,
1355 StopMiddle->len,
1356 true) == 0)
1357 {
1358 TSTernaryValue subres;
1359
1360 subres = checkclass_str(chkval, StopMiddle, val, data);
1361
1362 if (subres != TS_NO)
1363 {
1364 if (data)
1365 {
1366 /*
1367 * We need to join position information
1368 */
1369 if (subres == TS_MAYBE)
1370 {
1371 /*
1372 * No position info for this match, so we must report
1373 * MAYBE overall.
1374 */
1375 res = TS_MAYBE;
1376 /* forget any previous positions */
1377 npos = 0;
1378 /* don't leak storage */
1379 if (allpos)
1380 pfree(allpos);
1381 break;
1382 }
1383
1384 while (npos + data->npos > totalpos)
1385 {
1386 if (totalpos == 0)
1387 {
1388 totalpos = 256;
1389 allpos = palloc(sizeof(WordEntryPos) * totalpos);
1390 }
1391 else
1392 {
1393 totalpos *= 2;
1394 allpos = repalloc(allpos, sizeof(WordEntryPos) * totalpos);
1395 }
1396 }
1397
1398 memcpy(allpos + npos, data->pos, sizeof(WordEntryPos) * data->npos);
1399 npos += data->npos;
1400
1401 /* don't leak storage from individual matches */
1402 if (data->allocated)
1403 pfree(data->pos);
1404 data->pos = NULL;
1405 data->allocated = false;
1406 /* it's important to reset data->npos before next loop */
1407 data->npos = 0;
1408 }
1409 else
1410 {
1411 /* Don't need positions, just handle YES/MAYBE */
1412 if (subres == TS_YES || res == TS_NO)
1413 res = subres;
1414 }
1415 }
1416
1417 StopMiddle++;
1418 }
1419
1420 if (data && npos > 0)
1421 {
1422 /* Sort and make unique array of found positions */
1423 data->pos = allpos;
1424 qsort(data->pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
1425 data->npos = qunique(data->pos, npos, sizeof(WordEntryPos),
1426 compareWordEntryPos);
1427 data->allocated = true;
1428 res = TS_YES;
1429 }
1430 }
1431
1432 return res;
1433 }
1434
1435 /*
1436 * Compute output position list for a tsquery operator in phrase mode.
1437 *
1438 * Merge the position lists in Ldata and Rdata as specified by "emit",
1439 * returning the result list into *data. The input position lists must be
1440 * sorted and unique, and the output will be as well.
1441 *
1442 * data: pointer to initially-all-zeroes output struct, or NULL
1443 * Ldata, Rdata: input position lists
1444 * emit: bitmask of TSPO_XXX flags
1445 * Loffset: offset to be added to Ldata positions before comparing/outputting
1446 * Roffset: offset to be added to Rdata positions before comparing/outputting
1447 * max_npos: maximum possible required size of output position array
1448 *
1449 * Loffset and Roffset should not be negative, else we risk trying to output
1450 * negative positions, which won't fit into WordEntryPos.
1451 *
1452 * The result is boolean (TS_YES or TS_NO), but for the caller's convenience
1453 * we return it as TSTernaryValue.
1454 *
1455 * Returns TS_YES if any positions were emitted to *data; or if data is NULL,
1456 * returns TS_YES if any positions would have been emitted.
1457 */
1458 #define TSPO_L_ONLY 0x01 /* emit positions appearing only in L */
1459 #define TSPO_R_ONLY 0x02 /* emit positions appearing only in R */
1460 #define TSPO_BOTH 0x04 /* emit positions appearing in both L&R */
1461
1462 static TSTernaryValue
TS_phrase_output(ExecPhraseData * data,ExecPhraseData * Ldata,ExecPhraseData * Rdata,int emit,int Loffset,int Roffset,int max_npos)1463 TS_phrase_output(ExecPhraseData *data,
1464 ExecPhraseData *Ldata,
1465 ExecPhraseData *Rdata,
1466 int emit,
1467 int Loffset,
1468 int Roffset,
1469 int max_npos)
1470 {
1471 int Lindex,
1472 Rindex;
1473
1474 /* Loop until both inputs are exhausted */
1475 Lindex = Rindex = 0;
1476 while (Lindex < Ldata->npos || Rindex < Rdata->npos)
1477 {
1478 int Lpos,
1479 Rpos;
1480 int output_pos = 0;
1481
1482 /*
1483 * Fetch current values to compare. WEP_GETPOS() is needed because
1484 * ExecPhraseData->data can point to a tsvector's WordEntryPosVector.
1485 */
1486 if (Lindex < Ldata->npos)
1487 Lpos = WEP_GETPOS(Ldata->pos[Lindex]) + Loffset;
1488 else
1489 {
1490 /* L array exhausted, so we're done if R_ONLY isn't set */
1491 if (!(emit & TSPO_R_ONLY))
1492 break;
1493 Lpos = INT_MAX;
1494 }
1495 if (Rindex < Rdata->npos)
1496 Rpos = WEP_GETPOS(Rdata->pos[Rindex]) + Roffset;
1497 else
1498 {
1499 /* R array exhausted, so we're done if L_ONLY isn't set */
1500 if (!(emit & TSPO_L_ONLY))
1501 break;
1502 Rpos = INT_MAX;
1503 }
1504
1505 /* Merge-join the two input lists */
1506 if (Lpos < Rpos)
1507 {
1508 /* Lpos is not matched in Rdata, should we output it? */
1509 if (emit & TSPO_L_ONLY)
1510 output_pos = Lpos;
1511 Lindex++;
1512 }
1513 else if (Lpos == Rpos)
1514 {
1515 /* Lpos and Rpos match ... should we output it? */
1516 if (emit & TSPO_BOTH)
1517 output_pos = Rpos;
1518 Lindex++;
1519 Rindex++;
1520 }
1521 else /* Lpos > Rpos */
1522 {
1523 /* Rpos is not matched in Ldata, should we output it? */
1524 if (emit & TSPO_R_ONLY)
1525 output_pos = Rpos;
1526 Rindex++;
1527 }
1528
1529 if (output_pos > 0)
1530 {
1531 if (data)
1532 {
1533 /* Store position, first allocating output array if needed */
1534 if (data->pos == NULL)
1535 {
1536 data->pos = (WordEntryPos *)
1537 palloc(max_npos * sizeof(WordEntryPos));
1538 data->allocated = true;
1539 }
1540 data->pos[data->npos++] = output_pos;
1541 }
1542 else
1543 {
1544 /*
1545 * Exact positions not needed, so return TS_YES as soon as we
1546 * know there is at least one.
1547 */
1548 return TS_YES;
1549 }
1550 }
1551 }
1552
1553 if (data && data->npos > 0)
1554 {
1555 /* Let's assert we didn't overrun the array */
1556 Assert(data->npos <= max_npos);
1557 return TS_YES;
1558 }
1559 return TS_NO;
1560 }
1561
1562 /*
1563 * Execute tsquery at or below an OP_PHRASE operator.
1564 *
1565 * This handles tsquery execution at recursion levels where we need to care
1566 * about match locations.
1567 *
1568 * In addition to the same arguments used for TS_execute, the caller may pass
1569 * a preinitialized-to-zeroes ExecPhraseData struct, to be filled with lexeme
1570 * match position info on success. data == NULL if no position data need be
1571 * returned. (In practice, outside callers pass NULL, and only the internal
1572 * recursion cases pass a data pointer.)
1573 * Note: the function assumes data != NULL for operators other than OP_PHRASE.
1574 * This is OK because an outside call always starts from an OP_PHRASE node.
1575 *
1576 * The detailed semantics of the match data, given that the function returned
1577 * TS_YES (successful match), are:
1578 *
1579 * npos > 0, negate = false:
1580 * query is matched at specified position(s) (and only those positions)
1581 * npos > 0, negate = true:
1582 * query is matched at all positions *except* specified position(s)
1583 * npos = 0, negate = true:
1584 * query is matched at all positions
1585 * npos = 0, negate = false:
1586 * disallowed (this should result in TS_NO or TS_MAYBE, as appropriate)
1587 *
1588 * Successful matches also return a "width" value which is the match width in
1589 * lexemes, less one. Hence, "width" is zero for simple one-lexeme matches,
1590 * and is the sum of the phrase operator distances for phrase matches. Note
1591 * that when width > 0, the listed positions represent the ends of matches not
1592 * the starts. (This unintuitive rule is needed to avoid possibly generating
1593 * negative positions, which wouldn't fit into the WordEntryPos arrays.)
1594 *
1595 * If the TSExecuteCallback function reports that an operand is present
1596 * but fails to provide position(s) for it, we will return TS_MAYBE when
1597 * it is possible but not certain that the query is matched.
1598 *
1599 * When the function returns TS_NO or TS_MAYBE, it must return npos = 0,
1600 * negate = false (which is the state initialized by the caller); but the
1601 * "width" output in such cases is undefined.
1602 */
1603 static TSTernaryValue
TS_phrase_execute(QueryItem * curitem,void * arg,uint32 flags,TSExecuteCallback chkcond,ExecPhraseData * data)1604 TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags,
1605 TSExecuteCallback chkcond,
1606 ExecPhraseData *data)
1607 {
1608 ExecPhraseData Ldata,
1609 Rdata;
1610 TSTernaryValue lmatch,
1611 rmatch;
1612 int Loffset,
1613 Roffset,
1614 maxwidth;
1615
1616 /* since this function recurses, it could be driven to stack overflow */
1617 check_stack_depth();
1618
1619 if (curitem->type == QI_VAL)
1620 return chkcond(arg, (QueryOperand *) curitem, data);
1621
1622 switch (curitem->qoperator.oper)
1623 {
1624 case OP_NOT:
1625
1626 /*
1627 * We need not touch data->width, since a NOT operation does not
1628 * change the match width.
1629 */
1630 if (flags & TS_EXEC_SKIP_NOT)
1631 {
1632 /* with SKIP_NOT, report NOT as "match everywhere" */
1633 Assert(data->npos == 0 && !data->negate);
1634 data->negate = true;
1635 return TS_YES;
1636 }
1637 switch (TS_phrase_execute(curitem + 1, arg, flags, chkcond, data))
1638 {
1639 case TS_NO:
1640 /* change "match nowhere" to "match everywhere" */
1641 Assert(data->npos == 0 && !data->negate);
1642 data->negate = true;
1643 return TS_YES;
1644 case TS_YES:
1645 if (data->npos > 0)
1646 {
1647 /* we have some positions, invert negate flag */
1648 data->negate = !data->negate;
1649 return TS_YES;
1650 }
1651 else if (data->negate)
1652 {
1653 /* change "match everywhere" to "match nowhere" */
1654 data->negate = false;
1655 return TS_NO;
1656 }
1657 /* Should not get here if result was TS_YES */
1658 Assert(false);
1659 break;
1660 case TS_MAYBE:
1661 /* match positions are, and remain, uncertain */
1662 return TS_MAYBE;
1663 }
1664 break;
1665
1666 case OP_PHRASE:
1667 case OP_AND:
1668 memset(&Ldata, 0, sizeof(Ldata));
1669 memset(&Rdata, 0, sizeof(Rdata));
1670
1671 lmatch = TS_phrase_execute(curitem + curitem->qoperator.left,
1672 arg, flags, chkcond, &Ldata);
1673 if (lmatch == TS_NO)
1674 return TS_NO;
1675
1676 rmatch = TS_phrase_execute(curitem + 1,
1677 arg, flags, chkcond, &Rdata);
1678 if (rmatch == TS_NO)
1679 return TS_NO;
1680
1681 /*
1682 * If either operand has no position information, then we can't
1683 * return reliable position data, only a MAYBE result.
1684 */
1685 if (lmatch == TS_MAYBE || rmatch == TS_MAYBE)
1686 return TS_MAYBE;
1687
1688 if (curitem->qoperator.oper == OP_PHRASE)
1689 {
1690 /*
1691 * Compute Loffset and Roffset suitable for phrase match, and
1692 * compute overall width of whole phrase match.
1693 */
1694 Loffset = curitem->qoperator.distance + Rdata.width;
1695 Roffset = 0;
1696 if (data)
1697 data->width = curitem->qoperator.distance +
1698 Ldata.width + Rdata.width;
1699 }
1700 else
1701 {
1702 /*
1703 * For OP_AND, set output width and alignment like OP_OR (see
1704 * comment below)
1705 */
1706 maxwidth = Max(Ldata.width, Rdata.width);
1707 Loffset = maxwidth - Ldata.width;
1708 Roffset = maxwidth - Rdata.width;
1709 if (data)
1710 data->width = maxwidth;
1711 }
1712
1713 if (Ldata.negate && Rdata.negate)
1714 {
1715 /* !L & !R: treat as !(L | R) */
1716 (void) TS_phrase_output(data, &Ldata, &Rdata,
1717 TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
1718 Loffset, Roffset,
1719 Ldata.npos + Rdata.npos);
1720 if (data)
1721 data->negate = true;
1722 return TS_YES;
1723 }
1724 else if (Ldata.negate)
1725 {
1726 /* !L & R */
1727 return TS_phrase_output(data, &Ldata, &Rdata,
1728 TSPO_R_ONLY,
1729 Loffset, Roffset,
1730 Rdata.npos);
1731 }
1732 else if (Rdata.negate)
1733 {
1734 /* L & !R */
1735 return TS_phrase_output(data, &Ldata, &Rdata,
1736 TSPO_L_ONLY,
1737 Loffset, Roffset,
1738 Ldata.npos);
1739 }
1740 else
1741 {
1742 /* straight AND */
1743 return TS_phrase_output(data, &Ldata, &Rdata,
1744 TSPO_BOTH,
1745 Loffset, Roffset,
1746 Min(Ldata.npos, Rdata.npos));
1747 }
1748
1749 case OP_OR:
1750 memset(&Ldata, 0, sizeof(Ldata));
1751 memset(&Rdata, 0, sizeof(Rdata));
1752
1753 lmatch = TS_phrase_execute(curitem + curitem->qoperator.left,
1754 arg, flags, chkcond, &Ldata);
1755 rmatch = TS_phrase_execute(curitem + 1,
1756 arg, flags, chkcond, &Rdata);
1757
1758 if (lmatch == TS_NO && rmatch == TS_NO)
1759 return TS_NO;
1760
1761 /*
1762 * If either operand has no position information, then we can't
1763 * return reliable position data, only a MAYBE result.
1764 */
1765 if (lmatch == TS_MAYBE || rmatch == TS_MAYBE)
1766 return TS_MAYBE;
1767
1768 /*
1769 * Cope with undefined output width from failed submatch. (This
1770 * takes less code than trying to ensure that all failure returns
1771 * set data->width to zero.)
1772 */
1773 if (lmatch == TS_NO)
1774 Ldata.width = 0;
1775 if (rmatch == TS_NO)
1776 Rdata.width = 0;
1777
1778 /*
1779 * For OP_AND and OP_OR, report the width of the wider of the two
1780 * inputs, and align the narrower input's positions to the right
1781 * end of that width. This rule deals at least somewhat
1782 * reasonably with cases like "x <-> (y | z <-> q)".
1783 */
1784 maxwidth = Max(Ldata.width, Rdata.width);
1785 Loffset = maxwidth - Ldata.width;
1786 Roffset = maxwidth - Rdata.width;
1787 data->width = maxwidth;
1788
1789 if (Ldata.negate && Rdata.negate)
1790 {
1791 /* !L | !R: treat as !(L & R) */
1792 (void) TS_phrase_output(data, &Ldata, &Rdata,
1793 TSPO_BOTH,
1794 Loffset, Roffset,
1795 Min(Ldata.npos, Rdata.npos));
1796 data->negate = true;
1797 return TS_YES;
1798 }
1799 else if (Ldata.negate)
1800 {
1801 /* !L | R: treat as !(L & !R) */
1802 (void) TS_phrase_output(data, &Ldata, &Rdata,
1803 TSPO_L_ONLY,
1804 Loffset, Roffset,
1805 Ldata.npos);
1806 data->negate = true;
1807 return TS_YES;
1808 }
1809 else if (Rdata.negate)
1810 {
1811 /* L | !R: treat as !(!L & R) */
1812 (void) TS_phrase_output(data, &Ldata, &Rdata,
1813 TSPO_R_ONLY,
1814 Loffset, Roffset,
1815 Rdata.npos);
1816 data->negate = true;
1817 return TS_YES;
1818 }
1819 else
1820 {
1821 /* straight OR */
1822 return TS_phrase_output(data, &Ldata, &Rdata,
1823 TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
1824 Loffset, Roffset,
1825 Ldata.npos + Rdata.npos);
1826 }
1827
1828 default:
1829 elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
1830 }
1831
1832 /* not reachable, but keep compiler quiet */
1833 return TS_NO;
1834 }
1835
1836
1837 /*
1838 * Evaluate tsquery boolean expression.
1839 *
1840 * curitem: current tsquery item (initially, the first one)
1841 * arg: opaque value to pass through to callback function
1842 * flags: bitmask of flag bits shown in ts_utils.h
1843 * chkcond: callback function to check whether a primitive value is present
1844 */
1845 bool
TS_execute(QueryItem * curitem,void * arg,uint32 flags,TSExecuteCallback chkcond)1846 TS_execute(QueryItem *curitem, void *arg, uint32 flags,
1847 TSExecuteCallback chkcond)
1848 {
1849 /*
1850 * If we get TS_MAYBE from the recursion, return true. We could only see
1851 * that result if the caller passed TS_EXEC_PHRASE_NO_POS, so there's no
1852 * need to check again.
1853 */
1854 return TS_execute_recurse(curitem, arg, flags, chkcond) != TS_NO;
1855 }
1856
1857 /*
1858 * Evaluate tsquery boolean expression.
1859 *
1860 * This is the same as TS_execute except that TS_MAYBE is returned as-is.
1861 */
1862 TSTernaryValue
TS_execute_ternary(QueryItem * curitem,void * arg,uint32 flags,TSExecuteCallback chkcond)1863 TS_execute_ternary(QueryItem *curitem, void *arg, uint32 flags,
1864 TSExecuteCallback chkcond)
1865 {
1866 return TS_execute_recurse(curitem, arg, flags, chkcond);
1867 }
1868
1869 /*
1870 * TS_execute recursion for operators above any phrase operator. Here we do
1871 * not need to worry about lexeme positions. As soon as we hit an OP_PHRASE
1872 * operator, we pass it off to TS_phrase_execute which does worry.
1873 */
1874 static TSTernaryValue
TS_execute_recurse(QueryItem * curitem,void * arg,uint32 flags,TSExecuteCallback chkcond)1875 TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags,
1876 TSExecuteCallback chkcond)
1877 {
1878 TSTernaryValue lmatch;
1879
1880 /* since this function recurses, it could be driven to stack overflow */
1881 check_stack_depth();
1882
1883 /* ... and let's check for query cancel while we're at it */
1884 CHECK_FOR_INTERRUPTS();
1885
1886 if (curitem->type == QI_VAL)
1887 return chkcond(arg, (QueryOperand *) curitem,
1888 NULL /* don't need position info */ );
1889
1890 switch (curitem->qoperator.oper)
1891 {
1892 case OP_NOT:
1893 if (flags & TS_EXEC_SKIP_NOT)
1894 return TS_YES;
1895 switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
1896 {
1897 case TS_NO:
1898 return TS_YES;
1899 case TS_YES:
1900 return TS_NO;
1901 case TS_MAYBE:
1902 return TS_MAYBE;
1903 }
1904 break;
1905
1906 case OP_AND:
1907 lmatch = TS_execute_recurse(curitem + curitem->qoperator.left, arg,
1908 flags, chkcond);
1909 if (lmatch == TS_NO)
1910 return TS_NO;
1911 switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
1912 {
1913 case TS_NO:
1914 return TS_NO;
1915 case TS_YES:
1916 return lmatch;
1917 case TS_MAYBE:
1918 return TS_MAYBE;
1919 }
1920 break;
1921
1922 case OP_OR:
1923 lmatch = TS_execute_recurse(curitem + curitem->qoperator.left, arg,
1924 flags, chkcond);
1925 if (lmatch == TS_YES)
1926 return TS_YES;
1927 switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
1928 {
1929 case TS_NO:
1930 return lmatch;
1931 case TS_YES:
1932 return TS_YES;
1933 case TS_MAYBE:
1934 return TS_MAYBE;
1935 }
1936 break;
1937
1938 case OP_PHRASE:
1939
1940 /*
1941 * If we get a MAYBE result, and the caller doesn't want that,
1942 * convert it to NO. It would be more consistent, perhaps, to
1943 * return the result of TS_phrase_execute() verbatim and then
1944 * convert MAYBE results at the top of the recursion. But
1945 * converting at the topmost phrase operator gives results that
1946 * are bug-compatible with the old implementation, so do it like
1947 * this for now.
1948 */
1949 switch (TS_phrase_execute(curitem, arg, flags, chkcond, NULL))
1950 {
1951 case TS_NO:
1952 return TS_NO;
1953 case TS_YES:
1954 return TS_YES;
1955 case TS_MAYBE:
1956 return (flags & TS_EXEC_PHRASE_NO_POS) ? TS_MAYBE : TS_NO;
1957 }
1958 break;
1959
1960 default:
1961 elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
1962 }
1963
1964 /* not reachable, but keep compiler quiet */
1965 return TS_NO;
1966 }
1967
1968 /*
1969 * Detect whether a tsquery boolean expression requires any positive matches
1970 * to values shown in the tsquery.
1971 *
1972 * This is needed to know whether a GIN index search requires full index scan.
1973 * For example, 'x & !y' requires a match of x, so it's sufficient to scan
1974 * entries for x; but 'x | !y' could match rows containing neither x nor y.
1975 */
1976 bool
tsquery_requires_match(QueryItem * curitem)1977 tsquery_requires_match(QueryItem *curitem)
1978 {
1979 /* since this function recurses, it could be driven to stack overflow */
1980 check_stack_depth();
1981
1982 if (curitem->type == QI_VAL)
1983 return true;
1984
1985 switch (curitem->qoperator.oper)
1986 {
1987 case OP_NOT:
1988
1989 /*
1990 * Assume there are no required matches underneath a NOT. For
1991 * some cases with nested NOTs, we could prove there's a required
1992 * match, but it seems unlikely to be worth the trouble.
1993 */
1994 return false;
1995
1996 case OP_PHRASE:
1997
1998 /*
1999 * Treat OP_PHRASE as OP_AND here
2000 */
2001 case OP_AND:
2002 /* If either side requires a match, we're good */
2003 if (tsquery_requires_match(curitem + curitem->qoperator.left))
2004 return true;
2005 else
2006 return tsquery_requires_match(curitem + 1);
2007
2008 case OP_OR:
2009 /* Both sides must require a match */
2010 if (tsquery_requires_match(curitem + curitem->qoperator.left))
2011 return tsquery_requires_match(curitem + 1);
2012 else
2013 return false;
2014
2015 default:
2016 elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
2017 }
2018
2019 /* not reachable, but keep compiler quiet */
2020 return false;
2021 }
2022
2023 /*
2024 * boolean operations
2025 */
2026 Datum
ts_match_qv(PG_FUNCTION_ARGS)2027 ts_match_qv(PG_FUNCTION_ARGS)
2028 {
2029 PG_RETURN_DATUM(DirectFunctionCall2(ts_match_vq,
2030 PG_GETARG_DATUM(1),
2031 PG_GETARG_DATUM(0)));
2032 }
2033
2034 Datum
ts_match_vq(PG_FUNCTION_ARGS)2035 ts_match_vq(PG_FUNCTION_ARGS)
2036 {
2037 TSVector val = PG_GETARG_TSVECTOR(0);
2038 TSQuery query = PG_GETARG_TSQUERY(1);
2039 CHKVAL chkval;
2040 bool result;
2041
2042 /* empty query matches nothing */
2043 if (!query->size)
2044 {
2045 PG_FREE_IF_COPY(val, 0);
2046 PG_FREE_IF_COPY(query, 1);
2047 PG_RETURN_BOOL(false);
2048 }
2049
2050 chkval.arrb = ARRPTR(val);
2051 chkval.arre = chkval.arrb + val->size;
2052 chkval.values = STRPTR(val);
2053 chkval.operand = GETOPERAND(query);
2054 result = TS_execute(GETQUERY(query),
2055 &chkval,
2056 TS_EXEC_EMPTY,
2057 checkcondition_str);
2058
2059 PG_FREE_IF_COPY(val, 0);
2060 PG_FREE_IF_COPY(query, 1);
2061 PG_RETURN_BOOL(result);
2062 }
2063
2064 Datum
ts_match_tt(PG_FUNCTION_ARGS)2065 ts_match_tt(PG_FUNCTION_ARGS)
2066 {
2067 TSVector vector;
2068 TSQuery query;
2069 bool res;
2070
2071 vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector,
2072 PG_GETARG_DATUM(0)));
2073 query = DatumGetTSQuery(DirectFunctionCall1(plainto_tsquery,
2074 PG_GETARG_DATUM(1)));
2075
2076 res = DatumGetBool(DirectFunctionCall2(ts_match_vq,
2077 TSVectorGetDatum(vector),
2078 TSQueryGetDatum(query)));
2079
2080 pfree(vector);
2081 pfree(query);
2082
2083 PG_RETURN_BOOL(res);
2084 }
2085
2086 Datum
ts_match_tq(PG_FUNCTION_ARGS)2087 ts_match_tq(PG_FUNCTION_ARGS)
2088 {
2089 TSVector vector;
2090 TSQuery query = PG_GETARG_TSQUERY(1);
2091 bool res;
2092
2093 vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector,
2094 PG_GETARG_DATUM(0)));
2095
2096 res = DatumGetBool(DirectFunctionCall2(ts_match_vq,
2097 TSVectorGetDatum(vector),
2098 TSQueryGetDatum(query)));
2099
2100 pfree(vector);
2101 PG_FREE_IF_COPY(query, 1);
2102
2103 PG_RETURN_BOOL(res);
2104 }
2105
2106 /*
2107 * ts_stat statistic function support
2108 */
2109
2110
2111 /*
2112 * Returns the number of positions in value 'wptr' within tsvector 'txt',
2113 * that have a weight equal to one of the weights in 'weight' bitmask.
2114 */
2115 static int
check_weight(TSVector txt,WordEntry * wptr,int8 weight)2116 check_weight(TSVector txt, WordEntry *wptr, int8 weight)
2117 {
2118 int len = POSDATALEN(txt, wptr);
2119 int num = 0;
2120 WordEntryPos *ptr = POSDATAPTR(txt, wptr);
2121
2122 while (len--)
2123 {
2124 if (weight & (1 << WEP_GETWEIGHT(*ptr)))
2125 num++;
2126 ptr++;
2127 }
2128 return num;
2129 }
2130
2131 #define compareStatWord(a,e,t) \
2132 tsCompareString((a)->lexeme, (a)->lenlexeme, \
2133 STRPTR(t) + (e)->pos, (e)->len, \
2134 false)
2135
2136 static void
insertStatEntry(MemoryContext persistentContext,TSVectorStat * stat,TSVector txt,uint32 off)2137 insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, uint32 off)
2138 {
2139 WordEntry *we = ARRPTR(txt) + off;
2140 StatEntry *node = stat->root,
2141 *pnode = NULL;
2142 int n,
2143 res = 0;
2144 uint32 depth = 1;
2145
2146 if (stat->weight == 0)
2147 n = (we->haspos) ? POSDATALEN(txt, we) : 1;
2148 else
2149 n = (we->haspos) ? check_weight(txt, we, stat->weight) : 0;
2150
2151 if (n == 0)
2152 return; /* nothing to insert */
2153
2154 while (node)
2155 {
2156 res = compareStatWord(node, we, txt);
2157
2158 if (res == 0)
2159 {
2160 break;
2161 }
2162 else
2163 {
2164 pnode = node;
2165 node = (res < 0) ? node->left : node->right;
2166 }
2167 depth++;
2168 }
2169
2170 if (depth > stat->maxdepth)
2171 stat->maxdepth = depth;
2172
2173 if (node == NULL)
2174 {
2175 node = MemoryContextAlloc(persistentContext, STATENTRYHDRSZ + we->len);
2176 node->left = node->right = NULL;
2177 node->ndoc = 1;
2178 node->nentry = n;
2179 node->lenlexeme = we->len;
2180 memcpy(node->lexeme, STRPTR(txt) + we->pos, node->lenlexeme);
2181
2182 if (pnode == NULL)
2183 {
2184 stat->root = node;
2185 }
2186 else
2187 {
2188 if (res < 0)
2189 pnode->left = node;
2190 else
2191 pnode->right = node;
2192 }
2193
2194 }
2195 else
2196 {
2197 node->ndoc++;
2198 node->nentry += n;
2199 }
2200 }
2201
2202 static void
chooseNextStatEntry(MemoryContext persistentContext,TSVectorStat * stat,TSVector txt,uint32 low,uint32 high,uint32 offset)2203 chooseNextStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt,
2204 uint32 low, uint32 high, uint32 offset)
2205 {
2206 uint32 pos;
2207 uint32 middle = (low + high) >> 1;
2208
2209 pos = (low + middle) >> 1;
2210 if (low != middle && pos >= offset && pos - offset < txt->size)
2211 insertStatEntry(persistentContext, stat, txt, pos - offset);
2212 pos = (high + middle + 1) >> 1;
2213 if (middle + 1 != high && pos >= offset && pos - offset < txt->size)
2214 insertStatEntry(persistentContext, stat, txt, pos - offset);
2215
2216 if (low != middle)
2217 chooseNextStatEntry(persistentContext, stat, txt, low, middle, offset);
2218 if (high != middle + 1)
2219 chooseNextStatEntry(persistentContext, stat, txt, middle + 1, high, offset);
2220 }
2221
2222 /*
2223 * This is written like a custom aggregate function, because the
2224 * original plan was to do just that. Unfortunately, an aggregate function
2225 * can't return a set, so that plan was abandoned. If that limitation is
2226 * lifted in the future, ts_stat could be a real aggregate function so that
2227 * you could use it like this:
2228 *
2229 * SELECT ts_stat(vector_column) FROM vector_table;
2230 *
2231 * where vector_column is a tsvector-type column in vector_table.
2232 */
2233
2234 static TSVectorStat *
ts_accum(MemoryContext persistentContext,TSVectorStat * stat,Datum data)2235 ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data)
2236 {
2237 TSVector txt = DatumGetTSVector(data);
2238 uint32 i,
2239 nbit = 0,
2240 offset;
2241
2242 if (stat == NULL)
2243 { /* Init in first */
2244 stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat));
2245 stat->maxdepth = 1;
2246 }
2247
2248 /* simple check of correctness */
2249 if (txt == NULL || txt->size == 0)
2250 {
2251 if (txt && txt != (TSVector) DatumGetPointer(data))
2252 pfree(txt);
2253 return stat;
2254 }
2255
2256 i = txt->size - 1;
2257 for (; i > 0; i >>= 1)
2258 nbit++;
2259
2260 nbit = 1 << nbit;
2261 offset = (nbit - txt->size) / 2;
2262
2263 insertStatEntry(persistentContext, stat, txt, (nbit >> 1) - offset);
2264 chooseNextStatEntry(persistentContext, stat, txt, 0, nbit, offset);
2265
2266 return stat;
2267 }
2268
2269 static void
ts_setup_firstcall(FunctionCallInfo fcinfo,FuncCallContext * funcctx,TSVectorStat * stat)2270 ts_setup_firstcall(FunctionCallInfo fcinfo, FuncCallContext *funcctx,
2271 TSVectorStat *stat)
2272 {
2273 TupleDesc tupdesc;
2274 MemoryContext oldcontext;
2275 StatEntry *node;
2276
2277 funcctx->user_fctx = (void *) stat;
2278
2279 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
2280
2281 stat->stack = palloc0(sizeof(StatEntry *) * (stat->maxdepth + 1));
2282 stat->stackpos = 0;
2283
2284 node = stat->root;
2285 /* find leftmost value */
2286 if (node == NULL)
2287 stat->stack[stat->stackpos] = NULL;
2288 else
2289 for (;;)
2290 {
2291 stat->stack[stat->stackpos] = node;
2292 if (node->left)
2293 {
2294 stat->stackpos++;
2295 node = node->left;
2296 }
2297 else
2298 break;
2299 }
2300 Assert(stat->stackpos <= stat->maxdepth);
2301
2302 tupdesc = CreateTemplateTupleDesc(3);
2303 TupleDescInitEntry(tupdesc, (AttrNumber) 1, "word",
2304 TEXTOID, -1, 0);
2305 TupleDescInitEntry(tupdesc, (AttrNumber) 2, "ndoc",
2306 INT4OID, -1, 0);
2307 TupleDescInitEntry(tupdesc, (AttrNumber) 3, "nentry",
2308 INT4OID, -1, 0);
2309 funcctx->tuple_desc = BlessTupleDesc(tupdesc);
2310 funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
2311
2312 MemoryContextSwitchTo(oldcontext);
2313 }
2314
2315 static StatEntry *
walkStatEntryTree(TSVectorStat * stat)2316 walkStatEntryTree(TSVectorStat *stat)
2317 {
2318 StatEntry *node = stat->stack[stat->stackpos];
2319
2320 if (node == NULL)
2321 return NULL;
2322
2323 if (node->ndoc != 0)
2324 {
2325 /* return entry itself: we already was at left sublink */
2326 return node;
2327 }
2328 else if (node->right && node->right != stat->stack[stat->stackpos + 1])
2329 {
2330 /* go on right sublink */
2331 stat->stackpos++;
2332 node = node->right;
2333
2334 /* find most-left value */
2335 for (;;)
2336 {
2337 stat->stack[stat->stackpos] = node;
2338 if (node->left)
2339 {
2340 stat->stackpos++;
2341 node = node->left;
2342 }
2343 else
2344 break;
2345 }
2346 Assert(stat->stackpos <= stat->maxdepth);
2347 }
2348 else
2349 {
2350 /* we already return all left subtree, itself and right subtree */
2351 if (stat->stackpos == 0)
2352 return NULL;
2353
2354 stat->stackpos--;
2355 return walkStatEntryTree(stat);
2356 }
2357
2358 return node;
2359 }
2360
2361 static Datum
ts_process_call(FuncCallContext * funcctx)2362 ts_process_call(FuncCallContext *funcctx)
2363 {
2364 TSVectorStat *st;
2365 StatEntry *entry;
2366
2367 st = (TSVectorStat *) funcctx->user_fctx;
2368
2369 entry = walkStatEntryTree(st);
2370
2371 if (entry != NULL)
2372 {
2373 Datum result;
2374 char *values[3];
2375 char ndoc[16];
2376 char nentry[16];
2377 HeapTuple tuple;
2378
2379 values[0] = palloc(entry->lenlexeme + 1);
2380 memcpy(values[0], entry->lexeme, entry->lenlexeme);
2381 (values[0])[entry->lenlexeme] = '\0';
2382 sprintf(ndoc, "%d", entry->ndoc);
2383 values[1] = ndoc;
2384 sprintf(nentry, "%d", entry->nentry);
2385 values[2] = nentry;
2386
2387 tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
2388 result = HeapTupleGetDatum(tuple);
2389
2390 pfree(values[0]);
2391
2392 /* mark entry as already visited */
2393 entry->ndoc = 0;
2394
2395 return result;
2396 }
2397
2398 return (Datum) 0;
2399 }
2400
2401 static TSVectorStat *
ts_stat_sql(MemoryContext persistentContext,text * txt,text * ws)2402 ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws)
2403 {
2404 char *query = text_to_cstring(txt);
2405 TSVectorStat *stat;
2406 bool isnull;
2407 Portal portal;
2408 SPIPlanPtr plan;
2409
2410 if ((plan = SPI_prepare(query, 0, NULL)) == NULL)
2411 /* internal error */
2412 elog(ERROR, "SPI_prepare(\"%s\") failed", query);
2413
2414 if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, true)) == NULL)
2415 /* internal error */
2416 elog(ERROR, "SPI_cursor_open(\"%s\") failed", query);
2417
2418 SPI_cursor_fetch(portal, true, 100);
2419
2420 if (SPI_tuptable == NULL ||
2421 SPI_tuptable->tupdesc->natts != 1 ||
2422 !IsBinaryCoercible(SPI_gettypeid(SPI_tuptable->tupdesc, 1),
2423 TSVECTOROID))
2424 ereport(ERROR,
2425 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2426 errmsg("ts_stat query must return one tsvector column")));
2427
2428 stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat));
2429 stat->maxdepth = 1;
2430
2431 if (ws)
2432 {
2433 char *buf;
2434
2435 buf = VARDATA_ANY(ws);
2436 while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws))
2437 {
2438 if (pg_mblen(buf) == 1)
2439 {
2440 switch (*buf)
2441 {
2442 case 'A':
2443 case 'a':
2444 stat->weight |= 1 << 3;
2445 break;
2446 case 'B':
2447 case 'b':
2448 stat->weight |= 1 << 2;
2449 break;
2450 case 'C':
2451 case 'c':
2452 stat->weight |= 1 << 1;
2453 break;
2454 case 'D':
2455 case 'd':
2456 stat->weight |= 1;
2457 break;
2458 default:
2459 stat->weight |= 0;
2460 }
2461 }
2462 buf += pg_mblen(buf);
2463 }
2464 }
2465
2466 while (SPI_processed > 0)
2467 {
2468 uint64 i;
2469
2470 for (i = 0; i < SPI_processed; i++)
2471 {
2472 Datum data = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull);
2473
2474 if (!isnull)
2475 stat = ts_accum(persistentContext, stat, data);
2476 }
2477
2478 SPI_freetuptable(SPI_tuptable);
2479 SPI_cursor_fetch(portal, true, 100);
2480 }
2481
2482 SPI_freetuptable(SPI_tuptable);
2483 SPI_cursor_close(portal);
2484 SPI_freeplan(plan);
2485 pfree(query);
2486
2487 return stat;
2488 }
2489
2490 Datum
ts_stat1(PG_FUNCTION_ARGS)2491 ts_stat1(PG_FUNCTION_ARGS)
2492 {
2493 FuncCallContext *funcctx;
2494 Datum result;
2495
2496 if (SRF_IS_FIRSTCALL())
2497 {
2498 TSVectorStat *stat;
2499 text *txt = PG_GETARG_TEXT_PP(0);
2500
2501 funcctx = SRF_FIRSTCALL_INIT();
2502 SPI_connect();
2503 stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, NULL);
2504 PG_FREE_IF_COPY(txt, 0);
2505 ts_setup_firstcall(fcinfo, funcctx, stat);
2506 SPI_finish();
2507 }
2508
2509 funcctx = SRF_PERCALL_SETUP();
2510 if ((result = ts_process_call(funcctx)) != (Datum) 0)
2511 SRF_RETURN_NEXT(funcctx, result);
2512 SRF_RETURN_DONE(funcctx);
2513 }
2514
2515 Datum
ts_stat2(PG_FUNCTION_ARGS)2516 ts_stat2(PG_FUNCTION_ARGS)
2517 {
2518 FuncCallContext *funcctx;
2519 Datum result;
2520
2521 if (SRF_IS_FIRSTCALL())
2522 {
2523 TSVectorStat *stat;
2524 text *txt = PG_GETARG_TEXT_PP(0);
2525 text *ws = PG_GETARG_TEXT_PP(1);
2526
2527 funcctx = SRF_FIRSTCALL_INIT();
2528 SPI_connect();
2529 stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, ws);
2530 PG_FREE_IF_COPY(txt, 0);
2531 PG_FREE_IF_COPY(ws, 1);
2532 ts_setup_firstcall(fcinfo, funcctx, stat);
2533 SPI_finish();
2534 }
2535
2536 funcctx = SRF_PERCALL_SETUP();
2537 if ((result = ts_process_call(funcctx)) != (Datum) 0)
2538 SRF_RETURN_NEXT(funcctx, result);
2539 SRF_RETURN_DONE(funcctx);
2540 }
2541
2542
2543 /*
2544 * Triggers for automatic update of a tsvector column from text column(s)
2545 *
2546 * Trigger arguments are either
2547 * name of tsvector col, name of tsconfig to use, name(s) of text col(s)
2548 * name of tsvector col, name of regconfig col, name(s) of text col(s)
2549 * ie, tsconfig can either be specified by name, or indirectly as the
2550 * contents of a regconfig field in the row. If the name is used, it must
2551 * be explicitly schema-qualified.
2552 */
2553 Datum
tsvector_update_trigger_byid(PG_FUNCTION_ARGS)2554 tsvector_update_trigger_byid(PG_FUNCTION_ARGS)
2555 {
2556 return tsvector_update_trigger(fcinfo, false);
2557 }
2558
2559 Datum
tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS)2560 tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS)
2561 {
2562 return tsvector_update_trigger(fcinfo, true);
2563 }
2564
2565 static Datum
tsvector_update_trigger(PG_FUNCTION_ARGS,bool config_column)2566 tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column)
2567 {
2568 TriggerData *trigdata;
2569 Trigger *trigger;
2570 Relation rel;
2571 HeapTuple rettuple = NULL;
2572 int tsvector_attr_num,
2573 i;
2574 ParsedText prs;
2575 Datum datum;
2576 bool isnull;
2577 text *txt;
2578 Oid cfgId;
2579 bool update_needed;
2580
2581 /* Check call context */
2582 if (!CALLED_AS_TRIGGER(fcinfo)) /* internal error */
2583 elog(ERROR, "tsvector_update_trigger: not fired by trigger manager");
2584
2585 trigdata = (TriggerData *) fcinfo->context;
2586 if (!TRIGGER_FIRED_FOR_ROW(trigdata->tg_event))
2587 elog(ERROR, "tsvector_update_trigger: must be fired for row");
2588 if (!TRIGGER_FIRED_BEFORE(trigdata->tg_event))
2589 elog(ERROR, "tsvector_update_trigger: must be fired BEFORE event");
2590
2591 if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
2592 {
2593 rettuple = trigdata->tg_trigtuple;
2594 update_needed = true;
2595 }
2596 else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
2597 {
2598 rettuple = trigdata->tg_newtuple;
2599 update_needed = false; /* computed below */
2600 }
2601 else
2602 elog(ERROR, "tsvector_update_trigger: must be fired for INSERT or UPDATE");
2603
2604 trigger = trigdata->tg_trigger;
2605 rel = trigdata->tg_relation;
2606
2607 if (trigger->tgnargs < 3)
2608 elog(ERROR, "tsvector_update_trigger: arguments must be tsvector_field, ts_config, text_field1, ...)");
2609
2610 /* Find the target tsvector column */
2611 tsvector_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[0]);
2612 if (tsvector_attr_num == SPI_ERROR_NOATTRIBUTE)
2613 ereport(ERROR,
2614 (errcode(ERRCODE_UNDEFINED_COLUMN),
2615 errmsg("tsvector column \"%s\" does not exist",
2616 trigger->tgargs[0])));
2617 /* This will effectively reject system columns, so no separate test: */
2618 if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, tsvector_attr_num),
2619 TSVECTOROID))
2620 ereport(ERROR,
2621 (errcode(ERRCODE_DATATYPE_MISMATCH),
2622 errmsg("column \"%s\" is not of tsvector type",
2623 trigger->tgargs[0])));
2624
2625 /* Find the configuration to use */
2626 if (config_column)
2627 {
2628 int config_attr_num;
2629
2630 config_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[1]);
2631 if (config_attr_num == SPI_ERROR_NOATTRIBUTE)
2632 ereport(ERROR,
2633 (errcode(ERRCODE_UNDEFINED_COLUMN),
2634 errmsg("configuration column \"%s\" does not exist",
2635 trigger->tgargs[1])));
2636 if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, config_attr_num),
2637 REGCONFIGOID))
2638 ereport(ERROR,
2639 (errcode(ERRCODE_DATATYPE_MISMATCH),
2640 errmsg("column \"%s\" is not of regconfig type",
2641 trigger->tgargs[1])));
2642
2643 datum = SPI_getbinval(rettuple, rel->rd_att, config_attr_num, &isnull);
2644 if (isnull)
2645 ereport(ERROR,
2646 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
2647 errmsg("configuration column \"%s\" must not be null",
2648 trigger->tgargs[1])));
2649 cfgId = DatumGetObjectId(datum);
2650 }
2651 else
2652 {
2653 List *names;
2654
2655 names = stringToQualifiedNameList(trigger->tgargs[1]);
2656 /* require a schema so that results are not search path dependent */
2657 if (list_length(names) < 2)
2658 ereport(ERROR,
2659 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2660 errmsg("text search configuration name \"%s\" must be schema-qualified",
2661 trigger->tgargs[1])));
2662 cfgId = get_ts_config_oid(names, false);
2663 }
2664
2665 /* initialize parse state */
2666 prs.lenwords = 32;
2667 prs.curwords = 0;
2668 prs.pos = 0;
2669 prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
2670
2671 /* find all words in indexable column(s) */
2672 for (i = 2; i < trigger->tgnargs; i++)
2673 {
2674 int numattr;
2675
2676 numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]);
2677 if (numattr == SPI_ERROR_NOATTRIBUTE)
2678 ereport(ERROR,
2679 (errcode(ERRCODE_UNDEFINED_COLUMN),
2680 errmsg("column \"%s\" does not exist",
2681 trigger->tgargs[i])));
2682 if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, numattr), TEXTOID))
2683 ereport(ERROR,
2684 (errcode(ERRCODE_DATATYPE_MISMATCH),
2685 errmsg("column \"%s\" is not of a character type",
2686 trigger->tgargs[i])));
2687
2688 if (bms_is_member(numattr - FirstLowInvalidHeapAttributeNumber, trigdata->tg_updatedcols))
2689 update_needed = true;
2690
2691 datum = SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull);
2692 if (isnull)
2693 continue;
2694
2695 txt = DatumGetTextPP(datum);
2696
2697 parsetext(cfgId, &prs, VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt));
2698
2699 if (txt != (text *) DatumGetPointer(datum))
2700 pfree(txt);
2701 }
2702
2703 if (update_needed)
2704 {
2705 /* make tsvector value */
2706 datum = TSVectorGetDatum(make_tsvector(&prs));
2707 isnull = false;
2708
2709 /* and insert it into tuple */
2710 rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
2711 1, &tsvector_attr_num,
2712 &datum, &isnull);
2713
2714 pfree(DatumGetPointer(datum));
2715 }
2716
2717 return PointerGetDatum(rettuple);
2718 }
2719