1 /*-------------------------------------------------------------------------
2 *
3 * tsvector_op.c
4 * operations over tsvector
5 *
6 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsvector_op.c
11 *
12 *-------------------------------------------------------------------------
13 */
14 #include "postgres.h"
15
16 #include <limits.h>
17
18 #include "access/htup_details.h"
19 #include "catalog/namespace.h"
20 #include "catalog/pg_type.h"
21 #include "commands/trigger.h"
22 #include "executor/spi.h"
23 #include "funcapi.h"
24 #include "mb/pg_wchar.h"
25 #include "miscadmin.h"
26 #include "parser/parse_coerce.h"
27 #include "tsearch/ts_utils.h"
28 #include "utils/builtins.h"
29 #include "utils/lsyscache.h"
30 #include "utils/regproc.h"
31 #include "utils/rel.h"
32
33
34 typedef struct
35 {
36 WordEntry *arrb;
37 WordEntry *arre;
38 char *values;
39 char *operand;
40 } CHKVAL;
41
42
43 typedef struct StatEntry
44 {
45 uint32 ndoc; /* zero indicates that we were already here
46 * while walking through the tree */
47 uint32 nentry;
48 struct StatEntry *left;
49 struct StatEntry *right;
50 uint32 lenlexeme;
51 char lexeme[FLEXIBLE_ARRAY_MEMBER];
52 } StatEntry;
53
54 #define STATENTRYHDRSZ (offsetof(StatEntry, lexeme))
55
56 typedef struct
57 {
58 int32 weight;
59
60 uint32 maxdepth;
61
62 StatEntry **stack;
63 uint32 stackpos;
64
65 StatEntry *root;
66 } TSVectorStat;
67
68 #define STATHDRSIZE (offsetof(TSVectorStat, data))
69
70 /* TS_execute requires ternary logic to handle NOT with phrase matches */
71 typedef enum
72 {
73 TS_NO, /* definitely no match */
74 TS_YES, /* definitely does match */
75 TS_MAYBE /* can't verify match for lack of pos data */
76 } TSTernaryValue;
77
78
79 static TSTernaryValue TS_execute_recurse(QueryItem *curitem, void *arg,
80 uint32 flags,
81 TSExecuteCallback chkcond);
82 static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len);
83 static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
84
85
86 /*
87 * Order: haspos, len, word, for all positions (pos, weight)
88 */
89 static int
silly_cmp_tsvector(const TSVector a,const TSVector b)90 silly_cmp_tsvector(const TSVector a, const TSVector b)
91 {
92 if (VARSIZE(a) < VARSIZE(b))
93 return -1;
94 else if (VARSIZE(a) > VARSIZE(b))
95 return 1;
96 else if (a->size < b->size)
97 return -1;
98 else if (a->size > b->size)
99 return 1;
100 else
101 {
102 WordEntry *aptr = ARRPTR(a);
103 WordEntry *bptr = ARRPTR(b);
104 int i = 0;
105 int res;
106
107
108 for (i = 0; i < a->size; i++)
109 {
110 if (aptr->haspos != bptr->haspos)
111 {
112 return (aptr->haspos > bptr->haspos) ? -1 : 1;
113 }
114 else if ((res = tsCompareString(STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) != 0)
115 {
116 return res;
117 }
118 else if (aptr->haspos)
119 {
120 WordEntryPos *ap = POSDATAPTR(a, aptr);
121 WordEntryPos *bp = POSDATAPTR(b, bptr);
122 int j;
123
124 if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr))
125 return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1;
126
127 for (j = 0; j < POSDATALEN(a, aptr); j++)
128 {
129 if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp))
130 {
131 return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1;
132 }
133 else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp))
134 {
135 return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1;
136 }
137 ap++, bp++;
138 }
139 }
140
141 aptr++;
142 bptr++;
143 }
144 }
145
146 return 0;
147 }
148
149 #define TSVECTORCMPFUNC( type, action, ret ) \
150 Datum \
151 tsvector_##type(PG_FUNCTION_ARGS) \
152 { \
153 TSVector a = PG_GETARG_TSVECTOR(0); \
154 TSVector b = PG_GETARG_TSVECTOR(1); \
155 int res = silly_cmp_tsvector(a, b); \
156 PG_FREE_IF_COPY(a,0); \
157 PG_FREE_IF_COPY(b,1); \
158 PG_RETURN_##ret( res action 0 ); \
159 } \
160 /* keep compiler quiet - no extra ; */ \
161 extern int no_such_variable
162
163 TSVECTORCMPFUNC(lt, <, BOOL);
164 TSVECTORCMPFUNC(le, <=, BOOL);
165 TSVECTORCMPFUNC(eq, ==, BOOL);
166 TSVECTORCMPFUNC(ge, >=, BOOL);
167 TSVECTORCMPFUNC(gt, >, BOOL);
168 TSVECTORCMPFUNC(ne, !=, BOOL);
169 TSVECTORCMPFUNC(cmp, +, INT32);
170
171 Datum
tsvector_strip(PG_FUNCTION_ARGS)172 tsvector_strip(PG_FUNCTION_ARGS)
173 {
174 TSVector in = PG_GETARG_TSVECTOR(0);
175 TSVector out;
176 int i,
177 len = 0;
178 WordEntry *arrin = ARRPTR(in),
179 *arrout;
180 char *cur;
181
182 for (i = 0; i < in->size; i++)
183 len += arrin[i].len;
184
185 len = CALCDATASIZE(in->size, len);
186 out = (TSVector) palloc0(len);
187 SET_VARSIZE(out, len);
188 out->size = in->size;
189 arrout = ARRPTR(out);
190 cur = STRPTR(out);
191 for (i = 0; i < in->size; i++)
192 {
193 memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len);
194 arrout[i].haspos = 0;
195 arrout[i].len = arrin[i].len;
196 arrout[i].pos = cur - STRPTR(out);
197 cur += arrout[i].len;
198 }
199
200 PG_FREE_IF_COPY(in, 0);
201 PG_RETURN_POINTER(out);
202 }
203
204 Datum
tsvector_length(PG_FUNCTION_ARGS)205 tsvector_length(PG_FUNCTION_ARGS)
206 {
207 TSVector in = PG_GETARG_TSVECTOR(0);
208 int32 ret = in->size;
209
210 PG_FREE_IF_COPY(in, 0);
211 PG_RETURN_INT32(ret);
212 }
213
214 Datum
tsvector_setweight(PG_FUNCTION_ARGS)215 tsvector_setweight(PG_FUNCTION_ARGS)
216 {
217 TSVector in = PG_GETARG_TSVECTOR(0);
218 char cw = PG_GETARG_CHAR(1);
219 TSVector out;
220 int i,
221 j;
222 WordEntry *entry;
223 WordEntryPos *p;
224 int w = 0;
225
226 switch (cw)
227 {
228 case 'A':
229 case 'a':
230 w = 3;
231 break;
232 case 'B':
233 case 'b':
234 w = 2;
235 break;
236 case 'C':
237 case 'c':
238 w = 1;
239 break;
240 case 'D':
241 case 'd':
242 w = 0;
243 break;
244 default:
245 /* internal error */
246 elog(ERROR, "unrecognized weight: %d", cw);
247 }
248
249 out = (TSVector) palloc(VARSIZE(in));
250 memcpy(out, in, VARSIZE(in));
251 entry = ARRPTR(out);
252 i = out->size;
253 while (i--)
254 {
255 if ((j = POSDATALEN(out, entry)) != 0)
256 {
257 p = POSDATAPTR(out, entry);
258 while (j--)
259 {
260 WEP_SETWEIGHT(*p, w);
261 p++;
262 }
263 }
264 entry++;
265 }
266
267 PG_FREE_IF_COPY(in, 0);
268 PG_RETURN_POINTER(out);
269 }
270
271 /*
272 * setweight(tsin tsvector, char_weight "char", lexemes "text"[])
273 *
274 * Assign weight w to elements of tsin that are listed in lexemes.
275 */
276 Datum
tsvector_setweight_by_filter(PG_FUNCTION_ARGS)277 tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
278 {
279 TSVector tsin = PG_GETARG_TSVECTOR(0);
280 char char_weight = PG_GETARG_CHAR(1);
281 ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2);
282
283 TSVector tsout;
284 int i,
285 j,
286 nlexemes,
287 weight;
288 WordEntry *entry;
289 Datum *dlexemes;
290 bool *nulls;
291
292 switch (char_weight)
293 {
294 case 'A':
295 case 'a':
296 weight = 3;
297 break;
298 case 'B':
299 case 'b':
300 weight = 2;
301 break;
302 case 'C':
303 case 'c':
304 weight = 1;
305 break;
306 case 'D':
307 case 'd':
308 weight = 0;
309 break;
310 default:
311 /* internal error */
312 elog(ERROR, "unrecognized weight: %c", char_weight);
313 }
314
315 tsout = (TSVector) palloc(VARSIZE(tsin));
316 memcpy(tsout, tsin, VARSIZE(tsin));
317 entry = ARRPTR(tsout);
318
319 deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
320 &dlexemes, &nulls, &nlexemes);
321
322 /*
323 * Assuming that lexemes array is significantly shorter than tsvector we
324 * can iterate through lexemes performing binary search of each lexeme
325 * from lexemes in tsvector.
326 */
327 for (i = 0; i < nlexemes; i++)
328 {
329 char *lex;
330 int lex_len,
331 lex_pos;
332
333 if (nulls[i])
334 ereport(ERROR,
335 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
336 errmsg("lexeme array may not contain nulls")));
337
338 lex = VARDATA(dlexemes[i]);
339 lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
340 lex_pos = tsvector_bsearch(tsout, lex, lex_len);
341
342 if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
343 {
344 WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
345
346 while (j--)
347 {
348 WEP_SETWEIGHT(*p, weight);
349 p++;
350 }
351 }
352 }
353
354 PG_FREE_IF_COPY(tsin, 0);
355 PG_FREE_IF_COPY(lexemes, 2);
356
357 PG_RETURN_POINTER(tsout);
358 }
359
360 #define compareEntry(pa, a, pb, b) \
361 tsCompareString((pa) + (a)->pos, (a)->len, \
362 (pb) + (b)->pos, (b)->len, \
363 false)
364
365 /*
366 * Add positions from src to dest after offsetting them by maxpos.
367 * Return the number added (might be less than expected due to overflow)
368 */
369 static int32
add_pos(TSVector src,WordEntry * srcptr,TSVector dest,WordEntry * destptr,int32 maxpos)370 add_pos(TSVector src, WordEntry *srcptr,
371 TSVector dest, WordEntry *destptr,
372 int32 maxpos)
373 {
374 uint16 *clen = &_POSVECPTR(dest, destptr)->npos;
375 int i;
376 uint16 slen = POSDATALEN(src, srcptr),
377 startlen;
378 WordEntryPos *spos = POSDATAPTR(src, srcptr),
379 *dpos = POSDATAPTR(dest, destptr);
380
381 if (!destptr->haspos)
382 *clen = 0;
383
384 startlen = *clen;
385 for (i = 0;
386 i < slen && *clen < MAXNUMPOS &&
387 (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1);
388 i++)
389 {
390 WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i]));
391 WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos));
392 (*clen)++;
393 }
394
395 if (*clen != startlen)
396 destptr->haspos = 1;
397 return *clen - startlen;
398 }
399
400 /*
401 * Perform binary search of given lexeme in TSVector.
402 * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
403 * found.
404 */
405 static int
tsvector_bsearch(const TSVector tsv,char * lexeme,int lexeme_len)406 tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
407 {
408 WordEntry *arrin = ARRPTR(tsv);
409 int StopLow = 0,
410 StopHigh = tsv->size,
411 StopMiddle,
412 cmp;
413
414 while (StopLow < StopHigh)
415 {
416 StopMiddle = (StopLow + StopHigh) / 2;
417
418 cmp = tsCompareString(lexeme, lexeme_len,
419 STRPTR(tsv) + arrin[StopMiddle].pos,
420 arrin[StopMiddle].len,
421 false);
422
423 if (cmp < 0)
424 StopHigh = StopMiddle;
425 else if (cmp > 0)
426 StopLow = StopMiddle + 1;
427 else /* found it */
428 return StopMiddle;
429 }
430
431 return -1;
432 }
433
434 /*
435 * qsort comparator functions
436 */
437
438 static int
compare_int(const void * va,const void * vb)439 compare_int(const void *va, const void *vb)
440 {
441 int a = *((const int *) va);
442 int b = *((const int *) vb);
443
444 if (a == b)
445 return 0;
446 return (a > b) ? 1 : -1;
447 }
448
449 static int
compare_text_lexemes(const void * va,const void * vb)450 compare_text_lexemes(const void *va, const void *vb)
451 {
452 Datum a = *((const Datum *) va);
453 Datum b = *((const Datum *) vb);
454 char *alex = VARDATA_ANY(a);
455 int alex_len = VARSIZE_ANY_EXHDR(a);
456 char *blex = VARDATA_ANY(b);
457 int blex_len = VARSIZE_ANY_EXHDR(b);
458
459 return tsCompareString(alex, alex_len, blex, blex_len, false);
460 }
461
462 /*
463 * Internal routine to delete lexemes from TSVector by array of offsets.
464 *
465 * int *indices_to_delete -- array of lexeme offsets to delete (modified here!)
466 * int indices_count -- size of that array
467 *
468 * Returns new TSVector without given lexemes along with their positions
469 * and weights.
470 */
471 static TSVector
tsvector_delete_by_indices(TSVector tsv,int * indices_to_delete,int indices_count)472 tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
473 int indices_count)
474 {
475 TSVector tsout;
476 WordEntry *arrin = ARRPTR(tsv),
477 *arrout;
478 char *data = STRPTR(tsv),
479 *dataout;
480 int i, /* index in arrin */
481 j, /* index in arrout */
482 k, /* index in indices_to_delete */
483 curoff; /* index in dataout area */
484
485 /*
486 * Sort the filter array to simplify membership checks below. Also, get
487 * rid of any duplicate entries, so that we can assume that indices_count
488 * is exactly equal to the number of lexemes that will be removed.
489 */
490 if (indices_count > 1)
491 {
492 int kp;
493
494 qsort(indices_to_delete, indices_count, sizeof(int), compare_int);
495 kp = 0;
496 for (k = 1; k < indices_count; k++)
497 {
498 if (indices_to_delete[k] != indices_to_delete[kp])
499 indices_to_delete[++kp] = indices_to_delete[k];
500 }
501 indices_count = ++kp;
502 }
503
504 /*
505 * Here we overestimate tsout size, since we don't know how much space is
506 * used by the deleted lexeme(s). We will set exact size below.
507 */
508 tsout = (TSVector) palloc0(VARSIZE(tsv));
509
510 /* This count must be correct because STRPTR(tsout) relies on it. */
511 tsout->size = tsv->size - indices_count;
512
513 /*
514 * Copy tsv to tsout, skipping lexemes listed in indices_to_delete.
515 */
516 arrout = ARRPTR(tsout);
517 dataout = STRPTR(tsout);
518 curoff = 0;
519 for (i = j = k = 0; i < tsv->size; i++)
520 {
521 /*
522 * If current i is present in indices_to_delete, skip this lexeme.
523 * Since indices_to_delete is already sorted, we only need to check
524 * the current (k'th) entry.
525 */
526 if (k < indices_count && i == indices_to_delete[k])
527 {
528 k++;
529 continue;
530 }
531
532 /* Copy lexeme and its positions and weights */
533 memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
534 arrout[j].haspos = arrin[i].haspos;
535 arrout[j].len = arrin[i].len;
536 arrout[j].pos = curoff;
537 curoff += arrin[i].len;
538 if (arrin[i].haspos)
539 {
540 int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos)
541 + sizeof(uint16);
542
543 curoff = SHORTALIGN(curoff);
544 memcpy(dataout + curoff,
545 STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len),
546 len);
547 curoff += len;
548 }
549
550 j++;
551 }
552
553 /*
554 * k should now be exactly equal to indices_count. If it isn't then the
555 * caller provided us with indices outside of [0, tsv->size) range and
556 * estimation of tsout's size is wrong.
557 */
558 Assert(k == indices_count);
559
560 SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
561 return tsout;
562 }
563
564 /*
565 * Delete given lexeme from tsvector.
566 * Implementation of user-level ts_delete(tsvector, text).
567 */
568 Datum
tsvector_delete_str(PG_FUNCTION_ARGS)569 tsvector_delete_str(PG_FUNCTION_ARGS)
570 {
571 TSVector tsin = PG_GETARG_TSVECTOR(0),
572 tsout;
573 text *tlexeme = PG_GETARG_TEXT_PP(1);
574 char *lexeme = VARDATA_ANY(tlexeme);
575 int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme),
576 skip_index;
577
578 if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1)
579 PG_RETURN_POINTER(tsin);
580
581 tsout = tsvector_delete_by_indices(tsin, &skip_index, 1);
582
583 PG_FREE_IF_COPY(tsin, 0);
584 PG_FREE_IF_COPY(tlexeme, 1);
585 PG_RETURN_POINTER(tsout);
586 }
587
588 /*
589 * Delete given array of lexemes from tsvector.
590 * Implementation of user-level ts_delete(tsvector, text[]).
591 */
592 Datum
tsvector_delete_arr(PG_FUNCTION_ARGS)593 tsvector_delete_arr(PG_FUNCTION_ARGS)
594 {
595 TSVector tsin = PG_GETARG_TSVECTOR(0),
596 tsout;
597 ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1);
598 int i,
599 nlex,
600 skip_count,
601 *skip_indices;
602 Datum *dlexemes;
603 bool *nulls;
604
605 deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
606 &dlexemes, &nulls, &nlex);
607
608 /*
609 * In typical use case array of lexemes to delete is relatively small. So
610 * here we optimize things for that scenario: iterate through lexarr
611 * performing binary search of each lexeme from lexarr in tsvector.
612 */
613 skip_indices = palloc0(nlex * sizeof(int));
614 for (i = skip_count = 0; i < nlex; i++)
615 {
616 char *lex;
617 int lex_len,
618 lex_pos;
619
620 if (nulls[i])
621 ereport(ERROR,
622 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
623 errmsg("lexeme array may not contain nulls")));
624
625 lex = VARDATA(dlexemes[i]);
626 lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
627 lex_pos = tsvector_bsearch(tsin, lex, lex_len);
628
629 if (lex_pos >= 0)
630 skip_indices[skip_count++] = lex_pos;
631 }
632
633 tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count);
634
635 pfree(skip_indices);
636 PG_FREE_IF_COPY(tsin, 0);
637 PG_FREE_IF_COPY(lexemes, 1);
638
639 PG_RETURN_POINTER(tsout);
640 }
641
642 /*
643 * Expand tsvector as table with following columns:
644 * lexeme: lexeme text
645 * positions: integer array of lexeme positions
646 * weights: char array of weights corresponding to positions
647 */
648 Datum
tsvector_unnest(PG_FUNCTION_ARGS)649 tsvector_unnest(PG_FUNCTION_ARGS)
650 {
651 FuncCallContext *funcctx;
652 TSVector tsin;
653
654 if (SRF_IS_FIRSTCALL())
655 {
656 MemoryContext oldcontext;
657 TupleDesc tupdesc;
658
659 funcctx = SRF_FIRSTCALL_INIT();
660 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
661
662 tupdesc = CreateTemplateTupleDesc(3, false);
663 TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
664 TEXTOID, -1, 0);
665 TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
666 INT2ARRAYOID, -1, 0);
667 TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
668 TEXTARRAYOID, -1, 0);
669 funcctx->tuple_desc = BlessTupleDesc(tupdesc);
670
671 funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0);
672
673 MemoryContextSwitchTo(oldcontext);
674 }
675
676 funcctx = SRF_PERCALL_SETUP();
677 tsin = (TSVector) funcctx->user_fctx;
678
679 if (funcctx->call_cntr < tsin->size)
680 {
681 WordEntry *arrin = ARRPTR(tsin);
682 char *data = STRPTR(tsin);
683 HeapTuple tuple;
684 int j,
685 i = funcctx->call_cntr;
686 bool nulls[] = {false, false, false};
687 Datum values[3];
688
689 values[0] = PointerGetDatum(
690 cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len)
691 );
692
693 if (arrin[i].haspos)
694 {
695 WordEntryPosVector *posv;
696 Datum *positions;
697 Datum *weights;
698 char weight;
699
700 /*
701 * Internally tsvector stores position and weight in the same
702 * uint16 (2 bits for weight, 14 for position). Here we extract
703 * that in two separate arrays.
704 */
705 posv = _POSVECPTR(tsin, arrin + i);
706 positions = palloc(posv->npos * sizeof(Datum));
707 weights = palloc(posv->npos * sizeof(Datum));
708 for (j = 0; j < posv->npos; j++)
709 {
710 positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
711 weight = 'D' - WEP_GETWEIGHT(posv->pos[j]);
712 weights[j] = PointerGetDatum(
713 cstring_to_text_with_len(&weight, 1)
714 );
715 }
716
717 values[1] = PointerGetDatum(
718 construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
719 values[2] = PointerGetDatum(
720 construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
721 }
722 else
723 {
724 nulls[1] = nulls[2] = true;
725 }
726
727 tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
728 SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
729 }
730 else
731 {
732 pfree(tsin);
733 SRF_RETURN_DONE(funcctx);
734 }
735 }
736
737 /*
738 * Convert tsvector to array of lexemes.
739 */
740 Datum
tsvector_to_array(PG_FUNCTION_ARGS)741 tsvector_to_array(PG_FUNCTION_ARGS)
742 {
743 TSVector tsin = PG_GETARG_TSVECTOR(0);
744 WordEntry *arrin = ARRPTR(tsin);
745 Datum *elements;
746 int i;
747 ArrayType *array;
748
749 elements = palloc(tsin->size * sizeof(Datum));
750
751 for (i = 0; i < tsin->size; i++)
752 {
753 elements[i] = PointerGetDatum(
754 cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len)
755 );
756 }
757
758 array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
759
760 pfree(elements);
761 PG_FREE_IF_COPY(tsin, 0);
762 PG_RETURN_POINTER(array);
763 }
764
765 /*
766 * Build tsvector from array of lexemes.
767 */
768 Datum
array_to_tsvector(PG_FUNCTION_ARGS)769 array_to_tsvector(PG_FUNCTION_ARGS)
770 {
771 ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
772 TSVector tsout;
773 Datum *dlexemes;
774 WordEntry *arrout;
775 bool *nulls;
776 int nitems,
777 i,
778 j,
779 tslen,
780 datalen = 0;
781 char *cur;
782
783 deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
784
785 /* Reject nulls (maybe we should just ignore them, instead?) */
786 for (i = 0; i < nitems; i++)
787 {
788 if (nulls[i])
789 ereport(ERROR,
790 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
791 errmsg("lexeme array may not contain nulls")));
792 }
793
794 /* Sort and de-dup, because this is required for a valid tsvector. */
795 if (nitems > 1)
796 {
797 qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes);
798 j = 0;
799 for (i = 1; i < nitems; i++)
800 {
801 if (compare_text_lexemes(&dlexemes[j], &dlexemes[i]) < 0)
802 dlexemes[++j] = dlexemes[i];
803 }
804 nitems = ++j;
805 }
806
807 /* Calculate space needed for surviving lexemes. */
808 for (i = 0; i < nitems; i++)
809 datalen += VARSIZE(dlexemes[i]) - VARHDRSZ;
810 tslen = CALCDATASIZE(nitems, datalen);
811
812 /* Allocate and fill tsvector. */
813 tsout = (TSVector) palloc0(tslen);
814 SET_VARSIZE(tsout, tslen);
815 tsout->size = nitems;
816
817 arrout = ARRPTR(tsout);
818 cur = STRPTR(tsout);
819 for (i = 0; i < nitems; i++)
820 {
821 char *lex = VARDATA(dlexemes[i]);
822 int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
823
824 memcpy(cur, lex, lex_len);
825 arrout[i].haspos = 0;
826 arrout[i].len = lex_len;
827 arrout[i].pos = cur - STRPTR(tsout);
828 cur += lex_len;
829 }
830
831 PG_FREE_IF_COPY(v, 0);
832 PG_RETURN_POINTER(tsout);
833 }
834
835 /*
836 * ts_filter(): keep only lexemes with given weights in tsvector.
837 */
838 Datum
tsvector_filter(PG_FUNCTION_ARGS)839 tsvector_filter(PG_FUNCTION_ARGS)
840 {
841 TSVector tsin = PG_GETARG_TSVECTOR(0),
842 tsout;
843 ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
844 WordEntry *arrin = ARRPTR(tsin),
845 *arrout;
846 char *datain = STRPTR(tsin),
847 *dataout;
848 Datum *dweights;
849 bool *nulls;
850 int nweights;
851 int i,
852 j;
853 int cur_pos = 0;
854 char mask = 0;
855
856 deconstruct_array(weights, CHAROID, 1, true, 'c',
857 &dweights, &nulls, &nweights);
858
859 for (i = 0; i < nweights; i++)
860 {
861 char char_weight;
862
863 if (nulls[i])
864 ereport(ERROR,
865 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
866 errmsg("weight array may not contain nulls")));
867
868 char_weight = DatumGetChar(dweights[i]);
869 switch (char_weight)
870 {
871 case 'A':
872 case 'a':
873 mask = mask | 8;
874 break;
875 case 'B':
876 case 'b':
877 mask = mask | 4;
878 break;
879 case 'C':
880 case 'c':
881 mask = mask | 2;
882 break;
883 case 'D':
884 case 'd':
885 mask = mask | 1;
886 break;
887 default:
888 ereport(ERROR,
889 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
890 errmsg("unrecognized weight: \"%c\"", char_weight)));
891 }
892 }
893
894 tsout = (TSVector) palloc0(VARSIZE(tsin));
895 tsout->size = tsin->size;
896 arrout = ARRPTR(tsout);
897 dataout = STRPTR(tsout);
898
899 for (i = j = 0; i < tsin->size; i++)
900 {
901 WordEntryPosVector *posvin,
902 *posvout;
903 int npos = 0;
904 int k;
905
906 if (!arrin[i].haspos)
907 continue;
908
909 posvin = _POSVECPTR(tsin, arrin + i);
910 posvout = (WordEntryPosVector *)
911 (dataout + SHORTALIGN(cur_pos + arrin[i].len));
912
913 for (k = 0; k < posvin->npos; k++)
914 {
915 if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
916 posvout->pos[npos++] = posvin->pos[k];
917 }
918
919 /* if no satisfactory positions found, skip lexeme */
920 if (!npos)
921 continue;
922
923 arrout[j].haspos = true;
924 arrout[j].len = arrin[i].len;
925 arrout[j].pos = cur_pos;
926
927 memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
928 posvout->npos = npos;
929 cur_pos += SHORTALIGN(arrin[i].len);
930 cur_pos += POSDATALEN(tsout, arrout + j) * sizeof(WordEntryPos) +
931 sizeof(uint16);
932 j++;
933 }
934
935 tsout->size = j;
936 if (dataout != STRPTR(tsout))
937 memmove(STRPTR(tsout), dataout, cur_pos);
938
939 SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
940
941 PG_FREE_IF_COPY(tsin, 0);
942 PG_RETURN_POINTER(tsout);
943 }
944
945 Datum
tsvector_concat(PG_FUNCTION_ARGS)946 tsvector_concat(PG_FUNCTION_ARGS)
947 {
948 TSVector in1 = PG_GETARG_TSVECTOR(0);
949 TSVector in2 = PG_GETARG_TSVECTOR(1);
950 TSVector out;
951 WordEntry *ptr;
952 WordEntry *ptr1,
953 *ptr2;
954 WordEntryPos *p;
955 int maxpos = 0,
956 i,
957 j,
958 i1,
959 i2,
960 dataoff,
961 output_bytes,
962 output_size;
963 char *data,
964 *data1,
965 *data2;
966
967 /* Get max position in in1; we'll need this to offset in2's positions */
968 ptr = ARRPTR(in1);
969 i = in1->size;
970 while (i--)
971 {
972 if ((j = POSDATALEN(in1, ptr)) != 0)
973 {
974 p = POSDATAPTR(in1, ptr);
975 while (j--)
976 {
977 if (WEP_GETPOS(*p) > maxpos)
978 maxpos = WEP_GETPOS(*p);
979 p++;
980 }
981 }
982 ptr++;
983 }
984
985 ptr1 = ARRPTR(in1);
986 ptr2 = ARRPTR(in2);
987 data1 = STRPTR(in1);
988 data2 = STRPTR(in2);
989 i1 = in1->size;
990 i2 = in2->size;
991
992 /*
993 * Conservative estimate of space needed. We might need all the data in
994 * both inputs, and conceivably add a pad byte before position data for
995 * each item where there was none before.
996 */
997 output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 + i2;
998
999 out = (TSVector) palloc0(output_bytes);
1000 SET_VARSIZE(out, output_bytes);
1001
1002 /*
1003 * We must make out->size valid so that STRPTR(out) is sensible. We'll
1004 * collapse out any unused space at the end.
1005 */
1006 out->size = in1->size + in2->size;
1007
1008 ptr = ARRPTR(out);
1009 data = STRPTR(out);
1010 dataoff = 0;
1011 while (i1 && i2)
1012 {
1013 int cmp = compareEntry(data1, ptr1, data2, ptr2);
1014
1015 if (cmp < 0)
1016 { /* in1 first */
1017 ptr->haspos = ptr1->haspos;
1018 ptr->len = ptr1->len;
1019 memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
1020 ptr->pos = dataoff;
1021 dataoff += ptr1->len;
1022 if (ptr->haspos)
1023 {
1024 dataoff = SHORTALIGN(dataoff);
1025 memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
1026 dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
1027 }
1028
1029 ptr++;
1030 ptr1++;
1031 i1--;
1032 }
1033 else if (cmp > 0)
1034 { /* in2 first */
1035 ptr->haspos = ptr2->haspos;
1036 ptr->len = ptr2->len;
1037 memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
1038 ptr->pos = dataoff;
1039 dataoff += ptr2->len;
1040 if (ptr->haspos)
1041 {
1042 int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
1043
1044 if (addlen == 0)
1045 ptr->haspos = 0;
1046 else
1047 {
1048 dataoff = SHORTALIGN(dataoff);
1049 dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
1050 }
1051 }
1052
1053 ptr++;
1054 ptr2++;
1055 i2--;
1056 }
1057 else
1058 {
1059 ptr->haspos = ptr1->haspos | ptr2->haspos;
1060 ptr->len = ptr1->len;
1061 memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
1062 ptr->pos = dataoff;
1063 dataoff += ptr1->len;
1064 if (ptr->haspos)
1065 {
1066 if (ptr1->haspos)
1067 {
1068 dataoff = SHORTALIGN(dataoff);
1069 memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
1070 dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
1071 if (ptr2->haspos)
1072 dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos);
1073 }
1074 else /* must have ptr2->haspos */
1075 {
1076 int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
1077
1078 if (addlen == 0)
1079 ptr->haspos = 0;
1080 else
1081 {
1082 dataoff = SHORTALIGN(dataoff);
1083 dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
1084 }
1085 }
1086 }
1087
1088 ptr++;
1089 ptr1++;
1090 ptr2++;
1091 i1--;
1092 i2--;
1093 }
1094 }
1095
1096 while (i1)
1097 {
1098 ptr->haspos = ptr1->haspos;
1099 ptr->len = ptr1->len;
1100 memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
1101 ptr->pos = dataoff;
1102 dataoff += ptr1->len;
1103 if (ptr->haspos)
1104 {
1105 dataoff = SHORTALIGN(dataoff);
1106 memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
1107 dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
1108 }
1109
1110 ptr++;
1111 ptr1++;
1112 i1--;
1113 }
1114
1115 while (i2)
1116 {
1117 ptr->haspos = ptr2->haspos;
1118 ptr->len = ptr2->len;
1119 memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
1120 ptr->pos = dataoff;
1121 dataoff += ptr2->len;
1122 if (ptr->haspos)
1123 {
1124 int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
1125
1126 if (addlen == 0)
1127 ptr->haspos = 0;
1128 else
1129 {
1130 dataoff = SHORTALIGN(dataoff);
1131 dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
1132 }
1133 }
1134
1135 ptr++;
1136 ptr2++;
1137 i2--;
1138 }
1139
1140 /*
1141 * Instead of checking each offset individually, we check for overflow of
1142 * pos fields once at the end.
1143 */
1144 if (dataoff > MAXSTRPOS)
1145 ereport(ERROR,
1146 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1147 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", dataoff, MAXSTRPOS)));
1148
1149 /*
1150 * Adjust sizes (asserting that we didn't overrun the original estimates)
1151 * and collapse out any unused array entries.
1152 */
1153 output_size = ptr - ARRPTR(out);
1154 Assert(output_size <= out->size);
1155 out->size = output_size;
1156 if (data != STRPTR(out))
1157 memmove(STRPTR(out), data, dataoff);
1158 output_bytes = CALCDATASIZE(out->size, dataoff);
1159 Assert(output_bytes <= VARSIZE(out));
1160 SET_VARSIZE(out, output_bytes);
1161
1162 PG_FREE_IF_COPY(in1, 0);
1163 PG_FREE_IF_COPY(in2, 1);
1164 PG_RETURN_POINTER(out);
1165 }
1166
1167 /*
1168 * Compare two strings by tsvector rules.
1169 *
1170 * if isPrefix = true then it returns zero value iff b has prefix a
1171 */
1172 int32
tsCompareString(char * a,int lena,char * b,int lenb,bool prefix)1173 tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
1174 {
1175 int cmp;
1176
1177 if (lena == 0)
1178 {
1179 if (prefix)
1180 cmp = 0; /* empty string is prefix of anything */
1181 else
1182 cmp = (lenb > 0) ? -1 : 0;
1183 }
1184 else if (lenb == 0)
1185 {
1186 cmp = (lena > 0) ? 1 : 0;
1187 }
1188 else
1189 {
1190 cmp = memcmp(a, b, Min(lena, lenb));
1191
1192 if (prefix)
1193 {
1194 if (cmp == 0 && lena > lenb)
1195 cmp = 1; /* a is longer, so not a prefix of b */
1196 }
1197 else if (cmp == 0 && lena != lenb)
1198 {
1199 cmp = (lena < lenb) ? -1 : 1;
1200 }
1201 }
1202
1203 return cmp;
1204 }
1205
1206 /*
1207 * Check weight info or/and fill 'data' with the required positions
1208 */
1209 static bool
checkclass_str(CHKVAL * chkval,WordEntry * entry,QueryOperand * val,ExecPhraseData * data)1210 checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
1211 ExecPhraseData *data)
1212 {
1213 bool result = false;
1214
1215 if (entry->haspos && (val->weight || data))
1216 {
1217 WordEntryPosVector *posvec;
1218
1219 /*
1220 * We can't use the _POSVECPTR macro here because the pointer to the
1221 * tsvector's lexeme storage is already contained in chkval->values.
1222 */
1223 posvec = (WordEntryPosVector *)
1224 (chkval->values + SHORTALIGN(entry->pos + entry->len));
1225
1226 if (val->weight && data)
1227 {
1228 WordEntryPos *posvec_iter = posvec->pos;
1229 WordEntryPos *dptr;
1230
1231 /*
1232 * Filter position information by weights
1233 */
1234 dptr = data->pos = palloc(sizeof(WordEntryPos) * posvec->npos);
1235 data->allocated = true;
1236
1237 /* Is there a position with a matching weight? */
1238 while (posvec_iter < posvec->pos + posvec->npos)
1239 {
1240 /* If true, append this position to the data->pos */
1241 if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
1242 {
1243 *dptr = WEP_GETPOS(*posvec_iter);
1244 dptr++;
1245 }
1246
1247 posvec_iter++;
1248 }
1249
1250 data->npos = dptr - data->pos;
1251
1252 if (data->npos > 0)
1253 result = true;
1254 }
1255 else if (val->weight)
1256 {
1257 WordEntryPos *posvec_iter = posvec->pos;
1258
1259 /* Is there a position with a matching weight? */
1260 while (posvec_iter < posvec->pos + posvec->npos)
1261 {
1262 if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
1263 {
1264 result = true;
1265 break; /* no need to go further */
1266 }
1267
1268 posvec_iter++;
1269 }
1270 }
1271 else /* data != NULL */
1272 {
1273 data->npos = posvec->npos;
1274 data->pos = posvec->pos;
1275 data->allocated = false;
1276 result = true;
1277 }
1278 }
1279 else
1280 {
1281 result = true;
1282 }
1283
1284 return result;
1285 }
1286
1287 /*
1288 * Removes duplicate pos entries. We can't use uniquePos() from
1289 * tsvector.c because array might be longer than MAXENTRYPOS
1290 *
1291 * Returns new length.
1292 */
1293 static int
uniqueLongPos(WordEntryPos * pos,int npos)1294 uniqueLongPos(WordEntryPos *pos, int npos)
1295 {
1296 WordEntryPos *pos_iter,
1297 *result;
1298
1299 if (npos <= 1)
1300 return npos;
1301
1302 qsort((void *) pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
1303
1304 result = pos;
1305 pos_iter = pos + 1;
1306 while (pos_iter < pos + npos)
1307 {
1308 if (WEP_GETPOS(*pos_iter) != WEP_GETPOS(*result))
1309 {
1310 result++;
1311 *result = WEP_GETPOS(*pos_iter);
1312 }
1313
1314 pos_iter++;
1315 }
1316
1317 return result + 1 - pos;
1318 }
1319
1320 /*
1321 * is there value 'val' in array or not ?
1322 */
1323 static bool
checkcondition_str(void * checkval,QueryOperand * val,ExecPhraseData * data)1324 checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
1325 {
1326 CHKVAL *chkval = (CHKVAL *) checkval;
1327 WordEntry *StopLow = chkval->arrb;
1328 WordEntry *StopHigh = chkval->arre;
1329 WordEntry *StopMiddle = StopHigh;
1330 bool res = false;
1331
1332 /* Loop invariant: StopLow <= val < StopHigh */
1333 while (StopLow < StopHigh)
1334 {
1335 int difference;
1336
1337 StopMiddle = StopLow + (StopHigh - StopLow) / 2;
1338 difference = tsCompareString(chkval->operand + val->distance,
1339 val->length,
1340 chkval->values + StopMiddle->pos,
1341 StopMiddle->len,
1342 false);
1343
1344 if (difference == 0)
1345 {
1346 /* Check weight info & fill 'data' with positions */
1347 res = checkclass_str(chkval, StopMiddle, val, data);
1348 break;
1349 }
1350 else if (difference > 0)
1351 StopLow = StopMiddle + 1;
1352 else
1353 StopHigh = StopMiddle;
1354 }
1355
1356 if ((!res || data) && val->prefix)
1357 {
1358 WordEntryPos *allpos = NULL;
1359 int npos = 0,
1360 totalpos = 0;
1361
1362 /*
1363 * there was a failed exact search, so we should scan further to find
1364 * a prefix match. We also need to do so if caller needs position info
1365 */
1366 if (StopLow >= StopHigh)
1367 StopMiddle = StopHigh;
1368
1369 while ((!res || data) && StopMiddle < chkval->arre &&
1370 tsCompareString(chkval->operand + val->distance,
1371 val->length,
1372 chkval->values + StopMiddle->pos,
1373 StopMiddle->len,
1374 true) == 0)
1375 {
1376 if (data)
1377 {
1378 /*
1379 * We need to join position information
1380 */
1381 res = checkclass_str(chkval, StopMiddle, val, data);
1382
1383 if (res)
1384 {
1385 while (npos + data->npos >= totalpos)
1386 {
1387 if (totalpos == 0)
1388 {
1389 totalpos = 256;
1390 allpos = palloc(sizeof(WordEntryPos) * totalpos);
1391 }
1392 else
1393 {
1394 totalpos *= 2;
1395 allpos = repalloc(allpos, sizeof(WordEntryPos) * totalpos);
1396 }
1397 }
1398
1399 memcpy(allpos + npos, data->pos, sizeof(WordEntryPos) * data->npos);
1400 npos += data->npos;
1401 }
1402 else
1403 {
1404 /* at loop exit, res must be true if we found matches */
1405 res = (npos > 0);
1406 }
1407 }
1408 else
1409 {
1410 res = checkclass_str(chkval, StopMiddle, val, NULL);
1411 }
1412
1413 StopMiddle++;
1414 }
1415
1416 if (res && data)
1417 {
1418 /* Sort and make unique array of found positions */
1419 data->pos = allpos;
1420 data->npos = uniqueLongPos(allpos, npos);
1421 data->allocated = true;
1422 }
1423 }
1424
1425 return res;
1426 }
1427
1428 /*
1429 * Compute output position list for a tsquery operator in phrase mode.
1430 *
1431 * Merge the position lists in Ldata and Rdata as specified by "emit",
1432 * returning the result list into *data. The input position lists must be
1433 * sorted and unique, and the output will be as well.
1434 *
1435 * data: pointer to initially-all-zeroes output struct, or NULL
1436 * Ldata, Rdata: input position lists
1437 * emit: bitmask of TSPO_XXX flags
1438 * Loffset: offset to be added to Ldata positions before comparing/outputting
1439 * Roffset: offset to be added to Rdata positions before comparing/outputting
1440 * max_npos: maximum possible required size of output position array
1441 *
1442 * Loffset and Roffset should not be negative, else we risk trying to output
1443 * negative positions, which won't fit into WordEntryPos.
1444 *
1445 * The result is boolean (TS_YES or TS_NO), but for the caller's convenience
1446 * we return it as TSTernaryValue.
1447 *
1448 * Returns TS_YES if any positions were emitted to *data; or if data is NULL,
1449 * returns TS_YES if any positions would have been emitted.
1450 */
1451 #define TSPO_L_ONLY 0x01 /* emit positions appearing only in L */
1452 #define TSPO_R_ONLY 0x02 /* emit positions appearing only in R */
1453 #define TSPO_BOTH 0x04 /* emit positions appearing in both L&R */
1454
1455 static TSTernaryValue
TS_phrase_output(ExecPhraseData * data,ExecPhraseData * Ldata,ExecPhraseData * Rdata,int emit,int Loffset,int Roffset,int max_npos)1456 TS_phrase_output(ExecPhraseData *data,
1457 ExecPhraseData *Ldata,
1458 ExecPhraseData *Rdata,
1459 int emit,
1460 int Loffset,
1461 int Roffset,
1462 int max_npos)
1463 {
1464 int Lindex,
1465 Rindex;
1466
1467 /* Loop until both inputs are exhausted */
1468 Lindex = Rindex = 0;
1469 while (Lindex < Ldata->npos || Rindex < Rdata->npos)
1470 {
1471 int Lpos,
1472 Rpos;
1473 int output_pos = 0;
1474
1475 /*
1476 * Fetch current values to compare. WEP_GETPOS() is needed because
1477 * ExecPhraseData->data can point to a tsvector's WordEntryPosVector.
1478 */
1479 if (Lindex < Ldata->npos)
1480 Lpos = WEP_GETPOS(Ldata->pos[Lindex]) + Loffset;
1481 else
1482 {
1483 /* L array exhausted, so we're done if R_ONLY isn't set */
1484 if (!(emit & TSPO_R_ONLY))
1485 break;
1486 Lpos = INT_MAX;
1487 }
1488 if (Rindex < Rdata->npos)
1489 Rpos = WEP_GETPOS(Rdata->pos[Rindex]) + Roffset;
1490 else
1491 {
1492 /* R array exhausted, so we're done if L_ONLY isn't set */
1493 if (!(emit & TSPO_L_ONLY))
1494 break;
1495 Rpos = INT_MAX;
1496 }
1497
1498 /* Merge-join the two input lists */
1499 if (Lpos < Rpos)
1500 {
1501 /* Lpos is not matched in Rdata, should we output it? */
1502 if (emit & TSPO_L_ONLY)
1503 output_pos = Lpos;
1504 Lindex++;
1505 }
1506 else if (Lpos == Rpos)
1507 {
1508 /* Lpos and Rpos match ... should we output it? */
1509 if (emit & TSPO_BOTH)
1510 output_pos = Rpos;
1511 Lindex++;
1512 Rindex++;
1513 }
1514 else /* Lpos > Rpos */
1515 {
1516 /* Rpos is not matched in Ldata, should we output it? */
1517 if (emit & TSPO_R_ONLY)
1518 output_pos = Rpos;
1519 Rindex++;
1520 }
1521
1522 if (output_pos > 0)
1523 {
1524 if (data)
1525 {
1526 /* Store position, first allocating output array if needed */
1527 if (data->pos == NULL)
1528 {
1529 data->pos = (WordEntryPos *)
1530 palloc(max_npos * sizeof(WordEntryPos));
1531 data->allocated = true;
1532 }
1533 data->pos[data->npos++] = output_pos;
1534 }
1535 else
1536 {
1537 /*
1538 * Exact positions not needed, so return TS_YES as soon as we
1539 * know there is at least one.
1540 */
1541 return TS_YES;
1542 }
1543 }
1544 }
1545
1546 if (data && data->npos > 0)
1547 {
1548 /* Let's assert we didn't overrun the array */
1549 Assert(data->npos <= max_npos);
1550 return TS_YES;
1551 }
1552 return TS_NO;
1553 }
1554
1555 /*
1556 * Execute tsquery at or below an OP_PHRASE operator.
1557 *
1558 * This handles tsquery execution at recursion levels where we need to care
1559 * about match locations.
1560 *
1561 * In addition to the same arguments used for TS_execute, the caller may pass
1562 * a preinitialized-to-zeroes ExecPhraseData struct, to be filled with lexeme
1563 * match position info on success. data == NULL if no position data need be
1564 * returned. (In practice, outside callers pass NULL, and only the internal
1565 * recursion cases pass a data pointer.)
1566 * Note: the function assumes data != NULL for operators other than OP_PHRASE.
1567 * This is OK because an outside call always starts from an OP_PHRASE node.
1568 *
1569 * The detailed semantics of the match data, given that the function returned
1570 * TS_YES (successful match), are:
1571 *
1572 * npos > 0, negate = false:
1573 * query is matched at specified position(s) (and only those positions)
1574 * npos > 0, negate = true:
1575 * query is matched at all positions *except* specified position(s)
1576 * npos = 0, negate = true:
1577 * query is matched at all positions
1578 * npos = 0, negate = false:
1579 * disallowed (this should result in TS_NO or TS_MAYBE, as appropriate)
1580 *
1581 * Successful matches also return a "width" value which is the match width in
1582 * lexemes, less one. Hence, "width" is zero for simple one-lexeme matches,
1583 * and is the sum of the phrase operator distances for phrase matches. Note
1584 * that when width > 0, the listed positions represent the ends of matches not
1585 * the starts. (This unintuitive rule is needed to avoid possibly generating
1586 * negative positions, which wouldn't fit into the WordEntryPos arrays.)
1587 *
1588 * If the TSExecuteCallback function reports that an operand is present
1589 * but fails to provide position(s) for it, we will return TS_MAYBE when
1590 * it is possible but not certain that the query is matched.
1591 *
1592 * When the function returns TS_NO or TS_MAYBE, it must return npos = 0,
1593 * negate = false (which is the state initialized by the caller); but the
1594 * "width" output in such cases is undefined.
1595 */
1596 static TSTernaryValue
TS_phrase_execute(QueryItem * curitem,void * arg,uint32 flags,TSExecuteCallback chkcond,ExecPhraseData * data)1597 TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags,
1598 TSExecuteCallback chkcond,
1599 ExecPhraseData *data)
1600 {
1601 ExecPhraseData Ldata,
1602 Rdata;
1603 TSTernaryValue lmatch,
1604 rmatch;
1605 int Loffset,
1606 Roffset,
1607 maxwidth;
1608
1609 /* since this function recurses, it could be driven to stack overflow */
1610 check_stack_depth();
1611
1612 if (curitem->type == QI_VAL)
1613 {
1614 if (!chkcond(arg, (QueryOperand *) curitem, data))
1615 return TS_NO;
1616 if (data->npos > 0 || data->negate)
1617 return TS_YES;
1618 /* If we have no position data, we must return TS_MAYBE */
1619 return TS_MAYBE;
1620 }
1621
1622 switch (curitem->qoperator.oper)
1623 {
1624 case OP_NOT:
1625
1626 /*
1627 * We need not touch data->width, since a NOT operation does not
1628 * change the match width.
1629 */
1630 if (!(flags & TS_EXEC_CALC_NOT))
1631 {
1632 /* without CALC_NOT, report NOT as "match everywhere" */
1633 Assert(data->npos == 0 && !data->negate);
1634 data->negate = true;
1635 return TS_YES;
1636 }
1637 switch (TS_phrase_execute(curitem + 1, arg, flags, chkcond, data))
1638 {
1639 case TS_NO:
1640 /* change "match nowhere" to "match everywhere" */
1641 Assert(data->npos == 0 && !data->negate);
1642 data->negate = true;
1643 return TS_YES;
1644 case TS_YES:
1645 if (data->npos > 0)
1646 {
1647 /* we have some positions, invert negate flag */
1648 data->negate = !data->negate;
1649 return TS_YES;
1650 }
1651 else if (data->negate)
1652 {
1653 /* change "match everywhere" to "match nowhere" */
1654 data->negate = false;
1655 return TS_NO;
1656 }
1657 /* Should not get here if result was TS_YES */
1658 Assert(false);
1659 break;
1660 case TS_MAYBE:
1661 /* match positions are, and remain, uncertain */
1662 return TS_MAYBE;
1663 }
1664 break;
1665
1666 case OP_PHRASE:
1667 case OP_AND:
1668 memset(&Ldata, 0, sizeof(Ldata));
1669 memset(&Rdata, 0, sizeof(Rdata));
1670
1671 lmatch = TS_phrase_execute(curitem + curitem->qoperator.left,
1672 arg, flags, chkcond, &Ldata);
1673 if (lmatch == TS_NO)
1674 return TS_NO;
1675
1676 rmatch = TS_phrase_execute(curitem + 1,
1677 arg, flags, chkcond, &Rdata);
1678 if (rmatch == TS_NO)
1679 return TS_NO;
1680
1681 /*
1682 * If either operand has no position information, then we can't
1683 * return reliable position data, only a MAYBE result.
1684 */
1685 if (lmatch == TS_MAYBE || rmatch == TS_MAYBE)
1686 return TS_MAYBE;
1687
1688 if (curitem->qoperator.oper == OP_PHRASE)
1689 {
1690 /*
1691 * Compute Loffset and Roffset suitable for phrase match, and
1692 * compute overall width of whole phrase match.
1693 */
1694 Loffset = curitem->qoperator.distance + Rdata.width;
1695 Roffset = 0;
1696 if (data)
1697 data->width = curitem->qoperator.distance +
1698 Ldata.width + Rdata.width;
1699 }
1700 else
1701 {
1702 /*
1703 * For OP_AND, set output width and alignment like OP_OR (see
1704 * comment below)
1705 */
1706 maxwidth = Max(Ldata.width, Rdata.width);
1707 Loffset = maxwidth - Ldata.width;
1708 Roffset = maxwidth - Rdata.width;
1709 if (data)
1710 data->width = maxwidth;
1711 }
1712
1713 if (Ldata.negate && Rdata.negate)
1714 {
1715 /* !L & !R: treat as !(L | R) */
1716 (void) TS_phrase_output(data, &Ldata, &Rdata,
1717 TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
1718 Loffset, Roffset,
1719 Ldata.npos + Rdata.npos);
1720 if (data)
1721 data->negate = true;
1722 return TS_YES;
1723 }
1724 else if (Ldata.negate)
1725 {
1726 /* !L & R */
1727 return TS_phrase_output(data, &Ldata, &Rdata,
1728 TSPO_R_ONLY,
1729 Loffset, Roffset,
1730 Rdata.npos);
1731 }
1732 else if (Rdata.negate)
1733 {
1734 /* L & !R */
1735 return TS_phrase_output(data, &Ldata, &Rdata,
1736 TSPO_L_ONLY,
1737 Loffset, Roffset,
1738 Ldata.npos);
1739 }
1740 else
1741 {
1742 /* straight AND */
1743 return TS_phrase_output(data, &Ldata, &Rdata,
1744 TSPO_BOTH,
1745 Loffset, Roffset,
1746 Min(Ldata.npos, Rdata.npos));
1747 }
1748
1749 case OP_OR:
1750 memset(&Ldata, 0, sizeof(Ldata));
1751 memset(&Rdata, 0, sizeof(Rdata));
1752
1753 lmatch = TS_phrase_execute(curitem + curitem->qoperator.left,
1754 arg, flags, chkcond, &Ldata);
1755 rmatch = TS_phrase_execute(curitem + 1,
1756 arg, flags, chkcond, &Rdata);
1757
1758 if (lmatch == TS_NO && rmatch == TS_NO)
1759 return TS_NO;
1760
1761 /*
1762 * If either operand has no position information, then we can't
1763 * return reliable position data, only a MAYBE result.
1764 */
1765 if (lmatch == TS_MAYBE || rmatch == TS_MAYBE)
1766 return TS_MAYBE;
1767
1768 /*
1769 * Cope with undefined output width from failed submatch. (This
1770 * takes less code than trying to ensure that all failure returns
1771 * set data->width to zero.)
1772 */
1773 if (lmatch == TS_NO)
1774 Ldata.width = 0;
1775 if (rmatch == TS_NO)
1776 Rdata.width = 0;
1777
1778 /*
1779 * For OP_AND and OP_OR, report the width of the wider of the two
1780 * inputs, and align the narrower input's positions to the right
1781 * end of that width. This rule deals at least somewhat
1782 * reasonably with cases like "x <-> (y | z <-> q)".
1783 */
1784 maxwidth = Max(Ldata.width, Rdata.width);
1785 Loffset = maxwidth - Ldata.width;
1786 Roffset = maxwidth - Rdata.width;
1787 data->width = maxwidth;
1788
1789 if (Ldata.negate && Rdata.negate)
1790 {
1791 /* !L | !R: treat as !(L & R) */
1792 (void) TS_phrase_output(data, &Ldata, &Rdata,
1793 TSPO_BOTH,
1794 Loffset, Roffset,
1795 Min(Ldata.npos, Rdata.npos));
1796 data->negate = true;
1797 return TS_YES;
1798 }
1799 else if (Ldata.negate)
1800 {
1801 /* !L | R: treat as !(L & !R) */
1802 (void) TS_phrase_output(data, &Ldata, &Rdata,
1803 TSPO_L_ONLY,
1804 Loffset, Roffset,
1805 Ldata.npos);
1806 data->negate = true;
1807 return TS_YES;
1808 }
1809 else if (Rdata.negate)
1810 {
1811 /* L | !R: treat as !(!L & R) */
1812 (void) TS_phrase_output(data, &Ldata, &Rdata,
1813 TSPO_R_ONLY,
1814 Loffset, Roffset,
1815 Rdata.npos);
1816 data->negate = true;
1817 return TS_YES;
1818 }
1819 else
1820 {
1821 /* straight OR */
1822 return TS_phrase_output(data, &Ldata, &Rdata,
1823 TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
1824 Loffset, Roffset,
1825 Ldata.npos + Rdata.npos);
1826 }
1827
1828 default:
1829 elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
1830 }
1831
1832 /* not reachable, but keep compiler quiet */
1833 return TS_NO;
1834 }
1835
1836
1837 /*
1838 * Evaluate tsquery boolean expression.
1839 *
1840 * curitem: current tsquery item (initially, the first one)
1841 * arg: opaque value to pass through to callback function
1842 * flags: bitmask of flag bits shown in ts_utils.h
1843 * chkcond: callback function to check whether a primitive value is present
1844 */
1845 bool
TS_execute(QueryItem * curitem,void * arg,uint32 flags,TSExecuteCallback chkcond)1846 TS_execute(QueryItem *curitem, void *arg, uint32 flags,
1847 TSExecuteCallback chkcond)
1848 {
1849 /*
1850 * If we get TS_MAYBE from the recursion, return true. We could only see
1851 * that result if the caller passed TS_EXEC_PHRASE_NO_POS, so there's no
1852 * need to check again.
1853 */
1854 return TS_execute_recurse(curitem, arg, flags, chkcond) != TS_NO;
1855 }
1856
1857 /*
1858 * TS_execute recursion for operators above any phrase operator. Here we do
1859 * not need to worry about lexeme positions. As soon as we hit an OP_PHRASE
1860 * operator, we pass it off to TS_phrase_execute which does worry.
1861 */
1862 static TSTernaryValue
TS_execute_recurse(QueryItem * curitem,void * arg,uint32 flags,TSExecuteCallback chkcond)1863 TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags,
1864 TSExecuteCallback chkcond)
1865 {
1866 TSTernaryValue lmatch;
1867
1868 /* since this function recurses, it could be driven to stack overflow */
1869 check_stack_depth();
1870
1871 /* ... and let's check for query cancel while we're at it */
1872 CHECK_FOR_INTERRUPTS();
1873
1874 if (curitem->type == QI_VAL)
1875 return chkcond(arg, (QueryOperand *) curitem,
1876 NULL /* don't need position info */ ) ? TS_YES : TS_NO;
1877
1878 switch (curitem->qoperator.oper)
1879 {
1880 case OP_NOT:
1881 if (!(flags & TS_EXEC_CALC_NOT))
1882 return TS_YES;
1883 switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
1884 {
1885 case TS_NO:
1886 return TS_YES;
1887 case TS_YES:
1888 return TS_NO;
1889 case TS_MAYBE:
1890 return TS_MAYBE;
1891 }
1892 break;
1893
1894 case OP_AND:
1895 lmatch = TS_execute_recurse(curitem + curitem->qoperator.left, arg,
1896 flags, chkcond);
1897 if (lmatch == TS_NO)
1898 return TS_NO;
1899 switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
1900 {
1901 case TS_NO:
1902 return TS_NO;
1903 case TS_YES:
1904 return lmatch;
1905 case TS_MAYBE:
1906 return TS_MAYBE;
1907 }
1908 break;
1909
1910 case OP_OR:
1911 lmatch = TS_execute_recurse(curitem + curitem->qoperator.left, arg,
1912 flags, chkcond);
1913 if (lmatch == TS_YES)
1914 return TS_YES;
1915 switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
1916 {
1917 case TS_NO:
1918 return lmatch;
1919 case TS_YES:
1920 return TS_YES;
1921 case TS_MAYBE:
1922 return TS_MAYBE;
1923 }
1924 break;
1925
1926 case OP_PHRASE:
1927
1928 /*
1929 * If we get a MAYBE result, and the caller doesn't want that,
1930 * convert it to NO. It would be more consistent, perhaps, to
1931 * return the result of TS_phrase_execute() verbatim and then
1932 * convert MAYBE results at the top of the recursion. But
1933 * converting at the topmost phrase operator gives results that
1934 * are bug-compatible with the old implementation, so do it like
1935 * this for now.
1936 */
1937 switch (TS_phrase_execute(curitem, arg, flags, chkcond, NULL))
1938 {
1939 case TS_NO:
1940 return TS_NO;
1941 case TS_YES:
1942 return TS_YES;
1943 case TS_MAYBE:
1944 return (flags & TS_EXEC_PHRASE_NO_POS) ? TS_MAYBE : TS_NO;
1945 }
1946 break;
1947
1948 default:
1949 elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
1950 }
1951
1952 /* not reachable, but keep compiler quiet */
1953 return TS_NO;
1954 }
1955
1956 /*
1957 * Detect whether a tsquery boolean expression requires any positive matches
1958 * to values shown in the tsquery.
1959 *
1960 * This is needed to know whether a GIN index search requires full index scan.
1961 * For example, 'x & !y' requires a match of x, so it's sufficient to scan
1962 * entries for x; but 'x | !y' could match rows containing neither x nor y.
1963 */
1964 bool
tsquery_requires_match(QueryItem * curitem)1965 tsquery_requires_match(QueryItem *curitem)
1966 {
1967 /* since this function recurses, it could be driven to stack overflow */
1968 check_stack_depth();
1969
1970 if (curitem->type == QI_VAL)
1971 return true;
1972
1973 switch (curitem->qoperator.oper)
1974 {
1975 case OP_NOT:
1976
1977 /*
1978 * Assume there are no required matches underneath a NOT. For
1979 * some cases with nested NOTs, we could prove there's a required
1980 * match, but it seems unlikely to be worth the trouble.
1981 */
1982 return false;
1983
1984 case OP_PHRASE:
1985
1986 /*
1987 * Treat OP_PHRASE as OP_AND here
1988 */
1989 case OP_AND:
1990 /* If either side requires a match, we're good */
1991 if (tsquery_requires_match(curitem + curitem->qoperator.left))
1992 return true;
1993 else
1994 return tsquery_requires_match(curitem + 1);
1995
1996 case OP_OR:
1997 /* Both sides must require a match */
1998 if (tsquery_requires_match(curitem + curitem->qoperator.left))
1999 return tsquery_requires_match(curitem + 1);
2000 else
2001 return false;
2002
2003 default:
2004 elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
2005 }
2006
2007 /* not reachable, but keep compiler quiet */
2008 return false;
2009 }
2010
2011 /*
2012 * boolean operations
2013 */
2014 Datum
ts_match_qv(PG_FUNCTION_ARGS)2015 ts_match_qv(PG_FUNCTION_ARGS)
2016 {
2017 PG_RETURN_DATUM(DirectFunctionCall2(ts_match_vq,
2018 PG_GETARG_DATUM(1),
2019 PG_GETARG_DATUM(0)));
2020 }
2021
2022 Datum
ts_match_vq(PG_FUNCTION_ARGS)2023 ts_match_vq(PG_FUNCTION_ARGS)
2024 {
2025 TSVector val = PG_GETARG_TSVECTOR(0);
2026 TSQuery query = PG_GETARG_TSQUERY(1);
2027 CHKVAL chkval;
2028 bool result;
2029
2030 /* empty query matches nothing */
2031 if (!query->size)
2032 {
2033 PG_FREE_IF_COPY(val, 0);
2034 PG_FREE_IF_COPY(query, 1);
2035 PG_RETURN_BOOL(false);
2036 }
2037
2038 chkval.arrb = ARRPTR(val);
2039 chkval.arre = chkval.arrb + val->size;
2040 chkval.values = STRPTR(val);
2041 chkval.operand = GETOPERAND(query);
2042 result = TS_execute(GETQUERY(query),
2043 &chkval,
2044 TS_EXEC_CALC_NOT,
2045 checkcondition_str);
2046
2047 PG_FREE_IF_COPY(val, 0);
2048 PG_FREE_IF_COPY(query, 1);
2049 PG_RETURN_BOOL(result);
2050 }
2051
2052 Datum
ts_match_tt(PG_FUNCTION_ARGS)2053 ts_match_tt(PG_FUNCTION_ARGS)
2054 {
2055 TSVector vector;
2056 TSQuery query;
2057 bool res;
2058
2059 vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector,
2060 PG_GETARG_DATUM(0)));
2061 query = DatumGetTSQuery(DirectFunctionCall1(plainto_tsquery,
2062 PG_GETARG_DATUM(1)));
2063
2064 res = DatumGetBool(DirectFunctionCall2(ts_match_vq,
2065 TSVectorGetDatum(vector),
2066 TSQueryGetDatum(query)));
2067
2068 pfree(vector);
2069 pfree(query);
2070
2071 PG_RETURN_BOOL(res);
2072 }
2073
2074 Datum
ts_match_tq(PG_FUNCTION_ARGS)2075 ts_match_tq(PG_FUNCTION_ARGS)
2076 {
2077 TSVector vector;
2078 TSQuery query = PG_GETARG_TSQUERY(1);
2079 bool res;
2080
2081 vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector,
2082 PG_GETARG_DATUM(0)));
2083
2084 res = DatumGetBool(DirectFunctionCall2(ts_match_vq,
2085 TSVectorGetDatum(vector),
2086 TSQueryGetDatum(query)));
2087
2088 pfree(vector);
2089 PG_FREE_IF_COPY(query, 1);
2090
2091 PG_RETURN_BOOL(res);
2092 }
2093
2094 /*
2095 * ts_stat statistic function support
2096 */
2097
2098
2099 /*
2100 * Returns the number of positions in value 'wptr' within tsvector 'txt',
2101 * that have a weight equal to one of the weights in 'weight' bitmask.
2102 */
2103 static int
check_weight(TSVector txt,WordEntry * wptr,int8 weight)2104 check_weight(TSVector txt, WordEntry *wptr, int8 weight)
2105 {
2106 int len = POSDATALEN(txt, wptr);
2107 int num = 0;
2108 WordEntryPos *ptr = POSDATAPTR(txt, wptr);
2109
2110 while (len--)
2111 {
2112 if (weight & (1 << WEP_GETWEIGHT(*ptr)))
2113 num++;
2114 ptr++;
2115 }
2116 return num;
2117 }
2118
2119 #define compareStatWord(a,e,t) \
2120 tsCompareString((a)->lexeme, (a)->lenlexeme, \
2121 STRPTR(t) + (e)->pos, (e)->len, \
2122 false)
2123
2124 static void
insertStatEntry(MemoryContext persistentContext,TSVectorStat * stat,TSVector txt,uint32 off)2125 insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, uint32 off)
2126 {
2127 WordEntry *we = ARRPTR(txt) + off;
2128 StatEntry *node = stat->root,
2129 *pnode = NULL;
2130 int n,
2131 res = 0;
2132 uint32 depth = 1;
2133
2134 if (stat->weight == 0)
2135 n = (we->haspos) ? POSDATALEN(txt, we) : 1;
2136 else
2137 n = (we->haspos) ? check_weight(txt, we, stat->weight) : 0;
2138
2139 if (n == 0)
2140 return; /* nothing to insert */
2141
2142 while (node)
2143 {
2144 res = compareStatWord(node, we, txt);
2145
2146 if (res == 0)
2147 {
2148 break;
2149 }
2150 else
2151 {
2152 pnode = node;
2153 node = (res < 0) ? node->left : node->right;
2154 }
2155 depth++;
2156 }
2157
2158 if (depth > stat->maxdepth)
2159 stat->maxdepth = depth;
2160
2161 if (node == NULL)
2162 {
2163 node = MemoryContextAlloc(persistentContext, STATENTRYHDRSZ + we->len);
2164 node->left = node->right = NULL;
2165 node->ndoc = 1;
2166 node->nentry = n;
2167 node->lenlexeme = we->len;
2168 memcpy(node->lexeme, STRPTR(txt) + we->pos, node->lenlexeme);
2169
2170 if (pnode == NULL)
2171 {
2172 stat->root = node;
2173 }
2174 else
2175 {
2176 if (res < 0)
2177 pnode->left = node;
2178 else
2179 pnode->right = node;
2180 }
2181
2182 }
2183 else
2184 {
2185 node->ndoc++;
2186 node->nentry += n;
2187 }
2188 }
2189
2190 static void
chooseNextStatEntry(MemoryContext persistentContext,TSVectorStat * stat,TSVector txt,uint32 low,uint32 high,uint32 offset)2191 chooseNextStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt,
2192 uint32 low, uint32 high, uint32 offset)
2193 {
2194 uint32 pos;
2195 uint32 middle = (low + high) >> 1;
2196
2197 pos = (low + middle) >> 1;
2198 if (low != middle && pos >= offset && pos - offset < txt->size)
2199 insertStatEntry(persistentContext, stat, txt, pos - offset);
2200 pos = (high + middle + 1) >> 1;
2201 if (middle + 1 != high && pos >= offset && pos - offset < txt->size)
2202 insertStatEntry(persistentContext, stat, txt, pos - offset);
2203
2204 if (low != middle)
2205 chooseNextStatEntry(persistentContext, stat, txt, low, middle, offset);
2206 if (high != middle + 1)
2207 chooseNextStatEntry(persistentContext, stat, txt, middle + 1, high, offset);
2208 }
2209
2210 /*
2211 * This is written like a custom aggregate function, because the
2212 * original plan was to do just that. Unfortunately, an aggregate function
2213 * can't return a set, so that plan was abandoned. If that limitation is
2214 * lifted in the future, ts_stat could be a real aggregate function so that
2215 * you could use it like this:
2216 *
2217 * SELECT ts_stat(vector_column) FROM vector_table;
2218 *
2219 * where vector_column is a tsvector-type column in vector_table.
2220 */
2221
2222 static TSVectorStat *
ts_accum(MemoryContext persistentContext,TSVectorStat * stat,Datum data)2223 ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data)
2224 {
2225 TSVector txt = DatumGetTSVector(data);
2226 uint32 i,
2227 nbit = 0,
2228 offset;
2229
2230 if (stat == NULL)
2231 { /* Init in first */
2232 stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat));
2233 stat->maxdepth = 1;
2234 }
2235
2236 /* simple check of correctness */
2237 if (txt == NULL || txt->size == 0)
2238 {
2239 if (txt && txt != (TSVector) DatumGetPointer(data))
2240 pfree(txt);
2241 return stat;
2242 }
2243
2244 i = txt->size - 1;
2245 for (; i > 0; i >>= 1)
2246 nbit++;
2247
2248 nbit = 1 << nbit;
2249 offset = (nbit - txt->size) / 2;
2250
2251 insertStatEntry(persistentContext, stat, txt, (nbit >> 1) - offset);
2252 chooseNextStatEntry(persistentContext, stat, txt, 0, nbit, offset);
2253
2254 return stat;
2255 }
2256
2257 static void
ts_setup_firstcall(FunctionCallInfo fcinfo,FuncCallContext * funcctx,TSVectorStat * stat)2258 ts_setup_firstcall(FunctionCallInfo fcinfo, FuncCallContext *funcctx,
2259 TSVectorStat *stat)
2260 {
2261 TupleDesc tupdesc;
2262 MemoryContext oldcontext;
2263 StatEntry *node;
2264
2265 funcctx->user_fctx = (void *) stat;
2266
2267 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
2268
2269 stat->stack = palloc0(sizeof(StatEntry *) * (stat->maxdepth + 1));
2270 stat->stackpos = 0;
2271
2272 node = stat->root;
2273 /* find leftmost value */
2274 if (node == NULL)
2275 stat->stack[stat->stackpos] = NULL;
2276 else
2277 for (;;)
2278 {
2279 stat->stack[stat->stackpos] = node;
2280 if (node->left)
2281 {
2282 stat->stackpos++;
2283 node = node->left;
2284 }
2285 else
2286 break;
2287 }
2288 Assert(stat->stackpos <= stat->maxdepth);
2289
2290 tupdesc = CreateTemplateTupleDesc(3, false);
2291 TupleDescInitEntry(tupdesc, (AttrNumber) 1, "word",
2292 TEXTOID, -1, 0);
2293 TupleDescInitEntry(tupdesc, (AttrNumber) 2, "ndoc",
2294 INT4OID, -1, 0);
2295 TupleDescInitEntry(tupdesc, (AttrNumber) 3, "nentry",
2296 INT4OID, -1, 0);
2297 funcctx->tuple_desc = BlessTupleDesc(tupdesc);
2298 funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
2299
2300 MemoryContextSwitchTo(oldcontext);
2301 }
2302
2303 static StatEntry *
walkStatEntryTree(TSVectorStat * stat)2304 walkStatEntryTree(TSVectorStat *stat)
2305 {
2306 StatEntry *node = stat->stack[stat->stackpos];
2307
2308 if (node == NULL)
2309 return NULL;
2310
2311 if (node->ndoc != 0)
2312 {
2313 /* return entry itself: we already was at left sublink */
2314 return node;
2315 }
2316 else if (node->right && node->right != stat->stack[stat->stackpos + 1])
2317 {
2318 /* go on right sublink */
2319 stat->stackpos++;
2320 node = node->right;
2321
2322 /* find most-left value */
2323 for (;;)
2324 {
2325 stat->stack[stat->stackpos] = node;
2326 if (node->left)
2327 {
2328 stat->stackpos++;
2329 node = node->left;
2330 }
2331 else
2332 break;
2333 }
2334 Assert(stat->stackpos <= stat->maxdepth);
2335 }
2336 else
2337 {
2338 /* we already return all left subtree, itself and right subtree */
2339 if (stat->stackpos == 0)
2340 return NULL;
2341
2342 stat->stackpos--;
2343 return walkStatEntryTree(stat);
2344 }
2345
2346 return node;
2347 }
2348
2349 static Datum
ts_process_call(FuncCallContext * funcctx)2350 ts_process_call(FuncCallContext *funcctx)
2351 {
2352 TSVectorStat *st;
2353 StatEntry *entry;
2354
2355 st = (TSVectorStat *) funcctx->user_fctx;
2356
2357 entry = walkStatEntryTree(st);
2358
2359 if (entry != NULL)
2360 {
2361 Datum result;
2362 char *values[3];
2363 char ndoc[16];
2364 char nentry[16];
2365 HeapTuple tuple;
2366
2367 values[0] = palloc(entry->lenlexeme + 1);
2368 memcpy(values[0], entry->lexeme, entry->lenlexeme);
2369 (values[0])[entry->lenlexeme] = '\0';
2370 sprintf(ndoc, "%d", entry->ndoc);
2371 values[1] = ndoc;
2372 sprintf(nentry, "%d", entry->nentry);
2373 values[2] = nentry;
2374
2375 tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
2376 result = HeapTupleGetDatum(tuple);
2377
2378 pfree(values[0]);
2379
2380 /* mark entry as already visited */
2381 entry->ndoc = 0;
2382
2383 return result;
2384 }
2385
2386 return (Datum) 0;
2387 }
2388
2389 static TSVectorStat *
ts_stat_sql(MemoryContext persistentContext,text * txt,text * ws)2390 ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws)
2391 {
2392 char *query = text_to_cstring(txt);
2393 TSVectorStat *stat;
2394 bool isnull;
2395 Portal portal;
2396 SPIPlanPtr plan;
2397
2398 if ((plan = SPI_prepare(query, 0, NULL)) == NULL)
2399 /* internal error */
2400 elog(ERROR, "SPI_prepare(\"%s\") failed", query);
2401
2402 if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, true)) == NULL)
2403 /* internal error */
2404 elog(ERROR, "SPI_cursor_open(\"%s\") failed", query);
2405
2406 SPI_cursor_fetch(portal, true, 100);
2407
2408 if (SPI_tuptable == NULL ||
2409 SPI_tuptable->tupdesc->natts != 1 ||
2410 !IsBinaryCoercible(SPI_gettypeid(SPI_tuptable->tupdesc, 1),
2411 TSVECTOROID))
2412 ereport(ERROR,
2413 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2414 errmsg("ts_stat query must return one tsvector column")));
2415
2416 stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat));
2417 stat->maxdepth = 1;
2418
2419 if (ws)
2420 {
2421 char *buf;
2422
2423 buf = VARDATA_ANY(ws);
2424 while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws))
2425 {
2426 if (pg_mblen(buf) == 1)
2427 {
2428 switch (*buf)
2429 {
2430 case 'A':
2431 case 'a':
2432 stat->weight |= 1 << 3;
2433 break;
2434 case 'B':
2435 case 'b':
2436 stat->weight |= 1 << 2;
2437 break;
2438 case 'C':
2439 case 'c':
2440 stat->weight |= 1 << 1;
2441 break;
2442 case 'D':
2443 case 'd':
2444 stat->weight |= 1;
2445 break;
2446 default:
2447 stat->weight |= 0;
2448 }
2449 }
2450 buf += pg_mblen(buf);
2451 }
2452 }
2453
2454 while (SPI_processed > 0)
2455 {
2456 uint64 i;
2457
2458 for (i = 0; i < SPI_processed; i++)
2459 {
2460 Datum data = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull);
2461
2462 if (!isnull)
2463 stat = ts_accum(persistentContext, stat, data);
2464 }
2465
2466 SPI_freetuptable(SPI_tuptable);
2467 SPI_cursor_fetch(portal, true, 100);
2468 }
2469
2470 SPI_freetuptable(SPI_tuptable);
2471 SPI_cursor_close(portal);
2472 SPI_freeplan(plan);
2473 pfree(query);
2474
2475 return stat;
2476 }
2477
2478 Datum
ts_stat1(PG_FUNCTION_ARGS)2479 ts_stat1(PG_FUNCTION_ARGS)
2480 {
2481 FuncCallContext *funcctx;
2482 Datum result;
2483
2484 if (SRF_IS_FIRSTCALL())
2485 {
2486 TSVectorStat *stat;
2487 text *txt = PG_GETARG_TEXT_PP(0);
2488
2489 funcctx = SRF_FIRSTCALL_INIT();
2490 SPI_connect();
2491 stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, NULL);
2492 PG_FREE_IF_COPY(txt, 0);
2493 ts_setup_firstcall(fcinfo, funcctx, stat);
2494 SPI_finish();
2495 }
2496
2497 funcctx = SRF_PERCALL_SETUP();
2498 if ((result = ts_process_call(funcctx)) != (Datum) 0)
2499 SRF_RETURN_NEXT(funcctx, result);
2500 SRF_RETURN_DONE(funcctx);
2501 }
2502
2503 Datum
ts_stat2(PG_FUNCTION_ARGS)2504 ts_stat2(PG_FUNCTION_ARGS)
2505 {
2506 FuncCallContext *funcctx;
2507 Datum result;
2508
2509 if (SRF_IS_FIRSTCALL())
2510 {
2511 TSVectorStat *stat;
2512 text *txt = PG_GETARG_TEXT_PP(0);
2513 text *ws = PG_GETARG_TEXT_PP(1);
2514
2515 funcctx = SRF_FIRSTCALL_INIT();
2516 SPI_connect();
2517 stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, ws);
2518 PG_FREE_IF_COPY(txt, 0);
2519 PG_FREE_IF_COPY(ws, 1);
2520 ts_setup_firstcall(fcinfo, funcctx, stat);
2521 SPI_finish();
2522 }
2523
2524 funcctx = SRF_PERCALL_SETUP();
2525 if ((result = ts_process_call(funcctx)) != (Datum) 0)
2526 SRF_RETURN_NEXT(funcctx, result);
2527 SRF_RETURN_DONE(funcctx);
2528 }
2529
2530
2531 /*
2532 * Triggers for automatic update of a tsvector column from text column(s)
2533 *
2534 * Trigger arguments are either
2535 * name of tsvector col, name of tsconfig to use, name(s) of text col(s)
2536 * name of tsvector col, name of regconfig col, name(s) of text col(s)
2537 * ie, tsconfig can either be specified by name, or indirectly as the
2538 * contents of a regconfig field in the row. If the name is used, it must
2539 * be explicitly schema-qualified.
2540 */
2541 Datum
tsvector_update_trigger_byid(PG_FUNCTION_ARGS)2542 tsvector_update_trigger_byid(PG_FUNCTION_ARGS)
2543 {
2544 return tsvector_update_trigger(fcinfo, false);
2545 }
2546
2547 Datum
tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS)2548 tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS)
2549 {
2550 return tsvector_update_trigger(fcinfo, true);
2551 }
2552
2553 static Datum
tsvector_update_trigger(PG_FUNCTION_ARGS,bool config_column)2554 tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column)
2555 {
2556 TriggerData *trigdata;
2557 Trigger *trigger;
2558 Relation rel;
2559 HeapTuple rettuple = NULL;
2560 int tsvector_attr_num,
2561 i;
2562 ParsedText prs;
2563 Datum datum;
2564 bool isnull;
2565 text *txt;
2566 Oid cfgId;
2567
2568 /* Check call context */
2569 if (!CALLED_AS_TRIGGER(fcinfo)) /* internal error */
2570 elog(ERROR, "tsvector_update_trigger: not fired by trigger manager");
2571
2572 trigdata = (TriggerData *) fcinfo->context;
2573 if (!TRIGGER_FIRED_FOR_ROW(trigdata->tg_event))
2574 elog(ERROR, "tsvector_update_trigger: must be fired for row");
2575 if (!TRIGGER_FIRED_BEFORE(trigdata->tg_event))
2576 elog(ERROR, "tsvector_update_trigger: must be fired BEFORE event");
2577
2578 if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
2579 rettuple = trigdata->tg_trigtuple;
2580 else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
2581 rettuple = trigdata->tg_newtuple;
2582 else
2583 elog(ERROR, "tsvector_update_trigger: must be fired for INSERT or UPDATE");
2584
2585 trigger = trigdata->tg_trigger;
2586 rel = trigdata->tg_relation;
2587
2588 if (trigger->tgnargs < 3)
2589 elog(ERROR, "tsvector_update_trigger: arguments must be tsvector_field, ts_config, text_field1, ...)");
2590
2591 /* Find the target tsvector column */
2592 tsvector_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[0]);
2593 if (tsvector_attr_num == SPI_ERROR_NOATTRIBUTE)
2594 ereport(ERROR,
2595 (errcode(ERRCODE_UNDEFINED_COLUMN),
2596 errmsg("tsvector column \"%s\" does not exist",
2597 trigger->tgargs[0])));
2598 /* This will effectively reject system columns, so no separate test: */
2599 if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, tsvector_attr_num),
2600 TSVECTOROID))
2601 ereport(ERROR,
2602 (errcode(ERRCODE_DATATYPE_MISMATCH),
2603 errmsg("column \"%s\" is not of tsvector type",
2604 trigger->tgargs[0])));
2605
2606 /* Find the configuration to use */
2607 if (config_column)
2608 {
2609 int config_attr_num;
2610
2611 config_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[1]);
2612 if (config_attr_num == SPI_ERROR_NOATTRIBUTE)
2613 ereport(ERROR,
2614 (errcode(ERRCODE_UNDEFINED_COLUMN),
2615 errmsg("configuration column \"%s\" does not exist",
2616 trigger->tgargs[1])));
2617 if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, config_attr_num),
2618 REGCONFIGOID))
2619 ereport(ERROR,
2620 (errcode(ERRCODE_DATATYPE_MISMATCH),
2621 errmsg("column \"%s\" is not of regconfig type",
2622 trigger->tgargs[1])));
2623
2624 datum = SPI_getbinval(rettuple, rel->rd_att, config_attr_num, &isnull);
2625 if (isnull)
2626 ereport(ERROR,
2627 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
2628 errmsg("configuration column \"%s\" must not be null",
2629 trigger->tgargs[1])));
2630 cfgId = DatumGetObjectId(datum);
2631 }
2632 else
2633 {
2634 List *names;
2635
2636 names = stringToQualifiedNameList(trigger->tgargs[1]);
2637 /* require a schema so that results are not search path dependent */
2638 if (list_length(names) < 2)
2639 ereport(ERROR,
2640 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2641 errmsg("text search configuration name \"%s\" must be schema-qualified",
2642 trigger->tgargs[1])));
2643 cfgId = get_ts_config_oid(names, false);
2644 }
2645
2646 /* initialize parse state */
2647 prs.lenwords = 32;
2648 prs.curwords = 0;
2649 prs.pos = 0;
2650 prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
2651
2652 /* find all words in indexable column(s) */
2653 for (i = 2; i < trigger->tgnargs; i++)
2654 {
2655 int numattr;
2656
2657 numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]);
2658 if (numattr == SPI_ERROR_NOATTRIBUTE)
2659 ereport(ERROR,
2660 (errcode(ERRCODE_UNDEFINED_COLUMN),
2661 errmsg("column \"%s\" does not exist",
2662 trigger->tgargs[i])));
2663 if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, numattr), TEXTOID))
2664 ereport(ERROR,
2665 (errcode(ERRCODE_DATATYPE_MISMATCH),
2666 errmsg("column \"%s\" is not of a character type",
2667 trigger->tgargs[i])));
2668
2669 datum = SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull);
2670 if (isnull)
2671 continue;
2672
2673 txt = DatumGetTextPP(datum);
2674
2675 parsetext(cfgId, &prs, VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt));
2676
2677 if (txt != (text *) DatumGetPointer(datum))
2678 pfree(txt);
2679 }
2680
2681 /* make tsvector value */
2682 datum = TSVectorGetDatum(make_tsvector(&prs));
2683 isnull = false;
2684
2685 /* and insert it into tuple */
2686 rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
2687 1, &tsvector_attr_num,
2688 &datum, &isnull);
2689
2690 pfree(DatumGetPointer(datum));
2691
2692 return PointerGetDatum(rettuple);
2693 }
2694