1 /*-------------------------------------------------------------------------
2 *
3 * to_tsany.c
4 * to_ts* function definitions
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/tsearch/to_tsany.c
11 *
12 *-------------------------------------------------------------------------
13 */
14 #include "postgres.h"
15
16 #include "tsearch/ts_cache.h"
17 #include "tsearch/ts_utils.h"
18 #include "utils/builtins.h"
19 #include "utils/jsonapi.h"
20
21
22 typedef struct MorphOpaque
23 {
24 Oid cfg_id;
25 int qoperator; /* query operator */
26 } MorphOpaque;
27
28 typedef struct TSVectorBuildState
29 {
30 ParsedText *prs;
31 Oid cfgId;
32 } TSVectorBuildState;
33
34 static void add_to_tsvector(void *_state, char *elem_value, int elem_len);
35
36
37 Datum
get_current_ts_config(PG_FUNCTION_ARGS)38 get_current_ts_config(PG_FUNCTION_ARGS)
39 {
40 PG_RETURN_OID(getTSCurrentConfig(true));
41 }
42
43 /*
44 * to_tsvector
45 */
46 static int
compareWORD(const void * a,const void * b)47 compareWORD(const void *a, const void *b)
48 {
49 int res;
50
51 res = tsCompareString(
52 ((const ParsedWord *) a)->word, ((const ParsedWord *) a)->len,
53 ((const ParsedWord *) b)->word, ((const ParsedWord *) b)->len,
54 false);
55
56 if (res == 0)
57 {
58 if (((const ParsedWord *) a)->pos.pos == ((const ParsedWord *) b)->pos.pos)
59 return 0;
60
61 res = (((const ParsedWord *) a)->pos.pos > ((const ParsedWord *) b)->pos.pos) ? 1 : -1;
62 }
63
64 return res;
65 }
66
67 static int
uniqueWORD(ParsedWord * a,int32 l)68 uniqueWORD(ParsedWord *a, int32 l)
69 {
70 ParsedWord *ptr,
71 *res;
72 int tmppos;
73
74 if (l == 1)
75 {
76 tmppos = LIMITPOS(a->pos.pos);
77 a->alen = 2;
78 a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
79 a->pos.apos[0] = 1;
80 a->pos.apos[1] = tmppos;
81 return l;
82 }
83
84 res = a;
85 ptr = a + 1;
86
87 /*
88 * Sort words with its positions
89 */
90 qsort((void *) a, l, sizeof(ParsedWord), compareWORD);
91
92 /*
93 * Initialize first word and its first position
94 */
95 tmppos = LIMITPOS(a->pos.pos);
96 a->alen = 2;
97 a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
98 a->pos.apos[0] = 1;
99 a->pos.apos[1] = tmppos;
100
101 /*
102 * Summarize position information for each word
103 */
104 while (ptr - a < l)
105 {
106 if (!(ptr->len == res->len &&
107 strncmp(ptr->word, res->word, res->len) == 0))
108 {
109 /*
110 * Got a new word, so put it in result
111 */
112 res++;
113 res->len = ptr->len;
114 res->word = ptr->word;
115 tmppos = LIMITPOS(ptr->pos.pos);
116 res->alen = 2;
117 res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen);
118 res->pos.apos[0] = 1;
119 res->pos.apos[1] = tmppos;
120 }
121 else
122 {
123 /*
124 * The word already exists, so adjust position information. But
125 * before we should check size of position's array, max allowed
126 * value for position and uniqueness of position
127 */
128 pfree(ptr->word);
129 if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 &&
130 res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
131 {
132 if (res->pos.apos[0] + 1 >= res->alen)
133 {
134 res->alen *= 2;
135 res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen);
136 }
137 if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
138 {
139 res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos);
140 res->pos.apos[0]++;
141 }
142 }
143 }
144 ptr++;
145 }
146
147 return res + 1 - a;
148 }
149
150 /*
151 * make value of tsvector, given parsed text
152 *
153 * Note: frees prs->words and subsidiary data.
154 */
155 TSVector
make_tsvector(ParsedText * prs)156 make_tsvector(ParsedText *prs)
157 {
158 int i,
159 j,
160 lenstr = 0,
161 totallen;
162 TSVector in;
163 WordEntry *ptr;
164 char *str;
165 int stroff;
166
167 /* Merge duplicate words */
168 if (prs->curwords > 0)
169 prs->curwords = uniqueWORD(prs->words, prs->curwords);
170
171 /* Determine space needed */
172 for (i = 0; i < prs->curwords; i++)
173 {
174 lenstr += prs->words[i].len;
175 if (prs->words[i].alen)
176 {
177 lenstr = SHORTALIGN(lenstr);
178 lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
179 }
180 }
181
182 if (lenstr > MAXSTRPOS)
183 ereport(ERROR,
184 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
185 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS)));
186
187 totallen = CALCDATASIZE(prs->curwords, lenstr);
188 in = (TSVector) palloc0(totallen);
189 SET_VARSIZE(in, totallen);
190 in->size = prs->curwords;
191
192 ptr = ARRPTR(in);
193 str = STRPTR(in);
194 stroff = 0;
195 for (i = 0; i < prs->curwords; i++)
196 {
197 ptr->len = prs->words[i].len;
198 ptr->pos = stroff;
199 memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
200 stroff += prs->words[i].len;
201 pfree(prs->words[i].word);
202 if (prs->words[i].alen)
203 {
204 int k = prs->words[i].pos.apos[0];
205 WordEntryPos *wptr;
206
207 if (k > 0xFFFF)
208 elog(ERROR, "positions array too long");
209
210 ptr->haspos = 1;
211 stroff = SHORTALIGN(stroff);
212 *(uint16 *) (str + stroff) = (uint16) k;
213 wptr = POSDATAPTR(in, ptr);
214 for (j = 0; j < k; j++)
215 {
216 WEP_SETWEIGHT(wptr[j], 0);
217 WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
218 }
219 stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
220 pfree(prs->words[i].pos.apos);
221 }
222 else
223 ptr->haspos = 0;
224 ptr++;
225 }
226
227 if (prs->words)
228 pfree(prs->words);
229
230 return in;
231 }
232
233 Datum
to_tsvector_byid(PG_FUNCTION_ARGS)234 to_tsvector_byid(PG_FUNCTION_ARGS)
235 {
236 Oid cfgId = PG_GETARG_OID(0);
237 text *in = PG_GETARG_TEXT_PP(1);
238 ParsedText prs;
239 TSVector out;
240
241 prs.lenwords = VARSIZE_ANY_EXHDR(in) / 6; /* just estimation of word's
242 * number */
243 if (prs.lenwords < 2)
244 prs.lenwords = 2;
245 prs.curwords = 0;
246 prs.pos = 0;
247 prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
248
249 parsetext(cfgId, &prs, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
250
251 PG_FREE_IF_COPY(in, 1);
252
253 out = make_tsvector(&prs);
254
255 PG_RETURN_TSVECTOR(out);
256 }
257
258 Datum
to_tsvector(PG_FUNCTION_ARGS)259 to_tsvector(PG_FUNCTION_ARGS)
260 {
261 text *in = PG_GETARG_TEXT_PP(0);
262 Oid cfgId;
263
264 cfgId = getTSCurrentConfig(true);
265 PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid,
266 ObjectIdGetDatum(cfgId),
267 PointerGetDatum(in)));
268 }
269
270 /*
271 * Worker function for jsonb(_string)_to_tsvector(_byid)
272 */
273 static TSVector
jsonb_to_tsvector_worker(Oid cfgId,Jsonb * jb,uint32 flags)274 jsonb_to_tsvector_worker(Oid cfgId, Jsonb *jb, uint32 flags)
275 {
276 TSVectorBuildState state;
277 ParsedText prs;
278
279 prs.words = NULL;
280 prs.curwords = 0;
281 state.prs = &prs;
282 state.cfgId = cfgId;
283
284 iterate_jsonb_values(jb, flags, &state, add_to_tsvector);
285
286 return make_tsvector(&prs);
287 }
288
289 Datum
jsonb_string_to_tsvector_byid(PG_FUNCTION_ARGS)290 jsonb_string_to_tsvector_byid(PG_FUNCTION_ARGS)
291 {
292 Oid cfgId = PG_GETARG_OID(0);
293 Jsonb *jb = PG_GETARG_JSONB_P(1);
294 TSVector result;
295
296 result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
297 PG_FREE_IF_COPY(jb, 1);
298
299 PG_RETURN_TSVECTOR(result);
300 }
301
302 Datum
jsonb_string_to_tsvector(PG_FUNCTION_ARGS)303 jsonb_string_to_tsvector(PG_FUNCTION_ARGS)
304 {
305 Jsonb *jb = PG_GETARG_JSONB_P(0);
306 Oid cfgId;
307 TSVector result;
308
309 cfgId = getTSCurrentConfig(true);
310 result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
311 PG_FREE_IF_COPY(jb, 0);
312
313 PG_RETURN_TSVECTOR(result);
314 }
315
316 Datum
jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)317 jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
318 {
319 Oid cfgId = PG_GETARG_OID(0);
320 Jsonb *jb = PG_GETARG_JSONB_P(1);
321 Jsonb *jbFlags = PG_GETARG_JSONB_P(2);
322 TSVector result;
323 uint32 flags = parse_jsonb_index_flags(jbFlags);
324
325 result = jsonb_to_tsvector_worker(cfgId, jb, flags);
326 PG_FREE_IF_COPY(jb, 1);
327 PG_FREE_IF_COPY(jbFlags, 2);
328
329 PG_RETURN_TSVECTOR(result);
330 }
331
332 Datum
jsonb_to_tsvector(PG_FUNCTION_ARGS)333 jsonb_to_tsvector(PG_FUNCTION_ARGS)
334 {
335 Jsonb *jb = PG_GETARG_JSONB_P(0);
336 Jsonb *jbFlags = PG_GETARG_JSONB_P(1);
337 Oid cfgId;
338 TSVector result;
339 uint32 flags = parse_jsonb_index_flags(jbFlags);
340
341 cfgId = getTSCurrentConfig(true);
342 result = jsonb_to_tsvector_worker(cfgId, jb, flags);
343 PG_FREE_IF_COPY(jb, 0);
344 PG_FREE_IF_COPY(jbFlags, 1);
345
346 PG_RETURN_TSVECTOR(result);
347 }
348
349 /*
350 * Worker function for json(_string)_to_tsvector(_byid)
351 */
352 static TSVector
json_to_tsvector_worker(Oid cfgId,text * json,uint32 flags)353 json_to_tsvector_worker(Oid cfgId, text *json, uint32 flags)
354 {
355 TSVectorBuildState state;
356 ParsedText prs;
357
358 prs.words = NULL;
359 prs.curwords = 0;
360 state.prs = &prs;
361 state.cfgId = cfgId;
362
363 iterate_json_values(json, flags, &state, add_to_tsvector);
364
365 return make_tsvector(&prs);
366 }
367
368 Datum
json_string_to_tsvector_byid(PG_FUNCTION_ARGS)369 json_string_to_tsvector_byid(PG_FUNCTION_ARGS)
370 {
371 Oid cfgId = PG_GETARG_OID(0);
372 text *json = PG_GETARG_TEXT_P(1);
373 TSVector result;
374
375 result = json_to_tsvector_worker(cfgId, json, jtiString);
376 PG_FREE_IF_COPY(json, 1);
377
378 PG_RETURN_TSVECTOR(result);
379 }
380
381 Datum
json_string_to_tsvector(PG_FUNCTION_ARGS)382 json_string_to_tsvector(PG_FUNCTION_ARGS)
383 {
384 text *json = PG_GETARG_TEXT_P(0);
385 Oid cfgId;
386 TSVector result;
387
388 cfgId = getTSCurrentConfig(true);
389 result = json_to_tsvector_worker(cfgId, json, jtiString);
390 PG_FREE_IF_COPY(json, 0);
391
392 PG_RETURN_TSVECTOR(result);
393 }
394
395 Datum
json_to_tsvector_byid(PG_FUNCTION_ARGS)396 json_to_tsvector_byid(PG_FUNCTION_ARGS)
397 {
398 Oid cfgId = PG_GETARG_OID(0);
399 text *json = PG_GETARG_TEXT_P(1);
400 Jsonb *jbFlags = PG_GETARG_JSONB_P(2);
401 TSVector result;
402 uint32 flags = parse_jsonb_index_flags(jbFlags);
403
404 result = json_to_tsvector_worker(cfgId, json, flags);
405 PG_FREE_IF_COPY(json, 1);
406 PG_FREE_IF_COPY(jbFlags, 2);
407
408 PG_RETURN_TSVECTOR(result);
409 }
410
411 Datum
json_to_tsvector(PG_FUNCTION_ARGS)412 json_to_tsvector(PG_FUNCTION_ARGS)
413 {
414 text *json = PG_GETARG_TEXT_P(0);
415 Jsonb *jbFlags = PG_GETARG_JSONB_P(1);
416 Oid cfgId;
417 TSVector result;
418 uint32 flags = parse_jsonb_index_flags(jbFlags);
419
420 cfgId = getTSCurrentConfig(true);
421 result = json_to_tsvector_worker(cfgId, json, flags);
422 PG_FREE_IF_COPY(json, 0);
423 PG_FREE_IF_COPY(jbFlags, 1);
424
425 PG_RETURN_TSVECTOR(result);
426 }
427
428 /*
429 * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState.
430 */
431 static void
add_to_tsvector(void * _state,char * elem_value,int elem_len)432 add_to_tsvector(void *_state, char *elem_value, int elem_len)
433 {
434 TSVectorBuildState *state = (TSVectorBuildState *) _state;
435 ParsedText *prs = state->prs;
436 int32 prevwords;
437
438 if (prs->words == NULL)
439 {
440 /*
441 * First time through: initialize words array to a reasonable size.
442 * (parsetext() will realloc it bigger as needed.)
443 */
444 prs->lenwords = 16;
445 prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords);
446 prs->curwords = 0;
447 prs->pos = 0;
448 }
449
450 prevwords = prs->curwords;
451
452 parsetext(state->cfgId, prs, elem_value, elem_len);
453
454 /*
455 * If we extracted any words from this JSON element, advance pos to create
456 * an artificial break between elements. This is because we don't want
457 * phrase searches to think that the last word in this element is adjacent
458 * to the first word in the next one.
459 */
460 if (prs->curwords > prevwords)
461 prs->pos += 1;
462 }
463
464
465 /*
466 * to_tsquery
467 */
468
469
470 /*
471 * This function is used for morph parsing.
472 *
473 * The value is passed to parsetext which will call the right dictionary to
474 * lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP
475 * to the stack.
476 *
477 * All words belonging to the same variant are pushed as an ANDed list,
478 * and different variants are ORed together.
479 */
480 static void
pushval_morph(Datum opaque,TSQueryParserState state,char * strval,int lenval,int16 weight,bool prefix)481 pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
482 {
483 int32 count = 0;
484 ParsedText prs;
485 uint32 variant,
486 pos = 0,
487 cntvar = 0,
488 cntpos = 0,
489 cnt = 0;
490 MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque);
491
492 prs.lenwords = 4;
493 prs.curwords = 0;
494 prs.pos = 0;
495 prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
496
497 parsetext(data->cfg_id, &prs, strval, lenval);
498
499 if (prs.curwords > 0)
500 {
501 while (count < prs.curwords)
502 {
503 /*
504 * Were any stop words removed? If so, fill empty positions with
505 * placeholders linked by an appropriate operator.
506 */
507 if (pos > 0 && pos + 1 < prs.words[count].pos.pos)
508 {
509 while (pos + 1 < prs.words[count].pos.pos)
510 {
511 /* put placeholders for each missing stop word */
512 pushStop(state);
513 if (cntpos)
514 pushOperator(state, data->qoperator, 1);
515 cntpos++;
516 pos++;
517 }
518 }
519
520 /* save current word's position */
521 pos = prs.words[count].pos.pos;
522
523 /* Go through all variants obtained from this token */
524 cntvar = 0;
525 while (count < prs.curwords && pos == prs.words[count].pos.pos)
526 {
527 variant = prs.words[count].nvariant;
528
529 /* Push all words belonging to the same variant */
530 cnt = 0;
531 while (count < prs.curwords &&
532 pos == prs.words[count].pos.pos &&
533 variant == prs.words[count].nvariant)
534 {
535 pushValue(state,
536 prs.words[count].word,
537 prs.words[count].len,
538 weight,
539 ((prs.words[count].flags & TSL_PREFIX) || prefix));
540 pfree(prs.words[count].word);
541 if (cnt)
542 pushOperator(state, OP_AND, 0);
543 cnt++;
544 count++;
545 }
546
547 if (cntvar)
548 pushOperator(state, OP_OR, 0);
549 cntvar++;
550 }
551
552 if (cntpos)
553 {
554 /* distance may be useful */
555 pushOperator(state, data->qoperator, 1);
556 }
557
558 cntpos++;
559 }
560
561 pfree(prs.words);
562
563 }
564 else
565 pushStop(state);
566 }
567
568 Datum
to_tsquery_byid(PG_FUNCTION_ARGS)569 to_tsquery_byid(PG_FUNCTION_ARGS)
570 {
571 text *in = PG_GETARG_TEXT_PP(1);
572 TSQuery query;
573 MorphOpaque data;
574
575 data.cfg_id = PG_GETARG_OID(0);
576 data.qoperator = OP_AND;
577
578 query = parse_tsquery(text_to_cstring(in),
579 pushval_morph,
580 PointerGetDatum(&data),
581 0);
582
583 PG_RETURN_TSQUERY(query);
584 }
585
586 Datum
to_tsquery(PG_FUNCTION_ARGS)587 to_tsquery(PG_FUNCTION_ARGS)
588 {
589 text *in = PG_GETARG_TEXT_PP(0);
590 Oid cfgId;
591
592 cfgId = getTSCurrentConfig(true);
593 PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid,
594 ObjectIdGetDatum(cfgId),
595 PointerGetDatum(in)));
596 }
597
598 Datum
plainto_tsquery_byid(PG_FUNCTION_ARGS)599 plainto_tsquery_byid(PG_FUNCTION_ARGS)
600 {
601 text *in = PG_GETARG_TEXT_PP(1);
602 TSQuery query;
603 MorphOpaque data;
604
605 data.cfg_id = PG_GETARG_OID(0);
606 data.qoperator = OP_AND;
607
608 query = parse_tsquery(text_to_cstring(in),
609 pushval_morph,
610 PointerGetDatum(&data),
611 P_TSQ_PLAIN);
612
613 PG_RETURN_POINTER(query);
614 }
615
616 Datum
plainto_tsquery(PG_FUNCTION_ARGS)617 plainto_tsquery(PG_FUNCTION_ARGS)
618 {
619 text *in = PG_GETARG_TEXT_PP(0);
620 Oid cfgId;
621
622 cfgId = getTSCurrentConfig(true);
623 PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid,
624 ObjectIdGetDatum(cfgId),
625 PointerGetDatum(in)));
626 }
627
628
629 Datum
phraseto_tsquery_byid(PG_FUNCTION_ARGS)630 phraseto_tsquery_byid(PG_FUNCTION_ARGS)
631 {
632 text *in = PG_GETARG_TEXT_PP(1);
633 TSQuery query;
634 MorphOpaque data;
635
636 data.cfg_id = PG_GETARG_OID(0);
637 data.qoperator = OP_PHRASE;
638
639 query = parse_tsquery(text_to_cstring(in),
640 pushval_morph,
641 PointerGetDatum(&data),
642 P_TSQ_PLAIN);
643
644 PG_RETURN_TSQUERY(query);
645 }
646
647 Datum
phraseto_tsquery(PG_FUNCTION_ARGS)648 phraseto_tsquery(PG_FUNCTION_ARGS)
649 {
650 text *in = PG_GETARG_TEXT_PP(0);
651 Oid cfgId;
652
653 cfgId = getTSCurrentConfig(true);
654 PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid,
655 ObjectIdGetDatum(cfgId),
656 PointerGetDatum(in)));
657 }
658
659 Datum
websearch_to_tsquery_byid(PG_FUNCTION_ARGS)660 websearch_to_tsquery_byid(PG_FUNCTION_ARGS)
661 {
662 text *in = PG_GETARG_TEXT_PP(1);
663 MorphOpaque data;
664 TSQuery query = NULL;
665
666 data.cfg_id = PG_GETARG_OID(0);
667
668 data.qoperator = OP_AND;
669
670 query = parse_tsquery(text_to_cstring(in),
671 pushval_morph,
672 PointerGetDatum(&data),
673 P_TSQ_WEB);
674
675 PG_RETURN_TSQUERY(query);
676 }
677
678 Datum
websearch_to_tsquery(PG_FUNCTION_ARGS)679 websearch_to_tsquery(PG_FUNCTION_ARGS)
680 {
681 text *in = PG_GETARG_TEXT_PP(0);
682 Oid cfgId;
683
684 cfgId = getTSCurrentConfig(true);
685 PG_RETURN_DATUM(DirectFunctionCall2(websearch_to_tsquery_byid,
686 ObjectIdGetDatum(cfgId),
687 PointerGetDatum(in)));
688
689 }
690