1 /*-------------------------------------------------------------------------
2  *
3  * to_tsany.c
4  *		to_ts* function definitions
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/tsearch/to_tsany.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "tsearch/ts_cache.h"
17 #include "tsearch/ts_utils.h"
18 #include "utils/builtins.h"
19 #include "utils/jsonapi.h"
20 
21 
22 typedef struct MorphOpaque
23 {
24 	Oid			cfg_id;
25 	int			qoperator;		/* query operator */
26 } MorphOpaque;
27 
28 typedef struct TSVectorBuildState
29 {
30 	ParsedText *prs;
31 	Oid			cfgId;
32 } TSVectorBuildState;
33 
34 static void add_to_tsvector(void *_state, char *elem_value, int elem_len);
35 
36 
37 Datum
38 get_current_ts_config(PG_FUNCTION_ARGS)
39 {
40 	PG_RETURN_OID(getTSCurrentConfig(true));
41 }
42 
43 /*
44  * to_tsvector
45  */
46 static int
47 compareWORD(const void *a, const void *b)
48 {
49 	int			res;
50 
51 	res = tsCompareString(
52 						  ((const ParsedWord *) a)->word, ((const ParsedWord *) a)->len,
53 						  ((const ParsedWord *) b)->word, ((const ParsedWord *) b)->len,
54 						  false);
55 
56 	if (res == 0)
57 	{
58 		if (((const ParsedWord *) a)->pos.pos == ((const ParsedWord *) b)->pos.pos)
59 			return 0;
60 
61 		res = (((const ParsedWord *) a)->pos.pos > ((const ParsedWord *) b)->pos.pos) ? 1 : -1;
62 	}
63 
64 	return res;
65 }
66 
67 static int
68 uniqueWORD(ParsedWord *a, int32 l)
69 {
70 	ParsedWord *ptr,
71 			   *res;
72 	int			tmppos;
73 
74 	if (l == 1)
75 	{
76 		tmppos = LIMITPOS(a->pos.pos);
77 		a->alen = 2;
78 		a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
79 		a->pos.apos[0] = 1;
80 		a->pos.apos[1] = tmppos;
81 		return l;
82 	}
83 
84 	res = a;
85 	ptr = a + 1;
86 
87 	/*
88 	 * Sort words with its positions
89 	 */
90 	qsort((void *) a, l, sizeof(ParsedWord), compareWORD);
91 
92 	/*
93 	 * Initialize first word and its first position
94 	 */
95 	tmppos = LIMITPOS(a->pos.pos);
96 	a->alen = 2;
97 	a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
98 	a->pos.apos[0] = 1;
99 	a->pos.apos[1] = tmppos;
100 
101 	/*
102 	 * Summarize position information for each word
103 	 */
104 	while (ptr - a < l)
105 	{
106 		if (!(ptr->len == res->len &&
107 			  strncmp(ptr->word, res->word, res->len) == 0))
108 		{
109 			/*
110 			 * Got a new word, so put it in result
111 			 */
112 			res++;
113 			res->len = ptr->len;
114 			res->word = ptr->word;
115 			tmppos = LIMITPOS(ptr->pos.pos);
116 			res->alen = 2;
117 			res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen);
118 			res->pos.apos[0] = 1;
119 			res->pos.apos[1] = tmppos;
120 		}
121 		else
122 		{
123 			/*
124 			 * The word already exists, so adjust position information. But
125 			 * before we should check size of position's array, max allowed
126 			 * value for position and uniqueness of position
127 			 */
128 			pfree(ptr->word);
129 			if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 &&
130 				res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
131 			{
132 				if (res->pos.apos[0] + 1 >= res->alen)
133 				{
134 					res->alen *= 2;
135 					res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen);
136 				}
137 				if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
138 				{
139 					res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos);
140 					res->pos.apos[0]++;
141 				}
142 			}
143 		}
144 		ptr++;
145 	}
146 
147 	return res + 1 - a;
148 }
149 
150 /*
151  * make value of tsvector, given parsed text
152  *
153  * Note: frees prs->words and subsidiary data.
154  */
155 TSVector
156 make_tsvector(ParsedText *prs)
157 {
158 	int			i,
159 				j,
160 				lenstr = 0,
161 				totallen;
162 	TSVector	in;
163 	WordEntry  *ptr;
164 	char	   *str;
165 	int			stroff;
166 
167 	/* Merge duplicate words */
168 	if (prs->curwords > 0)
169 		prs->curwords = uniqueWORD(prs->words, prs->curwords);
170 
171 	/* Determine space needed */
172 	for (i = 0; i < prs->curwords; i++)
173 	{
174 		lenstr += prs->words[i].len;
175 		if (prs->words[i].alen)
176 		{
177 			lenstr = SHORTALIGN(lenstr);
178 			lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
179 		}
180 	}
181 
182 	if (lenstr > MAXSTRPOS)
183 		ereport(ERROR,
184 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
185 				 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS)));
186 
187 	totallen = CALCDATASIZE(prs->curwords, lenstr);
188 	in = (TSVector) palloc0(totallen);
189 	SET_VARSIZE(in, totallen);
190 	in->size = prs->curwords;
191 
192 	ptr = ARRPTR(in);
193 	str = STRPTR(in);
194 	stroff = 0;
195 	for (i = 0; i < prs->curwords; i++)
196 	{
197 		ptr->len = prs->words[i].len;
198 		ptr->pos = stroff;
199 		memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
200 		stroff += prs->words[i].len;
201 		pfree(prs->words[i].word);
202 		if (prs->words[i].alen)
203 		{
204 			int			k = prs->words[i].pos.apos[0];
205 			WordEntryPos *wptr;
206 
207 			if (k > 0xFFFF)
208 				elog(ERROR, "positions array too long");
209 
210 			ptr->haspos = 1;
211 			stroff = SHORTALIGN(stroff);
212 			*(uint16 *) (str + stroff) = (uint16) k;
213 			wptr = POSDATAPTR(in, ptr);
214 			for (j = 0; j < k; j++)
215 			{
216 				WEP_SETWEIGHT(wptr[j], 0);
217 				WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
218 			}
219 			stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
220 			pfree(prs->words[i].pos.apos);
221 		}
222 		else
223 			ptr->haspos = 0;
224 		ptr++;
225 	}
226 
227 	if (prs->words)
228 		pfree(prs->words);
229 
230 	return in;
231 }
232 
233 Datum
234 to_tsvector_byid(PG_FUNCTION_ARGS)
235 {
236 	Oid			cfgId = PG_GETARG_OID(0);
237 	text	   *in = PG_GETARG_TEXT_PP(1);
238 	ParsedText	prs;
239 	TSVector	out;
240 
241 	prs.lenwords = VARSIZE_ANY_EXHDR(in) / 6;	/* just estimation of word's
242 												 * number */
243 	if (prs.lenwords < 2)
244 		prs.lenwords = 2;
245 	prs.curwords = 0;
246 	prs.pos = 0;
247 	prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
248 
249 	parsetext(cfgId, &prs, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
250 
251 	PG_FREE_IF_COPY(in, 1);
252 
253 	out = make_tsvector(&prs);
254 
255 	PG_RETURN_TSVECTOR(out);
256 }
257 
258 Datum
259 to_tsvector(PG_FUNCTION_ARGS)
260 {
261 	text	   *in = PG_GETARG_TEXT_PP(0);
262 	Oid			cfgId;
263 
264 	cfgId = getTSCurrentConfig(true);
265 	PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid,
266 										ObjectIdGetDatum(cfgId),
267 										PointerGetDatum(in)));
268 }
269 
270 /*
271  * Worker function for jsonb(_string)_to_tsvector(_byid)
272  */
273 static TSVector
274 jsonb_to_tsvector_worker(Oid cfgId, Jsonb *jb, uint32 flags)
275 {
276 	TSVectorBuildState state;
277 	ParsedText	prs;
278 
279 	prs.words = NULL;
280 	prs.curwords = 0;
281 	state.prs = &prs;
282 	state.cfgId = cfgId;
283 
284 	iterate_jsonb_values(jb, flags, &state, add_to_tsvector);
285 
286 	return make_tsvector(&prs);
287 }
288 
289 Datum
290 jsonb_string_to_tsvector_byid(PG_FUNCTION_ARGS)
291 {
292 	Oid			cfgId = PG_GETARG_OID(0);
293 	Jsonb	   *jb = PG_GETARG_JSONB_P(1);
294 	TSVector	result;
295 
296 	result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
297 	PG_FREE_IF_COPY(jb, 1);
298 
299 	PG_RETURN_TSVECTOR(result);
300 }
301 
302 Datum
303 jsonb_string_to_tsvector(PG_FUNCTION_ARGS)
304 {
305 	Jsonb	   *jb = PG_GETARG_JSONB_P(0);
306 	Oid			cfgId;
307 	TSVector	result;
308 
309 	cfgId = getTSCurrentConfig(true);
310 	result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
311 	PG_FREE_IF_COPY(jb, 0);
312 
313 	PG_RETURN_TSVECTOR(result);
314 }
315 
316 Datum
317 jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
318 {
319 	Oid			cfgId = PG_GETARG_OID(0);
320 	Jsonb	   *jb = PG_GETARG_JSONB_P(1);
321 	Jsonb	   *jbFlags = PG_GETARG_JSONB_P(2);
322 	TSVector	result;
323 	uint32		flags = parse_jsonb_index_flags(jbFlags);
324 
325 	result = jsonb_to_tsvector_worker(cfgId, jb, flags);
326 	PG_FREE_IF_COPY(jb, 1);
327 	PG_FREE_IF_COPY(jbFlags, 2);
328 
329 	PG_RETURN_TSVECTOR(result);
330 }
331 
332 Datum
333 jsonb_to_tsvector(PG_FUNCTION_ARGS)
334 {
335 	Jsonb	   *jb = PG_GETARG_JSONB_P(0);
336 	Jsonb	   *jbFlags = PG_GETARG_JSONB_P(1);
337 	Oid			cfgId;
338 	TSVector	result;
339 	uint32		flags = parse_jsonb_index_flags(jbFlags);
340 
341 	cfgId = getTSCurrentConfig(true);
342 	result = jsonb_to_tsvector_worker(cfgId, jb, flags);
343 	PG_FREE_IF_COPY(jb, 0);
344 	PG_FREE_IF_COPY(jbFlags, 1);
345 
346 	PG_RETURN_TSVECTOR(result);
347 }
348 
349 /*
350  * Worker function for json(_string)_to_tsvector(_byid)
351  */
352 static TSVector
353 json_to_tsvector_worker(Oid cfgId, text *json, uint32 flags)
354 {
355 	TSVectorBuildState state;
356 	ParsedText	prs;
357 
358 	prs.words = NULL;
359 	prs.curwords = 0;
360 	state.prs = &prs;
361 	state.cfgId = cfgId;
362 
363 	iterate_json_values(json, flags, &state, add_to_tsvector);
364 
365 	return make_tsvector(&prs);
366 }
367 
368 Datum
369 json_string_to_tsvector_byid(PG_FUNCTION_ARGS)
370 {
371 	Oid			cfgId = PG_GETARG_OID(0);
372 	text	   *json = PG_GETARG_TEXT_P(1);
373 	TSVector	result;
374 
375 	result = json_to_tsvector_worker(cfgId, json, jtiString);
376 	PG_FREE_IF_COPY(json, 1);
377 
378 	PG_RETURN_TSVECTOR(result);
379 }
380 
381 Datum
382 json_string_to_tsvector(PG_FUNCTION_ARGS)
383 {
384 	text	   *json = PG_GETARG_TEXT_P(0);
385 	Oid			cfgId;
386 	TSVector	result;
387 
388 	cfgId = getTSCurrentConfig(true);
389 	result = json_to_tsvector_worker(cfgId, json, jtiString);
390 	PG_FREE_IF_COPY(json, 0);
391 
392 	PG_RETURN_TSVECTOR(result);
393 }
394 
395 Datum
396 json_to_tsvector_byid(PG_FUNCTION_ARGS)
397 {
398 	Oid			cfgId = PG_GETARG_OID(0);
399 	text	   *json = PG_GETARG_TEXT_P(1);
400 	Jsonb	   *jbFlags = PG_GETARG_JSONB_P(2);
401 	TSVector	result;
402 	uint32		flags = parse_jsonb_index_flags(jbFlags);
403 
404 	result = json_to_tsvector_worker(cfgId, json, flags);
405 	PG_FREE_IF_COPY(json, 1);
406 	PG_FREE_IF_COPY(jbFlags, 2);
407 
408 	PG_RETURN_TSVECTOR(result);
409 }
410 
411 Datum
412 json_to_tsvector(PG_FUNCTION_ARGS)
413 {
414 	text	   *json = PG_GETARG_TEXT_P(0);
415 	Jsonb	   *jbFlags = PG_GETARG_JSONB_P(1);
416 	Oid			cfgId;
417 	TSVector	result;
418 	uint32		flags = parse_jsonb_index_flags(jbFlags);
419 
420 	cfgId = getTSCurrentConfig(true);
421 	result = json_to_tsvector_worker(cfgId, json, flags);
422 	PG_FREE_IF_COPY(json, 0);
423 	PG_FREE_IF_COPY(jbFlags, 1);
424 
425 	PG_RETURN_TSVECTOR(result);
426 }
427 
428 /*
429  * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState.
430  */
431 static void
432 add_to_tsvector(void *_state, char *elem_value, int elem_len)
433 {
434 	TSVectorBuildState *state = (TSVectorBuildState *) _state;
435 	ParsedText *prs = state->prs;
436 	int32		prevwords;
437 
438 	if (prs->words == NULL)
439 	{
440 		/*
441 		 * First time through: initialize words array to a reasonable size.
442 		 * (parsetext() will realloc it bigger as needed.)
443 		 */
444 		prs->lenwords = 16;
445 		prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords);
446 		prs->curwords = 0;
447 		prs->pos = 0;
448 	}
449 
450 	prevwords = prs->curwords;
451 
452 	parsetext(state->cfgId, prs, elem_value, elem_len);
453 
454 	/*
455 	 * If we extracted any words from this JSON element, advance pos to create
456 	 * an artificial break between elements.  This is because we don't want
457 	 * phrase searches to think that the last word in this element is adjacent
458 	 * to the first word in the next one.
459 	 */
460 	if (prs->curwords > prevwords)
461 		prs->pos += 1;
462 }
463 
464 
465 /*
466  * to_tsquery
467  */
468 
469 
470 /*
471  * This function is used for morph parsing.
472  *
473  * The value is passed to parsetext which will call the right dictionary to
474  * lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP
475  * to the stack.
476  *
477  * All words belonging to the same variant are pushed as an ANDed list,
478  * and different variants are ORed together.
479  */
480 static void
481 pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
482 {
483 	int32		count = 0;
484 	ParsedText	prs;
485 	uint32		variant,
486 				pos = 0,
487 				cntvar = 0,
488 				cntpos = 0,
489 				cnt = 0;
490 	MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque);
491 
492 	prs.lenwords = 4;
493 	prs.curwords = 0;
494 	prs.pos = 0;
495 	prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
496 
497 	parsetext(data->cfg_id, &prs, strval, lenval);
498 
499 	if (prs.curwords > 0)
500 	{
501 		while (count < prs.curwords)
502 		{
503 			/*
504 			 * Were any stop words removed? If so, fill empty positions with
505 			 * placeholders linked by an appropriate operator.
506 			 */
507 			if (pos > 0 && pos + 1 < prs.words[count].pos.pos)
508 			{
509 				while (pos + 1 < prs.words[count].pos.pos)
510 				{
511 					/* put placeholders for each missing stop word */
512 					pushStop(state);
513 					if (cntpos)
514 						pushOperator(state, data->qoperator, 1);
515 					cntpos++;
516 					pos++;
517 				}
518 			}
519 
520 			/* save current word's position */
521 			pos = prs.words[count].pos.pos;
522 
523 			/* Go through all variants obtained from this token */
524 			cntvar = 0;
525 			while (count < prs.curwords && pos == prs.words[count].pos.pos)
526 			{
527 				variant = prs.words[count].nvariant;
528 
529 				/* Push all words belonging to the same variant */
530 				cnt = 0;
531 				while (count < prs.curwords &&
532 					   pos == prs.words[count].pos.pos &&
533 					   variant == prs.words[count].nvariant)
534 				{
535 					pushValue(state,
536 							  prs.words[count].word,
537 							  prs.words[count].len,
538 							  weight,
539 							  ((prs.words[count].flags & TSL_PREFIX) || prefix));
540 					pfree(prs.words[count].word);
541 					if (cnt)
542 						pushOperator(state, OP_AND, 0);
543 					cnt++;
544 					count++;
545 				}
546 
547 				if (cntvar)
548 					pushOperator(state, OP_OR, 0);
549 				cntvar++;
550 			}
551 
552 			if (cntpos)
553 			{
554 				/* distance may be useful */
555 				pushOperator(state, data->qoperator, 1);
556 			}
557 
558 			cntpos++;
559 		}
560 
561 		pfree(prs.words);
562 
563 	}
564 	else
565 		pushStop(state);
566 }
567 
568 Datum
569 to_tsquery_byid(PG_FUNCTION_ARGS)
570 {
571 	text	   *in = PG_GETARG_TEXT_PP(1);
572 	TSQuery		query;
573 	MorphOpaque data;
574 
575 	data.cfg_id = PG_GETARG_OID(0);
576 	data.qoperator = OP_AND;
577 
578 	query = parse_tsquery(text_to_cstring(in),
579 						  pushval_morph,
580 						  PointerGetDatum(&data),
581 						  0);
582 
583 	PG_RETURN_TSQUERY(query);
584 }
585 
586 Datum
587 to_tsquery(PG_FUNCTION_ARGS)
588 {
589 	text	   *in = PG_GETARG_TEXT_PP(0);
590 	Oid			cfgId;
591 
592 	cfgId = getTSCurrentConfig(true);
593 	PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid,
594 										ObjectIdGetDatum(cfgId),
595 										PointerGetDatum(in)));
596 }
597 
598 Datum
599 plainto_tsquery_byid(PG_FUNCTION_ARGS)
600 {
601 	text	   *in = PG_GETARG_TEXT_PP(1);
602 	TSQuery		query;
603 	MorphOpaque data;
604 
605 	data.cfg_id = PG_GETARG_OID(0);
606 	data.qoperator = OP_AND;
607 
608 	query = parse_tsquery(text_to_cstring(in),
609 						  pushval_morph,
610 						  PointerGetDatum(&data),
611 						  P_TSQ_PLAIN);
612 
613 	PG_RETURN_POINTER(query);
614 }
615 
616 Datum
617 plainto_tsquery(PG_FUNCTION_ARGS)
618 {
619 	text	   *in = PG_GETARG_TEXT_PP(0);
620 	Oid			cfgId;
621 
622 	cfgId = getTSCurrentConfig(true);
623 	PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid,
624 										ObjectIdGetDatum(cfgId),
625 										PointerGetDatum(in)));
626 }
627 
628 
629 Datum
630 phraseto_tsquery_byid(PG_FUNCTION_ARGS)
631 {
632 	text	   *in = PG_GETARG_TEXT_PP(1);
633 	TSQuery		query;
634 	MorphOpaque data;
635 
636 	data.cfg_id = PG_GETARG_OID(0);
637 	data.qoperator = OP_PHRASE;
638 
639 	query = parse_tsquery(text_to_cstring(in),
640 						  pushval_morph,
641 						  PointerGetDatum(&data),
642 						  P_TSQ_PLAIN);
643 
644 	PG_RETURN_TSQUERY(query);
645 }
646 
647 Datum
648 phraseto_tsquery(PG_FUNCTION_ARGS)
649 {
650 	text	   *in = PG_GETARG_TEXT_PP(0);
651 	Oid			cfgId;
652 
653 	cfgId = getTSCurrentConfig(true);
654 	PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid,
655 										ObjectIdGetDatum(cfgId),
656 										PointerGetDatum(in)));
657 }
658 
659 Datum
660 websearch_to_tsquery_byid(PG_FUNCTION_ARGS)
661 {
662 	text	   *in = PG_GETARG_TEXT_PP(1);
663 	MorphOpaque data;
664 	TSQuery		query = NULL;
665 
666 	data.cfg_id = PG_GETARG_OID(0);
667 
668 	data.qoperator = OP_AND;
669 
670 	query = parse_tsquery(text_to_cstring(in),
671 						  pushval_morph,
672 						  PointerGetDatum(&data),
673 						  P_TSQ_WEB);
674 
675 	PG_RETURN_TSQUERY(query);
676 }
677 
678 Datum
679 websearch_to_tsquery(PG_FUNCTION_ARGS)
680 {
681 	text	   *in = PG_GETARG_TEXT_PP(0);
682 	Oid			cfgId;
683 
684 	cfgId = getTSCurrentConfig(true);
685 	PG_RETURN_DATUM(DirectFunctionCall2(websearch_to_tsquery_byid,
686 										ObjectIdGetDatum(cfgId),
687 										PointerGetDatum(in)));
688 
689 }
690