1 /*
2  * contrib/pg_trgm/trgm_op.c
3  */
4 #include "postgres.h"
5 
6 #include <ctype.h>
7 
8 #include "catalog/pg_type.h"
9 #include "lib/qunique.h"
10 #include "trgm.h"
11 #include "tsearch/ts_locale.h"
12 #include "utils/lsyscache.h"
13 #include "utils/memutils.h"
14 #include "utils/pg_crc.h"
15 
16 PG_MODULE_MAGIC;
17 
18 /* GUC variables */
19 double		similarity_threshold = 0.3f;
20 double		word_similarity_threshold = 0.6f;
21 double		strict_word_similarity_threshold = 0.5f;
22 
23 void		_PG_init(void);
24 
25 PG_FUNCTION_INFO_V1(set_limit);
26 PG_FUNCTION_INFO_V1(show_limit);
27 PG_FUNCTION_INFO_V1(show_trgm);
28 PG_FUNCTION_INFO_V1(similarity);
29 PG_FUNCTION_INFO_V1(word_similarity);
30 PG_FUNCTION_INFO_V1(strict_word_similarity);
31 PG_FUNCTION_INFO_V1(similarity_dist);
32 PG_FUNCTION_INFO_V1(similarity_op);
33 PG_FUNCTION_INFO_V1(word_similarity_op);
34 PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
35 PG_FUNCTION_INFO_V1(word_similarity_dist_op);
36 PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
37 PG_FUNCTION_INFO_V1(strict_word_similarity_op);
38 PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
39 PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
40 PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
41 
42 /* Trigram with position */
43 typedef struct
44 {
45 	trgm		trg;
46 	int			index;
47 } pos_trgm;
48 
49 /* Trigram bound type */
50 typedef uint8 TrgmBound;
51 #define TRGM_BOUND_LEFT				0x01	/* trigram is left bound of word */
52 #define TRGM_BOUND_RIGHT			0x02	/* trigram is right bound of word */
53 
54 /* Word similarity flags */
55 #define WORD_SIMILARITY_CHECK_ONLY	0x01	/* only check existence of similar
56 											 * search pattern in text */
57 #define WORD_SIMILARITY_STRICT		0x02	/* force bounds of extent to match
58 											 * word bounds */
59 
60 /*
61  * Module load callback
62  */
63 void
_PG_init(void)64 _PG_init(void)
65 {
66 	/* Define custom GUC variables. */
67 	DefineCustomRealVariable("pg_trgm.similarity_threshold",
68 							 "Sets the threshold used by the % operator.",
69 							 "Valid range is 0.0 .. 1.0.",
70 							 &similarity_threshold,
71 							 0.3,
72 							 0.0,
73 							 1.0,
74 							 PGC_USERSET,
75 							 0,
76 							 NULL,
77 							 NULL,
78 							 NULL);
79 	DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
80 							 "Sets the threshold used by the <% operator.",
81 							 "Valid range is 0.0 .. 1.0.",
82 							 &word_similarity_threshold,
83 							 0.6,
84 							 0.0,
85 							 1.0,
86 							 PGC_USERSET,
87 							 0,
88 							 NULL,
89 							 NULL,
90 							 NULL);
91 	DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
92 							 "Sets the threshold used by the <<% operator.",
93 							 "Valid range is 0.0 .. 1.0.",
94 							 &strict_word_similarity_threshold,
95 							 0.5,
96 							 0.0,
97 							 1.0,
98 							 PGC_USERSET,
99 							 0,
100 							 NULL,
101 							 NULL,
102 							 NULL);
103 }
104 
105 /*
106  * Deprecated function.
107  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
108  */
109 Datum
set_limit(PG_FUNCTION_ARGS)110 set_limit(PG_FUNCTION_ARGS)
111 {
112 	float4		nlimit = PG_GETARG_FLOAT4(0);
113 	char	   *nlimit_str;
114 	Oid			func_out_oid;
115 	bool		is_varlena;
116 
117 	getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
118 
119 	nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
120 
121 	SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
122 					PGC_USERSET, PGC_S_SESSION);
123 
124 	PG_RETURN_FLOAT4(similarity_threshold);
125 }
126 
127 
128 /*
129  * Get similarity threshold for given index scan strategy number.
130  */
131 double
index_strategy_get_limit(StrategyNumber strategy)132 index_strategy_get_limit(StrategyNumber strategy)
133 {
134 	switch (strategy)
135 	{
136 		case SimilarityStrategyNumber:
137 			return similarity_threshold;
138 		case WordSimilarityStrategyNumber:
139 			return word_similarity_threshold;
140 		case StrictWordSimilarityStrategyNumber:
141 			return strict_word_similarity_threshold;
142 		default:
143 			elog(ERROR, "unrecognized strategy number: %d", strategy);
144 			break;
145 	}
146 
147 	return 0.0;					/* keep compiler quiet */
148 }
149 
150 /*
151  * Deprecated function.
152  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
153  */
154 Datum
show_limit(PG_FUNCTION_ARGS)155 show_limit(PG_FUNCTION_ARGS)
156 {
157 	PG_RETURN_FLOAT4(similarity_threshold);
158 }
159 
160 static int
comp_trgm(const void * a,const void * b)161 comp_trgm(const void *a, const void *b)
162 {
163 	return CMPTRGM(a, b);
164 }
165 
166 /*
167  * Finds first word in string, returns pointer to the word,
168  * endword points to the character after word
169  */
170 static char *
find_word(char * str,int lenstr,char ** endword,int * charlen)171 find_word(char *str, int lenstr, char **endword, int *charlen)
172 {
173 	char	   *beginword = str;
174 
175 	while (beginword - str < lenstr && !ISWORDCHR(beginword))
176 		beginword += pg_mblen(beginword);
177 
178 	if (beginword - str >= lenstr)
179 		return NULL;
180 
181 	*endword = beginword;
182 	*charlen = 0;
183 	while (*endword - str < lenstr && ISWORDCHR(*endword))
184 	{
185 		*endword += pg_mblen(*endword);
186 		(*charlen)++;
187 	}
188 
189 	return beginword;
190 }
191 
192 /*
193  * Reduce a trigram (three possibly multi-byte characters) to a trgm,
194  * which is always exactly three bytes.  If we have three single-byte
195  * characters, we just use them as-is; otherwise we form a hash value.
196  */
197 void
compact_trigram(trgm * tptr,char * str,int bytelen)198 compact_trigram(trgm *tptr, char *str, int bytelen)
199 {
200 	if (bytelen == 3)
201 	{
202 		CPTRGM(tptr, str);
203 	}
204 	else
205 	{
206 		pg_crc32	crc;
207 
208 		INIT_LEGACY_CRC32(crc);
209 		COMP_LEGACY_CRC32(crc, str, bytelen);
210 		FIN_LEGACY_CRC32(crc);
211 
212 		/*
213 		 * use only 3 upper bytes from crc, hope, it's good enough hashing
214 		 */
215 		CPTRGM(tptr, &crc);
216 	}
217 }
218 
219 /*
220  * Adds trigrams from words (already padded).
221  */
222 static trgm *
make_trigrams(trgm * tptr,char * str,int bytelen,int charlen)223 make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
224 {
225 	char	   *ptr = str;
226 
227 	if (charlen < 3)
228 		return tptr;
229 
230 	if (bytelen > charlen)
231 	{
232 		/* Find multibyte character boundaries and apply compact_trigram */
233 		int			lenfirst = pg_mblen(str),
234 					lenmiddle = pg_mblen(str + lenfirst),
235 					lenlast = pg_mblen(str + lenfirst + lenmiddle);
236 
237 		while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
238 		{
239 			compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
240 
241 			ptr += lenfirst;
242 			tptr++;
243 
244 			lenfirst = lenmiddle;
245 			lenmiddle = lenlast;
246 			lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
247 		}
248 	}
249 	else
250 	{
251 		/* Fast path when there are no multibyte characters */
252 		Assert(bytelen == charlen);
253 
254 		while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
255 		{
256 			CPTRGM(tptr, ptr);
257 			ptr++;
258 			tptr++;
259 		}
260 	}
261 
262 	return tptr;
263 }
264 
265 /*
266  * Make array of trigrams without sorting and removing duplicate items.
267  *
268  * trg: where to return the array of trigrams.
269  * str: source string, of length slen bytes.
270  * bounds: where to return bounds of trigrams (if needed).
271  *
272  * Returns length of the generated array.
273  */
274 static int
generate_trgm_only(trgm * trg,char * str,int slen,TrgmBound * bounds)275 generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
276 {
277 	trgm	   *tptr;
278 	char	   *buf;
279 	int			charlen,
280 				bytelen;
281 	char	   *bword,
282 			   *eword;
283 
284 	if (slen + LPADDING + RPADDING < 3 || slen == 0)
285 		return 0;
286 
287 	tptr = trg;
288 
289 	/* Allocate a buffer for case-folded, blank-padded words */
290 	buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
291 
292 	if (LPADDING > 0)
293 	{
294 		*buf = ' ';
295 		if (LPADDING > 1)
296 			*(buf + 1) = ' ';
297 	}
298 
299 	eword = str;
300 	while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
301 	{
302 #ifdef IGNORECASE
303 		bword = lowerstr_with_len(bword, eword - bword);
304 		bytelen = strlen(bword);
305 #else
306 		bytelen = eword - bword;
307 #endif
308 
309 		memcpy(buf + LPADDING, bword, bytelen);
310 
311 #ifdef IGNORECASE
312 		pfree(bword);
313 #endif
314 
315 		buf[LPADDING + bytelen] = ' ';
316 		buf[LPADDING + bytelen + 1] = ' ';
317 
318 		/* Calculate trigrams marking their bounds if needed */
319 		if (bounds)
320 			bounds[tptr - trg] |= TRGM_BOUND_LEFT;
321 		tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
322 							 charlen + LPADDING + RPADDING);
323 		if (bounds)
324 			bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
325 	}
326 
327 	pfree(buf);
328 
329 	return tptr - trg;
330 }
331 
332 /*
333  * Guard against possible overflow in the palloc requests below.  (We
334  * don't worry about the additive constants, since palloc can detect
335  * requests that are a little above MaxAllocSize --- we just need to
336  * prevent integer overflow in the multiplications.)
337  */
338 static void
protect_out_of_mem(int slen)339 protect_out_of_mem(int slen)
340 {
341 	if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
342 		(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
343 		ereport(ERROR,
344 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
345 				 errmsg("out of memory")));
346 }
347 
348 /*
349  * Make array of trigrams with sorting and removing duplicate items.
350  *
351  * str: source string, of length slen bytes.
352  *
353  * Returns the sorted array of unique trigrams.
354  */
355 TRGM *
generate_trgm(char * str,int slen)356 generate_trgm(char *str, int slen)
357 {
358 	TRGM	   *trg;
359 	int			len;
360 
361 	protect_out_of_mem(slen);
362 
363 	trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
364 	trg->flag = ARRKEY;
365 
366 	len = generate_trgm_only(GETARR(trg), str, slen, NULL);
367 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
368 
369 	if (len == 0)
370 		return trg;
371 
372 	/*
373 	 * Make trigrams unique.
374 	 */
375 	if (len > 1)
376 	{
377 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
378 		len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
379 	}
380 
381 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
382 
383 	return trg;
384 }
385 
386 /*
387  * Make array of positional trigrams from two trigram arrays trg1 and trg2.
388  *
389  * trg1: trigram array of search pattern, of length len1. trg1 is required
390  *		 word which positions don't matter and replaced with -1.
391  * trg2: trigram array of text, of length len2. trg2 is haystack where we
392  *		 search and have to store its positions.
393  *
394  * Returns concatenated trigram array.
395  */
396 static pos_trgm *
make_positional_trgm(trgm * trg1,int len1,trgm * trg2,int len2)397 make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
398 {
399 	pos_trgm   *result;
400 	int			i,
401 				len = len1 + len2;
402 
403 	result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
404 
405 	for (i = 0; i < len1; i++)
406 	{
407 		memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
408 		result[i].index = -1;
409 	}
410 
411 	for (i = 0; i < len2; i++)
412 	{
413 		memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
414 		result[i + len1].index = i;
415 	}
416 
417 	return result;
418 }
419 
420 /*
421  * Compare position trigrams: compare trigrams first and position second.
422  */
423 static int
comp_ptrgm(const void * v1,const void * v2)424 comp_ptrgm(const void *v1, const void *v2)
425 {
426 	const pos_trgm *p1 = (const pos_trgm *) v1;
427 	const pos_trgm *p2 = (const pos_trgm *) v2;
428 	int			cmp;
429 
430 	cmp = CMPTRGM(p1->trg, p2->trg);
431 	if (cmp != 0)
432 		return cmp;
433 
434 	if (p1->index < p2->index)
435 		return -1;
436 	else if (p1->index == p2->index)
437 		return 0;
438 	else
439 		return 1;
440 }
441 
442 /*
443  * Iterative search function which calculates maximum similarity with word in
444  * the string. But maximum similarity is calculated only if check_only == false.
445  *
446  * trg2indexes: array which stores indexes of the array "found".
447  * found: array which stores true of false values.
448  * ulen1: count of unique trigrams of array "trg1".
449  * len2: length of array "trg2" and array "trg2indexes".
450  * len: length of the array "found".
451  * lags: set of boolean flags parameterizing similarity calculation.
452  * bounds: whether each trigram is left/right bound of word.
453  *
454  * Returns word similarity.
455  */
456 static float4
iterate_word_similarity(int * trg2indexes,bool * found,int ulen1,int len2,int len,uint8 flags,TrgmBound * bounds)457 iterate_word_similarity(int *trg2indexes,
458 						bool *found,
459 						int ulen1,
460 						int len2,
461 						int len,
462 						uint8 flags,
463 						TrgmBound *bounds)
464 {
465 	int		   *lastpos,
466 				i,
467 				ulen2 = 0,
468 				count = 0,
469 				upper = -1,
470 				lower;
471 	float4		smlr_cur,
472 				smlr_max = 0.0f;
473 	double		threshold;
474 
475 	Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
476 
477 	/* Select appropriate threshold */
478 	threshold = (flags & WORD_SIMILARITY_STRICT) ?
479 		strict_word_similarity_threshold :
480 		word_similarity_threshold;
481 
482 	/*
483 	 * Consider first trigram as initial lower bound for strict word
484 	 * similarity, or initialize it later with first trigram present for plain
485 	 * word similarity.
486 	 */
487 	lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
488 
489 	/* Memorise last position of each trigram */
490 	lastpos = (int *) palloc(sizeof(int) * len);
491 	memset(lastpos, -1, sizeof(int) * len);
492 
493 	for (i = 0; i < len2; i++)
494 	{
495 		/* Get index of next trigram */
496 		int			trgindex = trg2indexes[i];
497 
498 		/* Update last position of this trigram */
499 		if (lower >= 0 || found[trgindex])
500 		{
501 			if (lastpos[trgindex] < 0)
502 			{
503 				ulen2++;
504 				if (found[trgindex])
505 					count++;
506 			}
507 			lastpos[trgindex] = i;
508 		}
509 
510 		/*
511 		 * Adjust upper bound if trigram is upper bound of word for strict
512 		 * word similarity, or if trigram is present in required substring for
513 		 * plain word similarity
514 		 */
515 		if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
516 			: found[trgindex])
517 		{
518 			int			prev_lower,
519 						tmp_ulen2,
520 						tmp_lower,
521 						tmp_count;
522 
523 			upper = i;
524 			if (lower == -1)
525 			{
526 				lower = i;
527 				ulen2 = 1;
528 			}
529 
530 			smlr_cur = CALCSML(count, ulen1, ulen2);
531 
532 			/* Also try to adjust lower bound for greater similarity */
533 			tmp_count = count;
534 			tmp_ulen2 = ulen2;
535 			prev_lower = lower;
536 			for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
537 			{
538 				float		smlr_tmp;
539 				int			tmp_trgindex;
540 
541 				/*
542 				 * Adjust lower bound only if trigram is lower bound of word
543 				 * for strict word similarity, or consider every trigram as
544 				 * lower bound for plain word similarity.
545 				 */
546 				if (!(flags & WORD_SIMILARITY_STRICT)
547 					|| (bounds[tmp_lower] & TRGM_BOUND_LEFT))
548 				{
549 					smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
550 					if (smlr_tmp > smlr_cur)
551 					{
552 						smlr_cur = smlr_tmp;
553 						ulen2 = tmp_ulen2;
554 						lower = tmp_lower;
555 						count = tmp_count;
556 					}
557 
558 					/*
559 					 * If we only check that word similarity is greater than
560 					 * threshold we do not need to calculate a maximum
561 					 * similarity.
562 					 */
563 					if ((flags & WORD_SIMILARITY_CHECK_ONLY)
564 						&& smlr_cur >= threshold)
565 						break;
566 				}
567 
568 				tmp_trgindex = trg2indexes[tmp_lower];
569 				if (lastpos[tmp_trgindex] == tmp_lower)
570 				{
571 					tmp_ulen2--;
572 					if (found[tmp_trgindex])
573 						tmp_count--;
574 				}
575 			}
576 
577 			smlr_max = Max(smlr_max, smlr_cur);
578 
579 			/*
580 			 * if we only check that word similarity is greater than threshold
581 			 * we do not need to calculate a maximum similarity.
582 			 */
583 			if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
584 				break;
585 
586 			for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
587 			{
588 				int			tmp_trgindex;
589 
590 				tmp_trgindex = trg2indexes[tmp_lower];
591 				if (lastpos[tmp_trgindex] == tmp_lower)
592 					lastpos[tmp_trgindex] = -1;
593 			}
594 		}
595 	}
596 
597 	pfree(lastpos);
598 
599 	return smlr_max;
600 }
601 
602 /*
603  * Calculate word similarity.
604  * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
605  * are used to calculate word similarity using iterate_word_similarity().
606  *
607  * "trg2indexes" is array which stores indexes of the array "found".
608  * In other words:
609  * trg2indexes[j] = i;
610  * found[i] = true (or false);
611  * If found[i] == true then there is trigram trg2[j] in array "trg1".
612  * If found[i] == false then there is not trigram trg2[j] in array "trg1".
613  *
614  * str1: search pattern string, of length slen1 bytes.
615  * str2: text in which we are looking for a word, of length slen2 bytes.
616  * flags: set of boolean flags parameterizing similarity calculation.
617  *
618  * Returns word similarity.
619  */
620 static float4
calc_word_similarity(char * str1,int slen1,char * str2,int slen2,uint8 flags)621 calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
622 					 uint8 flags)
623 {
624 	bool	   *found;
625 	pos_trgm   *ptrg;
626 	trgm	   *trg1;
627 	trgm	   *trg2;
628 	int			len1,
629 				len2,
630 				len,
631 				i,
632 				j,
633 				ulen1;
634 	int		   *trg2indexes;
635 	float4		result;
636 	TrgmBound  *bounds;
637 
638 	protect_out_of_mem(slen1 + slen2);
639 
640 	/* Make positional trigrams */
641 	trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
642 	trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
643 	if (flags & WORD_SIMILARITY_STRICT)
644 		bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
645 	else
646 		bounds = NULL;
647 
648 	len1 = generate_trgm_only(trg1, str1, slen1, NULL);
649 	len2 = generate_trgm_only(trg2, str2, slen2, bounds);
650 
651 	ptrg = make_positional_trgm(trg1, len1, trg2, len2);
652 	len = len1 + len2;
653 	qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
654 
655 	pfree(trg1);
656 	pfree(trg2);
657 
658 	/*
659 	 * Merge positional trigrams array: enumerate each trigram and find its
660 	 * presence in required word.
661 	 */
662 	trg2indexes = (int *) palloc(sizeof(int) * len2);
663 	found = (bool *) palloc0(sizeof(bool) * len);
664 
665 	ulen1 = 0;
666 	j = 0;
667 	for (i = 0; i < len; i++)
668 	{
669 		if (i > 0)
670 		{
671 			int			cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
672 
673 			if (cmp != 0)
674 			{
675 				if (found[j])
676 					ulen1++;
677 				j++;
678 			}
679 		}
680 
681 		if (ptrg[i].index >= 0)
682 		{
683 			trg2indexes[ptrg[i].index] = j;
684 		}
685 		else
686 		{
687 			found[j] = true;
688 		}
689 	}
690 	if (found[j])
691 		ulen1++;
692 
693 	/* Run iterative procedure to find maximum similarity with word */
694 	result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
695 									 flags, bounds);
696 
697 	pfree(trg2indexes);
698 	pfree(found);
699 	pfree(ptrg);
700 
701 	return result;
702 }
703 
704 
705 /*
706  * Extract the next non-wildcard part of a search string, i.e. a word bounded
707  * by '_' or '%' meta-characters, non-word characters or string end.
708  *
709  * str: source string, of length lenstr bytes (need not be null-terminated)
710  * buf: where to return the substring (must be long enough)
711  * *bytelen: receives byte length of the found substring
712  * *charlen: receives character length of the found substring
713  *
714  * Returns pointer to end+1 of the found substring in the source string.
715  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
716  *
717  * If the found word is bounded by non-word characters or string boundaries
718  * then this function will include corresponding padding spaces into buf.
719  */
720 static const char *
get_wildcard_part(const char * str,int lenstr,char * buf,int * bytelen,int * charlen)721 get_wildcard_part(const char *str, int lenstr,
722 				  char *buf, int *bytelen, int *charlen)
723 {
724 	const char *beginword = str;
725 	const char *endword;
726 	char	   *s = buf;
727 	bool		in_leading_wildcard_meta = false;
728 	bool		in_trailing_wildcard_meta = false;
729 	bool		in_escape = false;
730 	int			clen;
731 
732 	/*
733 	 * Find the first word character, remembering whether preceding character
734 	 * was wildcard meta-character.  Note that the in_escape state persists
735 	 * from this loop to the next one, since we may exit at a word character
736 	 * that is in_escape.
737 	 */
738 	while (beginword - str < lenstr)
739 	{
740 		if (in_escape)
741 		{
742 			if (ISWORDCHR(beginword))
743 				break;
744 			in_escape = false;
745 			in_leading_wildcard_meta = false;
746 		}
747 		else
748 		{
749 			if (ISESCAPECHAR(beginword))
750 				in_escape = true;
751 			else if (ISWILDCARDCHAR(beginword))
752 				in_leading_wildcard_meta = true;
753 			else if (ISWORDCHR(beginword))
754 				break;
755 			else
756 				in_leading_wildcard_meta = false;
757 		}
758 		beginword += pg_mblen(beginword);
759 	}
760 
761 	/*
762 	 * Handle string end.
763 	 */
764 	if (beginword - str >= lenstr)
765 		return NULL;
766 
767 	/*
768 	 * Add left padding spaces if preceding character wasn't wildcard
769 	 * meta-character.
770 	 */
771 	*charlen = 0;
772 	if (!in_leading_wildcard_meta)
773 	{
774 		if (LPADDING > 0)
775 		{
776 			*s++ = ' ';
777 			(*charlen)++;
778 			if (LPADDING > 1)
779 			{
780 				*s++ = ' ';
781 				(*charlen)++;
782 			}
783 		}
784 	}
785 
786 	/*
787 	 * Copy data into buf until wildcard meta-character, non-word character or
788 	 * string boundary.  Strip escapes during copy.
789 	 */
790 	endword = beginword;
791 	while (endword - str < lenstr)
792 	{
793 		clen = pg_mblen(endword);
794 		if (in_escape)
795 		{
796 			if (ISWORDCHR(endword))
797 			{
798 				memcpy(s, endword, clen);
799 				(*charlen)++;
800 				s += clen;
801 			}
802 			else
803 			{
804 				/*
805 				 * Back up endword to the escape character when stopping at an
806 				 * escaped char, so that subsequent get_wildcard_part will
807 				 * restart from the escape character.  We assume here that
808 				 * escape chars are single-byte.
809 				 */
810 				endword--;
811 				break;
812 			}
813 			in_escape = false;
814 		}
815 		else
816 		{
817 			if (ISESCAPECHAR(endword))
818 				in_escape = true;
819 			else if (ISWILDCARDCHAR(endword))
820 			{
821 				in_trailing_wildcard_meta = true;
822 				break;
823 			}
824 			else if (ISWORDCHR(endword))
825 			{
826 				memcpy(s, endword, clen);
827 				(*charlen)++;
828 				s += clen;
829 			}
830 			else
831 				break;
832 		}
833 		endword += clen;
834 	}
835 
836 	/*
837 	 * Add right padding spaces if next character isn't wildcard
838 	 * meta-character.
839 	 */
840 	if (!in_trailing_wildcard_meta)
841 	{
842 		if (RPADDING > 0)
843 		{
844 			*s++ = ' ';
845 			(*charlen)++;
846 			if (RPADDING > 1)
847 			{
848 				*s++ = ' ';
849 				(*charlen)++;
850 			}
851 		}
852 	}
853 
854 	*bytelen = s - buf;
855 	return endword;
856 }
857 
858 /*
859  * Generates trigrams for wildcard search string.
860  *
861  * Returns array of trigrams that must occur in any string that matches the
862  * wildcard string.  For example, given pattern "a%bcd%" the trigrams
863  * " a", "bcd" would be extracted.
864  */
865 TRGM *
generate_wildcard_trgm(const char * str,int slen)866 generate_wildcard_trgm(const char *str, int slen)
867 {
868 	TRGM	   *trg;
869 	char	   *buf,
870 			   *buf2;
871 	trgm	   *tptr;
872 	int			len,
873 				charlen,
874 				bytelen;
875 	const char *eword;
876 
877 	protect_out_of_mem(slen);
878 
879 	trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
880 	trg->flag = ARRKEY;
881 	SET_VARSIZE(trg, TRGMHDRSIZE);
882 
883 	if (slen + LPADDING + RPADDING < 3 || slen == 0)
884 		return trg;
885 
886 	tptr = GETARR(trg);
887 
888 	/* Allocate a buffer for blank-padded, but not yet case-folded, words */
889 	buf = palloc(sizeof(char) * (slen + 4));
890 
891 	/*
892 	 * Extract trigrams from each substring extracted by get_wildcard_part.
893 	 */
894 	eword = str;
895 	while ((eword = get_wildcard_part(eword, slen - (eword - str),
896 									  buf, &bytelen, &charlen)) != NULL)
897 	{
898 #ifdef IGNORECASE
899 		buf2 = lowerstr_with_len(buf, bytelen);
900 		bytelen = strlen(buf2);
901 #else
902 		buf2 = buf;
903 #endif
904 
905 		/*
906 		 * count trigrams
907 		 */
908 		tptr = make_trigrams(tptr, buf2, bytelen, charlen);
909 
910 #ifdef IGNORECASE
911 		pfree(buf2);
912 #endif
913 	}
914 
915 	pfree(buf);
916 
917 	if ((len = tptr - GETARR(trg)) == 0)
918 		return trg;
919 
920 	/*
921 	 * Make trigrams unique.
922 	 */
923 	if (len > 1)
924 	{
925 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
926 		len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
927 	}
928 
929 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
930 
931 	return trg;
932 }
933 
934 uint32
trgm2int(trgm * ptr)935 trgm2int(trgm *ptr)
936 {
937 	uint32		val = 0;
938 
939 	val |= *(((unsigned char *) ptr));
940 	val <<= 8;
941 	val |= *(((unsigned char *) ptr) + 1);
942 	val <<= 8;
943 	val |= *(((unsigned char *) ptr) + 2);
944 
945 	return val;
946 }
947 
948 Datum
show_trgm(PG_FUNCTION_ARGS)949 show_trgm(PG_FUNCTION_ARGS)
950 {
951 	text	   *in = PG_GETARG_TEXT_PP(0);
952 	TRGM	   *trg;
953 	Datum	   *d;
954 	ArrayType  *a;
955 	trgm	   *ptr;
956 	int			i;
957 
958 	trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
959 	d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
960 
961 	for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
962 	{
963 		text	   *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
964 
965 		if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
966 		{
967 			snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
968 			SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
969 		}
970 		else
971 		{
972 			SET_VARSIZE(item, VARHDRSZ + 3);
973 			CPTRGM(VARDATA(item), ptr);
974 		}
975 		d[i] = PointerGetDatum(item);
976 	}
977 
978 	a = construct_array(d,
979 						ARRNELEM(trg),
980 						TEXTOID,
981 						-1,
982 						false,
983 						TYPALIGN_INT);
984 
985 	for (i = 0; i < ARRNELEM(trg); i++)
986 		pfree(DatumGetPointer(d[i]));
987 
988 	pfree(d);
989 	pfree(trg);
990 	PG_FREE_IF_COPY(in, 0);
991 
992 	PG_RETURN_POINTER(a);
993 }
994 
995 float4
cnt_sml(TRGM * trg1,TRGM * trg2,bool inexact)996 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
997 {
998 	trgm	   *ptr1,
999 			   *ptr2;
1000 	int			count = 0;
1001 	int			len1,
1002 				len2;
1003 
1004 	ptr1 = GETARR(trg1);
1005 	ptr2 = GETARR(trg2);
1006 
1007 	len1 = ARRNELEM(trg1);
1008 	len2 = ARRNELEM(trg2);
1009 
1010 	/* explicit test is needed to avoid 0/0 division when both lengths are 0 */
1011 	if (len1 <= 0 || len2 <= 0)
1012 		return (float4) 0.0;
1013 
1014 	while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1015 	{
1016 		int			res = CMPTRGM(ptr1, ptr2);
1017 
1018 		if (res < 0)
1019 			ptr1++;
1020 		else if (res > 0)
1021 			ptr2++;
1022 		else
1023 		{
1024 			ptr1++;
1025 			ptr2++;
1026 			count++;
1027 		}
1028 	}
1029 
1030 	/*
1031 	 * If inexact then len2 is equal to count, because we don't know actual
1032 	 * length of second string in inexact search and we can assume that count
1033 	 * is a lower bound of len2.
1034 	 */
1035 	return CALCSML(count, len1, inexact ? count : len2);
1036 }
1037 
1038 
1039 /*
1040  * Returns whether trg2 contains all trigrams in trg1.
1041  * This relies on the trigram arrays being sorted.
1042  */
1043 bool
trgm_contained_by(TRGM * trg1,TRGM * trg2)1044 trgm_contained_by(TRGM *trg1, TRGM *trg2)
1045 {
1046 	trgm	   *ptr1,
1047 			   *ptr2;
1048 	int			len1,
1049 				len2;
1050 
1051 	ptr1 = GETARR(trg1);
1052 	ptr2 = GETARR(trg2);
1053 
1054 	len1 = ARRNELEM(trg1);
1055 	len2 = ARRNELEM(trg2);
1056 
1057 	while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1058 	{
1059 		int			res = CMPTRGM(ptr1, ptr2);
1060 
1061 		if (res < 0)
1062 			return false;
1063 		else if (res > 0)
1064 			ptr2++;
1065 		else
1066 		{
1067 			ptr1++;
1068 			ptr2++;
1069 		}
1070 	}
1071 	if (ptr1 - GETARR(trg1) < len1)
1072 		return false;
1073 	else
1074 		return true;
1075 }
1076 
1077 /*
1078  * Return a palloc'd boolean array showing, for each trigram in "query",
1079  * whether it is present in the trigram array "key".
1080  * This relies on the "key" array being sorted, but "query" need not be.
1081  */
1082 bool *
trgm_presence_map(TRGM * query,TRGM * key)1083 trgm_presence_map(TRGM *query, TRGM *key)
1084 {
1085 	bool	   *result;
1086 	trgm	   *ptrq = GETARR(query),
1087 			   *ptrk = GETARR(key);
1088 	int			lenq = ARRNELEM(query),
1089 				lenk = ARRNELEM(key),
1090 				i;
1091 
1092 	result = (bool *) palloc0(lenq * sizeof(bool));
1093 
1094 	/* for each query trigram, do a binary search in the key array */
1095 	for (i = 0; i < lenq; i++)
1096 	{
1097 		int			lo = 0;
1098 		int			hi = lenk;
1099 
1100 		while (lo < hi)
1101 		{
1102 			int			mid = (lo + hi) / 2;
1103 			int			res = CMPTRGM(ptrq, ptrk + mid);
1104 
1105 			if (res < 0)
1106 				hi = mid;
1107 			else if (res > 0)
1108 				lo = mid + 1;
1109 			else
1110 			{
1111 				result[i] = true;
1112 				break;
1113 			}
1114 		}
1115 		ptrq++;
1116 	}
1117 
1118 	return result;
1119 }
1120 
1121 Datum
similarity(PG_FUNCTION_ARGS)1122 similarity(PG_FUNCTION_ARGS)
1123 {
1124 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1125 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1126 	TRGM	   *trg1,
1127 			   *trg2;
1128 	float4		res;
1129 
1130 	trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
1131 	trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
1132 
1133 	res = cnt_sml(trg1, trg2, false);
1134 
1135 	pfree(trg1);
1136 	pfree(trg2);
1137 	PG_FREE_IF_COPY(in1, 0);
1138 	PG_FREE_IF_COPY(in2, 1);
1139 
1140 	PG_RETURN_FLOAT4(res);
1141 }
1142 
1143 Datum
word_similarity(PG_FUNCTION_ARGS)1144 word_similarity(PG_FUNCTION_ARGS)
1145 {
1146 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1147 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1148 	float4		res;
1149 
1150 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1151 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1152 							   0);
1153 
1154 	PG_FREE_IF_COPY(in1, 0);
1155 	PG_FREE_IF_COPY(in2, 1);
1156 	PG_RETURN_FLOAT4(res);
1157 }
1158 
1159 Datum
strict_word_similarity(PG_FUNCTION_ARGS)1160 strict_word_similarity(PG_FUNCTION_ARGS)
1161 {
1162 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1163 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1164 	float4		res;
1165 
1166 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1167 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1168 							   WORD_SIMILARITY_STRICT);
1169 
1170 	PG_FREE_IF_COPY(in1, 0);
1171 	PG_FREE_IF_COPY(in2, 1);
1172 	PG_RETURN_FLOAT4(res);
1173 }
1174 
1175 Datum
similarity_dist(PG_FUNCTION_ARGS)1176 similarity_dist(PG_FUNCTION_ARGS)
1177 {
1178 	float4		res = DatumGetFloat4(DirectFunctionCall2(similarity,
1179 														 PG_GETARG_DATUM(0),
1180 														 PG_GETARG_DATUM(1)));
1181 
1182 	PG_RETURN_FLOAT4(1.0 - res);
1183 }
1184 
1185 Datum
similarity_op(PG_FUNCTION_ARGS)1186 similarity_op(PG_FUNCTION_ARGS)
1187 {
1188 	float4		res = DatumGetFloat4(DirectFunctionCall2(similarity,
1189 														 PG_GETARG_DATUM(0),
1190 														 PG_GETARG_DATUM(1)));
1191 
1192 	PG_RETURN_BOOL(res >= similarity_threshold);
1193 }
1194 
1195 Datum
word_similarity_op(PG_FUNCTION_ARGS)1196 word_similarity_op(PG_FUNCTION_ARGS)
1197 {
1198 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1199 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1200 	float4		res;
1201 
1202 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1203 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1204 							   WORD_SIMILARITY_CHECK_ONLY);
1205 
1206 	PG_FREE_IF_COPY(in1, 0);
1207 	PG_FREE_IF_COPY(in2, 1);
1208 	PG_RETURN_BOOL(res >= word_similarity_threshold);
1209 }
1210 
1211 Datum
word_similarity_commutator_op(PG_FUNCTION_ARGS)1212 word_similarity_commutator_op(PG_FUNCTION_ARGS)
1213 {
1214 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1215 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1216 	float4		res;
1217 
1218 	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1219 							   VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1220 							   WORD_SIMILARITY_CHECK_ONLY);
1221 
1222 	PG_FREE_IF_COPY(in1, 0);
1223 	PG_FREE_IF_COPY(in2, 1);
1224 	PG_RETURN_BOOL(res >= word_similarity_threshold);
1225 }
1226 
1227 Datum
word_similarity_dist_op(PG_FUNCTION_ARGS)1228 word_similarity_dist_op(PG_FUNCTION_ARGS)
1229 {
1230 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1231 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1232 	float4		res;
1233 
1234 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1235 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1236 							   0);
1237 
1238 	PG_FREE_IF_COPY(in1, 0);
1239 	PG_FREE_IF_COPY(in2, 1);
1240 	PG_RETURN_FLOAT4(1.0 - res);
1241 }
1242 
1243 Datum
word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)1244 word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1245 {
1246 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1247 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1248 	float4		res;
1249 
1250 	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1251 							   VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1252 							   0);
1253 
1254 	PG_FREE_IF_COPY(in1, 0);
1255 	PG_FREE_IF_COPY(in2, 1);
1256 	PG_RETURN_FLOAT4(1.0 - res);
1257 }
1258 
1259 Datum
strict_word_similarity_op(PG_FUNCTION_ARGS)1260 strict_word_similarity_op(PG_FUNCTION_ARGS)
1261 {
1262 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1263 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1264 	float4		res;
1265 
1266 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1267 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1268 							   WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1269 
1270 	PG_FREE_IF_COPY(in1, 0);
1271 	PG_FREE_IF_COPY(in2, 1);
1272 	PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1273 }
1274 
1275 Datum
strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)1276 strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
1277 {
1278 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1279 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1280 	float4		res;
1281 
1282 	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1283 							   VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1284 							   WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1285 
1286 	PG_FREE_IF_COPY(in1, 0);
1287 	PG_FREE_IF_COPY(in2, 1);
1288 	PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1289 }
1290 
1291 Datum
strict_word_similarity_dist_op(PG_FUNCTION_ARGS)1292 strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
1293 {
1294 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1295 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1296 	float4		res;
1297 
1298 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1299 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1300 							   WORD_SIMILARITY_STRICT);
1301 
1302 	PG_FREE_IF_COPY(in1, 0);
1303 	PG_FREE_IF_COPY(in2, 1);
1304 	PG_RETURN_FLOAT4(1.0 - res);
1305 }
1306 
1307 Datum
strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)1308 strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1309 {
1310 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1311 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1312 	float4		res;
1313 
1314 	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1315 							   VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1316 							   WORD_SIMILARITY_STRICT);
1317 
1318 	PG_FREE_IF_COPY(in1, 0);
1319 	PG_FREE_IF_COPY(in2, 1);
1320 	PG_RETURN_FLOAT4(1.0 - res);
1321 }
1322