1 /*
2  * contrib/pg_trgm/trgm_op.c
3  */
4 #include "postgres.h"
5 
6 #include <ctype.h>
7 
8 #include "trgm.h"
9 
10 #include "catalog/pg_type.h"
11 #include "tsearch/ts_locale.h"
12 #include "utils/lsyscache.h"
13 #include "utils/memutils.h"
14 #include "utils/pg_crc.h"
15 
16 PG_MODULE_MAGIC;
17 
18 /* GUC variables */
19 double		similarity_threshold = 0.3f;
20 double		word_similarity_threshold = 0.6f;
21 
22 void		_PG_init(void);
23 
24 PG_FUNCTION_INFO_V1(set_limit);
25 PG_FUNCTION_INFO_V1(show_limit);
26 PG_FUNCTION_INFO_V1(show_trgm);
27 PG_FUNCTION_INFO_V1(similarity);
28 PG_FUNCTION_INFO_V1(word_similarity);
29 PG_FUNCTION_INFO_V1(similarity_dist);
30 PG_FUNCTION_INFO_V1(similarity_op);
31 PG_FUNCTION_INFO_V1(word_similarity_op);
32 PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
33 PG_FUNCTION_INFO_V1(word_similarity_dist_op);
34 PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
35 
36 /* Trigram with position */
37 typedef struct
38 {
39 	trgm		trg;
40 	int			index;
41 } pos_trgm;
42 
43 /*
44  * Module load callback
45  */
46 void
_PG_init(void)47 _PG_init(void)
48 {
49 	/* Define custom GUC variables. */
50 	DefineCustomRealVariable("pg_trgm.similarity_threshold",
51 							 "Sets the threshold used by the % operator.",
52 							 "Valid range is 0.0 .. 1.0.",
53 							 &similarity_threshold,
54 							 0.3,
55 							 0.0,
56 							 1.0,
57 							 PGC_USERSET,
58 							 0,
59 							 NULL,
60 							 NULL,
61 							 NULL);
62 	DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
63 							 "Sets the threshold used by the <% operator.",
64 							 "Valid range is 0.0 .. 1.0.",
65 							 &word_similarity_threshold,
66 							 0.6,
67 							 0.0,
68 							 1.0,
69 							 PGC_USERSET,
70 							 0,
71 							 NULL,
72 							 NULL,
73 							 NULL);
74 }
75 
76 /*
77  * Deprecated function.
78  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
79  */
80 Datum
set_limit(PG_FUNCTION_ARGS)81 set_limit(PG_FUNCTION_ARGS)
82 {
83 	float4		nlimit = PG_GETARG_FLOAT4(0);
84 	char	   *nlimit_str;
85 	Oid			func_out_oid;
86 	bool		is_varlena;
87 
88 	getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
89 
90 	nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
91 
92 	SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
93 					PGC_USERSET, PGC_S_SESSION);
94 
95 	PG_RETURN_FLOAT4(similarity_threshold);
96 }
97 
98 /*
99  * Deprecated function.
100  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
101  */
102 Datum
show_limit(PG_FUNCTION_ARGS)103 show_limit(PG_FUNCTION_ARGS)
104 {
105 	PG_RETURN_FLOAT4(similarity_threshold);
106 }
107 
108 static int
comp_trgm(const void * a,const void * b)109 comp_trgm(const void *a, const void *b)
110 {
111 	return CMPTRGM(a, b);
112 }
113 
114 static int
unique_array(trgm * a,int len)115 unique_array(trgm *a, int len)
116 {
117 	trgm	   *curend,
118 			   *tmp;
119 
120 	curend = tmp = a;
121 	while (tmp - a < len)
122 		if (CMPTRGM(tmp, curend))
123 		{
124 			curend++;
125 			CPTRGM(curend, tmp);
126 			tmp++;
127 		}
128 		else
129 			tmp++;
130 
131 	return curend + 1 - a;
132 }
133 
134 /*
135  * Finds first word in string, returns pointer to the word,
136  * endword points to the character after word
137  */
138 static char *
find_word(char * str,int lenstr,char ** endword,int * charlen)139 find_word(char *str, int lenstr, char **endword, int *charlen)
140 {
141 	char	   *beginword = str;
142 
143 	while (beginword - str < lenstr && !ISWORDCHR(beginword))
144 		beginword += pg_mblen(beginword);
145 
146 	if (beginword - str >= lenstr)
147 		return NULL;
148 
149 	*endword = beginword;
150 	*charlen = 0;
151 	while (*endword - str < lenstr && ISWORDCHR(*endword))
152 	{
153 		*endword += pg_mblen(*endword);
154 		(*charlen)++;
155 	}
156 
157 	return beginword;
158 }
159 
160 /*
161  * Reduce a trigram (three possibly multi-byte characters) to a trgm,
162  * which is always exactly three bytes.  If we have three single-byte
163  * characters, we just use them as-is; otherwise we form a hash value.
164  */
165 void
compact_trigram(trgm * tptr,char * str,int bytelen)166 compact_trigram(trgm *tptr, char *str, int bytelen)
167 {
168 	if (bytelen == 3)
169 	{
170 		CPTRGM(tptr, str);
171 	}
172 	else
173 	{
174 		pg_crc32	crc;
175 
176 		INIT_LEGACY_CRC32(crc);
177 		COMP_LEGACY_CRC32(crc, str, bytelen);
178 		FIN_LEGACY_CRC32(crc);
179 
180 		/*
181 		 * use only 3 upper bytes from crc, hope, it's good enough hashing
182 		 */
183 		CPTRGM(tptr, &crc);
184 	}
185 }
186 
187 /*
188  * Adds trigrams from words (already padded).
189  */
190 static trgm *
make_trigrams(trgm * tptr,char * str,int bytelen,int charlen)191 make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
192 {
193 	char	   *ptr = str;
194 
195 	if (charlen < 3)
196 		return tptr;
197 
198 	if (bytelen > charlen)
199 	{
200 		/* Find multibyte character boundaries and apply compact_trigram */
201 		int			lenfirst = pg_mblen(str),
202 					lenmiddle = pg_mblen(str + lenfirst),
203 					lenlast = pg_mblen(str + lenfirst + lenmiddle);
204 
205 		while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
206 		{
207 			compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
208 
209 			ptr += lenfirst;
210 			tptr++;
211 
212 			lenfirst = lenmiddle;
213 			lenmiddle = lenlast;
214 			lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
215 		}
216 	}
217 	else
218 	{
219 		/* Fast path when there are no multibyte characters */
220 		Assert(bytelen == charlen);
221 
222 		while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
223 		{
224 			CPTRGM(tptr, ptr);
225 			ptr++;
226 			tptr++;
227 		}
228 	}
229 
230 	return tptr;
231 }
232 
233 /*
234  * Make array of trigrams without sorting and removing duplicate items.
235  *
236  * trg: where to return the array of trigrams.
237  * str: source string, of length slen bytes.
238  *
239  * Returns length of the generated array.
240  */
241 static int
generate_trgm_only(trgm * trg,char * str,int slen)242 generate_trgm_only(trgm *trg, char *str, int slen)
243 {
244 	trgm	   *tptr;
245 	char	   *buf;
246 	int			charlen,
247 				bytelen;
248 	char	   *bword,
249 			   *eword;
250 
251 	if (slen + LPADDING + RPADDING < 3 || slen == 0)
252 		return 0;
253 
254 	tptr = trg;
255 
256 	/* Allocate a buffer for case-folded, blank-padded words */
257 	buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
258 
259 	if (LPADDING > 0)
260 	{
261 		*buf = ' ';
262 		if (LPADDING > 1)
263 			*(buf + 1) = ' ';
264 	}
265 
266 	eword = str;
267 	while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
268 	{
269 #ifdef IGNORECASE
270 		bword = lowerstr_with_len(bword, eword - bword);
271 		bytelen = strlen(bword);
272 #else
273 		bytelen = eword - bword;
274 #endif
275 
276 		memcpy(buf + LPADDING, bword, bytelen);
277 
278 #ifdef IGNORECASE
279 		pfree(bword);
280 #endif
281 
282 		buf[LPADDING + bytelen] = ' ';
283 		buf[LPADDING + bytelen + 1] = ' ';
284 
285 		/*
286 		 * count trigrams
287 		 */
288 		tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
289 							 charlen + LPADDING + RPADDING);
290 	}
291 
292 	pfree(buf);
293 
294 	return tptr - trg;
295 }
296 
297 /*
298  * Guard against possible overflow in the palloc requests below.  (We
299  * don't worry about the additive constants, since palloc can detect
300  * requests that are a little above MaxAllocSize --- we just need to
301  * prevent integer overflow in the multiplications.)
302  */
303 static void
protect_out_of_mem(int slen)304 protect_out_of_mem(int slen)
305 {
306 	if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
307 		(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
308 		ereport(ERROR,
309 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
310 				 errmsg("out of memory")));
311 }
312 
313 /*
314  * Make array of trigrams with sorting and removing duplicate items.
315  *
316  * str: source string, of length slen bytes.
317  *
318  * Returns the sorted array of unique trigrams.
319  */
320 TRGM *
generate_trgm(char * str,int slen)321 generate_trgm(char *str, int slen)
322 {
323 	TRGM	   *trg;
324 	int			len;
325 
326 	protect_out_of_mem(slen);
327 
328 	trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
329 	trg->flag = ARRKEY;
330 
331 	len = generate_trgm_only(GETARR(trg), str, slen);
332 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
333 
334 	if (len == 0)
335 		return trg;
336 
337 	/*
338 	 * Make trigrams unique.
339 	 */
340 	if (len > 1)
341 	{
342 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
343 		len = unique_array(GETARR(trg), len);
344 	}
345 
346 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
347 
348 	return trg;
349 }
350 
351 /*
352  * Make array of positional trigrams from two trigram arrays trg1 and trg2.
353  *
354  * trg1: trigram array of search pattern, of length len1. trg1 is required
355  *		 word which positions don't matter and replaced with -1.
356  * trg2: trigram array of text, of length len2. trg2 is haystack where we
357  *		 search and have to store its positions.
358  *
359  * Returns concatenated trigram array.
360  */
361 static pos_trgm *
make_positional_trgm(trgm * trg1,int len1,trgm * trg2,int len2)362 make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
363 {
364 	pos_trgm   *result;
365 	int			i,
366 				len = len1 + len2;
367 
368 	result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
369 
370 	for (i = 0; i < len1; i++)
371 	{
372 		memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
373 		result[i].index = -1;
374 	}
375 
376 	for (i = 0; i < len2; i++)
377 	{
378 		memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
379 		result[i + len1].index = i;
380 	}
381 
382 	return result;
383 }
384 
385 /*
386  * Compare position trigrams: compare trigrams first and position second.
387  */
388 static int
comp_ptrgm(const void * v1,const void * v2)389 comp_ptrgm(const void *v1, const void *v2)
390 {
391 	const pos_trgm *p1 = (const pos_trgm *) v1;
392 	const pos_trgm *p2 = (const pos_trgm *) v2;
393 	int			cmp;
394 
395 	cmp = CMPTRGM(p1->trg, p2->trg);
396 	if (cmp != 0)
397 		return cmp;
398 
399 	if (p1->index < p2->index)
400 		return -1;
401 	else if (p1->index == p2->index)
402 		return 0;
403 	else
404 		return 1;
405 }
406 
407 /*
408  * Iterative search function which calculates maximum similarity with word in
409  * the string. But maximum similarity is calculated only if check_only == false.
410  *
411  * trg2indexes: array which stores indexes of the array "found".
412  * found: array which stores true of false values.
413  * ulen1: count of unique trigrams of array "trg1".
414  * len2: length of array "trg2" and array "trg2indexes".
415  * len: length of the array "found".
416  * check_only: if true then only check existence of similar search pattern in
417  *			   text.
418  *
419  * Returns word similarity.
420  */
421 static float4
iterate_word_similarity(int * trg2indexes,bool * found,int ulen1,int len2,int len,bool check_only)422 iterate_word_similarity(int *trg2indexes,
423 						bool *found,
424 						int ulen1,
425 						int len2,
426 						int len,
427 						bool check_only)
428 {
429 	int		   *lastpos,
430 				i,
431 				ulen2 = 0,
432 				count = 0,
433 				upper = -1,
434 				lower = -1;
435 	float4		smlr_cur,
436 				smlr_max = 0.0f;
437 
438 	/* Memorise last position of each trigram */
439 	lastpos = (int *) palloc(sizeof(int) * len);
440 	memset(lastpos, -1, sizeof(int) * len);
441 
442 	for (i = 0; i < len2; i++)
443 	{
444 		/* Get index of next trigram */
445 		int			trgindex = trg2indexes[i];
446 
447 		/* Update last position of this trigram */
448 		if (lower >= 0 || found[trgindex])
449 		{
450 			if (lastpos[trgindex] < 0)
451 			{
452 				ulen2++;
453 				if (found[trgindex])
454 					count++;
455 			}
456 			lastpos[trgindex] = i;
457 		}
458 
459 		/* Adjust upper bound if this trigram is present in required substring */
460 		if (found[trgindex])
461 		{
462 			int			prev_lower,
463 						tmp_ulen2,
464 						tmp_lower,
465 						tmp_count;
466 
467 			upper = i;
468 			if (lower == -1)
469 			{
470 				lower = i;
471 				ulen2 = 1;
472 			}
473 
474 			smlr_cur = CALCSML(count, ulen1, ulen2);
475 
476 			/* Also try to adjust lower bound for greater similarity */
477 			tmp_count = count;
478 			tmp_ulen2 = ulen2;
479 			prev_lower = lower;
480 			for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
481 			{
482 				float		smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
483 				int			tmp_trgindex;
484 
485 				if (smlr_tmp > smlr_cur)
486 				{
487 					smlr_cur = smlr_tmp;
488 					ulen2 = tmp_ulen2;
489 					lower = tmp_lower;
490 					count = tmp_count;
491 				}
492 
493 				/*
494 				 * if we only check that word similarity is greater than
495 				 * pg_trgm.word_similarity_threshold we do not need to
496 				 * calculate a maximum similarity.
497 				 */
498 				if (check_only && smlr_cur >= word_similarity_threshold)
499 					break;
500 
501 				tmp_trgindex = trg2indexes[tmp_lower];
502 				if (lastpos[tmp_trgindex] == tmp_lower)
503 				{
504 					tmp_ulen2--;
505 					if (found[tmp_trgindex])
506 						tmp_count--;
507 				}
508 			}
509 
510 			smlr_max = Max(smlr_max, smlr_cur);
511 
512 			/*
513 			 * if we only check that word similarity is greater than
514 			 * pg_trgm.word_similarity_threshold we do not need to calculate a
515 			 * maximum similarity
516 			 */
517 			if (check_only && smlr_max >= word_similarity_threshold)
518 				break;
519 
520 			for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
521 			{
522 				int			tmp_trgindex;
523 
524 				tmp_trgindex = trg2indexes[tmp_lower];
525 				if (lastpos[tmp_trgindex] == tmp_lower)
526 					lastpos[tmp_trgindex] = -1;
527 			}
528 		}
529 	}
530 
531 	pfree(lastpos);
532 
533 	return smlr_max;
534 }
535 
536 /*
537  * Calculate word similarity.
538  * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
539  * are used to calculate word similarity using iterate_word_similarity().
540  *
541  * "trg2indexes" is array which stores indexes of the array "found".
542  * In other words:
543  * trg2indexes[j] = i;
544  * found[i] = true (or false);
545  * If found[i] == true then there is trigram trg2[j] in array "trg1".
546  * If found[i] == false then there is not trigram trg2[j] in array "trg1".
547  *
548  * str1: search pattern string, of length slen1 bytes.
549  * str2: text in which we are looking for a word, of length slen2 bytes.
550  * check_only: if true then only check existence of similar search pattern in
551  *			   text.
552  *
553  * Returns word similarity.
554  */
555 static float4
calc_word_similarity(char * str1,int slen1,char * str2,int slen2,bool check_only)556 calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
557 					 bool check_only)
558 {
559 	bool	   *found;
560 	pos_trgm   *ptrg;
561 	trgm	   *trg1;
562 	trgm	   *trg2;
563 	int			len1,
564 				len2,
565 				len,
566 				i,
567 				j,
568 				ulen1;
569 	int		   *trg2indexes;
570 	float4		result;
571 
572 	protect_out_of_mem(slen1 + slen2);
573 
574 	/* Make positional trigrams */
575 	trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) *3);
576 	trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) *3);
577 
578 	len1 = generate_trgm_only(trg1, str1, slen1);
579 	len2 = generate_trgm_only(trg2, str2, slen2);
580 
581 	ptrg = make_positional_trgm(trg1, len1, trg2, len2);
582 	len = len1 + len2;
583 	qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
584 
585 	pfree(trg1);
586 	pfree(trg2);
587 
588 	/*
589 	 * Merge positional trigrams array: enumerate each trigram and find its
590 	 * presence in required word.
591 	 */
592 	trg2indexes = (int *) palloc(sizeof(int) * len2);
593 	found = (bool *) palloc0(sizeof(bool) * len);
594 
595 	ulen1 = 0;
596 	j = 0;
597 	for (i = 0; i < len; i++)
598 	{
599 		if (i > 0)
600 		{
601 			int			cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
602 
603 			if (cmp != 0)
604 			{
605 				if (found[j])
606 					ulen1++;
607 				j++;
608 			}
609 		}
610 
611 		if (ptrg[i].index >= 0)
612 		{
613 			trg2indexes[ptrg[i].index] = j;
614 		}
615 		else
616 		{
617 			found[j] = true;
618 		}
619 	}
620 	if (found[j])
621 		ulen1++;
622 
623 	/* Run iterative procedure to find maximum similarity with word */
624 	result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
625 									 check_only);
626 
627 	pfree(trg2indexes);
628 	pfree(found);
629 	pfree(ptrg);
630 
631 	return result;
632 }
633 
634 
635 /*
636  * Extract the next non-wildcard part of a search string, i.e. a word bounded
637  * by '_' or '%' meta-characters, non-word characters or string end.
638  *
639  * str: source string, of length lenstr bytes (need not be null-terminated)
640  * buf: where to return the substring (must be long enough)
641  * *bytelen: receives byte length of the found substring
642  * *charlen: receives character length of the found substring
643  *
644  * Returns pointer to end+1 of the found substring in the source string.
645  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
646  *
647  * If the found word is bounded by non-word characters or string boundaries
648  * then this function will include corresponding padding spaces into buf.
649  */
650 static const char *
get_wildcard_part(const char * str,int lenstr,char * buf,int * bytelen,int * charlen)651 get_wildcard_part(const char *str, int lenstr,
652 				  char *buf, int *bytelen, int *charlen)
653 {
654 	const char *beginword = str;
655 	const char *endword;
656 	char	   *s = buf;
657 	bool		in_leading_wildcard_meta = false;
658 	bool		in_trailing_wildcard_meta = false;
659 	bool		in_escape = false;
660 	int			clen;
661 
662 	/*
663 	 * Find the first word character, remembering whether preceding character
664 	 * was wildcard meta-character.  Note that the in_escape state persists
665 	 * from this loop to the next one, since we may exit at a word character
666 	 * that is in_escape.
667 	 */
668 	while (beginword - str < lenstr)
669 	{
670 		if (in_escape)
671 		{
672 			if (ISWORDCHR(beginword))
673 				break;
674 			in_escape = false;
675 			in_leading_wildcard_meta = false;
676 		}
677 		else
678 		{
679 			if (ISESCAPECHAR(beginword))
680 				in_escape = true;
681 			else if (ISWILDCARDCHAR(beginword))
682 				in_leading_wildcard_meta = true;
683 			else if (ISWORDCHR(beginword))
684 				break;
685 			else
686 				in_leading_wildcard_meta = false;
687 		}
688 		beginword += pg_mblen(beginword);
689 	}
690 
691 	/*
692 	 * Handle string end.
693 	 */
694 	if (beginword - str >= lenstr)
695 		return NULL;
696 
697 	/*
698 	 * Add left padding spaces if preceding character wasn't wildcard
699 	 * meta-character.
700 	 */
701 	*charlen = 0;
702 	if (!in_leading_wildcard_meta)
703 	{
704 		if (LPADDING > 0)
705 		{
706 			*s++ = ' ';
707 			(*charlen)++;
708 			if (LPADDING > 1)
709 			{
710 				*s++ = ' ';
711 				(*charlen)++;
712 			}
713 		}
714 	}
715 
716 	/*
717 	 * Copy data into buf until wildcard meta-character, non-word character or
718 	 * string boundary.  Strip escapes during copy.
719 	 */
720 	endword = beginword;
721 	while (endword - str < lenstr)
722 	{
723 		clen = pg_mblen(endword);
724 		if (in_escape)
725 		{
726 			if (ISWORDCHR(endword))
727 			{
728 				memcpy(s, endword, clen);
729 				(*charlen)++;
730 				s += clen;
731 			}
732 			else
733 			{
734 				/*
735 				 * Back up endword to the escape character when stopping at an
736 				 * escaped char, so that subsequent get_wildcard_part will
737 				 * restart from the escape character.  We assume here that
738 				 * escape chars are single-byte.
739 				 */
740 				endword--;
741 				break;
742 			}
743 			in_escape = false;
744 		}
745 		else
746 		{
747 			if (ISESCAPECHAR(endword))
748 				in_escape = true;
749 			else if (ISWILDCARDCHAR(endword))
750 			{
751 				in_trailing_wildcard_meta = true;
752 				break;
753 			}
754 			else if (ISWORDCHR(endword))
755 			{
756 				memcpy(s, endword, clen);
757 				(*charlen)++;
758 				s += clen;
759 			}
760 			else
761 				break;
762 		}
763 		endword += clen;
764 	}
765 
766 	/*
767 	 * Add right padding spaces if next character isn't wildcard
768 	 * meta-character.
769 	 */
770 	if (!in_trailing_wildcard_meta)
771 	{
772 		if (RPADDING > 0)
773 		{
774 			*s++ = ' ';
775 			(*charlen)++;
776 			if (RPADDING > 1)
777 			{
778 				*s++ = ' ';
779 				(*charlen)++;
780 			}
781 		}
782 	}
783 
784 	*bytelen = s - buf;
785 	return endword;
786 }
787 
788 /*
789  * Generates trigrams for wildcard search string.
790  *
791  * Returns array of trigrams that must occur in any string that matches the
792  * wildcard string.  For example, given pattern "a%bcd%" the trigrams
793  * " a", "bcd" would be extracted.
794  */
795 TRGM *
generate_wildcard_trgm(const char * str,int slen)796 generate_wildcard_trgm(const char *str, int slen)
797 {
798 	TRGM	   *trg;
799 	char	   *buf,
800 			   *buf2;
801 	trgm	   *tptr;
802 	int			len,
803 				charlen,
804 				bytelen;
805 	const char *eword;
806 
807 	protect_out_of_mem(slen);
808 
809 	trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
810 	trg->flag = ARRKEY;
811 	SET_VARSIZE(trg, TRGMHDRSIZE);
812 
813 	if (slen + LPADDING + RPADDING < 3 || slen == 0)
814 		return trg;
815 
816 	tptr = GETARR(trg);
817 
818 	/* Allocate a buffer for blank-padded, but not yet case-folded, words */
819 	buf = palloc(sizeof(char) * (slen + 4));
820 
821 	/*
822 	 * Extract trigrams from each substring extracted by get_wildcard_part.
823 	 */
824 	eword = str;
825 	while ((eword = get_wildcard_part(eword, slen - (eword - str),
826 									  buf, &bytelen, &charlen)) != NULL)
827 	{
828 #ifdef IGNORECASE
829 		buf2 = lowerstr_with_len(buf, bytelen);
830 		bytelen = strlen(buf2);
831 #else
832 		buf2 = buf;
833 #endif
834 
835 		/*
836 		 * count trigrams
837 		 */
838 		tptr = make_trigrams(tptr, buf2, bytelen, charlen);
839 
840 #ifdef IGNORECASE
841 		pfree(buf2);
842 #endif
843 	}
844 
845 	pfree(buf);
846 
847 	if ((len = tptr - GETARR(trg)) == 0)
848 		return trg;
849 
850 	/*
851 	 * Make trigrams unique.
852 	 */
853 	if (len > 1)
854 	{
855 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
856 		len = unique_array(GETARR(trg), len);
857 	}
858 
859 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
860 
861 	return trg;
862 }
863 
864 uint32
trgm2int(trgm * ptr)865 trgm2int(trgm *ptr)
866 {
867 	uint32		val = 0;
868 
869 	val |= *(((unsigned char *) ptr));
870 	val <<= 8;
871 	val |= *(((unsigned char *) ptr) + 1);
872 	val <<= 8;
873 	val |= *(((unsigned char *) ptr) + 2);
874 
875 	return val;
876 }
877 
878 Datum
show_trgm(PG_FUNCTION_ARGS)879 show_trgm(PG_FUNCTION_ARGS)
880 {
881 	text	   *in = PG_GETARG_TEXT_P(0);
882 	TRGM	   *trg;
883 	Datum	   *d;
884 	ArrayType  *a;
885 	trgm	   *ptr;
886 	int			i;
887 
888 	trg = generate_trgm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
889 	d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
890 
891 	for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
892 	{
893 		text	   *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
894 
895 		if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
896 		{
897 			snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
898 			SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
899 		}
900 		else
901 		{
902 			SET_VARSIZE(item, VARHDRSZ + 3);
903 			CPTRGM(VARDATA(item), ptr);
904 		}
905 		d[i] = PointerGetDatum(item);
906 	}
907 
908 	a = construct_array(
909 						d,
910 						ARRNELEM(trg),
911 						TEXTOID,
912 						-1,
913 						false,
914 						'i'
915 		);
916 
917 	for (i = 0; i < ARRNELEM(trg); i++)
918 		pfree(DatumGetPointer(d[i]));
919 
920 	pfree(d);
921 	pfree(trg);
922 	PG_FREE_IF_COPY(in, 0);
923 
924 	PG_RETURN_POINTER(a);
925 }
926 
927 float4
cnt_sml(TRGM * trg1,TRGM * trg2,bool inexact)928 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
929 {
930 	trgm	   *ptr1,
931 			   *ptr2;
932 	int			count = 0;
933 	int			len1,
934 				len2;
935 
936 	ptr1 = GETARR(trg1);
937 	ptr2 = GETARR(trg2);
938 
939 	len1 = ARRNELEM(trg1);
940 	len2 = ARRNELEM(trg2);
941 
942 	/* explicit test is needed to avoid 0/0 division when both lengths are 0 */
943 	if (len1 <= 0 || len2 <= 0)
944 		return (float4) 0.0;
945 
946 	while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
947 	{
948 		int			res = CMPTRGM(ptr1, ptr2);
949 
950 		if (res < 0)
951 			ptr1++;
952 		else if (res > 0)
953 			ptr2++;
954 		else
955 		{
956 			ptr1++;
957 			ptr2++;
958 			count++;
959 		}
960 	}
961 
962 	/*
963 	 * If inexact then len2 is equal to count, because we don't know actual
964 	 * length of second string in inexact search and we can assume that count
965 	 * is a lower bound of len2.
966 	 */
967 	return CALCSML(count, len1, inexact ? count : len2);
968 }
969 
970 
971 /*
972  * Returns whether trg2 contains all trigrams in trg1.
973  * This relies on the trigram arrays being sorted.
974  */
975 bool
trgm_contained_by(TRGM * trg1,TRGM * trg2)976 trgm_contained_by(TRGM *trg1, TRGM *trg2)
977 {
978 	trgm	   *ptr1,
979 			   *ptr2;
980 	int			len1,
981 				len2;
982 
983 	ptr1 = GETARR(trg1);
984 	ptr2 = GETARR(trg2);
985 
986 	len1 = ARRNELEM(trg1);
987 	len2 = ARRNELEM(trg2);
988 
989 	while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
990 	{
991 		int			res = CMPTRGM(ptr1, ptr2);
992 
993 		if (res < 0)
994 			return false;
995 		else if (res > 0)
996 			ptr2++;
997 		else
998 		{
999 			ptr1++;
1000 			ptr2++;
1001 		}
1002 	}
1003 	if (ptr1 - GETARR(trg1) < len1)
1004 		return false;
1005 	else
1006 		return true;
1007 }
1008 
1009 /*
1010  * Return a palloc'd boolean array showing, for each trigram in "query",
1011  * whether it is present in the trigram array "key".
1012  * This relies on the "key" array being sorted, but "query" need not be.
1013  */
1014 bool *
trgm_presence_map(TRGM * query,TRGM * key)1015 trgm_presence_map(TRGM *query, TRGM *key)
1016 {
1017 	bool	   *result;
1018 	trgm	   *ptrq = GETARR(query),
1019 			   *ptrk = GETARR(key);
1020 	int			lenq = ARRNELEM(query),
1021 				lenk = ARRNELEM(key),
1022 				i;
1023 
1024 	result = (bool *) palloc0(lenq * sizeof(bool));
1025 
1026 	/* for each query trigram, do a binary search in the key array */
1027 	for (i = 0; i < lenq; i++)
1028 	{
1029 		int			lo = 0;
1030 		int			hi = lenk;
1031 
1032 		while (lo < hi)
1033 		{
1034 			int			mid = (lo + hi) / 2;
1035 			int			res = CMPTRGM(ptrq, ptrk + mid);
1036 
1037 			if (res < 0)
1038 				hi = mid;
1039 			else if (res > 0)
1040 				lo = mid + 1;
1041 			else
1042 			{
1043 				result[i] = true;
1044 				break;
1045 			}
1046 		}
1047 		ptrq++;
1048 	}
1049 
1050 	return result;
1051 }
1052 
1053 Datum
similarity(PG_FUNCTION_ARGS)1054 similarity(PG_FUNCTION_ARGS)
1055 {
1056 	text	   *in1 = PG_GETARG_TEXT_P(0);
1057 	text	   *in2 = PG_GETARG_TEXT_P(1);
1058 	TRGM	   *trg1,
1059 			   *trg2;
1060 	float4		res;
1061 
1062 	trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
1063 	trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
1064 
1065 	res = cnt_sml(trg1, trg2, false);
1066 
1067 	pfree(trg1);
1068 	pfree(trg2);
1069 	PG_FREE_IF_COPY(in1, 0);
1070 	PG_FREE_IF_COPY(in2, 1);
1071 
1072 	PG_RETURN_FLOAT4(res);
1073 }
1074 
1075 Datum
word_similarity(PG_FUNCTION_ARGS)1076 word_similarity(PG_FUNCTION_ARGS)
1077 {
1078 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1079 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1080 	float4		res;
1081 
1082 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1083 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1084 							   false);
1085 
1086 	PG_FREE_IF_COPY(in1, 0);
1087 	PG_FREE_IF_COPY(in2, 1);
1088 	PG_RETURN_FLOAT4(res);
1089 }
1090 
1091 Datum
similarity_dist(PG_FUNCTION_ARGS)1092 similarity_dist(PG_FUNCTION_ARGS)
1093 {
1094 	float4		res = DatumGetFloat4(DirectFunctionCall2(similarity,
1095 														 PG_GETARG_DATUM(0),
1096 														 PG_GETARG_DATUM(1)));
1097 
1098 	PG_RETURN_FLOAT4(1.0 - res);
1099 }
1100 
1101 Datum
similarity_op(PG_FUNCTION_ARGS)1102 similarity_op(PG_FUNCTION_ARGS)
1103 {
1104 	float4		res = DatumGetFloat4(DirectFunctionCall2(similarity,
1105 														 PG_GETARG_DATUM(0),
1106 														 PG_GETARG_DATUM(1)));
1107 
1108 	PG_RETURN_BOOL(res >= similarity_threshold);
1109 }
1110 
1111 Datum
word_similarity_op(PG_FUNCTION_ARGS)1112 word_similarity_op(PG_FUNCTION_ARGS)
1113 {
1114 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1115 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1116 	float4		res;
1117 
1118 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1119 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1120 							   true);
1121 
1122 	PG_FREE_IF_COPY(in1, 0);
1123 	PG_FREE_IF_COPY(in2, 1);
1124 	PG_RETURN_BOOL(res >= word_similarity_threshold);
1125 }
1126 
1127 Datum
word_similarity_commutator_op(PG_FUNCTION_ARGS)1128 word_similarity_commutator_op(PG_FUNCTION_ARGS)
1129 {
1130 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1131 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1132 	float4		res;
1133 
1134 	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1135 							   VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1136 							   true);
1137 
1138 	PG_FREE_IF_COPY(in1, 0);
1139 	PG_FREE_IF_COPY(in2, 1);
1140 	PG_RETURN_BOOL(res >= word_similarity_threshold);
1141 }
1142 
1143 Datum
word_similarity_dist_op(PG_FUNCTION_ARGS)1144 word_similarity_dist_op(PG_FUNCTION_ARGS)
1145 {
1146 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1147 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1148 	float4		res;
1149 
1150 	res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1151 							   VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1152 							   false);
1153 
1154 	PG_FREE_IF_COPY(in1, 0);
1155 	PG_FREE_IF_COPY(in2, 1);
1156 	PG_RETURN_FLOAT4(1.0 - res);
1157 }
1158 
1159 Datum
word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)1160 word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1161 {
1162 	text	   *in1 = PG_GETARG_TEXT_PP(0);
1163 	text	   *in2 = PG_GETARG_TEXT_PP(1);
1164 	float4		res;
1165 
1166 	res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1167 							   VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1168 							   false);
1169 
1170 	PG_FREE_IF_COPY(in1, 0);
1171 	PG_FREE_IF_COPY(in2, 1);
1172 	PG_RETURN_FLOAT4(1.0 - res);
1173 }
1174