1 /*
2 * contrib/pg_trgm/trgm_op.c
3 */
4 #include "postgres.h"
5
6 #include <ctype.h>
7
8 #include "trgm.h"
9
10 #include "catalog/pg_type.h"
11 #include "tsearch/ts_locale.h"
12 #include "utils/lsyscache.h"
13 #include "utils/memutils.h"
14 #include "utils/pg_crc.h"
15
16 PG_MODULE_MAGIC;
17
18 /* GUC variables */
19 double similarity_threshold = 0.3f;
20 double word_similarity_threshold = 0.6f;
21
22 void _PG_init(void);
23
24 PG_FUNCTION_INFO_V1(set_limit);
25 PG_FUNCTION_INFO_V1(show_limit);
26 PG_FUNCTION_INFO_V1(show_trgm);
27 PG_FUNCTION_INFO_V1(similarity);
28 PG_FUNCTION_INFO_V1(word_similarity);
29 PG_FUNCTION_INFO_V1(similarity_dist);
30 PG_FUNCTION_INFO_V1(similarity_op);
31 PG_FUNCTION_INFO_V1(word_similarity_op);
32 PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
33 PG_FUNCTION_INFO_V1(word_similarity_dist_op);
34 PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
35
36 /* Trigram with position */
37 typedef struct
38 {
39 trgm trg;
40 int index;
41 } pos_trgm;
42
43 /*
44 * Module load callback
45 */
46 void
_PG_init(void)47 _PG_init(void)
48 {
49 /* Define custom GUC variables. */
50 DefineCustomRealVariable("pg_trgm.similarity_threshold",
51 "Sets the threshold used by the % operator.",
52 "Valid range is 0.0 .. 1.0.",
53 &similarity_threshold,
54 0.3,
55 0.0,
56 1.0,
57 PGC_USERSET,
58 0,
59 NULL,
60 NULL,
61 NULL);
62 DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
63 "Sets the threshold used by the <% operator.",
64 "Valid range is 0.0 .. 1.0.",
65 &word_similarity_threshold,
66 0.6,
67 0.0,
68 1.0,
69 PGC_USERSET,
70 0,
71 NULL,
72 NULL,
73 NULL);
74 }
75
76 /*
77 * Deprecated function.
78 * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
79 */
80 Datum
set_limit(PG_FUNCTION_ARGS)81 set_limit(PG_FUNCTION_ARGS)
82 {
83 float4 nlimit = PG_GETARG_FLOAT4(0);
84 char *nlimit_str;
85 Oid func_out_oid;
86 bool is_varlena;
87
88 getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
89
90 nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
91
92 SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
93 PGC_USERSET, PGC_S_SESSION);
94
95 PG_RETURN_FLOAT4(similarity_threshold);
96 }
97
98 /*
99 * Deprecated function.
100 * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
101 */
102 Datum
show_limit(PG_FUNCTION_ARGS)103 show_limit(PG_FUNCTION_ARGS)
104 {
105 PG_RETURN_FLOAT4(similarity_threshold);
106 }
107
108 static int
comp_trgm(const void * a,const void * b)109 comp_trgm(const void *a, const void *b)
110 {
111 return CMPTRGM(a, b);
112 }
113
114 static int
unique_array(trgm * a,int len)115 unique_array(trgm *a, int len)
116 {
117 trgm *curend,
118 *tmp;
119
120 curend = tmp = a;
121 while (tmp - a < len)
122 if (CMPTRGM(tmp, curend))
123 {
124 curend++;
125 CPTRGM(curend, tmp);
126 tmp++;
127 }
128 else
129 tmp++;
130
131 return curend + 1 - a;
132 }
133
134 /*
135 * Finds first word in string, returns pointer to the word,
136 * endword points to the character after word
137 */
138 static char *
find_word(char * str,int lenstr,char ** endword,int * charlen)139 find_word(char *str, int lenstr, char **endword, int *charlen)
140 {
141 char *beginword = str;
142
143 while (beginword - str < lenstr && !ISWORDCHR(beginword))
144 beginword += pg_mblen(beginword);
145
146 if (beginword - str >= lenstr)
147 return NULL;
148
149 *endword = beginword;
150 *charlen = 0;
151 while (*endword - str < lenstr && ISWORDCHR(*endword))
152 {
153 *endword += pg_mblen(*endword);
154 (*charlen)++;
155 }
156
157 return beginword;
158 }
159
160 /*
161 * Reduce a trigram (three possibly multi-byte characters) to a trgm,
162 * which is always exactly three bytes. If we have three single-byte
163 * characters, we just use them as-is; otherwise we form a hash value.
164 */
165 void
compact_trigram(trgm * tptr,char * str,int bytelen)166 compact_trigram(trgm *tptr, char *str, int bytelen)
167 {
168 if (bytelen == 3)
169 {
170 CPTRGM(tptr, str);
171 }
172 else
173 {
174 pg_crc32 crc;
175
176 INIT_LEGACY_CRC32(crc);
177 COMP_LEGACY_CRC32(crc, str, bytelen);
178 FIN_LEGACY_CRC32(crc);
179
180 /*
181 * use only 3 upper bytes from crc, hope, it's good enough hashing
182 */
183 CPTRGM(tptr, &crc);
184 }
185 }
186
187 /*
188 * Adds trigrams from words (already padded).
189 */
190 static trgm *
make_trigrams(trgm * tptr,char * str,int bytelen,int charlen)191 make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
192 {
193 char *ptr = str;
194
195 if (charlen < 3)
196 return tptr;
197
198 if (bytelen > charlen)
199 {
200 /* Find multibyte character boundaries and apply compact_trigram */
201 int lenfirst = pg_mblen(str),
202 lenmiddle = pg_mblen(str + lenfirst),
203 lenlast = pg_mblen(str + lenfirst + lenmiddle);
204
205 while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
206 {
207 compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
208
209 ptr += lenfirst;
210 tptr++;
211
212 lenfirst = lenmiddle;
213 lenmiddle = lenlast;
214 lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
215 }
216 }
217 else
218 {
219 /* Fast path when there are no multibyte characters */
220 Assert(bytelen == charlen);
221
222 while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
223 {
224 CPTRGM(tptr, ptr);
225 ptr++;
226 tptr++;
227 }
228 }
229
230 return tptr;
231 }
232
233 /*
234 * Make array of trigrams without sorting and removing duplicate items.
235 *
236 * trg: where to return the array of trigrams.
237 * str: source string, of length slen bytes.
238 *
239 * Returns length of the generated array.
240 */
241 static int
generate_trgm_only(trgm * trg,char * str,int slen)242 generate_trgm_only(trgm *trg, char *str, int slen)
243 {
244 trgm *tptr;
245 char *buf;
246 int charlen,
247 bytelen;
248 char *bword,
249 *eword;
250
251 if (slen + LPADDING + RPADDING < 3 || slen == 0)
252 return 0;
253
254 tptr = trg;
255
256 /* Allocate a buffer for case-folded, blank-padded words */
257 buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
258
259 if (LPADDING > 0)
260 {
261 *buf = ' ';
262 if (LPADDING > 1)
263 *(buf + 1) = ' ';
264 }
265
266 eword = str;
267 while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
268 {
269 #ifdef IGNORECASE
270 bword = lowerstr_with_len(bword, eword - bword);
271 bytelen = strlen(bword);
272 #else
273 bytelen = eword - bword;
274 #endif
275
276 memcpy(buf + LPADDING, bword, bytelen);
277
278 #ifdef IGNORECASE
279 pfree(bword);
280 #endif
281
282 buf[LPADDING + bytelen] = ' ';
283 buf[LPADDING + bytelen + 1] = ' ';
284
285 /*
286 * count trigrams
287 */
288 tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
289 charlen + LPADDING + RPADDING);
290 }
291
292 pfree(buf);
293
294 return tptr - trg;
295 }
296
297 /*
298 * Guard against possible overflow in the palloc requests below. (We
299 * don't worry about the additive constants, since palloc can detect
300 * requests that are a little above MaxAllocSize --- we just need to
301 * prevent integer overflow in the multiplications.)
302 */
303 static void
protect_out_of_mem(int slen)304 protect_out_of_mem(int slen)
305 {
306 if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
307 (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
308 ereport(ERROR,
309 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
310 errmsg("out of memory")));
311 }
312
313 /*
314 * Make array of trigrams with sorting and removing duplicate items.
315 *
316 * str: source string, of length slen bytes.
317 *
318 * Returns the sorted array of unique trigrams.
319 */
320 TRGM *
generate_trgm(char * str,int slen)321 generate_trgm(char *str, int slen)
322 {
323 TRGM *trg;
324 int len;
325
326 protect_out_of_mem(slen);
327
328 trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
329 trg->flag = ARRKEY;
330
331 len = generate_trgm_only(GETARR(trg), str, slen);
332 SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
333
334 if (len == 0)
335 return trg;
336
337 /*
338 * Make trigrams unique.
339 */
340 if (len > 1)
341 {
342 qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
343 len = unique_array(GETARR(trg), len);
344 }
345
346 SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
347
348 return trg;
349 }
350
351 /*
352 * Make array of positional trigrams from two trigram arrays trg1 and trg2.
353 *
354 * trg1: trigram array of search pattern, of length len1. trg1 is required
355 * word which positions don't matter and replaced with -1.
356 * trg2: trigram array of text, of length len2. trg2 is haystack where we
357 * search and have to store its positions.
358 *
359 * Returns concatenated trigram array.
360 */
361 static pos_trgm *
make_positional_trgm(trgm * trg1,int len1,trgm * trg2,int len2)362 make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
363 {
364 pos_trgm *result;
365 int i,
366 len = len1 + len2;
367
368 result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
369
370 for (i = 0; i < len1; i++)
371 {
372 memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
373 result[i].index = -1;
374 }
375
376 for (i = 0; i < len2; i++)
377 {
378 memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
379 result[i + len1].index = i;
380 }
381
382 return result;
383 }
384
385 /*
386 * Compare position trigrams: compare trigrams first and position second.
387 */
388 static int
comp_ptrgm(const void * v1,const void * v2)389 comp_ptrgm(const void *v1, const void *v2)
390 {
391 const pos_trgm *p1 = (const pos_trgm *) v1;
392 const pos_trgm *p2 = (const pos_trgm *) v2;
393 int cmp;
394
395 cmp = CMPTRGM(p1->trg, p2->trg);
396 if (cmp != 0)
397 return cmp;
398
399 if (p1->index < p2->index)
400 return -1;
401 else if (p1->index == p2->index)
402 return 0;
403 else
404 return 1;
405 }
406
407 /*
408 * Iterative search function which calculates maximum similarity with word in
409 * the string. But maximum similarity is calculated only if check_only == false.
410 *
411 * trg2indexes: array which stores indexes of the array "found".
412 * found: array which stores true of false values.
413 * ulen1: count of unique trigrams of array "trg1".
414 * len2: length of array "trg2" and array "trg2indexes".
415 * len: length of the array "found".
416 * check_only: if true then only check existence of similar search pattern in
417 * text.
418 *
419 * Returns word similarity.
420 */
421 static float4
iterate_word_similarity(int * trg2indexes,bool * found,int ulen1,int len2,int len,bool check_only)422 iterate_word_similarity(int *trg2indexes,
423 bool *found,
424 int ulen1,
425 int len2,
426 int len,
427 bool check_only)
428 {
429 int *lastpos,
430 i,
431 ulen2 = 0,
432 count = 0,
433 upper = -1,
434 lower = -1;
435 float4 smlr_cur,
436 smlr_max = 0.0f;
437
438 /* Memorise last position of each trigram */
439 lastpos = (int *) palloc(sizeof(int) * len);
440 memset(lastpos, -1, sizeof(int) * len);
441
442 for (i = 0; i < len2; i++)
443 {
444 /* Get index of next trigram */
445 int trgindex = trg2indexes[i];
446
447 /* Update last position of this trigram */
448 if (lower >= 0 || found[trgindex])
449 {
450 if (lastpos[trgindex] < 0)
451 {
452 ulen2++;
453 if (found[trgindex])
454 count++;
455 }
456 lastpos[trgindex] = i;
457 }
458
459 /* Adjust upper bound if this trigram is present in required substring */
460 if (found[trgindex])
461 {
462 int prev_lower,
463 tmp_ulen2,
464 tmp_lower,
465 tmp_count;
466
467 upper = i;
468 if (lower == -1)
469 {
470 lower = i;
471 ulen2 = 1;
472 }
473
474 smlr_cur = CALCSML(count, ulen1, ulen2);
475
476 /* Also try to adjust lower bound for greater similarity */
477 tmp_count = count;
478 tmp_ulen2 = ulen2;
479 prev_lower = lower;
480 for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
481 {
482 float smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
483 int tmp_trgindex;
484
485 if (smlr_tmp > smlr_cur)
486 {
487 smlr_cur = smlr_tmp;
488 ulen2 = tmp_ulen2;
489 lower = tmp_lower;
490 count = tmp_count;
491 }
492
493 /*
494 * if we only check that word similarity is greater than
495 * pg_trgm.word_similarity_threshold we do not need to
496 * calculate a maximum similarity.
497 */
498 if (check_only && smlr_cur >= word_similarity_threshold)
499 break;
500
501 tmp_trgindex = trg2indexes[tmp_lower];
502 if (lastpos[tmp_trgindex] == tmp_lower)
503 {
504 tmp_ulen2--;
505 if (found[tmp_trgindex])
506 tmp_count--;
507 }
508 }
509
510 smlr_max = Max(smlr_max, smlr_cur);
511
512 /*
513 * if we only check that word similarity is greater than
514 * pg_trgm.word_similarity_threshold we do not need to calculate a
515 * maximum similarity
516 */
517 if (check_only && smlr_max >= word_similarity_threshold)
518 break;
519
520 for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
521 {
522 int tmp_trgindex;
523
524 tmp_trgindex = trg2indexes[tmp_lower];
525 if (lastpos[tmp_trgindex] == tmp_lower)
526 lastpos[tmp_trgindex] = -1;
527 }
528 }
529 }
530
531 pfree(lastpos);
532
533 return smlr_max;
534 }
535
536 /*
537 * Calculate word similarity.
538 * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
539 * are used to calculate word similarity using iterate_word_similarity().
540 *
541 * "trg2indexes" is array which stores indexes of the array "found".
542 * In other words:
543 * trg2indexes[j] = i;
544 * found[i] = true (or false);
545 * If found[i] == true then there is trigram trg2[j] in array "trg1".
546 * If found[i] == false then there is not trigram trg2[j] in array "trg1".
547 *
548 * str1: search pattern string, of length slen1 bytes.
549 * str2: text in which we are looking for a word, of length slen2 bytes.
550 * check_only: if true then only check existence of similar search pattern in
551 * text.
552 *
553 * Returns word similarity.
554 */
555 static float4
calc_word_similarity(char * str1,int slen1,char * str2,int slen2,bool check_only)556 calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
557 bool check_only)
558 {
559 bool *found;
560 pos_trgm *ptrg;
561 trgm *trg1;
562 trgm *trg2;
563 int len1,
564 len2,
565 len,
566 i,
567 j,
568 ulen1;
569 int *trg2indexes;
570 float4 result;
571
572 protect_out_of_mem(slen1 + slen2);
573
574 /* Make positional trigrams */
575 trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) *3);
576 trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) *3);
577
578 len1 = generate_trgm_only(trg1, str1, slen1);
579 len2 = generate_trgm_only(trg2, str2, slen2);
580
581 ptrg = make_positional_trgm(trg1, len1, trg2, len2);
582 len = len1 + len2;
583 qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
584
585 pfree(trg1);
586 pfree(trg2);
587
588 /*
589 * Merge positional trigrams array: enumerate each trigram and find its
590 * presence in required word.
591 */
592 trg2indexes = (int *) palloc(sizeof(int) * len2);
593 found = (bool *) palloc0(sizeof(bool) * len);
594
595 ulen1 = 0;
596 j = 0;
597 for (i = 0; i < len; i++)
598 {
599 if (i > 0)
600 {
601 int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
602
603 if (cmp != 0)
604 {
605 if (found[j])
606 ulen1++;
607 j++;
608 }
609 }
610
611 if (ptrg[i].index >= 0)
612 {
613 trg2indexes[ptrg[i].index] = j;
614 }
615 else
616 {
617 found[j] = true;
618 }
619 }
620 if (found[j])
621 ulen1++;
622
623 /* Run iterative procedure to find maximum similarity with word */
624 result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
625 check_only);
626
627 pfree(trg2indexes);
628 pfree(found);
629 pfree(ptrg);
630
631 return result;
632 }
633
634
635 /*
636 * Extract the next non-wildcard part of a search string, i.e. a word bounded
637 * by '_' or '%' meta-characters, non-word characters or string end.
638 *
639 * str: source string, of length lenstr bytes (need not be null-terminated)
640 * buf: where to return the substring (must be long enough)
641 * *bytelen: receives byte length of the found substring
642 * *charlen: receives character length of the found substring
643 *
644 * Returns pointer to end+1 of the found substring in the source string.
645 * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
646 *
647 * If the found word is bounded by non-word characters or string boundaries
648 * then this function will include corresponding padding spaces into buf.
649 */
650 static const char *
get_wildcard_part(const char * str,int lenstr,char * buf,int * bytelen,int * charlen)651 get_wildcard_part(const char *str, int lenstr,
652 char *buf, int *bytelen, int *charlen)
653 {
654 const char *beginword = str;
655 const char *endword;
656 char *s = buf;
657 bool in_leading_wildcard_meta = false;
658 bool in_trailing_wildcard_meta = false;
659 bool in_escape = false;
660 int clen;
661
662 /*
663 * Find the first word character, remembering whether preceding character
664 * was wildcard meta-character. Note that the in_escape state persists
665 * from this loop to the next one, since we may exit at a word character
666 * that is in_escape.
667 */
668 while (beginword - str < lenstr)
669 {
670 if (in_escape)
671 {
672 if (ISWORDCHR(beginword))
673 break;
674 in_escape = false;
675 in_leading_wildcard_meta = false;
676 }
677 else
678 {
679 if (ISESCAPECHAR(beginword))
680 in_escape = true;
681 else if (ISWILDCARDCHAR(beginword))
682 in_leading_wildcard_meta = true;
683 else if (ISWORDCHR(beginword))
684 break;
685 else
686 in_leading_wildcard_meta = false;
687 }
688 beginword += pg_mblen(beginword);
689 }
690
691 /*
692 * Handle string end.
693 */
694 if (beginword - str >= lenstr)
695 return NULL;
696
697 /*
698 * Add left padding spaces if preceding character wasn't wildcard
699 * meta-character.
700 */
701 *charlen = 0;
702 if (!in_leading_wildcard_meta)
703 {
704 if (LPADDING > 0)
705 {
706 *s++ = ' ';
707 (*charlen)++;
708 if (LPADDING > 1)
709 {
710 *s++ = ' ';
711 (*charlen)++;
712 }
713 }
714 }
715
716 /*
717 * Copy data into buf until wildcard meta-character, non-word character or
718 * string boundary. Strip escapes during copy.
719 */
720 endword = beginword;
721 while (endword - str < lenstr)
722 {
723 clen = pg_mblen(endword);
724 if (in_escape)
725 {
726 if (ISWORDCHR(endword))
727 {
728 memcpy(s, endword, clen);
729 (*charlen)++;
730 s += clen;
731 }
732 else
733 {
734 /*
735 * Back up endword to the escape character when stopping at an
736 * escaped char, so that subsequent get_wildcard_part will
737 * restart from the escape character. We assume here that
738 * escape chars are single-byte.
739 */
740 endword--;
741 break;
742 }
743 in_escape = false;
744 }
745 else
746 {
747 if (ISESCAPECHAR(endword))
748 in_escape = true;
749 else if (ISWILDCARDCHAR(endword))
750 {
751 in_trailing_wildcard_meta = true;
752 break;
753 }
754 else if (ISWORDCHR(endword))
755 {
756 memcpy(s, endword, clen);
757 (*charlen)++;
758 s += clen;
759 }
760 else
761 break;
762 }
763 endword += clen;
764 }
765
766 /*
767 * Add right padding spaces if next character isn't wildcard
768 * meta-character.
769 */
770 if (!in_trailing_wildcard_meta)
771 {
772 if (RPADDING > 0)
773 {
774 *s++ = ' ';
775 (*charlen)++;
776 if (RPADDING > 1)
777 {
778 *s++ = ' ';
779 (*charlen)++;
780 }
781 }
782 }
783
784 *bytelen = s - buf;
785 return endword;
786 }
787
788 /*
789 * Generates trigrams for wildcard search string.
790 *
791 * Returns array of trigrams that must occur in any string that matches the
792 * wildcard string. For example, given pattern "a%bcd%" the trigrams
793 * " a", "bcd" would be extracted.
794 */
795 TRGM *
generate_wildcard_trgm(const char * str,int slen)796 generate_wildcard_trgm(const char *str, int slen)
797 {
798 TRGM *trg;
799 char *buf,
800 *buf2;
801 trgm *tptr;
802 int len,
803 charlen,
804 bytelen;
805 const char *eword;
806
807 protect_out_of_mem(slen);
808
809 trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
810 trg->flag = ARRKEY;
811 SET_VARSIZE(trg, TRGMHDRSIZE);
812
813 if (slen + LPADDING + RPADDING < 3 || slen == 0)
814 return trg;
815
816 tptr = GETARR(trg);
817
818 /* Allocate a buffer for blank-padded, but not yet case-folded, words */
819 buf = palloc(sizeof(char) * (slen + 4));
820
821 /*
822 * Extract trigrams from each substring extracted by get_wildcard_part.
823 */
824 eword = str;
825 while ((eword = get_wildcard_part(eword, slen - (eword - str),
826 buf, &bytelen, &charlen)) != NULL)
827 {
828 #ifdef IGNORECASE
829 buf2 = lowerstr_with_len(buf, bytelen);
830 bytelen = strlen(buf2);
831 #else
832 buf2 = buf;
833 #endif
834
835 /*
836 * count trigrams
837 */
838 tptr = make_trigrams(tptr, buf2, bytelen, charlen);
839
840 #ifdef IGNORECASE
841 pfree(buf2);
842 #endif
843 }
844
845 pfree(buf);
846
847 if ((len = tptr - GETARR(trg)) == 0)
848 return trg;
849
850 /*
851 * Make trigrams unique.
852 */
853 if (len > 1)
854 {
855 qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
856 len = unique_array(GETARR(trg), len);
857 }
858
859 SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
860
861 return trg;
862 }
863
864 uint32
trgm2int(trgm * ptr)865 trgm2int(trgm *ptr)
866 {
867 uint32 val = 0;
868
869 val |= *(((unsigned char *) ptr));
870 val <<= 8;
871 val |= *(((unsigned char *) ptr) + 1);
872 val <<= 8;
873 val |= *(((unsigned char *) ptr) + 2);
874
875 return val;
876 }
877
878 Datum
show_trgm(PG_FUNCTION_ARGS)879 show_trgm(PG_FUNCTION_ARGS)
880 {
881 text *in = PG_GETARG_TEXT_P(0);
882 TRGM *trg;
883 Datum *d;
884 ArrayType *a;
885 trgm *ptr;
886 int i;
887
888 trg = generate_trgm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
889 d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
890
891 for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
892 {
893 text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
894
895 if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
896 {
897 snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
898 SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
899 }
900 else
901 {
902 SET_VARSIZE(item, VARHDRSZ + 3);
903 CPTRGM(VARDATA(item), ptr);
904 }
905 d[i] = PointerGetDatum(item);
906 }
907
908 a = construct_array(
909 d,
910 ARRNELEM(trg),
911 TEXTOID,
912 -1,
913 false,
914 'i'
915 );
916
917 for (i = 0; i < ARRNELEM(trg); i++)
918 pfree(DatumGetPointer(d[i]));
919
920 pfree(d);
921 pfree(trg);
922 PG_FREE_IF_COPY(in, 0);
923
924 PG_RETURN_POINTER(a);
925 }
926
927 float4
cnt_sml(TRGM * trg1,TRGM * trg2,bool inexact)928 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
929 {
930 trgm *ptr1,
931 *ptr2;
932 int count = 0;
933 int len1,
934 len2;
935
936 ptr1 = GETARR(trg1);
937 ptr2 = GETARR(trg2);
938
939 len1 = ARRNELEM(trg1);
940 len2 = ARRNELEM(trg2);
941
942 /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
943 if (len1 <= 0 || len2 <= 0)
944 return (float4) 0.0;
945
946 while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
947 {
948 int res = CMPTRGM(ptr1, ptr2);
949
950 if (res < 0)
951 ptr1++;
952 else if (res > 0)
953 ptr2++;
954 else
955 {
956 ptr1++;
957 ptr2++;
958 count++;
959 }
960 }
961
962 /*
963 * If inexact then len2 is equal to count, because we don't know actual
964 * length of second string in inexact search and we can assume that count
965 * is a lower bound of len2.
966 */
967 return CALCSML(count, len1, inexact ? count : len2);
968 }
969
970
971 /*
972 * Returns whether trg2 contains all trigrams in trg1.
973 * This relies on the trigram arrays being sorted.
974 */
975 bool
trgm_contained_by(TRGM * trg1,TRGM * trg2)976 trgm_contained_by(TRGM *trg1, TRGM *trg2)
977 {
978 trgm *ptr1,
979 *ptr2;
980 int len1,
981 len2;
982
983 ptr1 = GETARR(trg1);
984 ptr2 = GETARR(trg2);
985
986 len1 = ARRNELEM(trg1);
987 len2 = ARRNELEM(trg2);
988
989 while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
990 {
991 int res = CMPTRGM(ptr1, ptr2);
992
993 if (res < 0)
994 return false;
995 else if (res > 0)
996 ptr2++;
997 else
998 {
999 ptr1++;
1000 ptr2++;
1001 }
1002 }
1003 if (ptr1 - GETARR(trg1) < len1)
1004 return false;
1005 else
1006 return true;
1007 }
1008
1009 /*
1010 * Return a palloc'd boolean array showing, for each trigram in "query",
1011 * whether it is present in the trigram array "key".
1012 * This relies on the "key" array being sorted, but "query" need not be.
1013 */
1014 bool *
trgm_presence_map(TRGM * query,TRGM * key)1015 trgm_presence_map(TRGM *query, TRGM *key)
1016 {
1017 bool *result;
1018 trgm *ptrq = GETARR(query),
1019 *ptrk = GETARR(key);
1020 int lenq = ARRNELEM(query),
1021 lenk = ARRNELEM(key),
1022 i;
1023
1024 result = (bool *) palloc0(lenq * sizeof(bool));
1025
1026 /* for each query trigram, do a binary search in the key array */
1027 for (i = 0; i < lenq; i++)
1028 {
1029 int lo = 0;
1030 int hi = lenk;
1031
1032 while (lo < hi)
1033 {
1034 int mid = (lo + hi) / 2;
1035 int res = CMPTRGM(ptrq, ptrk + mid);
1036
1037 if (res < 0)
1038 hi = mid;
1039 else if (res > 0)
1040 lo = mid + 1;
1041 else
1042 {
1043 result[i] = true;
1044 break;
1045 }
1046 }
1047 ptrq++;
1048 }
1049
1050 return result;
1051 }
1052
1053 Datum
similarity(PG_FUNCTION_ARGS)1054 similarity(PG_FUNCTION_ARGS)
1055 {
1056 text *in1 = PG_GETARG_TEXT_P(0);
1057 text *in2 = PG_GETARG_TEXT_P(1);
1058 TRGM *trg1,
1059 *trg2;
1060 float4 res;
1061
1062 trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
1063 trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
1064
1065 res = cnt_sml(trg1, trg2, false);
1066
1067 pfree(trg1);
1068 pfree(trg2);
1069 PG_FREE_IF_COPY(in1, 0);
1070 PG_FREE_IF_COPY(in2, 1);
1071
1072 PG_RETURN_FLOAT4(res);
1073 }
1074
1075 Datum
word_similarity(PG_FUNCTION_ARGS)1076 word_similarity(PG_FUNCTION_ARGS)
1077 {
1078 text *in1 = PG_GETARG_TEXT_PP(0);
1079 text *in2 = PG_GETARG_TEXT_PP(1);
1080 float4 res;
1081
1082 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1083 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1084 false);
1085
1086 PG_FREE_IF_COPY(in1, 0);
1087 PG_FREE_IF_COPY(in2, 1);
1088 PG_RETURN_FLOAT4(res);
1089 }
1090
1091 Datum
similarity_dist(PG_FUNCTION_ARGS)1092 similarity_dist(PG_FUNCTION_ARGS)
1093 {
1094 float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1095 PG_GETARG_DATUM(0),
1096 PG_GETARG_DATUM(1)));
1097
1098 PG_RETURN_FLOAT4(1.0 - res);
1099 }
1100
1101 Datum
similarity_op(PG_FUNCTION_ARGS)1102 similarity_op(PG_FUNCTION_ARGS)
1103 {
1104 float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1105 PG_GETARG_DATUM(0),
1106 PG_GETARG_DATUM(1)));
1107
1108 PG_RETURN_BOOL(res >= similarity_threshold);
1109 }
1110
1111 Datum
word_similarity_op(PG_FUNCTION_ARGS)1112 word_similarity_op(PG_FUNCTION_ARGS)
1113 {
1114 text *in1 = PG_GETARG_TEXT_PP(0);
1115 text *in2 = PG_GETARG_TEXT_PP(1);
1116 float4 res;
1117
1118 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1119 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1120 true);
1121
1122 PG_FREE_IF_COPY(in1, 0);
1123 PG_FREE_IF_COPY(in2, 1);
1124 PG_RETURN_BOOL(res >= word_similarity_threshold);
1125 }
1126
1127 Datum
word_similarity_commutator_op(PG_FUNCTION_ARGS)1128 word_similarity_commutator_op(PG_FUNCTION_ARGS)
1129 {
1130 text *in1 = PG_GETARG_TEXT_PP(0);
1131 text *in2 = PG_GETARG_TEXT_PP(1);
1132 float4 res;
1133
1134 res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1135 VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1136 true);
1137
1138 PG_FREE_IF_COPY(in1, 0);
1139 PG_FREE_IF_COPY(in2, 1);
1140 PG_RETURN_BOOL(res >= word_similarity_threshold);
1141 }
1142
1143 Datum
word_similarity_dist_op(PG_FUNCTION_ARGS)1144 word_similarity_dist_op(PG_FUNCTION_ARGS)
1145 {
1146 text *in1 = PG_GETARG_TEXT_PP(0);
1147 text *in2 = PG_GETARG_TEXT_PP(1);
1148 float4 res;
1149
1150 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1151 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1152 false);
1153
1154 PG_FREE_IF_COPY(in1, 0);
1155 PG_FREE_IF_COPY(in2, 1);
1156 PG_RETURN_FLOAT4(1.0 - res);
1157 }
1158
1159 Datum
word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)1160 word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1161 {
1162 text *in1 = PG_GETARG_TEXT_PP(0);
1163 text *in2 = PG_GETARG_TEXT_PP(1);
1164 float4 res;
1165
1166 res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1167 VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1168 false);
1169
1170 PG_FREE_IF_COPY(in1, 0);
1171 PG_FREE_IF_COPY(in2, 1);
1172 PG_RETURN_FLOAT4(1.0 - res);
1173 }
1174