1 /*
2 * contrib/pg_trgm/trgm_op.c
3 */
4 #include "postgres.h"
5
6 #include <ctype.h>
7
8 #include "catalog/pg_type.h"
9 #include "lib/qunique.h"
10 #include "trgm.h"
11 #include "tsearch/ts_locale.h"
12 #include "utils/lsyscache.h"
13 #include "utils/memutils.h"
14 #include "utils/pg_crc.h"
15
16 PG_MODULE_MAGIC;
17
18 /* GUC variables */
19 double similarity_threshold = 0.3f;
20 double word_similarity_threshold = 0.6f;
21 double strict_word_similarity_threshold = 0.5f;
22
23 void _PG_init(void);
24
25 PG_FUNCTION_INFO_V1(set_limit);
26 PG_FUNCTION_INFO_V1(show_limit);
27 PG_FUNCTION_INFO_V1(show_trgm);
28 PG_FUNCTION_INFO_V1(similarity);
29 PG_FUNCTION_INFO_V1(word_similarity);
30 PG_FUNCTION_INFO_V1(strict_word_similarity);
31 PG_FUNCTION_INFO_V1(similarity_dist);
32 PG_FUNCTION_INFO_V1(similarity_op);
33 PG_FUNCTION_INFO_V1(word_similarity_op);
34 PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
35 PG_FUNCTION_INFO_V1(word_similarity_dist_op);
36 PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
37 PG_FUNCTION_INFO_V1(strict_word_similarity_op);
38 PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
39 PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
40 PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
41
42 /* Trigram with position */
43 typedef struct
44 {
45 trgm trg;
46 int index;
47 } pos_trgm;
48
49 /* Trigram bound type */
50 typedef uint8 TrgmBound;
51 #define TRGM_BOUND_LEFT 0x01 /* trigram is left bound of word */
52 #define TRGM_BOUND_RIGHT 0x02 /* trigram is right bound of word */
53
54 /* Word similarity flags */
55 #define WORD_SIMILARITY_CHECK_ONLY 0x01 /* only check existence of similar
56 * search pattern in text */
57 #define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match
58 * word bounds */
59
60 /*
61 * Module load callback
62 */
63 void
_PG_init(void)64 _PG_init(void)
65 {
66 /* Define custom GUC variables. */
67 DefineCustomRealVariable("pg_trgm.similarity_threshold",
68 "Sets the threshold used by the % operator.",
69 "Valid range is 0.0 .. 1.0.",
70 &similarity_threshold,
71 0.3,
72 0.0,
73 1.0,
74 PGC_USERSET,
75 0,
76 NULL,
77 NULL,
78 NULL);
79 DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
80 "Sets the threshold used by the <% operator.",
81 "Valid range is 0.0 .. 1.0.",
82 &word_similarity_threshold,
83 0.6,
84 0.0,
85 1.0,
86 PGC_USERSET,
87 0,
88 NULL,
89 NULL,
90 NULL);
91 DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
92 "Sets the threshold used by the <<% operator.",
93 "Valid range is 0.0 .. 1.0.",
94 &strict_word_similarity_threshold,
95 0.5,
96 0.0,
97 1.0,
98 PGC_USERSET,
99 0,
100 NULL,
101 NULL,
102 NULL);
103 }
104
105 /*
106 * Deprecated function.
107 * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
108 */
109 Datum
set_limit(PG_FUNCTION_ARGS)110 set_limit(PG_FUNCTION_ARGS)
111 {
112 float4 nlimit = PG_GETARG_FLOAT4(0);
113 char *nlimit_str;
114 Oid func_out_oid;
115 bool is_varlena;
116
117 getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
118
119 nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
120
121 SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
122 PGC_USERSET, PGC_S_SESSION);
123
124 PG_RETURN_FLOAT4(similarity_threshold);
125 }
126
127
128 /*
129 * Get similarity threshold for given index scan strategy number.
130 */
131 double
index_strategy_get_limit(StrategyNumber strategy)132 index_strategy_get_limit(StrategyNumber strategy)
133 {
134 switch (strategy)
135 {
136 case SimilarityStrategyNumber:
137 return similarity_threshold;
138 case WordSimilarityStrategyNumber:
139 return word_similarity_threshold;
140 case StrictWordSimilarityStrategyNumber:
141 return strict_word_similarity_threshold;
142 default:
143 elog(ERROR, "unrecognized strategy number: %d", strategy);
144 break;
145 }
146
147 return 0.0; /* keep compiler quiet */
148 }
149
150 /*
151 * Deprecated function.
152 * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
153 */
154 Datum
show_limit(PG_FUNCTION_ARGS)155 show_limit(PG_FUNCTION_ARGS)
156 {
157 PG_RETURN_FLOAT4(similarity_threshold);
158 }
159
160 static int
comp_trgm(const void * a,const void * b)161 comp_trgm(const void *a, const void *b)
162 {
163 return CMPTRGM(a, b);
164 }
165
166 /*
167 * Finds first word in string, returns pointer to the word,
168 * endword points to the character after word
169 */
170 static char *
find_word(char * str,int lenstr,char ** endword,int * charlen)171 find_word(char *str, int lenstr, char **endword, int *charlen)
172 {
173 char *beginword = str;
174
175 while (beginword - str < lenstr && !ISWORDCHR(beginword))
176 beginword += pg_mblen(beginword);
177
178 if (beginword - str >= lenstr)
179 return NULL;
180
181 *endword = beginword;
182 *charlen = 0;
183 while (*endword - str < lenstr && ISWORDCHR(*endword))
184 {
185 *endword += pg_mblen(*endword);
186 (*charlen)++;
187 }
188
189 return beginword;
190 }
191
192 /*
193 * Reduce a trigram (three possibly multi-byte characters) to a trgm,
194 * which is always exactly three bytes. If we have three single-byte
195 * characters, we just use them as-is; otherwise we form a hash value.
196 */
197 void
compact_trigram(trgm * tptr,char * str,int bytelen)198 compact_trigram(trgm *tptr, char *str, int bytelen)
199 {
200 if (bytelen == 3)
201 {
202 CPTRGM(tptr, str);
203 }
204 else
205 {
206 pg_crc32 crc;
207
208 INIT_LEGACY_CRC32(crc);
209 COMP_LEGACY_CRC32(crc, str, bytelen);
210 FIN_LEGACY_CRC32(crc);
211
212 /*
213 * use only 3 upper bytes from crc, hope, it's good enough hashing
214 */
215 CPTRGM(tptr, &crc);
216 }
217 }
218
219 /*
220 * Adds trigrams from words (already padded).
221 */
222 static trgm *
make_trigrams(trgm * tptr,char * str,int bytelen,int charlen)223 make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
224 {
225 char *ptr = str;
226
227 if (charlen < 3)
228 return tptr;
229
230 if (bytelen > charlen)
231 {
232 /* Find multibyte character boundaries and apply compact_trigram */
233 int lenfirst = pg_mblen(str),
234 lenmiddle = pg_mblen(str + lenfirst),
235 lenlast = pg_mblen(str + lenfirst + lenmiddle);
236
237 while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
238 {
239 compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
240
241 ptr += lenfirst;
242 tptr++;
243
244 lenfirst = lenmiddle;
245 lenmiddle = lenlast;
246 lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
247 }
248 }
249 else
250 {
251 /* Fast path when there are no multibyte characters */
252 Assert(bytelen == charlen);
253
254 while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
255 {
256 CPTRGM(tptr, ptr);
257 ptr++;
258 tptr++;
259 }
260 }
261
262 return tptr;
263 }
264
265 /*
266 * Make array of trigrams without sorting and removing duplicate items.
267 *
268 * trg: where to return the array of trigrams.
269 * str: source string, of length slen bytes.
270 * bounds: where to return bounds of trigrams (if needed).
271 *
272 * Returns length of the generated array.
273 */
274 static int
generate_trgm_only(trgm * trg,char * str,int slen,TrgmBound * bounds)275 generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
276 {
277 trgm *tptr;
278 char *buf;
279 int charlen,
280 bytelen;
281 char *bword,
282 *eword;
283
284 if (slen + LPADDING + RPADDING < 3 || slen == 0)
285 return 0;
286
287 tptr = trg;
288
289 /* Allocate a buffer for case-folded, blank-padded words */
290 buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
291
292 if (LPADDING > 0)
293 {
294 *buf = ' ';
295 if (LPADDING > 1)
296 *(buf + 1) = ' ';
297 }
298
299 eword = str;
300 while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
301 {
302 #ifdef IGNORECASE
303 bword = lowerstr_with_len(bword, eword - bword);
304 bytelen = strlen(bword);
305 #else
306 bytelen = eword - bword;
307 #endif
308
309 memcpy(buf + LPADDING, bword, bytelen);
310
311 #ifdef IGNORECASE
312 pfree(bword);
313 #endif
314
315 buf[LPADDING + bytelen] = ' ';
316 buf[LPADDING + bytelen + 1] = ' ';
317
318 /* Calculate trigrams marking their bounds if needed */
319 if (bounds)
320 bounds[tptr - trg] |= TRGM_BOUND_LEFT;
321 tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
322 charlen + LPADDING + RPADDING);
323 if (bounds)
324 bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
325 }
326
327 pfree(buf);
328
329 return tptr - trg;
330 }
331
332 /*
333 * Guard against possible overflow in the palloc requests below. (We
334 * don't worry about the additive constants, since palloc can detect
335 * requests that are a little above MaxAllocSize --- we just need to
336 * prevent integer overflow in the multiplications.)
337 */
338 static void
protect_out_of_mem(int slen)339 protect_out_of_mem(int slen)
340 {
341 if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
342 (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
343 ereport(ERROR,
344 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
345 errmsg("out of memory")));
346 }
347
348 /*
349 * Make array of trigrams with sorting and removing duplicate items.
350 *
351 * str: source string, of length slen bytes.
352 *
353 * Returns the sorted array of unique trigrams.
354 */
355 TRGM *
generate_trgm(char * str,int slen)356 generate_trgm(char *str, int slen)
357 {
358 TRGM *trg;
359 int len;
360
361 protect_out_of_mem(slen);
362
363 trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
364 trg->flag = ARRKEY;
365
366 len = generate_trgm_only(GETARR(trg), str, slen, NULL);
367 SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
368
369 if (len == 0)
370 return trg;
371
372 /*
373 * Make trigrams unique.
374 */
375 if (len > 1)
376 {
377 qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
378 len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
379 }
380
381 SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
382
383 return trg;
384 }
385
386 /*
387 * Make array of positional trigrams from two trigram arrays trg1 and trg2.
388 *
389 * trg1: trigram array of search pattern, of length len1. trg1 is required
390 * word which positions don't matter and replaced with -1.
391 * trg2: trigram array of text, of length len2. trg2 is haystack where we
392 * search and have to store its positions.
393 *
394 * Returns concatenated trigram array.
395 */
396 static pos_trgm *
make_positional_trgm(trgm * trg1,int len1,trgm * trg2,int len2)397 make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
398 {
399 pos_trgm *result;
400 int i,
401 len = len1 + len2;
402
403 result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
404
405 for (i = 0; i < len1; i++)
406 {
407 memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
408 result[i].index = -1;
409 }
410
411 for (i = 0; i < len2; i++)
412 {
413 memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
414 result[i + len1].index = i;
415 }
416
417 return result;
418 }
419
420 /*
421 * Compare position trigrams: compare trigrams first and position second.
422 */
423 static int
comp_ptrgm(const void * v1,const void * v2)424 comp_ptrgm(const void *v1, const void *v2)
425 {
426 const pos_trgm *p1 = (const pos_trgm *) v1;
427 const pos_trgm *p2 = (const pos_trgm *) v2;
428 int cmp;
429
430 cmp = CMPTRGM(p1->trg, p2->trg);
431 if (cmp != 0)
432 return cmp;
433
434 if (p1->index < p2->index)
435 return -1;
436 else if (p1->index == p2->index)
437 return 0;
438 else
439 return 1;
440 }
441
442 /*
443 * Iterative search function which calculates maximum similarity with word in
444 * the string. But maximum similarity is calculated only if check_only == false.
445 *
446 * trg2indexes: array which stores indexes of the array "found".
447 * found: array which stores true of false values.
448 * ulen1: count of unique trigrams of array "trg1".
449 * len2: length of array "trg2" and array "trg2indexes".
450 * len: length of the array "found".
451 * lags: set of boolean flags parameterizing similarity calculation.
452 * bounds: whether each trigram is left/right bound of word.
453 *
454 * Returns word similarity.
455 */
456 static float4
iterate_word_similarity(int * trg2indexes,bool * found,int ulen1,int len2,int len,uint8 flags,TrgmBound * bounds)457 iterate_word_similarity(int *trg2indexes,
458 bool *found,
459 int ulen1,
460 int len2,
461 int len,
462 uint8 flags,
463 TrgmBound *bounds)
464 {
465 int *lastpos,
466 i,
467 ulen2 = 0,
468 count = 0,
469 upper = -1,
470 lower;
471 float4 smlr_cur,
472 smlr_max = 0.0f;
473 double threshold;
474
475 Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
476
477 /* Select appropriate threshold */
478 threshold = (flags & WORD_SIMILARITY_STRICT) ?
479 strict_word_similarity_threshold :
480 word_similarity_threshold;
481
482 /*
483 * Consider first trigram as initial lower bound for strict word
484 * similarity, or initialize it later with first trigram present for plain
485 * word similarity.
486 */
487 lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
488
489 /* Memorise last position of each trigram */
490 lastpos = (int *) palloc(sizeof(int) * len);
491 memset(lastpos, -1, sizeof(int) * len);
492
493 for (i = 0; i < len2; i++)
494 {
495 /* Get index of next trigram */
496 int trgindex = trg2indexes[i];
497
498 /* Update last position of this trigram */
499 if (lower >= 0 || found[trgindex])
500 {
501 if (lastpos[trgindex] < 0)
502 {
503 ulen2++;
504 if (found[trgindex])
505 count++;
506 }
507 lastpos[trgindex] = i;
508 }
509
510 /*
511 * Adjust upper bound if trigram is upper bound of word for strict
512 * word similarity, or if trigram is present in required substring for
513 * plain word similarity
514 */
515 if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
516 : found[trgindex])
517 {
518 int prev_lower,
519 tmp_ulen2,
520 tmp_lower,
521 tmp_count;
522
523 upper = i;
524 if (lower == -1)
525 {
526 lower = i;
527 ulen2 = 1;
528 }
529
530 smlr_cur = CALCSML(count, ulen1, ulen2);
531
532 /* Also try to adjust lower bound for greater similarity */
533 tmp_count = count;
534 tmp_ulen2 = ulen2;
535 prev_lower = lower;
536 for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
537 {
538 float smlr_tmp;
539 int tmp_trgindex;
540
541 /*
542 * Adjust lower bound only if trigram is lower bound of word
543 * for strict word similarity, or consider every trigram as
544 * lower bound for plain word similarity.
545 */
546 if (!(flags & WORD_SIMILARITY_STRICT)
547 || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
548 {
549 smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
550 if (smlr_tmp > smlr_cur)
551 {
552 smlr_cur = smlr_tmp;
553 ulen2 = tmp_ulen2;
554 lower = tmp_lower;
555 count = tmp_count;
556 }
557
558 /*
559 * If we only check that word similarity is greater than
560 * threshold we do not need to calculate a maximum
561 * similarity.
562 */
563 if ((flags & WORD_SIMILARITY_CHECK_ONLY)
564 && smlr_cur >= threshold)
565 break;
566 }
567
568 tmp_trgindex = trg2indexes[tmp_lower];
569 if (lastpos[tmp_trgindex] == tmp_lower)
570 {
571 tmp_ulen2--;
572 if (found[tmp_trgindex])
573 tmp_count--;
574 }
575 }
576
577 smlr_max = Max(smlr_max, smlr_cur);
578
579 /*
580 * if we only check that word similarity is greater than threshold
581 * we do not need to calculate a maximum similarity.
582 */
583 if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
584 break;
585
586 for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
587 {
588 int tmp_trgindex;
589
590 tmp_trgindex = trg2indexes[tmp_lower];
591 if (lastpos[tmp_trgindex] == tmp_lower)
592 lastpos[tmp_trgindex] = -1;
593 }
594 }
595 }
596
597 pfree(lastpos);
598
599 return smlr_max;
600 }
601
602 /*
603 * Calculate word similarity.
604 * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
605 * are used to calculate word similarity using iterate_word_similarity().
606 *
607 * "trg2indexes" is array which stores indexes of the array "found".
608 * In other words:
609 * trg2indexes[j] = i;
610 * found[i] = true (or false);
611 * If found[i] == true then there is trigram trg2[j] in array "trg1".
612 * If found[i] == false then there is not trigram trg2[j] in array "trg1".
613 *
614 * str1: search pattern string, of length slen1 bytes.
615 * str2: text in which we are looking for a word, of length slen2 bytes.
616 * flags: set of boolean flags parameterizing similarity calculation.
617 *
618 * Returns word similarity.
619 */
620 static float4
calc_word_similarity(char * str1,int slen1,char * str2,int slen2,uint8 flags)621 calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
622 uint8 flags)
623 {
624 bool *found;
625 pos_trgm *ptrg;
626 trgm *trg1;
627 trgm *trg2;
628 int len1,
629 len2,
630 len,
631 i,
632 j,
633 ulen1;
634 int *trg2indexes;
635 float4 result;
636 TrgmBound *bounds;
637
638 protect_out_of_mem(slen1 + slen2);
639
640 /* Make positional trigrams */
641 trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
642 trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
643 if (flags & WORD_SIMILARITY_STRICT)
644 bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
645 else
646 bounds = NULL;
647
648 len1 = generate_trgm_only(trg1, str1, slen1, NULL);
649 len2 = generate_trgm_only(trg2, str2, slen2, bounds);
650
651 ptrg = make_positional_trgm(trg1, len1, trg2, len2);
652 len = len1 + len2;
653 qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
654
655 pfree(trg1);
656 pfree(trg2);
657
658 /*
659 * Merge positional trigrams array: enumerate each trigram and find its
660 * presence in required word.
661 */
662 trg2indexes = (int *) palloc(sizeof(int) * len2);
663 found = (bool *) palloc0(sizeof(bool) * len);
664
665 ulen1 = 0;
666 j = 0;
667 for (i = 0; i < len; i++)
668 {
669 if (i > 0)
670 {
671 int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
672
673 if (cmp != 0)
674 {
675 if (found[j])
676 ulen1++;
677 j++;
678 }
679 }
680
681 if (ptrg[i].index >= 0)
682 {
683 trg2indexes[ptrg[i].index] = j;
684 }
685 else
686 {
687 found[j] = true;
688 }
689 }
690 if (found[j])
691 ulen1++;
692
693 /* Run iterative procedure to find maximum similarity with word */
694 result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
695 flags, bounds);
696
697 pfree(trg2indexes);
698 pfree(found);
699 pfree(ptrg);
700
701 return result;
702 }
703
704
705 /*
706 * Extract the next non-wildcard part of a search string, i.e. a word bounded
707 * by '_' or '%' meta-characters, non-word characters or string end.
708 *
709 * str: source string, of length lenstr bytes (need not be null-terminated)
710 * buf: where to return the substring (must be long enough)
711 * *bytelen: receives byte length of the found substring
712 * *charlen: receives character length of the found substring
713 *
714 * Returns pointer to end+1 of the found substring in the source string.
715 * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
716 *
717 * If the found word is bounded by non-word characters or string boundaries
718 * then this function will include corresponding padding spaces into buf.
719 */
720 static const char *
get_wildcard_part(const char * str,int lenstr,char * buf,int * bytelen,int * charlen)721 get_wildcard_part(const char *str, int lenstr,
722 char *buf, int *bytelen, int *charlen)
723 {
724 const char *beginword = str;
725 const char *endword;
726 char *s = buf;
727 bool in_leading_wildcard_meta = false;
728 bool in_trailing_wildcard_meta = false;
729 bool in_escape = false;
730 int clen;
731
732 /*
733 * Find the first word character, remembering whether preceding character
734 * was wildcard meta-character. Note that the in_escape state persists
735 * from this loop to the next one, since we may exit at a word character
736 * that is in_escape.
737 */
738 while (beginword - str < lenstr)
739 {
740 if (in_escape)
741 {
742 if (ISWORDCHR(beginword))
743 break;
744 in_escape = false;
745 in_leading_wildcard_meta = false;
746 }
747 else
748 {
749 if (ISESCAPECHAR(beginword))
750 in_escape = true;
751 else if (ISWILDCARDCHAR(beginword))
752 in_leading_wildcard_meta = true;
753 else if (ISWORDCHR(beginword))
754 break;
755 else
756 in_leading_wildcard_meta = false;
757 }
758 beginword += pg_mblen(beginword);
759 }
760
761 /*
762 * Handle string end.
763 */
764 if (beginword - str >= lenstr)
765 return NULL;
766
767 /*
768 * Add left padding spaces if preceding character wasn't wildcard
769 * meta-character.
770 */
771 *charlen = 0;
772 if (!in_leading_wildcard_meta)
773 {
774 if (LPADDING > 0)
775 {
776 *s++ = ' ';
777 (*charlen)++;
778 if (LPADDING > 1)
779 {
780 *s++ = ' ';
781 (*charlen)++;
782 }
783 }
784 }
785
786 /*
787 * Copy data into buf until wildcard meta-character, non-word character or
788 * string boundary. Strip escapes during copy.
789 */
790 endword = beginword;
791 while (endword - str < lenstr)
792 {
793 clen = pg_mblen(endword);
794 if (in_escape)
795 {
796 if (ISWORDCHR(endword))
797 {
798 memcpy(s, endword, clen);
799 (*charlen)++;
800 s += clen;
801 }
802 else
803 {
804 /*
805 * Back up endword to the escape character when stopping at an
806 * escaped char, so that subsequent get_wildcard_part will
807 * restart from the escape character. We assume here that
808 * escape chars are single-byte.
809 */
810 endword--;
811 break;
812 }
813 in_escape = false;
814 }
815 else
816 {
817 if (ISESCAPECHAR(endword))
818 in_escape = true;
819 else if (ISWILDCARDCHAR(endword))
820 {
821 in_trailing_wildcard_meta = true;
822 break;
823 }
824 else if (ISWORDCHR(endword))
825 {
826 memcpy(s, endword, clen);
827 (*charlen)++;
828 s += clen;
829 }
830 else
831 break;
832 }
833 endword += clen;
834 }
835
836 /*
837 * Add right padding spaces if next character isn't wildcard
838 * meta-character.
839 */
840 if (!in_trailing_wildcard_meta)
841 {
842 if (RPADDING > 0)
843 {
844 *s++ = ' ';
845 (*charlen)++;
846 if (RPADDING > 1)
847 {
848 *s++ = ' ';
849 (*charlen)++;
850 }
851 }
852 }
853
854 *bytelen = s - buf;
855 return endword;
856 }
857
858 /*
859 * Generates trigrams for wildcard search string.
860 *
861 * Returns array of trigrams that must occur in any string that matches the
862 * wildcard string. For example, given pattern "a%bcd%" the trigrams
863 * " a", "bcd" would be extracted.
864 */
865 TRGM *
generate_wildcard_trgm(const char * str,int slen)866 generate_wildcard_trgm(const char *str, int slen)
867 {
868 TRGM *trg;
869 char *buf,
870 *buf2;
871 trgm *tptr;
872 int len,
873 charlen,
874 bytelen;
875 const char *eword;
876
877 protect_out_of_mem(slen);
878
879 trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
880 trg->flag = ARRKEY;
881 SET_VARSIZE(trg, TRGMHDRSIZE);
882
883 if (slen + LPADDING + RPADDING < 3 || slen == 0)
884 return trg;
885
886 tptr = GETARR(trg);
887
888 /* Allocate a buffer for blank-padded, but not yet case-folded, words */
889 buf = palloc(sizeof(char) * (slen + 4));
890
891 /*
892 * Extract trigrams from each substring extracted by get_wildcard_part.
893 */
894 eword = str;
895 while ((eword = get_wildcard_part(eword, slen - (eword - str),
896 buf, &bytelen, &charlen)) != NULL)
897 {
898 #ifdef IGNORECASE
899 buf2 = lowerstr_with_len(buf, bytelen);
900 bytelen = strlen(buf2);
901 #else
902 buf2 = buf;
903 #endif
904
905 /*
906 * count trigrams
907 */
908 tptr = make_trigrams(tptr, buf2, bytelen, charlen);
909
910 #ifdef IGNORECASE
911 pfree(buf2);
912 #endif
913 }
914
915 pfree(buf);
916
917 if ((len = tptr - GETARR(trg)) == 0)
918 return trg;
919
920 /*
921 * Make trigrams unique.
922 */
923 if (len > 1)
924 {
925 qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
926 len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
927 }
928
929 SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
930
931 return trg;
932 }
933
934 uint32
trgm2int(trgm * ptr)935 trgm2int(trgm *ptr)
936 {
937 uint32 val = 0;
938
939 val |= *(((unsigned char *) ptr));
940 val <<= 8;
941 val |= *(((unsigned char *) ptr) + 1);
942 val <<= 8;
943 val |= *(((unsigned char *) ptr) + 2);
944
945 return val;
946 }
947
948 Datum
show_trgm(PG_FUNCTION_ARGS)949 show_trgm(PG_FUNCTION_ARGS)
950 {
951 text *in = PG_GETARG_TEXT_PP(0);
952 TRGM *trg;
953 Datum *d;
954 ArrayType *a;
955 trgm *ptr;
956 int i;
957
958 trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
959 d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
960
961 for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
962 {
963 text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
964
965 if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
966 {
967 snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
968 SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
969 }
970 else
971 {
972 SET_VARSIZE(item, VARHDRSZ + 3);
973 CPTRGM(VARDATA(item), ptr);
974 }
975 d[i] = PointerGetDatum(item);
976 }
977
978 a = construct_array(d,
979 ARRNELEM(trg),
980 TEXTOID,
981 -1,
982 false,
983 TYPALIGN_INT);
984
985 for (i = 0; i < ARRNELEM(trg); i++)
986 pfree(DatumGetPointer(d[i]));
987
988 pfree(d);
989 pfree(trg);
990 PG_FREE_IF_COPY(in, 0);
991
992 PG_RETURN_POINTER(a);
993 }
994
995 float4
cnt_sml(TRGM * trg1,TRGM * trg2,bool inexact)996 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
997 {
998 trgm *ptr1,
999 *ptr2;
1000 int count = 0;
1001 int len1,
1002 len2;
1003
1004 ptr1 = GETARR(trg1);
1005 ptr2 = GETARR(trg2);
1006
1007 len1 = ARRNELEM(trg1);
1008 len2 = ARRNELEM(trg2);
1009
1010 /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
1011 if (len1 <= 0 || len2 <= 0)
1012 return (float4) 0.0;
1013
1014 while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1015 {
1016 int res = CMPTRGM(ptr1, ptr2);
1017
1018 if (res < 0)
1019 ptr1++;
1020 else if (res > 0)
1021 ptr2++;
1022 else
1023 {
1024 ptr1++;
1025 ptr2++;
1026 count++;
1027 }
1028 }
1029
1030 /*
1031 * If inexact then len2 is equal to count, because we don't know actual
1032 * length of second string in inexact search and we can assume that count
1033 * is a lower bound of len2.
1034 */
1035 return CALCSML(count, len1, inexact ? count : len2);
1036 }
1037
1038
1039 /*
1040 * Returns whether trg2 contains all trigrams in trg1.
1041 * This relies on the trigram arrays being sorted.
1042 */
1043 bool
trgm_contained_by(TRGM * trg1,TRGM * trg2)1044 trgm_contained_by(TRGM *trg1, TRGM *trg2)
1045 {
1046 trgm *ptr1,
1047 *ptr2;
1048 int len1,
1049 len2;
1050
1051 ptr1 = GETARR(trg1);
1052 ptr2 = GETARR(trg2);
1053
1054 len1 = ARRNELEM(trg1);
1055 len2 = ARRNELEM(trg2);
1056
1057 while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1058 {
1059 int res = CMPTRGM(ptr1, ptr2);
1060
1061 if (res < 0)
1062 return false;
1063 else if (res > 0)
1064 ptr2++;
1065 else
1066 {
1067 ptr1++;
1068 ptr2++;
1069 }
1070 }
1071 if (ptr1 - GETARR(trg1) < len1)
1072 return false;
1073 else
1074 return true;
1075 }
1076
1077 /*
1078 * Return a palloc'd boolean array showing, for each trigram in "query",
1079 * whether it is present in the trigram array "key".
1080 * This relies on the "key" array being sorted, but "query" need not be.
1081 */
1082 bool *
trgm_presence_map(TRGM * query,TRGM * key)1083 trgm_presence_map(TRGM *query, TRGM *key)
1084 {
1085 bool *result;
1086 trgm *ptrq = GETARR(query),
1087 *ptrk = GETARR(key);
1088 int lenq = ARRNELEM(query),
1089 lenk = ARRNELEM(key),
1090 i;
1091
1092 result = (bool *) palloc0(lenq * sizeof(bool));
1093
1094 /* for each query trigram, do a binary search in the key array */
1095 for (i = 0; i < lenq; i++)
1096 {
1097 int lo = 0;
1098 int hi = lenk;
1099
1100 while (lo < hi)
1101 {
1102 int mid = (lo + hi) / 2;
1103 int res = CMPTRGM(ptrq, ptrk + mid);
1104
1105 if (res < 0)
1106 hi = mid;
1107 else if (res > 0)
1108 lo = mid + 1;
1109 else
1110 {
1111 result[i] = true;
1112 break;
1113 }
1114 }
1115 ptrq++;
1116 }
1117
1118 return result;
1119 }
1120
1121 Datum
similarity(PG_FUNCTION_ARGS)1122 similarity(PG_FUNCTION_ARGS)
1123 {
1124 text *in1 = PG_GETARG_TEXT_PP(0);
1125 text *in2 = PG_GETARG_TEXT_PP(1);
1126 TRGM *trg1,
1127 *trg2;
1128 float4 res;
1129
1130 trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
1131 trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
1132
1133 res = cnt_sml(trg1, trg2, false);
1134
1135 pfree(trg1);
1136 pfree(trg2);
1137 PG_FREE_IF_COPY(in1, 0);
1138 PG_FREE_IF_COPY(in2, 1);
1139
1140 PG_RETURN_FLOAT4(res);
1141 }
1142
1143 Datum
word_similarity(PG_FUNCTION_ARGS)1144 word_similarity(PG_FUNCTION_ARGS)
1145 {
1146 text *in1 = PG_GETARG_TEXT_PP(0);
1147 text *in2 = PG_GETARG_TEXT_PP(1);
1148 float4 res;
1149
1150 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1151 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1152 0);
1153
1154 PG_FREE_IF_COPY(in1, 0);
1155 PG_FREE_IF_COPY(in2, 1);
1156 PG_RETURN_FLOAT4(res);
1157 }
1158
1159 Datum
strict_word_similarity(PG_FUNCTION_ARGS)1160 strict_word_similarity(PG_FUNCTION_ARGS)
1161 {
1162 text *in1 = PG_GETARG_TEXT_PP(0);
1163 text *in2 = PG_GETARG_TEXT_PP(1);
1164 float4 res;
1165
1166 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1167 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1168 WORD_SIMILARITY_STRICT);
1169
1170 PG_FREE_IF_COPY(in1, 0);
1171 PG_FREE_IF_COPY(in2, 1);
1172 PG_RETURN_FLOAT4(res);
1173 }
1174
1175 Datum
similarity_dist(PG_FUNCTION_ARGS)1176 similarity_dist(PG_FUNCTION_ARGS)
1177 {
1178 float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1179 PG_GETARG_DATUM(0),
1180 PG_GETARG_DATUM(1)));
1181
1182 PG_RETURN_FLOAT4(1.0 - res);
1183 }
1184
1185 Datum
similarity_op(PG_FUNCTION_ARGS)1186 similarity_op(PG_FUNCTION_ARGS)
1187 {
1188 float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1189 PG_GETARG_DATUM(0),
1190 PG_GETARG_DATUM(1)));
1191
1192 PG_RETURN_BOOL(res >= similarity_threshold);
1193 }
1194
1195 Datum
word_similarity_op(PG_FUNCTION_ARGS)1196 word_similarity_op(PG_FUNCTION_ARGS)
1197 {
1198 text *in1 = PG_GETARG_TEXT_PP(0);
1199 text *in2 = PG_GETARG_TEXT_PP(1);
1200 float4 res;
1201
1202 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1203 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1204 WORD_SIMILARITY_CHECK_ONLY);
1205
1206 PG_FREE_IF_COPY(in1, 0);
1207 PG_FREE_IF_COPY(in2, 1);
1208 PG_RETURN_BOOL(res >= word_similarity_threshold);
1209 }
1210
1211 Datum
word_similarity_commutator_op(PG_FUNCTION_ARGS)1212 word_similarity_commutator_op(PG_FUNCTION_ARGS)
1213 {
1214 text *in1 = PG_GETARG_TEXT_PP(0);
1215 text *in2 = PG_GETARG_TEXT_PP(1);
1216 float4 res;
1217
1218 res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1219 VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1220 WORD_SIMILARITY_CHECK_ONLY);
1221
1222 PG_FREE_IF_COPY(in1, 0);
1223 PG_FREE_IF_COPY(in2, 1);
1224 PG_RETURN_BOOL(res >= word_similarity_threshold);
1225 }
1226
1227 Datum
word_similarity_dist_op(PG_FUNCTION_ARGS)1228 word_similarity_dist_op(PG_FUNCTION_ARGS)
1229 {
1230 text *in1 = PG_GETARG_TEXT_PP(0);
1231 text *in2 = PG_GETARG_TEXT_PP(1);
1232 float4 res;
1233
1234 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1235 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1236 0);
1237
1238 PG_FREE_IF_COPY(in1, 0);
1239 PG_FREE_IF_COPY(in2, 1);
1240 PG_RETURN_FLOAT4(1.0 - res);
1241 }
1242
1243 Datum
word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)1244 word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1245 {
1246 text *in1 = PG_GETARG_TEXT_PP(0);
1247 text *in2 = PG_GETARG_TEXT_PP(1);
1248 float4 res;
1249
1250 res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1251 VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1252 0);
1253
1254 PG_FREE_IF_COPY(in1, 0);
1255 PG_FREE_IF_COPY(in2, 1);
1256 PG_RETURN_FLOAT4(1.0 - res);
1257 }
1258
1259 Datum
strict_word_similarity_op(PG_FUNCTION_ARGS)1260 strict_word_similarity_op(PG_FUNCTION_ARGS)
1261 {
1262 text *in1 = PG_GETARG_TEXT_PP(0);
1263 text *in2 = PG_GETARG_TEXT_PP(1);
1264 float4 res;
1265
1266 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1267 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1268 WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1269
1270 PG_FREE_IF_COPY(in1, 0);
1271 PG_FREE_IF_COPY(in2, 1);
1272 PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1273 }
1274
1275 Datum
strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)1276 strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
1277 {
1278 text *in1 = PG_GETARG_TEXT_PP(0);
1279 text *in2 = PG_GETARG_TEXT_PP(1);
1280 float4 res;
1281
1282 res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1283 VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1284 WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1285
1286 PG_FREE_IF_COPY(in1, 0);
1287 PG_FREE_IF_COPY(in2, 1);
1288 PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1289 }
1290
1291 Datum
strict_word_similarity_dist_op(PG_FUNCTION_ARGS)1292 strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
1293 {
1294 text *in1 = PG_GETARG_TEXT_PP(0);
1295 text *in2 = PG_GETARG_TEXT_PP(1);
1296 float4 res;
1297
1298 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1299 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1300 WORD_SIMILARITY_STRICT);
1301
1302 PG_FREE_IF_COPY(in1, 0);
1303 PG_FREE_IF_COPY(in2, 1);
1304 PG_RETURN_FLOAT4(1.0 - res);
1305 }
1306
1307 Datum
strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)1308 strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1309 {
1310 text *in1 = PG_GETARG_TEXT_PP(0);
1311 text *in2 = PG_GETARG_TEXT_PP(1);
1312 float4 res;
1313
1314 res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1315 VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1316 WORD_SIMILARITY_STRICT);
1317
1318 PG_FREE_IF_COPY(in1, 0);
1319 PG_FREE_IF_COPY(in2, 1);
1320 PG_RETURN_FLOAT4(1.0 - res);
1321 }
1322