1 /* 2 * contrib/pg_trgm/trgm_op.c 3 */ 4 #include "postgres.h" 5 6 #include <ctype.h> 7 8 #include "catalog/pg_type.h" 9 #include "lib/qunique.h" 10 #include "trgm.h" 11 #include "tsearch/ts_locale.h" 12 #include "utils/lsyscache.h" 13 #include "utils/memutils.h" 14 #include "utils/pg_crc.h" 15 16 PG_MODULE_MAGIC; 17 18 /* GUC variables */ 19 double similarity_threshold = 0.3f; 20 double word_similarity_threshold = 0.6f; 21 double strict_word_similarity_threshold = 0.5f; 22 23 void _PG_init(void); 24 25 PG_FUNCTION_INFO_V1(set_limit); 26 PG_FUNCTION_INFO_V1(show_limit); 27 PG_FUNCTION_INFO_V1(show_trgm); 28 PG_FUNCTION_INFO_V1(similarity); 29 PG_FUNCTION_INFO_V1(word_similarity); 30 PG_FUNCTION_INFO_V1(strict_word_similarity); 31 PG_FUNCTION_INFO_V1(similarity_dist); 32 PG_FUNCTION_INFO_V1(similarity_op); 33 PG_FUNCTION_INFO_V1(word_similarity_op); 34 PG_FUNCTION_INFO_V1(word_similarity_commutator_op); 35 PG_FUNCTION_INFO_V1(word_similarity_dist_op); 36 PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op); 37 PG_FUNCTION_INFO_V1(strict_word_similarity_op); 38 PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op); 39 PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op); 40 PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op); 41 42 /* Trigram with position */ 43 typedef struct 44 { 45 trgm trg; 46 int index; 47 } pos_trgm; 48 49 /* Trigram bound type */ 50 typedef uint8 TrgmBound; 51 #define TRGM_BOUND_LEFT 0x01 /* trigram is left bound of word */ 52 #define TRGM_BOUND_RIGHT 0x02 /* trigram is right bound of word */ 53 54 /* Word similarity flags */ 55 #define WORD_SIMILARITY_CHECK_ONLY 0x01 /* only check existence of similar 56 * search pattern in text */ 57 #define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match 58 * word bounds */ 59 60 /* 61 * Module load callback 62 */ 63 void 64 _PG_init(void) 65 { 66 /* Define custom GUC variables. */ 67 DefineCustomRealVariable("pg_trgm.similarity_threshold", 68 "Sets the threshold used by the % operator.", 69 "Valid range is 0.0 .. 1.0.", 70 &similarity_threshold, 71 0.3, 72 0.0, 73 1.0, 74 PGC_USERSET, 75 0, 76 NULL, 77 NULL, 78 NULL); 79 DefineCustomRealVariable("pg_trgm.word_similarity_threshold", 80 "Sets the threshold used by the <% operator.", 81 "Valid range is 0.0 .. 1.0.", 82 &word_similarity_threshold, 83 0.6, 84 0.0, 85 1.0, 86 PGC_USERSET, 87 0, 88 NULL, 89 NULL, 90 NULL); 91 DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold", 92 "Sets the threshold used by the <<% operator.", 93 "Valid range is 0.0 .. 1.0.", 94 &strict_word_similarity_threshold, 95 0.5, 96 0.0, 97 1.0, 98 PGC_USERSET, 99 0, 100 NULL, 101 NULL, 102 NULL); 103 } 104 105 /* 106 * Deprecated function. 107 * Use "pg_trgm.similarity_threshold" GUC variable instead of this function. 108 */ 109 Datum 110 set_limit(PG_FUNCTION_ARGS) 111 { 112 float4 nlimit = PG_GETARG_FLOAT4(0); 113 char *nlimit_str; 114 Oid func_out_oid; 115 bool is_varlena; 116 117 getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena); 118 119 nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit)); 120 121 SetConfigOption("pg_trgm.similarity_threshold", nlimit_str, 122 PGC_USERSET, PGC_S_SESSION); 123 124 PG_RETURN_FLOAT4(similarity_threshold); 125 } 126 127 128 /* 129 * Get similarity threshold for given index scan strategy number. 130 */ 131 double 132 index_strategy_get_limit(StrategyNumber strategy) 133 { 134 switch (strategy) 135 { 136 case SimilarityStrategyNumber: 137 return similarity_threshold; 138 case WordSimilarityStrategyNumber: 139 return word_similarity_threshold; 140 case StrictWordSimilarityStrategyNumber: 141 return strict_word_similarity_threshold; 142 default: 143 elog(ERROR, "unrecognized strategy number: %d", strategy); 144 break; 145 } 146 147 return 0.0; /* keep compiler quiet */ 148 } 149 150 /* 151 * Deprecated function. 152 * Use "pg_trgm.similarity_threshold" GUC variable instead of this function. 153 */ 154 Datum 155 show_limit(PG_FUNCTION_ARGS) 156 { 157 PG_RETURN_FLOAT4(similarity_threshold); 158 } 159 160 static int 161 comp_trgm(const void *a, const void *b) 162 { 163 return CMPTRGM(a, b); 164 } 165 166 /* 167 * Finds first word in string, returns pointer to the word, 168 * endword points to the character after word 169 */ 170 static char * 171 find_word(char *str, int lenstr, char **endword, int *charlen) 172 { 173 char *beginword = str; 174 175 while (beginword - str < lenstr && !ISWORDCHR(beginword)) 176 beginword += pg_mblen(beginword); 177 178 if (beginword - str >= lenstr) 179 return NULL; 180 181 *endword = beginword; 182 *charlen = 0; 183 while (*endword - str < lenstr && ISWORDCHR(*endword)) 184 { 185 *endword += pg_mblen(*endword); 186 (*charlen)++; 187 } 188 189 return beginword; 190 } 191 192 /* 193 * Reduce a trigram (three possibly multi-byte characters) to a trgm, 194 * which is always exactly three bytes. If we have three single-byte 195 * characters, we just use them as-is; otherwise we form a hash value. 196 */ 197 void 198 compact_trigram(trgm *tptr, char *str, int bytelen) 199 { 200 if (bytelen == 3) 201 { 202 CPTRGM(tptr, str); 203 } 204 else 205 { 206 pg_crc32 crc; 207 208 INIT_LEGACY_CRC32(crc); 209 COMP_LEGACY_CRC32(crc, str, bytelen); 210 FIN_LEGACY_CRC32(crc); 211 212 /* 213 * use only 3 upper bytes from crc, hope, it's good enough hashing 214 */ 215 CPTRGM(tptr, &crc); 216 } 217 } 218 219 /* 220 * Adds trigrams from words (already padded). 221 */ 222 static trgm * 223 make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) 224 { 225 char *ptr = str; 226 227 if (charlen < 3) 228 return tptr; 229 230 if (bytelen > charlen) 231 { 232 /* Find multibyte character boundaries and apply compact_trigram */ 233 int lenfirst = pg_mblen(str), 234 lenmiddle = pg_mblen(str + lenfirst), 235 lenlast = pg_mblen(str + lenfirst + lenmiddle); 236 237 while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen) 238 { 239 compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast); 240 241 ptr += lenfirst; 242 tptr++; 243 244 lenfirst = lenmiddle; 245 lenmiddle = lenlast; 246 lenlast = pg_mblen(ptr + lenfirst + lenmiddle); 247 } 248 } 249 else 250 { 251 /* Fast path when there are no multibyte characters */ 252 Assert(bytelen == charlen); 253 254 while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ ) 255 { 256 CPTRGM(tptr, ptr); 257 ptr++; 258 tptr++; 259 } 260 } 261 262 return tptr; 263 } 264 265 /* 266 * Make array of trigrams without sorting and removing duplicate items. 267 * 268 * trg: where to return the array of trigrams. 269 * str: source string, of length slen bytes. 270 * bounds: where to return bounds of trigrams (if needed). 271 * 272 * Returns length of the generated array. 273 */ 274 static int 275 generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds) 276 { 277 trgm *tptr; 278 char *buf; 279 int charlen, 280 bytelen; 281 char *bword, 282 *eword; 283 284 if (slen + LPADDING + RPADDING < 3 || slen == 0) 285 return 0; 286 287 tptr = trg; 288 289 /* Allocate a buffer for case-folded, blank-padded words */ 290 buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4); 291 292 if (LPADDING > 0) 293 { 294 *buf = ' '; 295 if (LPADDING > 1) 296 *(buf + 1) = ' '; 297 } 298 299 eword = str; 300 while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL) 301 { 302 #ifdef IGNORECASE 303 bword = lowerstr_with_len(bword, eword - bword); 304 bytelen = strlen(bword); 305 #else 306 bytelen = eword - bword; 307 #endif 308 309 memcpy(buf + LPADDING, bword, bytelen); 310 311 #ifdef IGNORECASE 312 pfree(bword); 313 #endif 314 315 buf[LPADDING + bytelen] = ' '; 316 buf[LPADDING + bytelen + 1] = ' '; 317 318 /* Calculate trigrams marking their bounds if needed */ 319 if (bounds) 320 bounds[tptr - trg] |= TRGM_BOUND_LEFT; 321 tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING, 322 charlen + LPADDING + RPADDING); 323 if (bounds) 324 bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT; 325 } 326 327 pfree(buf); 328 329 return tptr - trg; 330 } 331 332 /* 333 * Guard against possible overflow in the palloc requests below. (We 334 * don't worry about the additive constants, since palloc can detect 335 * requests that are a little above MaxAllocSize --- we just need to 336 * prevent integer overflow in the multiplications.) 337 */ 338 static void 339 protect_out_of_mem(int slen) 340 { 341 if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) || 342 (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length())) 343 ereport(ERROR, 344 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), 345 errmsg("out of memory"))); 346 } 347 348 /* 349 * Make array of trigrams with sorting and removing duplicate items. 350 * 351 * str: source string, of length slen bytes. 352 * 353 * Returns the sorted array of unique trigrams. 354 */ 355 TRGM * 356 generate_trgm(char *str, int slen) 357 { 358 TRGM *trg; 359 int len; 360 361 protect_out_of_mem(slen); 362 363 trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3); 364 trg->flag = ARRKEY; 365 366 len = generate_trgm_only(GETARR(trg), str, slen, NULL); 367 SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); 368 369 if (len == 0) 370 return trg; 371 372 /* 373 * Make trigrams unique. 374 */ 375 if (len > 1) 376 { 377 qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm); 378 len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm); 379 } 380 381 SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); 382 383 return trg; 384 } 385 386 /* 387 * Make array of positional trigrams from two trigram arrays trg1 and trg2. 388 * 389 * trg1: trigram array of search pattern, of length len1. trg1 is required 390 * word which positions don't matter and replaced with -1. 391 * trg2: trigram array of text, of length len2. trg2 is haystack where we 392 * search and have to store its positions. 393 * 394 * Returns concatenated trigram array. 395 */ 396 static pos_trgm * 397 make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2) 398 { 399 pos_trgm *result; 400 int i, 401 len = len1 + len2; 402 403 result = (pos_trgm *) palloc(sizeof(pos_trgm) * len); 404 405 for (i = 0; i < len1; i++) 406 { 407 memcpy(&result[i].trg, &trg1[i], sizeof(trgm)); 408 result[i].index = -1; 409 } 410 411 for (i = 0; i < len2; i++) 412 { 413 memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm)); 414 result[i + len1].index = i; 415 } 416 417 return result; 418 } 419 420 /* 421 * Compare position trigrams: compare trigrams first and position second. 422 */ 423 static int 424 comp_ptrgm(const void *v1, const void *v2) 425 { 426 const pos_trgm *p1 = (const pos_trgm *) v1; 427 const pos_trgm *p2 = (const pos_trgm *) v2; 428 int cmp; 429 430 cmp = CMPTRGM(p1->trg, p2->trg); 431 if (cmp != 0) 432 return cmp; 433 434 if (p1->index < p2->index) 435 return -1; 436 else if (p1->index == p2->index) 437 return 0; 438 else 439 return 1; 440 } 441 442 /* 443 * Iterative search function which calculates maximum similarity with word in 444 * the string. But maximum similarity is calculated only if check_only == false. 445 * 446 * trg2indexes: array which stores indexes of the array "found". 447 * found: array which stores true of false values. 448 * ulen1: count of unique trigrams of array "trg1". 449 * len2: length of array "trg2" and array "trg2indexes". 450 * len: length of the array "found". 451 * lags: set of boolean flags parameterizing similarity calculation. 452 * bounds: whether each trigram is left/right bound of word. 453 * 454 * Returns word similarity. 455 */ 456 static float4 457 iterate_word_similarity(int *trg2indexes, 458 bool *found, 459 int ulen1, 460 int len2, 461 int len, 462 uint8 flags, 463 TrgmBound *bounds) 464 { 465 int *lastpos, 466 i, 467 ulen2 = 0, 468 count = 0, 469 upper = -1, 470 lower; 471 float4 smlr_cur, 472 smlr_max = 0.0f; 473 double threshold; 474 475 Assert(bounds || !(flags & WORD_SIMILARITY_STRICT)); 476 477 /* Select appropriate threshold */ 478 threshold = (flags & WORD_SIMILARITY_STRICT) ? 479 strict_word_similarity_threshold : 480 word_similarity_threshold; 481 482 /* 483 * Consider first trigram as initial lower bound for strict word 484 * similarity, or initialize it later with first trigram present for plain 485 * word similarity. 486 */ 487 lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1; 488 489 /* Memorise last position of each trigram */ 490 lastpos = (int *) palloc(sizeof(int) * len); 491 memset(lastpos, -1, sizeof(int) * len); 492 493 for (i = 0; i < len2; i++) 494 { 495 /* Get index of next trigram */ 496 int trgindex = trg2indexes[i]; 497 498 /* Update last position of this trigram */ 499 if (lower >= 0 || found[trgindex]) 500 { 501 if (lastpos[trgindex] < 0) 502 { 503 ulen2++; 504 if (found[trgindex]) 505 count++; 506 } 507 lastpos[trgindex] = i; 508 } 509 510 /* 511 * Adjust upper bound if trigram is upper bound of word for strict 512 * word similarity, or if trigram is present in required substring for 513 * plain word similarity 514 */ 515 if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT) 516 : found[trgindex]) 517 { 518 int prev_lower, 519 tmp_ulen2, 520 tmp_lower, 521 tmp_count; 522 523 upper = i; 524 if (lower == -1) 525 { 526 lower = i; 527 ulen2 = 1; 528 } 529 530 smlr_cur = CALCSML(count, ulen1, ulen2); 531 532 /* Also try to adjust lower bound for greater similarity */ 533 tmp_count = count; 534 tmp_ulen2 = ulen2; 535 prev_lower = lower; 536 for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++) 537 { 538 float smlr_tmp; 539 int tmp_trgindex; 540 541 /* 542 * Adjust lower bound only if trigram is lower bound of word 543 * for strict word similarity, or consider every trigram as 544 * lower bound for plain word similarity. 545 */ 546 if (!(flags & WORD_SIMILARITY_STRICT) 547 || (bounds[tmp_lower] & TRGM_BOUND_LEFT)) 548 { 549 smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2); 550 if (smlr_tmp > smlr_cur) 551 { 552 smlr_cur = smlr_tmp; 553 ulen2 = tmp_ulen2; 554 lower = tmp_lower; 555 count = tmp_count; 556 } 557 558 /* 559 * If we only check that word similarity is greater than 560 * threshold we do not need to calculate a maximum 561 * similarity. 562 */ 563 if ((flags & WORD_SIMILARITY_CHECK_ONLY) 564 && smlr_cur >= threshold) 565 break; 566 } 567 568 tmp_trgindex = trg2indexes[tmp_lower]; 569 if (lastpos[tmp_trgindex] == tmp_lower) 570 { 571 tmp_ulen2--; 572 if (found[tmp_trgindex]) 573 tmp_count--; 574 } 575 } 576 577 smlr_max = Max(smlr_max, smlr_cur); 578 579 /* 580 * if we only check that word similarity is greater than threshold 581 * we do not need to calculate a maximum similarity. 582 */ 583 if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold) 584 break; 585 586 for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++) 587 { 588 int tmp_trgindex; 589 590 tmp_trgindex = trg2indexes[tmp_lower]; 591 if (lastpos[tmp_trgindex] == tmp_lower) 592 lastpos[tmp_trgindex] = -1; 593 } 594 } 595 } 596 597 pfree(lastpos); 598 599 return smlr_max; 600 } 601 602 /* 603 * Calculate word similarity. 604 * This function prepare two arrays: "trg2indexes" and "found". Then this arrays 605 * are used to calculate word similarity using iterate_word_similarity(). 606 * 607 * "trg2indexes" is array which stores indexes of the array "found". 608 * In other words: 609 * trg2indexes[j] = i; 610 * found[i] = true (or false); 611 * If found[i] == true then there is trigram trg2[j] in array "trg1". 612 * If found[i] == false then there is not trigram trg2[j] in array "trg1". 613 * 614 * str1: search pattern string, of length slen1 bytes. 615 * str2: text in which we are looking for a word, of length slen2 bytes. 616 * flags: set of boolean flags parameterizing similarity calculation. 617 * 618 * Returns word similarity. 619 */ 620 static float4 621 calc_word_similarity(char *str1, int slen1, char *str2, int slen2, 622 uint8 flags) 623 { 624 bool *found; 625 pos_trgm *ptrg; 626 trgm *trg1; 627 trgm *trg2; 628 int len1, 629 len2, 630 len, 631 i, 632 j, 633 ulen1; 634 int *trg2indexes; 635 float4 result; 636 TrgmBound *bounds; 637 638 protect_out_of_mem(slen1 + slen2); 639 640 /* Make positional trigrams */ 641 trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3); 642 trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3); 643 if (flags & WORD_SIMILARITY_STRICT) 644 bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3); 645 else 646 bounds = NULL; 647 648 len1 = generate_trgm_only(trg1, str1, slen1, NULL); 649 len2 = generate_trgm_only(trg2, str2, slen2, bounds); 650 651 ptrg = make_positional_trgm(trg1, len1, trg2, len2); 652 len = len1 + len2; 653 qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm); 654 655 pfree(trg1); 656 pfree(trg2); 657 658 /* 659 * Merge positional trigrams array: enumerate each trigram and find its 660 * presence in required word. 661 */ 662 trg2indexes = (int *) palloc(sizeof(int) * len2); 663 found = (bool *) palloc0(sizeof(bool) * len); 664 665 ulen1 = 0; 666 j = 0; 667 for (i = 0; i < len; i++) 668 { 669 if (i > 0) 670 { 671 int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg); 672 673 if (cmp != 0) 674 { 675 if (found[j]) 676 ulen1++; 677 j++; 678 } 679 } 680 681 if (ptrg[i].index >= 0) 682 { 683 trg2indexes[ptrg[i].index] = j; 684 } 685 else 686 { 687 found[j] = true; 688 } 689 } 690 if (found[j]) 691 ulen1++; 692 693 /* Run iterative procedure to find maximum similarity with word */ 694 result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len, 695 flags, bounds); 696 697 pfree(trg2indexes); 698 pfree(found); 699 pfree(ptrg); 700 701 return result; 702 } 703 704 705 /* 706 * Extract the next non-wildcard part of a search string, i.e. a word bounded 707 * by '_' or '%' meta-characters, non-word characters or string end. 708 * 709 * str: source string, of length lenstr bytes (need not be null-terminated) 710 * buf: where to return the substring (must be long enough) 711 * *bytelen: receives byte length of the found substring 712 * *charlen: receives character length of the found substring 713 * 714 * Returns pointer to end+1 of the found substring in the source string. 715 * Returns NULL if no word found (in which case buf, bytelen, charlen not set) 716 * 717 * If the found word is bounded by non-word characters or string boundaries 718 * then this function will include corresponding padding spaces into buf. 719 */ 720 static const char * 721 get_wildcard_part(const char *str, int lenstr, 722 char *buf, int *bytelen, int *charlen) 723 { 724 const char *beginword = str; 725 const char *endword; 726 char *s = buf; 727 bool in_leading_wildcard_meta = false; 728 bool in_trailing_wildcard_meta = false; 729 bool in_escape = false; 730 int clen; 731 732 /* 733 * Find the first word character, remembering whether preceding character 734 * was wildcard meta-character. Note that the in_escape state persists 735 * from this loop to the next one, since we may exit at a word character 736 * that is in_escape. 737 */ 738 while (beginword - str < lenstr) 739 { 740 if (in_escape) 741 { 742 if (ISWORDCHR(beginword)) 743 break; 744 in_escape = false; 745 in_leading_wildcard_meta = false; 746 } 747 else 748 { 749 if (ISESCAPECHAR(beginword)) 750 in_escape = true; 751 else if (ISWILDCARDCHAR(beginword)) 752 in_leading_wildcard_meta = true; 753 else if (ISWORDCHR(beginword)) 754 break; 755 else 756 in_leading_wildcard_meta = false; 757 } 758 beginword += pg_mblen(beginword); 759 } 760 761 /* 762 * Handle string end. 763 */ 764 if (beginword - str >= lenstr) 765 return NULL; 766 767 /* 768 * Add left padding spaces if preceding character wasn't wildcard 769 * meta-character. 770 */ 771 *charlen = 0; 772 if (!in_leading_wildcard_meta) 773 { 774 if (LPADDING > 0) 775 { 776 *s++ = ' '; 777 (*charlen)++; 778 if (LPADDING > 1) 779 { 780 *s++ = ' '; 781 (*charlen)++; 782 } 783 } 784 } 785 786 /* 787 * Copy data into buf until wildcard meta-character, non-word character or 788 * string boundary. Strip escapes during copy. 789 */ 790 endword = beginword; 791 while (endword - str < lenstr) 792 { 793 clen = pg_mblen(endword); 794 if (in_escape) 795 { 796 if (ISWORDCHR(endword)) 797 { 798 memcpy(s, endword, clen); 799 (*charlen)++; 800 s += clen; 801 } 802 else 803 { 804 /* 805 * Back up endword to the escape character when stopping at an 806 * escaped char, so that subsequent get_wildcard_part will 807 * restart from the escape character. We assume here that 808 * escape chars are single-byte. 809 */ 810 endword--; 811 break; 812 } 813 in_escape = false; 814 } 815 else 816 { 817 if (ISESCAPECHAR(endword)) 818 in_escape = true; 819 else if (ISWILDCARDCHAR(endword)) 820 { 821 in_trailing_wildcard_meta = true; 822 break; 823 } 824 else if (ISWORDCHR(endword)) 825 { 826 memcpy(s, endword, clen); 827 (*charlen)++; 828 s += clen; 829 } 830 else 831 break; 832 } 833 endword += clen; 834 } 835 836 /* 837 * Add right padding spaces if next character isn't wildcard 838 * meta-character. 839 */ 840 if (!in_trailing_wildcard_meta) 841 { 842 if (RPADDING > 0) 843 { 844 *s++ = ' '; 845 (*charlen)++; 846 if (RPADDING > 1) 847 { 848 *s++ = ' '; 849 (*charlen)++; 850 } 851 } 852 } 853 854 *bytelen = s - buf; 855 return endword; 856 } 857 858 /* 859 * Generates trigrams for wildcard search string. 860 * 861 * Returns array of trigrams that must occur in any string that matches the 862 * wildcard string. For example, given pattern "a%bcd%" the trigrams 863 * " a", "bcd" would be extracted. 864 */ 865 TRGM * 866 generate_wildcard_trgm(const char *str, int slen) 867 { 868 TRGM *trg; 869 char *buf, 870 *buf2; 871 trgm *tptr; 872 int len, 873 charlen, 874 bytelen; 875 const char *eword; 876 877 protect_out_of_mem(slen); 878 879 trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3); 880 trg->flag = ARRKEY; 881 SET_VARSIZE(trg, TRGMHDRSIZE); 882 883 if (slen + LPADDING + RPADDING < 3 || slen == 0) 884 return trg; 885 886 tptr = GETARR(trg); 887 888 /* Allocate a buffer for blank-padded, but not yet case-folded, words */ 889 buf = palloc(sizeof(char) * (slen + 4)); 890 891 /* 892 * Extract trigrams from each substring extracted by get_wildcard_part. 893 */ 894 eword = str; 895 while ((eword = get_wildcard_part(eword, slen - (eword - str), 896 buf, &bytelen, &charlen)) != NULL) 897 { 898 #ifdef IGNORECASE 899 buf2 = lowerstr_with_len(buf, bytelen); 900 bytelen = strlen(buf2); 901 #else 902 buf2 = buf; 903 #endif 904 905 /* 906 * count trigrams 907 */ 908 tptr = make_trigrams(tptr, buf2, bytelen, charlen); 909 910 #ifdef IGNORECASE 911 pfree(buf2); 912 #endif 913 } 914 915 pfree(buf); 916 917 if ((len = tptr - GETARR(trg)) == 0) 918 return trg; 919 920 /* 921 * Make trigrams unique. 922 */ 923 if (len > 1) 924 { 925 qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm); 926 len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm); 927 } 928 929 SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); 930 931 return trg; 932 } 933 934 uint32 935 trgm2int(trgm *ptr) 936 { 937 uint32 val = 0; 938 939 val |= *(((unsigned char *) ptr)); 940 val <<= 8; 941 val |= *(((unsigned char *) ptr) + 1); 942 val <<= 8; 943 val |= *(((unsigned char *) ptr) + 2); 944 945 return val; 946 } 947 948 Datum 949 show_trgm(PG_FUNCTION_ARGS) 950 { 951 text *in = PG_GETARG_TEXT_PP(0); 952 TRGM *trg; 953 Datum *d; 954 ArrayType *a; 955 trgm *ptr; 956 int i; 957 958 trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in)); 959 d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg))); 960 961 for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++) 962 { 963 text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3)); 964 965 if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr)) 966 { 967 snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr)); 968 SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item))); 969 } 970 else 971 { 972 SET_VARSIZE(item, VARHDRSZ + 3); 973 CPTRGM(VARDATA(item), ptr); 974 } 975 d[i] = PointerGetDatum(item); 976 } 977 978 a = construct_array(d, 979 ARRNELEM(trg), 980 TEXTOID, 981 -1, 982 false, 983 TYPALIGN_INT); 984 985 for (i = 0; i < ARRNELEM(trg); i++) 986 pfree(DatumGetPointer(d[i])); 987 988 pfree(d); 989 pfree(trg); 990 PG_FREE_IF_COPY(in, 0); 991 992 PG_RETURN_POINTER(a); 993 } 994 995 float4 996 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact) 997 { 998 trgm *ptr1, 999 *ptr2; 1000 int count = 0; 1001 int len1, 1002 len2; 1003 1004 ptr1 = GETARR(trg1); 1005 ptr2 = GETARR(trg2); 1006 1007 len1 = ARRNELEM(trg1); 1008 len2 = ARRNELEM(trg2); 1009 1010 /* explicit test is needed to avoid 0/0 division when both lengths are 0 */ 1011 if (len1 <= 0 || len2 <= 0) 1012 return (float4) 0.0; 1013 1014 while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2) 1015 { 1016 int res = CMPTRGM(ptr1, ptr2); 1017 1018 if (res < 0) 1019 ptr1++; 1020 else if (res > 0) 1021 ptr2++; 1022 else 1023 { 1024 ptr1++; 1025 ptr2++; 1026 count++; 1027 } 1028 } 1029 1030 /* 1031 * If inexact then len2 is equal to count, because we don't know actual 1032 * length of second string in inexact search and we can assume that count 1033 * is a lower bound of len2. 1034 */ 1035 return CALCSML(count, len1, inexact ? count : len2); 1036 } 1037 1038 1039 /* 1040 * Returns whether trg2 contains all trigrams in trg1. 1041 * This relies on the trigram arrays being sorted. 1042 */ 1043 bool 1044 trgm_contained_by(TRGM *trg1, TRGM *trg2) 1045 { 1046 trgm *ptr1, 1047 *ptr2; 1048 int len1, 1049 len2; 1050 1051 ptr1 = GETARR(trg1); 1052 ptr2 = GETARR(trg2); 1053 1054 len1 = ARRNELEM(trg1); 1055 len2 = ARRNELEM(trg2); 1056 1057 while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2) 1058 { 1059 int res = CMPTRGM(ptr1, ptr2); 1060 1061 if (res < 0) 1062 return false; 1063 else if (res > 0) 1064 ptr2++; 1065 else 1066 { 1067 ptr1++; 1068 ptr2++; 1069 } 1070 } 1071 if (ptr1 - GETARR(trg1) < len1) 1072 return false; 1073 else 1074 return true; 1075 } 1076 1077 /* 1078 * Return a palloc'd boolean array showing, for each trigram in "query", 1079 * whether it is present in the trigram array "key". 1080 * This relies on the "key" array being sorted, but "query" need not be. 1081 */ 1082 bool * 1083 trgm_presence_map(TRGM *query, TRGM *key) 1084 { 1085 bool *result; 1086 trgm *ptrq = GETARR(query), 1087 *ptrk = GETARR(key); 1088 int lenq = ARRNELEM(query), 1089 lenk = ARRNELEM(key), 1090 i; 1091 1092 result = (bool *) palloc0(lenq * sizeof(bool)); 1093 1094 /* for each query trigram, do a binary search in the key array */ 1095 for (i = 0; i < lenq; i++) 1096 { 1097 int lo = 0; 1098 int hi = lenk; 1099 1100 while (lo < hi) 1101 { 1102 int mid = (lo + hi) / 2; 1103 int res = CMPTRGM(ptrq, ptrk + mid); 1104 1105 if (res < 0) 1106 hi = mid; 1107 else if (res > 0) 1108 lo = mid + 1; 1109 else 1110 { 1111 result[i] = true; 1112 break; 1113 } 1114 } 1115 ptrq++; 1116 } 1117 1118 return result; 1119 } 1120 1121 Datum 1122 similarity(PG_FUNCTION_ARGS) 1123 { 1124 text *in1 = PG_GETARG_TEXT_PP(0); 1125 text *in2 = PG_GETARG_TEXT_PP(1); 1126 TRGM *trg1, 1127 *trg2; 1128 float4 res; 1129 1130 trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1)); 1131 trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2)); 1132 1133 res = cnt_sml(trg1, trg2, false); 1134 1135 pfree(trg1); 1136 pfree(trg2); 1137 PG_FREE_IF_COPY(in1, 0); 1138 PG_FREE_IF_COPY(in2, 1); 1139 1140 PG_RETURN_FLOAT4(res); 1141 } 1142 1143 Datum 1144 word_similarity(PG_FUNCTION_ARGS) 1145 { 1146 text *in1 = PG_GETARG_TEXT_PP(0); 1147 text *in2 = PG_GETARG_TEXT_PP(1); 1148 float4 res; 1149 1150 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), 1151 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), 1152 0); 1153 1154 PG_FREE_IF_COPY(in1, 0); 1155 PG_FREE_IF_COPY(in2, 1); 1156 PG_RETURN_FLOAT4(res); 1157 } 1158 1159 Datum 1160 strict_word_similarity(PG_FUNCTION_ARGS) 1161 { 1162 text *in1 = PG_GETARG_TEXT_PP(0); 1163 text *in2 = PG_GETARG_TEXT_PP(1); 1164 float4 res; 1165 1166 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), 1167 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), 1168 WORD_SIMILARITY_STRICT); 1169 1170 PG_FREE_IF_COPY(in1, 0); 1171 PG_FREE_IF_COPY(in2, 1); 1172 PG_RETURN_FLOAT4(res); 1173 } 1174 1175 Datum 1176 similarity_dist(PG_FUNCTION_ARGS) 1177 { 1178 float4 res = DatumGetFloat4(DirectFunctionCall2(similarity, 1179 PG_GETARG_DATUM(0), 1180 PG_GETARG_DATUM(1))); 1181 1182 PG_RETURN_FLOAT4(1.0 - res); 1183 } 1184 1185 Datum 1186 similarity_op(PG_FUNCTION_ARGS) 1187 { 1188 float4 res = DatumGetFloat4(DirectFunctionCall2(similarity, 1189 PG_GETARG_DATUM(0), 1190 PG_GETARG_DATUM(1))); 1191 1192 PG_RETURN_BOOL(res >= similarity_threshold); 1193 } 1194 1195 Datum 1196 word_similarity_op(PG_FUNCTION_ARGS) 1197 { 1198 text *in1 = PG_GETARG_TEXT_PP(0); 1199 text *in2 = PG_GETARG_TEXT_PP(1); 1200 float4 res; 1201 1202 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), 1203 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), 1204 WORD_SIMILARITY_CHECK_ONLY); 1205 1206 PG_FREE_IF_COPY(in1, 0); 1207 PG_FREE_IF_COPY(in2, 1); 1208 PG_RETURN_BOOL(res >= word_similarity_threshold); 1209 } 1210 1211 Datum 1212 word_similarity_commutator_op(PG_FUNCTION_ARGS) 1213 { 1214 text *in1 = PG_GETARG_TEXT_PP(0); 1215 text *in2 = PG_GETARG_TEXT_PP(1); 1216 float4 res; 1217 1218 res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), 1219 VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), 1220 WORD_SIMILARITY_CHECK_ONLY); 1221 1222 PG_FREE_IF_COPY(in1, 0); 1223 PG_FREE_IF_COPY(in2, 1); 1224 PG_RETURN_BOOL(res >= word_similarity_threshold); 1225 } 1226 1227 Datum 1228 word_similarity_dist_op(PG_FUNCTION_ARGS) 1229 { 1230 text *in1 = PG_GETARG_TEXT_PP(0); 1231 text *in2 = PG_GETARG_TEXT_PP(1); 1232 float4 res; 1233 1234 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), 1235 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), 1236 0); 1237 1238 PG_FREE_IF_COPY(in1, 0); 1239 PG_FREE_IF_COPY(in2, 1); 1240 PG_RETURN_FLOAT4(1.0 - res); 1241 } 1242 1243 Datum 1244 word_similarity_dist_commutator_op(PG_FUNCTION_ARGS) 1245 { 1246 text *in1 = PG_GETARG_TEXT_PP(0); 1247 text *in2 = PG_GETARG_TEXT_PP(1); 1248 float4 res; 1249 1250 res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), 1251 VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), 1252 0); 1253 1254 PG_FREE_IF_COPY(in1, 0); 1255 PG_FREE_IF_COPY(in2, 1); 1256 PG_RETURN_FLOAT4(1.0 - res); 1257 } 1258 1259 Datum 1260 strict_word_similarity_op(PG_FUNCTION_ARGS) 1261 { 1262 text *in1 = PG_GETARG_TEXT_PP(0); 1263 text *in2 = PG_GETARG_TEXT_PP(1); 1264 float4 res; 1265 1266 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), 1267 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), 1268 WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT); 1269 1270 PG_FREE_IF_COPY(in1, 0); 1271 PG_FREE_IF_COPY(in2, 1); 1272 PG_RETURN_BOOL(res >= strict_word_similarity_threshold); 1273 } 1274 1275 Datum 1276 strict_word_similarity_commutator_op(PG_FUNCTION_ARGS) 1277 { 1278 text *in1 = PG_GETARG_TEXT_PP(0); 1279 text *in2 = PG_GETARG_TEXT_PP(1); 1280 float4 res; 1281 1282 res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), 1283 VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), 1284 WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT); 1285 1286 PG_FREE_IF_COPY(in1, 0); 1287 PG_FREE_IF_COPY(in2, 1); 1288 PG_RETURN_BOOL(res >= strict_word_similarity_threshold); 1289 } 1290 1291 Datum 1292 strict_word_similarity_dist_op(PG_FUNCTION_ARGS) 1293 { 1294 text *in1 = PG_GETARG_TEXT_PP(0); 1295 text *in2 = PG_GETARG_TEXT_PP(1); 1296 float4 res; 1297 1298 res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), 1299 VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), 1300 WORD_SIMILARITY_STRICT); 1301 1302 PG_FREE_IF_COPY(in1, 0); 1303 PG_FREE_IF_COPY(in2, 1); 1304 PG_RETURN_FLOAT4(1.0 - res); 1305 } 1306 1307 Datum 1308 strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS) 1309 { 1310 text *in1 = PG_GETARG_TEXT_PP(0); 1311 text *in2 = PG_GETARG_TEXT_PP(1); 1312 float4 res; 1313 1314 res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2), 1315 VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1), 1316 WORD_SIMILARITY_STRICT); 1317 1318 PG_FREE_IF_COPY(in1, 0); 1319 PG_FREE_IF_COPY(in2, 1); 1320 PG_RETURN_FLOAT4(1.0 - res); 1321 } 1322