1 /* 2 * contrib/pg_trgm/trgm_gist.c 3 */ 4 #include "postgres.h" 5 6 #include "trgm.h" 7 8 #include "access/stratnum.h" 9 #include "fmgr.h" 10 #include "port/pg_bitutils.h" 11 12 13 typedef struct 14 { 15 /* most recent inputs to gtrgm_consistent */ 16 StrategyNumber strategy; 17 text *query; 18 /* extracted trigrams for query */ 19 TRGM *trigrams; 20 /* if a regex operator, the extracted graph */ 21 TrgmPackedGraph *graph; 22 23 /* 24 * The "query" and "trigrams" are stored in the same palloc block as this 25 * cache struct, at MAXALIGN'ed offsets. The graph however isn't. 26 */ 27 } gtrgm_consistent_cache; 28 29 #define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key)) 30 31 32 PG_FUNCTION_INFO_V1(gtrgm_in); 33 PG_FUNCTION_INFO_V1(gtrgm_out); 34 PG_FUNCTION_INFO_V1(gtrgm_compress); 35 PG_FUNCTION_INFO_V1(gtrgm_decompress); 36 PG_FUNCTION_INFO_V1(gtrgm_consistent); 37 PG_FUNCTION_INFO_V1(gtrgm_distance); 38 PG_FUNCTION_INFO_V1(gtrgm_union); 39 PG_FUNCTION_INFO_V1(gtrgm_same); 40 PG_FUNCTION_INFO_V1(gtrgm_penalty); 41 PG_FUNCTION_INFO_V1(gtrgm_picksplit); 42 43 44 Datum 45 gtrgm_in(PG_FUNCTION_ARGS) 46 { 47 elog(ERROR, "not implemented"); 48 PG_RETURN_DATUM(0); 49 } 50 51 Datum 52 gtrgm_out(PG_FUNCTION_ARGS) 53 { 54 elog(ERROR, "not implemented"); 55 PG_RETURN_DATUM(0); 56 } 57 58 static void 59 makesign(BITVECP sign, TRGM *a) 60 { 61 int32 k, 62 len = ARRNELEM(a); 63 trgm *ptr = GETARR(a); 64 int32 tmp = 0; 65 66 MemSet((void *) sign, 0, sizeof(BITVEC)); 67 SETBIT(sign, SIGLENBIT); /* set last unused bit */ 68 for (k = 0; k < len; k++) 69 { 70 CPTRGM(((char *) &tmp), ptr + k); 71 HASH(sign, tmp); 72 } 73 } 74 75 Datum 76 gtrgm_compress(PG_FUNCTION_ARGS) 77 { 78 GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); 79 GISTENTRY *retval = entry; 80 81 if (entry->leafkey) 82 { /* trgm */ 83 TRGM *res; 84 text *val = DatumGetTextPP(entry->key); 85 86 res = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val)); 87 retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); 88 gistentryinit(*retval, PointerGetDatum(res), 89 entry->rel, entry->page, 90 entry->offset, false); 91 } 92 else if (ISSIGNKEY(DatumGetPointer(entry->key)) && 93 !ISALLTRUE(DatumGetPointer(entry->key))) 94 { 95 int32 i, 96 len; 97 TRGM *res; 98 BITVECP sign = GETSIGN(DatumGetPointer(entry->key)); 99 100 LOOPBYTE 101 { 102 if ((sign[i] & 0xff) != 0xff) 103 PG_RETURN_POINTER(retval); 104 } 105 106 len = CALCGTSIZE(SIGNKEY | ALLISTRUE, 0); 107 res = (TRGM *) palloc(len); 108 SET_VARSIZE(res, len); 109 res->flag = SIGNKEY | ALLISTRUE; 110 111 retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); 112 gistentryinit(*retval, PointerGetDatum(res), 113 entry->rel, entry->page, 114 entry->offset, false); 115 } 116 PG_RETURN_POINTER(retval); 117 } 118 119 Datum 120 gtrgm_decompress(PG_FUNCTION_ARGS) 121 { 122 GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); 123 GISTENTRY *retval; 124 text *key; 125 126 key = DatumGetTextPP(entry->key); 127 128 if (key != (text *) DatumGetPointer(entry->key)) 129 { 130 /* need to pass back the decompressed item */ 131 retval = palloc(sizeof(GISTENTRY)); 132 gistentryinit(*retval, PointerGetDatum(key), 133 entry->rel, entry->page, entry->offset, entry->leafkey); 134 PG_RETURN_POINTER(retval); 135 } 136 else 137 { 138 /* we can return the entry as-is */ 139 PG_RETURN_POINTER(entry); 140 } 141 } 142 143 static int32 144 cnt_sml_sign_common(TRGM *qtrg, BITVECP sign) 145 { 146 int32 count = 0; 147 int32 k, 148 len = ARRNELEM(qtrg); 149 trgm *ptr = GETARR(qtrg); 150 int32 tmp = 0; 151 152 for (k = 0; k < len; k++) 153 { 154 CPTRGM(((char *) &tmp), ptr + k); 155 count += GETBIT(sign, HASHVAL(tmp)); 156 } 157 158 return count; 159 } 160 161 Datum 162 gtrgm_consistent(PG_FUNCTION_ARGS) 163 { 164 GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); 165 text *query = PG_GETARG_TEXT_P(1); 166 StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); 167 168 /* Oid subtype = PG_GETARG_OID(3); */ 169 bool *recheck = (bool *) PG_GETARG_POINTER(4); 170 TRGM *key = (TRGM *) DatumGetPointer(entry->key); 171 TRGM *qtrg; 172 bool res; 173 Size querysize = VARSIZE(query); 174 gtrgm_consistent_cache *cache; 175 double nlimit; 176 177 /* 178 * We keep the extracted trigrams in cache, because trigram extraction is 179 * relatively CPU-expensive. When trying to reuse a cached value, check 180 * strategy number not just query itself, because trigram extraction 181 * depends on strategy. 182 * 183 * The cached structure is a single palloc chunk containing the 184 * gtrgm_consistent_cache header, then the input query (4-byte length 185 * word, uncompressed, starting at a MAXALIGN boundary), then the TRGM 186 * value (also starting at a MAXALIGN boundary). However we don't try to 187 * include the regex graph (if any) in that struct. (XXX currently, this 188 * approach can leak regex graphs across index rescans. Not clear if 189 * that's worth fixing.) 190 */ 191 cache = (gtrgm_consistent_cache *) fcinfo->flinfo->fn_extra; 192 if (cache == NULL || 193 cache->strategy != strategy || 194 VARSIZE(cache->query) != querysize || 195 memcmp((char *) cache->query, (char *) query, querysize) != 0) 196 { 197 gtrgm_consistent_cache *newcache; 198 TrgmPackedGraph *graph = NULL; 199 Size qtrgsize; 200 201 switch (strategy) 202 { 203 case SimilarityStrategyNumber: 204 case WordSimilarityStrategyNumber: 205 case StrictWordSimilarityStrategyNumber: 206 qtrg = generate_trgm(VARDATA(query), 207 querysize - VARHDRSZ); 208 break; 209 case ILikeStrategyNumber: 210 #ifndef IGNORECASE 211 elog(ERROR, "cannot handle ~~* with case-sensitive trigrams"); 212 #endif 213 /* FALL THRU */ 214 case LikeStrategyNumber: 215 qtrg = generate_wildcard_trgm(VARDATA(query), 216 querysize - VARHDRSZ); 217 break; 218 case RegExpICaseStrategyNumber: 219 #ifndef IGNORECASE 220 elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); 221 #endif 222 /* FALL THRU */ 223 case RegExpStrategyNumber: 224 qtrg = createTrgmNFA(query, PG_GET_COLLATION(), 225 &graph, fcinfo->flinfo->fn_mcxt); 226 /* just in case an empty array is returned ... */ 227 if (qtrg && ARRNELEM(qtrg) <= 0) 228 { 229 pfree(qtrg); 230 qtrg = NULL; 231 } 232 break; 233 default: 234 elog(ERROR, "unrecognized strategy number: %d", strategy); 235 qtrg = NULL; /* keep compiler quiet */ 236 break; 237 } 238 239 qtrgsize = qtrg ? VARSIZE(qtrg) : 0; 240 241 newcache = (gtrgm_consistent_cache *) 242 MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, 243 MAXALIGN(sizeof(gtrgm_consistent_cache)) + 244 MAXALIGN(querysize) + 245 qtrgsize); 246 247 newcache->strategy = strategy; 248 newcache->query = (text *) 249 ((char *) newcache + MAXALIGN(sizeof(gtrgm_consistent_cache))); 250 memcpy((char *) newcache->query, (char *) query, querysize); 251 if (qtrg) 252 { 253 newcache->trigrams = (TRGM *) 254 ((char *) newcache->query + MAXALIGN(querysize)); 255 memcpy((char *) newcache->trigrams, (char *) qtrg, qtrgsize); 256 /* release qtrg in case it was made in fn_mcxt */ 257 pfree(qtrg); 258 } 259 else 260 newcache->trigrams = NULL; 261 newcache->graph = graph; 262 263 if (cache) 264 pfree(cache); 265 fcinfo->flinfo->fn_extra = (void *) newcache; 266 cache = newcache; 267 } 268 269 qtrg = cache->trigrams; 270 271 switch (strategy) 272 { 273 case SimilarityStrategyNumber: 274 case WordSimilarityStrategyNumber: 275 case StrictWordSimilarityStrategyNumber: 276 277 /* 278 * Similarity search is exact. (Strict) word similarity search is 279 * inexact 280 */ 281 *recheck = (strategy != SimilarityStrategyNumber); 282 283 nlimit = index_strategy_get_limit(strategy); 284 285 if (GIST_LEAF(entry)) 286 { /* all leafs contains orig trgm */ 287 double tmpsml = cnt_sml(qtrg, key, *recheck); 288 289 res = (tmpsml >= nlimit); 290 } 291 else if (ISALLTRUE(key)) 292 { /* non-leaf contains signature */ 293 res = true; 294 } 295 else 296 { /* non-leaf contains signature */ 297 int32 count = cnt_sml_sign_common(qtrg, GETSIGN(key)); 298 int32 len = ARRNELEM(qtrg); 299 300 if (len == 0) 301 res = false; 302 else 303 res = (((((float8) count) / ((float8) len))) >= nlimit); 304 } 305 break; 306 case ILikeStrategyNumber: 307 #ifndef IGNORECASE 308 elog(ERROR, "cannot handle ~~* with case-sensitive trigrams"); 309 #endif 310 /* FALL THRU */ 311 case LikeStrategyNumber: 312 /* Wildcard search is inexact */ 313 *recheck = true; 314 315 /* 316 * Check if all the extracted trigrams can be present in child 317 * nodes. 318 */ 319 if (GIST_LEAF(entry)) 320 { /* all leafs contains orig trgm */ 321 res = trgm_contained_by(qtrg, key); 322 } 323 else if (ISALLTRUE(key)) 324 { /* non-leaf contains signature */ 325 res = true; 326 } 327 else 328 { /* non-leaf contains signature */ 329 int32 k, 330 tmp = 0, 331 len = ARRNELEM(qtrg); 332 trgm *ptr = GETARR(qtrg); 333 BITVECP sign = GETSIGN(key); 334 335 res = true; 336 for (k = 0; k < len; k++) 337 { 338 CPTRGM(((char *) &tmp), ptr + k); 339 if (!GETBIT(sign, HASHVAL(tmp))) 340 { 341 res = false; 342 break; 343 } 344 } 345 } 346 break; 347 case RegExpICaseStrategyNumber: 348 #ifndef IGNORECASE 349 elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); 350 #endif 351 /* FALL THRU */ 352 case RegExpStrategyNumber: 353 /* Regexp search is inexact */ 354 *recheck = true; 355 356 /* Check regex match as much as we can with available info */ 357 if (qtrg) 358 { 359 if (GIST_LEAF(entry)) 360 { /* all leafs contains orig trgm */ 361 bool *check; 362 363 check = trgm_presence_map(qtrg, key); 364 res = trigramsMatchGraph(cache->graph, check); 365 pfree(check); 366 } 367 else if (ISALLTRUE(key)) 368 { /* non-leaf contains signature */ 369 res = true; 370 } 371 else 372 { /* non-leaf contains signature */ 373 int32 k, 374 tmp = 0, 375 len = ARRNELEM(qtrg); 376 trgm *ptr = GETARR(qtrg); 377 BITVECP sign = GETSIGN(key); 378 bool *check; 379 380 /* 381 * GETBIT() tests may give false positives, due to limited 382 * size of the sign array. But since trigramsMatchGraph() 383 * implements a monotone boolean function, false positives 384 * in the check array can't lead to false negative answer. 385 * So we can apply trigramsMatchGraph despite uncertainty, 386 * and that usefully improves the quality of the search. 387 */ 388 check = (bool *) palloc(len * sizeof(bool)); 389 for (k = 0; k < len; k++) 390 { 391 CPTRGM(((char *) &tmp), ptr + k); 392 check[k] = GETBIT(sign, HASHVAL(tmp)); 393 } 394 res = trigramsMatchGraph(cache->graph, check); 395 pfree(check); 396 } 397 } 398 else 399 { 400 /* trigram-free query must be rechecked everywhere */ 401 res = true; 402 } 403 break; 404 default: 405 elog(ERROR, "unrecognized strategy number: %d", strategy); 406 res = false; /* keep compiler quiet */ 407 break; 408 } 409 410 PG_RETURN_BOOL(res); 411 } 412 413 Datum 414 gtrgm_distance(PG_FUNCTION_ARGS) 415 { 416 GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); 417 text *query = PG_GETARG_TEXT_P(1); 418 StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); 419 420 /* Oid subtype = PG_GETARG_OID(3); */ 421 bool *recheck = (bool *) PG_GETARG_POINTER(4); 422 TRGM *key = (TRGM *) DatumGetPointer(entry->key); 423 TRGM *qtrg; 424 float8 res; 425 Size querysize = VARSIZE(query); 426 char *cache = (char *) fcinfo->flinfo->fn_extra; 427 428 /* 429 * Cache the generated trigrams across multiple calls with the same query. 430 */ 431 if (cache == NULL || 432 VARSIZE(cache) != querysize || 433 memcmp(cache, query, querysize) != 0) 434 { 435 char *newcache; 436 437 qtrg = generate_trgm(VARDATA(query), querysize - VARHDRSZ); 438 439 newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, 440 MAXALIGN(querysize) + 441 VARSIZE(qtrg)); 442 443 memcpy(newcache, query, querysize); 444 memcpy(newcache + MAXALIGN(querysize), qtrg, VARSIZE(qtrg)); 445 446 if (cache) 447 pfree(cache); 448 fcinfo->flinfo->fn_extra = newcache; 449 cache = newcache; 450 } 451 452 qtrg = (TRGM *) (cache + MAXALIGN(querysize)); 453 454 switch (strategy) 455 { 456 case DistanceStrategyNumber: 457 case WordDistanceStrategyNumber: 458 case StrictWordDistanceStrategyNumber: 459 /* Only plain trigram distance is exact */ 460 *recheck = (strategy != DistanceStrategyNumber); 461 if (GIST_LEAF(entry)) 462 { /* all leafs contains orig trgm */ 463 464 /* 465 * Prevent gcc optimizing the sml variable using volatile 466 * keyword. Otherwise res can differ from the 467 * word_similarity_dist_op() function. 468 */ 469 float4 volatile sml = cnt_sml(qtrg, key, *recheck); 470 471 res = 1.0 - sml; 472 } 473 else if (ISALLTRUE(key)) 474 { /* all leafs contains orig trgm */ 475 res = 0.0; 476 } 477 else 478 { /* non-leaf contains signature */ 479 int32 count = cnt_sml_sign_common(qtrg, GETSIGN(key)); 480 int32 len = ARRNELEM(qtrg); 481 482 res = (len == 0) ? -1.0 : 1.0 - ((float8) count) / ((float8) len); 483 } 484 break; 485 default: 486 elog(ERROR, "unrecognized strategy number: %d", strategy); 487 res = 0; /* keep compiler quiet */ 488 break; 489 } 490 491 PG_RETURN_FLOAT8(res); 492 } 493 494 static int32 495 unionkey(BITVECP sbase, TRGM *add) 496 { 497 int32 i; 498 499 if (ISSIGNKEY(add)) 500 { 501 BITVECP sadd = GETSIGN(add); 502 503 if (ISALLTRUE(add)) 504 return 1; 505 506 LOOPBYTE 507 sbase[i] |= sadd[i]; 508 } 509 else 510 { 511 trgm *ptr = GETARR(add); 512 int32 tmp = 0; 513 514 for (i = 0; i < ARRNELEM(add); i++) 515 { 516 CPTRGM(((char *) &tmp), ptr + i); 517 HASH(sbase, tmp); 518 } 519 } 520 return 0; 521 } 522 523 524 Datum 525 gtrgm_union(PG_FUNCTION_ARGS) 526 { 527 GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); 528 int32 len = entryvec->n; 529 int *size = (int *) PG_GETARG_POINTER(1); 530 BITVEC base; 531 int32 i; 532 int32 flag = 0; 533 TRGM *result; 534 535 MemSet((void *) base, 0, sizeof(BITVEC)); 536 for (i = 0; i < len; i++) 537 { 538 if (unionkey(base, GETENTRY(entryvec, i))) 539 { 540 flag = ALLISTRUE; 541 break; 542 } 543 } 544 545 flag |= SIGNKEY; 546 len = CALCGTSIZE(flag, 0); 547 result = (TRGM *) palloc(len); 548 SET_VARSIZE(result, len); 549 result->flag = flag; 550 if (!ISALLTRUE(result)) 551 memcpy((void *) GETSIGN(result), (void *) base, sizeof(BITVEC)); 552 *size = len; 553 554 PG_RETURN_POINTER(result); 555 } 556 557 Datum 558 gtrgm_same(PG_FUNCTION_ARGS) 559 { 560 TRGM *a = (TRGM *) PG_GETARG_POINTER(0); 561 TRGM *b = (TRGM *) PG_GETARG_POINTER(1); 562 bool *result = (bool *) PG_GETARG_POINTER(2); 563 564 if (ISSIGNKEY(a)) 565 { /* then b also ISSIGNKEY */ 566 if (ISALLTRUE(a) && ISALLTRUE(b)) 567 *result = true; 568 else if (ISALLTRUE(a)) 569 *result = false; 570 else if (ISALLTRUE(b)) 571 *result = false; 572 else 573 { 574 int32 i; 575 BITVECP sa = GETSIGN(a), 576 sb = GETSIGN(b); 577 578 *result = true; 579 LOOPBYTE 580 { 581 if (sa[i] != sb[i]) 582 { 583 *result = false; 584 break; 585 } 586 } 587 } 588 } 589 else 590 { /* a and b ISARRKEY */ 591 int32 lena = ARRNELEM(a), 592 lenb = ARRNELEM(b); 593 594 if (lena != lenb) 595 *result = false; 596 else 597 { 598 trgm *ptra = GETARR(a), 599 *ptrb = GETARR(b); 600 int32 i; 601 602 *result = true; 603 for (i = 0; i < lena; i++) 604 if (CMPTRGM(ptra + i, ptrb + i)) 605 { 606 *result = false; 607 break; 608 } 609 } 610 } 611 612 PG_RETURN_POINTER(result); 613 } 614 615 static int32 616 sizebitvec(BITVECP sign) 617 { 618 return pg_popcount(sign, SIGLEN); 619 } 620 621 static int 622 hemdistsign(BITVECP a, BITVECP b) 623 { 624 int i, 625 diff, 626 dist = 0; 627 628 LOOPBYTE 629 { 630 diff = (unsigned char) (a[i] ^ b[i]); 631 /* Using the popcount functions here isn't likely to win */ 632 dist += pg_number_of_ones[diff]; 633 } 634 return dist; 635 } 636 637 static int 638 hemdist(TRGM *a, TRGM *b) 639 { 640 if (ISALLTRUE(a)) 641 { 642 if (ISALLTRUE(b)) 643 return 0; 644 else 645 return SIGLENBIT - sizebitvec(GETSIGN(b)); 646 } 647 else if (ISALLTRUE(b)) 648 return SIGLENBIT - sizebitvec(GETSIGN(a)); 649 650 return hemdistsign(GETSIGN(a), GETSIGN(b)); 651 } 652 653 Datum 654 gtrgm_penalty(PG_FUNCTION_ARGS) 655 { 656 GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0); /* always ISSIGNKEY */ 657 GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1); 658 float *penalty = (float *) PG_GETARG_POINTER(2); 659 TRGM *origval = (TRGM *) DatumGetPointer(origentry->key); 660 TRGM *newval = (TRGM *) DatumGetPointer(newentry->key); 661 BITVECP orig = GETSIGN(origval); 662 663 *penalty = 0.0; 664 665 if (ISARRKEY(newval)) 666 { 667 char *cache = (char *) fcinfo->flinfo->fn_extra; 668 TRGM *cachedVal = (TRGM *) (cache + MAXALIGN(sizeof(BITVEC))); 669 Size newvalsize = VARSIZE(newval); 670 BITVECP sign; 671 672 /* 673 * Cache the sign data across multiple calls with the same newval. 674 */ 675 if (cache == NULL || 676 VARSIZE(cachedVal) != newvalsize || 677 memcmp(cachedVal, newval, newvalsize) != 0) 678 { 679 char *newcache; 680 681 newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, 682 MAXALIGN(sizeof(BITVEC)) + 683 newvalsize); 684 685 makesign((BITVECP) newcache, newval); 686 687 cachedVal = (TRGM *) (newcache + MAXALIGN(sizeof(BITVEC))); 688 memcpy(cachedVal, newval, newvalsize); 689 690 if (cache) 691 pfree(cache); 692 fcinfo->flinfo->fn_extra = newcache; 693 cache = newcache; 694 } 695 696 sign = (BITVECP) cache; 697 698 if (ISALLTRUE(origval)) 699 *penalty = ((float) (SIGLENBIT - sizebitvec(sign))) / (float) (SIGLENBIT + 1); 700 else 701 *penalty = hemdistsign(sign, orig); 702 } 703 else 704 *penalty = hemdist(origval, newval); 705 PG_RETURN_POINTER(penalty); 706 } 707 708 typedef struct 709 { 710 bool allistrue; 711 BITVEC sign; 712 } CACHESIGN; 713 714 static void 715 fillcache(CACHESIGN *item, TRGM *key) 716 { 717 item->allistrue = false; 718 if (ISARRKEY(key)) 719 makesign(item->sign, key); 720 else if (ISALLTRUE(key)) 721 item->allistrue = true; 722 else 723 memcpy((void *) item->sign, (void *) GETSIGN(key), sizeof(BITVEC)); 724 } 725 726 #define WISH_F(a,b,c) (double)( -(double)(((a)-(b))*((a)-(b))*((a)-(b)))*(c) ) 727 typedef struct 728 { 729 OffsetNumber pos; 730 int32 cost; 731 } SPLITCOST; 732 733 static int 734 comparecost(const void *a, const void *b) 735 { 736 if (((const SPLITCOST *) a)->cost == ((const SPLITCOST *) b)->cost) 737 return 0; 738 else 739 return (((const SPLITCOST *) a)->cost > ((const SPLITCOST *) b)->cost) ? 1 : -1; 740 } 741 742 743 static int 744 hemdistcache(CACHESIGN *a, CACHESIGN *b) 745 { 746 if (a->allistrue) 747 { 748 if (b->allistrue) 749 return 0; 750 else 751 return SIGLENBIT - sizebitvec(b->sign); 752 } 753 else if (b->allistrue) 754 return SIGLENBIT - sizebitvec(a->sign); 755 756 return hemdistsign(a->sign, b->sign); 757 } 758 759 Datum 760 gtrgm_picksplit(PG_FUNCTION_ARGS) 761 { 762 GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); 763 OffsetNumber maxoff = entryvec->n - 1; 764 GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1); 765 OffsetNumber k, 766 j; 767 TRGM *datum_l, 768 *datum_r; 769 BITVECP union_l, 770 union_r; 771 int32 size_alpha, 772 size_beta; 773 int32 size_waste, 774 waste = -1; 775 int32 nbytes; 776 OffsetNumber seed_1 = 0, 777 seed_2 = 0; 778 OffsetNumber *left, 779 *right; 780 BITVECP ptr; 781 int i; 782 CACHESIGN *cache; 783 SPLITCOST *costvector; 784 785 /* cache the sign data for each existing item */ 786 cache = (CACHESIGN *) palloc(sizeof(CACHESIGN) * (maxoff + 1)); 787 for (k = FirstOffsetNumber; k <= maxoff; k = OffsetNumberNext(k)) 788 fillcache(&cache[k], GETENTRY(entryvec, k)); 789 790 /* now find the two furthest-apart items */ 791 for (k = FirstOffsetNumber; k < maxoff; k = OffsetNumberNext(k)) 792 { 793 for (j = OffsetNumberNext(k); j <= maxoff; j = OffsetNumberNext(j)) 794 { 795 size_waste = hemdistcache(&(cache[j]), &(cache[k])); 796 if (size_waste > waste) 797 { 798 waste = size_waste; 799 seed_1 = k; 800 seed_2 = j; 801 } 802 } 803 } 804 805 /* just in case we didn't make a selection ... */ 806 if (seed_1 == 0 || seed_2 == 0) 807 { 808 seed_1 = 1; 809 seed_2 = 2; 810 } 811 812 /* initialize the result vectors */ 813 nbytes = maxoff * sizeof(OffsetNumber); 814 v->spl_left = left = (OffsetNumber *) palloc(nbytes); 815 v->spl_right = right = (OffsetNumber *) palloc(nbytes); 816 v->spl_nleft = 0; 817 v->spl_nright = 0; 818 819 /* form initial .. */ 820 if (cache[seed_1].allistrue) 821 { 822 datum_l = (TRGM *) palloc(CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); 823 SET_VARSIZE(datum_l, CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); 824 datum_l->flag = SIGNKEY | ALLISTRUE; 825 } 826 else 827 { 828 datum_l = (TRGM *) palloc(CALCGTSIZE(SIGNKEY, 0)); 829 SET_VARSIZE(datum_l, CALCGTSIZE(SIGNKEY, 0)); 830 datum_l->flag = SIGNKEY; 831 memcpy((void *) GETSIGN(datum_l), (void *) cache[seed_1].sign, sizeof(BITVEC)); 832 } 833 if (cache[seed_2].allistrue) 834 { 835 datum_r = (TRGM *) palloc(CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); 836 SET_VARSIZE(datum_r, CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); 837 datum_r->flag = SIGNKEY | ALLISTRUE; 838 } 839 else 840 { 841 datum_r = (TRGM *) palloc(CALCGTSIZE(SIGNKEY, 0)); 842 SET_VARSIZE(datum_r, CALCGTSIZE(SIGNKEY, 0)); 843 datum_r->flag = SIGNKEY; 844 memcpy((void *) GETSIGN(datum_r), (void *) cache[seed_2].sign, sizeof(BITVEC)); 845 } 846 847 union_l = GETSIGN(datum_l); 848 union_r = GETSIGN(datum_r); 849 850 /* sort before ... */ 851 costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); 852 for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j)) 853 { 854 costvector[j - 1].pos = j; 855 size_alpha = hemdistcache(&(cache[seed_1]), &(cache[j])); 856 size_beta = hemdistcache(&(cache[seed_2]), &(cache[j])); 857 costvector[j - 1].cost = abs(size_alpha - size_beta); 858 } 859 qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost); 860 861 for (k = 0; k < maxoff; k++) 862 { 863 j = costvector[k].pos; 864 if (j == seed_1) 865 { 866 *left++ = j; 867 v->spl_nleft++; 868 continue; 869 } 870 else if (j == seed_2) 871 { 872 *right++ = j; 873 v->spl_nright++; 874 continue; 875 } 876 877 if (ISALLTRUE(datum_l) || cache[j].allistrue) 878 { 879 if (ISALLTRUE(datum_l) && cache[j].allistrue) 880 size_alpha = 0; 881 else 882 size_alpha = SIGLENBIT - sizebitvec( 883 (cache[j].allistrue) ? GETSIGN(datum_l) : GETSIGN(cache[j].sign) 884 ); 885 } 886 else 887 size_alpha = hemdistsign(cache[j].sign, GETSIGN(datum_l)); 888 889 if (ISALLTRUE(datum_r) || cache[j].allistrue) 890 { 891 if (ISALLTRUE(datum_r) && cache[j].allistrue) 892 size_beta = 0; 893 else 894 size_beta = SIGLENBIT - sizebitvec( 895 (cache[j].allistrue) ? GETSIGN(datum_r) : GETSIGN(cache[j].sign) 896 ); 897 } 898 else 899 size_beta = hemdistsign(cache[j].sign, GETSIGN(datum_r)); 900 901 if (size_alpha < size_beta + WISH_F(v->spl_nleft, v->spl_nright, 0.1)) 902 { 903 if (ISALLTRUE(datum_l) || cache[j].allistrue) 904 { 905 if (!ISALLTRUE(datum_l)) 906 MemSet((void *) GETSIGN(datum_l), 0xff, sizeof(BITVEC)); 907 } 908 else 909 { 910 ptr = cache[j].sign; 911 LOOPBYTE 912 union_l[i] |= ptr[i]; 913 } 914 *left++ = j; 915 v->spl_nleft++; 916 } 917 else 918 { 919 if (ISALLTRUE(datum_r) || cache[j].allistrue) 920 { 921 if (!ISALLTRUE(datum_r)) 922 MemSet((void *) GETSIGN(datum_r), 0xff, sizeof(BITVEC)); 923 } 924 else 925 { 926 ptr = cache[j].sign; 927 LOOPBYTE 928 union_r[i] |= ptr[i]; 929 } 930 *right++ = j; 931 v->spl_nright++; 932 } 933 } 934 935 v->spl_ldatum = PointerGetDatum(datum_l); 936 v->spl_rdatum = PointerGetDatum(datum_r); 937 938 PG_RETURN_POINTER(v); 939 } 940