1 /*------------------------------------------------------------------------- 2 * 3 * hashutil.c 4 * Utility code for Postgres hash implementation. 5 * 6 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group 7 * Portions Copyright (c) 1994, Regents of the University of California 8 * 9 * 10 * IDENTIFICATION 11 * src/backend/access/hash/hashutil.c 12 * 13 *------------------------------------------------------------------------- 14 */ 15 #include "postgres.h" 16 17 #include "access/hash.h" 18 #include "access/reloptions.h" 19 #include "access/relscan.h" 20 #include "port/pg_bitutils.h" 21 #include "storage/buf_internals.h" 22 #include "utils/lsyscache.h" 23 #include "utils/rel.h" 24 25 #define CALC_NEW_BUCKET(old_bucket, lowmask) \ 26 old_bucket | (lowmask + 1) 27 28 /* 29 * _hash_checkqual -- does the index tuple satisfy the scan conditions? 30 */ 31 bool 32 _hash_checkqual(IndexScanDesc scan, IndexTuple itup) 33 { 34 /* 35 * Currently, we can't check any of the scan conditions since we do not 36 * have the original index entry value to supply to the sk_func. Always 37 * return true; we expect that hashgettuple already set the recheck flag 38 * to make the main indexscan code do it. 39 */ 40 #ifdef NOT_USED 41 TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); 42 ScanKey key = scan->keyData; 43 int scanKeySize = scan->numberOfKeys; 44 45 while (scanKeySize > 0) 46 { 47 Datum datum; 48 bool isNull; 49 Datum test; 50 51 datum = index_getattr(itup, 52 key->sk_attno, 53 tupdesc, 54 &isNull); 55 56 /* assume sk_func is strict */ 57 if (isNull) 58 return false; 59 if (key->sk_flags & SK_ISNULL) 60 return false; 61 62 test = FunctionCall2Coll(&key->sk_func, key->sk_collation, 63 datum, key->sk_argument); 64 65 if (!DatumGetBool(test)) 66 return false; 67 68 key++; 69 scanKeySize--; 70 } 71 #endif 72 73 return true; 74 } 75 76 /* 77 * _hash_datum2hashkey -- given a Datum, call the index's hash function 78 * 79 * The Datum is assumed to be of the index's column type, so we can use the 80 * "primary" hash function that's tracked for us by the generic index code. 81 */ 82 uint32 83 _hash_datum2hashkey(Relation rel, Datum key) 84 { 85 FmgrInfo *procinfo; 86 Oid collation; 87 88 /* XXX assumes index has only one attribute */ 89 procinfo = index_getprocinfo(rel, 1, HASHSTANDARD_PROC); 90 collation = rel->rd_indcollation[0]; 91 92 return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key)); 93 } 94 95 /* 96 * _hash_datum2hashkey_type -- given a Datum of a specified type, 97 * hash it in a fashion compatible with this index 98 * 99 * This is much more expensive than _hash_datum2hashkey, so use it only in 100 * cross-type situations. 101 */ 102 uint32 103 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype) 104 { 105 RegProcedure hash_proc; 106 Oid collation; 107 108 /* XXX assumes index has only one attribute */ 109 hash_proc = get_opfamily_proc(rel->rd_opfamily[0], 110 keytype, 111 keytype, 112 HASHSTANDARD_PROC); 113 if (!RegProcedureIsValid(hash_proc)) 114 elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"", 115 HASHSTANDARD_PROC, keytype, keytype, 116 RelationGetRelationName(rel)); 117 collation = rel->rd_indcollation[0]; 118 119 return DatumGetUInt32(OidFunctionCall1Coll(hash_proc, collation, key)); 120 } 121 122 /* 123 * _hash_hashkey2bucket -- determine which bucket the hashkey maps to. 124 */ 125 Bucket 126 _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, 127 uint32 highmask, uint32 lowmask) 128 { 129 Bucket bucket; 130 131 bucket = hashkey & highmask; 132 if (bucket > maxbucket) 133 bucket = bucket & lowmask; 134 135 return bucket; 136 } 137 138 /* 139 * _hash_spareindex -- returns spare index / global splitpoint phase of the 140 * bucket 141 */ 142 uint32 143 _hash_spareindex(uint32 num_bucket) 144 { 145 uint32 splitpoint_group; 146 uint32 splitpoint_phases; 147 148 splitpoint_group = pg_ceil_log2_32(num_bucket); 149 150 if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) 151 return splitpoint_group; 152 153 /* account for single-phase groups */ 154 splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE; 155 156 /* account for multi-phase groups before splitpoint_group */ 157 splitpoint_phases += 158 ((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) << 159 HASH_SPLITPOINT_PHASE_BITS); 160 161 /* account for phases within current group */ 162 splitpoint_phases += 163 (((num_bucket - 1) >> 164 (splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) & 165 HASH_SPLITPOINT_PHASE_MASK); /* to 0-based value. */ 166 167 return splitpoint_phases; 168 } 169 170 /* 171 * _hash_get_totalbuckets -- returns total number of buckets allocated till 172 * the given splitpoint phase. 173 */ 174 uint32 175 _hash_get_totalbuckets(uint32 splitpoint_phase) 176 { 177 uint32 splitpoint_group; 178 uint32 total_buckets; 179 uint32 phases_within_splitpoint_group; 180 181 if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) 182 return (1 << splitpoint_phase); 183 184 /* get splitpoint's group */ 185 splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE; 186 splitpoint_group += 187 ((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >> 188 HASH_SPLITPOINT_PHASE_BITS); 189 190 /* account for buckets before splitpoint_group */ 191 total_buckets = (1 << (splitpoint_group - 1)); 192 193 /* account for buckets within splitpoint_group */ 194 phases_within_splitpoint_group = 195 (((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) & 196 HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */ 197 total_buckets += 198 (((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) * 199 phases_within_splitpoint_group); 200 201 return total_buckets; 202 } 203 204 /* 205 * _hash_checkpage -- sanity checks on the format of all hash pages 206 * 207 * If flags is not zero, it is a bitwise OR of the acceptable page types 208 * (values of hasho_flag & LH_PAGE_TYPE). 209 */ 210 void 211 _hash_checkpage(Relation rel, Buffer buf, int flags) 212 { 213 Page page = BufferGetPage(buf); 214 215 /* 216 * ReadBuffer verifies that every newly-read page passes 217 * PageHeaderIsValid, which means it either contains a reasonably sane 218 * page header or is all-zero. We have to defend against the all-zero 219 * case, however. 220 */ 221 if (PageIsNew(page)) 222 ereport(ERROR, 223 (errcode(ERRCODE_INDEX_CORRUPTED), 224 errmsg("index \"%s\" contains unexpected zero page at block %u", 225 RelationGetRelationName(rel), 226 BufferGetBlockNumber(buf)), 227 errhint("Please REINDEX it."))); 228 229 /* 230 * Additionally check that the special area looks sane. 231 */ 232 if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData))) 233 ereport(ERROR, 234 (errcode(ERRCODE_INDEX_CORRUPTED), 235 errmsg("index \"%s\" contains corrupted page at block %u", 236 RelationGetRelationName(rel), 237 BufferGetBlockNumber(buf)), 238 errhint("Please REINDEX it."))); 239 240 if (flags) 241 { 242 HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page); 243 244 if ((opaque->hasho_flag & flags) == 0) 245 ereport(ERROR, 246 (errcode(ERRCODE_INDEX_CORRUPTED), 247 errmsg("index \"%s\" contains corrupted page at block %u", 248 RelationGetRelationName(rel), 249 BufferGetBlockNumber(buf)), 250 errhint("Please REINDEX it."))); 251 } 252 253 /* 254 * When checking the metapage, also verify magic number and version. 255 */ 256 if (flags == LH_META_PAGE) 257 { 258 HashMetaPage metap = HashPageGetMeta(page); 259 260 if (metap->hashm_magic != HASH_MAGIC) 261 ereport(ERROR, 262 (errcode(ERRCODE_INDEX_CORRUPTED), 263 errmsg("index \"%s\" is not a hash index", 264 RelationGetRelationName(rel)))); 265 266 if (metap->hashm_version != HASH_VERSION) 267 ereport(ERROR, 268 (errcode(ERRCODE_INDEX_CORRUPTED), 269 errmsg("index \"%s\" has wrong hash version", 270 RelationGetRelationName(rel)), 271 errhint("Please REINDEX it."))); 272 } 273 } 274 275 bytea * 276 hashoptions(Datum reloptions, bool validate) 277 { 278 static const relopt_parse_elt tab[] = { 279 {"fillfactor", RELOPT_TYPE_INT, offsetof(HashOptions, fillfactor)}, 280 }; 281 282 return (bytea *) build_reloptions(reloptions, validate, 283 RELOPT_KIND_HASH, 284 sizeof(HashOptions), 285 tab, lengthof(tab)); 286 } 287 288 /* 289 * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value 290 */ 291 uint32 292 _hash_get_indextuple_hashkey(IndexTuple itup) 293 { 294 char *attp; 295 296 /* 297 * We assume the hash key is the first attribute and can't be null, so 298 * this can be done crudely but very very cheaply ... 299 */ 300 attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info); 301 return *((uint32 *) attp); 302 } 303 304 /* 305 * _hash_convert_tuple - convert raw index data to hash key 306 * 307 * Inputs: values and isnull arrays for the user data column(s) 308 * Outputs: values and isnull arrays for the index tuple, suitable for 309 * passing to index_form_tuple(). 310 * 311 * Returns true if successful, false if not (because there are null values). 312 * On a false result, the given data need not be indexed. 313 * 314 * Note: callers know that the index-column arrays are always of length 1. 315 * In principle, there could be more than one input column, though we do not 316 * currently support that. 317 */ 318 bool 319 _hash_convert_tuple(Relation index, 320 Datum *user_values, bool *user_isnull, 321 Datum *index_values, bool *index_isnull) 322 { 323 uint32 hashkey; 324 325 /* 326 * We do not insert null values into hash indexes. This is okay because 327 * the only supported search operator is '=', and we assume it is strict. 328 */ 329 if (user_isnull[0]) 330 return false; 331 332 hashkey = _hash_datum2hashkey(index, user_values[0]); 333 index_values[0] = UInt32GetDatum(hashkey); 334 index_isnull[0] = false; 335 return true; 336 } 337 338 /* 339 * _hash_binsearch - Return the offset number in the page where the 340 * specified hash value should be sought or inserted. 341 * 342 * We use binary search, relying on the assumption that the existing entries 343 * are ordered by hash key. 344 * 345 * Returns the offset of the first index entry having hashkey >= hash_value, 346 * or the page's max offset plus one if hash_value is greater than all 347 * existing hash keys in the page. This is the appropriate place to start 348 * a search, or to insert a new item. 349 */ 350 OffsetNumber 351 _hash_binsearch(Page page, uint32 hash_value) 352 { 353 OffsetNumber upper; 354 OffsetNumber lower; 355 356 /* Loop invariant: lower <= desired place <= upper */ 357 upper = PageGetMaxOffsetNumber(page) + 1; 358 lower = FirstOffsetNumber; 359 360 while (upper > lower) 361 { 362 OffsetNumber off; 363 IndexTuple itup; 364 uint32 hashkey; 365 366 off = (upper + lower) / 2; 367 Assert(OffsetNumberIsValid(off)); 368 369 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); 370 hashkey = _hash_get_indextuple_hashkey(itup); 371 if (hashkey < hash_value) 372 lower = off + 1; 373 else 374 upper = off; 375 } 376 377 return lower; 378 } 379 380 /* 381 * _hash_binsearch_last 382 * 383 * Same as above, except that if there are multiple matching items in the 384 * page, we return the offset of the last one instead of the first one, 385 * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1. 386 * This is handy for starting a new page in a backwards scan. 387 */ 388 OffsetNumber 389 _hash_binsearch_last(Page page, uint32 hash_value) 390 { 391 OffsetNumber upper; 392 OffsetNumber lower; 393 394 /* Loop invariant: lower <= desired place <= upper */ 395 upper = PageGetMaxOffsetNumber(page); 396 lower = FirstOffsetNumber - 1; 397 398 while (upper > lower) 399 { 400 IndexTuple itup; 401 OffsetNumber off; 402 uint32 hashkey; 403 404 off = (upper + lower + 1) / 2; 405 Assert(OffsetNumberIsValid(off)); 406 407 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); 408 hashkey = _hash_get_indextuple_hashkey(itup); 409 if (hashkey > hash_value) 410 upper = off - 1; 411 else 412 lower = off; 413 } 414 415 return lower; 416 } 417 418 /* 419 * _hash_get_oldblock_from_newbucket() -- get the block number of a bucket 420 * from which current (new) bucket is being split. 421 */ 422 BlockNumber 423 _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket) 424 { 425 Bucket old_bucket; 426 uint32 mask; 427 Buffer metabuf; 428 HashMetaPage metap; 429 BlockNumber blkno; 430 431 /* 432 * To get the old bucket from the current bucket, we need a mask to modulo 433 * into lower half of table. This mask is stored in meta page as 434 * hashm_lowmask, but here we can't rely on the same, because we need a 435 * value of lowmask that was prevalent at the time when bucket split was 436 * started. Masking the most significant bit of new bucket would give us 437 * old bucket. 438 */ 439 mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1; 440 old_bucket = new_bucket & mask; 441 442 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); 443 metap = HashPageGetMeta(BufferGetPage(metabuf)); 444 445 blkno = BUCKET_TO_BLKNO(metap, old_bucket); 446 447 _hash_relbuf(rel, metabuf); 448 449 return blkno; 450 } 451 452 /* 453 * _hash_get_newblock_from_oldbucket() -- get the block number of a bucket 454 * that will be generated after split from old bucket. 455 * 456 * This is used to find the new bucket from old bucket based on current table 457 * half. It is mainly required to finish the incomplete splits where we are 458 * sure that not more than one bucket could have split in progress from old 459 * bucket. 460 */ 461 BlockNumber 462 _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket) 463 { 464 Bucket new_bucket; 465 Buffer metabuf; 466 HashMetaPage metap; 467 BlockNumber blkno; 468 469 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); 470 metap = HashPageGetMeta(BufferGetPage(metabuf)); 471 472 new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket, 473 metap->hashm_lowmask, 474 metap->hashm_maxbucket); 475 blkno = BUCKET_TO_BLKNO(metap, new_bucket); 476 477 _hash_relbuf(rel, metabuf); 478 479 return blkno; 480 } 481 482 /* 483 * _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be 484 * generated after split from current (old) bucket. 485 * 486 * This is used to find the new bucket from old bucket. New bucket can be 487 * obtained by OR'ing old bucket with most significant bit of current table 488 * half (lowmask passed in this function can be used to identify msb of 489 * current table half). There could be multiple buckets that could have 490 * been split from current bucket. We need the first such bucket that exists. 491 * Caller must ensure that no more than one split has happened from old 492 * bucket. 493 */ 494 Bucket 495 _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, 496 uint32 lowmask, uint32 maxbucket) 497 { 498 Bucket new_bucket; 499 500 new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask); 501 if (new_bucket > maxbucket) 502 { 503 lowmask = lowmask >> 1; 504 new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask); 505 } 506 507 return new_bucket; 508 } 509 510 /* 511 * _hash_kill_items - set LP_DEAD state for items an indexscan caller has 512 * told us were killed. 513 * 514 * scan->opaque, referenced locally through so, contains information about the 515 * current page and killed tuples thereon (generally, this should only be 516 * called if so->numKilled > 0). 517 * 518 * The caller does not have a lock on the page and may or may not have the 519 * page pinned in a buffer. Note that read-lock is sufficient for setting 520 * LP_DEAD status (which is only a hint). 521 * 522 * The caller must have pin on bucket buffer, but may or may not have pin 523 * on overflow buffer, as indicated by HashScanPosIsPinned(so->currPos). 524 * 525 * We match items by heap TID before assuming they are the right ones to 526 * delete. 527 * 528 * There are never any scans active in a bucket at the time VACUUM begins, 529 * because VACUUM takes a cleanup lock on the primary bucket page and scans 530 * hold a pin. A scan can begin after VACUUM leaves the primary bucket page 531 * but before it finishes the entire bucket, but it can never pass VACUUM, 532 * because VACUUM always locks the next page before releasing the lock on 533 * the previous one. Therefore, we don't have to worry about accidentally 534 * killing a TID that has been reused for an unrelated tuple. 535 */ 536 void 537 _hash_kill_items(IndexScanDesc scan) 538 { 539 HashScanOpaque so = (HashScanOpaque) scan->opaque; 540 Relation rel = scan->indexRelation; 541 BlockNumber blkno; 542 Buffer buf; 543 Page page; 544 HashPageOpaque opaque; 545 OffsetNumber offnum, 546 maxoff; 547 int numKilled = so->numKilled; 548 int i; 549 bool killedsomething = false; 550 bool havePin = false; 551 552 Assert(so->numKilled > 0); 553 Assert(so->killedItems != NULL); 554 Assert(HashScanPosIsValid(so->currPos)); 555 556 /* 557 * Always reset the scan state, so we don't look for same items on other 558 * pages. 559 */ 560 so->numKilled = 0; 561 562 blkno = so->currPos.currPage; 563 if (HashScanPosIsPinned(so->currPos)) 564 { 565 /* 566 * We already have pin on this buffer, so, all we need to do is 567 * acquire lock on it. 568 */ 569 havePin = true; 570 buf = so->currPos.buf; 571 LockBuffer(buf, BUFFER_LOCK_SHARE); 572 } 573 else 574 buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); 575 576 page = BufferGetPage(buf); 577 opaque = (HashPageOpaque) PageGetSpecialPointer(page); 578 maxoff = PageGetMaxOffsetNumber(page); 579 580 for (i = 0; i < numKilled; i++) 581 { 582 int itemIndex = so->killedItems[i]; 583 HashScanPosItem *currItem = &so->currPos.items[itemIndex]; 584 585 offnum = currItem->indexOffset; 586 587 Assert(itemIndex >= so->currPos.firstItem && 588 itemIndex <= so->currPos.lastItem); 589 590 while (offnum <= maxoff) 591 { 592 ItemId iid = PageGetItemId(page, offnum); 593 IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); 594 595 if (ItemPointerEquals(&ituple->t_tid, &currItem->heapTid)) 596 { 597 /* found the item */ 598 ItemIdMarkDead(iid); 599 killedsomething = true; 600 break; /* out of inner search loop */ 601 } 602 offnum = OffsetNumberNext(offnum); 603 } 604 } 605 606 /* 607 * Since this can be redone later if needed, mark as dirty hint. Whenever 608 * we mark anything LP_DEAD, we also set the page's 609 * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint. 610 */ 611 if (killedsomething) 612 { 613 opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES; 614 MarkBufferDirtyHint(buf, true); 615 } 616 617 if (so->hashso_bucket_buf == so->currPos.buf || 618 havePin) 619 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); 620 else 621 _hash_relbuf(rel, buf); 622 } 623