1 /*------------------------------------------------------------------------- 2 * 3 * hash.h 4 * header file for postgres hash access method implementation 5 * 6 * 7 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group 8 * Portions Copyright (c) 1994, Regents of the University of California 9 * 10 * src/include/access/hash.h 11 * 12 * NOTES 13 * modeled after Margo Seltzer's hash implementation for unix. 14 * 15 *------------------------------------------------------------------------- 16 */ 17 #ifndef HASH_H 18 #define HASH_H 19 20 #include "access/amapi.h" 21 #include "access/itup.h" 22 #include "access/sdir.h" 23 #include "catalog/pg_am_d.h" 24 #include "common/hashfn.h" 25 #include "lib/stringinfo.h" 26 #include "storage/bufmgr.h" 27 #include "storage/lockdefs.h" 28 #include "utils/hsearch.h" 29 #include "utils/relcache.h" 30 31 /* 32 * Mapping from hash bucket number to physical block number of bucket's 33 * starting page. Beware of multiple evaluations of argument! 34 */ 35 typedef uint32 Bucket; 36 37 #define InvalidBucket ((Bucket) 0xFFFFFFFF) 38 39 #define BUCKET_TO_BLKNO(metap,B) \ 40 ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1) 41 42 /* 43 * Special space for hash index pages. 44 * 45 * hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at. 46 * Additional bits in the flag word are used for more transient purposes. 47 * 48 * To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE. 49 * However, we ensure that each used page type has a distinct bit so that 50 * we can OR together page types for uses such as the allowable-page-types 51 * argument of _hash_checkpage(). 52 */ 53 #define LH_UNUSED_PAGE (0) 54 #define LH_OVERFLOW_PAGE (1 << 0) 55 #define LH_BUCKET_PAGE (1 << 1) 56 #define LH_BITMAP_PAGE (1 << 2) 57 #define LH_META_PAGE (1 << 3) 58 #define LH_BUCKET_BEING_POPULATED (1 << 4) 59 #define LH_BUCKET_BEING_SPLIT (1 << 5) 60 #define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6) 61 #define LH_PAGE_HAS_DEAD_TUPLES (1 << 7) 62 63 #define LH_PAGE_TYPE \ 64 (LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE) 65 66 /* 67 * In an overflow page, hasho_prevblkno stores the block number of the previous 68 * page in the bucket chain; in a bucket page, hasho_prevblkno stores the 69 * hashm_maxbucket value as of the last time the bucket was last split, or 70 * else as of the time the bucket was created. The latter convention is used 71 * to determine whether a cached copy of the metapage is too stale to be used 72 * without needing to lock or pin the metapage. 73 * 74 * hasho_nextblkno is always the block number of the next page in the 75 * bucket chain, or InvalidBlockNumber if there are no more such pages. 76 */ 77 typedef struct HashPageOpaqueData 78 { 79 BlockNumber hasho_prevblkno; /* see above */ 80 BlockNumber hasho_nextblkno; /* see above */ 81 Bucket hasho_bucket; /* bucket number this pg belongs to */ 82 uint16 hasho_flag; /* page type code + flag bits, see above */ 83 uint16 hasho_page_id; /* for identification of hash indexes */ 84 } HashPageOpaqueData; 85 86 typedef HashPageOpaqueData *HashPageOpaque; 87 88 #define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0) 89 #define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0) 90 #define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0) 91 #define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0) 92 93 /* 94 * The page ID is for the convenience of pg_filedump and similar utilities, 95 * which otherwise would have a hard time telling pages of different index 96 * types apart. It should be the last 2 bytes on the page. This is more or 97 * less "free" due to alignment considerations. 98 */ 99 #define HASHO_PAGE_ID 0xFF80 100 101 typedef struct HashScanPosItem /* what we remember about each match */ 102 { 103 ItemPointerData heapTid; /* TID of referenced heap item */ 104 OffsetNumber indexOffset; /* index item's location within page */ 105 } HashScanPosItem; 106 107 typedef struct HashScanPosData 108 { 109 Buffer buf; /* if valid, the buffer is pinned */ 110 BlockNumber currPage; /* current hash index page */ 111 BlockNumber nextPage; /* next overflow page */ 112 BlockNumber prevPage; /* prev overflow or bucket page */ 113 114 /* 115 * The items array is always ordered in index order (ie, increasing 116 * indexoffset). When scanning backwards it is convenient to fill the 117 * array back-to-front, so we start at the last slot and fill downwards. 118 * Hence we need both a first-valid-entry and a last-valid-entry counter. 119 * itemIndex is a cursor showing which entry was last returned to caller. 120 */ 121 int firstItem; /* first valid index in items[] */ 122 int lastItem; /* last valid index in items[] */ 123 int itemIndex; /* current index in items[] */ 124 125 HashScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ 126 } HashScanPosData; 127 128 #define HashScanPosIsPinned(scanpos) \ 129 ( \ 130 AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ 131 !BufferIsValid((scanpos).buf)), \ 132 BufferIsValid((scanpos).buf) \ 133 ) 134 135 #define HashScanPosIsValid(scanpos) \ 136 ( \ 137 AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ 138 !BufferIsValid((scanpos).buf)), \ 139 BlockNumberIsValid((scanpos).currPage) \ 140 ) 141 142 #define HashScanPosInvalidate(scanpos) \ 143 do { \ 144 (scanpos).buf = InvalidBuffer; \ 145 (scanpos).currPage = InvalidBlockNumber; \ 146 (scanpos).nextPage = InvalidBlockNumber; \ 147 (scanpos).prevPage = InvalidBlockNumber; \ 148 (scanpos).firstItem = 0; \ 149 (scanpos).lastItem = 0; \ 150 (scanpos).itemIndex = 0; \ 151 } while (0) 152 153 /* 154 * HashScanOpaqueData is private state for a hash index scan. 155 */ 156 typedef struct HashScanOpaqueData 157 { 158 /* Hash value of the scan key, ie, the hash key we seek */ 159 uint32 hashso_sk_hash; 160 161 /* remember the buffer associated with primary bucket */ 162 Buffer hashso_bucket_buf; 163 164 /* 165 * remember the buffer associated with primary bucket page of bucket being 166 * split. it is required during the scan of the bucket which is being 167 * populated during split operation. 168 */ 169 Buffer hashso_split_bucket_buf; 170 171 /* Whether scan starts on bucket being populated due to split */ 172 bool hashso_buc_populated; 173 174 /* 175 * Whether scanning bucket being split? The value of this parameter is 176 * referred only when hashso_buc_populated is true. 177 */ 178 bool hashso_buc_split; 179 /* info about killed items if any (killedItems is NULL if never used) */ 180 int *killedItems; /* currPos.items indexes of killed items */ 181 int numKilled; /* number of currently stored items */ 182 183 /* 184 * Identify all the matching items on a page and save them in 185 * HashScanPosData 186 */ 187 HashScanPosData currPos; /* current position data */ 188 } HashScanOpaqueData; 189 190 typedef HashScanOpaqueData *HashScanOpaque; 191 192 /* 193 * Definitions for metapage. 194 */ 195 196 #define HASH_METAPAGE 0 /* metapage is always block 0 */ 197 198 #define HASH_MAGIC 0x6440640 199 #define HASH_VERSION 4 200 201 /* 202 * spares[] holds the number of overflow pages currently allocated at or 203 * before a certain splitpoint. For example, if spares[3] = 7 then there are 204 * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro). The 205 * value in spares[ovflpoint] increases as overflow pages are added at the 206 * end of the index. Once ovflpoint increases (ie, we have actually allocated 207 * the bucket pages belonging to that splitpoint) the number of spares at the 208 * prior splitpoint cannot change anymore. 209 * 210 * ovflpages that have been recycled for reuse can be found by looking at 211 * bitmaps that are stored within ovflpages dedicated for the purpose. 212 * The blknos of these bitmap pages are kept in mapp[]; nmaps is the 213 * number of currently existing bitmaps. 214 * 215 * The limitation on the size of spares[] comes from the fact that there's 216 * no point in having more than 2^32 buckets with only uint32 hashcodes. 217 * (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is 218 * adjusted in such a way to accommodate multi phased allocation of buckets 219 * after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE). 220 * 221 * There is no particular upper limit on the size of mapp[], other than 222 * needing to fit into the metapage. (With 8K block size, 1024 bitmaps 223 * limit us to 256 GB of overflow space...). For smaller block size we 224 * can not use 1024 bitmaps as it will lead to the meta page data crossing 225 * the block size boundary. So we use BLCKSZ to determine the maximum number 226 * of bitmaps. 227 */ 228 #define HASH_MAX_BITMAPS Min(BLCKSZ / 8, 1024) 229 230 #define HASH_SPLITPOINT_PHASE_BITS 2 231 #define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS) 232 #define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1) 233 #define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10 234 235 /* defines max number of splitpoint phases a hash index can have */ 236 #define HASH_MAX_SPLITPOINT_GROUP 32 237 #define HASH_MAX_SPLITPOINTS \ 238 (((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \ 239 HASH_SPLITPOINT_PHASES_PER_GRP) + \ 240 HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) 241 242 typedef struct HashMetaPageData 243 { 244 uint32 hashm_magic; /* magic no. for hash tables */ 245 uint32 hashm_version; /* version ID */ 246 double hashm_ntuples; /* number of tuples stored in the table */ 247 uint16 hashm_ffactor; /* target fill factor (tuples/bucket) */ 248 uint16 hashm_bsize; /* index page size (bytes) */ 249 uint16 hashm_bmsize; /* bitmap array size (bytes) - must be a power 250 * of 2 */ 251 uint16 hashm_bmshift; /* log2(bitmap array size in BITS) */ 252 uint32 hashm_maxbucket; /* ID of maximum bucket in use */ 253 uint32 hashm_highmask; /* mask to modulo into entire table */ 254 uint32 hashm_lowmask; /* mask to modulo into lower half of table */ 255 uint32 hashm_ovflpoint; /* splitpoint from which ovflpage being 256 * allocated */ 257 uint32 hashm_firstfree; /* lowest-number free ovflpage (bit#) */ 258 uint32 hashm_nmaps; /* number of bitmap pages */ 259 RegProcedure hashm_procid; /* hash function id from pg_proc */ 260 uint32 hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before each 261 * splitpoint */ 262 BlockNumber hashm_mapp[HASH_MAX_BITMAPS]; /* blknos of ovfl bitmaps */ 263 } HashMetaPageData; 264 265 typedef HashMetaPageData *HashMetaPage; 266 267 typedef struct HashOptions 268 { 269 int32 varlena_header_; /* varlena header (do not touch directly!) */ 270 int fillfactor; /* page fill factor in percent (0..100) */ 271 } HashOptions; 272 273 #define HashGetFillFactor(relation) \ 274 (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ 275 relation->rd_rel->relam == HASH_AM_OID), \ 276 (relation)->rd_options ? \ 277 ((HashOptions *) (relation)->rd_options)->fillfactor : \ 278 HASH_DEFAULT_FILLFACTOR) 279 #define HashGetTargetPageUsage(relation) \ 280 (BLCKSZ * HashGetFillFactor(relation) / 100) 281 282 /* 283 * Maximum size of a hash index item (it's okay to have only one per page) 284 */ 285 #define HashMaxItemSize(page) \ 286 MAXALIGN_DOWN(PageGetPageSize(page) - \ 287 SizeOfPageHeaderData - \ 288 sizeof(ItemIdData) - \ 289 MAXALIGN(sizeof(HashPageOpaqueData))) 290 291 #define INDEX_MOVED_BY_SPLIT_MASK INDEX_AM_RESERVED_BIT 292 293 #define HASH_MIN_FILLFACTOR 10 294 #define HASH_DEFAULT_FILLFACTOR 75 295 296 /* 297 * Constants 298 */ 299 #define BYTE_TO_BIT 3 /* 2^3 bits/byte */ 300 #define ALL_SET ((uint32) ~0) 301 302 /* 303 * Bitmap pages do not contain tuples. They do contain the standard 304 * page headers and trailers; however, everything in between is a 305 * giant bit array. The number of bits that fit on a page obviously 306 * depends on the page size and the header/trailer overhead. We require 307 * the number of bits per page to be a power of 2. 308 */ 309 #define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize) 310 #define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT) 311 #define BMPG_SHIFT(metap) ((metap)->hashm_bmshift) 312 #define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1) 313 314 #define HashPageGetBitmap(page) \ 315 ((uint32 *) PageGetContents(page)) 316 317 #define HashGetMaxBitmapSize(page) \ 318 (PageGetPageSize((Page) page) - \ 319 (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData)))) 320 321 #define HashPageGetMeta(page) \ 322 ((HashMetaPage) PageGetContents(page)) 323 324 /* 325 * The number of bits in an ovflpage bitmap word. 326 */ 327 #define BITS_PER_MAP 32 /* Number of bits in uint32 */ 328 329 /* Given the address of the beginning of a bit map, clear/set the nth bit */ 330 #define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP))) 331 #define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP))) 332 #define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP))) 333 334 /* 335 * page-level and high-level locking modes (see README) 336 */ 337 #define HASH_READ BUFFER_LOCK_SHARE 338 #define HASH_WRITE BUFFER_LOCK_EXCLUSIVE 339 #define HASH_NOLOCK (-1) 340 341 /* 342 * When a new operator class is declared, we require that the user supply 343 * us with an amproc function for hashing a key of the new type, returning 344 * a 32-bit hash value. We call this the "standard" hash function. We 345 * also allow an optional "extended" hash function which accepts a salt and 346 * returns a 64-bit hash value. This is highly recommended but, for reasons 347 * of backward compatibility, optional. 348 * 349 * When the salt is 0, the low 32 bits of the value returned by the extended 350 * hash function should match the value that would have been returned by the 351 * standard hash function. 352 */ 353 #define HASHSTANDARD_PROC 1 354 #define HASHEXTENDED_PROC 2 355 #define HASHOPTIONS_PROC 3 356 #define HASHNProcs 3 357 358 359 /* public routines */ 360 361 extern IndexBuildResult *hashbuild(Relation heap, Relation index, 362 struct IndexInfo *indexInfo); 363 extern void hashbuildempty(Relation index); 364 extern bool hashinsert(Relation rel, Datum *values, bool *isnull, 365 ItemPointer ht_ctid, Relation heapRel, 366 IndexUniqueCheck checkUnique, 367 struct IndexInfo *indexInfo); 368 extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir); 369 extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); 370 extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys); 371 extern void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, 372 ScanKey orderbys, int norderbys); 373 extern void hashendscan(IndexScanDesc scan); 374 extern IndexBulkDeleteResult *hashbulkdelete(IndexVacuumInfo *info, 375 IndexBulkDeleteResult *stats, 376 IndexBulkDeleteCallback callback, 377 void *callback_state); 378 extern IndexBulkDeleteResult *hashvacuumcleanup(IndexVacuumInfo *info, 379 IndexBulkDeleteResult *stats); 380 extern bytea *hashoptions(Datum reloptions, bool validate); 381 extern bool hashvalidate(Oid opclassoid); 382 383 /* private routines */ 384 385 /* hashinsert.c */ 386 extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel); 387 extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, 388 Size itemsize, IndexTuple itup); 389 extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, 390 OffsetNumber *itup_offsets, uint16 nitups); 391 392 /* hashovfl.c */ 393 extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin); 394 extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, 395 Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, 396 Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy); 397 extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage); 398 extern void _hash_squeezebucket(Relation rel, 399 Bucket bucket, BlockNumber bucket_blkno, 400 Buffer bucket_buf, 401 BufferAccessStrategy bstrategy); 402 extern uint32 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno); 403 404 /* hashpage.c */ 405 extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, 406 int access, int flags); 407 extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel, 408 BlockNumber blkno, int flags); 409 extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, 410 bool force_refresh); 411 extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, 412 int access, 413 HashMetaPage *cachedmetap); 414 extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno); 415 extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, 416 uint32 flag, bool initpage); 417 extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, 418 ForkNumber forkNum); 419 extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, 420 int access, int flags, 421 BufferAccessStrategy bstrategy); 422 extern void _hash_relbuf(Relation rel, Buffer buf); 423 extern void _hash_dropbuf(Relation rel, Buffer buf); 424 extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so); 425 extern uint32 _hash_init(Relation rel, double num_tuples, 426 ForkNumber forkNum); 427 extern void _hash_init_metabuffer(Buffer buf, double num_tuples, 428 RegProcedure procid, uint16 ffactor, bool initpage); 429 extern void _hash_pageinit(Page page, Size size); 430 extern void _hash_expandtable(Relation rel, Buffer metabuf); 431 extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, 432 Bucket obucket, uint32 maxbucket, uint32 highmask, 433 uint32 lowmask); 434 435 /* hashsearch.c */ 436 extern bool _hash_next(IndexScanDesc scan, ScanDirection dir); 437 extern bool _hash_first(IndexScanDesc scan, ScanDirection dir); 438 439 /* hashsort.c */ 440 typedef struct HSpool HSpool; /* opaque struct in hashsort.c */ 441 442 extern HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets); 443 extern void _h_spooldestroy(HSpool *hspool); 444 extern void _h_spool(HSpool *hspool, ItemPointer self, 445 Datum *values, bool *isnull); 446 extern void _h_indexbuild(HSpool *hspool, Relation heapRel); 447 448 /* hashutil.c */ 449 extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup); 450 extern uint32 _hash_datum2hashkey(Relation rel, Datum key); 451 extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype); 452 extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, 453 uint32 highmask, uint32 lowmask); 454 extern uint32 _hash_spareindex(uint32 num_bucket); 455 extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase); 456 extern void _hash_checkpage(Relation rel, Buffer buf, int flags); 457 extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup); 458 extern bool _hash_convert_tuple(Relation index, 459 Datum *user_values, bool *user_isnull, 460 Datum *index_values, bool *index_isnull); 461 extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value); 462 extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value); 463 extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket); 464 extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket); 465 extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, 466 uint32 lowmask, uint32 maxbucket); 467 extern void _hash_kill_items(IndexScanDesc scan); 468 469 /* hash.c */ 470 extern void hashbucketcleanup(Relation rel, Bucket cur_bucket, 471 Buffer bucket_buf, BlockNumber bucket_blkno, 472 BufferAccessStrategy bstrategy, 473 uint32 maxbucket, uint32 highmask, uint32 lowmask, 474 double *tuples_removed, double *num_index_tuples, 475 bool split_cleanup, 476 IndexBulkDeleteCallback callback, void *callback_state); 477 478 #endif /* HASH_H */ 479