1 /*------------------------------------------------------------------------- 2 * 3 * hash.h 4 * header file for postgres hash access method implementation 5 * 6 * 7 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group 8 * Portions Copyright (c) 1994, Regents of the University of California 9 * 10 * src/include/access/hash.h 11 * 12 * NOTES 13 * modeled after Margo Seltzer's hash implementation for unix. 14 * 15 *------------------------------------------------------------------------- 16 */ 17 #ifndef HASH_H 18 #define HASH_H 19 20 #include "access/amapi.h" 21 #include "access/itup.h" 22 #include "access/sdir.h" 23 #include "fmgr.h" 24 #include "lib/stringinfo.h" 25 #include "storage/bufmgr.h" 26 #include "storage/lockdefs.h" 27 #include "utils/hsearch.h" 28 #include "utils/relcache.h" 29 30 /* 31 * Mapping from hash bucket number to physical block number of bucket's 32 * starting page. Beware of multiple evaluations of argument! 33 */ 34 typedef uint32 Bucket; 35 36 #define InvalidBucket ((Bucket) 0xFFFFFFFF) 37 38 #define BUCKET_TO_BLKNO(metap,B) \ 39 ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1) 40 41 /* 42 * Special space for hash index pages. 43 * 44 * hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at. 45 * Additional bits in the flag word are used for more transient purposes. 46 * 47 * To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE. 48 * However, we ensure that each used page type has a distinct bit so that 49 * we can OR together page types for uses such as the allowable-page-types 50 * argument of _hash_checkpage(). 51 */ 52 #define LH_UNUSED_PAGE (0) 53 #define LH_OVERFLOW_PAGE (1 << 0) 54 #define LH_BUCKET_PAGE (1 << 1) 55 #define LH_BITMAP_PAGE (1 << 2) 56 #define LH_META_PAGE (1 << 3) 57 #define LH_BUCKET_BEING_POPULATED (1 << 4) 58 #define LH_BUCKET_BEING_SPLIT (1 << 5) 59 #define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6) 60 #define LH_PAGE_HAS_DEAD_TUPLES (1 << 7) 61 62 #define LH_PAGE_TYPE \ 63 (LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE) 64 65 /* 66 * In an overflow page, hasho_prevblkno stores the block number of the previous 67 * page in the bucket chain; in a bucket page, hasho_prevblkno stores the 68 * hashm_maxbucket value as of the last time the bucket was last split, or 69 * else as of the time the bucket was created. The latter convention is used 70 * to determine whether a cached copy of the metapage is too stale to be used 71 * without needing to lock or pin the metapage. 72 * 73 * hasho_nextblkno is always the block number of the next page in the 74 * bucket chain, or InvalidBlockNumber if there are no more such pages. 75 */ 76 typedef struct HashPageOpaqueData 77 { 78 BlockNumber hasho_prevblkno; /* see above */ 79 BlockNumber hasho_nextblkno; /* see above */ 80 Bucket hasho_bucket; /* bucket number this pg belongs to */ 81 uint16 hasho_flag; /* page type code + flag bits, see above */ 82 uint16 hasho_page_id; /* for identification of hash indexes */ 83 } HashPageOpaqueData; 84 85 typedef HashPageOpaqueData *HashPageOpaque; 86 87 #define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0) 88 #define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0) 89 #define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0) 90 #define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0) 91 92 /* 93 * The page ID is for the convenience of pg_filedump and similar utilities, 94 * which otherwise would have a hard time telling pages of different index 95 * types apart. It should be the last 2 bytes on the page. This is more or 96 * less "free" due to alignment considerations. 97 */ 98 #define HASHO_PAGE_ID 0xFF80 99 100 typedef struct HashScanPosItem /* what we remember about each match */ 101 { 102 ItemPointerData heapTid; /* TID of referenced heap item */ 103 OffsetNumber indexOffset; /* index item's location within page */ 104 } HashScanPosItem; 105 106 107 /* 108 * HashScanOpaqueData is private state for a hash index scan. 109 */ 110 typedef struct HashScanOpaqueData 111 { 112 /* Hash value of the scan key, ie, the hash key we seek */ 113 uint32 hashso_sk_hash; 114 115 /* 116 * We also want to remember which buffer we're currently examining in the 117 * scan. We keep the buffer pinned (but not locked) across hashgettuple 118 * calls, in order to avoid doing a ReadBuffer() for every tuple in the 119 * index. 120 */ 121 Buffer hashso_curbuf; 122 123 /* remember the buffer associated with primary bucket */ 124 Buffer hashso_bucket_buf; 125 126 /* 127 * remember the buffer associated with primary bucket page of bucket being 128 * split. it is required during the scan of the bucket which is being 129 * populated during split operation. 130 */ 131 Buffer hashso_split_bucket_buf; 132 133 /* Current position of the scan, as an index TID */ 134 ItemPointerData hashso_curpos; 135 136 /* Current position of the scan, as a heap TID */ 137 ItemPointerData hashso_heappos; 138 139 /* Whether scan starts on bucket being populated due to split */ 140 bool hashso_buc_populated; 141 142 /* 143 * Whether scanning bucket being split? The value of this parameter is 144 * referred only when hashso_buc_populated is true. 145 */ 146 bool hashso_buc_split; 147 /* info about killed items if any (killedItems is NULL if never used) */ 148 HashScanPosItem *killedItems; /* tids and offset numbers of killed items */ 149 int numKilled; /* number of currently stored items */ 150 } HashScanOpaqueData; 151 152 typedef HashScanOpaqueData *HashScanOpaque; 153 154 /* 155 * Definitions for metapage. 156 */ 157 158 #define HASH_METAPAGE 0 /* metapage is always block 0 */ 159 160 #define HASH_MAGIC 0x6440640 161 #define HASH_VERSION 4 162 163 /* 164 * spares[] holds the number of overflow pages currently allocated at or 165 * before a certain splitpoint. For example, if spares[3] = 7 then there are 166 * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro). The 167 * value in spares[ovflpoint] increases as overflow pages are added at the 168 * end of the index. Once ovflpoint increases (ie, we have actually allocated 169 * the bucket pages belonging to that splitpoint) the number of spares at the 170 * prior splitpoint cannot change anymore. 171 * 172 * ovflpages that have been recycled for reuse can be found by looking at 173 * bitmaps that are stored within ovflpages dedicated for the purpose. 174 * The blknos of these bitmap pages are kept in mapp[]; nmaps is the 175 * number of currently existing bitmaps. 176 * 177 * The limitation on the size of spares[] comes from the fact that there's 178 * no point in having more than 2^32 buckets with only uint32 hashcodes. 179 * (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is 180 * adjusted in such a way to accommodate multi phased allocation of buckets 181 * after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE). 182 * 183 * There is no particular upper limit on the size of mapp[], other than 184 * needing to fit into the metapage. (With 8K block size, 1024 bitmaps 185 * limit us to 256 GB of overflow space...). For smaller block size we 186 * can not use 1024 bitmaps as it will lead to the meta page data crossing 187 * the block size boundary. So we use BLCKSZ to determine the maximum number 188 * of bitmaps. 189 */ 190 #define HASH_MAX_BITMAPS Min(BLCKSZ / 8, 1024) 191 192 #define HASH_SPLITPOINT_PHASE_BITS 2 193 #define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS) 194 #define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1) 195 #define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10 196 197 /* defines max number of splitpoit phases a hash index can have */ 198 #define HASH_MAX_SPLITPOINT_GROUP 32 199 #define HASH_MAX_SPLITPOINTS \ 200 (((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \ 201 HASH_SPLITPOINT_PHASES_PER_GRP) + \ 202 HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) 203 204 typedef struct HashMetaPageData 205 { 206 uint32 hashm_magic; /* magic no. for hash tables */ 207 uint32 hashm_version; /* version ID */ 208 double hashm_ntuples; /* number of tuples stored in the table */ 209 uint16 hashm_ffactor; /* target fill factor (tuples/bucket) */ 210 uint16 hashm_bsize; /* index page size (bytes) */ 211 uint16 hashm_bmsize; /* bitmap array size (bytes) - must be a power 212 * of 2 */ 213 uint16 hashm_bmshift; /* log2(bitmap array size in BITS) */ 214 uint32 hashm_maxbucket; /* ID of maximum bucket in use */ 215 uint32 hashm_highmask; /* mask to modulo into entire table */ 216 uint32 hashm_lowmask; /* mask to modulo into lower half of table */ 217 uint32 hashm_ovflpoint; /* splitpoint from which ovflpgs being 218 * allocated */ 219 uint32 hashm_firstfree; /* lowest-number free ovflpage (bit#) */ 220 uint32 hashm_nmaps; /* number of bitmap pages */ 221 RegProcedure hashm_procid; /* hash procedure id from pg_proc */ 222 uint32 hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before each 223 * splitpoint */ 224 BlockNumber hashm_mapp[HASH_MAX_BITMAPS]; /* blknos of ovfl bitmaps */ 225 } HashMetaPageData; 226 227 typedef HashMetaPageData *HashMetaPage; 228 229 /* 230 * Maximum size of a hash index item (it's okay to have only one per page) 231 */ 232 #define HashMaxItemSize(page) \ 233 MAXALIGN_DOWN(PageGetPageSize(page) - \ 234 SizeOfPageHeaderData - \ 235 sizeof(ItemIdData) - \ 236 MAXALIGN(sizeof(HashPageOpaqueData))) 237 238 #define INDEX_MOVED_BY_SPLIT_MASK 0x2000 239 240 #define HASH_MIN_FILLFACTOR 10 241 #define HASH_DEFAULT_FILLFACTOR 75 242 243 /* 244 * Constants 245 */ 246 #define BYTE_TO_BIT 3 /* 2^3 bits/byte */ 247 #define ALL_SET ((uint32) ~0) 248 249 /* 250 * Bitmap pages do not contain tuples. They do contain the standard 251 * page headers and trailers; however, everything in between is a 252 * giant bit array. The number of bits that fit on a page obviously 253 * depends on the page size and the header/trailer overhead. We require 254 * the number of bits per page to be a power of 2. 255 */ 256 #define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize) 257 #define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT) 258 #define BMPG_SHIFT(metap) ((metap)->hashm_bmshift) 259 #define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1) 260 261 #define HashPageGetBitmap(page) \ 262 ((uint32 *) PageGetContents(page)) 263 264 #define HashGetMaxBitmapSize(page) \ 265 (PageGetPageSize((Page) page) - \ 266 (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData)))) 267 268 #define HashPageGetMeta(page) \ 269 ((HashMetaPage) PageGetContents(page)) 270 271 /* 272 * The number of bits in an ovflpage bitmap word. 273 */ 274 #define BITS_PER_MAP 32 /* Number of bits in uint32 */ 275 276 /* Given the address of the beginning of a bit map, clear/set the nth bit */ 277 #define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP))) 278 #define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP))) 279 #define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP))) 280 281 /* 282 * page-level and high-level locking modes (see README) 283 */ 284 #define HASH_READ BUFFER_LOCK_SHARE 285 #define HASH_WRITE BUFFER_LOCK_EXCLUSIVE 286 #define HASH_NOLOCK (-1) 287 288 /* 289 * Strategy number. There's only one valid strategy for hashing: equality. 290 */ 291 #define HTEqualStrategyNumber 1 292 #define HTMaxStrategyNumber 1 293 294 /* 295 * When a new operator class is declared, we require that the user supply 296 * us with an amproc procedure for hashing a key of the new type. 297 * Since we only have one such proc in amproc, it's number 1. 298 */ 299 #define HASHPROC 1 300 #define HASHNProcs 1 301 302 303 /* public routines */ 304 305 extern IndexBuildResult *hashbuild(Relation heap, Relation index, 306 struct IndexInfo *indexInfo); 307 extern void hashbuildempty(Relation index); 308 extern bool hashinsert(Relation rel, Datum *values, bool *isnull, 309 ItemPointer ht_ctid, Relation heapRel, 310 IndexUniqueCheck checkUnique, 311 struct IndexInfo *indexInfo); 312 extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir); 313 extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); 314 extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys); 315 extern void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, 316 ScanKey orderbys, int norderbys); 317 extern void hashendscan(IndexScanDesc scan); 318 extern IndexBulkDeleteResult *hashbulkdelete(IndexVacuumInfo *info, 319 IndexBulkDeleteResult *stats, 320 IndexBulkDeleteCallback callback, 321 void *callback_state); 322 extern IndexBulkDeleteResult *hashvacuumcleanup(IndexVacuumInfo *info, 323 IndexBulkDeleteResult *stats); 324 extern bytea *hashoptions(Datum reloptions, bool validate); 325 extern bool hashvalidate(Oid opclassoid); 326 327 extern Datum hash_any(register const unsigned char *k, register int keylen); 328 extern Datum hash_uint32(uint32 k); 329 330 /* private routines */ 331 332 /* hashinsert.c */ 333 extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel); 334 extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, 335 Size itemsize, IndexTuple itup); 336 extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, 337 OffsetNumber *itup_offsets, uint16 nitups); 338 339 /* hashovfl.c */ 340 extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin); 341 extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, 342 Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, 343 Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy); 344 extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage); 345 extern void _hash_squeezebucket(Relation rel, 346 Bucket bucket, BlockNumber bucket_blkno, 347 Buffer bucket_buf, 348 BufferAccessStrategy bstrategy); 349 extern uint32 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno); 350 351 /* hashpage.c */ 352 extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, 353 int access, int flags); 354 extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel, 355 BlockNumber blkno, int flags); 356 extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, 357 bool force_refresh); 358 extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, 359 int access, 360 HashMetaPage *cachedmetap); 361 extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno); 362 extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, 363 uint32 flag, bool initpage); 364 extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, 365 ForkNumber forkNum); 366 extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, 367 int access, int flags, 368 BufferAccessStrategy bstrategy); 369 extern void _hash_relbuf(Relation rel, Buffer buf); 370 extern void _hash_dropbuf(Relation rel, Buffer buf); 371 extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so); 372 extern uint32 _hash_init(Relation rel, double num_tuples, 373 ForkNumber forkNum); 374 extern void _hash_init_metabuffer(Buffer buf, double num_tuples, 375 RegProcedure procid, uint16 ffactor, bool initpage); 376 extern void _hash_pageinit(Page page, Size size); 377 extern void _hash_expandtable(Relation rel, Buffer metabuf); 378 extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, 379 Bucket obucket, uint32 maxbucket, uint32 highmask, 380 uint32 lowmask); 381 382 /* hashsearch.c */ 383 extern bool _hash_next(IndexScanDesc scan, ScanDirection dir); 384 extern bool _hash_first(IndexScanDesc scan, ScanDirection dir); 385 extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir); 386 387 /* hashsort.c */ 388 typedef struct HSpool HSpool; /* opaque struct in hashsort.c */ 389 390 extern HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets); 391 extern void _h_spooldestroy(HSpool *hspool); 392 extern void _h_spool(HSpool *hspool, ItemPointer self, 393 Datum *values, bool *isnull); 394 extern void _h_indexbuild(HSpool *hspool, Relation heapRel); 395 396 /* hashutil.c */ 397 extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup); 398 extern uint32 _hash_datum2hashkey(Relation rel, Datum key); 399 extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype); 400 extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, 401 uint32 highmask, uint32 lowmask); 402 extern uint32 _hash_log2(uint32 num); 403 extern uint32 _hash_spareindex(uint32 num_bucket); 404 extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase); 405 extern void _hash_checkpage(Relation rel, Buffer buf, int flags); 406 extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup); 407 extern bool _hash_convert_tuple(Relation index, 408 Datum *user_values, bool *user_isnull, 409 Datum *index_values, bool *index_isnull); 410 extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value); 411 extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value); 412 extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket); 413 extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket); 414 extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, 415 uint32 lowmask, uint32 maxbucket); 416 extern void _hash_kill_items(IndexScanDesc scan); 417 418 /* hash.c */ 419 extern void hashbucketcleanup(Relation rel, Bucket cur_bucket, 420 Buffer bucket_buf, BlockNumber bucket_blkno, 421 BufferAccessStrategy bstrategy, 422 uint32 maxbucket, uint32 highmask, uint32 lowmask, 423 double *tuples_removed, double *num_index_tuples, 424 bool bucket_has_garbage, 425 IndexBulkDeleteCallback callback, void *callback_state); 426 427 #endif /* HASH_H */ 428