1 /*-------------------------------------------------------------------------
2  *
3  * hash.h
4  *	  header file for postgres hash access method implementation
5  *
6  *
7  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * src/include/access/hash.h
11  *
12  * NOTES
13  *		modeled after Margo Seltzer's hash implementation for unix.
14  *
15  *-------------------------------------------------------------------------
16  */
17 #ifndef HASH_H
18 #define HASH_H
19 
20 #include "access/amapi.h"
21 #include "access/itup.h"
22 #include "access/sdir.h"
23 #include "catalog/pg_am_d.h"
24 #include "common/hashfn.h"
25 #include "lib/stringinfo.h"
26 #include "storage/bufmgr.h"
27 #include "storage/lockdefs.h"
28 #include "utils/hsearch.h"
29 #include "utils/relcache.h"
30 
31 /*
32  * Mapping from hash bucket number to physical block number of bucket's
33  * starting page.  Beware of multiple evaluations of argument!
34  */
35 typedef uint32 Bucket;
36 
37 #define InvalidBucket	((Bucket) 0xFFFFFFFF)
38 
39 #define BUCKET_TO_BLKNO(metap,B) \
40 		((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1)
41 
42 /*
43  * Special space for hash index pages.
44  *
45  * hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at.
46  * Additional bits in the flag word are used for more transient purposes.
47  *
48  * To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE.
49  * However, we ensure that each used page type has a distinct bit so that
50  * we can OR together page types for uses such as the allowable-page-types
51  * argument of _hash_checkpage().
52  */
53 #define LH_UNUSED_PAGE			(0)
54 #define LH_OVERFLOW_PAGE		(1 << 0)
55 #define LH_BUCKET_PAGE			(1 << 1)
56 #define LH_BITMAP_PAGE			(1 << 2)
57 #define LH_META_PAGE			(1 << 3)
58 #define LH_BUCKET_BEING_POPULATED	(1 << 4)
59 #define LH_BUCKET_BEING_SPLIT	(1 << 5)
60 #define LH_BUCKET_NEEDS_SPLIT_CLEANUP	(1 << 6)
61 #define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
62 
63 #define LH_PAGE_TYPE \
64 	(LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
65 
66 /*
67  * In an overflow page, hasho_prevblkno stores the block number of the previous
68  * page in the bucket chain; in a bucket page, hasho_prevblkno stores the
69  * hashm_maxbucket value as of the last time the bucket was last split, or
70  * else as of the time the bucket was created.  The latter convention is used
71  * to determine whether a cached copy of the metapage is too stale to be used
72  * without needing to lock or pin the metapage.
73  *
74  * hasho_nextblkno is always the block number of the next page in the
75  * bucket chain, or InvalidBlockNumber if there are no more such pages.
76  */
77 typedef struct HashPageOpaqueData
78 {
79 	BlockNumber hasho_prevblkno;	/* see above */
80 	BlockNumber hasho_nextblkno;	/* see above */
81 	Bucket		hasho_bucket;	/* bucket number this pg belongs to */
82 	uint16		hasho_flag;		/* page type code + flag bits, see above */
83 	uint16		hasho_page_id;	/* for identification of hash indexes */
84 } HashPageOpaqueData;
85 
86 typedef HashPageOpaqueData *HashPageOpaque;
87 
88 #define H_NEEDS_SPLIT_CLEANUP(opaque)	(((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0)
89 #define H_BUCKET_BEING_SPLIT(opaque)	(((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
90 #define H_BUCKET_BEING_POPULATED(opaque)	(((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
91 #define H_HAS_DEAD_TUPLES(opaque)		(((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
92 
93 /*
94  * The page ID is for the convenience of pg_filedump and similar utilities,
95  * which otherwise would have a hard time telling pages of different index
96  * types apart.  It should be the last 2 bytes on the page.  This is more or
97  * less "free" due to alignment considerations.
98  */
99 #define HASHO_PAGE_ID		0xFF80
100 
101 typedef struct HashScanPosItem	/* what we remember about each match */
102 {
103 	ItemPointerData heapTid;	/* TID of referenced heap item */
104 	OffsetNumber indexOffset;	/* index item's location within page */
105 } HashScanPosItem;
106 
107 typedef struct HashScanPosData
108 {
109 	Buffer		buf;			/* if valid, the buffer is pinned */
110 	BlockNumber currPage;		/* current hash index page */
111 	BlockNumber nextPage;		/* next overflow page */
112 	BlockNumber prevPage;		/* prev overflow or bucket page */
113 
114 	/*
115 	 * The items array is always ordered in index order (ie, increasing
116 	 * indexoffset).  When scanning backwards it is convenient to fill the
117 	 * array back-to-front, so we start at the last slot and fill downwards.
118 	 * Hence we need both a first-valid-entry and a last-valid-entry counter.
119 	 * itemIndex is a cursor showing which entry was last returned to caller.
120 	 */
121 	int			firstItem;		/* first valid index in items[] */
122 	int			lastItem;		/* last valid index in items[] */
123 	int			itemIndex;		/* current index in items[] */
124 
125 	HashScanPosItem items[MaxIndexTuplesPerPage];	/* MUST BE LAST */
126 } HashScanPosData;
127 
128 #define HashScanPosIsPinned(scanpos) \
129 ( \
130 	AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
131 				!BufferIsValid((scanpos).buf)), \
132 	BufferIsValid((scanpos).buf) \
133 )
134 
135 #define HashScanPosIsValid(scanpos) \
136 ( \
137 	AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
138 				!BufferIsValid((scanpos).buf)), \
139 	BlockNumberIsValid((scanpos).currPage) \
140 )
141 
142 #define HashScanPosInvalidate(scanpos) \
143 	do { \
144 		(scanpos).buf = InvalidBuffer; \
145 		(scanpos).currPage = InvalidBlockNumber; \
146 		(scanpos).nextPage = InvalidBlockNumber; \
147 		(scanpos).prevPage = InvalidBlockNumber; \
148 		(scanpos).firstItem = 0; \
149 		(scanpos).lastItem = 0; \
150 		(scanpos).itemIndex = 0; \
151 	} while (0)
152 
153 /*
154  *	HashScanOpaqueData is private state for a hash index scan.
155  */
156 typedef struct HashScanOpaqueData
157 {
158 	/* Hash value of the scan key, ie, the hash key we seek */
159 	uint32		hashso_sk_hash;
160 
161 	/* remember the buffer associated with primary bucket */
162 	Buffer		hashso_bucket_buf;
163 
164 	/*
165 	 * remember the buffer associated with primary bucket page of bucket being
166 	 * split.  it is required during the scan of the bucket which is being
167 	 * populated during split operation.
168 	 */
169 	Buffer		hashso_split_bucket_buf;
170 
171 	/* Whether scan starts on bucket being populated due to split */
172 	bool		hashso_buc_populated;
173 
174 	/*
175 	 * Whether scanning bucket being split?  The value of this parameter is
176 	 * referred only when hashso_buc_populated is true.
177 	 */
178 	bool		hashso_buc_split;
179 	/* info about killed items if any (killedItems is NULL if never used) */
180 	int		   *killedItems;	/* currPos.items indexes of killed items */
181 	int			numKilled;		/* number of currently stored items */
182 
183 	/*
184 	 * Identify all the matching items on a page and save them in
185 	 * HashScanPosData
186 	 */
187 	HashScanPosData currPos;	/* current position data */
188 } HashScanOpaqueData;
189 
190 typedef HashScanOpaqueData *HashScanOpaque;
191 
192 /*
193  * Definitions for metapage.
194  */
195 
196 #define HASH_METAPAGE	0		/* metapage is always block 0 */
197 
198 #define HASH_MAGIC		0x6440640
199 #define HASH_VERSION	4
200 
201 /*
202  * spares[] holds the number of overflow pages currently allocated at or
203  * before a certain splitpoint. For example, if spares[3] = 7 then there are
204  * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro).  The
205  * value in spares[ovflpoint] increases as overflow pages are added at the
206  * end of the index.  Once ovflpoint increases (ie, we have actually allocated
207  * the bucket pages belonging to that splitpoint) the number of spares at the
208  * prior splitpoint cannot change anymore.
209  *
210  * ovflpages that have been recycled for reuse can be found by looking at
211  * bitmaps that are stored within ovflpages dedicated for the purpose.
212  * The blknos of these bitmap pages are kept in mapp[]; nmaps is the
213  * number of currently existing bitmaps.
214  *
215  * The limitation on the size of spares[] comes from the fact that there's
216  * no point in having more than 2^32 buckets with only uint32 hashcodes.
217  * (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is
218  * adjusted in such a way to accommodate multi phased allocation of buckets
219  * after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE).
220  *
221  * There is no particular upper limit on the size of mapp[], other than
222  * needing to fit into the metapage.  (With 8K block size, 1024 bitmaps
223  * limit us to 256 GB of overflow space...).  For smaller block size we
224  * can not use 1024 bitmaps as it will lead to the meta page data crossing
225  * the block size boundary.  So we use BLCKSZ to determine the maximum number
226  * of bitmaps.
227  */
228 #define HASH_MAX_BITMAPS			Min(BLCKSZ / 8, 1024)
229 
230 #define HASH_SPLITPOINT_PHASE_BITS	2
231 #define HASH_SPLITPOINT_PHASES_PER_GRP	(1 << HASH_SPLITPOINT_PHASE_BITS)
232 #define HASH_SPLITPOINT_PHASE_MASK		(HASH_SPLITPOINT_PHASES_PER_GRP - 1)
233 #define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE	10
234 
235 /* defines max number of splitpoint phases a hash index can have */
236 #define HASH_MAX_SPLITPOINT_GROUP	32
237 #define HASH_MAX_SPLITPOINTS \
238 	(((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \
239 	  HASH_SPLITPOINT_PHASES_PER_GRP) + \
240 	 HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
241 
242 typedef struct HashMetaPageData
243 {
244 	uint32		hashm_magic;	/* magic no. for hash tables */
245 	uint32		hashm_version;	/* version ID */
246 	double		hashm_ntuples;	/* number of tuples stored in the table */
247 	uint16		hashm_ffactor;	/* target fill factor (tuples/bucket) */
248 	uint16		hashm_bsize;	/* index page size (bytes) */
249 	uint16		hashm_bmsize;	/* bitmap array size (bytes) - must be a power
250 								 * of 2 */
251 	uint16		hashm_bmshift;	/* log2(bitmap array size in BITS) */
252 	uint32		hashm_maxbucket;	/* ID of maximum bucket in use */
253 	uint32		hashm_highmask; /* mask to modulo into entire table */
254 	uint32		hashm_lowmask;	/* mask to modulo into lower half of table */
255 	uint32		hashm_ovflpoint;	/* splitpoint from which ovflpage being
256 									 * allocated */
257 	uint32		hashm_firstfree;	/* lowest-number free ovflpage (bit#) */
258 	uint32		hashm_nmaps;	/* number of bitmap pages */
259 	RegProcedure hashm_procid;	/* hash function id from pg_proc */
260 	uint32		hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before each
261 													 * splitpoint */
262 	BlockNumber hashm_mapp[HASH_MAX_BITMAPS];	/* blknos of ovfl bitmaps */
263 } HashMetaPageData;
264 
265 typedef HashMetaPageData *HashMetaPage;
266 
267 typedef struct HashOptions
268 {
269 	int32		varlena_header_;	/* varlena header (do not touch directly!) */
270 	int			fillfactor;		/* page fill factor in percent (0..100) */
271 } HashOptions;
272 
273 #define HashGetFillFactor(relation) \
274 	(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
275 				 relation->rd_rel->relam == HASH_AM_OID), \
276 	 (relation)->rd_options ? \
277 	 ((HashOptions *) (relation)->rd_options)->fillfactor :	\
278 	 HASH_DEFAULT_FILLFACTOR)
279 #define HashGetTargetPageUsage(relation) \
280 	(BLCKSZ * HashGetFillFactor(relation) / 100)
281 
282 /*
283  * Maximum size of a hash index item (it's okay to have only one per page)
284  */
285 #define HashMaxItemSize(page) \
286 	MAXALIGN_DOWN(PageGetPageSize(page) - \
287 				  SizeOfPageHeaderData - \
288 				  sizeof(ItemIdData) - \
289 				  MAXALIGN(sizeof(HashPageOpaqueData)))
290 
291 #define INDEX_MOVED_BY_SPLIT_MASK	INDEX_AM_RESERVED_BIT
292 
293 #define HASH_MIN_FILLFACTOR			10
294 #define HASH_DEFAULT_FILLFACTOR		75
295 
296 /*
297  * Constants
298  */
299 #define BYTE_TO_BIT				3	/* 2^3 bits/byte */
300 #define ALL_SET					((uint32) ~0)
301 
302 /*
303  * Bitmap pages do not contain tuples.  They do contain the standard
304  * page headers and trailers; however, everything in between is a
305  * giant bit array.  The number of bits that fit on a page obviously
306  * depends on the page size and the header/trailer overhead.  We require
307  * the number of bits per page to be a power of 2.
308  */
309 #define BMPGSZ_BYTE(metap)		((metap)->hashm_bmsize)
310 #define BMPGSZ_BIT(metap)		((metap)->hashm_bmsize << BYTE_TO_BIT)
311 #define BMPG_SHIFT(metap)		((metap)->hashm_bmshift)
312 #define BMPG_MASK(metap)		(BMPGSZ_BIT(metap) - 1)
313 
314 #define HashPageGetBitmap(page) \
315 	((uint32 *) PageGetContents(page))
316 
317 #define HashGetMaxBitmapSize(page) \
318 	(PageGetPageSize((Page) page) - \
319 	 (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData))))
320 
321 #define HashPageGetMeta(page) \
322 	((HashMetaPage) PageGetContents(page))
323 
324 /*
325  * The number of bits in an ovflpage bitmap word.
326  */
327 #define BITS_PER_MAP	32		/* Number of bits in uint32 */
328 
329 /* Given the address of the beginning of a bit map, clear/set the nth bit */
330 #define CLRBIT(A, N)	((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP)))
331 #define SETBIT(A, N)	((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP)))
332 #define ISSET(A, N)		((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP)))
333 
334 /*
335  * page-level and high-level locking modes (see README)
336  */
337 #define HASH_READ		BUFFER_LOCK_SHARE
338 #define HASH_WRITE		BUFFER_LOCK_EXCLUSIVE
339 #define HASH_NOLOCK		(-1)
340 
341 /*
342  * When a new operator class is declared, we require that the user supply
343  * us with an amproc function for hashing a key of the new type, returning
344  * a 32-bit hash value.  We call this the "standard" hash function.  We
345  * also allow an optional "extended" hash function which accepts a salt and
346  * returns a 64-bit hash value.  This is highly recommended but, for reasons
347  * of backward compatibility, optional.
348  *
349  * When the salt is 0, the low 32 bits of the value returned by the extended
350  * hash function should match the value that would have been returned by the
351  * standard hash function.
352  */
353 #define HASHSTANDARD_PROC		1
354 #define HASHEXTENDED_PROC		2
355 #define HASHOPTIONS_PROC		3
356 #define HASHNProcs				3
357 
358 
359 /* public routines */
360 
361 extern IndexBuildResult *hashbuild(Relation heap, Relation index,
362 								   struct IndexInfo *indexInfo);
363 extern void hashbuildempty(Relation index);
364 extern bool hashinsert(Relation rel, Datum *values, bool *isnull,
365 					   ItemPointer ht_ctid, Relation heapRel,
366 					   IndexUniqueCheck checkUnique,
367 					   struct IndexInfo *indexInfo);
368 extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir);
369 extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
370 extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys);
371 extern void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
372 					   ScanKey orderbys, int norderbys);
373 extern void hashendscan(IndexScanDesc scan);
374 extern IndexBulkDeleteResult *hashbulkdelete(IndexVacuumInfo *info,
375 											 IndexBulkDeleteResult *stats,
376 											 IndexBulkDeleteCallback callback,
377 											 void *callback_state);
378 extern IndexBulkDeleteResult *hashvacuumcleanup(IndexVacuumInfo *info,
379 												IndexBulkDeleteResult *stats);
380 extern bytea *hashoptions(Datum reloptions, bool validate);
381 extern bool hashvalidate(Oid opclassoid);
382 
383 /* private routines */
384 
385 /* hashinsert.c */
386 extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel);
387 extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf,
388 								   Size itemsize, IndexTuple itup);
389 extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
390 								OffsetNumber *itup_offsets, uint16 nitups);
391 
392 /* hashovfl.c */
393 extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin);
394 extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
395 									  Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
396 									  Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy);
397 extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage);
398 extern void _hash_squeezebucket(Relation rel,
399 								Bucket bucket, BlockNumber bucket_blkno,
400 								Buffer bucket_buf,
401 								BufferAccessStrategy bstrategy);
402 extern uint32 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno);
403 
404 /* hashpage.c */
405 extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
406 						   int access, int flags);
407 extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
408 												 BlockNumber blkno, int flags);
409 extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf,
410 										 bool force_refresh);
411 extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey,
412 											  int access,
413 											  HashMetaPage *cachedmetap);
414 extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
415 extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket,
416 						  uint32 flag, bool initpage);
417 extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno,
418 							  ForkNumber forkNum);
419 extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
420 										 int access, int flags,
421 										 BufferAccessStrategy bstrategy);
422 extern void _hash_relbuf(Relation rel, Buffer buf);
423 extern void _hash_dropbuf(Relation rel, Buffer buf);
424 extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so);
425 extern uint32 _hash_init(Relation rel, double num_tuples,
426 						 ForkNumber forkNum);
427 extern void _hash_init_metabuffer(Buffer buf, double num_tuples,
428 								  RegProcedure procid, uint16 ffactor, bool initpage);
429 extern void _hash_pageinit(Page page, Size size);
430 extern void _hash_expandtable(Relation rel, Buffer metabuf);
431 extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf,
432 							   Bucket obucket, uint32 maxbucket, uint32 highmask,
433 							   uint32 lowmask);
434 
435 /* hashsearch.c */
436 extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
437 extern bool _hash_first(IndexScanDesc scan, ScanDirection dir);
438 
439 /* hashsort.c */
440 typedef struct HSpool HSpool;	/* opaque struct in hashsort.c */
441 
442 extern HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets);
443 extern void _h_spooldestroy(HSpool *hspool);
444 extern void _h_spool(HSpool *hspool, ItemPointer self,
445 					 Datum *values, bool *isnull);
446 extern void _h_indexbuild(HSpool *hspool, Relation heapRel);
447 
448 /* hashutil.c */
449 extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
450 extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
451 extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype);
452 extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
453 								   uint32 highmask, uint32 lowmask);
454 extern uint32 _hash_spareindex(uint32 num_bucket);
455 extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase);
456 extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
457 extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
458 extern bool _hash_convert_tuple(Relation index,
459 								Datum *user_values, bool *user_isnull,
460 								Datum *index_values, bool *index_isnull);
461 extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
462 extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
463 extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
464 extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
465 extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
466 												 uint32 lowmask, uint32 maxbucket);
467 extern void _hash_kill_items(IndexScanDesc scan);
468 
469 /* hash.c */
470 extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
471 							  Buffer bucket_buf, BlockNumber bucket_blkno,
472 							  BufferAccessStrategy bstrategy,
473 							  uint32 maxbucket, uint32 highmask, uint32 lowmask,
474 							  double *tuples_removed, double *num_index_tuples,
475 							  bool split_cleanup,
476 							  IndexBulkDeleteCallback callback, void *callback_state);
477 
478 #endif							/* HASH_H */
479