1 /*-------------------------------------------------------------------------
2  *
3  * hash.h
4  *	  header file for postgres hash access method implementation
5  *
6  *
7  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * src/include/access/hash.h
11  *
12  * NOTES
13  *		modeled after Margo Seltzer's hash implementation for unix.
14  *
15  *-------------------------------------------------------------------------
16  */
17 #ifndef HASH_H
18 #define HASH_H
19 
20 #include "access/amapi.h"
21 #include "access/itup.h"
22 #include "access/sdir.h"
23 #include "fmgr.h"
24 #include "lib/stringinfo.h"
25 #include "storage/bufmgr.h"
26 #include "storage/lockdefs.h"
27 #include "utils/hsearch.h"
28 #include "utils/relcache.h"
29 
30 /*
31  * Mapping from hash bucket number to physical block number of bucket's
32  * starting page.  Beware of multiple evaluations of argument!
33  */
34 typedef uint32 Bucket;
35 
36 #define InvalidBucket	((Bucket) 0xFFFFFFFF)
37 
38 #define BUCKET_TO_BLKNO(metap,B) \
39 		((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1)
40 
41 /*
42  * Special space for hash index pages.
43  *
44  * hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at.
45  * Additional bits in the flag word are used for more transient purposes.
46  *
47  * To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE.
48  * However, we ensure that each used page type has a distinct bit so that
49  * we can OR together page types for uses such as the allowable-page-types
50  * argument of _hash_checkpage().
51  */
52 #define LH_UNUSED_PAGE			(0)
53 #define LH_OVERFLOW_PAGE		(1 << 0)
54 #define LH_BUCKET_PAGE			(1 << 1)
55 #define LH_BITMAP_PAGE			(1 << 2)
56 #define LH_META_PAGE			(1 << 3)
57 #define LH_BUCKET_BEING_POPULATED	(1 << 4)
58 #define LH_BUCKET_BEING_SPLIT	(1 << 5)
59 #define LH_BUCKET_NEEDS_SPLIT_CLEANUP	(1 << 6)
60 #define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
61 
62 #define LH_PAGE_TYPE \
63 	(LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
64 
65 /*
66  * In an overflow page, hasho_prevblkno stores the block number of the previous
67  * page in the bucket chain; in a bucket page, hasho_prevblkno stores the
68  * hashm_maxbucket value as of the last time the bucket was last split, or
69  * else as of the time the bucket was created.  The latter convention is used
70  * to determine whether a cached copy of the metapage is too stale to be used
71  * without needing to lock or pin the metapage.
72  *
73  * hasho_nextblkno is always the block number of the next page in the
74  * bucket chain, or InvalidBlockNumber if there are no more such pages.
75  */
76 typedef struct HashPageOpaqueData
77 {
78 	BlockNumber hasho_prevblkno;	/* see above */
79 	BlockNumber hasho_nextblkno;	/* see above */
80 	Bucket		hasho_bucket;	/* bucket number this pg belongs to */
81 	uint16		hasho_flag;		/* page type code + flag bits, see above */
82 	uint16		hasho_page_id;	/* for identification of hash indexes */
83 } HashPageOpaqueData;
84 
85 typedef HashPageOpaqueData *HashPageOpaque;
86 
87 #define H_NEEDS_SPLIT_CLEANUP(opaque)	(((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0)
88 #define H_BUCKET_BEING_SPLIT(opaque)	(((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
89 #define H_BUCKET_BEING_POPULATED(opaque)	(((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
90 #define H_HAS_DEAD_TUPLES(opaque)		(((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
91 
92 /*
93  * The page ID is for the convenience of pg_filedump and similar utilities,
94  * which otherwise would have a hard time telling pages of different index
95  * types apart.  It should be the last 2 bytes on the page.  This is more or
96  * less "free" due to alignment considerations.
97  */
98 #define HASHO_PAGE_ID		0xFF80
99 
100 typedef struct HashScanPosItem	/* what we remember about each match */
101 {
102 	ItemPointerData heapTid;	/* TID of referenced heap item */
103 	OffsetNumber indexOffset;	/* index item's location within page */
104 } HashScanPosItem;
105 
106 
107 /*
108  *	HashScanOpaqueData is private state for a hash index scan.
109  */
110 typedef struct HashScanOpaqueData
111 {
112 	/* Hash value of the scan key, ie, the hash key we seek */
113 	uint32		hashso_sk_hash;
114 
115 	/*
116 	 * We also want to remember which buffer we're currently examining in the
117 	 * scan. We keep the buffer pinned (but not locked) across hashgettuple
118 	 * calls, in order to avoid doing a ReadBuffer() for every tuple in the
119 	 * index.
120 	 */
121 	Buffer		hashso_curbuf;
122 
123 	/* remember the buffer associated with primary bucket */
124 	Buffer		hashso_bucket_buf;
125 
126 	/*
127 	 * remember the buffer associated with primary bucket page of bucket being
128 	 * split.  it is required during the scan of the bucket which is being
129 	 * populated during split operation.
130 	 */
131 	Buffer		hashso_split_bucket_buf;
132 
133 	/* Current position of the scan, as an index TID */
134 	ItemPointerData hashso_curpos;
135 
136 	/* Current position of the scan, as a heap TID */
137 	ItemPointerData hashso_heappos;
138 
139 	/* Whether scan starts on bucket being populated due to split */
140 	bool		hashso_buc_populated;
141 
142 	/*
143 	 * Whether scanning bucket being split?  The value of this parameter is
144 	 * referred only when hashso_buc_populated is true.
145 	 */
146 	bool		hashso_buc_split;
147 	/* info about killed items if any (killedItems is NULL if never used) */
148 	HashScanPosItem *killedItems;	/* tids and offset numbers of killed items */
149 	int			numKilled;		/* number of currently stored items */
150 } HashScanOpaqueData;
151 
152 typedef HashScanOpaqueData *HashScanOpaque;
153 
154 /*
155  * Definitions for metapage.
156  */
157 
158 #define HASH_METAPAGE	0		/* metapage is always block 0 */
159 
160 #define HASH_MAGIC		0x6440640
161 #define HASH_VERSION	4
162 
163 /*
164  * spares[] holds the number of overflow pages currently allocated at or
165  * before a certain splitpoint. For example, if spares[3] = 7 then there are
166  * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro).  The
167  * value in spares[ovflpoint] increases as overflow pages are added at the
168  * end of the index.  Once ovflpoint increases (ie, we have actually allocated
169  * the bucket pages belonging to that splitpoint) the number of spares at the
170  * prior splitpoint cannot change anymore.
171  *
172  * ovflpages that have been recycled for reuse can be found by looking at
173  * bitmaps that are stored within ovflpages dedicated for the purpose.
174  * The blknos of these bitmap pages are kept in mapp[]; nmaps is the
175  * number of currently existing bitmaps.
176  *
177  * The limitation on the size of spares[] comes from the fact that there's
178  * no point in having more than 2^32 buckets with only uint32 hashcodes.
179  * (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is
180  * adjusted in such a way to accommodate multi phased allocation of buckets
181  * after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE).
182  *
183  * There is no particular upper limit on the size of mapp[], other than
184  * needing to fit into the metapage.  (With 8K block size, 1024 bitmaps
185  * limit us to 256 GB of overflow space...).  For smaller block size we
186  * can not use 1024 bitmaps as it will lead to the meta page data crossing
187  * the block size boundary.  So we use BLCKSZ to determine the maximum number
188  * of bitmaps.
189  */
190 #define HASH_MAX_BITMAPS			Min(BLCKSZ / 8, 1024)
191 
192 #define HASH_SPLITPOINT_PHASE_BITS	2
193 #define HASH_SPLITPOINT_PHASES_PER_GRP	(1 << HASH_SPLITPOINT_PHASE_BITS)
194 #define HASH_SPLITPOINT_PHASE_MASK		(HASH_SPLITPOINT_PHASES_PER_GRP - 1)
195 #define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE	10
196 
197 /* defines max number of splitpoit phases a hash index can have */
198 #define HASH_MAX_SPLITPOINT_GROUP	32
199 #define HASH_MAX_SPLITPOINTS \
200 	(((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \
201 	  HASH_SPLITPOINT_PHASES_PER_GRP) + \
202 	 HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
203 
204 typedef struct HashMetaPageData
205 {
206 	uint32		hashm_magic;	/* magic no. for hash tables */
207 	uint32		hashm_version;	/* version ID */
208 	double		hashm_ntuples;	/* number of tuples stored in the table */
209 	uint16		hashm_ffactor;	/* target fill factor (tuples/bucket) */
210 	uint16		hashm_bsize;	/* index page size (bytes) */
211 	uint16		hashm_bmsize;	/* bitmap array size (bytes) - must be a power
212 								 * of 2 */
213 	uint16		hashm_bmshift;	/* log2(bitmap array size in BITS) */
214 	uint32		hashm_maxbucket;	/* ID of maximum bucket in use */
215 	uint32		hashm_highmask; /* mask to modulo into entire table */
216 	uint32		hashm_lowmask;	/* mask to modulo into lower half of table */
217 	uint32		hashm_ovflpoint;	/* splitpoint from which ovflpgs being
218 									 * allocated */
219 	uint32		hashm_firstfree;	/* lowest-number free ovflpage (bit#) */
220 	uint32		hashm_nmaps;	/* number of bitmap pages */
221 	RegProcedure hashm_procid;	/* hash procedure id from pg_proc */
222 	uint32		hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before each
223 													 * splitpoint */
224 	BlockNumber hashm_mapp[HASH_MAX_BITMAPS];	/* blknos of ovfl bitmaps */
225 } HashMetaPageData;
226 
227 typedef HashMetaPageData *HashMetaPage;
228 
229 /*
230  * Maximum size of a hash index item (it's okay to have only one per page)
231  */
232 #define HashMaxItemSize(page) \
233 	MAXALIGN_DOWN(PageGetPageSize(page) - \
234 				  SizeOfPageHeaderData - \
235 				  sizeof(ItemIdData) - \
236 				  MAXALIGN(sizeof(HashPageOpaqueData)))
237 
238 #define INDEX_MOVED_BY_SPLIT_MASK	0x2000
239 
240 #define HASH_MIN_FILLFACTOR			10
241 #define HASH_DEFAULT_FILLFACTOR		75
242 
243 /*
244  * Constants
245  */
246 #define BYTE_TO_BIT				3	/* 2^3 bits/byte */
247 #define ALL_SET					((uint32) ~0)
248 
249 /*
250  * Bitmap pages do not contain tuples.  They do contain the standard
251  * page headers and trailers; however, everything in between is a
252  * giant bit array.  The number of bits that fit on a page obviously
253  * depends on the page size and the header/trailer overhead.  We require
254  * the number of bits per page to be a power of 2.
255  */
256 #define BMPGSZ_BYTE(metap)		((metap)->hashm_bmsize)
257 #define BMPGSZ_BIT(metap)		((metap)->hashm_bmsize << BYTE_TO_BIT)
258 #define BMPG_SHIFT(metap)		((metap)->hashm_bmshift)
259 #define BMPG_MASK(metap)		(BMPGSZ_BIT(metap) - 1)
260 
261 #define HashPageGetBitmap(page) \
262 	((uint32 *) PageGetContents(page))
263 
264 #define HashGetMaxBitmapSize(page) \
265 	(PageGetPageSize((Page) page) - \
266 	 (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData))))
267 
268 #define HashPageGetMeta(page) \
269 	((HashMetaPage) PageGetContents(page))
270 
271 /*
272  * The number of bits in an ovflpage bitmap word.
273  */
274 #define BITS_PER_MAP	32		/* Number of bits in uint32 */
275 
276 /* Given the address of the beginning of a bit map, clear/set the nth bit */
277 #define CLRBIT(A, N)	((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP)))
278 #define SETBIT(A, N)	((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP)))
279 #define ISSET(A, N)		((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP)))
280 
281 /*
282  * page-level and high-level locking modes (see README)
283  */
284 #define HASH_READ		BUFFER_LOCK_SHARE
285 #define HASH_WRITE		BUFFER_LOCK_EXCLUSIVE
286 #define HASH_NOLOCK		(-1)
287 
288 /*
289  *	Strategy number. There's only one valid strategy for hashing: equality.
290  */
291 #define HTEqualStrategyNumber			1
292 #define HTMaxStrategyNumber				1
293 
294 /*
295  *	When a new operator class is declared, we require that the user supply
296  *	us with an amproc procedure for hashing a key of the new type.
297  *	Since we only have one such proc in amproc, it's number 1.
298  */
299 #define HASHPROC		1
300 #define HASHNProcs		1
301 
302 
303 /* public routines */
304 
305 extern IndexBuildResult *hashbuild(Relation heap, Relation index,
306 		  struct IndexInfo *indexInfo);
307 extern void hashbuildempty(Relation index);
308 extern bool hashinsert(Relation rel, Datum *values, bool *isnull,
309 		   ItemPointer ht_ctid, Relation heapRel,
310 		   IndexUniqueCheck checkUnique,
311 		   struct IndexInfo *indexInfo);
312 extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir);
313 extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
314 extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys);
315 extern void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
316 		   ScanKey orderbys, int norderbys);
317 extern void hashendscan(IndexScanDesc scan);
318 extern IndexBulkDeleteResult *hashbulkdelete(IndexVacuumInfo *info,
319 			   IndexBulkDeleteResult *stats,
320 			   IndexBulkDeleteCallback callback,
321 			   void *callback_state);
322 extern IndexBulkDeleteResult *hashvacuumcleanup(IndexVacuumInfo *info,
323 				  IndexBulkDeleteResult *stats);
324 extern bytea *hashoptions(Datum reloptions, bool validate);
325 extern bool hashvalidate(Oid opclassoid);
326 
327 extern Datum hash_any(register const unsigned char *k, register int keylen);
328 extern Datum hash_uint32(uint32 k);
329 
330 /* private routines */
331 
332 /* hashinsert.c */
333 extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel);
334 extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf,
335 			   Size itemsize, IndexTuple itup);
336 extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
337 					OffsetNumber *itup_offsets, uint16 nitups);
338 
339 /* hashovfl.c */
340 extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin);
341 extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
342 				   Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
343 				   Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy);
344 extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage);
345 extern void _hash_squeezebucket(Relation rel,
346 					Bucket bucket, BlockNumber bucket_blkno,
347 					Buffer bucket_buf,
348 					BufferAccessStrategy bstrategy);
349 extern uint32 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno);
350 
351 /* hashpage.c */
352 extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
353 			 int access, int flags);
354 extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
355 								   BlockNumber blkno, int flags);
356 extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf,
357 					 bool force_refresh);
358 extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey,
359 								int access,
360 								HashMetaPage *cachedmetap);
361 extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
362 extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket,
363 			  uint32 flag, bool initpage);
364 extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno,
365 				ForkNumber forkNum);
366 extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
367 						   int access, int flags,
368 						   BufferAccessStrategy bstrategy);
369 extern void _hash_relbuf(Relation rel, Buffer buf);
370 extern void _hash_dropbuf(Relation rel, Buffer buf);
371 extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so);
372 extern uint32 _hash_init(Relation rel, double num_tuples,
373 		   ForkNumber forkNum);
374 extern void _hash_init_metabuffer(Buffer buf, double num_tuples,
375 					  RegProcedure procid, uint16 ffactor, bool initpage);
376 extern void _hash_pageinit(Page page, Size size);
377 extern void _hash_expandtable(Relation rel, Buffer metabuf);
378 extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf,
379 				   Bucket obucket, uint32 maxbucket, uint32 highmask,
380 				   uint32 lowmask);
381 
382 /* hashsearch.c */
383 extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
384 extern bool _hash_first(IndexScanDesc scan, ScanDirection dir);
385 extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
386 
387 /* hashsort.c */
388 typedef struct HSpool HSpool;	/* opaque struct in hashsort.c */
389 
390 extern HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets);
391 extern void _h_spooldestroy(HSpool *hspool);
392 extern void _h_spool(HSpool *hspool, ItemPointer self,
393 		 Datum *values, bool *isnull);
394 extern void _h_indexbuild(HSpool *hspool, Relation heapRel);
395 
396 /* hashutil.c */
397 extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
398 extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
399 extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype);
400 extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
401 					 uint32 highmask, uint32 lowmask);
402 extern uint32 _hash_log2(uint32 num);
403 extern uint32 _hash_spareindex(uint32 num_bucket);
404 extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase);
405 extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
406 extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
407 extern bool _hash_convert_tuple(Relation index,
408 					Datum *user_values, bool *user_isnull,
409 					Datum *index_values, bool *index_isnull);
410 extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
411 extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
412 extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
413 extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
414 extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
415 								   uint32 lowmask, uint32 maxbucket);
416 extern void _hash_kill_items(IndexScanDesc scan);
417 
418 /* hash.c */
419 extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
420 				  Buffer bucket_buf, BlockNumber bucket_blkno,
421 				  BufferAccessStrategy bstrategy,
422 				  uint32 maxbucket, uint32 highmask, uint32 lowmask,
423 				  double *tuples_removed, double *num_index_tuples,
424 				  bool bucket_has_garbage,
425 				  IndexBulkDeleteCallback callback, void *callback_state);
426 
427 #endif							/* HASH_H */
428