1 /*-------------------------------------------------------------------------
2 *
3 * hash.h
4 * header file for postgres hash access method implementation
5 *
6 *
7 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * src/include/access/hash.h
11 *
12 * NOTES
13 * modeled after Margo Seltzer's hash implementation for unix.
14 *
15 *-------------------------------------------------------------------------
16 */
17 #ifndef HASH_H
18 #define HASH_H
19
20 #include "access/amapi.h"
21 #include "access/itup.h"
22 #include "access/sdir.h"
23 #include "fmgr.h"
24 #include "lib/stringinfo.h"
25 #include "storage/bufmgr.h"
26 #include "storage/lockdefs.h"
27 #include "utils/hsearch.h"
28 #include "utils/relcache.h"
29
30 /*
31 * Mapping from hash bucket number to physical block number of bucket's
32 * starting page. Beware of multiple evaluations of argument!
33 */
34 typedef uint32 Bucket;
35
36 #define InvalidBucket ((Bucket) 0xFFFFFFFF)
37
38 #define BUCKET_TO_BLKNO(metap,B) \
39 ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1)
40
41 /*
42 * Rotate the high 32 bits and the low 32 bits separately. The standard
43 * hash function sometimes rotates the low 32 bits by one bit when
44 * combining elements. We want extended hash functions to be compatible with
45 * that algorithm when the seed is 0, so we can't just do a normal rotation.
46 * This works, though.
47 */
48 #define ROTATE_HIGH_AND_LOW_32BITS(v) \
49 ((((v) << 1) & UINT64CONST(0xfffffffefffffffe)) | \
50 (((v) >> 31) & UINT64CONST(0x100000001)))
51
52 /*
53 * Special space for hash index pages.
54 *
55 * hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at.
56 * Additional bits in the flag word are used for more transient purposes.
57 *
fsync_pgdata(const char * pg_data,const char * progname,int serverVersion)58 * To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE.
59 * However, we ensure that each used page type has a distinct bit so that
60 * we can OR together page types for uses such as the allowable-page-types
61 * argument of _hash_checkpage().
62 */
63 #define LH_UNUSED_PAGE (0)
64 #define LH_OVERFLOW_PAGE (1 << 0)
65 #define LH_BUCKET_PAGE (1 << 1)
66 #define LH_BITMAP_PAGE (1 << 2)
67 #define LH_META_PAGE (1 << 3)
68 #define LH_BUCKET_BEING_POPULATED (1 << 4)
69 #define LH_BUCKET_BEING_SPLIT (1 << 5)
70 #define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6)
71 #define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
72
73 #define LH_PAGE_TYPE \
74 (LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
75
76 /*
77 * In an overflow page, hasho_prevblkno stores the block number of the previous
78 * page in the bucket chain; in a bucket page, hasho_prevblkno stores the
79 * hashm_maxbucket value as of the last time the bucket was last split, or
80 * else as of the time the bucket was created. The latter convention is used
81 * to determine whether a cached copy of the metapage is too stale to be used
82 * without needing to lock or pin the metapage.
83 *
84 * hasho_nextblkno is always the block number of the next page in the
85 * bucket chain, or InvalidBlockNumber if there are no more such pages.
86 */
87 typedef struct HashPageOpaqueData
88 {
89 BlockNumber hasho_prevblkno; /* see above */
90 BlockNumber hasho_nextblkno; /* see above */
91 Bucket hasho_bucket; /* bucket number this pg belongs to */
92 uint16 hasho_flag; /* page type code + flag bits, see above */
93 uint16 hasho_page_id; /* for identification of hash indexes */
94 } HashPageOpaqueData;
95
96 typedef HashPageOpaqueData *HashPageOpaque;
97
98 #define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0)
99 #define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
100 #define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
101 #define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
102
103 /*
104 * The page ID is for the convenience of pg_filedump and similar utilities,
105 * which otherwise would have a hard time telling pages of different index
106 * types apart. It should be the last 2 bytes on the page. This is more or
107 * less "free" due to alignment considerations.
108 */
109 #define HASHO_PAGE_ID 0xFF80
110
111 typedef struct HashScanPosItem /* what we remember about each match */
112 {
113 ItemPointerData heapTid; /* TID of referenced heap item */
114 OffsetNumber indexOffset; /* index item's location within page */
115 } HashScanPosItem;
116
117 typedef struct HashScanPosData
118 {
119 Buffer buf; /* if valid, the buffer is pinned */
120 BlockNumber currPage; /* current hash index page */
121 BlockNumber nextPage; /* next overflow page */
122 BlockNumber prevPage; /* prev overflow or bucket page */
123
fsync_dir_recurse(const char * dir,const char * progname)124 /*
125 * The items array is always ordered in index order (ie, increasing
126 * indexoffset). When scanning backwards it is convenient to fill the
127 * array back-to-front, so we start at the last slot and fill downwards.
128 * Hence we need both a first-valid-entry and a last-valid-entry counter.
129 * itemIndex is a cursor showing which entry was last returned to caller.
130 */
131 int firstItem; /* first valid index in items[] */
132 int lastItem; /* last valid index in items[] */
133 int itemIndex; /* current index in items[] */
134
135 HashScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
136 } HashScanPosData;
137
138 #define HashScanPosIsPinned(scanpos) \
139 ( \
140 AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
141 !BufferIsValid((scanpos).buf)), \
142 BufferIsValid((scanpos).buf) \
143 )
144
145 #define HashScanPosIsValid(scanpos) \
146 ( \
147 AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
148 !BufferIsValid((scanpos).buf)), \
149 BlockNumberIsValid((scanpos).currPage) \
150 )
151
walkdir(const char * path,int (* action)(const char * fname,bool isdir,const char * progname),bool process_symlinks,const char * progname)152 #define HashScanPosInvalidate(scanpos) \
153 do { \
154 (scanpos).buf = InvalidBuffer; \
155 (scanpos).currPage = InvalidBlockNumber; \
156 (scanpos).nextPage = InvalidBlockNumber; \
157 (scanpos).prevPage = InvalidBlockNumber; \
158 (scanpos).firstItem = 0; \
159 (scanpos).lastItem = 0; \
160 (scanpos).itemIndex = 0; \
161 } while (0)
162
163 /*
164 * HashScanOpaqueData is private state for a hash index scan.
165 */
166 typedef struct HashScanOpaqueData
167 {
168 /* Hash value of the scan key, ie, the hash key we seek */
169 uint32 hashso_sk_hash;
170
171 /* remember the buffer associated with primary bucket */
172 Buffer hashso_bucket_buf;
173
174 /*
175 * remember the buffer associated with primary bucket page of bucket being
176 * split. it is required during the scan of the bucket which is being
177 * populated during split operation.
178 */
179 Buffer hashso_split_bucket_buf;
180
181 /* Whether scan starts on bucket being populated due to split */
182 bool hashso_buc_populated;
183
184 /*
185 * Whether scanning bucket being split? The value of this parameter is
186 * referred only when hashso_buc_populated is true.
187 */
188 bool hashso_buc_split;
189 /* info about killed items if any (killedItems is NULL if never used) */
190 int *killedItems; /* currPos.items indexes of killed items */
191 int numKilled; /* number of currently stored items */
192
193 /*
194 * Identify all the matching items on a page and save them in
195 * HashScanPosData
196 */
197 HashScanPosData currPos; /* current position data */
198 } HashScanOpaqueData;
199
200 typedef HashScanOpaqueData *HashScanOpaque;
201
202 /*
203 * Definitions for metapage.
204 */
205
206 #define HASH_METAPAGE 0 /* metapage is always block 0 */
207
208 #define HASH_MAGIC 0x6440640
209 #define HASH_VERSION 4
210
211 /*
212 * spares[] holds the number of overflow pages currently allocated at or
213 * before a certain splitpoint. For example, if spares[3] = 7 then there are
214 * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro). The
215 * value in spares[ovflpoint] increases as overflow pages are added at the
216 * end of the index. Once ovflpoint increases (ie, we have actually allocated
217 * the bucket pages belonging to that splitpoint) the number of spares at the
218 * prior splitpoint cannot change anymore.
219 *
220 * ovflpages that have been recycled for reuse can be found by looking at
pre_sync_fname(const char * fname,bool isdir,const char * progname)221 * bitmaps that are stored within ovflpages dedicated for the purpose.
222 * The blknos of these bitmap pages are kept in mapp[]; nmaps is the
223 * number of currently existing bitmaps.
224 *
225 * The limitation on the size of spares[] comes from the fact that there's
226 * no point in having more than 2^32 buckets with only uint32 hashcodes.
227 * (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is
228 * adjusted in such a way to accommodate multi phased allocation of buckets
229 * after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE).
230 *
231 * There is no particular upper limit on the size of mapp[], other than
232 * needing to fit into the metapage. (With 8K block size, 1024 bitmaps
233 * limit us to 256 GB of overflow space...). For smaller block size we
234 * can not use 1024 bitmaps as it will lead to the meta page data crossing
235 * the block size boundary. So we use BLCKSZ to determine the maximum number
236 * of bitmaps.
237 */
238 #define HASH_MAX_BITMAPS Min(BLCKSZ / 8, 1024)
239
240 #define HASH_SPLITPOINT_PHASE_BITS 2
241 #define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS)
242 #define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1)
243 #define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10
244
245 /* defines max number of splitpoint phases a hash index can have */
246 #define HASH_MAX_SPLITPOINT_GROUP 32
247 #define HASH_MAX_SPLITPOINTS \
248 (((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \
249 HASH_SPLITPOINT_PHASES_PER_GRP) + \
250 HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
251
252 typedef struct HashMetaPageData
253 {
254 uint32 hashm_magic; /* magic no. for hash tables */
255 uint32 hashm_version; /* version ID */
256 double hashm_ntuples; /* number of tuples stored in the table */
257 uint16 hashm_ffactor; /* target fill factor (tuples/bucket) */
258 uint16 hashm_bsize; /* index page size (bytes) */
259 uint16 hashm_bmsize; /* bitmap array size (bytes) - must be a power
260 * of 2 */
261 uint16 hashm_bmshift; /* log2(bitmap array size in BITS) */
262 uint32 hashm_maxbucket; /* ID of maximum bucket in use */
263 uint32 hashm_highmask; /* mask to modulo into entire table */
264 uint32 hashm_lowmask; /* mask to modulo into lower half of table */
265 uint32 hashm_ovflpoint; /* splitpoint from which ovflpgs being
266 * allocated */
267 uint32 hashm_firstfree; /* lowest-number free ovflpage (bit#) */
268 uint32 hashm_nmaps; /* number of bitmap pages */
269 RegProcedure hashm_procid; /* hash function id from pg_proc */
270 uint32 hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before each
271 * splitpoint */
272 BlockNumber hashm_mapp[HASH_MAX_BITMAPS]; /* blknos of ovfl bitmaps */
273 } HashMetaPageData;
274
275 typedef HashMetaPageData *HashMetaPage;
276
277 /*
278 * Maximum size of a hash index item (it's okay to have only one per page)
279 */
280 #define HashMaxItemSize(page) \
281 MAXALIGN_DOWN(PageGetPageSize(page) - \
282 SizeOfPageHeaderData - \
283 sizeof(ItemIdData) - \
284 MAXALIGN(sizeof(HashPageOpaqueData)))
285
286 #define INDEX_MOVED_BY_SPLIT_MASK INDEX_AM_RESERVED_BIT
287
288 #define HASH_MIN_FILLFACTOR 10
289 #define HASH_DEFAULT_FILLFACTOR 75
290
291 /*
292 * Constants
293 */
294 #define BYTE_TO_BIT 3 /* 2^3 bits/byte */
295 #define ALL_SET ((uint32) ~0)
296
297 /*
298 * Bitmap pages do not contain tuples. They do contain the standard
299 * page headers and trailers; however, everything in between is a
300 * giant bit array. The number of bits that fit on a page obviously
301 * depends on the page size and the header/trailer overhead. We require
302 * the number of bits per page to be a power of 2.
303 */
304 #define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize)
305 #define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT)
306 #define BMPG_SHIFT(metap) ((metap)->hashm_bmshift)
307 #define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1)
308
309 #define HashPageGetBitmap(page) \
310 ((uint32 *) PageGetContents(page))
311
312 #define HashGetMaxBitmapSize(page) \
313 (PageGetPageSize((Page) page) - \
314 (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData))))
315
316 #define HashPageGetMeta(page) \
317 ((HashMetaPage) PageGetContents(page))
318
319 /*
320 * The number of bits in an ovflpage bitmap word.
fsync_parent_path(const char * fname,const char * progname)321 */
322 #define BITS_PER_MAP 32 /* Number of bits in uint32 */
323
324 /* Given the address of the beginning of a bit map, clear/set the nth bit */
325 #define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP)))
326 #define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP)))
327 #define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP)))
328
329 /*
330 * page-level and high-level locking modes (see README)
331 */
332 #define HASH_READ BUFFER_LOCK_SHARE
333 #define HASH_WRITE BUFFER_LOCK_EXCLUSIVE
334 #define HASH_NOLOCK (-1)
335
336 /*
337 * Strategy number. There's only one valid strategy for hashing: equality.
338 */
339 #define HTEqualStrategyNumber 1
340 #define HTMaxStrategyNumber 1
341
342 /*
343 * When a new operator class is declared, we require that the user supply
344 * us with an amproc function for hashing a key of the new type, returning
345 * a 32-bit hash value. We call this the "standard" hash function. We
346 * also allow an optional "extended" hash function which accepts a salt and
347 * returns a 64-bit hash value. This is highly recommended but, for reasons
348 * of backward compatibility, optional.
349 *
350 * When the salt is 0, the low 32 bits of the value returned by the extended
351 * hash function should match the value that would have been returned by the
352 * standard hash function.
353 */
354 #define HASHSTANDARD_PROC 1
355 #define HASHEXTENDED_PROC 2
356 #define HASHNProcs 2
357
358
359 /* public routines */
360
361 extern IndexBuildResult *hashbuild(Relation heap, Relation index,
362 struct IndexInfo *indexInfo);
363 extern void hashbuildempty(Relation index);
364 extern bool hashinsert(Relation rel, Datum *values, bool *isnull,
365 ItemPointer ht_ctid, Relation heapRel,
366 IndexUniqueCheck checkUnique,
367 struct IndexInfo *indexInfo);
368 extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir);
369 extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
370 extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys);
371 extern void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
372 ScanKey orderbys, int norderbys);
373 extern void hashendscan(IndexScanDesc scan);
374 extern IndexBulkDeleteResult *hashbulkdelete(IndexVacuumInfo *info,
375 IndexBulkDeleteResult *stats,
376 IndexBulkDeleteCallback callback,
377 void *callback_state);
378 extern IndexBulkDeleteResult *hashvacuumcleanup(IndexVacuumInfo *info,
379 IndexBulkDeleteResult *stats);
380 extern bytea *hashoptions(Datum reloptions, bool validate);
381 extern bool hashvalidate(Oid opclassoid);
382
383 extern Datum hash_any(register const unsigned char *k, register int keylen);
384 extern Datum hash_any_extended(register const unsigned char *k,
385 register int keylen, uint64 seed);
386 extern Datum hash_uint32(uint32 k);
387 extern Datum hash_uint32_extended(uint32 k, uint64 seed);
388
389 /* private routines */
390
391 /* hashinsert.c */
392 extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel);
393 extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf,
394 Size itemsize, IndexTuple itup);
395 extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
396 OffsetNumber *itup_offsets, uint16 nitups);
397
398 /* hashovfl.c */
399 extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin);
400 extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
401 Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
402 Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy);
403 extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage);
404 extern void _hash_squeezebucket(Relation rel,
405 Bucket bucket, BlockNumber bucket_blkno,
406 Buffer bucket_buf,
407 BufferAccessStrategy bstrategy);
408 extern uint32 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno);
409
410 /* hashpage.c */
411 extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
412 int access, int flags);
413 extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
414 BlockNumber blkno, int flags);
415 extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf,
416 bool force_refresh);
417 extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey,
418 int access,
419 HashMetaPage *cachedmetap);
420 extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
421 extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket,
422 uint32 flag, bool initpage);
423 extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno,
424 ForkNumber forkNum);
425 extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
426 int access, int flags,
427 BufferAccessStrategy bstrategy);
428 extern void _hash_relbuf(Relation rel, Buffer buf);
429 extern void _hash_dropbuf(Relation rel, Buffer buf);
430 extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so);
431 extern uint32 _hash_init(Relation rel, double num_tuples,
432 ForkNumber forkNum);
433 extern void _hash_init_metabuffer(Buffer buf, double num_tuples,
434 RegProcedure procid, uint16 ffactor, bool initpage);
435 extern void _hash_pageinit(Page page, Size size);
436 extern void _hash_expandtable(Relation rel, Buffer metabuf);
437 extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf,
438 Bucket obucket, uint32 maxbucket, uint32 highmask,
439 uint32 lowmask);
440
441 /* hashsearch.c */
442 extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
443 extern bool _hash_first(IndexScanDesc scan, ScanDirection dir);
444
445 /* hashsort.c */
446 typedef struct HSpool HSpool; /* opaque struct in hashsort.c */
447
448 extern HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets);
449 extern void _h_spooldestroy(HSpool *hspool);
450 extern void _h_spool(HSpool *hspool, ItemPointer self,
451 Datum *values, bool *isnull);
452 extern void _h_indexbuild(HSpool *hspool, Relation heapRel);
453
454 /* hashutil.c */
455 extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
456 extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
457 extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype);
458 extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
459 uint32 highmask, uint32 lowmask);
460 extern uint32 _hash_log2(uint32 num);
461 extern uint32 _hash_spareindex(uint32 num_bucket);
462 extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase);
463 extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
464 extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
465 extern bool _hash_convert_tuple(Relation index,
466 Datum *user_values, bool *user_isnull,
467 Datum *index_values, bool *index_isnull);
468 extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
469 extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
470 extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
471 extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
472 extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
473 uint32 lowmask, uint32 maxbucket);
474 extern void _hash_kill_items(IndexScanDesc scan);
475
476 /* hash.c */
477 extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
478 Buffer bucket_buf, BlockNumber bucket_blkno,
479 BufferAccessStrategy bstrategy,
480 uint32 maxbucket, uint32 highmask, uint32 lowmask,
481 double *tuples_removed, double *num_index_tuples,
482 bool bucket_has_garbage,
483 IndexBulkDeleteCallback callback, void *callback_state);
484
485 #endif /* HASH_H */
486