1 /*-------------------------------------------------------------------------
2 *
3 * hashpage.c
4 * Hash table page management code for the Postgres hash access method
5 *
6 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/hash/hashpage.c
12 *
13 * NOTES
14 * Postgres hash pages look like ordinary relation pages. The opaque
15 * data at high addresses includes information about the page including
16 * whether a page is an overflow page or a true bucket, the bucket
17 * number, and the block numbers of the preceding and following pages
18 * in the same bucket.
19 *
20 * The first page in a hash relation, page zero, is special -- it stores
21 * information describing the hash table; it is referred to as the
22 * "meta page." Pages one and higher store the actual data.
23 *
24 * There are also bitmap pages, which are not manipulated here;
25 * see hashovfl.c.
26 *
27 *-------------------------------------------------------------------------
28 */
29 #include "postgres.h"
30
31 #include "access/hash.h"
32 #include "access/hash_xlog.h"
33 #include "miscadmin.h"
34 #include "port/pg_bitutils.h"
35 #include "storage/lmgr.h"
36 #include "storage/predicate.h"
37 #include "storage/smgr.h"
38
39 static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
40 uint32 nblocks);
41 static void _hash_splitbucket(Relation rel, Buffer metabuf,
42 Bucket obucket, Bucket nbucket,
43 Buffer obuf,
44 Buffer nbuf,
45 HTAB *htab,
46 uint32 maxbucket,
47 uint32 highmask, uint32 lowmask);
48 static void log_split_page(Relation rel, Buffer buf);
49
50
51 /*
52 * _hash_getbuf() -- Get a buffer by block number for read or write.
53 *
54 * 'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
55 * 'flags' is a bitwise OR of the allowed page types.
56 *
57 * This must be used only to fetch pages that are expected to be valid
58 * already. _hash_checkpage() is applied using the given flags.
59 *
60 * When this routine returns, the appropriate lock is set on the
61 * requested buffer and its reference count has been incremented
62 * (ie, the buffer is "locked and pinned").
63 *
64 * P_NEW is disallowed because this routine can only be used
65 * to access pages that are known to be before the filesystem EOF.
66 * Extending the index should be done with _hash_getnewbuf.
67 */
68 Buffer
_hash_getbuf(Relation rel,BlockNumber blkno,int access,int flags)69 _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
70 {
71 Buffer buf;
72
73 if (blkno == P_NEW)
74 elog(ERROR, "hash AM does not use P_NEW");
75
76 buf = ReadBuffer(rel, blkno);
77
78 if (access != HASH_NOLOCK)
79 LockBuffer(buf, access);
80
81 /* ref count and lock type are correct */
82
83 _hash_checkpage(rel, buf, flags);
84
85 return buf;
86 }
87
88 /*
89 * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup.
90 *
91 * We read the page and try to acquire a cleanup lock. If we get it,
92 * we return the buffer; otherwise, we return InvalidBuffer.
93 */
94 Buffer
_hash_getbuf_with_condlock_cleanup(Relation rel,BlockNumber blkno,int flags)95 _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags)
96 {
97 Buffer buf;
98
99 if (blkno == P_NEW)
100 elog(ERROR, "hash AM does not use P_NEW");
101
102 buf = ReadBuffer(rel, blkno);
103
104 if (!ConditionalLockBufferForCleanup(buf))
105 {
106 ReleaseBuffer(buf);
107 return InvalidBuffer;
108 }
109
110 /* ref count and lock type are correct */
111
112 _hash_checkpage(rel, buf, flags);
113
114 return buf;
115 }
116
117 /*
118 * _hash_getinitbuf() -- Get and initialize a buffer by block number.
119 *
120 * This must be used only to fetch pages that are known to be before
121 * the index's filesystem EOF, but are to be filled from scratch.
122 * _hash_pageinit() is applied automatically. Otherwise it has
123 * effects similar to _hash_getbuf() with access = HASH_WRITE.
124 *
125 * When this routine returns, a write lock is set on the
126 * requested buffer and its reference count has been incremented
127 * (ie, the buffer is "locked and pinned").
128 *
129 * P_NEW is disallowed because this routine can only be used
130 * to access pages that are known to be before the filesystem EOF.
131 * Extending the index should be done with _hash_getnewbuf.
132 */
133 Buffer
_hash_getinitbuf(Relation rel,BlockNumber blkno)134 _hash_getinitbuf(Relation rel, BlockNumber blkno)
135 {
136 Buffer buf;
137
138 if (blkno == P_NEW)
139 elog(ERROR, "hash AM does not use P_NEW");
140
141 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
142 NULL);
143
144 /* ref count and lock type are correct */
145
146 /* initialize the page */
147 _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
148
149 return buf;
150 }
151
152 /*
153 * _hash_initbuf() -- Get and initialize a buffer by bucket number.
154 */
155 void
_hash_initbuf(Buffer buf,uint32 max_bucket,uint32 num_bucket,uint32 flag,bool initpage)156 _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag,
157 bool initpage)
158 {
159 HashPageOpaque pageopaque;
160 Page page;
161
162 page = BufferGetPage(buf);
163
164 /* initialize the page */
165 if (initpage)
166 _hash_pageinit(page, BufferGetPageSize(buf));
167
168 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
169
170 /*
171 * Set hasho_prevblkno with current hashm_maxbucket. This value will be
172 * used to validate cached HashMetaPageData. See
173 * _hash_getbucketbuf_from_hashkey().
174 */
175 pageopaque->hasho_prevblkno = max_bucket;
176 pageopaque->hasho_nextblkno = InvalidBlockNumber;
177 pageopaque->hasho_bucket = num_bucket;
178 pageopaque->hasho_flag = flag;
179 pageopaque->hasho_page_id = HASHO_PAGE_ID;
180 }
181
182 /*
183 * _hash_getnewbuf() -- Get a new page at the end of the index.
184 *
185 * This has the same API as _hash_getinitbuf, except that we are adding
186 * a page to the index, and hence expect the page to be past the
187 * logical EOF. (However, we have to support the case where it isn't,
188 * since a prior try might have crashed after extending the filesystem
189 * EOF but before updating the metapage to reflect the added page.)
190 *
191 * It is caller's responsibility to ensure that only one process can
192 * extend the index at a time. In practice, this function is called
193 * only while holding write lock on the metapage, because adding a page
194 * is always associated with an update of metapage data.
195 */
196 Buffer
_hash_getnewbuf(Relation rel,BlockNumber blkno,ForkNumber forkNum)197 _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
198 {
199 BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum);
200 Buffer buf;
201
202 if (blkno == P_NEW)
203 elog(ERROR, "hash AM does not use P_NEW");
204 if (blkno > nblocks)
205 elog(ERROR, "access to noncontiguous page in hash index \"%s\"",
206 RelationGetRelationName(rel));
207
208 /* smgr insists we use P_NEW to extend the relation */
209 if (blkno == nblocks)
210 {
211 buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL);
212 if (BufferGetBlockNumber(buf) != blkno)
213 elog(ERROR, "unexpected hash relation size: %u, should be %u",
214 BufferGetBlockNumber(buf), blkno);
215 LockBuffer(buf, HASH_WRITE);
216 }
217 else
218 {
219 buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
220 NULL);
221 }
222
223 /* ref count and lock type are correct */
224
225 /* initialize the page */
226 _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
227
228 return buf;
229 }
230
231 /*
232 * _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
233 *
234 * This is identical to _hash_getbuf() but also allows a buffer access
235 * strategy to be specified. We use this for VACUUM operations.
236 */
237 Buffer
_hash_getbuf_with_strategy(Relation rel,BlockNumber blkno,int access,int flags,BufferAccessStrategy bstrategy)238 _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
239 int access, int flags,
240 BufferAccessStrategy bstrategy)
241 {
242 Buffer buf;
243
244 if (blkno == P_NEW)
245 elog(ERROR, "hash AM does not use P_NEW");
246
247 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
248
249 if (access != HASH_NOLOCK)
250 LockBuffer(buf, access);
251
252 /* ref count and lock type are correct */
253
254 _hash_checkpage(rel, buf, flags);
255
256 return buf;
257 }
258
259 /*
260 * _hash_relbuf() -- release a locked buffer.
261 *
262 * Lock and pin (refcount) are both dropped.
263 */
264 void
_hash_relbuf(Relation rel,Buffer buf)265 _hash_relbuf(Relation rel, Buffer buf)
266 {
267 UnlockReleaseBuffer(buf);
268 }
269
270 /*
271 * _hash_dropbuf() -- release an unlocked buffer.
272 *
273 * This is used to unpin a buffer on which we hold no lock.
274 */
275 void
_hash_dropbuf(Relation rel,Buffer buf)276 _hash_dropbuf(Relation rel, Buffer buf)
277 {
278 ReleaseBuffer(buf);
279 }
280
281 /*
282 * _hash_dropscanbuf() -- release buffers used in scan.
283 *
284 * This routine unpins the buffers used during scan on which we
285 * hold no lock.
286 */
287 void
_hash_dropscanbuf(Relation rel,HashScanOpaque so)288 _hash_dropscanbuf(Relation rel, HashScanOpaque so)
289 {
290 /* release pin we hold on primary bucket page */
291 if (BufferIsValid(so->hashso_bucket_buf) &&
292 so->hashso_bucket_buf != so->currPos.buf)
293 _hash_dropbuf(rel, so->hashso_bucket_buf);
294 so->hashso_bucket_buf = InvalidBuffer;
295
296 /* release pin we hold on primary bucket page of bucket being split */
297 if (BufferIsValid(so->hashso_split_bucket_buf) &&
298 so->hashso_split_bucket_buf != so->currPos.buf)
299 _hash_dropbuf(rel, so->hashso_split_bucket_buf);
300 so->hashso_split_bucket_buf = InvalidBuffer;
301
302 /* release any pin we still hold */
303 if (BufferIsValid(so->currPos.buf))
304 _hash_dropbuf(rel, so->currPos.buf);
305 so->currPos.buf = InvalidBuffer;
306
307 /* reset split scan */
308 so->hashso_buc_populated = false;
309 so->hashso_buc_split = false;
310 }
311
312
313 /*
314 * _hash_init() -- Initialize the metadata page of a hash index,
315 * the initial buckets, and the initial bitmap page.
316 *
317 * The initial number of buckets is dependent on num_tuples, an estimate
318 * of the number of tuples to be loaded into the index initially. The
319 * chosen number of buckets is returned.
320 *
321 * We are fairly cavalier about locking here, since we know that no one else
322 * could be accessing this index. In particular the rule about not holding
323 * multiple buffer locks is ignored.
324 */
325 uint32
_hash_init(Relation rel,double num_tuples,ForkNumber forkNum)326 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
327 {
328 Buffer metabuf;
329 Buffer buf;
330 Buffer bitmapbuf;
331 Page pg;
332 HashMetaPage metap;
333 RegProcedure procid;
334 int32 data_width;
335 int32 item_width;
336 int32 ffactor;
337 uint32 num_buckets;
338 uint32 i;
339 bool use_wal;
340
341 /* safety check */
342 if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0)
343 elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
344 RelationGetRelationName(rel));
345
346 /*
347 * WAL log creation of pages if the relation is persistent, or this is the
348 * init fork. Init forks for unlogged relations always need to be WAL
349 * logged.
350 */
351 use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM;
352
353 /*
354 * Determine the target fill factor (in tuples per bucket) for this index.
355 * The idea is to make the fill factor correspond to pages about as full
356 * as the user-settable fillfactor parameter says. We can compute it
357 * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
358 */
359 data_width = sizeof(uint32);
360 item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
361 sizeof(ItemIdData); /* include the line pointer */
362 ffactor = HashGetTargetPageUsage(rel) / item_width;
363 /* keep to a sane range */
364 if (ffactor < 10)
365 ffactor = 10;
366
367 procid = index_getprocid(rel, 1, HASHSTANDARD_PROC);
368
369 /*
370 * We initialize the metapage, the first N bucket pages, and the first
371 * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
372 * calls to occur. This ensures that the smgr level has the right idea of
373 * the physical index length.
374 *
375 * Critical section not required, because on error the creation of the
376 * whole relation will be rolled back.
377 */
378 metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
379 _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false);
380 MarkBufferDirty(metabuf);
381
382 pg = BufferGetPage(metabuf);
383 metap = HashPageGetMeta(pg);
384
385 /* XLOG stuff */
386 if (use_wal)
387 {
388 xl_hash_init_meta_page xlrec;
389 XLogRecPtr recptr;
390
391 xlrec.num_tuples = num_tuples;
392 xlrec.procid = metap->hashm_procid;
393 xlrec.ffactor = metap->hashm_ffactor;
394
395 XLogBeginInsert();
396 XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage);
397 XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
398
399 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE);
400
401 PageSetLSN(BufferGetPage(metabuf), recptr);
402 }
403
404 num_buckets = metap->hashm_maxbucket + 1;
405
406 /*
407 * Release buffer lock on the metapage while we initialize buckets.
408 * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
409 * won't accomplish anything. It's a bad idea to hold buffer locks for
410 * long intervals in any case, since that can block the bgwriter.
411 */
412 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
413
414 /*
415 * Initialize and WAL Log the first N buckets
416 */
417 for (i = 0; i < num_buckets; i++)
418 {
419 BlockNumber blkno;
420
421 /* Allow interrupts, in case N is huge */
422 CHECK_FOR_INTERRUPTS();
423
424 blkno = BUCKET_TO_BLKNO(metap, i);
425 buf = _hash_getnewbuf(rel, blkno, forkNum);
426 _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false);
427 MarkBufferDirty(buf);
428
429 if (use_wal)
430 log_newpage(&rel->rd_node,
431 forkNum,
432 blkno,
433 BufferGetPage(buf),
434 true);
435 _hash_relbuf(rel, buf);
436 }
437
438 /* Now reacquire buffer lock on metapage */
439 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
440
441 /*
442 * Initialize bitmap page
443 */
444 bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum);
445 _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false);
446 MarkBufferDirty(bitmapbuf);
447
448 /* add the new bitmap page to the metapage's list of bitmaps */
449 /* metapage already has a write lock */
450 if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
451 ereport(ERROR,
452 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
453 errmsg("out of overflow pages in hash index \"%s\"",
454 RelationGetRelationName(rel))));
455
456 metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1;
457
458 metap->hashm_nmaps++;
459 MarkBufferDirty(metabuf);
460
461 /* XLOG stuff */
462 if (use_wal)
463 {
464 xl_hash_init_bitmap_page xlrec;
465 XLogRecPtr recptr;
466
467 xlrec.bmsize = metap->hashm_bmsize;
468
469 XLogBeginInsert();
470 XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage);
471 XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT);
472
473 /*
474 * This is safe only because nobody else can be modifying the index at
475 * this stage; it's only visible to the transaction that is creating
476 * it.
477 */
478 XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
479
480 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE);
481
482 PageSetLSN(BufferGetPage(bitmapbuf), recptr);
483 PageSetLSN(BufferGetPage(metabuf), recptr);
484 }
485
486 /* all done */
487 _hash_relbuf(rel, bitmapbuf);
488 _hash_relbuf(rel, metabuf);
489
490 return num_buckets;
491 }
492
493 /*
494 * _hash_init_metabuffer() -- Initialize the metadata page of a hash index.
495 */
496 void
_hash_init_metabuffer(Buffer buf,double num_tuples,RegProcedure procid,uint16 ffactor,bool initpage)497 _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
498 uint16 ffactor, bool initpage)
499 {
500 HashMetaPage metap;
501 HashPageOpaque pageopaque;
502 Page page;
503 double dnumbuckets;
504 uint32 num_buckets;
505 uint32 spare_index;
506 uint32 lshift;
507
508 /*
509 * Choose the number of initial bucket pages to match the fill factor
510 * given the estimated number of tuples. We round up the result to the
511 * total number of buckets which has to be allocated before using its
512 * hashm_spares element. However always force at least 2 bucket pages. The
513 * upper limit is determined by considerations explained in
514 * _hash_expandtable().
515 */
516 dnumbuckets = num_tuples / ffactor;
517 if (dnumbuckets <= 2.0)
518 num_buckets = 2;
519 else if (dnumbuckets >= (double) 0x40000000)
520 num_buckets = 0x40000000;
521 else
522 num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets));
523
524 spare_index = _hash_spareindex(num_buckets);
525 Assert(spare_index < HASH_MAX_SPLITPOINTS);
526
527 page = BufferGetPage(buf);
528 if (initpage)
529 _hash_pageinit(page, BufferGetPageSize(buf));
530
531 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
532 pageopaque->hasho_prevblkno = InvalidBlockNumber;
533 pageopaque->hasho_nextblkno = InvalidBlockNumber;
534 pageopaque->hasho_bucket = -1;
535 pageopaque->hasho_flag = LH_META_PAGE;
536 pageopaque->hasho_page_id = HASHO_PAGE_ID;
537
538 metap = HashPageGetMeta(page);
539
540 metap->hashm_magic = HASH_MAGIC;
541 metap->hashm_version = HASH_VERSION;
542 metap->hashm_ntuples = 0;
543 metap->hashm_nmaps = 0;
544 metap->hashm_ffactor = ffactor;
545 metap->hashm_bsize = HashGetMaxBitmapSize(page);
546
547 /* find largest bitmap array size that will fit in page size */
548 lshift = pg_leftmost_one_pos32(metap->hashm_bsize);
549 Assert(lshift > 0);
550 metap->hashm_bmsize = 1 << lshift;
551 metap->hashm_bmshift = lshift + BYTE_TO_BIT;
552 Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));
553
554 /*
555 * Label the index with its primary hash support function's OID. This is
556 * pretty useless for normal operation (in fact, hashm_procid is not used
557 * anywhere), but it might be handy for forensic purposes so we keep it.
558 */
559 metap->hashm_procid = procid;
560
561 /*
562 * We initialize the index with N buckets, 0 .. N-1, occupying physical
563 * blocks 1 to N. The first freespace bitmap page is in block N+1.
564 */
565 metap->hashm_maxbucket = num_buckets - 1;
566
567 /*
568 * Set highmask as next immediate ((2 ^ x) - 1), which should be
569 * sufficient to cover num_buckets.
570 */
571 metap->hashm_highmask = pg_nextpower2_32(num_buckets + 1) - 1;
572 metap->hashm_lowmask = (metap->hashm_highmask >> 1);
573
574 MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
575 MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
576
577 /* Set up mapping for one spare page after the initial splitpoints */
578 metap->hashm_spares[spare_index] = 1;
579 metap->hashm_ovflpoint = spare_index;
580 metap->hashm_firstfree = 0;
581
582 /*
583 * Set pd_lower just past the end of the metadata. This is essential,
584 * because without doing so, metadata will be lost if xlog.c compresses
585 * the page.
586 */
587 ((PageHeader) page)->pd_lower =
588 ((char *) metap + sizeof(HashMetaPageData)) - (char *) page;
589 }
590
591 /*
592 * _hash_pageinit() -- Initialize a new hash index page.
593 */
594 void
_hash_pageinit(Page page,Size size)595 _hash_pageinit(Page page, Size size)
596 {
597 PageInit(page, size, sizeof(HashPageOpaqueData));
598 }
599
600 /*
601 * Attempt to expand the hash table by creating one new bucket.
602 *
603 * This will silently do nothing if we don't get cleanup lock on old or
604 * new bucket.
605 *
606 * Complete the pending splits and remove the tuples from old bucket,
607 * if there are any left over from the previous split.
608 *
609 * The caller must hold a pin, but no lock, on the metapage buffer.
610 * The buffer is returned in the same state.
611 */
612 void
_hash_expandtable(Relation rel,Buffer metabuf)613 _hash_expandtable(Relation rel, Buffer metabuf)
614 {
615 HashMetaPage metap;
616 Bucket old_bucket;
617 Bucket new_bucket;
618 uint32 spare_ndx;
619 BlockNumber start_oblkno;
620 BlockNumber start_nblkno;
621 Buffer buf_nblkno;
622 Buffer buf_oblkno;
623 Page opage;
624 Page npage;
625 HashPageOpaque oopaque;
626 HashPageOpaque nopaque;
627 uint32 maxbucket;
628 uint32 highmask;
629 uint32 lowmask;
630 bool metap_update_masks = false;
631 bool metap_update_splitpoint = false;
632
633 restart_expand:
634
635 /*
636 * Write-lock the meta page. It used to be necessary to acquire a
637 * heavyweight lock to begin a split, but that is no longer required.
638 */
639 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
640
641 _hash_checkpage(rel, metabuf, LH_META_PAGE);
642 metap = HashPageGetMeta(BufferGetPage(metabuf));
643
644 /*
645 * Check to see if split is still needed; someone else might have already
646 * done one while we waited for the lock.
647 *
648 * Make sure this stays in sync with _hash_doinsert()
649 */
650 if (metap->hashm_ntuples <=
651 (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
652 goto fail;
653
654 /*
655 * Can't split anymore if maxbucket has reached its maximum possible
656 * value.
657 *
658 * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
659 * the calculation maxbucket+1 mustn't overflow). Currently we restrict
660 * to half that to prevent failure of pg_ceil_log2_32() and insufficient
661 * space in hashm_spares[]. It's moot anyway because an index with 2^32
662 * buckets would certainly overflow BlockNumber and hence
663 * _hash_alloc_buckets() would fail, but if we supported buckets smaller
664 * than a disk block then this would be an independent constraint.
665 *
666 * If you change this, see also the maximum initial number of buckets in
667 * _hash_init().
668 */
669 if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
670 goto fail;
671
672 /*
673 * Determine which bucket is to be split, and attempt to take cleanup lock
674 * on the old bucket. If we can't get the lock, give up.
675 *
676 * The cleanup lock protects us not only against other backends, but
677 * against our own backend as well.
678 *
679 * The cleanup lock is mainly to protect the split from concurrent
680 * inserts. See src/backend/access/hash/README, Lock Definitions for
681 * further details. Due to this locking restriction, if there is any
682 * pending scan, the split will give up which is not good, but harmless.
683 */
684 new_bucket = metap->hashm_maxbucket + 1;
685
686 old_bucket = (new_bucket & metap->hashm_lowmask);
687
688 start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
689
690 buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE);
691 if (!buf_oblkno)
692 goto fail;
693
694 opage = BufferGetPage(buf_oblkno);
695 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
696
697 /*
698 * We want to finish the split from a bucket as there is no apparent
699 * benefit by not doing so and it will make the code complicated to finish
700 * the split that involves multiple buckets considering the case where new
701 * split also fails. We don't need to consider the new bucket for
702 * completing the split here as it is not possible that a re-split of new
703 * bucket starts when there is still a pending split from old bucket.
704 */
705 if (H_BUCKET_BEING_SPLIT(oopaque))
706 {
707 /*
708 * Copy bucket mapping info now; refer the comment in code below where
709 * we copy this information before calling _hash_splitbucket to see
710 * why this is okay.
711 */
712 maxbucket = metap->hashm_maxbucket;
713 highmask = metap->hashm_highmask;
714 lowmask = metap->hashm_lowmask;
715
716 /*
717 * Release the lock on metapage and old_bucket, before completing the
718 * split.
719 */
720 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
721 LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK);
722
723 _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket,
724 highmask, lowmask);
725
726 /* release the pin on old buffer and retry for expand. */
727 _hash_dropbuf(rel, buf_oblkno);
728
729 goto restart_expand;
730 }
731
732 /*
733 * Clean the tuples remained from the previous split. This operation
734 * requires cleanup lock and we already have one on the old bucket, so
735 * let's do it. We also don't want to allow further splits from the bucket
736 * till the garbage of previous split is cleaned. This has two
737 * advantages; first, it helps in avoiding the bloat due to garbage and
738 * second is, during cleanup of bucket, we are always sure that the
739 * garbage tuples belong to most recently split bucket. On the contrary,
740 * if we allow cleanup of bucket after meta page is updated to indicate
741 * the new split and before the actual split, the cleanup operation won't
742 * be able to decide whether the tuple has been moved to the newly created
743 * bucket and ended up deleting such tuples.
744 */
745 if (H_NEEDS_SPLIT_CLEANUP(oopaque))
746 {
747 /*
748 * Copy bucket mapping info now; refer to the comment in code below
749 * where we copy this information before calling _hash_splitbucket to
750 * see why this is okay.
751 */
752 maxbucket = metap->hashm_maxbucket;
753 highmask = metap->hashm_highmask;
754 lowmask = metap->hashm_lowmask;
755
756 /* Release the metapage lock. */
757 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
758
759 hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL,
760 maxbucket, highmask, lowmask, NULL, NULL, true,
761 NULL, NULL);
762
763 _hash_dropbuf(rel, buf_oblkno);
764
765 goto restart_expand;
766 }
767
768 /*
769 * There shouldn't be any active scan on new bucket.
770 *
771 * Note: it is safe to compute the new bucket's blkno here, even though we
772 * may still need to update the BUCKET_TO_BLKNO mapping. This is because
773 * the current value of hashm_spares[hashm_ovflpoint] correctly shows
774 * where we are going to put a new splitpoint's worth of buckets.
775 */
776 start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
777
778 /*
779 * If the split point is increasing we need to allocate a new batch of
780 * bucket pages.
781 */
782 spare_ndx = _hash_spareindex(new_bucket + 1);
783 if (spare_ndx > metap->hashm_ovflpoint)
784 {
785 uint32 buckets_to_add;
786
787 Assert(spare_ndx == metap->hashm_ovflpoint + 1);
788
789 /*
790 * We treat allocation of buckets as a separate WAL-logged action.
791 * Even if we fail after this operation, won't leak bucket pages;
792 * rather, the next split will consume this space. In any case, even
793 * without failure we don't use all the space in one split operation.
794 */
795 buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket;
796 if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add))
797 {
798 /* can't split due to BlockNumber overflow */
799 _hash_relbuf(rel, buf_oblkno);
800 goto fail;
801 }
802 }
803
804 /*
805 * Physically allocate the new bucket's primary page. We want to do this
806 * before changing the metapage's mapping info, in case we can't get the
807 * disk space. Ideally, we don't need to check for cleanup lock on new
808 * bucket as no other backend could find this bucket unless meta page is
809 * updated. However, it is good to be consistent with old bucket locking.
810 */
811 buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM);
812 if (!IsBufferCleanupOK(buf_nblkno))
813 {
814 _hash_relbuf(rel, buf_oblkno);
815 _hash_relbuf(rel, buf_nblkno);
816 goto fail;
817 }
818
819 /*
820 * Since we are scribbling on the pages in the shared buffers, establish a
821 * critical section. Any failure in this next code leaves us with a big
822 * problem: the metapage is effectively corrupt but could get written back
823 * to disk.
824 */
825 START_CRIT_SECTION();
826
827 /*
828 * Okay to proceed with split. Update the metapage bucket mapping info.
829 */
830 metap->hashm_maxbucket = new_bucket;
831
832 if (new_bucket > metap->hashm_highmask)
833 {
834 /* Starting a new doubling */
835 metap->hashm_lowmask = metap->hashm_highmask;
836 metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
837 metap_update_masks = true;
838 }
839
840 /*
841 * If the split point is increasing we need to adjust the hashm_spares[]
842 * array and hashm_ovflpoint so that future overflow pages will be created
843 * beyond this new batch of bucket pages.
844 */
845 if (spare_ndx > metap->hashm_ovflpoint)
846 {
847 metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
848 metap->hashm_ovflpoint = spare_ndx;
849 metap_update_splitpoint = true;
850 }
851
852 MarkBufferDirty(metabuf);
853
854 /*
855 * Copy bucket mapping info now; this saves re-accessing the meta page
856 * inside _hash_splitbucket's inner loop. Note that once we drop the
857 * split lock, other splits could begin, so these values might be out of
858 * date before _hash_splitbucket finishes. That's okay, since all it
859 * needs is to tell which of these two buckets to map hashkeys into.
860 */
861 maxbucket = metap->hashm_maxbucket;
862 highmask = metap->hashm_highmask;
863 lowmask = metap->hashm_lowmask;
864
865 opage = BufferGetPage(buf_oblkno);
866 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
867
868 /*
869 * Mark the old bucket to indicate that split is in progress. (At
870 * operation end, we will clear the split-in-progress flag.) Also, for a
871 * primary bucket page, hasho_prevblkno stores the number of buckets that
872 * existed as of the last split, so we must update that value here.
873 */
874 oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
875 oopaque->hasho_prevblkno = maxbucket;
876
877 MarkBufferDirty(buf_oblkno);
878
879 npage = BufferGetPage(buf_nblkno);
880
881 /*
882 * initialize the new bucket's primary page and mark it to indicate that
883 * split is in progress.
884 */
885 nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
886 nopaque->hasho_prevblkno = maxbucket;
887 nopaque->hasho_nextblkno = InvalidBlockNumber;
888 nopaque->hasho_bucket = new_bucket;
889 nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
890 nopaque->hasho_page_id = HASHO_PAGE_ID;
891
892 MarkBufferDirty(buf_nblkno);
893
894 /* XLOG stuff */
895 if (RelationNeedsWAL(rel))
896 {
897 xl_hash_split_allocate_page xlrec;
898 XLogRecPtr recptr;
899
900 xlrec.new_bucket = maxbucket;
901 xlrec.old_bucket_flag = oopaque->hasho_flag;
902 xlrec.new_bucket_flag = nopaque->hasho_flag;
903 xlrec.flags = 0;
904
905 XLogBeginInsert();
906
907 XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD);
908 XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT);
909 XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD);
910
911 if (metap_update_masks)
912 {
913 xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS;
914 XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32));
915 XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32));
916 }
917
918 if (metap_update_splitpoint)
919 {
920 xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT;
921 XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint,
922 sizeof(uint32));
923 XLogRegisterBufData(2,
924 (char *) &metap->hashm_spares[metap->hashm_ovflpoint],
925 sizeof(uint32));
926 }
927
928 XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage);
929
930 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE);
931
932 PageSetLSN(BufferGetPage(buf_oblkno), recptr);
933 PageSetLSN(BufferGetPage(buf_nblkno), recptr);
934 PageSetLSN(BufferGetPage(metabuf), recptr);
935 }
936
937 END_CRIT_SECTION();
938
939 /* drop lock, but keep pin */
940 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
941
942 /* Relocate records to the new bucket */
943 _hash_splitbucket(rel, metabuf,
944 old_bucket, new_bucket,
945 buf_oblkno, buf_nblkno, NULL,
946 maxbucket, highmask, lowmask);
947
948 /* all done, now release the pins on primary buckets. */
949 _hash_dropbuf(rel, buf_oblkno);
950 _hash_dropbuf(rel, buf_nblkno);
951
952 return;
953
954 /* Here if decide not to split or fail to acquire old bucket lock */
955 fail:
956
957 /* We didn't write the metapage, so just drop lock */
958 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
959 }
960
961
962 /*
963 * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
964 *
965 * This does not need to initialize the new bucket pages; we'll do that as
966 * each one is used by _hash_expandtable(). But we have to extend the logical
967 * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
968 * sync with ours, so that we don't get complaints from smgr.
969 *
970 * We do this by writing a page of zeroes at the end of the splitpoint range.
971 * We expect that the filesystem will ensure that the intervening pages read
972 * as zeroes too. On many filesystems this "hole" will not be allocated
973 * immediately, which means that the index file may end up more fragmented
974 * than if we forced it all to be allocated now; but since we don't scan
975 * hash indexes sequentially anyway, that probably doesn't matter.
976 *
977 * XXX It's annoying that this code is executed with the metapage lock held.
978 * We need to interlock against _hash_addovflpage() adding a new overflow page
979 * concurrently, but it'd likely be better to use LockRelationForExtension
980 * for the purpose. OTOH, adding a splitpoint is a very infrequent operation,
981 * so it may not be worth worrying about.
982 *
983 * Returns true if successful, or false if allocation failed due to
984 * BlockNumber overflow.
985 */
986 static bool
_hash_alloc_buckets(Relation rel,BlockNumber firstblock,uint32 nblocks)987 _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
988 {
989 BlockNumber lastblock;
990 PGAlignedBlock zerobuf;
991 Page page;
992 HashPageOpaque ovflopaque;
993
994 lastblock = firstblock + nblocks - 1;
995
996 /*
997 * Check for overflow in block number calculation; if so, we cannot extend
998 * the index anymore.
999 */
1000 if (lastblock < firstblock || lastblock == InvalidBlockNumber)
1001 return false;
1002
1003 page = (Page) zerobuf.data;
1004
1005 /*
1006 * Initialize the page. Just zeroing the page won't work; see
1007 * _hash_freeovflpage for similar usage. We take care to make the special
1008 * space valid for the benefit of tools such as pageinspect.
1009 */
1010 _hash_pageinit(page, BLCKSZ);
1011
1012 ovflopaque = (HashPageOpaque) PageGetSpecialPointer(page);
1013
1014 ovflopaque->hasho_prevblkno = InvalidBlockNumber;
1015 ovflopaque->hasho_nextblkno = InvalidBlockNumber;
1016 ovflopaque->hasho_bucket = -1;
1017 ovflopaque->hasho_flag = LH_UNUSED_PAGE;
1018 ovflopaque->hasho_page_id = HASHO_PAGE_ID;
1019
1020 if (RelationNeedsWAL(rel))
1021 log_newpage(&rel->rd_node,
1022 MAIN_FORKNUM,
1023 lastblock,
1024 zerobuf.data,
1025 true);
1026
1027 RelationOpenSmgr(rel);
1028 PageSetChecksumInplace(page, lastblock);
1029 smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf.data, false);
1030
1031 return true;
1032 }
1033
1034
1035 /*
1036 * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
1037 *
1038 * This routine is used to partition the tuples between old and new bucket and
1039 * is used to finish the incomplete split operations. To finish the previously
1040 * interrupted split operation, the caller needs to fill htab. If htab is set,
1041 * then we skip the movement of tuples that exists in htab, otherwise NULL
1042 * value of htab indicates movement of all the tuples that belong to the new
1043 * bucket.
1044 *
1045 * We are splitting a bucket that consists of a base bucket page and zero
1046 * or more overflow (bucket chain) pages. We must relocate tuples that
1047 * belong in the new bucket.
1048 *
1049 * The caller must hold cleanup locks on both buckets to ensure that
1050 * no one else is trying to access them (see README).
1051 *
1052 * The caller must hold a pin, but no lock, on the metapage buffer.
1053 * The buffer is returned in the same state. (The metapage is only
1054 * touched if it becomes necessary to add or remove overflow pages.)
1055 *
1056 * Split needs to retain pin on primary bucket pages of both old and new
1057 * buckets till end of operation. This is to prevent vacuum from starting
1058 * while a split is in progress.
1059 *
1060 * In addition, the caller must have created the new bucket's base page,
1061 * which is passed in buffer nbuf, pinned and write-locked. The lock will be
1062 * released here and pin must be released by the caller. (The API is set up
1063 * this way because we must do _hash_getnewbuf() before releasing the metapage
1064 * write lock. So instead of passing the new bucket's start block number, we
1065 * pass an actual buffer.)
1066 */
1067 static void
_hash_splitbucket(Relation rel,Buffer metabuf,Bucket obucket,Bucket nbucket,Buffer obuf,Buffer nbuf,HTAB * htab,uint32 maxbucket,uint32 highmask,uint32 lowmask)1068 _hash_splitbucket(Relation rel,
1069 Buffer metabuf,
1070 Bucket obucket,
1071 Bucket nbucket,
1072 Buffer obuf,
1073 Buffer nbuf,
1074 HTAB *htab,
1075 uint32 maxbucket,
1076 uint32 highmask,
1077 uint32 lowmask)
1078 {
1079 Buffer bucket_obuf;
1080 Buffer bucket_nbuf;
1081 Page opage;
1082 Page npage;
1083 HashPageOpaque oopaque;
1084 HashPageOpaque nopaque;
1085 OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
1086 IndexTuple itups[MaxIndexTuplesPerPage];
1087 Size all_tups_size = 0;
1088 int i;
1089 uint16 nitups = 0;
1090
1091 bucket_obuf = obuf;
1092 opage = BufferGetPage(obuf);
1093 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
1094
1095 bucket_nbuf = nbuf;
1096 npage = BufferGetPage(nbuf);
1097 nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
1098
1099 /* Copy the predicate locks from old bucket to new bucket. */
1100 PredicateLockPageSplit(rel,
1101 BufferGetBlockNumber(bucket_obuf),
1102 BufferGetBlockNumber(bucket_nbuf));
1103
1104 /*
1105 * Partition the tuples in the old bucket between the old bucket and the
1106 * new bucket, advancing along the old bucket's overflow bucket chain and
1107 * adding overflow pages to the new bucket as needed. Outer loop iterates
1108 * once per page in old bucket.
1109 */
1110 for (;;)
1111 {
1112 BlockNumber oblkno;
1113 OffsetNumber ooffnum;
1114 OffsetNumber omaxoffnum;
1115
1116 /* Scan each tuple in old page */
1117 omaxoffnum = PageGetMaxOffsetNumber(opage);
1118 for (ooffnum = FirstOffsetNumber;
1119 ooffnum <= omaxoffnum;
1120 ooffnum = OffsetNumberNext(ooffnum))
1121 {
1122 IndexTuple itup;
1123 Size itemsz;
1124 Bucket bucket;
1125 bool found = false;
1126
1127 /* skip dead tuples */
1128 if (ItemIdIsDead(PageGetItemId(opage, ooffnum)))
1129 continue;
1130
1131 /*
1132 * Before inserting a tuple, probe the hash table containing TIDs
1133 * of tuples belonging to new bucket, if we find a match, then
1134 * skip that tuple, else fetch the item's hash key (conveniently
1135 * stored in the item) and determine which bucket it now belongs
1136 * in.
1137 */
1138 itup = (IndexTuple) PageGetItem(opage,
1139 PageGetItemId(opage, ooffnum));
1140
1141 if (htab)
1142 (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found);
1143
1144 if (found)
1145 continue;
1146
1147 bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
1148 maxbucket, highmask, lowmask);
1149
1150 if (bucket == nbucket)
1151 {
1152 IndexTuple new_itup;
1153
1154 /*
1155 * make a copy of index tuple as we have to scribble on it.
1156 */
1157 new_itup = CopyIndexTuple(itup);
1158
1159 /*
1160 * mark the index tuple as moved by split, such tuples are
1161 * skipped by scan if there is split in progress for a bucket.
1162 */
1163 new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK;
1164
1165 /*
1166 * insert the tuple into the new bucket. if it doesn't fit on
1167 * the current page in the new bucket, we must allocate a new
1168 * overflow page and place the tuple on that page instead.
1169 */
1170 itemsz = IndexTupleSize(new_itup);
1171 itemsz = MAXALIGN(itemsz);
1172
1173 if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz))
1174 {
1175 /*
1176 * Change the shared buffer state in critical section,
1177 * otherwise any error could make it unrecoverable.
1178 */
1179 START_CRIT_SECTION();
1180
1181 _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
1182 MarkBufferDirty(nbuf);
1183 /* log the split operation before releasing the lock */
1184 log_split_page(rel, nbuf);
1185
1186 END_CRIT_SECTION();
1187
1188 /* drop lock, but keep pin */
1189 LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
1190
1191 /* be tidy */
1192 for (i = 0; i < nitups; i++)
1193 pfree(itups[i]);
1194 nitups = 0;
1195 all_tups_size = 0;
1196
1197 /* chain to a new overflow page */
1198 nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false);
1199 npage = BufferGetPage(nbuf);
1200 nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
1201 }
1202
1203 itups[nitups++] = new_itup;
1204 all_tups_size += itemsz;
1205 }
1206 else
1207 {
1208 /*
1209 * the tuple stays on this page, so nothing to do.
1210 */
1211 Assert(bucket == obucket);
1212 }
1213 }
1214
1215 oblkno = oopaque->hasho_nextblkno;
1216
1217 /* retain the pin on the old primary bucket */
1218 if (obuf == bucket_obuf)
1219 LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
1220 else
1221 _hash_relbuf(rel, obuf);
1222
1223 /* Exit loop if no more overflow pages in old bucket */
1224 if (!BlockNumberIsValid(oblkno))
1225 {
1226 /*
1227 * Change the shared buffer state in critical section, otherwise
1228 * any error could make it unrecoverable.
1229 */
1230 START_CRIT_SECTION();
1231
1232 _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
1233 MarkBufferDirty(nbuf);
1234 /* log the split operation before releasing the lock */
1235 log_split_page(rel, nbuf);
1236
1237 END_CRIT_SECTION();
1238
1239 if (nbuf == bucket_nbuf)
1240 LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
1241 else
1242 _hash_relbuf(rel, nbuf);
1243
1244 /* be tidy */
1245 for (i = 0; i < nitups; i++)
1246 pfree(itups[i]);
1247 break;
1248 }
1249
1250 /* Else, advance to next old page */
1251 obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE);
1252 opage = BufferGetPage(obuf);
1253 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
1254 }
1255
1256 /*
1257 * We're at the end of the old bucket chain, so we're done partitioning
1258 * the tuples. Mark the old and new buckets to indicate split is
1259 * finished.
1260 *
1261 * To avoid deadlocks due to locking order of buckets, first lock the old
1262 * bucket and then the new bucket.
1263 */
1264 LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE);
1265 opage = BufferGetPage(bucket_obuf);
1266 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
1267
1268 LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE);
1269 npage = BufferGetPage(bucket_nbuf);
1270 nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
1271
1272 START_CRIT_SECTION();
1273
1274 oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT;
1275 nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED;
1276
1277 /*
1278 * After the split is finished, mark the old bucket to indicate that it
1279 * contains deletable tuples. We will clear split-cleanup flag after
1280 * deleting such tuples either at the end of split or at the next split
1281 * from old bucket or at the time of vacuum.
1282 */
1283 oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP;
1284
1285 /*
1286 * now write the buffers, here we don't release the locks as caller is
1287 * responsible to release locks.
1288 */
1289 MarkBufferDirty(bucket_obuf);
1290 MarkBufferDirty(bucket_nbuf);
1291
1292 if (RelationNeedsWAL(rel))
1293 {
1294 XLogRecPtr recptr;
1295 xl_hash_split_complete xlrec;
1296
1297 xlrec.old_bucket_flag = oopaque->hasho_flag;
1298 xlrec.new_bucket_flag = nopaque->hasho_flag;
1299
1300 XLogBeginInsert();
1301
1302 XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete);
1303
1304 XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD);
1305 XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD);
1306
1307 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE);
1308
1309 PageSetLSN(BufferGetPage(bucket_obuf), recptr);
1310 PageSetLSN(BufferGetPage(bucket_nbuf), recptr);
1311 }
1312
1313 END_CRIT_SECTION();
1314
1315 /*
1316 * If possible, clean up the old bucket. We might not be able to do this
1317 * if someone else has a pin on it, but if not then we can go ahead. This
1318 * isn't absolutely necessary, but it reduces bloat; if we don't do it
1319 * now, VACUUM will do it eventually, but maybe not until new overflow
1320 * pages have been allocated. Note that there's no need to clean up the
1321 * new bucket.
1322 */
1323 if (IsBufferCleanupOK(bucket_obuf))
1324 {
1325 LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
1326 hashbucketcleanup(rel, obucket, bucket_obuf,
1327 BufferGetBlockNumber(bucket_obuf), NULL,
1328 maxbucket, highmask, lowmask, NULL, NULL, true,
1329 NULL, NULL);
1330 }
1331 else
1332 {
1333 LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
1334 LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK);
1335 }
1336 }
1337
1338 /*
1339 * _hash_finish_split() -- Finish the previously interrupted split operation
1340 *
1341 * To complete the split operation, we form the hash table of TIDs in new
1342 * bucket which is then used by split operation to skip tuples that are
1343 * already moved before the split operation was previously interrupted.
1344 *
1345 * The caller must hold a pin, but no lock, on the metapage and old bucket's
1346 * primary page buffer. The buffers are returned in the same state. (The
1347 * metapage is only touched if it becomes necessary to add or remove overflow
1348 * pages.)
1349 */
1350 void
_hash_finish_split(Relation rel,Buffer metabuf,Buffer obuf,Bucket obucket,uint32 maxbucket,uint32 highmask,uint32 lowmask)1351 _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
1352 uint32 maxbucket, uint32 highmask, uint32 lowmask)
1353 {
1354 HASHCTL hash_ctl;
1355 HTAB *tidhtab;
1356 Buffer bucket_nbuf = InvalidBuffer;
1357 Buffer nbuf;
1358 Page npage;
1359 BlockNumber nblkno;
1360 BlockNumber bucket_nblkno;
1361 HashPageOpaque npageopaque;
1362 Bucket nbucket;
1363 bool found;
1364
1365 /* Initialize hash tables used to track TIDs */
1366 hash_ctl.keysize = sizeof(ItemPointerData);
1367 hash_ctl.entrysize = sizeof(ItemPointerData);
1368 hash_ctl.hcxt = CurrentMemoryContext;
1369
1370 tidhtab =
1371 hash_create("bucket ctids",
1372 256, /* arbitrary initial size */
1373 &hash_ctl,
1374 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1375
1376 bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket);
1377
1378 /*
1379 * Scan the new bucket and build hash table of TIDs
1380 */
1381 for (;;)
1382 {
1383 OffsetNumber noffnum;
1384 OffsetNumber nmaxoffnum;
1385
1386 nbuf = _hash_getbuf(rel, nblkno, HASH_READ,
1387 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
1388
1389 /* remember the primary bucket buffer to acquire cleanup lock on it. */
1390 if (nblkno == bucket_nblkno)
1391 bucket_nbuf = nbuf;
1392
1393 npage = BufferGetPage(nbuf);
1394 npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
1395
1396 /* Scan each tuple in new page */
1397 nmaxoffnum = PageGetMaxOffsetNumber(npage);
1398 for (noffnum = FirstOffsetNumber;
1399 noffnum <= nmaxoffnum;
1400 noffnum = OffsetNumberNext(noffnum))
1401 {
1402 IndexTuple itup;
1403
1404 /* Fetch the item's TID and insert it in hash table. */
1405 itup = (IndexTuple) PageGetItem(npage,
1406 PageGetItemId(npage, noffnum));
1407
1408 (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found);
1409
1410 Assert(!found);
1411 }
1412
1413 nblkno = npageopaque->hasho_nextblkno;
1414
1415 /*
1416 * release our write lock without modifying buffer and ensure to
1417 * retain the pin on primary bucket.
1418 */
1419 if (nbuf == bucket_nbuf)
1420 LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
1421 else
1422 _hash_relbuf(rel, nbuf);
1423
1424 /* Exit loop if no more overflow pages in new bucket */
1425 if (!BlockNumberIsValid(nblkno))
1426 break;
1427 }
1428
1429 /*
1430 * Conditionally get the cleanup lock on old and new buckets to perform
1431 * the split operation. If we don't get the cleanup locks, silently give
1432 * up and next insertion on old bucket will try again to complete the
1433 * split.
1434 */
1435 if (!ConditionalLockBufferForCleanup(obuf))
1436 {
1437 hash_destroy(tidhtab);
1438 return;
1439 }
1440 if (!ConditionalLockBufferForCleanup(bucket_nbuf))
1441 {
1442 LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
1443 hash_destroy(tidhtab);
1444 return;
1445 }
1446
1447 npage = BufferGetPage(bucket_nbuf);
1448 npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
1449 nbucket = npageopaque->hasho_bucket;
1450
1451 _hash_splitbucket(rel, metabuf, obucket,
1452 nbucket, obuf, bucket_nbuf, tidhtab,
1453 maxbucket, highmask, lowmask);
1454
1455 _hash_dropbuf(rel, bucket_nbuf);
1456 hash_destroy(tidhtab);
1457 }
1458
1459 /*
1460 * log_split_page() -- Log the split operation
1461 *
1462 * We log the split operation when the new page in new bucket gets full,
1463 * so we log the entire page.
1464 *
1465 * 'buf' must be locked by the caller which is also responsible for unlocking
1466 * it.
1467 */
1468 static void
log_split_page(Relation rel,Buffer buf)1469 log_split_page(Relation rel, Buffer buf)
1470 {
1471 if (RelationNeedsWAL(rel))
1472 {
1473 XLogRecPtr recptr;
1474
1475 XLogBeginInsert();
1476
1477 XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
1478
1479 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE);
1480
1481 PageSetLSN(BufferGetPage(buf), recptr);
1482 }
1483 }
1484
1485 /*
1486 * _hash_getcachedmetap() -- Returns cached metapage data.
1487 *
1488 * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on
1489 * the metapage. If not set, we'll set it before returning if we have to
1490 * refresh the cache, and return with a pin but no lock on it; caller is
1491 * responsible for releasing the pin.
1492 *
1493 * We refresh the cache if it's not initialized yet or force_refresh is true.
1494 */
1495 HashMetaPage
_hash_getcachedmetap(Relation rel,Buffer * metabuf,bool force_refresh)1496 _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
1497 {
1498 Page page;
1499
1500 Assert(metabuf);
1501 if (force_refresh || rel->rd_amcache == NULL)
1502 {
1503 char *cache = NULL;
1504
1505 /*
1506 * It's important that we don't set rd_amcache to an invalid value.
1507 * Either MemoryContextAlloc or _hash_getbuf could fail, so don't
1508 * install a pointer to the newly-allocated storage in the actual
1509 * relcache entry until both have succeeded.
1510 */
1511 if (rel->rd_amcache == NULL)
1512 cache = MemoryContextAlloc(rel->rd_indexcxt,
1513 sizeof(HashMetaPageData));
1514
1515 /* Read the metapage. */
1516 if (BufferIsValid(*metabuf))
1517 LockBuffer(*metabuf, BUFFER_LOCK_SHARE);
1518 else
1519 *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ,
1520 LH_META_PAGE);
1521 page = BufferGetPage(*metabuf);
1522
1523 /* Populate the cache. */
1524 if (rel->rd_amcache == NULL)
1525 rel->rd_amcache = cache;
1526 memcpy(rel->rd_amcache, HashPageGetMeta(page),
1527 sizeof(HashMetaPageData));
1528
1529 /* Release metapage lock, but keep the pin. */
1530 LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK);
1531 }
1532
1533 return (HashMetaPage) rel->rd_amcache;
1534 }
1535
1536 /*
1537 * _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
1538 * hashkey.
1539 *
1540 * Bucket pages do not move or get removed once they are allocated. This give
1541 * us an opportunity to use the previously saved metapage contents to reach
1542 * the target bucket buffer, instead of reading from the metapage every time.
1543 * This saves one buffer access every time we want to reach the target bucket
1544 * buffer, which is very helpful savings in bufmgr traffic and contention.
1545 *
1546 * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
1547 * bucket buffer has to be locked for reading or writing.
1548 *
1549 * The out parameter cachedmetap is set with metapage contents used for
1550 * hashkey to bucket buffer mapping. Some callers need this info to reach the
1551 * old bucket in case of bucket split, see _hash_doinsert().
1552 */
1553 Buffer
_hash_getbucketbuf_from_hashkey(Relation rel,uint32 hashkey,int access,HashMetaPage * cachedmetap)1554 _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access,
1555 HashMetaPage *cachedmetap)
1556 {
1557 HashMetaPage metap;
1558 Buffer buf;
1559 Buffer metabuf = InvalidBuffer;
1560 Page page;
1561 Bucket bucket;
1562 BlockNumber blkno;
1563 HashPageOpaque opaque;
1564
1565 /* We read from target bucket buffer, hence locking is must. */
1566 Assert(access == HASH_READ || access == HASH_WRITE);
1567
1568 metap = _hash_getcachedmetap(rel, &metabuf, false);
1569 Assert(metap != NULL);
1570
1571 /*
1572 * Loop until we get a lock on the correct target bucket.
1573 */
1574 for (;;)
1575 {
1576 /*
1577 * Compute the target bucket number, and convert to block number.
1578 */
1579 bucket = _hash_hashkey2bucket(hashkey,
1580 metap->hashm_maxbucket,
1581 metap->hashm_highmask,
1582 metap->hashm_lowmask);
1583
1584 blkno = BUCKET_TO_BLKNO(metap, bucket);
1585
1586 /* Fetch the primary bucket page for the bucket */
1587 buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
1588 page = BufferGetPage(buf);
1589 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
1590 Assert(opaque->hasho_bucket == bucket);
1591 Assert(opaque->hasho_prevblkno != InvalidBlockNumber);
1592
1593 /*
1594 * If this bucket hasn't been split, we're done.
1595 */
1596 if (opaque->hasho_prevblkno <= metap->hashm_maxbucket)
1597 break;
1598
1599 /* Drop lock on this buffer, update cached metapage, and retry. */
1600 _hash_relbuf(rel, buf);
1601 metap = _hash_getcachedmetap(rel, &metabuf, true);
1602 Assert(metap != NULL);
1603 }
1604
1605 if (BufferIsValid(metabuf))
1606 _hash_dropbuf(rel, metabuf);
1607
1608 if (cachedmetap)
1609 *cachedmetap = metap;
1610
1611 return buf;
1612 }
1613