1 /*-------------------------------------------------------------------------
2 *
3 * hash.c
4 * Implementation of Margo Seltzer's Hashing package for postgres.
5 *
6 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/hash/hash.c
12 *
13 * NOTES
14 * This file contains only the public interface routines.
15 *
16 *-------------------------------------------------------------------------
17 */
18
19 #include "postgres.h"
20
21 #include "access/hash.h"
22 #include "access/hash_xlog.h"
23 #include "access/relscan.h"
24 #include "access/tableam.h"
25 #include "catalog/index.h"
26 #include "commands/progress.h"
27 #include "commands/vacuum.h"
28 #include "miscadmin.h"
29 #include "optimizer/plancat.h"
30 #include "pgstat.h"
31 #include "utils/builtins.h"
32 #include "utils/index_selfuncs.h"
33 #include "utils/rel.h"
34
35 /* Working state for hashbuild and its callback */
36 typedef struct
37 {
38 HSpool *spool; /* NULL if not using spooling */
39 double indtuples; /* # tuples accepted into index */
40 Relation heapRel; /* heap relation descriptor */
41 } HashBuildState;
42
43 static void hashbuildCallback(Relation index,
44 ItemPointer tid,
45 Datum *values,
46 bool *isnull,
47 bool tupleIsAlive,
48 void *state);
49
50
51 /*
52 * Hash handler function: return IndexAmRoutine with access method parameters
53 * and callbacks.
54 */
55 Datum
hashhandler(PG_FUNCTION_ARGS)56 hashhandler(PG_FUNCTION_ARGS)
57 {
58 IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
59
60 amroutine->amstrategies = HTMaxStrategyNumber;
61 amroutine->amsupport = HASHNProcs;
62 amroutine->amoptsprocnum = HASHOPTIONS_PROC;
63 amroutine->amcanorder = false;
64 amroutine->amcanorderbyop = false;
65 amroutine->amcanbackward = true;
66 amroutine->amcanunique = false;
67 amroutine->amcanmulticol = false;
68 amroutine->amoptionalkey = false;
69 amroutine->amsearcharray = false;
70 amroutine->amsearchnulls = false;
71 amroutine->amstorage = false;
72 amroutine->amclusterable = false;
73 amroutine->ampredlocks = true;
74 amroutine->amcanparallel = false;
75 amroutine->amcaninclude = false;
76 amroutine->amusemaintenanceworkmem = false;
77 amroutine->amparallelvacuumoptions =
78 VACUUM_OPTION_PARALLEL_BULKDEL;
79 amroutine->amkeytype = INT4OID;
80
81 amroutine->ambuild = hashbuild;
82 amroutine->ambuildempty = hashbuildempty;
83 amroutine->aminsert = hashinsert;
84 amroutine->ambulkdelete = hashbulkdelete;
85 amroutine->amvacuumcleanup = hashvacuumcleanup;
86 amroutine->amcanreturn = NULL;
87 amroutine->amcostestimate = hashcostestimate;
88 amroutine->amoptions = hashoptions;
89 amroutine->amproperty = NULL;
90 amroutine->ambuildphasename = NULL;
91 amroutine->amvalidate = hashvalidate;
92 amroutine->amadjustmembers = hashadjustmembers;
93 amroutine->ambeginscan = hashbeginscan;
94 amroutine->amrescan = hashrescan;
95 amroutine->amgettuple = hashgettuple;
96 amroutine->amgetbitmap = hashgetbitmap;
97 amroutine->amendscan = hashendscan;
98 amroutine->ammarkpos = NULL;
99 amroutine->amrestrpos = NULL;
100 amroutine->amestimateparallelscan = NULL;
101 amroutine->aminitparallelscan = NULL;
102 amroutine->amparallelrescan = NULL;
103
104 PG_RETURN_POINTER(amroutine);
105 }
106
107 /*
108 * hashbuild() -- build a new hash index.
109 */
110 IndexBuildResult *
hashbuild(Relation heap,Relation index,IndexInfo * indexInfo)111 hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
112 {
113 IndexBuildResult *result;
114 BlockNumber relpages;
115 double reltuples;
116 double allvisfrac;
117 uint32 num_buckets;
118 long sort_threshold;
119 HashBuildState buildstate;
120
121 /*
122 * We expect to be called exactly once for any index relation. If that's
123 * not the case, big trouble's what we have.
124 */
125 if (RelationGetNumberOfBlocks(index) != 0)
126 elog(ERROR, "index \"%s\" already contains data",
127 RelationGetRelationName(index));
128
129 /* Estimate the number of rows currently present in the table */
130 estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
131
132 /* Initialize the hash index metadata page and initial buckets */
133 num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
134
135 /*
136 * If we just insert the tuples into the index in scan order, then
137 * (assuming their hash codes are pretty random) there will be no locality
138 * of access to the index, and if the index is bigger than available RAM
139 * then we'll thrash horribly. To prevent that scenario, we can sort the
140 * tuples by (expected) bucket number. However, such a sort is useless
141 * overhead when the index does fit in RAM. We choose to sort if the
142 * initial index size exceeds maintenance_work_mem, or the number of
143 * buffers usable for the index, whichever is less. (Limiting by the
144 * number of buffers should reduce thrashing between PG buffers and kernel
145 * buffers, which seems useful even if no physical I/O results. Limiting
146 * by maintenance_work_mem is useful to allow easy testing of the sort
147 * code path, and may be useful to DBAs as an additional control knob.)
148 *
149 * NOTE: this test will need adjustment if a bucket is ever different from
150 * one page. Also, "initial index size" accounting does not include the
151 * metapage, nor the first bitmap page.
152 */
153 sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
154 if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
155 sort_threshold = Min(sort_threshold, NBuffers);
156 else
157 sort_threshold = Min(sort_threshold, NLocBuffer);
158
159 if (num_buckets >= (uint32) sort_threshold)
160 buildstate.spool = _h_spoolinit(heap, index, num_buckets);
161 else
162 buildstate.spool = NULL;
163
164 /* prepare to build the index */
165 buildstate.indtuples = 0;
166 buildstate.heapRel = heap;
167
168 /* do the heap scan */
169 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
170 hashbuildCallback,
171 (void *) &buildstate, NULL);
172 pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL,
173 buildstate.indtuples);
174
175 if (buildstate.spool)
176 {
177 /* sort the tuples and insert them into the index */
178 _h_indexbuild(buildstate.spool, buildstate.heapRel);
179 _h_spooldestroy(buildstate.spool);
180 }
181
182 /*
183 * Return statistics
184 */
185 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
186
187 result->heap_tuples = reltuples;
188 result->index_tuples = buildstate.indtuples;
189
190 return result;
191 }
192
193 /*
194 * hashbuildempty() -- build an empty hash index in the initialization fork
195 */
196 void
hashbuildempty(Relation index)197 hashbuildempty(Relation index)
198 {
199 _hash_init(index, 0, INIT_FORKNUM);
200 }
201
202 /*
203 * Per-tuple callback for table_index_build_scan
204 */
205 static void
hashbuildCallback(Relation index,ItemPointer tid,Datum * values,bool * isnull,bool tupleIsAlive,void * state)206 hashbuildCallback(Relation index,
207 ItemPointer tid,
208 Datum *values,
209 bool *isnull,
210 bool tupleIsAlive,
211 void *state)
212 {
213 HashBuildState *buildstate = (HashBuildState *) state;
214 Datum index_values[1];
215 bool index_isnull[1];
216 IndexTuple itup;
217
218 /* convert data to a hash key; on failure, do not insert anything */
219 if (!_hash_convert_tuple(index,
220 values, isnull,
221 index_values, index_isnull))
222 return;
223
224 /* Either spool the tuple for sorting, or just put it into the index */
225 if (buildstate->spool)
226 _h_spool(buildstate->spool, tid, index_values, index_isnull);
227 else
228 {
229 /* form an index tuple and point it at the heap tuple */
230 itup = index_form_tuple(RelationGetDescr(index),
231 index_values, index_isnull);
232 itup->t_tid = *tid;
233 _hash_doinsert(index, itup, buildstate->heapRel);
234 pfree(itup);
235 }
236
237 buildstate->indtuples += 1;
238 }
239
240 /*
241 * hashinsert() -- insert an index tuple into a hash table.
242 *
243 * Hash on the heap tuple's key, form an index tuple with hash code.
244 * Find the appropriate location for the new tuple, and put it there.
245 */
246 bool
hashinsert(Relation rel,Datum * values,bool * isnull,ItemPointer ht_ctid,Relation heapRel,IndexUniqueCheck checkUnique,bool indexUnchanged,IndexInfo * indexInfo)247 hashinsert(Relation rel, Datum *values, bool *isnull,
248 ItemPointer ht_ctid, Relation heapRel,
249 IndexUniqueCheck checkUnique,
250 bool indexUnchanged,
251 IndexInfo *indexInfo)
252 {
253 Datum index_values[1];
254 bool index_isnull[1];
255 IndexTuple itup;
256
257 /* convert data to a hash key; on failure, do not insert anything */
258 if (!_hash_convert_tuple(rel,
259 values, isnull,
260 index_values, index_isnull))
261 return false;
262
263 /* form an index tuple and point it at the heap tuple */
264 itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
265 itup->t_tid = *ht_ctid;
266
267 _hash_doinsert(rel, itup, heapRel);
268
269 pfree(itup);
270
271 return false;
272 }
273
274
275 /*
276 * hashgettuple() -- Get the next tuple in the scan.
277 */
278 bool
hashgettuple(IndexScanDesc scan,ScanDirection dir)279 hashgettuple(IndexScanDesc scan, ScanDirection dir)
280 {
281 HashScanOpaque so = (HashScanOpaque) scan->opaque;
282 bool res;
283
284 /* Hash indexes are always lossy since we store only the hash code */
285 scan->xs_recheck = true;
286
287 /*
288 * If we've already initialized this scan, we can just advance it in the
289 * appropriate direction. If we haven't done so yet, we call a routine to
290 * get the first item in the scan.
291 */
292 if (!HashScanPosIsValid(so->currPos))
293 res = _hash_first(scan, dir);
294 else
295 {
296 /*
297 * Check to see if we should kill the previously-fetched tuple.
298 */
299 if (scan->kill_prior_tuple)
300 {
301 /*
302 * Yes, so remember it for later. (We'll deal with all such tuples
303 * at once right after leaving the index page or at end of scan.)
304 * In case if caller reverses the indexscan direction it is quite
305 * possible that the same item might get entered multiple times.
306 * But, we don't detect that; instead, we just forget any excess
307 * entries.
308 */
309 if (so->killedItems == NULL)
310 so->killedItems = (int *)
311 palloc(MaxIndexTuplesPerPage * sizeof(int));
312
313 if (so->numKilled < MaxIndexTuplesPerPage)
314 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
315 }
316
317 /*
318 * Now continue the scan.
319 */
320 res = _hash_next(scan, dir);
321 }
322
323 return res;
324 }
325
326
327 /*
328 * hashgetbitmap() -- get all tuples at once
329 */
330 int64
hashgetbitmap(IndexScanDesc scan,TIDBitmap * tbm)331 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
332 {
333 HashScanOpaque so = (HashScanOpaque) scan->opaque;
334 bool res;
335 int64 ntids = 0;
336 HashScanPosItem *currItem;
337
338 res = _hash_first(scan, ForwardScanDirection);
339
340 while (res)
341 {
342 currItem = &so->currPos.items[so->currPos.itemIndex];
343
344 /*
345 * _hash_first and _hash_next handle eliminate dead index entries
346 * whenever scan->ignore_killed_tuples is true. Therefore, there's
347 * nothing to do here except add the results to the TIDBitmap.
348 */
349 tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
350 ntids++;
351
352 res = _hash_next(scan, ForwardScanDirection);
353 }
354
355 return ntids;
356 }
357
358
359 /*
360 * hashbeginscan() -- start a scan on a hash index
361 */
362 IndexScanDesc
hashbeginscan(Relation rel,int nkeys,int norderbys)363 hashbeginscan(Relation rel, int nkeys, int norderbys)
364 {
365 IndexScanDesc scan;
366 HashScanOpaque so;
367
368 /* no order by operators allowed */
369 Assert(norderbys == 0);
370
371 scan = RelationGetIndexScan(rel, nkeys, norderbys);
372
373 so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
374 HashScanPosInvalidate(so->currPos);
375 so->hashso_bucket_buf = InvalidBuffer;
376 so->hashso_split_bucket_buf = InvalidBuffer;
377
378 so->hashso_buc_populated = false;
379 so->hashso_buc_split = false;
380
381 so->killedItems = NULL;
382 so->numKilled = 0;
383
384 scan->opaque = so;
385
386 return scan;
387 }
388
389 /*
390 * hashrescan() -- rescan an index relation
391 */
392 void
hashrescan(IndexScanDesc scan,ScanKey scankey,int nscankeys,ScanKey orderbys,int norderbys)393 hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
394 ScanKey orderbys, int norderbys)
395 {
396 HashScanOpaque so = (HashScanOpaque) scan->opaque;
397 Relation rel = scan->indexRelation;
398
399 if (HashScanPosIsValid(so->currPos))
400 {
401 /* Before leaving current page, deal with any killed items */
402 if (so->numKilled > 0)
403 _hash_kill_items(scan);
404 }
405
406 _hash_dropscanbuf(rel, so);
407
408 /* set position invalid (this will cause _hash_first call) */
409 HashScanPosInvalidate(so->currPos);
410
411 /* Update scan key, if a new one is given */
412 if (scankey && scan->numberOfKeys > 0)
413 {
414 memmove(scan->keyData,
415 scankey,
416 scan->numberOfKeys * sizeof(ScanKeyData));
417 }
418
419 so->hashso_buc_populated = false;
420 so->hashso_buc_split = false;
421 }
422
423 /*
424 * hashendscan() -- close down a scan
425 */
426 void
hashendscan(IndexScanDesc scan)427 hashendscan(IndexScanDesc scan)
428 {
429 HashScanOpaque so = (HashScanOpaque) scan->opaque;
430 Relation rel = scan->indexRelation;
431
432 if (HashScanPosIsValid(so->currPos))
433 {
434 /* Before leaving current page, deal with any killed items */
435 if (so->numKilled > 0)
436 _hash_kill_items(scan);
437 }
438
439 _hash_dropscanbuf(rel, so);
440
441 if (so->killedItems != NULL)
442 pfree(so->killedItems);
443 pfree(so);
444 scan->opaque = NULL;
445 }
446
447 /*
448 * Bulk deletion of all index entries pointing to a set of heap tuples.
449 * The set of target tuples is specified via a callback routine that tells
450 * whether any given heap tuple (identified by ItemPointer) is being deleted.
451 *
452 * This function also deletes the tuples that are moved by split to other
453 * bucket.
454 *
455 * Result: a palloc'd struct containing statistical info for VACUUM displays.
456 */
457 IndexBulkDeleteResult *
hashbulkdelete(IndexVacuumInfo * info,IndexBulkDeleteResult * stats,IndexBulkDeleteCallback callback,void * callback_state)458 hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
459 IndexBulkDeleteCallback callback, void *callback_state)
460 {
461 Relation rel = info->index;
462 double tuples_removed;
463 double num_index_tuples;
464 double orig_ntuples;
465 Bucket orig_maxbucket;
466 Bucket cur_maxbucket;
467 Bucket cur_bucket;
468 Buffer metabuf = InvalidBuffer;
469 HashMetaPage metap;
470 HashMetaPage cachedmetap;
471
472 tuples_removed = 0;
473 num_index_tuples = 0;
474
475 /*
476 * We need a copy of the metapage so that we can use its hashm_spares[]
477 * values to compute bucket page addresses, but a cached copy should be
478 * good enough. (If not, we'll detect that further down and refresh the
479 * cache as necessary.)
480 */
481 cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
482 Assert(cachedmetap != NULL);
483
484 orig_maxbucket = cachedmetap->hashm_maxbucket;
485 orig_ntuples = cachedmetap->hashm_ntuples;
486
487 /* Scan the buckets that we know exist */
488 cur_bucket = 0;
489 cur_maxbucket = orig_maxbucket;
490
491 loop_top:
492 while (cur_bucket <= cur_maxbucket)
493 {
494 BlockNumber bucket_blkno;
495 BlockNumber blkno;
496 Buffer bucket_buf;
497 Buffer buf;
498 HashPageOpaque bucket_opaque;
499 Page page;
500 bool split_cleanup = false;
501
502 /* Get address of bucket's start page */
503 bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
504
505 blkno = bucket_blkno;
506
507 /*
508 * We need to acquire a cleanup lock on the primary bucket page to out
509 * wait concurrent scans before deleting the dead tuples.
510 */
511 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
512 LockBufferForCleanup(buf);
513 _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
514
515 page = BufferGetPage(buf);
516 bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
517
518 /*
519 * If the bucket contains tuples that are moved by split, then we need
520 * to delete such tuples. We can't delete such tuples if the split
521 * operation on bucket is not finished as those are needed by scans.
522 */
523 if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
524 H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
525 {
526 split_cleanup = true;
527
528 /*
529 * This bucket might have been split since we last held a lock on
530 * the metapage. If so, hashm_maxbucket, hashm_highmask and
531 * hashm_lowmask might be old enough to cause us to fail to remove
532 * tuples left behind by the most recent split. To prevent that,
533 * now that the primary page of the target bucket has been locked
534 * (and thus can't be further split), check whether we need to
535 * update our cached metapage data.
536 */
537 Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
538 if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
539 {
540 cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
541 Assert(cachedmetap != NULL);
542 }
543 }
544
545 bucket_buf = buf;
546
547 hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
548 cachedmetap->hashm_maxbucket,
549 cachedmetap->hashm_highmask,
550 cachedmetap->hashm_lowmask, &tuples_removed,
551 &num_index_tuples, split_cleanup,
552 callback, callback_state);
553
554 _hash_dropbuf(rel, bucket_buf);
555
556 /* Advance to next bucket */
557 cur_bucket++;
558 }
559
560 if (BufferIsInvalid(metabuf))
561 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
562
563 /* Write-lock metapage and check for split since we started */
564 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
565 metap = HashPageGetMeta(BufferGetPage(metabuf));
566
567 if (cur_maxbucket != metap->hashm_maxbucket)
568 {
569 /* There's been a split, so process the additional bucket(s) */
570 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
571 cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
572 Assert(cachedmetap != NULL);
573 cur_maxbucket = cachedmetap->hashm_maxbucket;
574 goto loop_top;
575 }
576
577 /* Okay, we're really done. Update tuple count in metapage. */
578 START_CRIT_SECTION();
579
580 if (orig_maxbucket == metap->hashm_maxbucket &&
581 orig_ntuples == metap->hashm_ntuples)
582 {
583 /*
584 * No one has split or inserted anything since start of scan, so
585 * believe our count as gospel.
586 */
587 metap->hashm_ntuples = num_index_tuples;
588 }
589 else
590 {
591 /*
592 * Otherwise, our count is untrustworthy since we may have
593 * double-scanned tuples in split buckets. Proceed by dead-reckoning.
594 * (Note: we still return estimated_count = false, because using this
595 * count is better than not updating reltuples at all.)
596 */
597 if (metap->hashm_ntuples > tuples_removed)
598 metap->hashm_ntuples -= tuples_removed;
599 else
600 metap->hashm_ntuples = 0;
601 num_index_tuples = metap->hashm_ntuples;
602 }
603
604 MarkBufferDirty(metabuf);
605
606 /* XLOG stuff */
607 if (RelationNeedsWAL(rel))
608 {
609 xl_hash_update_meta_page xlrec;
610 XLogRecPtr recptr;
611
612 xlrec.ntuples = metap->hashm_ntuples;
613
614 XLogBeginInsert();
615 XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
616
617 XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
618
619 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
620 PageSetLSN(BufferGetPage(metabuf), recptr);
621 }
622
623 END_CRIT_SECTION();
624
625 _hash_relbuf(rel, metabuf);
626
627 /* return statistics */
628 if (stats == NULL)
629 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
630 stats->estimated_count = false;
631 stats->num_index_tuples = num_index_tuples;
632 stats->tuples_removed += tuples_removed;
633 /* hashvacuumcleanup will fill in num_pages */
634
635 return stats;
636 }
637
638 /*
639 * Post-VACUUM cleanup.
640 *
641 * Result: a palloc'd struct containing statistical info for VACUUM displays.
642 */
643 IndexBulkDeleteResult *
hashvacuumcleanup(IndexVacuumInfo * info,IndexBulkDeleteResult * stats)644 hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
645 {
646 Relation rel = info->index;
647 BlockNumber num_pages;
648
649 /* If hashbulkdelete wasn't called, return NULL signifying no change */
650 /* Note: this covers the analyze_only case too */
651 if (stats == NULL)
652 return NULL;
653
654 /* update statistics */
655 num_pages = RelationGetNumberOfBlocks(rel);
656 stats->num_pages = num_pages;
657
658 return stats;
659 }
660
661 /*
662 * Helper function to perform deletion of index entries from a bucket.
663 *
664 * This function expects that the caller has acquired a cleanup lock on the
665 * primary bucket page, and will return with a write lock again held on the
666 * primary bucket page. The lock won't necessarily be held continuously,
667 * though, because we'll release it when visiting overflow pages.
668 *
669 * There can't be any concurrent scans in progress when we first enter this
670 * function because of the cleanup lock we hold on the primary bucket page,
671 * but as soon as we release that lock, there might be. If those scans got
672 * ahead of our cleanup scan, they might see a tuple before we kill it and
673 * wake up only after VACUUM has completed and the TID has been recycled for
674 * an unrelated tuple. To avoid that calamity, we prevent scans from passing
675 * our cleanup scan by locking the next page in the bucket chain before
676 * releasing the lock on the previous page. (This type of lock chaining is not
677 * ideal, so we might want to look for a better solution at some point.)
678 *
679 * We need to retain a pin on the primary bucket to ensure that no concurrent
680 * split can start.
681 */
682 void
hashbucketcleanup(Relation rel,Bucket cur_bucket,Buffer bucket_buf,BlockNumber bucket_blkno,BufferAccessStrategy bstrategy,uint32 maxbucket,uint32 highmask,uint32 lowmask,double * tuples_removed,double * num_index_tuples,bool split_cleanup,IndexBulkDeleteCallback callback,void * callback_state)683 hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
684 BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
685 uint32 maxbucket, uint32 highmask, uint32 lowmask,
686 double *tuples_removed, double *num_index_tuples,
687 bool split_cleanup,
688 IndexBulkDeleteCallback callback, void *callback_state)
689 {
690 BlockNumber blkno;
691 Buffer buf;
692 Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
693 bool bucket_dirty = false;
694
695 blkno = bucket_blkno;
696 buf = bucket_buf;
697
698 if (split_cleanup)
699 new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
700 lowmask, maxbucket);
701
702 /* Scan each page in bucket */
703 for (;;)
704 {
705 HashPageOpaque opaque;
706 OffsetNumber offno;
707 OffsetNumber maxoffno;
708 Buffer next_buf;
709 Page page;
710 OffsetNumber deletable[MaxOffsetNumber];
711 int ndeletable = 0;
712 bool retain_pin = false;
713 bool clear_dead_marking = false;
714
715 vacuum_delay_point();
716
717 page = BufferGetPage(buf);
718 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
719
720 /* Scan each tuple in page */
721 maxoffno = PageGetMaxOffsetNumber(page);
722 for (offno = FirstOffsetNumber;
723 offno <= maxoffno;
724 offno = OffsetNumberNext(offno))
725 {
726 ItemPointer htup;
727 IndexTuple itup;
728 Bucket bucket;
729 bool kill_tuple = false;
730
731 itup = (IndexTuple) PageGetItem(page,
732 PageGetItemId(page, offno));
733 htup = &(itup->t_tid);
734
735 /*
736 * To remove the dead tuples, we strictly want to rely on results
737 * of callback function. refer btvacuumpage for detailed reason.
738 */
739 if (callback && callback(htup, callback_state))
740 {
741 kill_tuple = true;
742 if (tuples_removed)
743 *tuples_removed += 1;
744 }
745 else if (split_cleanup)
746 {
747 /* delete the tuples that are moved by split. */
748 bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
749 maxbucket,
750 highmask,
751 lowmask);
752 /* mark the item for deletion */
753 if (bucket != cur_bucket)
754 {
755 /*
756 * We expect tuples to either belong to current bucket or
757 * new_bucket. This is ensured because we don't allow
758 * further splits from bucket that contains garbage. See
759 * comments in _hash_expandtable.
760 */
761 Assert(bucket == new_bucket);
762 kill_tuple = true;
763 }
764 }
765
766 if (kill_tuple)
767 {
768 /* mark the item for deletion */
769 deletable[ndeletable++] = offno;
770 }
771 else
772 {
773 /* we're keeping it, so count it */
774 if (num_index_tuples)
775 *num_index_tuples += 1;
776 }
777 }
778
779 /* retain the pin on primary bucket page till end of bucket scan */
780 if (blkno == bucket_blkno)
781 retain_pin = true;
782 else
783 retain_pin = false;
784
785 blkno = opaque->hasho_nextblkno;
786
787 /*
788 * Apply deletions, advance to next page and write page if needed.
789 */
790 if (ndeletable > 0)
791 {
792 /* No ereport(ERROR) until changes are logged */
793 START_CRIT_SECTION();
794
795 PageIndexMultiDelete(page, deletable, ndeletable);
796 bucket_dirty = true;
797
798 /*
799 * Let us mark the page as clean if vacuum removes the DEAD tuples
800 * from an index page. We do this by clearing
801 * LH_PAGE_HAS_DEAD_TUPLES flag.
802 */
803 if (tuples_removed && *tuples_removed > 0 &&
804 H_HAS_DEAD_TUPLES(opaque))
805 {
806 opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
807 clear_dead_marking = true;
808 }
809
810 MarkBufferDirty(buf);
811
812 /* XLOG stuff */
813 if (RelationNeedsWAL(rel))
814 {
815 xl_hash_delete xlrec;
816 XLogRecPtr recptr;
817
818 xlrec.clear_dead_marking = clear_dead_marking;
819 xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
820
821 XLogBeginInsert();
822 XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
823
824 /*
825 * bucket buffer needs to be registered to ensure that we can
826 * acquire a cleanup lock on it during replay.
827 */
828 if (!xlrec.is_primary_bucket_page)
829 XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
830
831 XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
832 XLogRegisterBufData(1, (char *) deletable,
833 ndeletable * sizeof(OffsetNumber));
834
835 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
836 PageSetLSN(BufferGetPage(buf), recptr);
837 }
838
839 END_CRIT_SECTION();
840 }
841
842 /* bail out if there are no more pages to scan. */
843 if (!BlockNumberIsValid(blkno))
844 break;
845
846 next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
847 LH_OVERFLOW_PAGE,
848 bstrategy);
849
850 /*
851 * release the lock on previous page after acquiring the lock on next
852 * page
853 */
854 if (retain_pin)
855 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
856 else
857 _hash_relbuf(rel, buf);
858
859 buf = next_buf;
860 }
861
862 /*
863 * lock the bucket page to clear the garbage flag and squeeze the bucket.
864 * if the current buffer is same as bucket buffer, then we already have
865 * lock on bucket page.
866 */
867 if (buf != bucket_buf)
868 {
869 _hash_relbuf(rel, buf);
870 LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
871 }
872
873 /*
874 * Clear the garbage flag from bucket after deleting the tuples that are
875 * moved by split. We purposefully clear the flag before squeeze bucket,
876 * so that after restart, vacuum shouldn't again try to delete the moved
877 * by split tuples.
878 */
879 if (split_cleanup)
880 {
881 HashPageOpaque bucket_opaque;
882 Page page;
883
884 page = BufferGetPage(bucket_buf);
885 bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
886
887 /* No ereport(ERROR) until changes are logged */
888 START_CRIT_SECTION();
889
890 bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
891 MarkBufferDirty(bucket_buf);
892
893 /* XLOG stuff */
894 if (RelationNeedsWAL(rel))
895 {
896 XLogRecPtr recptr;
897
898 XLogBeginInsert();
899 XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
900
901 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
902 PageSetLSN(page, recptr);
903 }
904
905 END_CRIT_SECTION();
906 }
907
908 /*
909 * If we have deleted anything, try to compact free space. For squeezing
910 * the bucket, we must have a cleanup lock, else it can impact the
911 * ordering of tuples for a scan that has started before it.
912 */
913 if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
914 _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
915 bstrategy);
916 else
917 LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
918 }
919