1 /*-------------------------------------------------------------------------
2  *
3  * hash.c
4  *	  Implementation of Margo Seltzer's Hashing package for postgres.
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/hash/hash.c
12  *
13  * NOTES
14  *	  This file contains only the public interface routines.
15  *
16  *-------------------------------------------------------------------------
17  */
18 
19 #include "postgres.h"
20 
21 #include "access/hash.h"
22 #include "access/hash_xlog.h"
23 #include "access/relscan.h"
24 #include "access/tableam.h"
25 #include "catalog/index.h"
26 #include "commands/progress.h"
27 #include "commands/vacuum.h"
28 #include "miscadmin.h"
29 #include "optimizer/plancat.h"
30 #include "pgstat.h"
31 #include "utils/builtins.h"
32 #include "utils/index_selfuncs.h"
33 #include "utils/rel.h"
34 
35 /* Working state for hashbuild and its callback */
36 typedef struct
37 {
38 	HSpool	   *spool;			/* NULL if not using spooling */
39 	double		indtuples;		/* # tuples accepted into index */
40 	Relation	heapRel;		/* heap relation descriptor */
41 } HashBuildState;
42 
43 static void hashbuildCallback(Relation index,
44 							  ItemPointer tid,
45 							  Datum *values,
46 							  bool *isnull,
47 							  bool tupleIsAlive,
48 							  void *state);
49 
50 
51 /*
52  * Hash handler function: return IndexAmRoutine with access method parameters
53  * and callbacks.
54  */
55 Datum
hashhandler(PG_FUNCTION_ARGS)56 hashhandler(PG_FUNCTION_ARGS)
57 {
58 	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
59 
60 	amroutine->amstrategies = HTMaxStrategyNumber;
61 	amroutine->amsupport = HASHNProcs;
62 	amroutine->amoptsprocnum = HASHOPTIONS_PROC;
63 	amroutine->amcanorder = false;
64 	amroutine->amcanorderbyop = false;
65 	amroutine->amcanbackward = true;
66 	amroutine->amcanunique = false;
67 	amroutine->amcanmulticol = false;
68 	amroutine->amoptionalkey = false;
69 	amroutine->amsearcharray = false;
70 	amroutine->amsearchnulls = false;
71 	amroutine->amstorage = false;
72 	amroutine->amclusterable = false;
73 	amroutine->ampredlocks = true;
74 	amroutine->amcanparallel = false;
75 	amroutine->amcaninclude = false;
76 	amroutine->amusemaintenanceworkmem = false;
77 	amroutine->amparallelvacuumoptions =
78 		VACUUM_OPTION_PARALLEL_BULKDEL;
79 	amroutine->amkeytype = INT4OID;
80 
81 	amroutine->ambuild = hashbuild;
82 	amroutine->ambuildempty = hashbuildempty;
83 	amroutine->aminsert = hashinsert;
84 	amroutine->ambulkdelete = hashbulkdelete;
85 	amroutine->amvacuumcleanup = hashvacuumcleanup;
86 	amroutine->amcanreturn = NULL;
87 	amroutine->amcostestimate = hashcostestimate;
88 	amroutine->amoptions = hashoptions;
89 	amroutine->amproperty = NULL;
90 	amroutine->ambuildphasename = NULL;
91 	amroutine->amvalidate = hashvalidate;
92 	amroutine->amadjustmembers = hashadjustmembers;
93 	amroutine->ambeginscan = hashbeginscan;
94 	amroutine->amrescan = hashrescan;
95 	amroutine->amgettuple = hashgettuple;
96 	amroutine->amgetbitmap = hashgetbitmap;
97 	amroutine->amendscan = hashendscan;
98 	amroutine->ammarkpos = NULL;
99 	amroutine->amrestrpos = NULL;
100 	amroutine->amestimateparallelscan = NULL;
101 	amroutine->aminitparallelscan = NULL;
102 	amroutine->amparallelrescan = NULL;
103 
104 	PG_RETURN_POINTER(amroutine);
105 }
106 
107 /*
108  *	hashbuild() -- build a new hash index.
109  */
110 IndexBuildResult *
hashbuild(Relation heap,Relation index,IndexInfo * indexInfo)111 hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
112 {
113 	IndexBuildResult *result;
114 	BlockNumber relpages;
115 	double		reltuples;
116 	double		allvisfrac;
117 	uint32		num_buckets;
118 	long		sort_threshold;
119 	HashBuildState buildstate;
120 
121 	/*
122 	 * We expect to be called exactly once for any index relation. If that's
123 	 * not the case, big trouble's what we have.
124 	 */
125 	if (RelationGetNumberOfBlocks(index) != 0)
126 		elog(ERROR, "index \"%s\" already contains data",
127 			 RelationGetRelationName(index));
128 
129 	/* Estimate the number of rows currently present in the table */
130 	estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
131 
132 	/* Initialize the hash index metadata page and initial buckets */
133 	num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
134 
135 	/*
136 	 * If we just insert the tuples into the index in scan order, then
137 	 * (assuming their hash codes are pretty random) there will be no locality
138 	 * of access to the index, and if the index is bigger than available RAM
139 	 * then we'll thrash horribly.  To prevent that scenario, we can sort the
140 	 * tuples by (expected) bucket number.  However, such a sort is useless
141 	 * overhead when the index does fit in RAM.  We choose to sort if the
142 	 * initial index size exceeds maintenance_work_mem, or the number of
143 	 * buffers usable for the index, whichever is less.  (Limiting by the
144 	 * number of buffers should reduce thrashing between PG buffers and kernel
145 	 * buffers, which seems useful even if no physical I/O results.  Limiting
146 	 * by maintenance_work_mem is useful to allow easy testing of the sort
147 	 * code path, and may be useful to DBAs as an additional control knob.)
148 	 *
149 	 * NOTE: this test will need adjustment if a bucket is ever different from
150 	 * one page.  Also, "initial index size" accounting does not include the
151 	 * metapage, nor the first bitmap page.
152 	 */
153 	sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
154 	if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
155 		sort_threshold = Min(sort_threshold, NBuffers);
156 	else
157 		sort_threshold = Min(sort_threshold, NLocBuffer);
158 
159 	if (num_buckets >= (uint32) sort_threshold)
160 		buildstate.spool = _h_spoolinit(heap, index, num_buckets);
161 	else
162 		buildstate.spool = NULL;
163 
164 	/* prepare to build the index */
165 	buildstate.indtuples = 0;
166 	buildstate.heapRel = heap;
167 
168 	/* do the heap scan */
169 	reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
170 									   hashbuildCallback,
171 									   (void *) &buildstate, NULL);
172 	pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL,
173 								 buildstate.indtuples);
174 
175 	if (buildstate.spool)
176 	{
177 		/* sort the tuples and insert them into the index */
178 		_h_indexbuild(buildstate.spool, buildstate.heapRel);
179 		_h_spooldestroy(buildstate.spool);
180 	}
181 
182 	/*
183 	 * Return statistics
184 	 */
185 	result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
186 
187 	result->heap_tuples = reltuples;
188 	result->index_tuples = buildstate.indtuples;
189 
190 	return result;
191 }
192 
193 /*
194  *	hashbuildempty() -- build an empty hash index in the initialization fork
195  */
196 void
hashbuildempty(Relation index)197 hashbuildempty(Relation index)
198 {
199 	_hash_init(index, 0, INIT_FORKNUM);
200 }
201 
202 /*
203  * Per-tuple callback for table_index_build_scan
204  */
205 static void
hashbuildCallback(Relation index,ItemPointer tid,Datum * values,bool * isnull,bool tupleIsAlive,void * state)206 hashbuildCallback(Relation index,
207 				  ItemPointer tid,
208 				  Datum *values,
209 				  bool *isnull,
210 				  bool tupleIsAlive,
211 				  void *state)
212 {
213 	HashBuildState *buildstate = (HashBuildState *) state;
214 	Datum		index_values[1];
215 	bool		index_isnull[1];
216 	IndexTuple	itup;
217 
218 	/* convert data to a hash key; on failure, do not insert anything */
219 	if (!_hash_convert_tuple(index,
220 							 values, isnull,
221 							 index_values, index_isnull))
222 		return;
223 
224 	/* Either spool the tuple for sorting, or just put it into the index */
225 	if (buildstate->spool)
226 		_h_spool(buildstate->spool, tid, index_values, index_isnull);
227 	else
228 	{
229 		/* form an index tuple and point it at the heap tuple */
230 		itup = index_form_tuple(RelationGetDescr(index),
231 								index_values, index_isnull);
232 		itup->t_tid = *tid;
233 		_hash_doinsert(index, itup, buildstate->heapRel);
234 		pfree(itup);
235 	}
236 
237 	buildstate->indtuples += 1;
238 }
239 
240 /*
241  *	hashinsert() -- insert an index tuple into a hash table.
242  *
243  *	Hash on the heap tuple's key, form an index tuple with hash code.
244  *	Find the appropriate location for the new tuple, and put it there.
245  */
246 bool
hashinsert(Relation rel,Datum * values,bool * isnull,ItemPointer ht_ctid,Relation heapRel,IndexUniqueCheck checkUnique,bool indexUnchanged,IndexInfo * indexInfo)247 hashinsert(Relation rel, Datum *values, bool *isnull,
248 		   ItemPointer ht_ctid, Relation heapRel,
249 		   IndexUniqueCheck checkUnique,
250 		   bool indexUnchanged,
251 		   IndexInfo *indexInfo)
252 {
253 	Datum		index_values[1];
254 	bool		index_isnull[1];
255 	IndexTuple	itup;
256 
257 	/* convert data to a hash key; on failure, do not insert anything */
258 	if (!_hash_convert_tuple(rel,
259 							 values, isnull,
260 							 index_values, index_isnull))
261 		return false;
262 
263 	/* form an index tuple and point it at the heap tuple */
264 	itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
265 	itup->t_tid = *ht_ctid;
266 
267 	_hash_doinsert(rel, itup, heapRel);
268 
269 	pfree(itup);
270 
271 	return false;
272 }
273 
274 
275 /*
276  *	hashgettuple() -- Get the next tuple in the scan.
277  */
278 bool
hashgettuple(IndexScanDesc scan,ScanDirection dir)279 hashgettuple(IndexScanDesc scan, ScanDirection dir)
280 {
281 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
282 	bool		res;
283 
284 	/* Hash indexes are always lossy since we store only the hash code */
285 	scan->xs_recheck = true;
286 
287 	/*
288 	 * If we've already initialized this scan, we can just advance it in the
289 	 * appropriate direction.  If we haven't done so yet, we call a routine to
290 	 * get the first item in the scan.
291 	 */
292 	if (!HashScanPosIsValid(so->currPos))
293 		res = _hash_first(scan, dir);
294 	else
295 	{
296 		/*
297 		 * Check to see if we should kill the previously-fetched tuple.
298 		 */
299 		if (scan->kill_prior_tuple)
300 		{
301 			/*
302 			 * Yes, so remember it for later. (We'll deal with all such tuples
303 			 * at once right after leaving the index page or at end of scan.)
304 			 * In case if caller reverses the indexscan direction it is quite
305 			 * possible that the same item might get entered multiple times.
306 			 * But, we don't detect that; instead, we just forget any excess
307 			 * entries.
308 			 */
309 			if (so->killedItems == NULL)
310 				so->killedItems = (int *)
311 					palloc(MaxIndexTuplesPerPage * sizeof(int));
312 
313 			if (so->numKilled < MaxIndexTuplesPerPage)
314 				so->killedItems[so->numKilled++] = so->currPos.itemIndex;
315 		}
316 
317 		/*
318 		 * Now continue the scan.
319 		 */
320 		res = _hash_next(scan, dir);
321 	}
322 
323 	return res;
324 }
325 
326 
327 /*
328  *	hashgetbitmap() -- get all tuples at once
329  */
330 int64
hashgetbitmap(IndexScanDesc scan,TIDBitmap * tbm)331 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
332 {
333 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
334 	bool		res;
335 	int64		ntids = 0;
336 	HashScanPosItem *currItem;
337 
338 	res = _hash_first(scan, ForwardScanDirection);
339 
340 	while (res)
341 	{
342 		currItem = &so->currPos.items[so->currPos.itemIndex];
343 
344 		/*
345 		 * _hash_first and _hash_next handle eliminate dead index entries
346 		 * whenever scan->ignore_killed_tuples is true.  Therefore, there's
347 		 * nothing to do here except add the results to the TIDBitmap.
348 		 */
349 		tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
350 		ntids++;
351 
352 		res = _hash_next(scan, ForwardScanDirection);
353 	}
354 
355 	return ntids;
356 }
357 
358 
359 /*
360  *	hashbeginscan() -- start a scan on a hash index
361  */
362 IndexScanDesc
hashbeginscan(Relation rel,int nkeys,int norderbys)363 hashbeginscan(Relation rel, int nkeys, int norderbys)
364 {
365 	IndexScanDesc scan;
366 	HashScanOpaque so;
367 
368 	/* no order by operators allowed */
369 	Assert(norderbys == 0);
370 
371 	scan = RelationGetIndexScan(rel, nkeys, norderbys);
372 
373 	so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
374 	HashScanPosInvalidate(so->currPos);
375 	so->hashso_bucket_buf = InvalidBuffer;
376 	so->hashso_split_bucket_buf = InvalidBuffer;
377 
378 	so->hashso_buc_populated = false;
379 	so->hashso_buc_split = false;
380 
381 	so->killedItems = NULL;
382 	so->numKilled = 0;
383 
384 	scan->opaque = so;
385 
386 	return scan;
387 }
388 
389 /*
390  *	hashrescan() -- rescan an index relation
391  */
392 void
hashrescan(IndexScanDesc scan,ScanKey scankey,int nscankeys,ScanKey orderbys,int norderbys)393 hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
394 		   ScanKey orderbys, int norderbys)
395 {
396 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
397 	Relation	rel = scan->indexRelation;
398 
399 	if (HashScanPosIsValid(so->currPos))
400 	{
401 		/* Before leaving current page, deal with any killed items */
402 		if (so->numKilled > 0)
403 			_hash_kill_items(scan);
404 	}
405 
406 	_hash_dropscanbuf(rel, so);
407 
408 	/* set position invalid (this will cause _hash_first call) */
409 	HashScanPosInvalidate(so->currPos);
410 
411 	/* Update scan key, if a new one is given */
412 	if (scankey && scan->numberOfKeys > 0)
413 	{
414 		memmove(scan->keyData,
415 				scankey,
416 				scan->numberOfKeys * sizeof(ScanKeyData));
417 	}
418 
419 	so->hashso_buc_populated = false;
420 	so->hashso_buc_split = false;
421 }
422 
423 /*
424  *	hashendscan() -- close down a scan
425  */
426 void
hashendscan(IndexScanDesc scan)427 hashendscan(IndexScanDesc scan)
428 {
429 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
430 	Relation	rel = scan->indexRelation;
431 
432 	if (HashScanPosIsValid(so->currPos))
433 	{
434 		/* Before leaving current page, deal with any killed items */
435 		if (so->numKilled > 0)
436 			_hash_kill_items(scan);
437 	}
438 
439 	_hash_dropscanbuf(rel, so);
440 
441 	if (so->killedItems != NULL)
442 		pfree(so->killedItems);
443 	pfree(so);
444 	scan->opaque = NULL;
445 }
446 
447 /*
448  * Bulk deletion of all index entries pointing to a set of heap tuples.
449  * The set of target tuples is specified via a callback routine that tells
450  * whether any given heap tuple (identified by ItemPointer) is being deleted.
451  *
452  * This function also deletes the tuples that are moved by split to other
453  * bucket.
454  *
455  * Result: a palloc'd struct containing statistical info for VACUUM displays.
456  */
457 IndexBulkDeleteResult *
hashbulkdelete(IndexVacuumInfo * info,IndexBulkDeleteResult * stats,IndexBulkDeleteCallback callback,void * callback_state)458 hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
459 			   IndexBulkDeleteCallback callback, void *callback_state)
460 {
461 	Relation	rel = info->index;
462 	double		tuples_removed;
463 	double		num_index_tuples;
464 	double		orig_ntuples;
465 	Bucket		orig_maxbucket;
466 	Bucket		cur_maxbucket;
467 	Bucket		cur_bucket;
468 	Buffer		metabuf = InvalidBuffer;
469 	HashMetaPage metap;
470 	HashMetaPage cachedmetap;
471 
472 	tuples_removed = 0;
473 	num_index_tuples = 0;
474 
475 	/*
476 	 * We need a copy of the metapage so that we can use its hashm_spares[]
477 	 * values to compute bucket page addresses, but a cached copy should be
478 	 * good enough.  (If not, we'll detect that further down and refresh the
479 	 * cache as necessary.)
480 	 */
481 	cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
482 	Assert(cachedmetap != NULL);
483 
484 	orig_maxbucket = cachedmetap->hashm_maxbucket;
485 	orig_ntuples = cachedmetap->hashm_ntuples;
486 
487 	/* Scan the buckets that we know exist */
488 	cur_bucket = 0;
489 	cur_maxbucket = orig_maxbucket;
490 
491 loop_top:
492 	while (cur_bucket <= cur_maxbucket)
493 	{
494 		BlockNumber bucket_blkno;
495 		BlockNumber blkno;
496 		Buffer		bucket_buf;
497 		Buffer		buf;
498 		HashPageOpaque bucket_opaque;
499 		Page		page;
500 		bool		split_cleanup = false;
501 
502 		/* Get address of bucket's start page */
503 		bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
504 
505 		blkno = bucket_blkno;
506 
507 		/*
508 		 * We need to acquire a cleanup lock on the primary bucket page to out
509 		 * wait concurrent scans before deleting the dead tuples.
510 		 */
511 		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
512 		LockBufferForCleanup(buf);
513 		_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
514 
515 		page = BufferGetPage(buf);
516 		bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
517 
518 		/*
519 		 * If the bucket contains tuples that are moved by split, then we need
520 		 * to delete such tuples.  We can't delete such tuples if the split
521 		 * operation on bucket is not finished as those are needed by scans.
522 		 */
523 		if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
524 			H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
525 		{
526 			split_cleanup = true;
527 
528 			/*
529 			 * This bucket might have been split since we last held a lock on
530 			 * the metapage.  If so, hashm_maxbucket, hashm_highmask and
531 			 * hashm_lowmask might be old enough to cause us to fail to remove
532 			 * tuples left behind by the most recent split.  To prevent that,
533 			 * now that the primary page of the target bucket has been locked
534 			 * (and thus can't be further split), check whether we need to
535 			 * update our cached metapage data.
536 			 */
537 			Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
538 			if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
539 			{
540 				cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
541 				Assert(cachedmetap != NULL);
542 			}
543 		}
544 
545 		bucket_buf = buf;
546 
547 		hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
548 						  cachedmetap->hashm_maxbucket,
549 						  cachedmetap->hashm_highmask,
550 						  cachedmetap->hashm_lowmask, &tuples_removed,
551 						  &num_index_tuples, split_cleanup,
552 						  callback, callback_state);
553 
554 		_hash_dropbuf(rel, bucket_buf);
555 
556 		/* Advance to next bucket */
557 		cur_bucket++;
558 	}
559 
560 	if (BufferIsInvalid(metabuf))
561 		metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
562 
563 	/* Write-lock metapage and check for split since we started */
564 	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
565 	metap = HashPageGetMeta(BufferGetPage(metabuf));
566 
567 	if (cur_maxbucket != metap->hashm_maxbucket)
568 	{
569 		/* There's been a split, so process the additional bucket(s) */
570 		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
571 		cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
572 		Assert(cachedmetap != NULL);
573 		cur_maxbucket = cachedmetap->hashm_maxbucket;
574 		goto loop_top;
575 	}
576 
577 	/* Okay, we're really done.  Update tuple count in metapage. */
578 	START_CRIT_SECTION();
579 
580 	if (orig_maxbucket == metap->hashm_maxbucket &&
581 		orig_ntuples == metap->hashm_ntuples)
582 	{
583 		/*
584 		 * No one has split or inserted anything since start of scan, so
585 		 * believe our count as gospel.
586 		 */
587 		metap->hashm_ntuples = num_index_tuples;
588 	}
589 	else
590 	{
591 		/*
592 		 * Otherwise, our count is untrustworthy since we may have
593 		 * double-scanned tuples in split buckets.  Proceed by dead-reckoning.
594 		 * (Note: we still return estimated_count = false, because using this
595 		 * count is better than not updating reltuples at all.)
596 		 */
597 		if (metap->hashm_ntuples > tuples_removed)
598 			metap->hashm_ntuples -= tuples_removed;
599 		else
600 			metap->hashm_ntuples = 0;
601 		num_index_tuples = metap->hashm_ntuples;
602 	}
603 
604 	MarkBufferDirty(metabuf);
605 
606 	/* XLOG stuff */
607 	if (RelationNeedsWAL(rel))
608 	{
609 		xl_hash_update_meta_page xlrec;
610 		XLogRecPtr	recptr;
611 
612 		xlrec.ntuples = metap->hashm_ntuples;
613 
614 		XLogBeginInsert();
615 		XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
616 
617 		XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
618 
619 		recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
620 		PageSetLSN(BufferGetPage(metabuf), recptr);
621 	}
622 
623 	END_CRIT_SECTION();
624 
625 	_hash_relbuf(rel, metabuf);
626 
627 	/* return statistics */
628 	if (stats == NULL)
629 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
630 	stats->estimated_count = false;
631 	stats->num_index_tuples = num_index_tuples;
632 	stats->tuples_removed += tuples_removed;
633 	/* hashvacuumcleanup will fill in num_pages */
634 
635 	return stats;
636 }
637 
638 /*
639  * Post-VACUUM cleanup.
640  *
641  * Result: a palloc'd struct containing statistical info for VACUUM displays.
642  */
643 IndexBulkDeleteResult *
hashvacuumcleanup(IndexVacuumInfo * info,IndexBulkDeleteResult * stats)644 hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
645 {
646 	Relation	rel = info->index;
647 	BlockNumber num_pages;
648 
649 	/* If hashbulkdelete wasn't called, return NULL signifying no change */
650 	/* Note: this covers the analyze_only case too */
651 	if (stats == NULL)
652 		return NULL;
653 
654 	/* update statistics */
655 	num_pages = RelationGetNumberOfBlocks(rel);
656 	stats->num_pages = num_pages;
657 
658 	return stats;
659 }
660 
661 /*
662  * Helper function to perform deletion of index entries from a bucket.
663  *
664  * This function expects that the caller has acquired a cleanup lock on the
665  * primary bucket page, and will return with a write lock again held on the
666  * primary bucket page.  The lock won't necessarily be held continuously,
667  * though, because we'll release it when visiting overflow pages.
668  *
669  * There can't be any concurrent scans in progress when we first enter this
670  * function because of the cleanup lock we hold on the primary bucket page,
671  * but as soon as we release that lock, there might be.  If those scans got
672  * ahead of our cleanup scan, they might see a tuple before we kill it and
673  * wake up only after VACUUM has completed and the TID has been recycled for
674  * an unrelated tuple.  To avoid that calamity, we prevent scans from passing
675  * our cleanup scan by locking the next page in the bucket chain before
676  * releasing the lock on the previous page.  (This type of lock chaining is not
677  * ideal, so we might want to look for a better solution at some point.)
678  *
679  * We need to retain a pin on the primary bucket to ensure that no concurrent
680  * split can start.
681  */
682 void
hashbucketcleanup(Relation rel,Bucket cur_bucket,Buffer bucket_buf,BlockNumber bucket_blkno,BufferAccessStrategy bstrategy,uint32 maxbucket,uint32 highmask,uint32 lowmask,double * tuples_removed,double * num_index_tuples,bool split_cleanup,IndexBulkDeleteCallback callback,void * callback_state)683 hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
684 				  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
685 				  uint32 maxbucket, uint32 highmask, uint32 lowmask,
686 				  double *tuples_removed, double *num_index_tuples,
687 				  bool split_cleanup,
688 				  IndexBulkDeleteCallback callback, void *callback_state)
689 {
690 	BlockNumber blkno;
691 	Buffer		buf;
692 	Bucket		new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
693 	bool		bucket_dirty = false;
694 
695 	blkno = bucket_blkno;
696 	buf = bucket_buf;
697 
698 	if (split_cleanup)
699 		new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
700 														lowmask, maxbucket);
701 
702 	/* Scan each page in bucket */
703 	for (;;)
704 	{
705 		HashPageOpaque opaque;
706 		OffsetNumber offno;
707 		OffsetNumber maxoffno;
708 		Buffer		next_buf;
709 		Page		page;
710 		OffsetNumber deletable[MaxOffsetNumber];
711 		int			ndeletable = 0;
712 		bool		retain_pin = false;
713 		bool		clear_dead_marking = false;
714 
715 		vacuum_delay_point();
716 
717 		page = BufferGetPage(buf);
718 		opaque = (HashPageOpaque) PageGetSpecialPointer(page);
719 
720 		/* Scan each tuple in page */
721 		maxoffno = PageGetMaxOffsetNumber(page);
722 		for (offno = FirstOffsetNumber;
723 			 offno <= maxoffno;
724 			 offno = OffsetNumberNext(offno))
725 		{
726 			ItemPointer htup;
727 			IndexTuple	itup;
728 			Bucket		bucket;
729 			bool		kill_tuple = false;
730 
731 			itup = (IndexTuple) PageGetItem(page,
732 											PageGetItemId(page, offno));
733 			htup = &(itup->t_tid);
734 
735 			/*
736 			 * To remove the dead tuples, we strictly want to rely on results
737 			 * of callback function.  refer btvacuumpage for detailed reason.
738 			 */
739 			if (callback && callback(htup, callback_state))
740 			{
741 				kill_tuple = true;
742 				if (tuples_removed)
743 					*tuples_removed += 1;
744 			}
745 			else if (split_cleanup)
746 			{
747 				/* delete the tuples that are moved by split. */
748 				bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
749 											  maxbucket,
750 											  highmask,
751 											  lowmask);
752 				/* mark the item for deletion */
753 				if (bucket != cur_bucket)
754 				{
755 					/*
756 					 * We expect tuples to either belong to current bucket or
757 					 * new_bucket.  This is ensured because we don't allow
758 					 * further splits from bucket that contains garbage. See
759 					 * comments in _hash_expandtable.
760 					 */
761 					Assert(bucket == new_bucket);
762 					kill_tuple = true;
763 				}
764 			}
765 
766 			if (kill_tuple)
767 			{
768 				/* mark the item for deletion */
769 				deletable[ndeletable++] = offno;
770 			}
771 			else
772 			{
773 				/* we're keeping it, so count it */
774 				if (num_index_tuples)
775 					*num_index_tuples += 1;
776 			}
777 		}
778 
779 		/* retain the pin on primary bucket page till end of bucket scan */
780 		if (blkno == bucket_blkno)
781 			retain_pin = true;
782 		else
783 			retain_pin = false;
784 
785 		blkno = opaque->hasho_nextblkno;
786 
787 		/*
788 		 * Apply deletions, advance to next page and write page if needed.
789 		 */
790 		if (ndeletable > 0)
791 		{
792 			/* No ereport(ERROR) until changes are logged */
793 			START_CRIT_SECTION();
794 
795 			PageIndexMultiDelete(page, deletable, ndeletable);
796 			bucket_dirty = true;
797 
798 			/*
799 			 * Let us mark the page as clean if vacuum removes the DEAD tuples
800 			 * from an index page. We do this by clearing
801 			 * LH_PAGE_HAS_DEAD_TUPLES flag.
802 			 */
803 			if (tuples_removed && *tuples_removed > 0 &&
804 				H_HAS_DEAD_TUPLES(opaque))
805 			{
806 				opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
807 				clear_dead_marking = true;
808 			}
809 
810 			MarkBufferDirty(buf);
811 
812 			/* XLOG stuff */
813 			if (RelationNeedsWAL(rel))
814 			{
815 				xl_hash_delete xlrec;
816 				XLogRecPtr	recptr;
817 
818 				xlrec.clear_dead_marking = clear_dead_marking;
819 				xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
820 
821 				XLogBeginInsert();
822 				XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
823 
824 				/*
825 				 * bucket buffer needs to be registered to ensure that we can
826 				 * acquire a cleanup lock on it during replay.
827 				 */
828 				if (!xlrec.is_primary_bucket_page)
829 					XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
830 
831 				XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
832 				XLogRegisterBufData(1, (char *) deletable,
833 									ndeletable * sizeof(OffsetNumber));
834 
835 				recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
836 				PageSetLSN(BufferGetPage(buf), recptr);
837 			}
838 
839 			END_CRIT_SECTION();
840 		}
841 
842 		/* bail out if there are no more pages to scan. */
843 		if (!BlockNumberIsValid(blkno))
844 			break;
845 
846 		next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
847 											  LH_OVERFLOW_PAGE,
848 											  bstrategy);
849 
850 		/*
851 		 * release the lock on previous page after acquiring the lock on next
852 		 * page
853 		 */
854 		if (retain_pin)
855 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
856 		else
857 			_hash_relbuf(rel, buf);
858 
859 		buf = next_buf;
860 	}
861 
862 	/*
863 	 * lock the bucket page to clear the garbage flag and squeeze the bucket.
864 	 * if the current buffer is same as bucket buffer, then we already have
865 	 * lock on bucket page.
866 	 */
867 	if (buf != bucket_buf)
868 	{
869 		_hash_relbuf(rel, buf);
870 		LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
871 	}
872 
873 	/*
874 	 * Clear the garbage flag from bucket after deleting the tuples that are
875 	 * moved by split.  We purposefully clear the flag before squeeze bucket,
876 	 * so that after restart, vacuum shouldn't again try to delete the moved
877 	 * by split tuples.
878 	 */
879 	if (split_cleanup)
880 	{
881 		HashPageOpaque bucket_opaque;
882 		Page		page;
883 
884 		page = BufferGetPage(bucket_buf);
885 		bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
886 
887 		/* No ereport(ERROR) until changes are logged */
888 		START_CRIT_SECTION();
889 
890 		bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
891 		MarkBufferDirty(bucket_buf);
892 
893 		/* XLOG stuff */
894 		if (RelationNeedsWAL(rel))
895 		{
896 			XLogRecPtr	recptr;
897 
898 			XLogBeginInsert();
899 			XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
900 
901 			recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
902 			PageSetLSN(page, recptr);
903 		}
904 
905 		END_CRIT_SECTION();
906 	}
907 
908 	/*
909 	 * If we have deleted anything, try to compact free space.  For squeezing
910 	 * the bucket, we must have a cleanup lock, else it can impact the
911 	 * ordering of tuples for a scan that has started before it.
912 	 */
913 	if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
914 		_hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
915 							bstrategy);
916 	else
917 		LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
918 }
919