1 /*-------------------------------------------------------------------------
2  *
3  * hashutil.c
4  *	  Utility code for Postgres hash implementation.
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/hash/hashutil.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/hash.h"
18 #include "access/reloptions.h"
19 #include "access/relscan.h"
20 #include "port/pg_bitutils.h"
21 #include "storage/buf_internals.h"
22 #include "utils/lsyscache.h"
23 #include "utils/rel.h"
24 
25 #define CALC_NEW_BUCKET(old_bucket, lowmask) \
26 			old_bucket | (lowmask + 1)
27 
28 /*
29  * _hash_checkqual -- does the index tuple satisfy the scan conditions?
30  */
31 bool
_hash_checkqual(IndexScanDesc scan,IndexTuple itup)32 _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
33 {
34 	/*
35 	 * Currently, we can't check any of the scan conditions since we do not
36 	 * have the original index entry value to supply to the sk_func. Always
37 	 * return true; we expect that hashgettuple already set the recheck flag
38 	 * to make the main indexscan code do it.
39 	 */
40 #ifdef NOT_USED
41 	TupleDesc	tupdesc = RelationGetDescr(scan->indexRelation);
42 	ScanKey		key = scan->keyData;
43 	int			scanKeySize = scan->numberOfKeys;
44 
45 	while (scanKeySize > 0)
46 	{
47 		Datum		datum;
48 		bool		isNull;
49 		Datum		test;
50 
51 		datum = index_getattr(itup,
52 							  key->sk_attno,
53 							  tupdesc,
54 							  &isNull);
55 
56 		/* assume sk_func is strict */
57 		if (isNull)
58 			return false;
59 		if (key->sk_flags & SK_ISNULL)
60 			return false;
61 
62 		test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
63 								 datum, key->sk_argument);
64 
65 		if (!DatumGetBool(test))
66 			return false;
67 
68 		key++;
69 		scanKeySize--;
70 	}
71 #endif
72 
73 	return true;
74 }
75 
76 /*
77  * _hash_datum2hashkey -- given a Datum, call the index's hash function
78  *
79  * The Datum is assumed to be of the index's column type, so we can use the
80  * "primary" hash function that's tracked for us by the generic index code.
81  */
82 uint32
_hash_datum2hashkey(Relation rel,Datum key)83 _hash_datum2hashkey(Relation rel, Datum key)
84 {
85 	FmgrInfo   *procinfo;
86 	Oid			collation;
87 
88 	/* XXX assumes index has only one attribute */
89 	procinfo = index_getprocinfo(rel, 1, HASHSTANDARD_PROC);
90 	collation = rel->rd_indcollation[0];
91 
92 	return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key));
93 }
94 
95 /*
96  * _hash_datum2hashkey_type -- given a Datum of a specified type,
97  *			hash it in a fashion compatible with this index
98  *
99  * This is much more expensive than _hash_datum2hashkey, so use it only in
100  * cross-type situations.
101  */
102 uint32
_hash_datum2hashkey_type(Relation rel,Datum key,Oid keytype)103 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype)
104 {
105 	RegProcedure hash_proc;
106 	Oid			collation;
107 
108 	/* XXX assumes index has only one attribute */
109 	hash_proc = get_opfamily_proc(rel->rd_opfamily[0],
110 								  keytype,
111 								  keytype,
112 								  HASHSTANDARD_PROC);
113 	if (!RegProcedureIsValid(hash_proc))
114 		elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"",
115 			 HASHSTANDARD_PROC, keytype, keytype,
116 			 RelationGetRelationName(rel));
117 	collation = rel->rd_indcollation[0];
118 
119 	return DatumGetUInt32(OidFunctionCall1Coll(hash_proc, collation, key));
120 }
121 
122 /*
123  * _hash_hashkey2bucket -- determine which bucket the hashkey maps to.
124  */
125 Bucket
_hash_hashkey2bucket(uint32 hashkey,uint32 maxbucket,uint32 highmask,uint32 lowmask)126 _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
127 					 uint32 highmask, uint32 lowmask)
128 {
129 	Bucket		bucket;
130 
131 	bucket = hashkey & highmask;
132 	if (bucket > maxbucket)
133 		bucket = bucket & lowmask;
134 
135 	return bucket;
136 }
137 
138 /*
139  * _hash_spareindex -- returns spare index / global splitpoint phase of the
140  *					   bucket
141  */
142 uint32
_hash_spareindex(uint32 num_bucket)143 _hash_spareindex(uint32 num_bucket)
144 {
145 	uint32		splitpoint_group;
146 	uint32		splitpoint_phases;
147 
148 	splitpoint_group = pg_ceil_log2_32(num_bucket);
149 
150 	if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
151 		return splitpoint_group;
152 
153 	/* account for single-phase groups */
154 	splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
155 
156 	/* account for multi-phase groups before splitpoint_group */
157 	splitpoint_phases +=
158 		((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) <<
159 		 HASH_SPLITPOINT_PHASE_BITS);
160 
161 	/* account for phases within current group */
162 	splitpoint_phases +=
163 		(((num_bucket - 1) >>
164 		  (splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) &
165 		 HASH_SPLITPOINT_PHASE_MASK);	/* to 0-based value. */
166 
167 	return splitpoint_phases;
168 }
169 
170 /*
171  *	_hash_get_totalbuckets -- returns total number of buckets allocated till
172  *							the given splitpoint phase.
173  */
174 uint32
_hash_get_totalbuckets(uint32 splitpoint_phase)175 _hash_get_totalbuckets(uint32 splitpoint_phase)
176 {
177 	uint32		splitpoint_group;
178 	uint32		total_buckets;
179 	uint32		phases_within_splitpoint_group;
180 
181 	if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
182 		return (1 << splitpoint_phase);
183 
184 	/* get splitpoint's group */
185 	splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
186 	splitpoint_group +=
187 		((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >>
188 		 HASH_SPLITPOINT_PHASE_BITS);
189 
190 	/* account for buckets before splitpoint_group */
191 	total_buckets = (1 << (splitpoint_group - 1));
192 
193 	/* account for buckets within splitpoint_group */
194 	phases_within_splitpoint_group =
195 		(((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) &
196 		  HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */
197 	total_buckets +=
198 		(((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) *
199 		 phases_within_splitpoint_group);
200 
201 	return total_buckets;
202 }
203 
204 /*
205  * _hash_checkpage -- sanity checks on the format of all hash pages
206  *
207  * If flags is not zero, it is a bitwise OR of the acceptable page types
208  * (values of hasho_flag & LH_PAGE_TYPE).
209  */
210 void
_hash_checkpage(Relation rel,Buffer buf,int flags)211 _hash_checkpage(Relation rel, Buffer buf, int flags)
212 {
213 	Page		page = BufferGetPage(buf);
214 
215 	/*
216 	 * ReadBuffer verifies that every newly-read page passes
217 	 * PageHeaderIsValid, which means it either contains a reasonably sane
218 	 * page header or is all-zero.  We have to defend against the all-zero
219 	 * case, however.
220 	 */
221 	if (PageIsNew(page))
222 		ereport(ERROR,
223 				(errcode(ERRCODE_INDEX_CORRUPTED),
224 				 errmsg("index \"%s\" contains unexpected zero page at block %u",
225 						RelationGetRelationName(rel),
226 						BufferGetBlockNumber(buf)),
227 				 errhint("Please REINDEX it.")));
228 
229 	/*
230 	 * Additionally check that the special area looks sane.
231 	 */
232 	if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData)))
233 		ereport(ERROR,
234 				(errcode(ERRCODE_INDEX_CORRUPTED),
235 				 errmsg("index \"%s\" contains corrupted page at block %u",
236 						RelationGetRelationName(rel),
237 						BufferGetBlockNumber(buf)),
238 				 errhint("Please REINDEX it.")));
239 
240 	if (flags)
241 	{
242 		HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
243 
244 		if ((opaque->hasho_flag & flags) == 0)
245 			ereport(ERROR,
246 					(errcode(ERRCODE_INDEX_CORRUPTED),
247 					 errmsg("index \"%s\" contains corrupted page at block %u",
248 							RelationGetRelationName(rel),
249 							BufferGetBlockNumber(buf)),
250 					 errhint("Please REINDEX it.")));
251 	}
252 
253 	/*
254 	 * When checking the metapage, also verify magic number and version.
255 	 */
256 	if (flags == LH_META_PAGE)
257 	{
258 		HashMetaPage metap = HashPageGetMeta(page);
259 
260 		if (metap->hashm_magic != HASH_MAGIC)
261 			ereport(ERROR,
262 					(errcode(ERRCODE_INDEX_CORRUPTED),
263 					 errmsg("index \"%s\" is not a hash index",
264 							RelationGetRelationName(rel))));
265 
266 		if (metap->hashm_version != HASH_VERSION)
267 			ereport(ERROR,
268 					(errcode(ERRCODE_INDEX_CORRUPTED),
269 					 errmsg("index \"%s\" has wrong hash version",
270 							RelationGetRelationName(rel)),
271 					 errhint("Please REINDEX it.")));
272 	}
273 }
274 
275 bytea *
hashoptions(Datum reloptions,bool validate)276 hashoptions(Datum reloptions, bool validate)
277 {
278 	static const relopt_parse_elt tab[] = {
279 		{"fillfactor", RELOPT_TYPE_INT, offsetof(HashOptions, fillfactor)},
280 	};
281 
282 	return (bytea *) build_reloptions(reloptions, validate,
283 									  RELOPT_KIND_HASH,
284 									  sizeof(HashOptions),
285 									  tab, lengthof(tab));
286 }
287 
288 /*
289  * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value
290  */
291 uint32
_hash_get_indextuple_hashkey(IndexTuple itup)292 _hash_get_indextuple_hashkey(IndexTuple itup)
293 {
294 	char	   *attp;
295 
296 	/*
297 	 * We assume the hash key is the first attribute and can't be null, so
298 	 * this can be done crudely but very very cheaply ...
299 	 */
300 	attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
301 	return *((uint32 *) attp);
302 }
303 
304 /*
305  * _hash_convert_tuple - convert raw index data to hash key
306  *
307  * Inputs: values and isnull arrays for the user data column(s)
308  * Outputs: values and isnull arrays for the index tuple, suitable for
309  *		passing to index_form_tuple().
310  *
311  * Returns true if successful, false if not (because there are null values).
312  * On a false result, the given data need not be indexed.
313  *
314  * Note: callers know that the index-column arrays are always of length 1.
315  * In principle, there could be more than one input column, though we do not
316  * currently support that.
317  */
318 bool
_hash_convert_tuple(Relation index,Datum * user_values,bool * user_isnull,Datum * index_values,bool * index_isnull)319 _hash_convert_tuple(Relation index,
320 					Datum *user_values, bool *user_isnull,
321 					Datum *index_values, bool *index_isnull)
322 {
323 	uint32		hashkey;
324 
325 	/*
326 	 * We do not insert null values into hash indexes.  This is okay because
327 	 * the only supported search operator is '=', and we assume it is strict.
328 	 */
329 	if (user_isnull[0])
330 		return false;
331 
332 	hashkey = _hash_datum2hashkey(index, user_values[0]);
333 	index_values[0] = UInt32GetDatum(hashkey);
334 	index_isnull[0] = false;
335 	return true;
336 }
337 
338 /*
339  * _hash_binsearch - Return the offset number in the page where the
340  *					 specified hash value should be sought or inserted.
341  *
342  * We use binary search, relying on the assumption that the existing entries
343  * are ordered by hash key.
344  *
345  * Returns the offset of the first index entry having hashkey >= hash_value,
346  * or the page's max offset plus one if hash_value is greater than all
347  * existing hash keys in the page.  This is the appropriate place to start
348  * a search, or to insert a new item.
349  */
350 OffsetNumber
_hash_binsearch(Page page,uint32 hash_value)351 _hash_binsearch(Page page, uint32 hash_value)
352 {
353 	OffsetNumber upper;
354 	OffsetNumber lower;
355 
356 	/* Loop invariant: lower <= desired place <= upper */
357 	upper = PageGetMaxOffsetNumber(page) + 1;
358 	lower = FirstOffsetNumber;
359 
360 	while (upper > lower)
361 	{
362 		OffsetNumber off;
363 		IndexTuple	itup;
364 		uint32		hashkey;
365 
366 		off = (upper + lower) / 2;
367 		Assert(OffsetNumberIsValid(off));
368 
369 		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
370 		hashkey = _hash_get_indextuple_hashkey(itup);
371 		if (hashkey < hash_value)
372 			lower = off + 1;
373 		else
374 			upper = off;
375 	}
376 
377 	return lower;
378 }
379 
380 /*
381  * _hash_binsearch_last
382  *
383  * Same as above, except that if there are multiple matching items in the
384  * page, we return the offset of the last one instead of the first one,
385  * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1.
386  * This is handy for starting a new page in a backwards scan.
387  */
388 OffsetNumber
_hash_binsearch_last(Page page,uint32 hash_value)389 _hash_binsearch_last(Page page, uint32 hash_value)
390 {
391 	OffsetNumber upper;
392 	OffsetNumber lower;
393 
394 	/* Loop invariant: lower <= desired place <= upper */
395 	upper = PageGetMaxOffsetNumber(page);
396 	lower = FirstOffsetNumber - 1;
397 
398 	while (upper > lower)
399 	{
400 		IndexTuple	itup;
401 		OffsetNumber off;
402 		uint32		hashkey;
403 
404 		off = (upper + lower + 1) / 2;
405 		Assert(OffsetNumberIsValid(off));
406 
407 		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
408 		hashkey = _hash_get_indextuple_hashkey(itup);
409 		if (hashkey > hash_value)
410 			upper = off - 1;
411 		else
412 			lower = off;
413 	}
414 
415 	return lower;
416 }
417 
418 /*
419  *	_hash_get_oldblock_from_newbucket() -- get the block number of a bucket
420  *			from which current (new) bucket is being split.
421  */
422 BlockNumber
_hash_get_oldblock_from_newbucket(Relation rel,Bucket new_bucket)423 _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
424 {
425 	Bucket		old_bucket;
426 	uint32		mask;
427 	Buffer		metabuf;
428 	HashMetaPage metap;
429 	BlockNumber blkno;
430 
431 	/*
432 	 * To get the old bucket from the current bucket, we need a mask to modulo
433 	 * into lower half of table.  This mask is stored in meta page as
434 	 * hashm_lowmask, but here we can't rely on the same, because we need a
435 	 * value of lowmask that was prevalent at the time when bucket split was
436 	 * started.  Masking the most significant bit of new bucket would give us
437 	 * old bucket.
438 	 */
439 	mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1;
440 	old_bucket = new_bucket & mask;
441 
442 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
443 	metap = HashPageGetMeta(BufferGetPage(metabuf));
444 
445 	blkno = BUCKET_TO_BLKNO(metap, old_bucket);
446 
447 	_hash_relbuf(rel, metabuf);
448 
449 	return blkno;
450 }
451 
452 /*
453  *	_hash_get_newblock_from_oldbucket() -- get the block number of a bucket
454  *			that will be generated after split from old bucket.
455  *
456  * This is used to find the new bucket from old bucket based on current table
457  * half.  It is mainly required to finish the incomplete splits where we are
458  * sure that not more than one bucket could have split in progress from old
459  * bucket.
460  */
461 BlockNumber
_hash_get_newblock_from_oldbucket(Relation rel,Bucket old_bucket)462 _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
463 {
464 	Bucket		new_bucket;
465 	Buffer		metabuf;
466 	HashMetaPage metap;
467 	BlockNumber blkno;
468 
469 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
470 	metap = HashPageGetMeta(BufferGetPage(metabuf));
471 
472 	new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
473 													metap->hashm_lowmask,
474 													metap->hashm_maxbucket);
475 	blkno = BUCKET_TO_BLKNO(metap, new_bucket);
476 
477 	_hash_relbuf(rel, metabuf);
478 
479 	return blkno;
480 }
481 
482 /*
483  *	_hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
484  *			generated after split from current (old) bucket.
485  *
486  * This is used to find the new bucket from old bucket.  New bucket can be
487  * obtained by OR'ing old bucket with most significant bit of current table
488  * half (lowmask passed in this function can be used to identify msb of
489  * current table half).  There could be multiple buckets that could have
490  * been split from current bucket.  We need the first such bucket that exists.
491  * Caller must ensure that no more than one split has happened from old
492  * bucket.
493  */
494 Bucket
_hash_get_newbucket_from_oldbucket(Relation rel,Bucket old_bucket,uint32 lowmask,uint32 maxbucket)495 _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
496 								   uint32 lowmask, uint32 maxbucket)
497 {
498 	Bucket		new_bucket;
499 
500 	new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
501 	if (new_bucket > maxbucket)
502 	{
503 		lowmask = lowmask >> 1;
504 		new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
505 	}
506 
507 	return new_bucket;
508 }
509 
510 /*
511  * _hash_kill_items - set LP_DEAD state for items an indexscan caller has
512  * told us were killed.
513  *
514  * scan->opaque, referenced locally through so, contains information about the
515  * current page and killed tuples thereon (generally, this should only be
516  * called if so->numKilled > 0).
517  *
518  * The caller does not have a lock on the page and may or may not have the
519  * page pinned in a buffer.  Note that read-lock is sufficient for setting
520  * LP_DEAD status (which is only a hint).
521  *
522  * The caller must have pin on bucket buffer, but may or may not have pin
523  * on overflow buffer, as indicated by HashScanPosIsPinned(so->currPos).
524  *
525  * We match items by heap TID before assuming they are the right ones to
526  * delete.
527  *
528  * There are never any scans active in a bucket at the time VACUUM begins,
529  * because VACUUM takes a cleanup lock on the primary bucket page and scans
530  * hold a pin.  A scan can begin after VACUUM leaves the primary bucket page
531  * but before it finishes the entire bucket, but it can never pass VACUUM,
532  * because VACUUM always locks the next page before releasing the lock on
533  * the previous one.  Therefore, we don't have to worry about accidentally
534  * killing a TID that has been reused for an unrelated tuple.
535  */
536 void
_hash_kill_items(IndexScanDesc scan)537 _hash_kill_items(IndexScanDesc scan)
538 {
539 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
540 	Relation	rel = scan->indexRelation;
541 	BlockNumber blkno;
542 	Buffer		buf;
543 	Page		page;
544 	HashPageOpaque opaque;
545 	OffsetNumber offnum,
546 				maxoff;
547 	int			numKilled = so->numKilled;
548 	int			i;
549 	bool		killedsomething = false;
550 	bool		havePin = false;
551 
552 	Assert(so->numKilled > 0);
553 	Assert(so->killedItems != NULL);
554 	Assert(HashScanPosIsValid(so->currPos));
555 
556 	/*
557 	 * Always reset the scan state, so we don't look for same items on other
558 	 * pages.
559 	 */
560 	so->numKilled = 0;
561 
562 	blkno = so->currPos.currPage;
563 	if (HashScanPosIsPinned(so->currPos))
564 	{
565 		/*
566 		 * We already have pin on this buffer, so, all we need to do is
567 		 * acquire lock on it.
568 		 */
569 		havePin = true;
570 		buf = so->currPos.buf;
571 		LockBuffer(buf, BUFFER_LOCK_SHARE);
572 	}
573 	else
574 		buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
575 
576 	page = BufferGetPage(buf);
577 	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
578 	maxoff = PageGetMaxOffsetNumber(page);
579 
580 	for (i = 0; i < numKilled; i++)
581 	{
582 		int			itemIndex = so->killedItems[i];
583 		HashScanPosItem *currItem = &so->currPos.items[itemIndex];
584 
585 		offnum = currItem->indexOffset;
586 
587 		Assert(itemIndex >= so->currPos.firstItem &&
588 			   itemIndex <= so->currPos.lastItem);
589 
590 		while (offnum <= maxoff)
591 		{
592 			ItemId		iid = PageGetItemId(page, offnum);
593 			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid);
594 
595 			if (ItemPointerEquals(&ituple->t_tid, &currItem->heapTid))
596 			{
597 				/* found the item */
598 				ItemIdMarkDead(iid);
599 				killedsomething = true;
600 				break;			/* out of inner search loop */
601 			}
602 			offnum = OffsetNumberNext(offnum);
603 		}
604 	}
605 
606 	/*
607 	 * Since this can be redone later if needed, mark as dirty hint. Whenever
608 	 * we mark anything LP_DEAD, we also set the page's
609 	 * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint.
610 	 */
611 	if (killedsomething)
612 	{
613 		opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
614 		MarkBufferDirtyHint(buf, true);
615 	}
616 
617 	if (so->hashso_bucket_buf == so->currPos.buf ||
618 		havePin)
619 		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
620 	else
621 		_hash_relbuf(rel, buf);
622 }
623