1 /*-------------------------------------------------------------------------
2  *
3  * hashutil.c
4  *	  Utility code for Postgres hash implementation.
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/hash/hashutil.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/hash.h"
18 #include "access/reloptions.h"
19 #include "access/relscan.h"
20 #include "utils/lsyscache.h"
21 #include "utils/rel.h"
22 #include "storage/buf_internals.h"
23 
24 #define CALC_NEW_BUCKET(old_bucket, lowmask) \
25 			old_bucket | (lowmask + 1)
26 
27 /*
28  * _hash_checkqual -- does the index tuple satisfy the scan conditions?
29  */
30 bool
_hash_checkqual(IndexScanDesc scan,IndexTuple itup)31 _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
32 {
33 	/*
34 	 * Currently, we can't check any of the scan conditions since we do not
35 	 * have the original index entry value to supply to the sk_func. Always
36 	 * return true; we expect that hashgettuple already set the recheck flag
37 	 * to make the main indexscan code do it.
38 	 */
39 #ifdef NOT_USED
40 	TupleDesc	tupdesc = RelationGetDescr(scan->indexRelation);
41 	ScanKey		key = scan->keyData;
42 	int			scanKeySize = scan->numberOfKeys;
43 
44 	while (scanKeySize > 0)
45 	{
46 		Datum		datum;
47 		bool		isNull;
48 		Datum		test;
49 
50 		datum = index_getattr(itup,
51 							  key->sk_attno,
52 							  tupdesc,
53 							  &isNull);
54 
55 		/* assume sk_func is strict */
56 		if (isNull)
57 			return false;
58 		if (key->sk_flags & SK_ISNULL)
59 			return false;
60 
61 		test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
62 								 datum, key->sk_argument);
63 
64 		if (!DatumGetBool(test))
65 			return false;
66 
67 		key++;
68 		scanKeySize--;
69 	}
70 #endif
71 
72 	return true;
73 }
74 
75 /*
76  * _hash_datum2hashkey -- given a Datum, call the index's hash procedure
77  *
78  * The Datum is assumed to be of the index's column type, so we can use the
79  * "primary" hash procedure that's tracked for us by the generic index code.
80  */
81 uint32
_hash_datum2hashkey(Relation rel,Datum key)82 _hash_datum2hashkey(Relation rel, Datum key)
83 {
84 	FmgrInfo   *procinfo;
85 	Oid			collation;
86 
87 	/* XXX assumes index has only one attribute */
88 	procinfo = index_getprocinfo(rel, 1, HASHPROC);
89 	collation = rel->rd_indcollation[0];
90 
91 	return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key));
92 }
93 
94 /*
95  * _hash_datum2hashkey_type -- given a Datum of a specified type,
96  *			hash it in a fashion compatible with this index
97  *
98  * This is much more expensive than _hash_datum2hashkey, so use it only in
99  * cross-type situations.
100  */
101 uint32
_hash_datum2hashkey_type(Relation rel,Datum key,Oid keytype)102 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype)
103 {
104 	RegProcedure hash_proc;
105 	Oid			collation;
106 
107 	/* XXX assumes index has only one attribute */
108 	hash_proc = get_opfamily_proc(rel->rd_opfamily[0],
109 								  keytype,
110 								  keytype,
111 								  HASHPROC);
112 	if (!RegProcedureIsValid(hash_proc))
113 		elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"",
114 			 HASHPROC, keytype, keytype,
115 			 RelationGetRelationName(rel));
116 	collation = rel->rd_indcollation[0];
117 
118 	return DatumGetUInt32(OidFunctionCall1Coll(hash_proc, collation, key));
119 }
120 
121 /*
122  * _hash_hashkey2bucket -- determine which bucket the hashkey maps to.
123  */
124 Bucket
_hash_hashkey2bucket(uint32 hashkey,uint32 maxbucket,uint32 highmask,uint32 lowmask)125 _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
126 					 uint32 highmask, uint32 lowmask)
127 {
128 	Bucket		bucket;
129 
130 	bucket = hashkey & highmask;
131 	if (bucket > maxbucket)
132 		bucket = bucket & lowmask;
133 
134 	return bucket;
135 }
136 
137 /*
138  * _hash_log2 -- returns ceil(lg2(num))
139  */
140 uint32
_hash_log2(uint32 num)141 _hash_log2(uint32 num)
142 {
143 	uint32		i,
144 				limit;
145 
146 	limit = 1;
147 	for (i = 0; limit < num; limit <<= 1, i++)
148 		;
149 	return i;
150 }
151 
152 /*
153  * _hash_spareindex -- returns spare index / global splitpoint phase of the
154  *					   bucket
155  */
156 uint32
_hash_spareindex(uint32 num_bucket)157 _hash_spareindex(uint32 num_bucket)
158 {
159 	uint32		splitpoint_group;
160 	uint32		splitpoint_phases;
161 
162 	splitpoint_group = _hash_log2(num_bucket);
163 
164 	if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
165 		return splitpoint_group;
166 
167 	/* account for single-phase groups */
168 	splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
169 
170 	/* account for multi-phase groups before splitpoint_group */
171 	splitpoint_phases +=
172 		((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) <<
173 		 HASH_SPLITPOINT_PHASE_BITS);
174 
175 	/* account for phases within current group */
176 	splitpoint_phases +=
177 		(((num_bucket - 1) >>
178 		  (splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) &
179 		 HASH_SPLITPOINT_PHASE_MASK);	/* to 0-based value. */
180 
181 	return splitpoint_phases;
182 }
183 
184 /*
185  *	_hash_get_totalbuckets -- returns total number of buckets allocated till
186  *							the given splitpoint phase.
187  */
188 uint32
_hash_get_totalbuckets(uint32 splitpoint_phase)189 _hash_get_totalbuckets(uint32 splitpoint_phase)
190 {
191 	uint32		splitpoint_group;
192 	uint32		total_buckets;
193 	uint32		phases_within_splitpoint_group;
194 
195 	if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
196 		return (1 << splitpoint_phase);
197 
198 	/* get splitpoint's group */
199 	splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
200 	splitpoint_group +=
201 		((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >>
202 		 HASH_SPLITPOINT_PHASE_BITS);
203 
204 	/* account for buckets before splitpoint_group */
205 	total_buckets = (1 << (splitpoint_group - 1));
206 
207 	/* account for buckets within splitpoint_group */
208 	phases_within_splitpoint_group =
209 		(((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) &
210 		  HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */
211 	total_buckets +=
212 		(((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) *
213 		 phases_within_splitpoint_group);
214 
215 	return total_buckets;
216 }
217 
218 /*
219  * _hash_checkpage -- sanity checks on the format of all hash pages
220  *
221  * If flags is not zero, it is a bitwise OR of the acceptable page types
222  * (values of hasho_flag & LH_PAGE_TYPE).
223  */
224 void
_hash_checkpage(Relation rel,Buffer buf,int flags)225 _hash_checkpage(Relation rel, Buffer buf, int flags)
226 {
227 	Page		page = BufferGetPage(buf);
228 
229 	/*
230 	 * ReadBuffer verifies that every newly-read page passes
231 	 * PageHeaderIsValid, which means it either contains a reasonably sane
232 	 * page header or is all-zero.  We have to defend against the all-zero
233 	 * case, however.
234 	 */
235 	if (PageIsNew(page))
236 		ereport(ERROR,
237 				(errcode(ERRCODE_INDEX_CORRUPTED),
238 				 errmsg("index \"%s\" contains unexpected zero page at block %u",
239 						RelationGetRelationName(rel),
240 						BufferGetBlockNumber(buf)),
241 				 errhint("Please REINDEX it.")));
242 
243 	/*
244 	 * Additionally check that the special area looks sane.
245 	 */
246 	if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData)))
247 		ereport(ERROR,
248 				(errcode(ERRCODE_INDEX_CORRUPTED),
249 				 errmsg("index \"%s\" contains corrupted page at block %u",
250 						RelationGetRelationName(rel),
251 						BufferGetBlockNumber(buf)),
252 				 errhint("Please REINDEX it.")));
253 
254 	if (flags)
255 	{
256 		HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
257 
258 		if ((opaque->hasho_flag & flags) == 0)
259 			ereport(ERROR,
260 					(errcode(ERRCODE_INDEX_CORRUPTED),
261 					 errmsg("index \"%s\" contains corrupted page at block %u",
262 							RelationGetRelationName(rel),
263 							BufferGetBlockNumber(buf)),
264 					 errhint("Please REINDEX it.")));
265 	}
266 
267 	/*
268 	 * When checking the metapage, also verify magic number and version.
269 	 */
270 	if (flags == LH_META_PAGE)
271 	{
272 		HashMetaPage metap = HashPageGetMeta(page);
273 
274 		if (metap->hashm_magic != HASH_MAGIC)
275 			ereport(ERROR,
276 					(errcode(ERRCODE_INDEX_CORRUPTED),
277 					 errmsg("index \"%s\" is not a hash index",
278 							RelationGetRelationName(rel))));
279 
280 		if (metap->hashm_version != HASH_VERSION)
281 			ereport(ERROR,
282 					(errcode(ERRCODE_INDEX_CORRUPTED),
283 					 errmsg("index \"%s\" has wrong hash version",
284 							RelationGetRelationName(rel)),
285 					 errhint("Please REINDEX it.")));
286 	}
287 }
288 
289 bytea *
hashoptions(Datum reloptions,bool validate)290 hashoptions(Datum reloptions, bool validate)
291 {
292 	return default_reloptions(reloptions, validate, RELOPT_KIND_HASH);
293 }
294 
295 /*
296  * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value
297  */
298 uint32
_hash_get_indextuple_hashkey(IndexTuple itup)299 _hash_get_indextuple_hashkey(IndexTuple itup)
300 {
301 	char	   *attp;
302 
303 	/*
304 	 * We assume the hash key is the first attribute and can't be null, so
305 	 * this can be done crudely but very very cheaply ...
306 	 */
307 	attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
308 	return *((uint32 *) attp);
309 }
310 
311 /*
312  * _hash_convert_tuple - convert raw index data to hash key
313  *
314  * Inputs: values and isnull arrays for the user data column(s)
315  * Outputs: values and isnull arrays for the index tuple, suitable for
316  *		passing to index_form_tuple().
317  *
318  * Returns true if successful, false if not (because there are null values).
319  * On a false result, the given data need not be indexed.
320  *
321  * Note: callers know that the index-column arrays are always of length 1.
322  * In principle, there could be more than one input column, though we do not
323  * currently support that.
324  */
325 bool
_hash_convert_tuple(Relation index,Datum * user_values,bool * user_isnull,Datum * index_values,bool * index_isnull)326 _hash_convert_tuple(Relation index,
327 					Datum *user_values, bool *user_isnull,
328 					Datum *index_values, bool *index_isnull)
329 {
330 	uint32		hashkey;
331 
332 	/*
333 	 * We do not insert null values into hash indexes.  This is okay because
334 	 * the only supported search operator is '=', and we assume it is strict.
335 	 */
336 	if (user_isnull[0])
337 		return false;
338 
339 	hashkey = _hash_datum2hashkey(index, user_values[0]);
340 	index_values[0] = UInt32GetDatum(hashkey);
341 	index_isnull[0] = false;
342 	return true;
343 }
344 
345 /*
346  * _hash_binsearch - Return the offset number in the page where the
347  *					 specified hash value should be sought or inserted.
348  *
349  * We use binary search, relying on the assumption that the existing entries
350  * are ordered by hash key.
351  *
352  * Returns the offset of the first index entry having hashkey >= hash_value,
353  * or the page's max offset plus one if hash_value is greater than all
354  * existing hash keys in the page.  This is the appropriate place to start
355  * a search, or to insert a new item.
356  */
357 OffsetNumber
_hash_binsearch(Page page,uint32 hash_value)358 _hash_binsearch(Page page, uint32 hash_value)
359 {
360 	OffsetNumber upper;
361 	OffsetNumber lower;
362 
363 	/* Loop invariant: lower <= desired place <= upper */
364 	upper = PageGetMaxOffsetNumber(page) + 1;
365 	lower = FirstOffsetNumber;
366 
367 	while (upper > lower)
368 	{
369 		OffsetNumber off;
370 		IndexTuple	itup;
371 		uint32		hashkey;
372 
373 		off = (upper + lower) / 2;
374 		Assert(OffsetNumberIsValid(off));
375 
376 		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
377 		hashkey = _hash_get_indextuple_hashkey(itup);
378 		if (hashkey < hash_value)
379 			lower = off + 1;
380 		else
381 			upper = off;
382 	}
383 
384 	return lower;
385 }
386 
387 /*
388  * _hash_binsearch_last
389  *
390  * Same as above, except that if there are multiple matching items in the
391  * page, we return the offset of the last one instead of the first one,
392  * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1.
393  * This is handy for starting a new page in a backwards scan.
394  */
395 OffsetNumber
_hash_binsearch_last(Page page,uint32 hash_value)396 _hash_binsearch_last(Page page, uint32 hash_value)
397 {
398 	OffsetNumber upper;
399 	OffsetNumber lower;
400 
401 	/* Loop invariant: lower <= desired place <= upper */
402 	upper = PageGetMaxOffsetNumber(page);
403 	lower = FirstOffsetNumber - 1;
404 
405 	while (upper > lower)
406 	{
407 		IndexTuple	itup;
408 		OffsetNumber off;
409 		uint32		hashkey;
410 
411 		off = (upper + lower + 1) / 2;
412 		Assert(OffsetNumberIsValid(off));
413 
414 		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
415 		hashkey = _hash_get_indextuple_hashkey(itup);
416 		if (hashkey > hash_value)
417 			upper = off - 1;
418 		else
419 			lower = off;
420 	}
421 
422 	return lower;
423 }
424 
425 /*
426  *	_hash_get_oldblock_from_newbucket() -- get the block number of a bucket
427  *			from which current (new) bucket is being split.
428  */
429 BlockNumber
_hash_get_oldblock_from_newbucket(Relation rel,Bucket new_bucket)430 _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
431 {
432 	Bucket		old_bucket;
433 	uint32		mask;
434 	Buffer		metabuf;
435 	HashMetaPage metap;
436 	BlockNumber blkno;
437 
438 	/*
439 	 * To get the old bucket from the current bucket, we need a mask to modulo
440 	 * into lower half of table.  This mask is stored in meta page as
441 	 * hashm_lowmask, but here we can't rely on the same, because we need a
442 	 * value of lowmask that was prevalent at the time when bucket split was
443 	 * started.  Masking the most significant bit of new bucket would give us
444 	 * old bucket.
445 	 */
446 	mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1;
447 	old_bucket = new_bucket & mask;
448 
449 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
450 	metap = HashPageGetMeta(BufferGetPage(metabuf));
451 
452 	blkno = BUCKET_TO_BLKNO(metap, old_bucket);
453 
454 	_hash_relbuf(rel, metabuf);
455 
456 	return blkno;
457 }
458 
459 /*
460  *	_hash_get_newblock_from_oldbucket() -- get the block number of a bucket
461  *			that will be generated after split from old bucket.
462  *
463  * This is used to find the new bucket from old bucket based on current table
464  * half.  It is mainly required to finish the incomplete splits where we are
465  * sure that not more than one bucket could have split in progress from old
466  * bucket.
467  */
468 BlockNumber
_hash_get_newblock_from_oldbucket(Relation rel,Bucket old_bucket)469 _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
470 {
471 	Bucket		new_bucket;
472 	Buffer		metabuf;
473 	HashMetaPage metap;
474 	BlockNumber blkno;
475 
476 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
477 	metap = HashPageGetMeta(BufferGetPage(metabuf));
478 
479 	new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
480 													metap->hashm_lowmask,
481 													metap->hashm_maxbucket);
482 	blkno = BUCKET_TO_BLKNO(metap, new_bucket);
483 
484 	_hash_relbuf(rel, metabuf);
485 
486 	return blkno;
487 }
488 
489 /*
490  *	_hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
491  *			generated after split from current (old) bucket.
492  *
493  * This is used to find the new bucket from old bucket.  New bucket can be
494  * obtained by OR'ing old bucket with most significant bit of current table
495  * half (lowmask passed in this function can be used to identify msb of
496  * current table half).  There could be multiple buckets that could have
497  * been split from current bucket.  We need the first such bucket that exists.
498  * Caller must ensure that no more than one split has happened from old
499  * bucket.
500  */
501 Bucket
_hash_get_newbucket_from_oldbucket(Relation rel,Bucket old_bucket,uint32 lowmask,uint32 maxbucket)502 _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
503 								   uint32 lowmask, uint32 maxbucket)
504 {
505 	Bucket		new_bucket;
506 
507 	new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
508 	if (new_bucket > maxbucket)
509 	{
510 		lowmask = lowmask >> 1;
511 		new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
512 	}
513 
514 	return new_bucket;
515 }
516 
517 /*
518  * _hash_kill_items - set LP_DEAD state for items an indexscan caller has
519  * told us were killed.
520  *
521  * scan->opaque, referenced locally through so, contains information about the
522  * current page and killed tuples thereon (generally, this should only be
523  * called if so->numKilled > 0).
524  *
525  * We match items by heap TID before assuming they are the right ones to
526  * delete.
527  */
528 void
_hash_kill_items(IndexScanDesc scan)529 _hash_kill_items(IndexScanDesc scan)
530 {
531 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
532 	Page		page;
533 	HashPageOpaque opaque;
534 	OffsetNumber offnum,
535 				maxoff;
536 	int			numKilled = so->numKilled;
537 	int			i;
538 	bool		killedsomething = false;
539 
540 	Assert(so->numKilled > 0);
541 	Assert(so->killedItems != NULL);
542 
543 	/*
544 	 * Always reset the scan state, so we don't look for same items on other
545 	 * pages.
546 	 */
547 	so->numKilled = 0;
548 
549 	page = BufferGetPage(so->hashso_curbuf);
550 	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
551 	maxoff = PageGetMaxOffsetNumber(page);
552 
553 	for (i = 0; i < numKilled; i++)
554 	{
555 		offnum = so->killedItems[i].indexOffset;
556 
557 		while (offnum <= maxoff)
558 		{
559 			ItemId		iid = PageGetItemId(page, offnum);
560 			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid);
561 
562 			if (ItemPointerEquals(&ituple->t_tid, &so->killedItems[i].heapTid))
563 			{
564 				/* found the item */
565 				ItemIdMarkDead(iid);
566 				killedsomething = true;
567 				break;			/* out of inner search loop */
568 			}
569 			offnum = OffsetNumberNext(offnum);
570 		}
571 	}
572 
573 	/*
574 	 * Since this can be redone later if needed, mark as dirty hint. Whenever
575 	 * we mark anything LP_DEAD, we also set the page's
576 	 * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint.
577 	 */
578 	if (killedsomething)
579 	{
580 		opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
581 		MarkBufferDirtyHint(so->hashso_curbuf, true);
582 	}
583 }
584