1 /*-------------------------------------------------------------------------
2  *
3  * hashfn.c
4  *		Generic hashing functions, and hash functions for use in dynahash.c
5  *		hashtables
6  *
7  *
8  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
9  * Portions Copyright (c) 1994, Regents of the University of California
10  *
11  *
12  * IDENTIFICATION
13  *	  src/common/hashfn.c
14  *
15  * NOTES
16  *	  It is expected that every bit of a hash function's 32-bit result is
17  *	  as random as every other; failure to ensure this is likely to lead
18  *	  to poor performance of hash tables.  In most cases a hash
19  *	  function should use hash_bytes() or its variant hash_bytes_uint32(),
20  *	  or the wrappers hash_any() and hash_uint32 defined in hashfn.h.
21  *
22  *-------------------------------------------------------------------------
23  */
24 #include "postgres.h"
25 
26 #include "common/hashfn.h"
27 
28 
29 /*
30  * This hash function was written by Bob Jenkins
31  * (bob_jenkins@burtleburtle.net), and superficially adapted
32  * for PostgreSQL by Neil Conway. For more information on this
33  * hash function, see http://burtleburtle.net/bob/hash/doobs.html,
34  * or Bob's article in Dr. Dobb's Journal, Sept. 1997.
35  *
36  * In the current code, we have adopted Bob's 2006 update of his hash
37  * function to fetch the data a word at a time when it is suitably aligned.
38  * This makes for a useful speedup, at the cost of having to maintain
39  * four code paths (aligned vs unaligned, and little-endian vs big-endian).
40  * It also uses two separate mixing functions mix() and final(), instead
41  * of a slower multi-purpose function.
42  */
43 
44 /* Get a bit mask of the bits set in non-uint32 aligned addresses */
45 #define UINT32_ALIGN_MASK (sizeof(uint32) - 1)
46 
47 /* Rotate a uint32 value left by k bits - note multiple evaluation! */
48 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
49 
50 /*----------
51  * mix -- mix 3 32-bit values reversibly.
52  *
53  * This is reversible, so any information in (a,b,c) before mix() is
54  * still in (a,b,c) after mix().
55  *
56  * If four pairs of (a,b,c) inputs are run through mix(), or through
57  * mix() in reverse, there are at least 32 bits of the output that
58  * are sometimes the same for one pair and different for another pair.
59  * This was tested for:
60  * * pairs that differed by one bit, by two bits, in any combination
61  *	 of top bits of (a,b,c), or in any combination of bottom bits of
62  *	 (a,b,c).
63  * * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
64  *	 the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
65  *	 is commonly produced by subtraction) look like a single 1-bit
66  *	 difference.
67  * * the base values were pseudorandom, all zero but one bit set, or
68  *	 all zero plus a counter that starts at zero.
69  *
70  * This does not achieve avalanche.  There are input bits of (a,b,c)
71  * that fail to affect some output bits of (a,b,c), especially of a.  The
72  * most thoroughly mixed value is c, but it doesn't really even achieve
73  * avalanche in c.
74  *
75  * This allows some parallelism.  Read-after-writes are good at doubling
76  * the number of bits affected, so the goal of mixing pulls in the opposite
77  * direction from the goal of parallelism.  I did what I could.  Rotates
78  * seem to cost as much as shifts on every machine I could lay my hands on,
79  * and rotates are much kinder to the top and bottom bits, so I used rotates.
80  *----------
81  */
82 #define mix(a,b,c) \
83 { \
84   a -= c;  a ^= rot(c, 4);	c += b; \
85   b -= a;  b ^= rot(a, 6);	a += c; \
86   c -= b;  c ^= rot(b, 8);	b += a; \
87   a -= c;  a ^= rot(c,16);	c += b; \
88   b -= a;  b ^= rot(a,19);	a += c; \
89   c -= b;  c ^= rot(b, 4);	b += a; \
90 }
91 
92 /*----------
93  * final -- final mixing of 3 32-bit values (a,b,c) into c
94  *
95  * Pairs of (a,b,c) values differing in only a few bits will usually
96  * produce values of c that look totally different.  This was tested for
97  * * pairs that differed by one bit, by two bits, in any combination
98  *	 of top bits of (a,b,c), or in any combination of bottom bits of
99  *	 (a,b,c).
100  * * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
101  *	 the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
102  *	 is commonly produced by subtraction) look like a single 1-bit
103  *	 difference.
104  * * the base values were pseudorandom, all zero but one bit set, or
105  *	 all zero plus a counter that starts at zero.
106  *
107  * The use of separate functions for mix() and final() allow for a
108  * substantial performance increase since final() does not need to
109  * do well in reverse, but is does need to affect all output bits.
110  * mix(), on the other hand, does not need to affect all output
111  * bits (affecting 32 bits is enough).  The original hash function had
112  * a single mixing operation that had to satisfy both sets of requirements
113  * and was slower as a result.
114  *----------
115  */
116 #define final(a,b,c) \
117 { \
118   c ^= b; c -= rot(b,14); \
119   a ^= c; a -= rot(c,11); \
120   b ^= a; b -= rot(a,25); \
121   c ^= b; c -= rot(b,16); \
122   a ^= c; a -= rot(c, 4); \
123   b ^= a; b -= rot(a,14); \
124   c ^= b; c -= rot(b,24); \
125 }
126 
127 /*
128  * hash_bytes() -- hash a variable-length key into a 32-bit value
129  *		k		: the key (the unaligned variable-length array of bytes)
130  *		len		: the length of the key, counting by bytes
131  *
132  * Returns a uint32 value.  Every bit of the key affects every bit of
133  * the return value.  Every 1-bit and 2-bit delta achieves avalanche.
134  * About 6*len+35 instructions. The best hash table sizes are powers
135  * of 2.  There is no need to do mod a prime (mod is sooo slow!).
136  * If you need less than 32 bits, use a bitmask.
137  *
138  * This procedure must never throw elog(ERROR); the ResourceOwner code
139  * relies on this not to fail.
140  *
141  * Note: we could easily change this function to return a 64-bit hash value
142  * by using the final values of both b and c.  b is perhaps a little less
143  * well mixed than c, however.
144  */
145 uint32
hash_bytes(const unsigned char * k,int keylen)146 hash_bytes(const unsigned char *k, int keylen)
147 {
148 	uint32		a,
149 				b,
150 				c,
151 				len;
152 
153 	/* Set up the internal state */
154 	len = keylen;
155 	a = b = c = 0x9e3779b9 + len + 3923095;
156 
157 	/* If the source pointer is word-aligned, we use word-wide fetches */
158 	if (((uintptr_t) k & UINT32_ALIGN_MASK) == 0)
159 	{
160 		/* Code path for aligned source data */
161 		const uint32 *ka = (const uint32 *) k;
162 
163 		/* handle most of the key */
164 		while (len >= 12)
165 		{
166 			a += ka[0];
167 			b += ka[1];
168 			c += ka[2];
169 			mix(a, b, c);
170 			ka += 3;
171 			len -= 12;
172 		}
173 
174 		/* handle the last 11 bytes */
175 		k = (const unsigned char *) ka;
176 #ifdef WORDS_BIGENDIAN
177 		switch (len)
178 		{
179 			case 11:
180 				c += ((uint32) k[10] << 8);
181 				/* fall through */
182 			case 10:
183 				c += ((uint32) k[9] << 16);
184 				/* fall through */
185 			case 9:
186 				c += ((uint32) k[8] << 24);
187 				/* fall through */
188 			case 8:
189 				/* the lowest byte of c is reserved for the length */
190 				b += ka[1];
191 				a += ka[0];
192 				break;
193 			case 7:
194 				b += ((uint32) k[6] << 8);
195 				/* fall through */
196 			case 6:
197 				b += ((uint32) k[5] << 16);
198 				/* fall through */
199 			case 5:
200 				b += ((uint32) k[4] << 24);
201 				/* fall through */
202 			case 4:
203 				a += ka[0];
204 				break;
205 			case 3:
206 				a += ((uint32) k[2] << 8);
207 				/* fall through */
208 			case 2:
209 				a += ((uint32) k[1] << 16);
210 				/* fall through */
211 			case 1:
212 				a += ((uint32) k[0] << 24);
213 				/* case 0: nothing left to add */
214 		}
215 #else							/* !WORDS_BIGENDIAN */
216 		switch (len)
217 		{
218 			case 11:
219 				c += ((uint32) k[10] << 24);
220 				/* fall through */
221 			case 10:
222 				c += ((uint32) k[9] << 16);
223 				/* fall through */
224 			case 9:
225 				c += ((uint32) k[8] << 8);
226 				/* fall through */
227 			case 8:
228 				/* the lowest byte of c is reserved for the length */
229 				b += ka[1];
230 				a += ka[0];
231 				break;
232 			case 7:
233 				b += ((uint32) k[6] << 16);
234 				/* fall through */
235 			case 6:
236 				b += ((uint32) k[5] << 8);
237 				/* fall through */
238 			case 5:
239 				b += k[4];
240 				/* fall through */
241 			case 4:
242 				a += ka[0];
243 				break;
244 			case 3:
245 				a += ((uint32) k[2] << 16);
246 				/* fall through */
247 			case 2:
248 				a += ((uint32) k[1] << 8);
249 				/* fall through */
250 			case 1:
251 				a += k[0];
252 				/* case 0: nothing left to add */
253 		}
254 #endif							/* WORDS_BIGENDIAN */
255 	}
256 	else
257 	{
258 		/* Code path for non-aligned source data */
259 
260 		/* handle most of the key */
261 		while (len >= 12)
262 		{
263 #ifdef WORDS_BIGENDIAN
264 			a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
265 			b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
266 			c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
267 #else							/* !WORDS_BIGENDIAN */
268 			a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
269 			b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
270 			c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
271 #endif							/* WORDS_BIGENDIAN */
272 			mix(a, b, c);
273 			k += 12;
274 			len -= 12;
275 		}
276 
277 		/* handle the last 11 bytes */
278 #ifdef WORDS_BIGENDIAN
279 		switch (len)
280 		{
281 			case 11:
282 				c += ((uint32) k[10] << 8);
283 				/* fall through */
284 			case 10:
285 				c += ((uint32) k[9] << 16);
286 				/* fall through */
287 			case 9:
288 				c += ((uint32) k[8] << 24);
289 				/* fall through */
290 			case 8:
291 				/* the lowest byte of c is reserved for the length */
292 				b += k[7];
293 				/* fall through */
294 			case 7:
295 				b += ((uint32) k[6] << 8);
296 				/* fall through */
297 			case 6:
298 				b += ((uint32) k[5] << 16);
299 				/* fall through */
300 			case 5:
301 				b += ((uint32) k[4] << 24);
302 				/* fall through */
303 			case 4:
304 				a += k[3];
305 				/* fall through */
306 			case 3:
307 				a += ((uint32) k[2] << 8);
308 				/* fall through */
309 			case 2:
310 				a += ((uint32) k[1] << 16);
311 				/* fall through */
312 			case 1:
313 				a += ((uint32) k[0] << 24);
314 				/* case 0: nothing left to add */
315 		}
316 #else							/* !WORDS_BIGENDIAN */
317 		switch (len)
318 		{
319 			case 11:
320 				c += ((uint32) k[10] << 24);
321 				/* fall through */
322 			case 10:
323 				c += ((uint32) k[9] << 16);
324 				/* fall through */
325 			case 9:
326 				c += ((uint32) k[8] << 8);
327 				/* fall through */
328 			case 8:
329 				/* the lowest byte of c is reserved for the length */
330 				b += ((uint32) k[7] << 24);
331 				/* fall through */
332 			case 7:
333 				b += ((uint32) k[6] << 16);
334 				/* fall through */
335 			case 6:
336 				b += ((uint32) k[5] << 8);
337 				/* fall through */
338 			case 5:
339 				b += k[4];
340 				/* fall through */
341 			case 4:
342 				a += ((uint32) k[3] << 24);
343 				/* fall through */
344 			case 3:
345 				a += ((uint32) k[2] << 16);
346 				/* fall through */
347 			case 2:
348 				a += ((uint32) k[1] << 8);
349 				/* fall through */
350 			case 1:
351 				a += k[0];
352 				/* case 0: nothing left to add */
353 		}
354 #endif							/* WORDS_BIGENDIAN */
355 	}
356 
357 	final(a, b, c);
358 
359 	/* report the result */
360 	return c;
361 }
362 
363 /*
364  * hash_bytes_extended() -- hash into a 64-bit value, using an optional seed
365  *		k		: the key (the unaligned variable-length array of bytes)
366  *		len		: the length of the key, counting by bytes
367  *		seed	: a 64-bit seed (0 means no seed)
368  *
369  * Returns a uint64 value.  Otherwise similar to hash_bytes.
370  */
371 uint64
hash_bytes_extended(const unsigned char * k,int keylen,uint64 seed)372 hash_bytes_extended(const unsigned char *k, int keylen, uint64 seed)
373 {
374 	uint32		a,
375 				b,
376 				c,
377 				len;
378 
379 	/* Set up the internal state */
380 	len = keylen;
381 	a = b = c = 0x9e3779b9 + len + 3923095;
382 
383 	/* If the seed is non-zero, use it to perturb the internal state. */
384 	if (seed != 0)
385 	{
386 		/*
387 		 * In essence, the seed is treated as part of the data being hashed,
388 		 * but for simplicity, we pretend that it's padded with four bytes of
389 		 * zeroes so that the seed constitutes a 12-byte chunk.
390 		 */
391 		a += (uint32) (seed >> 32);
392 		b += (uint32) seed;
393 		mix(a, b, c);
394 	}
395 
396 	/* If the source pointer is word-aligned, we use word-wide fetches */
397 	if (((uintptr_t) k & UINT32_ALIGN_MASK) == 0)
398 	{
399 		/* Code path for aligned source data */
400 		const uint32 *ka = (const uint32 *) k;
401 
402 		/* handle most of the key */
403 		while (len >= 12)
404 		{
405 			a += ka[0];
406 			b += ka[1];
407 			c += ka[2];
408 			mix(a, b, c);
409 			ka += 3;
410 			len -= 12;
411 		}
412 
413 		/* handle the last 11 bytes */
414 		k = (const unsigned char *) ka;
415 #ifdef WORDS_BIGENDIAN
416 		switch (len)
417 		{
418 			case 11:
419 				c += ((uint32) k[10] << 8);
420 				/* fall through */
421 			case 10:
422 				c += ((uint32) k[9] << 16);
423 				/* fall through */
424 			case 9:
425 				c += ((uint32) k[8] << 24);
426 				/* fall through */
427 			case 8:
428 				/* the lowest byte of c is reserved for the length */
429 				b += ka[1];
430 				a += ka[0];
431 				break;
432 			case 7:
433 				b += ((uint32) k[6] << 8);
434 				/* fall through */
435 			case 6:
436 				b += ((uint32) k[5] << 16);
437 				/* fall through */
438 			case 5:
439 				b += ((uint32) k[4] << 24);
440 				/* fall through */
441 			case 4:
442 				a += ka[0];
443 				break;
444 			case 3:
445 				a += ((uint32) k[2] << 8);
446 				/* fall through */
447 			case 2:
448 				a += ((uint32) k[1] << 16);
449 				/* fall through */
450 			case 1:
451 				a += ((uint32) k[0] << 24);
452 				/* case 0: nothing left to add */
453 		}
454 #else							/* !WORDS_BIGENDIAN */
455 		switch (len)
456 		{
457 			case 11:
458 				c += ((uint32) k[10] << 24);
459 				/* fall through */
460 			case 10:
461 				c += ((uint32) k[9] << 16);
462 				/* fall through */
463 			case 9:
464 				c += ((uint32) k[8] << 8);
465 				/* fall through */
466 			case 8:
467 				/* the lowest byte of c is reserved for the length */
468 				b += ka[1];
469 				a += ka[0];
470 				break;
471 			case 7:
472 				b += ((uint32) k[6] << 16);
473 				/* fall through */
474 			case 6:
475 				b += ((uint32) k[5] << 8);
476 				/* fall through */
477 			case 5:
478 				b += k[4];
479 				/* fall through */
480 			case 4:
481 				a += ka[0];
482 				break;
483 			case 3:
484 				a += ((uint32) k[2] << 16);
485 				/* fall through */
486 			case 2:
487 				a += ((uint32) k[1] << 8);
488 				/* fall through */
489 			case 1:
490 				a += k[0];
491 				/* case 0: nothing left to add */
492 		}
493 #endif							/* WORDS_BIGENDIAN */
494 	}
495 	else
496 	{
497 		/* Code path for non-aligned source data */
498 
499 		/* handle most of the key */
500 		while (len >= 12)
501 		{
502 #ifdef WORDS_BIGENDIAN
503 			a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
504 			b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
505 			c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
506 #else							/* !WORDS_BIGENDIAN */
507 			a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
508 			b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
509 			c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
510 #endif							/* WORDS_BIGENDIAN */
511 			mix(a, b, c);
512 			k += 12;
513 			len -= 12;
514 		}
515 
516 		/* handle the last 11 bytes */
517 #ifdef WORDS_BIGENDIAN
518 		switch (len)
519 		{
520 			case 11:
521 				c += ((uint32) k[10] << 8);
522 				/* fall through */
523 			case 10:
524 				c += ((uint32) k[9] << 16);
525 				/* fall through */
526 			case 9:
527 				c += ((uint32) k[8] << 24);
528 				/* fall through */
529 			case 8:
530 				/* the lowest byte of c is reserved for the length */
531 				b += k[7];
532 				/* fall through */
533 			case 7:
534 				b += ((uint32) k[6] << 8);
535 				/* fall through */
536 			case 6:
537 				b += ((uint32) k[5] << 16);
538 				/* fall through */
539 			case 5:
540 				b += ((uint32) k[4] << 24);
541 				/* fall through */
542 			case 4:
543 				a += k[3];
544 				/* fall through */
545 			case 3:
546 				a += ((uint32) k[2] << 8);
547 				/* fall through */
548 			case 2:
549 				a += ((uint32) k[1] << 16);
550 				/* fall through */
551 			case 1:
552 				a += ((uint32) k[0] << 24);
553 				/* case 0: nothing left to add */
554 		}
555 #else							/* !WORDS_BIGENDIAN */
556 		switch (len)
557 		{
558 			case 11:
559 				c += ((uint32) k[10] << 24);
560 				/* fall through */
561 			case 10:
562 				c += ((uint32) k[9] << 16);
563 				/* fall through */
564 			case 9:
565 				c += ((uint32) k[8] << 8);
566 				/* fall through */
567 			case 8:
568 				/* the lowest byte of c is reserved for the length */
569 				b += ((uint32) k[7] << 24);
570 				/* fall through */
571 			case 7:
572 				b += ((uint32) k[6] << 16);
573 				/* fall through */
574 			case 6:
575 				b += ((uint32) k[5] << 8);
576 				/* fall through */
577 			case 5:
578 				b += k[4];
579 				/* fall through */
580 			case 4:
581 				a += ((uint32) k[3] << 24);
582 				/* fall through */
583 			case 3:
584 				a += ((uint32) k[2] << 16);
585 				/* fall through */
586 			case 2:
587 				a += ((uint32) k[1] << 8);
588 				/* fall through */
589 			case 1:
590 				a += k[0];
591 				/* case 0: nothing left to add */
592 		}
593 #endif							/* WORDS_BIGENDIAN */
594 	}
595 
596 	final(a, b, c);
597 
598 	/* report the result */
599 	return ((uint64) b << 32) | c;
600 }
601 
602 /*
603  * hash_bytes_uint32() -- hash a 32-bit value to a 32-bit value
604  *
605  * This has the same result as
606  *		hash_bytes(&k, sizeof(uint32))
607  * but is faster and doesn't force the caller to store k into memory.
608  */
609 uint32
hash_bytes_uint32(uint32 k)610 hash_bytes_uint32(uint32 k)
611 {
612 	uint32		a,
613 				b,
614 				c;
615 
616 	a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
617 	a += k;
618 
619 	final(a, b, c);
620 
621 	/* report the result */
622 	return c;
623 }
624 
625 /*
626  * hash_bytes_uint32_extended() -- hash 32-bit value to 64-bit value, with seed
627  *
628  * Like hash_bytes_uint32, this is a convenience function.
629  */
630 uint64
hash_bytes_uint32_extended(uint32 k,uint64 seed)631 hash_bytes_uint32_extended(uint32 k, uint64 seed)
632 {
633 	uint32		a,
634 				b,
635 				c;
636 
637 	a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
638 
639 	if (seed != 0)
640 	{
641 		a += (uint32) (seed >> 32);
642 		b += (uint32) seed;
643 		mix(a, b, c);
644 	}
645 
646 	a += k;
647 
648 	final(a, b, c);
649 
650 	/* report the result */
651 	return ((uint64) b << 32) | c;
652 }
653 
654 /*
655  * string_hash: hash function for keys that are NUL-terminated strings.
656  *
657  * NOTE: this is the default hash function if none is specified.
658  */
659 uint32
string_hash(const void * key,Size keysize)660 string_hash(const void *key, Size keysize)
661 {
662 	/*
663 	 * If the string exceeds keysize-1 bytes, we want to hash only that many,
664 	 * because when it is copied into the hash table it will be truncated at
665 	 * that length.
666 	 */
667 	Size		s_len = strlen((const char *) key);
668 
669 	s_len = Min(s_len, keysize - 1);
670 	return hash_bytes((const unsigned char *) key, (int) s_len);
671 }
672 
673 /*
674  * tag_hash: hash function for fixed-size tag values
675  */
676 uint32
tag_hash(const void * key,Size keysize)677 tag_hash(const void *key, Size keysize)
678 {
679 	return hash_bytes((const unsigned char *) key, (int) keysize);
680 }
681 
682 /*
683  * uint32_hash: hash function for keys that are uint32 or int32
684  *
685  * (tag_hash works for this case too, but is slower)
686  */
687 uint32
uint32_hash(const void * key,Size keysize)688 uint32_hash(const void *key, Size keysize)
689 {
690 	Assert(keysize == sizeof(uint32));
691 	return hash_bytes_uint32(*((const uint32 *) key));
692 }
693