1 /*-------------------------------------------------------------------------
2  *
3  * hashfunc.c
4  *	  Support functions for hash access method.
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/hash/hashfunc.c
12  *
13  * NOTES
14  *	  These functions are stored in pg_amproc.  For each operator class
15  *	  defined for hash indexes, they compute the hash value of the argument.
16  *
17  *	  Additional hash functions appear in /utils/adt/ files for various
18  *	  specialized datatypes.
19  *
20  *	  It is expected that every bit of a hash function's 32-bit result is
21  *	  as random as every other; failure to ensure this is likely to lead
22  *	  to poor performance of hash joins, for example.  In most cases a hash
23  *	  function should use hash_any() or its variant hash_uint32().
24  *-------------------------------------------------------------------------
25  */
26 
27 #include "postgres.h"
28 
29 #include "access/hash.h"
30 #include "catalog/pg_collation.h"
31 #include "common/hashfn.h"
32 #include "utils/builtins.h"
33 #include "utils/float.h"
34 #include "utils/pg_locale.h"
35 
36 /*
37  * Datatype-specific hash functions.
38  *
39  * These support both hash indexes and hash joins.
40  *
41  * NOTE: some of these are also used by catcache operations, without
42  * any direct connection to hash indexes.  Also, the common hash_any
43  * routine is also used by dynahash tables.
44  */
45 
46 /* Note: this is used for both "char" and boolean datatypes */
47 Datum
hashchar(PG_FUNCTION_ARGS)48 hashchar(PG_FUNCTION_ARGS)
49 {
50 	return hash_uint32((int32) PG_GETARG_CHAR(0));
51 }
52 
53 Datum
hashcharextended(PG_FUNCTION_ARGS)54 hashcharextended(PG_FUNCTION_ARGS)
55 {
56 	return hash_uint32_extended((int32) PG_GETARG_CHAR(0), PG_GETARG_INT64(1));
57 }
58 
59 Datum
hashint2(PG_FUNCTION_ARGS)60 hashint2(PG_FUNCTION_ARGS)
61 {
62 	return hash_uint32((int32) PG_GETARG_INT16(0));
63 }
64 
65 Datum
hashint2extended(PG_FUNCTION_ARGS)66 hashint2extended(PG_FUNCTION_ARGS)
67 {
68 	return hash_uint32_extended((int32) PG_GETARG_INT16(0), PG_GETARG_INT64(1));
69 }
70 
71 Datum
hashint4(PG_FUNCTION_ARGS)72 hashint4(PG_FUNCTION_ARGS)
73 {
74 	return hash_uint32(PG_GETARG_INT32(0));
75 }
76 
77 Datum
hashint4extended(PG_FUNCTION_ARGS)78 hashint4extended(PG_FUNCTION_ARGS)
79 {
80 	return hash_uint32_extended(PG_GETARG_INT32(0), PG_GETARG_INT64(1));
81 }
82 
83 Datum
hashint8(PG_FUNCTION_ARGS)84 hashint8(PG_FUNCTION_ARGS)
85 {
86 	/*
87 	 * The idea here is to produce a hash value compatible with the values
88 	 * produced by hashint4 and hashint2 for logically equal inputs; this is
89 	 * necessary to support cross-type hash joins across these input types.
90 	 * Since all three types are signed, we can xor the high half of the int8
91 	 * value if the sign is positive, or the complement of the high half when
92 	 * the sign is negative.
93 	 */
94 	int64		val = PG_GETARG_INT64(0);
95 	uint32		lohalf = (uint32) val;
96 	uint32		hihalf = (uint32) (val >> 32);
97 
98 	lohalf ^= (val >= 0) ? hihalf : ~hihalf;
99 
100 	return hash_uint32(lohalf);
101 }
102 
103 Datum
hashint8extended(PG_FUNCTION_ARGS)104 hashint8extended(PG_FUNCTION_ARGS)
105 {
106 	/* Same approach as hashint8 */
107 	int64		val = PG_GETARG_INT64(0);
108 	uint32		lohalf = (uint32) val;
109 	uint32		hihalf = (uint32) (val >> 32);
110 
111 	lohalf ^= (val >= 0) ? hihalf : ~hihalf;
112 
113 	return hash_uint32_extended(lohalf, PG_GETARG_INT64(1));
114 }
115 
116 Datum
hashoid(PG_FUNCTION_ARGS)117 hashoid(PG_FUNCTION_ARGS)
118 {
119 	return hash_uint32((uint32) PG_GETARG_OID(0));
120 }
121 
122 Datum
hashoidextended(PG_FUNCTION_ARGS)123 hashoidextended(PG_FUNCTION_ARGS)
124 {
125 	return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1));
126 }
127 
128 Datum
hashenum(PG_FUNCTION_ARGS)129 hashenum(PG_FUNCTION_ARGS)
130 {
131 	return hash_uint32((uint32) PG_GETARG_OID(0));
132 }
133 
134 Datum
hashenumextended(PG_FUNCTION_ARGS)135 hashenumextended(PG_FUNCTION_ARGS)
136 {
137 	return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1));
138 }
139 
140 Datum
hashfloat4(PG_FUNCTION_ARGS)141 hashfloat4(PG_FUNCTION_ARGS)
142 {
143 	float4		key = PG_GETARG_FLOAT4(0);
144 	float8		key8;
145 
146 	/*
147 	 * On IEEE-float machines, minus zero and zero have different bit patterns
148 	 * but should compare as equal.  We must ensure that they have the same
149 	 * hash value, which is most reliably done this way:
150 	 */
151 	if (key == (float4) 0)
152 		PG_RETURN_UINT32(0);
153 
154 	/*
155 	 * To support cross-type hashing of float8 and float4, we want to return
156 	 * the same hash value hashfloat8 would produce for an equal float8 value.
157 	 * So, widen the value to float8 and hash that.  (We must do this rather
158 	 * than have hashfloat8 try to narrow its value to float4; that could fail
159 	 * on overflow.)
160 	 */
161 	key8 = key;
162 
163 	/*
164 	 * Similarly, NaNs can have different bit patterns but they should all
165 	 * compare as equal.  For backwards-compatibility reasons we force them to
166 	 * have the hash value of a standard float8 NaN.  (You'd think we could
167 	 * replace key with a float4 NaN and then widen it; but on some old
168 	 * platforms, that way produces a different bit pattern.)
169 	 */
170 	if (isnan(key8))
171 		key8 = get_float8_nan();
172 
173 	return hash_any((unsigned char *) &key8, sizeof(key8));
174 }
175 
176 Datum
hashfloat4extended(PG_FUNCTION_ARGS)177 hashfloat4extended(PG_FUNCTION_ARGS)
178 {
179 	float4		key = PG_GETARG_FLOAT4(0);
180 	uint64		seed = PG_GETARG_INT64(1);
181 	float8		key8;
182 
183 	/* Same approach as hashfloat4 */
184 	if (key == (float4) 0)
185 		PG_RETURN_UINT64(seed);
186 	key8 = key;
187 	if (isnan(key8))
188 		key8 = get_float8_nan();
189 
190 	return hash_any_extended((unsigned char *) &key8, sizeof(key8), seed);
191 }
192 
193 Datum
hashfloat8(PG_FUNCTION_ARGS)194 hashfloat8(PG_FUNCTION_ARGS)
195 {
196 	float8		key = PG_GETARG_FLOAT8(0);
197 
198 	/*
199 	 * On IEEE-float machines, minus zero and zero have different bit patterns
200 	 * but should compare as equal.  We must ensure that they have the same
201 	 * hash value, which is most reliably done this way:
202 	 */
203 	if (key == (float8) 0)
204 		PG_RETURN_UINT32(0);
205 
206 	/*
207 	 * Similarly, NaNs can have different bit patterns but they should all
208 	 * compare as equal.  For backwards-compatibility reasons we force them to
209 	 * have the hash value of a standard NaN.
210 	 */
211 	if (isnan(key))
212 		key = get_float8_nan();
213 
214 	return hash_any((unsigned char *) &key, sizeof(key));
215 }
216 
217 Datum
hashfloat8extended(PG_FUNCTION_ARGS)218 hashfloat8extended(PG_FUNCTION_ARGS)
219 {
220 	float8		key = PG_GETARG_FLOAT8(0);
221 	uint64		seed = PG_GETARG_INT64(1);
222 
223 	/* Same approach as hashfloat8 */
224 	if (key == (float8) 0)
225 		PG_RETURN_UINT64(seed);
226 	if (isnan(key))
227 		key = get_float8_nan();
228 
229 	return hash_any_extended((unsigned char *) &key, sizeof(key), seed);
230 }
231 
232 Datum
hashoidvector(PG_FUNCTION_ARGS)233 hashoidvector(PG_FUNCTION_ARGS)
234 {
235 	oidvector  *key = (oidvector *) PG_GETARG_POINTER(0);
236 
237 	return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid));
238 }
239 
240 Datum
hashoidvectorextended(PG_FUNCTION_ARGS)241 hashoidvectorextended(PG_FUNCTION_ARGS)
242 {
243 	oidvector  *key = (oidvector *) PG_GETARG_POINTER(0);
244 
245 	return hash_any_extended((unsigned char *) key->values,
246 							 key->dim1 * sizeof(Oid),
247 							 PG_GETARG_INT64(1));
248 }
249 
250 Datum
hashname(PG_FUNCTION_ARGS)251 hashname(PG_FUNCTION_ARGS)
252 {
253 	char	   *key = NameStr(*PG_GETARG_NAME(0));
254 
255 	return hash_any((unsigned char *) key, strlen(key));
256 }
257 
258 Datum
hashnameextended(PG_FUNCTION_ARGS)259 hashnameextended(PG_FUNCTION_ARGS)
260 {
261 	char	   *key = NameStr(*PG_GETARG_NAME(0));
262 
263 	return hash_any_extended((unsigned char *) key, strlen(key),
264 							 PG_GETARG_INT64(1));
265 }
266 
267 Datum
hashtext(PG_FUNCTION_ARGS)268 hashtext(PG_FUNCTION_ARGS)
269 {
270 	text	   *key = PG_GETARG_TEXT_PP(0);
271 	Oid			collid = PG_GET_COLLATION();
272 	pg_locale_t mylocale = 0;
273 	Datum		result;
274 
275 	if (!collid)
276 		ereport(ERROR,
277 				(errcode(ERRCODE_INDETERMINATE_COLLATION),
278 				 errmsg("could not determine which collation to use for string hashing"),
279 				 errhint("Use the COLLATE clause to set the collation explicitly.")));
280 
281 	if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
282 		mylocale = pg_newlocale_from_collation(collid);
283 
284 	if (!mylocale || mylocale->deterministic)
285 	{
286 		result = hash_any((unsigned char *) VARDATA_ANY(key),
287 						  VARSIZE_ANY_EXHDR(key));
288 	}
289 	else
290 	{
291 #ifdef USE_ICU
292 		if (mylocale->provider == COLLPROVIDER_ICU)
293 		{
294 			int32_t		ulen = -1;
295 			UChar	   *uchar = NULL;
296 			Size		bsize;
297 			uint8_t    *buf;
298 
299 			ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key));
300 
301 			bsize = ucol_getSortKey(mylocale->info.icu.ucol,
302 									uchar, ulen, NULL, 0);
303 			buf = palloc(bsize);
304 			ucol_getSortKey(mylocale->info.icu.ucol,
305 							uchar, ulen, buf, bsize);
306 
307 			result = hash_any(buf, bsize);
308 
309 			pfree(buf);
310 		}
311 		else
312 #endif
313 			/* shouldn't happen */
314 			elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
315 	}
316 
317 	/* Avoid leaking memory for toasted inputs */
318 	PG_FREE_IF_COPY(key, 0);
319 
320 	return result;
321 }
322 
323 Datum
hashtextextended(PG_FUNCTION_ARGS)324 hashtextextended(PG_FUNCTION_ARGS)
325 {
326 	text	   *key = PG_GETARG_TEXT_PP(0);
327 	Oid			collid = PG_GET_COLLATION();
328 	pg_locale_t mylocale = 0;
329 	Datum		result;
330 
331 	if (!collid)
332 		ereport(ERROR,
333 				(errcode(ERRCODE_INDETERMINATE_COLLATION),
334 				 errmsg("could not determine which collation to use for string hashing"),
335 				 errhint("Use the COLLATE clause to set the collation explicitly.")));
336 
337 	if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
338 		mylocale = pg_newlocale_from_collation(collid);
339 
340 	if (!mylocale || mylocale->deterministic)
341 	{
342 		result = hash_any_extended((unsigned char *) VARDATA_ANY(key),
343 								   VARSIZE_ANY_EXHDR(key),
344 								   PG_GETARG_INT64(1));
345 	}
346 	else
347 	{
348 #ifdef USE_ICU
349 		if (mylocale->provider == COLLPROVIDER_ICU)
350 		{
351 			int32_t		ulen = -1;
352 			UChar	   *uchar = NULL;
353 			Size		bsize;
354 			uint8_t    *buf;
355 
356 			ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key));
357 
358 			bsize = ucol_getSortKey(mylocale->info.icu.ucol,
359 									uchar, ulen, NULL, 0);
360 			buf = palloc(bsize);
361 			ucol_getSortKey(mylocale->info.icu.ucol,
362 							uchar, ulen, buf, bsize);
363 
364 			result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1));
365 
366 			pfree(buf);
367 		}
368 		else
369 #endif
370 			/* shouldn't happen */
371 			elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
372 	}
373 
374 	PG_FREE_IF_COPY(key, 0);
375 
376 	return result;
377 }
378 
379 /*
380  * hashvarlena() can be used for any varlena datatype in which there are
381  * no non-significant bits, ie, distinct bitpatterns never compare as equal.
382  */
383 Datum
hashvarlena(PG_FUNCTION_ARGS)384 hashvarlena(PG_FUNCTION_ARGS)
385 {
386 	struct varlena *key = PG_GETARG_VARLENA_PP(0);
387 	Datum		result;
388 
389 	result = hash_any((unsigned char *) VARDATA_ANY(key),
390 					  VARSIZE_ANY_EXHDR(key));
391 
392 	/* Avoid leaking memory for toasted inputs */
393 	PG_FREE_IF_COPY(key, 0);
394 
395 	return result;
396 }
397 
398 Datum
hashvarlenaextended(PG_FUNCTION_ARGS)399 hashvarlenaextended(PG_FUNCTION_ARGS)
400 {
401 	struct varlena *key = PG_GETARG_VARLENA_PP(0);
402 	Datum		result;
403 
404 	result = hash_any_extended((unsigned char *) VARDATA_ANY(key),
405 							   VARSIZE_ANY_EXHDR(key),
406 							   PG_GETARG_INT64(1));
407 
408 	PG_FREE_IF_COPY(key, 0);
409 
410 	return result;
411 }
412