1 /* Set of hash utility functions to help maintaining the invariant that
2     if a==b then hash(a)==hash(b)
3 
4    All the utility functions (_Py_Hash*()) return "-1" to signify an error.
5 */
6 #include "Python.h"
7 
8 #ifdef __APPLE__
9 #  include <libkern/OSByteOrder.h>
10 #elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
11 #  include <endian.h>
12 #elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
13 #  include <sys/endian.h>
14 #endif
15 
16 #ifdef __cplusplus
17 extern "C" {
18 #endif
19 
20 _Py_HashSecret_t _Py_HashSecret = {{0}};
21 
22 #if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
23 extern PyHash_FuncDef PyHash_Func;
24 #else
25 static PyHash_FuncDef PyHash_Func;
26 #endif
27 
28 /* Count _Py_HashBytes() calls */
29 #ifdef Py_HASH_STATS
30 #define Py_HASH_STATS_MAX 32
31 static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
32 #endif
33 
34 /* For numeric types, the hash of a number x is based on the reduction
35    of x modulo the prime P = 2**_PyHASH_BITS - 1.  It's designed so that
36    hash(x) == hash(y) whenever x and y are numerically equal, even if
37    x and y have different types.
38 
39    A quick summary of the hashing strategy:
40 
41    (1) First define the 'reduction of x modulo P' for any rational
42    number x; this is a standard extension of the usual notion of
43    reduction modulo P for integers.  If x == p/q (written in lowest
44    terms), the reduction is interpreted as the reduction of p times
45    the inverse of the reduction of q, all modulo P; if q is exactly
46    divisible by P then define the reduction to be infinity.  So we've
47    got a well-defined map
48 
49       reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
50 
51    (2) Now for a rational number x, define hash(x) by:
52 
53       reduce(x)   if x >= 0
54       -reduce(-x) if x < 0
55 
56    If the result of the reduction is infinity (this is impossible for
57    integers, floats and Decimals) then use the predefined hash value
58    _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
59    _PyHASH_INF and -_PyHASH_INF are also used for the
60    hashes of float and Decimal infinities.
61 
62    NaNs hash with a pointer hash.  Having distinct hash values prevents
63    catastrophic pileups from distinct NaN instances which used to always
64    have the same hash value but would compare unequal.
65 
66    A selling point for the above strategy is that it makes it possible
67    to compute hashes of decimal and binary floating-point numbers
68    efficiently, even if the exponent of the binary or decimal number
69    is large.  The key point is that
70 
71       reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
72 
73    provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
74    binary or decimal float is never infinity, since the denominator is a power
75    of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
76    for nonnegative x,
77 
78       reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
79 
80       reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
81 
82    and reduce(10**e) can be computed efficiently by the usual modular
83    exponentiation algorithm.  For reduce(2**e) it's even better: since
84    P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
85    by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
86 
87    */
88 
89 Py_hash_t _Py_HashPointer(const void *);
90 
91 Py_hash_t
_Py_HashDouble(PyObject * inst,double v)92 _Py_HashDouble(PyObject *inst, double v)
93 {
94     int e, sign;
95     double m;
96     Py_uhash_t x, y;
97 
98     if (!Py_IS_FINITE(v)) {
99         if (Py_IS_INFINITY(v))
100             return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
101         else
102             return _Py_HashPointer(inst);
103     }
104 
105     m = frexp(v, &e);
106 
107     sign = 1;
108     if (m < 0) {
109         sign = -1;
110         m = -m;
111     }
112 
113     /* process 28 bits at a time;  this should work well both for binary
114        and hexadecimal floating point. */
115     x = 0;
116     while (m) {
117         x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
118         m *= 268435456.0;  /* 2**28 */
119         e -= 28;
120         y = (Py_uhash_t)m;  /* pull out integer part */
121         m -= y;
122         x += y;
123         if (x >= _PyHASH_MODULUS)
124             x -= _PyHASH_MODULUS;
125     }
126 
127     /* adjust for the exponent;  first reduce it modulo _PyHASH_BITS */
128     e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
129     x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
130 
131     x = x * sign;
132     if (x == (Py_uhash_t)-1)
133         x = (Py_uhash_t)-2;
134     return (Py_hash_t)x;
135 }
136 
137 Py_hash_t
_Py_HashPointerRaw(const void * p)138 _Py_HashPointerRaw(const void *p)
139 {
140     size_t y = (size_t)p;
141     /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
142        excessive hash collisions for dicts and sets */
143     y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
144     return (Py_hash_t)y;
145 }
146 
147 Py_hash_t
_Py_HashPointer(const void * p)148 _Py_HashPointer(const void *p)
149 {
150     Py_hash_t x = _Py_HashPointerRaw(p);
151     if (x == -1) {
152         x = -2;
153     }
154     return x;
155 }
156 
157 Py_hash_t
_Py_HashBytes(const void * src,Py_ssize_t len)158 _Py_HashBytes(const void *src, Py_ssize_t len)
159 {
160     Py_hash_t x;
161     /*
162       We make the hash of the empty string be 0, rather than using
163       (prefix ^ suffix), since this slightly obfuscates the hash secret
164     */
165     if (len == 0) {
166         return 0;
167     }
168 
169 #ifdef Py_HASH_STATS
170     hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
171 #endif
172 
173 #if Py_HASH_CUTOFF > 0
174     if (len < Py_HASH_CUTOFF) {
175         /* Optimize hashing of very small strings with inline DJBX33A. */
176         Py_uhash_t hash;
177         const unsigned char *p = src;
178         hash = 5381; /* DJBX33A starts with 5381 */
179 
180         switch(len) {
181             /* ((hash << 5) + hash) + *p == hash * 33 + *p */
182             case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
183             case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
184             case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
185             case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
186             case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
187             case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
188             case 1: hash = ((hash << 5) + hash) + *p++; break;
189             default:
190                 Py_UNREACHABLE();
191         }
192         hash ^= len;
193         hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
194         x = (Py_hash_t)hash;
195     }
196     else
197 #endif /* Py_HASH_CUTOFF */
198         x = PyHash_Func.hash(src, len);
199 
200     if (x == -1)
201         return -2;
202     return x;
203 }
204 
205 void
_PyHash_Fini(void)206 _PyHash_Fini(void)
207 {
208 #ifdef Py_HASH_STATS
209     fprintf(stderr, "len   calls    total\n");
210     Py_ssize_t total = 0;
211     for (int i = 1; i <= Py_HASH_STATS_MAX; i++) {
212         total += hashstats[i];
213         fprintf(stderr, "%2i %8zd %8zd\n", i, hashstats[i], total);
214     }
215     total += hashstats[0];
216     fprintf(stderr, ">  %8zd %8zd\n", hashstats[0], total);
217 #endif
218 }
219 
220 PyHash_FuncDef *
PyHash_GetFuncDef(void)221 PyHash_GetFuncDef(void)
222 {
223     return &PyHash_Func;
224 }
225 
226 /* Optimized memcpy() for Windows */
227 #ifdef _MSC_VER
228 #  if SIZEOF_PY_UHASH_T == 4
229 #    define PY_UHASH_CPY(dst, src) do {                                    \
230        dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
231        } while(0)
232 #  elif SIZEOF_PY_UHASH_T == 8
233 #    define PY_UHASH_CPY(dst, src) do {                                    \
234        dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
235        dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
236        } while(0)
237 #  else
238 #    error SIZEOF_PY_UHASH_T must be 4 or 8
239 #  endif /* SIZEOF_PY_UHASH_T */
240 #else /* not Windows */
241 #  define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
242 #endif /* _MSC_VER */
243 
244 
245 #if Py_HASH_ALGORITHM == Py_HASH_FNV
246 /* **************************************************************************
247  * Modified Fowler-Noll-Vo (FNV) hash function
248  */
249 static Py_hash_t
fnv(const void * src,Py_ssize_t len)250 fnv(const void *src, Py_ssize_t len)
251 {
252     const unsigned char *p = src;
253     Py_uhash_t x;
254     Py_ssize_t remainder, blocks;
255     union {
256         Py_uhash_t value;
257         unsigned char bytes[SIZEOF_PY_UHASH_T];
258     } block;
259 
260 #ifdef Py_DEBUG
261     assert(_Py_HashSecret_Initialized);
262 #endif
263     remainder = len % SIZEOF_PY_UHASH_T;
264     if (remainder == 0) {
265         /* Process at least one block byte by byte to reduce hash collisions
266          * for strings with common prefixes. */
267         remainder = SIZEOF_PY_UHASH_T;
268     }
269     blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
270 
271     x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
272     x ^= (Py_uhash_t) *p << 7;
273     while (blocks--) {
274         PY_UHASH_CPY(block.bytes, p);
275         x = (_PyHASH_MULTIPLIER * x) ^ block.value;
276         p += SIZEOF_PY_UHASH_T;
277     }
278     /* add remainder */
279     for (; remainder > 0; remainder--)
280         x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
281     x ^= (Py_uhash_t) len;
282     x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
283     if (x == (Py_uhash_t) -1) {
284         x = (Py_uhash_t) -2;
285     }
286     return x;
287 }
288 
289 static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
290                                      16 * SIZEOF_PY_HASH_T};
291 
292 #endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
293 
294 
295 /* **************************************************************************
296  <MIT License>
297  Copyright (c) 2013  Marek Majkowski <marek@popcount.org>
298 
299  Permission is hereby granted, free of charge, to any person obtaining a copy
300  of this software and associated documentation files (the "Software"), to deal
301  in the Software without restriction, including without limitation the rights
302  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
303  copies of the Software, and to permit persons to whom the Software is
304  furnished to do so, subject to the following conditions:
305 
306  The above copyright notice and this permission notice shall be included in
307  all copies or substantial portions of the Software.
308 
309  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
310  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
311  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
312  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
313  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
314  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
315  THE SOFTWARE.
316  </MIT License>
317 
318  Original location:
319     https://github.com/majek/csiphash/
320 
321  Solution inspired by code from:
322     Samuel Neves (supercop/crypto_auth/siphash24/little)
323     djb (supercop/crypto_auth/siphash24/little2)
324     Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
325 
326  Modified for Python by Christian Heimes:
327     - C89 / MSVC compatibility
328     - _rotl64() on Windows
329     - letoh64() fallback
330 */
331 
332 /* byte swap little endian to host endian
333  * Endian conversion not only ensures that the hash function returns the same
334  * value on all platforms. It is also required to for a good dispersion of
335  * the hash values' least significant bits.
336  */
337 #if PY_LITTLE_ENDIAN
338 #  define _le64toh(x) ((uint64_t)(x))
339 #elif defined(__APPLE__)
340 #  define _le64toh(x) OSSwapLittleToHostInt64(x)
341 #elif defined(HAVE_LETOH64)
342 #  define _le64toh(x) le64toh(x)
343 #else
344 #  define _le64toh(x) (((uint64_t)(x) << 56) | \
345                       (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
346                       (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
347                       (((uint64_t)(x) << 8)  & 0xff00000000ULL) | \
348                       (((uint64_t)(x) >> 8)  & 0xff000000ULL) | \
349                       (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
350                       (((uint64_t)(x) >> 40) & 0xff00ULL) | \
351                       ((uint64_t)(x)  >> 56))
352 #endif
353 
354 
355 #ifdef _MSC_VER
356 #  define ROTATE(x, b)  _rotl64(x, b)
357 #else
358 #  define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
359 #endif
360 
361 #define HALF_ROUND(a,b,c,d,s,t)     \
362     a += b; c += d;                 \
363     b = ROTATE(b, s) ^ a;           \
364     d = ROTATE(d, t) ^ c;           \
365     a = ROTATE(a, 32);
366 
367 #define SINGLE_ROUND(v0,v1,v2,v3)   \
368     HALF_ROUND(v0,v1,v2,v3,13,16);  \
369     HALF_ROUND(v2,v1,v0,v3,17,21);
370 
371 #define DOUBLE_ROUND(v0,v1,v2,v3)   \
372     SINGLE_ROUND(v0,v1,v2,v3);      \
373     SINGLE_ROUND(v0,v1,v2,v3);
374 
375 
376 static uint64_t
siphash13(uint64_t k0,uint64_t k1,const void * src,Py_ssize_t src_sz)377 siphash13(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
378     uint64_t b = (uint64_t)src_sz << 56;
379     const uint8_t *in = (const uint8_t*)src;
380 
381     uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
382     uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
383     uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
384     uint64_t v3 = k1 ^ 0x7465646279746573ULL;
385 
386     uint64_t t;
387     uint8_t *pt;
388 
389     while (src_sz >= 8) {
390         uint64_t mi;
391         memcpy(&mi, in, sizeof(mi));
392         mi = _le64toh(mi);
393         in += sizeof(mi);
394         src_sz -= sizeof(mi);
395         v3 ^= mi;
396         SINGLE_ROUND(v0,v1,v2,v3);
397         v0 ^= mi;
398     }
399 
400     t = 0;
401     pt = (uint8_t *)&t;
402     switch (src_sz) {
403         case 7: pt[6] = in[6]; /* fall through */
404         case 6: pt[5] = in[5]; /* fall through */
405         case 5: pt[4] = in[4]; /* fall through */
406         case 4: memcpy(pt, in, sizeof(uint32_t)); break;
407         case 3: pt[2] = in[2]; /* fall through */
408         case 2: pt[1] = in[1]; /* fall through */
409         case 1: pt[0] = in[0]; /* fall through */
410     }
411     b |= _le64toh(t);
412 
413     v3 ^= b;
414     SINGLE_ROUND(v0,v1,v2,v3);
415     v0 ^= b;
416     v2 ^= 0xff;
417     SINGLE_ROUND(v0,v1,v2,v3);
418     SINGLE_ROUND(v0,v1,v2,v3);
419     SINGLE_ROUND(v0,v1,v2,v3);
420 
421     /* modified */
422     t = (v0 ^ v1) ^ (v2 ^ v3);
423     return t;
424 }
425 
426 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
427 static uint64_t
siphash24(uint64_t k0,uint64_t k1,const void * src,Py_ssize_t src_sz)428 siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
429     uint64_t b = (uint64_t)src_sz << 56;
430     const uint8_t *in = (const uint8_t*)src;
431 
432     uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
433     uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
434     uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
435     uint64_t v3 = k1 ^ 0x7465646279746573ULL;
436 
437     uint64_t t;
438     uint8_t *pt;
439 
440     while (src_sz >= 8) {
441         uint64_t mi;
442         memcpy(&mi, in, sizeof(mi));
443         mi = _le64toh(mi);
444         in += sizeof(mi);
445         src_sz -= sizeof(mi);
446         v3 ^= mi;
447         DOUBLE_ROUND(v0,v1,v2,v3);
448         v0 ^= mi;
449     }
450 
451     t = 0;
452     pt = (uint8_t *)&t;
453     switch (src_sz) {
454         case 7: pt[6] = in[6]; /* fall through */
455         case 6: pt[5] = in[5]; /* fall through */
456         case 5: pt[4] = in[4]; /* fall through */
457         case 4: memcpy(pt, in, sizeof(uint32_t)); break;
458         case 3: pt[2] = in[2]; /* fall through */
459         case 2: pt[1] = in[1]; /* fall through */
460         case 1: pt[0] = in[0]; /* fall through */
461     }
462     b |= _le64toh(t);
463 
464     v3 ^= b;
465     DOUBLE_ROUND(v0,v1,v2,v3);
466     v0 ^= b;
467     v2 ^= 0xff;
468     DOUBLE_ROUND(v0,v1,v2,v3);
469     DOUBLE_ROUND(v0,v1,v2,v3);
470 
471     /* modified */
472     t = (v0 ^ v1) ^ (v2 ^ v3);
473     return t;
474 }
475 #endif
476 
477 uint64_t
_Py_KeyedHash(uint64_t key,const void * src,Py_ssize_t src_sz)478 _Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
479 {
480     return siphash13(key, 0, src, src_sz);
481 }
482 
483 
484 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH13
485 static Py_hash_t
pysiphash(const void * src,Py_ssize_t src_sz)486 pysiphash(const void *src, Py_ssize_t src_sz) {
487     return (Py_hash_t)siphash13(
488         _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
489         src, src_sz);
490 }
491 
492 static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash13", 64, 128};
493 #endif
494 
495 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
496 static Py_hash_t
pysiphash(const void * src,Py_ssize_t src_sz)497 pysiphash(const void *src, Py_ssize_t src_sz) {
498     return (Py_hash_t)siphash24(
499         _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
500         src, src_sz);
501 }
502 
503 static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
504 #endif
505 
506 #ifdef __cplusplus
507 }
508 #endif
509