1 /*
2    xxHash - Extremely Fast Hash algorithm
3    Development source file for `xxh3`
4    Copyright (C) 2019-present, Yann Collet.
5 
6    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
7 
8    Redistribution and use in source and binary forms, with or without
9    modification, are permitted provided that the following conditions are
10    met:
11 
12        * Redistributions of source code must retain the above copyright
13    notice, this list of conditions and the following disclaimer.
14        * Redistributions in binary form must reproduce the above
15    copyright notice, this list of conditions and the following disclaimer
16    in the documentation and/or other materials provided with the
17    distribution.
18 
19    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31    You can contact the author at :
32    - xxHash source repository : https://github.com/Cyan4973/xxHash
33 */
34 
35 /* Note :
36    This file is separated for development purposes.
37    It will be integrated into `xxhash.c` when development phase is complete.
38 */
39 
40 #ifndef XXH3_H
41 #define XXH3_H
42 
43 
44 /* ===   Dependencies   === */
45 
46 #undef XXH_INLINE_ALL   /* in case it's already defined */
47 #define XXH_INLINE_ALL
48 #include "xxhash.h"
49 
50 
51 /* ===   Compiler specifics   === */
52 
53 #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
54 #  define XXH_RESTRICT   restrict
55 #else
56 /* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
57 #  define XXH_RESTRICT   /* disable */
58 #endif
59 
60 #if defined(__GNUC__)
61 #  if defined(__AVX2__)
62 #    include <immintrin.h>
63 #  elif defined(__SSE2__)
64 #    include <emmintrin.h>
65 #  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
66 #    define inline __inline__  /* clang bug */
67 #    include <arm_neon.h>
68 #    undef inline
69 #  endif
70 #elif defined(_MSC_VER)
71 #  include <intrin.h>
72 #endif
73 
74 
75 
76 /* ==========================================
77  * Vectorization detection
78  * ========================================== */
79 #define XXH_SCALAR 0
80 #define XXH_SSE2   1
81 #define XXH_AVX2   2
82 #define XXH_NEON   3
83 #define XXH_VSX    4
84 
85 #ifndef XXH_VECTOR    /* can be defined on command line */
86 #  if defined(__AVX2__)
87 #    define XXH_VECTOR XXH_AVX2
88 #  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
89 #    define XXH_VECTOR XXH_SSE2
90 #  elif defined(__GNUC__) /* msvc support maybe later */ \
91   && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
92   && defined(__LITTLE_ENDIAN__) /* ARM big endian is a thing */
93 #    define XXH_VECTOR XXH_NEON
94 #  elif defined(__PPC64__) && defined(__VSX__) && defined(__GNUC__)
95 #    define XXH_VECTOR XXH_VSX
96 #  else
97 #    define XXH_VECTOR XXH_SCALAR
98 #  endif
99 #endif
100 
101 /* control alignment of accumulator,
102  * for compatibility with fast vector loads */
103 #ifndef XXH_ACC_ALIGN
104 #  if XXH_VECTOR == 0   /* scalar */
105 #     define XXH_ACC_ALIGN 8
106 #  elif XXH_VECTOR == 1  /* sse2 */
107 #     define XXH_ACC_ALIGN 16
108 #  elif XXH_VECTOR == 2  /* avx2 */
109 #     define XXH_ACC_ALIGN 32
110 #  elif XXH_VECTOR == 3  /* neon */
111 #     define XXH_ACC_ALIGN 16
112 #  elif XXH_VECTOR == 4  /* vsx */
113 #     define XXH_ACC_ALIGN 16
114 #  endif
115 #endif
116 
117 /* U64 XXH_mult32to64(U32 a, U64 b) { return (U64)a * (U64)b; } */
118 #if defined(_MSC_VER) && defined(_M_IX86)
119 #    include <intrin.h>
120 #    define XXH_mult32to64(x, y) __emulu(x, y)
121 #else
122 #    define XXH_mult32to64(x, y) ((U64)((x) & 0xFFFFFFFF) * (U64)((y) & 0xFFFFFFFF))
123 #endif
124 
125 /* VSX stuff */
126 #if XXH_VECTOR == XXH_VSX
127 #  include <altivec.h>
128 #  undef vector
129 typedef __vector unsigned long long U64x2;
130 typedef __vector unsigned U32x4;
131 /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
XXH_vsxMultOdd(U32x4 a,U32x4 b)132 XXH_FORCE_INLINE U64x2 XXH_vsxMultOdd(U32x4 a, U32x4 b) {
133     U64x2 result;
134     __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
135     return result;
136 }
XXH_vsxMultEven(U32x4 a,U32x4 b)137 XXH_FORCE_INLINE U64x2 XXH_vsxMultEven(U32x4 a, U32x4 b) {
138     U64x2 result;
139     __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
140     return result;
141 }
142 #endif
143 
144 
145 /* ==========================================
146  * XXH3 default settings
147  * ========================================== */
148 
149 #define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
150 
151 #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
152 #  error "default keyset is not large enough"
153 #endif
154 
155 XXH_ALIGN(64) static const BYTE kSecret[XXH_SECRET_DEFAULT_SIZE] = {
156     0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
157     0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
158     0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
159     0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
160     0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
161     0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
162     0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
163     0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
164 
165     0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
166     0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
167     0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
168     0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
169 };
170 
171 
172 static XXH128_hash_t
XXH3_mul128(U64 ll1,U64 ll2)173 XXH3_mul128(U64 ll1, U64 ll2)
174 {
175 /* __uint128_t seems a bad choice with emscripten current, see https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 */
176 #if !defined(__wasm__) && defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
177 
178     __uint128_t lll = (__uint128_t)ll1 * ll2;
179     XXH128_hash_t const r128 = { (U64)(lll), (U64)(lll >> 64) };
180     return r128;
181 
182 #elif defined(_M_X64) || defined(_M_IA64)
183 
184 #ifndef _MSC_VER
185 #   pragma intrinsic(_umul128)
186 #endif
187     U64 llhigh;
188     U64 const lllow = _umul128(ll1, ll2, &llhigh);
189     XXH128_hash_t const r128 = { lllow, llhigh };
190     return r128;
191 
192 #else /* Portable scalar version */
193 
194     /* emulate 64x64->128b multiplication, using four 32x32->64 */
195     U32 const h1 = (U32)(ll1 >> 32);
196     U32 const h2 = (U32)(ll2 >> 32);
197     U32 const l1 = (U32)ll1;
198     U32 const l2 = (U32)ll2;
199 
200     U64 const llh  = XXH_mult32to64(h1, h2);
201     U64 const llm1 = XXH_mult32to64(l1, h2);
202     U64 const llm2 = XXH_mult32to64(h1, l2);
203     U64 const lll  = XXH_mult32to64(l1, l2);
204 
205     U64 const t = lll + (llm1 << 32);
206     U64 const carry1 = t < lll;
207 
208     U64 const lllow = t + (llm2 << 32);
209     U64 const carry2 = lllow < t;
210     U64 const llhigh = llh + (llm1 >> 32) + (llm2 >> 32) + carry1 + carry2;
211 
212     XXH128_hash_t const r128 = { lllow, llhigh };
213     return r128;
214 
215 #endif
216 }
217 
218 
219 #if defined(__GNUC__) && defined(__i386__)
220 /* GCC is stupid and tries to vectorize this.
221  * This tells GCC that it is wrong. */
222 __attribute__((__target__("no-sse")))
223 #endif
224 static U64
XXH3_mul128_fold64(U64 ll1,U64 ll2)225 XXH3_mul128_fold64(U64 ll1, U64 ll2)
226 {
227 /* __uint128_t seems a bad choice with emscripten current, see https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 */
228 #if !defined(__wasm__) && defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
229 
230     __uint128_t lll = (__uint128_t)ll1 * ll2;
231     return (U64)lll ^ (U64)(lll >> 64);
232 
233 #elif defined(_M_X64) || defined(_M_IA64)
234 
235 #ifndef _MSC_VER
236 #   pragma intrinsic(_umul128)
237 #endif
238     U64 llhigh;
239     U64 const lllow = _umul128(ll1, ll2, &llhigh);
240     return lllow ^ llhigh;
241 
242     /* We have to do it out manually on 32-bit.
243      * This is a modified, unrolled, widened, and optimized version of the
244      * mulqdu routine from Hacker's Delight.
245      *
246      *   https://www.hackersdelight.org/hdcodetxt/mulqdu.c.txt
247      *
248      * This was modified to use U32->U64 multiplication instead
249      * of U16->U32, to add the high and low values in the end,
250      * be endian-independent, and I added a partial assembly
251      * implementation for ARM. */
252 
253     /* An easy 128-bit folding multiply on ARMv6T2 and ARMv7-A/R can be done with
254      * the mighty umaal (Unsigned Multiply Accumulate Accumulate Long) which takes 4 cycles
255      * or less, doing a long multiply and adding two 32-bit integers:
256      *
257      *     void umaal(U32 *RdLo, U32 *RdHi, U32 Rn, U32 Rm)
258      *     {
259      *         U64 prodAcc = (U64)Rn * (U64)Rm;
260      *         prodAcc += *RdLo;
261      *         prodAcc += *RdHi;
262      *         *RdLo = prodAcc & 0xFFFFFFFF;
263      *         *RdHi = prodAcc >> 32;
264      *     }
265      *
266      * This is compared to umlal which adds to a single 64-bit integer:
267      *
268      *     void umlal(U32 *RdLo, U32 *RdHi, U32 Rn, U32 Rm)
269      *     {
270      *         U64 prodAcc = (U64)Rn * (U64)Rm;
271      *         prodAcc += (*RdLo | ((U64)*RdHi << 32);
272      *         *RdLo = prodAcc & 0xFFFFFFFF;
273      *         *RdHi = prodAcc >> 32;
274      *     }
275      *
276      * Getting the compiler to emit them is like pulling teeth, and checking
277      * for it is annoying because ARMv7-M lacks this instruction. However, it
278      * is worth it, because this is an otherwise expensive operation. */
279 
280      /* GCC-compatible, ARMv6t2 or ARMv7+, non-M variant, and 32-bit */
281 #elif defined(__GNUC__) /* GCC-compatible */ \
282     && defined(__ARM_ARCH) && !defined(__aarch64__) && !defined(__arm64__) /* 32-bit ARM */\
283     && !defined(__ARM_ARCH_7M__) /* <- Not ARMv7-M  vv*/ \
284         && !(defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM == 0 && __TARGET_ARCH_THUMB == 4) \
285     && (defined(__ARM_ARCH_6T2__) || __ARM_ARCH > 6) /* ARMv6T2 or later */
286 
287     U32 w[4] = { 0 };
288     U32 u[2] = { (U32)(ll1 >> 32), (U32)ll1 };
289     U32 v[2] = { (U32)(ll2 >> 32), (U32)ll2 };
290     U32 k;
291 
292     /* U64 t = (U64)u[1] * (U64)v[1];
293      * w[3] = t & 0xFFFFFFFF;
294      * k = t >> 32; */
295     __asm__("umull %0, %1, %2, %3"
296             : "=r" (w[3]), "=r" (k)
297             : "r" (u[1]), "r" (v[1]));
298 
299     /* t = (U64)u[0] * (U64)v[1] + w[2] + k;
300      * w[2] = t & 0xFFFFFFFF;
301      * k = t >> 32; */
302     __asm__("umaal %0, %1, %2, %3"
303             : "+r" (w[2]), "+r" (k)
304             : "r" (u[0]), "r" (v[1]));
305     w[1] = k;
306     k = 0;
307 
308     /* t = (U64)u[1] * (U64)v[0] + w[2] + k;
309      * w[2] = t & 0xFFFFFFFF;
310      * k = t >> 32; */
311     __asm__("umaal %0, %1, %2, %3"
312             : "+r" (w[2]), "+r" (k)
313             : "r" (u[1]), "r" (v[0]));
314 
315     /* t = (U64)u[0] * (U64)v[0] + w[1] + k;
316      * w[1] = t & 0xFFFFFFFF;
317      * k = t >> 32; */
318     __asm__("umaal %0, %1, %2, %3"
319             : "+r" (w[1]), "+r" (k)
320             : "r" (u[0]), "r" (v[0]));
321     w[0] = k;
322 
323     return (w[1] | ((U64)w[0] << 32)) ^ (w[3] | ((U64)w[2] << 32));
324 
325 #else /* Portable scalar version */
326 
327     /* emulate 64x64->128b multiplication, using four 32x32->64 */
328     U32 const h1 = (U32)(ll1 >> 32);
329     U32 const h2 = (U32)(ll2 >> 32);
330     U32 const l1 = (U32)ll1;
331     U32 const l2 = (U32)ll2;
332 
333     U64 const llh  = XXH_mult32to64(h1, h2);
334     U64 const llm1 = XXH_mult32to64(l1, h2);
335     U64 const llm2 = XXH_mult32to64(h1, l2);
336     U64 const lll  = XXH_mult32to64(l1, l2);
337 
338     U64 const t = lll + (llm1 << 32);
339     U64 const carry1 = t < lll;
340 
341     U64 const lllow = t + (llm2 << 32);
342     U64 const carry2 = lllow < t;
343     U64 const llhigh = llh + (llm1 >> 32) + (llm2 >> 32) + carry1 + carry2;
344 
345     return llhigh ^ lllow;
346 
347 #endif
348 }
349 
350 
XXH3_avalanche(U64 h64)351 static XXH64_hash_t XXH3_avalanche(U64 h64)
352 {
353     h64 ^= h64 >> 37;
354     h64 *= PRIME64_3;
355     h64 ^= h64 >> 32;
356     return h64;
357 }
358 
359 
360 /* ==========================================
361  * Short keys
362  * ========================================== */
363 
364 XXH_FORCE_INLINE XXH64_hash_t
XXH3_len_1to3_64b(const void * data,size_t len,const void * keyPtr,XXH64_hash_t seed)365 XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
366 {
367     XXH_ASSERT(data != NULL);
368     XXH_ASSERT(1 <= len && len <= 3);
369     XXH_ASSERT(keyPtr != NULL);
370     {   BYTE const c1 = ((const BYTE*)data)[0];
371         BYTE const c2 = ((const BYTE*)data)[len >> 1];
372         BYTE const c3 = ((const BYTE*)data)[len - 1];
373         U32  const combined = ((U32)c1) + (((U32)c2) << 8) + (((U32)c3) << 16) + (((U32)len) << 24);
374         U64  const keyed = (U64)combined ^ (XXH_readLE32(keyPtr) + seed);
375         U64  const mixed = keyed * PRIME64_1;
376         return XXH3_avalanche(mixed);
377     }
378 }
379 
380 XXH_FORCE_INLINE XXH64_hash_t
XXH3_len_4to8_64b(const void * data,size_t len,const void * keyPtr,XXH64_hash_t seed)381 XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
382 {
383     XXH_ASSERT(data != NULL);
384     XXH_ASSERT(keyPtr != NULL);
385     XXH_ASSERT(4 <= len && len <= 8);
386     {   U32 const in1 = XXH_readLE32(data);
387         U32 const in2 = XXH_readLE32((const BYTE*)data + len - 4);
388         U64 const in64 = in1 + ((U64)in2 << 32);
389         U64 const keyed = in64 ^ (XXH_readLE64(keyPtr) + seed);
390         U64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
391         return XXH3_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
392     }
393 }
394 
395 XXH_FORCE_INLINE XXH64_hash_t
XXH3_len_9to16_64b(const void * data,size_t len,const void * keyPtr,XXH64_hash_t seed)396 XXH3_len_9to16_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
397 {
398     XXH_ASSERT(data != NULL);
399     XXH_ASSERT(keyPtr != NULL);
400     XXH_ASSERT(9 <= len && len <= 16);
401     {   const U64* const key64 = (const U64*) keyPtr;
402         U64 const ll1 = XXH_readLE64(data) ^ (XXH_readLE64(key64) + seed);
403         U64 const ll2 = XXH_readLE64((const BYTE*)data + len - 8) ^ (XXH_readLE64(key64+1) - seed);
404         U64 const acc = len + (ll1 + ll2) + XXH3_mul128_fold64(ll1, ll2);
405         return XXH3_avalanche(acc);
406     }
407 }
408 
409 XXH_FORCE_INLINE XXH64_hash_t
XXH3_len_0to16_64b(const void * data,size_t len,const void * keyPtr,XXH64_hash_t seed)410 XXH3_len_0to16_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
411 {
412     XXH_ASSERT(len <= 16);
413     {   if (len > 8) return XXH3_len_9to16_64b(data, len, keyPtr, seed);
414         if (len >= 4) return XXH3_len_4to8_64b(data, len, keyPtr, seed);
415         if (len) return XXH3_len_1to3_64b(data, len, keyPtr, seed);
416         return 0;
417     }
418 }
419 
420 
421 /* ===    Long Keys    === */
422 
423 #define STRIPE_LEN 64
424 #define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
425 #define ACC_NB (STRIPE_LEN / sizeof(U64))
426 
427 typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e;
428 
429 XXH_FORCE_INLINE void
XXH3_accumulate_512(void * XXH_RESTRICT acc,const void * XXH_RESTRICT data,const void * XXH_RESTRICT key,XXH3_accWidth_e accWidth)430 XXH3_accumulate_512(      void* XXH_RESTRICT acc,
431                     const void* XXH_RESTRICT data,
432                     const void* XXH_RESTRICT key,
433                     XXH3_accWidth_e accWidth)
434 {
435 #if (XXH_VECTOR == XXH_AVX2)
436 
437     XXH_ASSERT((((size_t)acc) & 31) == 0);
438     {   XXH_ALIGN(32) __m256i* const xacc  =       (__m256i *) acc;
439         const         __m256i* const xdata = (const __m256i *) data;  /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
440         const         __m256i* const xkey  = (const __m256i *) key;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
441 
442         size_t i;
443         for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
444             __m256i const d   = _mm256_loadu_si256 (xdata+i);
445             __m256i const k   = _mm256_loadu_si256 (xkey+i);
446             __m256i const dk  = _mm256_xor_si256 (d,k);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
447             __m256i const mul = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
448             if (accWidth == XXH3_acc_128bits) {
449                 __m256i const dswap = _mm256_shuffle_epi32(d, _MM_SHUFFLE(1,0,3,2));
450                 __m256i const add = _mm256_add_epi64(xacc[i], dswap);
451                 xacc[i]  = _mm256_add_epi64(mul, add);
452             } else {  /* XXH3_acc_64bits */
453                 __m256i const add = _mm256_add_epi64(xacc[i], d);
454                 xacc[i]  = _mm256_add_epi64(mul, add);
455             }
456     }   }
457 
458 #elif (XXH_VECTOR == XXH_SSE2)
459 
460     XXH_ASSERT((((size_t)acc) & 15) == 0);
461     {   XXH_ALIGN(16) __m128i* const xacc  =       (__m128i *) acc;   /* presumed */
462         const         __m128i* const xdata = (const __m128i *) data;  /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
463         const         __m128i* const xkey  = (const __m128i *) key;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
464 
465         size_t i;
466         for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
467             __m128i const d   = _mm_loadu_si128 (xdata+i);
468             __m128i const k   = _mm_loadu_si128 (xkey+i);
469             __m128i const dk  = _mm_xor_si128 (d,k);                                 /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
470             __m128i const mul = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk, 0x31));    /* uint64 mul[2] = {dk0*dk1,dk2*dk3} */
471             if (accWidth == XXH3_acc_128bits) {
472                 __m128i const dswap = _mm_shuffle_epi32(d, _MM_SHUFFLE(1,0,3,2));
473                 __m128i const add = _mm_add_epi64(xacc[i], dswap);
474                 xacc[i]  = _mm_add_epi64(mul, add);
475             } else {  /* XXH3_acc_64bits */
476                 __m128i const add = _mm_add_epi64(xacc[i], d);
477                 xacc[i]  = _mm_add_epi64(mul, add);
478             }
479     }   }
480 
481 #elif (XXH_VECTOR == XXH_NEON)
482 
483     XXH_ASSERT((((size_t)acc) & 15) == 0);
484     {
485         XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
486         /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
487         uint32_t const* const xdata = (const uint32_t *) data;
488         uint32_t const* const xkey  = (const uint32_t *) key;
489 
490         size_t i;
491         for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
492 #if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
493             /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
494              * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
495              * assumes I don't want to destroy it and tries to make a copy. This slows down the code
496              * a lot.
497              * aarch64 not only uses an entirely different syntax, but it requires three
498              * instructions...
499              *    ext    v1.16B, v0.16B, #8    // select high bits because aarch64 can't address them directly
500              *    zip1   v3.2s, v0.2s, v1.2s   // first zip
501              *    zip2   v2.2s, v0.2s, v1.2s   // second zip
502              * ...to do what ARM does in one:
503              *    vzip.32 d0, d1               // Interleave high and low bits and overwrite. */
504 
505             /* data_vec = xdata[i]; */
506             uint32x4_t const data_vec    = vld1q_u32(xdata + (i * 4));
507             /* key_vec  = xkey[i];  */
508             uint32x4_t const key_vec     = vld1q_u32(xkey  + (i * 4));
509             /* data_key = data_vec ^ key_vec; */
510             uint32x4_t       data_key;
511 
512             if (accWidth == XXH3_acc_64bits) {
513                 /* Add first to prevent register swaps */
514                 /* xacc[i] += data_vec; */
515                 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u32(data_vec));
516             } else {  /* XXH3_acc_128bits */
517                 /* xacc[i] += swap(data_vec); */
518                 /* can probably be optimized better */
519                 uint64x2_t const data64 = vreinterpretq_u64_u32(data_vec);
520                 uint64x2_t const swapped= vextq_u64(data64, data64, 1);
521                 xacc[i] = vaddq_u64 (xacc[i], swapped);
522             }
523 
524             data_key = veorq_u32(data_vec, key_vec);
525 
526             /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
527              * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
528             __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
529             /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
530             xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
531 
532 #else
533             /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
534 
535             /* data_vec = xdata[i]; */
536             uint32x4_t const data_vec    = vld1q_u32(xdata + (i * 4));
537             /* key_vec  = xkey[i];  */
538             uint32x4_t const key_vec     = vld1q_u32(xkey  + (i * 4));
539             /* data_key = data_vec ^ key_vec; */
540             uint32x4_t const data_key    = veorq_u32(data_vec, key_vec);
541             /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
542             uint32x2_t const data_key_lo = vmovn_u64  (vreinterpretq_u64_u32(data_key));
543             /* data_key_hi = (uint32x2_t) (data_key >> 32); */
544             uint32x2_t const data_key_hi = vshrn_n_u64 (vreinterpretq_u64_u32(data_key), 32);
545             if (accWidth == XXH3_acc_64bits) {
546                 /* xacc[i] += data_vec; */
547                 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u32(data_vec));
548             } else {  /* XXH3_acc_128bits */
549                 /* xacc[i] += swap(data_vec); */
550                 uint64x2_t const data64 = vreinterpretq_u64_u32(data_vec);
551                 uint64x2_t const swapped= vextq_u64(data64, data64, 1);
552                 xacc[i] = vaddq_u64 (xacc[i], swapped);
553             }
554             /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
555             xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
556 
557 #endif
558         }
559     }
560 
561 #elif (XXH_VECTOR == XXH_VSX) && 0   /* <=========================== DISABLED : MUST BE VALIDATED */
562     /* note : vsx code path currently not tested in CI (limitation of cross-compiler and/or emulator)
563      *        for vsx code path to be shipped and supported, it is critical to create a CI test for it */
564           U64x2* const xacc =        (U64x2*) acc;    /* presumed aligned */
565     U64x2 const* const xdata = (U64x2 const*) data;   /* no alignment restriction */
566     U64x2 const* const xkey  = (U64x2 const*) key;    /* no alignment restriction */
567     U64x2 const v32 = { 32,  32 };
568 
569     size_t i;
570     for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
571         /* data_vec = xdata[i]; */
572         /* key_vec = xkey[i]; */
573 #ifdef __BIG_ENDIAN__
574         /* byteswap */
575         U64x2 const data_vec = vec_revb(vec_vsx_ld(0, xdata + i));  /* note : vec_revb is power9+ */
576         U64x2 const key_vec = vec_revb(vec_vsx_ld(0, xkey + i));    /* note : vec_revb is power9+ */
577 #else
578         U64x2 const data_vec = vec_vsx_ld(0, xdata + i);
579         U64x2 const key_vec = vec_vsx_ld(0, xkey + i);
580 #endif
581         U64x2 const data_key = data_vec ^ key_vec;
582         /* shuffled = (data_key << 32) | (data_key >> 32); */
583         U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
584         /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
585         U64x2 const product = XXH_vsxMultOdd((U32x4)data_key, shuffled);
586 
587         xacc[i] += product;
588 
589         if (accWidth == XXH3_acc_64bits) {
590             xacc[i] += data_vec;
591         } else {  /* XXH3_acc_128bits */
592             U64x2 const data_swapped = vec_permi(data_vec, data_vec, 2);   /* <===== untested !!! */
593             xacc[i] += data_swapped;
594         }
595     }
596 
597 #else   /* scalar variant of Accumulator - universal */
598 
599     XXH_ALIGN(XXH_ACC_ALIGN) U64* const xacc = (U64*) acc;    /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
600     const char* const xdata = (const char*) data;  /* no alignment restriction */
601     const char* const xkey  = (const char*) key;   /* no alignment restriction */
602     size_t i;
603     XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
604     for (i=0; i < ACC_NB; i+=2) {
605         U64 const in1 = XXH_readLE64(xdata + 8*i);
606         U64 const in2 = XXH_readLE64(xdata + 8*(i+1));
607         U64 const key1  = XXH_readLE64(xkey + 8*i);
608         U64 const key2  = XXH_readLE64(xkey + 8*(i+1));
609         U64 const data_key1 = key1 ^ in1;
610         U64 const data_key2 = key2 ^ in2;
611         xacc[i]   += XXH_mult32to64(data_key1 & 0xFFFFFFFF, data_key1 >> 32);
612         xacc[i+1] += XXH_mult32to64(data_key2 & 0xFFFFFFFF, data_key2 >> 32);
613         if (accWidth == XXH3_acc_128bits) {
614             xacc[i]   += in2;
615             xacc[i+1] += in1;
616         } else {  /* XXH3_acc_64bits */
617             xacc[i]   += in1;
618             xacc[i+1] += in2;
619         }
620     }
621 #endif
622 }
623 
624 XXH_FORCE_INLINE void
XXH3_scrambleAcc(void * XXH_RESTRICT acc,const void * XXH_RESTRICT key)625 XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT key)
626 {
627 #if (XXH_VECTOR == XXH_AVX2)
628 
629     XXH_ASSERT((((size_t)acc) & 31) == 0);
630     {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
631         const         __m256i* const xkey = (const __m256i *) key;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
632         const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
633 
634         size_t i;
635         for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
636             __m256i data = xacc[i];
637             __m256i const shifted = _mm256_srli_epi64(data, 47);
638             data = _mm256_xor_si256(data, shifted);
639 
640             {   __m256i const k   = _mm256_loadu_si256 (xkey+i);
641                 __m256i const dk  = _mm256_xor_si256   (data, k);
642 
643                 __m256i const dk1 = _mm256_mul_epu32 (dk, prime32);
644 
645                 __m256i const d2  = _mm256_shuffle_epi32 (dk, 0x31);
646                 __m256i const dk2 = _mm256_mul_epu32 (d2, prime32);
647                 __m256i const dk2h= _mm256_slli_epi64 (dk2, 32);
648 
649                 xacc[i] = _mm256_add_epi64(dk1, dk2h);
650         }   }
651     }
652 
653 #elif (XXH_VECTOR == XXH_SSE2)
654 
655     {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
656         const         __m128i* const xkey = (const __m128i *) key;   /* not really aligned, just for ptr arithmetic */
657         const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
658 
659         size_t i;
660         for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
661             __m128i data = xacc[i];
662             __m128i const shifted = _mm_srli_epi64(data, 47);
663             data = _mm_xor_si128(data, shifted);
664 
665             {   __m128i const k   = _mm_loadu_si128 (xkey+i);
666                 __m128i const dk  = _mm_xor_si128   (data,k);
667 
668                 __m128i const dk1 = _mm_mul_epu32 (dk, prime32);
669 
670                 __m128i const d2  = _mm_shuffle_epi32 (dk, 0x31);
671                 __m128i const dk2 = _mm_mul_epu32 (d2, prime32);
672                 __m128i const dk2h= _mm_slli_epi64(dk2, 32);
673 
674                 xacc[i] = _mm_add_epi64(dk1, dk2h);
675         }   }
676     }
677 
678 #elif (XXH_VECTOR == XXH_NEON)
679 
680     XXH_ASSERT((((size_t)acc) & 15) == 0);
681 
682     {   uint64x2_t* const xacc =     (uint64x2_t*) acc;
683         uint32_t const* const xkey = (uint32_t const*) key;
684         uint32x2_t const prime     = vdup_n_u32 (PRIME32_1);
685 
686         size_t i;
687         for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
688             /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
689             uint64x2_t const   acc_vec  = xacc[i];
690             uint64x2_t const   shifted  = vshrq_n_u64 (acc_vec, 47);
691             uint64x2_t const   data_vec = veorq_u64   (acc_vec, shifted);
692 
693             /* key_vec  = xkey[i]; */
694             uint32x4_t const   key_vec  = vld1q_u32   (xkey + (i * 4));
695             /* data_key = data_vec ^ key_vec; */
696             uint32x4_t const   data_key = veorq_u32   (vreinterpretq_u32_u64(data_vec), key_vec);
697             /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
698             uint32x2x2_t const shuffled = vzip_u32    (vget_low_u32(data_key), vget_high_u32(data_key));
699 
700             /* data_key *= PRIME32_1 */
701 
702             /* prod_hi = (data_key >> 32) * PRIME32_1; */
703             uint64x2_t const   prod_hi = vmull_u32    (shuffled.val[1], prime);
704             /* xacc[i] = prod_hi << 32; */
705             xacc[i] = vshlq_n_u64(prod_hi, 32);
706             /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
707             xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
708     }   }
709 
710 #elif (XXH_VECTOR == XXH_VSX)
711 
712           U64x2* const xacc =       (U64x2*) acc;
713     const U64x2* const xkey = (const U64x2*) key;
714     /* constants */
715     U64x2 const v32  = { 32, 32 };
716     U64x2 const v47 = { 47, 47 };
717     U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
718     size_t i;
719 
720     for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
721         U64x2 const acc_vec  = xacc[i];
722         U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
723         /* key_vec = xkey[i]; */
724 #ifdef __BIG_ENDIAN__
725         /* swap 32-bit words */
726         U64x2 const key_vec  = vec_rl(vec_vsx_ld(0, xkey + i), v32);
727 #else
728         U64x2 const key_vec  = vec_vsx_ld(0, xkey + i);
729 #endif
730         U64x2 const data_key = data_vec ^ key_vec;
731 
732         /* data_key *= PRIME32_1 */
733 
734         /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF);  */
735         U64x2 const prod_lo  = XXH_vsxMultOdd((U32x4)data_key, prime);
736         /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32);  */
737         U64x2 const prod_hi  = XXH_vsxMultEven((U32x4)data_key, prime);
738         xacc[i] = prod_lo + (prod_hi << v32);
739     }
740 
741 #else   /* scalar variant of Scrambler - universal */
742 
743     XXH_ALIGN(XXH_ACC_ALIGN) U64* const xacc = (U64*) acc;   /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
744     const char* const xkey = (const char*) key;   /* no alignment restriction */
745     int i;
746     XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
747 
748     for (i=0; i < (int)ACC_NB; i++) {
749         U64 const key64 = XXH_readLE64(xkey + 8*i);
750         U64 acc64 = xacc[i];
751         acc64 ^= acc64 >> 47;
752         acc64 ^= key64;
753         acc64 *= PRIME32_1;
754         xacc[i] = acc64;
755     }
756 
757 #endif
758 }
759 
760 /* assumption : nbStripes will not overflow secret size */
761 XXH_FORCE_INLINE void
XXH3_accumulate(U64 * XXH_RESTRICT acc,const void * XXH_RESTRICT data,const void * XXH_RESTRICT secret,size_t nbStripes,XXH3_accWidth_e accWidth)762 XXH3_accumulate(       U64* XXH_RESTRICT acc,
763                 const void* XXH_RESTRICT data,
764                 const void* XXH_RESTRICT secret,
765                       size_t nbStripes,
766                       XXH3_accWidth_e accWidth)
767 {
768     size_t n;
769     /* Clang doesn't unroll this loop without the pragma. Unrolling can be up to 1.4x faster.
770      * The unroll statement seems detrimental for WASM (@aras-p) and ARM though.
771      */
772 #if defined(__clang__) && !defined(__OPTIMIZE_SIZE__) && !defined(__ARM_ARCH) && !defined(__EMSCRIPTEN__)
773 #  pragma clang loop unroll(enable)
774 #endif
775 
776     for (n = 0; n < nbStripes; n++ ) {
777         XXH3_accumulate_512(acc,
778                (const char*)data   + n*STRIPE_LEN,
779                (const char*)secret + n*XXH_SECRET_CONSUME_RATE,
780                             accWidth);
781     }
782 }
783 
784 /* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
785  *        and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
786  *        However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
787  *        Pretty much every other modes and compilers prefer `FORCE_INLINE`.
788  */
789 #if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__)
790 static void
791 #else
792 XXH_FORCE_INLINE void
793 #endif
XXH3_hashLong_internal_loop(U64 * XXH_RESTRICT acc,const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize,XXH3_accWidth_e accWidth)794 XXH3_hashLong_internal_loop( U64* XXH_RESTRICT acc,
795                       const void* XXH_RESTRICT data, size_t len,
796                       const void* XXH_RESTRICT secret, size_t secretSize,
797                             XXH3_accWidth_e accWidth)
798 {
799     size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
800     size_t const block_len = STRIPE_LEN * nb_rounds;
801     size_t const nb_blocks = len / block_len;
802 
803     size_t n;
804 
805     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
806 
807     for (n = 0; n < nb_blocks; n++) {
808         XXH3_accumulate(acc, (const char*)data + n*block_len, secret, nb_rounds, accWidth);
809         XXH3_scrambleAcc(acc, (const char*)secret + secretSize - STRIPE_LEN);
810     }
811 
812     /* last partial block */
813     XXH_ASSERT(len > STRIPE_LEN);
814     {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
815         XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
816         XXH3_accumulate(acc, (const char*)data + nb_blocks*block_len, secret, nbStripes, accWidth);
817 
818         /* last stripe */
819         if (len & (STRIPE_LEN - 1)) {
820             const void* const p = (const char*)data + len - STRIPE_LEN;
821 #define XXH_SECRET_LASTACC_START 7  /* do not align on 8, so that secret is different from scrambler */
822             XXH3_accumulate_512(acc, p, (const char*)secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
823     }   }
824 }
825 
826 XXH_FORCE_INLINE U64
XXH3_mix2Accs(const U64 * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)827 XXH3_mix2Accs(const U64* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
828 {
829     const U64* const key64 = (const U64*)secret;
830     return XXH3_mul128_fold64(
831                acc[0] ^ XXH_readLE64(key64),
832                acc[1] ^ XXH_readLE64(key64+1) );
833 }
834 
835 static XXH64_hash_t
XXH3_mergeAccs(const U64 * XXH_RESTRICT acc,const void * XXH_RESTRICT secret,U64 start)836 XXH3_mergeAccs(const U64* XXH_RESTRICT acc, const void* XXH_RESTRICT secret, U64 start)
837 {
838     U64 result64 = start;
839 
840     result64 += XXH3_mix2Accs(acc+0, (const char*)secret +  0);
841     result64 += XXH3_mix2Accs(acc+2, (const char*)secret + 16);
842     result64 += XXH3_mix2Accs(acc+4, (const char*)secret + 32);
843     result64 += XXH3_mix2Accs(acc+6, (const char*)secret + 48);
844 
845     return XXH3_avalanche(result64);
846 }
847 
848 #define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
849                         PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
850 
851 XXH_FORCE_INLINE XXH64_hash_t
XXH3_hashLong_internal(const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize)852 XXH3_hashLong_internal(const void* XXH_RESTRICT data, size_t len,
853                        const void* XXH_RESTRICT secret, size_t secretSize)
854 {
855     XXH_ALIGN(XXH_ACC_ALIGN) U64 acc[ACC_NB] = XXH3_INIT_ACC;
856 
857     XXH3_hashLong_internal_loop(acc, data, len, secret, secretSize, XXH3_acc_64bits);
858 
859     /* converge into final hash */
860     XXH_STATIC_ASSERT(sizeof(acc) == 64);
861 #define XXH_SECRET_MERGEACCS_START 11  /* do not align on 8, so that secret is different from accumulator */
862     XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
863     return XXH3_mergeAccs(acc, (const char*)secret + XXH_SECRET_MERGEACCS_START, (U64)len * PRIME64_1);
864 }
865 
866 
867 XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3_hashLong_64b_defaultSecret(const void * XXH_RESTRICT data,size_t len)868 XXH3_hashLong_64b_defaultSecret(const void* XXH_RESTRICT data, size_t len)
869 {
870     return XXH3_hashLong_internal(data, len, kSecret, sizeof(kSecret));
871 }
872 
873 XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3_hashLong_64b_withSecret(const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize)874 XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT data, size_t len,
875                              const void* XXH_RESTRICT secret, size_t secretSize)
876 {
877     return XXH3_hashLong_internal(data, len, secret, secretSize);
878 }
879 
880 
XXH_writeLE64(void * dst,U64 v64)881 XXH_FORCE_INLINE void XXH_writeLE64(void* dst, U64 v64)
882 {
883     if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
884     memcpy(dst, &v64, sizeof(v64));
885 }
886 
887 /* XXH3_initKeySeed() :
888  * destination `customSecret` is presumed allocated and same size as `kSecret`.
889  */
XXH3_initKeySeed(void * customSecret,U64 seed64)890 XXH_FORCE_INLINE void XXH3_initKeySeed(void* customSecret, U64 seed64)
891 {
892           char* const dst = (char*)customSecret;
893     const char* const src = (const char*)kSecret;
894     int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
895     int i;
896 
897     XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
898 
899     for (i=0; i < nbRounds; i++) {
900         XXH_writeLE64(dst + 16*i,     XXH_readLE64(src + 16*i)     + seed64);
901         XXH_writeLE64(dst + 16*i + 8, XXH_readLE64(src + 16*i + 8) - seed64);
902     }
903 }
904 
905 
906 /* XXH3_hashLong_64b_withSeed() :
907  * Generate a custom key,
908  * based on alteration of default kSecret with the seed,
909  * and then use this key for long mode hashing.
910  * This operation is decently fast but nonetheless costs a little bit of time.
911  * Try to avoid it whenever possible (typically when seed==0).
912  */
913 XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3_hashLong_64b_withSeed(const void * data,size_t len,XXH64_hash_t seed)914 XXH3_hashLong_64b_withSeed(const void* data, size_t len, XXH64_hash_t seed)
915 {
916     XXH_ALIGN(8) char secret[XXH_SECRET_DEFAULT_SIZE];
917     if (seed==0) return XXH3_hashLong_64b_defaultSecret(data, len);
918     XXH3_initKeySeed(secret, seed);
919     return XXH3_hashLong_internal(data, len, secret, sizeof(secret));
920 }
921 
922 
XXH3_mix16B(const void * XXH_RESTRICT data,const void * XXH_RESTRICT key,U64 seed64)923 XXH_FORCE_INLINE U64 XXH3_mix16B(const void* XXH_RESTRICT data,
924                                  const void* XXH_RESTRICT key, U64 seed64)
925 {
926     const U64* const key64 = (const U64*)key;
927     U64 const ll1 = XXH_readLE64(data);
928     U64 const ll2 = XXH_readLE64((const BYTE*)data+8);
929     return XXH3_mul128_fold64(
930                ll1 ^ (XXH_readLE64(key64)   + seed64),
931                ll2 ^ (XXH_readLE64(key64+1) - seed64) );
932 }
933 
934 
935 XXH_FORCE_INLINE XXH64_hash_t
XXH3_len_17to128_64b(const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)936 XXH3_len_17to128_64b(const void* XXH_RESTRICT data, size_t len,
937                      const void* XXH_RESTRICT secret, size_t secretSize,
938                      XXH64_hash_t seed)
939 {
940     const BYTE* const p = (const BYTE*)data;
941     const char* const key = (const char*)secret;
942 
943     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
944     XXH_ASSERT(16 < len && len <= 128);
945 
946     {   U64 acc = len * PRIME64_1;
947         if (len > 32) {
948             if (len > 64) {
949                 if (len > 96) {
950                     acc += XXH3_mix16B(p+48, key+96, seed);
951                     acc += XXH3_mix16B(p+len-64, key+112, seed);
952                 }
953                 acc += XXH3_mix16B(p+32, key+64, seed);
954                 acc += XXH3_mix16B(p+len-48, key+80, seed);
955             }
956             acc += XXH3_mix16B(p+16, key+32, seed);
957             acc += XXH3_mix16B(p+len-32, key+48, seed);
958         }
959         acc += XXH3_mix16B(p+0, key+0, seed);
960         acc += XXH3_mix16B(p+len-16, key+16, seed);
961 
962         return XXH3_avalanche(acc);
963     }
964 }
965 
966 #define XXH3_MIDSIZE_MAX 240
967 
968 XXH_NO_INLINE XXH64_hash_t
XXH3_len_129to240_64b(const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)969 XXH3_len_129to240_64b(const void* XXH_RESTRICT data, size_t len,
970                       const void* XXH_RESTRICT secret, size_t secretSize,
971                       XXH64_hash_t seed)
972 {
973     const BYTE* const p = (const BYTE*)data;
974     const char* const key = (const char*)secret;
975 
976     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
977     XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
978 
979     #define XXH3_MIDSIZE_STARTOFFSET 3
980     #define XXH3_MIDSIZE_LASTOFFSET  17
981 
982     {   U64 acc = len * PRIME64_1;
983         int const nbRounds = (int)len / 16;
984         int i;
985         for (i=0; i<8; i++) {
986             acc += XXH3_mix16B(p+(16*i), key+(16*i), seed);
987         }
988         acc = XXH3_avalanche(acc);
989         XXH_ASSERT(nbRounds >= 8);
990         for (i=8 ; i < nbRounds; i++) {
991             acc += XXH3_mix16B(p+(16*i), key+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
992         }
993         /* last bytes */
994         acc += XXH3_mix16B(p + len - 16, key + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
995         return XXH3_avalanche(acc);
996     }
997 }
998 
999 /* ===   Public entry point   === */
1000 
XXH3_64bits(const void * data,size_t len)1001 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len)
1002 {
1003     if (len <= 16) return XXH3_len_0to16_64b(data, len, kSecret, 0);
1004     if (len <= 128) return XXH3_len_17to128_64b(data, len, kSecret, sizeof(kSecret), 0);
1005     if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b(data, len, kSecret, sizeof(kSecret), 0);
1006     return XXH3_hashLong_64b_defaultSecret(data, len);
1007 }
1008 
1009 XXH_PUBLIC_API XXH64_hash_t
XXH3_64bits_withSecret(const void * data,size_t len,const void * secret,size_t secretSize)1010 XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize)
1011 {
1012     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
1013     /* if an action must be taken should `secret` conditions not be respected,
1014      * it should be done here.
1015      * For now, it's a contract pre-condition.
1016      * Adding a check and a branch here would cost performance at every hash */
1017      if (len <= 16) return XXH3_len_0to16_64b(data, len, secret, 0);
1018      if (len <= 128) return XXH3_len_17to128_64b(data, len, secret, secretSize, 0);
1019      if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b(data, len, secret, secretSize, 0);
1020      return XXH3_hashLong_64b_withSecret(data, len, secret, secretSize);
1021 }
1022 
1023 XXH_PUBLIC_API XXH64_hash_t
XXH3_64bits_withSeed(const void * data,size_t len,XXH64_hash_t seed)1024 XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed)
1025 {
1026     if (len <= 16) return XXH3_len_0to16_64b(data, len, kSecret, seed);
1027     if (len <= 128) return XXH3_len_17to128_64b(data, len, kSecret, sizeof(kSecret), seed);
1028     if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b(data, len, kSecret, sizeof(kSecret), seed);
1029     return XXH3_hashLong_64b_withSeed(data, len, seed);
1030 }
1031 
1032 /* ===   XXH3 streaming   === */
1033 
XXH3_createState(void)1034 XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
1035 {
1036     return (XXH3_state_t*)XXH_malloc(sizeof(XXH3_state_t));
1037 }
1038 
XXH3_freeState(XXH3_state_t * statePtr)1039 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
1040 {
1041     XXH_free(statePtr);
1042     return XXH_OK;
1043 }
1044 
1045 XXH_PUBLIC_API void
XXH3_copyState(XXH3_state_t * dst_state,const XXH3_state_t * src_state)1046 XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
1047 {
1048     memcpy(dst_state, src_state, sizeof(*dst_state));
1049 }
1050 
1051 static void
XXH3_64bits_reset_internal(XXH3_state_t * statePtr,XXH64_hash_t seed,const void * secret,size_t secretSize)1052 XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
1053                            XXH64_hash_t seed,
1054                            const void* secret, size_t secretSize)
1055 {
1056     XXH_ASSERT(statePtr != NULL);
1057     memset(statePtr, 0, sizeof(*statePtr));
1058     statePtr->acc[0] = PRIME32_3;
1059     statePtr->acc[1] = PRIME64_1;
1060     statePtr->acc[2] = PRIME64_2;
1061     statePtr->acc[3] = PRIME64_3;
1062     statePtr->acc[4] = PRIME64_4;
1063     statePtr->acc[5] = PRIME32_2;
1064     statePtr->acc[6] = PRIME64_5;
1065     statePtr->acc[7] = PRIME32_1;
1066     statePtr->seed = seed;
1067     XXH_ASSERT(secret != NULL);
1068     statePtr->secret = secret;
1069     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
1070     statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
1071     statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
1072 }
1073 
1074 XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_reset(XXH3_state_t * statePtr)1075 XXH3_64bits_reset(XXH3_state_t* statePtr)
1076 {
1077     if (statePtr == NULL) return XXH_ERROR;
1078     XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
1079     return XXH_OK;
1080 }
1081 
1082 XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_reset_withSecret(XXH3_state_t * statePtr,const void * secret,size_t secretSize)1083 XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
1084 {
1085     if (statePtr == NULL) return XXH_ERROR;
1086     XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize);
1087     if (secret == NULL) return XXH_ERROR;
1088     if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
1089     return XXH_OK;
1090 }
1091 
1092 XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_reset_withSeed(XXH3_state_t * statePtr,XXH64_hash_t seed)1093 XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
1094 {
1095     if (statePtr == NULL) return XXH_ERROR;
1096     XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
1097     XXH3_initKeySeed(statePtr->customSecret, seed);
1098     statePtr->secret = statePtr->customSecret;
1099     return XXH_OK;
1100 }
1101 
1102 XXH_FORCE_INLINE void
XXH3_consumeStripes(U64 * acc,XXH32_hash_t * nbStripesSoFarPtr,XXH32_hash_t nbStripesPerBlock,const void * data,size_t totalStripes,const void * secret,size_t secretLimit,XXH3_accWidth_e accWidth)1103 XXH3_consumeStripes( U64* acc,
1104                             XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
1105                             const void* data, size_t totalStripes,
1106                             const void* secret, size_t secretLimit,
1107                             XXH3_accWidth_e accWidth)
1108 {
1109     XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
1110     if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
1111         /* need a scrambling operation */
1112         size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
1113         XXH3_accumulate(acc, data, (const char*)secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
1114         XXH3_scrambleAcc(acc, (const char*)secret + secretLimit);
1115         XXH3_accumulate(acc, (const char*)data + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
1116         *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
1117     } else {
1118         XXH3_accumulate(acc, data, (const char*)secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
1119         *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
1120     }
1121 }
1122 
1123 XXH_FORCE_INLINE XXH_errorcode
XXH3_update(XXH3_state_t * state,const void * input,size_t len,XXH3_accWidth_e accWidth)1124 XXH3_update(XXH3_state_t* state, const void* input, size_t len, XXH3_accWidth_e accWidth)
1125 {
1126     if (input==NULL)
1127 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
1128         return XXH_OK;
1129 #else
1130         return XXH_ERROR;
1131 #endif
1132 
1133     {   const BYTE* p = (const BYTE*)input;
1134         const BYTE* const bEnd = p + len;
1135 
1136         state->totalLen += len;
1137 
1138         if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
1139             XXH_memcpy(state->buffer + state->bufferedSize, input, len);
1140             state->bufferedSize += (XXH32_hash_t)len;
1141             return XXH_OK;
1142         }
1143         /* input now > XXH3_INTERNALBUFFER_SIZE */
1144 
1145         #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN)
1146         XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0);   /* clean multiple */
1147 
1148         if (state->bufferedSize) {   /* some data within internal buffer: fill then consume it */
1149             size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
1150             XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
1151             p += loadSize;
1152             XXH3_consumeStripes(state->acc,
1153                                &state->nbStripesSoFar, state->nbStripesPerBlock,
1154                                 state->buffer, XXH3_INTERNALBUFFER_STRIPES,
1155                                 state->secret, state->secretLimit,
1156                                 accWidth);
1157             state->bufferedSize = 0;
1158         }
1159 
1160         /* consume input by full buffer quantities */
1161         if (p+XXH3_INTERNALBUFFER_SIZE <= bEnd) {
1162             const BYTE* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
1163             do {
1164                 XXH3_consumeStripes(state->acc,
1165                                    &state->nbStripesSoFar, state->nbStripesPerBlock,
1166                                     p, XXH3_INTERNALBUFFER_STRIPES,
1167                                     state->secret, state->secretLimit,
1168                                     accWidth);
1169                 p += XXH3_INTERNALBUFFER_SIZE;
1170             } while (p<=limit);
1171         }
1172 
1173         if (p < bEnd) { /* some remaining input data : buffer it */
1174             XXH_memcpy(state->buffer, p, (size_t)(bEnd-p));
1175             state->bufferedSize = (XXH32_hash_t)(bEnd-p);
1176         }
1177     }
1178 
1179     return XXH_OK;
1180 }
1181 
1182 XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_update(XXH3_state_t * state,const void * input,size_t len)1183 XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
1184 {
1185     return XXH3_update(state, input, len, XXH3_acc_64bits);
1186 }
1187 
1188 
1189 XXH_FORCE_INLINE void
XXH3_digest_long(XXH64_hash_t * acc,const XXH3_state_t * state,XXH3_accWidth_e accWidth)1190 XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth)
1191 {
1192     memcpy(acc, state->acc, sizeof(state->acc));  /* digest locally, state remains unaltered, and can continue ingesting more data afterwards */
1193     if (state->bufferedSize >= STRIPE_LEN) {
1194         size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
1195         XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
1196         XXH3_consumeStripes(acc,
1197                            &nbStripesSoFar, state->nbStripesPerBlock,
1198                             state->buffer, totalNbStripes,
1199                             state->secret, state->secretLimit,
1200                             accWidth);
1201         if (state->bufferedSize % STRIPE_LEN) {  /* one last partial stripe */
1202             XXH3_accumulate_512(acc,
1203                                 state->buffer + state->bufferedSize - STRIPE_LEN,
1204                    (const char*)state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
1205                                 accWidth);
1206         }
1207     } else {  /* bufferedSize < STRIPE_LEN */
1208         if (state->bufferedSize) { /* one last stripe */
1209             char lastStripe[STRIPE_LEN];
1210             size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
1211             memcpy(lastStripe, (const char*)state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
1212             memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
1213             XXH3_accumulate_512(acc,
1214                                 lastStripe,
1215                    (const char*)state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
1216                                 accWidth);
1217     }   }
1218 }
1219 
XXH3_64bits_digest(const XXH3_state_t * state)1220 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
1221 {
1222     if (state->totalLen > XXH3_MIDSIZE_MAX) {
1223         XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
1224         XXH3_digest_long(acc, state, XXH3_acc_64bits);
1225         return XXH3_mergeAccs(acc, (const char*)state->secret + XXH_SECRET_MERGEACCS_START, (U64)state->totalLen * PRIME64_1);
1226     }
1227     /* len <= XXH3_MIDSIZE_MAX : short code */
1228     if (state->seed)
1229         return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
1230     return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
1231 }
1232 
1233 /* ==========================================
1234  * XXH3 128 bits (=> XXH128)
1235  * ========================================== */
1236 
1237 XXH_FORCE_INLINE XXH128_hash_t
XXH3_len_1to3_128b(const void * data,size_t len,const void * keyPtr,XXH64_hash_t seed)1238 XXH3_len_1to3_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
1239 {
1240     XXH_ASSERT(data != NULL);
1241     XXH_ASSERT(1 <= len && len <= 3);
1242     XXH_ASSERT(keyPtr != NULL);
1243     {   const U32* const key32 = (const U32*) keyPtr;
1244         BYTE const c1 = ((const BYTE*)data)[0];
1245         BYTE const c2 = ((const BYTE*)data)[len >> 1];
1246         BYTE const c3 = ((const BYTE*)data)[len - 1];
1247         U32  const combinedl = ((U32)c1) + (((U32)c2) << 8) + (((U32)c3) << 16) + (((U32)len) << 24);
1248         U32  const combinedh = XXH_swap32(combinedl);
1249         U64  const keyedl = (U64)combinedl ^ (XXH_readLE32(key32)   + seed);
1250         U64  const keyedh = (U64)combinedh ^ (XXH_readLE32(key32+1) - seed);
1251         U64  const mixedl = keyedl * PRIME64_1;
1252         U64  const mixedh = keyedh * PRIME64_2;
1253         XXH128_hash_t const h128 = { XXH3_avalanche(mixedl) /*low64*/, XXH3_avalanche(mixedh) /*high64*/ };
1254         return h128;
1255     }
1256 }
1257 
1258 
1259 XXH_FORCE_INLINE XXH128_hash_t
XXH3_len_4to8_128b(const void * data,size_t len,const void * keyPtr,XXH64_hash_t seed)1260 XXH3_len_4to8_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
1261 {
1262     XXH_ASSERT(data != NULL);
1263     XXH_ASSERT(keyPtr != NULL);
1264     XXH_ASSERT(4 <= len && len <= 8);
1265     {   U32 const in1 = XXH_readLE32(data);
1266         U32 const in2 = XXH_readLE32((const BYTE*)data + len - 4);
1267         U64 const in64l = in1 + ((U64)in2 << 32);
1268         U64 const in64h = XXH_swap64(in64l);
1269         U64 const keyedl = in64l ^ (XXH_readLE64(keyPtr) + seed);
1270         U64 const keyedh = in64h ^ (XXH_readLE64((const char*)keyPtr + 8) - seed);
1271         U64 const mix64l1 = len + ((keyedl ^ (keyedl >> 51)) * PRIME32_1);
1272         U64 const mix64l2 = (mix64l1 ^ (mix64l1 >> 47)) * PRIME64_2;
1273         U64 const mix64h1 = ((keyedh ^ (keyedh >> 47)) * PRIME64_1) - len;
1274         U64 const mix64h2 = (mix64h1 ^ (mix64h1 >> 43)) * PRIME64_4;
1275         {   XXH128_hash_t const h128 = { XXH3_avalanche(mix64l2) /*low64*/, XXH3_avalanche(mix64h2) /*high64*/ };
1276             return h128;
1277     }   }
1278 }
1279 
1280 XXH_FORCE_INLINE XXH128_hash_t
XXH3_len_9to16_128b(const void * data,size_t len,const void * keyPtr,XXH64_hash_t seed)1281 XXH3_len_9to16_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
1282 {
1283     XXH_ASSERT(data != NULL);
1284     XXH_ASSERT(keyPtr != NULL);
1285     XXH_ASSERT(9 <= len && len <= 16);
1286     {   const U64* const key64 = (const U64*) keyPtr;
1287         U64 const ll1 = XXH_readLE64(data) ^ (XXH_readLE64(key64) + seed);
1288         U64 const ll2 = XXH_readLE64((const BYTE*)data + len - 8) ^ (XXH_readLE64(key64+1) - seed);
1289         U64 const inlow = ll1 ^ ll2;
1290         XXH128_hash_t m128 = XXH3_mul128(inlow, PRIME64_1);
1291         m128.high64 += ll2 * PRIME64_1;
1292         m128.low64  ^= (m128.high64 >> 32);
1293         {   XXH128_hash_t h128 = XXH3_mul128(m128.low64, PRIME64_2);
1294             h128.high64 += m128.high64 * PRIME64_2;
1295             h128.low64   = XXH3_avalanche(h128.low64);
1296             h128.high64  = XXH3_avalanche(h128.high64);
1297             return h128;
1298     }   }
1299 }
1300 
1301 /* Assumption : `secret` size is >= 16
1302  * Note : it should be >= XXH3_SECRET_SIZE_MIN anyway */
1303 XXH_FORCE_INLINE XXH128_hash_t
XXH3_len_0to16_128b(const void * data,size_t len,const void * secret,XXH64_hash_t seed)1304 XXH3_len_0to16_128b(const void* data, size_t len, const void* secret, XXH64_hash_t seed)
1305 {
1306     XXH_ASSERT(len <= 16);
1307     {   if (len > 8) return XXH3_len_9to16_128b(data, len, secret, seed);
1308         if (len >= 4) return XXH3_len_4to8_128b(data, len, secret, seed);
1309         if (len) return XXH3_len_1to3_128b(data, len, secret, seed);
1310         {   XXH128_hash_t const h128 = { 0, 0 };
1311             return h128;
1312     }   }
1313 }
1314 
1315 XXH_FORCE_INLINE XXH128_hash_t
XXH3_hashLong_128b_internal(const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize)1316 XXH3_hashLong_128b_internal(const void* XXH_RESTRICT data, size_t len,
1317                             const void* XXH_RESTRICT secret, size_t secretSize)
1318 {
1319     XXH_ALIGN(XXH_ACC_ALIGN) U64 acc[ACC_NB] = XXH3_INIT_ACC;
1320 
1321     XXH3_hashLong_internal_loop(acc, data, len, secret, secretSize, XXH3_acc_128bits);
1322 
1323     /* converge into final hash */
1324     XXH_STATIC_ASSERT(sizeof(acc) == 64);
1325     XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
1326     {   U64 const low64 = XXH3_mergeAccs(acc, (const char*)secret + XXH_SECRET_MERGEACCS_START, (U64)len * PRIME64_1);
1327         U64 const high64 = XXH3_mergeAccs(acc, (const char*)secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((U64)len * PRIME64_2));
1328         XXH128_hash_t const h128 = { low64, high64 };
1329         return h128;
1330     }
1331 }
1332 
1333 XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3_hashLong_128b_defaultSecret(const void * data,size_t len)1334 XXH3_hashLong_128b_defaultSecret(const void* data, size_t len)
1335 {
1336     return XXH3_hashLong_128b_internal(data, len, kSecret, sizeof(kSecret));
1337 }
1338 
1339 XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3_hashLong_128b_withSecret(const void * data,size_t len,const void * secret,size_t secretSize)1340 XXH3_hashLong_128b_withSecret(const void* data, size_t len,
1341                               const void* secret, size_t secretSize)
1342 {
1343     return XXH3_hashLong_128b_internal(data, len, secret, secretSize);
1344 }
1345 
1346 XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3_hashLong_128b_withSeed(const void * data,size_t len,XXH64_hash_t seed)1347 XXH3_hashLong_128b_withSeed(const void* data, size_t len, XXH64_hash_t seed)
1348 {
1349     XXH_ALIGN(8) char secret[XXH_SECRET_DEFAULT_SIZE];
1350     if (seed == 0) return XXH3_hashLong_128b_defaultSecret(data, len);
1351     XXH3_initKeySeed(secret, seed);
1352     return XXH3_hashLong_128b_internal(data, len, secret, sizeof(secret));
1353 }
1354 
1355 XXH_NO_INLINE XXH128_hash_t
XXH3_len_129to240_128b(const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1356 XXH3_len_129to240_128b(const void* XXH_RESTRICT data, size_t len,
1357                       const void* XXH_RESTRICT secret, size_t secretSize,
1358                       XXH64_hash_t seed)
1359 {
1360     const BYTE* const p = (const BYTE*)data;
1361     const char* const key = (const char*)secret;
1362 
1363     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
1364     XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
1365 
1366     {   U64 acc1 = len * PRIME64_1;
1367         U64 acc2 = 0;
1368         int const nbRounds = (int)len / 32;
1369         int i;
1370         for (i=0; i<4; i++) {
1371             acc1 += XXH3_mix16B(p+(32*i),    key+(32*i),     seed);
1372             acc2 += XXH3_mix16B(p+(32*i)+16, key+(32*i)+16, -seed);
1373         }
1374         acc1 = XXH3_avalanche(acc1);
1375         acc2 = XXH3_avalanche(acc2);
1376         XXH_ASSERT(nbRounds >= 4);
1377         for (i=4 ; i < nbRounds; i++) {
1378             acc1 += XXH3_mix16B(p+(32*i)   , key+(32*(i-4))    + XXH3_MIDSIZE_STARTOFFSET,  seed);
1379             acc2 += XXH3_mix16B(p+(32*i)+16, key+(32*(i-4))+16 + XXH3_MIDSIZE_STARTOFFSET, -seed);
1380         }
1381         /* last bytes */
1382         acc1 += XXH3_mix16B(p + len - 16, key + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET     ,  seed);
1383         acc2 += XXH3_mix16B(p + len - 32, key + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, -seed);
1384 
1385         {   U64 const low64 = acc1 + acc2;
1386             U64 const high64 = (acc1 * PRIME64_1) + (acc2 * PRIME64_4) + ((len - seed) * PRIME64_2);
1387             XXH128_hash_t const h128 = { XXH3_avalanche(low64), (XXH64_hash_t)0 - XXH3_avalanche(high64) };
1388             return h128;
1389         }
1390     }
1391 }
1392 
1393 XXH_FORCE_INLINE XXH128_hash_t
XXH3_len_17to128_128b(const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1394 XXH3_len_17to128_128b(const void* XXH_RESTRICT data, size_t len,
1395                      const void* XXH_RESTRICT secret, size_t secretSize,
1396                      XXH64_hash_t seed)
1397 {
1398     const BYTE* const p = (const BYTE*)data;
1399     const char* const key = (const char*)secret;
1400 
1401     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
1402     XXH_ASSERT(16 < len && len <= 128);
1403 
1404     {   U64 acc1 = len * PRIME64_1;
1405         U64 acc2 = 0;
1406         if (len > 32) {
1407             if (len > 64) {
1408                 if (len > 96) {
1409                     acc1 += XXH3_mix16B(p+48, key+96, seed);
1410                     acc2 += XXH3_mix16B(p+len-64, key+112, seed);
1411                 }
1412                 acc1 += XXH3_mix16B(p+32, key+64, seed);
1413                 acc2 += XXH3_mix16B(p+len-48, key+80, seed);
1414             }
1415             acc1 += XXH3_mix16B(p+16, key+32, seed);
1416             acc2 += XXH3_mix16B(p+len-32, key+48, seed);
1417         }
1418         acc1 += XXH3_mix16B(p+0, key+0, seed);
1419         acc2 += XXH3_mix16B(p+len-16, key+16, seed);
1420 
1421         {   U64 const low64 = acc1 + acc2;
1422             U64 const high64 = (acc1 * PRIME64_1) + (acc2 * PRIME64_4) + ((len - seed) * PRIME64_2);
1423             XXH128_hash_t const h128 = { XXH3_avalanche(low64), (XXH64_hash_t)0 - XXH3_avalanche(high64) };
1424             return h128;
1425         }
1426     }
1427 }
1428 
XXH3_128bits(const void * data,size_t len)1429 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len)
1430 {
1431     if (len <= 16) return XXH3_len_0to16_128b(data, len, kSecret, 0);
1432     if (len <= 128) return XXH3_len_17to128_128b(data, len, kSecret, sizeof(kSecret), 0);
1433     if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b(data, len, kSecret, sizeof(kSecret), 0);
1434     return XXH3_hashLong_128b_defaultSecret(data, len);
1435 }
1436 
1437 XXH_PUBLIC_API XXH128_hash_t
XXH3_128bits_withSecret(const void * data,size_t len,const void * secret,size_t secretSize)1438 XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize)
1439 {
1440     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
1441     /* if an action must be taken should `secret` conditions not be respected,
1442      * it should be done here.
1443      * For now, it's a contract pre-condition.
1444      * Adding a check and a branch here would cost performance at every hash */
1445      if (len <= 16) return XXH3_len_0to16_128b(data, len, secret, 0);
1446      if (len <= 128) return XXH3_len_17to128_128b(data, len, secret, secretSize, 0);
1447      if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b(data, len, secret, secretSize, 0);
1448      return XXH3_hashLong_128b_withSecret(data, len, secret, secretSize);
1449 }
1450 
1451 XXH_PUBLIC_API XXH128_hash_t
XXH3_128bits_withSeed(const void * data,size_t len,XXH64_hash_t seed)1452 XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed)
1453 {
1454     if (len <= 16) return XXH3_len_0to16_128b(data, len, kSecret, seed);
1455     if (len <= 128) return XXH3_len_17to128_128b(data, len, kSecret, sizeof(kSecret), seed);
1456     if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b(data, len, kSecret, sizeof(kSecret), seed);
1457     return XXH3_hashLong_128b_withSeed(data, len, seed);
1458 }
1459 
1460 XXH_PUBLIC_API XXH128_hash_t
XXH128(const void * data,size_t len,XXH64_hash_t seed)1461 XXH128(const void* data, size_t len, XXH64_hash_t seed)
1462 {
1463     return XXH3_128bits_withSeed(data, len, seed);
1464 }
1465 
1466 
1467 /* ===   XXH3 128-bit streaming   === */
1468 
1469 /* all the functions are actually the same as for 64-bit streaming variant,
1470    just the reset one is different (different initial acc values for 0,5,6,7),
1471    and near the end of the digest function */
1472 
1473 static void
XXH3_128bits_reset_internal(XXH3_state_t * statePtr,XXH64_hash_t seed,const void * secret,size_t secretSize)1474 XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
1475                            XXH64_hash_t seed,
1476                            const void* secret, size_t secretSize)
1477 {
1478     XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
1479 }
1480 
1481 XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_reset(XXH3_state_t * statePtr)1482 XXH3_128bits_reset(XXH3_state_t* statePtr)
1483 {
1484     if (statePtr == NULL) return XXH_ERROR;
1485     XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
1486     return XXH_OK;
1487 }
1488 
1489 XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_reset_withSecret(XXH3_state_t * statePtr,const void * secret,size_t secretSize)1490 XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
1491 {
1492     if (statePtr == NULL) return XXH_ERROR;
1493     XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize);
1494     if (secret == NULL) return XXH_ERROR;
1495     if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
1496     return XXH_OK;
1497 }
1498 
1499 XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_reset_withSeed(XXH3_state_t * statePtr,XXH64_hash_t seed)1500 XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
1501 {
1502     if (statePtr == NULL) return XXH_ERROR;
1503     XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
1504     XXH3_initKeySeed(statePtr->customSecret, seed);
1505     statePtr->secret = statePtr->customSecret;
1506     return XXH_OK;
1507 }
1508 
1509 XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_update(XXH3_state_t * state,const void * input,size_t len)1510 XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
1511 {
1512     return XXH3_update(state, input, len, XXH3_acc_128bits);
1513 }
1514 
XXH3_128bits_digest(const XXH3_state_t * state)1515 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
1516 {
1517     if (state->totalLen > XXH3_MIDSIZE_MAX) {
1518         XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
1519         XXH3_digest_long(acc, state, XXH3_acc_128bits);
1520         XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
1521         {   U64 const low64 = XXH3_mergeAccs(acc, (const char*)state->secret + XXH_SECRET_MERGEACCS_START, (U64)state->totalLen * PRIME64_1);
1522             U64 const high64 = XXH3_mergeAccs(acc, (const char*)state->secret + state->secretLimit + STRIPE_LEN - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((U64)state->totalLen * PRIME64_2));
1523             XXH128_hash_t const h128 = { low64, high64 };
1524             return h128;
1525         }
1526     }
1527     /* len <= XXH3_MIDSIZE_MAX : short code */
1528     if (state->seed)
1529         return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
1530     return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
1531 }
1532 
1533 /* 128-bit utility functions */
1534 
1535 #include <string.h>   /* memcmp */
1536 
1537 /* return : 1 is equal, 0 if different */
XXH128_isEqual(XXH128_hash_t h1,XXH128_hash_t h2)1538 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
1539 {
1540     /* note : XXH128_hash_t is compact, it has no padding byte */
1541     return !(memcmp(&h1, &h2, sizeof(h1)));
1542 }
1543 
1544 /* This prototype is compatible with stdlib's qsort().
1545  * return : >0 if *h128_1  > *h128_2
1546  *          <0 if *h128_1  < *h128_2
1547  *          =0 if *h128_1 == *h128_2  */
XXH128_cmp(const void * h128_1,const void * h128_2)1548 XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
1549 {
1550     XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
1551     XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
1552     int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
1553     /* note : bets that, in most cases, hash values are different */
1554     if (hcmp) return hcmp;
1555     return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
1556 }
1557 
1558 
1559 /*======   Canonical representation   ======*/
1560 XXH_PUBLIC_API void
XXH128_canonicalFromHash(XXH128_canonical_t * dst,XXH128_hash_t hash)1561 XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
1562 {
1563     XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
1564     if (XXH_CPU_LITTLE_ENDIAN) {
1565         hash.high64 = XXH_swap64(hash.high64);
1566         hash.low64  = XXH_swap64(hash.low64);
1567     }
1568     memcpy(dst, &hash.high64, sizeof(hash.high64));
1569     memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
1570 }
1571 
1572 XXH_PUBLIC_API XXH128_hash_t
XXH128_hashFromCanonical(const XXH128_canonical_t * src)1573 XXH128_hashFromCanonical(const XXH128_canonical_t* src)
1574 {
1575     XXH128_hash_t h;
1576     h.high64 = XXH_readBE64(src);
1577     h.low64  = XXH_readBE64(src->digest + 8);
1578     return h;
1579 }
1580 
1581 
1582 
1583 #endif  /* XXH3_H */
1584