1 /*
2 xxHash - Extremely Fast Hash algorithm
3 Development source file for `xxh3`
4 Copyright (C) 2019-present, Yann Collet.
5
6 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
7
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions are
10 met:
11
12 * Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following disclaimer
16 in the documentation and/or other materials provided with the
17 distribution.
18
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 You can contact the author at :
32 - xxHash source repository : https://github.com/Cyan4973/xxHash
33 */
34
35 /* Note :
36 This file is separated for development purposes.
37 It will be integrated into `xxhash.c` when development phase is complete.
38 */
39
40 #ifndef XXH3_H
41 #define XXH3_H
42
43
44 /* === Dependencies === */
45
46 #undef XXH_INLINE_ALL /* in case it's already defined */
47 #define XXH_INLINE_ALL
48 #include "xxhash.h"
49
50
51 /* === Compiler specifics === */
52
53 #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
54 # define XXH_RESTRICT restrict
55 #else
56 /* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
57 # define XXH_RESTRICT /* disable */
58 #endif
59
60 #if defined(__GNUC__)
61 # if defined(__AVX2__)
62 # include <immintrin.h>
63 # elif defined(__SSE2__)
64 # include <emmintrin.h>
65 # elif defined(__ARM_NEON__) || defined(__ARM_NEON)
66 # define inline __inline__ /* clang bug */
67 # include <arm_neon.h>
68 # undef inline
69 # endif
70 #elif defined(_MSC_VER)
71 # include <intrin.h>
72 #endif
73
74
75
76 /* ==========================================
77 * Vectorization detection
78 * ========================================== */
79 #define XXH_SCALAR 0
80 #define XXH_SSE2 1
81 #define XXH_AVX2 2
82 #define XXH_NEON 3
83 #define XXH_VSX 4
84
85 #ifndef XXH_VECTOR /* can be defined on command line */
86 # if defined(__AVX2__)
87 # define XXH_VECTOR XXH_AVX2
88 # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
89 # define XXH_VECTOR XXH_SSE2
90 # elif defined(__GNUC__) /* msvc support maybe later */ \
91 && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
92 && defined(__LITTLE_ENDIAN__) /* ARM big endian is a thing */
93 # define XXH_VECTOR XXH_NEON
94 # elif defined(__PPC64__) && defined(__VSX__) && defined(__GNUC__)
95 # define XXH_VECTOR XXH_VSX
96 # else
97 # define XXH_VECTOR XXH_SCALAR
98 # endif
99 #endif
100
101 /* control alignment of accumulator,
102 * for compatibility with fast vector loads */
103 #ifndef XXH_ACC_ALIGN
104 # if XXH_VECTOR == 0 /* scalar */
105 # define XXH_ACC_ALIGN 8
106 # elif XXH_VECTOR == 1 /* sse2 */
107 # define XXH_ACC_ALIGN 16
108 # elif XXH_VECTOR == 2 /* avx2 */
109 # define XXH_ACC_ALIGN 32
110 # elif XXH_VECTOR == 3 /* neon */
111 # define XXH_ACC_ALIGN 16
112 # elif XXH_VECTOR == 4 /* vsx */
113 # define XXH_ACC_ALIGN 16
114 # endif
115 #endif
116
117 /* U64 XXH_mult32to64(U32 a, U64 b) { return (U64)a * (U64)b; } */
118 #if defined(_MSC_VER) && defined(_M_IX86)
119 # include <intrin.h>
120 # define XXH_mult32to64(x, y) __emulu(x, y)
121 #else
122 # define XXH_mult32to64(x, y) ((U64)((x) & 0xFFFFFFFF) * (U64)((y) & 0xFFFFFFFF))
123 #endif
124
125 /* VSX stuff */
126 #if XXH_VECTOR == XXH_VSX
127 # include <altivec.h>
128 # undef vector
129 typedef __vector unsigned long long U64x2;
130 typedef __vector unsigned U32x4;
131 /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
XXH_vsxMultOdd(U32x4 a,U32x4 b)132 XXH_FORCE_INLINE U64x2 XXH_vsxMultOdd(U32x4 a, U32x4 b) {
133 U64x2 result;
134 __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
135 return result;
136 }
XXH_vsxMultEven(U32x4 a,U32x4 b)137 XXH_FORCE_INLINE U64x2 XXH_vsxMultEven(U32x4 a, U32x4 b) {
138 U64x2 result;
139 __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
140 return result;
141 }
142 #endif
143
144
145 /* ==========================================
146 * XXH3 default settings
147 * ========================================== */
148
149 #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */
150
151 #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
152 # error "default keyset is not large enough"
153 #endif
154
155 XXH_ALIGN(64) static const BYTE kSecret[XXH_SECRET_DEFAULT_SIZE] = {
156 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
157 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
158 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
159 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
160 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
161 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
162 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
163 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
164
165 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
166 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
167 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
168 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
169 };
170
171
172 static XXH128_hash_t
XXH3_mul128(U64 ll1,U64 ll2)173 XXH3_mul128(U64 ll1, U64 ll2)
174 {
175 /* __uint128_t seems a bad choice with emscripten current, see https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 */
176 #if !defined(__wasm__) && defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
177
178 __uint128_t lll = (__uint128_t)ll1 * ll2;
179 XXH128_hash_t const r128 = { (U64)(lll), (U64)(lll >> 64) };
180 return r128;
181
182 #elif defined(_M_X64) || defined(_M_IA64)
183
184 #ifndef _MSC_VER
185 # pragma intrinsic(_umul128)
186 #endif
187 U64 llhigh;
188 U64 const lllow = _umul128(ll1, ll2, &llhigh);
189 XXH128_hash_t const r128 = { lllow, llhigh };
190 return r128;
191
192 #else /* Portable scalar version */
193
194 /* emulate 64x64->128b multiplication, using four 32x32->64 */
195 U32 const h1 = (U32)(ll1 >> 32);
196 U32 const h2 = (U32)(ll2 >> 32);
197 U32 const l1 = (U32)ll1;
198 U32 const l2 = (U32)ll2;
199
200 U64 const llh = XXH_mult32to64(h1, h2);
201 U64 const llm1 = XXH_mult32to64(l1, h2);
202 U64 const llm2 = XXH_mult32to64(h1, l2);
203 U64 const lll = XXH_mult32to64(l1, l2);
204
205 U64 const t = lll + (llm1 << 32);
206 U64 const carry1 = t < lll;
207
208 U64 const lllow = t + (llm2 << 32);
209 U64 const carry2 = lllow < t;
210 U64 const llhigh = llh + (llm1 >> 32) + (llm2 >> 32) + carry1 + carry2;
211
212 XXH128_hash_t const r128 = { lllow, llhigh };
213 return r128;
214
215 #endif
216 }
217
218
219 #if defined(__GNUC__) && defined(__i386__)
220 /* GCC is stupid and tries to vectorize this.
221 * This tells GCC that it is wrong. */
222 __attribute__((__target__("no-sse")))
223 #endif
224 static U64
XXH3_mul128_fold64(U64 ll1,U64 ll2)225 XXH3_mul128_fold64(U64 ll1, U64 ll2)
226 {
227 /* __uint128_t seems a bad choice with emscripten current, see https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 */
228 #if !defined(__wasm__) && defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
229
230 __uint128_t lll = (__uint128_t)ll1 * ll2;
231 return (U64)lll ^ (U64)(lll >> 64);
232
233 #elif defined(_M_X64) || defined(_M_IA64)
234
235 #ifndef _MSC_VER
236 # pragma intrinsic(_umul128)
237 #endif
238 U64 llhigh;
239 U64 const lllow = _umul128(ll1, ll2, &llhigh);
240 return lllow ^ llhigh;
241
242 /* We have to do it out manually on 32-bit.
243 * This is a modified, unrolled, widened, and optimized version of the
244 * mulqdu routine from Hacker's Delight.
245 *
246 * https://www.hackersdelight.org/hdcodetxt/mulqdu.c.txt
247 *
248 * This was modified to use U32->U64 multiplication instead
249 * of U16->U32, to add the high and low values in the end,
250 * be endian-independent, and I added a partial assembly
251 * implementation for ARM. */
252
253 /* An easy 128-bit folding multiply on ARMv6T2 and ARMv7-A/R can be done with
254 * the mighty umaal (Unsigned Multiply Accumulate Accumulate Long) which takes 4 cycles
255 * or less, doing a long multiply and adding two 32-bit integers:
256 *
257 * void umaal(U32 *RdLo, U32 *RdHi, U32 Rn, U32 Rm)
258 * {
259 * U64 prodAcc = (U64)Rn * (U64)Rm;
260 * prodAcc += *RdLo;
261 * prodAcc += *RdHi;
262 * *RdLo = prodAcc & 0xFFFFFFFF;
263 * *RdHi = prodAcc >> 32;
264 * }
265 *
266 * This is compared to umlal which adds to a single 64-bit integer:
267 *
268 * void umlal(U32 *RdLo, U32 *RdHi, U32 Rn, U32 Rm)
269 * {
270 * U64 prodAcc = (U64)Rn * (U64)Rm;
271 * prodAcc += (*RdLo | ((U64)*RdHi << 32);
272 * *RdLo = prodAcc & 0xFFFFFFFF;
273 * *RdHi = prodAcc >> 32;
274 * }
275 *
276 * Getting the compiler to emit them is like pulling teeth, and checking
277 * for it is annoying because ARMv7-M lacks this instruction. However, it
278 * is worth it, because this is an otherwise expensive operation. */
279
280 /* GCC-compatible, ARMv6t2 or ARMv7+, non-M variant, and 32-bit */
281 #elif defined(__GNUC__) /* GCC-compatible */ \
282 && defined(__ARM_ARCH) && !defined(__aarch64__) && !defined(__arm64__) /* 32-bit ARM */\
283 && !defined(__ARM_ARCH_7M__) /* <- Not ARMv7-M vv*/ \
284 && !(defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM == 0 && __TARGET_ARCH_THUMB == 4) \
285 && (defined(__ARM_ARCH_6T2__) || __ARM_ARCH > 6) /* ARMv6T2 or later */
286
287 U32 w[4] = { 0 };
288 U32 u[2] = { (U32)(ll1 >> 32), (U32)ll1 };
289 U32 v[2] = { (U32)(ll2 >> 32), (U32)ll2 };
290 U32 k;
291
292 /* U64 t = (U64)u[1] * (U64)v[1];
293 * w[3] = t & 0xFFFFFFFF;
294 * k = t >> 32; */
295 __asm__("umull %0, %1, %2, %3"
296 : "=r" (w[3]), "=r" (k)
297 : "r" (u[1]), "r" (v[1]));
298
299 /* t = (U64)u[0] * (U64)v[1] + w[2] + k;
300 * w[2] = t & 0xFFFFFFFF;
301 * k = t >> 32; */
302 __asm__("umaal %0, %1, %2, %3"
303 : "+r" (w[2]), "+r" (k)
304 : "r" (u[0]), "r" (v[1]));
305 w[1] = k;
306 k = 0;
307
308 /* t = (U64)u[1] * (U64)v[0] + w[2] + k;
309 * w[2] = t & 0xFFFFFFFF;
310 * k = t >> 32; */
311 __asm__("umaal %0, %1, %2, %3"
312 : "+r" (w[2]), "+r" (k)
313 : "r" (u[1]), "r" (v[0]));
314
315 /* t = (U64)u[0] * (U64)v[0] + w[1] + k;
316 * w[1] = t & 0xFFFFFFFF;
317 * k = t >> 32; */
318 __asm__("umaal %0, %1, %2, %3"
319 : "+r" (w[1]), "+r" (k)
320 : "r" (u[0]), "r" (v[0]));
321 w[0] = k;
322
323 return (w[1] | ((U64)w[0] << 32)) ^ (w[3] | ((U64)w[2] << 32));
324
325 #else /* Portable scalar version */
326
327 /* emulate 64x64->128b multiplication, using four 32x32->64 */
328 U32 const h1 = (U32)(ll1 >> 32);
329 U32 const h2 = (U32)(ll2 >> 32);
330 U32 const l1 = (U32)ll1;
331 U32 const l2 = (U32)ll2;
332
333 U64 const llh = XXH_mult32to64(h1, h2);
334 U64 const llm1 = XXH_mult32to64(l1, h2);
335 U64 const llm2 = XXH_mult32to64(h1, l2);
336 U64 const lll = XXH_mult32to64(l1, l2);
337
338 U64 const t = lll + (llm1 << 32);
339 U64 const carry1 = t < lll;
340
341 U64 const lllow = t + (llm2 << 32);
342 U64 const carry2 = lllow < t;
343 U64 const llhigh = llh + (llm1 >> 32) + (llm2 >> 32) + carry1 + carry2;
344
345 return llhigh ^ lllow;
346
347 #endif
348 }
349
350
XXH3_avalanche(U64 h64)351 static XXH64_hash_t XXH3_avalanche(U64 h64)
352 {
353 h64 ^= h64 >> 37;
354 h64 *= PRIME64_3;
355 h64 ^= h64 >> 32;
356 return h64;
357 }
358
359
360 /* ==========================================
361 * Short keys
362 * ========================================== */
363
364 XXH_FORCE_INLINE XXH64_hash_t
XXH3_len_1to3_64b(const void * data,size_t len,const void * keyPtr,XXH64_hash_t seed)365 XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
366 {
367 XXH_ASSERT(data != NULL);
368 XXH_ASSERT(1 <= len && len <= 3);
369 XXH_ASSERT(keyPtr != NULL);
370 { BYTE const c1 = ((const BYTE*)data)[0];
371 BYTE const c2 = ((const BYTE*)data)[len >> 1];
372 BYTE const c3 = ((const BYTE*)data)[len - 1];
373 U32 const combined = ((U32)c1) + (((U32)c2) << 8) + (((U32)c3) << 16) + (((U32)len) << 24);
374 U64 const keyed = (U64)combined ^ (XXH_readLE32(keyPtr) + seed);
375 U64 const mixed = keyed * PRIME64_1;
376 return XXH3_avalanche(mixed);
377 }
378 }
379
380 XXH_FORCE_INLINE XXH64_hash_t
XXH3_len_4to8_64b(const void * data,size_t len,const void * keyPtr,XXH64_hash_t seed)381 XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
382 {
383 XXH_ASSERT(data != NULL);
384 XXH_ASSERT(keyPtr != NULL);
385 XXH_ASSERT(4 <= len && len <= 8);
386 { U32 const in1 = XXH_readLE32(data);
387 U32 const in2 = XXH_readLE32((const BYTE*)data + len - 4);
388 U64 const in64 = in1 + ((U64)in2 << 32);
389 U64 const keyed = in64 ^ (XXH_readLE64(keyPtr) + seed);
390 U64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
391 return XXH3_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
392 }
393 }
394
395 XXH_FORCE_INLINE XXH64_hash_t
XXH3_len_9to16_64b(const void * data,size_t len,const void * keyPtr,XXH64_hash_t seed)396 XXH3_len_9to16_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
397 {
398 XXH_ASSERT(data != NULL);
399 XXH_ASSERT(keyPtr != NULL);
400 XXH_ASSERT(9 <= len && len <= 16);
401 { const U64* const key64 = (const U64*) keyPtr;
402 U64 const ll1 = XXH_readLE64(data) ^ (XXH_readLE64(key64) + seed);
403 U64 const ll2 = XXH_readLE64((const BYTE*)data + len - 8) ^ (XXH_readLE64(key64+1) - seed);
404 U64 const acc = len + (ll1 + ll2) + XXH3_mul128_fold64(ll1, ll2);
405 return XXH3_avalanche(acc);
406 }
407 }
408
409 XXH_FORCE_INLINE XXH64_hash_t
XXH3_len_0to16_64b(const void * data,size_t len,const void * keyPtr,XXH64_hash_t seed)410 XXH3_len_0to16_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
411 {
412 XXH_ASSERT(len <= 16);
413 { if (len > 8) return XXH3_len_9to16_64b(data, len, keyPtr, seed);
414 if (len >= 4) return XXH3_len_4to8_64b(data, len, keyPtr, seed);
415 if (len) return XXH3_len_1to3_64b(data, len, keyPtr, seed);
416 return 0;
417 }
418 }
419
420
421 /* === Long Keys === */
422
423 #define STRIPE_LEN 64
424 #define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */
425 #define ACC_NB (STRIPE_LEN / sizeof(U64))
426
427 typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e;
428
429 XXH_FORCE_INLINE void
XXH3_accumulate_512(void * XXH_RESTRICT acc,const void * XXH_RESTRICT data,const void * XXH_RESTRICT key,XXH3_accWidth_e accWidth)430 XXH3_accumulate_512( void* XXH_RESTRICT acc,
431 const void* XXH_RESTRICT data,
432 const void* XXH_RESTRICT key,
433 XXH3_accWidth_e accWidth)
434 {
435 #if (XXH_VECTOR == XXH_AVX2)
436
437 XXH_ASSERT((((size_t)acc) & 31) == 0);
438 { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc;
439 const __m256i* const xdata = (const __m256i *) data; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
440 const __m256i* const xkey = (const __m256i *) key; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
441
442 size_t i;
443 for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
444 __m256i const d = _mm256_loadu_si256 (xdata+i);
445 __m256i const k = _mm256_loadu_si256 (xkey+i);
446 __m256i const dk = _mm256_xor_si256 (d,k); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
447 __m256i const mul = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
448 if (accWidth == XXH3_acc_128bits) {
449 __m256i const dswap = _mm256_shuffle_epi32(d, _MM_SHUFFLE(1,0,3,2));
450 __m256i const add = _mm256_add_epi64(xacc[i], dswap);
451 xacc[i] = _mm256_add_epi64(mul, add);
452 } else { /* XXH3_acc_64bits */
453 __m256i const add = _mm256_add_epi64(xacc[i], d);
454 xacc[i] = _mm256_add_epi64(mul, add);
455 }
456 } }
457
458 #elif (XXH_VECTOR == XXH_SSE2)
459
460 XXH_ASSERT((((size_t)acc) & 15) == 0);
461 { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; /* presumed */
462 const __m128i* const xdata = (const __m128i *) data; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
463 const __m128i* const xkey = (const __m128i *) key; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
464
465 size_t i;
466 for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
467 __m128i const d = _mm_loadu_si128 (xdata+i);
468 __m128i const k = _mm_loadu_si128 (xkey+i);
469 __m128i const dk = _mm_xor_si128 (d,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */
470 __m128i const mul = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk, 0x31)); /* uint64 mul[2] = {dk0*dk1,dk2*dk3} */
471 if (accWidth == XXH3_acc_128bits) {
472 __m128i const dswap = _mm_shuffle_epi32(d, _MM_SHUFFLE(1,0,3,2));
473 __m128i const add = _mm_add_epi64(xacc[i], dswap);
474 xacc[i] = _mm_add_epi64(mul, add);
475 } else { /* XXH3_acc_64bits */
476 __m128i const add = _mm_add_epi64(xacc[i], d);
477 xacc[i] = _mm_add_epi64(mul, add);
478 }
479 } }
480
481 #elif (XXH_VECTOR == XXH_NEON)
482
483 XXH_ASSERT((((size_t)acc) & 15) == 0);
484 {
485 XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
486 /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
487 uint32_t const* const xdata = (const uint32_t *) data;
488 uint32_t const* const xkey = (const uint32_t *) key;
489
490 size_t i;
491 for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
492 #if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
493 /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
494 * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
495 * assumes I don't want to destroy it and tries to make a copy. This slows down the code
496 * a lot.
497 * aarch64 not only uses an entirely different syntax, but it requires three
498 * instructions...
499 * ext v1.16B, v0.16B, #8 // select high bits because aarch64 can't address them directly
500 * zip1 v3.2s, v0.2s, v1.2s // first zip
501 * zip2 v2.2s, v0.2s, v1.2s // second zip
502 * ...to do what ARM does in one:
503 * vzip.32 d0, d1 // Interleave high and low bits and overwrite. */
504
505 /* data_vec = xdata[i]; */
506 uint32x4_t const data_vec = vld1q_u32(xdata + (i * 4));
507 /* key_vec = xkey[i]; */
508 uint32x4_t const key_vec = vld1q_u32(xkey + (i * 4));
509 /* data_key = data_vec ^ key_vec; */
510 uint32x4_t data_key;
511
512 if (accWidth == XXH3_acc_64bits) {
513 /* Add first to prevent register swaps */
514 /* xacc[i] += data_vec; */
515 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u32(data_vec));
516 } else { /* XXH3_acc_128bits */
517 /* xacc[i] += swap(data_vec); */
518 /* can probably be optimized better */
519 uint64x2_t const data64 = vreinterpretq_u64_u32(data_vec);
520 uint64x2_t const swapped= vextq_u64(data64, data64, 1);
521 xacc[i] = vaddq_u64 (xacc[i], swapped);
522 }
523
524 data_key = veorq_u32(data_vec, key_vec);
525
526 /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
527 * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
528 __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
529 /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
530 xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
531
532 #else
533 /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
534
535 /* data_vec = xdata[i]; */
536 uint32x4_t const data_vec = vld1q_u32(xdata + (i * 4));
537 /* key_vec = xkey[i]; */
538 uint32x4_t const key_vec = vld1q_u32(xkey + (i * 4));
539 /* data_key = data_vec ^ key_vec; */
540 uint32x4_t const data_key = veorq_u32(data_vec, key_vec);
541 /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
542 uint32x2_t const data_key_lo = vmovn_u64 (vreinterpretq_u64_u32(data_key));
543 /* data_key_hi = (uint32x2_t) (data_key >> 32); */
544 uint32x2_t const data_key_hi = vshrn_n_u64 (vreinterpretq_u64_u32(data_key), 32);
545 if (accWidth == XXH3_acc_64bits) {
546 /* xacc[i] += data_vec; */
547 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u32(data_vec));
548 } else { /* XXH3_acc_128bits */
549 /* xacc[i] += swap(data_vec); */
550 uint64x2_t const data64 = vreinterpretq_u64_u32(data_vec);
551 uint64x2_t const swapped= vextq_u64(data64, data64, 1);
552 xacc[i] = vaddq_u64 (xacc[i], swapped);
553 }
554 /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
555 xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
556
557 #endif
558 }
559 }
560
561 #elif (XXH_VECTOR == XXH_VSX) && 0 /* <=========================== DISABLED : MUST BE VALIDATED */
562 /* note : vsx code path currently not tested in CI (limitation of cross-compiler and/or emulator)
563 * for vsx code path to be shipped and supported, it is critical to create a CI test for it */
564 U64x2* const xacc = (U64x2*) acc; /* presumed aligned */
565 U64x2 const* const xdata = (U64x2 const*) data; /* no alignment restriction */
566 U64x2 const* const xkey = (U64x2 const*) key; /* no alignment restriction */
567 U64x2 const v32 = { 32, 32 };
568
569 size_t i;
570 for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
571 /* data_vec = xdata[i]; */
572 /* key_vec = xkey[i]; */
573 #ifdef __BIG_ENDIAN__
574 /* byteswap */
575 U64x2 const data_vec = vec_revb(vec_vsx_ld(0, xdata + i)); /* note : vec_revb is power9+ */
576 U64x2 const key_vec = vec_revb(vec_vsx_ld(0, xkey + i)); /* note : vec_revb is power9+ */
577 #else
578 U64x2 const data_vec = vec_vsx_ld(0, xdata + i);
579 U64x2 const key_vec = vec_vsx_ld(0, xkey + i);
580 #endif
581 U64x2 const data_key = data_vec ^ key_vec;
582 /* shuffled = (data_key << 32) | (data_key >> 32); */
583 U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
584 /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
585 U64x2 const product = XXH_vsxMultOdd((U32x4)data_key, shuffled);
586
587 xacc[i] += product;
588
589 if (accWidth == XXH3_acc_64bits) {
590 xacc[i] += data_vec;
591 } else { /* XXH3_acc_128bits */
592 U64x2 const data_swapped = vec_permi(data_vec, data_vec, 2); /* <===== untested !!! */
593 xacc[i] += data_swapped;
594 }
595 }
596
597 #else /* scalar variant of Accumulator - universal */
598
599 XXH_ALIGN(XXH_ACC_ALIGN) U64* const xacc = (U64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
600 const char* const xdata = (const char*) data; /* no alignment restriction */
601 const char* const xkey = (const char*) key; /* no alignment restriction */
602 size_t i;
603 XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
604 for (i=0; i < ACC_NB; i+=2) {
605 U64 const in1 = XXH_readLE64(xdata + 8*i);
606 U64 const in2 = XXH_readLE64(xdata + 8*(i+1));
607 U64 const key1 = XXH_readLE64(xkey + 8*i);
608 U64 const key2 = XXH_readLE64(xkey + 8*(i+1));
609 U64 const data_key1 = key1 ^ in1;
610 U64 const data_key2 = key2 ^ in2;
611 xacc[i] += XXH_mult32to64(data_key1 & 0xFFFFFFFF, data_key1 >> 32);
612 xacc[i+1] += XXH_mult32to64(data_key2 & 0xFFFFFFFF, data_key2 >> 32);
613 if (accWidth == XXH3_acc_128bits) {
614 xacc[i] += in2;
615 xacc[i+1] += in1;
616 } else { /* XXH3_acc_64bits */
617 xacc[i] += in1;
618 xacc[i+1] += in2;
619 }
620 }
621 #endif
622 }
623
624 XXH_FORCE_INLINE void
XXH3_scrambleAcc(void * XXH_RESTRICT acc,const void * XXH_RESTRICT key)625 XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT key)
626 {
627 #if (XXH_VECTOR == XXH_AVX2)
628
629 XXH_ASSERT((((size_t)acc) & 31) == 0);
630 { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
631 const __m256i* const xkey = (const __m256i *) key; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
632 const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
633
634 size_t i;
635 for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
636 __m256i data = xacc[i];
637 __m256i const shifted = _mm256_srli_epi64(data, 47);
638 data = _mm256_xor_si256(data, shifted);
639
640 { __m256i const k = _mm256_loadu_si256 (xkey+i);
641 __m256i const dk = _mm256_xor_si256 (data, k);
642
643 __m256i const dk1 = _mm256_mul_epu32 (dk, prime32);
644
645 __m256i const d2 = _mm256_shuffle_epi32 (dk, 0x31);
646 __m256i const dk2 = _mm256_mul_epu32 (d2, prime32);
647 __m256i const dk2h= _mm256_slli_epi64 (dk2, 32);
648
649 xacc[i] = _mm256_add_epi64(dk1, dk2h);
650 } }
651 }
652
653 #elif (XXH_VECTOR == XXH_SSE2)
654
655 { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
656 const __m128i* const xkey = (const __m128i *) key; /* not really aligned, just for ptr arithmetic */
657 const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
658
659 size_t i;
660 for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
661 __m128i data = xacc[i];
662 __m128i const shifted = _mm_srli_epi64(data, 47);
663 data = _mm_xor_si128(data, shifted);
664
665 { __m128i const k = _mm_loadu_si128 (xkey+i);
666 __m128i const dk = _mm_xor_si128 (data,k);
667
668 __m128i const dk1 = _mm_mul_epu32 (dk, prime32);
669
670 __m128i const d2 = _mm_shuffle_epi32 (dk, 0x31);
671 __m128i const dk2 = _mm_mul_epu32 (d2, prime32);
672 __m128i const dk2h= _mm_slli_epi64(dk2, 32);
673
674 xacc[i] = _mm_add_epi64(dk1, dk2h);
675 } }
676 }
677
678 #elif (XXH_VECTOR == XXH_NEON)
679
680 XXH_ASSERT((((size_t)acc) & 15) == 0);
681
682 { uint64x2_t* const xacc = (uint64x2_t*) acc;
683 uint32_t const* const xkey = (uint32_t const*) key;
684 uint32x2_t const prime = vdup_n_u32 (PRIME32_1);
685
686 size_t i;
687 for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
688 /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
689 uint64x2_t const acc_vec = xacc[i];
690 uint64x2_t const shifted = vshrq_n_u64 (acc_vec, 47);
691 uint64x2_t const data_vec = veorq_u64 (acc_vec, shifted);
692
693 /* key_vec = xkey[i]; */
694 uint32x4_t const key_vec = vld1q_u32 (xkey + (i * 4));
695 /* data_key = data_vec ^ key_vec; */
696 uint32x4_t const data_key = veorq_u32 (vreinterpretq_u32_u64(data_vec), key_vec);
697 /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
698 uint32x2x2_t const shuffled = vzip_u32 (vget_low_u32(data_key), vget_high_u32(data_key));
699
700 /* data_key *= PRIME32_1 */
701
702 /* prod_hi = (data_key >> 32) * PRIME32_1; */
703 uint64x2_t const prod_hi = vmull_u32 (shuffled.val[1], prime);
704 /* xacc[i] = prod_hi << 32; */
705 xacc[i] = vshlq_n_u64(prod_hi, 32);
706 /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
707 xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
708 } }
709
710 #elif (XXH_VECTOR == XXH_VSX)
711
712 U64x2* const xacc = (U64x2*) acc;
713 const U64x2* const xkey = (const U64x2*) key;
714 /* constants */
715 U64x2 const v32 = { 32, 32 };
716 U64x2 const v47 = { 47, 47 };
717 U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
718 size_t i;
719
720 for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
721 U64x2 const acc_vec = xacc[i];
722 U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
723 /* key_vec = xkey[i]; */
724 #ifdef __BIG_ENDIAN__
725 /* swap 32-bit words */
726 U64x2 const key_vec = vec_rl(vec_vsx_ld(0, xkey + i), v32);
727 #else
728 U64x2 const key_vec = vec_vsx_ld(0, xkey + i);
729 #endif
730 U64x2 const data_key = data_vec ^ key_vec;
731
732 /* data_key *= PRIME32_1 */
733
734 /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF); */
735 U64x2 const prod_lo = XXH_vsxMultOdd((U32x4)data_key, prime);
736 /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32); */
737 U64x2 const prod_hi = XXH_vsxMultEven((U32x4)data_key, prime);
738 xacc[i] = prod_lo + (prod_hi << v32);
739 }
740
741 #else /* scalar variant of Scrambler - universal */
742
743 XXH_ALIGN(XXH_ACC_ALIGN) U64* const xacc = (U64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
744 const char* const xkey = (const char*) key; /* no alignment restriction */
745 int i;
746 XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
747
748 for (i=0; i < (int)ACC_NB; i++) {
749 U64 const key64 = XXH_readLE64(xkey + 8*i);
750 U64 acc64 = xacc[i];
751 acc64 ^= acc64 >> 47;
752 acc64 ^= key64;
753 acc64 *= PRIME32_1;
754 xacc[i] = acc64;
755 }
756
757 #endif
758 }
759
760 /* assumption : nbStripes will not overflow secret size */
761 XXH_FORCE_INLINE void
XXH3_accumulate(U64 * XXH_RESTRICT acc,const void * XXH_RESTRICT data,const void * XXH_RESTRICT secret,size_t nbStripes,XXH3_accWidth_e accWidth)762 XXH3_accumulate( U64* XXH_RESTRICT acc,
763 const void* XXH_RESTRICT data,
764 const void* XXH_RESTRICT secret,
765 size_t nbStripes,
766 XXH3_accWidth_e accWidth)
767 {
768 size_t n;
769 /* Clang doesn't unroll this loop without the pragma. Unrolling can be up to 1.4x faster.
770 * The unroll statement seems detrimental for WASM (@aras-p) and ARM though.
771 */
772 #if defined(__clang__) && !defined(__OPTIMIZE_SIZE__) && !defined(__ARM_ARCH) && !defined(__EMSCRIPTEN__)
773 # pragma clang loop unroll(enable)
774 #endif
775
776 for (n = 0; n < nbStripes; n++ ) {
777 XXH3_accumulate_512(acc,
778 (const char*)data + n*STRIPE_LEN,
779 (const char*)secret + n*XXH_SECRET_CONSUME_RATE,
780 accWidth);
781 }
782 }
783
784 /* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
785 * and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
786 * However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
787 * Pretty much every other modes and compilers prefer `FORCE_INLINE`.
788 */
789 #if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__)
790 static void
791 #else
792 XXH_FORCE_INLINE void
793 #endif
XXH3_hashLong_internal_loop(U64 * XXH_RESTRICT acc,const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize,XXH3_accWidth_e accWidth)794 XXH3_hashLong_internal_loop( U64* XXH_RESTRICT acc,
795 const void* XXH_RESTRICT data, size_t len,
796 const void* XXH_RESTRICT secret, size_t secretSize,
797 XXH3_accWidth_e accWidth)
798 {
799 size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
800 size_t const block_len = STRIPE_LEN * nb_rounds;
801 size_t const nb_blocks = len / block_len;
802
803 size_t n;
804
805 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
806
807 for (n = 0; n < nb_blocks; n++) {
808 XXH3_accumulate(acc, (const char*)data + n*block_len, secret, nb_rounds, accWidth);
809 XXH3_scrambleAcc(acc, (const char*)secret + secretSize - STRIPE_LEN);
810 }
811
812 /* last partial block */
813 XXH_ASSERT(len > STRIPE_LEN);
814 { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
815 XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
816 XXH3_accumulate(acc, (const char*)data + nb_blocks*block_len, secret, nbStripes, accWidth);
817
818 /* last stripe */
819 if (len & (STRIPE_LEN - 1)) {
820 const void* const p = (const char*)data + len - STRIPE_LEN;
821 #define XXH_SECRET_LASTACC_START 7 /* do not align on 8, so that secret is different from scrambler */
822 XXH3_accumulate_512(acc, p, (const char*)secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
823 } }
824 }
825
826 XXH_FORCE_INLINE U64
XXH3_mix2Accs(const U64 * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)827 XXH3_mix2Accs(const U64* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
828 {
829 const U64* const key64 = (const U64*)secret;
830 return XXH3_mul128_fold64(
831 acc[0] ^ XXH_readLE64(key64),
832 acc[1] ^ XXH_readLE64(key64+1) );
833 }
834
835 static XXH64_hash_t
XXH3_mergeAccs(const U64 * XXH_RESTRICT acc,const void * XXH_RESTRICT secret,U64 start)836 XXH3_mergeAccs(const U64* XXH_RESTRICT acc, const void* XXH_RESTRICT secret, U64 start)
837 {
838 U64 result64 = start;
839
840 result64 += XXH3_mix2Accs(acc+0, (const char*)secret + 0);
841 result64 += XXH3_mix2Accs(acc+2, (const char*)secret + 16);
842 result64 += XXH3_mix2Accs(acc+4, (const char*)secret + 32);
843 result64 += XXH3_mix2Accs(acc+6, (const char*)secret + 48);
844
845 return XXH3_avalanche(result64);
846 }
847
848 #define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
849 PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
850
851 XXH_FORCE_INLINE XXH64_hash_t
XXH3_hashLong_internal(const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize)852 XXH3_hashLong_internal(const void* XXH_RESTRICT data, size_t len,
853 const void* XXH_RESTRICT secret, size_t secretSize)
854 {
855 XXH_ALIGN(XXH_ACC_ALIGN) U64 acc[ACC_NB] = XXH3_INIT_ACC;
856
857 XXH3_hashLong_internal_loop(acc, data, len, secret, secretSize, XXH3_acc_64bits);
858
859 /* converge into final hash */
860 XXH_STATIC_ASSERT(sizeof(acc) == 64);
861 #define XXH_SECRET_MERGEACCS_START 11 /* do not align on 8, so that secret is different from accumulator */
862 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
863 return XXH3_mergeAccs(acc, (const char*)secret + XXH_SECRET_MERGEACCS_START, (U64)len * PRIME64_1);
864 }
865
866
867 XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3_hashLong_64b_defaultSecret(const void * XXH_RESTRICT data,size_t len)868 XXH3_hashLong_64b_defaultSecret(const void* XXH_RESTRICT data, size_t len)
869 {
870 return XXH3_hashLong_internal(data, len, kSecret, sizeof(kSecret));
871 }
872
873 XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3_hashLong_64b_withSecret(const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize)874 XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT data, size_t len,
875 const void* XXH_RESTRICT secret, size_t secretSize)
876 {
877 return XXH3_hashLong_internal(data, len, secret, secretSize);
878 }
879
880
XXH_writeLE64(void * dst,U64 v64)881 XXH_FORCE_INLINE void XXH_writeLE64(void* dst, U64 v64)
882 {
883 if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
884 memcpy(dst, &v64, sizeof(v64));
885 }
886
887 /* XXH3_initKeySeed() :
888 * destination `customSecret` is presumed allocated and same size as `kSecret`.
889 */
XXH3_initKeySeed(void * customSecret,U64 seed64)890 XXH_FORCE_INLINE void XXH3_initKeySeed(void* customSecret, U64 seed64)
891 {
892 char* const dst = (char*)customSecret;
893 const char* const src = (const char*)kSecret;
894 int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
895 int i;
896
897 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
898
899 for (i=0; i < nbRounds; i++) {
900 XXH_writeLE64(dst + 16*i, XXH_readLE64(src + 16*i) + seed64);
901 XXH_writeLE64(dst + 16*i + 8, XXH_readLE64(src + 16*i + 8) - seed64);
902 }
903 }
904
905
906 /* XXH3_hashLong_64b_withSeed() :
907 * Generate a custom key,
908 * based on alteration of default kSecret with the seed,
909 * and then use this key for long mode hashing.
910 * This operation is decently fast but nonetheless costs a little bit of time.
911 * Try to avoid it whenever possible (typically when seed==0).
912 */
913 XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3_hashLong_64b_withSeed(const void * data,size_t len,XXH64_hash_t seed)914 XXH3_hashLong_64b_withSeed(const void* data, size_t len, XXH64_hash_t seed)
915 {
916 XXH_ALIGN(8) char secret[XXH_SECRET_DEFAULT_SIZE];
917 if (seed==0) return XXH3_hashLong_64b_defaultSecret(data, len);
918 XXH3_initKeySeed(secret, seed);
919 return XXH3_hashLong_internal(data, len, secret, sizeof(secret));
920 }
921
922
XXH3_mix16B(const void * XXH_RESTRICT data,const void * XXH_RESTRICT key,U64 seed64)923 XXH_FORCE_INLINE U64 XXH3_mix16B(const void* XXH_RESTRICT data,
924 const void* XXH_RESTRICT key, U64 seed64)
925 {
926 const U64* const key64 = (const U64*)key;
927 U64 const ll1 = XXH_readLE64(data);
928 U64 const ll2 = XXH_readLE64((const BYTE*)data+8);
929 return XXH3_mul128_fold64(
930 ll1 ^ (XXH_readLE64(key64) + seed64),
931 ll2 ^ (XXH_readLE64(key64+1) - seed64) );
932 }
933
934
935 XXH_FORCE_INLINE XXH64_hash_t
XXH3_len_17to128_64b(const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)936 XXH3_len_17to128_64b(const void* XXH_RESTRICT data, size_t len,
937 const void* XXH_RESTRICT secret, size_t secretSize,
938 XXH64_hash_t seed)
939 {
940 const BYTE* const p = (const BYTE*)data;
941 const char* const key = (const char*)secret;
942
943 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
944 XXH_ASSERT(16 < len && len <= 128);
945
946 { U64 acc = len * PRIME64_1;
947 if (len > 32) {
948 if (len > 64) {
949 if (len > 96) {
950 acc += XXH3_mix16B(p+48, key+96, seed);
951 acc += XXH3_mix16B(p+len-64, key+112, seed);
952 }
953 acc += XXH3_mix16B(p+32, key+64, seed);
954 acc += XXH3_mix16B(p+len-48, key+80, seed);
955 }
956 acc += XXH3_mix16B(p+16, key+32, seed);
957 acc += XXH3_mix16B(p+len-32, key+48, seed);
958 }
959 acc += XXH3_mix16B(p+0, key+0, seed);
960 acc += XXH3_mix16B(p+len-16, key+16, seed);
961
962 return XXH3_avalanche(acc);
963 }
964 }
965
966 #define XXH3_MIDSIZE_MAX 240
967
968 XXH_NO_INLINE XXH64_hash_t
XXH3_len_129to240_64b(const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)969 XXH3_len_129to240_64b(const void* XXH_RESTRICT data, size_t len,
970 const void* XXH_RESTRICT secret, size_t secretSize,
971 XXH64_hash_t seed)
972 {
973 const BYTE* const p = (const BYTE*)data;
974 const char* const key = (const char*)secret;
975
976 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
977 XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
978
979 #define XXH3_MIDSIZE_STARTOFFSET 3
980 #define XXH3_MIDSIZE_LASTOFFSET 17
981
982 { U64 acc = len * PRIME64_1;
983 int const nbRounds = (int)len / 16;
984 int i;
985 for (i=0; i<8; i++) {
986 acc += XXH3_mix16B(p+(16*i), key+(16*i), seed);
987 }
988 acc = XXH3_avalanche(acc);
989 XXH_ASSERT(nbRounds >= 8);
990 for (i=8 ; i < nbRounds; i++) {
991 acc += XXH3_mix16B(p+(16*i), key+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
992 }
993 /* last bytes */
994 acc += XXH3_mix16B(p + len - 16, key + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
995 return XXH3_avalanche(acc);
996 }
997 }
998
999 /* === Public entry point === */
1000
XXH3_64bits(const void * data,size_t len)1001 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len)
1002 {
1003 if (len <= 16) return XXH3_len_0to16_64b(data, len, kSecret, 0);
1004 if (len <= 128) return XXH3_len_17to128_64b(data, len, kSecret, sizeof(kSecret), 0);
1005 if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b(data, len, kSecret, sizeof(kSecret), 0);
1006 return XXH3_hashLong_64b_defaultSecret(data, len);
1007 }
1008
1009 XXH_PUBLIC_API XXH64_hash_t
XXH3_64bits_withSecret(const void * data,size_t len,const void * secret,size_t secretSize)1010 XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize)
1011 {
1012 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
1013 /* if an action must be taken should `secret` conditions not be respected,
1014 * it should be done here.
1015 * For now, it's a contract pre-condition.
1016 * Adding a check and a branch here would cost performance at every hash */
1017 if (len <= 16) return XXH3_len_0to16_64b(data, len, secret, 0);
1018 if (len <= 128) return XXH3_len_17to128_64b(data, len, secret, secretSize, 0);
1019 if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b(data, len, secret, secretSize, 0);
1020 return XXH3_hashLong_64b_withSecret(data, len, secret, secretSize);
1021 }
1022
1023 XXH_PUBLIC_API XXH64_hash_t
XXH3_64bits_withSeed(const void * data,size_t len,XXH64_hash_t seed)1024 XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed)
1025 {
1026 if (len <= 16) return XXH3_len_0to16_64b(data, len, kSecret, seed);
1027 if (len <= 128) return XXH3_len_17to128_64b(data, len, kSecret, sizeof(kSecret), seed);
1028 if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b(data, len, kSecret, sizeof(kSecret), seed);
1029 return XXH3_hashLong_64b_withSeed(data, len, seed);
1030 }
1031
1032 /* === XXH3 streaming === */
1033
XXH3_createState(void)1034 XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
1035 {
1036 return (XXH3_state_t*)XXH_malloc(sizeof(XXH3_state_t));
1037 }
1038
XXH3_freeState(XXH3_state_t * statePtr)1039 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
1040 {
1041 XXH_free(statePtr);
1042 return XXH_OK;
1043 }
1044
1045 XXH_PUBLIC_API void
XXH3_copyState(XXH3_state_t * dst_state,const XXH3_state_t * src_state)1046 XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
1047 {
1048 memcpy(dst_state, src_state, sizeof(*dst_state));
1049 }
1050
1051 static void
XXH3_64bits_reset_internal(XXH3_state_t * statePtr,XXH64_hash_t seed,const void * secret,size_t secretSize)1052 XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
1053 XXH64_hash_t seed,
1054 const void* secret, size_t secretSize)
1055 {
1056 XXH_ASSERT(statePtr != NULL);
1057 memset(statePtr, 0, sizeof(*statePtr));
1058 statePtr->acc[0] = PRIME32_3;
1059 statePtr->acc[1] = PRIME64_1;
1060 statePtr->acc[2] = PRIME64_2;
1061 statePtr->acc[3] = PRIME64_3;
1062 statePtr->acc[4] = PRIME64_4;
1063 statePtr->acc[5] = PRIME32_2;
1064 statePtr->acc[6] = PRIME64_5;
1065 statePtr->acc[7] = PRIME32_1;
1066 statePtr->seed = seed;
1067 XXH_ASSERT(secret != NULL);
1068 statePtr->secret = secret;
1069 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
1070 statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
1071 statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
1072 }
1073
1074 XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_reset(XXH3_state_t * statePtr)1075 XXH3_64bits_reset(XXH3_state_t* statePtr)
1076 {
1077 if (statePtr == NULL) return XXH_ERROR;
1078 XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
1079 return XXH_OK;
1080 }
1081
1082 XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_reset_withSecret(XXH3_state_t * statePtr,const void * secret,size_t secretSize)1083 XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
1084 {
1085 if (statePtr == NULL) return XXH_ERROR;
1086 XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize);
1087 if (secret == NULL) return XXH_ERROR;
1088 if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
1089 return XXH_OK;
1090 }
1091
1092 XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_reset_withSeed(XXH3_state_t * statePtr,XXH64_hash_t seed)1093 XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
1094 {
1095 if (statePtr == NULL) return XXH_ERROR;
1096 XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
1097 XXH3_initKeySeed(statePtr->customSecret, seed);
1098 statePtr->secret = statePtr->customSecret;
1099 return XXH_OK;
1100 }
1101
1102 XXH_FORCE_INLINE void
XXH3_consumeStripes(U64 * acc,XXH32_hash_t * nbStripesSoFarPtr,XXH32_hash_t nbStripesPerBlock,const void * data,size_t totalStripes,const void * secret,size_t secretLimit,XXH3_accWidth_e accWidth)1103 XXH3_consumeStripes( U64* acc,
1104 XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
1105 const void* data, size_t totalStripes,
1106 const void* secret, size_t secretLimit,
1107 XXH3_accWidth_e accWidth)
1108 {
1109 XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
1110 if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
1111 /* need a scrambling operation */
1112 size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
1113 XXH3_accumulate(acc, data, (const char*)secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
1114 XXH3_scrambleAcc(acc, (const char*)secret + secretLimit);
1115 XXH3_accumulate(acc, (const char*)data + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
1116 *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
1117 } else {
1118 XXH3_accumulate(acc, data, (const char*)secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
1119 *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
1120 }
1121 }
1122
1123 XXH_FORCE_INLINE XXH_errorcode
XXH3_update(XXH3_state_t * state,const void * input,size_t len,XXH3_accWidth_e accWidth)1124 XXH3_update(XXH3_state_t* state, const void* input, size_t len, XXH3_accWidth_e accWidth)
1125 {
1126 if (input==NULL)
1127 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
1128 return XXH_OK;
1129 #else
1130 return XXH_ERROR;
1131 #endif
1132
1133 { const BYTE* p = (const BYTE*)input;
1134 const BYTE* const bEnd = p + len;
1135
1136 state->totalLen += len;
1137
1138 if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */
1139 XXH_memcpy(state->buffer + state->bufferedSize, input, len);
1140 state->bufferedSize += (XXH32_hash_t)len;
1141 return XXH_OK;
1142 }
1143 /* input now > XXH3_INTERNALBUFFER_SIZE */
1144
1145 #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN)
1146 XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0); /* clean multiple */
1147
1148 if (state->bufferedSize) { /* some data within internal buffer: fill then consume it */
1149 size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
1150 XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
1151 p += loadSize;
1152 XXH3_consumeStripes(state->acc,
1153 &state->nbStripesSoFar, state->nbStripesPerBlock,
1154 state->buffer, XXH3_INTERNALBUFFER_STRIPES,
1155 state->secret, state->secretLimit,
1156 accWidth);
1157 state->bufferedSize = 0;
1158 }
1159
1160 /* consume input by full buffer quantities */
1161 if (p+XXH3_INTERNALBUFFER_SIZE <= bEnd) {
1162 const BYTE* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
1163 do {
1164 XXH3_consumeStripes(state->acc,
1165 &state->nbStripesSoFar, state->nbStripesPerBlock,
1166 p, XXH3_INTERNALBUFFER_STRIPES,
1167 state->secret, state->secretLimit,
1168 accWidth);
1169 p += XXH3_INTERNALBUFFER_SIZE;
1170 } while (p<=limit);
1171 }
1172
1173 if (p < bEnd) { /* some remaining input data : buffer it */
1174 XXH_memcpy(state->buffer, p, (size_t)(bEnd-p));
1175 state->bufferedSize = (XXH32_hash_t)(bEnd-p);
1176 }
1177 }
1178
1179 return XXH_OK;
1180 }
1181
1182 XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_update(XXH3_state_t * state,const void * input,size_t len)1183 XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
1184 {
1185 return XXH3_update(state, input, len, XXH3_acc_64bits);
1186 }
1187
1188
1189 XXH_FORCE_INLINE void
XXH3_digest_long(XXH64_hash_t * acc,const XXH3_state_t * state,XXH3_accWidth_e accWidth)1190 XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth)
1191 {
1192 memcpy(acc, state->acc, sizeof(state->acc)); /* digest locally, state remains unaltered, and can continue ingesting more data afterwards */
1193 if (state->bufferedSize >= STRIPE_LEN) {
1194 size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
1195 XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
1196 XXH3_consumeStripes(acc,
1197 &nbStripesSoFar, state->nbStripesPerBlock,
1198 state->buffer, totalNbStripes,
1199 state->secret, state->secretLimit,
1200 accWidth);
1201 if (state->bufferedSize % STRIPE_LEN) { /* one last partial stripe */
1202 XXH3_accumulate_512(acc,
1203 state->buffer + state->bufferedSize - STRIPE_LEN,
1204 (const char*)state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
1205 accWidth);
1206 }
1207 } else { /* bufferedSize < STRIPE_LEN */
1208 if (state->bufferedSize) { /* one last stripe */
1209 char lastStripe[STRIPE_LEN];
1210 size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
1211 memcpy(lastStripe, (const char*)state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
1212 memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
1213 XXH3_accumulate_512(acc,
1214 lastStripe,
1215 (const char*)state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
1216 accWidth);
1217 } }
1218 }
1219
XXH3_64bits_digest(const XXH3_state_t * state)1220 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
1221 {
1222 if (state->totalLen > XXH3_MIDSIZE_MAX) {
1223 XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
1224 XXH3_digest_long(acc, state, XXH3_acc_64bits);
1225 return XXH3_mergeAccs(acc, (const char*)state->secret + XXH_SECRET_MERGEACCS_START, (U64)state->totalLen * PRIME64_1);
1226 }
1227 /* len <= XXH3_MIDSIZE_MAX : short code */
1228 if (state->seed)
1229 return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
1230 return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
1231 }
1232
1233 /* ==========================================
1234 * XXH3 128 bits (=> XXH128)
1235 * ========================================== */
1236
1237 XXH_FORCE_INLINE XXH128_hash_t
XXH3_len_1to3_128b(const void * data,size_t len,const void * keyPtr,XXH64_hash_t seed)1238 XXH3_len_1to3_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
1239 {
1240 XXH_ASSERT(data != NULL);
1241 XXH_ASSERT(1 <= len && len <= 3);
1242 XXH_ASSERT(keyPtr != NULL);
1243 { const U32* const key32 = (const U32*) keyPtr;
1244 BYTE const c1 = ((const BYTE*)data)[0];
1245 BYTE const c2 = ((const BYTE*)data)[len >> 1];
1246 BYTE const c3 = ((const BYTE*)data)[len - 1];
1247 U32 const combinedl = ((U32)c1) + (((U32)c2) << 8) + (((U32)c3) << 16) + (((U32)len) << 24);
1248 U32 const combinedh = XXH_swap32(combinedl);
1249 U64 const keyedl = (U64)combinedl ^ (XXH_readLE32(key32) + seed);
1250 U64 const keyedh = (U64)combinedh ^ (XXH_readLE32(key32+1) - seed);
1251 U64 const mixedl = keyedl * PRIME64_1;
1252 U64 const mixedh = keyedh * PRIME64_2;
1253 XXH128_hash_t const h128 = { XXH3_avalanche(mixedl) /*low64*/, XXH3_avalanche(mixedh) /*high64*/ };
1254 return h128;
1255 }
1256 }
1257
1258
1259 XXH_FORCE_INLINE XXH128_hash_t
XXH3_len_4to8_128b(const void * data,size_t len,const void * keyPtr,XXH64_hash_t seed)1260 XXH3_len_4to8_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
1261 {
1262 XXH_ASSERT(data != NULL);
1263 XXH_ASSERT(keyPtr != NULL);
1264 XXH_ASSERT(4 <= len && len <= 8);
1265 { U32 const in1 = XXH_readLE32(data);
1266 U32 const in2 = XXH_readLE32((const BYTE*)data + len - 4);
1267 U64 const in64l = in1 + ((U64)in2 << 32);
1268 U64 const in64h = XXH_swap64(in64l);
1269 U64 const keyedl = in64l ^ (XXH_readLE64(keyPtr) + seed);
1270 U64 const keyedh = in64h ^ (XXH_readLE64((const char*)keyPtr + 8) - seed);
1271 U64 const mix64l1 = len + ((keyedl ^ (keyedl >> 51)) * PRIME32_1);
1272 U64 const mix64l2 = (mix64l1 ^ (mix64l1 >> 47)) * PRIME64_2;
1273 U64 const mix64h1 = ((keyedh ^ (keyedh >> 47)) * PRIME64_1) - len;
1274 U64 const mix64h2 = (mix64h1 ^ (mix64h1 >> 43)) * PRIME64_4;
1275 { XXH128_hash_t const h128 = { XXH3_avalanche(mix64l2) /*low64*/, XXH3_avalanche(mix64h2) /*high64*/ };
1276 return h128;
1277 } }
1278 }
1279
1280 XXH_FORCE_INLINE XXH128_hash_t
XXH3_len_9to16_128b(const void * data,size_t len,const void * keyPtr,XXH64_hash_t seed)1281 XXH3_len_9to16_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
1282 {
1283 XXH_ASSERT(data != NULL);
1284 XXH_ASSERT(keyPtr != NULL);
1285 XXH_ASSERT(9 <= len && len <= 16);
1286 { const U64* const key64 = (const U64*) keyPtr;
1287 U64 const ll1 = XXH_readLE64(data) ^ (XXH_readLE64(key64) + seed);
1288 U64 const ll2 = XXH_readLE64((const BYTE*)data + len - 8) ^ (XXH_readLE64(key64+1) - seed);
1289 U64 const inlow = ll1 ^ ll2;
1290 XXH128_hash_t m128 = XXH3_mul128(inlow, PRIME64_1);
1291 m128.high64 += ll2 * PRIME64_1;
1292 m128.low64 ^= (m128.high64 >> 32);
1293 { XXH128_hash_t h128 = XXH3_mul128(m128.low64, PRIME64_2);
1294 h128.high64 += m128.high64 * PRIME64_2;
1295 h128.low64 = XXH3_avalanche(h128.low64);
1296 h128.high64 = XXH3_avalanche(h128.high64);
1297 return h128;
1298 } }
1299 }
1300
1301 /* Assumption : `secret` size is >= 16
1302 * Note : it should be >= XXH3_SECRET_SIZE_MIN anyway */
1303 XXH_FORCE_INLINE XXH128_hash_t
XXH3_len_0to16_128b(const void * data,size_t len,const void * secret,XXH64_hash_t seed)1304 XXH3_len_0to16_128b(const void* data, size_t len, const void* secret, XXH64_hash_t seed)
1305 {
1306 XXH_ASSERT(len <= 16);
1307 { if (len > 8) return XXH3_len_9to16_128b(data, len, secret, seed);
1308 if (len >= 4) return XXH3_len_4to8_128b(data, len, secret, seed);
1309 if (len) return XXH3_len_1to3_128b(data, len, secret, seed);
1310 { XXH128_hash_t const h128 = { 0, 0 };
1311 return h128;
1312 } }
1313 }
1314
1315 XXH_FORCE_INLINE XXH128_hash_t
XXH3_hashLong_128b_internal(const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize)1316 XXH3_hashLong_128b_internal(const void* XXH_RESTRICT data, size_t len,
1317 const void* XXH_RESTRICT secret, size_t secretSize)
1318 {
1319 XXH_ALIGN(XXH_ACC_ALIGN) U64 acc[ACC_NB] = XXH3_INIT_ACC;
1320
1321 XXH3_hashLong_internal_loop(acc, data, len, secret, secretSize, XXH3_acc_128bits);
1322
1323 /* converge into final hash */
1324 XXH_STATIC_ASSERT(sizeof(acc) == 64);
1325 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
1326 { U64 const low64 = XXH3_mergeAccs(acc, (const char*)secret + XXH_SECRET_MERGEACCS_START, (U64)len * PRIME64_1);
1327 U64 const high64 = XXH3_mergeAccs(acc, (const char*)secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((U64)len * PRIME64_2));
1328 XXH128_hash_t const h128 = { low64, high64 };
1329 return h128;
1330 }
1331 }
1332
1333 XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3_hashLong_128b_defaultSecret(const void * data,size_t len)1334 XXH3_hashLong_128b_defaultSecret(const void* data, size_t len)
1335 {
1336 return XXH3_hashLong_128b_internal(data, len, kSecret, sizeof(kSecret));
1337 }
1338
1339 XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3_hashLong_128b_withSecret(const void * data,size_t len,const void * secret,size_t secretSize)1340 XXH3_hashLong_128b_withSecret(const void* data, size_t len,
1341 const void* secret, size_t secretSize)
1342 {
1343 return XXH3_hashLong_128b_internal(data, len, secret, secretSize);
1344 }
1345
1346 XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3_hashLong_128b_withSeed(const void * data,size_t len,XXH64_hash_t seed)1347 XXH3_hashLong_128b_withSeed(const void* data, size_t len, XXH64_hash_t seed)
1348 {
1349 XXH_ALIGN(8) char secret[XXH_SECRET_DEFAULT_SIZE];
1350 if (seed == 0) return XXH3_hashLong_128b_defaultSecret(data, len);
1351 XXH3_initKeySeed(secret, seed);
1352 return XXH3_hashLong_128b_internal(data, len, secret, sizeof(secret));
1353 }
1354
1355 XXH_NO_INLINE XXH128_hash_t
XXH3_len_129to240_128b(const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1356 XXH3_len_129to240_128b(const void* XXH_RESTRICT data, size_t len,
1357 const void* XXH_RESTRICT secret, size_t secretSize,
1358 XXH64_hash_t seed)
1359 {
1360 const BYTE* const p = (const BYTE*)data;
1361 const char* const key = (const char*)secret;
1362
1363 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
1364 XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
1365
1366 { U64 acc1 = len * PRIME64_1;
1367 U64 acc2 = 0;
1368 int const nbRounds = (int)len / 32;
1369 int i;
1370 for (i=0; i<4; i++) {
1371 acc1 += XXH3_mix16B(p+(32*i), key+(32*i), seed);
1372 acc2 += XXH3_mix16B(p+(32*i)+16, key+(32*i)+16, -seed);
1373 }
1374 acc1 = XXH3_avalanche(acc1);
1375 acc2 = XXH3_avalanche(acc2);
1376 XXH_ASSERT(nbRounds >= 4);
1377 for (i=4 ; i < nbRounds; i++) {
1378 acc1 += XXH3_mix16B(p+(32*i) , key+(32*(i-4)) + XXH3_MIDSIZE_STARTOFFSET, seed);
1379 acc2 += XXH3_mix16B(p+(32*i)+16, key+(32*(i-4))+16 + XXH3_MIDSIZE_STARTOFFSET, -seed);
1380 }
1381 /* last bytes */
1382 acc1 += XXH3_mix16B(p + len - 16, key + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET , seed);
1383 acc2 += XXH3_mix16B(p + len - 32, key + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, -seed);
1384
1385 { U64 const low64 = acc1 + acc2;
1386 U64 const high64 = (acc1 * PRIME64_1) + (acc2 * PRIME64_4) + ((len - seed) * PRIME64_2);
1387 XXH128_hash_t const h128 = { XXH3_avalanche(low64), (XXH64_hash_t)0 - XXH3_avalanche(high64) };
1388 return h128;
1389 }
1390 }
1391 }
1392
1393 XXH_FORCE_INLINE XXH128_hash_t
XXH3_len_17to128_128b(const void * XXH_RESTRICT data,size_t len,const void * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1394 XXH3_len_17to128_128b(const void* XXH_RESTRICT data, size_t len,
1395 const void* XXH_RESTRICT secret, size_t secretSize,
1396 XXH64_hash_t seed)
1397 {
1398 const BYTE* const p = (const BYTE*)data;
1399 const char* const key = (const char*)secret;
1400
1401 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
1402 XXH_ASSERT(16 < len && len <= 128);
1403
1404 { U64 acc1 = len * PRIME64_1;
1405 U64 acc2 = 0;
1406 if (len > 32) {
1407 if (len > 64) {
1408 if (len > 96) {
1409 acc1 += XXH3_mix16B(p+48, key+96, seed);
1410 acc2 += XXH3_mix16B(p+len-64, key+112, seed);
1411 }
1412 acc1 += XXH3_mix16B(p+32, key+64, seed);
1413 acc2 += XXH3_mix16B(p+len-48, key+80, seed);
1414 }
1415 acc1 += XXH3_mix16B(p+16, key+32, seed);
1416 acc2 += XXH3_mix16B(p+len-32, key+48, seed);
1417 }
1418 acc1 += XXH3_mix16B(p+0, key+0, seed);
1419 acc2 += XXH3_mix16B(p+len-16, key+16, seed);
1420
1421 { U64 const low64 = acc1 + acc2;
1422 U64 const high64 = (acc1 * PRIME64_1) + (acc2 * PRIME64_4) + ((len - seed) * PRIME64_2);
1423 XXH128_hash_t const h128 = { XXH3_avalanche(low64), (XXH64_hash_t)0 - XXH3_avalanche(high64) };
1424 return h128;
1425 }
1426 }
1427 }
1428
XXH3_128bits(const void * data,size_t len)1429 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len)
1430 {
1431 if (len <= 16) return XXH3_len_0to16_128b(data, len, kSecret, 0);
1432 if (len <= 128) return XXH3_len_17to128_128b(data, len, kSecret, sizeof(kSecret), 0);
1433 if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b(data, len, kSecret, sizeof(kSecret), 0);
1434 return XXH3_hashLong_128b_defaultSecret(data, len);
1435 }
1436
1437 XXH_PUBLIC_API XXH128_hash_t
XXH3_128bits_withSecret(const void * data,size_t len,const void * secret,size_t secretSize)1438 XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize)
1439 {
1440 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
1441 /* if an action must be taken should `secret` conditions not be respected,
1442 * it should be done here.
1443 * For now, it's a contract pre-condition.
1444 * Adding a check and a branch here would cost performance at every hash */
1445 if (len <= 16) return XXH3_len_0to16_128b(data, len, secret, 0);
1446 if (len <= 128) return XXH3_len_17to128_128b(data, len, secret, secretSize, 0);
1447 if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b(data, len, secret, secretSize, 0);
1448 return XXH3_hashLong_128b_withSecret(data, len, secret, secretSize);
1449 }
1450
1451 XXH_PUBLIC_API XXH128_hash_t
XXH3_128bits_withSeed(const void * data,size_t len,XXH64_hash_t seed)1452 XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed)
1453 {
1454 if (len <= 16) return XXH3_len_0to16_128b(data, len, kSecret, seed);
1455 if (len <= 128) return XXH3_len_17to128_128b(data, len, kSecret, sizeof(kSecret), seed);
1456 if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b(data, len, kSecret, sizeof(kSecret), seed);
1457 return XXH3_hashLong_128b_withSeed(data, len, seed);
1458 }
1459
1460 XXH_PUBLIC_API XXH128_hash_t
XXH128(const void * data,size_t len,XXH64_hash_t seed)1461 XXH128(const void* data, size_t len, XXH64_hash_t seed)
1462 {
1463 return XXH3_128bits_withSeed(data, len, seed);
1464 }
1465
1466
1467 /* === XXH3 128-bit streaming === */
1468
1469 /* all the functions are actually the same as for 64-bit streaming variant,
1470 just the reset one is different (different initial acc values for 0,5,6,7),
1471 and near the end of the digest function */
1472
1473 static void
XXH3_128bits_reset_internal(XXH3_state_t * statePtr,XXH64_hash_t seed,const void * secret,size_t secretSize)1474 XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
1475 XXH64_hash_t seed,
1476 const void* secret, size_t secretSize)
1477 {
1478 XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
1479 }
1480
1481 XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_reset(XXH3_state_t * statePtr)1482 XXH3_128bits_reset(XXH3_state_t* statePtr)
1483 {
1484 if (statePtr == NULL) return XXH_ERROR;
1485 XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
1486 return XXH_OK;
1487 }
1488
1489 XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_reset_withSecret(XXH3_state_t * statePtr,const void * secret,size_t secretSize)1490 XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
1491 {
1492 if (statePtr == NULL) return XXH_ERROR;
1493 XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize);
1494 if (secret == NULL) return XXH_ERROR;
1495 if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
1496 return XXH_OK;
1497 }
1498
1499 XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_reset_withSeed(XXH3_state_t * statePtr,XXH64_hash_t seed)1500 XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
1501 {
1502 if (statePtr == NULL) return XXH_ERROR;
1503 XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
1504 XXH3_initKeySeed(statePtr->customSecret, seed);
1505 statePtr->secret = statePtr->customSecret;
1506 return XXH_OK;
1507 }
1508
1509 XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_update(XXH3_state_t * state,const void * input,size_t len)1510 XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
1511 {
1512 return XXH3_update(state, input, len, XXH3_acc_128bits);
1513 }
1514
XXH3_128bits_digest(const XXH3_state_t * state)1515 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
1516 {
1517 if (state->totalLen > XXH3_MIDSIZE_MAX) {
1518 XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
1519 XXH3_digest_long(acc, state, XXH3_acc_128bits);
1520 XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
1521 { U64 const low64 = XXH3_mergeAccs(acc, (const char*)state->secret + XXH_SECRET_MERGEACCS_START, (U64)state->totalLen * PRIME64_1);
1522 U64 const high64 = XXH3_mergeAccs(acc, (const char*)state->secret + state->secretLimit + STRIPE_LEN - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((U64)state->totalLen * PRIME64_2));
1523 XXH128_hash_t const h128 = { low64, high64 };
1524 return h128;
1525 }
1526 }
1527 /* len <= XXH3_MIDSIZE_MAX : short code */
1528 if (state->seed)
1529 return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
1530 return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
1531 }
1532
1533 /* 128-bit utility functions */
1534
1535 #include <string.h> /* memcmp */
1536
1537 /* return : 1 is equal, 0 if different */
XXH128_isEqual(XXH128_hash_t h1,XXH128_hash_t h2)1538 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
1539 {
1540 /* note : XXH128_hash_t is compact, it has no padding byte */
1541 return !(memcmp(&h1, &h2, sizeof(h1)));
1542 }
1543
1544 /* This prototype is compatible with stdlib's qsort().
1545 * return : >0 if *h128_1 > *h128_2
1546 * <0 if *h128_1 < *h128_2
1547 * =0 if *h128_1 == *h128_2 */
XXH128_cmp(const void * h128_1,const void * h128_2)1548 XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
1549 {
1550 XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
1551 XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
1552 int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
1553 /* note : bets that, in most cases, hash values are different */
1554 if (hcmp) return hcmp;
1555 return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
1556 }
1557
1558
1559 /*====== Canonical representation ======*/
1560 XXH_PUBLIC_API void
XXH128_canonicalFromHash(XXH128_canonical_t * dst,XXH128_hash_t hash)1561 XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
1562 {
1563 XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
1564 if (XXH_CPU_LITTLE_ENDIAN) {
1565 hash.high64 = XXH_swap64(hash.high64);
1566 hash.low64 = XXH_swap64(hash.low64);
1567 }
1568 memcpy(dst, &hash.high64, sizeof(hash.high64));
1569 memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
1570 }
1571
1572 XXH_PUBLIC_API XXH128_hash_t
XXH128_hashFromCanonical(const XXH128_canonical_t * src)1573 XXH128_hashFromCanonical(const XXH128_canonical_t* src)
1574 {
1575 XXH128_hash_t h;
1576 h.high64 = XXH_readBE64(src);
1577 h.low64 = XXH_readBE64(src->digest + 8);
1578 return h;
1579 }
1580
1581
1582
1583 #endif /* XXH3_H */
1584