1 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 /*
6 xxHash - Extremely Fast Hash algorithm
7 Development source file for `xxh3`
8 Copyright (C) 2019-present, Yann Collet.
9
10 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
11
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are
14 met:
15
16 * Redistributions of source code must retain the above copyright
17 notice, this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above
19 copyright notice, this list of conditions and the following disclaimer
20 in the documentation and/or other materials provided with the
21 distribution.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
35 You can contact the author at :
36 - xxHash source repository : https://github.com/Cyan4973/xxHash
37 */
38
39 /* RocksDB Note: This file contains a preview release (xxhash repository
40 version 0.7.2) of XXH3 that is unlikely to be compatible with the final
41 version of XXH3. We have therefore renamed this XXH3p ("preview"), for
42 clarity so that we can continue to use this version even after
43 integrating a newer incompatible version.
44 */
45
46 /* Note :
47 This file is separated for development purposes.
48 It will be integrated into `xxhash.c` when development phase is complete.
49 */
50
51 #ifndef XXH3p_H
52 #define XXH3p_H
53
54
55 /* === Dependencies === */
56
57 #undef XXH_INLINE_ALL /* in case it's already defined */
58 #define XXH_INLINE_ALL
59 #include "xxhash.h"
60
61
62 /* === Compiler specifics === */
63
64 #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
65 # define XXH_RESTRICT restrict
66 #else
67 /* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
68 # define XXH_RESTRICT /* disable */
69 #endif
70
71 #if defined(__GNUC__)
72 # if defined(__AVX2__)
73 # include <immintrin.h>
74 # elif defined(__SSE2__)
75 # include <emmintrin.h>
76 # elif defined(__ARM_NEON__) || defined(__ARM_NEON)
77 # define inline __inline__ /* clang bug */
78 # include <arm_neon.h>
79 # undef inline
80 # endif
81 #elif defined(_MSC_VER)
82 # include <intrin.h>
83 #endif
84
85 /*
86 * Sanity check.
87 *
88 * XXH3 only requires these features to be efficient:
89 *
90 * - Usable unaligned access
91 * - A 32-bit or 64-bit ALU
92 * - If 32-bit, a decent ADC instruction
93 * - A 32 or 64-bit multiply with a 64-bit result
94 *
95 * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the
96 * classic 16-bit only subset of ARM's instruction set.
97 *
98 * First of all, Thumb-1 lacks support for the UMULL instruction which
99 * performs the important long multiply. This means numerous __aeabi_lmul
100 * calls.
101 *
102 * Second of all, the 8 functional registers are just not enough.
103 * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
104 * Lo registers, and this shuffling results in thousands more MOVs than A32.
105 *
106 * A32 and T32 don't have this limitation. They can access all 14 registers,
107 * do a 32->64 multiply with UMULL, and the flexible operand is helpful too.
108 *
109 * If compiling Thumb-1 for a target which supports ARM instructions, we
110 * will give a warning.
111 *
112 * Usually, if this happens, it is because of an accident and you probably
113 * need to specify -march, as you probably meant to compileh for a newer
114 * architecture.
115 */
116 #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
117 # warning "XXH3 is highly inefficient without ARM or Thumb-2."
118 #endif
119
120 /* ==========================================
121 * Vectorization detection
122 * ========================================== */
123 #define XXH_SCALAR 0
124 #define XXH_SSE2 1
125 #define XXH_AVX2 2
126 #define XXH_NEON 3
127 #define XXH_VSX 4
128
129 #ifndef XXH_VECTOR /* can be defined on command line */
130 # if defined(__AVX2__)
131 # define XXH_VECTOR XXH_AVX2
132 # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
133 # define XXH_VECTOR XXH_SSE2
134 # elif defined(__GNUC__) /* msvc support maybe later */ \
135 && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
136 && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
137 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
138 # define XXH_VECTOR XXH_NEON
139 # elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
140 # define XXH_VECTOR XXH_VSX
141 # else
142 # define XXH_VECTOR XXH_SCALAR
143 # endif
144 #endif
145
146 /* control alignment of accumulator,
147 * for compatibility with fast vector loads */
148 #ifndef XXH_ACC_ALIGN
149 # if XXH_VECTOR == 0 /* scalar */
150 # define XXH_ACC_ALIGN 8
151 # elif XXH_VECTOR == 1 /* sse2 */
152 # define XXH_ACC_ALIGN 16
153 # elif XXH_VECTOR == 2 /* avx2 */
154 # define XXH_ACC_ALIGN 32
155 # elif XXH_VECTOR == 3 /* neon */
156 # define XXH_ACC_ALIGN 16
157 # elif XXH_VECTOR == 4 /* vsx */
158 # define XXH_ACC_ALIGN 16
159 # endif
160 #endif
161
162 /* xxh_u64 XXH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */
163 #if defined(_MSC_VER) && defined(_M_IX86)
164 # include <intrin.h>
165 # define XXH_mult32to64(x, y) __emulu(x, y)
166 #else
167 # define XXH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF))
168 #endif
169
170 /* VSX stuff. It's a lot because VSX support is mediocre across compilers and
171 * there is a lot of mischief with endianness. */
172 #if XXH_VECTOR == XXH_VSX
173 # include <altivec.h>
174 # undef vector
175 typedef __vector unsigned long long U64x2;
176 typedef __vector unsigned char U8x16;
177 typedef __vector unsigned U32x4;
178
179 #ifndef XXH_VSX_BE
180 # if defined(__BIG_ENDIAN__) \
181 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
182 # define XXH_VSX_BE 1
183 # elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
184 # warning "-maltivec=be is not recommended. Please use native endianness."
185 # define XXH_VSX_BE 1
186 # else
187 # define XXH_VSX_BE 0
188 # endif
189 #endif
190
191 /* We need some helpers for big endian mode. */
192 #if XXH_VSX_BE
193 /* A wrapper for POWER9's vec_revb. */
194 # ifdef __POWER9_VECTOR__
195 # define XXH_vec_revb vec_revb
196 # else
XXH_vec_revb(U64x2 val)197 XXH_FORCE_INLINE U64x2 XXH_vec_revb(U64x2 val)
198 {
199 U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
200 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
201 return vec_perm(val, val, vByteSwap);
202 }
203 # endif
204
205 /* Power8 Crypto gives us vpermxor which is very handy for
206 * PPC64EB.
207 *
208 * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask)
209 * {
210 * U8x16 ret;
211 * for (int i = 0; i < 16; i++) {
212 * ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4];
213 * }
214 * return ret;
215 * }
216 *
217 * Because both of the main loops load the key, swap, and xor it with input,
218 * we can combine the key swap into this instruction.
219 */
220 # ifdef vec_permxor
221 # define XXH_vec_permxor vec_permxor
222 # else
223 # define XXH_vec_permxor __builtin_crypto_vpermxor
224 # endif
225 #endif /* XXH_VSX_BE */
226 /*
227 * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes
228 * vec_mule.
229 *
230 * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while.
231 * Clang has an easy way to control this, we can just use the builtin which doesn't swap.
232 * GCC needs inline assembly. */
233 #if __has_builtin(__builtin_altivec_vmuleuw)
234 # define XXH_vec_mulo __builtin_altivec_vmulouw
235 # define XXH_vec_mule __builtin_altivec_vmuleuw
236 #else
237 /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
XXH_vec_mulo(U32x4 a,U32x4 b)238 XXH_FORCE_INLINE U64x2 XXH_vec_mulo(U32x4 a, U32x4 b) {
239 U64x2 result;
240 __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
241 return result;
242 }
XXH_vec_mule(U32x4 a,U32x4 b)243 XXH_FORCE_INLINE U64x2 XXH_vec_mule(U32x4 a, U32x4 b) {
244 U64x2 result;
245 __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
246 return result;
247 }
248 #endif /* __has_builtin(__builtin_altivec_vmuleuw) */
249 #endif /* XXH_VECTOR == XXH_VSX */
250
251 /* prefetch
252 * can be disabled, by declaring XXH_NO_PREFETCH build macro */
253 #if defined(XXH_NO_PREFETCH)
254 # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
255 #else
256 # if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
257 # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
258 # define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
259 # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
260 # define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
261 # else
262 # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
263 # endif
264 #endif /* XXH_NO_PREFETCH */
265
266
267 /* ==========================================
268 * XXH3 default settings
269 * ========================================== */
270
271 #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3p_SECRET_SIZE_MIN */
272
273 #if (XXH_SECRET_DEFAULT_SIZE < XXH3p_SECRET_SIZE_MIN)
274 # error "default keyset is not large enough"
275 #endif
276
277 XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
278 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
279 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
280 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
281 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
282 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
283 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
284 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
285 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
286
287 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
288 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
289 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
290 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
291 };
292
293 /*
294 * GCC for x86 has a tendency to use SSE in this loop. While it
295 * successfully avoids swapping (as MUL overwrites EAX and EDX), it
296 * slows it down because instead of free register swap shifts, it
297 * must use pshufd and punpckl/hd.
298 *
299 * To prevent this, we use this attribute to shut off SSE.
300 */
301 #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
302 __attribute__((__target__("no-sse")))
303 #endif
304 static XXH128_hash_t
XXH_mult64to128(xxh_u64 lhs,xxh_u64 rhs)305 XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
306 {
307 /*
308 * GCC/Clang __uint128_t method.
309 *
310 * On most 64-bit targets, GCC and Clang define a __uint128_t type.
311 * This is usually the best way as it usually uses a native long 64-bit
312 * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
313 *
314 * Usually.
315 *
316 * Despite being a 32-bit platform, Clang (and emscripten) define this
317 * type despite not having the arithmetic for it. This results in a
318 * laggy compiler builtin call which calculates a full 128-bit multiply.
319 * In that case it is best to use the portable one.
320 * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
321 */
322 #if defined(__GNUC__) && !defined(__wasm__) \
323 && defined(__SIZEOF_INT128__) \
324 || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
325
326 __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
327 XXH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) };
328 return r128;
329
330 /*
331 * MSVC for x64's _umul128 method.
332 *
333 * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
334 *
335 * This compiles to single operand MUL on x64.
336 */
337 #elif defined(_M_X64) || defined(_M_IA64)
338
339 #ifndef _MSC_VER
340 # pragma intrinsic(_umul128)
341 #endif
342 xxh_u64 product_high;
343 xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
344 XXH128_hash_t const r128 = { product_low, product_high };
345 return r128;
346
347 #else
348 /*
349 * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
350 *
351 * This is a fast and simple grade school multiply, which is shown
352 * below with base 10 arithmetic instead of base 0x100000000.
353 *
354 * 9 3 // D2 lhs = 93
355 * x 7 5 // D2 rhs = 75
356 * ----------
357 * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
358 * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
359 * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
360 * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
361 * ---------
362 * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21
363 * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63
364 * ---------
365 * 6 9 7 5
366 *
367 * The reasons for adding the products like this are:
368 * 1. It avoids manual carry tracking. Just like how
369 * (9 * 9) + 9 + 9 = 99, the same applies with this for
370 * UINT64_MAX. This avoids a lot of complexity.
371 *
372 * 2. It hints for, and on Clang, compiles to, the powerful UMAAL
373 * instruction available in ARMv6+ A32/T32, which is shown below:
374 *
375 * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
376 * {
377 * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
378 * *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
379 * *RdHi = (xxh_u32)(product >> 32);
380 * }
381 *
382 * This instruction was designed for efficient long multiplication,
383 * and allows this to be calculated in only 4 instructions which
384 * is comparable to some 64-bit ALUs.
385 *
386 * 3. It isn't terrible on other platforms. Usually this will be
387 * a couple of 32-bit ADD/ADCs.
388 */
389
390 /* First calculate all of the cross products. */
391 xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
392 xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
393 xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
394 xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
395
396 /* Now add the products together. These will never overflow. */
397 xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
398 xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
399 xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
400
401 XXH128_hash_t r128 = { lower, upper };
402 return r128;
403 #endif
404 }
405
406 /*
407 * We want to keep the attribute here because a target switch
408 * disables inlining.
409 *
410 * Does a 64-bit to 128-bit multiply, then XOR folds it.
411 * The reason for the separate function is to prevent passing
412 * too many structs around by value. This will hopefully inline
413 * the multiply, but we don't force it.
414 */
415 #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
416 __attribute__((__target__("no-sse")))
417 #endif
418 static xxh_u64
XXH3p_mul128_fold64(xxh_u64 lhs,xxh_u64 rhs)419 XXH3p_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
420 {
421 XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
422 return product.low64 ^ product.high64;
423 }
424
425
XXH3p_avalanche(xxh_u64 h64)426 static XXH64_hash_t XXH3p_avalanche(xxh_u64 h64)
427 {
428 h64 ^= h64 >> 37;
429 h64 *= PRIME64_3;
430 h64 ^= h64 >> 32;
431 return h64;
432 }
433
434
435 /* ==========================================
436 * Short keys
437 * ========================================== */
438
439 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_1to3_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)440 XXH3p_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
441 {
442 XXH_ASSERT(input != NULL);
443 XXH_ASSERT(1 <= len && len <= 3);
444 XXH_ASSERT(secret != NULL);
445 { xxh_u8 const c1 = input[0];
446 xxh_u8 const c2 = input[len >> 1];
447 xxh_u8 const c3 = input[len - 1];
448 xxh_u32 const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24);
449 xxh_u64 const keyed = (xxh_u64)combined ^ (XXH_readLE32(secret) + seed);
450 xxh_u64 const mixed = keyed * PRIME64_1;
451 return XXH3p_avalanche(mixed);
452 }
453 }
454
455 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_4to8_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)456 XXH3p_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
457 {
458 XXH_ASSERT(input != NULL);
459 XXH_ASSERT(secret != NULL);
460 XXH_ASSERT(4 <= len && len <= 8);
461 { xxh_u32 const input_lo = XXH_readLE32(input);
462 xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
463 xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32);
464 xxh_u64 const keyed = input_64 ^ (XXH_readLE64(secret) + seed);
465 xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
466 return XXH3p_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
467 }
468 }
469
470 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_9to16_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)471 XXH3p_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
472 {
473 XXH_ASSERT(input != NULL);
474 XXH_ASSERT(secret != NULL);
475 XXH_ASSERT(9 <= len && len <= 16);
476 { xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed);
477 xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret + 8) - seed);
478 xxh_u64 const acc = len + (input_lo + input_hi) + XXH3p_mul128_fold64(input_lo, input_hi);
479 return XXH3p_avalanche(acc);
480 }
481 }
482
483 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_0to16_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)484 XXH3p_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
485 {
486 XXH_ASSERT(len <= 16);
487 { if (len > 8) return XXH3p_len_9to16_64b(input, len, secret, seed);
488 if (len >= 4) return XXH3p_len_4to8_64b(input, len, secret, seed);
489 if (len) return XXH3p_len_1to3_64b(input, len, secret, seed);
490 /*
491 * RocksDB modification from XXH3 preview: zero result for empty
492 * string can be problematic for multiplication-based algorithms.
493 * Return a hash of the seed instead.
494 */
495 return XXH3p_mul128_fold64(seed + XXH_readLE64(secret), PRIME64_2);
496 }
497 }
498
499
500 /* === Long Keys === */
501
502 #define STRIPE_LEN 64
503 #define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */
504 #define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
505
506 typedef enum { XXH3p_acc_64bits, XXH3p_acc_128bits } XXH3p_accWidth_e;
507
508 XXH_FORCE_INLINE void
XXH3p_accumulate_512(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret,XXH3p_accWidth_e accWidth)509 XXH3p_accumulate_512( void* XXH_RESTRICT acc,
510 const void* XXH_RESTRICT input,
511 const void* XXH_RESTRICT secret,
512 XXH3p_accWidth_e accWidth)
513 {
514 #if (XXH_VECTOR == XXH_AVX2)
515
516 XXH_ASSERT((((size_t)acc) & 31) == 0);
517 { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc;
518 const __m256i* const xinput = (const __m256i *) input; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
519 const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
520
521 size_t i;
522 for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
523 __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
524 __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
525 __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
526 __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
527 if (accWidth == XXH3p_acc_128bits) {
528 __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
529 __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
530 xacc[i] = _mm256_add_epi64(product, sum);
531 } else { /* XXH3p_acc_64bits */
532 __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
533 xacc[i] = _mm256_add_epi64(product, sum);
534 }
535 } }
536
537 #elif (XXH_VECTOR == XXH_SSE2)
538
539 XXH_ASSERT((((size_t)acc) & 15) == 0);
540 { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc;
541 const __m128i* const xinput = (const __m128i *) input; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
542 const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
543
544 size_t i;
545 for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
546 __m128i const data_vec = _mm_loadu_si128 (xinput+i);
547 __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
548 __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
549 __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
550 if (accWidth == XXH3p_acc_128bits) {
551 __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
552 __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
553 xacc[i] = _mm_add_epi64(product, sum);
554 } else { /* XXH3p_acc_64bits */
555 __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
556 xacc[i] = _mm_add_epi64(product, sum);
557 }
558 } }
559
560 #elif (XXH_VECTOR == XXH_NEON)
561
562 XXH_ASSERT((((size_t)acc) & 15) == 0);
563 {
564 XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
565 /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
566 uint8_t const* const xinput = (const uint8_t *) input;
567 uint8_t const* const xsecret = (const uint8_t *) secret;
568
569 size_t i;
570 for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
571 #if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
572 /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
573 * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
574 * assumes I don't want to destroy it and tries to make a copy. This slows down the code
575 * a lot.
576 * aarch64 not only uses an entirely different syntax, but it requires three
577 * instructions...
578 * ext v1.16B, v0.16B, #8 // select high bits because aarch64 can't address them directly
579 * zip1 v3.2s, v0.2s, v1.2s // first zip
580 * zip2 v2.2s, v0.2s, v1.2s // second zip
581 * ...to do what ARM does in one:
582 * vzip.32 d0, d1 // Interleave high and low bits and overwrite. */
583
584 /* data_vec = xsecret[i]; */
585 uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16));
586 /* key_vec = xsecret[i]; */
587 uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16));
588 /* data_key = data_vec ^ key_vec; */
589 uint32x4_t data_key;
590
591 if (accWidth == XXH3p_acc_64bits) {
592 /* Add first to prevent register swaps */
593 /* xacc[i] += data_vec; */
594 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
595 } else { /* XXH3p_acc_128bits */
596 /* xacc[i] += swap(data_vec); */
597 /* can probably be optimized better */
598 uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
599 uint64x2_t const swapped= vextq_u64(data64, data64, 1);
600 xacc[i] = vaddq_u64 (xacc[i], swapped);
601 }
602
603 data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));
604
605 /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
606 * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
607 __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
608 /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
609 xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
610
611 #else
612 /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
613
614 /* data_vec = xsecret[i]; */
615 uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16));
616 /* key_vec = xsecret[i]; */
617 uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16));
618 /* data_key = data_vec ^ key_vec; */
619 uint64x2_t const data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
620 /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
621 uint32x2_t const data_key_lo = vmovn_u64 (data_key);
622 /* data_key_hi = (uint32x2_t) (data_key >> 32); */
623 uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32);
624 if (accWidth == XXH3p_acc_64bits) {
625 /* xacc[i] += data_vec; */
626 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
627 } else { /* XXH3p_acc_128bits */
628 /* xacc[i] += swap(data_vec); */
629 uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
630 uint64x2_t const swapped= vextq_u64(data64, data64, 1);
631 xacc[i] = vaddq_u64 (xacc[i], swapped);
632 }
633 /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
634 xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
635
636 #endif
637 }
638 }
639
640 #elif (XXH_VECTOR == XXH_VSX)
641 U64x2* const xacc = (U64x2*) acc; /* presumed aligned */
642 U64x2 const* const xinput = (U64x2 const*) input; /* no alignment restriction */
643 U64x2 const* const xsecret = (U64x2 const*) secret; /* no alignment restriction */
644 U64x2 const v32 = { 32, 32 };
645 #if XXH_VSX_BE
646 U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
647 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
648 #endif
649 size_t i;
650 for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
651 /* data_vec = xinput[i]; */
652 /* key_vec = xsecret[i]; */
653 #if XXH_VSX_BE
654 /* byteswap */
655 U64x2 const data_vec = XXH_vec_revb(vec_vsx_ld(0, xinput + i));
656 U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
657 /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
658 U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
659 #else
660 U64x2 const data_vec = vec_vsx_ld(0, xinput + i);
661 U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
662 U64x2 const data_key = data_vec ^ key_vec;
663 #endif
664 /* shuffled = (data_key << 32) | (data_key >> 32); */
665 U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
666 /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
667 U64x2 const product = XXH_vec_mulo((U32x4)data_key, shuffled);
668 xacc[i] += product;
669
670 if (accWidth == XXH3p_acc_64bits) {
671 xacc[i] += data_vec;
672 } else { /* XXH3p_acc_128bits */
673 /* swap high and low halves */
674 U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
675 xacc[i] += data_swapped;
676 }
677 }
678
679 #else /* scalar variant of Accumulator - universal */
680
681 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
682 const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */
683 const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
684 size_t i;
685 XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
686 for (i=0; i < ACC_NB; i++) {
687 xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
688 xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
689
690 if (accWidth == XXH3p_acc_64bits) {
691 xacc[i] += data_val;
692 } else {
693 xacc[i ^ 1] += data_val; /* swap adjacent lanes */
694 }
695 xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
696 }
697 #endif
698 }
699
700 XXH_FORCE_INLINE void
XXH3p_scrambleAcc(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)701 XXH3p_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
702 {
703 #if (XXH_VECTOR == XXH_AVX2)
704
705 XXH_ASSERT((((size_t)acc) & 31) == 0);
706 { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
707 const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
708 const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
709
710 size_t i;
711 for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
712 /* xacc[i] ^= (xacc[i] >> 47) */
713 __m256i const acc_vec = xacc[i];
714 __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47);
715 __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted);
716 /* xacc[i] ^= xsecret; */
717 __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
718 __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
719
720 /* xacc[i] *= PRIME32_1; */
721 __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31);
722 __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);
723 __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);
724 xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
725 }
726 }
727
728 #elif (XXH_VECTOR == XXH_SSE2)
729
730 XXH_ASSERT((((size_t)acc) & 15) == 0);
731 { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
732 const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */
733 const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
734
735 size_t i;
736 for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
737 /* xacc[i] ^= (xacc[i] >> 47) */
738 __m128i const acc_vec = xacc[i];
739 __m128i const shifted = _mm_srli_epi64 (acc_vec, 47);
740 __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted);
741 /* xacc[i] ^= xsecret; */
742 __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
743 __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);
744
745 /* xacc[i] *= PRIME32_1; */
746 __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31);
747 __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32);
748 __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32);
749 xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
750 }
751 }
752
753 #elif (XXH_VECTOR == XXH_NEON)
754
755 XXH_ASSERT((((size_t)acc) & 15) == 0);
756
757 { uint64x2_t* const xacc = (uint64x2_t*) acc;
758 uint8_t const* const xsecret = (uint8_t const*) secret;
759 uint32x2_t const prime = vdup_n_u32 (PRIME32_1);
760
761 size_t i;
762 for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
763 /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
764 uint64x2_t const acc_vec = xacc[i];
765 uint64x2_t const shifted = vshrq_n_u64 (acc_vec, 47);
766 uint64x2_t const data_vec = veorq_u64 (acc_vec, shifted);
767
768 /* key_vec = xsecret[i]; */
769 uint32x4_t const key_vec = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16)));
770 /* data_key = data_vec ^ key_vec; */
771 uint32x4_t const data_key = veorq_u32 (vreinterpretq_u32_u64(data_vec), key_vec);
772 /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
773 uint32x2x2_t const shuffled = vzip_u32 (vget_low_u32(data_key), vget_high_u32(data_key));
774
775 /* data_key *= PRIME32_1 */
776
777 /* prod_hi = (data_key >> 32) * PRIME32_1; */
778 uint64x2_t const prod_hi = vmull_u32 (shuffled.val[1], prime);
779 /* xacc[i] = prod_hi << 32; */
780 xacc[i] = vshlq_n_u64(prod_hi, 32);
781 /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
782 xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
783 } }
784
785 #elif (XXH_VECTOR == XXH_VSX)
786
787 U64x2* const xacc = (U64x2*) acc;
788 const U64x2* const xsecret = (const U64x2*) secret;
789 /* constants */
790 U64x2 const v32 = { 32, 32 };
791 U64x2 const v47 = { 47, 47 };
792 U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
793 size_t i;
794 #if XXH_VSX_BE
795 /* endian swap */
796 U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
797 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
798 #endif
799 for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
800 U64x2 const acc_vec = xacc[i];
801 U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
802 /* key_vec = xsecret[i]; */
803 #if XXH_VSX_BE
804 /* swap bytes words */
805 U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
806 U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
807 #else
808 U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
809 U64x2 const data_key = data_vec ^ key_vec;
810 #endif
811
812 /* data_key *= PRIME32_1 */
813
814 /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF); */
815 U64x2 const prod_even = XXH_vec_mule((U32x4)data_key, prime);
816 /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32); */
817 U64x2 const prod_odd = XXH_vec_mulo((U32x4)data_key, prime);
818 xacc[i] = prod_odd + (prod_even << v32);
819 }
820
821 #else /* scalar variant of Scrambler - universal */
822
823 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
824 const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
825 size_t i;
826 XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
827 for (i=0; i < ACC_NB; i++) {
828 xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
829 xxh_u64 acc64 = xacc[i];
830 acc64 ^= acc64 >> 47;
831 acc64 ^= key64;
832 acc64 *= PRIME32_1;
833 xacc[i] = acc64;
834 }
835
836 #endif
837 }
838
839 #define XXH_PREFETCH_DIST 384
840
841 /* assumption : nbStripes will not overflow secret size */
842 XXH_FORCE_INLINE void
XXH3p_accumulate(xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT input,const xxh_u8 * XXH_RESTRICT secret,size_t nbStripes,XXH3p_accWidth_e accWidth)843 XXH3p_accumulate( xxh_u64* XXH_RESTRICT acc,
844 const xxh_u8* XXH_RESTRICT input,
845 const xxh_u8* XXH_RESTRICT secret,
846 size_t nbStripes,
847 XXH3p_accWidth_e accWidth)
848 {
849 size_t n;
850 for (n = 0; n < nbStripes; n++ ) {
851 const xxh_u8* const in = input + n*STRIPE_LEN;
852 XXH_PREFETCH(in + XXH_PREFETCH_DIST);
853 XXH3p_accumulate_512(acc,
854 in,
855 secret + n*XXH_SECRET_CONSUME_RATE,
856 accWidth);
857 }
858 }
859
860 /* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
861 * and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
862 * However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
863 * Pretty much every other modes and compilers prefer `FORCE_INLINE`.
864 */
865
866 #if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__)
867 static void
868 #else
869 XXH_FORCE_INLINE void
870 #endif
XXH3p_hashLong_internal_loop(xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH3p_accWidth_e accWidth)871 XXH3p_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
872 const xxh_u8* XXH_RESTRICT input, size_t len,
873 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
874 XXH3p_accWidth_e accWidth)
875 {
876 size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
877 size_t const block_len = STRIPE_LEN * nb_rounds;
878 size_t const nb_blocks = len / block_len;
879
880 size_t n;
881
882 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
883
884 for (n = 0; n < nb_blocks; n++) {
885 XXH3p_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
886 XXH3p_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
887 }
888
889 /* last partial block */
890 XXH_ASSERT(len > STRIPE_LEN);
891 { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
892 XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
893 XXH3p_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
894
895 /* last stripe */
896 if (len & (STRIPE_LEN - 1)) {
897 const xxh_u8* const p = input + len - STRIPE_LEN;
898 #define XXH_SECRET_LASTACC_START 7 /* do not align on 8, so that secret is different from scrambler */
899 XXH3p_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
900 } }
901 }
902
903 XXH_FORCE_INLINE xxh_u64
XXH3p_mix2Accs(const xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT secret)904 XXH3p_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
905 {
906 return XXH3p_mul128_fold64(
907 acc[0] ^ XXH_readLE64(secret),
908 acc[1] ^ XXH_readLE64(secret+8) );
909 }
910
911 static XXH64_hash_t
XXH3p_mergeAccs(const xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT secret,xxh_u64 start)912 XXH3p_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
913 {
914 xxh_u64 result64 = start;
915
916 result64 += XXH3p_mix2Accs(acc+0, secret + 0);
917 result64 += XXH3p_mix2Accs(acc+2, secret + 16);
918 result64 += XXH3p_mix2Accs(acc+4, secret + 32);
919 result64 += XXH3p_mix2Accs(acc+6, secret + 48);
920
921 return XXH3p_avalanche(result64);
922 }
923
924 #define XXH3p_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
925 PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
926
927 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_hashLong_internal(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize)928 XXH3p_hashLong_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
929 const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
930 {
931 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
932
933 XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_64bits);
934
935 /* converge into final hash */
936 XXH_STATIC_ASSERT(sizeof(acc) == 64);
937 #define XXH_SECRET_MERGEACCS_START 11 /* do not align on 8, so that secret is different from accumulator */
938 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
939 return XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
940 }
941
942
943 XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_64b_defaultSecret(const xxh_u8 * XXH_RESTRICT input,size_t len)944 XXH3p_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
945 {
946 return XXH3p_hashLong_internal(input, len, kSecret, sizeof(kSecret));
947 }
948
949 XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_64b_withSecret(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize)950 XXH3p_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
951 const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
952 {
953 return XXH3p_hashLong_internal(input, len, secret, secretSize);
954 }
955
956
XXH_writeLE64(void * dst,xxh_u64 v64)957 XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
958 {
959 if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
960 memcpy(dst, &v64, sizeof(v64));
961 }
962
963 /* XXH3p_initCustomSecret() :
964 * destination `customSecret` is presumed allocated and same size as `kSecret`.
965 */
XXH3p_initCustomSecret(xxh_u8 * customSecret,xxh_u64 seed64)966 XXH_FORCE_INLINE void XXH3p_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64)
967 {
968 int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
969 int i;
970
971 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
972
973 for (i=0; i < nbRounds; i++) {
974 XXH_writeLE64(customSecret + 16*i, XXH_readLE64(kSecret + 16*i) + seed64);
975 XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64);
976 }
977 }
978
979
980 /* XXH3p_hashLong_64b_withSeed() :
981 * Generate a custom key,
982 * based on alteration of default kSecret with the seed,
983 * and then use this key for long mode hashing.
984 * This operation is decently fast but nonetheless costs a little bit of time.
985 * Try to avoid it whenever possible (typically when seed==0).
986 */
987 XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_64b_withSeed(const xxh_u8 * input,size_t len,XXH64_hash_t seed)988 XXH3p_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
989 {
990 XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
991 if (seed==0) return XXH3p_hashLong_64b_defaultSecret(input, len);
992 XXH3p_initCustomSecret(secret, seed);
993 return XXH3p_hashLong_internal(input, len, secret, sizeof(secret));
994 }
995
996
XXH3p_mix16B(const xxh_u8 * XXH_RESTRICT input,const xxh_u8 * XXH_RESTRICT secret,xxh_u64 seed64)997 XXH_FORCE_INLINE xxh_u64 XXH3p_mix16B(const xxh_u8* XXH_RESTRICT input,
998 const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
999 {
1000 xxh_u64 const input_lo = XXH_readLE64(input);
1001 xxh_u64 const input_hi = XXH_readLE64(input+8);
1002 return XXH3p_mul128_fold64(
1003 input_lo ^ (XXH_readLE64(secret) + seed64),
1004 input_hi ^ (XXH_readLE64(secret+8) - seed64) );
1005 }
1006
1007
1008 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_17to128_64b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1009 XXH3p_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
1010 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1011 XXH64_hash_t seed)
1012 {
1013 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1014 XXH_ASSERT(16 < len && len <= 128);
1015
1016 { xxh_u64 acc = len * PRIME64_1;
1017 if (len > 32) {
1018 if (len > 64) {
1019 if (len > 96) {
1020 acc += XXH3p_mix16B(input+48, secret+96, seed);
1021 acc += XXH3p_mix16B(input+len-64, secret+112, seed);
1022 }
1023 acc += XXH3p_mix16B(input+32, secret+64, seed);
1024 acc += XXH3p_mix16B(input+len-48, secret+80, seed);
1025 }
1026 acc += XXH3p_mix16B(input+16, secret+32, seed);
1027 acc += XXH3p_mix16B(input+len-32, secret+48, seed);
1028 }
1029 acc += XXH3p_mix16B(input+0, secret+0, seed);
1030 acc += XXH3p_mix16B(input+len-16, secret+16, seed);
1031
1032 return XXH3p_avalanche(acc);
1033 }
1034 }
1035
1036 #define XXH3p_MIDSIZE_MAX 240
1037
1038 XXH_NO_INLINE XXH64_hash_t
XXH3p_len_129to240_64b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1039 XXH3p_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
1040 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1041 XXH64_hash_t seed)
1042 {
1043 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1044 XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
1045
1046 #define XXH3p_MIDSIZE_STARTOFFSET 3
1047 #define XXH3p_MIDSIZE_LASTOFFSET 17
1048
1049 { xxh_u64 acc = len * PRIME64_1;
1050 int const nbRounds = (int)len / 16;
1051 int i;
1052 for (i=0; i<8; i++) {
1053 acc += XXH3p_mix16B(input+(16*i), secret+(16*i), seed);
1054 }
1055 acc = XXH3p_avalanche(acc);
1056 XXH_ASSERT(nbRounds >= 8);
1057 for (i=8 ; i < nbRounds; i++) {
1058 acc += XXH3p_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3p_MIDSIZE_STARTOFFSET, seed);
1059 }
1060 /* last bytes */
1061 acc += XXH3p_mix16B(input + len - 16, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET, seed);
1062 return XXH3p_avalanche(acc);
1063 }
1064 }
1065
1066 /* === Public entry point === */
1067
XXH3p_64bits(const void * input,size_t len)1068 XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits(const void* input, size_t len)
1069 {
1070 if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
1071 if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1072 if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1073 return XXH3p_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
1074 }
1075
1076 XXH_PUBLIC_API XXH64_hash_t
XXH3p_64bits_withSecret(const void * input,size_t len,const void * secret,size_t secretSize)1077 XXH3p_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
1078 {
1079 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
1080 /* if an action must be taken should `secret` conditions not be respected,
1081 * it should be done here.
1082 * For now, it's a contract pre-condition.
1083 * Adding a check and a branch here would cost performance at every hash */
1084 if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
1085 if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1086 if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1087 return XXH3p_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
1088 }
1089
1090 XXH_PUBLIC_API XXH64_hash_t
XXH3p_64bits_withSeed(const void * input,size_t len,XXH64_hash_t seed)1091 XXH3p_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
1092 {
1093 if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
1094 if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1095 if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1096 return XXH3p_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
1097 }
1098
1099 /* === XXH3 streaming === */
1100
XXH3p_createState(void)1101 XXH_PUBLIC_API XXH3p_state_t* XXH3p_createState(void)
1102 {
1103 return (XXH3p_state_t*)XXH_malloc(sizeof(XXH3p_state_t));
1104 }
1105
XXH3p_freeState(XXH3p_state_t * statePtr)1106 XXH_PUBLIC_API XXH_errorcode XXH3p_freeState(XXH3p_state_t* statePtr)
1107 {
1108 XXH_free(statePtr);
1109 return XXH_OK;
1110 }
1111
1112 XXH_PUBLIC_API void
XXH3p_copyState(XXH3p_state_t * dst_state,const XXH3p_state_t * src_state)1113 XXH3p_copyState(XXH3p_state_t* dst_state, const XXH3p_state_t* src_state)
1114 {
1115 memcpy(dst_state, src_state, sizeof(*dst_state));
1116 }
1117
1118 static void
XXH3p_64bits_reset_internal(XXH3p_state_t * statePtr,XXH64_hash_t seed,const xxh_u8 * secret,size_t secretSize)1119 XXH3p_64bits_reset_internal(XXH3p_state_t* statePtr,
1120 XXH64_hash_t seed,
1121 const xxh_u8* secret, size_t secretSize)
1122 {
1123 XXH_ASSERT(statePtr != NULL);
1124 memset(statePtr, 0, sizeof(*statePtr));
1125 statePtr->acc[0] = PRIME32_3;
1126 statePtr->acc[1] = PRIME64_1;
1127 statePtr->acc[2] = PRIME64_2;
1128 statePtr->acc[3] = PRIME64_3;
1129 statePtr->acc[4] = PRIME64_4;
1130 statePtr->acc[5] = PRIME32_2;
1131 statePtr->acc[6] = PRIME64_5;
1132 statePtr->acc[7] = PRIME32_1;
1133 statePtr->seed = seed;
1134 XXH_ASSERT(secret != NULL);
1135 statePtr->secret = secret;
1136 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
1137 statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
1138 statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
1139 }
1140
1141 XXH_PUBLIC_API XXH_errorcode
XXH3p_64bits_reset(XXH3p_state_t * statePtr)1142 XXH3p_64bits_reset(XXH3p_state_t* statePtr)
1143 {
1144 if (statePtr == NULL) return XXH_ERROR;
1145 XXH3p_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
1146 return XXH_OK;
1147 }
1148
1149 XXH_PUBLIC_API XXH_errorcode
XXH3p_64bits_reset_withSecret(XXH3p_state_t * statePtr,const void * secret,size_t secretSize)1150 XXH3p_64bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize)
1151 {
1152 if (statePtr == NULL) return XXH_ERROR;
1153 XXH3p_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
1154 if (secret == NULL) return XXH_ERROR;
1155 if (secretSize < XXH3p_SECRET_SIZE_MIN) return XXH_ERROR;
1156 return XXH_OK;
1157 }
1158
1159 XXH_PUBLIC_API XXH_errorcode
XXH3p_64bits_reset_withSeed(XXH3p_state_t * statePtr,XXH64_hash_t seed)1160 XXH3p_64bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed)
1161 {
1162 if (statePtr == NULL) return XXH_ERROR;
1163 XXH3p_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
1164 XXH3p_initCustomSecret(statePtr->customSecret, seed);
1165 statePtr->secret = statePtr->customSecret;
1166 return XXH_OK;
1167 }
1168
1169 XXH_FORCE_INLINE void
XXH3p_consumeStripes(xxh_u64 * acc,XXH32_hash_t * nbStripesSoFarPtr,XXH32_hash_t nbStripesPerBlock,const xxh_u8 * input,size_t totalStripes,const xxh_u8 * secret,size_t secretLimit,XXH3p_accWidth_e accWidth)1170 XXH3p_consumeStripes( xxh_u64* acc,
1171 XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
1172 const xxh_u8* input, size_t totalStripes,
1173 const xxh_u8* secret, size_t secretLimit,
1174 XXH3p_accWidth_e accWidth)
1175 {
1176 XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
1177 if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
1178 /* need a scrambling operation */
1179 size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
1180 XXH3p_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
1181 XXH3p_scrambleAcc(acc, secret + secretLimit);
1182 XXH3p_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
1183 *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
1184 } else {
1185 XXH3p_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
1186 *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
1187 }
1188 }
1189
1190 XXH_FORCE_INLINE XXH_errorcode
XXH3p_update(XXH3p_state_t * state,const xxh_u8 * input,size_t len,XXH3p_accWidth_e accWidth)1191 XXH3p_update(XXH3p_state_t* state, const xxh_u8* input, size_t len, XXH3p_accWidth_e accWidth)
1192 {
1193 if (input==NULL)
1194 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
1195 return XXH_OK;
1196 #else
1197 return XXH_ERROR;
1198 #endif
1199
1200 { const xxh_u8* const bEnd = input + len;
1201
1202 state->totalLen += len;
1203
1204 if (state->bufferedSize + len <= XXH3p_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */
1205 XXH_memcpy(state->buffer + state->bufferedSize, input, len);
1206 state->bufferedSize += (XXH32_hash_t)len;
1207 return XXH_OK;
1208 }
1209 /* input now > XXH3p_INTERNALBUFFER_SIZE */
1210
1211 #define XXH3p_INTERNALBUFFER_STRIPES (XXH3p_INTERNALBUFFER_SIZE / STRIPE_LEN)
1212 XXH_STATIC_ASSERT(XXH3p_INTERNALBUFFER_SIZE % STRIPE_LEN == 0); /* clean multiple */
1213
1214 if (state->bufferedSize) { /* some input within internal buffer: fill then consume it */
1215 size_t const loadSize = XXH3p_INTERNALBUFFER_SIZE - state->bufferedSize;
1216 XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
1217 input += loadSize;
1218 XXH3p_consumeStripes(state->acc,
1219 &state->nbStripesSoFar, state->nbStripesPerBlock,
1220 state->buffer, XXH3p_INTERNALBUFFER_STRIPES,
1221 state->secret, state->secretLimit,
1222 accWidth);
1223 state->bufferedSize = 0;
1224 }
1225
1226 /* consume input by full buffer quantities */
1227 if (input+XXH3p_INTERNALBUFFER_SIZE <= bEnd) {
1228 const xxh_u8* const limit = bEnd - XXH3p_INTERNALBUFFER_SIZE;
1229 do {
1230 XXH3p_consumeStripes(state->acc,
1231 &state->nbStripesSoFar, state->nbStripesPerBlock,
1232 input, XXH3p_INTERNALBUFFER_STRIPES,
1233 state->secret, state->secretLimit,
1234 accWidth);
1235 input += XXH3p_INTERNALBUFFER_SIZE;
1236 } while (input<=limit);
1237 }
1238
1239 if (input < bEnd) { /* some remaining input input : buffer it */
1240 XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
1241 state->bufferedSize = (XXH32_hash_t)(bEnd-input);
1242 }
1243 }
1244
1245 return XXH_OK;
1246 }
1247
1248 XXH_PUBLIC_API XXH_errorcode
XXH3p_64bits_update(XXH3p_state_t * state,const void * input,size_t len)1249 XXH3p_64bits_update(XXH3p_state_t* state, const void* input, size_t len)
1250 {
1251 return XXH3p_update(state, (const xxh_u8*)input, len, XXH3p_acc_64bits);
1252 }
1253
1254
1255 XXH_FORCE_INLINE void
XXH3p_digest_long(XXH64_hash_t * acc,const XXH3p_state_t * state,XXH3p_accWidth_e accWidth)1256 XXH3p_digest_long (XXH64_hash_t* acc, const XXH3p_state_t* state, XXH3p_accWidth_e accWidth)
1257 {
1258 memcpy(acc, state->acc, sizeof(state->acc)); /* digest locally, state remains unaltered, and can continue ingesting more input afterwards */
1259 if (state->bufferedSize >= STRIPE_LEN) {
1260 size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
1261 XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
1262 XXH3p_consumeStripes(acc,
1263 &nbStripesSoFar, state->nbStripesPerBlock,
1264 state->buffer, totalNbStripes,
1265 state->secret, state->secretLimit,
1266 accWidth);
1267 if (state->bufferedSize % STRIPE_LEN) { /* one last partial stripe */
1268 XXH3p_accumulate_512(acc,
1269 state->buffer + state->bufferedSize - STRIPE_LEN,
1270 state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
1271 accWidth);
1272 }
1273 } else { /* bufferedSize < STRIPE_LEN */
1274 if (state->bufferedSize) { /* one last stripe */
1275 xxh_u8 lastStripe[STRIPE_LEN];
1276 size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
1277 memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
1278 memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
1279 XXH3p_accumulate_512(acc,
1280 lastStripe,
1281 state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
1282 accWidth);
1283 } }
1284 }
1285
XXH3p_64bits_digest(const XXH3p_state_t * state)1286 XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_digest (const XXH3p_state_t* state)
1287 {
1288 if (state->totalLen > XXH3p_MIDSIZE_MAX) {
1289 XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
1290 XXH3p_digest_long(acc, state, XXH3p_acc_64bits);
1291 return XXH3p_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
1292 }
1293 /* len <= XXH3p_MIDSIZE_MAX : short code */
1294 if (state->seed)
1295 return XXH3p_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
1296 return XXH3p_64bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
1297 }
1298
1299 /* ==========================================
1300 * XXH3 128 bits (=> XXH128)
1301 * ========================================== */
1302
1303 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_1to3_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)1304 XXH3p_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1305 {
1306 XXH_ASSERT(input != NULL);
1307 XXH_ASSERT(1 <= len && len <= 3);
1308 XXH_ASSERT(secret != NULL);
1309 { xxh_u8 const c1 = input[0];
1310 xxh_u8 const c2 = input[len >> 1];
1311 xxh_u8 const c3 = input[len - 1];
1312 xxh_u32 const combinedl = ((xxh_u32)c1) + (((xxh_u32)c2) << 8) + (((xxh_u32)c3) << 16) + (((xxh_u32)len) << 24);
1313 xxh_u32 const combinedh = XXH_swap32(combinedl);
1314 xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ (XXH_readLE32(secret) + seed);
1315 xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ (XXH_readLE32(secret+4) - seed);
1316 xxh_u64 const mixedl = keyed_lo * PRIME64_1;
1317 xxh_u64 const mixedh = keyed_hi * PRIME64_5;
1318 XXH128_hash_t const h128 = { XXH3p_avalanche(mixedl) /*low64*/, XXH3p_avalanche(mixedh) /*high64*/ };
1319 return h128;
1320 }
1321 }
1322
1323
1324 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_4to8_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)1325 XXH3p_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1326 {
1327 XXH_ASSERT(input != NULL);
1328 XXH_ASSERT(secret != NULL);
1329 XXH_ASSERT(4 <= len && len <= 8);
1330 { xxh_u32 const input_lo = XXH_readLE32(input);
1331 xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
1332 xxh_u64 const input_64_lo = input_lo + ((xxh_u64)input_hi << 32);
1333 xxh_u64 const input_64_hi = XXH_swap64(input_64_lo);
1334 xxh_u64 const keyed_lo = input_64_lo ^ (XXH_readLE64(secret) + seed);
1335 xxh_u64 const keyed_hi = input_64_hi ^ (XXH_readLE64(secret + 8) - seed);
1336 xxh_u64 const mix64l1 = len + ((keyed_lo ^ (keyed_lo >> 51)) * PRIME32_1);
1337 xxh_u64 const mix64l2 = (mix64l1 ^ (mix64l1 >> 47)) * PRIME64_2;
1338 xxh_u64 const mix64h1 = ((keyed_hi ^ (keyed_hi >> 47)) * PRIME64_1) - len;
1339 xxh_u64 const mix64h2 = (mix64h1 ^ (mix64h1 >> 43)) * PRIME64_4;
1340 { XXH128_hash_t const h128 = { XXH3p_avalanche(mix64l2) /*low64*/, XXH3p_avalanche(mix64h2) /*high64*/ };
1341 return h128;
1342 } }
1343 }
1344
1345 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_9to16_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)1346 XXH3p_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1347 {
1348 XXH_ASSERT(input != NULL);
1349 XXH_ASSERT(secret != NULL);
1350 XXH_ASSERT(9 <= len && len <= 16);
1351 { xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed);
1352 xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret+8) - seed);
1353 XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi, PRIME64_1);
1354 xxh_u64 const lenContrib = XXH_mult32to64(len, PRIME32_5);
1355 m128.low64 += lenContrib;
1356 m128.high64 += input_hi * PRIME64_1;
1357 m128.low64 ^= (m128.high64 >> 32);
1358 { XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
1359 h128.high64 += m128.high64 * PRIME64_2;
1360 h128.low64 = XXH3p_avalanche(h128.low64);
1361 h128.high64 = XXH3p_avalanche(h128.high64);
1362 return h128;
1363 } }
1364 }
1365
1366 /* Assumption : `secret` size is >= 16
1367 * Note : it should be >= XXH3p_SECRET_SIZE_MIN anyway */
1368 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_0to16_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)1369 XXH3p_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1370 {
1371 XXH_ASSERT(len <= 16);
1372 { if (len > 8) return XXH3p_len_9to16_128b(input, len, secret, seed);
1373 if (len >= 4) return XXH3p_len_4to8_128b(input, len, secret, seed);
1374 if (len) return XXH3p_len_1to3_128b(input, len, secret, seed);
1375 { XXH128_hash_t const h128 = { 0, 0 };
1376 return h128;
1377 } }
1378 }
1379
1380 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_hashLong_128b_internal(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize)1381 XXH3p_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
1382 const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
1383 {
1384 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
1385
1386 XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_128bits);
1387
1388 /* converge into final hash */
1389 XXH_STATIC_ASSERT(sizeof(acc) == 64);
1390 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
1391 { xxh_u64 const low64 = XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
1392 xxh_u64 const high64 = XXH3p_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)len * PRIME64_2));
1393 XXH128_hash_t const h128 = { low64, high64 };
1394 return h128;
1395 }
1396 }
1397
1398 XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_128b_defaultSecret(const xxh_u8 * input,size_t len)1399 XXH3p_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
1400 {
1401 return XXH3p_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
1402 }
1403
1404 XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_128b_withSecret(const xxh_u8 * input,size_t len,const xxh_u8 * secret,size_t secretSize)1405 XXH3p_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
1406 const xxh_u8* secret, size_t secretSize)
1407 {
1408 return XXH3p_hashLong_128b_internal(input, len, secret, secretSize);
1409 }
1410
1411 XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_128b_withSeed(const xxh_u8 * input,size_t len,XXH64_hash_t seed)1412 XXH3p_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
1413 {
1414 XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
1415 if (seed == 0) return XXH3p_hashLong_128b_defaultSecret(input, len);
1416 XXH3p_initCustomSecret(secret, seed);
1417 return XXH3p_hashLong_128b_internal(input, len, secret, sizeof(secret));
1418 }
1419
1420
1421 XXH_FORCE_INLINE XXH128_hash_t
XXH128_mix32B(XXH128_hash_t acc,const xxh_u8 * input_1,const xxh_u8 * input_2,const xxh_u8 * secret,XXH64_hash_t seed)1422 XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, const xxh_u8* secret, XXH64_hash_t seed)
1423 {
1424 acc.low64 += XXH3p_mix16B (input_1, secret+0, seed);
1425 acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
1426 acc.high64 += XXH3p_mix16B (input_2, secret+16, seed);
1427 acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
1428 return acc;
1429 }
1430
1431 XXH_NO_INLINE XXH128_hash_t
XXH3p_len_129to240_128b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1432 XXH3p_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
1433 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1434 XXH64_hash_t seed)
1435 {
1436 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1437 XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
1438
1439 { XXH128_hash_t acc;
1440 int const nbRounds = (int)len / 32;
1441 int i;
1442 acc.low64 = len * PRIME64_1;
1443 acc.high64 = 0;
1444 for (i=0; i<4; i++) {
1445 acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+(32*i), seed);
1446 }
1447 acc.low64 = XXH3p_avalanche(acc.low64);
1448 acc.high64 = XXH3p_avalanche(acc.high64);
1449 XXH_ASSERT(nbRounds >= 4);
1450 for (i=4 ; i < nbRounds; i++) {
1451 acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+XXH3p_MIDSIZE_STARTOFFSET+(32*(i-4)), seed);
1452 }
1453 /* last bytes */
1454 acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET - 16, 0ULL - seed);
1455
1456 { xxh_u64 const low64 = acc.low64 + acc.high64;
1457 xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
1458 XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
1459 return h128;
1460 }
1461 }
1462 }
1463
1464
1465 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_17to128_128b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1466 XXH3p_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
1467 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1468 XXH64_hash_t seed)
1469 {
1470 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1471 XXH_ASSERT(16 < len && len <= 128);
1472
1473 { XXH128_hash_t acc;
1474 acc.low64 = len * PRIME64_1;
1475 acc.high64 = 0;
1476 if (len > 32) {
1477 if (len > 64) {
1478 if (len > 96) {
1479 acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
1480 }
1481 acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
1482 }
1483 acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
1484 }
1485 acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
1486 { xxh_u64 const low64 = acc.low64 + acc.high64;
1487 xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
1488 XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
1489 return h128;
1490 }
1491 }
1492 }
1493
XXH3p_128bits(const void * input,size_t len)1494 XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits(const void* input, size_t len)
1495 {
1496 if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
1497 if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1498 if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1499 return XXH3p_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
1500 }
1501
1502 XXH_PUBLIC_API XXH128_hash_t
XXH3p_128bits_withSecret(const void * input,size_t len,const void * secret,size_t secretSize)1503 XXH3p_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
1504 {
1505 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
1506 /* if an action must be taken should `secret` conditions not be respected,
1507 * it should be done here.
1508 * For now, it's a contract pre-condition.
1509 * Adding a check and a branch here would cost performance at every hash */
1510 if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
1511 if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1512 if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1513 return XXH3p_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
1514 }
1515
1516 XXH_PUBLIC_API XXH128_hash_t
XXH3p_128bits_withSeed(const void * input,size_t len,XXH64_hash_t seed)1517 XXH3p_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
1518 {
1519 if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
1520 if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1521 if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1522 return XXH3p_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
1523 }
1524
1525 XXH_PUBLIC_API XXH128_hash_t
XXH128(const void * input,size_t len,XXH64_hash_t seed)1526 XXH128(const void* input, size_t len, XXH64_hash_t seed)
1527 {
1528 return XXH3p_128bits_withSeed(input, len, seed);
1529 }
1530
1531
1532 /* === XXH3 128-bit streaming === */
1533
1534 /* all the functions are actually the same as for 64-bit streaming variant,
1535 just the reset one is different (different initial acc values for 0,5,6,7),
1536 and near the end of the digest function */
1537
1538 static void
XXH3p_128bits_reset_internal(XXH3p_state_t * statePtr,XXH64_hash_t seed,const xxh_u8 * secret,size_t secretSize)1539 XXH3p_128bits_reset_internal(XXH3p_state_t* statePtr,
1540 XXH64_hash_t seed,
1541 const xxh_u8* secret, size_t secretSize)
1542 {
1543 XXH3p_64bits_reset_internal(statePtr, seed, secret, secretSize);
1544 }
1545
1546 XXH_PUBLIC_API XXH_errorcode
XXH3p_128bits_reset(XXH3p_state_t * statePtr)1547 XXH3p_128bits_reset(XXH3p_state_t* statePtr)
1548 {
1549 if (statePtr == NULL) return XXH_ERROR;
1550 XXH3p_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
1551 return XXH_OK;
1552 }
1553
1554 XXH_PUBLIC_API XXH_errorcode
XXH3p_128bits_reset_withSecret(XXH3p_state_t * statePtr,const void * secret,size_t secretSize)1555 XXH3p_128bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize)
1556 {
1557 if (statePtr == NULL) return XXH_ERROR;
1558 XXH3p_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
1559 if (secret == NULL) return XXH_ERROR;
1560 if (secretSize < XXH3p_SECRET_SIZE_MIN) return XXH_ERROR;
1561 return XXH_OK;
1562 }
1563
1564 XXH_PUBLIC_API XXH_errorcode
XXH3p_128bits_reset_withSeed(XXH3p_state_t * statePtr,XXH64_hash_t seed)1565 XXH3p_128bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed)
1566 {
1567 if (statePtr == NULL) return XXH_ERROR;
1568 XXH3p_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
1569 XXH3p_initCustomSecret(statePtr->customSecret, seed);
1570 statePtr->secret = statePtr->customSecret;
1571 return XXH_OK;
1572 }
1573
1574 XXH_PUBLIC_API XXH_errorcode
XXH3p_128bits_update(XXH3p_state_t * state,const void * input,size_t len)1575 XXH3p_128bits_update(XXH3p_state_t* state, const void* input, size_t len)
1576 {
1577 return XXH3p_update(state, (const xxh_u8*)input, len, XXH3p_acc_128bits);
1578 }
1579
XXH3p_128bits_digest(const XXH3p_state_t * state)1580 XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_digest (const XXH3p_state_t* state)
1581 {
1582 if (state->totalLen > XXH3p_MIDSIZE_MAX) {
1583 XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
1584 XXH3p_digest_long(acc, state, XXH3p_acc_128bits);
1585 XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
1586 { xxh_u64 const low64 = XXH3p_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
1587 xxh_u64 const high64 = XXH3p_mergeAccs(acc, state->secret + state->secretLimit + STRIPE_LEN - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)state->totalLen * PRIME64_2));
1588 XXH128_hash_t const h128 = { low64, high64 };
1589 return h128;
1590 }
1591 }
1592 /* len <= XXH3p_MIDSIZE_MAX : short code */
1593 if (state->seed)
1594 return XXH3p_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
1595 return XXH3p_128bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
1596 }
1597
1598 /* 128-bit utility functions */
1599
1600 #include <string.h> /* memcmp */
1601
1602 /* return : 1 is equal, 0 if different */
XXH128_isEqual(XXH128_hash_t h1,XXH128_hash_t h2)1603 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
1604 {
1605 /* note : XXH128_hash_t is compact, it has no padding byte */
1606 return !(memcmp(&h1, &h2, sizeof(h1)));
1607 }
1608
1609 /* This prototype is compatible with stdlib's qsort().
1610 * return : >0 if *h128_1 > *h128_2
1611 * <0 if *h128_1 < *h128_2
1612 * =0 if *h128_1 == *h128_2 */
XXH128_cmp(const void * h128_1,const void * h128_2)1613 XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
1614 {
1615 XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
1616 XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
1617 int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
1618 /* note : bets that, in most cases, hash values are different */
1619 if (hcmp) return hcmp;
1620 return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
1621 }
1622
1623
1624 /*====== Canonical representation ======*/
1625 XXH_PUBLIC_API void
XXH128_canonicalFromHash(XXH128_canonical_t * dst,XXH128_hash_t hash)1626 XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
1627 {
1628 XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
1629 if (XXH_CPU_LITTLE_ENDIAN) {
1630 hash.high64 = XXH_swap64(hash.high64);
1631 hash.low64 = XXH_swap64(hash.low64);
1632 }
1633 memcpy(dst, &hash.high64, sizeof(hash.high64));
1634 memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
1635 }
1636
1637 XXH_PUBLIC_API XXH128_hash_t
XXH128_hashFromCanonical(const XXH128_canonical_t * src)1638 XXH128_hashFromCanonical(const XXH128_canonical_t* src)
1639 {
1640 XXH128_hash_t h;
1641 h.high64 = XXH_readBE64(src);
1642 h.low64 = XXH_readBE64(src->digest + 8);
1643 return h;
1644 }
1645
1646
1647
1648 #endif /* XXH3p_H */
1649