1 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 /*
6    xxHash - Extremely Fast Hash algorithm
7    Development source file for `xxh3`
8    Copyright (C) 2019-present, Yann Collet.
9 
10    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
11 
12    Redistribution and use in source and binary forms, with or without
13    modification, are permitted provided that the following conditions are
14    met:
15 
16        * Redistributions of source code must retain the above copyright
17    notice, this list of conditions and the following disclaimer.
18        * Redistributions in binary form must reproduce the above
19    copyright notice, this list of conditions and the following disclaimer
20    in the documentation and/or other materials provided with the
21    distribution.
22 
23    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35    You can contact the author at :
36    - xxHash source repository : https://github.com/Cyan4973/xxHash
37 */
38 
39 /* RocksDB Note: This file contains a preview release (xxhash repository
40    version 0.7.2) of XXH3 that is unlikely to be compatible with the final
41    version of XXH3. We have therefore renamed this XXH3p ("preview"), for
42    clarity so that we can continue to use this version even after
43    integrating a newer incompatible version.
44 */
45 
46 /* Note :
47    This file is separated for development purposes.
48    It will be integrated into `xxhash.c` when development phase is complete.
49 */
50 
51 #ifndef XXH3p_H
52 #define XXH3p_H
53 
54 
55 /* ===   Dependencies   === */
56 
57 #undef XXH_INLINE_ALL   /* in case it's already defined */
58 #define XXH_INLINE_ALL
59 #include "xxhash.h"
60 
61 
62 /* ===   Compiler specifics   === */
63 
64 #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
65 #  define XXH_RESTRICT   restrict
66 #else
67 /* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
68 #  define XXH_RESTRICT   /* disable */
69 #endif
70 
71 #if defined(__GNUC__)
72 #  if defined(__AVX2__)
73 #    include <immintrin.h>
74 #  elif defined(__SSE2__)
75 #    include <emmintrin.h>
76 #  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
77 #    define inline __inline__  /* clang bug */
78 #    include <arm_neon.h>
79 #    undef inline
80 #  endif
81 #elif defined(_MSC_VER)
82 #  include <intrin.h>
83 #endif
84 
85 /*
86  * Sanity check.
87  *
88  * XXH3 only requires these features to be efficient:
89  *
90  *  - Usable unaligned access
91  *  - A 32-bit or 64-bit ALU
92  *      - If 32-bit, a decent ADC instruction
93  *  - A 32 or 64-bit multiply with a 64-bit result
94  *
95  * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the
96  * classic 16-bit only subset of ARM's instruction set.
97  *
98  * First of all, Thumb-1 lacks support for the UMULL instruction which
99  * performs the important long multiply. This means numerous __aeabi_lmul
100  * calls.
101  *
102  * Second of all, the 8 functional registers are just not enough.
103  * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
104  * Lo registers, and this shuffling results in thousands more MOVs than A32.
105  *
106  * A32 and T32 don't have this limitation. They can access all 14 registers,
107  * do a 32->64 multiply with UMULL, and the flexible operand is helpful too.
108  *
109  * If compiling Thumb-1 for a target which supports ARM instructions, we
110  * will give a warning.
111  *
112  * Usually, if this happens, it is because of an accident and you probably
113  * need to specify -march, as you probably meant to compileh for a newer
114  * architecture.
115  */
116 #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
117 #   warning "XXH3 is highly inefficient without ARM or Thumb-2."
118 #endif
119 
120 /* ==========================================
121  * Vectorization detection
122  * ========================================== */
123 #define XXH_SCALAR 0
124 #define XXH_SSE2   1
125 #define XXH_AVX2   2
126 #define XXH_NEON   3
127 #define XXH_VSX    4
128 
129 #ifndef XXH_VECTOR    /* can be defined on command line */
130 #  if defined(__AVX2__)
131 #    define XXH_VECTOR XXH_AVX2
132 #  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
133 #    define XXH_VECTOR XXH_SSE2
134 #  elif defined(__GNUC__) /* msvc support maybe later */ \
135   && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
136   && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
137     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
138 #    define XXH_VECTOR XXH_NEON
139 #  elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
140 #    define XXH_VECTOR XXH_VSX
141 #  else
142 #    define XXH_VECTOR XXH_SCALAR
143 #  endif
144 #endif
145 
146 /* control alignment of accumulator,
147  * for compatibility with fast vector loads */
148 #ifndef XXH_ACC_ALIGN
149 #  if XXH_VECTOR == 0   /* scalar */
150 #     define XXH_ACC_ALIGN 8
151 #  elif XXH_VECTOR == 1  /* sse2 */
152 #     define XXH_ACC_ALIGN 16
153 #  elif XXH_VECTOR == 2  /* avx2 */
154 #     define XXH_ACC_ALIGN 32
155 #  elif XXH_VECTOR == 3  /* neon */
156 #     define XXH_ACC_ALIGN 16
157 #  elif XXH_VECTOR == 4  /* vsx */
158 #     define XXH_ACC_ALIGN 16
159 #  endif
160 #endif
161 
162 /* xxh_u64 XXH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */
163 #if defined(_MSC_VER) && defined(_M_IX86)
164 #    include <intrin.h>
165 #    define XXH_mult32to64(x, y) __emulu(x, y)
166 #else
167 #    define XXH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF))
168 #endif
169 
170 /* VSX stuff. It's a lot because VSX support is mediocre across compilers and
171  * there is a lot of mischief with endianness. */
172 #if XXH_VECTOR == XXH_VSX
173 #  include <altivec.h>
174 #  undef vector
175 typedef __vector unsigned long long U64x2;
176 typedef __vector unsigned char U8x16;
177 typedef __vector unsigned U32x4;
178 
179 #ifndef XXH_VSX_BE
180 #  if defined(__BIG_ENDIAN__) \
181   || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
182 #    define XXH_VSX_BE 1
183 #  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
184 #    warning "-maltivec=be is not recommended. Please use native endianness."
185 #    define XXH_VSX_BE 1
186 #  else
187 #    define XXH_VSX_BE 0
188 #  endif
189 #endif
190 
191 /* We need some helpers for big endian mode. */
192 #if XXH_VSX_BE
193 /* A wrapper for POWER9's vec_revb. */
194 #  ifdef __POWER9_VECTOR__
195 #    define XXH_vec_revb vec_revb
196 #  else
XXH_vec_revb(U64x2 val)197 XXH_FORCE_INLINE U64x2 XXH_vec_revb(U64x2 val)
198 {
199     U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
200                               0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
201     return vec_perm(val, val, vByteSwap);
202 }
203 #  endif
204 
205 /* Power8 Crypto gives us vpermxor which is very handy for
206  * PPC64EB.
207  *
208  * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask)
209  * {
210  *     U8x16 ret;
211  *     for (int i = 0; i < 16; i++) {
212  *         ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4];
213  *     }
214  *     return ret;
215  * }
216  *
217  * Because both of the main loops load the key, swap, and xor it with input,
218  * we can combine the key swap into this instruction.
219  */
220 #  ifdef vec_permxor
221 #    define XXH_vec_permxor vec_permxor
222 #  else
223 #    define XXH_vec_permxor __builtin_crypto_vpermxor
224 #  endif
225 #endif  /* XXH_VSX_BE */
226 /*
227  * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes
228  * vec_mule.
229  *
230  * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while.
231  * Clang has an easy way to control this, we can just use the builtin which doesn't swap.
232  * GCC needs inline assembly. */
233 #if __has_builtin(__builtin_altivec_vmuleuw)
234 #  define XXH_vec_mulo __builtin_altivec_vmulouw
235 #  define XXH_vec_mule __builtin_altivec_vmuleuw
236 #else
237 /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
XXH_vec_mulo(U32x4 a,U32x4 b)238 XXH_FORCE_INLINE U64x2 XXH_vec_mulo(U32x4 a, U32x4 b) {
239     U64x2 result;
240     __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
241     return result;
242 }
XXH_vec_mule(U32x4 a,U32x4 b)243 XXH_FORCE_INLINE U64x2 XXH_vec_mule(U32x4 a, U32x4 b) {
244     U64x2 result;
245     __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
246     return result;
247 }
248 #endif /* __has_builtin(__builtin_altivec_vmuleuw) */
249 #endif /* XXH_VECTOR == XXH_VSX */
250 
251 /* prefetch
252  * can be disabled, by declaring XXH_NO_PREFETCH build macro */
253 #if defined(XXH_NO_PREFETCH)
254 #  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
255 #else
256 #  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
257 #    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
258 #    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
259 #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
260 #    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
261 #  else
262 #    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
263 #  endif
264 #endif  /* XXH_NO_PREFETCH */
265 
266 
267 /* ==========================================
268  * XXH3 default settings
269  * ========================================== */
270 
271 #define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3p_SECRET_SIZE_MIN */
272 
273 #if (XXH_SECRET_DEFAULT_SIZE < XXH3p_SECRET_SIZE_MIN)
274 #  error "default keyset is not large enough"
275 #endif
276 
277 XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
278     0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
279     0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
280     0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
281     0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
282     0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
283     0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
284     0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
285     0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
286 
287     0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
288     0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
289     0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
290     0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
291 };
292 
293 /*
294  * GCC for x86 has a tendency to use SSE in this loop. While it
295  * successfully avoids swapping (as MUL overwrites EAX and EDX), it
296  * slows it down because instead of free register swap shifts, it
297  * must use pshufd and punpckl/hd.
298  *
299  * To prevent this, we use this attribute to shut off SSE.
300  */
301 #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
302 __attribute__((__target__("no-sse")))
303 #endif
304 static XXH128_hash_t
XXH_mult64to128(xxh_u64 lhs,xxh_u64 rhs)305 XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
306 {
307     /*
308      * GCC/Clang __uint128_t method.
309      *
310      * On most 64-bit targets, GCC and Clang define a __uint128_t type.
311      * This is usually the best way as it usually uses a native long 64-bit
312      * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
313      *
314      * Usually.
315      *
316      * Despite being a 32-bit platform, Clang (and emscripten) define this
317      * type despite not having the arithmetic for it. This results in a
318      * laggy compiler builtin call which calculates a full 128-bit multiply.
319      * In that case it is best to use the portable one.
320      * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
321      */
322 #if defined(__GNUC__) && !defined(__wasm__) \
323     && defined(__SIZEOF_INT128__) \
324     || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
325 
326     __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
327     XXH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) };
328     return r128;
329 
330     /*
331      * MSVC for x64's _umul128 method.
332      *
333      * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
334      *
335      * This compiles to single operand MUL on x64.
336      */
337 #elif defined(_M_X64) || defined(_M_IA64)
338 
339 #ifndef _MSC_VER
340 #   pragma intrinsic(_umul128)
341 #endif
342     xxh_u64 product_high;
343     xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
344     XXH128_hash_t const r128 = { product_low, product_high };
345     return r128;
346 
347 #else
348     /*
349      * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
350      *
351      * This is a fast and simple grade school multiply, which is shown
352      * below with base 10 arithmetic instead of base 0x100000000.
353      *
354      *           9 3 // D2 lhs = 93
355      *         x 7 5 // D2 rhs = 75
356      *     ----------
357      *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
358      *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
359      *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
360      *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
361      *     ---------
362      *         2 7 | // D2 cross  = (15 / 10) + (45 % 10) + 21
363      *     + 6 7 | | // D2 upper  = (27 / 10) + (45 / 10) + 63
364      *     ---------
365      *       6 9 7 5
366      *
367      * The reasons for adding the products like this are:
368      *  1. It avoids manual carry tracking. Just like how
369      *     (9 * 9) + 9 + 9 = 99, the same applies with this for
370      *     UINT64_MAX. This avoids a lot of complexity.
371      *
372      *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
373      *     instruction available in ARMv6+ A32/T32, which is shown below:
374      *
375      *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
376      *         {
377      *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
378      *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
379      *             *RdHi = (xxh_u32)(product >> 32);
380      *         }
381      *
382      *     This instruction was designed for efficient long multiplication,
383      *     and allows this to be calculated in only 4 instructions which
384      *     is comparable to some 64-bit ALUs.
385      *
386      *  3. It isn't terrible on other platforms. Usually this will be
387      *     a couple of 32-bit ADD/ADCs.
388      */
389 
390     /* First calculate all of the cross products. */
391     xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
392     xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
393     xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
394     xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
395 
396     /* Now add the products together. These will never overflow. */
397     xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
398     xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
399     xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
400 
401     XXH128_hash_t r128 = { lower, upper };
402     return r128;
403 #endif
404 }
405 
406 /*
407  * We want to keep the attribute here because a target switch
408  * disables inlining.
409  *
410  * Does a 64-bit to 128-bit multiply, then XOR folds it.
411  * The reason for the separate function is to prevent passing
412  * too many structs around by value. This will hopefully inline
413  * the multiply, but we don't force it.
414  */
415 #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
416 __attribute__((__target__("no-sse")))
417 #endif
418 static xxh_u64
XXH3p_mul128_fold64(xxh_u64 lhs,xxh_u64 rhs)419 XXH3p_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
420 {
421     XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
422     return product.low64 ^ product.high64;
423 }
424 
425 
XXH3p_avalanche(xxh_u64 h64)426 static XXH64_hash_t XXH3p_avalanche(xxh_u64 h64)
427 {
428     h64 ^= h64 >> 37;
429     h64 *= PRIME64_3;
430     h64 ^= h64 >> 32;
431     return h64;
432 }
433 
434 
435 /* ==========================================
436  * Short keys
437  * ========================================== */
438 
439 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_1to3_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)440 XXH3p_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
441 {
442     XXH_ASSERT(input != NULL);
443     XXH_ASSERT(1 <= len && len <= 3);
444     XXH_ASSERT(secret != NULL);
445     {   xxh_u8 const c1 = input[0];
446         xxh_u8 const c2 = input[len >> 1];
447         xxh_u8 const c3 = input[len - 1];
448         xxh_u32  const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24);
449         xxh_u64  const keyed = (xxh_u64)combined ^ (XXH_readLE32(secret) + seed);
450         xxh_u64  const mixed = keyed * PRIME64_1;
451         return XXH3p_avalanche(mixed);
452     }
453 }
454 
455 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_4to8_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)456 XXH3p_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
457 {
458     XXH_ASSERT(input != NULL);
459     XXH_ASSERT(secret != NULL);
460     XXH_ASSERT(4 <= len && len <= 8);
461     {   xxh_u32 const input_lo = XXH_readLE32(input);
462         xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
463         xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32);
464         xxh_u64 const keyed = input_64 ^ (XXH_readLE64(secret) + seed);
465         xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
466         return XXH3p_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
467     }
468 }
469 
470 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_9to16_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)471 XXH3p_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
472 {
473     XXH_ASSERT(input != NULL);
474     XXH_ASSERT(secret != NULL);
475     XXH_ASSERT(9 <= len && len <= 16);
476     {   xxh_u64 const input_lo = XXH_readLE64(input)           ^ (XXH_readLE64(secret)     + seed);
477         xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret + 8) - seed);
478         xxh_u64 const acc = len + (input_lo + input_hi) + XXH3p_mul128_fold64(input_lo, input_hi);
479         return XXH3p_avalanche(acc);
480     }
481 }
482 
483 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_0to16_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)484 XXH3p_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
485 {
486     XXH_ASSERT(len <= 16);
487     {   if (len > 8) return XXH3p_len_9to16_64b(input, len, secret, seed);
488         if (len >= 4) return XXH3p_len_4to8_64b(input, len, secret, seed);
489         if (len) return XXH3p_len_1to3_64b(input, len, secret, seed);
490         /*
491          * RocksDB modification from XXH3 preview: zero result for empty
492          * string can be problematic for multiplication-based algorithms.
493          * Return a hash of the seed instead.
494          */
495         return XXH3p_mul128_fold64(seed + XXH_readLE64(secret), PRIME64_2);
496     }
497 }
498 
499 
500 /* ===    Long Keys    === */
501 
502 #define STRIPE_LEN 64
503 #define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
504 #define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
505 
506 typedef enum { XXH3p_acc_64bits, XXH3p_acc_128bits } XXH3p_accWidth_e;
507 
508 XXH_FORCE_INLINE void
XXH3p_accumulate_512(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret,XXH3p_accWidth_e accWidth)509 XXH3p_accumulate_512(      void* XXH_RESTRICT acc,
510                     const void* XXH_RESTRICT input,
511                     const void* XXH_RESTRICT secret,
512                     XXH3p_accWidth_e accWidth)
513 {
514 #if (XXH_VECTOR == XXH_AVX2)
515 
516     XXH_ASSERT((((size_t)acc) & 31) == 0);
517     {   XXH_ALIGN(32) __m256i* const xacc  =       (__m256i *) acc;
518         const         __m256i* const xinput = (const __m256i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
519         const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
520 
521         size_t i;
522         for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
523             __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
524             __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
525             __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
526             __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
527             if (accWidth == XXH3p_acc_128bits) {
528                 __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
529                 __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
530                 xacc[i]  = _mm256_add_epi64(product, sum);
531             } else {  /* XXH3p_acc_64bits */
532                 __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
533                 xacc[i]  = _mm256_add_epi64(product, sum);
534             }
535     }   }
536 
537 #elif (XXH_VECTOR == XXH_SSE2)
538 
539     XXH_ASSERT((((size_t)acc) & 15) == 0);
540     {   XXH_ALIGN(16) __m128i* const xacc  =       (__m128i *) acc;
541         const         __m128i* const xinput = (const __m128i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
542         const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
543 
544         size_t i;
545         for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
546             __m128i const data_vec = _mm_loadu_si128 (xinput+i);
547             __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
548             __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
549             __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
550             if (accWidth == XXH3p_acc_128bits) {
551                 __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
552                 __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
553                 xacc[i]  = _mm_add_epi64(product, sum);
554             } else {  /* XXH3p_acc_64bits */
555                 __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
556                 xacc[i]  = _mm_add_epi64(product, sum);
557             }
558     }   }
559 
560 #elif (XXH_VECTOR == XXH_NEON)
561 
562     XXH_ASSERT((((size_t)acc) & 15) == 0);
563     {
564         XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
565         /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
566         uint8_t const* const xinput = (const uint8_t *) input;
567         uint8_t const* const xsecret  = (const uint8_t *) secret;
568 
569         size_t i;
570         for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
571 #if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
572             /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
573              * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
574              * assumes I don't want to destroy it and tries to make a copy. This slows down the code
575              * a lot.
576              * aarch64 not only uses an entirely different syntax, but it requires three
577              * instructions...
578              *    ext    v1.16B, v0.16B, #8    // select high bits because aarch64 can't address them directly
579              *    zip1   v3.2s, v0.2s, v1.2s   // first zip
580              *    zip2   v2.2s, v0.2s, v1.2s   // second zip
581              * ...to do what ARM does in one:
582              *    vzip.32 d0, d1               // Interleave high and low bits and overwrite. */
583 
584             /* data_vec = xsecret[i]; */
585             uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
586             /* key_vec  = xsecret[i];  */
587             uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
588             /* data_key = data_vec ^ key_vec; */
589             uint32x4_t       data_key;
590 
591             if (accWidth == XXH3p_acc_64bits) {
592                 /* Add first to prevent register swaps */
593                 /* xacc[i] += data_vec; */
594                 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
595             } else {  /* XXH3p_acc_128bits */
596                 /* xacc[i] += swap(data_vec); */
597                 /* can probably be optimized better */
598                 uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
599                 uint64x2_t const swapped= vextq_u64(data64, data64, 1);
600                 xacc[i] = vaddq_u64 (xacc[i], swapped);
601             }
602 
603             data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));
604 
605             /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
606              * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
607             __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
608             /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
609             xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
610 
611 #else
612             /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
613 
614             /* data_vec = xsecret[i]; */
615             uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
616             /* key_vec  = xsecret[i];  */
617             uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
618             /* data_key = data_vec ^ key_vec; */
619             uint64x2_t const data_key    = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
620             /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
621             uint32x2_t const data_key_lo = vmovn_u64  (data_key);
622             /* data_key_hi = (uint32x2_t) (data_key >> 32); */
623             uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32);
624             if (accWidth == XXH3p_acc_64bits) {
625                 /* xacc[i] += data_vec; */
626                 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
627             } else {  /* XXH3p_acc_128bits */
628                 /* xacc[i] += swap(data_vec); */
629                 uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
630                 uint64x2_t const swapped= vextq_u64(data64, data64, 1);
631                 xacc[i] = vaddq_u64 (xacc[i], swapped);
632             }
633             /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
634             xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
635 
636 #endif
637         }
638     }
639 
640 #elif (XXH_VECTOR == XXH_VSX)
641           U64x2* const xacc =        (U64x2*) acc;    /* presumed aligned */
642     U64x2 const* const xinput = (U64x2 const*) input;   /* no alignment restriction */
643     U64x2 const* const xsecret  = (U64x2 const*) secret;    /* no alignment restriction */
644     U64x2 const v32 = { 32,  32 };
645 #if XXH_VSX_BE
646     U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
647                               0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
648 #endif
649     size_t i;
650     for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
651         /* data_vec = xinput[i]; */
652         /* key_vec = xsecret[i]; */
653 #if XXH_VSX_BE
654         /* byteswap */
655         U64x2 const data_vec = XXH_vec_revb(vec_vsx_ld(0, xinput + i));
656         U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
657         /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
658         U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
659 #else
660         U64x2 const data_vec = vec_vsx_ld(0, xinput + i);
661         U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
662         U64x2 const data_key = data_vec ^ key_vec;
663 #endif
664         /* shuffled = (data_key << 32) | (data_key >> 32); */
665         U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
666         /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
667         U64x2 const product = XXH_vec_mulo((U32x4)data_key, shuffled);
668         xacc[i] += product;
669 
670         if (accWidth == XXH3p_acc_64bits) {
671             xacc[i] += data_vec;
672         } else {  /* XXH3p_acc_128bits */
673             /* swap high and low halves */
674             U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
675             xacc[i] += data_swapped;
676         }
677     }
678 
679 #else   /* scalar variant of Accumulator - universal */
680 
681     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;    /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
682     const xxh_u8* const xinput = (const xxh_u8*) input;  /* no alignment restriction */
683     const xxh_u8* const xsecret  = (const xxh_u8*) secret;   /* no alignment restriction */
684     size_t i;
685     XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
686     for (i=0; i < ACC_NB; i++) {
687         xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
688         xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
689 
690         if (accWidth == XXH3p_acc_64bits) {
691             xacc[i] += data_val;
692         } else {
693             xacc[i ^ 1] += data_val; /* swap adjacent lanes */
694         }
695         xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
696     }
697 #endif
698 }
699 
700 XXH_FORCE_INLINE void
XXH3p_scrambleAcc(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)701 XXH3p_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
702 {
703 #if (XXH_VECTOR == XXH_AVX2)
704 
705     XXH_ASSERT((((size_t)acc) & 31) == 0);
706     {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
707         const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
708         const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
709 
710         size_t i;
711         for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
712             /* xacc[i] ^= (xacc[i] >> 47) */
713             __m256i const acc_vec     = xacc[i];
714             __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
715             __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
716             /* xacc[i] ^= xsecret; */
717             __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
718             __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
719 
720             /* xacc[i] *= PRIME32_1; */
721             __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31);
722             __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
723             __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
724             xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
725         }
726     }
727 
728 #elif (XXH_VECTOR == XXH_SSE2)
729 
730     XXH_ASSERT((((size_t)acc) & 15) == 0);
731     {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
732         const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */
733         const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
734 
735         size_t i;
736         for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
737             /* xacc[i] ^= (xacc[i] >> 47) */
738             __m128i const acc_vec     = xacc[i];
739             __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
740             __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
741             /* xacc[i] ^= xsecret; */
742             __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
743             __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
744 
745             /* xacc[i] *= PRIME32_1; */
746             __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31);
747             __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
748             __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
749             xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
750         }
751     }
752 
753 #elif (XXH_VECTOR == XXH_NEON)
754 
755     XXH_ASSERT((((size_t)acc) & 15) == 0);
756 
757     {   uint64x2_t* const xacc =     (uint64x2_t*) acc;
758         uint8_t const* const xsecret = (uint8_t const*) secret;
759         uint32x2_t const prime     = vdup_n_u32 (PRIME32_1);
760 
761         size_t i;
762         for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
763             /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
764             uint64x2_t const   acc_vec  = xacc[i];
765             uint64x2_t const   shifted  = vshrq_n_u64 (acc_vec, 47);
766             uint64x2_t const   data_vec = veorq_u64   (acc_vec, shifted);
767 
768             /* key_vec  = xsecret[i]; */
769             uint32x4_t const   key_vec  = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16)));
770             /* data_key = data_vec ^ key_vec; */
771             uint32x4_t const   data_key = veorq_u32   (vreinterpretq_u32_u64(data_vec), key_vec);
772             /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
773             uint32x2x2_t const shuffled = vzip_u32    (vget_low_u32(data_key), vget_high_u32(data_key));
774 
775             /* data_key *= PRIME32_1 */
776 
777             /* prod_hi = (data_key >> 32) * PRIME32_1; */
778             uint64x2_t const   prod_hi = vmull_u32    (shuffled.val[1], prime);
779             /* xacc[i] = prod_hi << 32; */
780             xacc[i] = vshlq_n_u64(prod_hi, 32);
781             /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
782             xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
783     }   }
784 
785 #elif (XXH_VECTOR == XXH_VSX)
786 
787           U64x2* const xacc =       (U64x2*) acc;
788     const U64x2* const xsecret = (const U64x2*) secret;
789     /* constants */
790     U64x2 const v32  = { 32, 32 };
791     U64x2 const v47 = { 47, 47 };
792     U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
793     size_t i;
794 #if XXH_VSX_BE
795     /* endian swap */
796     U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
797                               0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
798 #endif
799     for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
800         U64x2 const acc_vec  = xacc[i];
801         U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
802         /* key_vec = xsecret[i]; */
803 #if XXH_VSX_BE
804         /* swap bytes words */
805         U64x2 const key_raw  = vec_vsx_ld(0, xsecret + i);
806         U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
807 #else
808         U64x2 const key_vec  = vec_vsx_ld(0, xsecret + i);
809         U64x2 const data_key = data_vec ^ key_vec;
810 #endif
811 
812         /* data_key *= PRIME32_1 */
813 
814         /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF);  */
815         U64x2 const prod_even  = XXH_vec_mule((U32x4)data_key, prime);
816         /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32);  */
817         U64x2 const prod_odd  = XXH_vec_mulo((U32x4)data_key, prime);
818         xacc[i] = prod_odd + (prod_even << v32);
819     }
820 
821 #else   /* scalar variant of Scrambler - universal */
822 
823     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
824     const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
825     size_t i;
826     XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
827     for (i=0; i < ACC_NB; i++) {
828         xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
829         xxh_u64 acc64 = xacc[i];
830         acc64 ^= acc64 >> 47;
831         acc64 ^= key64;
832         acc64 *= PRIME32_1;
833         xacc[i] = acc64;
834     }
835 
836 #endif
837 }
838 
839 #define XXH_PREFETCH_DIST 384
840 
841 /* assumption : nbStripes will not overflow secret size */
842 XXH_FORCE_INLINE void
XXH3p_accumulate(xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT input,const xxh_u8 * XXH_RESTRICT secret,size_t nbStripes,XXH3p_accWidth_e accWidth)843 XXH3p_accumulate(       xxh_u64* XXH_RESTRICT acc,
844                 const xxh_u8* XXH_RESTRICT input,
845                 const xxh_u8* XXH_RESTRICT secret,
846                       size_t nbStripes,
847                       XXH3p_accWidth_e accWidth)
848 {
849     size_t n;
850     for (n = 0; n < nbStripes; n++ ) {
851         const xxh_u8* const in = input + n*STRIPE_LEN;
852         XXH_PREFETCH(in + XXH_PREFETCH_DIST);
853         XXH3p_accumulate_512(acc,
854                             in,
855                             secret + n*XXH_SECRET_CONSUME_RATE,
856                             accWidth);
857     }
858 }
859 
860 /* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
861  *        and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
862  *        However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
863  *        Pretty much every other modes and compilers prefer `FORCE_INLINE`.
864  */
865 
866 #if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__)
867 static void
868 #else
869 XXH_FORCE_INLINE void
870 #endif
XXH3p_hashLong_internal_loop(xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH3p_accWidth_e accWidth)871 XXH3p_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
872                       const xxh_u8* XXH_RESTRICT input, size_t len,
873                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
874                             XXH3p_accWidth_e accWidth)
875 {
876     size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
877     size_t const block_len = STRIPE_LEN * nb_rounds;
878     size_t const nb_blocks = len / block_len;
879 
880     size_t n;
881 
882     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
883 
884     for (n = 0; n < nb_blocks; n++) {
885         XXH3p_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
886         XXH3p_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
887     }
888 
889     /* last partial block */
890     XXH_ASSERT(len > STRIPE_LEN);
891     {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
892         XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
893         XXH3p_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
894 
895         /* last stripe */
896         if (len & (STRIPE_LEN - 1)) {
897             const xxh_u8* const p = input + len - STRIPE_LEN;
898 #define XXH_SECRET_LASTACC_START 7  /* do not align on 8, so that secret is different from scrambler */
899             XXH3p_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
900     }   }
901 }
902 
903 XXH_FORCE_INLINE xxh_u64
XXH3p_mix2Accs(const xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT secret)904 XXH3p_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
905 {
906     return XXH3p_mul128_fold64(
907                acc[0] ^ XXH_readLE64(secret),
908                acc[1] ^ XXH_readLE64(secret+8) );
909 }
910 
911 static XXH64_hash_t
XXH3p_mergeAccs(const xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT secret,xxh_u64 start)912 XXH3p_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
913 {
914     xxh_u64 result64 = start;
915 
916     result64 += XXH3p_mix2Accs(acc+0, secret +  0);
917     result64 += XXH3p_mix2Accs(acc+2, secret + 16);
918     result64 += XXH3p_mix2Accs(acc+4, secret + 32);
919     result64 += XXH3p_mix2Accs(acc+6, secret + 48);
920 
921     return XXH3p_avalanche(result64);
922 }
923 
924 #define XXH3p_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
925                         PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
926 
927 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_hashLong_internal(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize)928 XXH3p_hashLong_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
929                        const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
930 {
931     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
932 
933     XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_64bits);
934 
935     /* converge into final hash */
936     XXH_STATIC_ASSERT(sizeof(acc) == 64);
937 #define XXH_SECRET_MERGEACCS_START 11  /* do not align on 8, so that secret is different from accumulator */
938     XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
939     return XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
940 }
941 
942 
943 XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_64b_defaultSecret(const xxh_u8 * XXH_RESTRICT input,size_t len)944 XXH3p_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
945 {
946     return XXH3p_hashLong_internal(input, len, kSecret, sizeof(kSecret));
947 }
948 
949 XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_64b_withSecret(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize)950 XXH3p_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
951                              const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
952 {
953     return XXH3p_hashLong_internal(input, len, secret, secretSize);
954 }
955 
956 
XXH_writeLE64(void * dst,xxh_u64 v64)957 XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
958 {
959     if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
960     memcpy(dst, &v64, sizeof(v64));
961 }
962 
963 /* XXH3p_initCustomSecret() :
964  * destination `customSecret` is presumed allocated and same size as `kSecret`.
965  */
XXH3p_initCustomSecret(xxh_u8 * customSecret,xxh_u64 seed64)966 XXH_FORCE_INLINE void XXH3p_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64)
967 {
968     int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
969     int i;
970 
971     XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
972 
973     for (i=0; i < nbRounds; i++) {
974         XXH_writeLE64(customSecret + 16*i,     XXH_readLE64(kSecret + 16*i)     + seed64);
975         XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64);
976     }
977 }
978 
979 
980 /* XXH3p_hashLong_64b_withSeed() :
981  * Generate a custom key,
982  * based on alteration of default kSecret with the seed,
983  * and then use this key for long mode hashing.
984  * This operation is decently fast but nonetheless costs a little bit of time.
985  * Try to avoid it whenever possible (typically when seed==0).
986  */
987 XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_64b_withSeed(const xxh_u8 * input,size_t len,XXH64_hash_t seed)988 XXH3p_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
989 {
990     XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
991     if (seed==0) return XXH3p_hashLong_64b_defaultSecret(input, len);
992     XXH3p_initCustomSecret(secret, seed);
993     return XXH3p_hashLong_internal(input, len, secret, sizeof(secret));
994 }
995 
996 
XXH3p_mix16B(const xxh_u8 * XXH_RESTRICT input,const xxh_u8 * XXH_RESTRICT secret,xxh_u64 seed64)997 XXH_FORCE_INLINE xxh_u64 XXH3p_mix16B(const xxh_u8* XXH_RESTRICT input,
998                                  const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
999 {
1000     xxh_u64 const input_lo = XXH_readLE64(input);
1001     xxh_u64 const input_hi = XXH_readLE64(input+8);
1002     return XXH3p_mul128_fold64(
1003                input_lo ^ (XXH_readLE64(secret)   + seed64),
1004                input_hi ^ (XXH_readLE64(secret+8) - seed64) );
1005 }
1006 
1007 
1008 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_17to128_64b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1009 XXH3p_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
1010                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1011                      XXH64_hash_t seed)
1012 {
1013     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1014     XXH_ASSERT(16 < len && len <= 128);
1015 
1016     {   xxh_u64 acc = len * PRIME64_1;
1017         if (len > 32) {
1018             if (len > 64) {
1019                 if (len > 96) {
1020                     acc += XXH3p_mix16B(input+48, secret+96, seed);
1021                     acc += XXH3p_mix16B(input+len-64, secret+112, seed);
1022                 }
1023                 acc += XXH3p_mix16B(input+32, secret+64, seed);
1024                 acc += XXH3p_mix16B(input+len-48, secret+80, seed);
1025             }
1026             acc += XXH3p_mix16B(input+16, secret+32, seed);
1027             acc += XXH3p_mix16B(input+len-32, secret+48, seed);
1028         }
1029         acc += XXH3p_mix16B(input+0, secret+0, seed);
1030         acc += XXH3p_mix16B(input+len-16, secret+16, seed);
1031 
1032         return XXH3p_avalanche(acc);
1033     }
1034 }
1035 
1036 #define XXH3p_MIDSIZE_MAX 240
1037 
1038 XXH_NO_INLINE XXH64_hash_t
XXH3p_len_129to240_64b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1039 XXH3p_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
1040                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1041                       XXH64_hash_t seed)
1042 {
1043     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1044     XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
1045 
1046     #define XXH3p_MIDSIZE_STARTOFFSET 3
1047     #define XXH3p_MIDSIZE_LASTOFFSET  17
1048 
1049     {   xxh_u64 acc = len * PRIME64_1;
1050         int const nbRounds = (int)len / 16;
1051         int i;
1052         for (i=0; i<8; i++) {
1053             acc += XXH3p_mix16B(input+(16*i), secret+(16*i), seed);
1054         }
1055         acc = XXH3p_avalanche(acc);
1056         XXH_ASSERT(nbRounds >= 8);
1057         for (i=8 ; i < nbRounds; i++) {
1058             acc += XXH3p_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3p_MIDSIZE_STARTOFFSET, seed);
1059         }
1060         /* last bytes */
1061         acc += XXH3p_mix16B(input + len - 16, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET, seed);
1062         return XXH3p_avalanche(acc);
1063     }
1064 }
1065 
1066 /* ===   Public entry point   === */
1067 
XXH3p_64bits(const void * input,size_t len)1068 XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits(const void* input, size_t len)
1069 {
1070     if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
1071     if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1072     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1073     return XXH3p_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
1074 }
1075 
1076 XXH_PUBLIC_API XXH64_hash_t
XXH3p_64bits_withSecret(const void * input,size_t len,const void * secret,size_t secretSize)1077 XXH3p_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
1078 {
1079     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
1080     /* if an action must be taken should `secret` conditions not be respected,
1081      * it should be done here.
1082      * For now, it's a contract pre-condition.
1083      * Adding a check and a branch here would cost performance at every hash */
1084      if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
1085      if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1086      if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1087      return XXH3p_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
1088 }
1089 
1090 XXH_PUBLIC_API XXH64_hash_t
XXH3p_64bits_withSeed(const void * input,size_t len,XXH64_hash_t seed)1091 XXH3p_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
1092 {
1093     if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
1094     if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1095     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1096     return XXH3p_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
1097 }
1098 
1099 /* ===   XXH3 streaming   === */
1100 
XXH3p_createState(void)1101 XXH_PUBLIC_API XXH3p_state_t* XXH3p_createState(void)
1102 {
1103     return (XXH3p_state_t*)XXH_malloc(sizeof(XXH3p_state_t));
1104 }
1105 
XXH3p_freeState(XXH3p_state_t * statePtr)1106 XXH_PUBLIC_API XXH_errorcode XXH3p_freeState(XXH3p_state_t* statePtr)
1107 {
1108     XXH_free(statePtr);
1109     return XXH_OK;
1110 }
1111 
1112 XXH_PUBLIC_API void
XXH3p_copyState(XXH3p_state_t * dst_state,const XXH3p_state_t * src_state)1113 XXH3p_copyState(XXH3p_state_t* dst_state, const XXH3p_state_t* src_state)
1114 {
1115     memcpy(dst_state, src_state, sizeof(*dst_state));
1116 }
1117 
1118 static void
XXH3p_64bits_reset_internal(XXH3p_state_t * statePtr,XXH64_hash_t seed,const xxh_u8 * secret,size_t secretSize)1119 XXH3p_64bits_reset_internal(XXH3p_state_t* statePtr,
1120                            XXH64_hash_t seed,
1121                            const xxh_u8* secret, size_t secretSize)
1122 {
1123     XXH_ASSERT(statePtr != NULL);
1124     memset(statePtr, 0, sizeof(*statePtr));
1125     statePtr->acc[0] = PRIME32_3;
1126     statePtr->acc[1] = PRIME64_1;
1127     statePtr->acc[2] = PRIME64_2;
1128     statePtr->acc[3] = PRIME64_3;
1129     statePtr->acc[4] = PRIME64_4;
1130     statePtr->acc[5] = PRIME32_2;
1131     statePtr->acc[6] = PRIME64_5;
1132     statePtr->acc[7] = PRIME32_1;
1133     statePtr->seed = seed;
1134     XXH_ASSERT(secret != NULL);
1135     statePtr->secret = secret;
1136     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
1137     statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
1138     statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
1139 }
1140 
1141 XXH_PUBLIC_API XXH_errorcode
XXH3p_64bits_reset(XXH3p_state_t * statePtr)1142 XXH3p_64bits_reset(XXH3p_state_t* statePtr)
1143 {
1144     if (statePtr == NULL) return XXH_ERROR;
1145     XXH3p_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
1146     return XXH_OK;
1147 }
1148 
1149 XXH_PUBLIC_API XXH_errorcode
XXH3p_64bits_reset_withSecret(XXH3p_state_t * statePtr,const void * secret,size_t secretSize)1150 XXH3p_64bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize)
1151 {
1152     if (statePtr == NULL) return XXH_ERROR;
1153     XXH3p_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
1154     if (secret == NULL) return XXH_ERROR;
1155     if (secretSize < XXH3p_SECRET_SIZE_MIN) return XXH_ERROR;
1156     return XXH_OK;
1157 }
1158 
1159 XXH_PUBLIC_API XXH_errorcode
XXH3p_64bits_reset_withSeed(XXH3p_state_t * statePtr,XXH64_hash_t seed)1160 XXH3p_64bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed)
1161 {
1162     if (statePtr == NULL) return XXH_ERROR;
1163     XXH3p_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
1164     XXH3p_initCustomSecret(statePtr->customSecret, seed);
1165     statePtr->secret = statePtr->customSecret;
1166     return XXH_OK;
1167 }
1168 
1169 XXH_FORCE_INLINE void
XXH3p_consumeStripes(xxh_u64 * acc,XXH32_hash_t * nbStripesSoFarPtr,XXH32_hash_t nbStripesPerBlock,const xxh_u8 * input,size_t totalStripes,const xxh_u8 * secret,size_t secretLimit,XXH3p_accWidth_e accWidth)1170 XXH3p_consumeStripes( xxh_u64* acc,
1171                     XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
1172                     const xxh_u8* input, size_t totalStripes,
1173                     const xxh_u8* secret, size_t secretLimit,
1174                     XXH3p_accWidth_e accWidth)
1175 {
1176     XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
1177     if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
1178         /* need a scrambling operation */
1179         size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
1180         XXH3p_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
1181         XXH3p_scrambleAcc(acc, secret + secretLimit);
1182         XXH3p_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
1183         *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
1184     } else {
1185         XXH3p_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
1186         *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
1187     }
1188 }
1189 
1190 XXH_FORCE_INLINE XXH_errorcode
XXH3p_update(XXH3p_state_t * state,const xxh_u8 * input,size_t len,XXH3p_accWidth_e accWidth)1191 XXH3p_update(XXH3p_state_t* state, const xxh_u8* input, size_t len, XXH3p_accWidth_e accWidth)
1192 {
1193     if (input==NULL)
1194 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
1195         return XXH_OK;
1196 #else
1197         return XXH_ERROR;
1198 #endif
1199 
1200     {   const xxh_u8* const bEnd = input + len;
1201 
1202         state->totalLen += len;
1203 
1204         if (state->bufferedSize + len <= XXH3p_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
1205             XXH_memcpy(state->buffer + state->bufferedSize, input, len);
1206             state->bufferedSize += (XXH32_hash_t)len;
1207             return XXH_OK;
1208         }
1209         /* input now > XXH3p_INTERNALBUFFER_SIZE */
1210 
1211         #define XXH3p_INTERNALBUFFER_STRIPES (XXH3p_INTERNALBUFFER_SIZE / STRIPE_LEN)
1212         XXH_STATIC_ASSERT(XXH3p_INTERNALBUFFER_SIZE % STRIPE_LEN == 0);   /* clean multiple */
1213 
1214         if (state->bufferedSize) {   /* some input within internal buffer: fill then consume it */
1215             size_t const loadSize = XXH3p_INTERNALBUFFER_SIZE - state->bufferedSize;
1216             XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
1217             input += loadSize;
1218             XXH3p_consumeStripes(state->acc,
1219                                &state->nbStripesSoFar, state->nbStripesPerBlock,
1220                                 state->buffer, XXH3p_INTERNALBUFFER_STRIPES,
1221                                 state->secret, state->secretLimit,
1222                                 accWidth);
1223             state->bufferedSize = 0;
1224         }
1225 
1226         /* consume input by full buffer quantities */
1227         if (input+XXH3p_INTERNALBUFFER_SIZE <= bEnd) {
1228             const xxh_u8* const limit = bEnd - XXH3p_INTERNALBUFFER_SIZE;
1229             do {
1230                 XXH3p_consumeStripes(state->acc,
1231                                    &state->nbStripesSoFar, state->nbStripesPerBlock,
1232                                     input, XXH3p_INTERNALBUFFER_STRIPES,
1233                                     state->secret, state->secretLimit,
1234                                     accWidth);
1235                 input += XXH3p_INTERNALBUFFER_SIZE;
1236             } while (input<=limit);
1237         }
1238 
1239         if (input < bEnd) { /* some remaining input input : buffer it */
1240             XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
1241             state->bufferedSize = (XXH32_hash_t)(bEnd-input);
1242         }
1243     }
1244 
1245     return XXH_OK;
1246 }
1247 
1248 XXH_PUBLIC_API XXH_errorcode
XXH3p_64bits_update(XXH3p_state_t * state,const void * input,size_t len)1249 XXH3p_64bits_update(XXH3p_state_t* state, const void* input, size_t len)
1250 {
1251     return XXH3p_update(state, (const xxh_u8*)input, len, XXH3p_acc_64bits);
1252 }
1253 
1254 
1255 XXH_FORCE_INLINE void
XXH3p_digest_long(XXH64_hash_t * acc,const XXH3p_state_t * state,XXH3p_accWidth_e accWidth)1256 XXH3p_digest_long (XXH64_hash_t* acc, const XXH3p_state_t* state, XXH3p_accWidth_e accWidth)
1257 {
1258     memcpy(acc, state->acc, sizeof(state->acc));  /* digest locally, state remains unaltered, and can continue ingesting more input afterwards */
1259     if (state->bufferedSize >= STRIPE_LEN) {
1260         size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
1261         XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
1262         XXH3p_consumeStripes(acc,
1263                            &nbStripesSoFar, state->nbStripesPerBlock,
1264                             state->buffer, totalNbStripes,
1265                             state->secret, state->secretLimit,
1266                             accWidth);
1267         if (state->bufferedSize % STRIPE_LEN) {  /* one last partial stripe */
1268             XXH3p_accumulate_512(acc,
1269                                 state->buffer + state->bufferedSize - STRIPE_LEN,
1270                                 state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
1271                                 accWidth);
1272         }
1273     } else {  /* bufferedSize < STRIPE_LEN */
1274         if (state->bufferedSize) { /* one last stripe */
1275             xxh_u8 lastStripe[STRIPE_LEN];
1276             size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
1277             memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
1278             memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
1279             XXH3p_accumulate_512(acc,
1280                                 lastStripe,
1281                                 state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
1282                                 accWidth);
1283     }   }
1284 }
1285 
XXH3p_64bits_digest(const XXH3p_state_t * state)1286 XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_digest (const XXH3p_state_t* state)
1287 {
1288     if (state->totalLen > XXH3p_MIDSIZE_MAX) {
1289         XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
1290         XXH3p_digest_long(acc, state, XXH3p_acc_64bits);
1291         return XXH3p_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
1292     }
1293     /* len <= XXH3p_MIDSIZE_MAX : short code */
1294     if (state->seed)
1295         return XXH3p_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
1296     return XXH3p_64bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
1297 }
1298 
1299 /* ==========================================
1300  * XXH3 128 bits (=> XXH128)
1301  * ========================================== */
1302 
1303 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_1to3_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)1304 XXH3p_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1305 {
1306     XXH_ASSERT(input != NULL);
1307     XXH_ASSERT(1 <= len && len <= 3);
1308     XXH_ASSERT(secret != NULL);
1309     {   xxh_u8 const c1 = input[0];
1310         xxh_u8 const c2 = input[len >> 1];
1311         xxh_u8 const c3 = input[len - 1];
1312         xxh_u32  const combinedl = ((xxh_u32)c1) + (((xxh_u32)c2) << 8) + (((xxh_u32)c3) << 16) + (((xxh_u32)len) << 24);
1313         xxh_u32  const combinedh = XXH_swap32(combinedl);
1314         xxh_u64  const keyed_lo = (xxh_u64)combinedl ^ (XXH_readLE32(secret)   + seed);
1315         xxh_u64  const keyed_hi = (xxh_u64)combinedh ^ (XXH_readLE32(secret+4) - seed);
1316         xxh_u64  const mixedl = keyed_lo * PRIME64_1;
1317         xxh_u64  const mixedh = keyed_hi * PRIME64_5;
1318         XXH128_hash_t const h128 = { XXH3p_avalanche(mixedl) /*low64*/, XXH3p_avalanche(mixedh) /*high64*/ };
1319         return h128;
1320     }
1321 }
1322 
1323 
1324 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_4to8_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)1325 XXH3p_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1326 {
1327     XXH_ASSERT(input != NULL);
1328     XXH_ASSERT(secret != NULL);
1329     XXH_ASSERT(4 <= len && len <= 8);
1330     {   xxh_u32 const input_lo = XXH_readLE32(input);
1331         xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
1332         xxh_u64 const input_64_lo = input_lo + ((xxh_u64)input_hi << 32);
1333         xxh_u64 const input_64_hi = XXH_swap64(input_64_lo);
1334         xxh_u64 const keyed_lo = input_64_lo ^ (XXH_readLE64(secret) + seed);
1335         xxh_u64 const keyed_hi = input_64_hi ^ (XXH_readLE64(secret + 8) - seed);
1336         xxh_u64 const mix64l1 = len + ((keyed_lo ^ (keyed_lo >> 51)) * PRIME32_1);
1337         xxh_u64 const mix64l2 = (mix64l1 ^ (mix64l1 >> 47)) * PRIME64_2;
1338         xxh_u64 const mix64h1 = ((keyed_hi ^ (keyed_hi >> 47)) * PRIME64_1) - len;
1339         xxh_u64 const mix64h2 = (mix64h1 ^ (mix64h1 >> 43)) * PRIME64_4;
1340         {   XXH128_hash_t const h128 = { XXH3p_avalanche(mix64l2) /*low64*/, XXH3p_avalanche(mix64h2) /*high64*/ };
1341             return h128;
1342     }   }
1343 }
1344 
1345 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_9to16_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)1346 XXH3p_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1347 {
1348     XXH_ASSERT(input != NULL);
1349     XXH_ASSERT(secret != NULL);
1350     XXH_ASSERT(9 <= len && len <= 16);
1351     {   xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed);
1352         xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret+8) - seed);
1353         XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi, PRIME64_1);
1354         xxh_u64 const lenContrib = XXH_mult32to64(len, PRIME32_5);
1355         m128.low64 += lenContrib;
1356         m128.high64 += input_hi * PRIME64_1;
1357         m128.low64  ^= (m128.high64 >> 32);
1358         {   XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
1359             h128.high64 += m128.high64 * PRIME64_2;
1360             h128.low64   = XXH3p_avalanche(h128.low64);
1361             h128.high64  = XXH3p_avalanche(h128.high64);
1362             return h128;
1363     }   }
1364 }
1365 
1366 /* Assumption : `secret` size is >= 16
1367  * Note : it should be >= XXH3p_SECRET_SIZE_MIN anyway */
1368 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_0to16_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)1369 XXH3p_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1370 {
1371     XXH_ASSERT(len <= 16);
1372     {   if (len > 8) return XXH3p_len_9to16_128b(input, len, secret, seed);
1373         if (len >= 4) return XXH3p_len_4to8_128b(input, len, secret, seed);
1374         if (len) return XXH3p_len_1to3_128b(input, len, secret, seed);
1375         {   XXH128_hash_t const h128 = { 0, 0 };
1376             return h128;
1377     }   }
1378 }
1379 
1380 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_hashLong_128b_internal(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize)1381 XXH3p_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
1382                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
1383 {
1384     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
1385 
1386     XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_128bits);
1387 
1388     /* converge into final hash */
1389     XXH_STATIC_ASSERT(sizeof(acc) == 64);
1390     XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
1391     {   xxh_u64 const low64 = XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
1392         xxh_u64 const high64 = XXH3p_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)len * PRIME64_2));
1393         XXH128_hash_t const h128 = { low64, high64 };
1394         return h128;
1395     }
1396 }
1397 
1398 XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_128b_defaultSecret(const xxh_u8 * input,size_t len)1399 XXH3p_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
1400 {
1401     return XXH3p_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
1402 }
1403 
1404 XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_128b_withSecret(const xxh_u8 * input,size_t len,const xxh_u8 * secret,size_t secretSize)1405 XXH3p_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
1406                               const xxh_u8* secret, size_t secretSize)
1407 {
1408     return XXH3p_hashLong_128b_internal(input, len, secret, secretSize);
1409 }
1410 
1411 XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_128b_withSeed(const xxh_u8 * input,size_t len,XXH64_hash_t seed)1412 XXH3p_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
1413 {
1414     XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
1415     if (seed == 0) return XXH3p_hashLong_128b_defaultSecret(input, len);
1416     XXH3p_initCustomSecret(secret, seed);
1417     return XXH3p_hashLong_128b_internal(input, len, secret, sizeof(secret));
1418 }
1419 
1420 
1421 XXH_FORCE_INLINE XXH128_hash_t
XXH128_mix32B(XXH128_hash_t acc,const xxh_u8 * input_1,const xxh_u8 * input_2,const xxh_u8 * secret,XXH64_hash_t seed)1422 XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, const xxh_u8* secret, XXH64_hash_t seed)
1423 {
1424     acc.low64  += XXH3p_mix16B (input_1, secret+0, seed);
1425     acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
1426     acc.high64 += XXH3p_mix16B (input_2, secret+16, seed);
1427     acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
1428     return acc;
1429 }
1430 
1431 XXH_NO_INLINE XXH128_hash_t
XXH3p_len_129to240_128b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1432 XXH3p_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
1433                        const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1434                        XXH64_hash_t seed)
1435 {
1436     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1437     XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
1438 
1439     {   XXH128_hash_t acc;
1440         int const nbRounds = (int)len / 32;
1441         int i;
1442         acc.low64 = len * PRIME64_1;
1443         acc.high64 = 0;
1444         for (i=0; i<4; i++) {
1445             acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+(32*i), seed);
1446         }
1447         acc.low64 = XXH3p_avalanche(acc.low64);
1448         acc.high64 = XXH3p_avalanche(acc.high64);
1449         XXH_ASSERT(nbRounds >= 4);
1450         for (i=4 ; i < nbRounds; i++) {
1451             acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+XXH3p_MIDSIZE_STARTOFFSET+(32*(i-4)), seed);
1452         }
1453         /* last bytes */
1454         acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET - 16, 0ULL - seed);
1455 
1456         {   xxh_u64 const low64 = acc.low64 + acc.high64;
1457             xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
1458             XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
1459             return h128;
1460         }
1461     }
1462 }
1463 
1464 
1465 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_17to128_128b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1466 XXH3p_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
1467                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1468                       XXH64_hash_t seed)
1469 {
1470     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1471     XXH_ASSERT(16 < len && len <= 128);
1472 
1473     {   XXH128_hash_t acc;
1474         acc.low64 = len * PRIME64_1;
1475         acc.high64 = 0;
1476         if (len > 32) {
1477             if (len > 64) {
1478                 if (len > 96) {
1479                     acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
1480                 }
1481                 acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
1482             }
1483             acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
1484         }
1485         acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
1486         {   xxh_u64 const low64 = acc.low64 + acc.high64;
1487             xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
1488             XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
1489             return h128;
1490         }
1491     }
1492 }
1493 
XXH3p_128bits(const void * input,size_t len)1494 XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits(const void* input, size_t len)
1495 {
1496     if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
1497     if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1498     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1499     return XXH3p_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
1500 }
1501 
1502 XXH_PUBLIC_API XXH128_hash_t
XXH3p_128bits_withSecret(const void * input,size_t len,const void * secret,size_t secretSize)1503 XXH3p_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
1504 {
1505     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
1506     /* if an action must be taken should `secret` conditions not be respected,
1507      * it should be done here.
1508      * For now, it's a contract pre-condition.
1509      * Adding a check and a branch here would cost performance at every hash */
1510      if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
1511      if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1512      if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1513      return XXH3p_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
1514 }
1515 
1516 XXH_PUBLIC_API XXH128_hash_t
XXH3p_128bits_withSeed(const void * input,size_t len,XXH64_hash_t seed)1517 XXH3p_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
1518 {
1519     if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
1520     if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1521     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1522     return XXH3p_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
1523 }
1524 
1525 XXH_PUBLIC_API XXH128_hash_t
XXH128(const void * input,size_t len,XXH64_hash_t seed)1526 XXH128(const void* input, size_t len, XXH64_hash_t seed)
1527 {
1528     return XXH3p_128bits_withSeed(input, len, seed);
1529 }
1530 
1531 
1532 /* ===   XXH3 128-bit streaming   === */
1533 
1534 /* all the functions are actually the same as for 64-bit streaming variant,
1535    just the reset one is different (different initial acc values for 0,5,6,7),
1536    and near the end of the digest function */
1537 
1538 static void
XXH3p_128bits_reset_internal(XXH3p_state_t * statePtr,XXH64_hash_t seed,const xxh_u8 * secret,size_t secretSize)1539 XXH3p_128bits_reset_internal(XXH3p_state_t* statePtr,
1540                            XXH64_hash_t seed,
1541                            const xxh_u8* secret, size_t secretSize)
1542 {
1543     XXH3p_64bits_reset_internal(statePtr, seed, secret, secretSize);
1544 }
1545 
1546 XXH_PUBLIC_API XXH_errorcode
XXH3p_128bits_reset(XXH3p_state_t * statePtr)1547 XXH3p_128bits_reset(XXH3p_state_t* statePtr)
1548 {
1549     if (statePtr == NULL) return XXH_ERROR;
1550     XXH3p_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
1551     return XXH_OK;
1552 }
1553 
1554 XXH_PUBLIC_API XXH_errorcode
XXH3p_128bits_reset_withSecret(XXH3p_state_t * statePtr,const void * secret,size_t secretSize)1555 XXH3p_128bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize)
1556 {
1557     if (statePtr == NULL) return XXH_ERROR;
1558     XXH3p_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
1559     if (secret == NULL) return XXH_ERROR;
1560     if (secretSize < XXH3p_SECRET_SIZE_MIN) return XXH_ERROR;
1561     return XXH_OK;
1562 }
1563 
1564 XXH_PUBLIC_API XXH_errorcode
XXH3p_128bits_reset_withSeed(XXH3p_state_t * statePtr,XXH64_hash_t seed)1565 XXH3p_128bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed)
1566 {
1567     if (statePtr == NULL) return XXH_ERROR;
1568     XXH3p_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
1569     XXH3p_initCustomSecret(statePtr->customSecret, seed);
1570     statePtr->secret = statePtr->customSecret;
1571     return XXH_OK;
1572 }
1573 
1574 XXH_PUBLIC_API XXH_errorcode
XXH3p_128bits_update(XXH3p_state_t * state,const void * input,size_t len)1575 XXH3p_128bits_update(XXH3p_state_t* state, const void* input, size_t len)
1576 {
1577     return XXH3p_update(state, (const xxh_u8*)input, len, XXH3p_acc_128bits);
1578 }
1579 
XXH3p_128bits_digest(const XXH3p_state_t * state)1580 XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_digest (const XXH3p_state_t* state)
1581 {
1582     if (state->totalLen > XXH3p_MIDSIZE_MAX) {
1583         XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
1584         XXH3p_digest_long(acc, state, XXH3p_acc_128bits);
1585         XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
1586         {   xxh_u64 const low64 = XXH3p_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
1587             xxh_u64 const high64 = XXH3p_mergeAccs(acc, state->secret + state->secretLimit + STRIPE_LEN - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)state->totalLen * PRIME64_2));
1588             XXH128_hash_t const h128 = { low64, high64 };
1589             return h128;
1590         }
1591     }
1592     /* len <= XXH3p_MIDSIZE_MAX : short code */
1593     if (state->seed)
1594         return XXH3p_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
1595     return XXH3p_128bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
1596 }
1597 
1598 /* 128-bit utility functions */
1599 
1600 #include <string.h>   /* memcmp */
1601 
1602 /* return : 1 is equal, 0 if different */
XXH128_isEqual(XXH128_hash_t h1,XXH128_hash_t h2)1603 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
1604 {
1605     /* note : XXH128_hash_t is compact, it has no padding byte */
1606     return !(memcmp(&h1, &h2, sizeof(h1)));
1607 }
1608 
1609 /* This prototype is compatible with stdlib's qsort().
1610  * return : >0 if *h128_1  > *h128_2
1611  *          <0 if *h128_1  < *h128_2
1612  *          =0 if *h128_1 == *h128_2  */
XXH128_cmp(const void * h128_1,const void * h128_2)1613 XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
1614 {
1615     XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
1616     XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
1617     int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
1618     /* note : bets that, in most cases, hash values are different */
1619     if (hcmp) return hcmp;
1620     return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
1621 }
1622 
1623 
1624 /*======   Canonical representation   ======*/
1625 XXH_PUBLIC_API void
XXH128_canonicalFromHash(XXH128_canonical_t * dst,XXH128_hash_t hash)1626 XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
1627 {
1628     XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
1629     if (XXH_CPU_LITTLE_ENDIAN) {
1630         hash.high64 = XXH_swap64(hash.high64);
1631         hash.low64  = XXH_swap64(hash.low64);
1632     }
1633     memcpy(dst, &hash.high64, sizeof(hash.high64));
1634     memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
1635 }
1636 
1637 XXH_PUBLIC_API XXH128_hash_t
XXH128_hashFromCanonical(const XXH128_canonical_t * src)1638 XXH128_hashFromCanonical(const XXH128_canonical_t* src)
1639 {
1640     XXH128_hash_t h;
1641     h.high64 = XXH_readBE64(src);
1642     h.low64  = XXH_readBE64(src->digest + 8);
1643     return h;
1644 }
1645 
1646 
1647 
1648 #endif  /* XXH3p_H */
1649