1 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 /*
6    xxHash - Extremely Fast Hash algorithm
7    Development source file for `xxh3`
8    Copyright (C) 2019-present, Yann Collet.
9 
10    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
11 
12    Redistribution and use in source and binary forms, with or without
13    modification, are permitted provided that the following conditions are
14    met:
15 
16        * Redistributions of source code must retain the above copyright
17    notice, this list of conditions and the following disclaimer.
18        * Redistributions in binary form must reproduce the above
19    copyright notice, this list of conditions and the following disclaimer
20    in the documentation and/or other materials provided with the
21    distribution.
22 
23    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35    You can contact the author at :
36    - xxHash source repository : https://github.com/Cyan4973/xxHash
37 */
38 
39 /* RocksDB Note: This file contains a preview release (xxhash repository
40    version 0.7.2) of XXH3 that is unlikely to be compatible with the final
41    version of XXH3. We have therefore renamed this XXH3p ("preview"), for
42    clarity so that we can continue to use this version even after
43    integrating a newer incompatible version.
44 */
45 
46 /* Note :
47    This file is separated for development purposes.
48    It will be integrated into `xxhash.c` when development phase is complete.
49 */
50 
51 #ifndef XXH3p_H
52 #define XXH3p_H
53 
54 
55 /* ===   Dependencies   === */
56 
57 #undef XXH_INLINE_ALL   /* in case it's already defined */
58 #define XXH_INLINE_ALL
59 #include "xxhash.h"
60 
61 
62 /* ===   Compiler specifics   === */
63 
64 #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
65 #  define XXH_RESTRICT   restrict
66 #else
67 /* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
68 #  define XXH_RESTRICT   /* disable */
69 #endif
70 
71 #if defined(__GNUC__)
72 #  if defined(__AVX2__)
73 #    include <immintrin.h>
74 #  elif defined(__SSE2__)
75 #    include <emmintrin.h>
76 #  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
77 #    define inline __inline__  /* clang bug */
78 #    include <arm_neon.h>
79 #    undef inline
80 #  endif
81 #elif defined(_MSC_VER)
82 #  include <intrin.h>
83 #endif
84 
85 /*
86  * Sanity check.
87  *
88  * XXH3 only requires these features to be efficient:
89  *
90  *  - Usable unaligned access
91  *  - A 32-bit or 64-bit ALU
92  *      - If 32-bit, a decent ADC instruction
93  *  - A 32 or 64-bit multiply with a 64-bit result
94  *
95  * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the
96  * classic 16-bit only subset of ARM's instruction set.
97  *
98  * First of all, Thumb-1 lacks support for the UMULL instruction which
99  * performs the important long multiply. This means numerous __aeabi_lmul
100  * calls.
101  *
102  * Second of all, the 8 functional registers are just not enough.
103  * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
104  * Lo registers, and this shuffling results in thousands more MOVs than A32.
105  *
106  * A32 and T32 don't have this limitation. They can access all 14 registers,
107  * do a 32->64 multiply with UMULL, and the flexible operand is helpful too.
108  *
109  * If compiling Thumb-1 for a target which supports ARM instructions, we
110  * will give a warning.
111  *
112  * Usually, if this happens, it is because of an accident and you probably
113  * need to specify -march, as you probably meant to compileh for a newer
114  * architecture.
115  */
116 #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
117 #   warning "XXH3 is highly inefficient without ARM or Thumb-2."
118 #endif
119 
120 /* ==========================================
121  * Vectorization detection
122  * ========================================== */
123 #define XXH_SCALAR 0
124 #define XXH_SSE2   1
125 #define XXH_AVX2   2
126 #define XXH_NEON   3
127 #define XXH_VSX    4
128 
129 #ifndef XXH_VECTOR    /* can be defined on command line */
130 #  if defined(__AVX2__)
131 #    define XXH_VECTOR XXH_AVX2
132 #  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
133 #    define XXH_VECTOR XXH_SSE2
134 #  elif defined(__GNUC__) /* msvc support maybe later */ \
135   && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
136   && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
137     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
138 #    define XXH_VECTOR XXH_NEON
139 #  elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
140 #    define XXH_VECTOR XXH_VSX
141 #  else
142 #    define XXH_VECTOR XXH_SCALAR
143 #  endif
144 #endif
145 
146 /* control alignment of accumulator,
147  * for compatibility with fast vector loads */
148 #ifndef XXH_ACC_ALIGN
149 #  if XXH_VECTOR == 0   /* scalar */
150 #     define XXH_ACC_ALIGN 8
151 #  elif XXH_VECTOR == 1  /* sse2 */
152 #     define XXH_ACC_ALIGN 16
153 #  elif XXH_VECTOR == 2  /* avx2 */
154 #     define XXH_ACC_ALIGN 32
155 #  elif XXH_VECTOR == 3  /* neon */
156 #     define XXH_ACC_ALIGN 16
157 #  elif XXH_VECTOR == 4  /* vsx */
158 #     define XXH_ACC_ALIGN 16
159 #  endif
160 #endif
161 
162 /* xxh_u64 XXH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */
163 #if defined(_MSC_VER) && defined(_M_IX86)
164 #    include <intrin.h>
165 #    define XXH_mult32to64(x, y) __emulu(x, y)
166 #else
167 #    define XXH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF))
168 #endif
169 
170 /* VSX stuff. It's a lot because VSX support is mediocre across compilers and
171  * there is a lot of mischief with endianness. */
172 #if XXH_VECTOR == XXH_VSX
173 #  include <altivec.h>
174 #  undef vector
175 typedef __vector unsigned long long U64x2;
176 typedef __vector unsigned char U8x16;
177 typedef __vector unsigned U32x4;
178 
179 #ifndef XXH_VSX_BE
180 #  if defined(__BIG_ENDIAN__) \
181   || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
182 #    define XXH_VSX_BE 1
183 #  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
184 #    warning "-maltivec=be is not recommended. Please use native endianness."
185 #    define XXH_VSX_BE 1
186 #  else
187 #    define XXH_VSX_BE 0
188 #  endif
189 #endif
190 
191 /* We need some helpers for big endian mode. */
192 #if XXH_VSX_BE
193 /* A wrapper for POWER9's vec_revb. */
194 #  ifdef __POWER9_VECTOR__
195 #    define XXH_vec_revb vec_revb
196 #  else
XXH_vec_revb(U64x2 val)197 XXH_FORCE_INLINE U64x2 XXH_vec_revb(U64x2 val)
198 {
199     U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
200                               0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
201     return vec_perm(val, val, vByteSwap);
202 }
203 #  endif
204 
205 /* Power8 Crypto gives us vpermxor which is very handy for
206  * PPC64EB.
207  *
208  * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask)
209  * {
210  *     U8x16 ret;
211  *     for (int i = 0; i < 16; i++) {
212  *         ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4];
213  *     }
214  *     return ret;
215  * }
216  *
217  * Because both of the main loops load the key, swap, and xor it with input,
218  * we can combine the key swap into this instruction.
219  */
220 #  ifdef vec_permxor
221 #    define XXH_vec_permxor vec_permxor
222 #  else
223 #    define XXH_vec_permxor __builtin_crypto_vpermxor
224 #  endif
225 #endif  /* XXH_VSX_BE */
226 /*
227  * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes
228  * vec_mule.
229  *
230  * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while.
231  * Clang has an easy way to control this, we can just use the builtin which doesn't swap.
232  * GCC needs inline assembly. */
233 #if __has_builtin(__builtin_altivec_vmuleuw)
234 #  define XXH_vec_mulo __builtin_altivec_vmulouw
235 #  define XXH_vec_mule __builtin_altivec_vmuleuw
236 #else
237 /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
XXH_vec_mulo(U32x4 a,U32x4 b)238 XXH_FORCE_INLINE U64x2 XXH_vec_mulo(U32x4 a, U32x4 b) {
239     U64x2 result;
240     __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
241     return result;
242 }
XXH_vec_mule(U32x4 a,U32x4 b)243 XXH_FORCE_INLINE U64x2 XXH_vec_mule(U32x4 a, U32x4 b) {
244     U64x2 result;
245     __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
246     return result;
247 }
248 #endif /* __has_builtin(__builtin_altivec_vmuleuw) */
249 #endif /* XXH_VECTOR == XXH_VSX */
250 
251 /* prefetch
252  * can be disabled, by declaring XXH_NO_PREFETCH build macro */
253 #if defined(XXH_NO_PREFETCH)
254 #  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
255 #else
256 #if defined(_MSC_VER) && \
257     (defined(_M_X64) ||  \
258      defined(_M_IX86)) /* _mm_prefetch() is not defined outside of x86/x64 */
259 #    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
260 #    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
261 #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
262 #    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
263 #  else
264 #    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
265 #  endif
266 #endif  /* XXH_NO_PREFETCH */
267 
268 
269 /* ==========================================
270  * XXH3 default settings
271  * ========================================== */
272 
273 #define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3p_SECRET_SIZE_MIN */
274 
275 #if (XXH_SECRET_DEFAULT_SIZE < XXH3p_SECRET_SIZE_MIN)
276 #  error "default keyset is not large enough"
277 #endif
278 
279 XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
280     0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
281     0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
282     0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
283     0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
284     0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
285     0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
286     0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
287     0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
288 
289     0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
290     0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
291     0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
292     0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
293 };
294 
295 /*
296  * GCC for x86 has a tendency to use SSE in this loop. While it
297  * successfully avoids swapping (as MUL overwrites EAX and EDX), it
298  * slows it down because instead of free register swap shifts, it
299  * must use pshufd and punpckl/hd.
300  *
301  * To prevent this, we use this attribute to shut off SSE.
302  */
303 #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
304 __attribute__((__target__("no-sse")))
305 #endif
306 static XXH128_hash_t
XXH_mult64to128(xxh_u64 lhs,xxh_u64 rhs)307 XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
308 {
309     /*
310      * GCC/Clang __uint128_t method.
311      *
312      * On most 64-bit targets, GCC and Clang define a __uint128_t type.
313      * This is usually the best way as it usually uses a native long 64-bit
314      * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
315      *
316      * Usually.
317      *
318      * Despite being a 32-bit platform, Clang (and emscripten) define this
319      * type despite not having the arithmetic for it. This results in a
320      * laggy compiler builtin call which calculates a full 128-bit multiply.
321      * In that case it is best to use the portable one.
322      * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
323      */
324 #if defined(__GNUC__) && !defined(__wasm__) \
325     && defined(__SIZEOF_INT128__) \
326     || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
327 
328     __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
329     XXH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) };
330     return r128;
331 
332     /*
333      * MSVC for x64's _umul128 method.
334      *
335      * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
336      *
337      * This compiles to single operand MUL on x64.
338      */
339 #elif defined(_M_X64) || defined(_M_IA64)
340 
341 #ifndef _MSC_VER
342 #   pragma intrinsic(_umul128)
343 #endif
344     xxh_u64 product_high;
345     xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
346     XXH128_hash_t const r128 = { product_low, product_high };
347     return r128;
348 
349 #else
350     /*
351      * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
352      *
353      * This is a fast and simple grade school multiply, which is shown
354      * below with base 10 arithmetic instead of base 0x100000000.
355      *
356      *           9 3 // D2 lhs = 93
357      *         x 7 5 // D2 rhs = 75
358      *     ----------
359      *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
360      *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
361      *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
362      *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
363      *     ---------
364      *         2 7 | // D2 cross  = (15 / 10) + (45 % 10) + 21
365      *     + 6 7 | | // D2 upper  = (27 / 10) + (45 / 10) + 63
366      *     ---------
367      *       6 9 7 5
368      *
369      * The reasons for adding the products like this are:
370      *  1. It avoids manual carry tracking. Just like how
371      *     (9 * 9) + 9 + 9 = 99, the same applies with this for
372      *     UINT64_MAX. This avoids a lot of complexity.
373      *
374      *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
375      *     instruction available in ARMv6+ A32/T32, which is shown below:
376      *
377      *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
378      *         {
379      *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
380      *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
381      *             *RdHi = (xxh_u32)(product >> 32);
382      *         }
383      *
384      *     This instruction was designed for efficient long multiplication,
385      *     and allows this to be calculated in only 4 instructions which
386      *     is comparable to some 64-bit ALUs.
387      *
388      *  3. It isn't terrible on other platforms. Usually this will be
389      *     a couple of 32-bit ADD/ADCs.
390      */
391 
392     /* First calculate all of the cross products. */
393     xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
394     xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
395     xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
396     xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
397 
398     /* Now add the products together. These will never overflow. */
399     xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
400     xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
401     xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
402 
403     XXH128_hash_t r128 = { lower, upper };
404     return r128;
405 #endif
406 }
407 
408 /*
409  * We want to keep the attribute here because a target switch
410  * disables inlining.
411  *
412  * Does a 64-bit to 128-bit multiply, then XOR folds it.
413  * The reason for the separate function is to prevent passing
414  * too many structs around by value. This will hopefully inline
415  * the multiply, but we don't force it.
416  */
417 #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
418 __attribute__((__target__("no-sse")))
419 #endif
420 static xxh_u64
XXH3p_mul128_fold64(xxh_u64 lhs,xxh_u64 rhs)421 XXH3p_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
422 {
423     XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
424     return product.low64 ^ product.high64;
425 }
426 
427 
XXH3p_avalanche(xxh_u64 h64)428 static XXH64_hash_t XXH3p_avalanche(xxh_u64 h64)
429 {
430     h64 ^= h64 >> 37;
431     h64 *= PRIME64_3;
432     h64 ^= h64 >> 32;
433     return h64;
434 }
435 
436 
437 /* ==========================================
438  * Short keys
439  * ========================================== */
440 
441 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_1to3_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)442 XXH3p_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
443 {
444     XXH_ASSERT(input != NULL);
445     XXH_ASSERT(1 <= len && len <= 3);
446     XXH_ASSERT(secret != NULL);
447     {   xxh_u8 const c1 = input[0];
448         xxh_u8 const c2 = input[len >> 1];
449         xxh_u8 const c3 = input[len - 1];
450         xxh_u32  const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24);
451         xxh_u64  const keyed = (xxh_u64)combined ^ (XXH_readLE32(secret) + seed);
452         xxh_u64  const mixed = keyed * PRIME64_1;
453         return XXH3p_avalanche(mixed);
454     }
455 }
456 
457 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_4to8_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)458 XXH3p_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
459 {
460     XXH_ASSERT(input != NULL);
461     XXH_ASSERT(secret != NULL);
462     XXH_ASSERT(4 <= len && len <= 8);
463     {   xxh_u32 const input_lo = XXH_readLE32(input);
464         xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
465         xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32);
466         xxh_u64 const keyed = input_64 ^ (XXH_readLE64(secret) + seed);
467         xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
468         return XXH3p_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
469     }
470 }
471 
472 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_9to16_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)473 XXH3p_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
474 {
475     XXH_ASSERT(input != NULL);
476     XXH_ASSERT(secret != NULL);
477     XXH_ASSERT(9 <= len && len <= 16);
478     {   xxh_u64 const input_lo = XXH_readLE64(input)           ^ (XXH_readLE64(secret)     + seed);
479         xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret + 8) - seed);
480         xxh_u64 const acc = len + (input_lo + input_hi) + XXH3p_mul128_fold64(input_lo, input_hi);
481         return XXH3p_avalanche(acc);
482     }
483 }
484 
485 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_0to16_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)486 XXH3p_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
487 {
488     XXH_ASSERT(len <= 16);
489     {   if (len > 8) return XXH3p_len_9to16_64b(input, len, secret, seed);
490         if (len >= 4) return XXH3p_len_4to8_64b(input, len, secret, seed);
491         if (len) return XXH3p_len_1to3_64b(input, len, secret, seed);
492         /*
493          * RocksDB modification from XXH3 preview: zero result for empty
494          * string can be problematic for multiplication-based algorithms.
495          * Return a hash of the seed instead.
496          */
497         return XXH3p_mul128_fold64(seed + XXH_readLE64(secret), PRIME64_2);
498     }
499 }
500 
501 
502 /* ===    Long Keys    === */
503 
504 #define STRIPE_LEN 64
505 #define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
506 #define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
507 
508 typedef enum { XXH3p_acc_64bits, XXH3p_acc_128bits } XXH3p_accWidth_e;
509 
510 XXH_FORCE_INLINE void
XXH3p_accumulate_512(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret,XXH3p_accWidth_e accWidth)511 XXH3p_accumulate_512(      void* XXH_RESTRICT acc,
512                     const void* XXH_RESTRICT input,
513                     const void* XXH_RESTRICT secret,
514                     XXH3p_accWidth_e accWidth)
515 {
516 #if (XXH_VECTOR == XXH_AVX2)
517 
518     XXH_ASSERT((((size_t)acc) & 31) == 0);
519     {   XXH_ALIGN(32) __m256i* const xacc  =       (__m256i *) acc;
520         const         __m256i* const xinput = (const __m256i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
521         const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
522 
523         size_t i;
524         for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
525             __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
526             __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
527             __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
528             __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
529             if (accWidth == XXH3p_acc_128bits) {
530                 __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
531                 __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
532                 xacc[i]  = _mm256_add_epi64(product, sum);
533             } else {  /* XXH3p_acc_64bits */
534                 __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
535                 xacc[i]  = _mm256_add_epi64(product, sum);
536             }
537     }   }
538 
539 #elif (XXH_VECTOR == XXH_SSE2)
540 
541     XXH_ASSERT((((size_t)acc) & 15) == 0);
542     {   XXH_ALIGN(16) __m128i* const xacc  =       (__m128i *) acc;
543         const         __m128i* const xinput = (const __m128i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
544         const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
545 
546         size_t i;
547         for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
548             __m128i const data_vec = _mm_loadu_si128 (xinput+i);
549             __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
550             __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
551             __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
552             if (accWidth == XXH3p_acc_128bits) {
553                 __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
554                 __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
555                 xacc[i]  = _mm_add_epi64(product, sum);
556             } else {  /* XXH3p_acc_64bits */
557                 __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
558                 xacc[i]  = _mm_add_epi64(product, sum);
559             }
560     }   }
561 
562 #elif (XXH_VECTOR == XXH_NEON)
563 
564     XXH_ASSERT((((size_t)acc) & 15) == 0);
565     {
566         XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
567         /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
568         uint8_t const* const xinput = (const uint8_t *) input;
569         uint8_t const* const xsecret  = (const uint8_t *) secret;
570 
571         size_t i;
572         for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
573 #if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
574             /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
575              * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
576              * assumes I don't want to destroy it and tries to make a copy. This slows down the code
577              * a lot.
578              * aarch64 not only uses an entirely different syntax, but it requires three
579              * instructions...
580              *    ext    v1.16B, v0.16B, #8    // select high bits because aarch64 can't address them directly
581              *    zip1   v3.2s, v0.2s, v1.2s   // first zip
582              *    zip2   v2.2s, v0.2s, v1.2s   // second zip
583              * ...to do what ARM does in one:
584              *    vzip.32 d0, d1               // Interleave high and low bits and overwrite. */
585 
586             /* data_vec = xsecret[i]; */
587             uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
588             /* key_vec  = xsecret[i];  */
589             uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
590             /* data_key = data_vec ^ key_vec; */
591             uint32x4_t       data_key;
592 
593             if (accWidth == XXH3p_acc_64bits) {
594                 /* Add first to prevent register swaps */
595                 /* xacc[i] += data_vec; */
596                 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
597             } else {  /* XXH3p_acc_128bits */
598                 /* xacc[i] += swap(data_vec); */
599                 /* can probably be optimized better */
600                 uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
601                 uint64x2_t const swapped= vextq_u64(data64, data64, 1);
602                 xacc[i] = vaddq_u64 (xacc[i], swapped);
603             }
604 
605             data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));
606 
607             /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
608              * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
609             __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
610             /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
611             xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
612 
613 #else
614             /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
615 
616             /* data_vec = xsecret[i]; */
617             uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
618             /* key_vec  = xsecret[i];  */
619             uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
620             /* data_key = data_vec ^ key_vec; */
621             uint64x2_t const data_key    = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
622             /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
623             uint32x2_t const data_key_lo = vmovn_u64  (data_key);
624             /* data_key_hi = (uint32x2_t) (data_key >> 32); */
625             uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32);
626             if (accWidth == XXH3p_acc_64bits) {
627                 /* xacc[i] += data_vec; */
628                 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
629             } else {  /* XXH3p_acc_128bits */
630                 /* xacc[i] += swap(data_vec); */
631                 uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
632                 uint64x2_t const swapped= vextq_u64(data64, data64, 1);
633                 xacc[i] = vaddq_u64 (xacc[i], swapped);
634             }
635             /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
636             xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
637 
638 #endif
639         }
640     }
641 
642 #elif (XXH_VECTOR == XXH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5)
643           U64x2* const xacc =        (U64x2*) acc;    /* presumed aligned */
644     U64x2 const* const xinput = (U64x2 const*) input;   /* no alignment restriction */
645     U64x2 const* const xsecret  = (U64x2 const*) secret;    /* no alignment restriction */
646     U64x2 const v32 = { 32,  32 };
647 #if XXH_VSX_BE
648     U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
649                               0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
650 #endif
651     size_t i;
652     for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
653         /* data_vec = xinput[i]; */
654         /* key_vec = xsecret[i]; */
655 #if XXH_VSX_BE
656         /* byteswap */
657         U64x2 const data_vec = XXH_vec_revb(vec_vsx_ld(0, xinput + i));
658         U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
659         /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
660         U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
661 #else
662         U64x2 const data_vec = vec_vsx_ld(0, xinput + i);
663         U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
664         U64x2 const data_key = data_vec ^ key_vec;
665 #endif
666         /* shuffled = (data_key << 32) | (data_key >> 32); */
667         U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
668         /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
669         U64x2 const product = XXH_vec_mulo((U32x4)data_key, shuffled);
670         xacc[i] += product;
671 
672         if (accWidth == XXH3p_acc_64bits) {
673             xacc[i] += data_vec;
674         } else {  /* XXH3p_acc_128bits */
675             /* swap high and low halves */
676             U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
677             xacc[i] += data_swapped;
678         }
679     }
680 
681 #else   /* scalar variant of Accumulator - universal */
682 
683     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;    /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
684     const xxh_u8* const xinput = (const xxh_u8*) input;  /* no alignment restriction */
685     const xxh_u8* const xsecret  = (const xxh_u8*) secret;   /* no alignment restriction */
686     size_t i;
687     XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
688     for (i=0; i < ACC_NB; i++) {
689         xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
690         xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
691 
692         if (accWidth == XXH3p_acc_64bits) {
693             xacc[i] += data_val;
694         } else {
695             xacc[i ^ 1] += data_val; /* swap adjacent lanes */
696         }
697         xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
698     }
699 #endif
700 }
701 
702 XXH_FORCE_INLINE void
XXH3p_scrambleAcc(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)703 XXH3p_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
704 {
705 #if (XXH_VECTOR == XXH_AVX2)
706 
707     XXH_ASSERT((((size_t)acc) & 31) == 0);
708     {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
709         const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
710         const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
711 
712         size_t i;
713         for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
714             /* xacc[i] ^= (xacc[i] >> 47) */
715             __m256i const acc_vec     = xacc[i];
716             __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
717             __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
718             /* xacc[i] ^= xsecret; */
719             __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
720             __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
721 
722             /* xacc[i] *= PRIME32_1; */
723             __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31);
724             __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
725             __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
726             xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
727         }
728     }
729 
730 #elif (XXH_VECTOR == XXH_SSE2)
731 
732     XXH_ASSERT((((size_t)acc) & 15) == 0);
733     {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
734         const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */
735         const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
736 
737         size_t i;
738         for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
739             /* xacc[i] ^= (xacc[i] >> 47) */
740             __m128i const acc_vec     = xacc[i];
741             __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
742             __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
743             /* xacc[i] ^= xsecret; */
744             __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
745             __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
746 
747             /* xacc[i] *= PRIME32_1; */
748             __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31);
749             __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
750             __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
751             xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
752         }
753     }
754 
755 #elif (XXH_VECTOR == XXH_NEON)
756 
757     XXH_ASSERT((((size_t)acc) & 15) == 0);
758 
759     {   uint64x2_t* const xacc =     (uint64x2_t*) acc;
760         uint8_t const* const xsecret = (uint8_t const*) secret;
761         uint32x2_t const prime     = vdup_n_u32 (PRIME32_1);
762 
763         size_t i;
764         for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
765             /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
766             uint64x2_t const   acc_vec  = xacc[i];
767             uint64x2_t const   shifted  = vshrq_n_u64 (acc_vec, 47);
768             uint64x2_t const   data_vec = veorq_u64   (acc_vec, shifted);
769 
770             /* key_vec  = xsecret[i]; */
771             uint32x4_t const   key_vec  = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16)));
772             /* data_key = data_vec ^ key_vec; */
773             uint32x4_t const   data_key = veorq_u32   (vreinterpretq_u32_u64(data_vec), key_vec);
774             /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
775             uint32x2x2_t const shuffled = vzip_u32    (vget_low_u32(data_key), vget_high_u32(data_key));
776 
777             /* data_key *= PRIME32_1 */
778 
779             /* prod_hi = (data_key >> 32) * PRIME32_1; */
780             uint64x2_t const   prod_hi = vmull_u32    (shuffled.val[1], prime);
781             /* xacc[i] = prod_hi << 32; */
782             xacc[i] = vshlq_n_u64(prod_hi, 32);
783             /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
784             xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
785     }   }
786 
787 #elif (XXH_VECTOR == XXH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5)
788 
789           U64x2* const xacc =       (U64x2*) acc;
790     const U64x2* const xsecret = (const U64x2*) secret;
791     /* constants */
792     U64x2 const v32  = { 32, 32 };
793     U64x2 const v47 = { 47, 47 };
794     U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
795     size_t i;
796 #if XXH_VSX_BE
797     /* endian swap */
798     U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
799                               0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
800 #endif
801     for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
802         U64x2 const acc_vec  = xacc[i];
803         U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
804         /* key_vec = xsecret[i]; */
805 #if XXH_VSX_BE
806         /* swap bytes words */
807         U64x2 const key_raw  = vec_vsx_ld(0, xsecret + i);
808         U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
809 #else
810         U64x2 const key_vec  = vec_vsx_ld(0, xsecret + i);
811         U64x2 const data_key = data_vec ^ key_vec;
812 #endif
813 
814         /* data_key *= PRIME32_1 */
815 
816         /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF);  */
817         U64x2 const prod_even  = XXH_vec_mule((U32x4)data_key, prime);
818         /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32);  */
819         U64x2 const prod_odd  = XXH_vec_mulo((U32x4)data_key, prime);
820         xacc[i] = prod_odd + (prod_even << v32);
821     }
822 
823 #else   /* scalar variant of Scrambler - universal */
824 
825     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
826     const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
827     size_t i;
828     XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
829     for (i=0; i < ACC_NB; i++) {
830         xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
831         xxh_u64 acc64 = xacc[i];
832         acc64 ^= acc64 >> 47;
833         acc64 ^= key64;
834         acc64 *= PRIME32_1;
835         xacc[i] = acc64;
836     }
837 
838 #endif
839 }
840 
841 #define XXH_PREFETCH_DIST 384
842 
843 /* assumption : nbStripes will not overflow secret size */
844 XXH_FORCE_INLINE void
XXH3p_accumulate(xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT input,const xxh_u8 * XXH_RESTRICT secret,size_t nbStripes,XXH3p_accWidth_e accWidth)845 XXH3p_accumulate(       xxh_u64* XXH_RESTRICT acc,
846                 const xxh_u8* XXH_RESTRICT input,
847                 const xxh_u8* XXH_RESTRICT secret,
848                       size_t nbStripes,
849                       XXH3p_accWidth_e accWidth)
850 {
851     size_t n;
852     for (n = 0; n < nbStripes; n++ ) {
853         const xxh_u8* const in = input + n*STRIPE_LEN;
854         XXH_PREFETCH(in + XXH_PREFETCH_DIST);
855         XXH3p_accumulate_512(acc,
856                             in,
857                             secret + n*XXH_SECRET_CONSUME_RATE,
858                             accWidth);
859     }
860 }
861 
862 /* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
863  *        and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
864  *        However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
865  *        Pretty much every other modes and compilers prefer `FORCE_INLINE`.
866  */
867 
868 #if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__)
869 static void
870 #else
871 XXH_FORCE_INLINE void
872 #endif
XXH3p_hashLong_internal_loop(xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH3p_accWidth_e accWidth)873 XXH3p_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
874                       const xxh_u8* XXH_RESTRICT input, size_t len,
875                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
876                             XXH3p_accWidth_e accWidth)
877 {
878     size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
879     size_t const block_len = STRIPE_LEN * nb_rounds;
880     size_t const nb_blocks = len / block_len;
881 
882     size_t n;
883 
884     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
885 
886     for (n = 0; n < nb_blocks; n++) {
887         XXH3p_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
888         XXH3p_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
889     }
890 
891     /* last partial block */
892     XXH_ASSERT(len > STRIPE_LEN);
893     {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
894         XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
895         XXH3p_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
896 
897         /* last stripe */
898         if (len & (STRIPE_LEN - 1)) {
899             const xxh_u8* const p = input + len - STRIPE_LEN;
900 #define XXH_SECRET_LASTACC_START 7  /* do not align on 8, so that secret is different from scrambler */
901             XXH3p_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
902     }   }
903 }
904 
905 XXH_FORCE_INLINE xxh_u64
XXH3p_mix2Accs(const xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT secret)906 XXH3p_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
907 {
908     return XXH3p_mul128_fold64(
909                acc[0] ^ XXH_readLE64(secret),
910                acc[1] ^ XXH_readLE64(secret+8) );
911 }
912 
913 static XXH64_hash_t
XXH3p_mergeAccs(const xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT secret,xxh_u64 start)914 XXH3p_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
915 {
916     xxh_u64 result64 = start;
917 
918     result64 += XXH3p_mix2Accs(acc+0, secret +  0);
919     result64 += XXH3p_mix2Accs(acc+2, secret + 16);
920     result64 += XXH3p_mix2Accs(acc+4, secret + 32);
921     result64 += XXH3p_mix2Accs(acc+6, secret + 48);
922 
923     return XXH3p_avalanche(result64);
924 }
925 
926 #define XXH3p_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
927                         PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
928 
929 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_hashLong_internal(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize)930 XXH3p_hashLong_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
931                        const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
932 {
933     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
934 
935     XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_64bits);
936 
937     /* converge into final hash */
938     XXH_STATIC_ASSERT(sizeof(acc) == 64);
939 #define XXH_SECRET_MERGEACCS_START 11  /* do not align on 8, so that secret is different from accumulator */
940     XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
941     return XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
942 }
943 
944 
945 XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_64b_defaultSecret(const xxh_u8 * XXH_RESTRICT input,size_t len)946 XXH3p_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
947 {
948     return XXH3p_hashLong_internal(input, len, kSecret, sizeof(kSecret));
949 }
950 
951 XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_64b_withSecret(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize)952 XXH3p_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
953                              const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
954 {
955     return XXH3p_hashLong_internal(input, len, secret, secretSize);
956 }
957 
958 
XXH_writeLE64(void * dst,xxh_u64 v64)959 XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
960 {
961     if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
962     memcpy(dst, &v64, sizeof(v64));
963 }
964 
965 /* XXH3p_initCustomSecret() :
966  * destination `customSecret` is presumed allocated and same size as `kSecret`.
967  */
XXH3p_initCustomSecret(xxh_u8 * customSecret,xxh_u64 seed64)968 XXH_FORCE_INLINE void XXH3p_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64)
969 {
970     int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
971     int i;
972 
973     XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
974 
975     for (i=0; i < nbRounds; i++) {
976         XXH_writeLE64(customSecret + 16*i,     XXH_readLE64(kSecret + 16*i)     + seed64);
977         XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64);
978     }
979 }
980 
981 
982 /* XXH3p_hashLong_64b_withSeed() :
983  * Generate a custom key,
984  * based on alteration of default kSecret with the seed,
985  * and then use this key for long mode hashing.
986  * This operation is decently fast but nonetheless costs a little bit of time.
987  * Try to avoid it whenever possible (typically when seed==0).
988  */
989 XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_64b_withSeed(const xxh_u8 * input,size_t len,XXH64_hash_t seed)990 XXH3p_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
991 {
992     XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
993     if (seed==0) return XXH3p_hashLong_64b_defaultSecret(input, len);
994     XXH3p_initCustomSecret(secret, seed);
995     return XXH3p_hashLong_internal(input, len, secret, sizeof(secret));
996 }
997 
998 
XXH3p_mix16B(const xxh_u8 * XXH_RESTRICT input,const xxh_u8 * XXH_RESTRICT secret,xxh_u64 seed64)999 XXH_FORCE_INLINE xxh_u64 XXH3p_mix16B(const xxh_u8* XXH_RESTRICT input,
1000                                  const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
1001 {
1002     xxh_u64 const input_lo = XXH_readLE64(input);
1003     xxh_u64 const input_hi = XXH_readLE64(input+8);
1004     return XXH3p_mul128_fold64(
1005                input_lo ^ (XXH_readLE64(secret)   + seed64),
1006                input_hi ^ (XXH_readLE64(secret+8) - seed64) );
1007 }
1008 
1009 
1010 XXH_FORCE_INLINE XXH64_hash_t
XXH3p_len_17to128_64b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1011 XXH3p_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
1012                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1013                      XXH64_hash_t seed)
1014 {
1015     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1016     XXH_ASSERT(16 < len && len <= 128);
1017 
1018     {   xxh_u64 acc = len * PRIME64_1;
1019         if (len > 32) {
1020             if (len > 64) {
1021                 if (len > 96) {
1022                     acc += XXH3p_mix16B(input+48, secret+96, seed);
1023                     acc += XXH3p_mix16B(input+len-64, secret+112, seed);
1024                 }
1025                 acc += XXH3p_mix16B(input+32, secret+64, seed);
1026                 acc += XXH3p_mix16B(input+len-48, secret+80, seed);
1027             }
1028             acc += XXH3p_mix16B(input+16, secret+32, seed);
1029             acc += XXH3p_mix16B(input+len-32, secret+48, seed);
1030         }
1031         acc += XXH3p_mix16B(input+0, secret+0, seed);
1032         acc += XXH3p_mix16B(input+len-16, secret+16, seed);
1033 
1034         return XXH3p_avalanche(acc);
1035     }
1036 }
1037 
1038 #define XXH3p_MIDSIZE_MAX 240
1039 
1040 XXH_NO_INLINE XXH64_hash_t
XXH3p_len_129to240_64b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1041 XXH3p_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
1042                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1043                       XXH64_hash_t seed)
1044 {
1045     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1046     XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
1047 
1048     #define XXH3p_MIDSIZE_STARTOFFSET 3
1049     #define XXH3p_MIDSIZE_LASTOFFSET  17
1050 
1051     {   xxh_u64 acc = len * PRIME64_1;
1052         int const nbRounds = (int)len / 16;
1053         int i;
1054         for (i=0; i<8; i++) {
1055             acc += XXH3p_mix16B(input+(16*i), secret+(16*i), seed);
1056         }
1057         acc = XXH3p_avalanche(acc);
1058         XXH_ASSERT(nbRounds >= 8);
1059         for (i=8 ; i < nbRounds; i++) {
1060             acc += XXH3p_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3p_MIDSIZE_STARTOFFSET, seed);
1061         }
1062         /* last bytes */
1063         acc += XXH3p_mix16B(input + len - 16, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET, seed);
1064         return XXH3p_avalanche(acc);
1065     }
1066 }
1067 
1068 /* ===   Public entry point   === */
1069 
XXH3p_64bits(const void * input,size_t len)1070 XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits(const void* input, size_t len)
1071 {
1072     if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
1073     if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1074     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1075     return XXH3p_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
1076 }
1077 
1078 XXH_PUBLIC_API XXH64_hash_t
XXH3p_64bits_withSecret(const void * input,size_t len,const void * secret,size_t secretSize)1079 XXH3p_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
1080 {
1081     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
1082     /* if an action must be taken should `secret` conditions not be respected,
1083      * it should be done here.
1084      * For now, it's a contract pre-condition.
1085      * Adding a check and a branch here would cost performance at every hash */
1086      if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
1087      if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1088      if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1089      return XXH3p_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
1090 }
1091 
1092 XXH_PUBLIC_API XXH64_hash_t
XXH3p_64bits_withSeed(const void * input,size_t len,XXH64_hash_t seed)1093 XXH3p_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
1094 {
1095     if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
1096     if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1097     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1098     return XXH3p_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
1099 }
1100 
1101 /* ===   XXH3 streaming   === */
1102 
1103 /* RocksDB Note: unused & removed due to bug in preview version */
1104 
1105 /* ==========================================
1106  * XXH3 128 bits (=> XXH128)
1107  * ========================================== */
1108 
1109 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_1to3_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)1110 XXH3p_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1111 {
1112     XXH_ASSERT(input != NULL);
1113     XXH_ASSERT(1 <= len && len <= 3);
1114     XXH_ASSERT(secret != NULL);
1115     {   xxh_u8 const c1 = input[0];
1116         xxh_u8 const c2 = input[len >> 1];
1117         xxh_u8 const c3 = input[len - 1];
1118         xxh_u32  const combinedl = ((xxh_u32)c1) + (((xxh_u32)c2) << 8) + (((xxh_u32)c3) << 16) + (((xxh_u32)len) << 24);
1119         xxh_u32  const combinedh = XXH_swap32(combinedl);
1120         xxh_u64  const keyed_lo = (xxh_u64)combinedl ^ (XXH_readLE32(secret)   + seed);
1121         xxh_u64  const keyed_hi = (xxh_u64)combinedh ^ (XXH_readLE32(secret+4) - seed);
1122         xxh_u64  const mixedl = keyed_lo * PRIME64_1;
1123         xxh_u64  const mixedh = keyed_hi * PRIME64_5;
1124         XXH128_hash_t const h128 = { XXH3p_avalanche(mixedl) /*low64*/, XXH3p_avalanche(mixedh) /*high64*/ };
1125         return h128;
1126     }
1127 }
1128 
1129 
1130 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_4to8_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)1131 XXH3p_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1132 {
1133     XXH_ASSERT(input != NULL);
1134     XXH_ASSERT(secret != NULL);
1135     XXH_ASSERT(4 <= len && len <= 8);
1136     {   xxh_u32 const input_lo = XXH_readLE32(input);
1137         xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
1138         xxh_u64 const input_64_lo = input_lo + ((xxh_u64)input_hi << 32);
1139         xxh_u64 const input_64_hi = XXH_swap64(input_64_lo);
1140         xxh_u64 const keyed_lo = input_64_lo ^ (XXH_readLE64(secret) + seed);
1141         xxh_u64 const keyed_hi = input_64_hi ^ (XXH_readLE64(secret + 8) - seed);
1142         xxh_u64 const mix64l1 = len + ((keyed_lo ^ (keyed_lo >> 51)) * PRIME32_1);
1143         xxh_u64 const mix64l2 = (mix64l1 ^ (mix64l1 >> 47)) * PRIME64_2;
1144         xxh_u64 const mix64h1 = ((keyed_hi ^ (keyed_hi >> 47)) * PRIME64_1) - len;
1145         xxh_u64 const mix64h2 = (mix64h1 ^ (mix64h1 >> 43)) * PRIME64_4;
1146         {   XXH128_hash_t const h128 = { XXH3p_avalanche(mix64l2) /*low64*/, XXH3p_avalanche(mix64h2) /*high64*/ };
1147             return h128;
1148     }   }
1149 }
1150 
1151 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_9to16_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)1152 XXH3p_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1153 {
1154     XXH_ASSERT(input != NULL);
1155     XXH_ASSERT(secret != NULL);
1156     XXH_ASSERT(9 <= len && len <= 16);
1157     {   xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed);
1158         xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret+8) - seed);
1159         XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi, PRIME64_1);
1160         xxh_u64 const lenContrib = XXH_mult32to64(len, PRIME32_5);
1161         m128.low64 += lenContrib;
1162         m128.high64 += input_hi * PRIME64_1;
1163         m128.low64  ^= (m128.high64 >> 32);
1164         {   XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
1165             h128.high64 += m128.high64 * PRIME64_2;
1166             h128.low64   = XXH3p_avalanche(h128.low64);
1167             h128.high64  = XXH3p_avalanche(h128.high64);
1168             return h128;
1169     }   }
1170 }
1171 
1172 /* Assumption : `secret` size is >= 16
1173  * Note : it should be >= XXH3p_SECRET_SIZE_MIN anyway */
1174 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_0to16_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)1175 XXH3p_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1176 {
1177     XXH_ASSERT(len <= 16);
1178     {   if (len > 8) return XXH3p_len_9to16_128b(input, len, secret, seed);
1179         if (len >= 4) return XXH3p_len_4to8_128b(input, len, secret, seed);
1180         if (len) return XXH3p_len_1to3_128b(input, len, secret, seed);
1181         {   XXH128_hash_t const h128 = { 0, 0 };
1182             return h128;
1183     }   }
1184 }
1185 
1186 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_hashLong_128b_internal(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize)1187 XXH3p_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
1188                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
1189 {
1190     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
1191 
1192     XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_128bits);
1193 
1194     /* converge into final hash */
1195     XXH_STATIC_ASSERT(sizeof(acc) == 64);
1196     XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
1197     {   xxh_u64 const low64 = XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
1198         xxh_u64 const high64 = XXH3p_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)len * PRIME64_2));
1199         XXH128_hash_t const h128 = { low64, high64 };
1200         return h128;
1201     }
1202 }
1203 
1204 XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_128b_defaultSecret(const xxh_u8 * input,size_t len)1205 XXH3p_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
1206 {
1207     return XXH3p_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
1208 }
1209 
1210 XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_128b_withSecret(const xxh_u8 * input,size_t len,const xxh_u8 * secret,size_t secretSize)1211 XXH3p_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
1212                               const xxh_u8* secret, size_t secretSize)
1213 {
1214     return XXH3p_hashLong_128b_internal(input, len, secret, secretSize);
1215 }
1216 
1217 XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
XXH3p_hashLong_128b_withSeed(const xxh_u8 * input,size_t len,XXH64_hash_t seed)1218 XXH3p_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
1219 {
1220     XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
1221     if (seed == 0) return XXH3p_hashLong_128b_defaultSecret(input, len);
1222     XXH3p_initCustomSecret(secret, seed);
1223     return XXH3p_hashLong_128b_internal(input, len, secret, sizeof(secret));
1224 }
1225 
1226 
1227 XXH_FORCE_INLINE XXH128_hash_t
XXH128_mix32B(XXH128_hash_t acc,const xxh_u8 * input_1,const xxh_u8 * input_2,const xxh_u8 * secret,XXH64_hash_t seed)1228 XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, const xxh_u8* secret, XXH64_hash_t seed)
1229 {
1230     acc.low64  += XXH3p_mix16B (input_1, secret+0, seed);
1231     acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
1232     acc.high64 += XXH3p_mix16B (input_2, secret+16, seed);
1233     acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
1234     return acc;
1235 }
1236 
1237 XXH_NO_INLINE XXH128_hash_t
XXH3p_len_129to240_128b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1238 XXH3p_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
1239                        const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1240                        XXH64_hash_t seed)
1241 {
1242     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1243     XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
1244 
1245     {   XXH128_hash_t acc;
1246         int const nbRounds = (int)len / 32;
1247         int i;
1248         acc.low64 = len * PRIME64_1;
1249         acc.high64 = 0;
1250         for (i=0; i<4; i++) {
1251             acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+(32*i), seed);
1252         }
1253         acc.low64 = XXH3p_avalanche(acc.low64);
1254         acc.high64 = XXH3p_avalanche(acc.high64);
1255         XXH_ASSERT(nbRounds >= 4);
1256         for (i=4 ; i < nbRounds; i++) {
1257             acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+XXH3p_MIDSIZE_STARTOFFSET+(32*(i-4)), seed);
1258         }
1259         /* last bytes */
1260         acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET - 16, 0ULL - seed);
1261 
1262         {   xxh_u64 const low64 = acc.low64 + acc.high64;
1263             xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
1264             XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
1265             return h128;
1266         }
1267     }
1268 }
1269 
1270 
1271 XXH_FORCE_INLINE XXH128_hash_t
XXH3p_len_17to128_128b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)1272 XXH3p_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
1273                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1274                       XXH64_hash_t seed)
1275 {
1276     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1277     XXH_ASSERT(16 < len && len <= 128);
1278 
1279     {   XXH128_hash_t acc;
1280         acc.low64 = len * PRIME64_1;
1281         acc.high64 = 0;
1282         if (len > 32) {
1283             if (len > 64) {
1284                 if (len > 96) {
1285                     acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
1286                 }
1287                 acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
1288             }
1289             acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
1290         }
1291         acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
1292         {   xxh_u64 const low64 = acc.low64 + acc.high64;
1293             xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
1294             XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
1295             return h128;
1296         }
1297     }
1298 }
1299 
XXH3p_128bits(const void * input,size_t len)1300 XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits(const void* input, size_t len)
1301 {
1302     if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
1303     if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1304     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1305     return XXH3p_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
1306 }
1307 
1308 XXH_PUBLIC_API XXH128_hash_t
XXH3p_128bits_withSecret(const void * input,size_t len,const void * secret,size_t secretSize)1309 XXH3p_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
1310 {
1311     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
1312     /* if an action must be taken should `secret` conditions not be respected,
1313      * it should be done here.
1314      * For now, it's a contract pre-condition.
1315      * Adding a check and a branch here would cost performance at every hash */
1316      if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
1317      if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1318      if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1319      return XXH3p_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
1320 }
1321 
1322 XXH_PUBLIC_API XXH128_hash_t
XXH3p_128bits_withSeed(const void * input,size_t len,XXH64_hash_t seed)1323 XXH3p_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
1324 {
1325     if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
1326     if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1327     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1328     return XXH3p_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
1329 }
1330 
1331 XXH_PUBLIC_API XXH128_hash_t
XXH128(const void * input,size_t len,XXH64_hash_t seed)1332 XXH128(const void* input, size_t len, XXH64_hash_t seed)
1333 {
1334     return XXH3p_128bits_withSeed(input, len, seed);
1335 }
1336 
1337 
1338 /* ===   XXH3 128-bit streaming   === */
1339 
1340 /* RocksDB Note: unused & removed due to bug in preview version */
1341 
1342 /* 128-bit utility functions */
1343 
1344 #include <string.h>   /* memcmp */
1345 
1346 /* return : 1 is equal, 0 if different */
XXH128_isEqual(XXH128_hash_t h1,XXH128_hash_t h2)1347 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
1348 {
1349     /* note : XXH128_hash_t is compact, it has no padding byte */
1350     return !(memcmp(&h1, &h2, sizeof(h1)));
1351 }
1352 
1353 /* This prototype is compatible with stdlib's qsort().
1354  * return : >0 if *h128_1  > *h128_2
1355  *          <0 if *h128_1  < *h128_2
1356  *          =0 if *h128_1 == *h128_2  */
XXH128_cmp(const void * h128_1,const void * h128_2)1357 XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
1358 {
1359     XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
1360     XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
1361     int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
1362     /* note : bets that, in most cases, hash values are different */
1363     if (hcmp) return hcmp;
1364     return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
1365 }
1366 
1367 
1368 /*======   Canonical representation   ======*/
1369 XXH_PUBLIC_API void
XXH128_canonicalFromHash(XXH128_canonical_t * dst,XXH128_hash_t hash)1370 XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
1371 {
1372     XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
1373     if (XXH_CPU_LITTLE_ENDIAN) {
1374         hash.high64 = XXH_swap64(hash.high64);
1375         hash.low64  = XXH_swap64(hash.low64);
1376     }
1377     memcpy(dst, &hash.high64, sizeof(hash.high64));
1378     memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
1379 }
1380 
1381 XXH_PUBLIC_API XXH128_hash_t
XXH128_hashFromCanonical(const XXH128_canonical_t * src)1382 XXH128_hashFromCanonical(const XXH128_canonical_t* src)
1383 {
1384     XXH128_hash_t h;
1385     h.high64 = XXH_readBE64(src);
1386     h.low64  = XXH_readBE64(src->digest + 8);
1387     return h;
1388 }
1389 
1390 
1391 
1392 #endif  /* XXH3p_H */
1393