1 /*
2  * xxHash - Extremely Fast Hash algorithm
3  * Header File
4  * Copyright (C) 2012-2020 Yann Collet
5  *
6  * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are
10  * met:
11  *
12  *    * Redistributions of source code must retain the above copyright
13  *      notice, this list of conditions and the following disclaimer.
14  *    * Redistributions in binary form must reproduce the above
15  *      copyright notice, this list of conditions and the following disclaimer
16  *      in the documentation and/or other materials provided with the
17  *      distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * You can contact the author at:
32  *   - xxHash homepage: https://www.xxhash.com
33  *   - xxHash source repository: https://github.com/Cyan4973/xxHash
34  */
35 
36 /* TODO: update */
37 /* Notice extracted from xxHash homepage:
38 
39 xxHash is an extremely fast hash algorithm, running at RAM speed limits.
40 It also successfully passes all tests from the SMHasher suite.
41 
42 Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo
43 @3GHz)
44 
45 Name            Speed       Q.Score   Author
46 xxHash          5.4 GB/s     10
47 CrapWow         3.2 GB/s      2       Andrew
48 MumurHash 3a    2.7 GB/s     10       Austin Appleby
49 SpookyHash      2.0 GB/s     10       Bob Jenkins
50 SBox            1.4 GB/s      9       Bret Mulvey
51 Lookup3         1.2 GB/s      9       Bob Jenkins
52 SuperFastHash   1.2 GB/s      1       Paul Hsieh
53 CityHash64      1.05 GB/s    10       Pike & Alakuijala
54 FNV             0.55 GB/s     5       Fowler, Noll, Vo
55 CRC32           0.43 GB/s     9
56 MD5-32          0.33 GB/s    10       Ronald L. Rivest
57 SHA1-32         0.28 GB/s    10
58 
59 Q.Score is a measure of quality of the hash function.
60 It depends on successfully passing SMHasher test set.
61 10 is a perfect score.
62 
63 Note: SMHasher's CRC32 implementation is not the fastest one.
64 Other speed-oriented implementations can be faster,
65 especially in combination with PCLMUL instruction:
66 https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
67 
68 A 64-bit version, named XXH64, is available since r35.
69 It offers much better speed, but for 64-bit applications only.
70 Name     Speed on 64 bits    Speed on 32 bits
71 XXH64       13.8 GB/s            1.9 GB/s
72 XXH32        6.8 GB/s            6.0 GB/s
73 */
74 
75 #if defined(__cplusplus)
76 extern "C" {
77 
78 #endif
79 
80 /* ****************************
81  *  INLINE mode
82  ******************************/
83 /*!
84  * XXH_INLINE_ALL (and XXH_PRIVATE_API)
85  * Use these build macros to inline xxhash into the target unit.
86  * Inlining improves performance on small inputs, especially when the length is
87  * expressed as a compile-time constant:
88  *
89  *      https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
90  *
91  * It also keeps xxHash symbols private to the unit, so they are not exported.
92  *
93  * Usage:
94  *     #define XXH_INLINE_ALL
95  *     #include "xxhash.h"
96  *
97  * Do not compile and link xxhash.o as a separate object, as it is not useful.
98  */
99 #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) && \
100     !defined(XXH_INLINE_ALL_31684351384)
101 /* this section should be traversed only once */
102   #define XXH_INLINE_ALL_31684351384
103 /* give access to the advanced API, required to compile implementations */
104   #undef XXH_STATIC_LINKING_ONLY                       /* avoid macro redef */
105   #define XXH_STATIC_LINKING_ONLY
106 /* make all functions private */
107   #undef XXH_PUBLIC_API
108   #if defined(__GNUC__)
109     #define XXH_PUBLIC_API static __inline __attribute__((unused))
110   #elif defined(__cplusplus) || \
111       (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
112     #define XXH_PUBLIC_API static inline
113   #elif defined(_MSC_VER)
114     #define XXH_PUBLIC_API static __inline
115   #else
116   /* note: this version may generate warnings for unused static functions */
117     #define XXH_PUBLIC_API static
118   #endif
119 
120 /*
121  * This part deals with the special case where a unit wants to inline xxHash,
122  * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
123  * as part of some previously included *.h header file.
124  * Without further action, the new include would just be ignored,
125  * and functions would effectively _not_ be inlined (silent failure).
126  * The following macros solve this situation by prefixing all inlined names,
127  * avoiding naming collision with previous inclusions.
128  */
129   #ifdef XXH_NAMESPACE
130     #error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
131   /*
132    * Note: Alternative: #undef all symbols (it's a pretty large list).
133    * Without #error: it compiles, but functions are actually not inlined.
134    */
135   #endif
136   #define XXH_NAMESPACE XXH_INLINE_
137 /*
138  * Some identifiers (enums, type names) are not symbols, but they must
139  * still be renamed to avoid redeclaration.
140  * Alternative solution: do not redeclare them.
141  * However, this requires some #ifdefs, and is a more dispersed action.
142  * Meanwhile, renaming can be achieved in a single block
143  */
144   #define XXH_IPREF(Id) XXH_INLINE_##Id
145   #define XXH_OK XXH_IPREF(XXH_OK)
146   #define XXH_ERROR XXH_IPREF(XXH_ERROR)
147   #define XXH_errorcode XXH_IPREF(XXH_errorcode)
148   #define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t)
149   #define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t)
150   #define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
151   #define XXH32_state_s XXH_IPREF(XXH32_state_s)
152   #define XXH32_state_t XXH_IPREF(XXH32_state_t)
153   #define XXH64_state_s XXH_IPREF(XXH64_state_s)
154   #define XXH64_state_t XXH_IPREF(XXH64_state_t)
155   #define XXH3_state_s XXH_IPREF(XXH3_state_s)
156   #define XXH3_state_t XXH_IPREF(XXH3_state_t)
157   #define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
158 /* Ensure the header is parsed again, even if it was previously included */
159   #undef XXHASH_H_5627135585666179
160   #undef XXHASH_H_STATIC_13879238742
161 #endif                                 /* XXH_INLINE_ALL || XXH_PRIVATE_API */
162 
163 /* ****************************************************************
164  *  Stable API
165  *****************************************************************/
166 #ifndef XXHASH_H_5627135585666179
167   #define XXHASH_H_5627135585666179 1
168 
169   /* specific declaration modes for Windows */
170   #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
171     #if defined(WIN32) && defined(_MSC_VER) && \
172         (defined(XXH_IMPORT) || defined(XXH_EXPORT))
173       #ifdef XXH_EXPORT
174         #define XXH_PUBLIC_API __declspec(dllexport)
175       #elif XXH_IMPORT
176         #define XXH_PUBLIC_API __declspec(dllimport)
177       #endif
178     #else
179       #define XXH_PUBLIC_API                                  /* do nothing */
180     #endif
181   #endif
182 
183   /*!
184    * XXH_NAMESPACE, aka Namespace Emulation:
185    *
186    * If you want to include _and expose_ xxHash functions from within your own
187    * library, but also want to avoid symbol collisions with other libraries
188    * which may also include xxHash, you can use XXH_NAMESPACE to automatically
189    * prefix any public symbol from xxhash library with the value of
190    * XXH_NAMESPACE (therefore, avoid empty or numeric values).
191    *
192    * Note that no change is required within the calling program as long as it
193    * includes `xxhash.h`: Regular symbol names will be automatically translated
194    * by this header.
195    */
196   #ifdef XXH_NAMESPACE
197     #define XXH_CAT(A, B) A##B
198     #define XXH_NAME2(A, B) XXH_CAT(A, B)
199     #define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
200     /* XXH32 */
201     #define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
202     #define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
203     #define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
204     #define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
205     #define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
206     #define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
207     #define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
208     #define XXH32_canonicalFromHash \
209       XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
210     #define XXH32_hashFromCanonical \
211       XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
212     /* XXH64 */
213     #define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
214     #define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
215     #define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
216     #define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
217     #define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
218     #define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
219     #define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
220     #define XXH64_canonicalFromHash \
221       XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
222     #define XXH64_hashFromCanonical \
223       XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
224     /* XXH3_64bits */
225     #define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
226     #define XXH3_64bits_withSecret \
227       XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
228     #define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
229     #define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
230     #define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
231     #define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
232     #define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
233     #define XXH3_64bits_reset_withSeed \
234       XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
235     #define XXH3_64bits_reset_withSecret \
236       XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
237     #define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
238     #define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
239     #define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
240     /* XXH3_128bits */
241     #define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
242     #define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
243     #define XXH3_128bits_withSeed \
244       XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
245     #define XXH3_128bits_withSecret \
246       XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
247     #define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
248     #define XXH3_128bits_reset_withSeed \
249       XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
250     #define XXH3_128bits_reset_withSecret \
251       XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
252     #define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
253     #define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
254     #define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
255     #define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
256     #define XXH128_canonicalFromHash \
257       XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
258     #define XXH128_hashFromCanonical \
259       XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
260   #endif
261 
262   /* *************************************
263    *  Version
264    ***************************************/
265   #define XXH_VERSION_MAJOR 0
266   #define XXH_VERSION_MINOR 8
267   #define XXH_VERSION_RELEASE 0
268   #define XXH_VERSION_NUMBER                                   \
269     (XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 + \
270      XXH_VERSION_RELEASE)
271 XXH_PUBLIC_API unsigned XXH_versionNumber(void);
272 
273   /* ****************************
274    *  Definitions
275    ******************************/
276   #include <stddef.h>                                             /* size_t */
277 typedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode;
278 
279   /*-**********************************************************************
280    *  32-bit hash
281    ************************************************************************/
282   #if !defined(__VMS) &&       \
283       (defined(__cplusplus) || \
284        (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
285     #include <stdint.h>
286 typedef uint32_t XXH32_hash_t;
287   #else
288     #include <limits.h>
289     #if UINT_MAX == 0xFFFFFFFFUL
290 typedef unsigned int  XXH32_hash_t;
291     #else
292       #if ULONG_MAX == 0xFFFFFFFFUL
293 typedef unsigned long XXH32_hash_t;
294       #else
295         #error "unsupported platform: need a 32-bit type"
296       #endif
297     #endif
298   #endif
299 
300 /*!
301  * XXH32():
302  *  Calculate the 32-bit hash of sequence "length" bytes stored at memory
303  * address "input". The memory between input & input+length must be valid
304  * (allocated and read-accessible). "seed" can be used to alter the result
305  * predictably. Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher
306  * benchmark): 5.4 GB/s
307  *
308  * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
309  * and offers true 64/128 bit hash results. It provides a superior level of
310  * dispersion, and greatly reduces the risks of collisions.
311  */
312 XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length,
313                                   XXH32_hash_t seed);
314 
315 /*******   Streaming   *******/
316 
317 /*
318  * Streaming functions generate the xxHash value from an incrememtal input.
319  * This method is slower than single-call functions, due to state management.
320  * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
321  *
322  * An XXH state must first be allocated using `XXH*_createState()`.
323  *
324  * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
325  *
326  * Then, feed the hash state by calling `XXH*_update()` as many times as
327  * necessary.
328  *
329  * The function returns an error code, with 0 meaning OK, and any other value
330  * meaning there is an error.
331  *
332  * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
333  * This function returns the nn-bits hash as an int or long long.
334  *
335  * It's still possible to continue inserting input into the hash state after a
336  * digest, and generate new hash values later on by invoking `XXH*_digest()`.
337  *
338  * When done, release the state using `XXH*_freeState()`.
339  */
340 
341 typedef struct XXH32_state_s XXH32_state_t;              /* incomplete type */
342 XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void);
343 XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t *statePtr);
344 XXH_PUBLIC_API void           XXH32_copyState(XXH32_state_t *      dst_state,
345                                               const XXH32_state_t *src_state);
346 
347 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
348                                          XXH32_hash_t   seed);
349 XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *statePtr,
350                                           const void *input, size_t length);
351 XXH_PUBLIC_API XXH32_hash_t  XXH32_digest(const XXH32_state_t *statePtr);
352 
353 /*******   Canonical representation   *******/
354 
355 /*
356  * The default return values from XXH functions are unsigned 32 and 64 bit
357  * integers.
358  * This the simplest and fastest format for further post-processing.
359  *
360  * However, this leaves open the question of what is the order on the byte
361  * level, since little and big endian conventions will store the same number
362  * differently.
363  *
364  * The canonical representation settles this issue by mandating big-endian
365  * convention, the same convention as human-readable numbers (large digits
366  * first).
367  *
368  * When writing hash values to storage, sending them over a network, or printing
369  * them, it's highly recommended to use the canonical representation to ensure
370  * portability across a wider range of systems, present and future.
371  *
372  * The following functions allow transformation of hash values to and from
373  * canonical format.
374  */
375 
376 typedef struct {
377 
378   unsigned char digest[4];
379 
380 } XXH32_canonical_t;
381 
382 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst,
383                                             XXH32_hash_t       hash);
384 XXH_PUBLIC_API XXH32_hash_t
385 XXH32_hashFromCanonical(const XXH32_canonical_t *src);
386 
387   #ifndef XXH_NO_LONG_LONG
388     /*-**********************************************************************
389      *  64-bit hash
390      ************************************************************************/
391     #if !defined(__VMS) &&                                     \
392         (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
393                                   (__STDC_VERSION__ >= 199901L) /* C99 */))
394       #include <stdint.h>
395 typedef uint64_t XXH64_hash_t;
396     #else
397 /* the following type must have a width of 64-bit */
398 typedef unsigned long long XXH64_hash_t;
399     #endif
400 
401 /*!
402  * XXH64():
403  * Returns the 64-bit hash of sequence of length @length stored at memory
404  * address @input.
405  * @seed can be used to alter the result predictably.
406  *
407  * This function usually runs faster on 64-bit systems, but slower on 32-bit
408  * systems (see benchmark).
409  *
410  * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
411  * and offers true 64/128 bit hash results. It provides a superior level of
412  * dispersion, and greatly reduces the risks of collisions.
413  */
414 XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t length,
415                                   XXH64_hash_t seed);
416 
417 /*******   Streaming   *******/
418 typedef struct XXH64_state_s XXH64_state_t;              /* incomplete type */
419 XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void);
420 XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t *statePtr);
421 XXH_PUBLIC_API void           XXH64_copyState(XXH64_state_t *      dst_state,
422                                               const XXH64_state_t *src_state);
423 
424 XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr,
425                                          XXH64_hash_t   seed);
426 XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *statePtr,
427                                           const void *input, size_t length);
428 XXH_PUBLIC_API XXH64_hash_t  XXH64_digest(const XXH64_state_t *statePtr);
429 
430 /*******   Canonical representation   *******/
431 typedef struct {
432 
433   unsigned char digest[sizeof(XXH64_hash_t)];
434 
435 } XXH64_canonical_t;
436 
437 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
438                                             XXH64_hash_t       hash);
439 XXH_PUBLIC_API XXH64_hash_t
440 XXH64_hashFromCanonical(const XXH64_canonical_t *src);
441 
442 /*-**********************************************************************
443  *  XXH3 64-bit variant
444  ************************************************************************/
445 
446 /* ************************************************************************
447  * XXH3 is a new hash algorithm featuring:
448  *  - Improved speed for both small and large inputs
449  *  - True 64-bit and 128-bit outputs
450  *  - SIMD acceleration
451  *  - Improved 32-bit viability
452  *
453  * Speed analysis methodology is explained here:
454  *
455  *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
456  *
457  * In general, expect XXH3 to run about ~2x faster on large inputs and >3x
458  * faster on small ones compared to XXH64, though exact differences depend on
459  * the platform.
460  *
461  * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
462  * on all platforms.
463  *
464  * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
465  *
466  * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
467  * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
468  * explained in the implementation.
469  *
470  * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
471  * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
472  *
473  * XXH3 offers 2 variants, _64bits and _128bits.
474  * When only 64 bits are needed, prefer calling the _64bits variant, as it
475  * reduces the amount of mixing, resulting in faster speed on small inputs.
476  *
477  * It's also generally simpler to manipulate a scalar return type than a struct.
478  *
479  * The 128-bit version adds additional strength, but it is slightly slower.
480  *
481  * Return values of XXH3 and XXH128 are officially finalized starting
482  * with v0.8.0 and will no longer change in future versions.
483  * Avoid storing values from before that release in long-term storage.
484  *
485  * Results produced by v0.7.x are not comparable with results from v0.7.y.
486  * However, the API is completely stable, and it can safely be used for
487  * ephemeral data (local sessions).
488  *
489  * The API supports one-shot hashing, streaming mode, and custom secrets.
490  */
491 
492 /* XXH3_64bits():
493  * default 64-bit variant, using default secret and default seed of 0.
494  * It's the fastest variant. */
495 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *data, size_t len);
496 
497 /*
498  * XXH3_64bits_withSeed():
499  * This variant generates a custom secret on the fly
500  * based on default secret altered using the `seed` value.
501  * While this operation is decently fast, note that it's not completely free.
502  * Note: seed==0 produces the same results as XXH3_64bits().
503  */
504 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *data, size_t len,
505                                                  XXH64_hash_t seed);
506 
507     /*
508      * XXH3_64bits_withSecret():
509      * It's possible to provide any blob of bytes as a "secret" to generate the
510      * hash. This makes it more difficult for an external actor to prepare an
511      * intentional collision. The main condition is that secretSize *must* be
512      * large enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced
513      * hash values depends on secret's entropy. Technically, the secret must
514      * look like a bunch of random bytes. Avoid "trivial" or structured data
515      * such as repeated sequences or a text document. Whenever unsure about the
516      * "randomness" of the blob of bytes, consider relabelling it as a "custom
517      * seed" instead, and employ "XXH3_generateSecret()" (see below) to generate
518      * a high entropy secret derived from the custom seed.
519      */
520     #define XXH3_SECRET_SIZE_MIN 136
521 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len,
522                                                    const void *secret,
523                                                    size_t      secretSize);
524 
525 /*******   Streaming   *******/
526 /*
527  * Streaming requires state maintenance.
528  * This operation costs memory and CPU.
529  * As a consequence, streaming is slower than one-shot hashing.
530  * For better performance, prefer one-shot functions whenever applicable.
531  */
532 typedef struct XXH3_state_s XXH3_state_t;
533 XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void);
534 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr);
535 XXH_PUBLIC_API void          XXH3_copyState(XXH3_state_t *      dst_state,
536                                             const XXH3_state_t *src_state);
537 
538 /*
539  * XXH3_64bits_reset():
540  * Initialize with default parameters.
541  * digest will be equivalent to `XXH3_64bits()`.
542  */
543 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr);
544 /*
545  * XXH3_64bits_reset_withSeed():
546  * Generate a custom secret from `seed`, and store it into `statePtr`.
547  * digest will be equivalent to `XXH3_64bits_withSeed()`.
548  */
549 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
550                                                         XXH64_hash_t  seed);
551 /*
552  * XXH3_64bits_reset_withSecret():
553  * `secret` is referenced, it _must outlive_ the hash streaming session.
554  * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
555  * and the quality of produced hash values depends on secret's entropy
556  * (secret's content should look like a bunch of random bytes).
557  * When in doubt about the randomness of a candidate `secret`,
558  * consider employing `XXH3_generateSecret()` instead (see below).
559  */
560 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(
561     XXH3_state_t *statePtr, const void *secret, size_t secretSize);
562 
563 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *statePtr,
564                                                 const void *  input,
565                                                 size_t        length);
566 XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest(const XXH3_state_t *statePtr);
567 
568 /* note : canonical representation of XXH3 is the same as XXH64
569  * since they both produce XXH64_hash_t values */
570 
571 /*-**********************************************************************
572  *  XXH3 128-bit variant
573  ************************************************************************/
574 
575 typedef struct {
576 
577   XXH64_hash_t low64;
578   XXH64_hash_t high64;
579 
580 } XXH128_hash_t;
581 
582 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *data, size_t len);
583 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void *data, size_t len,
584                                                    XXH64_hash_t seed);
585 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *data,
586                                                      size_t      len,
587                                                      const void *secret,
588                                                      size_t      secretSize);
589 
590 /*******   Streaming   *******/
591 /*
592  * Streaming requires state maintenance.
593  * This operation costs memory and CPU.
594  * As a consequence, streaming is slower than one-shot hashing.
595  * For better performance, prefer one-shot functions whenever applicable.
596  *
597  * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
598  * Use already declared XXH3_createState() and XXH3_freeState().
599  *
600  * All reset and streaming functions have same meaning as their 64-bit
601  * counterpart.
602  */
603 
604 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr);
605 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
606                                                          XXH64_hash_t  seed);
607 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(
608     XXH3_state_t *statePtr, const void *secret, size_t secretSize);
609 
610 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *statePtr,
611                                                  const void *  input,
612                                                  size_t        length);
613 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *statePtr);
614 
615 /* Following helper functions make it possible to compare XXH128_hast_t values.
616  * Since XXH128_hash_t is a structure, this capability is not offered by the
617  * language.
618  * Note: For better performance, these functions can be inlined using
619  * XXH_INLINE_ALL */
620 
621 /*!
622  * XXH128_isEqual():
623  * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
624  */
625 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
626 
627 /*!
628  * XXH128_cmp():
629  *
630  * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
631  *
632  * return: >0 if *h128_1  > *h128_2
633  *         =0 if *h128_1 == *h128_2
634  *         <0 if *h128_1  < *h128_2
635  */
636 XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2);
637 
638 /*******   Canonical representation   *******/
639 typedef struct {
640 
641   unsigned char digest[sizeof(XXH128_hash_t)];
642 
643 } XXH128_canonical_t;
644 
645 XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
646                                              XXH128_hash_t       hash);
647 XXH_PUBLIC_API XXH128_hash_t
648 XXH128_hashFromCanonical(const XXH128_canonical_t *src);
649 
650   #endif                                                /* XXH_NO_LONG_LONG */
651 
652 #endif                                         /* XXHASH_H_5627135585666179 */
653 
654 #if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
655   #define XXHASH_H_STATIC_13879238742
656 /* ****************************************************************************
657  * This section contains declarations which are not guaranteed to remain stable.
658  * They may change in future versions, becoming incompatible with a different
659  * version of the library.
660  * These declarations should only be used with static linking.
661  * Never use them in association with dynamic linking!
662  *****************************************************************************
663  */
664 
665 /*
666  * These definitions are only present to allow static allocation
667  * of XXH states, on stack or in a struct, for example.
668  * Never **ever** access their members directly.
669  */
670 
671 struct XXH32_state_s {
672 
673   XXH32_hash_t total_len_32;
674   XXH32_hash_t large_len;
675   XXH32_hash_t v1;
676   XXH32_hash_t v2;
677   XXH32_hash_t v3;
678   XXH32_hash_t v4;
679   XXH32_hash_t mem32[4];
680   XXH32_hash_t memsize;
681   XXH32_hash_t
682       reserved; /* never read nor write, might be removed in a future version */
683 
684 };                                            /* typedef'd to XXH32_state_t */
685 
686   #ifndef XXH_NO_LONG_LONG       /* defined when there is no 64-bit support */
687 
688 struct XXH64_state_s {
689 
690   XXH64_hash_t total_len;
691   XXH64_hash_t v1;
692   XXH64_hash_t v2;
693   XXH64_hash_t v3;
694   XXH64_hash_t v4;
695   XXH64_hash_t mem64[4];
696   XXH32_hash_t memsize;
697   XXH32_hash_t reserved32;                   /* required for padding anyway */
698   XXH64_hash_t reserved64; /* never read nor write, might be removed in a future
699                               version */
700 
701 };                                            /* typedef'd to XXH64_state_t */
702 
703     #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)  /* C11+ */
704       #include <stdalign.h>
705       #define XXH_ALIGN(n) alignas(n)
706     #elif defined(__GNUC__)
707       #define XXH_ALIGN(n) __attribute__((aligned(n)))
708     #elif defined(_MSC_VER)
709       #define XXH_ALIGN(n) __declspec(align(n))
710     #else
711       #define XXH_ALIGN(n)                                      /* disabled */
712     #endif
713 
714     /* Old GCC versions only accept the attribute after the type in structures.
715      */
716     #if !(defined(__STDC_VERSION__) &&              \
717           (__STDC_VERSION__ >= 201112L)) /* C11+ */ \
718         && defined(__GNUC__)
719       #define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
720     #else
721       #define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
722     #endif
723 
724     #define XXH3_INTERNALBUFFER_SIZE 256
725     #define XXH3_SECRET_DEFAULT_SIZE 192
726 struct XXH3_state_s {
727 
728   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
729   /* used to store a custom secret generated from a seed */
730   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
731   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
732   XXH32_hash_t         bufferedSize;
733   XXH32_hash_t         reserved32;
734   size_t               nbStripesSoFar;
735   XXH64_hash_t         totalLen;
736   size_t               nbStripesPerBlock;
737   size_t               secretLimit;
738   XXH64_hash_t         seed;
739   XXH64_hash_t         reserved64;
740   const unsigned char *extSecret; /* reference to external secret;
741                                    * if == NULL, use .customSecret instead */
742   /* note: there may be some padding at the end due to alignment on 64 bytes */
743 
744 };                                             /* typedef'd to XXH3_state_t */
745 
746     #undef XXH_ALIGN_MEMBER
747 
748     /* When the XXH3_state_t structure is merely emplaced on stack,
749      * it should be initialized with XXH3_INITSTATE() or a memset()
750      * in case its first reset uses XXH3_NNbits_reset_withSeed().
751      * This init can be omitted if the first reset uses default or _withSecret
752      * mode. This operation isn't necessary when the state is created with
753      * XXH3_createState(). Note that this doesn't prepare the state for a
754      * streaming operation, it's still necessary to use XXH3_NNbits_reset*()
755      * afterwards.
756      */
757     #define XXH3_INITSTATE(XXH3_state_ptr) \
758       { (XXH3_state_ptr)->seed = 0; }
759 
760 /* ===   Experimental API   === */
761 /* Symbols defined below must be considered tied to a specific library version.
762  */
763 
764 /*
765  * XXH3_generateSecret():
766  *
767  * Derive a high-entropy secret from any user-defined content, named customSeed.
768  * The generated secret can be used in combination with `*_withSecret()`
769  * functions. The `_withSecret()` variants are useful to provide a higher level
770  * of protection than 64-bit seed, as it becomes much more difficult for an
771  * external actor to guess how to impact the calculation logic.
772  *
773  * The function accepts as input a custom seed of any length and any content,
774  * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
775  * into an already allocated buffer secretBuffer.
776  * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.
777  *
778  * The generated secret can then be used with any `*_withSecret()` variant.
779  * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
780  * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
781  * are part of this list. They all accept a `secret` parameter
782  * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
783  * _and_ feature very high entropy (consist of random-looking bytes).
784  * These conditions can be a high bar to meet, so
785  * this function can be used to generate a secret of proper quality.
786  *
787  * customSeed can be anything. It can have any size, even small ones,
788  * and its content can be anything, even stupidly "low entropy" source such as a
789  * bunch of zeroes. The resulting `secret` will nonetheless provide all expected
790  * qualities.
791  *
792  * Supplying NULL as the customSeed copies the default secret into
793  * `secretBuffer`. When customSeedSize > 0, supplying NULL as customSeed is
794  * undefined behavior.
795  */
796 XXH_PUBLIC_API void XXH3_generateSecret(void *      secretBuffer,
797                                         const void *customSeed,
798                                         size_t      customSeedSize);
799 
800 /* simple short-cut to pre-selected XXH3_128bits variant */
801 XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
802                                     XXH64_hash_t seed);
803 
804   #endif                                                /* XXH_NO_LONG_LONG */
805 
806   #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
807     #define XXH_IMPLEMENTATION
808   #endif
809 
810 #endif /* defined(XXH_STATIC_LINKING_ONLY) && \
811           !defined(XXHASH_H_STATIC_13879238742) */
812 
813 /* ======================================================================== */
814 /* ======================================================================== */
815 /* ======================================================================== */
816 
817 /*-**********************************************************************
818  * xxHash implementation
819  *-**********************************************************************
820  * xxHash's implementation used to be hosted inside xxhash.c.
821  *
822  * However, inlining requires implementation to be visible to the compiler,
823  * hence be included alongside the header.
824  * Previously, implementation was hosted inside xxhash.c,
825  * which was then #included when inlining was activated.
826  * This construction created issues with a few build and install systems,
827  * as it required xxhash.c to be stored in /include directory.
828  *
829  * xxHash implementation is now directly integrated within xxhash.h.
830  * As a consequence, xxhash.c is no longer needed in /include.
831  *
832  * xxhash.c is still available and is still useful.
833  * In a "normal" setup, when xxhash is not inlined,
834  * xxhash.h only exposes the prototypes and public symbols,
835  * while xxhash.c can be built into an object file xxhash.o
836  * which can then be linked into the final binary.
837  ************************************************************************/
838 
839 #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) || \
840      defined(XXH_IMPLEMENTATION)) &&                        \
841     !defined(XXH_IMPLEM_13a8737387)
842   #define XXH_IMPLEM_13a8737387
843 
844   /* *************************************
845    *  Tuning parameters
846    ***************************************/
847   /*!
848    * XXH_FORCE_MEMORY_ACCESS:
849    * By default, access to unaligned memory is controlled by `memcpy()`, which
850    * is safe and portable.
851    *
852    * Unfortunately, on some target/compiler combinations, the generated assembly
853    * is sub-optimal.
854    *
855    * The below switch allow selection of a different access method
856    * in the search for improved performance.
857    * Method 0 (default):
858    *     Use `memcpy()`. Safe and portable. Default.
859    * Method 1:
860    *     `__attribute__((packed))` statement. It depends on compiler extensions
861    *     and is therefore not portable.
862    *     This method is safe if your compiler supports it, and *generally* as
863    *     fast or faster than `memcpy`.
864    * Method 2:
865    *     Direct access via cast. This method doesn't depend on the compiler but
866    *     violates the C standard.
867    *     It can generate buggy code on targets which do not support unaligned
868    *     memory accesses.
869    *     But in some circumstances, it's the only known way to get the most
870    *     performance (example: GCC + ARMv6)
871    * Method 3:
872    *     Byteshift. This can generate the best code on old compilers which don't
873    *     inline small `memcpy()` calls, and it might also be faster on
874    * big-endian systems which lack a native byteswap instruction. See
875    * https://stackoverflow.com/a/32095106/646947 for details. Prefer these
876    * methods in priority order (0 > 1 > 2 > 3)
877    */
878   #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command \
879                                      line for example */
880     #if !defined(__clang__) && defined(__GNUC__) &&                \
881         defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && \
882         (__ARM_ARCH == 6)
883       #define XXH_FORCE_MEMORY_ACCESS 2
884     #elif !defined(__clang__) &&                            \
885         ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
886          (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
887       #define XXH_FORCE_MEMORY_ACCESS 1
888     #endif
889   #endif
890 
891   /*!
892    * XXH_ACCEPT_NULL_INPUT_POINTER:
893    * If the input pointer is NULL, xxHash's default behavior is to dereference
894    * it, triggering a segfault. When this macro is enabled, xxHash actively
895    * checks the input for a null pointer. If it is, the result for null input
896    * pointers is the same as a zero-length input.
897    */
898   #ifndef XXH_ACCEPT_NULL_INPUT_POINTER        /* can be defined externally */
899     #define XXH_ACCEPT_NULL_INPUT_POINTER 0
900   #endif
901 
902   /*!
903    * XXH_FORCE_ALIGN_CHECK:
904    * This is an important performance trick
905    * for architectures without decent unaligned memory access performance.
906    * It checks for input alignment, and when conditions are met,
907    * uses a "fast path" employing direct 32-bit/64-bit read,
908    * resulting in _dramatically faster_ read speed.
909    *
910    * The check costs one initial branch per hash, which is generally negligible,
911    * but not zero. Moreover, it's not useful to generate binary for an
912    * additional code path if memory access uses same instruction for both
913    * aligned and unaligned adresses.
914    *
915    * In these cases, the alignment check can be removed by setting this macro to
916    * 0. Then the code will always use unaligned memory access. Align check is
917    * automatically disabled on x86, x64 & arm64, which are platforms known to
918    * offer good unaligned memory accesses performance.
919    *
920    * This option does not affect XXH3 (only XXH32 and XXH64).
921    */
922   #ifndef XXH_FORCE_ALIGN_CHECK                /* can be defined externally */
923     #if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || \
924         defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64)  /* visual */
925       #define XXH_FORCE_ALIGN_CHECK 0
926     #else
927       #define XXH_FORCE_ALIGN_CHECK 1
928     #endif
929   #endif
930 
931   /*!
932    * XXH_NO_INLINE_HINTS:
933    *
934    * By default, xxHash tries to force the compiler to inline almost all
935    * internal functions.
936    *
937    * This can usually improve performance due to reduced jumping and improved
938    * constant folding, but significantly increases the size of the binary which
939    * might not be favorable.
940    *
941    * Additionally, sometimes the forced inlining can be detrimental to
942    * performance, depending on the architecture.
943    *
944    * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
945    * compiler full control on whether to inline or not.
946    *
947    * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
948    * -fno-inline with GCC or Clang, this will automatically be defined.
949    */
950   #ifndef XXH_NO_INLINE_HINTS
951     #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
952         || defined(__NO_INLINE__)                       /* -O0, -fno-inline */
953       #define XXH_NO_INLINE_HINTS 1
954     #else
955       #define XXH_NO_INLINE_HINTS 0
956     #endif
957   #endif
958 
959   /*!
960    * XXH_REROLL:
961    * Whether to reroll XXH32_finalize, and XXH64_finalize,
962    * instead of using an unrolled jump table/if statement loop.
963    *
964    * This is automatically defined on -Os/-Oz on GCC and Clang.
965    */
966   #ifndef XXH_REROLL
967     #if defined(__OPTIMIZE_SIZE__)
968       #define XXH_REROLL 1
969     #else
970       #define XXH_REROLL 0
971     #endif
972   #endif
973 
974   /* *************************************
975    *  Includes & Memory related functions
976    ***************************************/
977   /*!
978    * Modify the local functions below should you wish to use
979    * different memory routines for malloc() and free()
980    */
981   #include <stdlib.h>
982 
983 static void *XXH_malloc(size_t s) {
984 
985   return malloc(s);
986 
987 }
988 
989 static void XXH_free(void *p) {
990 
991   free(p);
992 
993 }
994 
995   /*! and for memcpy() */
996   #include <string.h>
997 static void *XXH_memcpy(void *dest, const void *src, size_t size) {
998 
999   return memcpy(dest, src, size);
1000 
1001 }
1002 
1003   #include <limits.h>                                         /* ULLONG_MAX */
1004 
1005   /* *************************************
1006    *  Compiler Specific Options
1007    ***************************************/
1008   #ifdef _MSC_VER                              /* Visual Studio warning fix */
1009     #pragma warning(disable : 4127) /* disable: C4127: conditional expression \
1010                                        is constant */
1011   #endif
1012 
1013   #if XXH_NO_INLINE_HINTS                         /* disable inlining hints */
1014     #if defined(__GNUC__)
1015       #define XXH_FORCE_INLINE static __attribute__((unused))
1016     #else
1017       #define XXH_FORCE_INLINE static
1018     #endif
1019     #define XXH_NO_INLINE static
1020   /* enable inlining hints */
1021   #elif defined(_MSC_VER)                                  /* Visual Studio */
1022     #define XXH_FORCE_INLINE static __forceinline
1023     #define XXH_NO_INLINE static __declspec(noinline)
1024   #elif defined(__GNUC__)
1025     #define XXH_FORCE_INLINE \
1026       static __inline__ __attribute__((always_inline, unused))
1027     #define XXH_NO_INLINE static __attribute__((noinline))
1028   #elif defined(__cplusplus) || \
1029       (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
1030     #define XXH_FORCE_INLINE static inline
1031     #define XXH_NO_INLINE static
1032   #else
1033     #define XXH_FORCE_INLINE static
1034     #define XXH_NO_INLINE static
1035   #endif
1036 
1037   /* *************************************
1038    *  Debug
1039    ***************************************/
1040   /*
1041    * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
1042    * compiler's command line options. The value must be a number.
1043    */
1044   #ifndef XXH_DEBUGLEVEL
1045     #ifdef DEBUGLEVEL                                   /* backwards compat */
1046       #define XXH_DEBUGLEVEL DEBUGLEVEL
1047     #else
1048       #define XXH_DEBUGLEVEL 0
1049     #endif
1050   #endif
1051 
1052   #if (XXH_DEBUGLEVEL >= 1)
1053     #include <assert.h>          /* note: can still be disabled with NDEBUG */
1054     #define XXH_ASSERT(c) assert(c)
1055   #else
1056     #define XXH_ASSERT(c) ((void)0)
1057   #endif
1058 
1059   /* note: use after variable declarations */
1060   #define XXH_STATIC_ASSERT(c)            \
1061     do {                                  \
1062                                           \
1063       enum { XXH_sa = 1 / (int)(!!(c)) }; \
1064                                           \
1065     } while (0)
1066 
1067   /* *************************************
1068    *  Basic Types
1069    ***************************************/
1070   #if !defined(__VMS) &&       \
1071       (defined(__cplusplus) || \
1072        (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
1073     #include <stdint.h>
1074 typedef uint8_t xxh_u8;
1075   #else
1076 typedef unsigned char xxh_u8;
1077   #endif
1078 typedef XXH32_hash_t xxh_u32;
1079 
1080   #ifdef XXH_OLD_NAMES
1081     #define BYTE xxh_u8
1082     #define U8 xxh_u8
1083     #define U32 xxh_u32
1084   #endif
1085 
1086 /* ***   Memory access   *** */
1087 
1088   #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
1089   /*
1090    * Manual byteshift. Best for old compilers which don't inline memcpy.
1091    * We actually directly use XXH_readLE32 and XXH_readBE32.
1092    */
1093   #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))
1094 
1095 /*
1096  * Force direct memory access. Only works on CPU which support unaligned memory
1097  * access in hardware.
1098  */
1099 static xxh_u32 XXH_read32(const void *memPtr) {
1100 
1101   return *(const xxh_u32 *)memPtr;
1102 
1103 }
1104 
1105   #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))
1106 
1107     /*
1108      * __pack instructions are safer but compiler specific, hence potentially
1109      * problematic for some compilers.
1110      *
1111      * Currently only defined for GCC and ICC.
1112      */
1113     #ifdef XXH_OLD_NAMES
1114 typedef union {
1115 
1116   xxh_u32 u32;
1117 
1118 } __attribute__((packed)) unalign;
1119 
1120     #endif
1121 static xxh_u32 XXH_read32(const void *ptr) {
1122 
1123   typedef union {
1124 
1125     xxh_u32 u32;
1126 
1127   } __attribute__((packed)) xxh_unalign;
1128 
1129   return ((const xxh_unalign *)ptr)->u32;
1130 
1131 }
1132 
1133   #else
1134 
1135 /*
1136  * Portable and safe solution. Generally efficient.
1137  * see: https://stackoverflow.com/a/32095106/646947
1138  */
1139 static xxh_u32 XXH_read32(const void *memPtr) {
1140 
1141   xxh_u32 val;
1142   memcpy(&val, memPtr, sizeof(val));
1143   return val;
1144 
1145 }
1146 
1147   #endif                                  /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
1148 
1149 /* ***   Endianess   *** */
1150 typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess;
1151 
1152   /*!
1153    * XXH_CPU_LITTLE_ENDIAN:
1154    * Defined to 1 if the target is little endian, or 0 if it is big endian.
1155    * It can be defined externally, for example on the compiler command line.
1156    *
1157    * If it is not defined, a runtime check (which is usually constant folded)
1158    * is used instead.
1159    */
1160   #ifndef XXH_CPU_LITTLE_ENDIAN
1161     /*
1162      * Try to detect endianness automatically, to avoid the nonstandard behavior
1163      * in `XXH_isLittleEndian()`
1164      */
1165     #if defined(_WIN32) /* Windows is always little endian */ \
1166         || defined(__LITTLE_ENDIAN__) ||                      \
1167         (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
1168       #define XXH_CPU_LITTLE_ENDIAN 1
1169     #elif defined(__BIG_ENDIAN__) || \
1170         (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
1171       #define XXH_CPU_LITTLE_ENDIAN 0
1172     #else
1173 /*
1174  * runtime test, presumed to simplify to a constant by compiler
1175  */
1176 static int XXH_isLittleEndian(void) {
1177 
1178   /*
1179    * Portable and well-defined behavior.
1180    * Don't use static: it is detrimental to performance.
1181    */
1182   const union {
1183 
1184     xxh_u32 u;
1185     xxh_u8  c[4];
1186 
1187   } one = {1};
1188 
1189   return one.c[0];
1190 
1191 }
1192 
1193       #define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()
1194     #endif
1195   #endif
1196 
1197   /* ****************************************
1198    *  Compiler-specific Functions and Macros
1199    ******************************************/
1200   #define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
1201 
1202   #ifdef __has_builtin
1203     #define XXH_HAS_BUILTIN(x) __has_builtin(x)
1204   #else
1205     #define XXH_HAS_BUILTIN(x) 0
1206   #endif
1207 
1208   #if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) && \
1209       XXH_HAS_BUILTIN(__builtin_rotateleft64)
1210     #define XXH_rotl32 __builtin_rotateleft32
1211     #define XXH_rotl64 __builtin_rotateleft64
1212   /* Note: although _rotl exists for minGW (GCC under windows), performance
1213    * seems poor */
1214   #elif defined(_MSC_VER)
1215     #define XXH_rotl32(x, r) _rotl(x, r)
1216     #define XXH_rotl64(x, r) _rotl64(x, r)
1217   #else
1218     #define XXH_rotl32(x, r) (((x) << (r)) | ((x) >> (32 - (r))))
1219     #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
1220   #endif
1221 
1222   #if defined(_MSC_VER)                                    /* Visual Studio */
1223     #define XXH_swap32 _byteswap_ulong
1224   #elif XXH_GCC_VERSION >= 403
1225     #define XXH_swap32 __builtin_bswap32
1226   #else
1227 static xxh_u32 XXH_swap32(xxh_u32 x) {
1228 
1229   return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) |
1230          ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff);
1231 
1232 }
1233 
1234   #endif
1235 
1236 /* ***************************
1237  *  Memory reads
1238  *****************************/
1239 typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
1240 
1241   /*
1242    * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
1243    *
1244    * This is ideal for older compilers which don't inline memcpy.
1245    */
1246   #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
1247 
1248 XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void *memPtr) {
1249 
1250   const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr;
1251   return bytePtr[0] | ((xxh_u32)bytePtr[1] << 8) | ((xxh_u32)bytePtr[2] << 16) |
1252          ((xxh_u32)bytePtr[3] << 24);
1253 
1254 }
1255 
1256 XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void *memPtr) {
1257 
1258   const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr;
1259   return bytePtr[3] | ((xxh_u32)bytePtr[2] << 8) | ((xxh_u32)bytePtr[1] << 16) |
1260          ((xxh_u32)bytePtr[0] << 24);
1261 
1262 }
1263 
1264   #else
1265 XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void *ptr) {
1266 
1267   return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
1268 
1269 }
1270 
1271 static xxh_u32 XXH_readBE32(const void *ptr) {
1272 
1273   return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
1274 
1275 }
1276 
1277   #endif
1278 
1279 XXH_FORCE_INLINE xxh_u32 XXH_readLE32_align(const void *  ptr,
1280                                             XXH_alignment align) {
1281 
1282   if (align == XXH_unaligned) {
1283 
1284     return XXH_readLE32(ptr);
1285 
1286   } else {
1287 
1288     return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32 *)ptr
1289                                  : XXH_swap32(*(const xxh_u32 *)ptr);
1290 
1291   }
1292 
1293 }
1294 
1295 /* *************************************
1296  *  Misc
1297  ***************************************/
1298 XXH_PUBLIC_API unsigned XXH_versionNumber(void) {
1299 
1300   return XXH_VERSION_NUMBER;
1301 
1302 }
1303 
1304 /* *******************************************************************
1305  *  32-bit hash functions
1306  *********************************************************************/
1307 static const xxh_u32 XXH_PRIME32_1 =
1308     0x9E3779B1U;                      /* 0b10011110001101110111100110110001 */
1309 static const xxh_u32 XXH_PRIME32_2 =
1310     0x85EBCA77U;                      /* 0b10000101111010111100101001110111 */
1311 static const xxh_u32 XXH_PRIME32_3 =
1312     0xC2B2AE3DU;                      /* 0b11000010101100101010111000111101 */
1313 static const xxh_u32 XXH_PRIME32_4 =
1314     0x27D4EB2FU;                      /* 0b00100111110101001110101100101111 */
1315 static const xxh_u32 XXH_PRIME32_5 =
1316     0x165667B1U;                      /* 0b00010110010101100110011110110001 */
1317 
1318   #ifdef XXH_OLD_NAMES
1319     #define PRIME32_1 XXH_PRIME32_1
1320     #define PRIME32_2 XXH_PRIME32_2
1321     #define PRIME32_3 XXH_PRIME32_3
1322     #define PRIME32_4 XXH_PRIME32_4
1323     #define PRIME32_5 XXH_PRIME32_5
1324   #endif
1325 
1326 static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) {
1327 
1328   acc += input * XXH_PRIME32_2;
1329   acc = XXH_rotl32(acc, 13);
1330   acc *= XXH_PRIME32_1;
1331   #if defined(__GNUC__) && defined(__SSE4_1__) && \
1332       !defined(XXH_ENABLE_AUTOVECTORIZE)
1333   /*
1334    * UGLY HACK:
1335    * This inline assembly hack forces acc into a normal register. This is the
1336    * only thing that prevents GCC and Clang from autovectorizing the XXH32
1337    * loop (pragmas and attributes don't work for some resason) without globally
1338    * disabling SSE4.1.
1339    *
1340    * The reason we want to avoid vectorization is because despite working on
1341    * 4 integers at a time, there are multiple factors slowing XXH32 down on
1342    * SSE4:
1343    * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
1344    *   newer chips!) making it slightly slower to multiply four integers at
1345    *   once compared to four integers independently. Even when pmulld was
1346    *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
1347    *   just to multiply unless doing a long operation.
1348    *
1349    * - Four instructions are required to rotate,
1350    *      movqda tmp,  v // not required with VEX encoding
1351    *      pslld  tmp, 13 // tmp <<= 13
1352    *      psrld  v,   19 // x >>= 19
1353    *      por    v,  tmp // x |= tmp
1354    *   compared to one for scalar:
1355    *      roll   v, 13    // reliably fast across the board
1356    *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
1357    *
1358    * - Instruction level parallelism is actually more beneficial here because
1359    *   the SIMD actually serializes this operation: While v1 is rotating, v2
1360    *   can load data, while v3 can multiply. SSE forces them to operate
1361    *   together.
1362    *
1363    * How this hack works:
1364    * __asm__(""       // Declare an assembly block but don't declare any
1365    * instructions :       // However, as an Input/Output Operand,
1366    *          "+r"    // constrain a read/write operand (+) as a general purpose
1367    * register (r). (acc)   // and set acc as the operand
1368    * );
1369    *
1370    * Because of the 'r', the compiler has promised that seed will be in a
1371    * general purpose register and the '+' says that it will be 'read/write',
1372    * so it has to assume it has changed. It is like volatile without all the
1373    * loads and stores.
1374    *
1375    * Since the argument has to be in a normal register (not an SSE register),
1376    * each time XXH32_round is called, it is impossible to vectorize.
1377    */
1378   __asm__("" : "+r"(acc));
1379   #endif
1380   return acc;
1381 
1382 }
1383 
1384 /* mix all bits */
1385 static xxh_u32 XXH32_avalanche(xxh_u32 h32) {
1386 
1387   h32 ^= h32 >> 15;
1388   h32 *= XXH_PRIME32_2;
1389   h32 ^= h32 >> 13;
1390   h32 *= XXH_PRIME32_3;
1391   h32 ^= h32 >> 16;
1392   return (h32);
1393 
1394 }
1395 
1396   #define XXH_get32bits(p) XXH_readLE32_align(p, align)
1397 
1398 static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
1399                               XXH_alignment align) {
1400 
1401   /* dummy comment */
1402 
1403   #define XXH_PROCESS1                           \
1404     do {                                         \
1405                                                  \
1406       h32 += (*ptr++) * XXH_PRIME32_5;           \
1407       h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \
1408                                                  \
1409     } while (0)
1410 
1411   #define XXH_PROCESS4                           \
1412     do {                                         \
1413                                                  \
1414       h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \
1415       ptr += 4;                                  \
1416       h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \
1417                                                  \
1418     } while (0)
1419 
1420   /* Compact rerolled version */
1421   if (XXH_REROLL) {
1422 
1423     len &= 15;
1424     while (len >= 4) {
1425 
1426       XXH_PROCESS4;
1427       len -= 4;
1428 
1429     }
1430 
1431     while (len > 0) {
1432 
1433       XXH_PROCESS1;
1434       --len;
1435 
1436     }
1437 
1438     return XXH32_avalanche(h32);
1439 
1440   } else {
1441 
1442     switch (len & 15) /* or switch(bEnd - p) */ {
1443 
1444       case 12:
1445         XXH_PROCESS4;
1446         /* fallthrough */
1447       case 8:
1448         XXH_PROCESS4;
1449         /* fallthrough */
1450       case 4:
1451         XXH_PROCESS4;
1452         return XXH32_avalanche(h32);
1453 
1454       case 13:
1455         XXH_PROCESS4;
1456         /* fallthrough */
1457       case 9:
1458         XXH_PROCESS4;
1459         /* fallthrough */
1460       case 5:
1461         XXH_PROCESS4;
1462         XXH_PROCESS1;
1463         return XXH32_avalanche(h32);
1464 
1465       case 14:
1466         XXH_PROCESS4;
1467         /* fallthrough */
1468       case 10:
1469         XXH_PROCESS4;
1470         /* fallthrough */
1471       case 6:
1472         XXH_PROCESS4;
1473         XXH_PROCESS1;
1474         XXH_PROCESS1;
1475         return XXH32_avalanche(h32);
1476 
1477       case 15:
1478         XXH_PROCESS4;
1479         /* fallthrough */
1480       case 11:
1481         XXH_PROCESS4;
1482         /* fallthrough */
1483       case 7:
1484         XXH_PROCESS4;
1485         /* fallthrough */
1486       case 3:
1487         XXH_PROCESS1;
1488         /* fallthrough */
1489       case 2:
1490         XXH_PROCESS1;
1491         /* fallthrough */
1492       case 1:
1493         XXH_PROCESS1;
1494         /* fallthrough */
1495       case 0:
1496         return XXH32_avalanche(h32);
1497 
1498     }
1499 
1500     XXH_ASSERT(0);
1501     return h32;                 /* reaching this point is deemed impossible */
1502 
1503   }
1504 
1505 }
1506 
1507   #ifdef XXH_OLD_NAMES
1508     #define PROCESS1 XXH_PROCESS1
1509     #define PROCESS4 XXH_PROCESS4
1510   #else
1511     #undef XXH_PROCESS1
1512     #undef XXH_PROCESS4
1513   #endif
1514 
1515 XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8 *input, size_t len,
1516                                             xxh_u32 seed, XXH_alignment align) {
1517 
1518   const xxh_u8 *bEnd = input + len;
1519   xxh_u32       h32;
1520 
1521   #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
1522       (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
1523   if (input == NULL) {
1524 
1525     len = 0;
1526     bEnd = input = (const xxh_u8 *)(size_t)16;
1527 
1528   }
1529 
1530   #endif
1531 
1532   if (len >= 16) {
1533 
1534     const xxh_u8 *const limit = bEnd - 15;
1535     xxh_u32             v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
1536     xxh_u32             v2 = seed + XXH_PRIME32_2;
1537     xxh_u32             v3 = seed + 0;
1538     xxh_u32             v4 = seed - XXH_PRIME32_1;
1539 
1540     do {
1541 
1542       v1 = XXH32_round(v1, XXH_get32bits(input));
1543       input += 4;
1544       v2 = XXH32_round(v2, XXH_get32bits(input));
1545       input += 4;
1546       v3 = XXH32_round(v3, XXH_get32bits(input));
1547       input += 4;
1548       v4 = XXH32_round(v4, XXH_get32bits(input));
1549       input += 4;
1550 
1551     } while (input < limit);
1552 
1553     h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) +
1554           XXH_rotl32(v4, 18);
1555 
1556   } else {
1557 
1558     h32 = seed + XXH_PRIME32_5;
1559 
1560   }
1561 
1562   h32 += (xxh_u32)len;
1563 
1564   return XXH32_finalize(h32, input, len & 15, align);
1565 
1566 }
1567 
1568 XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len,
1569                                   XXH32_hash_t seed) {
1570 
1571   #if 0
1572     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
1573     XXH32_state_t state;
1574     XXH32_reset(&state, seed);
1575     XXH32_update(&state, (const xxh_u8*)input, len);
1576     return XXH32_digest(&state);
1577 
1578   #else
1579 
1580   if (XXH_FORCE_ALIGN_CHECK) {
1581 
1582     if ((((size_t)input) & 3) ==
1583         0) {        /* Input is 4-bytes aligned, leverage the speed benefit */
1584       return XXH32_endian_align((const xxh_u8 *)input, len, seed, XXH_aligned);
1585 
1586     }
1587 
1588   }
1589 
1590   return XXH32_endian_align((const xxh_u8 *)input, len, seed, XXH_unaligned);
1591   #endif
1592 
1593 }
1594 
1595 /*******   Hash streaming   *******/
1596 
1597 XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void) {
1598 
1599   return (XXH32_state_t *)XXH_malloc(sizeof(XXH32_state_t));
1600 
1601 }
1602 
1603 XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) {
1604 
1605   XXH_free(statePtr);
1606   return XXH_OK;
1607 
1608 }
1609 
1610 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t *      dstState,
1611                                     const XXH32_state_t *srcState) {
1612 
1613   memcpy(dstState, srcState, sizeof(*dstState));
1614 
1615 }
1616 
1617 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
1618                                          XXH32_hash_t   seed) {
1619 
1620   XXH32_state_t state; /* using a local state to memcpy() in order to avoid
1621                           strict-aliasing warnings */
1622   memset(&state, 0, sizeof(state));
1623   state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
1624   state.v2 = seed + XXH_PRIME32_2;
1625   state.v3 = seed + 0;
1626   state.v4 = seed - XXH_PRIME32_1;
1627   /* do not write into reserved, planned to be removed in a future version */
1628   memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
1629   return XXH_OK;
1630 
1631 }
1632 
1633 XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state,
1634                                           const void *input, size_t len) {
1635 
1636   if (input == NULL)
1637   #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
1638       (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
1639     return XXH_OK;
1640   #else
1641     return XXH_ERROR;
1642   #endif
1643 
1644   {
1645 
1646     const xxh_u8 *      p = (const xxh_u8 *)input;
1647     const xxh_u8 *const bEnd = p + len;
1648 
1649     state->total_len_32 += (XXH32_hash_t)len;
1650     state->large_len |=
1651         (XXH32_hash_t)((len >= 16) | (state->total_len_32 >= 16));
1652 
1653     if (state->memsize + len < 16) {                  /* fill in tmp buffer */
1654       XXH_memcpy((xxh_u8 *)(state->mem32) + state->memsize, input, len);
1655       state->memsize += (XXH32_hash_t)len;
1656       return XXH_OK;
1657 
1658     }
1659 
1660     if (state->memsize) {            /* some data left from previous update */
1661       XXH_memcpy((xxh_u8 *)(state->mem32) + state->memsize, input,
1662                  16 - state->memsize);
1663       {
1664 
1665         const xxh_u32 *p32 = state->mem32;
1666         state->v1 = XXH32_round(state->v1, XXH_readLE32(p32));
1667         p32++;
1668         state->v2 = XXH32_round(state->v2, XXH_readLE32(p32));
1669         p32++;
1670         state->v3 = XXH32_round(state->v3, XXH_readLE32(p32));
1671         p32++;
1672         state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
1673 
1674       }
1675 
1676       p += 16 - state->memsize;
1677       state->memsize = 0;
1678 
1679     }
1680 
1681     if (p <= bEnd - 16) {
1682 
1683       const xxh_u8 *const limit = bEnd - 16;
1684       xxh_u32             v1 = state->v1;
1685       xxh_u32             v2 = state->v2;
1686       xxh_u32             v3 = state->v3;
1687       xxh_u32             v4 = state->v4;
1688 
1689       do {
1690 
1691         v1 = XXH32_round(v1, XXH_readLE32(p));
1692         p += 4;
1693         v2 = XXH32_round(v2, XXH_readLE32(p));
1694         p += 4;
1695         v3 = XXH32_round(v3, XXH_readLE32(p));
1696         p += 4;
1697         v4 = XXH32_round(v4, XXH_readLE32(p));
1698         p += 4;
1699 
1700       } while (p <= limit);
1701 
1702       state->v1 = v1;
1703       state->v2 = v2;
1704       state->v3 = v3;
1705       state->v4 = v4;
1706 
1707     }
1708 
1709     if (p < bEnd) {
1710 
1711       XXH_memcpy(state->mem32, p, (size_t)(bEnd - p));
1712       state->memsize = (unsigned)(bEnd - p);
1713 
1714     }
1715 
1716   }
1717 
1718   return XXH_OK;
1719 
1720 }
1721 
1722 XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *state) {
1723 
1724   xxh_u32 h32;
1725 
1726   if (state->large_len) {
1727 
1728     h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) +
1729           XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
1730 
1731   } else {
1732 
1733     h32 = state->v3 /* == seed */ + XXH_PRIME32_5;
1734 
1735   }
1736 
1737   h32 += state->total_len_32;
1738 
1739   return XXH32_finalize(h32, (const xxh_u8 *)state->mem32, state->memsize,
1740                         XXH_aligned);
1741 
1742 }
1743 
1744 /*******   Canonical representation   *******/
1745 
1746 /*
1747  * The default return values from XXH functions are unsigned 32 and 64 bit
1748  * integers.
1749  *
1750  * The canonical representation uses big endian convention, the same convention
1751  * as human-readable numbers (large digits first).
1752  *
1753  * This way, hash values can be written into a file or buffer, remaining
1754  * comparable across different systems.
1755  *
1756  * The following functions allow transformation of hash values to and from their
1757  * canonical format.
1758  */
1759 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst,
1760                                             XXH32_hash_t       hash) {
1761 
1762   XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
1763   if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
1764   memcpy(dst, &hash, sizeof(*dst));
1765 
1766 }
1767 
1768 XXH_PUBLIC_API XXH32_hash_t
1769 XXH32_hashFromCanonical(const XXH32_canonical_t *src) {
1770 
1771   return XXH_readBE32(src);
1772 
1773 }
1774 
1775   #ifndef XXH_NO_LONG_LONG
1776 
1777 /* *******************************************************************
1778  *  64-bit hash functions
1779  *********************************************************************/
1780 
1781 /*******   Memory access   *******/
1782 
1783 typedef XXH64_hash_t xxh_u64;
1784 
1785     #ifdef XXH_OLD_NAMES
1786       #define U64 xxh_u64
1787     #endif
1788 
1789     /*!
1790      * XXH_REROLL_XXH64:
1791      * Whether to reroll the XXH64_finalize() loop.
1792      *
1793      * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a
1794      * performance gain on 64-bit hosts, as only one jump is required.
1795      *
1796      * However, on 32-bit hosts, because arithmetic needs to be done with two
1797      * 32-bit registers, and 64-bit arithmetic needs to be simulated, it isn't
1798      * beneficial to unroll. The code becomes ridiculously large (the largest
1799      * function in the binary on i386!), and rerolling it saves anywhere from
1800      * 3kB to 20kB. It is also slightly faster because it fits into cache better
1801      * and is more likely to be inlined by the compiler.
1802      *
1803      * If XXH_REROLL is defined, this is ignored and the loop is always
1804      * rerolled.
1805      */
1806     #ifndef XXH_REROLL_XXH64
1807       #if (defined(__ILP32__) ||                                              \
1808            defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
1809           || !(defined(__x86_64__) || defined(_M_X64) ||                      \
1810                defined(_M_AMD64) /* x86-64 */                                 \
1811                || defined(_M_ARM64) || defined(__aarch64__) ||                \
1812                defined(__arm64__) /* aarch64 */                               \
1813                || defined(__PPC64__) || defined(__PPC64LE__) ||               \
1814                defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */       \
1815                || defined(__mips64__) || defined(__mips64)) /* mips64 */      \
1816           || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX)  /* check limits */
1817         #define XXH_REROLL_XXH64 1
1818       #else
1819         #define XXH_REROLL_XXH64 0
1820       #endif
1821     #endif                                    /* !defined(XXH_REROLL_XXH64) */
1822 
1823     #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
1824     /*
1825      * Manual byteshift. Best for old compilers which don't inline memcpy.
1826      * We actually directly use XXH_readLE64 and XXH_readBE64.
1827      */
1828     #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))
1829 
1830 /* Force direct memory access. Only works on CPU which support unaligned memory
1831  * access in hardware */
1832 static xxh_u64 XXH_read64(const void *memPtr) {
1833 
1834   return *(const xxh_u64 *)memPtr;
1835 
1836 }
1837 
1838     #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))
1839 
1840       /*
1841        * __pack instructions are safer, but compiler specific, hence potentially
1842        * problematic for some compilers.
1843        *
1844        * Currently only defined for GCC and ICC.
1845        */
1846       #ifdef XXH_OLD_NAMES
1847 typedef union {
1848 
1849   xxh_u32 u32;
1850   xxh_u64 u64;
1851 
1852 } __attribute__((packed)) unalign64;
1853 
1854       #endif
1855 static xxh_u64 XXH_read64(const void *ptr) {
1856 
1857   typedef union {
1858 
1859     xxh_u32 u32;
1860     xxh_u64 u64;
1861 
1862   } __attribute__((packed)) xxh_unalign64;
1863 
1864   return ((const xxh_unalign64 *)ptr)->u64;
1865 
1866 }
1867 
1868     #else
1869 
1870 /*
1871  * Portable and safe solution. Generally efficient.
1872  * see: https://stackoverflow.com/a/32095106/646947
1873  */
1874 static xxh_u64 XXH_read64(const void *memPtr) {
1875 
1876   xxh_u64 val;
1877   memcpy(&val, memPtr, sizeof(val));
1878   return val;
1879 
1880 }
1881 
1882     #endif                                /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
1883 
1884     #if defined(_MSC_VER)                                  /* Visual Studio */
1885       #define XXH_swap64 _byteswap_uint64
1886     #elif XXH_GCC_VERSION >= 403
1887       #define XXH_swap64 __builtin_bswap64
1888     #else
1889 static xxh_u64 XXH_swap64(xxh_u64 x) {
1890 
1891   return ((x << 56) & 0xff00000000000000ULL) |
1892          ((x << 40) & 0x00ff000000000000ULL) |
1893          ((x << 24) & 0x0000ff0000000000ULL) |
1894          ((x << 8) & 0x000000ff00000000ULL) |
1895          ((x >> 8) & 0x00000000ff000000ULL) |
1896          ((x >> 24) & 0x0000000000ff0000ULL) |
1897          ((x >> 40) & 0x000000000000ff00ULL) |
1898          ((x >> 56) & 0x00000000000000ffULL);
1899 
1900 }
1901 
1902     #endif
1903 
1904     /* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
1905     #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
1906 
1907 XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void *memPtr) {
1908 
1909   const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr;
1910   return bytePtr[0] | ((xxh_u64)bytePtr[1] << 8) | ((xxh_u64)bytePtr[2] << 16) |
1911          ((xxh_u64)bytePtr[3] << 24) | ((xxh_u64)bytePtr[4] << 32) |
1912          ((xxh_u64)bytePtr[5] << 40) | ((xxh_u64)bytePtr[6] << 48) |
1913          ((xxh_u64)bytePtr[7] << 56);
1914 
1915 }
1916 
1917 XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void *memPtr) {
1918 
1919   const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr;
1920   return bytePtr[7] | ((xxh_u64)bytePtr[6] << 8) | ((xxh_u64)bytePtr[5] << 16) |
1921          ((xxh_u64)bytePtr[4] << 24) | ((xxh_u64)bytePtr[3] << 32) |
1922          ((xxh_u64)bytePtr[2] << 40) | ((xxh_u64)bytePtr[1] << 48) |
1923          ((xxh_u64)bytePtr[0] << 56);
1924 
1925 }
1926 
1927     #else
1928 XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void *ptr) {
1929 
1930   return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
1931 
1932 }
1933 
1934 static xxh_u64 XXH_readBE64(const void *ptr) {
1935 
1936   return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
1937 
1938 }
1939 
1940     #endif
1941 
1942 XXH_FORCE_INLINE xxh_u64 XXH_readLE64_align(const void *  ptr,
1943                                             XXH_alignment align) {
1944 
1945   if (align == XXH_unaligned)
1946     return XXH_readLE64(ptr);
1947   else
1948     return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64 *)ptr
1949                                  : XXH_swap64(*(const xxh_u64 *)ptr);
1950 
1951 }
1952 
1953 /*******   xxh64   *******/
1954 
1955 static const xxh_u64 XXH_PRIME64_1 =
1956     0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111
1957                             */
1958 static const xxh_u64 XXH_PRIME64_2 =
1959     0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111
1960                             */
1961 static const xxh_u64 XXH_PRIME64_3 =
1962     0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001
1963                             */
1964 static const xxh_u64 XXH_PRIME64_4 =
1965     0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011
1966                             */
1967 static const xxh_u64 XXH_PRIME64_5 =
1968     0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101
1969                             */
1970 
1971     #ifdef XXH_OLD_NAMES
1972       #define PRIME64_1 XXH_PRIME64_1
1973       #define PRIME64_2 XXH_PRIME64_2
1974       #define PRIME64_3 XXH_PRIME64_3
1975       #define PRIME64_4 XXH_PRIME64_4
1976       #define PRIME64_5 XXH_PRIME64_5
1977     #endif
1978 
1979 static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) {
1980 
1981   acc += input * XXH_PRIME64_2;
1982   acc = XXH_rotl64(acc, 31);
1983   acc *= XXH_PRIME64_1;
1984   return acc;
1985 
1986 }
1987 
1988 static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) {
1989 
1990   val = XXH64_round(0, val);
1991   acc ^= val;
1992   acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
1993   return acc;
1994 
1995 }
1996 
1997 static xxh_u64 XXH64_avalanche(xxh_u64 h64) {
1998 
1999   h64 ^= h64 >> 33;
2000   h64 *= XXH_PRIME64_2;
2001   h64 ^= h64 >> 29;
2002   h64 *= XXH_PRIME64_3;
2003   h64 ^= h64 >> 32;
2004   return h64;
2005 
2006 }
2007 
2008     #define XXH_get64bits(p) XXH_readLE64_align(p, align)
2009 
2010 static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len,
2011                               XXH_alignment align) {
2012 
2013     /* dummy comment */
2014 
2015     #define XXH_PROCESS1_64                        \
2016       do {                                         \
2017                                                    \
2018         h64 ^= (*ptr++) * XXH_PRIME64_5;           \
2019         h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; \
2020                                                    \
2021       } while (0)
2022 
2023     #define XXH_PROCESS4_64                                        \
2024       do {                                                         \
2025                                                                    \
2026         h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;      \
2027         ptr += 4;                                                  \
2028         h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; \
2029                                                                    \
2030       } while (0)
2031 
2032     #define XXH_PROCESS8_64                                        \
2033       do {                                                         \
2034                                                                    \
2035         xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));     \
2036         ptr += 8;                                                  \
2037         h64 ^= k1;                                                 \
2038         h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4; \
2039                                                                    \
2040       } while (0)
2041 
2042   /* Rerolled version for 32-bit targets is faster and much smaller. */
2043   if (XXH_REROLL || XXH_REROLL_XXH64) {
2044 
2045     len &= 31;
2046     while (len >= 8) {
2047 
2048       XXH_PROCESS8_64;
2049       len -= 8;
2050 
2051     }
2052 
2053     if (len >= 4) {
2054 
2055       XXH_PROCESS4_64;
2056       len -= 4;
2057 
2058     }
2059 
2060     while (len > 0) {
2061 
2062       XXH_PROCESS1_64;
2063       --len;
2064 
2065     }
2066 
2067     return XXH64_avalanche(h64);
2068 
2069   } else {
2070 
2071     switch (len & 31) {
2072 
2073       case 24:
2074         XXH_PROCESS8_64;
2075         /* fallthrough */
2076       case 16:
2077         XXH_PROCESS8_64;
2078         /* fallthrough */
2079       case 8:
2080         XXH_PROCESS8_64;
2081         return XXH64_avalanche(h64);
2082 
2083       case 28:
2084         XXH_PROCESS8_64;
2085         /* fallthrough */
2086       case 20:
2087         XXH_PROCESS8_64;
2088         /* fallthrough */
2089       case 12:
2090         XXH_PROCESS8_64;
2091         /* fallthrough */
2092       case 4:
2093         XXH_PROCESS4_64;
2094         return XXH64_avalanche(h64);
2095 
2096       case 25:
2097         XXH_PROCESS8_64;
2098         /* fallthrough */
2099       case 17:
2100         XXH_PROCESS8_64;
2101         /* fallthrough */
2102       case 9:
2103         XXH_PROCESS8_64;
2104         XXH_PROCESS1_64;
2105         return XXH64_avalanche(h64);
2106 
2107       case 29:
2108         XXH_PROCESS8_64;
2109         /* fallthrough */
2110       case 21:
2111         XXH_PROCESS8_64;
2112         /* fallthrough */
2113       case 13:
2114         XXH_PROCESS8_64;
2115         /* fallthrough */
2116       case 5:
2117         XXH_PROCESS4_64;
2118         XXH_PROCESS1_64;
2119         return XXH64_avalanche(h64);
2120 
2121       case 26:
2122         XXH_PROCESS8_64;
2123         /* fallthrough */
2124       case 18:
2125         XXH_PROCESS8_64;
2126         /* fallthrough */
2127       case 10:
2128         XXH_PROCESS8_64;
2129         XXH_PROCESS1_64;
2130         XXH_PROCESS1_64;
2131         return XXH64_avalanche(h64);
2132 
2133       case 30:
2134         XXH_PROCESS8_64;
2135         /* fallthrough */
2136       case 22:
2137         XXH_PROCESS8_64;
2138         /* fallthrough */
2139       case 14:
2140         XXH_PROCESS8_64;
2141         /* fallthrough */
2142       case 6:
2143         XXH_PROCESS4_64;
2144         XXH_PROCESS1_64;
2145         XXH_PROCESS1_64;
2146         return XXH64_avalanche(h64);
2147 
2148       case 27:
2149         XXH_PROCESS8_64;
2150         /* fallthrough */
2151       case 19:
2152         XXH_PROCESS8_64;
2153         /* fallthrough */
2154       case 11:
2155         XXH_PROCESS8_64;
2156         XXH_PROCESS1_64;
2157         XXH_PROCESS1_64;
2158         XXH_PROCESS1_64;
2159         return XXH64_avalanche(h64);
2160 
2161       case 31:
2162         XXH_PROCESS8_64;
2163         /* fallthrough */
2164       case 23:
2165         XXH_PROCESS8_64;
2166         /* fallthrough */
2167       case 15:
2168         XXH_PROCESS8_64;
2169         /* fallthrough */
2170       case 7:
2171         XXH_PROCESS4_64;
2172         /* fallthrough */
2173       case 3:
2174         XXH_PROCESS1_64;
2175         /* fallthrough */
2176       case 2:
2177         XXH_PROCESS1_64;
2178         /* fallthrough */
2179       case 1:
2180         XXH_PROCESS1_64;
2181         /* fallthrough */
2182       case 0:
2183         return XXH64_avalanche(h64);
2184 
2185     }
2186 
2187   }
2188 
2189   /* impossible to reach */
2190   XXH_ASSERT(0);
2191   return 0;          /* unreachable, but some compilers complain without it */
2192 
2193 }
2194 
2195     #ifdef XXH_OLD_NAMES
2196       #define PROCESS1_64 XXH_PROCESS1_64
2197       #define PROCESS4_64 XXH_PROCESS4_64
2198       #define PROCESS8_64 XXH_PROCESS8_64
2199     #else
2200       #undef XXH_PROCESS1_64
2201       #undef XXH_PROCESS4_64
2202       #undef XXH_PROCESS8_64
2203     #endif
2204 
2205 XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8 *input, size_t len,
2206                                             xxh_u64 seed, XXH_alignment align) {
2207 
2208   const xxh_u8 *bEnd = input + len;
2209   xxh_u64       h64;
2210 
2211     #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
2212         (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
2213   if (input == NULL) {
2214 
2215     len = 0;
2216     bEnd = input = (const xxh_u8 *)(size_t)32;
2217 
2218   }
2219 
2220     #endif
2221 
2222   if (len >= 32) {
2223 
2224     const xxh_u8 *const limit = bEnd - 32;
2225     xxh_u64             v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2226     xxh_u64             v2 = seed + XXH_PRIME64_2;
2227     xxh_u64             v3 = seed + 0;
2228     xxh_u64             v4 = seed - XXH_PRIME64_1;
2229 
2230     do {
2231 
2232       v1 = XXH64_round(v1, XXH_get64bits(input));
2233       input += 8;
2234       v2 = XXH64_round(v2, XXH_get64bits(input));
2235       input += 8;
2236       v3 = XXH64_round(v3, XXH_get64bits(input));
2237       input += 8;
2238       v4 = XXH64_round(v4, XXH_get64bits(input));
2239       input += 8;
2240 
2241     } while (input <= limit);
2242 
2243     h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
2244           XXH_rotl64(v4, 18);
2245     h64 = XXH64_mergeRound(h64, v1);
2246     h64 = XXH64_mergeRound(h64, v2);
2247     h64 = XXH64_mergeRound(h64, v3);
2248     h64 = XXH64_mergeRound(h64, v4);
2249 
2250   } else {
2251 
2252     h64 = seed + XXH_PRIME64_5;
2253 
2254   }
2255 
2256   h64 += (xxh_u64)len;
2257 
2258   return XXH64_finalize(h64, input, len, align);
2259 
2260 }
2261 
2262 XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len,
2263                                   XXH64_hash_t seed) {
2264 
2265     #if 0
2266     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
2267     XXH64_state_t state;
2268     XXH64_reset(&state, seed);
2269     XXH64_update(&state, (const xxh_u8*)input, len);
2270     return XXH64_digest(&state);
2271 
2272     #else
2273 
2274   if (XXH_FORCE_ALIGN_CHECK) {
2275 
2276     if ((((size_t)input) & 7) ==
2277         0) {        /* Input is aligned, let's leverage the speed advantage */
2278       return XXH64_endian_align((const xxh_u8 *)input, len, seed, XXH_aligned);
2279 
2280     }
2281 
2282   }
2283 
2284   return XXH64_endian_align((const xxh_u8 *)input, len, seed, XXH_unaligned);
2285 
2286     #endif
2287 
2288 }
2289 
2290 /*******   Hash Streaming   *******/
2291 
2292 XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void) {
2293 
2294   return (XXH64_state_t *)XXH_malloc(sizeof(XXH64_state_t));
2295 
2296 }
2297 
2298 XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) {
2299 
2300   XXH_free(statePtr);
2301   return XXH_OK;
2302 
2303 }
2304 
2305 XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t *      dstState,
2306                                     const XXH64_state_t *srcState) {
2307 
2308   memcpy(dstState, srcState, sizeof(*dstState));
2309 
2310 }
2311 
2312 XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr,
2313                                          XXH64_hash_t   seed) {
2314 
2315   XXH64_state_t state; /* use a local state to memcpy() in order to avoid
2316                           strict-aliasing warnings */
2317   memset(&state, 0, sizeof(state));
2318   state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2319   state.v2 = seed + XXH_PRIME64_2;
2320   state.v3 = seed + 0;
2321   state.v4 = seed - XXH_PRIME64_1;
2322   /* do not write into reserved64, might be removed in a future version */
2323   memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
2324   return XXH_OK;
2325 
2326 }
2327 
2328 XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state,
2329                                           const void *input, size_t len) {
2330 
2331   if (input == NULL)
2332     #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
2333         (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
2334     return XXH_OK;
2335     #else
2336     return XXH_ERROR;
2337     #endif
2338 
2339   {
2340 
2341     const xxh_u8 *      p = (const xxh_u8 *)input;
2342     const xxh_u8 *const bEnd = p + len;
2343 
2344     state->total_len += len;
2345 
2346     if (state->memsize + len < 32) {                  /* fill in tmp buffer */
2347       XXH_memcpy(((xxh_u8 *)state->mem64) + state->memsize, input, len);
2348       state->memsize += (xxh_u32)len;
2349       return XXH_OK;
2350 
2351     }
2352 
2353     if (state->memsize) {                             /* tmp buffer is full */
2354       XXH_memcpy(((xxh_u8 *)state->mem64) + state->memsize, input,
2355                  32 - state->memsize);
2356       state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0));
2357       state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1));
2358       state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2));
2359       state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3));
2360       p += 32 - state->memsize;
2361       state->memsize = 0;
2362 
2363     }
2364 
2365     if (p + 32 <= bEnd) {
2366 
2367       const xxh_u8 *const limit = bEnd - 32;
2368       xxh_u64             v1 = state->v1;
2369       xxh_u64             v2 = state->v2;
2370       xxh_u64             v3 = state->v3;
2371       xxh_u64             v4 = state->v4;
2372 
2373       do {
2374 
2375         v1 = XXH64_round(v1, XXH_readLE64(p));
2376         p += 8;
2377         v2 = XXH64_round(v2, XXH_readLE64(p));
2378         p += 8;
2379         v3 = XXH64_round(v3, XXH_readLE64(p));
2380         p += 8;
2381         v4 = XXH64_round(v4, XXH_readLE64(p));
2382         p += 8;
2383 
2384       } while (p <= limit);
2385 
2386       state->v1 = v1;
2387       state->v2 = v2;
2388       state->v3 = v3;
2389       state->v4 = v4;
2390 
2391     }
2392 
2393     if (p < bEnd) {
2394 
2395       XXH_memcpy(state->mem64, p, (size_t)(bEnd - p));
2396       state->memsize = (unsigned)(bEnd - p);
2397 
2398     }
2399 
2400   }
2401 
2402   return XXH_OK;
2403 
2404 }
2405 
2406 XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *state) {
2407 
2408   xxh_u64 h64;
2409 
2410   if (state->total_len >= 32) {
2411 
2412     xxh_u64 const v1 = state->v1;
2413     xxh_u64 const v2 = state->v2;
2414     xxh_u64 const v3 = state->v3;
2415     xxh_u64 const v4 = state->v4;
2416 
2417     h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
2418           XXH_rotl64(v4, 18);
2419     h64 = XXH64_mergeRound(h64, v1);
2420     h64 = XXH64_mergeRound(h64, v2);
2421     h64 = XXH64_mergeRound(h64, v3);
2422     h64 = XXH64_mergeRound(h64, v4);
2423 
2424   } else {
2425 
2426     h64 = state->v3 /*seed*/ + XXH_PRIME64_5;
2427 
2428   }
2429 
2430   h64 += (xxh_u64)state->total_len;
2431 
2432   return XXH64_finalize(h64, (const xxh_u8 *)state->mem64,
2433                         (size_t)state->total_len, XXH_aligned);
2434 
2435 }
2436 
2437 /******* Canonical representation   *******/
2438 
2439 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
2440                                             XXH64_hash_t       hash) {
2441 
2442   XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
2443   if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
2444   memcpy(dst, &hash, sizeof(*dst));
2445 
2446 }
2447 
2448 XXH_PUBLIC_API XXH64_hash_t
2449 XXH64_hashFromCanonical(const XXH64_canonical_t *src) {
2450 
2451   return XXH_readBE64(src);
2452 
2453 }
2454 
2455   /* *********************************************************************
2456    *  XXH3
2457    *  New generation hash designed for speed on small keys and vectorization
2458    ************************************************************************ */
2459 
2460   /* ===   Compiler specifics   === */
2461 
2462     #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L  /* >= C99 */
2463       #define XXH_RESTRICT restrict
2464     #else
2465       /* Note: it might be useful to define __restrict or __restrict__ for some
2466        * C++ compilers */
2467       #define XXH_RESTRICT                                       /* disable */
2468     #endif
2469 
2470     #if (defined(__GNUC__) && (__GNUC__ >= 3)) ||                   \
2471         (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \
2472         defined(__clang__)
2473       #define XXH_likely(x) __builtin_expect(x, 1)
2474       #define XXH_unlikely(x) __builtin_expect(x, 0)
2475     #else
2476       #define XXH_likely(x) (x)
2477       #define XXH_unlikely(x) (x)
2478     #endif
2479 
2480     #if defined(__GNUC__)
2481       #if defined(__AVX2__)
2482         #include <immintrin.h>
2483       #elif defined(__SSE2__)
2484         #include <emmintrin.h>
2485       #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
2486         #define inline __inline__                 /* circumvent a clang bug */
2487         #include <arm_neon.h>
2488         #undef inline
2489       #endif
2490     #elif defined(_MSC_VER)
2491       #include <intrin.h>
2492     #endif
2493 
2494     /*
2495      * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
2496      * remaining a true 64-bit/128-bit hash function.
2497      *
2498      * This is done by prioritizing a subset of 64-bit operations that can be
2499      * emulated without too many steps on the average 32-bit machine.
2500      *
2501      * For example, these two lines seem similar, and run equally fast on
2502      * 64-bit:
2503      *
2504      *   xxh_u64 x;
2505      *   x ^= (x >> 47); // good
2506      *   x ^= (x >> 13); // bad
2507      *
2508      * However, to a 32-bit machine, there is a major difference.
2509      *
2510      * x ^= (x >> 47) looks like this:
2511      *
2512      *   x.lo ^= (x.hi >> (47 - 32));
2513      *
2514      * while x ^= (x >> 13) looks like this:
2515      *
2516      *   // note: funnel shifts are not usually cheap.
2517      *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
2518      *   x.hi ^= (x.hi >> 13);
2519      *
2520      * The first one is significantly faster than the second, simply because the
2521      * shift is larger than 32. This means:
2522      *  - All the bits we need are in the upper 32 bits, so we can ignore the
2523      * lower 32 bits in the shift.
2524      *  - The shift result will always fit in the lower 32 bits, and therefore,
2525      *    we can ignore the upper 32 bits in the xor.
2526      *
2527      * Thanks to this optimization, XXH3 only requires these features to be
2528      * efficient:
2529      *
2530      *  - Usable unaligned access
2531      *  - A 32-bit or 64-bit ALU
2532      *      - If 32-bit, a decent ADC instruction
2533      *  - A 32 or 64-bit multiply with a 64-bit result
2534      *  - For the 128-bit variant, a decent byteswap helps short inputs.
2535      *
2536      * The first two are already required by XXH32, and almost all 32-bit and
2537      * 64-bit platforms which can run XXH32 can run XXH3 efficiently.
2538      *
2539      * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
2540      * notable exception.
2541      *
2542      * First of all, Thumb-1 lacks support for the UMULL instruction which
2543      * performs the important long multiply. This means numerous __aeabi_lmul
2544      * calls.
2545      *
2546      * Second of all, the 8 functional registers are just not enough.
2547      * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic
2548      * need Lo registers, and this shuffling results in thousands more MOVs than
2549      * A32.
2550      *
2551      * A32 and T32 don't have this limitation. They can access all 14 registers,
2552      * do a 32->64 multiply with UMULL, and the flexible operand allowing free
2553      * shifts is helpful, too.
2554      *
2555      * Therefore, we do a quick sanity check.
2556      *
2557      * If compiling Thumb-1 for a target which supports ARM instructions, we
2558      * will emit a warning, as it is not a "sane" platform to compile for.
2559      *
2560      * Usually, if this happens, it is because of an accident and you probably
2561      * need to specify -march, as you likely meant to compile for a newer
2562      * architecture.
2563      *
2564      * Credit: large sections of the vectorial and asm source code paths
2565      *         have been contributed by @easyaspi314
2566      */
2567     #if defined(__thumb__) && !defined(__thumb2__) && \
2568         defined(__ARM_ARCH_ISA_ARM)
2569       #warning "XXH3 is highly inefficient without ARM or Thumb-2."
2570     #endif
2571 
2572     /* ==========================================
2573      * Vectorization detection
2574      * ========================================== */
2575     #define XXH_SCALAR 0                         /* Portable scalar version */
2576     #define XXH_SSE2 1                 /* SSE2 for Pentium 4 and all x86_64 */
2577     #define XXH_AVX2 2                    /* AVX2 for Haswell and Bulldozer */
2578     #define XXH_AVX512 3                  /* AVX512 for Skylake and Icelake */
2579     #define XXH_NEON 4             /* NEON for most ARMv7-A and all AArch64 */
2580     #define XXH_VSX 5                     /* VSX and ZVector for POWER8/z13 */
2581 
2582     #ifndef XXH_VECTOR                    /* can be defined on command line */
2583       #if defined(__AVX512F__)
2584         #define XXH_VECTOR XXH_AVX512
2585       #elif defined(__AVX2__)
2586         #define XXH_VECTOR XXH_AVX2
2587       #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \
2588           (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
2589         #define XXH_VECTOR XXH_SSE2
2590       #elif defined(__GNUC__) /* msvc support maybe later */                   \
2591           && (defined(__ARM_NEON__) || defined(__ARM_NEON)) &&                 \
2592           (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
2593            || (defined(__BYTE_ORDER__) &&                                      \
2594                __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
2595         #define XXH_VECTOR XXH_NEON
2596       #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \
2597           (defined(__s390x__) && defined(__VEC__)) &&             \
2598               defined(__GNUC__)                             /* TODO: IBM XL */
2599         #define XXH_VECTOR XXH_VSX
2600       #else
2601         #define XXH_VECTOR XXH_SCALAR
2602       #endif
2603     #endif
2604 
2605     /*
2606      * Controls the alignment of the accumulator,
2607      * for compatibility with aligned vector loads, which are usually faster.
2608      */
2609     #ifndef XXH_ACC_ALIGN
2610       #if defined(XXH_X86DISPATCH)
2611         #define XXH_ACC_ALIGN 64           /* for compatibility with avx512 */
2612       #elif XXH_VECTOR == XXH_SCALAR                              /* scalar */
2613         #define XXH_ACC_ALIGN 8
2614       #elif XXH_VECTOR == XXH_SSE2                                  /* sse2 */
2615         #define XXH_ACC_ALIGN 16
2616       #elif XXH_VECTOR == XXH_AVX2                                  /* avx2 */
2617         #define XXH_ACC_ALIGN 32
2618       #elif XXH_VECTOR == XXH_NEON                                  /* neon */
2619         #define XXH_ACC_ALIGN 16
2620       #elif XXH_VECTOR == XXH_VSX                                    /* vsx */
2621         #define XXH_ACC_ALIGN 16
2622       #elif XXH_VECTOR == XXH_AVX512                              /* avx512 */
2623         #define XXH_ACC_ALIGN 64
2624       #endif
2625     #endif
2626 
2627     #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \
2628         XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
2629       #define XXH_SEC_ALIGN XXH_ACC_ALIGN
2630     #else
2631       #define XXH_SEC_ALIGN 8
2632     #endif
2633 
2634     /*
2635      * UGLY HACK:
2636      * GCC usually generates the best code with -O3 for xxHash.
2637      *
2638      * However, when targeting AVX2, it is overzealous in its unrolling
2639      * resulting in code roughly 3/4 the speed of Clang.
2640      *
2641      * There are other issues, such as GCC splitting _mm256_loadu_si256 into
2642      * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
2643      * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
2644      *
2645      * That is why when compiling the AVX2 version, it is recommended to use
2646      * either -O2 -mavx2 -march=haswell or -O2 -mavx2
2647      * -mno-avx256-split-unaligned-load for decent performance, or to use Clang
2648      * instead.
2649      *
2650      * Fortunately, we can control the first one with a pragma that forces GCC
2651      * into -O2, but the other one we can't control without "failed to inline
2652      * always inline function due to target mismatch" warnings.
2653      */
2654     #if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
2655         && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
2656         && defined(__OPTIMIZE__) &&                                      \
2657         !defined(__OPTIMIZE_SIZE__)                  /* respect -O0 and -Os */
2658       #pragma GCC push_options
2659       #pragma GCC optimize("-O2")
2660     #endif
2661 
2662     #if XXH_VECTOR == XXH_NEON
2663       /*
2664        * NEON's setup for vmlal_u32 is a little more complicated than it is on
2665        * SSE2, AVX2, and VSX.
2666        *
2667        * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an
2668        * upcast.
2669        *
2670        * To do the same operation, the 128-bit 'Q' register needs to be split
2671        * into two 64-bit 'D' registers, performing this operation::
2672        *
2673        *   [                a                 |                 b ] |
2674        * '---------. .--------'                | |                         x |
2675        *            |              .---------' '--------.                |
2676        *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32 ]
2677        *
2678        * Due to significant changes in aarch64, the fastest method for aarch64
2679        * is completely different than the fastest method for ARMv7-A.
2680        *
2681        * ARMv7-A treats D registers as unions overlaying Q registers, so
2682        * modifying D11 will modify the high half of Q5. This is similar to how
2683        * modifying AH will only affect bits 8-15 of AX on x86.
2684        *
2685        * VZIP takes two registers, and puts even lanes in one register and odd
2686        * lanes in the other.
2687        *
2688        * On ARMv7-A, this strangely modifies both parameters in place instead of
2689        * taking the usual 3-operand form.
2690        *
2691        * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on
2692        * the lower and upper halves of the Q register to end up with the high
2693        * and low halves where we want - all in one instruction.
2694        *
2695        *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1],
2696        * d11[1] }
2697        *
2698        * Unfortunately we need inline assembly for this: Instructions modifying
2699        * two registers at once is not possible in GCC or Clang's IR, and they
2700        * have to create a copy.
2701        *
2702        * aarch64 requires a different approach.
2703        *
2704        * In order to make it easier to write a decent compiler for aarch64, many
2705        * quirks were removed, such as conditional execution.
2706        *
2707        * NEON was also affected by this.
2708        *
2709        * aarch64 cannot access the high bits of a Q-form register, and writes to
2710        * a D-form register zero the high bits, similar to how writes to W-form
2711        * scalar registers (or DWORD registers on x86_64) work.
2712        *
2713        * The formerly free vget_high intrinsics now require a vext (with a few
2714        * exceptions)
2715        *
2716        * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the
2717        * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to
2718        * only modify one operand.
2719        *
2720        * The equivalent of the VZIP.32 on the lower and upper halves would be
2721        * this mess:
2722        *
2723        *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1]
2724        * } zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] } zip2 v0.2s,
2725        * v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
2726        *
2727        * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64
2728        * (SHRN):
2729        *
2730        *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
2731        *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
2732        *
2733        * This is available on ARMv7-A, but is less efficient than a single
2734        * VZIP.32.
2735        */
2736 
2737       /*
2738        * Function-like macro:
2739        * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t
2740        * &outHi)
2741        * {
2742 
2743        *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
2744        *     outHi = (uint32x2_t)(in >> 32);
2745        *     in = UNDEFINED;
2746        * }
2747        */
2748       #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
2749           && defined(__GNUC__) && !defined(__aarch64__) && !defined(__arm64__)
2750         #define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                                   \
2751           do {                                                                                         \
2752                                                                                                        \
2753             /* Undocumented GCC/Clang operand modifier: %e0 = lower D half,                            \
2754              * %f0 = upper D half */                                                                   \
2755             /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486             \
2756              */                                                                                        \
2757             /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \
2758              */                                                                                        \
2759             __asm__("vzip.32  %e0, %f0" : "+w"(in));                                                   \
2760             (outLo) = vget_low_u32(vreinterpretq_u32_u64(in));                                         \
2761             (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                        \
2762                                                                                                        \
2763           } while (0)
2764 
2765       #else
2766         #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
2767           do {                                       \
2768                                                      \
2769             (outLo) = vmovn_u64(in);                 \
2770             (outHi) = vshrn_n_u64((in), 32);         \
2771                                                      \
2772           } while (0)
2773 
2774       #endif
2775     #endif                                        /* XXH_VECTOR == XXH_NEON */
2776 
2777     /*
2778      * VSX and Z Vector helpers.
2779      *
2780      * This is very messy, and any pull requests to clean this up are welcome.
2781      *
2782      * There are a lot of problems with supporting VSX and s390x, due to
2783      * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
2784      */
2785     #if XXH_VECTOR == XXH_VSX
2786       #if defined(__s390x__)
2787         #include <s390intrin.h>
2788       #else
2789         /* gcc's altivec.h can have the unwanted consequence to unconditionally
2790          * #define bool, vector, and pixel keywords,
2791          * with bad consequences for programs already using these keywords for
2792          * other purposes. The paragraph defining these macros is skipped when
2793          * __APPLE_ALTIVEC__ is defined.
2794          * __APPLE_ALTIVEC__ is _generally_ defined automatically by the
2795          * compiler, but it seems that, in some cases, it isn't. Force the build
2796          * macro to be defined, so that keywords are not altered.
2797          */
2798         #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
2799           #define __APPLE_ALTIVEC__
2800         #endif
2801         #include <altivec.h>
2802       #endif
2803 
2804 typedef __vector unsigned long long xxh_u64x2;
2805 typedef __vector unsigned char      xxh_u8x16;
2806 typedef __vector unsigned           xxh_u32x4;
2807 
2808       #ifndef XXH_VSX_BE
2809         #if defined(__BIG_ENDIAN__) ||  \
2810             (defined(__BYTE_ORDER__) && \
2811              __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
2812           #define XXH_VSX_BE 1
2813         #elif defined(__VEC_ELEMENT_REG_ORDER__) && \
2814             __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
2815           #warning \
2816               "-maltivec=be is not recommended. Please use native endianness."
2817           #define XXH_VSX_BE 1
2818         #else
2819           #define XXH_VSX_BE 0
2820         #endif
2821       #endif                                        /* !defined(XXH_VSX_BE) */
2822 
2823       #if XXH_VSX_BE
2824         /* A wrapper for POWER9's vec_revb. */
2825         #if defined(__POWER9_VECTOR__) || \
2826             (defined(__clang__) && defined(__s390x__))
2827           #define XXH_vec_revb vec_revb
2828         #else
2829 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) {
2830 
2831   xxh_u8x16 const vByteSwap = {0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
2832                                0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08};
2833   return vec_perm(val, val, vByteSwap);
2834 
2835 }
2836 
2837         #endif
2838       #endif                                                  /* XXH_VSX_BE */
2839 
2840 /*
2841  * Performs an unaligned load and byte swaps it on big endian.
2842  */
2843 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) {
2844 
2845   xxh_u64x2 ret;
2846   memcpy(&ret, ptr, sizeof(xxh_u64x2));
2847       #if XXH_VSX_BE
2848   ret = XXH_vec_revb(ret);
2849       #endif
2850   return ret;
2851 
2852 }
2853 
2854       /*
2855        * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
2856        *
2857        * These intrinsics weren't added until GCC 8, despite existing for a
2858        * while, and they are endian dependent. Also, their meaning swap
2859        * depending on version.
2860        * */
2861       #if defined(__s390x__)
2862       /* s390x is always big endian, no issue on this platform */
2863         #define XXH_vec_mulo vec_mulo
2864         #define XXH_vec_mule vec_mule
2865       #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
2866         /* Clang has a better way to control this, we can just use the builtin
2867          * which doesn't swap. */
2868         #define XXH_vec_mulo __builtin_altivec_vmulouw
2869         #define XXH_vec_mule __builtin_altivec_vmuleuw
2870       #else
2871 /* gcc needs inline assembly */
2872 /* Adapted from
2873  * https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
2874 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) {
2875 
2876   xxh_u64x2 result;
2877   __asm__("vmulouw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
2878   return result;
2879 
2880 }
2881 
2882 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) {
2883 
2884   xxh_u64x2 result;
2885   __asm__("vmuleuw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
2886   return result;
2887 
2888 }
2889 
2890       #endif                                  /* XXH_vec_mulo, XXH_vec_mule */
2891     #endif                                         /* XXH_VECTOR == XXH_VSX */
2892 
2893     /* prefetch
2894      * can be disabled, by declaring XXH_NO_PREFETCH build macro */
2895     #if defined(XXH_NO_PREFETCH)
2896       #define XXH_PREFETCH(ptr) (void)(ptr)                     /* disabled */
2897     #else
2898       #if defined(_MSC_VER) && \
2899           (defined(_M_X64) ||  \
2900            defined(            \
2901                _M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
2902         #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
2903         #define XXH_PREFETCH(ptr) _mm_prefetch((const char *)(ptr), _MM_HINT_T0)
2904       #elif defined(__GNUC__) && \
2905           ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)))
2906         #define XXH_PREFETCH(ptr) \
2907           __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
2908       #else
2909         #define XXH_PREFETCH(ptr) (void)(ptr)                   /* disabled */
2910       #endif
2911     #endif                                               /* XXH_NO_PREFETCH */
2912 
2913   /* ==========================================
2914    * XXH3 default settings
2915    * ========================================== */
2916 
2917     #define XXH_SECRET_DEFAULT_SIZE 192     /* minimum XXH3_SECRET_SIZE_MIN */
2918 
2919     #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
2920       #error "default keyset is not large enough"
2921     #endif
2922 
2923 /* Pseudorandom secret taken directly from FARSH */
2924 XXH_ALIGN(64)
2925 static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
2926 
2927     0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c,
2928     0xf7, 0x21, 0xad, 0x1c, 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb,
2929     0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, 0xcb, 0x79, 0xe6, 0x4e,
2930     0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
2931     0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6,
2932     0x81, 0x3a, 0x26, 0x4c, 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb,
2933     0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, 0x71, 0x64, 0x48, 0x97,
2934     0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
2935     0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7,
2936     0xc7, 0x0b, 0x4f, 0x1d, 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31,
2937     0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, 0xea, 0xc5, 0xac, 0x83,
2938     0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
2939     0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26,
2940     0x29, 0xd4, 0x68, 0x9e, 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc,
2941     0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, 0x45, 0xcb, 0x3a, 0x8f,
2942     0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
2943 
2944 };
2945 
2946     #ifdef XXH_OLD_NAMES
2947       #define kSecret XXH3_kSecret
2948     #endif
2949 
2950     /*
2951      * Calculates a 32-bit to 64-bit long multiply.
2952      *
2953      * Wraps __emulu on MSVC x86 because it tends to call __allmul when it
2954      * doesn't need to (but it shouldn't need to anyways, it is about 7
2955      * instructions to do a 64x64 multiply...). Since we know that this will
2956      * _always_ emit MULL, we use that instead of the normal method.
2957      *
2958      * If you are compiling for platforms like Thumb-1 and don't have a better
2959      * option, you may also want to write your own long multiply routine here.
2960      *
2961      * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
2962      * {
2963 
2964      *    return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
2965      * }
2966      */
2967     #if defined(_MSC_VER) && defined(_M_IX86)
2968       #include <intrin.h>
2969       #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
2970     #else
2971       /*
2972        * Downcast + upcast is usually better than masking on older compilers
2973        * like GCC 4.2 (especially 32-bit ones), all without affecting newer
2974        * compilers.
2975        *
2976        * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both
2977        * operands and perform a full 64x64 multiply -- entirely redundant on
2978        * 32-bit.
2979        */
2980       #define XXH_mult32to64(x, y) \
2981         ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
2982     #endif
2983 
2984 /*
2985  * Calculates a 64->128-bit long multiply.
2986  *
2987  * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
2988  */
2989 static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
2990 
2991     /*
2992      * GCC/Clang __uint128_t method.
2993      *
2994      * On most 64-bit targets, GCC and Clang define a __uint128_t type.
2995      * This is usually the best way as it usually uses a native long 64-bit
2996      * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
2997      *
2998      * Usually.
2999      *
3000      * Despite being a 32-bit platform, Clang (and emscripten) define this type
3001      * despite not having the arithmetic for it. This results in a laggy
3002      * compiler builtin call which calculates a full 128-bit multiply.
3003      * In that case it is best to use the portable one.
3004      * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
3005      */
3006     #if defined(__GNUC__) && !defined(__wasm__) && \
3007             defined(__SIZEOF_INT128__) ||          \
3008         (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
3009 
3010   __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
3011   XXH128_hash_t     r128;
3012   r128.low64 = (xxh_u64)(product);
3013   r128.high64 = (xxh_u64)(product >> 64);
3014   return r128;
3015 
3016       /*
3017        * MSVC for x64's _umul128 method.
3018        *
3019        * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64
3020        * *HighProduct);
3021        *
3022        * This compiles to single operand MUL on x64.
3023        */
3024     #elif defined(_M_X64) || defined(_M_IA64)
3025 
3026       #ifndef _MSC_VER
3027         #pragma intrinsic(_umul128)
3028       #endif
3029   xxh_u64       product_high;
3030   xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
3031   XXH128_hash_t r128;
3032   r128.low64 = product_low;
3033   r128.high64 = product_high;
3034   return r128;
3035 
3036     #else
3037   /*
3038    * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
3039    *
3040    * This is a fast and simple grade school multiply, which is shown below
3041    * with base 10 arithmetic instead of base 0x100000000.
3042    *
3043    *           9 3 // D2 lhs = 93
3044    *         x 7 5 // D2 rhs = 75
3045    *     ----------
3046    *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
3047    *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
3048    *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
3049    *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
3050    *     ---------
3051    *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
3052    *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
3053    *     ---------
3054    *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
3055    *
3056    * The reasons for adding the products like this are:
3057    *  1. It avoids manual carry tracking. Just like how
3058    *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
3059    *     This avoids a lot of complexity.
3060    *
3061    *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
3062    *     instruction available in ARM's Digital Signal Processing extension
3063    *     in 32-bit ARMv6 and later, which is shown below:
3064    *
3065    *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
3066    *         {
3067 
3068    *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
3069    *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
3070    *             *RdHi = (xxh_u32)(product >> 32);
3071    *         }
3072    *
3073    *     This instruction was designed for efficient long multiplication, and
3074    *     allows this to be calculated in only 4 instructions at speeds
3075    *     comparable to some 64-bit ALUs.
3076    *
3077    *  3. It isn't terrible on other platforms. Usually this will be a couple
3078    *     of 32-bit ADD/ADCs.
3079    */
3080 
3081   /* First calculate all of the cross products. */
3082   xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
3083   xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
3084   xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
3085   xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
3086 
3087   /* Now add the products together. These will never overflow. */
3088   xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
3089   xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
3090   xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
3091 
3092   XXH128_hash_t r128;
3093   r128.low64 = lower;
3094   r128.high64 = upper;
3095   return r128;
3096     #endif
3097 
3098 }
3099 
3100 /*
3101  * Does a 64-bit to 128-bit multiply, then XOR folds it.
3102  *
3103  * The reason for the separate function is to prevent passing too many structs
3104  * around by value. This will hopefully inline the multiply, but we don't force
3105  * it.
3106  */
3107 static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) {
3108 
3109   XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
3110   return product.low64 ^ product.high64;
3111 
3112 }
3113 
3114 /* Seems to produce slightly better code on GCC for some reason. */
3115 XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) {
3116 
3117   XXH_ASSERT(0 <= shift && shift < 64);
3118   return v64 ^ (v64 >> shift);
3119 
3120 }
3121 
3122 /*
3123  * This is a fast avalanche stage,
3124  * suitable when input bits are already partially mixed
3125  */
3126 static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) {
3127 
3128   h64 = XXH_xorshift64(h64, 37);
3129   h64 *= 0x165667919E3779F9ULL;
3130   h64 = XXH_xorshift64(h64, 32);
3131   return h64;
3132 
3133 }
3134 
3135 /*
3136  * This is a stronger avalanche,
3137  * inspired by Pelle Evensen's rrmxmx
3138  * preferable when input has not been previously mixed
3139  */
3140 static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) {
3141 
3142   /* this mix is inspired by Pelle Evensen's rrmxmx */
3143   h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
3144   h64 *= 0x9FB21C651E98DF25ULL;
3145   h64 ^= (h64 >> 35) + len;
3146   h64 *= 0x9FB21C651E98DF25ULL;
3147   return XXH_xorshift64(h64, 28);
3148 
3149 }
3150 
3151 /* ==========================================
3152  * Short keys
3153  * ==========================================
3154  * One of the shortcomings of XXH32 and XXH64 was that their performance was
3155  * sub-optimal on short lengths. It used an iterative algorithm which strongly
3156  * favored lengths that were a multiple of 4 or 8.
3157  *
3158  * Instead of iterating over individual inputs, we use a set of single shot
3159  * functions which piece together a range of lengths and operate in constant
3160  * time.
3161  *
3162  * Additionally, the number of multiplies has been significantly reduced. This
3163  * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
3164  *
3165  * Depending on the platform, this may or may not be faster than XXH32, but it
3166  * is almost guaranteed to be faster than XXH64.
3167  */
3168 
3169 /*
3170  * At very short lengths, there isn't enough input to fully hide secrets, or use
3171  * the entire secret.
3172  *
3173  * There is also only a limited amount of mixing we can do before significantly
3174  * impacting performance.
3175  *
3176  * Therefore, we use different sections of the secret and always mix two secret
3177  * samples with an XOR. This should have no effect on performance on the
3178  * seedless or withSeed variants because everything _should_ be constant folded
3179  * by modern compilers.
3180  *
3181  * The XOR mixing hides individual parts of the secret and increases entropy.
3182  *
3183  * This adds an extra layer of strength for custom secrets.
3184  */
3185 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_1to3_64b(const xxh_u8 *input, size_t len,
3186                                                 const xxh_u8 *secret,
3187                                                 XXH64_hash_t  seed) {
3188 
3189   XXH_ASSERT(input != NULL);
3190   XXH_ASSERT(1 <= len && len <= 3);
3191   XXH_ASSERT(secret != NULL);
3192   /*
3193    * len = 1: combined = { input[0], 0x01, input[0], input[0] }
3194    * len = 2: combined = { input[1], 0x02, input[0], input[1] }
3195    * len = 3: combined = { input[2], 0x03, input[0], input[1] }
3196    */
3197   {
3198 
3199     xxh_u8 const  c1 = input[0];
3200     xxh_u8 const  c2 = input[len >> 1];
3201     xxh_u8 const  c3 = input[len - 1];
3202     xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) |
3203                              ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
3204     xxh_u64 const bitflip =
3205         (XXH_readLE32(secret) ^ XXH_readLE32(secret + 4)) + seed;
3206     xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
3207     return XXH64_avalanche(keyed);
3208 
3209   }
3210 
3211 }
3212 
3213 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8 *input, size_t len,
3214                                                 const xxh_u8 *secret,
3215                                                 XXH64_hash_t  seed) {
3216 
3217   XXH_ASSERT(input != NULL);
3218   XXH_ASSERT(secret != NULL);
3219   XXH_ASSERT(4 <= len && len < 8);
3220   seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
3221   {
3222 
3223     xxh_u32 const input1 = XXH_readLE32(input);
3224     xxh_u32 const input2 = XXH_readLE32(input + len - 4);
3225     xxh_u64 const bitflip =
3226         (XXH_readLE64(secret + 8) ^ XXH_readLE64(secret + 16)) - seed;
3227     xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
3228     xxh_u64 const keyed = input64 ^ bitflip;
3229     return XXH3_rrmxmx(keyed, len);
3230 
3231   }
3232 
3233 }
3234 
3235 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8 *input,
3236                                                  size_t        len,
3237                                                  const xxh_u8 *secret,
3238                                                  XXH64_hash_t  seed) {
3239 
3240   XXH_ASSERT(input != NULL);
3241   XXH_ASSERT(secret != NULL);
3242   XXH_ASSERT(8 <= len && len <= 16);
3243   {
3244 
3245     xxh_u64 const bitflip1 =
3246         (XXH_readLE64(secret + 24) ^ XXH_readLE64(secret + 32)) + seed;
3247     xxh_u64 const bitflip2 =
3248         (XXH_readLE64(secret + 40) ^ XXH_readLE64(secret + 48)) - seed;
3249     xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;
3250     xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
3251     xxh_u64 const acc = len + XXH_swap64(input_lo) + input_hi +
3252                         XXH3_mul128_fold64(input_lo, input_hi);
3253     return XXH3_avalanche(acc);
3254 
3255   }
3256 
3257 }
3258 
3259 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_0to16_64b(const xxh_u8 *input,
3260                                                  size_t        len,
3261                                                  const xxh_u8 *secret,
3262                                                  XXH64_hash_t  seed) {
3263 
3264   XXH_ASSERT(len <= 16);
3265   {
3266 
3267     if (XXH_likely(len > 8))
3268       return XXH3_len_9to16_64b(input, len, secret, seed);
3269     if (XXH_likely(len >= 4))
3270       return XXH3_len_4to8_64b(input, len, secret, seed);
3271     if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
3272     return XXH64_avalanche(
3273         seed ^ (XXH_readLE64(secret + 56) ^ XXH_readLE64(secret + 64)));
3274 
3275   }
3276 
3277 }
3278 
3279 /*
3280  * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
3281  * multiplication by zero, affecting hashes of lengths 17 to 240.
3282  *
3283  * However, they are very unlikely.
3284  *
3285  * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
3286  * unseeded non-cryptographic hashes, it does not attempt to defend itself
3287  * against specially crafted inputs, only random inputs.
3288  *
3289  * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
3290  * cancelling out the secret is taken an arbitrary number of times (addressed
3291  * in XXH3_accumulate_512), this collision is very unlikely with random inputs
3292  * and/or proper seeding:
3293  *
3294  * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
3295  * function that is only called up to 16 times per hash with up to 240 bytes of
3296  * input.
3297  *
3298  * This is not too bad for a non-cryptographic hash function, especially with
3299  * only 64 bit outputs.
3300  *
3301  * The 128-bit variant (which trades some speed for strength) is NOT affected
3302  * by this, although it is always a good idea to use a proper seed if you care
3303  * about strength.
3304  */
3305 XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input,
3306                                      const xxh_u8 *XXH_RESTRICT secret,
3307                                      xxh_u64                    seed64) {
3308 
3309     #if defined(__GNUC__) && !defined(__clang__)  /* GCC, not Clang */ \
3310         && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */     \
3311         &&                                                             \
3312         !defined(                                                      \
3313             XXH_ENABLE_AUTOVECTORIZE)  /* Define to disable like XXH32 hack */
3314   /*
3315    * UGLY HACK:
3316    * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
3317    * slower code.
3318    *
3319    * By forcing seed64 into a register, we disrupt the cost model and
3320    * cause it to scalarize. See `XXH32_round()`
3321    *
3322    * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
3323    * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
3324    * GCC 9.2, despite both emitting scalar code.
3325    *
3326    * GCC generates much better scalar code than Clang for the rest of XXH3,
3327    * which is why finding a more optimal codepath is an interest.
3328    */
3329   __asm__("" : "+r"(seed64));
3330     #endif
3331   {
3332 
3333     xxh_u64 const input_lo = XXH_readLE64(input);
3334     xxh_u64 const input_hi = XXH_readLE64(input + 8);
3335     return XXH3_mul128_fold64(input_lo ^ (XXH_readLE64(secret) + seed64),
3336                               input_hi ^ (XXH_readLE64(secret + 8) - seed64));
3337 
3338   }
3339 
3340 }
3341 
3342 /* For mid range keys, XXH3 uses a Mum-hash variant. */
3343 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_17to128_64b(
3344     const xxh_u8 *XXH_RESTRICT input, size_t len,
3345     const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
3346 
3347   XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
3348   (void)secretSize;
3349   XXH_ASSERT(16 < len && len <= 128);
3350 
3351   {
3352 
3353     xxh_u64 acc = len * XXH_PRIME64_1;
3354     if (len > 32) {
3355 
3356       if (len > 64) {
3357 
3358         if (len > 96) {
3359 
3360           acc += XXH3_mix16B(input + 48, secret + 96, seed);
3361           acc += XXH3_mix16B(input + len - 64, secret + 112, seed);
3362 
3363         }
3364 
3365         acc += XXH3_mix16B(input + 32, secret + 64, seed);
3366         acc += XXH3_mix16B(input + len - 48, secret + 80, seed);
3367 
3368       }
3369 
3370       acc += XXH3_mix16B(input + 16, secret + 32, seed);
3371       acc += XXH3_mix16B(input + len - 32, secret + 48, seed);
3372 
3373     }
3374 
3375     acc += XXH3_mix16B(input + 0, secret + 0, seed);
3376     acc += XXH3_mix16B(input + len - 16, secret + 16, seed);
3377 
3378     return XXH3_avalanche(acc);
3379 
3380   }
3381 
3382 }
3383 
3384     #define XXH3_MIDSIZE_MAX 240
3385 
3386 XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
3387     const xxh_u8 *XXH_RESTRICT input, size_t len,
3388     const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
3389 
3390   XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
3391   (void)secretSize;
3392   XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
3393 
3394     #define XXH3_MIDSIZE_STARTOFFSET 3
3395     #define XXH3_MIDSIZE_LASTOFFSET 17
3396 
3397   {
3398 
3399     xxh_u64   acc = len * XXH_PRIME64_1;
3400     int const nbRounds = (int)len / 16;
3401     int       i;
3402     for (i = 0; i < 8; i++) {
3403 
3404       acc += XXH3_mix16B(input + (16 * i), secret + (16 * i), seed);
3405 
3406     }
3407 
3408     acc = XXH3_avalanche(acc);
3409     XXH_ASSERT(nbRounds >= 8);
3410     #if defined(__clang__)                                /* Clang */ \
3411         && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
3412         && !defined(XXH_ENABLE_AUTOVECTORIZE)          /* Define to disable */
3413       /*
3414        * UGLY HACK:
3415        * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
3416        * In everywhere else, it uses scalar code.
3417        *
3418        * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
3419        * would still be slower than UMAAL (see XXH_mult64to128).
3420        *
3421        * Unfortunately, Clang doesn't handle the long multiplies properly and
3422        * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
3423        * scalarized into an ugly mess of VMOV.32 instructions.
3424        *
3425        * This mess is difficult to avoid without turning autovectorization
3426        * off completely, but they are usually relatively minor and/or not
3427        * worth it to fix.
3428        *
3429        * This loop is the easiest to fix, as unlike XXH32, this pragma
3430        * _actually works_ because it is a loop vectorization instead of an
3431        * SLP vectorization.
3432        */
3433       #pragma clang loop vectorize(disable)
3434     #endif
3435     for (i = 8; i < nbRounds; i++) {
3436 
3437       acc +=
3438           XXH3_mix16B(input + (16 * i),
3439                       secret + (16 * (i - 8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
3440 
3441     }
3442 
3443     /* last bytes */
3444     acc += XXH3_mix16B(input + len - 16,
3445                        secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET,
3446                        seed);
3447     return XXH3_avalanche(acc);
3448 
3449   }
3450 
3451 }
3452 
3453   /* =======     Long Keys     ======= */
3454 
3455     #define XXH_STRIPE_LEN 64
3456     #define XXH_SECRET_CONSUME_RATE \
3457       8                 /* nb of secret bytes consumed at each accumulation */
3458     #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
3459 
3460     #ifdef XXH_OLD_NAMES
3461       #define STRIPE_LEN XXH_STRIPE_LEN
3462       #define ACC_NB XXH_ACC_NB
3463     #endif
3464 
3465 XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) {
3466 
3467   if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
3468   memcpy(dst, &v64, sizeof(v64));
3469 
3470 }
3471 
3472     /* Several intrinsic functions below are supposed to accept __int64 as
3473      * argument, as documented in
3474      * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . However,
3475      * several environments do not define __int64 type, requiring a workaround.
3476      */
3477     #if !defined(__VMS) &&                                     \
3478         (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
3479                                   (__STDC_VERSION__ >= 199901L) /* C99 */))
3480 typedef int64_t xxh_i64;
3481     #else
3482 /* the following type must have a width of 64-bit */
3483 typedef long long xxh_i64;
3484     #endif
3485 
3486   /*
3487    * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the
3488    * most optimized.
3489    *
3490    * It is a hardened version of UMAC, based off of FARSH's implementation.
3491    *
3492    * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
3493    * implementations, and it is ridiculously fast.
3494    *
3495    * We harden it by mixing the original input to the accumulators as well as
3496    * the product.
3497    *
3498    * This means that in the (relatively likely) case of a multiply by zero, the
3499    * original input is preserved.
3500    *
3501    * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
3502    * cross-pollination, as otherwise the upper and lower halves would be
3503    * essentially independent.
3504    *
3505    * This doesn't matter on 64-bit hashes since they all get merged together in
3506    * the end, so we skip the extra step.
3507    *
3508    * Both XXH3_64bits and XXH3_128bits use this subroutine.
3509    */
3510 
3511     #if (XXH_VECTOR == XXH_AVX512) || defined(XXH_X86DISPATCH)
3512 
3513       #ifndef XXH_TARGET_AVX512
3514         #define XXH_TARGET_AVX512               /* disable attribute target */
3515       #endif
3516 
3517 XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_accumulate_512_avx512(
3518     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
3519     const void *XXH_RESTRICT secret) {
3520 
3521   XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc;
3522   XXH_ASSERT((((size_t)acc) & 63) == 0);
3523   XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
3524 
3525   {
3526 
3527     /* data_vec    = input[0]; */
3528     __m512i const data_vec = _mm512_loadu_si512(input);
3529     /* key_vec     = secret[0]; */
3530     __m512i const key_vec = _mm512_loadu_si512(secret);
3531     /* data_key    = data_vec ^ key_vec; */
3532     __m512i const data_key = _mm512_xor_si512(data_vec, key_vec);
3533     /* data_key_lo = data_key >> 32; */
3534     __m512i const data_key_lo =
3535         _mm512_shuffle_epi32(data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
3536     /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
3537     __m512i const product = _mm512_mul_epu32(data_key, data_key_lo);
3538     /* xacc[0] += swap(data_vec); */
3539     __m512i const data_swap =
3540         _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
3541     __m512i const sum = _mm512_add_epi64(*xacc, data_swap);
3542     /* xacc[0] += product; */
3543     *xacc = _mm512_add_epi64(product, sum);
3544 
3545   }
3546 
3547 }
3548 
3549 /*
3550  * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
3551  *
3552  * Multiplication isn't perfect, as explained by Google in HighwayHash:
3553  *
3554  *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
3555  *  // varying degrees. In descending order of goodness, bytes
3556  *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
3557  *  // As expected, the upper and lower bytes are much worse.
3558  *
3559  * Source:
3560  * https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
3561  *
3562  * Since our algorithm uses a pseudorandom secret to add some variance into the
3563  * mix, we don't need to (or want to) mix as often or as much as HighwayHash
3564  * does.
3565  *
3566  * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
3567  * extraction.
3568  *
3569  * Both XXH3_64bits and XXH3_128bits use this subroutine.
3570  */
3571 
3572 XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_scrambleAcc_avx512(
3573     void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
3574 
3575   XXH_ASSERT((((size_t)acc) & 63) == 0);
3576   XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
3577   {
3578 
3579     XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc;
3580     const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
3581 
3582     /* xacc[0] ^= (xacc[0] >> 47) */
3583     __m512i const acc_vec = *xacc;
3584     __m512i const shifted = _mm512_srli_epi64(acc_vec, 47);
3585     __m512i const data_vec = _mm512_xor_si512(acc_vec, shifted);
3586     /* xacc[0] ^= secret; */
3587     __m512i const key_vec = _mm512_loadu_si512(secret);
3588     __m512i const data_key = _mm512_xor_si512(data_vec, key_vec);
3589 
3590     /* xacc[0] *= XXH_PRIME32_1; */
3591     __m512i const data_key_hi =
3592         _mm512_shuffle_epi32(data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
3593     __m512i const prod_lo = _mm512_mul_epu32(data_key, prime32);
3594     __m512i const prod_hi = _mm512_mul_epu32(data_key_hi, prime32);
3595     *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
3596 
3597   }
3598 
3599 }
3600 
3601 XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512(
3602     void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
3603 
3604   XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
3605   XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
3606   XXH_ASSERT(((size_t)customSecret & 63) == 0);
3607   (void)(&XXH_writeLE64);
3608   {
3609 
3610     int const     nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
3611     __m512i const seed = _mm512_mask_set1_epi64(
3612         _mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64);
3613 
3614     XXH_ALIGN(64) const __m512i *const src = (const __m512i *)XXH3_kSecret;
3615     XXH_ALIGN(64) __m512i *const       dest = (__m512i *)customSecret;
3616     int                                i;
3617     for (i = 0; i < nbRounds; ++i) {
3618 
3619       /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void
3620        * const*', this will warn "discards ‘const’ qualifier". */
3621       union {
3622 
3623         XXH_ALIGN(64) const __m512i *cp;
3624         XXH_ALIGN(64) void *p;
3625 
3626       } remote_const_void;
3627 
3628       remote_const_void.cp = src + i;
3629       dest[i] =
3630           _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
3631 
3632     }
3633 
3634   }
3635 
3636 }
3637 
3638     #endif
3639 
3640     #if (XXH_VECTOR == XXH_AVX2) || defined(XXH_X86DISPATCH)
3641 
3642       #ifndef XXH_TARGET_AVX2
3643         #define XXH_TARGET_AVX2                 /* disable attribute target */
3644       #endif
3645 
3646 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2(
3647     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
3648     const void *XXH_RESTRICT secret) {
3649 
3650   XXH_ASSERT((((size_t)acc) & 31) == 0);
3651   {
3652 
3653     XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc;
3654     /* Unaligned. This is mainly for pointer arithmetic, and because
3655      * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason.
3656      */
3657     const __m256i *const xinput = (const __m256i *)input;
3658     /* Unaligned. This is mainly for pointer arithmetic, and because
3659      * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
3660     const __m256i *const xsecret = (const __m256i *)secret;
3661 
3662     size_t i;
3663     for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
3664 
3665       /* data_vec    = xinput[i]; */
3666       __m256i const data_vec = _mm256_loadu_si256(xinput + i);
3667       /* key_vec     = xsecret[i]; */
3668       __m256i const key_vec = _mm256_loadu_si256(xsecret + i);
3669       /* data_key    = data_vec ^ key_vec; */
3670       __m256i const data_key = _mm256_xor_si256(data_vec, key_vec);
3671       /* data_key_lo = data_key >> 32; */
3672       __m256i const data_key_lo =
3673           _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
3674       /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
3675       __m256i const product = _mm256_mul_epu32(data_key, data_key_lo);
3676       /* xacc[i] += swap(data_vec); */
3677       __m256i const data_swap =
3678           _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
3679       __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
3680       /* xacc[i] += product; */
3681       xacc[i] = _mm256_add_epi64(product, sum);
3682 
3683     }
3684 
3685   }
3686 
3687 }
3688 
3689 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2(
3690     void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
3691 
3692   XXH_ASSERT((((size_t)acc) & 31) == 0);
3693   {
3694 
3695     XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc;
3696     /* Unaligned. This is mainly for pointer arithmetic, and because
3697      * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
3698     const __m256i *const xsecret = (const __m256i *)secret;
3699     const __m256i        prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
3700 
3701     size_t i;
3702     for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
3703 
3704       /* xacc[i] ^= (xacc[i] >> 47) */
3705       __m256i const acc_vec = xacc[i];
3706       __m256i const shifted = _mm256_srli_epi64(acc_vec, 47);
3707       __m256i const data_vec = _mm256_xor_si256(acc_vec, shifted);
3708       /* xacc[i] ^= xsecret; */
3709       __m256i const key_vec = _mm256_loadu_si256(xsecret + i);
3710       __m256i const data_key = _mm256_xor_si256(data_vec, key_vec);
3711 
3712       /* xacc[i] *= XXH_PRIME32_1; */
3713       __m256i const data_key_hi =
3714           _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
3715       __m256i const prod_lo = _mm256_mul_epu32(data_key, prime32);
3716       __m256i const prod_hi = _mm256_mul_epu32(data_key_hi, prime32);
3717       xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
3718 
3719     }
3720 
3721   }
3722 
3723 }
3724 
3725 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(
3726     void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
3727 
3728   XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
3729   XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
3730   XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
3731   (void)(&XXH_writeLE64);
3732   XXH_PREFETCH(customSecret);
3733   {
3734 
3735     __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64,
3736                                            -(xxh_i64)seed64, (xxh_i64)seed64);
3737 
3738     XXH_ALIGN(64) const __m256i *const src = (const __m256i *)XXH3_kSecret;
3739     XXH_ALIGN(64) __m256i *            dest = (__m256i *)customSecret;
3740 
3741       #if defined(__GNUC__) || defined(__clang__)
3742     /*
3743      * On GCC & Clang, marking 'dest' as modified will cause the compiler:
3744      *   - do not extract the secret from sse registers in the internal loop
3745      *   - use less common registers, and avoid pushing these reg into stack
3746      * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with
3747      * customSecret, and on aarch64, this prevented LDP from merging two
3748      * loads together for free. Putting the loads together before the stores
3749      * properly generates LDP.
3750      */
3751     __asm__("" : "+r"(dest));
3752       #endif
3753 
3754     /* GCC -O2 need unroll loop manually */
3755     dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src + 0), seed);
3756     dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src + 1), seed);
3757     dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src + 2), seed);
3758     dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src + 3), seed);
3759     dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src + 4), seed);
3760     dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src + 5), seed);
3761 
3762   }
3763 
3764 }
3765 
3766     #endif
3767 
3768     #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
3769 
3770       #ifndef XXH_TARGET_SSE2
3771         #define XXH_TARGET_SSE2                 /* disable attribute target */
3772       #endif
3773 
3774 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2(
3775     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
3776     const void *XXH_RESTRICT secret) {
3777 
3778   /* SSE2 is just a half-scale version of the AVX2 version. */
3779   XXH_ASSERT((((size_t)acc) & 15) == 0);
3780   {
3781 
3782     XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc;
3783     /* Unaligned. This is mainly for pointer arithmetic, and because
3784      * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
3785     const __m128i *const xinput = (const __m128i *)input;
3786     /* Unaligned. This is mainly for pointer arithmetic, and because
3787      * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
3788     const __m128i *const xsecret = (const __m128i *)secret;
3789 
3790     size_t i;
3791     for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
3792 
3793       /* data_vec    = xinput[i]; */
3794       __m128i const data_vec = _mm_loadu_si128(xinput + i);
3795       /* key_vec     = xsecret[i]; */
3796       __m128i const key_vec = _mm_loadu_si128(xsecret + i);
3797       /* data_key    = data_vec ^ key_vec; */
3798       __m128i const data_key = _mm_xor_si128(data_vec, key_vec);
3799       /* data_key_lo = data_key >> 32; */
3800       __m128i const data_key_lo =
3801           _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
3802       /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
3803       __m128i const product = _mm_mul_epu32(data_key, data_key_lo);
3804       /* xacc[i] += swap(data_vec); */
3805       __m128i const data_swap =
3806           _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
3807       __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
3808       /* xacc[i] += product; */
3809       xacc[i] = _mm_add_epi64(product, sum);
3810 
3811     }
3812 
3813   }
3814 
3815 }
3816 
3817 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2(
3818     void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
3819 
3820   XXH_ASSERT((((size_t)acc) & 15) == 0);
3821   {
3822 
3823     XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc;
3824     /* Unaligned. This is mainly for pointer arithmetic, and because
3825      * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
3826     const __m128i *const xsecret = (const __m128i *)secret;
3827     const __m128i        prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
3828 
3829     size_t i;
3830     for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
3831 
3832       /* xacc[i] ^= (xacc[i] >> 47) */
3833       __m128i const acc_vec = xacc[i];
3834       __m128i const shifted = _mm_srli_epi64(acc_vec, 47);
3835       __m128i const data_vec = _mm_xor_si128(acc_vec, shifted);
3836       /* xacc[i] ^= xsecret[i]; */
3837       __m128i const key_vec = _mm_loadu_si128(xsecret + i);
3838       __m128i const data_key = _mm_xor_si128(data_vec, key_vec);
3839 
3840       /* xacc[i] *= XXH_PRIME32_1; */
3841       __m128i const data_key_hi =
3842           _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
3843       __m128i const prod_lo = _mm_mul_epu32(data_key, prime32);
3844       __m128i const prod_hi = _mm_mul_epu32(data_key_hi, prime32);
3845       xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
3846 
3847     }
3848 
3849   }
3850 
3851 }
3852 
3853 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(
3854     void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
3855 
3856   XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
3857   (void)(&XXH_writeLE64);
3858   {
3859 
3860     int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
3861 
3862       #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
3863     // MSVC 32bit mode does not support _mm_set_epi64x before 2015
3864     XXH_ALIGN(16)
3865     const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, -(xxh_i64)seed64};
3866     __m128i const seed = _mm_load_si128((__m128i const *)seed64x2);
3867       #else
3868     __m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64);
3869       #endif
3870     int i;
3871 
3872     XXH_ALIGN(64) const float *const  src = (float const *)XXH3_kSecret;
3873     XXH_ALIGN(XXH_SEC_ALIGN) __m128i *dest = (__m128i *)customSecret;
3874       #if defined(__GNUC__) || defined(__clang__)
3875     /*
3876      * On GCC & Clang, marking 'dest' as modified will cause the compiler:
3877      *   - do not extract the secret from sse registers in the internal loop
3878      *   - use less common registers, and avoid pushing these reg into stack
3879      */
3880     __asm__("" : "+r"(dest));
3881       #endif
3882 
3883     for (i = 0; i < nbRounds; ++i) {
3884 
3885       dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src + i * 4)), seed);
3886 
3887     }
3888 
3889   }
3890 
3891 }
3892 
3893     #endif
3894 
3895     #if (XXH_VECTOR == XXH_NEON)
3896 
3897 XXH_FORCE_INLINE void XXH3_accumulate_512_neon(
3898     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
3899     const void *XXH_RESTRICT secret) {
3900 
3901   XXH_ASSERT((((size_t)acc) & 15) == 0);
3902   {
3903 
3904     XXH_ALIGN(16) uint64x2_t *const xacc = (uint64x2_t *)acc;
3905     /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7.
3906      */
3907     uint8_t const *const xinput = (const uint8_t *)input;
3908     uint8_t const *const xsecret = (const uint8_t *)secret;
3909 
3910     size_t i;
3911     for (i = 0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
3912 
3913       /* data_vec = xinput[i]; */
3914       uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));
3915       /* key_vec  = xsecret[i];  */
3916       uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
3917       uint64x2_t data_key;
3918       uint32x2_t data_key_lo, data_key_hi;
3919       /* xacc[i] += swap(data_vec); */
3920       uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
3921       uint64x2_t const swapped = vextq_u64(data64, data64, 1);
3922       xacc[i] = vaddq_u64(xacc[i], swapped);
3923       /* data_key = data_vec ^ key_vec; */
3924       data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
3925       /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
3926        * data_key_hi = (uint32x2_t) (data_key >> 32);
3927        * data_key = UNDEFINED; */
3928       XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
3929       /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
3930       xacc[i] = vmlal_u32(xacc[i], data_key_lo, data_key_hi);
3931 
3932     }
3933 
3934   }
3935 
3936 }
3937 
3938 XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void *XXH_RESTRICT       acc,
3939                                             const void *XXH_RESTRICT secret) {
3940 
3941   XXH_ASSERT((((size_t)acc) & 15) == 0);
3942 
3943   {
3944 
3945     uint64x2_t *   xacc = (uint64x2_t *)acc;
3946     uint8_t const *xsecret = (uint8_t const *)secret;
3947     uint32x2_t     prime = vdup_n_u32(XXH_PRIME32_1);
3948 
3949     size_t i;
3950     for (i = 0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
3951 
3952       /* xacc[i] ^= (xacc[i] >> 47); */
3953       uint64x2_t acc_vec = xacc[i];
3954       uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
3955       uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
3956 
3957       /* xacc[i] ^= xsecret[i]; */
3958       uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
3959       uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
3960 
3961       /* xacc[i] *= XXH_PRIME32_1 */
3962       uint32x2_t data_key_lo, data_key_hi;
3963       /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
3964        * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
3965        * xacc[i] = UNDEFINED; */
3966       XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
3967       { /*
3968          * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
3969          *
3970          * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
3971          * incorrectly "optimize" this:
3972          *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
3973          *   shifted = vshll_n_u32(tmp, 32);
3974          * to this:
3975          *   tmp     = "vmulq_u64"(a, b); // no such thing!
3976          *   shifted = vshlq_n_u64(tmp, 32);
3977          *
3978          * However, unlike SSE, Clang lacks a 64-bit multiply routine
3979          * for NEON, and it scalarizes two 64-bit multiplies instead.
3980          *
3981          * vmull_u32 has the same timing as vmul_u32, and it avoids
3982          * this bug completely.
3983          * See https://bugs.llvm.org/show_bug.cgi?id=39967
3984          */
3985         uint64x2_t prod_hi = vmull_u32(data_key_hi, prime);
3986         /* xacc[i] = prod_hi << 32; */
3987         xacc[i] = vshlq_n_u64(prod_hi, 32);
3988         /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
3989         xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
3990 
3991       }
3992 
3993     }
3994 
3995   }
3996 
3997 }
3998 
3999     #endif
4000 
4001     #if (XXH_VECTOR == XXH_VSX)
4002 
4003 XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT       acc,
4004                                               const void *XXH_RESTRICT input,
4005                                               const void *XXH_RESTRICT secret) {
4006 
4007   xxh_u64x2 *const       xacc = (xxh_u64x2 *)acc;       /* presumed aligned */
4008   xxh_u64x2 const *const xinput =
4009       (xxh_u64x2 const *)input;                 /* no alignment restriction */
4010   xxh_u64x2 const *const xsecret =
4011       (xxh_u64x2 const *)secret;                /* no alignment restriction */
4012   xxh_u64x2 const v32 = {32, 32};
4013   size_t          i;
4014   for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
4015 
4016     /* data_vec = xinput[i]; */
4017     xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
4018     /* key_vec = xsecret[i]; */
4019     xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
4020     xxh_u64x2 const data_key = data_vec ^ key_vec;
4021     /* shuffled = (data_key << 32) | (data_key >> 32); */
4022     xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
4023     /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled &
4024      * 0xFFFFFFFF); */
4025     xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
4026     xacc[i] += product;
4027 
4028         /* swap high and low halves */
4029       #ifdef __s390x__
4030     xacc[i] += vec_permi(data_vec, data_vec, 2);
4031       #else
4032     xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
4033       #endif
4034 
4035   }
4036 
4037 }
4038 
4039 XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void *XXH_RESTRICT       acc,
4040                                            const void *XXH_RESTRICT secret) {
4041 
4042   XXH_ASSERT((((size_t)acc) & 15) == 0);
4043 
4044   {
4045 
4046     xxh_u64x2 *const       xacc = (xxh_u64x2 *)acc;
4047     const xxh_u64x2 *const xsecret = (const xxh_u64x2 *)secret;
4048     /* constants */
4049     xxh_u64x2 const v32 = {32, 32};
4050     xxh_u64x2 const v47 = {47, 47};
4051     xxh_u32x4 const prime = {XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1,
4052                              XXH_PRIME32_1};
4053     size_t          i;
4054     for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
4055 
4056       /* xacc[i] ^= (xacc[i] >> 47); */
4057       xxh_u64x2 const acc_vec = xacc[i];
4058       xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
4059 
4060       /* xacc[i] ^= xsecret[i]; */
4061       xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
4062       xxh_u64x2 const data_key = data_vec ^ key_vec;
4063 
4064       /* xacc[i] *= XXH_PRIME32_1 */
4065       /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime &
4066        * 0xFFFFFFFF);  */
4067       xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);
4068       /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
4069       xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);
4070       xacc[i] = prod_odd + (prod_even << v32);
4071 
4072     }
4073 
4074   }
4075 
4076 }
4077 
4078     #endif
4079 
4080 /* scalar variants - universal */
4081 
4082 XXH_FORCE_INLINE void XXH3_accumulate_512_scalar(
4083     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
4084     const void *XXH_RESTRICT secret) {
4085 
4086   XXH_ALIGN(XXH_ACC_ALIGN)
4087   xxh_u64 *const      xacc = (xxh_u64 *)acc;            /* presumed aligned */
4088   const xxh_u8 *const xinput =
4089       (const xxh_u8 *)input;                    /* no alignment restriction */
4090   const xxh_u8 *const xsecret =
4091       (const xxh_u8 *)secret;                   /* no alignment restriction */
4092   size_t i;
4093   XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN - 1)) == 0);
4094   for (i = 0; i < XXH_ACC_NB; i++) {
4095 
4096     xxh_u64 const data_val = XXH_readLE64(xinput + 8 * i);
4097     xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i * 8);
4098     xacc[i ^ 1] += data_val;                         /* swap adjacent lanes */
4099     xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
4100 
4101   }
4102 
4103 }
4104 
4105 XXH_FORCE_INLINE void XXH3_scrambleAcc_scalar(void *XXH_RESTRICT       acc,
4106                                               const void *XXH_RESTRICT secret) {
4107 
4108   XXH_ALIGN(XXH_ACC_ALIGN)
4109   xxh_u64 *const      xacc = (xxh_u64 *)acc;            /* presumed aligned */
4110   const xxh_u8 *const xsecret =
4111       (const xxh_u8 *)secret;                   /* no alignment restriction */
4112   size_t i;
4113   XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN - 1)) == 0);
4114   for (i = 0; i < XXH_ACC_NB; i++) {
4115 
4116     xxh_u64 const key64 = XXH_readLE64(xsecret + 8 * i);
4117     xxh_u64       acc64 = xacc[i];
4118     acc64 = XXH_xorshift64(acc64, 47);
4119     acc64 ^= key64;
4120     acc64 *= XXH_PRIME32_1;
4121     xacc[i] = acc64;
4122 
4123   }
4124 
4125 }
4126 
4127 XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar(
4128     void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
4129 
4130   /*
4131    * We need a separate pointer for the hack below,
4132    * which requires a non-const pointer.
4133    * Any decent compiler will optimize this out otherwise.
4134    */
4135   const xxh_u8 *kSecretPtr = XXH3_kSecret;
4136   XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
4137 
4138     #if defined(__clang__) && defined(__aarch64__)
4139   /*
4140    * UGLY HACK:
4141    * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
4142    * placed sequentially, in order, at the top of the unrolled loop.
4143    *
4144    * While MOVK is great for generating constants (2 cycles for a 64-bit
4145    * constant compared to 4 cycles for LDR), long MOVK chains stall the
4146    * integer pipelines:
4147    *   I   L   S
4148    * MOVK
4149    * MOVK
4150    * MOVK
4151    * MOVK
4152    * ADD
4153    * SUB      STR
4154    *          STR
4155    * By forcing loads from memory (as the asm line causes Clang to assume
4156    * that XXH3_kSecretPtr has been changed), the pipelines are used more
4157    * efficiently:
4158    *   I   L   S
4159    *      LDR
4160    *  ADD LDR
4161    *  SUB     STR
4162    *          STR
4163    * XXH3_64bits_withSeed, len == 256, Snapdragon 835
4164    *   without hack: 2654.4 MB/s
4165    *   with hack:    3202.9 MB/s
4166    */
4167   __asm__("" : "+r"(kSecretPtr));
4168     #endif
4169   /*
4170    * Note: in debug mode, this overrides the asm optimization
4171    * and Clang will emit MOVK chains again.
4172    */
4173   XXH_ASSERT(kSecretPtr == XXH3_kSecret);
4174 
4175   {
4176 
4177     int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
4178     int       i;
4179     for (i = 0; i < nbRounds; i++) {
4180 
4181       /*
4182        * The asm hack causes Clang to assume that kSecretPtr aliases with
4183        * customSecret, and on aarch64, this prevented LDP from merging two
4184        * loads together for free. Putting the loads together before the stores
4185        * properly generates LDP.
4186        */
4187       xxh_u64 lo = XXH_readLE64(kSecretPtr + 16 * i) + seed64;
4188       xxh_u64 hi = XXH_readLE64(kSecretPtr + 16 * i + 8) - seed64;
4189       XXH_writeLE64((xxh_u8 *)customSecret + 16 * i, lo);
4190       XXH_writeLE64((xxh_u8 *)customSecret + 16 * i + 8, hi);
4191 
4192     }
4193 
4194   }
4195 
4196 }
4197 
4198 typedef void (*XXH3_f_accumulate_512)(void *XXH_RESTRICT, const void *,
4199                                       const void *);
4200 typedef void (*XXH3_f_scrambleAcc)(void *XXH_RESTRICT, const void *);
4201 typedef void (*XXH3_f_initCustomSecret)(void *XXH_RESTRICT, xxh_u64);
4202 
4203     #if (XXH_VECTOR == XXH_AVX512)
4204 
4205       #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
4206       #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
4207       #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
4208 
4209     #elif (XXH_VECTOR == XXH_AVX2)
4210 
4211       #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
4212       #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
4213       #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
4214 
4215     #elif (XXH_VECTOR == XXH_SSE2)
4216 
4217       #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
4218       #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
4219       #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
4220 
4221     #elif (XXH_VECTOR == XXH_NEON)
4222 
4223       #define XXH3_accumulate_512 XXH3_accumulate_512_neon
4224       #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
4225       #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4226 
4227     #elif (XXH_VECTOR == XXH_VSX)
4228 
4229       #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
4230       #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
4231       #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4232 
4233     #else                                                         /* scalar */
4234 
4235       #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
4236       #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
4237       #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4238 
4239     #endif
4240 
4241     #ifndef XXH_PREFETCH_DIST
4242       #ifdef __clang__
4243         #define XXH_PREFETCH_DIST 320
4244       #else
4245         #if (XXH_VECTOR == XXH_AVX512)
4246           #define XXH_PREFETCH_DIST 512
4247         #else
4248           #define XXH_PREFETCH_DIST 384
4249         #endif
4250       #endif                                                   /* __clang__ */
4251     #endif                                             /* XXH_PREFETCH_DIST */
4252 
4253 /*
4254  * XXH3_accumulate()
4255  * Loops over XXH3_accumulate_512().
4256  * Assumption: nbStripes will not overflow the secret size
4257  */
4258 XXH_FORCE_INLINE void XXH3_accumulate(xxh_u64 *XXH_RESTRICT      acc,
4259                                       const xxh_u8 *XXH_RESTRICT input,
4260                                       const xxh_u8 *XXH_RESTRICT secret,
4261                                       size_t                     nbStripes,
4262                                       XXH3_f_accumulate_512      f_acc512) {
4263 
4264   size_t n;
4265   for (n = 0; n < nbStripes; n++) {
4266 
4267     const xxh_u8 *const in = input + n * XXH_STRIPE_LEN;
4268     XXH_PREFETCH(in + XXH_PREFETCH_DIST);
4269     f_acc512(acc, in, secret + n * XXH_SECRET_CONSUME_RATE);
4270 
4271   }
4272 
4273 }
4274 
4275 XXH_FORCE_INLINE void XXH3_hashLong_internal_loop(
4276     xxh_u64 *XXH_RESTRICT acc, const xxh_u8 *XXH_RESTRICT input, size_t len,
4277     const xxh_u8 *XXH_RESTRICT secret, size_t secretSize,
4278     XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) {
4279 
4280   size_t const nbStripesPerBlock =
4281       (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
4282   size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
4283   size_t const nb_blocks = (len - 1) / block_len;
4284 
4285   size_t n;
4286 
4287   XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
4288 
4289   for (n = 0; n < nb_blocks; n++) {
4290 
4291     XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock,
4292                     f_acc512);
4293     f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
4294 
4295   }
4296 
4297   /* last partial block */
4298   XXH_ASSERT(len > XXH_STRIPE_LEN);
4299   {
4300 
4301     size_t const nbStripes =
4302         ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
4303     XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
4304     XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes,
4305                     f_acc512);
4306 
4307     /* last stripe */
4308     {
4309 
4310       const xxh_u8 *const p = input + len - XXH_STRIPE_LEN;
4311     #define XXH_SECRET_LASTACC_START \
4312       7  /* not aligned on 8, last secret is different from acc & scrambler */
4313       f_acc512(acc, p,
4314                secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
4315 
4316     }
4317 
4318   }
4319 
4320 }
4321 
4322 XXH_FORCE_INLINE xxh_u64 XXH3_mix2Accs(const xxh_u64 *XXH_RESTRICT acc,
4323                                        const xxh_u8 *XXH_RESTRICT  secret) {
4324 
4325   return XXH3_mul128_fold64(acc[0] ^ XXH_readLE64(secret),
4326                             acc[1] ^ XXH_readLE64(secret + 8));
4327 
4328 }
4329 
4330 static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc,
4331                                    const xxh_u8 *XXH_RESTRICT  secret,
4332                                    xxh_u64                     start) {
4333 
4334   xxh_u64 result64 = start;
4335   size_t  i = 0;
4336 
4337   for (i = 0; i < 4; i++) {
4338 
4339     result64 += XXH3_mix2Accs(acc + 2 * i, secret + 16 * i);
4340     #if defined(__clang__)                                /* Clang */ \
4341         && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
4342         && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
4343         && !defined(XXH_ENABLE_AUTOVECTORIZE)          /* Define to disable */
4344     /*
4345      * UGLY HACK:
4346      * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
4347      * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
4348      * XXH3_64bits, len == 256, Snapdragon 835:
4349      *   without hack: 2063.7 MB/s
4350      *   with hack:    2560.7 MB/s
4351      */
4352     __asm__("" : "+r"(result64));
4353     #endif
4354 
4355   }
4356 
4357   return XXH3_avalanche(result64);
4358 
4359 }
4360 
4361     #define XXH3_INIT_ACC                                              \
4362       {                                                                \
4363                                                                        \
4364         XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3,    \
4365             XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \
4366                                                                        \
4367       }
4368 
4369 XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(
4370     const void *XXH_RESTRICT input, size_t len, const void *XXH_RESTRICT secret,
4371     size_t secretSize, XXH3_f_accumulate_512 f_acc512,
4372     XXH3_f_scrambleAcc f_scramble) {
4373 
4374   XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
4375 
4376   XXH3_hashLong_internal_loop(acc, (const xxh_u8 *)input, len,
4377                               (const xxh_u8 *)secret, secretSize, f_acc512,
4378                               f_scramble);
4379 
4380   /* converge into final hash */
4381   XXH_STATIC_ASSERT(sizeof(acc) == 64);
4382     /* do not align on 8, so that the secret is different from the accumulator
4383      */
4384     #define XXH_SECRET_MERGEACCS_START 11
4385   XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
4386   return XXH3_mergeAccs(acc,
4387                         (const xxh_u8 *)secret + XXH_SECRET_MERGEACCS_START,
4388                         (xxh_u64)len * XXH_PRIME64_1);
4389 
4390 }
4391 
4392 /*
4393  * It's important for performance that XXH3_hashLong is not inlined.
4394  */
4395 XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSecret(
4396     const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
4397     const xxh_u8 *XXH_RESTRICT secret, size_t secretLen) {
4398 
4399   (void)seed64;
4400   return XXH3_hashLong_64b_internal(input, len, secret, secretLen,
4401                                     XXH3_accumulate_512, XXH3_scrambleAcc);
4402 
4403 }
4404 
4405 /*
4406  * It's important for performance that XXH3_hashLong is not inlined.
4407  * Since the function is not inlined, the compiler may not be able to understand
4408  * that, in some scenarios, its `secret` argument is actually a compile time
4409  * constant. This variant enforces that the compiler can detect that, and uses
4410  * this opportunity to streamline the generated code for better performance.
4411  */
4412 XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_default(
4413     const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
4414     const xxh_u8 *XXH_RESTRICT secret, size_t secretLen) {
4415 
4416   (void)seed64;
4417   (void)secret;
4418   (void)secretLen;
4419   return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret,
4420                                     sizeof(XXH3_kSecret), XXH3_accumulate_512,
4421                                     XXH3_scrambleAcc);
4422 
4423 }
4424 
4425 /*
4426  * XXH3_hashLong_64b_withSeed():
4427  * Generate a custom key based on alteration of default XXH3_kSecret with the
4428  * seed, and then use this key for long mode hashing.
4429  *
4430  * This operation is decently fast but nonetheless costs a little bit of time.
4431  * Try to avoid it whenever possible (typically when seed==0).
4432  *
4433  * It's important for performance that XXH3_hashLong is not inlined. Not sure
4434  * why (uop cache maybe?), but the difference is large and easily measurable.
4435  */
4436 XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed_internal(
4437     const void *input, size_t len, XXH64_hash_t seed,
4438     XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble,
4439     XXH3_f_initCustomSecret f_initSec) {
4440 
4441   if (seed == 0)
4442     return XXH3_hashLong_64b_internal(
4443         input, len, XXH3_kSecret, sizeof(XXH3_kSecret), f_acc512, f_scramble);
4444   {
4445 
4446     XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
4447     f_initSec(secret, seed);
4448     return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
4449                                       f_acc512, f_scramble);
4450 
4451   }
4452 
4453 }
4454 
4455 /*
4456  * It's important for performance that XXH3_hashLong is not inlined.
4457  */
4458 XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed(const void *  input,
4459                                                       size_t        len,
4460                                                       XXH64_hash_t  seed,
4461                                                       const xxh_u8 *secret,
4462                                                       size_t        secretLen) {
4463 
4464   (void)secret;
4465   (void)secretLen;
4466   return XXH3_hashLong_64b_withSeed_internal(
4467       input, len, seed, XXH3_accumulate_512, XXH3_scrambleAcc,
4468       XXH3_initCustomSecret);
4469 
4470 }
4471 
4472 typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void *XXH_RESTRICT, size_t,
4473                                           XXH64_hash_t,
4474                                           const xxh_u8 *XXH_RESTRICT, size_t);
4475 
4476 XXH_FORCE_INLINE XXH64_hash_t
4477 XXH3_64bits_internal(const void *XXH_RESTRICT input, size_t len,
4478                      XXH64_hash_t seed64, const void *XXH_RESTRICT secret,
4479                      size_t secretLen, XXH3_hashLong64_f f_hashLong) {
4480 
4481   XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
4482   /*
4483    * If an action is to be taken if `secretLen` condition is not respected,
4484    * it should be done here.
4485    * For now, it's a contract pre-condition.
4486    * Adding a check and a branch here would cost performance at every hash.
4487    * Also, note that function signature doesn't offer room to return an error.
4488    */
4489   if (len <= 16)
4490     return XXH3_len_0to16_64b((const xxh_u8 *)input, len,
4491                               (const xxh_u8 *)secret, seed64);
4492   if (len <= 128)
4493     return XXH3_len_17to128_64b((const xxh_u8 *)input, len,
4494                                 (const xxh_u8 *)secret, secretLen, seed64);
4495   if (len <= XXH3_MIDSIZE_MAX)
4496     return XXH3_len_129to240_64b((const xxh_u8 *)input, len,
4497                                  (const xxh_u8 *)secret, secretLen, seed64);
4498   return f_hashLong(input, len, seed64, (const xxh_u8 *)secret, secretLen);
4499 
4500 }
4501 
4502 /* ===   Public entry point   === */
4503 
4504 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) {
4505 
4506   return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret),
4507                               XXH3_hashLong_64b_default);
4508 
4509 }
4510 
4511 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input,
4512                                                    size_t      len,
4513                                                    const void *secret,
4514                                                    size_t      secretSize) {
4515 
4516   return XXH3_64bits_internal(input, len, 0, secret, secretSize,
4517                               XXH3_hashLong_64b_withSecret);
4518 
4519 }
4520 
4521 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *input, size_t len,
4522                                                  XXH64_hash_t seed) {
4523 
4524   return XXH3_64bits_internal(input, len, seed, XXH3_kSecret,
4525                               sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
4526 
4527 }
4528 
4529 /* ===   XXH3 streaming   === */
4530 
4531 /*
4532  * Malloc's a pointer that is always aligned to align.
4533  *
4534  * This must be freed with `XXH_alignedFree()`.
4535  *
4536  * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
4537  * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
4538  * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
4539  *
4540  * This underalignment previously caused a rather obvious crash which went
4541  * completely unnoticed due to XXH3_createState() not actually being tested.
4542  * Credit to RedSpah for noticing this bug.
4543  *
4544  * The alignment is done manually: Functions like posix_memalign or _mm_malloc
4545  * are avoided: To maintain portability, we would have to write a fallback
4546  * like this anyways, and besides, testing for the existence of library
4547  * functions without relying on external build tools is impossible.
4548  *
4549  * The method is simple: Overallocate, manually align, and store the offset
4550  * to the original behind the returned pointer.
4551  *
4552  * Align must be a power of 2 and 8 <= align <= 128.
4553  */
4554 static void *XXH_alignedMalloc(size_t s, size_t align) {
4555 
4556   XXH_ASSERT(align <= 128 && align >= 8);                    /* range check */
4557   XXH_ASSERT((align & (align - 1)) == 0);                     /* power of 2 */
4558   XXH_ASSERT(s != 0 && s < (s + align));                  /* empty/overflow */
4559   {  /* Overallocate to make room for manual realignment and an offset byte */
4560     xxh_u8 *base = (xxh_u8 *)XXH_malloc(s + align);
4561     if (base != NULL) {
4562 
4563       /*
4564        * Get the offset needed to align this pointer.
4565        *
4566        * Even if the returned pointer is aligned, there will always be
4567        * at least one byte to store the offset to the original pointer.
4568        */
4569       size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
4570       /* Add the offset for the now-aligned pointer */
4571       xxh_u8 *ptr = base + offset;
4572 
4573       XXH_ASSERT((size_t)ptr % align == 0);
4574 
4575       /* Store the offset immediately before the returned pointer. */
4576       ptr[-1] = (xxh_u8)offset;
4577       return ptr;
4578 
4579     }
4580 
4581     return NULL;
4582 
4583   }
4584 
4585 }
4586 
4587 /*
4588  * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
4589  * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
4590  */
4591 static void XXH_alignedFree(void *p) {
4592 
4593   if (p != NULL) {
4594 
4595     xxh_u8 *ptr = (xxh_u8 *)p;
4596     /* Get the offset byte we added in XXH_malloc. */
4597     xxh_u8 offset = ptr[-1];
4598     /* Free the original malloc'd pointer */
4599     xxh_u8 *base = ptr - offset;
4600     XXH_free(base);
4601 
4602   }
4603 
4604 }
4605 
4606 XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) {
4607 
4608   XXH3_state_t *const state =
4609       (XXH3_state_t *)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
4610   if (state == NULL) return NULL;
4611   XXH3_INITSTATE(state);
4612   return state;
4613 
4614 }
4615 
4616 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) {
4617 
4618   XXH_alignedFree(statePtr);
4619   return XXH_OK;
4620 
4621 }
4622 
4623 XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t *      dst_state,
4624                                    const XXH3_state_t *src_state) {
4625 
4626   memcpy(dst_state, src_state, sizeof(*dst_state));
4627 
4628 }
4629 
4630 static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr,
4631                                        XXH64_hash_t seed, const void *secret,
4632                                        size_t secretSize) {
4633 
4634   size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
4635   size_t const initLength =
4636       offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
4637   XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
4638   XXH_ASSERT(statePtr != NULL);
4639   /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
4640   memset((char *)statePtr + initStart, 0, initLength);
4641   statePtr->acc[0] = XXH_PRIME32_3;
4642   statePtr->acc[1] = XXH_PRIME64_1;
4643   statePtr->acc[2] = XXH_PRIME64_2;
4644   statePtr->acc[3] = XXH_PRIME64_3;
4645   statePtr->acc[4] = XXH_PRIME64_4;
4646   statePtr->acc[5] = XXH_PRIME32_2;
4647   statePtr->acc[6] = XXH_PRIME64_5;
4648   statePtr->acc[7] = XXH_PRIME32_1;
4649   statePtr->seed = seed;
4650   statePtr->extSecret = (const unsigned char *)secret;
4651   XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
4652   statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
4653   statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
4654 
4655 }
4656 
4657 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr) {
4658 
4659   if (statePtr == NULL) return XXH_ERROR;
4660   XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret,
4661                              XXH_SECRET_DEFAULT_SIZE);
4662   return XXH_OK;
4663 
4664 }
4665 
4666 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(
4667     XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
4668 
4669   if (statePtr == NULL) return XXH_ERROR;
4670   XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize);
4671   if (secret == NULL) return XXH_ERROR;
4672   if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
4673   return XXH_OK;
4674 
4675 }
4676 
4677 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
4678                                                         XXH64_hash_t  seed) {
4679 
4680   if (statePtr == NULL) return XXH_ERROR;
4681   if (seed == 0) return XXH3_64bits_reset(statePtr);
4682   if (seed != statePtr->seed)
4683     XXH3_initCustomSecret(statePtr->customSecret, seed);
4684   XXH3_64bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
4685   return XXH_OK;
4686 
4687 }
4688 
4689 /* Note : when XXH3_consumeStripes() is invoked,
4690  * there must be a guarantee that at least one more byte must be consumed from
4691  * input
4692  * so that the function can blindly consume all stripes using the "normal"
4693  * secret segment */
4694 XXH_FORCE_INLINE void XXH3_consumeStripes(
4695     xxh_u64 *XXH_RESTRICT acc, size_t *XXH_RESTRICT nbStripesSoFarPtr,
4696     size_t nbStripesPerBlock, const xxh_u8 *XXH_RESTRICT input,
4697     size_t nbStripes, const xxh_u8 *XXH_RESTRICT secret, size_t secretLimit,
4698     XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) {
4699 
4700   XXH_ASSERT(nbStripes <=
4701              nbStripesPerBlock); /* can handle max 1 scramble per invocation */
4702   XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
4703   if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
4704 
4705     /* need a scrambling operation */
4706     size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
4707     size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
4708     XXH3_accumulate(acc, input,
4709                     secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE,
4710                     nbStripesToEndofBlock, f_acc512);
4711     f_scramble(acc, secret + secretLimit);
4712     XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret,
4713                     nbStripesAfterBlock, f_acc512);
4714     *nbStripesSoFarPtr = nbStripesAfterBlock;
4715 
4716   } else {
4717 
4718     XXH3_accumulate(acc, input,
4719                     secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE,
4720                     nbStripes, f_acc512);
4721     *nbStripesSoFarPtr += nbStripes;
4722 
4723   }
4724 
4725 }
4726 
4727 /*
4728  * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
4729  */
4730 XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
4731                                            const xxh_u8 *input, size_t len,
4732                                            XXH3_f_accumulate_512 f_acc512,
4733                                            XXH3_f_scrambleAcc    f_scramble) {
4734 
4735   if (input == NULL)
4736     #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
4737         (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
4738     return XXH_OK;
4739     #else
4740     return XXH_ERROR;
4741     #endif
4742 
4743   {
4744 
4745     const xxh_u8 *const        bEnd = input + len;
4746     const unsigned char *const secret =
4747         (state->extSecret == NULL) ? state->customSecret : state->extSecret;
4748 
4749     state->totalLen += len;
4750 
4751     if (state->bufferedSize + len <=
4752         XXH3_INTERNALBUFFER_SIZE) {                   /* fill in tmp buffer */
4753       XXH_memcpy(state->buffer + state->bufferedSize, input, len);
4754       state->bufferedSize += (XXH32_hash_t)len;
4755       return XXH_OK;
4756 
4757     }
4758 
4759       /* total input is now > XXH3_INTERNALBUFFER_SIZE */
4760 
4761     #define XXH3_INTERNALBUFFER_STRIPES \
4762       (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
4763     XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN ==
4764                       0);                                 /* clean multiple */
4765 
4766     /*
4767      * Internal buffer is partially filled (always, except at beginning)
4768      * Complete it, then consume it.
4769      */
4770     if (state->bufferedSize) {
4771 
4772       size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
4773       XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
4774       input += loadSize;
4775       XXH3_consumeStripes(state->acc, &state->nbStripesSoFar,
4776                           state->nbStripesPerBlock, state->buffer,
4777                           XXH3_INTERNALBUFFER_STRIPES, secret,
4778                           state->secretLimit, f_acc512, f_scramble);
4779       state->bufferedSize = 0;
4780 
4781     }
4782 
4783     XXH_ASSERT(input < bEnd);
4784 
4785     /* Consume input by a multiple of internal buffer size */
4786     if (input + XXH3_INTERNALBUFFER_SIZE < bEnd) {
4787 
4788       const xxh_u8 *const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
4789       do {
4790 
4791         XXH3_consumeStripes(state->acc, &state->nbStripesSoFar,
4792                             state->nbStripesPerBlock, input,
4793                             XXH3_INTERNALBUFFER_STRIPES, secret,
4794                             state->secretLimit, f_acc512, f_scramble);
4795         input += XXH3_INTERNALBUFFER_SIZE;
4796 
4797       } while (input < limit);
4798 
4799       /* for last partial stripe */
4800       memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN,
4801              input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
4802 
4803     }
4804 
4805     XXH_ASSERT(input < bEnd);
4806 
4807     /* Some remaining input (always) : buffer it */
4808     XXH_memcpy(state->buffer, input, (size_t)(bEnd - input));
4809     state->bufferedSize = (XXH32_hash_t)(bEnd - input);
4810 
4811   }
4812 
4813   return XXH_OK;
4814 
4815 }
4816 
4817 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *state,
4818                                                 const void *input, size_t len) {
4819 
4820   return XXH3_update(state, (const xxh_u8 *)input, len, XXH3_accumulate_512,
4821                      XXH3_scrambleAcc);
4822 
4823 }
4824 
4825 XXH_FORCE_INLINE void XXH3_digest_long(XXH64_hash_t *       acc,
4826                                        const XXH3_state_t * state,
4827                                        const unsigned char *secret) {
4828 
4829   /*
4830    * Digest on a local copy. This way, the state remains unaltered, and it can
4831    * continue ingesting more input afterwards.
4832    */
4833   memcpy(acc, state->acc, sizeof(state->acc));
4834   if (state->bufferedSize >= XXH_STRIPE_LEN) {
4835 
4836     size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
4837     size_t       nbStripesSoFar = state->nbStripesSoFar;
4838     XXH3_consumeStripes(acc, &nbStripesSoFar, state->nbStripesPerBlock,
4839                         state->buffer, nbStripes, secret, state->secretLimit,
4840                         XXH3_accumulate_512, XXH3_scrambleAcc);
4841     /* last stripe */
4842     XXH3_accumulate_512(acc,
4843                         state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
4844                         secret + state->secretLimit - XXH_SECRET_LASTACC_START);
4845 
4846   } else {                                 /* bufferedSize < XXH_STRIPE_LEN */
4847 
4848     xxh_u8       lastStripe[XXH_STRIPE_LEN];
4849     size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
4850     XXH_ASSERT(state->bufferedSize >
4851                0);                   /* there is always some input buffered */
4852     memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize,
4853            catchupSize);
4854     memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
4855     XXH3_accumulate_512(acc, lastStripe,
4856                         secret + state->secretLimit - XXH_SECRET_LASTACC_START);
4857 
4858   }
4859 
4860 }
4861 
4862 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) {
4863 
4864   const unsigned char *const secret =
4865       (state->extSecret == NULL) ? state->customSecret : state->extSecret;
4866   if (state->totalLen > XXH3_MIDSIZE_MAX) {
4867 
4868     XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
4869     XXH3_digest_long(acc, state, secret);
4870     return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
4871                           (xxh_u64)state->totalLen * XXH_PRIME64_1);
4872 
4873   }
4874 
4875   /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
4876   if (state->seed)
4877     return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen,
4878                                 state->seed);
4879   return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
4880                                 secret, state->secretLimit + XXH_STRIPE_LEN);
4881 
4882 }
4883 
4884     #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
4885 
4886 XXH_PUBLIC_API void XXH3_generateSecret(void *      secretBuffer,
4887                                         const void *customSeed,
4888                                         size_t      customSeedSize) {
4889 
4890   XXH_ASSERT(secretBuffer != NULL);
4891   if (customSeedSize == 0) {
4892 
4893     memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
4894     return;
4895 
4896   }
4897 
4898   XXH_ASSERT(customSeed != NULL);
4899 
4900   {
4901 
4902     size_t const       segmentSize = sizeof(XXH128_hash_t);
4903     size_t const       nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
4904     XXH128_canonical_t scrambler;
4905     XXH64_hash_t       seeds[12];
4906     size_t             segnb;
4907     XXH_ASSERT(nbSegments == 12);
4908     XXH_ASSERT(segmentSize * nbSegments ==
4909                XXH_SECRET_DEFAULT_SIZE);                  /* exact multiple */
4910     XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
4911 
4912     /*
4913      * Copy customSeed to seeds[], truncating or repeating as necessary.
4914      */
4915     {
4916 
4917       size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
4918       size_t filled = toFill;
4919       memcpy(seeds, customSeed, toFill);
4920       while (filled < sizeof(seeds)) {
4921 
4922         toFill = XXH_MIN(filled, sizeof(seeds) - filled);
4923         memcpy((char *)seeds + filled, seeds, toFill);
4924         filled += toFill;
4925 
4926       }
4927 
4928     }
4929 
4930     /* generate secret */
4931     memcpy(secretBuffer, &scrambler, sizeof(scrambler));
4932     for (segnb = 1; segnb < nbSegments; segnb++) {
4933 
4934       size_t const       segmentStart = segnb * segmentSize;
4935       XXH128_canonical_t segment;
4936       XXH128_canonicalFromHash(&segment,
4937                                XXH128(&scrambler, sizeof(scrambler),
4938                                       XXH_readLE64(seeds + segnb) + segnb));
4939       memcpy((char *)secretBuffer + segmentStart, &segment, sizeof(segment));
4940 
4941     }
4942 
4943   }
4944 
4945 }
4946 
4947 /* ==========================================
4948  * XXH3 128 bits (a.k.a XXH128)
4949  * ==========================================
4950  * XXH3's 128-bit variant has better mixing and strength than the 64-bit
4951  * variant, even without counting the significantly larger output size.
4952  *
4953  * For example, extra steps are taken to avoid the seed-dependent collisions
4954  * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
4955  *
4956  * This strength naturally comes at the cost of some speed, especially on short
4957  * lengths. Note that longer hashes are about as fast as the 64-bit version
4958  * due to it using only a slight modification of the 64-bit loop.
4959  *
4960  * XXH128 is also more oriented towards 64-bit machines. It is still extremely
4961  * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
4962  */
4963 
4964 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_1to3_128b(const xxh_u8 *input,
4965                                                   size_t        len,
4966                                                   const xxh_u8 *secret,
4967                                                   XXH64_hash_t  seed) {
4968 
4969   /* A doubled version of 1to3_64b with different constants. */
4970   XXH_ASSERT(input != NULL);
4971   XXH_ASSERT(1 <= len && len <= 3);
4972   XXH_ASSERT(secret != NULL);
4973   /*
4974    * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
4975    * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
4976    * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
4977    */
4978   {
4979 
4980     xxh_u8 const  c1 = input[0];
4981     xxh_u8 const  c2 = input[len >> 1];
4982     xxh_u8 const  c3 = input[len - 1];
4983     xxh_u32 const combinedl = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) |
4984                               ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
4985     xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
4986     xxh_u64 const bitflipl =
4987         (XXH_readLE32(secret) ^ XXH_readLE32(secret + 4)) + seed;
4988     xxh_u64 const bitfliph =
4989         (XXH_readLE32(secret + 8) ^ XXH_readLE32(secret + 12)) - seed;
4990     xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
4991     xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
4992     XXH128_hash_t h128;
4993     h128.low64 = XXH64_avalanche(keyed_lo);
4994     h128.high64 = XXH64_avalanche(keyed_hi);
4995     return h128;
4996 
4997   }
4998 
4999 }
5000 
5001 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_4to8_128b(const xxh_u8 *input,
5002                                                   size_t        len,
5003                                                   const xxh_u8 *secret,
5004                                                   XXH64_hash_t  seed) {
5005 
5006   XXH_ASSERT(input != NULL);
5007   XXH_ASSERT(secret != NULL);
5008   XXH_ASSERT(4 <= len && len <= 8);
5009   seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
5010   {
5011 
5012     xxh_u32 const input_lo = XXH_readLE32(input);
5013     xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
5014     xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
5015     xxh_u64 const bitflip =
5016         (XXH_readLE64(secret + 16) ^ XXH_readLE64(secret + 24)) + seed;
5017     xxh_u64 const keyed = input_64 ^ bitflip;
5018 
5019     /* Shift len to the left to ensure it is even, this avoids even multiplies.
5020      */
5021     XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
5022 
5023     m128.high64 += (m128.low64 << 1);
5024     m128.low64 ^= (m128.high64 >> 3);
5025 
5026     m128.low64 = XXH_xorshift64(m128.low64, 35);
5027     m128.low64 *= 0x9FB21C651E98DF25ULL;
5028     m128.low64 = XXH_xorshift64(m128.low64, 28);
5029     m128.high64 = XXH3_avalanche(m128.high64);
5030     return m128;
5031 
5032   }
5033 
5034 }
5035 
5036 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_9to16_128b(const xxh_u8 *input,
5037                                                    size_t        len,
5038                                                    const xxh_u8 *secret,
5039                                                    XXH64_hash_t  seed) {
5040 
5041   XXH_ASSERT(input != NULL);
5042   XXH_ASSERT(secret != NULL);
5043   XXH_ASSERT(9 <= len && len <= 16);
5044   {
5045 
5046     xxh_u64 const bitflipl =
5047         (XXH_readLE64(secret + 32) ^ XXH_readLE64(secret + 40)) - seed;
5048     xxh_u64 const bitfliph =
5049         (XXH_readLE64(secret + 48) ^ XXH_readLE64(secret + 56)) + seed;
5050     xxh_u64 const input_lo = XXH_readLE64(input);
5051     xxh_u64       input_hi = XXH_readLE64(input + len - 8);
5052     XXH128_hash_t m128 =
5053         XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
5054     /*
5055      * Put len in the middle of m128 to ensure that the length gets mixed to
5056      * both the low and high bits in the 128x64 multiply below.
5057      */
5058     m128.low64 += (xxh_u64)(len - 1) << 54;
5059     input_hi ^= bitfliph;
5060     /*
5061      * Add the high 32 bits of input_hi to the high 32 bits of m128, then
5062      * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
5063      * the high 64 bits of m128.
5064      *
5065      * The best approach to this operation is different on 32-bit and 64-bit.
5066      */
5067     if (sizeof(void *) < sizeof(xxh_u64)) {                       /* 32-bit */
5068       /*
5069        * 32-bit optimized version, which is more readable.
5070        *
5071        * On 32-bit, it removes an ADC and delays a dependency between the two
5072        * halves of m128.high64, but it generates an extra mask on 64-bit.
5073        */
5074       m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) +
5075                      XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
5076 
5077     } else {
5078 
5079       /*
5080        * 64-bit optimized (albeit more confusing) version.
5081        *
5082        * Uses some properties of addition and multiplication to remove the mask:
5083        *
5084        * Let:
5085        *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
5086        *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
5087        *    c = XXH_PRIME32_2
5088        *
5089        *    a + (b * c)
5090        * Inverse Property: x + y - x == y
5091        *    a + (b * (1 + c - 1))
5092        * Distributive Property: x * (y + z) == (x * y) + (x * z)
5093        *    a + (b * 1) + (b * (c - 1))
5094        * Identity Property: x * 1 == x
5095        *    a + b + (b * (c - 1))
5096        *
5097        * Substitute a, b, and c:
5098        *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 -
5099        * 1))
5100        *
5101        * Since input_hi.hi + input_hi.lo == input_hi, we get this:
5102        *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
5103        */
5104       m128.high64 +=
5105           input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
5106 
5107     }
5108 
5109     /* m128 ^= XXH_swap64(m128 >> 64); */
5110     m128.low64 ^= XXH_swap64(m128.high64);
5111 
5112     {                      /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
5113       XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
5114       h128.high64 += m128.high64 * XXH_PRIME64_2;
5115 
5116       h128.low64 = XXH3_avalanche(h128.low64);
5117       h128.high64 = XXH3_avalanche(h128.high64);
5118       return h128;
5119 
5120     }
5121 
5122   }
5123 
5124 }
5125 
5126 /*
5127  * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
5128  */
5129 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_0to16_128b(const xxh_u8 *input,
5130                                                    size_t        len,
5131                                                    const xxh_u8 *secret,
5132                                                    XXH64_hash_t  seed) {
5133 
5134   XXH_ASSERT(len <= 16);
5135   {
5136 
5137     if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
5138     if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
5139     if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
5140     {
5141 
5142       XXH128_hash_t h128;
5143       xxh_u64 const bitflipl =
5144           XXH_readLE64(secret + 64) ^ XXH_readLE64(secret + 72);
5145       xxh_u64 const bitfliph =
5146           XXH_readLE64(secret + 80) ^ XXH_readLE64(secret + 88);
5147       h128.low64 = XXH64_avalanche(seed ^ bitflipl);
5148       h128.high64 = XXH64_avalanche(seed ^ bitfliph);
5149       return h128;
5150 
5151     }
5152 
5153   }
5154 
5155 }
5156 
5157 /*
5158  * A bit slower than XXH3_mix16B, but handles multiply by zero better.
5159  */
5160 XXH_FORCE_INLINE XXH128_hash_t XXH128_mix32B(XXH128_hash_t acc,
5161                                              const xxh_u8 *input_1,
5162                                              const xxh_u8 *input_2,
5163                                              const xxh_u8 *secret,
5164                                              XXH64_hash_t  seed) {
5165 
5166   acc.low64 += XXH3_mix16B(input_1, secret + 0, seed);
5167   acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
5168   acc.high64 += XXH3_mix16B(input_2, secret + 16, seed);
5169   acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
5170   return acc;
5171 
5172 }
5173 
5174 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_17to128_128b(
5175     const xxh_u8 *XXH_RESTRICT input, size_t len,
5176     const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
5177 
5178   XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
5179   (void)secretSize;
5180   XXH_ASSERT(16 < len && len <= 128);
5181 
5182   {
5183 
5184     XXH128_hash_t acc;
5185     acc.low64 = len * XXH_PRIME64_1;
5186     acc.high64 = 0;
5187     if (len > 32) {
5188 
5189       if (len > 64) {
5190 
5191         if (len > 96) {
5192 
5193           acc = XXH128_mix32B(acc, input + 48, input + len - 64, secret + 96,
5194                               seed);
5195 
5196         }
5197 
5198         acc =
5199             XXH128_mix32B(acc, input + 32, input + len - 48, secret + 64, seed);
5200 
5201       }
5202 
5203       acc = XXH128_mix32B(acc, input + 16, input + len - 32, secret + 32, seed);
5204 
5205     }
5206 
5207     acc = XXH128_mix32B(acc, input, input + len - 16, secret, seed);
5208     {
5209 
5210       XXH128_hash_t h128;
5211       h128.low64 = acc.low64 + acc.high64;
5212       h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) +
5213                     ((len - seed) * XXH_PRIME64_2);
5214       h128.low64 = XXH3_avalanche(h128.low64);
5215       h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
5216       return h128;
5217 
5218     }
5219 
5220   }
5221 
5222 }
5223 
5224 XXH_NO_INLINE XXH128_hash_t XXH3_len_129to240_128b(
5225     const xxh_u8 *XXH_RESTRICT input, size_t len,
5226     const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
5227 
5228   XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
5229   (void)secretSize;
5230   XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
5231 
5232   {
5233 
5234     XXH128_hash_t acc;
5235     int const     nbRounds = (int)len / 32;
5236     int           i;
5237     acc.low64 = len * XXH_PRIME64_1;
5238     acc.high64 = 0;
5239     for (i = 0; i < 4; i++) {
5240 
5241       acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16,
5242                           secret + (32 * i), seed);
5243 
5244     }
5245 
5246     acc.low64 = XXH3_avalanche(acc.low64);
5247     acc.high64 = XXH3_avalanche(acc.high64);
5248     XXH_ASSERT(nbRounds >= 4);
5249     for (i = 4; i < nbRounds; i++) {
5250 
5251       acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16,
5252                           secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
5253                           seed);
5254 
5255     }
5256 
5257     /* last bytes */
5258     acc = XXH128_mix32B(
5259         acc, input + len - 16, input + len - 32,
5260         secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
5261         0ULL - seed);
5262 
5263     {
5264 
5265       XXH128_hash_t h128;
5266       h128.low64 = acc.low64 + acc.high64;
5267       h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) +
5268                     ((len - seed) * XXH_PRIME64_2);
5269       h128.low64 = XXH3_avalanche(h128.low64);
5270       h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
5271       return h128;
5272 
5273     }
5274 
5275   }
5276 
5277 }
5278 
5279 XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_internal(
5280     const void *XXH_RESTRICT input, size_t len,
5281     const xxh_u8 *XXH_RESTRICT secret, size_t secretSize,
5282     XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) {
5283 
5284   XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
5285 
5286   XXH3_hashLong_internal_loop(acc, (const xxh_u8 *)input, len, secret,
5287                               secretSize, f_acc512, f_scramble);
5288 
5289   /* converge into final hash */
5290   XXH_STATIC_ASSERT(sizeof(acc) == 64);
5291   XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
5292   {
5293 
5294     XXH128_hash_t h128;
5295     h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
5296                                 (xxh_u64)len * XXH_PRIME64_1);
5297     h128.high64 = XXH3_mergeAccs(
5298         acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
5299         ~((xxh_u64)len * XXH_PRIME64_2));
5300     return h128;
5301 
5302   }
5303 
5304 }
5305 
5306 /*
5307  * It's important for performance that XXH3_hashLong is not inlined.
5308  */
5309 XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_default(
5310     const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
5311     const void *XXH_RESTRICT secret, size_t secretLen) {
5312 
5313   (void)seed64;
5314   (void)secret;
5315   (void)secretLen;
5316   return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret,
5317                                      sizeof(XXH3_kSecret), XXH3_accumulate_512,
5318                                      XXH3_scrambleAcc);
5319 
5320 }
5321 
5322 /*
5323  * It's important for performance that XXH3_hashLong is not inlined.
5324  */
5325 XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_withSecret(
5326     const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
5327     const void *XXH_RESTRICT secret, size_t secretLen) {
5328 
5329   (void)seed64;
5330   return XXH3_hashLong_128b_internal(input, len, (const xxh_u8 *)secret,
5331                                      secretLen, XXH3_accumulate_512,
5332                                      XXH3_scrambleAcc);
5333 
5334 }
5335 
5336 XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_withSeed_internal(
5337     const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
5338     XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble,
5339     XXH3_f_initCustomSecret f_initSec) {
5340 
5341   if (seed64 == 0)
5342     return XXH3_hashLong_128b_internal(
5343         input, len, XXH3_kSecret, sizeof(XXH3_kSecret), f_acc512, f_scramble);
5344   {
5345 
5346     XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
5347     f_initSec(secret, seed64);
5348     return XXH3_hashLong_128b_internal(input, len, (const xxh_u8 *)secret,
5349                                        sizeof(secret), f_acc512, f_scramble);
5350 
5351   }
5352 
5353 }
5354 
5355 /*
5356  * It's important for performance that XXH3_hashLong is not inlined.
5357  */
5358 XXH_NO_INLINE XXH128_hash_t
5359 XXH3_hashLong_128b_withSeed(const void *input, size_t len, XXH64_hash_t seed64,
5360                             const void *XXH_RESTRICT secret, size_t secretLen) {
5361 
5362   (void)secret;
5363   (void)secretLen;
5364   return XXH3_hashLong_128b_withSeed_internal(
5365       input, len, seed64, XXH3_accumulate_512, XXH3_scrambleAcc,
5366       XXH3_initCustomSecret);
5367 
5368 }
5369 
5370 typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void *XXH_RESTRICT, size_t,
5371                                             XXH64_hash_t,
5372                                             const void *XXH_RESTRICT, size_t);
5373 
5374 XXH_FORCE_INLINE XXH128_hash_t
5375 XXH3_128bits_internal(const void *input, size_t len, XXH64_hash_t seed64,
5376                       const void *XXH_RESTRICT secret, size_t secretLen,
5377                       XXH3_hashLong128_f f_hl128) {
5378 
5379   XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
5380   /*
5381    * If an action is to be taken if `secret` conditions are not respected,
5382    * it should be done here.
5383    * For now, it's a contract pre-condition.
5384    * Adding a check and a branch here would cost performance at every hash.
5385    */
5386   if (len <= 16)
5387     return XXH3_len_0to16_128b((const xxh_u8 *)input, len,
5388                                (const xxh_u8 *)secret, seed64);
5389   if (len <= 128)
5390     return XXH3_len_17to128_128b((const xxh_u8 *)input, len,
5391                                  (const xxh_u8 *)secret, secretLen, seed64);
5392   if (len <= XXH3_MIDSIZE_MAX)
5393     return XXH3_len_129to240_128b((const xxh_u8 *)input, len,
5394                                   (const xxh_u8 *)secret, secretLen, seed64);
5395   return f_hl128(input, len, seed64, secret, secretLen);
5396 
5397 }
5398 
5399 /* ===   Public XXH128 API   === */
5400 
5401 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) {
5402 
5403   return XXH3_128bits_internal(input, len, 0, XXH3_kSecret,
5404                                sizeof(XXH3_kSecret),
5405                                XXH3_hashLong_128b_default);
5406 
5407 }
5408 
5409 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input,
5410                                                      size_t      len,
5411                                                      const void *secret,
5412                                                      size_t      secretSize) {
5413 
5414   return XXH3_128bits_internal(input, len, 0, (const xxh_u8 *)secret,
5415                                secretSize, XXH3_hashLong_128b_withSecret);
5416 
5417 }
5418 
5419 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void * input,
5420                                                    size_t       len,
5421                                                    XXH64_hash_t seed) {
5422 
5423   return XXH3_128bits_internal(input, len, seed, XXH3_kSecret,
5424                                sizeof(XXH3_kSecret),
5425                                XXH3_hashLong_128b_withSeed);
5426 
5427 }
5428 
5429 XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len,
5430                                     XXH64_hash_t seed) {
5431 
5432   return XXH3_128bits_withSeed(input, len, seed);
5433 
5434 }
5435 
5436 /* ===   XXH3 128-bit streaming   === */
5437 
5438 /*
5439  * All the functions are actually the same as for 64-bit streaming variant.
5440  * The only difference is the finalizatiom routine.
5441  */
5442 
5443 static void XXH3_128bits_reset_internal(XXH3_state_t *statePtr,
5444                                         XXH64_hash_t seed, const void *secret,
5445                                         size_t secretSize) {
5446 
5447   XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
5448 
5449 }
5450 
5451 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr) {
5452 
5453   if (statePtr == NULL) return XXH_ERROR;
5454   XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret,
5455                               XXH_SECRET_DEFAULT_SIZE);
5456   return XXH_OK;
5457 
5458 }
5459 
5460 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(
5461     XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
5462 
5463   if (statePtr == NULL) return XXH_ERROR;
5464   XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize);
5465   if (secret == NULL) return XXH_ERROR;
5466   if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
5467   return XXH_OK;
5468 
5469 }
5470 
5471 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
5472                                                          XXH64_hash_t  seed) {
5473 
5474   if (statePtr == NULL) return XXH_ERROR;
5475   if (seed == 0) return XXH3_128bits_reset(statePtr);
5476   if (seed != statePtr->seed)
5477     XXH3_initCustomSecret(statePtr->customSecret, seed);
5478   XXH3_128bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
5479   return XXH_OK;
5480 
5481 }
5482 
5483 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state,
5484                                                  const void *  input,
5485                                                  size_t        len) {
5486 
5487   return XXH3_update(state, (const xxh_u8 *)input, len, XXH3_accumulate_512,
5488                      XXH3_scrambleAcc);
5489 
5490 }
5491 
5492 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) {
5493 
5494   const unsigned char *const secret =
5495       (state->extSecret == NULL) ? state->customSecret : state->extSecret;
5496   if (state->totalLen > XXH3_MIDSIZE_MAX) {
5497 
5498     XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
5499     XXH3_digest_long(acc, state, secret);
5500     XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >=
5501                sizeof(acc) + XXH_SECRET_MERGEACCS_START);
5502     {
5503 
5504       XXH128_hash_t h128;
5505       h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
5506                                   (xxh_u64)state->totalLen * XXH_PRIME64_1);
5507       h128.high64 =
5508           XXH3_mergeAccs(acc,
5509                          secret + state->secretLimit + XXH_STRIPE_LEN -
5510                              sizeof(acc) - XXH_SECRET_MERGEACCS_START,
5511                          ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
5512       return h128;
5513 
5514     }
5515 
5516   }
5517 
5518   /* len <= XXH3_MIDSIZE_MAX : short code */
5519   if (state->seed)
5520     return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen,
5521                                  state->seed);
5522   return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
5523                                  secret, state->secretLimit + XXH_STRIPE_LEN);
5524 
5525 }
5526 
5527   /* 128-bit utility functions */
5528 
5529     #include <string.h>                                   /* memcmp, memcpy */
5530 
5531 /* return : 1 is equal, 0 if different */
5532 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) {
5533 
5534   /* note : XXH128_hash_t is compact, it has no padding byte */
5535   return !(memcmp(&h1, &h2, sizeof(h1)));
5536 
5537 }
5538 
5539 /* This prototype is compatible with stdlib's qsort().
5540  * return : >0 if *h128_1  > *h128_2
5541  *          <0 if *h128_1  < *h128_2
5542  *          =0 if *h128_1 == *h128_2  */
5543 XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) {
5544 
5545   XXH128_hash_t const h1 = *(const XXH128_hash_t *)h128_1;
5546   XXH128_hash_t const h2 = *(const XXH128_hash_t *)h128_2;
5547   int const           hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
5548   /* note : bets that, in most cases, hash values are different */
5549   if (hcmp) return hcmp;
5550   return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
5551 
5552 }
5553 
5554 /*======   Canonical representation   ======*/
5555 XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
5556                                              XXH128_hash_t       hash) {
5557 
5558   XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
5559   if (XXH_CPU_LITTLE_ENDIAN) {
5560 
5561     hash.high64 = XXH_swap64(hash.high64);
5562     hash.low64 = XXH_swap64(hash.low64);
5563 
5564   }
5565 
5566   memcpy(dst, &hash.high64, sizeof(hash.high64));
5567   memcpy((char *)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
5568 
5569 }
5570 
5571 XXH_PUBLIC_API XXH128_hash_t
5572 XXH128_hashFromCanonical(const XXH128_canonical_t *src) {
5573 
5574   XXH128_hash_t h;
5575   h.high64 = XXH_readBE64(src);
5576   h.low64 = XXH_readBE64(src->digest + 8);
5577   return h;
5578 
5579 }
5580 
5581     /* Pop our optimization override from above */
5582     #if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
5583         && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
5584         && defined(__OPTIMIZE__) &&                                      \
5585         !defined(__OPTIMIZE_SIZE__)                  /* respect -O0 and -Os */
5586       #pragma GCC pop_options
5587     #endif
5588 
5589   #endif                                                /* XXH_NO_LONG_LONG */
5590 
5591 #endif                                                /* XXH_IMPLEMENTATION */
5592 
5593 #if defined(__cplusplus)
5594 
5595 }
5596 
5597 #endif
5598 
5599