1 /* 2 * xxHash - Extremely Fast Hash algorithm 3 * Header File 4 * Copyright (C) 2012-2020 Yann Collet 5 * 6 * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are 10 * met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above 15 * copyright notice, this list of conditions and the following disclaimer 16 * in the documentation and/or other materials provided with the 17 * distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 * You can contact the author at: 32 * - xxHash homepage: https://www.xxhash.com 33 * - xxHash source repository: https://github.com/Cyan4973/xxHash 34 */ 35 36 /* TODO: update */ 37 /* Notice extracted from xxHash homepage: 38 39 xxHash is an extremely fast hash algorithm, running at RAM speed limits. 40 It also successfully passes all tests from the SMHasher suite. 41 42 Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo 43 @3GHz) 44 45 Name Speed Q.Score Author 46 xxHash 5.4 GB/s 10 47 CrapWow 3.2 GB/s 2 Andrew 48 MumurHash 3a 2.7 GB/s 10 Austin Appleby 49 SpookyHash 2.0 GB/s 10 Bob Jenkins 50 SBox 1.4 GB/s 9 Bret Mulvey 51 Lookup3 1.2 GB/s 9 Bob Jenkins 52 SuperFastHash 1.2 GB/s 1 Paul Hsieh 53 CityHash64 1.05 GB/s 10 Pike & Alakuijala 54 FNV 0.55 GB/s 5 Fowler, Noll, Vo 55 CRC32 0.43 GB/s 9 56 MD5-32 0.33 GB/s 10 Ronald L. Rivest 57 SHA1-32 0.28 GB/s 10 58 59 Q.Score is a measure of quality of the hash function. 60 It depends on successfully passing SMHasher test set. 61 10 is a perfect score. 62 63 Note: SMHasher's CRC32 implementation is not the fastest one. 64 Other speed-oriented implementations can be faster, 65 especially in combination with PCLMUL instruction: 66 https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735 67 68 A 64-bit version, named XXH64, is available since r35. 69 It offers much better speed, but for 64-bit applications only. 70 Name Speed on 64 bits Speed on 32 bits 71 XXH64 13.8 GB/s 1.9 GB/s 72 XXH32 6.8 GB/s 6.0 GB/s 73 */ 74 75 #if defined(__cplusplus) 76 extern "C" { 77 78 #endif 79 80 /* **************************** 81 * INLINE mode 82 ******************************/ 83 /*! 84 * XXH_INLINE_ALL (and XXH_PRIVATE_API) 85 * Use these build macros to inline xxhash into the target unit. 86 * Inlining improves performance on small inputs, especially when the length is 87 * expressed as a compile-time constant: 88 * 89 * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html 90 * 91 * It also keeps xxHash symbols private to the unit, so they are not exported. 92 * 93 * Usage: 94 * #define XXH_INLINE_ALL 95 * #include "xxhash.h" 96 * 97 * Do not compile and link xxhash.o as a separate object, as it is not useful. 98 */ 99 #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) && \ 100 !defined(XXH_INLINE_ALL_31684351384) 101 /* this section should be traversed only once */ 102 #define XXH_INLINE_ALL_31684351384 103 /* give access to the advanced API, required to compile implementations */ 104 #undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ 105 #define XXH_STATIC_LINKING_ONLY 106 /* make all functions private */ 107 #undef XXH_PUBLIC_API 108 #if defined(__GNUC__) 109 #define XXH_PUBLIC_API static __inline __attribute__((unused)) 110 #elif defined(__cplusplus) || \ 111 (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) 112 #define XXH_PUBLIC_API static inline 113 #elif defined(_MSC_VER) 114 #define XXH_PUBLIC_API static __inline 115 #else 116 /* note: this version may generate warnings for unused static functions */ 117 #define XXH_PUBLIC_API static 118 #endif 119 120 /* 121 * This part deals with the special case where a unit wants to inline xxHash, 122 * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such 123 * as part of some previously included *.h header file. 124 * Without further action, the new include would just be ignored, 125 * and functions would effectively _not_ be inlined (silent failure). 126 * The following macros solve this situation by prefixing all inlined names, 127 * avoiding naming collision with previous inclusions. 128 */ 129 #ifdef XXH_NAMESPACE 130 #error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported" 131 /* 132 * Note: Alternative: #undef all symbols (it's a pretty large list). 133 * Without #error: it compiles, but functions are actually not inlined. 134 */ 135 #endif 136 #define XXH_NAMESPACE XXH_INLINE_ 137 /* 138 * Some identifiers (enums, type names) are not symbols, but they must 139 * still be renamed to avoid redeclaration. 140 * Alternative solution: do not redeclare them. 141 * However, this requires some #ifdefs, and is a more dispersed action. 142 * Meanwhile, renaming can be achieved in a single block 143 */ 144 #define XXH_IPREF(Id) XXH_INLINE_##Id 145 #define XXH_OK XXH_IPREF(XXH_OK) 146 #define XXH_ERROR XXH_IPREF(XXH_ERROR) 147 #define XXH_errorcode XXH_IPREF(XXH_errorcode) 148 #define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) 149 #define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) 150 #define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) 151 #define XXH32_state_s XXH_IPREF(XXH32_state_s) 152 #define XXH32_state_t XXH_IPREF(XXH32_state_t) 153 #define XXH64_state_s XXH_IPREF(XXH64_state_s) 154 #define XXH64_state_t XXH_IPREF(XXH64_state_t) 155 #define XXH3_state_s XXH_IPREF(XXH3_state_s) 156 #define XXH3_state_t XXH_IPREF(XXH3_state_t) 157 #define XXH128_hash_t XXH_IPREF(XXH128_hash_t) 158 /* Ensure the header is parsed again, even if it was previously included */ 159 #undef XXHASH_H_5627135585666179 160 #undef XXHASH_H_STATIC_13879238742 161 #endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ 162 163 /* **************************************************************** 164 * Stable API 165 *****************************************************************/ 166 #ifndef XXHASH_H_5627135585666179 167 #define XXHASH_H_5627135585666179 1 168 169 /* specific declaration modes for Windows */ 170 #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) 171 #if defined(WIN32) && defined(_MSC_VER) && \ 172 (defined(XXH_IMPORT) || defined(XXH_EXPORT)) 173 #ifdef XXH_EXPORT 174 #define XXH_PUBLIC_API __declspec(dllexport) 175 #elif XXH_IMPORT 176 #define XXH_PUBLIC_API __declspec(dllimport) 177 #endif 178 #else 179 #define XXH_PUBLIC_API /* do nothing */ 180 #endif 181 #endif 182 183 /*! 184 * XXH_NAMESPACE, aka Namespace Emulation: 185 * 186 * If you want to include _and expose_ xxHash functions from within your own 187 * library, but also want to avoid symbol collisions with other libraries 188 * which may also include xxHash, you can use XXH_NAMESPACE to automatically 189 * prefix any public symbol from xxhash library with the value of 190 * XXH_NAMESPACE (therefore, avoid empty or numeric values). 191 * 192 * Note that no change is required within the calling program as long as it 193 * includes `xxhash.h`: Regular symbol names will be automatically translated 194 * by this header. 195 */ 196 #ifdef XXH_NAMESPACE 197 #define XXH_CAT(A, B) A##B 198 #define XXH_NAME2(A, B) XXH_CAT(A, B) 199 #define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) 200 /* XXH32 */ 201 #define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) 202 #define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) 203 #define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) 204 #define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) 205 #define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) 206 #define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) 207 #define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) 208 #define XXH32_canonicalFromHash \ 209 XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) 210 #define XXH32_hashFromCanonical \ 211 XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) 212 /* XXH64 */ 213 #define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) 214 #define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) 215 #define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) 216 #define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) 217 #define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) 218 #define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) 219 #define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) 220 #define XXH64_canonicalFromHash \ 221 XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) 222 #define XXH64_hashFromCanonical \ 223 XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) 224 /* XXH3_64bits */ 225 #define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) 226 #define XXH3_64bits_withSecret \ 227 XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) 228 #define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) 229 #define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) 230 #define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) 231 #define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) 232 #define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) 233 #define XXH3_64bits_reset_withSeed \ 234 XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) 235 #define XXH3_64bits_reset_withSecret \ 236 XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) 237 #define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) 238 #define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) 239 #define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) 240 /* XXH3_128bits */ 241 #define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) 242 #define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) 243 #define XXH3_128bits_withSeed \ 244 XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) 245 #define XXH3_128bits_withSecret \ 246 XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) 247 #define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) 248 #define XXH3_128bits_reset_withSeed \ 249 XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) 250 #define XXH3_128bits_reset_withSecret \ 251 XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) 252 #define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) 253 #define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) 254 #define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) 255 #define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) 256 #define XXH128_canonicalFromHash \ 257 XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) 258 #define XXH128_hashFromCanonical \ 259 XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) 260 #endif 261 262 /* ************************************* 263 * Version 264 ***************************************/ 265 #define XXH_VERSION_MAJOR 0 266 #define XXH_VERSION_MINOR 8 267 #define XXH_VERSION_RELEASE 0 268 #define XXH_VERSION_NUMBER \ 269 (XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 + \ 270 XXH_VERSION_RELEASE) 271 XXH_PUBLIC_API unsigned XXH_versionNumber(void); 272 273 /* **************************** 274 * Definitions 275 ******************************/ 276 #include <stddef.h> /* size_t */ 277 typedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode; 278 279 /*-********************************************************************** 280 * 32-bit hash 281 ************************************************************************/ 282 #if !defined(__VMS) && \ 283 (defined(__cplusplus) || \ 284 (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)) 285 #include <stdint.h> 286 typedef uint32_t XXH32_hash_t; 287 #else 288 #include <limits.h> 289 #if UINT_MAX == 0xFFFFFFFFUL 290 typedef unsigned int XXH32_hash_t; 291 #else 292 #if ULONG_MAX == 0xFFFFFFFFUL 293 typedef unsigned long XXH32_hash_t; 294 #else 295 #error "unsupported platform: need a 32-bit type" 296 #endif 297 #endif 298 #endif 299 300 /*! 301 * XXH32(): 302 * Calculate the 32-bit hash of sequence "length" bytes stored at memory 303 * address "input". The memory between input & input+length must be valid 304 * (allocated and read-accessible). "seed" can be used to alter the result 305 * predictably. Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher 306 * benchmark): 5.4 GB/s 307 * 308 * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, 309 * and offers true 64/128 bit hash results. It provides a superior level of 310 * dispersion, and greatly reduces the risks of collisions. 311 */ 312 XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length, 313 XXH32_hash_t seed); 314 315 /******* Streaming *******/ 316 317 /* 318 * Streaming functions generate the xxHash value from an incrememtal input. 319 * This method is slower than single-call functions, due to state management. 320 * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. 321 * 322 * An XXH state must first be allocated using `XXH*_createState()`. 323 * 324 * Start a new hash by initializing the state with a seed using `XXH*_reset()`. 325 * 326 * Then, feed the hash state by calling `XXH*_update()` as many times as 327 * necessary. 328 * 329 * The function returns an error code, with 0 meaning OK, and any other value 330 * meaning there is an error. 331 * 332 * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. 333 * This function returns the nn-bits hash as an int or long long. 334 * 335 * It's still possible to continue inserting input into the hash state after a 336 * digest, and generate new hash values later on by invoking `XXH*_digest()`. 337 * 338 * When done, release the state using `XXH*_freeState()`. 339 */ 340 341 typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ 342 XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void); 343 XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr); 344 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t * dst_state, 345 const XXH32_state_t *src_state); 346 347 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr, 348 XXH32_hash_t seed); 349 XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *statePtr, 350 const void *input, size_t length); 351 XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr); 352 353 /******* Canonical representation *******/ 354 355 /* 356 * The default return values from XXH functions are unsigned 32 and 64 bit 357 * integers. 358 * This the simplest and fastest format for further post-processing. 359 * 360 * However, this leaves open the question of what is the order on the byte 361 * level, since little and big endian conventions will store the same number 362 * differently. 363 * 364 * The canonical representation settles this issue by mandating big-endian 365 * convention, the same convention as human-readable numbers (large digits 366 * first). 367 * 368 * When writing hash values to storage, sending them over a network, or printing 369 * them, it's highly recommended to use the canonical representation to ensure 370 * portability across a wider range of systems, present and future. 371 * 372 * The following functions allow transformation of hash values to and from 373 * canonical format. 374 */ 375 376 typedef struct { 377 378 unsigned char digest[4]; 379 380 } XXH32_canonical_t; 381 382 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst, 383 XXH32_hash_t hash); 384 XXH_PUBLIC_API XXH32_hash_t 385 XXH32_hashFromCanonical(const XXH32_canonical_t *src); 386 387 #ifndef XXH_NO_LONG_LONG 388 /*-********************************************************************** 389 * 64-bit hash 390 ************************************************************************/ 391 #if !defined(__VMS) && \ 392 (defined(__cplusplus) || (defined(__STDC_VERSION__) && \ 393 (__STDC_VERSION__ >= 199901L) /* C99 */)) 394 #include <stdint.h> 395 typedef uint64_t XXH64_hash_t; 396 #else 397 /* the following type must have a width of 64-bit */ 398 typedef unsigned long long XXH64_hash_t; 399 #endif 400 401 /*! 402 * XXH64(): 403 * Returns the 64-bit hash of sequence of length @length stored at memory 404 * address @input. 405 * @seed can be used to alter the result predictably. 406 * 407 * This function usually runs faster on 64-bit systems, but slower on 32-bit 408 * systems (see benchmark). 409 * 410 * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems, 411 * and offers true 64/128 bit hash results. It provides a superior level of 412 * dispersion, and greatly reduces the risks of collisions. 413 */ 414 XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t length, 415 XXH64_hash_t seed); 416 417 /******* Streaming *******/ 418 typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ 419 XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void); 420 XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr); 421 XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t * dst_state, 422 const XXH64_state_t *src_state); 423 424 XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr, 425 XXH64_hash_t seed); 426 XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *statePtr, 427 const void *input, size_t length); 428 XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *statePtr); 429 430 /******* Canonical representation *******/ 431 typedef struct { 432 433 unsigned char digest[sizeof(XXH64_hash_t)]; 434 435 } XXH64_canonical_t; 436 437 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst, 438 XXH64_hash_t hash); 439 XXH_PUBLIC_API XXH64_hash_t 440 XXH64_hashFromCanonical(const XXH64_canonical_t *src); 441 442 /*-********************************************************************** 443 * XXH3 64-bit variant 444 ************************************************************************/ 445 446 /* ************************************************************************ 447 * XXH3 is a new hash algorithm featuring: 448 * - Improved speed for both small and large inputs 449 * - True 64-bit and 128-bit outputs 450 * - SIMD acceleration 451 * - Improved 32-bit viability 452 * 453 * Speed analysis methodology is explained here: 454 * 455 * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html 456 * 457 * In general, expect XXH3 to run about ~2x faster on large inputs and >3x 458 * faster on small ones compared to XXH64, though exact differences depend on 459 * the platform. 460 * 461 * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash 462 * on all platforms. 463 * 464 * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it. 465 * 466 * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run 467 * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are 468 * explained in the implementation. 469 * 470 * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, 471 * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro. 472 * 473 * XXH3 offers 2 variants, _64bits and _128bits. 474 * When only 64 bits are needed, prefer calling the _64bits variant, as it 475 * reduces the amount of mixing, resulting in faster speed on small inputs. 476 * 477 * It's also generally simpler to manipulate a scalar return type than a struct. 478 * 479 * The 128-bit version adds additional strength, but it is slightly slower. 480 * 481 * Return values of XXH3 and XXH128 are officially finalized starting 482 * with v0.8.0 and will no longer change in future versions. 483 * Avoid storing values from before that release in long-term storage. 484 * 485 * Results produced by v0.7.x are not comparable with results from v0.7.y. 486 * However, the API is completely stable, and it can safely be used for 487 * ephemeral data (local sessions). 488 * 489 * The API supports one-shot hashing, streaming mode, and custom secrets. 490 */ 491 492 /* XXH3_64bits(): 493 * default 64-bit variant, using default secret and default seed of 0. 494 * It's the fastest variant. */ 495 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *data, size_t len); 496 497 /* 498 * XXH3_64bits_withSeed(): 499 * This variant generates a custom secret on the fly 500 * based on default secret altered using the `seed` value. 501 * While this operation is decently fast, note that it's not completely free. 502 * Note: seed==0 produces the same results as XXH3_64bits(). 503 */ 504 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *data, size_t len, 505 XXH64_hash_t seed); 506 507 /* 508 * XXH3_64bits_withSecret(): 509 * It's possible to provide any blob of bytes as a "secret" to generate the 510 * hash. This makes it more difficult for an external actor to prepare an 511 * intentional collision. The main condition is that secretSize *must* be 512 * large enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced 513 * hash values depends on secret's entropy. Technically, the secret must 514 * look like a bunch of random bytes. Avoid "trivial" or structured data 515 * such as repeated sequences or a text document. Whenever unsure about the 516 * "randomness" of the blob of bytes, consider relabelling it as a "custom 517 * seed" instead, and employ "XXH3_generateSecret()" (see below) to generate 518 * a high entropy secret derived from the custom seed. 519 */ 520 #define XXH3_SECRET_SIZE_MIN 136 521 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len, 522 const void *secret, 523 size_t secretSize); 524 525 /******* Streaming *******/ 526 /* 527 * Streaming requires state maintenance. 528 * This operation costs memory and CPU. 529 * As a consequence, streaming is slower than one-shot hashing. 530 * For better performance, prefer one-shot functions whenever applicable. 531 */ 532 typedef struct XXH3_state_s XXH3_state_t; 533 XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void); 534 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr); 535 XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t * dst_state, 536 const XXH3_state_t *src_state); 537 538 /* 539 * XXH3_64bits_reset(): 540 * Initialize with default parameters. 541 * digest will be equivalent to `XXH3_64bits()`. 542 */ 543 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr); 544 /* 545 * XXH3_64bits_reset_withSeed(): 546 * Generate a custom secret from `seed`, and store it into `statePtr`. 547 * digest will be equivalent to `XXH3_64bits_withSeed()`. 548 */ 549 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr, 550 XXH64_hash_t seed); 551 /* 552 * XXH3_64bits_reset_withSecret(): 553 * `secret` is referenced, it _must outlive_ the hash streaming session. 554 * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, 555 * and the quality of produced hash values depends on secret's entropy 556 * (secret's content should look like a bunch of random bytes). 557 * When in doubt about the randomness of a candidate `secret`, 558 * consider employing `XXH3_generateSecret()` instead (see below). 559 */ 560 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret( 561 XXH3_state_t *statePtr, const void *secret, size_t secretSize); 562 563 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *statePtr, 564 const void * input, 565 size_t length); 566 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *statePtr); 567 568 /* note : canonical representation of XXH3 is the same as XXH64 569 * since they both produce XXH64_hash_t values */ 570 571 /*-********************************************************************** 572 * XXH3 128-bit variant 573 ************************************************************************/ 574 575 typedef struct { 576 577 XXH64_hash_t low64; 578 XXH64_hash_t high64; 579 580 } XXH128_hash_t; 581 582 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *data, size_t len); 583 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void *data, size_t len, 584 XXH64_hash_t seed); 585 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *data, 586 size_t len, 587 const void *secret, 588 size_t secretSize); 589 590 /******* Streaming *******/ 591 /* 592 * Streaming requires state maintenance. 593 * This operation costs memory and CPU. 594 * As a consequence, streaming is slower than one-shot hashing. 595 * For better performance, prefer one-shot functions whenever applicable. 596 * 597 * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). 598 * Use already declared XXH3_createState() and XXH3_freeState(). 599 * 600 * All reset and streaming functions have same meaning as their 64-bit 601 * counterpart. 602 */ 603 604 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr); 605 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr, 606 XXH64_hash_t seed); 607 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret( 608 XXH3_state_t *statePtr, const void *secret, size_t secretSize); 609 610 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *statePtr, 611 const void * input, 612 size_t length); 613 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *statePtr); 614 615 /* Following helper functions make it possible to compare XXH128_hast_t values. 616 * Since XXH128_hash_t is a structure, this capability is not offered by the 617 * language. 618 * Note: For better performance, these functions can be inlined using 619 * XXH_INLINE_ALL */ 620 621 /*! 622 * XXH128_isEqual(): 623 * Return: 1 if `h1` and `h2` are equal, 0 if they are not. 624 */ 625 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); 626 627 /*! 628 * XXH128_cmp(): 629 * 630 * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. 631 * 632 * return: >0 if *h128_1 > *h128_2 633 * =0 if *h128_1 == *h128_2 634 * <0 if *h128_1 < *h128_2 635 */ 636 XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2); 637 638 /******* Canonical representation *******/ 639 typedef struct { 640 641 unsigned char digest[sizeof(XXH128_hash_t)]; 642 643 } XXH128_canonical_t; 644 645 XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst, 646 XXH128_hash_t hash); 647 XXH_PUBLIC_API XXH128_hash_t 648 XXH128_hashFromCanonical(const XXH128_canonical_t *src); 649 650 #endif /* XXH_NO_LONG_LONG */ 651 652 #endif /* XXHASH_H_5627135585666179 */ 653 654 #if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) 655 #define XXHASH_H_STATIC_13879238742 656 /* **************************************************************************** 657 * This section contains declarations which are not guaranteed to remain stable. 658 * They may change in future versions, becoming incompatible with a different 659 * version of the library. 660 * These declarations should only be used with static linking. 661 * Never use them in association with dynamic linking! 662 ***************************************************************************** 663 */ 664 665 /* 666 * These definitions are only present to allow static allocation 667 * of XXH states, on stack or in a struct, for example. 668 * Never **ever** access their members directly. 669 */ 670 671 struct XXH32_state_s { 672 673 XXH32_hash_t total_len_32; 674 XXH32_hash_t large_len; 675 XXH32_hash_t v1; 676 XXH32_hash_t v2; 677 XXH32_hash_t v3; 678 XXH32_hash_t v4; 679 XXH32_hash_t mem32[4]; 680 XXH32_hash_t memsize; 681 XXH32_hash_t 682 reserved; /* never read nor write, might be removed in a future version */ 683 684 }; /* typedef'd to XXH32_state_t */ 685 686 #ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ 687 688 struct XXH64_state_s { 689 690 XXH64_hash_t total_len; 691 XXH64_hash_t v1; 692 XXH64_hash_t v2; 693 XXH64_hash_t v3; 694 XXH64_hash_t v4; 695 XXH64_hash_t mem64[4]; 696 XXH32_hash_t memsize; 697 XXH32_hash_t reserved32; /* required for padding anyway */ 698 XXH64_hash_t reserved64; /* never read nor write, might be removed in a future 699 version */ 700 701 }; /* typedef'd to XXH64_state_t */ 702 703 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ 704 #include <stdalign.h> 705 #define XXH_ALIGN(n) alignas(n) 706 #elif defined(__GNUC__) 707 #define XXH_ALIGN(n) __attribute__((aligned(n))) 708 #elif defined(_MSC_VER) 709 #define XXH_ALIGN(n) __declspec(align(n)) 710 #else 711 #define XXH_ALIGN(n) /* disabled */ 712 #endif 713 714 /* Old GCC versions only accept the attribute after the type in structures. 715 */ 716 #if !(defined(__STDC_VERSION__) && \ 717 (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ 718 && defined(__GNUC__) 719 #define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) 720 #else 721 #define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type 722 #endif 723 724 #define XXH3_INTERNALBUFFER_SIZE 256 725 #define XXH3_SECRET_DEFAULT_SIZE 192 726 struct XXH3_state_s { 727 728 XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); 729 /* used to store a custom secret generated from a seed */ 730 XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); 731 XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); 732 XXH32_hash_t bufferedSize; 733 XXH32_hash_t reserved32; 734 size_t nbStripesSoFar; 735 XXH64_hash_t totalLen; 736 size_t nbStripesPerBlock; 737 size_t secretLimit; 738 XXH64_hash_t seed; 739 XXH64_hash_t reserved64; 740 const unsigned char *extSecret; /* reference to external secret; 741 * if == NULL, use .customSecret instead */ 742 /* note: there may be some padding at the end due to alignment on 64 bytes */ 743 744 }; /* typedef'd to XXH3_state_t */ 745 746 #undef XXH_ALIGN_MEMBER 747 748 /* When the XXH3_state_t structure is merely emplaced on stack, 749 * it should be initialized with XXH3_INITSTATE() or a memset() 750 * in case its first reset uses XXH3_NNbits_reset_withSeed(). 751 * This init can be omitted if the first reset uses default or _withSecret 752 * mode. This operation isn't necessary when the state is created with 753 * XXH3_createState(). Note that this doesn't prepare the state for a 754 * streaming operation, it's still necessary to use XXH3_NNbits_reset*() 755 * afterwards. 756 */ 757 #define XXH3_INITSTATE(XXH3_state_ptr) \ 758 { (XXH3_state_ptr)->seed = 0; } 759 760 /* === Experimental API === */ 761 /* Symbols defined below must be considered tied to a specific library version. 762 */ 763 764 /* 765 * XXH3_generateSecret(): 766 * 767 * Derive a high-entropy secret from any user-defined content, named customSeed. 768 * The generated secret can be used in combination with `*_withSecret()` 769 * functions. The `_withSecret()` variants are useful to provide a higher level 770 * of protection than 64-bit seed, as it becomes much more difficult for an 771 * external actor to guess how to impact the calculation logic. 772 * 773 * The function accepts as input a custom seed of any length and any content, 774 * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE 775 * into an already allocated buffer secretBuffer. 776 * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long. 777 * 778 * The generated secret can then be used with any `*_withSecret()` variant. 779 * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`, 780 * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()` 781 * are part of this list. They all accept a `secret` parameter 782 * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN) 783 * _and_ feature very high entropy (consist of random-looking bytes). 784 * These conditions can be a high bar to meet, so 785 * this function can be used to generate a secret of proper quality. 786 * 787 * customSeed can be anything. It can have any size, even small ones, 788 * and its content can be anything, even stupidly "low entropy" source such as a 789 * bunch of zeroes. The resulting `secret` will nonetheless provide all expected 790 * qualities. 791 * 792 * Supplying NULL as the customSeed copies the default secret into 793 * `secretBuffer`. When customSeedSize > 0, supplying NULL as customSeed is 794 * undefined behavior. 795 */ 796 XXH_PUBLIC_API void XXH3_generateSecret(void * secretBuffer, 797 const void *customSeed, 798 size_t customSeedSize); 799 800 /* simple short-cut to pre-selected XXH3_128bits variant */ 801 XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len, 802 XXH64_hash_t seed); 803 804 #endif /* XXH_NO_LONG_LONG */ 805 806 #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) 807 #define XXH_IMPLEMENTATION 808 #endif 809 810 #endif /* defined(XXH_STATIC_LINKING_ONLY) && \ 811 !defined(XXHASH_H_STATIC_13879238742) */ 812 813 /* ======================================================================== */ 814 /* ======================================================================== */ 815 /* ======================================================================== */ 816 817 /*-********************************************************************** 818 * xxHash implementation 819 *-********************************************************************** 820 * xxHash's implementation used to be hosted inside xxhash.c. 821 * 822 * However, inlining requires implementation to be visible to the compiler, 823 * hence be included alongside the header. 824 * Previously, implementation was hosted inside xxhash.c, 825 * which was then #included when inlining was activated. 826 * This construction created issues with a few build and install systems, 827 * as it required xxhash.c to be stored in /include directory. 828 * 829 * xxHash implementation is now directly integrated within xxhash.h. 830 * As a consequence, xxhash.c is no longer needed in /include. 831 * 832 * xxhash.c is still available and is still useful. 833 * In a "normal" setup, when xxhash is not inlined, 834 * xxhash.h only exposes the prototypes and public symbols, 835 * while xxhash.c can be built into an object file xxhash.o 836 * which can then be linked into the final binary. 837 ************************************************************************/ 838 839 #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) || \ 840 defined(XXH_IMPLEMENTATION)) && \ 841 !defined(XXH_IMPLEM_13a8737387) 842 #define XXH_IMPLEM_13a8737387 843 844 /* ************************************* 845 * Tuning parameters 846 ***************************************/ 847 /*! 848 * XXH_FORCE_MEMORY_ACCESS: 849 * By default, access to unaligned memory is controlled by `memcpy()`, which 850 * is safe and portable. 851 * 852 * Unfortunately, on some target/compiler combinations, the generated assembly 853 * is sub-optimal. 854 * 855 * The below switch allow selection of a different access method 856 * in the search for improved performance. 857 * Method 0 (default): 858 * Use `memcpy()`. Safe and portable. Default. 859 * Method 1: 860 * `__attribute__((packed))` statement. It depends on compiler extensions 861 * and is therefore not portable. 862 * This method is safe if your compiler supports it, and *generally* as 863 * fast or faster than `memcpy`. 864 * Method 2: 865 * Direct access via cast. This method doesn't depend on the compiler but 866 * violates the C standard. 867 * It can generate buggy code on targets which do not support unaligned 868 * memory accesses. 869 * But in some circumstances, it's the only known way to get the most 870 * performance (example: GCC + ARMv6) 871 * Method 3: 872 * Byteshift. This can generate the best code on old compilers which don't 873 * inline small `memcpy()` calls, and it might also be faster on 874 * big-endian systems which lack a native byteswap instruction. See 875 * https://stackoverflow.com/a/32095106/646947 for details. Prefer these 876 * methods in priority order (0 > 1 > 2 > 3) 877 */ 878 #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command \ 879 line for example */ 880 #if !defined(__clang__) && defined(__GNUC__) && \ 881 defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && \ 882 (__ARM_ARCH == 6) 883 #define XXH_FORCE_MEMORY_ACCESS 2 884 #elif !defined(__clang__) && \ 885 ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ 886 (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7))) 887 #define XXH_FORCE_MEMORY_ACCESS 1 888 #endif 889 #endif 890 891 /*! 892 * XXH_ACCEPT_NULL_INPUT_POINTER: 893 * If the input pointer is NULL, xxHash's default behavior is to dereference 894 * it, triggering a segfault. When this macro is enabled, xxHash actively 895 * checks the input for a null pointer. If it is, the result for null input 896 * pointers is the same as a zero-length input. 897 */ 898 #ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ 899 #define XXH_ACCEPT_NULL_INPUT_POINTER 0 900 #endif 901 902 /*! 903 * XXH_FORCE_ALIGN_CHECK: 904 * This is an important performance trick 905 * for architectures without decent unaligned memory access performance. 906 * It checks for input alignment, and when conditions are met, 907 * uses a "fast path" employing direct 32-bit/64-bit read, 908 * resulting in _dramatically faster_ read speed. 909 * 910 * The check costs one initial branch per hash, which is generally negligible, 911 * but not zero. Moreover, it's not useful to generate binary for an 912 * additional code path if memory access uses same instruction for both 913 * aligned and unaligned adresses. 914 * 915 * In these cases, the alignment check can be removed by setting this macro to 916 * 0. Then the code will always use unaligned memory access. Align check is 917 * automatically disabled on x86, x64 & arm64, which are platforms known to 918 * offer good unaligned memory accesses performance. 919 * 920 * This option does not affect XXH3 (only XXH32 and XXH64). 921 */ 922 #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ 923 #if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || \ 924 defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */ 925 #define XXH_FORCE_ALIGN_CHECK 0 926 #else 927 #define XXH_FORCE_ALIGN_CHECK 1 928 #endif 929 #endif 930 931 /*! 932 * XXH_NO_INLINE_HINTS: 933 * 934 * By default, xxHash tries to force the compiler to inline almost all 935 * internal functions. 936 * 937 * This can usually improve performance due to reduced jumping and improved 938 * constant folding, but significantly increases the size of the binary which 939 * might not be favorable. 940 * 941 * Additionally, sometimes the forced inlining can be detrimental to 942 * performance, depending on the architecture. 943 * 944 * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the 945 * compiler full control on whether to inline or not. 946 * 947 * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using 948 * -fno-inline with GCC or Clang, this will automatically be defined. 949 */ 950 #ifndef XXH_NO_INLINE_HINTS 951 #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \ 952 || defined(__NO_INLINE__) /* -O0, -fno-inline */ 953 #define XXH_NO_INLINE_HINTS 1 954 #else 955 #define XXH_NO_INLINE_HINTS 0 956 #endif 957 #endif 958 959 /*! 960 * XXH_REROLL: 961 * Whether to reroll XXH32_finalize, and XXH64_finalize, 962 * instead of using an unrolled jump table/if statement loop. 963 * 964 * This is automatically defined on -Os/-Oz on GCC and Clang. 965 */ 966 #ifndef XXH_REROLL 967 #if defined(__OPTIMIZE_SIZE__) 968 #define XXH_REROLL 1 969 #else 970 #define XXH_REROLL 0 971 #endif 972 #endif 973 974 /* ************************************* 975 * Includes & Memory related functions 976 ***************************************/ 977 /*! 978 * Modify the local functions below should you wish to use 979 * different memory routines for malloc() and free() 980 */ 981 #include <stdlib.h> 982 983 static void *XXH_malloc(size_t s) { 984 985 return malloc(s); 986 987 } 988 989 static void XXH_free(void *p) { 990 991 free(p); 992 993 } 994 995 /*! and for memcpy() */ 996 #include <string.h> 997 static void *XXH_memcpy(void *dest, const void *src, size_t size) { 998 999 return memcpy(dest, src, size); 1000 1001 } 1002 1003 #include <limits.h> /* ULLONG_MAX */ 1004 1005 /* ************************************* 1006 * Compiler Specific Options 1007 ***************************************/ 1008 #ifdef _MSC_VER /* Visual Studio warning fix */ 1009 #pragma warning(disable : 4127) /* disable: C4127: conditional expression \ 1010 is constant */ 1011 #endif 1012 1013 #if XXH_NO_INLINE_HINTS /* disable inlining hints */ 1014 #if defined(__GNUC__) 1015 #define XXH_FORCE_INLINE static __attribute__((unused)) 1016 #else 1017 #define XXH_FORCE_INLINE static 1018 #endif 1019 #define XXH_NO_INLINE static 1020 /* enable inlining hints */ 1021 #elif defined(_MSC_VER) /* Visual Studio */ 1022 #define XXH_FORCE_INLINE static __forceinline 1023 #define XXH_NO_INLINE static __declspec(noinline) 1024 #elif defined(__GNUC__) 1025 #define XXH_FORCE_INLINE \ 1026 static __inline__ __attribute__((always_inline, unused)) 1027 #define XXH_NO_INLINE static __attribute__((noinline)) 1028 #elif defined(__cplusplus) || \ 1029 (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ 1030 #define XXH_FORCE_INLINE static inline 1031 #define XXH_NO_INLINE static 1032 #else 1033 #define XXH_FORCE_INLINE static 1034 #define XXH_NO_INLINE static 1035 #endif 1036 1037 /* ************************************* 1038 * Debug 1039 ***************************************/ 1040 /* 1041 * XXH_DEBUGLEVEL is expected to be defined externally, typically via the 1042 * compiler's command line options. The value must be a number. 1043 */ 1044 #ifndef XXH_DEBUGLEVEL 1045 #ifdef DEBUGLEVEL /* backwards compat */ 1046 #define XXH_DEBUGLEVEL DEBUGLEVEL 1047 #else 1048 #define XXH_DEBUGLEVEL 0 1049 #endif 1050 #endif 1051 1052 #if (XXH_DEBUGLEVEL >= 1) 1053 #include <assert.h> /* note: can still be disabled with NDEBUG */ 1054 #define XXH_ASSERT(c) assert(c) 1055 #else 1056 #define XXH_ASSERT(c) ((void)0) 1057 #endif 1058 1059 /* note: use after variable declarations */ 1060 #define XXH_STATIC_ASSERT(c) \ 1061 do { \ 1062 \ 1063 enum { XXH_sa = 1 / (int)(!!(c)) }; \ 1064 \ 1065 } while (0) 1066 1067 /* ************************************* 1068 * Basic Types 1069 ***************************************/ 1070 #if !defined(__VMS) && \ 1071 (defined(__cplusplus) || \ 1072 (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)) 1073 #include <stdint.h> 1074 typedef uint8_t xxh_u8; 1075 #else 1076 typedef unsigned char xxh_u8; 1077 #endif 1078 typedef XXH32_hash_t xxh_u32; 1079 1080 #ifdef XXH_OLD_NAMES 1081 #define BYTE xxh_u8 1082 #define U8 xxh_u8 1083 #define U32 xxh_u32 1084 #endif 1085 1086 /* *** Memory access *** */ 1087 1088 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3)) 1089 /* 1090 * Manual byteshift. Best for old compilers which don't inline memcpy. 1091 * We actually directly use XXH_readLE32 and XXH_readBE32. 1092 */ 1093 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2)) 1094 1095 /* 1096 * Force direct memory access. Only works on CPU which support unaligned memory 1097 * access in hardware. 1098 */ 1099 static xxh_u32 XXH_read32(const void *memPtr) { 1100 1101 return *(const xxh_u32 *)memPtr; 1102 1103 } 1104 1105 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1)) 1106 1107 /* 1108 * __pack instructions are safer but compiler specific, hence potentially 1109 * problematic for some compilers. 1110 * 1111 * Currently only defined for GCC and ICC. 1112 */ 1113 #ifdef XXH_OLD_NAMES 1114 typedef union { 1115 1116 xxh_u32 u32; 1117 1118 } __attribute__((packed)) unalign; 1119 1120 #endif 1121 static xxh_u32 XXH_read32(const void *ptr) { 1122 1123 typedef union { 1124 1125 xxh_u32 u32; 1126 1127 } __attribute__((packed)) xxh_unalign; 1128 1129 return ((const xxh_unalign *)ptr)->u32; 1130 1131 } 1132 1133 #else 1134 1135 /* 1136 * Portable and safe solution. Generally efficient. 1137 * see: https://stackoverflow.com/a/32095106/646947 1138 */ 1139 static xxh_u32 XXH_read32(const void *memPtr) { 1140 1141 xxh_u32 val; 1142 memcpy(&val, memPtr, sizeof(val)); 1143 return val; 1144 1145 } 1146 1147 #endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ 1148 1149 /* *** Endianess *** */ 1150 typedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess; 1151 1152 /*! 1153 * XXH_CPU_LITTLE_ENDIAN: 1154 * Defined to 1 if the target is little endian, or 0 if it is big endian. 1155 * It can be defined externally, for example on the compiler command line. 1156 * 1157 * If it is not defined, a runtime check (which is usually constant folded) 1158 * is used instead. 1159 */ 1160 #ifndef XXH_CPU_LITTLE_ENDIAN 1161 /* 1162 * Try to detect endianness automatically, to avoid the nonstandard behavior 1163 * in `XXH_isLittleEndian()` 1164 */ 1165 #if defined(_WIN32) /* Windows is always little endian */ \ 1166 || defined(__LITTLE_ENDIAN__) || \ 1167 (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) 1168 #define XXH_CPU_LITTLE_ENDIAN 1 1169 #elif defined(__BIG_ENDIAN__) || \ 1170 (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) 1171 #define XXH_CPU_LITTLE_ENDIAN 0 1172 #else 1173 /* 1174 * runtime test, presumed to simplify to a constant by compiler 1175 */ 1176 static int XXH_isLittleEndian(void) { 1177 1178 /* 1179 * Portable and well-defined behavior. 1180 * Don't use static: it is detrimental to performance. 1181 */ 1182 const union { 1183 1184 xxh_u32 u; 1185 xxh_u8 c[4]; 1186 1187 } one = {1}; 1188 1189 return one.c[0]; 1190 1191 } 1192 1193 #define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() 1194 #endif 1195 #endif 1196 1197 /* **************************************** 1198 * Compiler-specific Functions and Macros 1199 ******************************************/ 1200 #define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) 1201 1202 #ifdef __has_builtin 1203 #define XXH_HAS_BUILTIN(x) __has_builtin(x) 1204 #else 1205 #define XXH_HAS_BUILTIN(x) 0 1206 #endif 1207 1208 #if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) && \ 1209 XXH_HAS_BUILTIN(__builtin_rotateleft64) 1210 #define XXH_rotl32 __builtin_rotateleft32 1211 #define XXH_rotl64 __builtin_rotateleft64 1212 /* Note: although _rotl exists for minGW (GCC under windows), performance 1213 * seems poor */ 1214 #elif defined(_MSC_VER) 1215 #define XXH_rotl32(x, r) _rotl(x, r) 1216 #define XXH_rotl64(x, r) _rotl64(x, r) 1217 #else 1218 #define XXH_rotl32(x, r) (((x) << (r)) | ((x) >> (32 - (r)))) 1219 #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r)))) 1220 #endif 1221 1222 #if defined(_MSC_VER) /* Visual Studio */ 1223 #define XXH_swap32 _byteswap_ulong 1224 #elif XXH_GCC_VERSION >= 403 1225 #define XXH_swap32 __builtin_bswap32 1226 #else 1227 static xxh_u32 XXH_swap32(xxh_u32 x) { 1228 1229 return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) | 1230 ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff); 1231 1232 } 1233 1234 #endif 1235 1236 /* *************************** 1237 * Memory reads 1238 *****************************/ 1239 typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; 1240 1241 /* 1242 * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. 1243 * 1244 * This is ideal for older compilers which don't inline memcpy. 1245 */ 1246 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3)) 1247 1248 XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void *memPtr) { 1249 1250 const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr; 1251 return bytePtr[0] | ((xxh_u32)bytePtr[1] << 8) | ((xxh_u32)bytePtr[2] << 16) | 1252 ((xxh_u32)bytePtr[3] << 24); 1253 1254 } 1255 1256 XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void *memPtr) { 1257 1258 const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr; 1259 return bytePtr[3] | ((xxh_u32)bytePtr[2] << 8) | ((xxh_u32)bytePtr[1] << 16) | 1260 ((xxh_u32)bytePtr[0] << 24); 1261 1262 } 1263 1264 #else 1265 XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void *ptr) { 1266 1267 return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); 1268 1269 } 1270 1271 static xxh_u32 XXH_readBE32(const void *ptr) { 1272 1273 return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); 1274 1275 } 1276 1277 #endif 1278 1279 XXH_FORCE_INLINE xxh_u32 XXH_readLE32_align(const void * ptr, 1280 XXH_alignment align) { 1281 1282 if (align == XXH_unaligned) { 1283 1284 return XXH_readLE32(ptr); 1285 1286 } else { 1287 1288 return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32 *)ptr 1289 : XXH_swap32(*(const xxh_u32 *)ptr); 1290 1291 } 1292 1293 } 1294 1295 /* ************************************* 1296 * Misc 1297 ***************************************/ 1298 XXH_PUBLIC_API unsigned XXH_versionNumber(void) { 1299 1300 return XXH_VERSION_NUMBER; 1301 1302 } 1303 1304 /* ******************************************************************* 1305 * 32-bit hash functions 1306 *********************************************************************/ 1307 static const xxh_u32 XXH_PRIME32_1 = 1308 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ 1309 static const xxh_u32 XXH_PRIME32_2 = 1310 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ 1311 static const xxh_u32 XXH_PRIME32_3 = 1312 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ 1313 static const xxh_u32 XXH_PRIME32_4 = 1314 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ 1315 static const xxh_u32 XXH_PRIME32_5 = 1316 0x165667B1U; /* 0b00010110010101100110011110110001 */ 1317 1318 #ifdef XXH_OLD_NAMES 1319 #define PRIME32_1 XXH_PRIME32_1 1320 #define PRIME32_2 XXH_PRIME32_2 1321 #define PRIME32_3 XXH_PRIME32_3 1322 #define PRIME32_4 XXH_PRIME32_4 1323 #define PRIME32_5 XXH_PRIME32_5 1324 #endif 1325 1326 static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) { 1327 1328 acc += input * XXH_PRIME32_2; 1329 acc = XXH_rotl32(acc, 13); 1330 acc *= XXH_PRIME32_1; 1331 #if defined(__GNUC__) && defined(__SSE4_1__) && \ 1332 !defined(XXH_ENABLE_AUTOVECTORIZE) 1333 /* 1334 * UGLY HACK: 1335 * This inline assembly hack forces acc into a normal register. This is the 1336 * only thing that prevents GCC and Clang from autovectorizing the XXH32 1337 * loop (pragmas and attributes don't work for some resason) without globally 1338 * disabling SSE4.1. 1339 * 1340 * The reason we want to avoid vectorization is because despite working on 1341 * 4 integers at a time, there are multiple factors slowing XXH32 down on 1342 * SSE4: 1343 * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on 1344 * newer chips!) making it slightly slower to multiply four integers at 1345 * once compared to four integers independently. Even when pmulld was 1346 * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE 1347 * just to multiply unless doing a long operation. 1348 * 1349 * - Four instructions are required to rotate, 1350 * movqda tmp, v // not required with VEX encoding 1351 * pslld tmp, 13 // tmp <<= 13 1352 * psrld v, 19 // x >>= 19 1353 * por v, tmp // x |= tmp 1354 * compared to one for scalar: 1355 * roll v, 13 // reliably fast across the board 1356 * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason 1357 * 1358 * - Instruction level parallelism is actually more beneficial here because 1359 * the SIMD actually serializes this operation: While v1 is rotating, v2 1360 * can load data, while v3 can multiply. SSE forces them to operate 1361 * together. 1362 * 1363 * How this hack works: 1364 * __asm__("" // Declare an assembly block but don't declare any 1365 * instructions : // However, as an Input/Output Operand, 1366 * "+r" // constrain a read/write operand (+) as a general purpose 1367 * register (r). (acc) // and set acc as the operand 1368 * ); 1369 * 1370 * Because of the 'r', the compiler has promised that seed will be in a 1371 * general purpose register and the '+' says that it will be 'read/write', 1372 * so it has to assume it has changed. It is like volatile without all the 1373 * loads and stores. 1374 * 1375 * Since the argument has to be in a normal register (not an SSE register), 1376 * each time XXH32_round is called, it is impossible to vectorize. 1377 */ 1378 __asm__("" : "+r"(acc)); 1379 #endif 1380 return acc; 1381 1382 } 1383 1384 /* mix all bits */ 1385 static xxh_u32 XXH32_avalanche(xxh_u32 h32) { 1386 1387 h32 ^= h32 >> 15; 1388 h32 *= XXH_PRIME32_2; 1389 h32 ^= h32 >> 13; 1390 h32 *= XXH_PRIME32_3; 1391 h32 ^= h32 >> 16; 1392 return (h32); 1393 1394 } 1395 1396 #define XXH_get32bits(p) XXH_readLE32_align(p, align) 1397 1398 static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len, 1399 XXH_alignment align) { 1400 1401 /* dummy comment */ 1402 1403 #define XXH_PROCESS1 \ 1404 do { \ 1405 \ 1406 h32 += (*ptr++) * XXH_PRIME32_5; \ 1407 h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \ 1408 \ 1409 } while (0) 1410 1411 #define XXH_PROCESS4 \ 1412 do { \ 1413 \ 1414 h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \ 1415 ptr += 4; \ 1416 h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \ 1417 \ 1418 } while (0) 1419 1420 /* Compact rerolled version */ 1421 if (XXH_REROLL) { 1422 1423 len &= 15; 1424 while (len >= 4) { 1425 1426 XXH_PROCESS4; 1427 len -= 4; 1428 1429 } 1430 1431 while (len > 0) { 1432 1433 XXH_PROCESS1; 1434 --len; 1435 1436 } 1437 1438 return XXH32_avalanche(h32); 1439 1440 } else { 1441 1442 switch (len & 15) /* or switch(bEnd - p) */ { 1443 1444 case 12: 1445 XXH_PROCESS4; 1446 /* fallthrough */ 1447 case 8: 1448 XXH_PROCESS4; 1449 /* fallthrough */ 1450 case 4: 1451 XXH_PROCESS4; 1452 return XXH32_avalanche(h32); 1453 1454 case 13: 1455 XXH_PROCESS4; 1456 /* fallthrough */ 1457 case 9: 1458 XXH_PROCESS4; 1459 /* fallthrough */ 1460 case 5: 1461 XXH_PROCESS4; 1462 XXH_PROCESS1; 1463 return XXH32_avalanche(h32); 1464 1465 case 14: 1466 XXH_PROCESS4; 1467 /* fallthrough */ 1468 case 10: 1469 XXH_PROCESS4; 1470 /* fallthrough */ 1471 case 6: 1472 XXH_PROCESS4; 1473 XXH_PROCESS1; 1474 XXH_PROCESS1; 1475 return XXH32_avalanche(h32); 1476 1477 case 15: 1478 XXH_PROCESS4; 1479 /* fallthrough */ 1480 case 11: 1481 XXH_PROCESS4; 1482 /* fallthrough */ 1483 case 7: 1484 XXH_PROCESS4; 1485 /* fallthrough */ 1486 case 3: 1487 XXH_PROCESS1; 1488 /* fallthrough */ 1489 case 2: 1490 XXH_PROCESS1; 1491 /* fallthrough */ 1492 case 1: 1493 XXH_PROCESS1; 1494 /* fallthrough */ 1495 case 0: 1496 return XXH32_avalanche(h32); 1497 1498 } 1499 1500 XXH_ASSERT(0); 1501 return h32; /* reaching this point is deemed impossible */ 1502 1503 } 1504 1505 } 1506 1507 #ifdef XXH_OLD_NAMES 1508 #define PROCESS1 XXH_PROCESS1 1509 #define PROCESS4 XXH_PROCESS4 1510 #else 1511 #undef XXH_PROCESS1 1512 #undef XXH_PROCESS4 1513 #endif 1514 1515 XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8 *input, size_t len, 1516 xxh_u32 seed, XXH_alignment align) { 1517 1518 const xxh_u8 *bEnd = input + len; 1519 xxh_u32 h32; 1520 1521 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ 1522 (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) 1523 if (input == NULL) { 1524 1525 len = 0; 1526 bEnd = input = (const xxh_u8 *)(size_t)16; 1527 1528 } 1529 1530 #endif 1531 1532 if (len >= 16) { 1533 1534 const xxh_u8 *const limit = bEnd - 15; 1535 xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; 1536 xxh_u32 v2 = seed + XXH_PRIME32_2; 1537 xxh_u32 v3 = seed + 0; 1538 xxh_u32 v4 = seed - XXH_PRIME32_1; 1539 1540 do { 1541 1542 v1 = XXH32_round(v1, XXH_get32bits(input)); 1543 input += 4; 1544 v2 = XXH32_round(v2, XXH_get32bits(input)); 1545 input += 4; 1546 v3 = XXH32_round(v3, XXH_get32bits(input)); 1547 input += 4; 1548 v4 = XXH32_round(v4, XXH_get32bits(input)); 1549 input += 4; 1550 1551 } while (input < limit); 1552 1553 h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + 1554 XXH_rotl32(v4, 18); 1555 1556 } else { 1557 1558 h32 = seed + XXH_PRIME32_5; 1559 1560 } 1561 1562 h32 += (xxh_u32)len; 1563 1564 return XXH32_finalize(h32, input, len & 15, align); 1565 1566 } 1567 1568 XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len, 1569 XXH32_hash_t seed) { 1570 1571 #if 0 1572 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ 1573 XXH32_state_t state; 1574 XXH32_reset(&state, seed); 1575 XXH32_update(&state, (const xxh_u8*)input, len); 1576 return XXH32_digest(&state); 1577 1578 #else 1579 1580 if (XXH_FORCE_ALIGN_CHECK) { 1581 1582 if ((((size_t)input) & 3) == 1583 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ 1584 return XXH32_endian_align((const xxh_u8 *)input, len, seed, XXH_aligned); 1585 1586 } 1587 1588 } 1589 1590 return XXH32_endian_align((const xxh_u8 *)input, len, seed, XXH_unaligned); 1591 #endif 1592 1593 } 1594 1595 /******* Hash streaming *******/ 1596 1597 XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void) { 1598 1599 return (XXH32_state_t *)XXH_malloc(sizeof(XXH32_state_t)); 1600 1601 } 1602 1603 XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) { 1604 1605 XXH_free(statePtr); 1606 return XXH_OK; 1607 1608 } 1609 1610 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t * dstState, 1611 const XXH32_state_t *srcState) { 1612 1613 memcpy(dstState, srcState, sizeof(*dstState)); 1614 1615 } 1616 1617 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr, 1618 XXH32_hash_t seed) { 1619 1620 XXH32_state_t state; /* using a local state to memcpy() in order to avoid 1621 strict-aliasing warnings */ 1622 memset(&state, 0, sizeof(state)); 1623 state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; 1624 state.v2 = seed + XXH_PRIME32_2; 1625 state.v3 = seed + 0; 1626 state.v4 = seed - XXH_PRIME32_1; 1627 /* do not write into reserved, planned to be removed in a future version */ 1628 memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); 1629 return XXH_OK; 1630 1631 } 1632 1633 XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state, 1634 const void *input, size_t len) { 1635 1636 if (input == NULL) 1637 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ 1638 (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) 1639 return XXH_OK; 1640 #else 1641 return XXH_ERROR; 1642 #endif 1643 1644 { 1645 1646 const xxh_u8 * p = (const xxh_u8 *)input; 1647 const xxh_u8 *const bEnd = p + len; 1648 1649 state->total_len_32 += (XXH32_hash_t)len; 1650 state->large_len |= 1651 (XXH32_hash_t)((len >= 16) | (state->total_len_32 >= 16)); 1652 1653 if (state->memsize + len < 16) { /* fill in tmp buffer */ 1654 XXH_memcpy((xxh_u8 *)(state->mem32) + state->memsize, input, len); 1655 state->memsize += (XXH32_hash_t)len; 1656 return XXH_OK; 1657 1658 } 1659 1660 if (state->memsize) { /* some data left from previous update */ 1661 XXH_memcpy((xxh_u8 *)(state->mem32) + state->memsize, input, 1662 16 - state->memsize); 1663 { 1664 1665 const xxh_u32 *p32 = state->mem32; 1666 state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); 1667 p32++; 1668 state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); 1669 p32++; 1670 state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); 1671 p32++; 1672 state->v4 = XXH32_round(state->v4, XXH_readLE32(p32)); 1673 1674 } 1675 1676 p += 16 - state->memsize; 1677 state->memsize = 0; 1678 1679 } 1680 1681 if (p <= bEnd - 16) { 1682 1683 const xxh_u8 *const limit = bEnd - 16; 1684 xxh_u32 v1 = state->v1; 1685 xxh_u32 v2 = state->v2; 1686 xxh_u32 v3 = state->v3; 1687 xxh_u32 v4 = state->v4; 1688 1689 do { 1690 1691 v1 = XXH32_round(v1, XXH_readLE32(p)); 1692 p += 4; 1693 v2 = XXH32_round(v2, XXH_readLE32(p)); 1694 p += 4; 1695 v3 = XXH32_round(v3, XXH_readLE32(p)); 1696 p += 4; 1697 v4 = XXH32_round(v4, XXH_readLE32(p)); 1698 p += 4; 1699 1700 } while (p <= limit); 1701 1702 state->v1 = v1; 1703 state->v2 = v2; 1704 state->v3 = v3; 1705 state->v4 = v4; 1706 1707 } 1708 1709 if (p < bEnd) { 1710 1711 XXH_memcpy(state->mem32, p, (size_t)(bEnd - p)); 1712 state->memsize = (unsigned)(bEnd - p); 1713 1714 } 1715 1716 } 1717 1718 return XXH_OK; 1719 1720 } 1721 1722 XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *state) { 1723 1724 xxh_u32 h32; 1725 1726 if (state->large_len) { 1727 1728 h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + 1729 XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); 1730 1731 } else { 1732 1733 h32 = state->v3 /* == seed */ + XXH_PRIME32_5; 1734 1735 } 1736 1737 h32 += state->total_len_32; 1738 1739 return XXH32_finalize(h32, (const xxh_u8 *)state->mem32, state->memsize, 1740 XXH_aligned); 1741 1742 } 1743 1744 /******* Canonical representation *******/ 1745 1746 /* 1747 * The default return values from XXH functions are unsigned 32 and 64 bit 1748 * integers. 1749 * 1750 * The canonical representation uses big endian convention, the same convention 1751 * as human-readable numbers (large digits first). 1752 * 1753 * This way, hash values can be written into a file or buffer, remaining 1754 * comparable across different systems. 1755 * 1756 * The following functions allow transformation of hash values to and from their 1757 * canonical format. 1758 */ 1759 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst, 1760 XXH32_hash_t hash) { 1761 1762 XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); 1763 if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); 1764 memcpy(dst, &hash, sizeof(*dst)); 1765 1766 } 1767 1768 XXH_PUBLIC_API XXH32_hash_t 1769 XXH32_hashFromCanonical(const XXH32_canonical_t *src) { 1770 1771 return XXH_readBE32(src); 1772 1773 } 1774 1775 #ifndef XXH_NO_LONG_LONG 1776 1777 /* ******************************************************************* 1778 * 64-bit hash functions 1779 *********************************************************************/ 1780 1781 /******* Memory access *******/ 1782 1783 typedef XXH64_hash_t xxh_u64; 1784 1785 #ifdef XXH_OLD_NAMES 1786 #define U64 xxh_u64 1787 #endif 1788 1789 /*! 1790 * XXH_REROLL_XXH64: 1791 * Whether to reroll the XXH64_finalize() loop. 1792 * 1793 * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a 1794 * performance gain on 64-bit hosts, as only one jump is required. 1795 * 1796 * However, on 32-bit hosts, because arithmetic needs to be done with two 1797 * 32-bit registers, and 64-bit arithmetic needs to be simulated, it isn't 1798 * beneficial to unroll. The code becomes ridiculously large (the largest 1799 * function in the binary on i386!), and rerolling it saves anywhere from 1800 * 3kB to 20kB. It is also slightly faster because it fits into cache better 1801 * and is more likely to be inlined by the compiler. 1802 * 1803 * If XXH_REROLL is defined, this is ignored and the loop is always 1804 * rerolled. 1805 */ 1806 #ifndef XXH_REROLL_XXH64 1807 #if (defined(__ILP32__) || \ 1808 defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \ 1809 || !(defined(__x86_64__) || defined(_M_X64) || \ 1810 defined(_M_AMD64) /* x86-64 */ \ 1811 || defined(_M_ARM64) || defined(__aarch64__) || \ 1812 defined(__arm64__) /* aarch64 */ \ 1813 || defined(__PPC64__) || defined(__PPC64LE__) || \ 1814 defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \ 1815 || defined(__mips64__) || defined(__mips64)) /* mips64 */ \ 1816 || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */ 1817 #define XXH_REROLL_XXH64 1 1818 #else 1819 #define XXH_REROLL_XXH64 0 1820 #endif 1821 #endif /* !defined(XXH_REROLL_XXH64) */ 1822 1823 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3)) 1824 /* 1825 * Manual byteshift. Best for old compilers which don't inline memcpy. 1826 * We actually directly use XXH_readLE64 and XXH_readBE64. 1827 */ 1828 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2)) 1829 1830 /* Force direct memory access. Only works on CPU which support unaligned memory 1831 * access in hardware */ 1832 static xxh_u64 XXH_read64(const void *memPtr) { 1833 1834 return *(const xxh_u64 *)memPtr; 1835 1836 } 1837 1838 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1)) 1839 1840 /* 1841 * __pack instructions are safer, but compiler specific, hence potentially 1842 * problematic for some compilers. 1843 * 1844 * Currently only defined for GCC and ICC. 1845 */ 1846 #ifdef XXH_OLD_NAMES 1847 typedef union { 1848 1849 xxh_u32 u32; 1850 xxh_u64 u64; 1851 1852 } __attribute__((packed)) unalign64; 1853 1854 #endif 1855 static xxh_u64 XXH_read64(const void *ptr) { 1856 1857 typedef union { 1858 1859 xxh_u32 u32; 1860 xxh_u64 u64; 1861 1862 } __attribute__((packed)) xxh_unalign64; 1863 1864 return ((const xxh_unalign64 *)ptr)->u64; 1865 1866 } 1867 1868 #else 1869 1870 /* 1871 * Portable and safe solution. Generally efficient. 1872 * see: https://stackoverflow.com/a/32095106/646947 1873 */ 1874 static xxh_u64 XXH_read64(const void *memPtr) { 1875 1876 xxh_u64 val; 1877 memcpy(&val, memPtr, sizeof(val)); 1878 return val; 1879 1880 } 1881 1882 #endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ 1883 1884 #if defined(_MSC_VER) /* Visual Studio */ 1885 #define XXH_swap64 _byteswap_uint64 1886 #elif XXH_GCC_VERSION >= 403 1887 #define XXH_swap64 __builtin_bswap64 1888 #else 1889 static xxh_u64 XXH_swap64(xxh_u64 x) { 1890 1891 return ((x << 56) & 0xff00000000000000ULL) | 1892 ((x << 40) & 0x00ff000000000000ULL) | 1893 ((x << 24) & 0x0000ff0000000000ULL) | 1894 ((x << 8) & 0x000000ff00000000ULL) | 1895 ((x >> 8) & 0x00000000ff000000ULL) | 1896 ((x >> 24) & 0x0000000000ff0000ULL) | 1897 ((x >> 40) & 0x000000000000ff00ULL) | 1898 ((x >> 56) & 0x00000000000000ffULL); 1899 1900 } 1901 1902 #endif 1903 1904 /* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ 1905 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3)) 1906 1907 XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void *memPtr) { 1908 1909 const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr; 1910 return bytePtr[0] | ((xxh_u64)bytePtr[1] << 8) | ((xxh_u64)bytePtr[2] << 16) | 1911 ((xxh_u64)bytePtr[3] << 24) | ((xxh_u64)bytePtr[4] << 32) | 1912 ((xxh_u64)bytePtr[5] << 40) | ((xxh_u64)bytePtr[6] << 48) | 1913 ((xxh_u64)bytePtr[7] << 56); 1914 1915 } 1916 1917 XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void *memPtr) { 1918 1919 const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr; 1920 return bytePtr[7] | ((xxh_u64)bytePtr[6] << 8) | ((xxh_u64)bytePtr[5] << 16) | 1921 ((xxh_u64)bytePtr[4] << 24) | ((xxh_u64)bytePtr[3] << 32) | 1922 ((xxh_u64)bytePtr[2] << 40) | ((xxh_u64)bytePtr[1] << 48) | 1923 ((xxh_u64)bytePtr[0] << 56); 1924 1925 } 1926 1927 #else 1928 XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void *ptr) { 1929 1930 return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); 1931 1932 } 1933 1934 static xxh_u64 XXH_readBE64(const void *ptr) { 1935 1936 return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); 1937 1938 } 1939 1940 #endif 1941 1942 XXH_FORCE_INLINE xxh_u64 XXH_readLE64_align(const void * ptr, 1943 XXH_alignment align) { 1944 1945 if (align == XXH_unaligned) 1946 return XXH_readLE64(ptr); 1947 else 1948 return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64 *)ptr 1949 : XXH_swap64(*(const xxh_u64 *)ptr); 1950 1951 } 1952 1953 /******* xxh64 *******/ 1954 1955 static const xxh_u64 XXH_PRIME64_1 = 1956 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 1957 */ 1958 static const xxh_u64 XXH_PRIME64_2 = 1959 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 1960 */ 1961 static const xxh_u64 XXH_PRIME64_3 = 1962 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 1963 */ 1964 static const xxh_u64 XXH_PRIME64_4 = 1965 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 1966 */ 1967 static const xxh_u64 XXH_PRIME64_5 = 1968 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 1969 */ 1970 1971 #ifdef XXH_OLD_NAMES 1972 #define PRIME64_1 XXH_PRIME64_1 1973 #define PRIME64_2 XXH_PRIME64_2 1974 #define PRIME64_3 XXH_PRIME64_3 1975 #define PRIME64_4 XXH_PRIME64_4 1976 #define PRIME64_5 XXH_PRIME64_5 1977 #endif 1978 1979 static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) { 1980 1981 acc += input * XXH_PRIME64_2; 1982 acc = XXH_rotl64(acc, 31); 1983 acc *= XXH_PRIME64_1; 1984 return acc; 1985 1986 } 1987 1988 static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) { 1989 1990 val = XXH64_round(0, val); 1991 acc ^= val; 1992 acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; 1993 return acc; 1994 1995 } 1996 1997 static xxh_u64 XXH64_avalanche(xxh_u64 h64) { 1998 1999 h64 ^= h64 >> 33; 2000 h64 *= XXH_PRIME64_2; 2001 h64 ^= h64 >> 29; 2002 h64 *= XXH_PRIME64_3; 2003 h64 ^= h64 >> 32; 2004 return h64; 2005 2006 } 2007 2008 #define XXH_get64bits(p) XXH_readLE64_align(p, align) 2009 2010 static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len, 2011 XXH_alignment align) { 2012 2013 /* dummy comment */ 2014 2015 #define XXH_PROCESS1_64 \ 2016 do { \ 2017 \ 2018 h64 ^= (*ptr++) * XXH_PRIME64_5; \ 2019 h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; \ 2020 \ 2021 } while (0) 2022 2023 #define XXH_PROCESS4_64 \ 2024 do { \ 2025 \ 2026 h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; \ 2027 ptr += 4; \ 2028 h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; \ 2029 \ 2030 } while (0) 2031 2032 #define XXH_PROCESS8_64 \ 2033 do { \ 2034 \ 2035 xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \ 2036 ptr += 8; \ 2037 h64 ^= k1; \ 2038 h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4; \ 2039 \ 2040 } while (0) 2041 2042 /* Rerolled version for 32-bit targets is faster and much smaller. */ 2043 if (XXH_REROLL || XXH_REROLL_XXH64) { 2044 2045 len &= 31; 2046 while (len >= 8) { 2047 2048 XXH_PROCESS8_64; 2049 len -= 8; 2050 2051 } 2052 2053 if (len >= 4) { 2054 2055 XXH_PROCESS4_64; 2056 len -= 4; 2057 2058 } 2059 2060 while (len > 0) { 2061 2062 XXH_PROCESS1_64; 2063 --len; 2064 2065 } 2066 2067 return XXH64_avalanche(h64); 2068 2069 } else { 2070 2071 switch (len & 31) { 2072 2073 case 24: 2074 XXH_PROCESS8_64; 2075 /* fallthrough */ 2076 case 16: 2077 XXH_PROCESS8_64; 2078 /* fallthrough */ 2079 case 8: 2080 XXH_PROCESS8_64; 2081 return XXH64_avalanche(h64); 2082 2083 case 28: 2084 XXH_PROCESS8_64; 2085 /* fallthrough */ 2086 case 20: 2087 XXH_PROCESS8_64; 2088 /* fallthrough */ 2089 case 12: 2090 XXH_PROCESS8_64; 2091 /* fallthrough */ 2092 case 4: 2093 XXH_PROCESS4_64; 2094 return XXH64_avalanche(h64); 2095 2096 case 25: 2097 XXH_PROCESS8_64; 2098 /* fallthrough */ 2099 case 17: 2100 XXH_PROCESS8_64; 2101 /* fallthrough */ 2102 case 9: 2103 XXH_PROCESS8_64; 2104 XXH_PROCESS1_64; 2105 return XXH64_avalanche(h64); 2106 2107 case 29: 2108 XXH_PROCESS8_64; 2109 /* fallthrough */ 2110 case 21: 2111 XXH_PROCESS8_64; 2112 /* fallthrough */ 2113 case 13: 2114 XXH_PROCESS8_64; 2115 /* fallthrough */ 2116 case 5: 2117 XXH_PROCESS4_64; 2118 XXH_PROCESS1_64; 2119 return XXH64_avalanche(h64); 2120 2121 case 26: 2122 XXH_PROCESS8_64; 2123 /* fallthrough */ 2124 case 18: 2125 XXH_PROCESS8_64; 2126 /* fallthrough */ 2127 case 10: 2128 XXH_PROCESS8_64; 2129 XXH_PROCESS1_64; 2130 XXH_PROCESS1_64; 2131 return XXH64_avalanche(h64); 2132 2133 case 30: 2134 XXH_PROCESS8_64; 2135 /* fallthrough */ 2136 case 22: 2137 XXH_PROCESS8_64; 2138 /* fallthrough */ 2139 case 14: 2140 XXH_PROCESS8_64; 2141 /* fallthrough */ 2142 case 6: 2143 XXH_PROCESS4_64; 2144 XXH_PROCESS1_64; 2145 XXH_PROCESS1_64; 2146 return XXH64_avalanche(h64); 2147 2148 case 27: 2149 XXH_PROCESS8_64; 2150 /* fallthrough */ 2151 case 19: 2152 XXH_PROCESS8_64; 2153 /* fallthrough */ 2154 case 11: 2155 XXH_PROCESS8_64; 2156 XXH_PROCESS1_64; 2157 XXH_PROCESS1_64; 2158 XXH_PROCESS1_64; 2159 return XXH64_avalanche(h64); 2160 2161 case 31: 2162 XXH_PROCESS8_64; 2163 /* fallthrough */ 2164 case 23: 2165 XXH_PROCESS8_64; 2166 /* fallthrough */ 2167 case 15: 2168 XXH_PROCESS8_64; 2169 /* fallthrough */ 2170 case 7: 2171 XXH_PROCESS4_64; 2172 /* fallthrough */ 2173 case 3: 2174 XXH_PROCESS1_64; 2175 /* fallthrough */ 2176 case 2: 2177 XXH_PROCESS1_64; 2178 /* fallthrough */ 2179 case 1: 2180 XXH_PROCESS1_64; 2181 /* fallthrough */ 2182 case 0: 2183 return XXH64_avalanche(h64); 2184 2185 } 2186 2187 } 2188 2189 /* impossible to reach */ 2190 XXH_ASSERT(0); 2191 return 0; /* unreachable, but some compilers complain without it */ 2192 2193 } 2194 2195 #ifdef XXH_OLD_NAMES 2196 #define PROCESS1_64 XXH_PROCESS1_64 2197 #define PROCESS4_64 XXH_PROCESS4_64 2198 #define PROCESS8_64 XXH_PROCESS8_64 2199 #else 2200 #undef XXH_PROCESS1_64 2201 #undef XXH_PROCESS4_64 2202 #undef XXH_PROCESS8_64 2203 #endif 2204 2205 XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8 *input, size_t len, 2206 xxh_u64 seed, XXH_alignment align) { 2207 2208 const xxh_u8 *bEnd = input + len; 2209 xxh_u64 h64; 2210 2211 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ 2212 (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) 2213 if (input == NULL) { 2214 2215 len = 0; 2216 bEnd = input = (const xxh_u8 *)(size_t)32; 2217 2218 } 2219 2220 #endif 2221 2222 if (len >= 32) { 2223 2224 const xxh_u8 *const limit = bEnd - 32; 2225 xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; 2226 xxh_u64 v2 = seed + XXH_PRIME64_2; 2227 xxh_u64 v3 = seed + 0; 2228 xxh_u64 v4 = seed - XXH_PRIME64_1; 2229 2230 do { 2231 2232 v1 = XXH64_round(v1, XXH_get64bits(input)); 2233 input += 8; 2234 v2 = XXH64_round(v2, XXH_get64bits(input)); 2235 input += 8; 2236 v3 = XXH64_round(v3, XXH_get64bits(input)); 2237 input += 8; 2238 v4 = XXH64_round(v4, XXH_get64bits(input)); 2239 input += 8; 2240 2241 } while (input <= limit); 2242 2243 h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + 2244 XXH_rotl64(v4, 18); 2245 h64 = XXH64_mergeRound(h64, v1); 2246 h64 = XXH64_mergeRound(h64, v2); 2247 h64 = XXH64_mergeRound(h64, v3); 2248 h64 = XXH64_mergeRound(h64, v4); 2249 2250 } else { 2251 2252 h64 = seed + XXH_PRIME64_5; 2253 2254 } 2255 2256 h64 += (xxh_u64)len; 2257 2258 return XXH64_finalize(h64, input, len, align); 2259 2260 } 2261 2262 XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len, 2263 XXH64_hash_t seed) { 2264 2265 #if 0 2266 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ 2267 XXH64_state_t state; 2268 XXH64_reset(&state, seed); 2269 XXH64_update(&state, (const xxh_u8*)input, len); 2270 return XXH64_digest(&state); 2271 2272 #else 2273 2274 if (XXH_FORCE_ALIGN_CHECK) { 2275 2276 if ((((size_t)input) & 7) == 2277 0) { /* Input is aligned, let's leverage the speed advantage */ 2278 return XXH64_endian_align((const xxh_u8 *)input, len, seed, XXH_aligned); 2279 2280 } 2281 2282 } 2283 2284 return XXH64_endian_align((const xxh_u8 *)input, len, seed, XXH_unaligned); 2285 2286 #endif 2287 2288 } 2289 2290 /******* Hash Streaming *******/ 2291 2292 XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void) { 2293 2294 return (XXH64_state_t *)XXH_malloc(sizeof(XXH64_state_t)); 2295 2296 } 2297 2298 XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) { 2299 2300 XXH_free(statePtr); 2301 return XXH_OK; 2302 2303 } 2304 2305 XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t * dstState, 2306 const XXH64_state_t *srcState) { 2307 2308 memcpy(dstState, srcState, sizeof(*dstState)); 2309 2310 } 2311 2312 XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr, 2313 XXH64_hash_t seed) { 2314 2315 XXH64_state_t state; /* use a local state to memcpy() in order to avoid 2316 strict-aliasing warnings */ 2317 memset(&state, 0, sizeof(state)); 2318 state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; 2319 state.v2 = seed + XXH_PRIME64_2; 2320 state.v3 = seed + 0; 2321 state.v4 = seed - XXH_PRIME64_1; 2322 /* do not write into reserved64, might be removed in a future version */ 2323 memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64)); 2324 return XXH_OK; 2325 2326 } 2327 2328 XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state, 2329 const void *input, size_t len) { 2330 2331 if (input == NULL) 2332 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ 2333 (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) 2334 return XXH_OK; 2335 #else 2336 return XXH_ERROR; 2337 #endif 2338 2339 { 2340 2341 const xxh_u8 * p = (const xxh_u8 *)input; 2342 const xxh_u8 *const bEnd = p + len; 2343 2344 state->total_len += len; 2345 2346 if (state->memsize + len < 32) { /* fill in tmp buffer */ 2347 XXH_memcpy(((xxh_u8 *)state->mem64) + state->memsize, input, len); 2348 state->memsize += (xxh_u32)len; 2349 return XXH_OK; 2350 2351 } 2352 2353 if (state->memsize) { /* tmp buffer is full */ 2354 XXH_memcpy(((xxh_u8 *)state->mem64) + state->memsize, input, 2355 32 - state->memsize); 2356 state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0)); 2357 state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1)); 2358 state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2)); 2359 state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3)); 2360 p += 32 - state->memsize; 2361 state->memsize = 0; 2362 2363 } 2364 2365 if (p + 32 <= bEnd) { 2366 2367 const xxh_u8 *const limit = bEnd - 32; 2368 xxh_u64 v1 = state->v1; 2369 xxh_u64 v2 = state->v2; 2370 xxh_u64 v3 = state->v3; 2371 xxh_u64 v4 = state->v4; 2372 2373 do { 2374 2375 v1 = XXH64_round(v1, XXH_readLE64(p)); 2376 p += 8; 2377 v2 = XXH64_round(v2, XXH_readLE64(p)); 2378 p += 8; 2379 v3 = XXH64_round(v3, XXH_readLE64(p)); 2380 p += 8; 2381 v4 = XXH64_round(v4, XXH_readLE64(p)); 2382 p += 8; 2383 2384 } while (p <= limit); 2385 2386 state->v1 = v1; 2387 state->v2 = v2; 2388 state->v3 = v3; 2389 state->v4 = v4; 2390 2391 } 2392 2393 if (p < bEnd) { 2394 2395 XXH_memcpy(state->mem64, p, (size_t)(bEnd - p)); 2396 state->memsize = (unsigned)(bEnd - p); 2397 2398 } 2399 2400 } 2401 2402 return XXH_OK; 2403 2404 } 2405 2406 XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *state) { 2407 2408 xxh_u64 h64; 2409 2410 if (state->total_len >= 32) { 2411 2412 xxh_u64 const v1 = state->v1; 2413 xxh_u64 const v2 = state->v2; 2414 xxh_u64 const v3 = state->v3; 2415 xxh_u64 const v4 = state->v4; 2416 2417 h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + 2418 XXH_rotl64(v4, 18); 2419 h64 = XXH64_mergeRound(h64, v1); 2420 h64 = XXH64_mergeRound(h64, v2); 2421 h64 = XXH64_mergeRound(h64, v3); 2422 h64 = XXH64_mergeRound(h64, v4); 2423 2424 } else { 2425 2426 h64 = state->v3 /*seed*/ + XXH_PRIME64_5; 2427 2428 } 2429 2430 h64 += (xxh_u64)state->total_len; 2431 2432 return XXH64_finalize(h64, (const xxh_u8 *)state->mem64, 2433 (size_t)state->total_len, XXH_aligned); 2434 2435 } 2436 2437 /******* Canonical representation *******/ 2438 2439 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst, 2440 XXH64_hash_t hash) { 2441 2442 XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); 2443 if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); 2444 memcpy(dst, &hash, sizeof(*dst)); 2445 2446 } 2447 2448 XXH_PUBLIC_API XXH64_hash_t 2449 XXH64_hashFromCanonical(const XXH64_canonical_t *src) { 2450 2451 return XXH_readBE64(src); 2452 2453 } 2454 2455 /* ********************************************************************* 2456 * XXH3 2457 * New generation hash designed for speed on small keys and vectorization 2458 ************************************************************************ */ 2459 2460 /* === Compiler specifics === */ 2461 2462 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ 2463 #define XXH_RESTRICT restrict 2464 #else 2465 /* Note: it might be useful to define __restrict or __restrict__ for some 2466 * C++ compilers */ 2467 #define XXH_RESTRICT /* disable */ 2468 #endif 2469 2470 #if (defined(__GNUC__) && (__GNUC__ >= 3)) || \ 2471 (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \ 2472 defined(__clang__) 2473 #define XXH_likely(x) __builtin_expect(x, 1) 2474 #define XXH_unlikely(x) __builtin_expect(x, 0) 2475 #else 2476 #define XXH_likely(x) (x) 2477 #define XXH_unlikely(x) (x) 2478 #endif 2479 2480 #if defined(__GNUC__) 2481 #if defined(__AVX2__) 2482 #include <immintrin.h> 2483 #elif defined(__SSE2__) 2484 #include <emmintrin.h> 2485 #elif defined(__ARM_NEON__) || defined(__ARM_NEON) 2486 #define inline __inline__ /* circumvent a clang bug */ 2487 #include <arm_neon.h> 2488 #undef inline 2489 #endif 2490 #elif defined(_MSC_VER) 2491 #include <intrin.h> 2492 #endif 2493 2494 /* 2495 * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while 2496 * remaining a true 64-bit/128-bit hash function. 2497 * 2498 * This is done by prioritizing a subset of 64-bit operations that can be 2499 * emulated without too many steps on the average 32-bit machine. 2500 * 2501 * For example, these two lines seem similar, and run equally fast on 2502 * 64-bit: 2503 * 2504 * xxh_u64 x; 2505 * x ^= (x >> 47); // good 2506 * x ^= (x >> 13); // bad 2507 * 2508 * However, to a 32-bit machine, there is a major difference. 2509 * 2510 * x ^= (x >> 47) looks like this: 2511 * 2512 * x.lo ^= (x.hi >> (47 - 32)); 2513 * 2514 * while x ^= (x >> 13) looks like this: 2515 * 2516 * // note: funnel shifts are not usually cheap. 2517 * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); 2518 * x.hi ^= (x.hi >> 13); 2519 * 2520 * The first one is significantly faster than the second, simply because the 2521 * shift is larger than 32. This means: 2522 * - All the bits we need are in the upper 32 bits, so we can ignore the 2523 * lower 32 bits in the shift. 2524 * - The shift result will always fit in the lower 32 bits, and therefore, 2525 * we can ignore the upper 32 bits in the xor. 2526 * 2527 * Thanks to this optimization, XXH3 only requires these features to be 2528 * efficient: 2529 * 2530 * - Usable unaligned access 2531 * - A 32-bit or 64-bit ALU 2532 * - If 32-bit, a decent ADC instruction 2533 * - A 32 or 64-bit multiply with a 64-bit result 2534 * - For the 128-bit variant, a decent byteswap helps short inputs. 2535 * 2536 * The first two are already required by XXH32, and almost all 32-bit and 2537 * 64-bit platforms which can run XXH32 can run XXH3 efficiently. 2538 * 2539 * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one 2540 * notable exception. 2541 * 2542 * First of all, Thumb-1 lacks support for the UMULL instruction which 2543 * performs the important long multiply. This means numerous __aeabi_lmul 2544 * calls. 2545 * 2546 * Second of all, the 8 functional registers are just not enough. 2547 * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic 2548 * need Lo registers, and this shuffling results in thousands more MOVs than 2549 * A32. 2550 * 2551 * A32 and T32 don't have this limitation. They can access all 14 registers, 2552 * do a 32->64 multiply with UMULL, and the flexible operand allowing free 2553 * shifts is helpful, too. 2554 * 2555 * Therefore, we do a quick sanity check. 2556 * 2557 * If compiling Thumb-1 for a target which supports ARM instructions, we 2558 * will emit a warning, as it is not a "sane" platform to compile for. 2559 * 2560 * Usually, if this happens, it is because of an accident and you probably 2561 * need to specify -march, as you likely meant to compile for a newer 2562 * architecture. 2563 * 2564 * Credit: large sections of the vectorial and asm source code paths 2565 * have been contributed by @easyaspi314 2566 */ 2567 #if defined(__thumb__) && !defined(__thumb2__) && \ 2568 defined(__ARM_ARCH_ISA_ARM) 2569 #warning "XXH3 is highly inefficient without ARM or Thumb-2." 2570 #endif 2571 2572 /* ========================================== 2573 * Vectorization detection 2574 * ========================================== */ 2575 #define XXH_SCALAR 0 /* Portable scalar version */ 2576 #define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */ 2577 #define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */ 2578 #define XXH_AVX512 3 /* AVX512 for Skylake and Icelake */ 2579 #define XXH_NEON 4 /* NEON for most ARMv7-A and all AArch64 */ 2580 #define XXH_VSX 5 /* VSX and ZVector for POWER8/z13 */ 2581 2582 #ifndef XXH_VECTOR /* can be defined on command line */ 2583 #if defined(__AVX512F__) 2584 #define XXH_VECTOR XXH_AVX512 2585 #elif defined(__AVX2__) 2586 #define XXH_VECTOR XXH_AVX2 2587 #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \ 2588 (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) 2589 #define XXH_VECTOR XXH_SSE2 2590 #elif defined(__GNUC__) /* msvc support maybe later */ \ 2591 && (defined(__ARM_NEON__) || defined(__ARM_NEON)) && \ 2592 (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ 2593 || (defined(__BYTE_ORDER__) && \ 2594 __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) 2595 #define XXH_VECTOR XXH_NEON 2596 #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \ 2597 (defined(__s390x__) && defined(__VEC__)) && \ 2598 defined(__GNUC__) /* TODO: IBM XL */ 2599 #define XXH_VECTOR XXH_VSX 2600 #else 2601 #define XXH_VECTOR XXH_SCALAR 2602 #endif 2603 #endif 2604 2605 /* 2606 * Controls the alignment of the accumulator, 2607 * for compatibility with aligned vector loads, which are usually faster. 2608 */ 2609 #ifndef XXH_ACC_ALIGN 2610 #if defined(XXH_X86DISPATCH) 2611 #define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ 2612 #elif XXH_VECTOR == XXH_SCALAR /* scalar */ 2613 #define XXH_ACC_ALIGN 8 2614 #elif XXH_VECTOR == XXH_SSE2 /* sse2 */ 2615 #define XXH_ACC_ALIGN 16 2616 #elif XXH_VECTOR == XXH_AVX2 /* avx2 */ 2617 #define XXH_ACC_ALIGN 32 2618 #elif XXH_VECTOR == XXH_NEON /* neon */ 2619 #define XXH_ACC_ALIGN 16 2620 #elif XXH_VECTOR == XXH_VSX /* vsx */ 2621 #define XXH_ACC_ALIGN 16 2622 #elif XXH_VECTOR == XXH_AVX512 /* avx512 */ 2623 #define XXH_ACC_ALIGN 64 2624 #endif 2625 #endif 2626 2627 #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \ 2628 XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 2629 #define XXH_SEC_ALIGN XXH_ACC_ALIGN 2630 #else 2631 #define XXH_SEC_ALIGN 8 2632 #endif 2633 2634 /* 2635 * UGLY HACK: 2636 * GCC usually generates the best code with -O3 for xxHash. 2637 * 2638 * However, when targeting AVX2, it is overzealous in its unrolling 2639 * resulting in code roughly 3/4 the speed of Clang. 2640 * 2641 * There are other issues, such as GCC splitting _mm256_loadu_si256 into 2642 * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which 2643 * only applies to Sandy and Ivy Bridge... which don't even support AVX2. 2644 * 2645 * That is why when compiling the AVX2 version, it is recommended to use 2646 * either -O2 -mavx2 -march=haswell or -O2 -mavx2 2647 * -mno-avx256-split-unaligned-load for decent performance, or to use Clang 2648 * instead. 2649 * 2650 * Fortunately, we can control the first one with a pragma that forces GCC 2651 * into -O2, but the other one we can't control without "failed to inline 2652 * always inline function due to target mismatch" warnings. 2653 */ 2654 #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ 2655 && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ 2656 && defined(__OPTIMIZE__) && \ 2657 !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ 2658 #pragma GCC push_options 2659 #pragma GCC optimize("-O2") 2660 #endif 2661 2662 #if XXH_VECTOR == XXH_NEON 2663 /* 2664 * NEON's setup for vmlal_u32 is a little more complicated than it is on 2665 * SSE2, AVX2, and VSX. 2666 * 2667 * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an 2668 * upcast. 2669 * 2670 * To do the same operation, the 128-bit 'Q' register needs to be split 2671 * into two 64-bit 'D' registers, performing this operation:: 2672 * 2673 * [ a | b ] | 2674 * '---------. .--------' | | x | 2675 * | .---------' '--------. | 2676 * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] 2677 * 2678 * Due to significant changes in aarch64, the fastest method for aarch64 2679 * is completely different than the fastest method for ARMv7-A. 2680 * 2681 * ARMv7-A treats D registers as unions overlaying Q registers, so 2682 * modifying D11 will modify the high half of Q5. This is similar to how 2683 * modifying AH will only affect bits 8-15 of AX on x86. 2684 * 2685 * VZIP takes two registers, and puts even lanes in one register and odd 2686 * lanes in the other. 2687 * 2688 * On ARMv7-A, this strangely modifies both parameters in place instead of 2689 * taking the usual 3-operand form. 2690 * 2691 * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on 2692 * the lower and upper halves of the Q register to end up with the high 2693 * and low halves where we want - all in one instruction. 2694 * 2695 * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], 2696 * d11[1] } 2697 * 2698 * Unfortunately we need inline assembly for this: Instructions modifying 2699 * two registers at once is not possible in GCC or Clang's IR, and they 2700 * have to create a copy. 2701 * 2702 * aarch64 requires a different approach. 2703 * 2704 * In order to make it easier to write a decent compiler for aarch64, many 2705 * quirks were removed, such as conditional execution. 2706 * 2707 * NEON was also affected by this. 2708 * 2709 * aarch64 cannot access the high bits of a Q-form register, and writes to 2710 * a D-form register zero the high bits, similar to how writes to W-form 2711 * scalar registers (or DWORD registers on x86_64) work. 2712 * 2713 * The formerly free vget_high intrinsics now require a vext (with a few 2714 * exceptions) 2715 * 2716 * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the 2717 * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to 2718 * only modify one operand. 2719 * 2720 * The equivalent of the VZIP.32 on the lower and upper halves would be 2721 * this mess: 2722 * 2723 * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] 2724 * } zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } zip2 v0.2s, 2725 * v0.2s, v1.2s // v0 = { v0[1], v2[1] } 2726 * 2727 * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 2728 * (SHRN): 2729 * 2730 * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); 2731 * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); 2732 * 2733 * This is available on ARMv7-A, but is less efficient than a single 2734 * VZIP.32. 2735 */ 2736 2737 /* 2738 * Function-like macro: 2739 * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t 2740 * &outHi) 2741 * { 2742 2743 * outLo = (uint32x2_t)(in & 0xFFFFFFFF); 2744 * outHi = (uint32x2_t)(in >> 32); 2745 * in = UNDEFINED; 2746 * } 2747 */ 2748 #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ 2749 && defined(__GNUC__) && !defined(__aarch64__) && !defined(__arm64__) 2750 #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ 2751 do { \ 2752 \ 2753 /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, \ 2754 * %f0 = upper D half */ \ 2755 /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 \ 2756 */ \ 2757 /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \ 2758 */ \ 2759 __asm__("vzip.32 %e0, %f0" : "+w"(in)); \ 2760 (outLo) = vget_low_u32(vreinterpretq_u32_u64(in)); \ 2761 (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ 2762 \ 2763 } while (0) 2764 2765 #else 2766 #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ 2767 do { \ 2768 \ 2769 (outLo) = vmovn_u64(in); \ 2770 (outHi) = vshrn_n_u64((in), 32); \ 2771 \ 2772 } while (0) 2773 2774 #endif 2775 #endif /* XXH_VECTOR == XXH_NEON */ 2776 2777 /* 2778 * VSX and Z Vector helpers. 2779 * 2780 * This is very messy, and any pull requests to clean this up are welcome. 2781 * 2782 * There are a lot of problems with supporting VSX and s390x, due to 2783 * inconsistent intrinsics, spotty coverage, and multiple endiannesses. 2784 */ 2785 #if XXH_VECTOR == XXH_VSX 2786 #if defined(__s390x__) 2787 #include <s390intrin.h> 2788 #else 2789 /* gcc's altivec.h can have the unwanted consequence to unconditionally 2790 * #define bool, vector, and pixel keywords, 2791 * with bad consequences for programs already using these keywords for 2792 * other purposes. The paragraph defining these macros is skipped when 2793 * __APPLE_ALTIVEC__ is defined. 2794 * __APPLE_ALTIVEC__ is _generally_ defined automatically by the 2795 * compiler, but it seems that, in some cases, it isn't. Force the build 2796 * macro to be defined, so that keywords are not altered. 2797 */ 2798 #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__) 2799 #define __APPLE_ALTIVEC__ 2800 #endif 2801 #include <altivec.h> 2802 #endif 2803 2804 typedef __vector unsigned long long xxh_u64x2; 2805 typedef __vector unsigned char xxh_u8x16; 2806 typedef __vector unsigned xxh_u32x4; 2807 2808 #ifndef XXH_VSX_BE 2809 #if defined(__BIG_ENDIAN__) || \ 2810 (defined(__BYTE_ORDER__) && \ 2811 __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) 2812 #define XXH_VSX_BE 1 2813 #elif defined(__VEC_ELEMENT_REG_ORDER__) && \ 2814 __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ 2815 #warning \ 2816 "-maltivec=be is not recommended. Please use native endianness." 2817 #define XXH_VSX_BE 1 2818 #else 2819 #define XXH_VSX_BE 0 2820 #endif 2821 #endif /* !defined(XXH_VSX_BE) */ 2822 2823 #if XXH_VSX_BE 2824 /* A wrapper for POWER9's vec_revb. */ 2825 #if defined(__POWER9_VECTOR__) || \ 2826 (defined(__clang__) && defined(__s390x__)) 2827 #define XXH_vec_revb vec_revb 2828 #else 2829 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) { 2830 2831 xxh_u8x16 const vByteSwap = {0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 2832 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08}; 2833 return vec_perm(val, val, vByteSwap); 2834 2835 } 2836 2837 #endif 2838 #endif /* XXH_VSX_BE */ 2839 2840 /* 2841 * Performs an unaligned load and byte swaps it on big endian. 2842 */ 2843 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) { 2844 2845 xxh_u64x2 ret; 2846 memcpy(&ret, ptr, sizeof(xxh_u64x2)); 2847 #if XXH_VSX_BE 2848 ret = XXH_vec_revb(ret); 2849 #endif 2850 return ret; 2851 2852 } 2853 2854 /* 2855 * vec_mulo and vec_mule are very problematic intrinsics on PowerPC 2856 * 2857 * These intrinsics weren't added until GCC 8, despite existing for a 2858 * while, and they are endian dependent. Also, their meaning swap 2859 * depending on version. 2860 * */ 2861 #if defined(__s390x__) 2862 /* s390x is always big endian, no issue on this platform */ 2863 #define XXH_vec_mulo vec_mulo 2864 #define XXH_vec_mule vec_mule 2865 #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) 2866 /* Clang has a better way to control this, we can just use the builtin 2867 * which doesn't swap. */ 2868 #define XXH_vec_mulo __builtin_altivec_vmulouw 2869 #define XXH_vec_mule __builtin_altivec_vmuleuw 2870 #else 2871 /* gcc needs inline assembly */ 2872 /* Adapted from 2873 * https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ 2874 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) { 2875 2876 xxh_u64x2 result; 2877 __asm__("vmulouw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b)); 2878 return result; 2879 2880 } 2881 2882 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) { 2883 2884 xxh_u64x2 result; 2885 __asm__("vmuleuw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b)); 2886 return result; 2887 2888 } 2889 2890 #endif /* XXH_vec_mulo, XXH_vec_mule */ 2891 #endif /* XXH_VECTOR == XXH_VSX */ 2892 2893 /* prefetch 2894 * can be disabled, by declaring XXH_NO_PREFETCH build macro */ 2895 #if defined(XXH_NO_PREFETCH) 2896 #define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ 2897 #else 2898 #if defined(_MSC_VER) && \ 2899 (defined(_M_X64) || \ 2900 defined( \ 2901 _M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ 2902 #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ 2903 #define XXH_PREFETCH(ptr) _mm_prefetch((const char *)(ptr), _MM_HINT_T0) 2904 #elif defined(__GNUC__) && \ 2905 ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))) 2906 #define XXH_PREFETCH(ptr) \ 2907 __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) 2908 #else 2909 #define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ 2910 #endif 2911 #endif /* XXH_NO_PREFETCH */ 2912 2913 /* ========================================== 2914 * XXH3 default settings 2915 * ========================================== */ 2916 2917 #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ 2918 2919 #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) 2920 #error "default keyset is not large enough" 2921 #endif 2922 2923 /* Pseudorandom secret taken directly from FARSH */ 2924 XXH_ALIGN(64) 2925 static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { 2926 2927 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 2928 0xf7, 0x21, 0xad, 0x1c, 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 2929 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, 0xcb, 0x79, 0xe6, 0x4e, 2930 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, 2931 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 2932 0x81, 0x3a, 0x26, 0x4c, 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 2933 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, 0x71, 0x64, 0x48, 0x97, 2934 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, 2935 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 2936 0xc7, 0x0b, 0x4f, 0x1d, 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 2937 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, 0xea, 0xc5, 0xac, 0x83, 2938 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, 2939 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 2940 0x29, 0xd4, 0x68, 0x9e, 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 2941 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, 0x45, 0xcb, 0x3a, 0x8f, 2942 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, 2943 2944 }; 2945 2946 #ifdef XXH_OLD_NAMES 2947 #define kSecret XXH3_kSecret 2948 #endif 2949 2950 /* 2951 * Calculates a 32-bit to 64-bit long multiply. 2952 * 2953 * Wraps __emulu on MSVC x86 because it tends to call __allmul when it 2954 * doesn't need to (but it shouldn't need to anyways, it is about 7 2955 * instructions to do a 64x64 multiply...). Since we know that this will 2956 * _always_ emit MULL, we use that instead of the normal method. 2957 * 2958 * If you are compiling for platforms like Thumb-1 and don't have a better 2959 * option, you may also want to write your own long multiply routine here. 2960 * 2961 * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) 2962 * { 2963 2964 * return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); 2965 * } 2966 */ 2967 #if defined(_MSC_VER) && defined(_M_IX86) 2968 #include <intrin.h> 2969 #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) 2970 #else 2971 /* 2972 * Downcast + upcast is usually better than masking on older compilers 2973 * like GCC 4.2 (especially 32-bit ones), all without affecting newer 2974 * compilers. 2975 * 2976 * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both 2977 * operands and perform a full 64x64 multiply -- entirely redundant on 2978 * 32-bit. 2979 */ 2980 #define XXH_mult32to64(x, y) \ 2981 ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) 2982 #endif 2983 2984 /* 2985 * Calculates a 64->128-bit long multiply. 2986 * 2987 * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version. 2988 */ 2989 static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) { 2990 2991 /* 2992 * GCC/Clang __uint128_t method. 2993 * 2994 * On most 64-bit targets, GCC and Clang define a __uint128_t type. 2995 * This is usually the best way as it usually uses a native long 64-bit 2996 * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. 2997 * 2998 * Usually. 2999 * 3000 * Despite being a 32-bit platform, Clang (and emscripten) define this type 3001 * despite not having the arithmetic for it. This results in a laggy 3002 * compiler builtin call which calculates a full 128-bit multiply. 3003 * In that case it is best to use the portable one. 3004 * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 3005 */ 3006 #if defined(__GNUC__) && !defined(__wasm__) && \ 3007 defined(__SIZEOF_INT128__) || \ 3008 (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) 3009 3010 __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; 3011 XXH128_hash_t r128; 3012 r128.low64 = (xxh_u64)(product); 3013 r128.high64 = (xxh_u64)(product >> 64); 3014 return r128; 3015 3016 /* 3017 * MSVC for x64's _umul128 method. 3018 * 3019 * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 3020 * *HighProduct); 3021 * 3022 * This compiles to single operand MUL on x64. 3023 */ 3024 #elif defined(_M_X64) || defined(_M_IA64) 3025 3026 #ifndef _MSC_VER 3027 #pragma intrinsic(_umul128) 3028 #endif 3029 xxh_u64 product_high; 3030 xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); 3031 XXH128_hash_t r128; 3032 r128.low64 = product_low; 3033 r128.high64 = product_high; 3034 return r128; 3035 3036 #else 3037 /* 3038 * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. 3039 * 3040 * This is a fast and simple grade school multiply, which is shown below 3041 * with base 10 arithmetic instead of base 0x100000000. 3042 * 3043 * 9 3 // D2 lhs = 93 3044 * x 7 5 // D2 rhs = 75 3045 * ---------- 3046 * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 3047 * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 3048 * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 3049 * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 3050 * --------- 3051 * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 3052 * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 3053 * --------- 3054 * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 3055 * 3056 * The reasons for adding the products like this are: 3057 * 1. It avoids manual carry tracking. Just like how 3058 * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. 3059 * This avoids a lot of complexity. 3060 * 3061 * 2. It hints for, and on Clang, compiles to, the powerful UMAAL 3062 * instruction available in ARM's Digital Signal Processing extension 3063 * in 32-bit ARMv6 and later, which is shown below: 3064 * 3065 * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) 3066 * { 3067 3068 * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; 3069 * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); 3070 * *RdHi = (xxh_u32)(product >> 32); 3071 * } 3072 * 3073 * This instruction was designed for efficient long multiplication, and 3074 * allows this to be calculated in only 4 instructions at speeds 3075 * comparable to some 64-bit ALUs. 3076 * 3077 * 3. It isn't terrible on other platforms. Usually this will be a couple 3078 * of 32-bit ADD/ADCs. 3079 */ 3080 3081 /* First calculate all of the cross products. */ 3082 xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); 3083 xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); 3084 xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); 3085 xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); 3086 3087 /* Now add the products together. These will never overflow. */ 3088 xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; 3089 xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; 3090 xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); 3091 3092 XXH128_hash_t r128; 3093 r128.low64 = lower; 3094 r128.high64 = upper; 3095 return r128; 3096 #endif 3097 3098 } 3099 3100 /* 3101 * Does a 64-bit to 128-bit multiply, then XOR folds it. 3102 * 3103 * The reason for the separate function is to prevent passing too many structs 3104 * around by value. This will hopefully inline the multiply, but we don't force 3105 * it. 3106 */ 3107 static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) { 3108 3109 XXH128_hash_t product = XXH_mult64to128(lhs, rhs); 3110 return product.low64 ^ product.high64; 3111 3112 } 3113 3114 /* Seems to produce slightly better code on GCC for some reason. */ 3115 XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) { 3116 3117 XXH_ASSERT(0 <= shift && shift < 64); 3118 return v64 ^ (v64 >> shift); 3119 3120 } 3121 3122 /* 3123 * This is a fast avalanche stage, 3124 * suitable when input bits are already partially mixed 3125 */ 3126 static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) { 3127 3128 h64 = XXH_xorshift64(h64, 37); 3129 h64 *= 0x165667919E3779F9ULL; 3130 h64 = XXH_xorshift64(h64, 32); 3131 return h64; 3132 3133 } 3134 3135 /* 3136 * This is a stronger avalanche, 3137 * inspired by Pelle Evensen's rrmxmx 3138 * preferable when input has not been previously mixed 3139 */ 3140 static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) { 3141 3142 /* this mix is inspired by Pelle Evensen's rrmxmx */ 3143 h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); 3144 h64 *= 0x9FB21C651E98DF25ULL; 3145 h64 ^= (h64 >> 35) + len; 3146 h64 *= 0x9FB21C651E98DF25ULL; 3147 return XXH_xorshift64(h64, 28); 3148 3149 } 3150 3151 /* ========================================== 3152 * Short keys 3153 * ========================================== 3154 * One of the shortcomings of XXH32 and XXH64 was that their performance was 3155 * sub-optimal on short lengths. It used an iterative algorithm which strongly 3156 * favored lengths that were a multiple of 4 or 8. 3157 * 3158 * Instead of iterating over individual inputs, we use a set of single shot 3159 * functions which piece together a range of lengths and operate in constant 3160 * time. 3161 * 3162 * Additionally, the number of multiplies has been significantly reduced. This 3163 * reduces latency, especially when emulating 64-bit multiplies on 32-bit. 3164 * 3165 * Depending on the platform, this may or may not be faster than XXH32, but it 3166 * is almost guaranteed to be faster than XXH64. 3167 */ 3168 3169 /* 3170 * At very short lengths, there isn't enough input to fully hide secrets, or use 3171 * the entire secret. 3172 * 3173 * There is also only a limited amount of mixing we can do before significantly 3174 * impacting performance. 3175 * 3176 * Therefore, we use different sections of the secret and always mix two secret 3177 * samples with an XOR. This should have no effect on performance on the 3178 * seedless or withSeed variants because everything _should_ be constant folded 3179 * by modern compilers. 3180 * 3181 * The XOR mixing hides individual parts of the secret and increases entropy. 3182 * 3183 * This adds an extra layer of strength for custom secrets. 3184 */ 3185 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_1to3_64b(const xxh_u8 *input, size_t len, 3186 const xxh_u8 *secret, 3187 XXH64_hash_t seed) { 3188 3189 XXH_ASSERT(input != NULL); 3190 XXH_ASSERT(1 <= len && len <= 3); 3191 XXH_ASSERT(secret != NULL); 3192 /* 3193 * len = 1: combined = { input[0], 0x01, input[0], input[0] } 3194 * len = 2: combined = { input[1], 0x02, input[0], input[1] } 3195 * len = 3: combined = { input[2], 0x03, input[0], input[1] } 3196 */ 3197 { 3198 3199 xxh_u8 const c1 = input[0]; 3200 xxh_u8 const c2 = input[len >> 1]; 3201 xxh_u8 const c3 = input[len - 1]; 3202 xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) | 3203 ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); 3204 xxh_u64 const bitflip = 3205 (XXH_readLE32(secret) ^ XXH_readLE32(secret + 4)) + seed; 3206 xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; 3207 return XXH64_avalanche(keyed); 3208 3209 } 3210 3211 } 3212 3213 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8 *input, size_t len, 3214 const xxh_u8 *secret, 3215 XXH64_hash_t seed) { 3216 3217 XXH_ASSERT(input != NULL); 3218 XXH_ASSERT(secret != NULL); 3219 XXH_ASSERT(4 <= len && len < 8); 3220 seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; 3221 { 3222 3223 xxh_u32 const input1 = XXH_readLE32(input); 3224 xxh_u32 const input2 = XXH_readLE32(input + len - 4); 3225 xxh_u64 const bitflip = 3226 (XXH_readLE64(secret + 8) ^ XXH_readLE64(secret + 16)) - seed; 3227 xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); 3228 xxh_u64 const keyed = input64 ^ bitflip; 3229 return XXH3_rrmxmx(keyed, len); 3230 3231 } 3232 3233 } 3234 3235 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8 *input, 3236 size_t len, 3237 const xxh_u8 *secret, 3238 XXH64_hash_t seed) { 3239 3240 XXH_ASSERT(input != NULL); 3241 XXH_ASSERT(secret != NULL); 3242 XXH_ASSERT(8 <= len && len <= 16); 3243 { 3244 3245 xxh_u64 const bitflip1 = 3246 (XXH_readLE64(secret + 24) ^ XXH_readLE64(secret + 32)) + seed; 3247 xxh_u64 const bitflip2 = 3248 (XXH_readLE64(secret + 40) ^ XXH_readLE64(secret + 48)) - seed; 3249 xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; 3250 xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; 3251 xxh_u64 const acc = len + XXH_swap64(input_lo) + input_hi + 3252 XXH3_mul128_fold64(input_lo, input_hi); 3253 return XXH3_avalanche(acc); 3254 3255 } 3256 3257 } 3258 3259 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_0to16_64b(const xxh_u8 *input, 3260 size_t len, 3261 const xxh_u8 *secret, 3262 XXH64_hash_t seed) { 3263 3264 XXH_ASSERT(len <= 16); 3265 { 3266 3267 if (XXH_likely(len > 8)) 3268 return XXH3_len_9to16_64b(input, len, secret, seed); 3269 if (XXH_likely(len >= 4)) 3270 return XXH3_len_4to8_64b(input, len, secret, seed); 3271 if (len) return XXH3_len_1to3_64b(input, len, secret, seed); 3272 return XXH64_avalanche( 3273 seed ^ (XXH_readLE64(secret + 56) ^ XXH_readLE64(secret + 64))); 3274 3275 } 3276 3277 } 3278 3279 /* 3280 * DISCLAIMER: There are known *seed-dependent* multicollisions here due to 3281 * multiplication by zero, affecting hashes of lengths 17 to 240. 3282 * 3283 * However, they are very unlikely. 3284 * 3285 * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all 3286 * unseeded non-cryptographic hashes, it does not attempt to defend itself 3287 * against specially crafted inputs, only random inputs. 3288 * 3289 * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes 3290 * cancelling out the secret is taken an arbitrary number of times (addressed 3291 * in XXH3_accumulate_512), this collision is very unlikely with random inputs 3292 * and/or proper seeding: 3293 * 3294 * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a 3295 * function that is only called up to 16 times per hash with up to 240 bytes of 3296 * input. 3297 * 3298 * This is not too bad for a non-cryptographic hash function, especially with 3299 * only 64 bit outputs. 3300 * 3301 * The 128-bit variant (which trades some speed for strength) is NOT affected 3302 * by this, although it is always a good idea to use a proper seed if you care 3303 * about strength. 3304 */ 3305 XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input, 3306 const xxh_u8 *XXH_RESTRICT secret, 3307 xxh_u64 seed64) { 3308 3309 #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ 3310 && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ 3311 && \ 3312 !defined( \ 3313 XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ 3314 /* 3315 * UGLY HACK: 3316 * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in 3317 * slower code. 3318 * 3319 * By forcing seed64 into a register, we disrupt the cost model and 3320 * cause it to scalarize. See `XXH32_round()` 3321 * 3322 * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, 3323 * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on 3324 * GCC 9.2, despite both emitting scalar code. 3325 * 3326 * GCC generates much better scalar code than Clang for the rest of XXH3, 3327 * which is why finding a more optimal codepath is an interest. 3328 */ 3329 __asm__("" : "+r"(seed64)); 3330 #endif 3331 { 3332 3333 xxh_u64 const input_lo = XXH_readLE64(input); 3334 xxh_u64 const input_hi = XXH_readLE64(input + 8); 3335 return XXH3_mul128_fold64(input_lo ^ (XXH_readLE64(secret) + seed64), 3336 input_hi ^ (XXH_readLE64(secret + 8) - seed64)); 3337 3338 } 3339 3340 } 3341 3342 /* For mid range keys, XXH3 uses a Mum-hash variant. */ 3343 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_17to128_64b( 3344 const xxh_u8 *XXH_RESTRICT input, size_t len, 3345 const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) { 3346 3347 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); 3348 (void)secretSize; 3349 XXH_ASSERT(16 < len && len <= 128); 3350 3351 { 3352 3353 xxh_u64 acc = len * XXH_PRIME64_1; 3354 if (len > 32) { 3355 3356 if (len > 64) { 3357 3358 if (len > 96) { 3359 3360 acc += XXH3_mix16B(input + 48, secret + 96, seed); 3361 acc += XXH3_mix16B(input + len - 64, secret + 112, seed); 3362 3363 } 3364 3365 acc += XXH3_mix16B(input + 32, secret + 64, seed); 3366 acc += XXH3_mix16B(input + len - 48, secret + 80, seed); 3367 3368 } 3369 3370 acc += XXH3_mix16B(input + 16, secret + 32, seed); 3371 acc += XXH3_mix16B(input + len - 32, secret + 48, seed); 3372 3373 } 3374 3375 acc += XXH3_mix16B(input + 0, secret + 0, seed); 3376 acc += XXH3_mix16B(input + len - 16, secret + 16, seed); 3377 3378 return XXH3_avalanche(acc); 3379 3380 } 3381 3382 } 3383 3384 #define XXH3_MIDSIZE_MAX 240 3385 3386 XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b( 3387 const xxh_u8 *XXH_RESTRICT input, size_t len, 3388 const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) { 3389 3390 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); 3391 (void)secretSize; 3392 XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); 3393 3394 #define XXH3_MIDSIZE_STARTOFFSET 3 3395 #define XXH3_MIDSIZE_LASTOFFSET 17 3396 3397 { 3398 3399 xxh_u64 acc = len * XXH_PRIME64_1; 3400 int const nbRounds = (int)len / 16; 3401 int i; 3402 for (i = 0; i < 8; i++) { 3403 3404 acc += XXH3_mix16B(input + (16 * i), secret + (16 * i), seed); 3405 3406 } 3407 3408 acc = XXH3_avalanche(acc); 3409 XXH_ASSERT(nbRounds >= 8); 3410 #if defined(__clang__) /* Clang */ \ 3411 && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ 3412 && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ 3413 /* 3414 * UGLY HACK: 3415 * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. 3416 * In everywhere else, it uses scalar code. 3417 * 3418 * For 64->128-bit multiplies, even if the NEON was 100% optimal, it 3419 * would still be slower than UMAAL (see XXH_mult64to128). 3420 * 3421 * Unfortunately, Clang doesn't handle the long multiplies properly and 3422 * converts them to the nonexistent "vmulq_u64" intrinsic, which is then 3423 * scalarized into an ugly mess of VMOV.32 instructions. 3424 * 3425 * This mess is difficult to avoid without turning autovectorization 3426 * off completely, but they are usually relatively minor and/or not 3427 * worth it to fix. 3428 * 3429 * This loop is the easiest to fix, as unlike XXH32, this pragma 3430 * _actually works_ because it is a loop vectorization instead of an 3431 * SLP vectorization. 3432 */ 3433 #pragma clang loop vectorize(disable) 3434 #endif 3435 for (i = 8; i < nbRounds; i++) { 3436 3437 acc += 3438 XXH3_mix16B(input + (16 * i), 3439 secret + (16 * (i - 8)) + XXH3_MIDSIZE_STARTOFFSET, seed); 3440 3441 } 3442 3443 /* last bytes */ 3444 acc += XXH3_mix16B(input + len - 16, 3445 secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, 3446 seed); 3447 return XXH3_avalanche(acc); 3448 3449 } 3450 3451 } 3452 3453 /* ======= Long Keys ======= */ 3454 3455 #define XXH_STRIPE_LEN 64 3456 #define XXH_SECRET_CONSUME_RATE \ 3457 8 /* nb of secret bytes consumed at each accumulation */ 3458 #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) 3459 3460 #ifdef XXH_OLD_NAMES 3461 #define STRIPE_LEN XXH_STRIPE_LEN 3462 #define ACC_NB XXH_ACC_NB 3463 #endif 3464 3465 XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) { 3466 3467 if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); 3468 memcpy(dst, &v64, sizeof(v64)); 3469 3470 } 3471 3472 /* Several intrinsic functions below are supposed to accept __int64 as 3473 * argument, as documented in 3474 * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . However, 3475 * several environments do not define __int64 type, requiring a workaround. 3476 */ 3477 #if !defined(__VMS) && \ 3478 (defined(__cplusplus) || (defined(__STDC_VERSION__) && \ 3479 (__STDC_VERSION__ >= 199901L) /* C99 */)) 3480 typedef int64_t xxh_i64; 3481 #else 3482 /* the following type must have a width of 64-bit */ 3483 typedef long long xxh_i64; 3484 #endif 3485 3486 /* 3487 * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the 3488 * most optimized. 3489 * 3490 * It is a hardened version of UMAC, based off of FARSH's implementation. 3491 * 3492 * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD 3493 * implementations, and it is ridiculously fast. 3494 * 3495 * We harden it by mixing the original input to the accumulators as well as 3496 * the product. 3497 * 3498 * This means that in the (relatively likely) case of a multiply by zero, the 3499 * original input is preserved. 3500 * 3501 * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve 3502 * cross-pollination, as otherwise the upper and lower halves would be 3503 * essentially independent. 3504 * 3505 * This doesn't matter on 64-bit hashes since they all get merged together in 3506 * the end, so we skip the extra step. 3507 * 3508 * Both XXH3_64bits and XXH3_128bits use this subroutine. 3509 */ 3510 3511 #if (XXH_VECTOR == XXH_AVX512) || defined(XXH_X86DISPATCH) 3512 3513 #ifndef XXH_TARGET_AVX512 3514 #define XXH_TARGET_AVX512 /* disable attribute target */ 3515 #endif 3516 3517 XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_accumulate_512_avx512( 3518 void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, 3519 const void *XXH_RESTRICT secret) { 3520 3521 XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc; 3522 XXH_ASSERT((((size_t)acc) & 63) == 0); 3523 XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); 3524 3525 { 3526 3527 /* data_vec = input[0]; */ 3528 __m512i const data_vec = _mm512_loadu_si512(input); 3529 /* key_vec = secret[0]; */ 3530 __m512i const key_vec = _mm512_loadu_si512(secret); 3531 /* data_key = data_vec ^ key_vec; */ 3532 __m512i const data_key = _mm512_xor_si512(data_vec, key_vec); 3533 /* data_key_lo = data_key >> 32; */ 3534 __m512i const data_key_lo = 3535 _mm512_shuffle_epi32(data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); 3536 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ 3537 __m512i const product = _mm512_mul_epu32(data_key, data_key_lo); 3538 /* xacc[0] += swap(data_vec); */ 3539 __m512i const data_swap = 3540 _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); 3541 __m512i const sum = _mm512_add_epi64(*xacc, data_swap); 3542 /* xacc[0] += product; */ 3543 *xacc = _mm512_add_epi64(product, sum); 3544 3545 } 3546 3547 } 3548 3549 /* 3550 * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. 3551 * 3552 * Multiplication isn't perfect, as explained by Google in HighwayHash: 3553 * 3554 * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to 3555 * // varying degrees. In descending order of goodness, bytes 3556 * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. 3557 * // As expected, the upper and lower bytes are much worse. 3558 * 3559 * Source: 3560 * https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 3561 * 3562 * Since our algorithm uses a pseudorandom secret to add some variance into the 3563 * mix, we don't need to (or want to) mix as often or as much as HighwayHash 3564 * does. 3565 * 3566 * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid 3567 * extraction. 3568 * 3569 * Both XXH3_64bits and XXH3_128bits use this subroutine. 3570 */ 3571 3572 XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_scrambleAcc_avx512( 3573 void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) { 3574 3575 XXH_ASSERT((((size_t)acc) & 63) == 0); 3576 XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); 3577 { 3578 3579 XXH_ALIGN(64) __m512i *const xacc = (__m512i *)acc; 3580 const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); 3581 3582 /* xacc[0] ^= (xacc[0] >> 47) */ 3583 __m512i const acc_vec = *xacc; 3584 __m512i const shifted = _mm512_srli_epi64(acc_vec, 47); 3585 __m512i const data_vec = _mm512_xor_si512(acc_vec, shifted); 3586 /* xacc[0] ^= secret; */ 3587 __m512i const key_vec = _mm512_loadu_si512(secret); 3588 __m512i const data_key = _mm512_xor_si512(data_vec, key_vec); 3589 3590 /* xacc[0] *= XXH_PRIME32_1; */ 3591 __m512i const data_key_hi = 3592 _mm512_shuffle_epi32(data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); 3593 __m512i const prod_lo = _mm512_mul_epu32(data_key, prime32); 3594 __m512i const prod_hi = _mm512_mul_epu32(data_key_hi, prime32); 3595 *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); 3596 3597 } 3598 3599 } 3600 3601 XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512( 3602 void *XXH_RESTRICT customSecret, xxh_u64 seed64) { 3603 3604 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); 3605 XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); 3606 XXH_ASSERT(((size_t)customSecret & 63) == 0); 3607 (void)(&XXH_writeLE64); 3608 { 3609 3610 int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); 3611 __m512i const seed = _mm512_mask_set1_epi64( 3612 _mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64); 3613 3614 XXH_ALIGN(64) const __m512i *const src = (const __m512i *)XXH3_kSecret; 3615 XXH_ALIGN(64) __m512i *const dest = (__m512i *)customSecret; 3616 int i; 3617 for (i = 0; i < nbRounds; ++i) { 3618 3619 /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void 3620 * const*', this will warn "discards ‘const’ qualifier". */ 3621 union { 3622 3623 XXH_ALIGN(64) const __m512i *cp; 3624 XXH_ALIGN(64) void *p; 3625 3626 } remote_const_void; 3627 3628 remote_const_void.cp = src + i; 3629 dest[i] = 3630 _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed); 3631 3632 } 3633 3634 } 3635 3636 } 3637 3638 #endif 3639 3640 #if (XXH_VECTOR == XXH_AVX2) || defined(XXH_X86DISPATCH) 3641 3642 #ifndef XXH_TARGET_AVX2 3643 #define XXH_TARGET_AVX2 /* disable attribute target */ 3644 #endif 3645 3646 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2( 3647 void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, 3648 const void *XXH_RESTRICT secret) { 3649 3650 XXH_ASSERT((((size_t)acc) & 31) == 0); 3651 { 3652 3653 XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc; 3654 /* Unaligned. This is mainly for pointer arithmetic, and because 3655 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. 3656 */ 3657 const __m256i *const xinput = (const __m256i *)input; 3658 /* Unaligned. This is mainly for pointer arithmetic, and because 3659 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ 3660 const __m256i *const xsecret = (const __m256i *)secret; 3661 3662 size_t i; 3663 for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) { 3664 3665 /* data_vec = xinput[i]; */ 3666 __m256i const data_vec = _mm256_loadu_si256(xinput + i); 3667 /* key_vec = xsecret[i]; */ 3668 __m256i const key_vec = _mm256_loadu_si256(xsecret + i); 3669 /* data_key = data_vec ^ key_vec; */ 3670 __m256i const data_key = _mm256_xor_si256(data_vec, key_vec); 3671 /* data_key_lo = data_key >> 32; */ 3672 __m256i const data_key_lo = 3673 _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1)); 3674 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ 3675 __m256i const product = _mm256_mul_epu32(data_key, data_key_lo); 3676 /* xacc[i] += swap(data_vec); */ 3677 __m256i const data_swap = 3678 _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); 3679 __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); 3680 /* xacc[i] += product; */ 3681 xacc[i] = _mm256_add_epi64(product, sum); 3682 3683 } 3684 3685 } 3686 3687 } 3688 3689 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2( 3690 void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) { 3691 3692 XXH_ASSERT((((size_t)acc) & 31) == 0); 3693 { 3694 3695 XXH_ALIGN(32) __m256i *const xacc = (__m256i *)acc; 3696 /* Unaligned. This is mainly for pointer arithmetic, and because 3697 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ 3698 const __m256i *const xsecret = (const __m256i *)secret; 3699 const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); 3700 3701 size_t i; 3702 for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) { 3703 3704 /* xacc[i] ^= (xacc[i] >> 47) */ 3705 __m256i const acc_vec = xacc[i]; 3706 __m256i const shifted = _mm256_srli_epi64(acc_vec, 47); 3707 __m256i const data_vec = _mm256_xor_si256(acc_vec, shifted); 3708 /* xacc[i] ^= xsecret; */ 3709 __m256i const key_vec = _mm256_loadu_si256(xsecret + i); 3710 __m256i const data_key = _mm256_xor_si256(data_vec, key_vec); 3711 3712 /* xacc[i] *= XXH_PRIME32_1; */ 3713 __m256i const data_key_hi = 3714 _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1)); 3715 __m256i const prod_lo = _mm256_mul_epu32(data_key, prime32); 3716 __m256i const prod_hi = _mm256_mul_epu32(data_key_hi, prime32); 3717 xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); 3718 3719 } 3720 3721 } 3722 3723 } 3724 3725 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2( 3726 void *XXH_RESTRICT customSecret, xxh_u64 seed64) { 3727 3728 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); 3729 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); 3730 XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); 3731 (void)(&XXH_writeLE64); 3732 XXH_PREFETCH(customSecret); 3733 { 3734 3735 __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64, 3736 -(xxh_i64)seed64, (xxh_i64)seed64); 3737 3738 XXH_ALIGN(64) const __m256i *const src = (const __m256i *)XXH3_kSecret; 3739 XXH_ALIGN(64) __m256i * dest = (__m256i *)customSecret; 3740 3741 #if defined(__GNUC__) || defined(__clang__) 3742 /* 3743 * On GCC & Clang, marking 'dest' as modified will cause the compiler: 3744 * - do not extract the secret from sse registers in the internal loop 3745 * - use less common registers, and avoid pushing these reg into stack 3746 * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with 3747 * customSecret, and on aarch64, this prevented LDP from merging two 3748 * loads together for free. Putting the loads together before the stores 3749 * properly generates LDP. 3750 */ 3751 __asm__("" : "+r"(dest)); 3752 #endif 3753 3754 /* GCC -O2 need unroll loop manually */ 3755 dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src + 0), seed); 3756 dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src + 1), seed); 3757 dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src + 2), seed); 3758 dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src + 3), seed); 3759 dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src + 4), seed); 3760 dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src + 5), seed); 3761 3762 } 3763 3764 } 3765 3766 #endif 3767 3768 #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) 3769 3770 #ifndef XXH_TARGET_SSE2 3771 #define XXH_TARGET_SSE2 /* disable attribute target */ 3772 #endif 3773 3774 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2( 3775 void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, 3776 const void *XXH_RESTRICT secret) { 3777 3778 /* SSE2 is just a half-scale version of the AVX2 version. */ 3779 XXH_ASSERT((((size_t)acc) & 15) == 0); 3780 { 3781 3782 XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc; 3783 /* Unaligned. This is mainly for pointer arithmetic, and because 3784 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ 3785 const __m128i *const xinput = (const __m128i *)input; 3786 /* Unaligned. This is mainly for pointer arithmetic, and because 3787 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ 3788 const __m128i *const xsecret = (const __m128i *)secret; 3789 3790 size_t i; 3791 for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) { 3792 3793 /* data_vec = xinput[i]; */ 3794 __m128i const data_vec = _mm_loadu_si128(xinput + i); 3795 /* key_vec = xsecret[i]; */ 3796 __m128i const key_vec = _mm_loadu_si128(xsecret + i); 3797 /* data_key = data_vec ^ key_vec; */ 3798 __m128i const data_key = _mm_xor_si128(data_vec, key_vec); 3799 /* data_key_lo = data_key >> 32; */ 3800 __m128i const data_key_lo = 3801 _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1)); 3802 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ 3803 __m128i const product = _mm_mul_epu32(data_key, data_key_lo); 3804 /* xacc[i] += swap(data_vec); */ 3805 __m128i const data_swap = 3806 _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); 3807 __m128i const sum = _mm_add_epi64(xacc[i], data_swap); 3808 /* xacc[i] += product; */ 3809 xacc[i] = _mm_add_epi64(product, sum); 3810 3811 } 3812 3813 } 3814 3815 } 3816 3817 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2( 3818 void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) { 3819 3820 XXH_ASSERT((((size_t)acc) & 15) == 0); 3821 { 3822 3823 XXH_ALIGN(16) __m128i *const xacc = (__m128i *)acc; 3824 /* Unaligned. This is mainly for pointer arithmetic, and because 3825 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ 3826 const __m128i *const xsecret = (const __m128i *)secret; 3827 const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); 3828 3829 size_t i; 3830 for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) { 3831 3832 /* xacc[i] ^= (xacc[i] >> 47) */ 3833 __m128i const acc_vec = xacc[i]; 3834 __m128i const shifted = _mm_srli_epi64(acc_vec, 47); 3835 __m128i const data_vec = _mm_xor_si128(acc_vec, shifted); 3836 /* xacc[i] ^= xsecret[i]; */ 3837 __m128i const key_vec = _mm_loadu_si128(xsecret + i); 3838 __m128i const data_key = _mm_xor_si128(data_vec, key_vec); 3839 3840 /* xacc[i] *= XXH_PRIME32_1; */ 3841 __m128i const data_key_hi = 3842 _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1)); 3843 __m128i const prod_lo = _mm_mul_epu32(data_key, prime32); 3844 __m128i const prod_hi = _mm_mul_epu32(data_key_hi, prime32); 3845 xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); 3846 3847 } 3848 3849 } 3850 3851 } 3852 3853 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2( 3854 void *XXH_RESTRICT customSecret, xxh_u64 seed64) { 3855 3856 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); 3857 (void)(&XXH_writeLE64); 3858 { 3859 3860 int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); 3861 3862 #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 3863 // MSVC 32bit mode does not support _mm_set_epi64x before 2015 3864 XXH_ALIGN(16) 3865 const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, -(xxh_i64)seed64}; 3866 __m128i const seed = _mm_load_si128((__m128i const *)seed64x2); 3867 #else 3868 __m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64); 3869 #endif 3870 int i; 3871 3872 XXH_ALIGN(64) const float *const src = (float const *)XXH3_kSecret; 3873 XXH_ALIGN(XXH_SEC_ALIGN) __m128i *dest = (__m128i *)customSecret; 3874 #if defined(__GNUC__) || defined(__clang__) 3875 /* 3876 * On GCC & Clang, marking 'dest' as modified will cause the compiler: 3877 * - do not extract the secret from sse registers in the internal loop 3878 * - use less common registers, and avoid pushing these reg into stack 3879 */ 3880 __asm__("" : "+r"(dest)); 3881 #endif 3882 3883 for (i = 0; i < nbRounds; ++i) { 3884 3885 dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src + i * 4)), seed); 3886 3887 } 3888 3889 } 3890 3891 } 3892 3893 #endif 3894 3895 #if (XXH_VECTOR == XXH_NEON) 3896 3897 XXH_FORCE_INLINE void XXH3_accumulate_512_neon( 3898 void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, 3899 const void *XXH_RESTRICT secret) { 3900 3901 XXH_ASSERT((((size_t)acc) & 15) == 0); 3902 { 3903 3904 XXH_ALIGN(16) uint64x2_t *const xacc = (uint64x2_t *)acc; 3905 /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. 3906 */ 3907 uint8_t const *const xinput = (const uint8_t *)input; 3908 uint8_t const *const xsecret = (const uint8_t *)secret; 3909 3910 size_t i; 3911 for (i = 0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) { 3912 3913 /* data_vec = xinput[i]; */ 3914 uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); 3915 /* key_vec = xsecret[i]; */ 3916 uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); 3917 uint64x2_t data_key; 3918 uint32x2_t data_key_lo, data_key_hi; 3919 /* xacc[i] += swap(data_vec); */ 3920 uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); 3921 uint64x2_t const swapped = vextq_u64(data64, data64, 1); 3922 xacc[i] = vaddq_u64(xacc[i], swapped); 3923 /* data_key = data_vec ^ key_vec; */ 3924 data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); 3925 /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); 3926 * data_key_hi = (uint32x2_t) (data_key >> 32); 3927 * data_key = UNDEFINED; */ 3928 XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); 3929 /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ 3930 xacc[i] = vmlal_u32(xacc[i], data_key_lo, data_key_hi); 3931 3932 } 3933 3934 } 3935 3936 } 3937 3938 XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void *XXH_RESTRICT acc, 3939 const void *XXH_RESTRICT secret) { 3940 3941 XXH_ASSERT((((size_t)acc) & 15) == 0); 3942 3943 { 3944 3945 uint64x2_t * xacc = (uint64x2_t *)acc; 3946 uint8_t const *xsecret = (uint8_t const *)secret; 3947 uint32x2_t prime = vdup_n_u32(XXH_PRIME32_1); 3948 3949 size_t i; 3950 for (i = 0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) { 3951 3952 /* xacc[i] ^= (xacc[i] >> 47); */ 3953 uint64x2_t acc_vec = xacc[i]; 3954 uint64x2_t shifted = vshrq_n_u64(acc_vec, 47); 3955 uint64x2_t data_vec = veorq_u64(acc_vec, shifted); 3956 3957 /* xacc[i] ^= xsecret[i]; */ 3958 uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); 3959 uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec)); 3960 3961 /* xacc[i] *= XXH_PRIME32_1 */ 3962 uint32x2_t data_key_lo, data_key_hi; 3963 /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); 3964 * data_key_hi = (uint32x2_t) (xacc[i] >> 32); 3965 * xacc[i] = UNDEFINED; */ 3966 XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); 3967 { /* 3968 * prod_hi = (data_key >> 32) * XXH_PRIME32_1; 3969 * 3970 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will 3971 * incorrectly "optimize" this: 3972 * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b)); 3973 * shifted = vshll_n_u32(tmp, 32); 3974 * to this: 3975 * tmp = "vmulq_u64"(a, b); // no such thing! 3976 * shifted = vshlq_n_u64(tmp, 32); 3977 * 3978 * However, unlike SSE, Clang lacks a 64-bit multiply routine 3979 * for NEON, and it scalarizes two 64-bit multiplies instead. 3980 * 3981 * vmull_u32 has the same timing as vmul_u32, and it avoids 3982 * this bug completely. 3983 * See https://bugs.llvm.org/show_bug.cgi?id=39967 3984 */ 3985 uint64x2_t prod_hi = vmull_u32(data_key_hi, prime); 3986 /* xacc[i] = prod_hi << 32; */ 3987 xacc[i] = vshlq_n_u64(prod_hi, 32); 3988 /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */ 3989 xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime); 3990 3991 } 3992 3993 } 3994 3995 } 3996 3997 } 3998 3999 #endif 4000 4001 #if (XXH_VECTOR == XXH_VSX) 4002 4003 XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT acc, 4004 const void *XXH_RESTRICT input, 4005 const void *XXH_RESTRICT secret) { 4006 4007 xxh_u64x2 *const xacc = (xxh_u64x2 *)acc; /* presumed aligned */ 4008 xxh_u64x2 const *const xinput = 4009 (xxh_u64x2 const *)input; /* no alignment restriction */ 4010 xxh_u64x2 const *const xsecret = 4011 (xxh_u64x2 const *)secret; /* no alignment restriction */ 4012 xxh_u64x2 const v32 = {32, 32}; 4013 size_t i; 4014 for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { 4015 4016 /* data_vec = xinput[i]; */ 4017 xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); 4018 /* key_vec = xsecret[i]; */ 4019 xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); 4020 xxh_u64x2 const data_key = data_vec ^ key_vec; 4021 /* shuffled = (data_key << 32) | (data_key >> 32); */ 4022 xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); 4023 /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 4024 * 0xFFFFFFFF); */ 4025 xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); 4026 xacc[i] += product; 4027 4028 /* swap high and low halves */ 4029 #ifdef __s390x__ 4030 xacc[i] += vec_permi(data_vec, data_vec, 2); 4031 #else 4032 xacc[i] += vec_xxpermdi(data_vec, data_vec, 2); 4033 #endif 4034 4035 } 4036 4037 } 4038 4039 XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void *XXH_RESTRICT acc, 4040 const void *XXH_RESTRICT secret) { 4041 4042 XXH_ASSERT((((size_t)acc) & 15) == 0); 4043 4044 { 4045 4046 xxh_u64x2 *const xacc = (xxh_u64x2 *)acc; 4047 const xxh_u64x2 *const xsecret = (const xxh_u64x2 *)secret; 4048 /* constants */ 4049 xxh_u64x2 const v32 = {32, 32}; 4050 xxh_u64x2 const v47 = {47, 47}; 4051 xxh_u32x4 const prime = {XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, 4052 XXH_PRIME32_1}; 4053 size_t i; 4054 for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { 4055 4056 /* xacc[i] ^= (xacc[i] >> 47); */ 4057 xxh_u64x2 const acc_vec = xacc[i]; 4058 xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); 4059 4060 /* xacc[i] ^= xsecret[i]; */ 4061 xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); 4062 xxh_u64x2 const data_key = data_vec ^ key_vec; 4063 4064 /* xacc[i] *= XXH_PRIME32_1 */ 4065 /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 4066 * 0xFFFFFFFF); */ 4067 xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); 4068 /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ 4069 xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); 4070 xacc[i] = prod_odd + (prod_even << v32); 4071 4072 } 4073 4074 } 4075 4076 } 4077 4078 #endif 4079 4080 /* scalar variants - universal */ 4081 4082 XXH_FORCE_INLINE void XXH3_accumulate_512_scalar( 4083 void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, 4084 const void *XXH_RESTRICT secret) { 4085 4086 XXH_ALIGN(XXH_ACC_ALIGN) 4087 xxh_u64 *const xacc = (xxh_u64 *)acc; /* presumed aligned */ 4088 const xxh_u8 *const xinput = 4089 (const xxh_u8 *)input; /* no alignment restriction */ 4090 const xxh_u8 *const xsecret = 4091 (const xxh_u8 *)secret; /* no alignment restriction */ 4092 size_t i; 4093 XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN - 1)) == 0); 4094 for (i = 0; i < XXH_ACC_NB; i++) { 4095 4096 xxh_u64 const data_val = XXH_readLE64(xinput + 8 * i); 4097 xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i * 8); 4098 xacc[i ^ 1] += data_val; /* swap adjacent lanes */ 4099 xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); 4100 4101 } 4102 4103 } 4104 4105 XXH_FORCE_INLINE void XXH3_scrambleAcc_scalar(void *XXH_RESTRICT acc, 4106 const void *XXH_RESTRICT secret) { 4107 4108 XXH_ALIGN(XXH_ACC_ALIGN) 4109 xxh_u64 *const xacc = (xxh_u64 *)acc; /* presumed aligned */ 4110 const xxh_u8 *const xsecret = 4111 (const xxh_u8 *)secret; /* no alignment restriction */ 4112 size_t i; 4113 XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN - 1)) == 0); 4114 for (i = 0; i < XXH_ACC_NB; i++) { 4115 4116 xxh_u64 const key64 = XXH_readLE64(xsecret + 8 * i); 4117 xxh_u64 acc64 = xacc[i]; 4118 acc64 = XXH_xorshift64(acc64, 47); 4119 acc64 ^= key64; 4120 acc64 *= XXH_PRIME32_1; 4121 xacc[i] = acc64; 4122 4123 } 4124 4125 } 4126 4127 XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar( 4128 void *XXH_RESTRICT customSecret, xxh_u64 seed64) { 4129 4130 /* 4131 * We need a separate pointer for the hack below, 4132 * which requires a non-const pointer. 4133 * Any decent compiler will optimize this out otherwise. 4134 */ 4135 const xxh_u8 *kSecretPtr = XXH3_kSecret; 4136 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); 4137 4138 #if defined(__clang__) && defined(__aarch64__) 4139 /* 4140 * UGLY HACK: 4141 * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are 4142 * placed sequentially, in order, at the top of the unrolled loop. 4143 * 4144 * While MOVK is great for generating constants (2 cycles for a 64-bit 4145 * constant compared to 4 cycles for LDR), long MOVK chains stall the 4146 * integer pipelines: 4147 * I L S 4148 * MOVK 4149 * MOVK 4150 * MOVK 4151 * MOVK 4152 * ADD 4153 * SUB STR 4154 * STR 4155 * By forcing loads from memory (as the asm line causes Clang to assume 4156 * that XXH3_kSecretPtr has been changed), the pipelines are used more 4157 * efficiently: 4158 * I L S 4159 * LDR 4160 * ADD LDR 4161 * SUB STR 4162 * STR 4163 * XXH3_64bits_withSeed, len == 256, Snapdragon 835 4164 * without hack: 2654.4 MB/s 4165 * with hack: 3202.9 MB/s 4166 */ 4167 __asm__("" : "+r"(kSecretPtr)); 4168 #endif 4169 /* 4170 * Note: in debug mode, this overrides the asm optimization 4171 * and Clang will emit MOVK chains again. 4172 */ 4173 XXH_ASSERT(kSecretPtr == XXH3_kSecret); 4174 4175 { 4176 4177 int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; 4178 int i; 4179 for (i = 0; i < nbRounds; i++) { 4180 4181 /* 4182 * The asm hack causes Clang to assume that kSecretPtr aliases with 4183 * customSecret, and on aarch64, this prevented LDP from merging two 4184 * loads together for free. Putting the loads together before the stores 4185 * properly generates LDP. 4186 */ 4187 xxh_u64 lo = XXH_readLE64(kSecretPtr + 16 * i) + seed64; 4188 xxh_u64 hi = XXH_readLE64(kSecretPtr + 16 * i + 8) - seed64; 4189 XXH_writeLE64((xxh_u8 *)customSecret + 16 * i, lo); 4190 XXH_writeLE64((xxh_u8 *)customSecret + 16 * i + 8, hi); 4191 4192 } 4193 4194 } 4195 4196 } 4197 4198 typedef void (*XXH3_f_accumulate_512)(void *XXH_RESTRICT, const void *, 4199 const void *); 4200 typedef void (*XXH3_f_scrambleAcc)(void *XXH_RESTRICT, const void *); 4201 typedef void (*XXH3_f_initCustomSecret)(void *XXH_RESTRICT, xxh_u64); 4202 4203 #if (XXH_VECTOR == XXH_AVX512) 4204 4205 #define XXH3_accumulate_512 XXH3_accumulate_512_avx512 4206 #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 4207 #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 4208 4209 #elif (XXH_VECTOR == XXH_AVX2) 4210 4211 #define XXH3_accumulate_512 XXH3_accumulate_512_avx2 4212 #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 4213 #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 4214 4215 #elif (XXH_VECTOR == XXH_SSE2) 4216 4217 #define XXH3_accumulate_512 XXH3_accumulate_512_sse2 4218 #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 4219 #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 4220 4221 #elif (XXH_VECTOR == XXH_NEON) 4222 4223 #define XXH3_accumulate_512 XXH3_accumulate_512_neon 4224 #define XXH3_scrambleAcc XXH3_scrambleAcc_neon 4225 #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar 4226 4227 #elif (XXH_VECTOR == XXH_VSX) 4228 4229 #define XXH3_accumulate_512 XXH3_accumulate_512_vsx 4230 #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx 4231 #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar 4232 4233 #else /* scalar */ 4234 4235 #define XXH3_accumulate_512 XXH3_accumulate_512_scalar 4236 #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar 4237 #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar 4238 4239 #endif 4240 4241 #ifndef XXH_PREFETCH_DIST 4242 #ifdef __clang__ 4243 #define XXH_PREFETCH_DIST 320 4244 #else 4245 #if (XXH_VECTOR == XXH_AVX512) 4246 #define XXH_PREFETCH_DIST 512 4247 #else 4248 #define XXH_PREFETCH_DIST 384 4249 #endif 4250 #endif /* __clang__ */ 4251 #endif /* XXH_PREFETCH_DIST */ 4252 4253 /* 4254 * XXH3_accumulate() 4255 * Loops over XXH3_accumulate_512(). 4256 * Assumption: nbStripes will not overflow the secret size 4257 */ 4258 XXH_FORCE_INLINE void XXH3_accumulate(xxh_u64 *XXH_RESTRICT acc, 4259 const xxh_u8 *XXH_RESTRICT input, 4260 const xxh_u8 *XXH_RESTRICT secret, 4261 size_t nbStripes, 4262 XXH3_f_accumulate_512 f_acc512) { 4263 4264 size_t n; 4265 for (n = 0; n < nbStripes; n++) { 4266 4267 const xxh_u8 *const in = input + n * XXH_STRIPE_LEN; 4268 XXH_PREFETCH(in + XXH_PREFETCH_DIST); 4269 f_acc512(acc, in, secret + n * XXH_SECRET_CONSUME_RATE); 4270 4271 } 4272 4273 } 4274 4275 XXH_FORCE_INLINE void XXH3_hashLong_internal_loop( 4276 xxh_u64 *XXH_RESTRICT acc, const xxh_u8 *XXH_RESTRICT input, size_t len, 4277 const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, 4278 XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) { 4279 4280 size_t const nbStripesPerBlock = 4281 (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; 4282 size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; 4283 size_t const nb_blocks = (len - 1) / block_len; 4284 4285 size_t n; 4286 4287 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); 4288 4289 for (n = 0; n < nb_blocks; n++) { 4290 4291 XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock, 4292 f_acc512); 4293 f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); 4294 4295 } 4296 4297 /* last partial block */ 4298 XXH_ASSERT(len > XXH_STRIPE_LEN); 4299 { 4300 4301 size_t const nbStripes = 4302 ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; 4303 XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); 4304 XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes, 4305 f_acc512); 4306 4307 /* last stripe */ 4308 { 4309 4310 const xxh_u8 *const p = input + len - XXH_STRIPE_LEN; 4311 #define XXH_SECRET_LASTACC_START \ 4312 7 /* not aligned on 8, last secret is different from acc & scrambler */ 4313 f_acc512(acc, p, 4314 secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); 4315 4316 } 4317 4318 } 4319 4320 } 4321 4322 XXH_FORCE_INLINE xxh_u64 XXH3_mix2Accs(const xxh_u64 *XXH_RESTRICT acc, 4323 const xxh_u8 *XXH_RESTRICT secret) { 4324 4325 return XXH3_mul128_fold64(acc[0] ^ XXH_readLE64(secret), 4326 acc[1] ^ XXH_readLE64(secret + 8)); 4327 4328 } 4329 4330 static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc, 4331 const xxh_u8 *XXH_RESTRICT secret, 4332 xxh_u64 start) { 4333 4334 xxh_u64 result64 = start; 4335 size_t i = 0; 4336 4337 for (i = 0; i < 4; i++) { 4338 4339 result64 += XXH3_mix2Accs(acc + 2 * i, secret + 16 * i); 4340 #if defined(__clang__) /* Clang */ \ 4341 && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ 4342 && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ 4343 && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ 4344 /* 4345 * UGLY HACK: 4346 * Prevent autovectorization on Clang ARMv7-a. Exact same problem as 4347 * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. 4348 * XXH3_64bits, len == 256, Snapdragon 835: 4349 * without hack: 2063.7 MB/s 4350 * with hack: 2560.7 MB/s 4351 */ 4352 __asm__("" : "+r"(result64)); 4353 #endif 4354 4355 } 4356 4357 return XXH3_avalanche(result64); 4358 4359 } 4360 4361 #define XXH3_INIT_ACC \ 4362 { \ 4363 \ 4364 XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ 4365 XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \ 4366 \ 4367 } 4368 4369 XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal( 4370 const void *XXH_RESTRICT input, size_t len, const void *XXH_RESTRICT secret, 4371 size_t secretSize, XXH3_f_accumulate_512 f_acc512, 4372 XXH3_f_scrambleAcc f_scramble) { 4373 4374 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; 4375 4376 XXH3_hashLong_internal_loop(acc, (const xxh_u8 *)input, len, 4377 (const xxh_u8 *)secret, secretSize, f_acc512, 4378 f_scramble); 4379 4380 /* converge into final hash */ 4381 XXH_STATIC_ASSERT(sizeof(acc) == 64); 4382 /* do not align on 8, so that the secret is different from the accumulator 4383 */ 4384 #define XXH_SECRET_MERGEACCS_START 11 4385 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); 4386 return XXH3_mergeAccs(acc, 4387 (const xxh_u8 *)secret + XXH_SECRET_MERGEACCS_START, 4388 (xxh_u64)len * XXH_PRIME64_1); 4389 4390 } 4391 4392 /* 4393 * It's important for performance that XXH3_hashLong is not inlined. 4394 */ 4395 XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSecret( 4396 const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, 4397 const xxh_u8 *XXH_RESTRICT secret, size_t secretLen) { 4398 4399 (void)seed64; 4400 return XXH3_hashLong_64b_internal(input, len, secret, secretLen, 4401 XXH3_accumulate_512, XXH3_scrambleAcc); 4402 4403 } 4404 4405 /* 4406 * It's important for performance that XXH3_hashLong is not inlined. 4407 * Since the function is not inlined, the compiler may not be able to understand 4408 * that, in some scenarios, its `secret` argument is actually a compile time 4409 * constant. This variant enforces that the compiler can detect that, and uses 4410 * this opportunity to streamline the generated code for better performance. 4411 */ 4412 XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_default( 4413 const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, 4414 const xxh_u8 *XXH_RESTRICT secret, size_t secretLen) { 4415 4416 (void)seed64; 4417 (void)secret; 4418 (void)secretLen; 4419 return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, 4420 sizeof(XXH3_kSecret), XXH3_accumulate_512, 4421 XXH3_scrambleAcc); 4422 4423 } 4424 4425 /* 4426 * XXH3_hashLong_64b_withSeed(): 4427 * Generate a custom key based on alteration of default XXH3_kSecret with the 4428 * seed, and then use this key for long mode hashing. 4429 * 4430 * This operation is decently fast but nonetheless costs a little bit of time. 4431 * Try to avoid it whenever possible (typically when seed==0). 4432 * 4433 * It's important for performance that XXH3_hashLong is not inlined. Not sure 4434 * why (uop cache maybe?), but the difference is large and easily measurable. 4435 */ 4436 XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed_internal( 4437 const void *input, size_t len, XXH64_hash_t seed, 4438 XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble, 4439 XXH3_f_initCustomSecret f_initSec) { 4440 4441 if (seed == 0) 4442 return XXH3_hashLong_64b_internal( 4443 input, len, XXH3_kSecret, sizeof(XXH3_kSecret), f_acc512, f_scramble); 4444 { 4445 4446 XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; 4447 f_initSec(secret, seed); 4448 return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), 4449 f_acc512, f_scramble); 4450 4451 } 4452 4453 } 4454 4455 /* 4456 * It's important for performance that XXH3_hashLong is not inlined. 4457 */ 4458 XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed(const void * input, 4459 size_t len, 4460 XXH64_hash_t seed, 4461 const xxh_u8 *secret, 4462 size_t secretLen) { 4463 4464 (void)secret; 4465 (void)secretLen; 4466 return XXH3_hashLong_64b_withSeed_internal( 4467 input, len, seed, XXH3_accumulate_512, XXH3_scrambleAcc, 4468 XXH3_initCustomSecret); 4469 4470 } 4471 4472 typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void *XXH_RESTRICT, size_t, 4473 XXH64_hash_t, 4474 const xxh_u8 *XXH_RESTRICT, size_t); 4475 4476 XXH_FORCE_INLINE XXH64_hash_t 4477 XXH3_64bits_internal(const void *XXH_RESTRICT input, size_t len, 4478 XXH64_hash_t seed64, const void *XXH_RESTRICT secret, 4479 size_t secretLen, XXH3_hashLong64_f f_hashLong) { 4480 4481 XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); 4482 /* 4483 * If an action is to be taken if `secretLen` condition is not respected, 4484 * it should be done here. 4485 * For now, it's a contract pre-condition. 4486 * Adding a check and a branch here would cost performance at every hash. 4487 * Also, note that function signature doesn't offer room to return an error. 4488 */ 4489 if (len <= 16) 4490 return XXH3_len_0to16_64b((const xxh_u8 *)input, len, 4491 (const xxh_u8 *)secret, seed64); 4492 if (len <= 128) 4493 return XXH3_len_17to128_64b((const xxh_u8 *)input, len, 4494 (const xxh_u8 *)secret, secretLen, seed64); 4495 if (len <= XXH3_MIDSIZE_MAX) 4496 return XXH3_len_129to240_64b((const xxh_u8 *)input, len, 4497 (const xxh_u8 *)secret, secretLen, seed64); 4498 return f_hashLong(input, len, seed64, (const xxh_u8 *)secret, secretLen); 4499 4500 } 4501 4502 /* === Public entry point === */ 4503 4504 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) { 4505 4506 return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), 4507 XXH3_hashLong_64b_default); 4508 4509 } 4510 4511 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input, 4512 size_t len, 4513 const void *secret, 4514 size_t secretSize) { 4515 4516 return XXH3_64bits_internal(input, len, 0, secret, secretSize, 4517 XXH3_hashLong_64b_withSecret); 4518 4519 } 4520 4521 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *input, size_t len, 4522 XXH64_hash_t seed) { 4523 4524 return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, 4525 sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); 4526 4527 } 4528 4529 /* === XXH3 streaming === */ 4530 4531 /* 4532 * Malloc's a pointer that is always aligned to align. 4533 * 4534 * This must be freed with `XXH_alignedFree()`. 4535 * 4536 * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte 4537 * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 4538 * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. 4539 * 4540 * This underalignment previously caused a rather obvious crash which went 4541 * completely unnoticed due to XXH3_createState() not actually being tested. 4542 * Credit to RedSpah for noticing this bug. 4543 * 4544 * The alignment is done manually: Functions like posix_memalign or _mm_malloc 4545 * are avoided: To maintain portability, we would have to write a fallback 4546 * like this anyways, and besides, testing for the existence of library 4547 * functions without relying on external build tools is impossible. 4548 * 4549 * The method is simple: Overallocate, manually align, and store the offset 4550 * to the original behind the returned pointer. 4551 * 4552 * Align must be a power of 2 and 8 <= align <= 128. 4553 */ 4554 static void *XXH_alignedMalloc(size_t s, size_t align) { 4555 4556 XXH_ASSERT(align <= 128 && align >= 8); /* range check */ 4557 XXH_ASSERT((align & (align - 1)) == 0); /* power of 2 */ 4558 XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ 4559 { /* Overallocate to make room for manual realignment and an offset byte */ 4560 xxh_u8 *base = (xxh_u8 *)XXH_malloc(s + align); 4561 if (base != NULL) { 4562 4563 /* 4564 * Get the offset needed to align this pointer. 4565 * 4566 * Even if the returned pointer is aligned, there will always be 4567 * at least one byte to store the offset to the original pointer. 4568 */ 4569 size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ 4570 /* Add the offset for the now-aligned pointer */ 4571 xxh_u8 *ptr = base + offset; 4572 4573 XXH_ASSERT((size_t)ptr % align == 0); 4574 4575 /* Store the offset immediately before the returned pointer. */ 4576 ptr[-1] = (xxh_u8)offset; 4577 return ptr; 4578 4579 } 4580 4581 return NULL; 4582 4583 } 4584 4585 } 4586 4587 /* 4588 * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass 4589 * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. 4590 */ 4591 static void XXH_alignedFree(void *p) { 4592 4593 if (p != NULL) { 4594 4595 xxh_u8 *ptr = (xxh_u8 *)p; 4596 /* Get the offset byte we added in XXH_malloc. */ 4597 xxh_u8 offset = ptr[-1]; 4598 /* Free the original malloc'd pointer */ 4599 xxh_u8 *base = ptr - offset; 4600 XXH_free(base); 4601 4602 } 4603 4604 } 4605 4606 XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) { 4607 4608 XXH3_state_t *const state = 4609 (XXH3_state_t *)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); 4610 if (state == NULL) return NULL; 4611 XXH3_INITSTATE(state); 4612 return state; 4613 4614 } 4615 4616 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) { 4617 4618 XXH_alignedFree(statePtr); 4619 return XXH_OK; 4620 4621 } 4622 4623 XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t * dst_state, 4624 const XXH3_state_t *src_state) { 4625 4626 memcpy(dst_state, src_state, sizeof(*dst_state)); 4627 4628 } 4629 4630 static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr, 4631 XXH64_hash_t seed, const void *secret, 4632 size_t secretSize) { 4633 4634 size_t const initStart = offsetof(XXH3_state_t, bufferedSize); 4635 size_t const initLength = 4636 offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; 4637 XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); 4638 XXH_ASSERT(statePtr != NULL); 4639 /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ 4640 memset((char *)statePtr + initStart, 0, initLength); 4641 statePtr->acc[0] = XXH_PRIME32_3; 4642 statePtr->acc[1] = XXH_PRIME64_1; 4643 statePtr->acc[2] = XXH_PRIME64_2; 4644 statePtr->acc[3] = XXH_PRIME64_3; 4645 statePtr->acc[4] = XXH_PRIME64_4; 4646 statePtr->acc[5] = XXH_PRIME32_2; 4647 statePtr->acc[6] = XXH_PRIME64_5; 4648 statePtr->acc[7] = XXH_PRIME32_1; 4649 statePtr->seed = seed; 4650 statePtr->extSecret = (const unsigned char *)secret; 4651 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); 4652 statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; 4653 statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; 4654 4655 } 4656 4657 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr) { 4658 4659 if (statePtr == NULL) return XXH_ERROR; 4660 XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret, 4661 XXH_SECRET_DEFAULT_SIZE); 4662 return XXH_OK; 4663 4664 } 4665 4666 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret( 4667 XXH3_state_t *statePtr, const void *secret, size_t secretSize) { 4668 4669 if (statePtr == NULL) return XXH_ERROR; 4670 XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize); 4671 if (secret == NULL) return XXH_ERROR; 4672 if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; 4673 return XXH_OK; 4674 4675 } 4676 4677 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr, 4678 XXH64_hash_t seed) { 4679 4680 if (statePtr == NULL) return XXH_ERROR; 4681 if (seed == 0) return XXH3_64bits_reset(statePtr); 4682 if (seed != statePtr->seed) 4683 XXH3_initCustomSecret(statePtr->customSecret, seed); 4684 XXH3_64bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); 4685 return XXH_OK; 4686 4687 } 4688 4689 /* Note : when XXH3_consumeStripes() is invoked, 4690 * there must be a guarantee that at least one more byte must be consumed from 4691 * input 4692 * so that the function can blindly consume all stripes using the "normal" 4693 * secret segment */ 4694 XXH_FORCE_INLINE void XXH3_consumeStripes( 4695 xxh_u64 *XXH_RESTRICT acc, size_t *XXH_RESTRICT nbStripesSoFarPtr, 4696 size_t nbStripesPerBlock, const xxh_u8 *XXH_RESTRICT input, 4697 size_t nbStripes, const xxh_u8 *XXH_RESTRICT secret, size_t secretLimit, 4698 XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) { 4699 4700 XXH_ASSERT(nbStripes <= 4701 nbStripesPerBlock); /* can handle max 1 scramble per invocation */ 4702 XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); 4703 if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) { 4704 4705 /* need a scrambling operation */ 4706 size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr; 4707 size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock; 4708 XXH3_accumulate(acc, input, 4709 secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, 4710 nbStripesToEndofBlock, f_acc512); 4711 f_scramble(acc, secret + secretLimit); 4712 XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, 4713 nbStripesAfterBlock, f_acc512); 4714 *nbStripesSoFarPtr = nbStripesAfterBlock; 4715 4716 } else { 4717 4718 XXH3_accumulate(acc, input, 4719 secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, 4720 nbStripes, f_acc512); 4721 *nbStripesSoFarPtr += nbStripes; 4722 4723 } 4724 4725 } 4726 4727 /* 4728 * Both XXH3_64bits_update and XXH3_128bits_update use this routine. 4729 */ 4730 XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state, 4731 const xxh_u8 *input, size_t len, 4732 XXH3_f_accumulate_512 f_acc512, 4733 XXH3_f_scrambleAcc f_scramble) { 4734 4735 if (input == NULL) 4736 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ 4737 (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) 4738 return XXH_OK; 4739 #else 4740 return XXH_ERROR; 4741 #endif 4742 4743 { 4744 4745 const xxh_u8 *const bEnd = input + len; 4746 const unsigned char *const secret = 4747 (state->extSecret == NULL) ? state->customSecret : state->extSecret; 4748 4749 state->totalLen += len; 4750 4751 if (state->bufferedSize + len <= 4752 XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */ 4753 XXH_memcpy(state->buffer + state->bufferedSize, input, len); 4754 state->bufferedSize += (XXH32_hash_t)len; 4755 return XXH_OK; 4756 4757 } 4758 4759 /* total input is now > XXH3_INTERNALBUFFER_SIZE */ 4760 4761 #define XXH3_INTERNALBUFFER_STRIPES \ 4762 (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) 4763 XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 4764 0); /* clean multiple */ 4765 4766 /* 4767 * Internal buffer is partially filled (always, except at beginning) 4768 * Complete it, then consume it. 4769 */ 4770 if (state->bufferedSize) { 4771 4772 size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; 4773 XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); 4774 input += loadSize; 4775 XXH3_consumeStripes(state->acc, &state->nbStripesSoFar, 4776 state->nbStripesPerBlock, state->buffer, 4777 XXH3_INTERNALBUFFER_STRIPES, secret, 4778 state->secretLimit, f_acc512, f_scramble); 4779 state->bufferedSize = 0; 4780 4781 } 4782 4783 XXH_ASSERT(input < bEnd); 4784 4785 /* Consume input by a multiple of internal buffer size */ 4786 if (input + XXH3_INTERNALBUFFER_SIZE < bEnd) { 4787 4788 const xxh_u8 *const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; 4789 do { 4790 4791 XXH3_consumeStripes(state->acc, &state->nbStripesSoFar, 4792 state->nbStripesPerBlock, input, 4793 XXH3_INTERNALBUFFER_STRIPES, secret, 4794 state->secretLimit, f_acc512, f_scramble); 4795 input += XXH3_INTERNALBUFFER_SIZE; 4796 4797 } while (input < limit); 4798 4799 /* for last partial stripe */ 4800 memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, 4801 input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); 4802 4803 } 4804 4805 XXH_ASSERT(input < bEnd); 4806 4807 /* Some remaining input (always) : buffer it */ 4808 XXH_memcpy(state->buffer, input, (size_t)(bEnd - input)); 4809 state->bufferedSize = (XXH32_hash_t)(bEnd - input); 4810 4811 } 4812 4813 return XXH_OK; 4814 4815 } 4816 4817 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *state, 4818 const void *input, size_t len) { 4819 4820 return XXH3_update(state, (const xxh_u8 *)input, len, XXH3_accumulate_512, 4821 XXH3_scrambleAcc); 4822 4823 } 4824 4825 XXH_FORCE_INLINE void XXH3_digest_long(XXH64_hash_t * acc, 4826 const XXH3_state_t * state, 4827 const unsigned char *secret) { 4828 4829 /* 4830 * Digest on a local copy. This way, the state remains unaltered, and it can 4831 * continue ingesting more input afterwards. 4832 */ 4833 memcpy(acc, state->acc, sizeof(state->acc)); 4834 if (state->bufferedSize >= XXH_STRIPE_LEN) { 4835 4836 size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; 4837 size_t nbStripesSoFar = state->nbStripesSoFar; 4838 XXH3_consumeStripes(acc, &nbStripesSoFar, state->nbStripesPerBlock, 4839 state->buffer, nbStripes, secret, state->secretLimit, 4840 XXH3_accumulate_512, XXH3_scrambleAcc); 4841 /* last stripe */ 4842 XXH3_accumulate_512(acc, 4843 state->buffer + state->bufferedSize - XXH_STRIPE_LEN, 4844 secret + state->secretLimit - XXH_SECRET_LASTACC_START); 4845 4846 } else { /* bufferedSize < XXH_STRIPE_LEN */ 4847 4848 xxh_u8 lastStripe[XXH_STRIPE_LEN]; 4849 size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; 4850 XXH_ASSERT(state->bufferedSize > 4851 0); /* there is always some input buffered */ 4852 memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, 4853 catchupSize); 4854 memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); 4855 XXH3_accumulate_512(acc, lastStripe, 4856 secret + state->secretLimit - XXH_SECRET_LASTACC_START); 4857 4858 } 4859 4860 } 4861 4862 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) { 4863 4864 const unsigned char *const secret = 4865 (state->extSecret == NULL) ? state->customSecret : state->extSecret; 4866 if (state->totalLen > XXH3_MIDSIZE_MAX) { 4867 4868 XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; 4869 XXH3_digest_long(acc, state, secret); 4870 return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, 4871 (xxh_u64)state->totalLen * XXH_PRIME64_1); 4872 4873 } 4874 4875 /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ 4876 if (state->seed) 4877 return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, 4878 state->seed); 4879 return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), 4880 secret, state->secretLimit + XXH_STRIPE_LEN); 4881 4882 } 4883 4884 #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) 4885 4886 XXH_PUBLIC_API void XXH3_generateSecret(void * secretBuffer, 4887 const void *customSeed, 4888 size_t customSeedSize) { 4889 4890 XXH_ASSERT(secretBuffer != NULL); 4891 if (customSeedSize == 0) { 4892 4893 memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); 4894 return; 4895 4896 } 4897 4898 XXH_ASSERT(customSeed != NULL); 4899 4900 { 4901 4902 size_t const segmentSize = sizeof(XXH128_hash_t); 4903 size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize; 4904 XXH128_canonical_t scrambler; 4905 XXH64_hash_t seeds[12]; 4906 size_t segnb; 4907 XXH_ASSERT(nbSegments == 12); 4908 XXH_ASSERT(segmentSize * nbSegments == 4909 XXH_SECRET_DEFAULT_SIZE); /* exact multiple */ 4910 XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); 4911 4912 /* 4913 * Copy customSeed to seeds[], truncating or repeating as necessary. 4914 */ 4915 { 4916 4917 size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds)); 4918 size_t filled = toFill; 4919 memcpy(seeds, customSeed, toFill); 4920 while (filled < sizeof(seeds)) { 4921 4922 toFill = XXH_MIN(filled, sizeof(seeds) - filled); 4923 memcpy((char *)seeds + filled, seeds, toFill); 4924 filled += toFill; 4925 4926 } 4927 4928 } 4929 4930 /* generate secret */ 4931 memcpy(secretBuffer, &scrambler, sizeof(scrambler)); 4932 for (segnb = 1; segnb < nbSegments; segnb++) { 4933 4934 size_t const segmentStart = segnb * segmentSize; 4935 XXH128_canonical_t segment; 4936 XXH128_canonicalFromHash(&segment, 4937 XXH128(&scrambler, sizeof(scrambler), 4938 XXH_readLE64(seeds + segnb) + segnb)); 4939 memcpy((char *)secretBuffer + segmentStart, &segment, sizeof(segment)); 4940 4941 } 4942 4943 } 4944 4945 } 4946 4947 /* ========================================== 4948 * XXH3 128 bits (a.k.a XXH128) 4949 * ========================================== 4950 * XXH3's 128-bit variant has better mixing and strength than the 64-bit 4951 * variant, even without counting the significantly larger output size. 4952 * 4953 * For example, extra steps are taken to avoid the seed-dependent collisions 4954 * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). 4955 * 4956 * This strength naturally comes at the cost of some speed, especially on short 4957 * lengths. Note that longer hashes are about as fast as the 64-bit version 4958 * due to it using only a slight modification of the 64-bit loop. 4959 * 4960 * XXH128 is also more oriented towards 64-bit machines. It is still extremely 4961 * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). 4962 */ 4963 4964 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_1to3_128b(const xxh_u8 *input, 4965 size_t len, 4966 const xxh_u8 *secret, 4967 XXH64_hash_t seed) { 4968 4969 /* A doubled version of 1to3_64b with different constants. */ 4970 XXH_ASSERT(input != NULL); 4971 XXH_ASSERT(1 <= len && len <= 3); 4972 XXH_ASSERT(secret != NULL); 4973 /* 4974 * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } 4975 * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } 4976 * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } 4977 */ 4978 { 4979 4980 xxh_u8 const c1 = input[0]; 4981 xxh_u8 const c2 = input[len >> 1]; 4982 xxh_u8 const c3 = input[len - 1]; 4983 xxh_u32 const combinedl = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) | 4984 ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); 4985 xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); 4986 xxh_u64 const bitflipl = 4987 (XXH_readLE32(secret) ^ XXH_readLE32(secret + 4)) + seed; 4988 xxh_u64 const bitfliph = 4989 (XXH_readLE32(secret + 8) ^ XXH_readLE32(secret + 12)) - seed; 4990 xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; 4991 xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; 4992 XXH128_hash_t h128; 4993 h128.low64 = XXH64_avalanche(keyed_lo); 4994 h128.high64 = XXH64_avalanche(keyed_hi); 4995 return h128; 4996 4997 } 4998 4999 } 5000 5001 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_4to8_128b(const xxh_u8 *input, 5002 size_t len, 5003 const xxh_u8 *secret, 5004 XXH64_hash_t seed) { 5005 5006 XXH_ASSERT(input != NULL); 5007 XXH_ASSERT(secret != NULL); 5008 XXH_ASSERT(4 <= len && len <= 8); 5009 seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; 5010 { 5011 5012 xxh_u32 const input_lo = XXH_readLE32(input); 5013 xxh_u32 const input_hi = XXH_readLE32(input + len - 4); 5014 xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); 5015 xxh_u64 const bitflip = 5016 (XXH_readLE64(secret + 16) ^ XXH_readLE64(secret + 24)) + seed; 5017 xxh_u64 const keyed = input_64 ^ bitflip; 5018 5019 /* Shift len to the left to ensure it is even, this avoids even multiplies. 5020 */ 5021 XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); 5022 5023 m128.high64 += (m128.low64 << 1); 5024 m128.low64 ^= (m128.high64 >> 3); 5025 5026 m128.low64 = XXH_xorshift64(m128.low64, 35); 5027 m128.low64 *= 0x9FB21C651E98DF25ULL; 5028 m128.low64 = XXH_xorshift64(m128.low64, 28); 5029 m128.high64 = XXH3_avalanche(m128.high64); 5030 return m128; 5031 5032 } 5033 5034 } 5035 5036 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_9to16_128b(const xxh_u8 *input, 5037 size_t len, 5038 const xxh_u8 *secret, 5039 XXH64_hash_t seed) { 5040 5041 XXH_ASSERT(input != NULL); 5042 XXH_ASSERT(secret != NULL); 5043 XXH_ASSERT(9 <= len && len <= 16); 5044 { 5045 5046 xxh_u64 const bitflipl = 5047 (XXH_readLE64(secret + 32) ^ XXH_readLE64(secret + 40)) - seed; 5048 xxh_u64 const bitfliph = 5049 (XXH_readLE64(secret + 48) ^ XXH_readLE64(secret + 56)) + seed; 5050 xxh_u64 const input_lo = XXH_readLE64(input); 5051 xxh_u64 input_hi = XXH_readLE64(input + len - 8); 5052 XXH128_hash_t m128 = 5053 XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); 5054 /* 5055 * Put len in the middle of m128 to ensure that the length gets mixed to 5056 * both the low and high bits in the 128x64 multiply below. 5057 */ 5058 m128.low64 += (xxh_u64)(len - 1) << 54; 5059 input_hi ^= bitfliph; 5060 /* 5061 * Add the high 32 bits of input_hi to the high 32 bits of m128, then 5062 * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to 5063 * the high 64 bits of m128. 5064 * 5065 * The best approach to this operation is different on 32-bit and 64-bit. 5066 */ 5067 if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ 5068 /* 5069 * 32-bit optimized version, which is more readable. 5070 * 5071 * On 32-bit, it removes an ADC and delays a dependency between the two 5072 * halves of m128.high64, but it generates an extra mask on 64-bit. 5073 */ 5074 m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + 5075 XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); 5076 5077 } else { 5078 5079 /* 5080 * 64-bit optimized (albeit more confusing) version. 5081 * 5082 * Uses some properties of addition and multiplication to remove the mask: 5083 * 5084 * Let: 5085 * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) 5086 * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) 5087 * c = XXH_PRIME32_2 5088 * 5089 * a + (b * c) 5090 * Inverse Property: x + y - x == y 5091 * a + (b * (1 + c - 1)) 5092 * Distributive Property: x * (y + z) == (x * y) + (x * z) 5093 * a + (b * 1) + (b * (c - 1)) 5094 * Identity Property: x * 1 == x 5095 * a + b + (b * (c - 1)) 5096 * 5097 * Substitute a, b, and c: 5098 * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 5099 * 1)) 5100 * 5101 * Since input_hi.hi + input_hi.lo == input_hi, we get this: 5102 * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) 5103 */ 5104 m128.high64 += 5105 input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); 5106 5107 } 5108 5109 /* m128 ^= XXH_swap64(m128 >> 64); */ 5110 m128.low64 ^= XXH_swap64(m128.high64); 5111 5112 { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ 5113 XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); 5114 h128.high64 += m128.high64 * XXH_PRIME64_2; 5115 5116 h128.low64 = XXH3_avalanche(h128.low64); 5117 h128.high64 = XXH3_avalanche(h128.high64); 5118 return h128; 5119 5120 } 5121 5122 } 5123 5124 } 5125 5126 /* 5127 * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN 5128 */ 5129 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_0to16_128b(const xxh_u8 *input, 5130 size_t len, 5131 const xxh_u8 *secret, 5132 XXH64_hash_t seed) { 5133 5134 XXH_ASSERT(len <= 16); 5135 { 5136 5137 if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); 5138 if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); 5139 if (len) return XXH3_len_1to3_128b(input, len, secret, seed); 5140 { 5141 5142 XXH128_hash_t h128; 5143 xxh_u64 const bitflipl = 5144 XXH_readLE64(secret + 64) ^ XXH_readLE64(secret + 72); 5145 xxh_u64 const bitfliph = 5146 XXH_readLE64(secret + 80) ^ XXH_readLE64(secret + 88); 5147 h128.low64 = XXH64_avalanche(seed ^ bitflipl); 5148 h128.high64 = XXH64_avalanche(seed ^ bitfliph); 5149 return h128; 5150 5151 } 5152 5153 } 5154 5155 } 5156 5157 /* 5158 * A bit slower than XXH3_mix16B, but handles multiply by zero better. 5159 */ 5160 XXH_FORCE_INLINE XXH128_hash_t XXH128_mix32B(XXH128_hash_t acc, 5161 const xxh_u8 *input_1, 5162 const xxh_u8 *input_2, 5163 const xxh_u8 *secret, 5164 XXH64_hash_t seed) { 5165 5166 acc.low64 += XXH3_mix16B(input_1, secret + 0, seed); 5167 acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); 5168 acc.high64 += XXH3_mix16B(input_2, secret + 16, seed); 5169 acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); 5170 return acc; 5171 5172 } 5173 5174 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_17to128_128b( 5175 const xxh_u8 *XXH_RESTRICT input, size_t len, 5176 const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) { 5177 5178 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); 5179 (void)secretSize; 5180 XXH_ASSERT(16 < len && len <= 128); 5181 5182 { 5183 5184 XXH128_hash_t acc; 5185 acc.low64 = len * XXH_PRIME64_1; 5186 acc.high64 = 0; 5187 if (len > 32) { 5188 5189 if (len > 64) { 5190 5191 if (len > 96) { 5192 5193 acc = XXH128_mix32B(acc, input + 48, input + len - 64, secret + 96, 5194 seed); 5195 5196 } 5197 5198 acc = 5199 XXH128_mix32B(acc, input + 32, input + len - 48, secret + 64, seed); 5200 5201 } 5202 5203 acc = XXH128_mix32B(acc, input + 16, input + len - 32, secret + 32, seed); 5204 5205 } 5206 5207 acc = XXH128_mix32B(acc, input, input + len - 16, secret, seed); 5208 { 5209 5210 XXH128_hash_t h128; 5211 h128.low64 = acc.low64 + acc.high64; 5212 h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) + 5213 ((len - seed) * XXH_PRIME64_2); 5214 h128.low64 = XXH3_avalanche(h128.low64); 5215 h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); 5216 return h128; 5217 5218 } 5219 5220 } 5221 5222 } 5223 5224 XXH_NO_INLINE XXH128_hash_t XXH3_len_129to240_128b( 5225 const xxh_u8 *XXH_RESTRICT input, size_t len, 5226 const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) { 5227 5228 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); 5229 (void)secretSize; 5230 XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); 5231 5232 { 5233 5234 XXH128_hash_t acc; 5235 int const nbRounds = (int)len / 32; 5236 int i; 5237 acc.low64 = len * XXH_PRIME64_1; 5238 acc.high64 = 0; 5239 for (i = 0; i < 4; i++) { 5240 5241 acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16, 5242 secret + (32 * i), seed); 5243 5244 } 5245 5246 acc.low64 = XXH3_avalanche(acc.low64); 5247 acc.high64 = XXH3_avalanche(acc.high64); 5248 XXH_ASSERT(nbRounds >= 4); 5249 for (i = 4; i < nbRounds; i++) { 5250 5251 acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16, 5252 secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), 5253 seed); 5254 5255 } 5256 5257 /* last bytes */ 5258 acc = XXH128_mix32B( 5259 acc, input + len - 16, input + len - 32, 5260 secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, 5261 0ULL - seed); 5262 5263 { 5264 5265 XXH128_hash_t h128; 5266 h128.low64 = acc.low64 + acc.high64; 5267 h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) + 5268 ((len - seed) * XXH_PRIME64_2); 5269 h128.low64 = XXH3_avalanche(h128.low64); 5270 h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); 5271 return h128; 5272 5273 } 5274 5275 } 5276 5277 } 5278 5279 XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_internal( 5280 const void *XXH_RESTRICT input, size_t len, 5281 const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, 5282 XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) { 5283 5284 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; 5285 5286 XXH3_hashLong_internal_loop(acc, (const xxh_u8 *)input, len, secret, 5287 secretSize, f_acc512, f_scramble); 5288 5289 /* converge into final hash */ 5290 XXH_STATIC_ASSERT(sizeof(acc) == 64); 5291 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); 5292 { 5293 5294 XXH128_hash_t h128; 5295 h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, 5296 (xxh_u64)len * XXH_PRIME64_1); 5297 h128.high64 = XXH3_mergeAccs( 5298 acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, 5299 ~((xxh_u64)len * XXH_PRIME64_2)); 5300 return h128; 5301 5302 } 5303 5304 } 5305 5306 /* 5307 * It's important for performance that XXH3_hashLong is not inlined. 5308 */ 5309 XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_default( 5310 const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, 5311 const void *XXH_RESTRICT secret, size_t secretLen) { 5312 5313 (void)seed64; 5314 (void)secret; 5315 (void)secretLen; 5316 return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, 5317 sizeof(XXH3_kSecret), XXH3_accumulate_512, 5318 XXH3_scrambleAcc); 5319 5320 } 5321 5322 /* 5323 * It's important for performance that XXH3_hashLong is not inlined. 5324 */ 5325 XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_withSecret( 5326 const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, 5327 const void *XXH_RESTRICT secret, size_t secretLen) { 5328 5329 (void)seed64; 5330 return XXH3_hashLong_128b_internal(input, len, (const xxh_u8 *)secret, 5331 secretLen, XXH3_accumulate_512, 5332 XXH3_scrambleAcc); 5333 5334 } 5335 5336 XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_withSeed_internal( 5337 const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, 5338 XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble, 5339 XXH3_f_initCustomSecret f_initSec) { 5340 5341 if (seed64 == 0) 5342 return XXH3_hashLong_128b_internal( 5343 input, len, XXH3_kSecret, sizeof(XXH3_kSecret), f_acc512, f_scramble); 5344 { 5345 5346 XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; 5347 f_initSec(secret, seed64); 5348 return XXH3_hashLong_128b_internal(input, len, (const xxh_u8 *)secret, 5349 sizeof(secret), f_acc512, f_scramble); 5350 5351 } 5352 5353 } 5354 5355 /* 5356 * It's important for performance that XXH3_hashLong is not inlined. 5357 */ 5358 XXH_NO_INLINE XXH128_hash_t 5359 XXH3_hashLong_128b_withSeed(const void *input, size_t len, XXH64_hash_t seed64, 5360 const void *XXH_RESTRICT secret, size_t secretLen) { 5361 5362 (void)secret; 5363 (void)secretLen; 5364 return XXH3_hashLong_128b_withSeed_internal( 5365 input, len, seed64, XXH3_accumulate_512, XXH3_scrambleAcc, 5366 XXH3_initCustomSecret); 5367 5368 } 5369 5370 typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void *XXH_RESTRICT, size_t, 5371 XXH64_hash_t, 5372 const void *XXH_RESTRICT, size_t); 5373 5374 XXH_FORCE_INLINE XXH128_hash_t 5375 XXH3_128bits_internal(const void *input, size_t len, XXH64_hash_t seed64, 5376 const void *XXH_RESTRICT secret, size_t secretLen, 5377 XXH3_hashLong128_f f_hl128) { 5378 5379 XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); 5380 /* 5381 * If an action is to be taken if `secret` conditions are not respected, 5382 * it should be done here. 5383 * For now, it's a contract pre-condition. 5384 * Adding a check and a branch here would cost performance at every hash. 5385 */ 5386 if (len <= 16) 5387 return XXH3_len_0to16_128b((const xxh_u8 *)input, len, 5388 (const xxh_u8 *)secret, seed64); 5389 if (len <= 128) 5390 return XXH3_len_17to128_128b((const xxh_u8 *)input, len, 5391 (const xxh_u8 *)secret, secretLen, seed64); 5392 if (len <= XXH3_MIDSIZE_MAX) 5393 return XXH3_len_129to240_128b((const xxh_u8 *)input, len, 5394 (const xxh_u8 *)secret, secretLen, seed64); 5395 return f_hl128(input, len, seed64, secret, secretLen); 5396 5397 } 5398 5399 /* === Public XXH128 API === */ 5400 5401 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) { 5402 5403 return XXH3_128bits_internal(input, len, 0, XXH3_kSecret, 5404 sizeof(XXH3_kSecret), 5405 XXH3_hashLong_128b_default); 5406 5407 } 5408 5409 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input, 5410 size_t len, 5411 const void *secret, 5412 size_t secretSize) { 5413 5414 return XXH3_128bits_internal(input, len, 0, (const xxh_u8 *)secret, 5415 secretSize, XXH3_hashLong_128b_withSecret); 5416 5417 } 5418 5419 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void * input, 5420 size_t len, 5421 XXH64_hash_t seed) { 5422 5423 return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, 5424 sizeof(XXH3_kSecret), 5425 XXH3_hashLong_128b_withSeed); 5426 5427 } 5428 5429 XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len, 5430 XXH64_hash_t seed) { 5431 5432 return XXH3_128bits_withSeed(input, len, seed); 5433 5434 } 5435 5436 /* === XXH3 128-bit streaming === */ 5437 5438 /* 5439 * All the functions are actually the same as for 64-bit streaming variant. 5440 * The only difference is the finalizatiom routine. 5441 */ 5442 5443 static void XXH3_128bits_reset_internal(XXH3_state_t *statePtr, 5444 XXH64_hash_t seed, const void *secret, 5445 size_t secretSize) { 5446 5447 XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize); 5448 5449 } 5450 5451 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr) { 5452 5453 if (statePtr == NULL) return XXH_ERROR; 5454 XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret, 5455 XXH_SECRET_DEFAULT_SIZE); 5456 return XXH_OK; 5457 5458 } 5459 5460 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret( 5461 XXH3_state_t *statePtr, const void *secret, size_t secretSize) { 5462 5463 if (statePtr == NULL) return XXH_ERROR; 5464 XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize); 5465 if (secret == NULL) return XXH_ERROR; 5466 if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; 5467 return XXH_OK; 5468 5469 } 5470 5471 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr, 5472 XXH64_hash_t seed) { 5473 5474 if (statePtr == NULL) return XXH_ERROR; 5475 if (seed == 0) return XXH3_128bits_reset(statePtr); 5476 if (seed != statePtr->seed) 5477 XXH3_initCustomSecret(statePtr->customSecret, seed); 5478 XXH3_128bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); 5479 return XXH_OK; 5480 5481 } 5482 5483 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state, 5484 const void * input, 5485 size_t len) { 5486 5487 return XXH3_update(state, (const xxh_u8 *)input, len, XXH3_accumulate_512, 5488 XXH3_scrambleAcc); 5489 5490 } 5491 5492 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) { 5493 5494 const unsigned char *const secret = 5495 (state->extSecret == NULL) ? state->customSecret : state->extSecret; 5496 if (state->totalLen > XXH3_MIDSIZE_MAX) { 5497 5498 XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; 5499 XXH3_digest_long(acc, state, secret); 5500 XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= 5501 sizeof(acc) + XXH_SECRET_MERGEACCS_START); 5502 { 5503 5504 XXH128_hash_t h128; 5505 h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, 5506 (xxh_u64)state->totalLen * XXH_PRIME64_1); 5507 h128.high64 = 5508 XXH3_mergeAccs(acc, 5509 secret + state->secretLimit + XXH_STRIPE_LEN - 5510 sizeof(acc) - XXH_SECRET_MERGEACCS_START, 5511 ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); 5512 return h128; 5513 5514 } 5515 5516 } 5517 5518 /* len <= XXH3_MIDSIZE_MAX : short code */ 5519 if (state->seed) 5520 return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, 5521 state->seed); 5522 return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), 5523 secret, state->secretLimit + XXH_STRIPE_LEN); 5524 5525 } 5526 5527 /* 128-bit utility functions */ 5528 5529 #include <string.h> /* memcmp, memcpy */ 5530 5531 /* return : 1 is equal, 0 if different */ 5532 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) { 5533 5534 /* note : XXH128_hash_t is compact, it has no padding byte */ 5535 return !(memcmp(&h1, &h2, sizeof(h1))); 5536 5537 } 5538 5539 /* This prototype is compatible with stdlib's qsort(). 5540 * return : >0 if *h128_1 > *h128_2 5541 * <0 if *h128_1 < *h128_2 5542 * =0 if *h128_1 == *h128_2 */ 5543 XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) { 5544 5545 XXH128_hash_t const h1 = *(const XXH128_hash_t *)h128_1; 5546 XXH128_hash_t const h2 = *(const XXH128_hash_t *)h128_2; 5547 int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); 5548 /* note : bets that, in most cases, hash values are different */ 5549 if (hcmp) return hcmp; 5550 return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); 5551 5552 } 5553 5554 /*====== Canonical representation ======*/ 5555 XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst, 5556 XXH128_hash_t hash) { 5557 5558 XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); 5559 if (XXH_CPU_LITTLE_ENDIAN) { 5560 5561 hash.high64 = XXH_swap64(hash.high64); 5562 hash.low64 = XXH_swap64(hash.low64); 5563 5564 } 5565 5566 memcpy(dst, &hash.high64, sizeof(hash.high64)); 5567 memcpy((char *)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); 5568 5569 } 5570 5571 XXH_PUBLIC_API XXH128_hash_t 5572 XXH128_hashFromCanonical(const XXH128_canonical_t *src) { 5573 5574 XXH128_hash_t h; 5575 h.high64 = XXH_readBE64(src); 5576 h.low64 = XXH_readBE64(src->digest + 8); 5577 return h; 5578 5579 } 5580 5581 /* Pop our optimization override from above */ 5582 #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ 5583 && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ 5584 && defined(__OPTIMIZE__) && \ 5585 !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ 5586 #pragma GCC pop_options 5587 #endif 5588 5589 #endif /* XXH_NO_LONG_LONG */ 5590 5591 #endif /* XXH_IMPLEMENTATION */ 5592 5593 #if defined(__cplusplus) 5594 5595 } 5596 5597 #endif 5598 5599