1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #include "util/hash.h"
11 
12 #include <string>
13 
14 #include "port/lang.h"
15 #include "util/coding.h"
16 #include "util/hash128.h"
17 #include "util/math128.h"
18 #include "util/xxhash.h"
19 #include "util/xxph3.h"
20 
21 namespace ROCKSDB_NAMESPACE {
22 
23 uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&) = &GetSliceHash64;
24 
Hash(const char * data,size_t n,uint32_t seed)25 uint32_t Hash(const char* data, size_t n, uint32_t seed) {
26   // MurmurHash1 - fast but mediocre quality
27   // https://github.com/aappleby/smhasher/wiki/MurmurHash1
28   //
29   const uint32_t m = 0xc6a4a793;
30   const uint32_t r = 24;
31   const char* limit = data + n;
32   uint32_t h = static_cast<uint32_t>(seed ^ (n * m));
33 
34   // Pick up four bytes at a time
35   while (data + 4 <= limit) {
36     uint32_t w = DecodeFixed32(data);
37     data += 4;
38     h += w;
39     h *= m;
40     h ^= (h >> 16);
41   }
42 
43   // Pick up remaining bytes
44   switch (limit - data) {
45     // Note: The original hash implementation used data[i] << shift, which
46     // promotes the char to int and then performs the shift. If the char is
47     // negative, the shift is undefined behavior in C++. The hash algorithm is
48     // part of the format definition, so we cannot change it; to obtain the same
49     // behavior in a legal way we just cast to uint32_t, which will do
50     // sign-extension. To guarantee compatibility with architectures where chars
51     // are unsigned we first cast the char to int8_t.
52     case 3:
53       h += static_cast<uint32_t>(static_cast<int8_t>(data[2])) << 16;
54       FALLTHROUGH_INTENDED;
55     case 2:
56       h += static_cast<uint32_t>(static_cast<int8_t>(data[1])) << 8;
57       FALLTHROUGH_INTENDED;
58     case 1:
59       h += static_cast<uint32_t>(static_cast<int8_t>(data[0]));
60       h *= m;
61       h ^= (h >> r);
62       break;
63   }
64   return h;
65 }
66 
67 // We are standardizing on a preview release of XXH3, because that's
68 // the best available at time of standardizing.
69 //
70 // In testing (mostly Intel Skylake), this hash function is much more
71 // thorough than Hash32 and is almost universally faster. Hash() only
72 // seems faster when passing runtime-sized keys of the same small size
73 // (less than about 24 bytes) thousands of times in a row; this seems
74 // to allow the branch predictor to work some magic. XXH3's speed is
75 // much less dependent on branch prediction.
76 //
77 // Hashing with a prefix extractor is potentially a common case of
78 // hashing objects of small, predictable size. We could consider
79 // bundling hash functions specialized for particular lengths with
80 // the prefix extractors.
Hash64(const char * data,size_t n,uint64_t seed)81 uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
82   return XXPH3_64bits_withSeed(data, n, seed);
83 }
84 
Hash64(const char * data,size_t n)85 uint64_t Hash64(const char* data, size_t n) {
86   // Same as seed = 0
87   return XXPH3_64bits(data, n);
88 }
89 
GetSlicePartsNPHash64(const SliceParts & data,uint64_t seed)90 uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) {
91   // TODO(ajkr): use XXH3 streaming APIs to avoid the copy/allocation.
92   size_t concat_len = 0;
93   for (int i = 0; i < data.num_parts; ++i) {
94     concat_len += data.parts[i].size();
95   }
96   std::string concat_data;
97   concat_data.reserve(concat_len);
98   for (int i = 0; i < data.num_parts; ++i) {
99     concat_data.append(data.parts[i].data(), data.parts[i].size());
100   }
101   assert(concat_data.size() == concat_len);
102   return NPHash64(concat_data.data(), concat_len, seed);
103 }
104 
Hash128(const char * data,size_t n,uint64_t seed)105 Unsigned128 Hash128(const char* data, size_t n, uint64_t seed) {
106   auto h = XXH3_128bits_withSeed(data, n, seed);
107   return (Unsigned128{h.high64} << 64) | (h.low64);
108 }
109 
Hash128(const char * data,size_t n)110 Unsigned128 Hash128(const char* data, size_t n) {
111   // Same as seed = 0
112   auto h = XXH3_128bits(data, n);
113   return (Unsigned128{h.high64} << 64) | (h.low64);
114 }
115 
Hash2x64(const char * data,size_t n,uint64_t * high64,uint64_t * low64)116 void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64) {
117   // Same as seed = 0
118   auto h = XXH3_128bits(data, n);
119   *high64 = h.high64;
120   *low64 = h.low64;
121 }
122 
Hash2x64(const char * data,size_t n,uint64_t seed,uint64_t * high64,uint64_t * low64)123 void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64,
124               uint64_t* low64) {
125   auto h = XXH3_128bits_withSeed(data, n, seed);
126   *high64 = h.high64;
127   *low64 = h.low64;
128 }
129 
130 namespace {
131 
XXH3_avalanche(uint64_t h64)132 inline uint64_t XXH3_avalanche(uint64_t h64) {
133   h64 ^= h64 >> 37;
134   h64 *= 0x165667919E3779F9U;
135   h64 ^= h64 >> 32;
136   return h64;
137 }
138 
XXH3_unavalanche(uint64_t h64)139 inline uint64_t XXH3_unavalanche(uint64_t h64) {
140   h64 ^= h64 >> 32;
141   h64 *= 0x8da8ee41d6df849U;  // inverse of 0x165667919E3779F9U
142   h64 ^= h64 >> 37;
143   return h64;
144 }
145 
146 }  // namespace
147 
BijectiveHash2x64(uint64_t in_high64,uint64_t in_low64,uint64_t seed,uint64_t * out_high64,uint64_t * out_low64)148 void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
149                        uint64_t* out_high64, uint64_t* out_low64) {
150   // Adapted from XXH3_len_9to16_128b
151   const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed;
152   const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed;
153   Unsigned128 tmp128 =
154       Multiply64to128(in_low64 ^ in_high64 ^ bitflipl, 0x9E3779B185EBCA87U);
155   uint64_t lo = Lower64of128(tmp128);
156   uint64_t hi = Upper64of128(tmp128);
157   lo += 0x3c0000000000000U;  // (len - 1) << 54
158   in_high64 ^= bitfliph;
159   hi += in_high64 + (Lower32of64(in_high64) * uint64_t{0x85EBCA76});
160   lo ^= EndianSwapValue(hi);
161   tmp128 = Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU);
162   lo = Lower64of128(tmp128);
163   hi = Upper64of128(tmp128) + (hi * 0xC2B2AE3D27D4EB4FU);
164   *out_low64 = XXH3_avalanche(lo);
165   *out_high64 = XXH3_avalanche(hi);
166 }
167 
BijectiveUnhash2x64(uint64_t in_high64,uint64_t in_low64,uint64_t seed,uint64_t * out_high64,uint64_t * out_low64)168 void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
169                          uint64_t* out_high64, uint64_t* out_low64) {
170   // Inverted above (also consulting XXH3_len_9to16_128b)
171   const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed;
172   const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed;
173   uint64_t lo = XXH3_unavalanche(in_low64);
174   uint64_t hi = XXH3_unavalanche(in_high64);
175   lo *= 0xba79078168d4baf;  // inverse of 0xC2B2AE3D27D4EB4FU
176   hi -= Upper64of128(Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU));
177   hi *= 0xba79078168d4baf;  // inverse of 0xC2B2AE3D27D4EB4FU
178   lo ^= EndianSwapValue(hi);
179   lo -= 0x3c0000000000000U;
180   lo *= 0x887493432badb37U;  // inverse of 0x9E3779B185EBCA87U
181   hi -= Upper64of128(Multiply64to128(lo, 0x9E3779B185EBCA87U));
182   uint32_t tmp32 = Lower32of64(hi) * 0xb6c92f47;  // inverse of 0x85EBCA77
183   hi -= tmp32;
184   hi = (hi & 0xFFFFFFFF00000000U) -
185        ((tmp32 * uint64_t{0x85EBCA76}) & 0xFFFFFFFF00000000U) + tmp32;
186   hi ^= bitfliph;
187   lo ^= hi ^ bitflipl;
188   *out_high64 = hi;
189   *out_low64 = lo;
190 }
191 
BijectiveHash2x64(uint64_t in_high64,uint64_t in_low64,uint64_t * out_high64,uint64_t * out_low64)192 void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64,
193                        uint64_t* out_high64, uint64_t* out_low64) {
194   BijectiveHash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64);
195 }
196 
BijectiveUnhash2x64(uint64_t in_high64,uint64_t in_low64,uint64_t * out_high64,uint64_t * out_low64)197 void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64,
198                          uint64_t* out_high64, uint64_t* out_low64) {
199   BijectiveUnhash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64);
200 }
201 }  // namespace ROCKSDB_NAMESPACE
202